Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: when lingroups are provided, use them for csv_summary #3311

Merged
merged 15 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 21 additions & 8 deletions doc/command-line.md
Original file line number Diff line number Diff line change
Expand Up @@ -637,13 +637,16 @@ sourmash tax metagenome
--taxonomy gtdb-rs202.taxonomy.v2.csv
```

The possible output formats are:
- `human`
- `csv_summary`
- `lineage_summary`
- `krona`
- `kreport`
- `lingroup_report`
The possible output formats are listed below, followed by the file extension used when writing to a file rather than stdout. When using more than one output format, you must provide an output basename (`--output-base`) that will be used to name the output files. If an `--output-dir` is provided, files will output to that directory.

- `human`: ".human.txt",
- `csv_summary`: ".summarized.csv",
- `lineage_summary`: ".lineage_summary.tsv",
- `krona`: ".krona.tsv",
- `kreport`: ".kreport.txt",
- `lingroup`: ".lingroup.tsv",
- `bioboxes`: ".bioboxes.profile",


#### `csv_summary` output format

Expand Down Expand Up @@ -672,6 +675,9 @@ o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus
```
The `query_md5` and `query_filename` columns are omitted here for brevity.

Note: When using `--lins` with a `--lingroup` file, the `csv_summary` file will report
summarization for each specified `lingroup`, rather than all possible `lin` ranks (v4.8.12+).

#### `krona` output format

`krona` format is a tab-separated list of these results at a specific rank.
Expand Down Expand Up @@ -842,6 +848,8 @@ lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000

Related lingroup subpaths will be grouped in output, but exact ordering may change between runs.

Note: this output format requires a single sample only. For a similar output with multiple query samples, provide the `lingroup` file and use the 'csv_summary' output format.

#### `bioboxes` output format

When using standard taxonomic ranks (not lins), you can choose to output a 'bioboxes' profile, `{base}.bioboxes.profile`, where `{base}` is the name provided via the `-o/--output-base` option. This output is organized according to the [bioboxes profile specifications](https://github.com/bioboxes/rfc/tree/master/data-format) so that this file can be used for CAMI challenges.
Expand Down Expand Up @@ -971,7 +979,12 @@ sourmash tax genome
> This command uses the default classification strategy, which uses a
containment threshold of 0.1 (10%).

There are two possible output formats, `csv_summary` and `krona`.
`sourmash tax genome` can produce the following output formats:

- `human`: ".human.txt",
- `csv_summary`: ".classifications.csv",
- `krona`: ".krona.tsv",
- `lineage_summary`: ".lineage_summary.tsv",

#### `csv_summary` output format

Expand Down
4 changes: 2 additions & 2 deletions src/sourmash/cli/tax/metagenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,14 @@ def subparser(subparsers):
"--lingroups",
metavar="FILE",
default=None,
help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will produce a 'lingroup' report containing taxonomic summarization for each group.",
help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. For 'tax metagenome' runs with a single 'gather' file (single query), providing this file will allow us to output a 'lingroup' report containing taxonomic summarization for each group. For multiple queries, we recommend the 'csv_summary' output format.",
)
subparser.add_argument(
"--ictv",
"--ictv-taxonomy",
action="store_true",
default=False,
help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.",
help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.",
)
add_rank_arg(subparser)

Expand Down
24 changes: 15 additions & 9 deletions src/sourmash/tax/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def metagenome(args):
notify("No gather results loaded. Exiting.")
sys.exit(-1)

single_query_output_formats = ["csv_summary", "kreport"]
single_query_output_formats = ["kreport", "lingroup", "bioboxes"]
desired_single_outputs = []
if len(query_gather_results) > 1: # working with multiple queries
desired_single_outputs = [
Expand Down Expand Up @@ -154,6 +154,15 @@ def metagenome(args):
error(f"ERROR: {str(exc)}")
sys.exit(-1)

# if lingroup file is passed in, read it
lingroups = None
if args.lingroup is not None:
try:
lingroups = tax_utils.read_lingroups(args.lingroup)
except ValueError as exc:
error(f"ERROR: {str(exc)}")
sys.exit(-1)

# write summarized output in human-readable format
if "lineage_summary" in args.output_format:
lineage_outfile, limit_float = make_outfile(
Expand Down Expand Up @@ -202,7 +211,10 @@ def metagenome(args):
)
with FileOutputCSV(summary_outfile) as out_fp:
tax_utils.write_summary(
query_gather_results, out_fp, limit_float_decimals=limit_float
query_gather_results,
out_fp,
limit_float_decimals=limit_float,
lingroups=lingroups,
)

# write summarized --> kreport output tsv
Expand All @@ -218,13 +230,7 @@ def metagenome(args):
)

# write summarized --> LINgroup output tsv
if "lingroup" in args.output_format:
try:
lingroups = tax_utils.read_lingroups(args.lingroup)
except ValueError as exc:
error(f"ERROR: {str(exc)}")
sys.exit(-1)

if "lingroup" in args.output_format and lingroups is not None:
lingroupfile, limit_float = make_outfile(
args.output_base, "lingroup", output_dir=args.output_dir
)
Expand Down
38 changes: 34 additions & 4 deletions src/sourmash/tax/tax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1116,14 +1116,17 @@ def write_summary(
sep=",",
limit_float_decimals=False,
classification=False,
lingroups=None,
):
"""
Write taxonomy-summarized gather results for each rank.
"""
w = None
for q_res in query_gather_results:
header, summary = q_res.make_full_summary(
limit_float=limit_float_decimals, classification=classification
limit_float=limit_float_decimals,
classification=classification,
lingroups=lingroups,
)
if w is None:
w = csv.DictWriter(csv_fp, header, delimiter=sep)
Expand Down Expand Up @@ -2073,9 +2076,19 @@ def as_lineage_dict(self, query_info, ranks):
lD[rank] = lin_name
return lD

def as_summary_dict(self, query_info, limit_float=False):
def as_summary_dict(self, query_info, limit_float=False, lingroups=None):
sD = asdict(self)
sD["lineage"] = self.lineage.display_lineage(null_as_unclassified=True)
# if lingroups, convert lingroup number to lingroup name
if lingroups is not None and sD["lineage"] in lingroups.keys():
sD["lineage"] = lingroups[sD["lineage"]]
elif (
lingroups
and sD["lineage"] != "unclassified"
and sD["lineage"] not in lingroups.keys()
):
return None
# import pdb;pdb.set_trace()
bluegenes marked this conversation as resolved.
Show resolved Hide resolved
sD["query_name"] = query_info.query_name
sD["query_md5"] = query_info.query_md5
sD["query_filename"] = query_info.query_filename
Expand Down Expand Up @@ -2553,7 +2566,9 @@ def make_human_summary(self, display_rank, classification=False):
results.append(res.as_human_friendly_dict(query_info=self.query_info))
return results

def make_full_summary(self, classification=False, limit_float=False):
def make_full_summary(
self, classification=False, limit_float=False, lingroups=None
):
results = []
rD = {}
if classification:
Expand Down Expand Up @@ -2590,16 +2605,31 @@ def make_full_summary(self, classification=False, limit_float=False):
"total_weighted_hashes",
]

lingroup_ranks = set()
if lingroups is not None:
for lin in lingroups.keys():
# e.g. "14;1;0;0;0;0;0;0;0;0" => 9
lin_rank = len(lin.split(";")) - 1
lingroup_ranks.add(lin_rank)

for rank in self.summarized_ranks[::-1]: # descending
# if lingroups are provided, only report summary for specified lingroups
if lingroup_ranks:
if int(rank) not in lingroup_ranks:
continue
unclassified = []
rank_results = self.summarized_lineage_results[rank]
rank_results.sort(
key=lambda res: -res.fraction
) # v5?: f_weighted_at_rank)
for res in rank_results:
rD = res.as_summary_dict(
query_info=self.query_info, limit_float=limit_float
query_info=self.query_info,
limit_float=limit_float,
lingroups=lingroups,
)
if rD is None:
continue
# save unclassified for the end
if rD["lineage"] == "unclassified":
unclassified.append(rD)
Expand Down
Loading
Loading