You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
author = {Nicola Segata and Levi Waldron and Annalisa Ballarini and Vagheesh Narasimhan and Olivier Jousson and Curtis Huttenhower},
107
-
title = {Metagenomic microbial community profiling using unique clade-specific marker genes},
108
-
journal = {Nature Methods},
109
-
volume = {9},
110
-
pages = {811--814},
111
-
year = {2012},
112
-
doi = {10.1038/nmeth.2066}
113
-
}
114
-
115
-
@article{ondov2011interactive,
116
-
author = {Brian D. Ondov and Nicholas H. Bergman and Adam M. Phillippy},
117
-
title = {Interactive metagenomic visualization in a Web browser},
118
-
journal = {BMC Bioinformatics},
119
-
volume = {12},
120
-
pages = {385},
121
-
year = {2011},
122
-
doi = {10.1186/1471-2105-12-385}
123
-
}
124
-
125
-
@article{lindgreen2016evaluation,
126
-
author = {Simon Lindgreen and Karen L. Adair and Paul P. Gardner},
127
-
title = {An evaluation of the accuracy and speed of metagenome analysis tools},
128
-
journal = {Scientific Reports},
129
-
volume = {6},
130
-
pages = {19233},
131
-
year = {2016},
132
-
doi = {10.1038/srep19233}
133
-
}
134
-
135
-
@article{ye2019benchmarking,
136
-
author = {Shaojun H. Ye and Katherine J. Siddle and Daniel J. Park and Pardis C. Sabeti},
137
-
title = {Benchmarking metagenomics tools for taxonomic classification},
138
-
journal = {Cell},
139
-
volume = {178},
140
-
number = {4},
141
-
pages = {779--794},
142
-
year = {2019},
143
-
doi = {10.1016/j.cell.2019.07.010}
144
-
}
145
-
146
105
@article{sczyrba2017cami,
147
106
author = {Alexander Sczyrba and Peter Hofmann and Peter Belmann and David Koslicki and Stefan Janssen and Johannes Dröge and Ivan Gregor and Stephan Majda and Jessika Fiedler and Eik Dahms and others},
148
107
title = {Critical Assessment of Metagenome Interpretation: a benchmark of metagenomics software},
@@ -154,24 +113,38 @@ @article{sczyrba2017cami
154
113
doi = {10.1038/nmeth.4458}
155
114
}
156
115
157
-
@article{meyer2022cami2,
158
-
author = {Falk Hildebrand and Alexander Sczyrba and Peter Belmann and David Koslicki and Stefan Janssen and Johannes Dröge and Ivan Gregor and Stephan Majda and Jessika Fiedler and Eik Dahms and others},
159
-
title = {Critical Assessment of Metagenome Interpretation: the second round of challenges},
160
-
journal = {Nature Methods},
161
-
volume = {19},
162
-
number = {4},
163
-
pages = {429--440},
164
-
year = {2022},
165
-
doi = {10.1038/s41592-022-01415-5}
166
-
}
167
-
168
-
@article{huson2016megan,
169
-
author = {Daniel H. Huson and Alexander F. Auch and Ji Qi and Stephan C. Schuster},
170
-
title = {MEGAN Community Edition – Interactive Exploration and Analysis of Large-Scale Microbiome Sequencing Data},
171
-
journal = {PLOS Computational Biology},
172
-
volume = {12},
173
-
number = {6},
174
-
pages = {e1004957},
175
-
year = {2016},
176
-
doi = {10.1371/journal.pcbi.1004957}
116
+
@article{Hunter2007,
117
+
author = {John D. Hunter},
118
+
title = {Matplotlib: A 2D Graphics Environment},
119
+
journal = {Computing in Science \& Engineering},
120
+
volume = {9},
121
+
number = {3},
122
+
pages = {90--95},
123
+
year = {2007},
124
+
publisher = {IEEE Computer Society},
125
+
doi = {10.1109/MCSE.2007.55},
126
+
url = {https://doi.org/10.1109/MCSE.2007.55}
127
+
}
128
+
129
+
@software{reback2020pandas,
130
+
author = {The pandas development team},
131
+
title = {pandas-dev/pandas: Pandas},
132
+
month = feb,
133
+
year = 2020,
134
+
publisher = {Zenodo},
135
+
version = {latest},
136
+
doi = {10.5281/zenodo.3509134},
137
+
url = {https://doi.org/10.5281/zenodo.3509134}
138
+
}
139
+
140
+
@article{Waskom2021,
141
+
author = {Michael L. Waskom},
142
+
title = {seaborn: statistical data visualization},
Copy file name to clipboardExpand all lines: paper/paper.md
+11-3Lines changed: 11 additions & 3 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -24,7 +24,7 @@ bibliography: paper.bib
24
24
25
25
Analyzing the taxonomic profiles of metagenomic samples often involves running k-mer based classifiers (like `Kraken2`) that generate detailed reports of read counts and abundances across taxa. These reports, while information-rich, are not immediately convenient for comparative analysis: they list each taxon in a hierarchical format for a single sample, and researchers must manually parse and merge multiple files to compare communities across samples. Existing scripts such as the `KrakenTools` suite [@lu2022kraken] (developed alongside `Kraken`) provide some post-processing functionality, but they require multiple steps and technical expertise to use. Similarly, interactive tools like `Pavian` focus on visualization and exploration of `Kraken` results rather than automated batch processing [@breitwieser2020pavian]. There is a clear need for a streamlined solution to transform raw `Kraken`-family outputs into tidy data matrices and summary statistics that can be readily used in downstream analysis or publication figures. `KrakenParser` fulfills this need by offering an all-in-one pipeline that reads in multiple `Kraken2`/`Bracken`/`Metabuli` reports and outputs clean CSV tables of taxonomic counts or relative abundances, optionally filtering out low-abundance taxa or non-target taxa (e.g. human reads) as specified by the user. This greatly simplifies metagenomic workflows, especially in comparative studies or clinical settings where dozens of samples must be processed consistently. By bridging the gap between raw classifier output and statistical analysis, KrakenParser empowers researchers who may not be bioinformatics experts to leverage high-throughput metagenomics with minimal data wrangling.
26
26
27
-
Metagenomic classification has seen rapid development, with numerous tools available for assigning sequencing reads to taxa. `Kraken` was introduced in 2014 as an ultrafast k-mer based classifier [@wood2014kraken], and its successor `Kraken2` [@wood2019kraken2] further reduced memory usage and improved speed . Other k-mer classifiers include `Bracken` [@lu2017bracken], which refines `Kraken`’s counts to improve abundance estimates, `KrakenUniq` which tracks unique k-mers per taxon to reduce false positives [@breitwieser2018krakenuniq], `Centrifuge` which uses an FM-index to allow classification with compressed databases [@kim2016centrifuge], and `CLARK` which uses discriminative k-mers for fast classification [@ounit2015clark]. More recently, tools like `Kaiju` perform classification in protein space for greater sensitivity (especially on viruses) [@menzel2016kaiju], and `Metabuli` combines DNA and translated amino acid matching to improve accuracy [@kim2024metabuli]. Comprehensive evaluations have benchmarked these methods’ accuracy and speed, and community challenges like `CAMI` have pushed development of improved classifiers [@sczyrba2017cami]. Despite the variety of classifiers, a common challenge remains: the output format. Many tools output reports similar to `Kraken`’s: tab-delimited text with hierarchical labels and counts. To interpret such outputs, researchers often rely on additional scripts or manual processing. `KrakenTools` [@lu2022kraken] provides scripts to combine `Kraken` reports, convert to other formats (e.g., `Krona` for visualization, or `BIOM` for ecological analysis), and filter results. `Pavian` and other interactive platforms allow users to visualize results with `Sankey` diagrams and heatmaps [@breitwieser2020pavian], but require use of a web interface or `R` environment. There are also lightweight utilities (e.g., `spideog` and `scrubby`) to convert Kraken reports to CSV or clean them, and researchers adept in programming sometimes write custom parsing scripts. In summary, prior to `KrakenParser`, users had to piece together multiple tools to achieve tasks like merging reports from multiple samples, summing reads at specific taxonomic ranks, and computing relative abundances. `KrakenParser` builds on this state of the field by consolidating the post-processing steps into one tool. It serves as an ideological successor to `KrakenTools` [@lu2022kraken], using some of the same internal conversion steps (like `KrakenTools`’ report-to-MPA conversion) but adding improvements in automation, filtering, and output formatting. By producing standardized CSV tables (with samples as rows and taxa as columns) and by computing percentages automatically, `KrakenParser` greatly accelerates the transition from raw classification data to biological insights. This is particularly valuable given the increasing scale of metagenomic studies (where dozens or hundreds of samples are profiled) and the need for reproducible, efficient analysis pipelines.
27
+
Metagenomic classification has seen rapid development, with numerous tools available for assigning sequencing reads to taxa. `Kraken` was introduced in 2014 as an ultrafast k-mer based classifier [@wood2014kraken], and its successor `Kraken2` [@wood2019kraken2] further reduced memory usage and improved speed . Other k-mer classifiers include `Bracken` [@lu2017bracken], which refines `Kraken`’s counts to improve abundance estimates, `KrakenUniq` which tracks unique k-mers per taxon to reduce false positives [@breitwieser2018krakenuniq], `Centrifuge` which uses an FM-index to allow classification with compressed databases [@kim2016centrifuge], and `CLARK` which uses discriminative k-mers for fast classification [@ounit2015clark]. More recently, tools like `Kaiju` perform classification in protein space for greater sensitivity (especially on viruses) [@menzel2016kaiju], and `Metabuli` combines DNA and translated amino acid matching to improve accuracy [@kim2024metabuli]. Comprehensive evaluations have benchmarked these methods’ accuracy and speed, and community challenges like `CAMI` have pushed development of improved classifiers [@sczyrba2017cami]. Despite the variety of classifiers, a common challenge remains: the output format. Many tools output reports similar to `Kraken`’s: tab-delimited text with hierarchical labels and counts. To interpret such outputs, researchers often rely on additional scripts or manual processing. `KrakenTools` [@lu2022kraken] provides scripts to combine `Kraken` reports, convert to other formats (e.g., `Krona` for visualization). `Pavian` and other interactive platforms allow users to visualize results with `Sankey` diagrams and heatmaps [@breitwieser2020pavian], but require use of a web interface or `R` environment. There are also lightweight utilities (e.g., [`spideog`](https://github.com/jeanmanguy/spideog)) to convert Kraken reports to CSV or clean them, and researchers adept in programming sometimes write custom parsing scripts. In summary, prior to `KrakenParser`, users had to piece together multiple tools to achieve tasks like merging reports from multiple samples, summing reads at specific taxonomic ranks, and computing relative abundances. `KrakenParser` builds on this state of the field by consolidating the post-processing steps into one tool. It serves as an ideological successor to `KrakenTools` [@lu2022kraken], using some of the same internal conversion steps (like `KrakenTools`’ report-to-MPA conversion) but adding improvements in automation, filtering, and output formatting. By producing standardized CSV tables (with samples as rows and taxa as columns) and by computing percentages automatically, `KrakenParser` greatly accelerates the transition from raw classification data to biological insights. This is particularly valuable given the increasing scale of metagenomic studies (where dozens or hundreds of samples are profiled) and the need for reproducible, efficient analysis pipelines.
28
28
29
29
# Implementation
30
30
@@ -37,10 +37,18 @@ Metagenomic classification has seen rapid development, with numerous tools avail
37
37
5. Convert to CSV: The cleaned text tables are converted to CSV files (comma-separated values). In this transpose operation, taxa become columns and sample identifiers become rows, yielding a standard matrix format. This structured CSV is easy to import into statistical software, spreadsheets, or R/Python data frames for further analysis.
38
38
6. Calculate relative abundances: For each count table, `KrakenParser` can create a corresponding relative abundance table (`--relabund` option) by computing percentages of total reads per sample, using the formula: $\text{Relative Abundance} = \left( \frac{\text{Number of individuals of taxa}}{\text{Total number of individuals of all taxa}} \right) \times 100$. Users can specify a threshold to group low-abundance taxa into an “Other” category. This results in a normalized profile for each sample, often more interpretable in comparative studies than raw counts.
39
39
40
-
Each of these steps is exposed as a sub-command in the CLI, so advanced users can integrate KrakenParser into custom workflows. By default, running `KrakenParser --complete -i <reports_dir>/kreports` executes all steps sequentially, writing outputs to a structured directory tree (with subfolders for each step). The outputs include one CSV file per rank (e.g. counts_phylum.csv, counts_species.csv) containing absolute read counts, and similarly named files under a `csv_relabund/` directory for percentages if requested. KrakenParser is optimized for speed and memory efficiency given the nature of the task: it processes text files line by line and uses `pandas` data frames for merging and calculations, which easily handle dozens of samples and tens of thousands of taxa on a standard workstation. The reliance on `KrakenTools` for the initial conversion ensures that the parsing logic benefits from the robustness of well-tested scripts, while the unified interface adds convenience. The tool also includes built-in help for each subcommand (`-h`), guiding users on required inputs and options. `KrakenParser`’s design reflects practical needs observed in the metagenomics community—it was tested during the [2025 “Bioinformatics Bootcamp”](https://pish.itmo.ru/genomics-bootcamp) hackathon organized by ITMO University, where teams analyzing metagenomic datasets were able to obtain meaningful results in a short time thanks to `KrakenParser`’s streamlined processing pipeline. By combining established methods with new automation, `KrakenParser` provides an efficient, reproducible, and user-friendly means to handle the otherwise tedious steps of post-classification data processing.
40
+
Each of these steps is exposed as a sub-command in the CLI, so advanced users can integrate KrakenParser into custom workflows. By default, running `KrakenParser --complete -i <reports_dir>/kreports` executes all steps sequentially, writing outputs to a structured directory tree (with subfolders for each step). The outputs include one CSV file per rank (e.g. counts_phylum.csv, counts_species.csv) containing absolute read counts, and similarly named files under a `csv_relabund/` directory for percentages if requested. KrakenParser is optimized for speed and memory efficiency given the nature of the task: it processes text files line by line and uses `pandas` data frames for merging and calculations, which easily handle dozens of samples and tens of thousands of taxa on a standard workstation. The reliance on `KrakenTools` for the initial conversion ensures that the parsing logic benefits from the robustness of well-tested scripts, while the unified interface adds convenience. The tool also includes built-in help for each subcommand (`-h`), guiding users on required inputs and options. `KrakenParser`’s design reflects practical needs observed in the metagenomics community - it was tested during the [2025 “Bioinformatics Bootcamp”](https://pish.itmo.ru/genomics-bootcamp) hackathon organized by ITMO University, where teams analyzing metagenomic datasets were able to obtain meaningful results in a short time thanks to `KrakenParser`’s streamlined processing pipeline. By combining established methods with new automation, `KrakenParser` provides an efficient, reproducible, and user-friendly means to handle the otherwise tedious steps of post-classification data processing.
41
+
42
+
`KrakenParser` also offers a suite of `Python`-based visualization tools to facilitate the interpretation of taxonomic profiles:
43
+
- Stacked Bar Plots: Utilizing `matplotlib`[@Hunter2007] and `pandas`[@reback2020pandas], `KrakenParser` can generate stacked bar plots that display the relative abundances of taxa across multiple samples. These plots provide a clear comparison of taxonomic compositions between samples.
44
+
- Streamgraphs: For a more dynamic representation, `KrakenParser` can create streamgraphs using `matplotlib`’s [@Hunter2007] stackplot function with a symmetric baseline. This visualization emphasizes changes in taxa abundances over a series of samples, highlighting temporal or sequential patterns. 
45
+
- Combined Visualizations: To offer both detailed and overarching views, `KrakenParser` supports combined plots that integrate stacked bar plots and streamgraphs. This dual representation aids in comprehensive data analysis.
46
+
- Clustermaps: Employing `seaborn`[@Waskom2021], KrakenParser can produce clustermaps that perform hierarchical clustering on taxa and samples. These heatmaps reveal patterns and groupings in the data, facilitating the identification of similar taxonomic profiles.
47
+
48
+
These visualization tools are accessible through the `KrakenParser` Python API, allowing users to customize and integrate them into their analysis workflows seamlessly.
41
49
42
50
# Acknowledgements
43
51
44
-
The development of `KrakenParser`and its use in community workshops were supported by the Russian Science Foundation (project no. 25-24-00351).
52
+
The development of `KrakenParser`was supported by the Russian Science Foundation (Project no. 25-24-00351).
0 commit comments