Skip to content

Commit bb95d84

Browse files
committed
add diversity
1 parent 7140385 commit bb95d84

1 file changed

Lines changed: 114 additions & 0 deletions

File tree

krakenparser/diversity.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
#!/usr/bin/env python
2+
3+
import pandas as pd
4+
import numpy as np
5+
import sys
6+
import shutil
7+
import argparse
8+
from pathlib import Path
9+
from skbio.diversity import beta_diversity
10+
from skbio.stats import subsample_counts
11+
12+
13+
# Define Shannon index
14+
def shannon_index(counts):
15+
counts = np.array(counts)
16+
counts = counts[counts > 0]
17+
proportions = counts / counts.sum()
18+
return -np.sum(proportions * np.log(proportions))
19+
20+
21+
# Define Pielou's evenness
22+
def pielou_evenness(counts):
23+
counts = np.array(counts)
24+
counts = counts[counts > 0]
25+
H = shannon_index(counts)
26+
S = len(counts)
27+
return H / np.log(S) if S > 1 else 0
28+
29+
30+
# Define Chao1 richness estimator
31+
def chao1_index(counts):
32+
counts = np.array(counts)
33+
S_obs = np.sum(counts > 0)
34+
F1 = np.sum(counts == 1)
35+
F2 = np.sum(counts == 2)
36+
if F2 == 0:
37+
return S_obs + F1 * (F1 - 1) / 2
38+
return S_obs + (F1 * F1) / (2 * F2)
39+
40+
41+
def calc_alpha_div(df, output_path):
42+
results = []
43+
for sample_id, row in df.iterrows():
44+
counts = row.values
45+
results.append(
46+
{
47+
"Sample": sample_id,
48+
"Shannon": shannon_index(counts),
49+
"Pielou": pielou_evenness(counts),
50+
"Chao1": chao1_index(counts),
51+
}
52+
)
53+
alpha_df = pd.DataFrame(results).set_index("Sample")
54+
alpha_df.to_csv(output_path / "alpha_div.csv")
55+
56+
57+
def calc_beta_div(df, output_path, rarefaction_depth):
58+
rarefied_counts = []
59+
sample_ids = []
60+
61+
for sample, row in df.iterrows():
62+
counts = row.values.astype(int)
63+
if counts.sum() >= rarefaction_depth:
64+
rarefied = subsample_counts(counts, n=rarefaction_depth)
65+
rarefied_counts.append(rarefied)
66+
sample_ids.append(sample)
67+
68+
if len(rarefied_counts) < 2:
69+
raise ValueError("Not enough samples passed the rarefaction threshold.")
70+
71+
bray_df = beta_diversity(
72+
"braycurtis", rarefied_counts, ids=sample_ids
73+
).to_data_frame()
74+
jaccard_df = beta_diversity(
75+
"jaccard", rarefied_counts, ids=sample_ids
76+
).to_data_frame()
77+
78+
bray_df.to_csv(output_path / "beta_div_bray.csv")
79+
jaccard_df.to_csv(output_path / "beta_div_jaccard.csv")
80+
81+
82+
if __name__ == "__main__":
83+
parser = argparse.ArgumentParser(description="Calculate α & β-diversities.")
84+
parser.add_argument(
85+
"-i",
86+
"--input",
87+
required=True,
88+
help="Input total count table CSV (species level).",
89+
)
90+
parser.add_argument("-o", "--output", required=True, help="Output directory path.")
91+
parser.add_argument(
92+
"-d",
93+
"--depth",
94+
type=int,
95+
default=1000,
96+
help="Rarefaction depth for β diversity (default: 1000).",
97+
)
98+
args = parser.parse_args()
99+
100+
input_file = Path(args.input)
101+
output_dir = Path(args.output)
102+
output_dir.mkdir(parents=True, exist_ok=True)
103+
104+
df = pd.read_csv(input_file, index_col=0)
105+
106+
calc_alpha_div(df, output_dir)
107+
calc_beta_div(df, output_dir, args.depth)
108+
print(
109+
f"α & β-diversities have been successfully calculated and saved to '{output_dir}'."
110+
)
111+
112+
pycache_dir = Path(__file__).resolve().parent / "__pycache__"
113+
if pycache_dir.exists() and pycache_dir.is_dir():
114+
shutil.rmtree(pycache_dir)

0 commit comments

Comments
 (0)