Skip to content

Commit b820bbb

Browse files
authored
Merge pull request #360 from broadinstitute/jlc_fix_adata.obs_numeric
Ensure numeric metadata in adata.obs are assigned TYPE numeric in metadata fragment files
2 parents 1ce5944 + 4e5a153 commit b820bbb

File tree

4 files changed

+351
-4
lines changed

4 files changed

+351
-4
lines changed

ingest/anndata_.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def generate_metadata_file(adata, output_name):
147147
headers = adata.obs.columns.tolist()
148148
types = []
149149
for header in headers:
150-
if pd.api.types.is_number(adata.obs[header]):
150+
if pd.api.types.is_numeric_dtype(adata.obs[header]):
151151
types.append("NUMERIC")
152152
else:
153153
types.append("GROUP")

scripts/sSCP395_h5ad_CAS.qmd

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
---
2+
title: "sSCP395 h5ad for CAS"
3+
format: html
4+
---
5+
6+
7+
```{python}
8+
import numpy as np
9+
import pandas as pd
10+
import scanpy as sc
11+
adata = sc.read_10x_h5("PBMChealthyNoSort3k2.0.0/pbmc_unsorted_3k_filtered_feature_bc_matrix.h5")
12+
adata
13+
```
14+
15+
```{python}
16+
meta = pd.read_csv('prep_files/orig_10xmultiome_metadata4h5ad.tsv', sep="\t", index_col=0)
17+
adata.obs = meta
18+
```
19+
20+
21+
```{python}
22+
umap = pd.read_csv('PBMChealthyNoSort3k2.0.0/analysis/dimensionality_reduction/gex/umap_projection.csv', usecols=[1, 2]).to_numpy()
23+
tsne = pd.read_csv('PBMChealthyNoSort3k2.0.0/analysis/dimensionality_reduction/gex/tsne_projection.csv', usecols=[1, 2]).to_numpy()
24+
```
25+
26+
27+
```{python}
28+
adata.obsm["X_umap"] = umap
29+
adata.obsm["X_tsne"] = tsne
30+
```
31+
32+
33+
```{python}
34+
adata.write_h5ad("sSCP395.h5ad")
35+
```
36+
37+
# start CAS notebook
38+
39+
40+
```{python}
41+
import scanpy as sc
42+
import warnings
43+
warnings.filterwarnings("ignore", category=FutureWarning)
44+
warnings.filterwarnings("ignore", category=UserWarning)
45+
sc.set_figure_params(dpi=80)
46+
```
47+
48+
49+
50+
```{python}
51+
sc.pl.umap(adata, color='gex_cluster')
52+
```
53+
54+
55+
```{python}
56+
with open(".cas-api-token") as f:
57+
api_token = f.read().strip()"
58+
```
59+
60+
61+
```{python}
62+
from cellarium.cas.client import CASClient
63+
64+
cas = CASClient(api_token=api_token)
65+
```
66+
67+
68+
```{python}
69+
cas_model_name = 'cellarium-pca-default-v1'
70+
cas_ontology_aware_response = cas.annotate_matrix_cell_type_ontology_aware_strategy(
71+
matrix=adata,
72+
chunk_size=500,
73+
feature_ids_column_name='gene_ids',
74+
feature_names_column_name='index',
75+
cas_model_name=cas_model_name)
76+
```
77+
78+
```{python}
79+
from cellarium.cas._io import suppress_stderr
80+
from cellarium.cas.visualization import CASCircularTreePlotUMAPDashApp
81+
82+
DASH_SERVER_PORT = 8050
83+
84+
with suppress_stderr():
85+
CASCircularTreePlotUMAPDashApp(
86+
adata, # the AnnData file
87+
cas_ontology_aware_response, # CAS response
88+
cluster_label_obs_column="gex_label", # (optional) The .obs column name containing cluster labels
89+
).run(port=DASH_SERVER_PORT, debug=False, jupyter_width="100%")
90+
```
91+
92+
```{python}
93+
import cellarium.cas.postprocessing.ontology_aware as pp
94+
from cellarium.cas.postprocessing.cell_ontology import CellOntologyCache
95+
96+
with suppress_stderr():
97+
cl = CellOntologyCache()
98+
```
99+
100+
```{python}
101+
from cellarium.cas.postprocessing import insert_cas_ontology_aware_response_into_adata
102+
103+
insert_cas_ontology_aware_response_into_adata(cas_ontology_aware_response, adata, cl)
104+
```
105+
```{python}
106+
107+
adata.obs["gex_cluster"] = adata.obs["gex_cluster"].astype('category')
108+
pp.compute_most_granular_top_k_calls_cluster(
109+
adata=adata,
110+
cl=cl,
111+
min_acceptable_score=0.1, # minimum acceptable score for a call
112+
cluster_label_obs_column='gex_cluster', # .obs column containing cluster labels
113+
top_k=3, # how many top calls to make?
114+
obs_prefix='cell_type_cas_cluster_calls' # .obs column to write the top-k calls to
115+
)
116+
```
117+
118+
```{python}
119+
pp.compute_most_granular_top_k_calls_single(
120+
adata=adata,
121+
cl=cl,
122+
min_acceptable_score=0.1, # minimum acceptable score for a call
123+
top_k=3, # how many top calls to make?
124+
obs_prefix="cas_cell_type_single_calls" # .obs column to write the top-k calls to
125+
)
126+
```
127+
128+
```{python}
129+
import pickle
130+
pickle.dump(cas_ontology_aware_response, open( "ontology_aware_strategy_cas_response.pickle", "wb"))
131+
```
132+
133+
```{python}
134+
test = cas.annotate_matrix_cell_type_summary_statistics_strategy(adata)
135+
```
136+
137+
138+
```{python}
139+
import json
140+
with open("ontology_aware_strategy_cas_response.json", "w") as f:
141+
f.write(json.dumps(cas_ontology_aware_response))
142+
143+
with open("cell_type_summary_statistics_strategy_response.json", "w") as f:
144+
f.write(json.dumps(test))
145+
```
146+
147+
148+
```{python}
149+
150+
adata.write_h5ad("sSCP395_CAS-annotated.h5ad")
151+
```
152+
153+
```{python}
154+
cluster_types = ["umap", "tsne"]
155+
mask = adata.obs.columns.str.contains('.*cell_type.*label.*')
156+
extracted = adata.obs.loc[:,mask]
157+
headers = ["NAME", "X", "Y" ] + extracted.columns.to_list()
158+
types = ["TYPE"] + ["group"] * (len(headers) - 1)
159+
for ctype in cluster_types:
160+
cluster_slot = "X_" + ctype
161+
cluster = pd.DataFrame(adata.obsm[cluster_slot])
162+
cluster.index = adata.obs_names
163+
cluster = pd.concat([cluster,extracted], axis=1)
164+
filename = ctype + ".tsv"
165+
with open(filename, "w") as file:
166+
file.write('\t'.join(map(str,headers)) + "\n")
167+
file.write('\t'.join(map(str,types)) + "\n")
168+
cluster.to_csv(filename, mode='a', sep="\t", index=True, header=False)
169+
```

scripts/sSCP407_CAS_single.qmd

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
---
2+
title: "sSCP407 single CAS"
3+
format: html
4+
---
5+
6+
```{python}
7+
!curl -O https://storage.googleapis.com/cellarium-file-system-public/cellarium-cas-tutorial/pbmc_10x_v3_4k.h5ad
8+
```
9+
10+
```{python}
11+
import numpy as np
12+
import pandas as pd
13+
import scanpy as sc
14+
adata = sc.read_h5ad("pbmc_10x_v3_4k.h5ad")
15+
adata
16+
```
17+
18+
# start CAS notebook
19+
20+
21+
```{python}
22+
import scanpy as sc
23+
import warnings
24+
warnings.filterwarnings("ignore", category=FutureWarning)
25+
warnings.filterwarnings("ignore", category=UserWarning)
26+
sc.set_figure_params(dpi=80)
27+
```
28+
29+
30+
```{python}
31+
with open(".cas-api-token") as f:
32+
api_token = f.read().strip()"
33+
```
34+
35+
36+
```{python}
37+
from cellarium.cas.client import CASClient
38+
39+
cas = CASClient(api_token=api_token)
40+
```
41+
42+
43+
```{python}
44+
cas_model_name = 'cellarium-pca-default-v1'
45+
cas_ontology_aware_response = cas.annotate_matrix_cell_type_ontology_aware_strategy(
46+
matrix=adata,
47+
chunk_size=500,
48+
feature_ids_column_name='gene_ids',
49+
feature_names_column_name='index',
50+
cas_model_name=cas_model_name)
51+
```
52+
53+
```{python}
54+
from cellarium.cas._io import suppress_stderr
55+
from cellarium.cas.visualization import CASCircularTreePlotUMAPDashApp
56+
```
57+
58+
```{python}
59+
import cellarium.cas.postprocessing.ontology_aware as pp
60+
from cellarium.cas.postprocessing.cell_ontology import CellOntologyCache
61+
62+
with suppress_stderr():
63+
cl = CellOntologyCache()
64+
```
65+
66+
```{python}
67+
from cellarium.cas.postprocessing import insert_cas_ontology_aware_response_into_adata
68+
69+
insert_cas_ontology_aware_response_into_adata(cas_ontology_aware_response, adata, cl)
70+
```
71+
72+
```{python}
73+
74+
pp.compute_most_granular_top_k_calls_cluster(
75+
adata=adata,
76+
cl=cl,
77+
min_acceptable_score=0.1, # minimum acceptable score for a call
78+
cluster_label_obs_column='cluster_label', # .obs column containing cluster labels
79+
top_k=3, # how many top calls to make?
80+
obs_prefix='cell_type_cas_cluster_calls' # .obs column to write the top-k calls to
81+
)
82+
```
83+
84+
```{python}
85+
pp.compute_most_granular_top_k_calls_single(
86+
adata=adata,
87+
cl=cl,
88+
min_acceptable_score=0.1, # minimum acceptable score for a call
89+
top_k=3, # how many top calls to make?
90+
obs_prefix="cas_cell_type_single_calls" # .obs column to write the top-k calls to
91+
)
92+
```
93+
94+
95+
```{python}
96+
test = cas.annotate_matrix_cell_type_summary_statistics_strategy(adata)
97+
```
98+
99+
# write out both CAS result files as json
100+
```{python}
101+
import json
102+
with open("ontology_aware_strategy_cas_response.json", "w") as f:
103+
f.write(json.dumps(cas_ontology_aware_response))
104+
105+
with open("cell_type_summary_statistics_strategy_response.json", "w") as f:
106+
f.write(json.dumps(test))
107+
```
108+
109+
# add dummy metadata for SCP convention
110+
# Note: lpp is possibly incorrect
111+
```{python}
112+
adata.obs['biosample_id'] = "sample-1"
113+
adata.obs['donor_id'] = "donor-1"
114+
adata.obs['species'] = "NCBITaxon_9606"
115+
adata.obs['species__ontology_label'] = "Homo sapiens"
116+
adata.obs['disease'] = "PATO_0000461"
117+
adata.obs['disease__ontology_label'] = "normal"
118+
adata.obs['organ'] = "UBERON_0000178"
119+
adata.obs['organ__ontology_label'] = "blood"
120+
adata.obs['library_preparation_protocol'] = "EFO_0030059"
121+
adata.obs['library_preparation_protocol__ontology_label'] = "10x multiome"
122+
adata.obs['sex'] = "female"
123+
```
124+
# update numeric metadata to be of appropriate numeric type in dataframe
125+
126+
```{python}
127+
scores = adata.obs.columns.str.contains('.*cell_type.*score.*')
128+
adata.obs.loc[:, scores] = adata.obs.loc[:, scores].astype(float)
129+
```
130+
# write AnnData to file
131+
132+
```{python}
133+
134+
adata.write_h5ad("sSCP407_CAS.h5ad")
135+
```
136+
137+
# extract cell type label annotations to a clustering file
138+
139+
```{python}
140+
cluster_types = ["umap"]
141+
mask = adata.obs.columns.str.contains('.*cell_type.*label.*')
142+
extracted = adata.obs.loc[:,mask]
143+
headers = ["NAME", "X", "Y" ] + extracted.columns.to_list()
144+
types = ["TYPE"] + ["group"] * (len(headers) - 1)
145+
for ctype in cluster_types:
146+
cluster_slot = "X_" + ctype
147+
cluster = pd.DataFrame(adata.obsm[cluster_slot])
148+
cluster.index = adata.obs_names
149+
cluster = pd.concat([cluster,extracted], axis=1)
150+
filename = "sSCP407_"+ ctype + ".tsv"
151+
with open(filename, "w") as file:
152+
file.write('\t'.join(map(str,headers)) + "\n")
153+
file.write('\t'.join(map(str,types)) + "\n")
154+
cluster.to_csv(filename, mode='a', sep="\t", index=True, header=False)
155+
```
156+

tests/test_anndata.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@ def test_generate_metadata_file(self):
136136
)
137137
compressed_file = self.metadata_filename + ".gz"
138138
with gzip.open(compressed_file, "rt", encoding="utf-8-sig") as metadata_body:
139-
line = metadata_body.readline().split("\t")
140-
expected_line = [
139+
name_line = metadata_body.readline().split("\t")
140+
expected_names = [
141141
'NAME',
142142
'n_genes',
143143
'percent_mito',
@@ -156,7 +156,29 @@ def test_generate_metadata_file(self):
156156
"library_preparation_protocol__ontology_label\n",
157157
]
158158
self.assertEqual(
159-
expected_line, line, 'did not get expected headers from metadata body'
159+
expected_names, name_line, 'did not get expected headers from metadata body'
160+
)
161+
type_line = metadata_body.readline().split("\t")
162+
expected_types = [
163+
'TYPE',
164+
'NUMERIC',
165+
'NUMERIC',
166+
'NUMERIC',
167+
'GROUP',
168+
'GROUP',
169+
'GROUP',
170+
'GROUP',
171+
'GROUP',
172+
'GROUP',
173+
'GROUP',
174+
'GROUP',
175+
'GROUP',
176+
'GROUP',
177+
'GROUP',
178+
"GROUP\n",
179+
]
180+
self.assertEqual(
181+
expected_types, type_line, 'did not get expected types from metadata body'
160182
)
161183

162184
def test_gene_id_indexed_generate_processed_matrix(self):

0 commit comments

Comments
 (0)