Skip to content

Commit 663af62

Browse files
authored
Merge pull request #371 from broadinstitute/development
Release 1.37.0
2 parents 5fb00bb + 00109a5 commit 663af62

File tree

4 files changed

+34
-4
lines changed

4 files changed

+34
-4
lines changed

.github/workflows/minify_ontologies.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ name: Minify ontologies
33
on:
44
pull_request:
55
types: [opened] # Only trigger on PR "opened" event
6-
push: # Uncomment, update branches to develop / debug
7-
branches:
8-
jb-metadata-boolean
6+
# push: # Uncomment, update branches to develop / debug
7+
# branches:
8+
# jb-metadata-boolean
99

1010
jobs:
1111
build:

ingest/anndata_.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def generate_processed_matrix(adata):
187187
h5ad_frag.features.processed.tsv
188188
Gzip files for faster delocalization
189189
"""
190-
if adata.var.index.name == 'gene_ids':
190+
if AnnDataIngestor.check_ensembl_index(adata):
191191
# CELLxGENE indexes by Ensembl gene ID, not gene name (i.e. symbol).
192192
# Gene name is encoded in feature_name, which is needed for gene search.
193193
feature_frame = adata.var.feature_name
@@ -216,6 +216,15 @@ def generate_processed_matrix(adata):
216216
)
217217
AnnDataIngestor.compress_file(mtx_filename)
218218

219+
@staticmethod
220+
def check_ensembl_index(adata):
221+
"""Check if an AnnData file is indexed on Ensembl gene IDs (e.g. ENSG00000243485) instead of gene symbols"""
222+
if adata.var.index.name == 'gene_ids':
223+
return True
224+
else:
225+
prefixes = list(set(gene_id[:3] for gene_id in adata.var_names))
226+
return len(prefixes) == 1 and prefixes[0] == 'ENS'
227+
219228
@staticmethod
220229
def delocalize_extracted_files(
221230
file_path, study_file_id, accession, files_to_delocalize
5.38 MB
Binary file not shown.

tests/test_anndata.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,27 @@ def test_gene_id_indexed_generate_processed_matrix(self):
266266
first_line, expected_first_line, 'Expected Ensembl ID and gene name'
267267
)
268268

269+
def test_check_if_indexed_by_gene_id(self):
270+
# check var.index.name
271+
feature_name = AnnDataIngestor(
272+
"../tests/data/anndata/indexed_by_gene_id.h5ad", self.study_id, self.study_file_id
273+
)
274+
adata = feature_name.obtain_adata()
275+
self.assertTrue(feature_name.check_ensembl_index(adata))
276+
277+
# check data inspection
278+
data_inspect = AnnDataIngestor(
279+
"../tests/data/anndata/cellxgene.human_liver_b_cells.h5ad", self.study_id, self.study_file_id
280+
)
281+
liver_adata = data_inspect.obtain_adata()
282+
self.assertTrue(data_inspect.check_ensembl_index(liver_adata))
283+
284+
# negative test
285+
gene_symbols = AnnDataIngestor(
286+
"../tests/data/anndata/anndata_test.h5ad", self.study_id, self.study_file_id
287+
)
288+
normal_adata = gene_symbols.obtain_adata()
289+
self.assertFalse(gene_symbols.check_ensembl_index(normal_adata))
269290

270291
def test_get_files_to_delocalize(self):
271292
files = AnnDataIngestor.clusterings_to_delocalize(self.valid_kwargs)

0 commit comments

Comments
 (0)