Skip to content

Commit 46c4e17

Browse files
authored
Merge pull request #286 from broadinstitute/development
Release 1.23.3
2 parents 613e26c + f9c1ae2 commit 46c4e17

File tree

5 files changed

+101
-25
lines changed

5 files changed

+101
-25
lines changed

ingest/anndata_.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def generate_cluster_body(adata, clustering_name):
8484
[cluster_cells, pd.DataFrame(adata.obsm[clustering_name])], axis=1
8585
)
8686
pd.DataFrame(cluster_body).to_csv(
87-
f"{clustering_name}.cluster.anndata_segment.tsv",
87+
AnnDataIngestor.set_output_filename(clustering_name),
8888
sep="\t",
8989
mode="a",
9090
header=None,
@@ -94,9 +94,13 @@ def generate_cluster_body(adata, clustering_name):
9494
@staticmethod
9595
def files_to_delocalize(arguments):
9696
# ToDo - check if names using obsm_keys need sanitization
97-
cluster_file_names = [name + ".tsv" for name in arguments["obsm_keys"]]
97+
cluster_file_names = [AnnDataIngestor.set_output_filename(name) for name in arguments["obsm_keys"]]
9898
return cluster_file_names
9999

100+
@staticmethod
101+
def set_output_filename(name):
102+
return f"{name}.cluster.anndata_segment.tsv"
103+
100104
@staticmethod
101105
def delocalize_cluster_files(file_path, study_file_id, files_to_delocalize):
102106
""" Copy cluster files to study bucket
19.5 MB
Binary file not shown.
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
NAME disease__time_since_onset disease__time_since_onset__unit organ_region organ_region__ontology_label donor disease__treated species species__ontology_label geographical_region geographical_region__ontology_label library_preparation_protocol library_preparation_protocol__ontology_label organ organ__ontology_label sex is_living organism_age__unit organism_age__unit_label ethnicity__ontology_label ethnicity organism_age disease disease__ontology_label cell_type cell_type__ontology_label donor_id biosample_id biosample_type preservation_method cell_type__custom
22
TYPE numeric group group group group group group group group group group group group group group group group group group group numeric group group group group group group group group group
3-
BM01_16dpp_AAGCAGTGGTAT 12|2 UO_0000035 MBA:000000944 Folium-tuber vermis (VII) BM01 False|False NCBITaxon_9606 human GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year European HANCESTRO_0005 31 MONDO_0005015|MONDO_0006849 diabetes|mastitis CL_0000066 epithelial cell BM01 BM01_16dpp_r3 PrimaryBioSample_BodyFluid Fresh epithelial
4-
BM01_16dpp_TAAGCAGTGGTA 1 UO_0000035 MBA:000000302|MBA:000000294|MBA:000000795 "Superior colliculus, sensory related|Superior colliculus, motor related|Periaqueductal gray" BM01 FALSE NCBITaxon_9606 Homo Sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year white HANCESTRO_0005 31 MONDO_0005709 common cold CL_0000066 epithelial cell BM01 BM01_16dpp_r3 PrimaryBioSample_BodyFluid Fresh epithelial
5-
BM01_16dpp_CTAAGCAGTGGT 24|2 UO_0000035 MBA:000000714|MBA:000000972 BM01 True|False NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year British HANCESTRO_0462 31 MONDO_0005015|MONDO_0005709 diabetes mellitus|common cold CL_0000066 epithelial cell BM01 BM01_16dpp_r3 PrimaryBioSample_BodyFluid Fresh epithelial
6-
BM01_16dpp_CGGTAAACCATT 36|3|1 UO_0000035 MBA:000001041 Paraflocculus BM01 True|False|False NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year HANCESTRO_0462 31 MONDO_0005015|MONDO_0006849|MONDO_0005709 diabetes|breast infection|common cold BM01 BM01_16dpp_r3 PrimaryBioSample_BodyFluid Fresh sub-epithelial
7-
BM01_16dpp_CCGAATTCACCG 0 UO_0000035 MBA:000000909|MBA:000000502 Entorhinal area|Subiculum BM01 FALSE NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year Caucasian HANCESTRO_0005 31 MONDO_0000001 disease or disorder BM01 BM01_16dpp_r3 PrimaryBioSample_BodyFluid Fresh sub-epithelial
3+
BM01_16dpp_AAGCAGTGGTAT 12|2 UO_0000035 MBA:000000944 Folium-tuber vermis (VII) BM01 False|False NCBITaxon_9606 human GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year European HANCESTRO_0005 31 MONDO_0005015|MONDO_0006849 diabetes|mastitis CL_0000066 epithelial cell BM01 BM01_16dpp_r3 PrimaryBioSample_BodyFluid Fresh epithelial
4+
BM01_16dpp_TAAGCAGTGGTA 1 UO_0000035 MBA:000000302|MBA:000000294|MBA:000000795 "Superior colliculus, sensory related|Superior colliculus, motor related|Periaqueductal gray" BM01 FALSE NCBITaxon_9606 Homo Sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year white HANCESTRO_0005 31 MONDO_0005709 common cold CL_0000066 epithelial cell BM01 BM01_16dpp_r3 PrimaryBioSample_BodyFluid Fresh epithelial
5+
BM01_16dpp_CTAAGCAGTGGT 24|2 UO_0000035 MBA:000000714|MBA:000000972 BM01 True|False NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year British HANCESTRO_0462 31 MONDO_0005015|MONDO_0005709 diabetes mellitus|common cold CL_0000066 epithelial cell BM01 BM01_16dpp_r3 PrimaryBioSample_BodyFluid Fresh epithelial
6+
BM01_16dpp_CGGTAAACCATT 36|3|1 UO_0000035 MBA:000001041 Paraflocculus BM01 True|False|False NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year HANCESTRO_0462 31 MONDO_0005015|MONDO_0006849|MONDO_0005709 diabetes|breast infection|common cold BM01 BM01_16dpp_r3 PrimaryBioSample_BodyFluid Fresh sub-epithelial
7+
BM01_16dpp_CCGAATTCACCG 0 UO_0000035 MBA:000000909|MBA:000000502 Entorhinal area|Subiculum BM01 FALSE NCBITaxon_9606 Homo sapiens GAZ_00003181 Boston EFO_0008919 Seq-Well UBERON_0001913 milk female yes UO_0000036 year Caucasian HANCESTRO_0005 31 MONDO_0000001 disease or disorder BM01 BM01_16dpp_r3 PrimaryBioSample_BodyFluid Fresh sub-epithelial

tests/test_anndata.py

Lines changed: 68 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,28 +4,37 @@
44

55
import unittest
66
import sys
7+
import os
8+
from unittest.mock import patch
79

810
sys.path.append("../ingest")
911
from anndata_ import AnnDataIngestor
10-
12+
from ingest_files import IngestFiles
1113

1214
class TestAnnDataIngestor(unittest.TestCase):
15+
16+
@staticmethod
17+
def setup_class(self):
18+
filepath_valid = "../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad"
19+
filepath_invalid = "../tests/data/anndata/bad.h5"
20+
self.study_id = "addedfeed000000000000000"
21+
self.study_file_id = "dec0dedfeed0000000000000"
22+
self.valid_args = [filepath_valid, self.study_id, self.study_file_id]
23+
self.invalid_args = [filepath_invalid, self.study_id, self.study_file_id]
24+
self.cluster_name = 'X_tsne'
25+
self.valid_kwargs = {'obsm_keys': [self.cluster_name]}
26+
self.anndata_ingest = AnnDataIngestor(*self.valid_args, **self.valid_kwargs)
27+
self.output_filename = f"{self.cluster_name}.cluster.anndata_segment.tsv"
28+
29+
def teardown_method(self, _):
30+
if os.path.isfile(self.output_filename):
31+
os.remove(self.output_filename)
32+
1333
def test_minimal_valid_anndata(self):
14-
good_input = AnnDataIngestor(
15-
"../tests/data/anndata/test.h5ad",
16-
"addedfeed000000000000000",
17-
"dec0dedfeed0000000000000",
18-
)
19-
self.assertTrue(
20-
good_input.validate(), "expect known good file to open with scanpy"
21-
)
34+
self.assertTrue(self.anndata_ingest.validate(), "expect known good file to open with scanpy")
2235

2336
def test_truncated_anndata(self):
24-
truncated_input = AnnDataIngestor(
25-
"../tests/data/anndata/bad.h5",
26-
"addedfeed000000000000000",
27-
"dec0dedfeed0000000000000",
28-
)
37+
truncated_input = AnnDataIngestor(*self.invalid_args)
2938
# passing obtain_data function to assertRaises using lambda
3039
# otherwise truncated_input.obtain_data() is evaluated and triggers
3140
# an exception before assertRaises gets called
@@ -39,8 +48,8 @@ def test_truncated_anndata(self):
3948
def test_input_bad_suffix(self):
4049
bad_input = AnnDataIngestor(
4150
"../tests/data/anndata/bad.foo",
42-
"addedfeed000000000000000",
43-
"dec0dedfeed0000000000000",
51+
self.study_id,
52+
self.study_file_id,
4453
)
4554
# passing obtain_data function to assertRaises using lambda
4655
# otherwise bad_input.obtain_data() is evaluated and triggers
@@ -52,3 +61,46 @@ def test_input_bad_suffix(self):
5261
)
5362
self.assertFalse(bad_input.validate())
5463

64+
def test_set_output_filename(self):
65+
cluster_name = "X_Umap"
66+
self.assertEqual(
67+
AnnDataIngestor.set_output_filename(cluster_name),
68+
"X_Umap.cluster.anndata_segment.tsv"
69+
)
70+
71+
def test_generate_cluster_header(self):
72+
self.anndata_ingest.generate_cluster_header(self.anndata_ingest.obtain_adata(), self.cluster_name)
73+
with open(self.output_filename) as header_file:
74+
header = header_file.readline().split("\t")
75+
self.assertEqual(['NAME', 'X', "Y\n"], header, "did not find expected headers")
76+
77+
def test_generate_cluster_type_declaration(self):
78+
self.anndata_ingest.generate_cluster_type_declaration(self.anndata_ingest.obtain_adata(), self.cluster_name)
79+
with open(self.output_filename) as header_file:
80+
header = header_file.readline().split("\t")
81+
self.assertEqual(['TYPE', 'numeric', "numeric\n"], header, "did not find expected headers")
82+
83+
def test_generate_cluster_body(self):
84+
self.anndata_ingest.generate_cluster_body(self.anndata_ingest.obtain_adata(), self.cluster_name)
85+
with open(self.output_filename) as cluster_body:
86+
line = cluster_body.readline().split("\t")
87+
expected_line = ['AAACATACAACCAC-1', '16.009954', "-21.073845\n"]
88+
self.assertEqual(expected_line, line, 'did not get expected coordinates from cluster body')
89+
90+
def test_get_files_to_delocalize(self):
91+
files = AnnDataIngestor.files_to_delocalize(self.valid_kwargs)
92+
expected_files = [self.output_filename]
93+
self.assertEqual(expected_files, files)
94+
95+
def test_delocalize_files(self):
96+
# just create header, no reason to run full extract
97+
self.anndata_ingest.generate_cluster_header(self.anndata_ingest.obtain_adata(), self.cluster_name)
98+
with patch('ingest_files.IngestFiles.delocalize_file'):
99+
AnnDataIngestor.delocalize_file(
100+
"gs://fake_bucket", self.study_id, AnnDataIngestor.files_to_delocalize(self.valid_kwargs)
101+
)
102+
self.assertEqual(
103+
IngestFiles.delocalize_file.call_count,
104+
1,
105+
"expected 1 call to delocalize output files",
106+
)

tests/test_ingest.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,10 @@
4646
validate_arguments,
4747
IngestPipeline,
4848
exit_pipeline,
49-
run_ingest,
49+
run_ingest
5050
)
5151
from expression_files.expression_files import GeneExpression
5252

53-
5453
def mock_load(self, *args, **kwargs):
5554
"""Enables overwriting normal function with this placeholder.
5655
Returning the arguments enables tests to verify that the code invokes
@@ -675,6 +674,27 @@ def test_subsample_no_cell_intersection(self, mock_load_subsample):
675674
exit_pipeline(ingest, status, status_cell_metadata, arguments)
676675
self.assertEqual(cm.exception.code, 1)
677676

677+
def test_extract_cluster_file_from_anndata(self):
678+
args = [
679+
"--study-id",
680+
"5d276a50421aa9117c982845",
681+
"--study-file-id",
682+
"5dd5ae25421aa910a723a337",
683+
"ingest_anndata",
684+
"--ingest-anndata",
685+
"--extract-cluster",
686+
"--anndata-file",
687+
"../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad",
688+
"--obsm-keys",
689+
"['X_tsne']"
690+
691+
]
692+
ingest, arguments, status, status_cell_metadata = self.execute_ingest(args)
693+
self.assertEqual(len(status), 1)
694+
self.assertEqual(status[0], 0)
695+
filename = 'X_tsne.cluster.anndata_segment.tsv'
696+
self.assertTrue(os.path.isfile(filename))
697+
678698

679699
if __name__ == "__main__":
680700
unittest.main()

0 commit comments

Comments
 (0)