Merge pull request #287 from broadinstitute/jlc_ingest_h5ad_metadata

jlchang · web-flow · commit 9158ec8e59c9 · 2023-01-04T12:32:29.000-05:00
Extract metadata from h5ad and delocalize ingest-intermediate files to study bucket (SCP-4823)
diff --git a/ingest/anndata_.py b/ingest/anndata_.py
@@ -16,12 +16,6 @@ def __init__(self, file_path, study_file_id, study_id, **kwargs):
         IngestFiles.__init__(
             self, file_path, allowed_file_types=self.ALLOWED_FILE_TYPES
         )
-        # If performing cluster extraction, set obsm_keys
-        extract_cluster = kwargs.get("extract_cluster")
-        if extract_cluster == True:
-            self.obsm_keys = kwargs["obsm_keys"]
-        else:
-            pass
 
     def obtain_adata(self):
         try:
@@ -61,7 +55,8 @@ def generate_cluster_header(adata, clustering_name):
         else:
             msg = f"Too few dimensions for visualization in obsm \"{clustering_name}\", found {clustering_dimension}, expected 2 or 3."
             raise ValueError(msg)
-        with open(f"{clustering_name}.cluster.anndata_segment.tsv", "w") as f:
+        filename = AnnDataIngestor.set_clustering_filename(clustering_name)
+        with open(filename, "w") as f:
             f.write('\t'.join(headers) + '\n')
 
     @staticmethod
@@ -71,7 +66,8 @@ def generate_cluster_type_declaration(adata, clustering_name):
         """
         clustering_dimension = adata.obsm[clustering_name].shape[1]
         types = ["TYPE", *["numeric"] * clustering_dimension]
-        with open(f"{clustering_name}.cluster.anndata_segment.tsv", "a") as f:
+        filename = AnnDataIngestor.set_clustering_filename(clustering_name)
+        with open(filename, "a") as f:
             f.write('\t'.join(types) + '\n')
 
     @staticmethod
@@ -83,27 +79,45 @@ def generate_cluster_body(adata, clustering_name):
         cluster_body = pd.concat(
             [cluster_cells, pd.DataFrame(adata.obsm[clustering_name])], axis=1
         )
+        filename = AnnDataIngestor.set_clustering_filename(clustering_name)
         pd.DataFrame(cluster_body).to_csv(
-            AnnDataIngestor.set_output_filename(clustering_name),
-            sep="\t",
-            mode="a",
-            header=None,
-            index=False,
+            filename, sep="\t", mode="a", header=None, index=False
         )
 
     @staticmethod
-    def files_to_delocalize(arguments):
-        # ToDo - check if names using obsm_keys need sanitization
-        cluster_file_names = [AnnDataIngestor.set_output_filename(name) for name in arguments["obsm_keys"]]
-        return cluster_file_names
+    def set_clustering_filename(name):
+        return f"h5ad_frag.cluster.{name}.tsv"
 
     @staticmethod
-    def set_output_filename(name):
-        return f"{name}.cluster.anndata_segment.tsv"
+    def generate_metadata_file(adata, output_name):
+        """
+        Generate metadata NAME and TYPE lines
+        """
+        headers = adata.obs.columns.tolist()
+        types = []
+        for header in headers:
+            if pd.api.types.is_number(adata.obs[header]):
+                types.append("NUMERIC")
+            else:
+                types.append("GROUP")
+        headers.insert(0, "NAME")
+        types.insert(0, "TYPE")
+        with open(output_name, "w") as f:
+            f.write('\t'.join(headers) + '\n')
+            f.write('\t'.join(types) + '\n')
+        adata.obs.to_csv(output_name, sep="\t", mode="a", header=None, index=True)
+
+    @staticmethod
+    def clusterings_to_delocalize(arguments):
+        # ToDo - check if names using obsm_keys need sanitization
+        cluster_file_names = []
+        for name in arguments["obsm_keys"]:
+            cluster_file_names.append(AnnDataIngestor.set_clustering_filename(name))
+        return cluster_file_names
 
     @staticmethod
-    def delocalize_cluster_files(file_path, study_file_id, files_to_delocalize):
-        """ Copy cluster files to study bucket
+    def delocalize_extracted_files(file_path, study_file_id, files_to_delocalize):
+        """ Copy extracted files to study bucket
         """
 
         for file in files_to_delocalize:
@@ -112,5 +126,5 @@ def delocalize_cluster_files(file_path, study_file_id, files_to_delocalize):
                 None,
                 file_path,
                 file,
-                f"_scp_internal/anndata_ingest/{file}",
+                f"_scp_internal/anndata_ingest/{study_file_id}/{file}",
             )
diff --git a/ingest/cli_parser.py b/ingest/cli_parser.py
@@ -354,9 +354,9 @@ def create_parser():
     )
 
     parser_anndata.add_argument(
-        "--extract-cluster",
-        action="store_true",
-        help="Indicates clustering data should be extracted",
+        "--extract",
+        type=ast.literal_eval,
+        help="Array of file types to extract, options include ['cluster', 'metadata']",
     )
 
     parser_expression_writer = subparsers.add_parser(
diff --git a/ingest/ingest_pipeline.py b/ingest/ingest_pipeline.py
@@ -30,7 +30,7 @@
 python ingest_pipeline.py  --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/test.h5ad
 
 # Ingest AnnData file - cluster extraction
-python ingest_pipeline.py  --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --extract-cluster --ingest-anndata --anndata-file ../tests/data/anndata/test.h5ad --obsm-keys "['X_tsne']"
+python ingest_pipeline.py  --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --extract "['cluster']" --ingest-anndata --anndata-file ../tests/data/anndata/test.h5ad --obsm-keys "['X_tsne']"
 
 # Subsample cluster and metadata file
 python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_subsample --cluster-file ../tests/data/test_1k_cluster_data.csv --name cluster1 --cell-metadata-file ../tests/data/test_1k_metadata_Data.csv --subsample
@@ -483,14 +483,18 @@ def subsample(self):
         return 0
 
     @custom_metric(config.get_metric_properties)
-    def ingest_anndata(self):
-        """Ingests anndata files."""
+    def extract_from_anndata(self):
+        """Extract data subsets from anndata file per SCP filetypes."""
         self.anndata = AnnDataIngestor(
             self.anndata_file, self.study_id, self.study_file_id, **self.kwargs
         )
         if self.anndata.validate():
             self.report_validation("success")
-            if self.kwargs["extract_cluster"] == True:
+            study_info = config.get_metric_properties()
+            accession = study_info.get_properties()['studyAccession']
+            file_id = study_info.get_properties()['fileName']
+            outfile_prefix = f"{file_id}.{accession}"
+            if self.kwargs.get("extract") and "cluster" in self.kwargs.get("extract"):
                 if not self.kwargs["obsm_keys"]:
                     self.kwargs["obsm_keys"] = ['X_tsne']
                 for key in self.kwargs["obsm_keys"]:
@@ -499,6 +503,11 @@ def ingest_anndata(self):
                         self.anndata.adata, key
                     )
                     AnnDataIngestor.generate_cluster_body(self.anndata.adata, key)
+            if self.kwargs.get("extract") and "metadata" in self.kwargs.get("extract"):
+                metadata_filename = f"h5ad_frag.metadata.tsv"
+                AnnDataIngestor.generate_metadata_file(
+                    self.anndata.adata, metadata_filename
+                )
             return 0
         # scanpy unable to open AnnData file
         else:
@@ -571,7 +580,7 @@ def run_ingest(ingest, arguments, parsed_args):
     elif "ingest_anndata" in arguments:
         if arguments["ingest_anndata"]:
             config.set_parent_event_name("ingest-pipeline:anndata:ingest")
-            status_anndata = ingest.ingest_anndata()
+            status_anndata = ingest.extract_from_anndata()
             status.append(status_anndata)
     elif "differential_expression" in arguments:
         config.set_parent_event_name("ingest-pipeline:differential-expression")
@@ -618,16 +627,24 @@ def exit_pipeline(ingest, status, status_cell_metadata, arguments):
                     file_path, study_file_id, files_to_match
                 )
         # for successful anndata jobs, need to delocalize intermediate ingest files
-        elif (
-            "extract_cluster" in arguments
-            and arguments.get("extract_cluster") == True
-            and all(i < 1 for i in status)
-        ):
+        elif arguments.get("extract") and all(i < 1 for i in status):
             file_path, study_file_id = get_delocalization_info(arguments)
             # append status?
+            files_to_delocalize = []
             if IngestFiles.is_remote_file(file_path):
-                files_to_delocalize = AnnDataIngestor.files_to_delocalize(arguments)
-                AnnDataIngestor.delocalize_cluster_files(
+                # the next 3 lines are copied from 493-495, suggestions
+                # welcomed for how to DRY this up
+                study_info = config.get_metric_properties()
+                accession = study_info.get_properties()['studyAccession']
+                file_id = study_info.get_properties()['fileName']
+                if "cluster" in arguments.get("extract"):
+                    files_to_delocalize.extend(
+                        AnnDataIngestor.clusterings_to_delocalize(arguments)
+                    )
+                if "metadata" in arguments.get("extract"):
+                    metadata_filename = f"h5ad_frag.metadata.tsv"
+                    files_to_delocalize.append(metadata_filename)
+                AnnDataIngestor.delocalize_extracted_files(
                     file_path, study_file_id, files_to_delocalize
                 )
         # all non-DE, non-anndata ingest jobs can exit on success
diff --git a/tests/test_anndata.py b/tests/test_anndata.py
@@ -11,8 +11,8 @@
 from anndata_ import AnnDataIngestor
 from ingest_files import IngestFiles
 
-class TestAnnDataIngestor(unittest.TestCase):
 
+class TestAnnDataIngestor(unittest.TestCase):
     @staticmethod
     def setup_class(self):
         filepath_valid = "../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad"
@@ -24,14 +24,16 @@ def setup_class(self):
         self.cluster_name = 'X_tsne'
         self.valid_kwargs = {'obsm_keys': [self.cluster_name]}
         self.anndata_ingest = AnnDataIngestor(*self.valid_args, **self.valid_kwargs)
-        self.output_filename = f"{self.cluster_name}.cluster.anndata_segment.tsv"
+        self.output_filename = f"h5ad_frag.cluster.{self.cluster_name}.tsv"
 
     def teardown_method(self, _):
         if os.path.isfile(self.output_filename):
             os.remove(self.output_filename)
 
     def test_minimal_valid_anndata(self):
-        self.assertTrue(self.anndata_ingest.validate(), "expect known good file to open with scanpy")
+        self.assertTrue(
+            self.anndata_ingest.validate(), "expect known good file to open with scanpy"
+        )
 
     def test_truncated_anndata(self):
         truncated_input = AnnDataIngestor(*self.invalid_args)
@@ -47,9 +49,7 @@ def test_truncated_anndata(self):
 
     def test_input_bad_suffix(self):
         bad_input = AnnDataIngestor(
-            "../tests/data/anndata/bad.foo",
-            self.study_id,
-            self.study_file_id,
+            "../tests/data/anndata/bad.foo", self.study_id, self.study_file_id
         )
         # passing obtain_data function to assertRaises using lambda
         # otherwise bad_input.obtain_data() is evaluated and triggers
@@ -64,40 +64,60 @@ def test_input_bad_suffix(self):
     def test_set_output_filename(self):
         cluster_name = "X_Umap"
         self.assertEqual(
-            AnnDataIngestor.set_output_filename(cluster_name),
-            "X_Umap.cluster.anndata_segment.tsv"
+            AnnDataIngestor.set_clustering_filename(cluster_name),
+            "h5ad_frag.cluster.X_Umap.tsv",
         )
 
     def test_generate_cluster_header(self):
-        self.anndata_ingest.generate_cluster_header(self.anndata_ingest.obtain_adata(), self.cluster_name)
+        self.anndata_ingest.generate_cluster_header(
+            self.anndata_ingest.obtain_adata(), self.cluster_name
+        )
         with open(self.output_filename) as header_file:
             header = header_file.readline().split("\t")
-            self.assertEqual(['NAME', 'X', "Y\n"], header, "did not find expected headers")
+            self.assertEqual(
+                ['NAME', 'X', "Y\n"], header, "did not find expected headers"
+            )
 
     def test_generate_cluster_type_declaration(self):
-        self.anndata_ingest.generate_cluster_type_declaration(self.anndata_ingest.obtain_adata(), self.cluster_name)
+        self.anndata_ingest.generate_cluster_type_declaration(
+            self.anndata_ingest.obtain_adata(), self.cluster_name
+        )
         with open(self.output_filename) as header_file:
             header = header_file.readline().split("\t")
-            self.assertEqual(['TYPE', 'numeric', "numeric\n"], header, "did not find expected headers")
+            self.assertEqual(
+                ['TYPE', 'numeric', "numeric\n"],
+                header,
+                "did not find expected headers",
+            )
 
     def test_generate_cluster_body(self):
-        self.anndata_ingest.generate_cluster_body(self.anndata_ingest.obtain_adata(), self.cluster_name)
+        self.anndata_ingest.generate_cluster_body(
+            self.anndata_ingest.obtain_adata(), self.cluster_name
+        )
         with open(self.output_filename) as cluster_body:
             line = cluster_body.readline().split("\t")
             expected_line = ['AAACATACAACCAC-1', '16.009954', "-21.073845\n"]
-            self.assertEqual(expected_line, line, 'did not get expected coordinates from cluster body')
+            self.assertEqual(
+                expected_line,
+                line,
+                'did not get expected coordinates from cluster body',
+            )
 
     def test_get_files_to_delocalize(self):
-        files = AnnDataIngestor.files_to_delocalize(self.valid_kwargs)
+        files = AnnDataIngestor.clusterings_to_delocalize(self.valid_kwargs)
         expected_files = [self.output_filename]
         self.assertEqual(expected_files, files)
 
     def test_delocalize_files(self):
         # just create header, no reason to run full extract
-        self.anndata_ingest.generate_cluster_header(self.anndata_ingest.obtain_adata(), self.cluster_name)
+        self.anndata_ingest.generate_cluster_header(
+            self.anndata_ingest.obtain_adata(), self.cluster_name
+        )
         with patch('ingest_files.IngestFiles.delocalize_file'):
             AnnDataIngestor.delocalize_file(
-                "gs://fake_bucket", self.study_id, AnnDataIngestor.files_to_delocalize(self.valid_kwargs)
+                "gs://fake_bucket",
+                self.study_id,
+                AnnDataIngestor.clusterings_to_delocalize(self.valid_kwargs),
             )
             self.assertEqual(
                 IngestFiles.delocalize_file.call_count,
diff --git a/tests/test_ingest.py b/tests/test_ingest.py
@@ -46,10 +46,11 @@
     validate_arguments,
     IngestPipeline,
     exit_pipeline,
-    run_ingest
+    run_ingest,
 )
 from expression_files.expression_files import GeneExpression
 
+
 def mock_load(self, *args, **kwargs):
     """Enables overwriting normal function with this placeholder.
     Returning the arguments enables tests to verify that the code invokes
@@ -682,17 +683,17 @@ def test_extract_cluster_file_from_anndata(self):
             "5dd5ae25421aa910a723a337",
             "ingest_anndata",
             "--ingest-anndata",
-            "--extract-cluster",
+            "--extract",
+            "['cluster']",
             "--anndata-file",
             "../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad",
             "--obsm-keys",
-            "['X_tsne']"
-
+            "['X_tsne']",
         ]
         ingest, arguments, status, status_cell_metadata = self.execute_ingest(args)
         self.assertEqual(len(status), 1)
         self.assertEqual(status[0], 0)
-        filename = 'X_tsne.cluster.anndata_segment.tsv'
+        filename = 'h5ad_frag.cluster.X_tsne.tsv'
         self.assertTrue(os.path.isfile(filename))
 
 

Original file line number	Diff line number	Diff line change
`@@ -354,9 +354,9 @@ def create_parser():`
`354`	`354`	`)`
`355`	`355`
`356`	`356`	`parser_anndata.add_argument(`
`357`		`- "--extract-cluster",`
`358`		`- action="store_true",`
`359`		`- help="Indicates clustering data should be extracted",`
	`357`	`+ "--extract",`
	`358`	`+ type=ast.literal_eval,`
	`359`	`+ help="Array of file types to extract, options include ['cluster', 'metadata']",`
`360`	`360`	`)`
`361`	`361`
`362`	`362`	`parser_expression_writer = subparsers.add_parser(`