broadinstitute
diff --git a/‎ingest/anndata_.py‎
Lines changed: 1 addition & 1 deletion b/‎ingest/anndata_.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ingest/ingest_pipeline.py‎
Lines changed: 17 additions & 8 deletions b/‎ingest/ingest_pipeline.py‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎ingest/validation/metadata_validation.py‎
Lines changed: 161 additions & 0 deletions b/‎ingest/validation/metadata_validation.py‎
Lines changed: 161 additions & 0 deletions
diff --git a/‎ingest/validation/validate_metadata.py‎
Lines changed: 0 additions & 106 deletions b/‎ingest/validation/validate_metadata.py‎
Lines changed: 0 additions & 106 deletions
diff --git a/‎schema/README.md‎
Lines changed: 12 additions & 9 deletions b/‎schema/README.md‎
Lines changed: 12 additions & 9 deletions
@@ -18,7 +18,7 @@ def __init__(self, file_path, study_file_id, study_id, **kwargs):
         )
         # If performing cluster extraction, set obsm_keys
         extract_cluster = kwargs.get("extract_cluster")
-        if extract_cluster:
+        if extract_cluster == True:
             self.obsm_keys = kwargs["obsm_keys"]
         else:
             pass
 
@@ -14,23 +14,26 @@
 # Takes expression file and stores it into MongoDB
 
 # Ingest cluster file
-python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_cluster --cluster-file ../tests/data/test_1k_cluster_Data.csv --ingest-cluster --name cluster1 --domain-ranges "{'x':[-1, 1], 'y':[-1, 1], 'z':[-1, 1]}"
+python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_cluster --cluster-file ../tests/data/test_1k_cluster_data.csv --ingest-cluster --name cluster1 --domain-ranges "{'x':[-1, 1], 'y':[-1, 1], 'z':[-1, 1]}"
 
 # Ingest Cell Metadata file
-python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_cell_metadata --cell-metadata-file ../tests/data/valid_no_array_v2.0.0.txt --study-accession SCP123 --ingest-cell-metadata
+python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_cell_metadata --cell-metadata-file ../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.txt --study-accession SCP123 --ingest-cell-metadata
 
 # Ingest Cell Metadata file against convention
 !! Please note that you must have a pre-configured BigQuery table available
-python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_cell_metadata --cell-metadata-file ../tests/data/valid_no_array_v2.0.0.txt --study-accession SCP123 --ingest-cell-metadata --validate-convention --bq-dataset cell_metadata --bq-table alexandria_convention
+python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_cell_metadata --cell-metadata-file ../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.txt --study-accession SCP123 --ingest-cell-metadata --validate-convention --bq-dataset cell_metadata --bq-table alexandria_convention
 
 # Ingest dense file
 python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --taxon-name 'Homo sapiens' --taxon-common-name human --ncbi-taxid 9606 --matrix-file ../tests/data/dense_matrix_19_genes_1000_cells.txt --matrix-file-type dense
 
-# Ingest AnnData file
+# Ingest AnnData file basic "does it open in Scanpy" validation
 python ingest_pipeline.py  --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/test.h5ad
 
+# Ingest AnnData file - cluster extraction
+python ingest_pipeline.py  --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --extract-cluster --ingest-anndata --anndata-file ../tests/data/anndata/test.h5ad --obsm-keys "['X_tsne']"
+
 # Subsample cluster and metadata file
-python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_subsample --cluster-file ../tests/data/test_1k_cluster_Data.csv --name custer1 --cell-metadata-file ../tests/data/test_1k_metadata_Data.csv --subsample
+python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_subsample --cluster-file ../tests/data/test_1k_cluster_data.csv --name cluster1 --cell-metadata-file ../tests/data/test_1k_metadata_Data.csv --subsample
 
 # Ingest mtx files
 python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --taxon-name 'Homo sapiens' --taxon-common-name human --matrix-file ../tests/data/mtx/matrix.mtx --matrix-file-type mtx --gene-file ../tests/data/genes.tsv --barcode-file ../tests/data/barcodes.tsv
@@ -487,7 +490,9 @@ def ingest_anndata(self):
         )
         if self.anndata.validate():
             self.report_validation("success")
-            if self.kwargs["extract_cluster"]:
+            if self.kwargs["extract_cluster"] == True:
+                if not self.kwargs["obsm_keys"]:
+                    self.kwargs["obsm_keys"] = ['X_tsne']
                 for key in self.kwargs["obsm_keys"]:
                     AnnDataIngestor.generate_cluster_header(self.anndata.adata, key)
                     AnnDataIngestor.generate_cluster_type_declaration(
@@ -523,7 +528,7 @@ def render_expression_arrays(self):
                 matrix_file_path=self.matrix_file_path,
                 matrix_file_type=self.matrix_file_type,
                 cluster_file_path=self.cluster_file,
-                **self.kwargs
+                **self.kwargs,
             )
             exp_writer.render_artifacts()
         except Exception as e:
@@ -613,7 +618,11 @@ def exit_pipeline(ingest, status, status_cell_metadata, arguments):
                     file_path, study_file_id, files_to_match
                 )
         # for successful anndata jobs, need to delocalize intermediate ingest files
-        elif "extract_cluster" in arguments and all(i < 1 for i in status):
+        elif (
+            "extract_cluster" in arguments
+            and arguments.get("extract_cluster") == True
+            and all(i < 1 for i in status)
+        ):
             file_path, study_file_id = get_delocalization_info(arguments)
             # append status?
             if IngestFiles.is_remote_file(file_path):
 
@@ -0,0 +1,161 @@
+"""Validate input metadata TSV file against metadata convention.
+
+DESCRIPTION
+This script takes a TSV metadata file and validates against a metadata convention
+using the python jsonschema library. The metadata convention JSON schema
+represents the rules that should be enforced on metadata files for studies
+participating under the convention.
+
+EXAMPLE
+# Using JSON file for latest Alexandria metadata convention in repo, validate input TSV
+$ python3 metadata_validation.py  ../../tests/data/annotation/metadata/convention/valid_array_v2.1.2.txt
+
+# generate an issues.json file to compare with reference test files
+$ python3 metadata_validation.py --issues-json ../../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.tsv
+
+# generate a BigQuery upload file to compare with reference test files
+$ python3 metadata_validation.py --bq-json ../../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.tsv
+
+# use a different metadata convention for validation
+$ python3 metadata_validation.py --convention <path to convention json> ../../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.tsv
+
+"""
+
+import argparse
+import json
+import logging
+from collections import defaultdict
+import sys
+import requests
+import urllib.parse as encoder
+import re
+import os
+import numbers
+import time
+import backoff
+import csv
+import copy
+import itertools
+import math
+import pandas as pd
+
+import colorama
+from colorama import Fore
+import jsonschema
+from google.cloud import bigquery
+
+sys.path.append("..")
+try:
+    # Used when importing internally and in tests
+    from cell_metadata import CellMetadata
+    from validation.validate_metadata import (
+        report_issues,
+        validate_input_metadata,
+        write_metadata_to_bq,
+        serialize_issues,
+        exit_if_errors,
+    )
+except ImportError:
+    # Used when importing as external package, e.g. imports in single_cell_portal code
+    from ..cell_metadata import CellMetadata
+
+
+def create_parser():
+    """Parse command line values for validate_metadata
+    """
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    # parser.add_argument(
+    #     '--output',
+    #     '-o',
+    #     type=str,
+    #     help='Output file name [optional]',
+    #     default=None
+    # )
+    # parser.add_argument(
+    #     '--key_id',
+    #     '-k'
+    #     type=str,
+    #     help='Key metadata name for parsing; CellID for metadata, BiosampleID for sample sheets [optional]',
+    #     default='CellID'
+    # )
+
+    # helper param to create JSON representation of metadata.issues
+    # to generate reference output for tests
+    parser.add_argument("--issues-json", action="store_true")
+    # helper param to create JSON representation of convention metadata
+    # to generate json for bigquery testing
+    parser.add_argument("--bq-json", action="store_true")
+    # overwrite existing output
+    parser.add_argument("--force", action="store_true")
+    # test BigQuery upload functions
+    parser.add_argument("--upload", action="store_true")
+    # validate_metadata.py CLI only for dev, bogus defaults below shouldn't propagate
+    # make bogus defaults obviously artificial for ease of detection
+    parser.add_argument(
+        "--study-id",
+        help="MongoDB study identifier",
+        default="dec0dedfeed1111111111111",
+    )
+    parser.add_argument(
+        "--study-file-id",
+        help="MongoDB file identifier",
+        default="addedfeed000000000000000",
+    )
+    parser.add_argument(
+        "--study-accession", help="SCP study accession", default="SCPtest"
+    )
+    parser.add_argument(
+        "--bq-dataset", help="BigQuery dataset identifier", default="cell_metadata"
+    )
+    parser.add_argument(
+        "--bq-table", help="BigQuery table identifier", default="alexandria_convention"
+    )
+    parser.add_argument(
+        "--convention",
+        help="Metadata convention JSON file",
+        default="../../schema/alexandria_convention/alexandria_convention_schema.json",
+    )
+    parser.add_argument("input_metadata", help="Metadata TSV file")
+    return parser
+
+
+def check_if_old_output():
+    """Exit if old output files found
+    """
+    output_files = ["bq.json"]
+
+    old_output = False
+    for file in output_files:
+        if os.path.exists(file):
+            print(f"{file} already exists, please delete file and try again")
+            old_output = True
+    if old_output:
+        exit(1)
+
+
+if __name__ == "__main__":
+    args = create_parser().parse_args()
+    arguments = vars(args)
+    if not args.force:
+        check_if_old_output()
+
+    with open(args.convention, "r") as f:
+        convention = json.load(f)
+    metadata = CellMetadata(
+        file_path=args.input_metadata,
+        study_id=args.study_id,
+        study_file_id=args.study_file_id,
+        study_accession=args.study_accession,
+    )
+    metadata.preprocess(True)
+    print("Validating", args.input_metadata)
+
+    validate_input_metadata(metadata, convention, args.bq_json)
+    if args.issues_json:
+        serialize_issues(metadata)
+    report_issues(metadata)
+    if args.upload:
+        write_metadata_to_bq(metadata, args.bq_dataset, args.bq_table)
+    exit_if_errors(metadata)
@@ -1,24 +1,4 @@
 """Validate input metadata TSV file against metadata convention.
-
-DESCRIPTION
-This CLI takes a TSV metadata file and validates against a metadata convention
-using the python jsonschema library. The metadata convention JSON schema
-represents the rules that should be enforced on metadata files for studies
-participating under the convention.
-
-EXAMPLE
-# Using JSON file for latest Alexandria metadata convention in repo, validate input TSV
-$ python3 validate_metadata.py  ../../tests/data/valid_no_array_v2.0.0.tsv
-
-# generate an issues.json file to compare with reference test files
-$ python3 validate_metadata.py --issues-json ../../tests/data/valid_no_array_v2.0.0.tsv
-
-# generate a BigQuery upload file to compare with reference test files
-$ python3 validate_metadata.py --bq-json ../../tests/data/valid_no_array_v2.0.0.tsv
-
-# use a different metadata convention for validation
-$ python3 validate_metadata.py --convention <path to convention json> ../../tests/data/valid_no_array_v2.0.0.tsv
-
 """
 
 import argparse
@@ -67,67 +47,6 @@
 MAX_HTTP_ATTEMPTS = 8
 
 
-def create_parser():
-    """Parse command line values for validate_metadata
-    """
-    parser = argparse.ArgumentParser(
-        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
-    )
-    # parser.add_argument(
-    #     '--output',
-    #     '-o',
-    #     type=str,
-    #     help='Output file name [optional]',
-    #     default=None
-    # )
-    # parser.add_argument(
-    #     '--key_id',
-    #     '-k'
-    #     type=str,
-    #     help='Key metadata name for parsing; CellID for metadata, BiosampleID for sample sheets [optional]',
-    #     default='CellID'
-    # )
-
-    # helper param to create JSON representation of metadata.issues
-    # to generate reference output for tests
-    parser.add_argument("--issues-json", action="store_true")
-    # helper param to create JSON representation of convention metadata
-    # to generate json for bigquery testing
-    parser.add_argument("--bq-json", action="store_true")
-    # overwrite existing output
-    parser.add_argument("--force", action="store_true")
-    # test BigQuery upload functions
-    parser.add_argument("--upload", action="store_true")
-    # validate_metadata.py CLI only for dev, bogus defaults below shouldn't propagate
-    # make bogus defaults obviously artificial for ease of detection
-    parser.add_argument(
-        "--study-id",
-        help="MongoDB study identifier",
-        default="dec0dedfeed1111111111111",
-    )
-    parser.add_argument(
-        "--study-file-id",
-        help="MongoDB file identifier",
-        default="addedfeed000000000000000",
-    )
-    parser.add_argument(
-        "--study-accession", help="SCP study accession", default="SCPtest"
-    )
-    parser.add_argument(
-        "--bq-dataset", help="BigQuery dataset identifier", default="cell_metadata"
-    )
-    parser.add_argument(
-        "--bq-table", help="BigQuery table identifier", default="alexandria_convention"
-    )
-    parser.add_argument(
-        "--convention",
-        help="Metadata convention JSON file",
-        default="../../schema/alexandria_convention/alexandria_convention_schema.json",
-    )
-    parser.add_argument("input_metadata", help="Metadata TSV file")
-    return parser
-
-
 ######################## ONTOLOGY RETRIVER  #########################
 # TODO: move code in this section to a separate file
 
@@ -1558,28 +1477,3 @@ def check_if_old_output():
     if old_output:
         exit(1)
 
-
-if __name__ == "__main__":
-    args = create_parser().parse_args()
-    arguments = vars(args)
-    if not args.force:
-        check_if_old_output()
-
-    with open(args.convention, "r") as f:
-        convention = json.load(f)
-    metadata = CellMetadata(
-        file_path=args.input_metadata,
-        study_id=args.study_id,
-        study_file_id=args.study_file_id,
-        study_accession=args.study_accession,
-    )
-    metadata.preprocess(True)
-    print("Validating", args.input_metadata)
-
-    validate_input_metadata(metadata, convention, args.bq_json)
-    if args.issues_json:
-        serialize_issues(metadata)
-    report_issues(metadata)
-    if args.upload:
-        write_metadata_to_bq(metadata, args.bq_dataset, args.bq_table)
-    exit_if_errors(metadata)
@@ -13,29 +13,32 @@
 
 * copy scp_bq_inputs.json from previous snapshot directory, update with new SCP-internal terms if appropriate
 
-* In the scripts `scp-ingest-pipeline` directory, run
+* In the scripts `scp-ingest-pipeline/schema` directory, run
+
   ```
-  python serialize_convention.py <project> <version>
+  python ../scripts/serialize_convention.py <project> <version>
   ```
 
 * Copy the new convention JSON and TSV files to the * &lt;project&gt;_convention directory  
 
-
 Notes:
+
 * Tests in test_validate_metadata.py use current metadata convention (except for invalid metadata convention test)
 
-* Specifically test_bigquery_json_content is expected to fail when the metadata convention is updated. The reference file, bq_test.json, must be updated (replace existing file with the generated addedfeed000000000000000.json file)
+* Specifically `test_bigquery_json_content` is expected to fail when the metadata convention is updated. The reference file, bq_test.json, must be updated (replace existing file with the generated addedfeed000000000000000.json file)
+
 ```
-python validate_metadata.py --bq-json <path to metadata file>
+python metadata_validation.py --bq-json <path to metadata file>
 ```
 
 * To create updated issues.json files to update reference files for tests, in the ingest/validation directory, run
+
 ```
-python validate_metadata.py --issues-json <path to metadata file>
+python metadata_validation.py --issues-json <path to metadata file>
 ```
 
-* To run validate_metadata.py against a different convention file:
+* To run metadata_validation.py against a different convention file:
+
 ```
-python validate_metadata.py --convention <path to convention file> <path to metadata file>
+python metadata_validation.py --convention <path to convention file> <path to metadata file>
 ```
-
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ def __init__(self, file_path, study_file_id, study_id, **kwargs):`
`18`	`18`	`)`
`19`	`19`	`# If performing cluster extraction, set obsm_keys`
`20`	`20`	`extract_cluster = kwargs.get("extract_cluster")`
`21`		`- if extract_cluster:`
	`21`	`+ if extract_cluster == True:`
`22`	`22`	`self.obsm_keys = kwargs["obsm_keys"]`
`23`	`23`	`else:`
`24`	`24`	`pass`