Skip to content

Commit 613e26c

Browse files
authored
Merge pull request #282 from broadinstitute/development
Release 1.23.2
2 parents edd971a + 55daf30 commit 613e26c

15 files changed

+1559
-138
lines changed

ingest/anndata_.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def __init__(self, file_path, study_file_id, study_id, **kwargs):
1818
)
1919
# If performing cluster extraction, set obsm_keys
2020
extract_cluster = kwargs.get("extract_cluster")
21-
if extract_cluster:
21+
if extract_cluster == True:
2222
self.obsm_keys = kwargs["obsm_keys"]
2323
else:
2424
pass

ingest/ingest_pipeline.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,23 +14,26 @@
1414
# Takes expression file and stores it into MongoDB
1515
1616
# Ingest cluster file
17-
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_cluster --cluster-file ../tests/data/test_1k_cluster_Data.csv --ingest-cluster --name cluster1 --domain-ranges "{'x':[-1, 1], 'y':[-1, 1], 'z':[-1, 1]}"
17+
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_cluster --cluster-file ../tests/data/test_1k_cluster_data.csv --ingest-cluster --name cluster1 --domain-ranges "{'x':[-1, 1], 'y':[-1, 1], 'z':[-1, 1]}"
1818
1919
# Ingest Cell Metadata file
20-
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_cell_metadata --cell-metadata-file ../tests/data/valid_no_array_v2.0.0.txt --study-accession SCP123 --ingest-cell-metadata
20+
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_cell_metadata --cell-metadata-file ../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.txt --study-accession SCP123 --ingest-cell-metadata
2121
2222
# Ingest Cell Metadata file against convention
2323
!! Please note that you must have a pre-configured BigQuery table available
24-
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_cell_metadata --cell-metadata-file ../tests/data/valid_no_array_v2.0.0.txt --study-accession SCP123 --ingest-cell-metadata --validate-convention --bq-dataset cell_metadata --bq-table alexandria_convention
24+
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_cell_metadata --cell-metadata-file ../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.txt --study-accession SCP123 --ingest-cell-metadata --validate-convention --bq-dataset cell_metadata --bq-table alexandria_convention
2525
2626
# Ingest dense file
2727
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --taxon-name 'Homo sapiens' --taxon-common-name human --ncbi-taxid 9606 --matrix-file ../tests/data/dense_matrix_19_genes_1000_cells.txt --matrix-file-type dense
2828
29-
# Ingest AnnData file
29+
# Ingest AnnData file basic "does it open in Scanpy" validation
3030
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --ingest-anndata --anndata-file ../tests/data/anndata/test.h5ad
3131
32+
# Ingest AnnData file - cluster extraction
33+
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_anndata --extract-cluster --ingest-anndata --anndata-file ../tests/data/anndata/test.h5ad --obsm-keys "['X_tsne']"
34+
3235
# Subsample cluster and metadata file
33-
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_subsample --cluster-file ../tests/data/test_1k_cluster_Data.csv --name custer1 --cell-metadata-file ../tests/data/test_1k_metadata_Data.csv --subsample
36+
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_subsample --cluster-file ../tests/data/test_1k_cluster_data.csv --name cluster1 --cell-metadata-file ../tests/data/test_1k_metadata_Data.csv --subsample
3437
3538
# Ingest mtx files
3639
python ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 ingest_expression --taxon-name 'Homo sapiens' --taxon-common-name human --matrix-file ../tests/data/mtx/matrix.mtx --matrix-file-type mtx --gene-file ../tests/data/genes.tsv --barcode-file ../tests/data/barcodes.tsv
@@ -487,7 +490,9 @@ def ingest_anndata(self):
487490
)
488491
if self.anndata.validate():
489492
self.report_validation("success")
490-
if self.kwargs["extract_cluster"]:
493+
if self.kwargs["extract_cluster"] == True:
494+
if not self.kwargs["obsm_keys"]:
495+
self.kwargs["obsm_keys"] = ['X_tsne']
491496
for key in self.kwargs["obsm_keys"]:
492497
AnnDataIngestor.generate_cluster_header(self.anndata.adata, key)
493498
AnnDataIngestor.generate_cluster_type_declaration(
@@ -523,7 +528,7 @@ def render_expression_arrays(self):
523528
matrix_file_path=self.matrix_file_path,
524529
matrix_file_type=self.matrix_file_type,
525530
cluster_file_path=self.cluster_file,
526-
**self.kwargs
531+
**self.kwargs,
527532
)
528533
exp_writer.render_artifacts()
529534
except Exception as e:
@@ -613,7 +618,11 @@ def exit_pipeline(ingest, status, status_cell_metadata, arguments):
613618
file_path, study_file_id, files_to_match
614619
)
615620
# for successful anndata jobs, need to delocalize intermediate ingest files
616-
elif "extract_cluster" in arguments and all(i < 1 for i in status):
621+
elif (
622+
"extract_cluster" in arguments
623+
and arguments.get("extract_cluster") == True
624+
and all(i < 1 for i in status)
625+
):
617626
file_path, study_file_id = get_delocalization_info(arguments)
618627
# append status?
619628
if IngestFiles.is_remote_file(file_path):
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
"""Validate input metadata TSV file against metadata convention.
2+
3+
DESCRIPTION
4+
This script takes a TSV metadata file and validates against a metadata convention
5+
using the python jsonschema library. The metadata convention JSON schema
6+
represents the rules that should be enforced on metadata files for studies
7+
participating under the convention.
8+
9+
EXAMPLE
10+
# Using JSON file for latest Alexandria metadata convention in repo, validate input TSV
11+
$ python3 metadata_validation.py ../../tests/data/annotation/metadata/convention/valid_array_v2.1.2.txt
12+
13+
# generate an issues.json file to compare with reference test files
14+
$ python3 metadata_validation.py --issues-json ../../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.tsv
15+
16+
# generate a BigQuery upload file to compare with reference test files
17+
$ python3 metadata_validation.py --bq-json ../../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.tsv
18+
19+
# use a different metadata convention for validation
20+
$ python3 metadata_validation.py --convention <path to convention json> ../../tests/data/annotation/metadata/convention/valid_no_array_v2.0.0.tsv
21+
22+
"""
23+
24+
import argparse
25+
import json
26+
import logging
27+
from collections import defaultdict
28+
import sys
29+
import requests
30+
import urllib.parse as encoder
31+
import re
32+
import os
33+
import numbers
34+
import time
35+
import backoff
36+
import csv
37+
import copy
38+
import itertools
39+
import math
40+
import pandas as pd
41+
42+
import colorama
43+
from colorama import Fore
44+
import jsonschema
45+
from google.cloud import bigquery
46+
47+
sys.path.append("..")
48+
try:
49+
# Used when importing internally and in tests
50+
from cell_metadata import CellMetadata
51+
from validation.validate_metadata import (
52+
report_issues,
53+
validate_input_metadata,
54+
write_metadata_to_bq,
55+
serialize_issues,
56+
exit_if_errors,
57+
)
58+
except ImportError:
59+
# Used when importing as external package, e.g. imports in single_cell_portal code
60+
from ..cell_metadata import CellMetadata
61+
62+
63+
def create_parser():
64+
"""Parse command line values for validate_metadata
65+
"""
66+
parser = argparse.ArgumentParser(
67+
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
68+
)
69+
# parser.add_argument(
70+
# '--output',
71+
# '-o',
72+
# type=str,
73+
# help='Output file name [optional]',
74+
# default=None
75+
# )
76+
# parser.add_argument(
77+
# '--key_id',
78+
# '-k'
79+
# type=str,
80+
# help='Key metadata name for parsing; CellID for metadata, BiosampleID for sample sheets [optional]',
81+
# default='CellID'
82+
# )
83+
84+
# helper param to create JSON representation of metadata.issues
85+
# to generate reference output for tests
86+
parser.add_argument("--issues-json", action="store_true")
87+
# helper param to create JSON representation of convention metadata
88+
# to generate json for bigquery testing
89+
parser.add_argument("--bq-json", action="store_true")
90+
# overwrite existing output
91+
parser.add_argument("--force", action="store_true")
92+
# test BigQuery upload functions
93+
parser.add_argument("--upload", action="store_true")
94+
# validate_metadata.py CLI only for dev, bogus defaults below shouldn't propagate
95+
# make bogus defaults obviously artificial for ease of detection
96+
parser.add_argument(
97+
"--study-id",
98+
help="MongoDB study identifier",
99+
default="dec0dedfeed1111111111111",
100+
)
101+
parser.add_argument(
102+
"--study-file-id",
103+
help="MongoDB file identifier",
104+
default="addedfeed000000000000000",
105+
)
106+
parser.add_argument(
107+
"--study-accession", help="SCP study accession", default="SCPtest"
108+
)
109+
parser.add_argument(
110+
"--bq-dataset", help="BigQuery dataset identifier", default="cell_metadata"
111+
)
112+
parser.add_argument(
113+
"--bq-table", help="BigQuery table identifier", default="alexandria_convention"
114+
)
115+
parser.add_argument(
116+
"--convention",
117+
help="Metadata convention JSON file",
118+
default="../../schema/alexandria_convention/alexandria_convention_schema.json",
119+
)
120+
parser.add_argument("input_metadata", help="Metadata TSV file")
121+
return parser
122+
123+
124+
def check_if_old_output():
125+
"""Exit if old output files found
126+
"""
127+
output_files = ["bq.json"]
128+
129+
old_output = False
130+
for file in output_files:
131+
if os.path.exists(file):
132+
print(f"{file} already exists, please delete file and try again")
133+
old_output = True
134+
if old_output:
135+
exit(1)
136+
137+
138+
if __name__ == "__main__":
139+
args = create_parser().parse_args()
140+
arguments = vars(args)
141+
if not args.force:
142+
check_if_old_output()
143+
144+
with open(args.convention, "r") as f:
145+
convention = json.load(f)
146+
metadata = CellMetadata(
147+
file_path=args.input_metadata,
148+
study_id=args.study_id,
149+
study_file_id=args.study_file_id,
150+
study_accession=args.study_accession,
151+
)
152+
metadata.preprocess(True)
153+
print("Validating", args.input_metadata)
154+
155+
validate_input_metadata(metadata, convention, args.bq_json)
156+
if args.issues_json:
157+
serialize_issues(metadata)
158+
report_issues(metadata)
159+
if args.upload:
160+
write_metadata_to_bq(metadata, args.bq_dataset, args.bq_table)
161+
exit_if_errors(metadata)

ingest/validation/validate_metadata.py

Lines changed: 0 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,4 @@
11
"""Validate input metadata TSV file against metadata convention.
2-
3-
DESCRIPTION
4-
This CLI takes a TSV metadata file and validates against a metadata convention
5-
using the python jsonschema library. The metadata convention JSON schema
6-
represents the rules that should be enforced on metadata files for studies
7-
participating under the convention.
8-
9-
EXAMPLE
10-
# Using JSON file for latest Alexandria metadata convention in repo, validate input TSV
11-
$ python3 validate_metadata.py ../../tests/data/valid_no_array_v2.0.0.tsv
12-
13-
# generate an issues.json file to compare with reference test files
14-
$ python3 validate_metadata.py --issues-json ../../tests/data/valid_no_array_v2.0.0.tsv
15-
16-
# generate a BigQuery upload file to compare with reference test files
17-
$ python3 validate_metadata.py --bq-json ../../tests/data/valid_no_array_v2.0.0.tsv
18-
19-
# use a different metadata convention for validation
20-
$ python3 validate_metadata.py --convention <path to convention json> ../../tests/data/valid_no_array_v2.0.0.tsv
21-
222
"""
233

244
import argparse
@@ -67,67 +47,6 @@
6747
MAX_HTTP_ATTEMPTS = 8
6848

6949

70-
def create_parser():
71-
"""Parse command line values for validate_metadata
72-
"""
73-
parser = argparse.ArgumentParser(
74-
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
75-
)
76-
# parser.add_argument(
77-
# '--output',
78-
# '-o',
79-
# type=str,
80-
# help='Output file name [optional]',
81-
# default=None
82-
# )
83-
# parser.add_argument(
84-
# '--key_id',
85-
# '-k'
86-
# type=str,
87-
# help='Key metadata name for parsing; CellID for metadata, BiosampleID for sample sheets [optional]',
88-
# default='CellID'
89-
# )
90-
91-
# helper param to create JSON representation of metadata.issues
92-
# to generate reference output for tests
93-
parser.add_argument("--issues-json", action="store_true")
94-
# helper param to create JSON representation of convention metadata
95-
# to generate json for bigquery testing
96-
parser.add_argument("--bq-json", action="store_true")
97-
# overwrite existing output
98-
parser.add_argument("--force", action="store_true")
99-
# test BigQuery upload functions
100-
parser.add_argument("--upload", action="store_true")
101-
# validate_metadata.py CLI only for dev, bogus defaults below shouldn't propagate
102-
# make bogus defaults obviously artificial for ease of detection
103-
parser.add_argument(
104-
"--study-id",
105-
help="MongoDB study identifier",
106-
default="dec0dedfeed1111111111111",
107-
)
108-
parser.add_argument(
109-
"--study-file-id",
110-
help="MongoDB file identifier",
111-
default="addedfeed000000000000000",
112-
)
113-
parser.add_argument(
114-
"--study-accession", help="SCP study accession", default="SCPtest"
115-
)
116-
parser.add_argument(
117-
"--bq-dataset", help="BigQuery dataset identifier", default="cell_metadata"
118-
)
119-
parser.add_argument(
120-
"--bq-table", help="BigQuery table identifier", default="alexandria_convention"
121-
)
122-
parser.add_argument(
123-
"--convention",
124-
help="Metadata convention JSON file",
125-
default="../../schema/alexandria_convention/alexandria_convention_schema.json",
126-
)
127-
parser.add_argument("input_metadata", help="Metadata TSV file")
128-
return parser
129-
130-
13150
######################## ONTOLOGY RETRIVER #########################
13251
# TODO: move code in this section to a separate file
13352

@@ -1558,28 +1477,3 @@ def check_if_old_output():
15581477
if old_output:
15591478
exit(1)
15601479

1561-
1562-
if __name__ == "__main__":
1563-
args = create_parser().parse_args()
1564-
arguments = vars(args)
1565-
if not args.force:
1566-
check_if_old_output()
1567-
1568-
with open(args.convention, "r") as f:
1569-
convention = json.load(f)
1570-
metadata = CellMetadata(
1571-
file_path=args.input_metadata,
1572-
study_id=args.study_id,
1573-
study_file_id=args.study_file_id,
1574-
study_accession=args.study_accession,
1575-
)
1576-
metadata.preprocess(True)
1577-
print("Validating", args.input_metadata)
1578-
1579-
validate_input_metadata(metadata, convention, args.bq_json)
1580-
if args.issues_json:
1581-
serialize_issues(metadata)
1582-
report_issues(metadata)
1583-
if args.upload:
1584-
write_metadata_to_bq(metadata, args.bq_dataset, args.bq_table)
1585-
exit_if_errors(metadata)

schema/README.md

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,29 +13,32 @@
1313

1414
* copy scp_bq_inputs.json from previous snapshot directory, update with new SCP-internal terms if appropriate
1515

16-
* In the scripts `scp-ingest-pipeline` directory, run
16+
* In the scripts `scp-ingest-pipeline/schema` directory, run
17+
1718
```
18-
python serialize_convention.py <project> <version>
19+
python ../scripts/serialize_convention.py <project> <version>
1920
```
2021

2122
* Copy the new convention JSON and TSV files to the * &lt;project&gt;_convention directory
2223

23-
2424
Notes:
25+
2526
* Tests in test_validate_metadata.py use current metadata convention (except for invalid metadata convention test)
2627

27-
* Specifically test_bigquery_json_content is expected to fail when the metadata convention is updated. The reference file, bq_test.json, must be updated (replace existing file with the generated addedfeed000000000000000.json file)
28+
* Specifically `test_bigquery_json_content` is expected to fail when the metadata convention is updated. The reference file, bq_test.json, must be updated (replace existing file with the generated addedfeed000000000000000.json file)
29+
2830
```
29-
python validate_metadata.py --bq-json <path to metadata file>
31+
python metadata_validation.py --bq-json <path to metadata file>
3032
```
3133

3234
* To create updated issues.json files to update reference files for tests, in the ingest/validation directory, run
35+
3336
```
34-
python validate_metadata.py --issues-json <path to metadata file>
37+
python metadata_validation.py --issues-json <path to metadata file>
3538
```
3639

37-
* To run validate_metadata.py against a different convention file:
40+
* To run metadata_validation.py against a different convention file:
41+
3842
```
39-
python validate_metadata.py --convention <path to convention file> <path to metadata file>
43+
python metadata_validation.py --convention <path to convention file> <path to metadata file>
4044
```
41-

0 commit comments

Comments
 (0)