From d0e07a5d4efe85f944ec85f49db3cc50bf65f38f Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Tue, 5 May 2026 14:13:23 -0400 Subject: [PATCH 1/4] Refactor custom_excel for portal query --- dcicutils/submitr/custom_excel.py | 346 ++++++++++++++++-------------- 1 file changed, 186 insertions(+), 160 deletions(-) diff --git a/dcicutils/submitr/custom_excel.py b/dcicutils/submitr/custom_excel.py index 9f4ea5291..c200e7bc0 100644 --- a/dcicutils/submitr/custom_excel.py +++ b/dcicutils/submitr/custom_excel.py @@ -2,17 +2,46 @@ import io import json import os -from requests import get as requests_get from typing import Any, List, Optional from dcicutils.data_readers import Excel, ExcelSheetReader from dcicutils.misc_utils import to_boolean, to_float, to_integer -# This module implements a custom Excel spreadsheet class which support "custom column mappings", -# meaning that, and a very low/early level in processing, the columns/values in the spreadsheet -# can be redefined/remapped to different columns/values. The mapping is defined by a JSON config -# file (by default in config/custom_column_mappings.json). It can be thought of as a virtual -# preprocessing step on the spreadsheet. This was first implemented to support the simplified QC -# columns/values. For EXAMPLE, so the spreadsheet author can specify single columns like this: +# This module implements a custom Excel spreadsheet class which supports "custom column mappings", +# meaning that, at a very low/early level in processing, the columns/values in the spreadsheet +# can be redefined/remapped to different columns/values. +# +# The mapping config is fetched live from the portal via: +# +# GET /search/?type=GenericQcConfig&tags=external_quality_metrics +# +# Each GenericQcConfig item returned carries the complete ready-to-use config in its "body" +# field, which already has the exact structure this module expects: +# +# { +# "sheet_mappings": { +# "DuplexSeq_ExternalQualityMetric": "duplexseq_external_quality_metric", +# "DSA_ExternalQualityMetric": "dsa_external_quality_metric" +# }, +# "column_mappings": { +# "duplexseq_external_quality_metric": { +# "total_raw_reads_sequenced": { +# "qc_values#.derived_from": "{name}", +# "qc_values#.value": "{value:integer}", +# "qc_values#.key": "Total Raw Reads Sequenced", +# "qc_values#.tooltip": "# of reads (150bp)" +# }, +# ... +# }, +# ... +# } +# } +# +# If multiple GenericQcConfig items are returned the one with the highest "version" +# (integer-parsed) is used. If the portal query fails or returns nothing, the bundled +# local JSON file (config/custom_column_mappings.json) is used as a fallback. +# +# The mapping can be thought of as a virtual preprocessing step on the spreadsheet. +# For EXAMPLE, so the spreadsheet author can specify single columns like this: # # total_raw_reads_sequenced: 11870183 # total_raw_bases_sequenced: 44928835584 @@ -28,68 +57,55 @@ # qc_values#1.key: Total Raw Bases Sequenced # qc_values#1.tooltip: None # -# The relevant portion of the controlling config file (config/custom_column_mappings.json) -# for the above example looks something like this: -# -# "sheet_mappings": { -# "ExternalQualityMetric": "external_quality_metric" -# }, -# "column_mappings": { -# "external_quality_metric": { -# "total_raw_reads_sequenced": { -# "qc_values#.derived_from": "{name}", -# "qc_values#.value": "{value:integer}", -# "qc_values#.key": "Total Raw Reads Sequenced", -# "qc_values#.tooltip": "# of reads (150bp)" -# }, -# "total_raw_bases_sequenced": { -# "qc_values#.derived_from": "{name}", -# "qc_values#.value": "{value:integer}", -# "qc_values#.key": "Total Raw Bases Sequenced", -# "qc_values#.tooltip": null -# }, -# "et cetera": "..." -# } -# } -# -# This says that for the ExternalQualityMetric sheet (only) the mappings with the config file -# section column_mappings.external_quality_metric will be applied. The "qc_values#" portion of -# the mapped columns names will be expanded to "qc_values#0" for total_raw_reads_sequenced items, -# and to "qc_values#1" for the total_raw_bases_sequenced items, and so on. This will be based on -# the ACTUAL columns present in the sheet; so if total_raw_reads_sequenced were not present in -# the sheet, then the total_raw_bases_sequenced items would be expanded to "qc_values#0". -# Note the special "{name}" and "{value}" values ("macros") for the target (synthetic) properties; -# these will be evaluated (here) to the name of the original property name and value, respectively. -# -# Since the (first) actual use-case of this is in fact for these qc_values, and since these have -# effectively untyped values (i.e. the ExternalQualityMetric schema specifies all primitive types -# as possible/acceptable types for qc_values.value), we also allow a ":TYPE" suffix for the -# special "{value}" macro, so that a specific primitive type may be specified, e.g. "{value:integer}" -# will evaluate the original property value as an integer (if it cannot be converted to an integer -# then whatever its value is, will be passed on through as a string). -# # The hook for this is to pass the CustomExcel type to StructuredDataSet in submission.py. -# Note that the config file is fetched from GitHub, with a fallback to config/custom_column_mappings.json. # # ALSO ... # This CustomExcel class also handles multiple sheets within a spreadsheet representing # the same (portal) type; see comments below near the ExcelSheetName class definition. -CUSTOM_COLUMN_MAPPINGS_BASE_URL = "https://raw.githubusercontent.com/smaht-dac/submitr/refs/heads" -CUSTOM_COLUMN_MAPPINGS_BRANCH = "master" -CUSTOM_COLUMN_MAPPINGS_PATH = "submitr/config/custom_column_mappings.json" -CUSTOM_COLUMN_MAPPINGS_URL = f"{CUSTOM_COLUMN_MAPPINGS_BASE_URL}/{CUSTOM_COLUMN_MAPPINGS_BRANCH}/{CUSTOM_COLUMN_MAPPINGS_PATH}" # noqa -CUSTOM_COLUMN_MAPPINGS_LOCAL = False +CUSTOM_COLUMN_MAPPINGS_LOCAL_CONFIG = os.path.join( + os.path.dirname(__file__), "config", "custom_column_mappings.json" +) COLUMN_NAME_ARRAY_SUFFIX_CHAR = "#" COLUMN_NAME_SEPARATOR = "." +# The portal search used to retrieve EQM column-mapping configs. +GENERIC_QC_CONFIG_SEARCH = "search/?type=GenericQcConfig&tags=external_quality_metrics" + + +def _get_most_recent_config_version(items: list) -> Optional[dict]: + """Return the GenericQcConfig item with the highest integer version number.""" + def parse_version(item): + try: + return int(item.get("version", 0)) + except (ValueError, TypeError): + return 0 + return max(items, key=parse_version, default=None) + class CustomExcel(Excel): - def __init__(self, *args, **kwargs): + def __init__(self, *args, portal=None, **kwargs): super().__init__(*args, **kwargs) - self._custom_column_mappings = CustomExcel._get_custom_column_mappings() + self._custom_column_mappings = CustomExcel._get_custom_column_mappings(portal=portal) + + @classmethod + def with_portal(cls, portal): + """Return a subclass of CustomExcel with portal baked in. + + Use this when passing excel_class to StructuredDataSet, which requires + a real class (it calls issubclass() on the argument internally): + + excel_class=CustomExcel.with_portal(portal) + """ + class _CustomExcelWithPortal(cls): + def __init__(self, *args, **kwargs): + kwargs.setdefault("portal", portal) + super().__init__(*args, **kwargs) + _CustomExcelWithPortal.__name__ = "CustomExcel" + _CustomExcelWithPortal.__qualname__ = "CustomExcel" + return _CustomExcelWithPortal def sheet_reader(self, sheet_name: str) -> ExcelSheetReader: return CustomExcelSheetReader(self, sheet_name=sheet_name, workbook=self._workbook, @@ -102,47 +118,62 @@ def effective_sheet_name(sheet_name: str) -> str: return sheet_name @staticmethod - def _get_custom_column_mappings() -> Optional[dict]: + def _get_custom_column_mappings(portal=None) -> Optional[dict]: - def fetch_custom_column_mappings(): - custom_column_mappings = None - if CUSTOM_COLUMN_MAPPINGS_LOCAL is not True: - # Fetch config file directly from GitHub (yes this repo is public). - try: - custom_column_mappings = requests_get(CUSTOM_COLUMN_MAPPINGS_URL).json() - except Exception: - pass - if not custom_column_mappings: - # Fallback to the actual config file in this package. - try: - file = os.path.join(os.path.dirname(__file__), "config", "custom_column_mappings.json") - with io.open(file, "r") as f: - custom_column_mappings = json.load(f) - except Exception: - custom_column_mappings = None - if not isinstance(custom_column_mappings, dict): - custom_column_mappings = {} - return custom_column_mappings + def fetch_from_portal(portal) -> Optional[dict]: + """Query the portal for GenericQcConfig items. The most recent item's + "body" field is the complete ready-to-use config dict.""" + if portal is None: + return None + try: + results = portal.get_metadata(GENERIC_QC_CONFIG_SEARCH) + if not isinstance(results, dict): + return None + items = results.get("@graph", []) + if not items: + return None + item = _get_most_recent_config_version(items) + if item is None: + return None + body = item.get("body") + if not isinstance(body, dict): + return None + return body + except Exception: + return None - def post_process_custom_column_mappings(custom_column_mappings: dict) -> Optional[dict]: - if isinstance(column_mappings := custom_column_mappings.get("column_mappings"), dict): - if isinstance(sheet_mappings := custom_column_mappings.get("sheet_mappings"), dict): - for sheet_name in list(sheet_mappings.keys()): - if isinstance(sheet_mappings[sheet_name], str): - if isinstance(column_mappings.get(sheet_mappings[sheet_name]), dict): - sheet_mappings[sheet_name] = column_mappings.get(sheet_mappings[sheet_name]) - else: - del sheet_mappings[sheet_name] - elif not isinstance(sheet_mappings[sheet_name], dict): - del sheet_mappings[sheet_name] - return sheet_mappings - return None + def fetch_from_local_json() -> Optional[dict]: + """Fall back to the bundled static JSON config.""" + try: + with io.open(CUSTOM_COLUMN_MAPPINGS_LOCAL_CONFIG, "r") as f: + return json.load(f) + except Exception: + return None - if not (custom_column_mappings := fetch_custom_column_mappings()): - return None - if not (custom_column_mappings := post_process_custom_column_mappings(custom_column_mappings)): + def post_process(raw_config: dict) -> Optional[dict]: + """Resolve sheet_mappings string references into the actual column-mapping dicts.""" + if not isinstance(raw_config, dict): + return None + column_mappings = raw_config.get("column_mappings") + sheet_mappings = raw_config.get("sheet_mappings") + if not isinstance(column_mappings, dict) or not isinstance(sheet_mappings, dict): + return None + for sheet_name in list(sheet_mappings.keys()): + mapping_key = sheet_mappings[sheet_name] + if isinstance(mapping_key, str): + resolved = column_mappings.get(mapping_key) + if isinstance(resolved, dict): + sheet_mappings[sheet_name] = resolved + else: + del sheet_mappings[sheet_name] + elif not isinstance(mapping_key, dict): + del sheet_mappings[sheet_name] + return sheet_mappings if sheet_mappings else None + + raw_config = fetch_from_portal(portal) or fetch_from_local_json() + if not raw_config: return None - return custom_column_mappings + return post_process(raw_config) class CustomExcelSheetReader(ExcelSheetReader): @@ -154,12 +185,11 @@ def __init__(self, *args, **kwargs) -> None: if ARGUMENT_NAME_CUSTOM_COLUMN_MAPPINGS in kwargs: def lookup_custom_column_mappings(custom_column_mappings: dict, sheet_name: str) -> Optional[dict]: if isinstance(custom_column_mappings, dict) and isinstance(sheet_name, str): - if isinstance(found_custom_column_mappings := custom_column_mappings.get(sheet_name), dict): - return found_custom_column_mappings - if (effective_sheet_name := CustomExcel.effective_sheet_name(sheet_name)) != sheet_name: - if isinstance(found_custom_column_mappings := - custom_column_mappings.get(effective_sheet_name), dict): - return found_custom_column_mappings + if isinstance(found := custom_column_mappings.get(sheet_name), dict): + return found + if (effective := CustomExcel.effective_sheet_name(sheet_name)) != sheet_name: + if isinstance(found := custom_column_mappings.get(effective), dict): + return found return None custom_column_mappings = kwargs[ARGUMENT_NAME_CUSTOM_COLUMN_MAPPINGS] del kwargs[ARGUMENT_NAME_CUSTOM_COLUMN_MAPPINGS] @@ -174,58 +204,38 @@ def lookup_custom_column_mappings(custom_column_mappings: dict, sheet_name: str) def _define_header(self, header: List[Optional[Any]]) -> None: def fixup_custom_column_mappings(custom_column_mappings: dict, actual_column_names: List[str]) -> dict: - - # This fixes up the custom column mappings config for this particular sheet based - # on the actual (header) column names, i.e. e.g. in particular for the array - # specifiers like mapping "qc_values#.value" to qc_values#0.value". - - def fixup_custom_array_column_mappings(custom_column_mappings: dict) -> None: - - def get_simple_array_column_name_component(column_name: str) -> Optional[str]: - if isinstance(column_name, str): - if column_name_components := column_name.split(COLUMN_NAME_SEPARATOR): - if (suffix := column_name_components[0].find(COLUMN_NAME_ARRAY_SUFFIX_CHAR)) > 0: - if (suffix + 1) == len(column_name_components[0]): - return column_name_components[0][:suffix] - return None - - synthetic_array_column_names = {} - for column_name in custom_column_mappings: - for synthetic_column_name in list(custom_column_mappings[column_name].keys()): - synthetic_array_column_name = get_simple_array_column_name_component(synthetic_column_name) - if synthetic_array_column_name: - if synthetic_array_column_name not in synthetic_array_column_names: - synthetic_array_column_names[synthetic_array_column_name] = \ - {"index": 0, "columns": [column_name]} - elif (column_name not in - synthetic_array_column_names[synthetic_array_column_name]["columns"]): - synthetic_array_column_names[synthetic_array_column_name]["index"] += 1 - synthetic_array_column_names[synthetic_array_column_name]["columns"].append(column_name) - synthetic_array_column_index = \ - synthetic_array_column_names[synthetic_array_column_name]["index"] - synthetic_array_column_name = synthetic_column_name.replace( - f"{synthetic_array_column_name}#", - f"{synthetic_array_column_name}#{synthetic_array_column_index}") - custom_column_mappings[column_name][synthetic_array_column_name] = \ - custom_column_mappings[column_name][synthetic_column_name] - del custom_column_mappings[column_name][synthetic_column_name] - + # Array indices (the N in qc_values#N.key) are intentionally left unassigned here. + # They are assigned dynamically per-row in _iter_mapper so that only non-empty + # columns receive an index, producing a compact 0-based array with no gaps. custom_column_mappings = deepcopy(custom_column_mappings) for custom_column_name in list(custom_column_mappings.keys()): if custom_column_name not in actual_column_names: del custom_column_mappings[custom_column_name] - fixup_custom_array_column_mappings(custom_column_mappings) return custom_column_mappings super()._define_header(header) if self._custom_column_mappings: self._custom_column_mappings = fixup_custom_column_mappings(self._custom_column_mappings, self.header) self._original_header = self.header + # Build an expanded header that _StructuredRowTemplate will use to register + # set_value functions. Each source column that has a mapping contributes N + # synthetic entries (one per mapped column in the sheet), with consecutive + # numeric indices. This is the *maximum* possible index range; _iter_mapper + # will only populate the slots that correspond to non-empty source values, so + # the actual array in any given row will be shorter — but every index it might + # emit must be pre-registered here or structured_data silently drops the value. self.header = [] + index = 0 for column_name in header: if column_name in self._custom_column_mappings: - synthetic_column_names = list(self._custom_column_mappings[column_name].keys()) - self.header += synthetic_column_names + for synthetic_key in self._custom_column_mappings[column_name]: + array_name, _, rest = synthetic_key.partition(COLUMN_NAME_ARRAY_SUFFIX_CHAR) + if rest: # bare placeholder: "qc_values#.key" -> "qc_values#.key" + self.header.append( + f"{array_name}{COLUMN_NAME_ARRAY_SUFFIX_CHAR}{index}{COLUMN_NAME_SEPARATOR}{rest.lstrip(COLUMN_NAME_SEPARATOR)}") + else: + self.header.append(synthetic_key) + index += 1 else: self.header.append(column_name) @@ -238,24 +248,45 @@ def _iter_mapper(self, row: dict) -> List[str]: if self._custom_column_mappings: synthetic_columns = {} columns_to_delete = [] + # Track per-array-name indices so each non-empty column gets the next + # available slot (e.g. qc_values#0, qc_values#1, ...). Indices are + # assigned here at row-processing time rather than statically at header + # time so that empty columns are simply skipped and leave no gap. + array_indices: dict = {} for column_name in row: - if column_name in self._custom_column_mappings: - column_mapping = self._custom_column_mappings[column_name] - for synthetic_column_name in column_mapping: - synthetic_column_value = column_mapping[synthetic_column_name] - if synthetic_column_value == "{name}": - synthetic_columns[synthetic_column_name] = column_name - elif (column_value := self._parse_value_specifier(synthetic_column_value, - row[column_name])) is not None: - synthetic_columns[synthetic_column_name] = column_value - else: - synthetic_columns[synthetic_column_name] = synthetic_column_value - columns_to_delete.append(column_name) - if columns_to_delete: - for column_to_delete in columns_to_delete: - del row[column_to_delete] - if synthetic_columns: - row.update(synthetic_columns) + if column_name not in self._custom_column_mappings: + continue + columns_to_delete.append(column_name) + if not row[column_name]: + continue + column_mapping = self._custom_column_mappings[column_name] + # Determine the array name (e.g. "qc_values") used by this mapping + # group and assign it the next sequential index for this row. + array_name = None + for synthetic_column_name in column_mapping: + prefix, _, _ = synthetic_column_name.partition(COLUMN_NAME_ARRAY_SUFFIX_CHAR) + if _ and prefix: + array_name = prefix + break + if array_name is not None: + index = array_indices.get(array_name, 0) + array_indices[array_name] = index + 1 + for synthetic_column_name, synthetic_column_value in column_mapping.items(): + # Replace bare "array_name#" placeholder with the assigned index. + if array_name is not None: + synthetic_column_name = synthetic_column_name.replace( + f"{array_name}{COLUMN_NAME_ARRAY_SUFFIX_CHAR}", + f"{array_name}{COLUMN_NAME_ARRAY_SUFFIX_CHAR}{index}", 1) + if synthetic_column_value == "{name}": + synthetic_columns[synthetic_column_name] = column_name + elif (column_value := self._parse_value_specifier(synthetic_column_value, + row[column_name])) is not None: + synthetic_columns[synthetic_column_name] = column_value + else: + synthetic_columns[synthetic_column_name] = synthetic_column_value + for column_to_delete in columns_to_delete: + del row[column_to_delete] + row.update(synthetic_columns) return row @staticmethod @@ -280,7 +311,7 @@ def _parse_value_specifier(value_specifier: Optional[Any], value: Optional[Any]) # This ExcelSheetName class is used to represent an Excel sheet name; it is simply a str type with an -# additional "original" property. The value of this will be given string with any prefix preceeding an +# additional "original" property. The value of this will be given string with any prefix preceding an # underscore removed; and the "original" property will evaluate to the original/given string. This is # used to support the use of sheet names of the form "XYZ_TypeName", where "XYZ" is an arbitrary string # and "TypeName" is the virtual name of the sheet, which will be used by StructuredDataSet/etc, and which @@ -289,11 +320,6 @@ def _parse_value_specifier(value_specifier: Optional[Any], value: Optional[Any]) # this would otherwise not be possible; this provides a way for a spreadsheet to partition items/rows of # a particular fixed type across multiple sheets. # -# If this requirement was known at the beginning (or if we had more foresight) we would not support this -# feature this way; we would build it in from the start; this mechanism here merely provides a hook for -# this feature with minimal disruption (the only real tricky part being to make sure the original sheet -# name is reported in error messages); doing is this was minimizes risk of disruption. -# class ExcelSheetName(str): def __new__(cls, value: str): value = value if isinstance(value, str) else str(value) From 055a998e175fdf4c2a2a41171328a77d69d23028 Mon Sep 17 00:00:00 2001 From: sarahgonicholson Date: Thu, 7 May 2026 11:13:00 -0400 Subject: [PATCH 2/4] fix formatting --- dcicutils/submitr/custom_excel.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dcicutils/submitr/custom_excel.py b/dcicutils/submitr/custom_excel.py index c200e7bc0..e28cbe0b7 100644 --- a/dcicutils/submitr/custom_excel.py +++ b/dcicutils/submitr/custom_excel.py @@ -232,7 +232,9 @@ def fixup_custom_column_mappings(custom_column_mappings: dict, actual_column_nam array_name, _, rest = synthetic_key.partition(COLUMN_NAME_ARRAY_SUFFIX_CHAR) if rest: # bare placeholder: "qc_values#.key" -> "qc_values#.key" self.header.append( - f"{array_name}{COLUMN_NAME_ARRAY_SUFFIX_CHAR}{index}{COLUMN_NAME_SEPARATOR}{rest.lstrip(COLUMN_NAME_SEPARATOR)}") + f"{array_name}{COLUMN_NAME_ARRAY_SUFFIX_CHAR}{index}" + f"{COLUMN_NAME_SEPARATOR}{rest.lstrip(COLUMN_NAME_SEPARATOR)}" + ) else: self.header.append(synthetic_key) index += 1 From 29004d2f423bec5daeea2430f2f95c69cbf0b96a Mon Sep 17 00:00:00 2001 From: aschroed Date: Tue, 9 Jun 2026 13:08:06 -0400 Subject: [PATCH 3/4] update main GA to pull new OIDC creds --- .github/workflows/main.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 733966a36..2b2e090f8 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,7 +12,9 @@ on: # Allows you to run this workflow manually from the Actions tab workflow_dispatch: - +permissions: #Needed for OIDC authentication + id-token: write + contents: read # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: # This workflow contains a single job called "build" @@ -42,10 +44,14 @@ jobs: # show loaded versions of various poetry-related libraries pip freeze --all | egrep '(pip|poetry(.[a-z]+)?|tomlkit)==' + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }} + aws-region: us-east-1 + - name: QA env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} S3_ENCRYPT_KEY: ${{ secrets.S3_ENCRYPT_KEY }} GLOBAL_ENV_BUCKET: foursight-envs run: | From a2e0cb14209e74487af971805fd9ee6856e67e35 Mon Sep 17 00:00:00 2001 From: aschroed Date: Wed, 17 Jun 2026 17:17:58 -0400 Subject: [PATCH 4/4] fix for [0] string values being converted into arrays --- dcicutils/structured_data.py | 3 ++- test/test_structured_data.py | 24 ++++++++++++++++++++---- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/dcicutils/structured_data.py b/dcicutils/structured_data.py index 45b11586f..9ca255eed 100644 --- a/dcicutils/structured_data.py +++ b/dcicutils/structured_data.py @@ -619,6 +619,7 @@ def set_value_backtrack_object(path_index: int, path_element: str) -> None: original_data = data json_value = None + is_schema_string = typeinfo is not None and typeinfo.get("type") == "string" if isinstance(path[-1], int) and (json_value := load_json_if(value, is_array=True)): path = right_trim(path, remove=lambda value: isinstance(value, int)) for i, p in enumerate(path[:-1]): @@ -631,7 +632,7 @@ def set_value_backtrack_object(path_index: int, path_element: str) -> None: values = [mapv(value, src) for value in values] merge_objects(data, values) else: - if json_value or (json_value := load_json_if(value, is_array=True, is_object=True)): + if json_value or (not is_schema_string and (json_value := load_json_if(value, is_array=True, is_object=True))): data[p] = json_value else: if isinstance(p, str) and (not isinstance(data, dict) or p not in data): diff --git a/test/test_structured_data.py b/test/test_structured_data.py index 453ce32bc..2979ae8ca 100644 --- a/test/test_structured_data.py +++ b/test/test_structured_data.py @@ -1065,7 +1065,7 @@ def _pytest_kwargs(kwargs: List[dict]) -> List[dict]: { "indigo": ["abc", "def", "ghi", "123456890"], "juliet": [[[0], [12, 34], [5], [67, 8, 90]], [[123]]], - "alfa": {"bravo": {"foo": 123}} + "alfa": {"bravo": '{"foo": 123}'} }, { "indigo": ["prufrock", "j.", "alfred"], @@ -1074,13 +1074,29 @@ def _pytest_kwargs(kwargs: List[dict]) -> List[dict]: } ] }, - "expected_errors": [{'src': {'type': 'SomeTypeFour', 'row': 1}, - 'error': "Validation error at '$.alfa.bravo': {'foo': 123} is not of type 'string'"}, - {'src': {'type': 'SomeTypeFour', 'row': 2}, + "expected_errors": [{'src': {'type': 'SomeTypeFour', 'row': 2}, 'error': "Validation error at '$.alfa.bravo': " "{'charlie': {'delta': 'hellocharlie'}} is not of type 'string'"}] # noqa }, # ---------------------------------------------------------------------------------------------- + # Verify that string values resembling JSON arrays/objects (e.g. '[0]', '[10, 50]') are stored + # as literal strings and not parsed into Python lists when the schema type is string. + { + "rows": [ + "alfa.bravo", + "[0]", + "[10, 50]", + ], + "as_file_name": "some_type_four.tsv", + "schemas": [_load_json_from_file("some_type_four.json")], + "expected": { + "SomeTypeFour": [ + {"alfa": {"bravo": "[0]"}}, + {"alfa": {"bravo": "[10, 50]"}}, + ] + } + }, + # ---------------------------------------------------------------------------------------------- { "ignore": True, "rows": [