From 350380894e9dafc903989f1ae208ef37231aca5d Mon Sep 17 00:00:00 2001 From: Diya Date: Mon, 14 Apr 2025 05:02:19 +0530 Subject: [PATCH 01/36] {Search for wildcards function updated} --- datashuttle/utils/folders.py | 71 +++++++++++++---- datashuttle/utils/validation.py | 2 +- tests/test_date_search_range.py | 134 ++++++++++++++++++++++++++++++++ 3 files changed, 191 insertions(+), 16 deletions(-) create mode 100644 tests/test_date_search_range.py diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 56852640c..217acdbcd 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -17,11 +17,14 @@ from datashuttle.utils.custom_types import TopLevelFolder import glob +import re +from datetime import datetime from pathlib import Path from datashuttle.configs import canonical_folders, canonical_tags from datashuttle.utils import ssh, utils, validation from datashuttle.utils.custom_exceptions import NeuroBlueprintError +from datashuttle.utils.utils import get_values_from_bids_formatted_name # ----------------------------------------------------------------------------- # Create Folders @@ -401,27 +404,65 @@ def search_for_wildcards( """ new_all_names: List[str] = [] for name in all_names: - if canonical_tags.tags("*") in name: - name = name.replace(canonical_tags.tags("*"), "*") - - matching_names: List[str] + if canonical_tags.tags("*") in name or "@DATETO@" in name: + search_str = name.replace(canonical_tags.tags("*"), "*") + # If a date-range tag is present, extract dates and update the search string. + if "@DATETO@" in name: + m = re.search(r"(\d{8})@DATETO@(\d{8})", name) + if not m: + raise ValueError( + "Invalid date range format in name: " + name + ) + start_str, end_str = m.groups() + try: + start_date = datetime.strptime(start_str, "%Y%m%d") + end_date = datetime.strptime(end_str, "%Y%m%d") + except ValueError as e: + raise ValueError("Invalid date in date range: " + str(e)) + # Replace the date-range substring with "date-*" + search_str = re.sub(r"\d{8}@DATETO@\d{8}", "date-*", name) + # Use the helper function to perform the glob search. if sub: - matching_names = search_sub_or_ses_level( # type: ignore - cfg, base_folder, local_or_central, sub, search_str=name + matching_names: List[str] = search_sub_or_ses_level( + cfg, + base_folder, + local_or_central, + sub, + search_str=search_str, )[0] else: - matching_names = search_sub_or_ses_level( # type: ignore - cfg, base_folder, local_or_central, search_str=name + matching_names = search_sub_or_ses_level( + cfg, base_folder, local_or_central, search_str=search_str )[0] - + # If a date-range tag was provided, further filter the results. + if "@DATETO@" in name: + filtered_names: List[str] = [] + for candidate in matching_names: + candidate_basename = ( + candidate + if isinstance(candidate, str) + else candidate.name + ) + values_list = get_values_from_bids_formatted_name( + [candidate_basename], "date" + ) + if not values_list: + continue + candidate_date_str = values_list[0] + try: + candidate_date = datetime.strptime( + candidate_date_str, "%Y%m%d" + ) + except ValueError: + continue + if start_date <= candidate_date <= end_date: + filtered_names.append(candidate) + matching_names = filtered_names new_all_names += matching_names else: new_all_names += [name] - - new_all_names = list( - set(new_all_names) - ) # remove duplicate names in case of wildcard overlap - + # Remove duplicates in case of wildcard overlap. + new_all_names = list(set(new_all_names)) return new_all_names @@ -440,7 +481,7 @@ def search_sub_or_ses_level( search_str: str = "*", verbose: bool = True, return_full_path: bool = False, -) -> Tuple[List[str] | List[Path], List[str]]: +) -> Tuple[Union[List[str], List[Path]], List[str]]: """ Search project folder at the subject or session level. Only returns folders diff --git a/datashuttle/utils/validation.py b/datashuttle/utils/validation.py index e85d757da..3dc116116 100644 --- a/datashuttle/utils/validation.py +++ b/datashuttle/utils/validation.py @@ -321,7 +321,7 @@ def replace_tags_in_regexp(regexp: str) -> str: Note `replace_date_time_tags_in_name()` operates in place on a list. """ regexp_list = [regexp] - date_regexp = "\d\d\d\d\d\d\d\d" + date_regexp = r"\d{8}" time_regexp = "\d\d\d\d\d\d" formatting.replace_date_time_tags_in_name( diff --git a/tests/test_date_search_range.py b/tests/test_date_search_range.py new file mode 100644 index 000000000..c51381598 --- /dev/null +++ b/tests/test_date_search_range.py @@ -0,0 +1,134 @@ +import glob +import os +import re +import shutil +import tempfile +from pathlib import Path +from typing import List + +import pytest + +from datashuttle.utils.folders import search_for_wildcards + + +# Dummy implementation for canonical_tags +class DummyCanonicalTags: + @staticmethod + def tags(x: str) -> str: + if x == "*": + return "@*@" + return x + + +# Patch canonical_tags so that tags("*") returns "@*@" +@pytest.fixture(autouse=True) +def patch_canonical_tags(monkeypatch): + from datashuttle.configs import canonical_tags + + monkeypatch.setattr(canonical_tags, "tags", DummyCanonicalTags.tags) + + +# Dummy implementation for search_sub_or_ses_level that simply performs globbing. +def dummy_search_sub_or_ses_level( + cfg, base_folder: Path, local_or_central: str, *args, search_str: str +): + pattern = os.path.join(str(base_folder), search_str) + matches: List[str] = sorted(glob.glob(pattern)) + return (matches,) + + +# Patch search_sub_or_ses_level in the module where search_for_wildcards is defined. +@pytest.fixture(autouse=True) +def patch_search_sub_or_ses_level(monkeypatch): + monkeypatch.setattr( + "datashuttle.utils.folders.search_sub_or_ses_level", + dummy_search_sub_or_ses_level, + ) + + +# Dummy implementation for get_values_from_bids_formatted_name. +def dummy_get_values_from_bids_formatted_name(name: str, key: str) -> dict: + # Expect name format: "sub-01_date-YYYYMMDD" + m = re.search(r"date-(\d{8})", name) + if m: + return {key: m.group(1)} + return {} + + +# Patch get_values_from_bids_formatted_name. +@pytest.fixture(autouse=True) +def patch_get_values_from_bids(monkeypatch): + monkeypatch.setattr( + "datashuttle.utils.utils.get_values_from_bids_formatted_name", + dummy_get_values_from_bids_formatted_name, + ) + + +# Fixture to create a temporary directory with a simulated folder structure. +@pytest.fixture +def temp_project_dir() -> Path: # type: ignore + temp_dir = Path(tempfile.mkdtemp()) + # Create folders with names in the format "sub-01_date-YYYYMMDD" + folder_dates = [ + "20250305", + "20250306", + "20250307", + "20250308", + "20250309", + "20250310", + ] + for date_str in folder_dates: + folder_name = f"sub-01_date-{date_str}" + os.mkdir(temp_dir / folder_name) + yield temp_dir + shutil.rmtree(temp_dir) + + +def test_date_range_wildcard(temp_project_dir: Path): + """ + When given a date-range wildcard pattern like "sub-01_20250306@DATETO@20250309", + only folders whose embedded date falls between 20250306 and 20250309 (inclusive) + should be returned. + """ + + class Configs: + pass + + cfg = Configs() + base_folder = temp_project_dir + local_or_central = "local" + pattern = "sub-01_20250306@DATETO@20250309" + result = search_for_wildcards( + cfg, base_folder, local_or_central, [pattern] + ) + + # Extract the dates from the returned folder names. + found_dates = set() + for folder in result: + basename = os.path.basename(folder) + m = re.search(r"date-(\d{8})", basename) + if m: + found_dates.add(m.group(1)) + + expected_dates = {"20250306", "20250307", "20250308", "20250309"} + assert found_dates == expected_dates + + +def test_simple_wildcard(temp_project_dir: Path): + """ + When given a simple wildcard pattern like "sub-01_@*@", + all folders should be returned. + """ + + class Configs: + pass + + cfg = Configs() + base_folder = temp_project_dir + local_or_central = "local" + pattern = "sub-01_@*@" + result = search_for_wildcards( + cfg, base_folder, local_or_central, [pattern] + ) + # We expect six folders. + assert len(result) == 6 From 054a98b6ce1068b74e1e26b77eaef3ccb0425eae Mon Sep 17 00:00:00 2001 From: Diya910 Date: Sun, 15 Jun 2025 20:38:13 +0530 Subject: [PATCH 02/36] Refactoring changes asked by the maintainer which include centralisation of code my making functions in validation.py and using in search_with_tags feature in folders file --- datashuttle/configs/canonical_tags.py | 35 ++++++ datashuttle/utils/data_transfer.py | 2 +- datashuttle/utils/folders.py | 161 ++++++++++++++++---------- datashuttle/utils/validation.py | 128 +++++++++++++++++--- tests/test_date_search_range.py | 138 ++++++++++++++++------ 5 files changed, 354 insertions(+), 110 deletions(-) diff --git a/datashuttle/configs/canonical_tags.py b/datashuttle/configs/canonical_tags.py index 233350bc6..8cfc7e8b9 100644 --- a/datashuttle/configs/canonical_tags.py +++ b/datashuttle/configs/canonical_tags.py @@ -11,5 +11,40 @@ def tags(tag_name: str) -> str: "datetime": "@DATETIME@", "to": "@TO@", "*": "@*@", + "DATETO": "@DATETO@", + "TIMETO": "@TIMETO@", + "DATETIMETO": "@DATETIMETO@", } return tags[tag_name] + + +_DATETIME_FORMATS = { + "datetime": "%Y%m%dT%H%M%S", + "time": "%H%M%S", + "date": "%Y%m%d", +} + + +def get_datetime_format(format_type: str) -> str: + """ + Get the datetime format string for a given format type. + + Parameters + ---------- + format_type : str + One of "datetime", "time", or "date" + + Returns + ------- + str + The format string for the specified type + + Raises + ------ + ValueError + If format_type is not one of the supported types + """ + if format_type not in _DATETIME_FORMATS: + raise ValueError(f"Invalid format type: {format_type}. Must be one of {list(_DATETIME_FORMATS.keys())}") + return _DATETIME_FORMATS[format_type] + diff --git a/datashuttle/utils/data_transfer.py b/datashuttle/utils/data_transfer.py index 21121b8e6..d2a5efbc1 100644 --- a/datashuttle/utils/data_transfer.py +++ b/datashuttle/utils/data_transfer.py @@ -462,7 +462,7 @@ def get_processed_names( processed_names = formatting.check_and_format_names( names_checked, prefix ) - processed_names = folders.search_for_wildcards( + processed_names = folders.search_with_tags( self.__cfg, self.__base_folder, self.__local_or_central, diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 217acdbcd..809a50f0c 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -362,7 +362,49 @@ def process_glob_to_find_datatype_folders( # ----------------------------------------------------------------------------- -def search_for_wildcards( +def filter_names_by_datetime_range( + names: List[str], + format_type: str, + start_timepoint: datetime, + end_timepoint: datetime, +) -> List[str]: + """ + Filter a list of names based on a datetime range. + Assumes all names contain the format_type pattern (e.g., date-*, time-*) + as they were searched using this pattern. + + Parameters + ---------- + names : List[str] + List of names to filter, all containing the datetime pattern + format_type : str + One of "datetime", "time", or "date" + start_timepoint : datetime + Start of the datetime range + end_timepoint : datetime + End of the datetime range + + Returns + ------- + List[str] + Filtered list of names that fall within the datetime range + """ + filtered_names: List[str] = [] + for candidate in names: + candidate_basename = candidate if isinstance(candidate, str) else candidate.name + value = get_values_from_bids_formatted_name([candidate_basename], format_type)[0] + try: + candidate_timepoint = datetime.strptime( + value, canonical_tags.get_datetime_format(format_type) + ) + if start_timepoint <= candidate_timepoint <= end_timepoint: + filtered_names.append(candidate) + except ValueError: + continue + return filtered_names + + +def search_with_tags( cfg: Configs, base_folder: Path, local_or_central: str, @@ -400,68 +442,69 @@ def search_for_wildcards( sub : optional subject to search for sessions in. If not provided, will search for subjects rather than sessions. - """ new_all_names: List[str] = [] for name in all_names: - if canonical_tags.tags("*") in name or "@DATETO@" in name: - search_str = name.replace(canonical_tags.tags("*"), "*") - # If a date-range tag is present, extract dates and update the search string. - if "@DATETO@" in name: - m = re.search(r"(\d{8})@DATETO@(\d{8})", name) - if not m: - raise ValueError( - "Invalid date range format in name: " + name - ) - start_str, end_str = m.groups() - try: - start_date = datetime.strptime(start_str, "%Y%m%d") - end_date = datetime.strptime(end_str, "%Y%m%d") - except ValueError as e: - raise ValueError("Invalid date in date range: " + str(e)) - # Replace the date-range substring with "date-*" - search_str = re.sub(r"\d{8}@DATETO@\d{8}", "date-*", name) - # Use the helper function to perform the glob search. - if sub: - matching_names: List[str] = search_sub_or_ses_level( - cfg, - base_folder, - local_or_central, - sub, - search_str=search_str, - )[0] - else: - matching_names = search_sub_or_ses_level( - cfg, base_folder, local_or_central, search_str=search_str - )[0] - # If a date-range tag was provided, further filter the results. - if "@DATETO@" in name: - filtered_names: List[str] = [] - for candidate in matching_names: - candidate_basename = ( - candidate - if isinstance(candidate, str) - else candidate.name - ) - values_list = get_values_from_bids_formatted_name( - [candidate_basename], "date" - ) - if not values_list: - continue - candidate_date_str = values_list[0] - try: - candidate_date = datetime.strptime( - candidate_date_str, "%Y%m%d" - ) - except ValueError: - continue - if start_date <= candidate_date <= end_date: - filtered_names.append(candidate) - matching_names = filtered_names - new_all_names += matching_names + if not (canonical_tags.tags("*") in name or + canonical_tags.tags("DATETO") in name or + canonical_tags.tags("TIMETO") in name or + canonical_tags.tags("DATETIMETO") in name): + new_all_names.append(name) + continue + + # Initialize search string + search_str = name + + # Handle wildcard replacement first if present + if canonical_tags.tags("*") in name: + search_str = search_str.replace(canonical_tags.tags("*"), "*") + + # Handle datetime ranges + format_type = tag = None + if canonical_tags.tags("DATETO") in search_str: + format_type = "date" + tag = canonical_tags.tags("DATETO") + elif canonical_tags.tags("TIMETO") in search_str: + format_type = "time" + tag = canonical_tags.tags("TIMETO") + elif canonical_tags.tags("DATETIMETO") in search_str: + format_type = "datetime" + tag = canonical_tags.tags("DATETIMETO") + + if format_type is not None: + assert tag is not None, "format and tag should be set together" + search_str = validation.format_and_validate_datetime_search_str(search_str, format_type, tag) + + # Use the helper function to perform the glob search + if sub: + matching_names: List[str] = search_sub_or_ses_level( + cfg, + base_folder, + local_or_central, + sub, + search_str=search_str, + )[0] else: - new_all_names += [name] - # Remove duplicates in case of wildcard overlap. + matching_names = search_sub_or_ses_level( + cfg, base_folder, local_or_central, search_str=search_str + )[0] + + # Filter results by datetime range if one was present + if format_type is not None and tag is not None: + expected_values = validation.get_expected_num_datetime_values(format_type) + full_tag_regex = fr"(\d{{{expected_values}}}){re.escape(tag)}(\d{{{expected_values}}})" + match = re.search(full_tag_regex, name) + if match: # We know this is true because format_and_validate_datetime_search_str succeeded + start_str, end_str = match.groups() + start_timepoint = datetime.strptime(start_str, canonical_tags.get_datetime_format(format_type)) + end_timepoint = datetime.strptime(end_str, canonical_tags.get_datetime_format(format_type)) + matching_names = filter_names_by_datetime_range( + matching_names, format_type, start_timepoint, end_timepoint + ) + + new_all_names.extend(matching_names) + + # Remove duplicates in case of wildcard overlap new_all_names = list(set(new_all_names)) return new_all_names diff --git a/datashuttle/utils/validation.py b/datashuttle/utils/validation.py index 3dc116116..5025dd6c6 100644 --- a/datashuttle/utils/validation.py +++ b/datashuttle/utils/validation.py @@ -24,7 +24,7 @@ from itertools import chain from pathlib import Path -from datashuttle.configs import canonical_configs, canonical_folders +from datashuttle.configs import canonical_configs, canonical_folders, canonical_tags from datashuttle.utils import formatting, getters, utils from datashuttle.utils.custom_exceptions import NeuroBlueprintError @@ -432,18 +432,11 @@ def datetime_are_iso_format( """ Check formatting for date-, time-, or datetime- tags. """ - formats = { - "datetime": "%Y%m%dT%H%M%S", - "time": "%H%M%S", - "date": "%Y%m%d", - } - - key = next((key for key in formats if key in name), None) + key = next((key for key in ["datetime", "time", "date"] if key in name), None) error_message: List[str] if not key: error_message = [] - else: try: format_to_check = utils.get_values_from_bids_formatted_name( @@ -452,17 +445,122 @@ def datetime_are_iso_format( except: return [] - strfmt = formats[key] - try: - datetime.strptime(format_to_check, strfmt) - error_message = [] + if not validate_datetime(format_to_check, key): + error_message = [get_datetime_error( + key, name, canonical_tags.get_datetime_format(key), path_ + )] + else: + error_message = [] except ValueError: - error_message = [get_datetime_error(key, name, strfmt, path_)] + error_message = [get_datetime_error( + key, name, canonical_tags.get_datetime_format(key), path_ + )] return error_message +def validate_datetime(datetime_str: str, format_type: str) -> bool: + """ + Validate that a datetime string matches the expected format. + + Parameters + ---------- + datetime_str : str + The datetime string to validate + format_type : str + One of "datetime", "time", or "date" + + Returns + ------- + bool + True if valid, False otherwise + """ + try: + datetime.strptime(datetime_str, canonical_tags.get_datetime_format(format_type)) + return True + except ValueError: + return False + + +def get_expected_num_datetime_values(format_type: str) -> int: + """ + Get the expected number of characters for a datetime format. + + Parameters + ---------- + format_type : str + One of "datetime", "time", or "date" + + Returns + ------- + int + The number of characters expected for the format + """ + format_str = canonical_tags.get_datetime_format(format_type) + today = datetime.now() + return len(today.strftime(format_str)) + + +def format_and_validate_datetime_search_str(search_str: str, format_type: str, tag: str) -> str: + """ + Validate and format a search string containing a datetime range. + + Parameters + ---------- + search_str : str + The search string containing the datetime range + format_type : str + One of "datetime", "time", or "date" + tag : str + The tag used for the range (e.g. @DATETO@) + + Returns + ------- + str + The formatted search string with datetime range replaced + + Raises + ------ + NeuroBlueprintError + If the datetime format is invalid or the range is malformed + """ + expected_values = get_expected_num_datetime_values(format_type) + full_tag_regex = fr"(\d{{{expected_values}}}){re.escape(tag)}(\d{{{expected_values}}})" + match = re.search(full_tag_regex, search_str) + + if not match: + utils.log_and_raise_error( + f"Invalid {format_type} range format in search string: {search_str}", + NeuroBlueprintError, + ) + + start_str, end_str = match.groups() + + if not validate_datetime(start_str, format_type): + utils.log_and_raise_error( + f"Invalid start {format_type} format: {start_str}", + NeuroBlueprintError, + ) + + if not validate_datetime(end_str, format_type): + utils.log_and_raise_error( + f"Invalid end {format_type} format: {end_str}", + NeuroBlueprintError, + ) + + start_timepoint = datetime.strptime(start_str, canonical_tags.get_datetime_format(format_type)) + end_timepoint = datetime.strptime(end_str, canonical_tags.get_datetime_format(format_type)) + + if end_timepoint < start_timepoint: + utils.log_and_raise_error( + f"End {format_type} is before start {format_type}", + NeuroBlueprintError, + ) + + return re.sub(full_tag_regex, f"{format_type}-*", search_str) + + def raise_display_mode( message: str, display_mode: DisplayMode, log: bool ) -> None: @@ -981,3 +1079,5 @@ def check_datatypes_are_valid( return message return None + + diff --git a/tests/test_date_search_range.py b/tests/test_date_search_range.py index c51381598..d32d61d01 100644 --- a/tests/test_date_search_range.py +++ b/tests/test_date_search_range.py @@ -3,65 +3,79 @@ import re import shutil import tempfile +from datetime import datetime from pathlib import Path from typing import List import pytest -from datashuttle.utils.folders import search_for_wildcards +from datashuttle.utils.folders import search_with_tags # Dummy implementation for canonical_tags class DummyCanonicalTags: @staticmethod def tags(x: str) -> str: - if x == "*": - return "@*@" - return x + tags_dict = { + "*": "@*@", + "DATETO": "@DATETO@", + "TIMETO": "@TIMETO@", + "DATETIMETO": "@DATETIMETO@" + } + return tags_dict.get(x, x) - -# Patch canonical_tags so that tags("*") returns "@*@" + @staticmethod + def get_datetime_format(format_type: str) -> str: + formats = { + "datetime": "%Y%m%dT%H%M%S", + "time": "%H%M%S", + "date": "%Y%m%d", + } + if format_type not in formats: + raise ValueError(f"Invalid format type: {format_type}") + return formats[format_type] + + +# Patch canonical_tags @pytest.fixture(autouse=True) def patch_canonical_tags(monkeypatch): from datashuttle.configs import canonical_tags - monkeypatch.setattr(canonical_tags, "tags", DummyCanonicalTags.tags) + monkeypatch.setattr(canonical_tags, "get_datetime_format", DummyCanonicalTags.get_datetime_format) # Dummy implementation for search_sub_or_ses_level that simply performs globbing. def dummy_search_sub_or_ses_level( - cfg, base_folder: Path, local_or_central: str, *args, search_str: str + cfg, base_folder: Path, local_or_central: str, *args, search_str: str = "*" ): pattern = os.path.join(str(base_folder), search_str) matches: List[str] = sorted(glob.glob(pattern)) - return (matches,) + return (matches, []) -# Patch search_sub_or_ses_level in the module where search_for_wildcards is defined. +# Patch search_sub_or_ses_level in the module where search_with_tags is defined. @pytest.fixture(autouse=True) def patch_search_sub_or_ses_level(monkeypatch): - monkeypatch.setattr( - "datashuttle.utils.folders.search_sub_or_ses_level", - dummy_search_sub_or_ses_level, - ) + from datashuttle.utils import folders + monkeypatch.setattr(folders, "search_sub_or_ses_level", dummy_search_sub_or_ses_level) -# Dummy implementation for get_values_from_bids_formatted_name. -def dummy_get_values_from_bids_formatted_name(name: str, key: str) -> dict: - # Expect name format: "sub-01_date-YYYYMMDD" - m = re.search(r"date-(\d{8})", name) - if m: - return {key: m.group(1)} - return {} +# Dummy implementation for get_values_from_bids_formatted_name +def dummy_get_values_from_bids_formatted_name(names: List[str], key: str, return_as_int: bool = False) -> List[str]: + results = [] + for name in names: + if key == "date": + m = re.search(r"date-(\d{8})", name) + if m: + results.append(m.group(1)) + return results -# Patch get_values_from_bids_formatted_name. +# Patch get_values_from_bids_formatted_name @pytest.fixture(autouse=True) def patch_get_values_from_bids(monkeypatch): - monkeypatch.setattr( - "datashuttle.utils.utils.get_values_from_bids_formatted_name", - dummy_get_values_from_bids_formatted_name, - ) + from datashuttle.utils import utils + monkeypatch.setattr(utils, "get_values_from_bids_formatted_name", dummy_get_values_from_bids_formatted_name) # Fixture to create a temporary directory with a simulated folder structure. @@ -90,7 +104,6 @@ def test_date_range_wildcard(temp_project_dir: Path): only folders whose embedded date falls between 20250306 and 20250309 (inclusive) should be returned. """ - class Configs: pass @@ -98,11 +111,9 @@ class Configs: base_folder = temp_project_dir local_or_central = "local" pattern = "sub-01_20250306@DATETO@20250309" - result = search_for_wildcards( - cfg, base_folder, local_or_central, [pattern] - ) + result = search_with_tags(cfg, base_folder, local_or_central, [pattern]) - # Extract the dates from the returned folder names. + # Extract the dates from the returned folder names found_dates = set() for folder in result: basename = os.path.basename(folder) @@ -119,7 +130,6 @@ def test_simple_wildcard(temp_project_dir: Path): When given a simple wildcard pattern like "sub-01_@*@", all folders should be returned. """ - class Configs: pass @@ -127,8 +137,64 @@ class Configs: base_folder = temp_project_dir local_or_central = "local" pattern = "sub-01_@*@" - result = search_for_wildcards( - cfg, base_folder, local_or_central, [pattern] - ) - # We expect six folders. + result = search_with_tags(cfg, base_folder, local_or_central, [pattern]) + # We expect six folders (20250305 through 20250310) assert len(result) == 6 + + +def test_invalid_date_range(temp_project_dir: Path): + """ + Test that invalid date ranges raise appropriate errors. + """ + class Configs: + pass + + cfg = Configs() + base_folder = temp_project_dir + local_or_central = "local" + + # Test end date before start date + with pytest.raises(Exception) as exc_info: + pattern = "sub-01_20250309@DATETO@20250306" + search_with_tags(cfg, base_folder, local_or_central, [pattern]) + assert "before start" in str(exc_info.value) + + # Test invalid date format + with pytest.raises(Exception) as exc_info: + pattern = "sub-01_2025030@DATETO@20250306" # Missing digit + search_with_tags(cfg, base_folder, local_or_central, [pattern]) + assert "Invalid" in str(exc_info.value) + + +def test_combined_wildcards(temp_project_dir: Path): + """ + Test that wildcard and date range can be combined in the same pattern. + """ + class Configs: + pass + + cfg = Configs() + base_folder = temp_project_dir + local_or_central = "local" + + # Create some additional test folders with different subject numbers + for sub in ["02", "03"]: + for date in ["20250307", "20250308"]: + folder_name = f"sub-{sub}_date-{date}" + os.mkdir(temp_project_dir / folder_name) + + pattern = "sub-*_20250307@DATETO@20250308" + result = search_with_tags(cfg, base_folder, local_or_central, [pattern]) + + # Should match all subjects but only dates within range + matched_folders = set(os.path.basename(f) for f in result) + expected_folders = { + "sub-01_date-20250307", + "sub-01_date-20250308", + "sub-02_date-20250307", + "sub-02_date-20250308", + "sub-03_date-20250307", + "sub-03_date-20250308", + } + assert matched_folders == expected_folders + From 69699d974d2ada1a665bab9765d36a38935aaa30 Mon Sep 17 00:00:00 2001 From: Diya910 Date: Wed, 2 Jul 2025 22:40:06 +0530 Subject: [PATCH 03/36] Fabrication of the code moved functions in folders.py --- datashuttle/configs/canonical_tags.py | 33 ++- datashuttle/utils/folders.py | 392 +++++++++++++++++++------- datashuttle/utils/validation.py | 106 +------ 3 files changed, 324 insertions(+), 207 deletions(-) diff --git a/datashuttle/configs/canonical_tags.py b/datashuttle/configs/canonical_tags.py index 8cfc7e8b9..d36e5d781 100644 --- a/datashuttle/configs/canonical_tags.py +++ b/datashuttle/configs/canonical_tags.py @@ -18,16 +18,25 @@ def tags(tag_name: str) -> str: return tags[tag_name] -_DATETIME_FORMATS = { - "datetime": "%Y%m%dT%H%M%S", - "time": "%H%M%S", - "date": "%Y%m%d", -} +def get_datetime_formats() -> dict: + """ + Get all datetime format strings. + + Returns + ------- + dict + A dictionary containing format strings for datetime, time, and date + """ + return { + "datetime": "%Y%m%dT%H%M%S", + "time": "%H%M%S", + "date": "%Y%m%d", + } def get_datetime_format(format_type: str) -> str: """ - Get the datetime format string for a given format type. + Get the datetime format string for a specific format type. Parameters ---------- @@ -37,14 +46,8 @@ def get_datetime_format(format_type: str) -> str: Returns ------- str - The format string for the specified type - - Raises - ------ - ValueError - If format_type is not one of the supported types + The format string for the specified format type """ - if format_type not in _DATETIME_FORMATS: - raise ValueError(f"Invalid format type: {format_type}. Must be one of {list(_DATETIME_FORMATS.keys())}") - return _DATETIME_FORMATS[format_type] + formats = get_datetime_formats() + return formats[format_type] diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 809a50f0c..3353cf022 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -65,6 +65,12 @@ def create_folder_trees( "datatype is deprecated in 0.6.0" ) + # Initialize all_paths with required keys + all_paths = { + "sub": [], + "ses": [], + } + if datatype_passed: error_message = validation.check_datatypes_are_valid( datatype, allow_all=True @@ -72,13 +78,6 @@ def create_folder_trees( if error_message: utils.log_and_raise_error(error_message, NeuroBlueprintError) - all_paths: Dict = {} - else: - all_paths = { - "sub": [], - "ses": [], - } - for sub in sub_names: sub_path = cfg.build_project_path( "local", @@ -358,10 +357,135 @@ def process_glob_to_find_datatype_folders( return zip(ses_folder_keys, ses_folder_values) +# ----------------------------------------------------------------------------- # Wildcards # ----------------------------------------------------------------------------- +def search_with_tags( + cfg: Configs, + base_folder: Path, + local_or_central: str, + all_names: List[str], + sub: Optional[str] = None, +) -> List[str]: + """ + Handle wildcard and datetime range searching in names during upload or download. + + There are two types of special patterns that can be used in names: + 1. Wildcards: Names containing @*@ will be replaced with "*" for glob pattern matching + 2. Datetime ranges: Names containing @DATETO@, @TIMETO@, or @DATETIMETO@ will be used + to filter folders within a specific datetime range + + For datetime ranges, the format must be: + - date: YYYYMMDD@DATETO@YYYYMMDD (e.g., "20240101@DATETO@20241231") + - time: HHMMSS@TIMETO@HHMMSS (e.g., "000000@TIMETO@235959") + - datetime: YYYYMMDDTHHMMss@DATETIMETO@YYYYMMDDTHHMMss + + Parameters + ---------- + cfg : Configs + datashuttle project configuration + base_folder : Path + folder to search for wildcards in + local_or_central : str + "local" or "central" project path to search in + all_names : List[str] + list of names that may contain wildcards or datetime ranges. If sub is + passed, these are treated as session names. If sub is None, they are + treated as subject names + sub : Optional[str] + optional subject to search for sessions in. If not provided, + will search for subjects rather than sessions + + Returns + ------- + List[str] + A list of matched folder names after wildcard expansion and datetime filtering. + For datetime ranges, only folders with timestamps within the specified range + will be included. + + Examples + -------- + Wildcards: + >>> search_with_tags(cfg, path, "local", ["sub-@*@"]) + ["sub-001", "sub-002", "sub-003"] + + Date range: + >>> search_with_tags(cfg, path, "local", ["sub-001_20240101@DATETO@20241231_id-*"]) + ["sub-001_20240315_id-1", "sub-001_20240401_id-2"] + + Time range: + >>> search_with_tags(cfg, path, "local", ["sub-002_000000@TIMETO@120000"]) + ["sub-002_083000", "sub-002_113000"] + """ + new_all_names: List[str] = [] + for name in all_names: + if not (canonical_tags.tags("*") in name or + canonical_tags.tags("DATETO") in name or + canonical_tags.tags("TIMETO") in name or + canonical_tags.tags("DATETIMETO") in name): + # If no special tags, just add the name as is + if "_date-" in name or "_time-" in name or "_datetime-" in name: + # For simple date/time formatted names, add them directly + new_all_names.append(name) + else: + # For regular names, just append them + new_all_names.append(name) + continue + + # Handle wildcard replacement first if present + search_str = name + if canonical_tags.tags("*") in name: + search_str = search_str.replace(canonical_tags.tags("*"), "*") + + # Handle datetime ranges + format_type = None + tag = None + if (tag := canonical_tags.tags("DATETO")) in search_str: + format_type = "date" + elif (tag := canonical_tags.tags("TIMETO")) in search_str: + format_type = "time" + elif (tag := canonical_tags.tags("DATETIMETO")) in search_str: + format_type = "datetime" + + if format_type is not None: + assert tag is not None + search_str = format_and_validate_datetime_search_str(search_str, format_type, tag) + + # Use the helper function to perform the glob search + if sub: + matching_names = search_sub_or_ses_level( + cfg, base_folder, local_or_central, sub, search_str=search_str + )[0] + else: + matching_names = search_sub_or_ses_level( + cfg, base_folder, local_or_central, search_str=search_str + )[0] + + # Filter results by datetime range + start_timepoint, end_timepoint = strip_start_end_date_from_datetime_tag( + name, format_type, tag + ) + matching_names = filter_names_by_datetime_range( + matching_names, format_type, start_timepoint, end_timepoint + ) + new_all_names.extend(matching_names) + else: + # No datetime range, just perform the glob search with wildcards + if sub: + matching_names = search_sub_or_ses_level( + cfg, base_folder, local_or_central, sub, search_str=search_str + )[0] + else: + matching_names = search_sub_or_ses_level( + cfg, base_folder, local_or_central, search_str=search_str + )[0] + new_all_names.extend(matching_names) + + return list(set(new_all_names)) # Remove duplicates + + def filter_names_by_datetime_range( names: List[str], format_type: str, @@ -388,125 +512,197 @@ def filter_names_by_datetime_range( ------- List[str] Filtered list of names that fall within the datetime range + + Raises + ------ + ValueError + If any datetime value does not match the expected ISO format """ filtered_names: List[str] = [] for candidate in names: candidate_basename = candidate if isinstance(candidate, str) else candidate.name value = get_values_from_bids_formatted_name([candidate_basename], format_type)[0] + try: - candidate_timepoint = datetime.strptime( - value, canonical_tags.get_datetime_format(format_type) - ) - if start_timepoint <= candidate_timepoint <= end_timepoint: - filtered_names.append(candidate) + candidate_timepoint = datetime_object_from_string(value, format_type) except ValueError: - continue + utils.log_and_raise_error( + f"Invalid {format_type} format in name {candidate_basename}. " + f"Expected ISO format: {canonical_tags.get_datetime_format(format_type)}", + ValueError, + ) + + if start_timepoint <= candidate_timepoint <= end_timepoint: + filtered_names.append(candidate) + return filtered_names -def search_with_tags( - cfg: Configs, - base_folder: Path, - local_or_central: str, - all_names: List[str], - sub: Optional[str] = None, -) -> List[str]: +# ----------------------------------------------------------------------------- +# Datetime Tag Functions +# ----------------------------------------------------------------------------- + + +def get_expected_datetime_len(format_type: str) -> int: """ - Handle wildcard flag in upload or download. + Get the expected length of characters for a datetime format. - All names in name are searched for @*@ string, and replaced - with single * for glob syntax. If sub is passed, it is - assumes all_names is ses_names and the sub folder is searched - for ses_names matching the name including wildcard. Otherwise, - if sub is None it is assumed all_names are sub names and - the level above is searched. + Parameters + ---------- + format_type : str + One of "datetime", "time", or "date" - Outputs a new list of names including all original names - but where @*@-containing names have been replaced with - search results. + Returns + ------- + int + The number of characters expected for the format + """ + format_str = canonical_tags.get_datetime_format(format_type) + today = datetime.now() + return len(today.strftime(format_str)) + + +def find_datetime_in_name(name: str, format_type: str, tag: str) -> tuple[str, str] | None: + """ + Find and extract datetime values from a name using a regex pattern. Parameters ---------- + name : str + The name containing the datetime range + e.g. "sub-001_20240101@DATETO@20250101_id-*" + format_type : str + One of "datetime", "time", or "date" + tag : str + The tag used for the range (e.g. @DATETO@) - project : initialised datashuttle project + Returns + ------- + tuple[str, str] | None + A tuple containing (start_datetime_str, end_datetime_str) if found, + None if no match is found + """ + expected_len = get_expected_datetime_len(format_type) + full_tag_regex = fr"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + match = re.search(full_tag_regex, name) + return match.groups() if match else None - base_folder : folder to search for wildcards in - local_or_central : "local" or "central" project path to - search in +def strip_start_end_date_from_datetime_tag( + search_str: str, format_type: str, tag: str +) -> tuple[datetime, datetime]: + """ + Extract and validate start and end datetime values from a search string. - all_names : list of subject or session names that - may or may not include the wildcard flag. If sub (below) - is passed, it is assumed these are session names. Otherwise, - it is assumed these are subject names. + Parameters + ---------- + search_str : str + The search string containing the datetime range + e.g. "sub-001_20240101T000000@DATETIMETO@20250101T235959" + format_type : str + One of "datetime", "time", or "date" + tag : str + The tag used for the range (e.g. @DATETIMETO@) - sub : optional subject to search for sessions in. If not provided, - will search for subjects rather than sessions. + Returns + ------- + tuple[datetime, datetime] + A tuple containing (start_timepoint, end_timepoint) + + Raises + ------ + NeuroBlueprintError + If the datetime format is invalid, the range is malformed, + or end datetime is before start datetime """ - new_all_names: List[str] = [] - for name in all_names: - if not (canonical_tags.tags("*") in name or - canonical_tags.tags("DATETO") in name or - canonical_tags.tags("TIMETO") in name or - canonical_tags.tags("DATETIMETO") in name): - new_all_names.append(name) - continue + expected_len = get_expected_datetime_len(format_type) + full_tag_regex = fr"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + match = re.search(full_tag_regex, search_str) - # Initialize search string - search_str = name + if not match: + utils.log_and_raise_error( + f"Invalid {format_type} range format in search string: {search_str}. Ensure the format matches the expected pattern: {canonical_tags.get_datetime_format(format_type)}.", + NeuroBlueprintError, + ) - # Handle wildcard replacement first if present - if canonical_tags.tags("*") in name: - search_str = search_str.replace(canonical_tags.tags("*"), "*") + start_str, end_str = match.groups() - # Handle datetime ranges - format_type = tag = None - if canonical_tags.tags("DATETO") in search_str: - format_type = "date" - tag = canonical_tags.tags("DATETO") - elif canonical_tags.tags("TIMETO") in search_str: - format_type = "time" - tag = canonical_tags.tags("TIMETO") - elif canonical_tags.tags("DATETIMETO") in search_str: - format_type = "datetime" - tag = canonical_tags.tags("DATETIMETO") + try: + start_timepoint = datetime_object_from_string(start_str, format_type) + end_timepoint = datetime_object_from_string(end_str, format_type) + except ValueError as e: + utils.log_and_raise_error( + f"Invalid {format_type} format in search string: {search_str}. Error: {str(e)}", + NeuroBlueprintError, + ) - if format_type is not None: - assert tag is not None, "format and tag should be set together" - search_str = validation.format_and_validate_datetime_search_str(search_str, format_type, tag) - - # Use the helper function to perform the glob search - if sub: - matching_names: List[str] = search_sub_or_ses_level( - cfg, - base_folder, - local_or_central, - sub, - search_str=search_str, - )[0] - else: - matching_names = search_sub_or_ses_level( - cfg, base_folder, local_or_central, search_str=search_str - )[0] - - # Filter results by datetime range if one was present - if format_type is not None and tag is not None: - expected_values = validation.get_expected_num_datetime_values(format_type) - full_tag_regex = fr"(\d{{{expected_values}}}){re.escape(tag)}(\d{{{expected_values}}})" - match = re.search(full_tag_regex, name) - if match: # We know this is true because format_and_validate_datetime_search_str succeeded - start_str, end_str = match.groups() - start_timepoint = datetime.strptime(start_str, canonical_tags.get_datetime_format(format_type)) - end_timepoint = datetime.strptime(end_str, canonical_tags.get_datetime_format(format_type)) - matching_names = filter_names_by_datetime_range( - matching_names, format_type, start_timepoint, end_timepoint - ) + if end_timepoint < start_timepoint: + utils.log_and_raise_error( + f"End {format_type} is before start {format_type}. Ensure the end datetime is after the start datetime.", + NeuroBlueprintError, + ) - new_all_names.extend(matching_names) + return start_timepoint, end_timepoint - # Remove duplicates in case of wildcard overlap - new_all_names = list(set(new_all_names)) - return new_all_names + +def format_and_validate_datetime_search_str(search_str: str, format_type: str, tag: str) -> str: + """ + Validate and format a search string containing a datetime range. + + Parameters + ---------- + search_str : str + The search string containing the datetime range + e.g. "sub-001_20240101@DATETO@20250101_id-*" or "sub-002_000000@TIMETO@235959" + format_type : str + One of "datetime", "time", or "date" + tag : str + The tag used for the range (e.g. @DATETO@) + + Returns + ------- + str + The formatted search string with datetime range replaced + e.g. "sub-001_date-*_id-*" or "sub-002_time-*" + + Raises + ------ + NeuroBlueprintError + If the datetime format is invalid or the range is malformed + """ + # Extract and validate datetime range + strip_start_end_date_from_datetime_tag(search_str, format_type, tag) + + # Replace datetime range with wildcard pattern + expected_len = get_expected_datetime_len(format_type) + full_tag_regex = fr"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + return re.sub(full_tag_regex, f"{format_type}-*", search_str) + + +def datetime_object_from_string(datetime_string: str, format_type: str) -> datetime: + """ + Convert a datetime string to a datetime object using the appropriate format. + + Parameters + ---------- + datetime_string : str + The string to convert to a datetime object + format_type : str + One of "datetime", "time", or "date" + + Returns + ------- + datetime + The parsed datetime object + + Raises + ------ + ValueError + If the string cannot be parsed using the specified format + """ + return datetime.strptime( + datetime_string, canonical_tags.get_datetime_format(format_type) + ) # ----------------------------------------------------------------------------- diff --git a/datashuttle/utils/validation.py b/datashuttle/utils/validation.py index 5025dd6c6..f10f855be 100644 --- a/datashuttle/utils/validation.py +++ b/datashuttle/utils/validation.py @@ -322,7 +322,7 @@ def replace_tags_in_regexp(regexp: str) -> str: """ regexp_list = [regexp] date_regexp = r"\d{8}" - time_regexp = "\d\d\d\d\d\d" + time_regexp = r"\d{6}" formatting.replace_date_time_tags_in_name( regexp_list, @@ -361,7 +361,7 @@ def names_include_special_characters( def name_has_special_character(name: str) -> bool: - return not re.match("^[A-Za-z0-9_-]*$", name) + return not re.match(r"^[A-Za-z0-9_-]*$", name) def dashes_and_underscore_alternate_incorrectly( @@ -432,7 +432,8 @@ def datetime_are_iso_format( """ Check formatting for date-, time-, or datetime- tags. """ - key = next((key for key in ["datetime", "time", "date"] if key in name), None) + datetime_keys = list(canonical_tags.get_datetime_formats().keys()) + key = next((key for key in datetime_keys if key in name), None) error_message: List[str] if not key: @@ -445,24 +446,19 @@ def datetime_are_iso_format( except: return [] - try: - if not validate_datetime(format_to_check, key): - error_message = [get_datetime_error( - key, name, canonical_tags.get_datetime_format(key), path_ - )] - else: - error_message = [] - except ValueError: + if datetime_value_str_is_iso_format(format_to_check, key): + error_message = [] + else: error_message = [get_datetime_error( - key, name, canonical_tags.get_datetime_format(key), path_ + key, name, canonical_tags.get_datetime_formats()[key], path_ )] return error_message -def validate_datetime(datetime_str: str, format_type: str) -> bool: +def datetime_value_str_is_iso_format(datetime_str: str, format_type: str) -> bool: """ - Validate that a datetime string matches the expected format. + Validate that a datetime string matches the expected ISO format. Parameters ---------- @@ -474,93 +470,15 @@ def validate_datetime(datetime_str: str, format_type: str) -> bool: Returns ------- bool - True if valid, False otherwise + True if the string matches the ISO format, False otherwise """ try: - datetime.strptime(datetime_str, canonical_tags.get_datetime_format(format_type)) + datetime.strptime(datetime_str, canonical_tags.get_datetime_formats()[format_type]) return True except ValueError: return False -def get_expected_num_datetime_values(format_type: str) -> int: - """ - Get the expected number of characters for a datetime format. - - Parameters - ---------- - format_type : str - One of "datetime", "time", or "date" - - Returns - ------- - int - The number of characters expected for the format - """ - format_str = canonical_tags.get_datetime_format(format_type) - today = datetime.now() - return len(today.strftime(format_str)) - - -def format_and_validate_datetime_search_str(search_str: str, format_type: str, tag: str) -> str: - """ - Validate and format a search string containing a datetime range. - - Parameters - ---------- - search_str : str - The search string containing the datetime range - format_type : str - One of "datetime", "time", or "date" - tag : str - The tag used for the range (e.g. @DATETO@) - - Returns - ------- - str - The formatted search string with datetime range replaced - - Raises - ------ - NeuroBlueprintError - If the datetime format is invalid or the range is malformed - """ - expected_values = get_expected_num_datetime_values(format_type) - full_tag_regex = fr"(\d{{{expected_values}}}){re.escape(tag)}(\d{{{expected_values}}})" - match = re.search(full_tag_regex, search_str) - - if not match: - utils.log_and_raise_error( - f"Invalid {format_type} range format in search string: {search_str}", - NeuroBlueprintError, - ) - - start_str, end_str = match.groups() - - if not validate_datetime(start_str, format_type): - utils.log_and_raise_error( - f"Invalid start {format_type} format: {start_str}", - NeuroBlueprintError, - ) - - if not validate_datetime(end_str, format_type): - utils.log_and_raise_error( - f"Invalid end {format_type} format: {end_str}", - NeuroBlueprintError, - ) - - start_timepoint = datetime.strptime(start_str, canonical_tags.get_datetime_format(format_type)) - end_timepoint = datetime.strptime(end_str, canonical_tags.get_datetime_format(format_type)) - - if end_timepoint < start_timepoint: - utils.log_and_raise_error( - f"End {format_type} is before start {format_type}", - NeuroBlueprintError, - ) - - return re.sub(full_tag_regex, f"{format_type}-*", search_str) - - def raise_display_mode( message: str, display_mode: DisplayMode, log: bool ) -> None: From 15f8a3c254aae7115fd0f92b5c5d88621415ff03 Mon Sep 17 00:00:00 2001 From: Diya <152620955+Diya910@users.noreply.github.com> Date: Fri, 4 Jul 2025 12:50:09 +0530 Subject: [PATCH 04/36] Update datashuttle/utils/folders.py Co-authored-by: Joe Ziminski <55797454+JoeZiminski@users.noreply.github.com> --- datashuttle/utils/folders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 3353cf022..2a306348d 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -380,7 +380,7 @@ def search_with_tags( For datetime ranges, the format must be: - date: YYYYMMDD@DATETO@YYYYMMDD (e.g., "20240101@DATETO@20241231") - time: HHMMSS@TIMETO@HHMMSS (e.g., "000000@TIMETO@235959") - - datetime: YYYYMMDDTHHMMss@DATETIMETO@YYYYMMDDTHHMMss + - datetime: YYYYMMDDTHHMMSS@DATETIMETO@YYYYMMDDTHHMMSS Parameters ---------- From 810402499c23af3f11dda142f26047108cb5f516 Mon Sep 17 00:00:00 2001 From: Diya <152620955+Diya910@users.noreply.github.com> Date: Fri, 4 Jul 2025 12:58:16 +0530 Subject: [PATCH 05/36] Update datashuttle/utils/folders.py Co-authored-by: Joe Ziminski <55797454+JoeZiminski@users.noreply.github.com> --- datashuttle/utils/folders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 2a306348d..8ded6edd7 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -413,7 +413,7 @@ def search_with_tags( Date range: >>> search_with_tags(cfg, path, "local", ["sub-001_20240101@DATETO@20241231_id-*"]) - ["sub-001_20240315_id-1", "sub-001_20240401_id-2"] + ["sub-001_date-20240315_id-1", "sub-001_date-20240401_id-2"] Time range: >>> search_with_tags(cfg, path, "local", ["sub-002_000000@TIMETO@120000"]) From 8a2cbbd1695e7c5c00fe3b30f57e142f365320c9 Mon Sep 17 00:00:00 2001 From: Diya <152620955+Diya910@users.noreply.github.com> Date: Fri, 4 Jul 2025 12:59:02 +0530 Subject: [PATCH 06/36] Update datashuttle/utils/folders.py Co-authored-by: Joe Ziminski <55797454+JoeZiminski@users.noreply.github.com> --- datashuttle/utils/folders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 8ded6edd7..a675108cb 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -417,7 +417,7 @@ def search_with_tags( Time range: >>> search_with_tags(cfg, path, "local", ["sub-002_000000@TIMETO@120000"]) - ["sub-002_083000", "sub-002_113000"] + ["sub-002_time-083000", "sub-002_time-113000"] """ new_all_names: List[str] = [] for name in all_names: From a44cb6111eecad3d80153c3e26dd49a2238c577f Mon Sep 17 00:00:00 2001 From: Diya <152620955+Diya910@users.noreply.github.com> Date: Fri, 4 Jul 2025 13:02:09 +0530 Subject: [PATCH 07/36] Update datashuttle/utils/folders.py Co-authored-by: Joe Ziminski <55797454+JoeZiminski@users.noreply.github.com> --- datashuttle/utils/folders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index a675108cb..febb77956 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -636,7 +636,7 @@ def strip_start_end_date_from_datetime_tag( NeuroBlueprintError, ) - if end_timepoint < start_timepoint: + if end_timepoint <= start_timepoint: utils.log_and_raise_error( f"End {format_type} is before start {format_type}. Ensure the end datetime is after the start datetime.", NeuroBlueprintError, From f9a21b45315fe4d06643694d87ffab6c8bdaf1a6 Mon Sep 17 00:00:00 2001 From: Diya <152620955+Diya910@users.noreply.github.com> Date: Fri, 4 Jul 2025 13:02:35 +0530 Subject: [PATCH 08/36] Update datashuttle/utils/validation.py Co-authored-by: Joe Ziminski <55797454+JoeZiminski@users.noreply.github.com> --- datashuttle/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashuttle/utils/validation.py b/datashuttle/utils/validation.py index f10f855be..da61c7275 100644 --- a/datashuttle/utils/validation.py +++ b/datashuttle/utils/validation.py @@ -433,7 +433,7 @@ def datetime_are_iso_format( Check formatting for date-, time-, or datetime- tags. """ datetime_keys = list(canonical_tags.get_datetime_formats().keys()) - key = next((key for key in datetime_keys if key in name), None) + key = next((key for key in datetime_keys if f"_{key}-" in name), None) error_message: List[str] if not key: From bd12cd63148944ab85112fb5917535cd267295a5 Mon Sep 17 00:00:00 2001 From: Diya910 Date: Fri, 4 Jul 2025 13:30:58 +0530 Subject: [PATCH 09/36] Refactor: Clean up docstrings in folders.py and canonical_tags.py as per review --- datashuttle/configs/canonical_tags.py | 19 -------- datashuttle/utils/folders.py | 70 +++++++++++++-------------- 2 files changed, 33 insertions(+), 56 deletions(-) diff --git a/datashuttle/configs/canonical_tags.py b/datashuttle/configs/canonical_tags.py index d36e5d781..7f265a5c3 100644 --- a/datashuttle/configs/canonical_tags.py +++ b/datashuttle/configs/canonical_tags.py @@ -32,22 +32,3 @@ def get_datetime_formats() -> dict: "time": "%H%M%S", "date": "%Y%m%d", } - - -def get_datetime_format(format_type: str) -> str: - """ - Get the datetime format string for a specific format type. - - Parameters - ---------- - format_type : str - One of "datetime", "time", or "date" - - Returns - ------- - str - The format string for the specified format type - """ - formats = get_datetime_formats() - return formats[format_type] - diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 3353cf022..b10da3896 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -384,17 +384,17 @@ def search_with_tags( Parameters ---------- - cfg : Configs + cfg datashuttle project configuration - base_folder : Path + base_folder folder to search for wildcards in - local_or_central : str + local_or_central "local" or "central" project path to search in - all_names : List[str] + all_names list of names that may contain wildcards or datetime ranges. If sub is passed, these are treated as session names. If sub is None, they are treated as subject names - sub : Optional[str] + sub optional subject to search for sessions in. If not provided, will search for subjects rather than sessions @@ -426,12 +426,7 @@ def search_with_tags( canonical_tags.tags("TIMETO") in name or canonical_tags.tags("DATETIMETO") in name): # If no special tags, just add the name as is - if "_date-" in name or "_time-" in name or "_datetime-" in name: - # For simple date/time formatted names, add them directly - new_all_names.append(name) - else: - # For regular names, just append them - new_all_names.append(name) + new_all_names.append(name) continue # Handle wildcard replacement first if present @@ -442,11 +437,14 @@ def search_with_tags( # Handle datetime ranges format_type = None tag = None - if (tag := canonical_tags.tags("DATETO")) in search_str: + if canonical_tags.tags("DATETO") in search_str: + tag = canonical_tags.tags("DATETO") format_type = "date" - elif (tag := canonical_tags.tags("TIMETO")) in search_str: + elif canonical_tags.tags("TIMETO") in search_str: + tag = canonical_tags.tags("TIMETO") format_type = "time" - elif (tag := canonical_tags.tags("DATETIMETO")) in search_str: + elif canonical_tags.tags("DATETIMETO") in search_str: + tag = canonical_tags.tags("DATETIMETO") format_type = "datetime" if format_type is not None: @@ -499,13 +497,13 @@ def filter_names_by_datetime_range( Parameters ---------- - names : List[str] + names List of names to filter, all containing the datetime pattern - format_type : str + format_type One of "datetime", "time", or "date" - start_timepoint : datetime + start_timepoint Start of the datetime range - end_timepoint : datetime + end_timepoint End of the datetime range Returns @@ -528,7 +526,7 @@ def filter_names_by_datetime_range( except ValueError: utils.log_and_raise_error( f"Invalid {format_type} format in name {candidate_basename}. " - f"Expected ISO format: {canonical_tags.get_datetime_format(format_type)}", + f"Expected ISO format: {canonical_tags.get_datetime_formats()[format_type]}", ValueError, ) @@ -549,7 +547,7 @@ def get_expected_datetime_len(format_type: str) -> int: Parameters ---------- - format_type : str + format_type One of "datetime", "time", or "date" Returns @@ -557,7 +555,7 @@ def get_expected_datetime_len(format_type: str) -> int: int The number of characters expected for the format """ - format_str = canonical_tags.get_datetime_format(format_type) + format_str = canonical_tags.get_datetime_formats()[format_type] today = datetime.now() return len(today.strftime(format_str)) @@ -568,12 +566,12 @@ def find_datetime_in_name(name: str, format_type: str, tag: str) -> tuple[str, s Parameters ---------- - name : str + name The name containing the datetime range e.g. "sub-001_20240101@DATETO@20250101_id-*" - format_type : str + format_type One of "datetime", "time", or "date" - tag : str + tag The tag used for the range (e.g. @DATETO@) Returns @@ -596,12 +594,12 @@ def strip_start_end_date_from_datetime_tag( Parameters ---------- - search_str : str + search_str The search string containing the datetime range e.g. "sub-001_20240101T000000@DATETIMETO@20250101T235959" - format_type : str + format_type One of "datetime", "time", or "date" - tag : str + tag The tag used for the range (e.g. @DATETIMETO@) Returns @@ -621,7 +619,7 @@ def strip_start_end_date_from_datetime_tag( if not match: utils.log_and_raise_error( - f"Invalid {format_type} range format in search string: {search_str}. Ensure the format matches the expected pattern: {canonical_tags.get_datetime_format(format_type)}.", + f"Invalid {format_type} range format in search string: {search_str}. Ensure the format matches the expected pattern: {canonical_tags.get_datetime_formats()[format_type]}.", NeuroBlueprintError, ) @@ -651,12 +649,12 @@ def format_and_validate_datetime_search_str(search_str: str, format_type: str, t Parameters ---------- - search_str : str + search_str The search string containing the datetime range e.g. "sub-001_20240101@DATETO@20250101_id-*" or "sub-002_000000@TIMETO@235959" - format_type : str + format_type One of "datetime", "time", or "date" - tag : str + tag The tag used for the range (e.g. @DATETO@) Returns @@ -670,7 +668,7 @@ def format_and_validate_datetime_search_str(search_str: str, format_type: str, t NeuroBlueprintError If the datetime format is invalid or the range is malformed """ - # Extract and validate datetime range + # Validate the datetime range format strip_start_end_date_from_datetime_tag(search_str, format_type, tag) # Replace datetime range with wildcard pattern @@ -685,10 +683,8 @@ def datetime_object_from_string(datetime_string: str, format_type: str) -> datet Parameters ---------- - datetime_string : str - The string to convert to a datetime object - format_type : str - One of "datetime", "time", or "date" + datetime_string : The string to convert to a datetime object + format_type : One of "datetime", "time", or "date" Returns ------- @@ -701,7 +697,7 @@ def datetime_object_from_string(datetime_string: str, format_type: str) -> datet If the string cannot be parsed using the specified format """ return datetime.strptime( - datetime_string, canonical_tags.get_datetime_format(format_type) + datetime_string, canonical_tags.get_datetime_formats()[format_type] ) From d1ede1dd4027ffb4e880e9094aa9ebd6ea459112 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 30 Jul 2025 23:28:34 +0000 Subject: [PATCH 10/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- datashuttle/configs/canonical_tags.py | 4 +- datashuttle/utils/folders.py | 110 ++++++++++++++++---------- datashuttle/utils/validation.py | 31 +++++--- tests/test_date_search_range.py | 31 ++++++-- 4 files changed, 117 insertions(+), 59 deletions(-) diff --git a/datashuttle/configs/canonical_tags.py b/datashuttle/configs/canonical_tags.py index f70734baa..25af6b085 100644 --- a/datashuttle/configs/canonical_tags.py +++ b/datashuttle/configs/canonical_tags.py @@ -17,13 +17,13 @@ def tags(tag_name: str) -> str: def get_datetime_formats() -> dict: - """ - Get all datetime format strings. + """Get all datetime format strings. Returns ------- dict A dictionary containing format strings for datetime, time, and date + """ return { "datetime": "%Y%m%dT%H%M%S", diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 3ed97927b..950a77d05 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -16,13 +16,10 @@ from datashuttle.configs.config_class import Configs from datashuttle.utils.custom_types import TopLevelFolder -import glob -import re - -from datetime import datetime import fnmatch import json - +import re +from datetime import datetime from pathlib import Path from datashuttle.configs import canonical_folders, canonical_tags @@ -418,8 +415,7 @@ def search_with_tags( all_names: List[str], sub: Optional[str] = None, ) -> List[str]: - """ - Handle wildcard and datetime range searching in names during upload or download. + """Handle wildcard and datetime range searching in names during upload or download. There are two types of special patterns that can be used in names: 1. Wildcards: Names containing @*@ will be replaced with "*" for glob pattern matching @@ -461,13 +457,16 @@ def search_with_tags( ["sub-001", "sub-002", "sub-003"] Date range: - >>> search_with_tags(cfg, path, "local", ["sub-001_20240101@DATETO@20241231_id-*"]) + >>> search_with_tags( + ... cfg, path, "local", ["sub-001_20240101@DATETO@20241231_id-*"] + ... ) ["sub-001_date-20240315_id-1", "sub-001_date-20240401_id-2"] Time range: >>> search_with_tags(cfg, path, "local", ["sub-002_000000@TIMETO@120000"]) ["sub-002_time-083000", "sub-002_time-113000"] -======= + ======= + Parameters ---------- cfg @@ -500,14 +499,17 @@ def search_with_tags( but where @*@-containing names have been replaced with search results. ->>>>>>> upstream/main + >>>>>>> upstream/main + """ new_all_names: List[str] = [] for name in all_names: - if not (canonical_tags.tags("*") in name or - canonical_tags.tags("DATETO") in name or - canonical_tags.tags("TIMETO") in name or - canonical_tags.tags("DATETIMETO") in name): + if not ( + canonical_tags.tags("*") in name + or canonical_tags.tags("DATETO") in name + or canonical_tags.tags("TIMETO") in name + or canonical_tags.tags("DATETIMETO") in name + ): # If no special tags, just add the name as is new_all_names.append(name) continue @@ -532,12 +534,18 @@ def search_with_tags( if format_type is not None: assert tag is not None - search_str = format_and_validate_datetime_search_str(search_str, format_type, tag) + search_str = format_and_validate_datetime_search_str( + search_str, format_type, tag + ) # Use the helper function to perform the glob search if sub: matching_names = search_sub_or_ses_level( - cfg, base_folder, local_or_central, sub, search_str=search_str + cfg, + base_folder, + local_or_central, + sub, + search_str=search_str, )[0] else: matching_names = search_sub_or_ses_level( @@ -545,8 +553,8 @@ def search_with_tags( )[0] # Filter results by datetime range - start_timepoint, end_timepoint = strip_start_end_date_from_datetime_tag( - name, format_type, tag + start_timepoint, end_timepoint = ( + strip_start_end_date_from_datetime_tag(name, format_type, tag) ) matching_names = filter_names_by_datetime_range( matching_names, format_type, start_timepoint, end_timepoint @@ -556,7 +564,11 @@ def search_with_tags( # No datetime range, just perform the glob search with wildcards if sub: matching_names = search_sub_or_ses_level( - cfg, base_folder, local_or_central, sub, search_str=search_str + cfg, + base_folder, + local_or_central, + sub, + search_str=search_str, )[0] else: matching_names = search_sub_or_ses_level( @@ -573,8 +585,7 @@ def filter_names_by_datetime_range( start_timepoint: datetime, end_timepoint: datetime, ) -> List[str]: - """ - Filter a list of names based on a datetime range. + """Filter a list of names based on a datetime range. Assumes all names contain the format_type pattern (e.g., date-*, time-*) as they were searched using this pattern. @@ -598,14 +609,21 @@ def filter_names_by_datetime_range( ------ ValueError If any datetime value does not match the expected ISO format + """ filtered_names: List[str] = [] for candidate in names: - candidate_basename = candidate if isinstance(candidate, str) else candidate.name - value = get_values_from_bids_formatted_name([candidate_basename], format_type)[0] + candidate_basename = ( + candidate if isinstance(candidate, str) else candidate.name + ) + value = get_values_from_bids_formatted_name( + [candidate_basename], format_type + )[0] try: - candidate_timepoint = datetime_object_from_string(value, format_type) + candidate_timepoint = datetime_object_from_string( + value, format_type + ) except ValueError: utils.log_and_raise_error( f"Invalid {format_type} format in name {candidate_basename}. " @@ -625,8 +643,7 @@ def filter_names_by_datetime_range( def get_expected_datetime_len(format_type: str) -> int: - """ - Get the expected length of characters for a datetime format. + """Get the expected length of characters for a datetime format. Parameters ---------- @@ -637,15 +654,17 @@ def get_expected_datetime_len(format_type: str) -> int: ------- int The number of characters expected for the format + """ format_str = canonical_tags.get_datetime_formats()[format_type] today = datetime.now() return len(today.strftime(format_str)) -def find_datetime_in_name(name: str, format_type: str, tag: str) -> tuple[str, str] | None: - """ - Find and extract datetime values from a name using a regex pattern. +def find_datetime_in_name( + name: str, format_type: str, tag: str +) -> tuple[str, str] | None: + """Find and extract datetime values from a name using a regex pattern. Parameters ---------- @@ -662,9 +681,12 @@ def find_datetime_in_name(name: str, format_type: str, tag: str) -> tuple[str, s tuple[str, str] | None A tuple containing (start_datetime_str, end_datetime_str) if found, None if no match is found + """ expected_len = get_expected_datetime_len(format_type) - full_tag_regex = fr"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + full_tag_regex = ( + rf"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + ) match = re.search(full_tag_regex, name) return match.groups() if match else None @@ -672,8 +694,7 @@ def find_datetime_in_name(name: str, format_type: str, tag: str) -> tuple[str, s def strip_start_end_date_from_datetime_tag( search_str: str, format_type: str, tag: str ) -> tuple[datetime, datetime]: - """ - Extract and validate start and end datetime values from a search string. + """Extract and validate start and end datetime values from a search string. Parameters ---------- @@ -695,9 +716,12 @@ def strip_start_end_date_from_datetime_tag( NeuroBlueprintError If the datetime format is invalid, the range is malformed, or end datetime is before start datetime + """ expected_len = get_expected_datetime_len(format_type) - full_tag_regex = fr"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + full_tag_regex = ( + rf"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + ) match = re.search(full_tag_regex, search_str) if not match: @@ -726,9 +750,10 @@ def strip_start_end_date_from_datetime_tag( return start_timepoint, end_timepoint -def format_and_validate_datetime_search_str(search_str: str, format_type: str, tag: str) -> str: - """ - Validate and format a search string containing a datetime range. +def format_and_validate_datetime_search_str( + search_str: str, format_type: str, tag: str +) -> str: + """Validate and format a search string containing a datetime range. Parameters ---------- @@ -750,19 +775,23 @@ def format_and_validate_datetime_search_str(search_str: str, format_type: str, t ------ NeuroBlueprintError If the datetime format is invalid or the range is malformed + """ # Validate the datetime range format strip_start_end_date_from_datetime_tag(search_str, format_type, tag) # Replace datetime range with wildcard pattern expected_len = get_expected_datetime_len(format_type) - full_tag_regex = fr"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + full_tag_regex = ( + rf"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + ) return re.sub(full_tag_regex, f"{format_type}-*", search_str) -def datetime_object_from_string(datetime_string: str, format_type: str) -> datetime: - """ - Convert a datetime string to a datetime object using the appropriate format. +def datetime_object_from_string( + datetime_string: str, format_type: str +) -> datetime: + """Convert a datetime string to a datetime object using the appropriate format. Parameters ---------- @@ -778,6 +807,7 @@ def datetime_object_from_string(datetime_string: str, format_type: str) -> datet ------ ValueError If the string cannot be parsed using the specified format + """ return datetime.strptime( datetime_string, canonical_tags.get_datetime_formats()[format_type] diff --git a/datashuttle/utils/validation.py b/datashuttle/utils/validation.py index 907b856f3..072e098c3 100644 --- a/datashuttle/utils/validation.py +++ b/datashuttle/utils/validation.py @@ -24,7 +24,11 @@ from itertools import chain from pathlib import Path -from datashuttle.configs import canonical_configs, canonical_folders, canonical_tags +from datashuttle.configs import ( + canonical_configs, + canonical_folders, + canonical_tags, +) from datashuttle.utils import formatting, getters, utils from datashuttle.utils.custom_exceptions import NeuroBlueprintError @@ -576,16 +580,22 @@ def datetime_are_iso_format( if datetime_value_str_is_iso_format(format_to_check, key): error_message = [] else: - error_message = [get_datetime_error( - key, name, canonical_tags.get_datetime_formats()[key], path_ - )] + error_message = [ + get_datetime_error( + key, + name, + canonical_tags.get_datetime_formats()[key], + path_, + ) + ] return error_message -def datetime_value_str_is_iso_format(datetime_str: str, format_type: str) -> bool: - """ - Validate that a datetime string matches the expected ISO format. +def datetime_value_str_is_iso_format( + datetime_str: str, format_type: str +) -> bool: + """Validate that a datetime string matches the expected ISO format. Parameters ---------- @@ -598,9 +608,12 @@ def datetime_value_str_is_iso_format(datetime_str: str, format_type: str) -> boo ------- bool True if the string matches the ISO format, False otherwise + """ try: - datetime.strptime(datetime_str, canonical_tags.get_datetime_formats()[format_type]) + datetime.strptime( + datetime_str, canonical_tags.get_datetime_formats()[format_type] + ) return True except ValueError: return False @@ -1185,5 +1198,3 @@ def check_datatypes_are_valid( return message return None - - diff --git a/tests/test_date_search_range.py b/tests/test_date_search_range.py index d32d61d01..3ebf8d3f9 100644 --- a/tests/test_date_search_range.py +++ b/tests/test_date_search_range.py @@ -3,7 +3,6 @@ import re import shutil import tempfile -from datetime import datetime from pathlib import Path from typing import List @@ -20,7 +19,7 @@ def tags(x: str) -> str: "*": "@*@", "DATETO": "@DATETO@", "TIMETO": "@TIMETO@", - "DATETIMETO": "@DATETIMETO@" + "DATETIMETO": "@DATETIMETO@", } return tags_dict.get(x, x) @@ -40,8 +39,13 @@ def get_datetime_format(format_type: str) -> str: @pytest.fixture(autouse=True) def patch_canonical_tags(monkeypatch): from datashuttle.configs import canonical_tags + monkeypatch.setattr(canonical_tags, "tags", DummyCanonicalTags.tags) - monkeypatch.setattr(canonical_tags, "get_datetime_format", DummyCanonicalTags.get_datetime_format) + monkeypatch.setattr( + canonical_tags, + "get_datetime_format", + DummyCanonicalTags.get_datetime_format, + ) # Dummy implementation for search_sub_or_ses_level that simply performs globbing. @@ -57,11 +61,16 @@ def dummy_search_sub_or_ses_level( @pytest.fixture(autouse=True) def patch_search_sub_or_ses_level(monkeypatch): from datashuttle.utils import folders - monkeypatch.setattr(folders, "search_sub_or_ses_level", dummy_search_sub_or_ses_level) + + monkeypatch.setattr( + folders, "search_sub_or_ses_level", dummy_search_sub_or_ses_level + ) # Dummy implementation for get_values_from_bids_formatted_name -def dummy_get_values_from_bids_formatted_name(names: List[str], key: str, return_as_int: bool = False) -> List[str]: +def dummy_get_values_from_bids_formatted_name( + names: List[str], key: str, return_as_int: bool = False +) -> List[str]: results = [] for name in names: if key == "date": @@ -75,7 +84,12 @@ def dummy_get_values_from_bids_formatted_name(names: List[str], key: str, return @pytest.fixture(autouse=True) def patch_get_values_from_bids(monkeypatch): from datashuttle.utils import utils - monkeypatch.setattr(utils, "get_values_from_bids_formatted_name", dummy_get_values_from_bids_formatted_name) + + monkeypatch.setattr( + utils, + "get_values_from_bids_formatted_name", + dummy_get_values_from_bids_formatted_name, + ) # Fixture to create a temporary directory with a simulated folder structure. @@ -104,6 +118,7 @@ def test_date_range_wildcard(temp_project_dir: Path): only folders whose embedded date falls between 20250306 and 20250309 (inclusive) should be returned. """ + class Configs: pass @@ -130,6 +145,7 @@ def test_simple_wildcard(temp_project_dir: Path): When given a simple wildcard pattern like "sub-01_@*@", all folders should be returned. """ + class Configs: pass @@ -146,6 +162,7 @@ def test_invalid_date_range(temp_project_dir: Path): """ Test that invalid date ranges raise appropriate errors. """ + class Configs: pass @@ -170,6 +187,7 @@ def test_combined_wildcards(temp_project_dir: Path): """ Test that wildcard and date range can be combined in the same pattern. """ + class Configs: pass @@ -197,4 +215,3 @@ class Configs: "sub-03_date-20250308", } assert matched_folders == expected_folders - From 399b402682f1caf0e802ffb48a0d6bb0f9cd8d2b Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Thu, 31 Jul 2025 00:51:39 +0100 Subject: [PATCH 11/36] Fix linting. --- datashuttle/configs/canonical_tags.py | 4 +- datashuttle/utils/folders.py | 156 ++++++++++++++++++-------- datashuttle/utils/validation.py | 31 +++-- 3 files changed, 133 insertions(+), 58 deletions(-) diff --git a/datashuttle/configs/canonical_tags.py b/datashuttle/configs/canonical_tags.py index f70734baa..25af6b085 100644 --- a/datashuttle/configs/canonical_tags.py +++ b/datashuttle/configs/canonical_tags.py @@ -17,13 +17,13 @@ def tags(tag_name: str) -> str: def get_datetime_formats() -> dict: - """ - Get all datetime format strings. + """Get all datetime format strings. Returns ------- dict A dictionary containing format strings for datetime, time, and date + """ return { "datetime": "%Y%m%dT%H%M%S", diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 3ed97927b..9e8d6d1fd 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -5,9 +5,11 @@ Any, Dict, List, + Literal, Optional, Tuple, Union, + overload, ) if TYPE_CHECKING: @@ -16,13 +18,10 @@ from datashuttle.configs.config_class import Configs from datashuttle.utils.custom_types import TopLevelFolder -import glob -import re - -from datetime import datetime import fnmatch import json - +import re +from datetime import datetime from pathlib import Path from datashuttle.configs import canonical_folders, canonical_tags @@ -78,7 +77,7 @@ def create_folder_trees( ) # Initialize all_paths with required keys - all_paths = { + all_paths: dict = { "sub": [], "ses": [], } @@ -265,7 +264,7 @@ def search_project_for_sub_or_ses_names( """ # Search local and central for folders that begin with "sub-*" - local_foldernames, _ = search_sub_or_ses_level( + local_foldernames, _ = search_sub_or_ses_level( # type: ignore cfg, cfg.get_base_folder("local", top_level_folder), "local", @@ -278,7 +277,7 @@ def search_project_for_sub_or_ses_names( central_foldernames: List if include_central: - central_foldernames, _ = search_sub_or_ses_level( + central_foldernames, _ = search_sub_or_ses_level( # type: ignore cfg, cfg.get_base_folder("central", top_level_folder), "central", @@ -418,8 +417,7 @@ def search_with_tags( all_names: List[str], sub: Optional[str] = None, ) -> List[str]: - """ - Handle wildcard and datetime range searching in names during upload or download. + """Handle wildcard and datetime range searching in names during upload or download. There are two types of special patterns that can be used in names: 1. Wildcards: Names containing @*@ will be replaced with "*" for glob pattern matching @@ -461,13 +459,16 @@ def search_with_tags( ["sub-001", "sub-002", "sub-003"] Date range: - >>> search_with_tags(cfg, path, "local", ["sub-001_20240101@DATETO@20241231_id-*"]) + >>> search_with_tags( + ... cfg, path, "local", ["sub-001_20240101@DATETO@20241231_id-*"] + ... ) ["sub-001_date-20240315_id-1", "sub-001_date-20240401_id-2"] Time range: >>> search_with_tags(cfg, path, "local", ["sub-002_000000@TIMETO@120000"]) ["sub-002_time-083000", "sub-002_time-113000"] -======= + ======= + Parameters ---------- cfg @@ -500,14 +501,17 @@ def search_with_tags( but where @*@-containing names have been replaced with search results. ->>>>>>> upstream/main + >>>>>>> upstream/main + """ new_all_names: List[str] = [] for name in all_names: - if not (canonical_tags.tags("*") in name or - canonical_tags.tags("DATETO") in name or - canonical_tags.tags("TIMETO") in name or - canonical_tags.tags("DATETIMETO") in name): + if not ( + canonical_tags.tags("*") in name + or canonical_tags.tags("DATETO") in name + or canonical_tags.tags("TIMETO") in name + or canonical_tags.tags("DATETIMETO") in name + ): # If no special tags, just add the name as is new_all_names.append(name) continue @@ -532,12 +536,20 @@ def search_with_tags( if format_type is not None: assert tag is not None - search_str = format_and_validate_datetime_search_str(search_str, format_type, tag) + search_str = format_and_validate_datetime_search_str( + search_str, format_type, tag + ) + + matching_names: List[str] # Use the helper function to perform the glob search if sub: matching_names = search_sub_or_ses_level( - cfg, base_folder, local_or_central, sub, search_str=search_str + cfg, + base_folder, + local_or_central, + sub, + search_str=search_str, )[0] else: matching_names = search_sub_or_ses_level( @@ -545,8 +557,8 @@ def search_with_tags( )[0] # Filter results by datetime range - start_timepoint, end_timepoint = strip_start_end_date_from_datetime_tag( - name, format_type, tag + start_timepoint, end_timepoint = ( + strip_start_end_date_from_datetime_tag(name, format_type, tag) ) matching_names = filter_names_by_datetime_range( matching_names, format_type, start_timepoint, end_timepoint @@ -556,7 +568,11 @@ def search_with_tags( # No datetime range, just perform the glob search with wildcards if sub: matching_names = search_sub_or_ses_level( - cfg, base_folder, local_or_central, sub, search_str=search_str + cfg, + base_folder, + local_or_central, + sub, + search_str=search_str, )[0] else: matching_names = search_sub_or_ses_level( @@ -573,8 +589,8 @@ def filter_names_by_datetime_range( start_timepoint: datetime, end_timepoint: datetime, ) -> List[str]: - """ - Filter a list of names based on a datetime range. + """Filter a list of names based on a datetime range. + Assumes all names contain the format_type pattern (e.g., date-*, time-*) as they were searched using this pattern. @@ -598,14 +614,21 @@ def filter_names_by_datetime_range( ------ ValueError If any datetime value does not match the expected ISO format + """ filtered_names: List[str] = [] for candidate in names: - candidate_basename = candidate if isinstance(candidate, str) else candidate.name - value = get_values_from_bids_formatted_name([candidate_basename], format_type)[0] + candidate_basename = ( + candidate if isinstance(candidate, str) else candidate.name + ) + value = get_values_from_bids_formatted_name( + [candidate_basename], format_type + )[0] try: - candidate_timepoint = datetime_object_from_string(value, format_type) + candidate_timepoint = datetime_object_from_string( + value, format_type + ) except ValueError: utils.log_and_raise_error( f"Invalid {format_type} format in name {candidate_basename}. " @@ -625,8 +648,7 @@ def filter_names_by_datetime_range( def get_expected_datetime_len(format_type: str) -> int: - """ - Get the expected length of characters for a datetime format. + """Get the expected length of characters for a datetime format. Parameters ---------- @@ -637,15 +659,17 @@ def get_expected_datetime_len(format_type: str) -> int: ------- int The number of characters expected for the format + """ format_str = canonical_tags.get_datetime_formats()[format_type] today = datetime.now() return len(today.strftime(format_str)) -def find_datetime_in_name(name: str, format_type: str, tag: str) -> tuple[str, str] | None: - """ - Find and extract datetime values from a name using a regex pattern. +def find_datetime_in_name( + name: str, format_type: str, tag: str +) -> tuple[str | Any, ...] | None: + """Find and extract datetime values from a name using a regex pattern. Parameters ---------- @@ -662,9 +686,12 @@ def find_datetime_in_name(name: str, format_type: str, tag: str) -> tuple[str, s tuple[str, str] | None A tuple containing (start_datetime_str, end_datetime_str) if found, None if no match is found + """ expected_len = get_expected_datetime_len(format_type) - full_tag_regex = fr"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + full_tag_regex = ( + rf"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + ) match = re.search(full_tag_regex, name) return match.groups() if match else None @@ -672,8 +699,7 @@ def find_datetime_in_name(name: str, format_type: str, tag: str) -> tuple[str, s def strip_start_end_date_from_datetime_tag( search_str: str, format_type: str, tag: str ) -> tuple[datetime, datetime]: - """ - Extract and validate start and end datetime values from a search string. + """Extract and validate start and end datetime values from a search string. Parameters ---------- @@ -695,9 +721,12 @@ def strip_start_end_date_from_datetime_tag( NeuroBlueprintError If the datetime format is invalid, the range is malformed, or end datetime is before start datetime + """ expected_len = get_expected_datetime_len(format_type) - full_tag_regex = fr"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + full_tag_regex = ( + rf"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + ) match = re.search(full_tag_regex, search_str) if not match: @@ -706,6 +735,7 @@ def strip_start_end_date_from_datetime_tag( NeuroBlueprintError, ) + assert match is not None, "type narrow `match`" start_str, end_str = match.groups() try: @@ -726,9 +756,10 @@ def strip_start_end_date_from_datetime_tag( return start_timepoint, end_timepoint -def format_and_validate_datetime_search_str(search_str: str, format_type: str, tag: str) -> str: - """ - Validate and format a search string containing a datetime range. +def format_and_validate_datetime_search_str( + search_str: str, format_type: str, tag: str +) -> str: + """Validate and format a search string containing a datetime range. Parameters ---------- @@ -750,24 +781,31 @@ def format_and_validate_datetime_search_str(search_str: str, format_type: str, t ------ NeuroBlueprintError If the datetime format is invalid or the range is malformed + """ # Validate the datetime range format strip_start_end_date_from_datetime_tag(search_str, format_type, tag) # Replace datetime range with wildcard pattern expected_len = get_expected_datetime_len(format_type) - full_tag_regex = fr"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + full_tag_regex = ( + rf"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" + ) return re.sub(full_tag_regex, f"{format_type}-*", search_str) -def datetime_object_from_string(datetime_string: str, format_type: str) -> datetime: - """ - Convert a datetime string to a datetime object using the appropriate format. +def datetime_object_from_string( + datetime_string: str, format_type: str +) -> datetime: + """Convert a datetime string to a datetime object using the appropriate format. Parameters ---------- - datetime_string : The string to convert to a datetime object - format_type : One of "datetime", "time", or "date" + datetime_string : + The string to convert to a datetime object + + format_type : + One of "datetime", "time", or "date" Returns ------- @@ -778,6 +816,7 @@ def datetime_object_from_string(datetime_string: str, format_type: str) -> datet ------ ValueError If the string cannot be parsed using the specified format + """ return datetime.strptime( datetime_string, canonical_tags.get_datetime_formats()[format_type] @@ -789,7 +828,32 @@ def datetime_object_from_string(datetime_string: str, format_type: str) -> datet # ----------------------------------------------------------------------------- -# @overload: Cannot get type overloading to work with this function. +@overload +def search_sub_or_ses_level( + cfg: Configs, + base_folder: Path, + local_or_central: str, + sub: Optional[str] = ..., + ses: Optional[str] = ..., + search_str: str = ..., + verbose: bool = ..., + return_full_path: Literal[False] = ..., +) -> Tuple[List[str], List[str]]: ... + + +@overload +def search_sub_or_ses_level( + cfg: Configs, + base_folder: Path, + local_or_central: str, + sub: Optional[str] = ..., + ses: Optional[str] = ..., + search_str: str = ..., + verbose: bool = ..., + return_full_path: Literal[True] = ..., +) -> Tuple[List[Path], List[str]]: ... + + def search_sub_or_ses_level( cfg: Configs, base_folder: Path, diff --git a/datashuttle/utils/validation.py b/datashuttle/utils/validation.py index 907b856f3..072e098c3 100644 --- a/datashuttle/utils/validation.py +++ b/datashuttle/utils/validation.py @@ -24,7 +24,11 @@ from itertools import chain from pathlib import Path -from datashuttle.configs import canonical_configs, canonical_folders, canonical_tags +from datashuttle.configs import ( + canonical_configs, + canonical_folders, + canonical_tags, +) from datashuttle.utils import formatting, getters, utils from datashuttle.utils.custom_exceptions import NeuroBlueprintError @@ -576,16 +580,22 @@ def datetime_are_iso_format( if datetime_value_str_is_iso_format(format_to_check, key): error_message = [] else: - error_message = [get_datetime_error( - key, name, canonical_tags.get_datetime_formats()[key], path_ - )] + error_message = [ + get_datetime_error( + key, + name, + canonical_tags.get_datetime_formats()[key], + path_, + ) + ] return error_message -def datetime_value_str_is_iso_format(datetime_str: str, format_type: str) -> bool: - """ - Validate that a datetime string matches the expected ISO format. +def datetime_value_str_is_iso_format( + datetime_str: str, format_type: str +) -> bool: + """Validate that a datetime string matches the expected ISO format. Parameters ---------- @@ -598,9 +608,12 @@ def datetime_value_str_is_iso_format(datetime_str: str, format_type: str) -> boo ------- bool True if the string matches the ISO format, False otherwise + """ try: - datetime.strptime(datetime_str, canonical_tags.get_datetime_formats()[format_type]) + datetime.strptime( + datetime_str, canonical_tags.get_datetime_formats()[format_type] + ) return True except ValueError: return False @@ -1185,5 +1198,3 @@ def check_datatypes_are_valid( return message return None - - From ac955037da7a896010f561ea4cf445fdc7a8f651 Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Thu, 31 Jul 2025 02:00:15 +0100 Subject: [PATCH 12/36] Properly ignore mypy on tests. --- pyproject.toml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2e9402411..3d5561b77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,10 +77,9 @@ requires = [ ] build-backend = "setuptools.build_meta" -[tool.mypy] -exclude = [ - "tests/" -] +[[tool.mypy.overrides]] +module = "tests.*" +ignore_errors = true [tool.setuptools] include-package-data = true From 939198270fb98bf21cdb8492bb5f28eec08e5634 Mon Sep 17 00:00:00 2001 From: Diya910 Date: Thu, 14 Aug 2025 12:40:53 +0530 Subject: [PATCH 13/36] Updated test cases to use basemodel and also added more test cases to cover all edges --- tests/test_date_search_range.py | 547 +++++++++++++++++++------------- 1 file changed, 335 insertions(+), 212 deletions(-) diff --git a/tests/test_date_search_range.py b/tests/test_date_search_range.py index 3ebf8d3f9..10c2cbd3e 100644 --- a/tests/test_date_search_range.py +++ b/tests/test_date_search_range.py @@ -1,217 +1,340 @@ -import glob import os import re import shutil -import tempfile -from pathlib import Path -from typing import List - import pytest +from pathlib import Path -from datashuttle.utils.folders import search_with_tags - - -# Dummy implementation for canonical_tags -class DummyCanonicalTags: - @staticmethod - def tags(x: str) -> str: - tags_dict = { - "*": "@*@", - "DATETO": "@DATETO@", - "TIMETO": "@TIMETO@", - "DATETIMETO": "@DATETIMETO@", - } - return tags_dict.get(x, x) - - @staticmethod - def get_datetime_format(format_type: str) -> str: - formats = { - "datetime": "%Y%m%dT%H%M%S", - "time": "%H%M%S", - "date": "%Y%m%d", - } - if format_type not in formats: - raise ValueError(f"Invalid format type: {format_type}") - return formats[format_type] - - -# Patch canonical_tags -@pytest.fixture(autouse=True) -def patch_canonical_tags(monkeypatch): - from datashuttle.configs import canonical_tags - - monkeypatch.setattr(canonical_tags, "tags", DummyCanonicalTags.tags) - monkeypatch.setattr( - canonical_tags, - "get_datetime_format", - DummyCanonicalTags.get_datetime_format, - ) - - -# Dummy implementation for search_sub_or_ses_level that simply performs globbing. -def dummy_search_sub_or_ses_level( - cfg, base_folder: Path, local_or_central: str, *args, search_str: str = "*" -): - pattern = os.path.join(str(base_folder), search_str) - matches: List[str] = sorted(glob.glob(pattern)) - return (matches, []) - - -# Patch search_sub_or_ses_level in the module where search_with_tags is defined. -@pytest.fixture(autouse=True) -def patch_search_sub_or_ses_level(monkeypatch): - from datashuttle.utils import folders - - monkeypatch.setattr( - folders, "search_sub_or_ses_level", dummy_search_sub_or_ses_level - ) - - -# Dummy implementation for get_values_from_bids_formatted_name -def dummy_get_values_from_bids_formatted_name( - names: List[str], key: str, return_as_int: bool = False -) -> List[str]: - results = [] - for name in names: - if key == "date": - m = re.search(r"date-(\d{8})", name) - if m: - results.append(m.group(1)) - return results - - -# Patch get_values_from_bids_formatted_name -@pytest.fixture(autouse=True) -def patch_get_values_from_bids(monkeypatch): - from datashuttle.utils import utils - - monkeypatch.setattr( - utils, - "get_values_from_bids_formatted_name", - dummy_get_values_from_bids_formatted_name, - ) - - -# Fixture to create a temporary directory with a simulated folder structure. -@pytest.fixture -def temp_project_dir() -> Path: # type: ignore - temp_dir = Path(tempfile.mkdtemp()) - # Create folders with names in the format "sub-01_date-YYYYMMDD" - folder_dates = [ - "20250305", - "20250306", - "20250307", - "20250308", - "20250309", - "20250310", - ] - for date_str in folder_dates: - folder_name = f"sub-01_date-{date_str}" - os.mkdir(temp_dir / folder_name) - yield temp_dir - shutil.rmtree(temp_dir) - - -def test_date_range_wildcard(temp_project_dir: Path): - """ - When given a date-range wildcard pattern like "sub-01_20250306@DATETO@20250309", - only folders whose embedded date falls between 20250306 and 20250309 (inclusive) - should be returned. - """ - - class Configs: - pass - - cfg = Configs() - base_folder = temp_project_dir - local_or_central = "local" - pattern = "sub-01_20250306@DATETO@20250309" - result = search_with_tags(cfg, base_folder, local_or_central, [pattern]) - - # Extract the dates from the returned folder names - found_dates = set() - for folder in result: - basename = os.path.basename(folder) - m = re.search(r"date-(\d{8})", basename) - if m: - found_dates.add(m.group(1)) - - expected_dates = {"20250306", "20250307", "20250308", "20250309"} - assert found_dates == expected_dates - - -def test_simple_wildcard(temp_project_dir: Path): - """ - When given a simple wildcard pattern like "sub-01_@*@", - all folders should be returned. - """ - - class Configs: - pass - - cfg = Configs() - base_folder = temp_project_dir - local_or_central = "local" - pattern = "sub-01_@*@" - result = search_with_tags(cfg, base_folder, local_or_central, [pattern]) - # We expect six folders (20250305 through 20250310) - assert len(result) == 6 - - -def test_invalid_date_range(temp_project_dir: Path): - """ - Test that invalid date ranges raise appropriate errors. - """ - - class Configs: - pass - - cfg = Configs() - base_folder = temp_project_dir - local_or_central = "local" - - # Test end date before start date - with pytest.raises(Exception) as exc_info: - pattern = "sub-01_20250309@DATETO@20250306" - search_with_tags(cfg, base_folder, local_or_central, [pattern]) - assert "before start" in str(exc_info.value) - - # Test invalid date format - with pytest.raises(Exception) as exc_info: - pattern = "sub-01_2025030@DATETO@20250306" # Missing digit - search_with_tags(cfg, base_folder, local_or_central, [pattern]) - assert "Invalid" in str(exc_info.value) - - -def test_combined_wildcards(temp_project_dir: Path): - """ - Test that wildcard and date range can be combined in the same pattern. - """ - - class Configs: - pass - - cfg = Configs() - base_folder = temp_project_dir - local_or_central = "local" - - # Create some additional test folders with different subject numbers - for sub in ["02", "03"]: - for date in ["20250307", "20250308"]: - folder_name = f"sub-{sub}_date-{date}" - os.mkdir(temp_project_dir / folder_name) - - pattern = "sub-*_20250307@DATETO@20250308" - result = search_with_tags(cfg, base_folder, local_or_central, [pattern]) - - # Should match all subjects but only dates within range - matched_folders = set(os.path.basename(f) for f in result) - expected_folders = { - "sub-01_date-20250307", - "sub-01_date-20250308", - "sub-02_date-20250307", - "sub-02_date-20250308", - "sub-03_date-20250307", - "sub-03_date-20250308", - } - assert matched_folders == expected_folders +from datashuttle.configs import canonical_tags + +from . import test_utils +from .base import BaseTest + + +class TestDateSearchRange(BaseTest): + """Test date/time range search functionality with real datashuttle projects.""" + + def test_simple_wildcard_first(self, project): + """Test basic wildcard functionality before testing date ranges.""" + subs = ["sub-001", "sub-002"] + sessions = ["ses-001", "ses-002"] + + datatypes_used = test_utils.get_all_broad_folders_used(value=False) + datatypes_used.update({"behav": True}) + test_utils.make_and_check_local_project_folders( + project, "rawdata", subs, sessions, ["behav"], datatypes_used + ) + + project.upload_custom( + "rawdata", + sub_names=[f"sub-{canonical_tags.tags('*')}"], + ses_names=[f"ses-{canonical_tags.tags('*')}"], + datatype=["behav"], + ) + + central_path = project.get_central_path() / "rawdata" + transferred_subs = [sub.name for sub in central_path.glob("sub-*")] + + expected_subs = ["sub-001", "sub-002"] + assert sorted(transferred_subs) == sorted(expected_subs) + + for sub_name in expected_subs: + sub_path = central_path / sub_name + transferred_sessions = [ses.name for ses in sub_path.glob("ses-*")] + expected_sessions = ["ses-001", "ses-002"] + assert sorted(transferred_sessions) == sorted(expected_sessions) + + def test_date_range_transfer(self, project): + """Test that date range patterns correctly filter folders during transfer.""" + subs = ["sub-001", "sub-002"] + sessions = [ + "ses-001_date-20240301", + "ses-002_date-20240315", + "ses-003_date-20240401", + "ses-004_date-20240415", + "ses-005_date-20240501", + ] + + datatypes_used = test_utils.get_all_broad_folders_used(value=False) + datatypes_used.update({"behav": True, "ephys": True}) + test_utils.make_and_check_local_project_folders( + project, "rawdata", subs, sessions, ["behav", "ephys"], datatypes_used + ) + + project.upload_custom( + "rawdata", + sub_names=subs, + ses_names=[f"ses-{canonical_tags.tags('*')}_20240315{canonical_tags.tags('DATETO')}20240401"], + datatype=["behav", "ephys"], + ) + + central_path = project.get_central_path() / "rawdata" + transferred_subs = list(central_path.glob("sub-*")) + + assert len(transferred_subs) == 2 + + for sub_path in transferred_subs: + transferred_sessions = [ses.name for ses in sub_path.glob("ses-*")] + expected_sessions = ["ses-002_date-20240315", "ses-003_date-20240401"] + assert sorted(transferred_sessions) == sorted(expected_sessions) + + def test_time_range_transfer(self, project): + """Test that time range patterns work correctly.""" + subs = ["sub-001"] + sessions = [ + "ses-001_time-080000", + "ses-002_time-120000", + "ses-003_time-160000", + "ses-004_time-200000", + ] + + datatypes_used = test_utils.get_all_broad_folders_used(value=False) + datatypes_used.update({"behav": True}) + test_utils.make_and_check_local_project_folders( + project, "rawdata", subs, sessions, ["behav"], datatypes_used + ) + + project.upload_custom( + "rawdata", + sub_names=subs, + ses_names=[f"ses-{canonical_tags.tags('*')}_100000{canonical_tags.tags('TIMETO')}180000"], + datatype=["behav"], + ) + + central_path = project.get_central_path() / "rawdata" / "sub-001" + transferred_sessions = [ses.name for ses in central_path.glob("ses-*")] + + expected_sessions = ["ses-002_time-120000", "ses-003_time-160000"] + assert sorted(transferred_sessions) == sorted(expected_sessions) + + def test_datetime_range_transfer(self, project): + """Test that wildcard matching works with datetime-tagged sessions.""" + subs = ["sub-001"] + sessions = [ + "ses-001_datetime-20240301T080000", + "ses-002_datetime-20240315T120000", + "ses-003_datetime-20240401T160000", + "ses-004_datetime-20240415T200000", + ] + + datatypes_used = test_utils.get_all_broad_folders_used(value=False) + datatypes_used.update({"behav": True}) + test_utils.make_and_check_local_project_folders( + project, "rawdata", subs, sessions, ["behav"], datatypes_used + ) + + project.upload_custom( + "rawdata", + sub_names=subs, + ses_names=[ + f"ses-{canonical_tags.tags('*')}_datetime-20240315{canonical_tags.tags('*')}", + f"ses-{canonical_tags.tags('*')}_datetime-20240401{canonical_tags.tags('*')}", + ], + datatype=["behav"], + ) + + central_path = project.get_central_path() / "rawdata" / "sub-001" + transferred_sessions = [ses.name for ses in central_path.glob("ses-*")] + + expected_sessions = ["ses-002_datetime-20240315T120000", "ses-003_datetime-20240401T160000"] + assert sorted(transferred_sessions) == sorted(expected_sessions) + + def test_combined_wildcard_and_date_range(self, project): + """Test combining wildcards with date ranges.""" + subs = ["sub-001", "sub-002", "sub-003"] + sessions = [ + "ses-001_date-20240301_run-01", + "ses-002_date-20240315_run-02", + "ses-003_date-20240401_run-01", + "ses-004_date-20240415_run-03", + ] + + datatypes_used = test_utils.get_all_broad_folders_used(value=False) + datatypes_used.update({"behav": True}) + test_utils.make_and_check_local_project_folders( + project, "rawdata", subs, sessions, ["behav"], datatypes_used + ) + + project.upload_custom( + "rawdata", + sub_names=[f"sub-{canonical_tags.tags('*')}"], + ses_names=[ + f"ses-{canonical_tags.tags('*')}_20240310{canonical_tags.tags('DATETO')}20240420_run-01", + f"ses-{canonical_tags.tags('*')}_20240310{canonical_tags.tags('DATETO')}20240420_run-02" + ], + datatype=["behav"], + ) + + central_path = project.get_central_path() / "rawdata" + transferred_subs = list(central_path.glob("sub-*")) + + assert len(transferred_subs) == 3 + + for sub_path in transferred_subs: + transferred_sessions = [ses.name for ses in sub_path.glob("ses-*")] + expected_sessions = ["ses-002_date-20240315_run-02", "ses-003_date-20240401_run-01"] + assert sorted(transferred_sessions) == sorted(expected_sessions) + + def test_invalid_date_range_errors(self, project): + """Test that invalid date ranges raise appropriate errors.""" + subs = ["sub-001"] + sessions = ["ses-001_date-20240301"] + + datatypes_used = test_utils.get_all_broad_folders_used(value=False) + datatypes_used.update({"behav": True}) + test_utils.make_and_check_local_project_folders( + project, "rawdata", subs, sessions, ["behav"], datatypes_used + ) + + with pytest.raises(Exception) as exc_info: + project.upload_custom( + "rawdata", + sub_names=subs, + ses_names=[f"ses-{canonical_tags.tags('*')}_20240401{canonical_tags.tags('DATETO')}20240301"], + datatype=["behav"], + ) + assert "before start" in str(exc_info.value) + + with pytest.raises(Exception) as exc_info: + project.upload_custom( + "rawdata", + sub_names=subs, + ses_names=[f"ses-{canonical_tags.tags('*')}_2024030{canonical_tags.tags('DATETO')}20240401"], + datatype=["behav"], + ) + assert "Invalid" in str(exc_info.value) + + def test_no_matches_in_date_range(self, project): + """Test behavior when no folders match the date range.""" + subs = ["sub-001"] + sessions = [ + "ses-001_date-20240101", + "ses-002_date-20240201", + ] + + datatypes_used = test_utils.get_all_broad_folders_used(value=False) + datatypes_used.update({"behav": True}) + test_utils.make_and_check_local_project_folders( + project, "rawdata", subs, sessions, ["behav"], datatypes_used + ) + + project.upload_custom( + "rawdata", + sub_names=subs, + ses_names=[f"ses-{canonical_tags.tags('*')}_20240301{canonical_tags.tags('DATETO')}20240401"], + datatype=["behav"], + ) + + central_path = project.get_central_path() / "rawdata" + transferred_items = list(central_path.glob("*")) + + if transferred_items: + transferred_sub_names = [item.name for item in transferred_items if item.name.startswith("sub-")] + assert len(transferred_sub_names) == 0 + + def test_subject_level_date_range(self, project): + """Test date ranges work at the subject level too.""" + subs = [ + "sub-001_date-20240301", + "sub-002_date-20240315", + "sub-003_date-20240401", + "sub-004_date-20240415", + ] + sessions = ["ses-001"] + + datatypes_used = test_utils.get_all_broad_folders_used(value=False) + datatypes_used.update({"behav": True}) + test_utils.make_and_check_local_project_folders( + project, "rawdata", subs, sessions, ["behav"], datatypes_used + ) + + project.upload_custom( + "rawdata", + sub_names=[f"sub-{canonical_tags.tags('*')}_20240310{canonical_tags.tags('DATETO')}20240410"], + ses_names=sessions, + datatype=["behav"], + ) + + central_path = project.get_central_path() / "rawdata" + transferred_subs = [sub.name for sub in central_path.glob("sub-*")] + + expected_subs = ["sub-002_date-20240315", "sub-003_date-20240401"] + assert sorted(transferred_subs) == sorted(expected_subs) + + @pytest.mark.parametrize("project", ["full"], indirect=True) + def test_download_with_date_range(self, project): + """Test that date range patterns work for downloads as well as uploads.""" + subs = ["sub-001", "sub-002"] + sessions = [ + "ses-001_date-20240301", + "ses-002_date-20240315", + "ses-003_date-20240401", + "ses-004_date-20240415", + ] + + datatypes_used = test_utils.get_all_broad_folders_used(value=False) + datatypes_used.update({"behav": True}) + test_utils.make_and_check_local_project_folders( + project, "rawdata", subs, sessions, ["behav"], datatypes_used + ) + + project.upload_custom( + "rawdata", + sub_names=subs, + ses_names=sessions, + datatype=["behav"], + ) + + os.chdir(project.get_local_path()) + local_rawdata = project.get_local_path() / "rawdata" + if local_rawdata.exists(): + shutil.rmtree(local_rawdata) + + project.download_custom( + "rawdata", + sub_names=subs, + ses_names=[f"ses-{canonical_tags.tags('*')}_20240310{canonical_tags.tags('DATETO')}20240401"], + datatype=["behav"], + ) + + local_path = project.get_local_path() / "rawdata" + downloaded_subs = list(local_path.glob("sub-*")) + + assert len(downloaded_subs) == 2 + + for sub_path in downloaded_subs: + downloaded_sessions = [ses.name for ses in sub_path.glob("ses-*")] + expected_sessions = ["ses-002_date-20240315", "ses-003_date-20240401"] + assert sorted(downloaded_sessions) == sorted(expected_sessions) + + def test_edge_case_exact_boundary_dates(self, project): + """Test that boundary dates are handled correctly (inclusive ranges).""" + subs = ["sub-001"] + sessions = [ + "ses-001_date-20240301", + "ses-002_date-20240315", + "ses-003_date-20240401", + "ses-004_date-20240415", + ] + + datatypes_used = test_utils.get_all_broad_folders_used(value=False) + datatypes_used.update({"behav": True}) + test_utils.make_and_check_local_project_folders( + project, "rawdata", subs, sessions, ["behav"], datatypes_used + ) + + project.upload_custom( + "rawdata", + sub_names=subs, + ses_names=[f"ses-{canonical_tags.tags('*')}_20240301{canonical_tags.tags('DATETO')}20240401"], + datatype=["behav"], + ) + + central_path = project.get_central_path() / "rawdata" / "sub-001" + transferred_sessions = [ses.name for ses in central_path.glob("ses-*")] + + expected_sessions = [ + "ses-001_date-20240301", + "ses-002_date-20240315", + "ses-003_date-20240401" + ] + assert sorted(transferred_sessions) == sorted(expected_sessions) From 23679eb43e15dc5ad555eb35e7d7a5d5f15e376f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 14 Aug 2025 07:11:30 +0000 Subject: [PATCH 14/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_date_search_range.py | 170 +++++++++++++++++++------------- 1 file changed, 103 insertions(+), 67 deletions(-) diff --git a/tests/test_date_search_range.py b/tests/test_date_search_range.py index 10c2cbd3e..168d1bb54 100644 --- a/tests/test_date_search_range.py +++ b/tests/test_date_search_range.py @@ -1,8 +1,7 @@ import os -import re import shutil + import pytest -from pathlib import Path from datashuttle.configs import canonical_tags @@ -15,28 +14,28 @@ class TestDateSearchRange(BaseTest): def test_simple_wildcard_first(self, project): """Test basic wildcard functionality before testing date ranges.""" - subs = ["sub-001", "sub-002"] + subs = ["sub-001", "sub-002"] sessions = ["ses-001", "ses-002"] - + datatypes_used = test_utils.get_all_broad_folders_used(value=False) datatypes_used.update({"behav": True}) test_utils.make_and_check_local_project_folders( project, "rawdata", subs, sessions, ["behav"], datatypes_used ) - + project.upload_custom( "rawdata", sub_names=[f"sub-{canonical_tags.tags('*')}"], ses_names=[f"ses-{canonical_tags.tags('*')}"], datatype=["behav"], ) - + central_path = project.get_central_path() / "rawdata" transferred_subs = [sub.name for sub in central_path.glob("sub-*")] - + expected_subs = ["sub-001", "sub-002"] assert sorted(transferred_subs) == sorted(expected_subs) - + for sub_name in expected_subs: sub_path = central_path / sub_name transferred_sessions = [ses.name for ses in sub_path.glob("ses-*")] @@ -48,33 +47,43 @@ def test_date_range_transfer(self, project): subs = ["sub-001", "sub-002"] sessions = [ "ses-001_date-20240301", - "ses-002_date-20240315", + "ses-002_date-20240315", "ses-003_date-20240401", "ses-004_date-20240415", "ses-005_date-20240501", ] - + datatypes_used = test_utils.get_all_broad_folders_used(value=False) datatypes_used.update({"behav": True, "ephys": True}) test_utils.make_and_check_local_project_folders( - project, "rawdata", subs, sessions, ["behav", "ephys"], datatypes_used + project, + "rawdata", + subs, + sessions, + ["behav", "ephys"], + datatypes_used, ) - + project.upload_custom( "rawdata", sub_names=subs, - ses_names=[f"ses-{canonical_tags.tags('*')}_20240315{canonical_tags.tags('DATETO')}20240401"], + ses_names=[ + f"ses-{canonical_tags.tags('*')}_20240315{canonical_tags.tags('DATETO')}20240401" + ], datatype=["behav", "ephys"], ) - + central_path = project.get_central_path() / "rawdata" transferred_subs = list(central_path.glob("sub-*")) - + assert len(transferred_subs) == 2 - + for sub_path in transferred_subs: transferred_sessions = [ses.name for ses in sub_path.glob("ses-*")] - expected_sessions = ["ses-002_date-20240315", "ses-003_date-20240401"] + expected_sessions = [ + "ses-002_date-20240315", + "ses-003_date-20240401", + ] assert sorted(transferred_sessions) == sorted(expected_sessions) def test_time_range_transfer(self, project): @@ -82,27 +91,29 @@ def test_time_range_transfer(self, project): subs = ["sub-001"] sessions = [ "ses-001_time-080000", - "ses-002_time-120000", + "ses-002_time-120000", "ses-003_time-160000", "ses-004_time-200000", ] - + datatypes_used = test_utils.get_all_broad_folders_used(value=False) datatypes_used.update({"behav": True}) test_utils.make_and_check_local_project_folders( project, "rawdata", subs, sessions, ["behav"], datatypes_used ) - + project.upload_custom( "rawdata", sub_names=subs, - ses_names=[f"ses-{canonical_tags.tags('*')}_100000{canonical_tags.tags('TIMETO')}180000"], + ses_names=[ + f"ses-{canonical_tags.tags('*')}_100000{canonical_tags.tags('TIMETO')}180000" + ], datatype=["behav"], ) - + central_path = project.get_central_path() / "rawdata" / "sub-001" transferred_sessions = [ses.name for ses in central_path.glob("ses-*")] - + expected_sessions = ["ses-002_time-120000", "ses-003_time-160000"] assert sorted(transferred_sessions) == sorted(expected_sessions) @@ -112,16 +123,16 @@ def test_datetime_range_transfer(self, project): sessions = [ "ses-001_datetime-20240301T080000", "ses-002_datetime-20240315T120000", - "ses-003_datetime-20240401T160000", + "ses-003_datetime-20240401T160000", "ses-004_datetime-20240415T200000", ] - + datatypes_used = test_utils.get_all_broad_folders_used(value=False) datatypes_used.update({"behav": True}) test_utils.make_and_check_local_project_folders( project, "rawdata", subs, sessions, ["behav"], datatypes_used ) - + project.upload_custom( "rawdata", sub_names=subs, @@ -131,11 +142,14 @@ def test_datetime_range_transfer(self, project): ], datatype=["behav"], ) - + central_path = project.get_central_path() / "rawdata" / "sub-001" transferred_sessions = [ses.name for ses in central_path.glob("ses-*")] - - expected_sessions = ["ses-002_datetime-20240315T120000", "ses-003_datetime-20240401T160000"] + + expected_sessions = [ + "ses-002_datetime-20240315T120000", + "ses-003_datetime-20240401T160000", + ] assert sorted(transferred_sessions) == sorted(expected_sessions) def test_combined_wildcard_and_date_range(self, project): @@ -147,31 +161,34 @@ def test_combined_wildcard_and_date_range(self, project): "ses-003_date-20240401_run-01", "ses-004_date-20240415_run-03", ] - + datatypes_used = test_utils.get_all_broad_folders_used(value=False) datatypes_used.update({"behav": True}) test_utils.make_and_check_local_project_folders( project, "rawdata", subs, sessions, ["behav"], datatypes_used ) - + project.upload_custom( "rawdata", sub_names=[f"sub-{canonical_tags.tags('*')}"], ses_names=[ f"ses-{canonical_tags.tags('*')}_20240310{canonical_tags.tags('DATETO')}20240420_run-01", - f"ses-{canonical_tags.tags('*')}_20240310{canonical_tags.tags('DATETO')}20240420_run-02" + f"ses-{canonical_tags.tags('*')}_20240310{canonical_tags.tags('DATETO')}20240420_run-02", ], datatype=["behav"], ) - + central_path = project.get_central_path() / "rawdata" transferred_subs = list(central_path.glob("sub-*")) - + assert len(transferred_subs) == 3 - + for sub_path in transferred_subs: transferred_sessions = [ses.name for ses in sub_path.glob("ses-*")] - expected_sessions = ["ses-002_date-20240315_run-02", "ses-003_date-20240401_run-01"] + expected_sessions = [ + "ses-002_date-20240315_run-02", + "ses-003_date-20240401_run-01", + ] assert sorted(transferred_sessions) == sorted(expected_sessions) def test_invalid_date_range_errors(self, project): @@ -189,7 +206,9 @@ def test_invalid_date_range_errors(self, project): project.upload_custom( "rawdata", sub_names=subs, - ses_names=[f"ses-{canonical_tags.tags('*')}_20240401{canonical_tags.tags('DATETO')}20240301"], + ses_names=[ + f"ses-{canonical_tags.tags('*')}_20240401{canonical_tags.tags('DATETO')}20240301" + ], datatype=["behav"], ) assert "before start" in str(exc_info.value) @@ -198,7 +217,9 @@ def test_invalid_date_range_errors(self, project): project.upload_custom( "rawdata", sub_names=subs, - ses_names=[f"ses-{canonical_tags.tags('*')}_2024030{canonical_tags.tags('DATETO')}20240401"], + ses_names=[ + f"ses-{canonical_tags.tags('*')}_2024030{canonical_tags.tags('DATETO')}20240401" + ], datatype=["behav"], ) assert "Invalid" in str(exc_info.value) @@ -210,53 +231,61 @@ def test_no_matches_in_date_range(self, project): "ses-001_date-20240101", "ses-002_date-20240201", ] - + datatypes_used = test_utils.get_all_broad_folders_used(value=False) datatypes_used.update({"behav": True}) test_utils.make_and_check_local_project_folders( project, "rawdata", subs, sessions, ["behav"], datatypes_used ) - + project.upload_custom( "rawdata", sub_names=subs, - ses_names=[f"ses-{canonical_tags.tags('*')}_20240301{canonical_tags.tags('DATETO')}20240401"], + ses_names=[ + f"ses-{canonical_tags.tags('*')}_20240301{canonical_tags.tags('DATETO')}20240401" + ], datatype=["behav"], ) - + central_path = project.get_central_path() / "rawdata" transferred_items = list(central_path.glob("*")) - + if transferred_items: - transferred_sub_names = [item.name for item in transferred_items if item.name.startswith("sub-")] + transferred_sub_names = [ + item.name + for item in transferred_items + if item.name.startswith("sub-") + ] assert len(transferred_sub_names) == 0 def test_subject_level_date_range(self, project): """Test date ranges work at the subject level too.""" subs = [ "sub-001_date-20240301", - "sub-002_date-20240315", + "sub-002_date-20240315", "sub-003_date-20240401", "sub-004_date-20240415", ] sessions = ["ses-001"] - + datatypes_used = test_utils.get_all_broad_folders_used(value=False) datatypes_used.update({"behav": True}) test_utils.make_and_check_local_project_folders( project, "rawdata", subs, sessions, ["behav"], datatypes_used ) - + project.upload_custom( "rawdata", - sub_names=[f"sub-{canonical_tags.tags('*')}_20240310{canonical_tags.tags('DATETO')}20240410"], + sub_names=[ + f"sub-{canonical_tags.tags('*')}_20240310{canonical_tags.tags('DATETO')}20240410" + ], ses_names=sessions, datatype=["behav"], ) - + central_path = project.get_central_path() / "rawdata" transferred_subs = [sub.name for sub in central_path.glob("sub-*")] - + expected_subs = ["sub-002_date-20240315", "sub-003_date-20240401"] assert sorted(transferred_subs) == sorted(expected_subs) @@ -267,43 +296,48 @@ def test_download_with_date_range(self, project): sessions = [ "ses-001_date-20240301", "ses-002_date-20240315", - "ses-003_date-20240401", + "ses-003_date-20240401", "ses-004_date-20240415", ] - + datatypes_used = test_utils.get_all_broad_folders_used(value=False) datatypes_used.update({"behav": True}) test_utils.make_and_check_local_project_folders( project, "rawdata", subs, sessions, ["behav"], datatypes_used ) - + project.upload_custom( "rawdata", sub_names=subs, ses_names=sessions, datatype=["behav"], ) - + os.chdir(project.get_local_path()) local_rawdata = project.get_local_path() / "rawdata" if local_rawdata.exists(): shutil.rmtree(local_rawdata) - + project.download_custom( "rawdata", sub_names=subs, - ses_names=[f"ses-{canonical_tags.tags('*')}_20240310{canonical_tags.tags('DATETO')}20240401"], + ses_names=[ + f"ses-{canonical_tags.tags('*')}_20240310{canonical_tags.tags('DATETO')}20240401" + ], datatype=["behav"], ) - + local_path = project.get_local_path() / "rawdata" downloaded_subs = list(local_path.glob("sub-*")) - + assert len(downloaded_subs) == 2 - + for sub_path in downloaded_subs: downloaded_sessions = [ses.name for ses in sub_path.glob("ses-*")] - expected_sessions = ["ses-002_date-20240315", "ses-003_date-20240401"] + expected_sessions = [ + "ses-002_date-20240315", + "ses-003_date-20240401", + ] assert sorted(downloaded_sessions) == sorted(expected_sessions) def test_edge_case_exact_boundary_dates(self, project): @@ -315,26 +349,28 @@ def test_edge_case_exact_boundary_dates(self, project): "ses-003_date-20240401", "ses-004_date-20240415", ] - + datatypes_used = test_utils.get_all_broad_folders_used(value=False) datatypes_used.update({"behav": True}) test_utils.make_and_check_local_project_folders( project, "rawdata", subs, sessions, ["behav"], datatypes_used ) - + project.upload_custom( "rawdata", sub_names=subs, - ses_names=[f"ses-{canonical_tags.tags('*')}_20240301{canonical_tags.tags('DATETO')}20240401"], + ses_names=[ + f"ses-{canonical_tags.tags('*')}_20240301{canonical_tags.tags('DATETO')}20240401" + ], datatype=["behav"], ) - + central_path = project.get_central_path() / "rawdata" / "sub-001" transferred_sessions = [ses.name for ses in central_path.glob("ses-*")] - + expected_sessions = [ - "ses-001_date-20240301", + "ses-001_date-20240301", "ses-002_date-20240315", - "ses-003_date-20240401" + "ses-003_date-20240401", ] assert sorted(transferred_sessions) == sorted(expected_sessions) From 2e3bcd73e6f438d24353e8123ce31537e6380188 Mon Sep 17 00:00:00 2001 From: Diya <152620955+Diya910@users.noreply.github.com> Date: Tue, 26 Aug 2025 08:33:30 +0530 Subject: [PATCH 15/36] Update datashuttle/utils/folders.py Co-authored-by: Joe Ziminski <55797454+JoeZiminski@users.noreply.github.com> --- datashuttle/utils/folders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 2621fa228..ad7c36b1c 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -745,7 +745,7 @@ def strip_start_end_date_from_datetime_tag( NeuroBlueprintError, ) - if end_timepoint <= start_timepoint: + if end_timepoint < start_timepoint: utils.log_and_raise_error( f"End {format_type} is before start {format_type}. Ensure the end datetime is after the start datetime.", NeuroBlueprintError, From e2940b2ce7c49e1b3f73a530b4ea02ab736fdd41 Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Tue, 9 Sep 2025 15:24:34 +0100 Subject: [PATCH 16/36] Cover DATETO etc. cases in check_and_format_names so they are not validated. --- datashuttle/utils/folders.py | 2 + datashuttle/utils/formatting.py | 8 +++- tests/test_date_search_range.py | 77 +++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index b314fd297..9470dc1bc 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -505,6 +505,7 @@ def search_with_tags( """ new_all_names: List[str] = [] + for name in all_names: if not ( canonical_tags.tags("*") in name @@ -524,6 +525,7 @@ def search_with_tags( # Handle datetime ranges format_type = None tag = None + if canonical_tags.tags("DATETO") in search_str: tag = canonical_tags.tags("DATETO") format_type = "date" diff --git a/datashuttle/utils/formatting.py b/datashuttle/utils/formatting.py index 0c77cddcf..a0677f6a6 100644 --- a/datashuttle/utils/formatting.py +++ b/datashuttle/utils/formatting.py @@ -63,7 +63,13 @@ def check_and_format_names( names_to_format, reserved_keywords = [], [] for name in names: - if name in canonical_reserved_keywords() or tags("*") in name: + if ( + name in canonical_reserved_keywords() + or tags("*") in name + or tags("DATETO") in name + or tags("TIMETO") in name + or tags("DATETIMETO") in name + ): if tags("to") in name: # handle an edge case where use searches with both tags reserved_keywords += update_names_with_range_to_flag( diff --git a/tests/test_date_search_range.py b/tests/test_date_search_range.py index 168d1bb54..bd2f3a917 100644 --- a/tests/test_date_search_range.py +++ b/tests/test_date_search_range.py @@ -374,3 +374,80 @@ def test_edge_case_exact_boundary_dates(self, project): "ses-003_date-20240401", ] assert sorted(transferred_sessions) == sorted(expected_sessions) + + def test_with_range_to_flag(self, project): + """Test that the @DATETO@ works well with @TO@""" + subs = ["sub-001"] + + sessions = [ + "ses-001_date-20240301", + "ses-002_date-20240301", + "ses-003_date-20240405", + "ses-004_date-20240415", + ] + + datatypes_used = test_utils.get_all_broad_folders_used(value=False) + datatypes_used.update({"behav": True}) + test_utils.make_and_check_local_project_folders( + project, "rawdata", subs, sessions, ["behav"], datatypes_used + ) + + # Select such that ses-002 onwards is selected, and + # ses-004 is excluded based on date. + project.upload_custom( + "rawdata", + sub_names=subs, + ses_names=[ + f"ses-002@TO@004_20240301{canonical_tags.tags('DATETO')}20240406" + ], + datatype=["behav"], + ) + + central_path = project.get_central_path() / "rawdata" / "sub-001" + transferred_sessions = [ses.name for ses in central_path.glob("ses-*")] + + expected_sessions = [ + "ses-002_date-20240301", + "ses-003_date-20240405", + ] + assert sorted(transferred_sessions) == sorted(expected_sessions) + + def test_without_wildcard_ses(self, project): + """Test without wildcard ses. + + Including @*@ only led to an uncaught but as it was triggering a + conditional in `check_and_format_names` that was not triggered by + @DATETO@ alone though it should have been. + """ + subs = ["sub-001"] + + sessions = [ + "ses-001_date-20240301", + "ses-002_date-20240301", + "ses-003_date-20240405", + "ses-004_date-20240415", + ] + + datatypes_used = test_utils.get_all_broad_folders_used(value=False) + datatypes_used.update({"behav": True}) + test_utils.make_and_check_local_project_folders( + project, "rawdata", subs, sessions, ["behav"], datatypes_used + ) + + # Select such that ses-002 is selected (and it is in range) + project.upload_custom( + "rawdata", + sub_names=subs, + ses_names=[ + f"ses-002_20240301{canonical_tags.tags('DATETO')}20240302" + ], + datatype=["behav"], + ) + + central_path = project.get_central_path() / "rawdata" / "sub-001" + transferred_sessions = [ses.name for ses in central_path.glob("ses-*")] + + expected_sessions = [ + "ses-002_date-20240301", + ] + assert sorted(transferred_sessions) == sorted(expected_sessions) From 4c1bd4a82fb08963cf0252533f94c0629349b44e Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Mon, 27 Oct 2025 12:55:44 +0000 Subject: [PATCH 17/36] Introduce get_datetime_to_search_regexp function. --- datashuttle/utils/folders.py | 42 ++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 9470dc1bc..00244a080 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -647,8 +647,8 @@ def filter_names_by_datetime_range( # ----------------------------------------------------------------------------- -def get_expected_datetime_len(format_type: str) -> int: - """Get the expected length of characters for a datetime format. +def get_datetime_to_search_regexp(format_type: str, tag: str) -> str: + """Get the regexp to find date / time / datetime in a string. Parameters ---------- @@ -658,12 +658,23 @@ def get_expected_datetime_len(format_type: str) -> int: Returns ------- int - The number of characters expected for the format + The regexp for the value (e.g. "\d{8}" for date. """ - format_str = canonical_tags.get_datetime_formats()[format_type] - today = datetime.now() - return len(today.strftime(format_str)) + if format_type == "date": + regexp = "\d{8}" + + elif format_type == "time": + regexp = "\d{6}" + + elif format_type == "datetime": + regexp = "\d{8}T\d{6}" + + full_tag_regex = ( + rf"({regexp}){re.escape(tag)}({regexp})" + ) + + return full_tag_regex def find_datetime_in_name( @@ -688,11 +699,10 @@ def find_datetime_in_name( None if no match is found """ - expected_len = get_expected_datetime_len(format_type) - full_tag_regex = ( - rf"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" - ) + full_tag_regex = get_datetime_to_search_regexp(format_type, tag) + match = re.search(full_tag_regex, name) + return match.groups() if match else None @@ -723,10 +733,8 @@ def strip_start_end_date_from_datetime_tag( or end datetime is before start datetime """ - expected_len = get_expected_datetime_len(format_type) - full_tag_regex = ( - rf"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" - ) + full_tag_regex = get_datetime_to_search_regexp(format_type, tag) + match = re.search(full_tag_regex, search_str) if not match: @@ -787,10 +795,8 @@ def format_and_validate_datetime_search_str( strip_start_end_date_from_datetime_tag(search_str, format_type, tag) # Replace datetime range with wildcard pattern - expected_len = get_expected_datetime_len(format_type) - full_tag_regex = ( - rf"(\d{{{expected_len}}}){re.escape(tag)}(\d{{{expected_len}}})" - ) + full_tag_regex = get_datetime_to_search_regexp(format_type, tag) + return re.sub(full_tag_regex, f"{format_type}-*", search_str) From 84524ec1572029d0d04a87249bf2e04b58ca170f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 Oct 2025 12:56:06 +0000 Subject: [PATCH 18/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- datashuttle/utils/folders.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 00244a080..a290bc3e3 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -670,9 +670,7 @@ def get_datetime_to_search_regexp(format_type: str, tag: str) -> str: elif format_type == "datetime": regexp = "\d{8}T\d{6}" - full_tag_regex = ( - rf"({regexp}){re.escape(tag)}({regexp})" - ) + full_tag_regex = rf"({regexp}){re.escape(tag)}({regexp})" return full_tag_regex From 7da63057afd0a2a02c381e1002963074783bd206 Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Mon, 27 Oct 2025 12:55:44 +0000 Subject: [PATCH 19/36] Introduce get_datetime_to_search_regexp function. --- datashuttle/utils/folders.py | 50 ++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 9470dc1bc..582b0b82f 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -647,23 +647,40 @@ def filter_names_by_datetime_range( # ----------------------------------------------------------------------------- -def get_expected_datetime_len(format_type: str) -> int: - """Get the expected length of characters for a datetime format. +def get_datetime_to_search_regexp(format_type: str, tag: str) -> str: + r"""Get the full regexp to find the full @DATETIMETO@ and similar tags. + + Users will write "@DATETO@", "