Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions multipleye_settings_preprocessing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,14 @@ include_pilots: True
expected_sampling_rate_hz: 1000 # Hz


# NOT YET IN USE; IGNORE. if any sessions do not use the standard stimulus version, specify so here
session_to_stimuli:
- "session":

default_stimulus_version:
# if any sessions do not use the standard stimulus version, uncomment and specify here
stimuli_version_to_session:
# default_stimuli_version: "v3"
# other_versions_mapping:
# v1:
# - "008"
# - "010"
# v2:
# - "011"
# - "012"
# - "013"
2 changes: 2 additions & 0 deletions preprocessing/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@

EXPECTED_SAMPLING_RATE_HZ = user_configs["expected_sampling_rate_hz"]

STIMULI_VERSION_TO_SESSION = user_configs["stimuli_version_to_session"]

# GENERAL SETTINGS
TRIAL_COLS = ["trial", "stimulus", "page"]

Expand Down
37 changes: 35 additions & 2 deletions preprocessing/data_collection/multipleye_data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
EYETRACKER_NAMES,
MESSAGE_REGEX,
STIMULUS_NAME_MAPPING,
STIMULI_VERSION_TO_SESSION,
)
from ..utils.conversion import convert_to_time_str
from ..checks.et_quality_checks import (
Expand Down Expand Up @@ -98,6 +99,7 @@ def __init__(
session_folder_regex: str,
included_sessions: list[str] | None = None,
excluded_sessions: list[str] | None = None,
stimuli_version_mapping: dict[str, str] | None = None,
# stimuli: list[Stimulus],
**kwargs,
):
Expand Down Expand Up @@ -134,6 +136,7 @@ def __init__(
self.psychometric_tests = kwargs.get("psychometric_tests", [])
self.excluded_sessions = excluded_sessions
self.included_sessions = included_sessions
self.stimuli_version_mapping = stimuli_version_mapping

open(self.data_root.parent / "preprocessing_logs.txt", "w").close()

Expand Down Expand Up @@ -395,7 +398,21 @@ def create_from_data_folder(
r"\d\d\d" + f"_{stimulus_language}_{country}_{lab_number}" + r"_ET\d"
)

stimulus_folder_path = data_dir / f"stimuli_{data_folder_name}"
if STIMULI_VERSION_TO_SESSION:
stimulus_folder_path = (
data_dir
/ f"stimuli_{data_folder_name}_{STIMULI_VERSION_TO_SESSION['default_stimuli_version']}"
)
stim_version_mapping = {
item: key
for key, values in STIMULI_VERSION_TO_SESSION[
"other_versions_mapping"
].items()
for item in values
}
else:
stimulus_folder_path = data_dir / f"stimuli_{data_folder_name}"

config_file = (
stimulus_folder_path
/ "config"
Expand Down Expand Up @@ -444,6 +461,9 @@ def create_from_data_folder(
ps_tests_path=ps_tests_path,
included_sessions=included_sessions,
excluded_sessions=excluded_sessions,
stimuli_version_mapping=stim_version_mapping
if STIMULI_VERSION_TO_SESSION
else None,
)

def create_sanity_check_report(
Expand Down Expand Up @@ -636,8 +656,21 @@ def prepare_session_level_information(self):
f"session {session}. Please check the files carefully."
)

stimulus_dir = self.stimulus_dir

if self.stimuli_version_mapping:
if session[:3] in self.stimuli_version_mapping:
# replace the stimulus order version in the stimulus folder name with the one from the mapping
old_stimulus_dir_name = self.stimulus_dir.name
new_stimulus_dir_name = re.sub(
r"v(\d+)",
f"{self.stimuli_version_mapping[session[:3]]}",
old_stimulus_dir_name,
)
stimulus_dir = self.stimulus_dir.parent / new_stimulus_dir_name

self.sessions[session].stimuli = self._load_session_stimuli(
self.stimulus_dir,
stimulus_dir,
self.language,
self.country,
self.lab_number,
Expand Down
146 changes: 108 additions & 38 deletions preprocessing/scripts/prepare_language_folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,55 +102,125 @@ def prepare_language_folder(data_collection_name):
# remove the zip file after extraction
pilot_participant_folder.unlink()

stimulus_folder_path = data_folder_path / f"stimuli_{data_collection_name}"

if not stimulus_folder_path.exists():
print(
f"The stimulus folder stimuli_{data_collection_name} does not exist. Check and if necessary, ask team to upload."
)
else:
config_path = stimulus_folder_path / "config"
if not config_path.exists():
raise FileNotFoundError(
f"The stimulus config folder not found in '{stimulus_folder_path}'. "
"Please check and restructure or possibly unzip the stimulus folder."
stimuli_version_to_session = constants.STIMULI_VERSION_TO_SESSION
if stimuli_version_to_session:
if not stimuli_version_to_session.get(
"default_stimuli_version"
) or not stimuli_version_to_session.get("other_versions_mapping"):
raise ValueError(
"Invalid stimuli version mapping in the config. Please provide both 'default_stimuli_version' and 'other_versions_mapping' keys with valid values."
)

# if aoi files are not yet split into questions and texts, do it here:
aoi_path = (
data_folder_path
/ stimulus_folder_path
/ f"aoi_stimuli_{lang}_{country}_{lab_no}"
)
# check that the values of default_stimuli_version and other_versions_mapping.keys use correct format
version_pattern = r"^v\d+$"
if not re.match(
version_pattern, stimuli_version_to_session["default_stimuli_version"]
):
raise ValueError(
f"Invalid format for default_stimuli_version: '{stimuli_version_to_session['default_stimuli_version']}'. Expected format is 'v<number>'."
)
for version in stimuli_version_to_session["other_versions_mapping"].keys():
if not re.match(version_pattern, version):
raise ValueError(
f"Invalid format for stimuli version in other_versions_mapping: '{version}'. Expected format is 'v<number>'."
)

# get all aoi files, if there are only 12 files, they are not yet split
aoi_files = list(aoi_path.glob("*.csv"))
if len(aoi_files) == 12:
print("Splitting AOI files into text and question AOIs...")
for aoi_file in aoi_files:
aoi_df = pd.read_csv(aoi_file)
# split the aoi_df into two parts, one for the stimulus and one for the questions
aoi_df_texts = aoi_df[~aoi_df["page"].str.contains("question", na=False)]
aoi_df_texts.drop(
columns=["question_image_version"], inplace=True, errors="ignore"
# check that the default_stimuli_version is not also included in the other_versions_mapping keys
if (
stimuli_version_to_session["default_stimuli_version"]
in stimuli_version_to_session["other_versions_mapping"].keys()
):
raise ValueError(
f"The default_stimuli_version '{stimuli_version_to_session['default_stimuli_version']}' should not be included in the other_versions_mapping keys."
)
aoi_df_questions = aoi_df[aoi_df["page"].str.contains("question", na=False)]

aoi_df_texts.to_csv(aoi_file, sep=",", index=False, encoding="UTF-8")
# check that there are no duplicate versions in the other_versions_mapping values
other_versions = list(
stimuli_version_to_session["other_versions_mapping"].keys()
)
if len(other_versions) != len(set(other_versions)):
raise ValueError(
"Duplicate stimulus versions found in other_versions_mapping keys. Please ensure each version appears only once."
)

question_path = aoi_path / (aoi_file.stem + "_questions" + aoi_file.suffix)
aoi_df_questions.to_csv(
question_path, sep=",", index=False, encoding="UTF-8"
stimulus_folder_paths = [
data_folder_path
/ f"stimuli_{data_collection_name}_{stimuli_version_to_session['default_stimuli_version']}"
]
for version in stimuli_version_to_session["other_versions_mapping"].keys():
stimulus_folder_paths.append(
data_folder_path / f"stimuli_{data_collection_name}_{version}"
)

elif len(aoi_files) == 24:
pass
# check that there are no sessions that are mapped to multiple stimulus versions
all_mapped_sessions = []
for version, sessions in stimuli_version_to_session[
"other_versions_mapping"
].items():
for session in sessions:
if session in all_mapped_sessions:
raise ValueError(
f"Session '{session}' is mapped to multiple stimulus versions. Please ensure each session is mapped to only one version."
)
all_mapped_sessions.append(session)
else:
raise ValueError(
f"Unexpected number of AOI files ({len(aoi_files)}) found in '{aoi_path}'. "
"Expected 12 (not split) or 24 (already split into texts and questions)."
stimulus_folder_paths = [data_folder_path / f"stimuli_{data_collection_name}"]

for stimulus_folder_path in stimulus_folder_paths:
if not stimulus_folder_path.exists():
print(
f"The stimulus folder '{stimulus_folder_path}' does not exist. Check and if necessary, ask the team to upload. If you have more than one stimulus version, make sure to specify the mapping in the config file."
)
else:
config_path = stimulus_folder_path / "config"
if not config_path.exists():
raise FileNotFoundError(
f"The stimulus config folder not found in '{stimulus_folder_path}'. "
"Please check and restructure or possibly unzip the stimulus folder."
)

# if aoi files are not yet split into questions and texts, do it here:
aoi_path = (
data_folder_path
/ stimulus_folder_path
/ f"aoi_stimuli_{lang}_{country}_{lab_no}"
)

# get all aoi files, if there are only 12 files, they are not yet split
aoi_files = list(aoi_path.glob("*.csv"))
if len(aoi_files) == 12:
print(
f"Splitting AOI files into text and question AOIs for {stimulus_folder_path.name}..."
)
for aoi_file in aoi_files:
aoi_df = pd.read_csv(aoi_file)
# split the aoi_df into two parts, one for the stimulus and one for the questions
aoi_df_texts = aoi_df[
~aoi_df["page"].str.contains("question", na=False)
]
aoi_df_texts.drop(
columns=["question_image_version"], inplace=True, errors="ignore"
)
aoi_df_questions = aoi_df[
aoi_df["page"].str.contains("question", na=False)
]

aoi_df_texts.to_csv(aoi_file, sep=",", index=False, encoding="UTF-8")

question_path = aoi_path / (
aoi_file.stem + "_questions" + aoi_file.suffix
)
aoi_df_questions.to_csv(
question_path, sep=",", index=False, encoding="UTF-8"
)
elif len(aoi_files) == 24:
pass
else:
raise ValueError(
f"Unexpected number of AOI files ({len(aoi_files)}) found in '{aoi_path}'. "
"Expected 12 (not split) or 24 (already split into texts and questions)."
)


def extract_stimulus_version_number_from_asc(asc_file_path: Path) -> int:
pattern = r"MSG\s+\d+\s+stimulus_order_version:\s+(?P<version_num>\d\d?\d?)\n"
Expand Down