Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 115 additions & 12 deletions protzilla/importing/ms_data_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@


def max_quant_import(
file_path: str, intensity_name: str, map_to_uniprot=False, aggregation_method: str ="Sum"
file_path: str,
intensity_name: str,
map_to_uniprot=False,
aggregation_method: str = "Sum",
) -> dict:
assert intensity_name in ["Intensity", "iBAQ", "LFQ intensity"]
try:
Expand All @@ -34,15 +37,28 @@ def max_quant_import(
c[len(intensity_name) + 1 :] for c in intensity_df.columns
]
intensity_df = intensity_df.assign(**{"Protein ID": protein_groups})
return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method)
return transform_and_clean(
intensity_df, intensity_name, map_to_uniprot, aggregation_method
)

except Exception as e:
msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid Max Quant file."
return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))])
return dict(
messages=[
dict(
level=logging.ERROR,
msg=msg,
trace=format_trace(traceback.format_exception(e)),
)
]
)


def ms_fragger_import(
file_path: str, intensity_name: str, map_to_uniprot=False, aggregation_method: str ="Sum"
file_path: str,
intensity_name: str,
map_to_uniprot=False,
aggregation_method: str = "Sum",
) -> dict:
assert intensity_name in [
"Intensity",
Expand Down Expand Up @@ -87,13 +103,25 @@ def ms_fragger_import(
)
intensity_df = intensity_df.assign(**{"Protein ID": protein_groups})

return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method)
return transform_and_clean(
intensity_df, intensity_name, map_to_uniprot, aggregation_method
)
except Exception as e:
msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid MS Fragger file."
return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))])
return dict(
messages=[
dict(
level=logging.ERROR,
msg=msg,
trace=format_trace(traceback.format_exception(e)),
)
]
)


def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum") -> dict:
def diann_import(
file_path, map_to_uniprot=False, aggregation_method: str = "Sum"
) -> dict:
try:
df = pd.read_csv(
file_path,
Expand All @@ -117,14 +145,86 @@ def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum"

intensity_name = "Intensity"

return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method)
return transform_and_clean(
intensity_df, intensity_name, map_to_uniprot, aggregation_method
)
except Exception as e:
msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid DIA-NN MS file."
return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))])
return dict(
messages=[
dict(
level=logging.ERROR,
msg=msg,
trace=format_trace(traceback.format_exception(e)),
)
]
)


def simple_csv_import(
file_path: str, map_to_uniprot=False, aggregation_method: str = "Sum"
) -> dict:
"""
Imports a simple CSV file with protein IDs in the first column and intensity values in the remaining columns.

:param file_path: Path to the CSV file
:type file_path: str
:param map_to_uniprot: Whether to map protein IDs to UniProt IDs
:type map_to_uniprot: bool
:param aggregation_method: Method to aggregate duplicate protein groups ("Sum", "Mean", or "Median")
:type aggregation_method: str
:return: Dictionary containing the processed dataframe and metadata
"""
try:
df = pd.read_csv(
file_path,
sep=",",
low_memory=False,
na_values=["", 0],
keep_default_na=True,
)

# Check if "Protein ID" column exists
if "Protein ID" not in df.columns:
msg = "Column 'Protein ID' not found in the provided file. Please check your file format."
return dict(messages=[dict(level=logging.ERROR, msg=msg)])

# Get sample columns (all columns except "Protein ID")
sample_columns = [col for col in df.columns if col != "Protein ID"]

if not sample_columns:
msg = "No sample columns found in the provided file. Please check your file format."
return dict(messages=[dict(level=logging.ERROR, msg=msg)])

# Create a dataframe with only the protein IDs and sample columns
intensity_df = df[["Protein ID"] + sample_columns]

# Use a fixed intensity name for the output
intensity_name = "Intensity"

# Pass to the common transform and clean function
return transform_and_clean(
intensity_df, intensity_name, map_to_uniprot, aggregation_method
)

except Exception as e:
msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid CSV file."
return dict(
messages=[
dict(
level=logging.ERROR,
msg=msg,
trace=format_trace(traceback.format_exception(e)),
)
]
)


def transform_and_clean(
df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool, aggregation_method: str ="Sum"
df: pd.DataFrame,
intensity_name: str,
map_to_uniprot: bool,
aggregation_method: str = "Sum",
) -> dict:
"""
Transforms a dataframe that is read from a file in wide format into long format,
Expand Down Expand Up @@ -158,7 +258,9 @@ def transform_and_clean(
# applies the selected aggregation to duplicate protein groups, NaN if all are NaN, aggregation of numbers otherwise
aggregation_method = aggregation_method.lower()
agg_kwargs = {"sum": {"min_count": 1}, "median": {}, "mean": {}}
df = df.groupby("Protein ID", as_index=False).agg(aggregation_method, **agg_kwargs[aggregation_method])
df = df.groupby("Protein ID", as_index=False).agg(
aggregation_method, **agg_kwargs[aggregation_method]
)

df = df.assign(Gene=lambda _: np.nan) # add deprecated genes column

Expand Down Expand Up @@ -222,6 +324,7 @@ def clean_protein_groups(protein_groups, map_to_uniprot=True):
for group in found_ids_per_group:
all_ids_of_group = []
for old_id in group:
# Issue 574: ENSEMBL ids are not mapped to uniprot
if uniprot_regex.search(old_id):
all_ids_of_group.append(old_id)
elif map_to_uniprot:
Expand All @@ -230,7 +333,7 @@ def clean_protein_groups(protein_groups, map_to_uniprot=True):
all_ids_of_group.extend(new_ids)
else:
all_ids_of_group.append(old_id)
new_groups.append(all_ids_of_group[0] if all_ids_of_group else '')
new_groups.append(all_ids_of_group[0] if all_ids_of_group else "")
return new_groups, removed_protein_ids


Expand Down
22 changes: 20 additions & 2 deletions protzilla/methods/importing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
diann_import,
max_quant_import,
ms_fragger_import,
simple_csv_import,
)
from protzilla.importing.peptide_import import peptide_import, evidence_import
from protzilla.steps import Step, StepManager
Expand Down Expand Up @@ -51,7 +52,9 @@ def method(self, inputs):
class MsFraggerImport(ImportingStep):
display_name = "MS Fragger Combined Protein Import"
operation = "Protein Data Import"
method_description = "Import the combined_protein.tsv file form output of MS Fragger"
method_description = (
"Import the combined_protein.tsv file form output of MS Fragger"
)

input_keys = ["file_path", "intensity_name", "map_to_uniprot", "aggregation_method"]
output_keys = ["protein_df"]
Expand All @@ -60,6 +63,21 @@ def method(self, inputs):
return ms_fragger_import(**inputs)


class SimpleCSVImport(ImportingStep):
display_name = "Simple CSV Intensities Import"
operation = "Protein Data Import"
method_description = (
"Import protein intensities from a csv file. The csv requires a column 'Protein ID', "
"the remaining column names should be the sample names. The values should be the intensities."
)

input_keys = ["file_path", "map_to_uniprot", "aggregation_method"]
output_keys = ["protein_df"]

def method(self, inputs):
return simple_csv_import(**inputs)


class MetadataImport(ImportingStep):
display_name = "Metadata Import"
operation = "metadataimport"
Expand Down Expand Up @@ -139,4 +157,4 @@ class EvidenceImport(ImportingStep):
output_keys = ["peptide_df"]

def method(self, inputs):
return evidence_import(**inputs)
return evidence_import(**inputs)
119 changes: 115 additions & 4 deletions tests/protzilla/importing/test_ms_data_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,117 @@
from protzilla.importing import ms_data_import


def simple_csv_import_intensity_df():
"""Create expected dataframe for simple CSV import test"""
# fmt: off
sample_data = { 'Sample': ['Sample1', 'Sample1', 'Sample1', 'Sample1', 'Sample1', 'Sample2', 'Sample2', 'Sample2', 'Sample2', 'Sample2'],
'Protein ID': ['A2A5R2', 'A2A7S8', 'A2A863', 'A2AGT5', 'A2AJ76', 'A2A5R2', 'A2A7S8', 'A2A863', 'A2AGT5', 'A2AJ76'],
'Gene': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
'Intensity': [18210618.0, 4133918.5, 144354336.0, 5645782.0, 9055790.0, 25468630.0, 7812505.5, 139428224.0, 3202878.8, 19467296.0]}
# fmt: on
df = pd.DataFrame(data=sample_data)
df.sort_values(by=["Sample", "Protein ID"], ignore_index=True, inplace=True)
return df


def test_simple_csv_import():
"""Test basic functionality of simple_csv_import"""
outputs = ms_data_import.simple_csv_import(
file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data.csv",
)

expected_protein_df = simple_csv_import_intensity_df()

# Drop Gene column for comparison as it's just placeholders
expected_protein_df = expected_protein_df.drop(columns=["Gene"])
result_protein_df = outputs["protein_df"].drop(columns=["Gene"])

pd.testing.assert_frame_equal(expected_protein_df, result_protein_df)


def test_simple_csv_import_file_not_exist():
"""Test error handling when file doesn't exist"""
outputs = ms_data_import.simple_csv_import(
file_path="non_existent_file_path",
)

assert "protein_df" not in outputs
assert "messages" in outputs
assert any(message["level"] == logging.ERROR for message in outputs["messages"])
assert any("found" in message["msg"].lower() for message in outputs["messages"])


def test_simple_csv_import_no_protein_id_column():
"""Test error handling when Protein ID column is missing"""
outputs = ms_data_import.simple_csv_import(
file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_no_protein_column.csv",
)

assert "protein_df" not in outputs
assert "messages" in outputs
assert any(message["level"] == logging.ERROR for message in outputs["messages"])
assert any("Protein ID" in message["msg"] for message in outputs["messages"])


def test_simple_csv_import_no_sample_columns():
"""Test error handling when no sample columns are present"""
outputs = ms_data_import.simple_csv_import(
file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_no_samples.csv",
)

assert "protein_df" not in outputs
assert "messages" in outputs
assert any(message["level"] == logging.ERROR for message in outputs["messages"])
assert any("No sample columns" in message["msg"] for message in outputs["messages"])


def test_simple_csv_import_aggregation_methods():
"""Test different aggregation methods"""
for method in ["Sum", "Mean", "Median"]:
outputs = ms_data_import.simple_csv_import(
file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_duplicates.csv",
aggregation_method=method,
)

assert "protein_df" in outputs
# The exact values would depend on the test data and method,
# but we can at least check that processing completes
assert outputs["protein_df"] is not None


def test_simple_csv_import_filters_contaminants():
"""Test that contaminant proteins are filtered"""
outputs = ms_data_import.simple_csv_import(
file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_contaminants.csv",
)

protein_ids = outputs["protein_df"]["Protein ID"].unique().tolist()

# All instances of CON__ should be filtered out
assert all(
not any(id_.startswith("CON__") for id_ in group.split(";"))
for group in protein_ids
)

# Check that contaminants list is not empty
assert len(outputs["contaminants"]) > 0


# Issue 574: ENSEMBL ids are not mapped to uniprot. Once resolved, uncomment this
# @patch("protzilla.importing.ms_data_import.map_ids_to_uniprot")
# def test_simple_csv_import_with_mapping(ids_to_uniprot_mock):
# """Test UniProt ID mapping functionality"""
# ids_to_uniprot_mock.return_value = {"ENSP12345678901": ["P54321"]}
#
# outputs = ms_data_import.simple_csv_import(
# file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_ensembl.csv",
# map_to_uniprot=True)
#
# # Check that the mapped IDs are present in the output
# protein_ids = outputs["protein_df"]["Protein ID"].unique().tolist()
# assert "P54321" in protein_ids


def ms_fragger_import_intensity_df(intensity_name):
ms_fragger_list = (
["DDM_0pt1_01", "A2A5R2", "Arfgef2"],
Expand Down Expand Up @@ -218,7 +329,9 @@ def test_max_quant_import_no_protein_ids_column():
assert "protein_df" not in outputs
assert "messages" in outputs
assert any(message["level"] == logging.ERROR for message in outputs["messages"])
assert any("Majority protein IDs" in message["msg"] for message in outputs["messages"])
assert any(
"Majority protein IDs" in message["msg"] for message in outputs["messages"]
)


def test_max_quant_import_invalid_data():
Expand Down Expand Up @@ -310,9 +423,7 @@ def test_transform_and_clean():
["C", "Q11111", np.nan],
]
df = pd.DataFrame(data, columns=columns)
outputs = ms_data_import.transform_and_clean(
df, "intensity", map_to_uniprot=False
)
outputs = ms_data_import.transform_and_clean(df, "intensity", map_to_uniprot=False)
expected_df = pd.DataFrame(expected_output, columns=out_col)

# we do not care about the genes column, it is deprecated (and replaced by nan)
Expand Down
6 changes: 6 additions & 0 deletions tests/test_data/simple_csv_data/simple_protein_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Protein ID,Sample1,Sample2
A2A5R2,18210618,25468630
A2A7S8,4133918.5,7812505.5
A2A863,144354336,139428224
A2AGT5,5645782,3202878.8
A2AJ76,9055790,19467296
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Protein ID,Sample1,Sample2
A2A5R2,18210618,25468630
A2A7S8,4133918.5,7812505.5
A2A863,144354336,139428224
A2AGT5,5645782,3202878.8
CON__A2AJ76,9055790,19467296
Loading
Loading