cschlaffner · henninggaertner · Feb 26, 2025 · Feb 27, 2025 · Feb 27, 2025
diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py
@@ -11,7 +11,10 @@
 
 
 def max_quant_import(
-    file_path: str, intensity_name: str, map_to_uniprot=False, aggregation_method: str ="Sum"
+    file_path: str,
+    intensity_name: str,
+    map_to_uniprot=False,
+    aggregation_method: str = "Sum",
 ) -> dict:
     assert intensity_name in ["Intensity", "iBAQ", "LFQ intensity"]
     try:
@@ -34,15 +37,28 @@ def max_quant_import(
             c[len(intensity_name) + 1 :] for c in intensity_df.columns
         ]
         intensity_df = intensity_df.assign(**{"Protein ID": protein_groups})
-        return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method)
+        return transform_and_clean(
+            intensity_df, intensity_name, map_to_uniprot, aggregation_method
+        )
 
     except Exception as e:
         msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid Max Quant file."
-        return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))])
+        return dict(
+            messages=[
+                dict(
+                    level=logging.ERROR,
+                    msg=msg,
+                    trace=format_trace(traceback.format_exception(e)),
+                )
+            ]
+        )
 
 
 def ms_fragger_import(
-    file_path: str, intensity_name: str, map_to_uniprot=False, aggregation_method: str ="Sum"
+    file_path: str,
+    intensity_name: str,
+    map_to_uniprot=False,
+    aggregation_method: str = "Sum",
 ) -> dict:
     assert intensity_name in [
         "Intensity",
@@ -87,13 +103,25 @@ def ms_fragger_import(
         )
         intensity_df = intensity_df.assign(**{"Protein ID": protein_groups})
 
-        return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method)
+        return transform_and_clean(
+            intensity_df, intensity_name, map_to_uniprot, aggregation_method
+        )
     except Exception as e:
         msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid MS Fragger file."
-        return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))])
+        return dict(
+            messages=[
+                dict(
+                    level=logging.ERROR,
+                    msg=msg,
+                    trace=format_trace(traceback.format_exception(e)),
+                )
+            ]
+        )
 
 
-def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum") -> dict:
+def diann_import(
+    file_path, map_to_uniprot=False, aggregation_method: str = "Sum"
+) -> dict:
     try:
         df = pd.read_csv(
             file_path,
@@ -117,14 +145,86 @@ def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum"
 
         intensity_name = "Intensity"
 
-        return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method)
+        return transform_and_clean(
+            intensity_df, intensity_name, map_to_uniprot, aggregation_method
+        )
     except Exception as e:
         msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid DIA-NN MS file."
-        return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))])
+        return dict(
+            messages=[
+                dict(
+                    level=logging.ERROR,
+                    msg=msg,
+                    trace=format_trace(traceback.format_exception(e)),
+                )
+            ]
+        )
+
+
+def simple_csv_import(
+    file_path: str, map_to_uniprot=False, aggregation_method: str = "Sum"
+) -> dict:
+    """
+    Imports a simple CSV file with protein IDs in the first column and intensity values in the remaining columns.
+
+    :param file_path: Path to the CSV file
+    :type file_path: str
+    :param map_to_uniprot: Whether to map protein IDs to UniProt IDs
+    :type map_to_uniprot: bool
+    :param aggregation_method: Method to aggregate duplicate protein groups ("Sum", "Mean", or "Median")
+    :type aggregation_method: str
+    :return: Dictionary containing the processed dataframe and metadata
+    """
+    try:
+        df = pd.read_csv(
+            file_path,
+            sep=",",
+            low_memory=False,
+            na_values=["", 0],
+            keep_default_na=True,
+        )
+
+        # Check if "Protein ID" column exists
+        if "Protein ID" not in df.columns:
+            msg = "Column 'Protein ID' not found in the provided file. Please check your file format."
+            return dict(messages=[dict(level=logging.ERROR, msg=msg)])
+
+        # Get sample columns (all columns except "Protein ID")
+        sample_columns = [col for col in df.columns if col != "Protein ID"]
+
+        if not sample_columns:
+            msg = "No sample columns found in the provided file. Please check your file format."
+            return dict(messages=[dict(level=logging.ERROR, msg=msg)])
+
+        # Create a dataframe with only the protein IDs and sample columns
+        intensity_df = df[["Protein ID"] + sample_columns]
+
+        # Use a fixed intensity name for the output
+        intensity_name = "Intensity"
+
+        # Pass to the common transform and clean function
+        return transform_and_clean(
+            intensity_df, intensity_name, map_to_uniprot, aggregation_method
+        )
+
+    except Exception as e:
+        msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid CSV file."
+        return dict(
+            messages=[
+                dict(
+                    level=logging.ERROR,
+                    msg=msg,
+                    trace=format_trace(traceback.format_exception(e)),
+                )
+            ]
+        )
 
 
 def transform_and_clean(
-    df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool, aggregation_method: str ="Sum"
+    df: pd.DataFrame,
+    intensity_name: str,
+    map_to_uniprot: bool,
+    aggregation_method: str = "Sum",
 ) -> dict:
     """
     Transforms a dataframe that is read from a file in wide format into long format,
@@ -158,7 +258,9 @@ def transform_and_clean(
     # applies the selected aggregation to duplicate protein groups, NaN if all are NaN, aggregation of numbers otherwise
     aggregation_method = aggregation_method.lower()
     agg_kwargs = {"sum": {"min_count": 1}, "median": {}, "mean": {}}
-    df = df.groupby("Protein ID", as_index=False).agg(aggregation_method, **agg_kwargs[aggregation_method])
+    df = df.groupby("Protein ID", as_index=False).agg(
+        aggregation_method, **agg_kwargs[aggregation_method]
+    )
 
     df = df.assign(Gene=lambda _: np.nan)  # add deprecated genes column
 
@@ -222,6 +324,7 @@ def clean_protein_groups(protein_groups, map_to_uniprot=True):
     for group in found_ids_per_group:
         all_ids_of_group = []
         for old_id in group:
+            # Issue 574: ENSEMBL ids are not mapped to uniprot
             if uniprot_regex.search(old_id):
                 all_ids_of_group.append(old_id)
             elif map_to_uniprot:
@@ -230,7 +333,7 @@ def clean_protein_groups(protein_groups, map_to_uniprot=True):
                 all_ids_of_group.extend(new_ids)
             else:
                 all_ids_of_group.append(old_id)
-        new_groups.append(all_ids_of_group[0] if all_ids_of_group else '')
+        new_groups.append(all_ids_of_group[0] if all_ids_of_group else "")
     return new_groups, removed_protein_ids
 
 

diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py
@@ -9,6 +9,7 @@
     diann_import,
     max_quant_import,
     ms_fragger_import,
+    simple_csv_import,
 )
 from protzilla.importing.peptide_import import peptide_import, evidence_import
 from protzilla.steps import Step, StepManager
@@ -51,7 +52,9 @@ def method(self, inputs):
 class MsFraggerImport(ImportingStep):
     display_name = "MS Fragger Combined Protein Import"
     operation = "Protein Data Import"
-    method_description = "Import the combined_protein.tsv file form output of MS Fragger"
+    method_description = (
+        "Import the combined_protein.tsv file form output of MS Fragger"
+    )
 
     input_keys = ["file_path", "intensity_name", "map_to_uniprot", "aggregation_method"]
     output_keys = ["protein_df"]
@@ -60,6 +63,21 @@ def method(self, inputs):
         return ms_fragger_import(**inputs)
 
 
+class SimpleCSVImport(ImportingStep):
+    display_name = "Simple CSV Intensities Import"
+    operation = "Protein Data Import"
+    method_description = (
+        "Import protein intensities from a csv file. The csv requires a column 'Protein ID', "
+        "the remaining column names should be the sample names. The values should be the intensities."
+    )
+
+    input_keys = ["file_path", "map_to_uniprot", "aggregation_method"]
+    output_keys = ["protein_df"]
+
+    def method(self, inputs):
+        return simple_csv_import(**inputs)
+
+
 class MetadataImport(ImportingStep):
     display_name = "Metadata Import"
     operation = "metadataimport"
@@ -139,4 +157,4 @@ class EvidenceImport(ImportingStep):
     output_keys = ["peptide_df"]
 
     def method(self, inputs):
-        return evidence_import(**inputs)
+        return evidence_import(**inputs)
diff --git a/tests/protzilla/importing/test_ms_data_import.py b/tests/protzilla/importing/test_ms_data_import.py
@@ -9,6 +9,117 @@
 from protzilla.importing import ms_data_import
 
 
+def simple_csv_import_intensity_df():
+    """Create expected dataframe for simple CSV import test"""
+    # fmt: off
+    sample_data = { 'Sample': ['Sample1', 'Sample1', 'Sample1', 'Sample1', 'Sample1', 'Sample2', 'Sample2', 'Sample2', 'Sample2', 'Sample2'],
+                    'Protein ID': ['A2A5R2', 'A2A7S8', 'A2A863', 'A2AGT5', 'A2AJ76', 'A2A5R2', 'A2A7S8', 'A2A863', 'A2AGT5', 'A2AJ76'],
+                    'Gene': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
+                    'Intensity': [18210618.0, 4133918.5, 144354336.0, 5645782.0, 9055790.0, 25468630.0, 7812505.5, 139428224.0, 3202878.8, 19467296.0]}
+    # fmt: on
+    df = pd.DataFrame(data=sample_data)
+    df.sort_values(by=["Sample", "Protein ID"], ignore_index=True, inplace=True)
+    return df
+
+
+def test_simple_csv_import():
+    """Test basic functionality of simple_csv_import"""
+    outputs = ms_data_import.simple_csv_import(
+        file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data.csv",
+    )
+
+    expected_protein_df = simple_csv_import_intensity_df()
+
+    # Drop Gene column for comparison as it's just placeholders
+    expected_protein_df = expected_protein_df.drop(columns=["Gene"])
+    result_protein_df = outputs["protein_df"].drop(columns=["Gene"])
+
+    pd.testing.assert_frame_equal(expected_protein_df, result_protein_df)
+
+
+def test_simple_csv_import_file_not_exist():
+    """Test error handling when file doesn't exist"""
+    outputs = ms_data_import.simple_csv_import(
+        file_path="non_existent_file_path",
+    )
+
+    assert "protein_df" not in outputs
+    assert "messages" in outputs
+    assert any(message["level"] == logging.ERROR for message in outputs["messages"])
+    assert any("found" in message["msg"].lower() for message in outputs["messages"])
+
+
+def test_simple_csv_import_no_protein_id_column():
+    """Test error handling when Protein ID column is missing"""
+    outputs = ms_data_import.simple_csv_import(
+        file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_no_protein_column.csv",
+    )
+
+    assert "protein_df" not in outputs
+    assert "messages" in outputs
+    assert any(message["level"] == logging.ERROR for message in outputs["messages"])
+    assert any("Protein ID" in message["msg"] for message in outputs["messages"])
+
+
+def test_simple_csv_import_no_sample_columns():
+    """Test error handling when no sample columns are present"""
+    outputs = ms_data_import.simple_csv_import(
+        file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_no_samples.csv",
+    )
+
+    assert "protein_df" not in outputs
+    assert "messages" in outputs
+    assert any(message["level"] == logging.ERROR for message in outputs["messages"])
+    assert any("No sample columns" in message["msg"] for message in outputs["messages"])
+
+
+def test_simple_csv_import_aggregation_methods():
+    """Test different aggregation methods"""
+    for method in ["Sum", "Mean", "Median"]:
+        outputs = ms_data_import.simple_csv_import(
+            file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_duplicates.csv",
+            aggregation_method=method,
+        )
+
+        assert "protein_df" in outputs
+        # The exact values would depend on the test data and method,
+        # but we can at least check that processing completes
+        assert outputs["protein_df"] is not None
+
+
+def test_simple_csv_import_filters_contaminants():
+    """Test that contaminant proteins are filtered"""
+    outputs = ms_data_import.simple_csv_import(
+        file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_contaminants.csv",
+    )
+
+    protein_ids = outputs["protein_df"]["Protein ID"].unique().tolist()
+
+    # All instances of CON__ should be filtered out
+    assert all(
+        not any(id_.startswith("CON__") for id_ in group.split(";"))
+        for group in protein_ids
+    )
+
+    # Check that contaminants list is not empty
+    assert len(outputs["contaminants"]) > 0
+
+
+# Issue 574: ENSEMBL ids are not mapped to uniprot. Once resolved, uncomment this
+# @patch("protzilla.importing.ms_data_import.map_ids_to_uniprot")
+# def test_simple_csv_import_with_mapping(ids_to_uniprot_mock):
+#     """Test UniProt ID mapping functionality"""
+#     ids_to_uniprot_mock.return_value = {"ENSP12345678901": ["P54321"]}
+#
+#     outputs = ms_data_import.simple_csv_import(
+#         file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_ensembl.csv",
+#         map_to_uniprot=True)
+#
+#     # Check that the mapped IDs are present in the output
+#     protein_ids = outputs["protein_df"]["Protein ID"].unique().tolist()
+#     assert "P54321" in protein_ids
+
+
 def ms_fragger_import_intensity_df(intensity_name):
     ms_fragger_list = (
         ["DDM_0pt1_01", "A2A5R2", "Arfgef2"],
@@ -218,7 +329,9 @@ def test_max_quant_import_no_protein_ids_column():
     assert "protein_df" not in outputs
     assert "messages" in outputs
     assert any(message["level"] == logging.ERROR for message in outputs["messages"])
-    assert any("Majority protein IDs" in message["msg"] for message in outputs["messages"])
+    assert any(
+        "Majority protein IDs" in message["msg"] for message in outputs["messages"]
+    )
 
 
 def test_max_quant_import_invalid_data():
@@ -310,9 +423,7 @@ def test_transform_and_clean():
         ["C", "Q11111", np.nan],
     ]
     df = pd.DataFrame(data, columns=columns)
-    outputs = ms_data_import.transform_and_clean(
-        df, "intensity", map_to_uniprot=False
-    )
+    outputs = ms_data_import.transform_and_clean(df, "intensity", map_to_uniprot=False)
     expected_df = pd.DataFrame(expected_output, columns=out_col)
 
     # we do not care about the genes column, it is deprecated (and replaced by nan)

diff --git a/tests/test_data/simple_csv_data/simple_protein_data.csv b/tests/test_data/simple_csv_data/simple_protein_data.csv
@@ -0,0 +1,6 @@
+Protein ID,Sample1,Sample2
+A2A5R2,18210618,25468630
+A2A7S8,4133918.5,7812505.5
+A2A863,144354336,139428224
+A2AGT5,5645782,3202878.8
+A2AJ76,9055790,19467296
diff --git a/tests/test_data/simple_csv_data/simple_protein_data_contaminants.csv b/tests/test_data/simple_csv_data/simple_protein_data_contaminants.csv
@@ -0,0 +1,6 @@
+Protein ID,Sample1,Sample2
+A2A5R2,18210618,25468630
+A2A7S8,4133918.5,7812505.5
+A2A863,144354336,139428224
+A2AGT5,5645782,3202878.8
+CON__A2AJ76,9055790,19467296