cschlaffner · henninggaertner · Jun 21, 2024 · Jun 21, 2024
diff --git a/protzilla/importing/metadata_import.py b/protzilla/importing/metadata_import.py
@@ -122,27 +122,34 @@ def metadata_import_method_diann(
         return dict(
             messages=[dict(level=logging.ERROR, msg=msg)],
         )
+    meta_df.rename(columns={"sample name": "Sample"}, inplace=True)
 
     if file_path.startswith(
         f"{PROJECT_PATH}/tests/protzilla/importing/conversion_tmp_"
     ):
         os.remove(file_path)
 
     if groupby_sample:
-        # we want to take the median of all MS runs (column "Sample" in the intensity df) for each Sample
-        # (column "sample name" in the metadata df)
+        # we want to take the median of all MS runs (column "Sample" in the protein df) for each Sample
+        # (column "Sample" in the metadata df)
+        protein_df.rename(columns={"Sample": "MS run"}, inplace=True)
         protein_df = pd.merge(
             protein_df,
-            meta_df[["MS run", "sample name"]],
-            left_on="Sample",
+            meta_df[["MS run", "Sample"]],
+            left_on="MS run",
             right_on="MS run",
             how="left",
         )
-        protein_df = protein_df.groupby(
-            ["Protein ID", "sample name"], as_index=False
-        ).median()
-        protein_df.rename(columns={"sample name": "Sample"}, inplace=True)
+        protein_df = (
+            protein_df.groupby(["Protein ID", "Sample"])["Intensity"]
+            .median()
+            .reset_index()
+        )
         return dict(protein_df=protein_df, metadata_df=meta_df)
+    else:
+        meta_df.rename(
+            columns={"MS run": "Sample", "Sample": "Sample Group"}, inplace=True
+        )
 
     return dict(protein_df=protein_df, metadata_df=meta_df)
 

diff --git a/protzilla/run.py b/protzilla/run.py
@@ -5,6 +5,7 @@
 import traceback
 
 import protzilla.constants.paths as paths
+from protzilla.constants.protzilla_logging import logger
 from protzilla.steps import Messages, Output, Plots, Step
 from protzilla.utilities import format_trace
 
@@ -31,6 +32,8 @@ def __enter__(self):
         def __exit__(self, exc_type, exc_value, tb):
             if exc_type:
                 formatted_trace = format_trace(traceback.format_exception(exc_value))
+                # print the traceback to the console
+                logger.exception(exc_value)
                 if (
                     hasattr(self.run, "current_step")
                     and self.run.current_step is not None

diff --git a/protzilla/runner.py b/protzilla/runner.py
@@ -92,7 +92,8 @@ def compute_workflow(self):
             if not step.finished:
                 break
 
-            self.run.step_next()
+            if self.run.steps.current_step_index < len(self.run.steps.all_steps) - 1:
+                self.run.step_next()
         self.run._run_write()
 
     def _insert_commandline_inputs(self, step):
@@ -123,6 +124,8 @@ def _perform_current_step(self, params=None):
 
     def _save_plots_html(self, step):
         for i, plot in enumerate(step.plots):
+            if isinstance(plot, bytes):
+                continue
             plot_path = f"{self.plots_path}/{self.run.steps.current_step_index}-{step.section}-{step.operation}-{step.display_name}-{i}.html"
             plot.write_html(plot_path)
 

diff --git a/protzilla/steps.py b/protzilla/steps.py
@@ -12,6 +12,7 @@
 import plotly
 from PIL import Image
 
+from protzilla.constants.protzilla_logging import logger
 from protzilla.utilities import format_trace
 
 
@@ -79,6 +80,7 @@ def calculate(self, steps: StepManager, inputs: dict) -> None:
 
             self.validate_outputs()
         except NotImplementedError as e:
+            logger.exception(e)
             self.messages.append(
                 dict(
                     level=logging.ERROR,
@@ -87,6 +89,7 @@ def calculate(self, steps: StepManager, inputs: dict) -> None:
                 )
             )
         except ValueError as e:
+            logger.exception(e)
             self.messages.append(
                 dict(
                     level=logging.ERROR,
@@ -95,24 +98,14 @@ def calculate(self, steps: StepManager, inputs: dict) -> None:
                 )
             )
         except TypeError as e:
+            logger.exception(e)
             self.messages.append(
                 dict(
                     level=logging.ERROR,
                     msg=f"Please check the implementation of this steps method class (especially the input_keys): {e}.",
                     trace=format_trace(traceback.format_exception(e)),
                 )
             )
-        except Exception as e:
-            self.messages.append(
-                dict(
-                    level=logging.ERROR,
-                    msg=(
-                        f"An error occurred while calculating this step: {e.__class__.__name__} {e} "
-                        f"Please check your parameters or report a potential programming issue."
-                    ),
-                    trace=format_trace(traceback.format_exception(e)),
-                )
-            )
 
     def method(self, **kwargs) -> dict:
         raise NotImplementedError("This method must be implemented in a subclass.")

diff --git a/tests/proteinGroups_small.txt b/tests/proteinGroups_small.txt
diff --git a/tests/protzilla/importing/test_metadata_import.py b/tests/protzilla/importing/test_metadata_import.py
@@ -57,7 +57,7 @@ def test_metadata_import_diann(run_empty):
         test_metadata, run_empty.current_outputs["metadata_df"]
     )
     pd.testing.assert_frame_equal(
-        test_protein_df, run_empty.steps.get_step_output(DiannImport, "protein_df")
+        test_protein_df, run_empty.current_outputs["protein_df"]
     )
 
 

diff --git a/tests/protzilla/test_runner.py b/tests/protzilla/test_runner.py
@@ -12,13 +12,13 @@
 sys.path.append(f"{PROJECT_PATH}")
 
 from protzilla.runner import Runner, _serialize_graphs
-from runner_cli import args_parser
 from protzilla.steps import Output, Plots
+from runner_cli import args_parser
 
 
 @pytest.fixture
 def ms_data_path():
-    return "tests/proteinGroups_small_cut.txt"
+    return "tests/proteinGroups_small.txt"
 
 
 @pytest.fixture
@@ -43,7 +43,8 @@ def mock_current_parameters(*args, **kwargs):
 
         # side effect to mark the step as finished
         runner.run.current_step.output = Output(
-            {key: "mock_output_value" for key in runner.run.current_step.output_keys})
+            {key: "mock_output_value" for key in runner.run.current_step.output_keys}
+        )
         if len(runner.run.current_step.output_keys) == 0:
             runner.run.current_step.plots = Plots(["mock_plot"])
 
@@ -69,7 +70,7 @@ def test_runner_imports(
     monkeypatch, tests_folder_name, ms_data_path, metadata_path, peptide_path
 ):
     importing_args = [
-        "standard",  # expects max-quant import, metadata import
+        ".standard",  # expects max-quant import, metadata import
         ms_data_path,
         f"--run_name={tests_folder_name}/test_runner_{random_string()}",
         f"--meta_data_path={metadata_path}",
@@ -88,34 +89,87 @@ def test_runner_imports(
     runner.compute_workflow()
 
     expected_methods = [
-        'MaxQuantImport',
-        'MetadataImport',
-        'FilterProteinsBySamplesMissing',
-        'FilterSamplesByProteinIntensitiesSum',
-        'ImputationByKNN',
-        'OutlierDetectionByLocalOutlierFactor',
-        'NormalisationByMedian',
-        'TransformationLog',
-        'PlotProtQuant',
-        'DifferentialExpressionTTest',
-        'PlotVolcano',
-        'EnrichmentAnalysisGOAnalysisWithString',
-        'PlotGOEnrichmentBarPlot'
+        "MaxQuantImport",
+        "MetadataImport",
+        "FilterProteinsBySamplesMissing",
+        "FilterSamplesByProteinIntensitiesSum",
+        "ImputationByKNN",
+        "OutlierDetectionByLocalOutlierFactor",
+        "NormalisationByMedian",
+        "TransformationLog",
+        "PlotProtQuant",
+        "DifferentialExpressionTTest",
+        "PlotVolcano",
+        "EnrichmentAnalysisGOAnalysisWithString",
+        "PlotGOEnrichmentBarPlot",
     ]
     expected_method_parameters = [
-        call({'intensity_name': 'iBAQ', 'map_to_uniprot': False, 'file_path': 'tests/proteinGroups_small_cut.txt'}),
-        call({'feature_orientation': 'Columns (samples in rows, features in columns)', 'file_path': 'tests/metadata_cut_columns.csv'}),
-        call({'percentage': 0.5}),
-        call({'deviation_threshold': 2.0}),
-        call({'number_of_neighbours': 5}),
-        call({'number_of_neighbors': 20}),
-        call({'percentile': 0.5}),
-        call({'log_base': 'log2'}),
-        call({'similarity_measure': 'euclidean distance'}),
-        call({'alpha': 0.05}),
-        call({'fc_threshold': 1}),
-        call({'differential_expression_threshold': 1, 'direction': 'both', 'gene_sets_restring': [], 'organism': 9606}),
-        call({'colors': [], 'cutoff': 0.05, 'gene_sets': ['Process', 'Component', 'Function', 'KEGG'], 'top_terms': 10, 'value': 'p-value'})
+        call(
+            {
+                "intensity_name": "iBAQ",
+                "map_to_uniprot": False,
+                "file_path": "tests/proteinGroups_small.txt",
+            }
+        ),
+        call(
+            {
+                "feature_orientation": "Columns (samples in rows, features in columns)",
+                "file_path": "tests/metadata_cut_columns.csv",
+            }
+        ),
+        call({"percentage": 0.5}),
+        call({"deviation_threshold": 2.0}),
+        call({"number_of_neighbours": 5}),
+        call({"number_of_neighbors": 20}),
+        call({"percentile": 0.5}),
+        call({"log_base": "log2"}),
+        call(
+            {
+                "input_df": "TransformationLog_1",
+                "protein_group": "P10636;P10636-9",
+                "similarity": 1,
+                "similarity_measure": "euclidean distance",
+            }
+        ),
+        call(
+            {
+                "alpha": 0.05,
+                "group1": "AD",
+                "group2": "CTR",
+                "grouping": "Group",
+                "multiple_testing_correction_method": "Benjamini-Hochberg",
+                "protein_df": "TransformationLog_1",
+                "ttest_type": "Welch's t-Test",
+            }
+        ),
+        call(
+            {
+                "fc_threshold": 1,
+                "input_dict": "DifferentialExpressionTTest_1",
+                "proteins_of_interest": [],
+            }
+        ),
+        call(
+            {
+                "background_path": None,
+                "differential_expression_threshold": 1,
+                "direction": "both",
+                "gene_sets_restring": ["Component", "Function", "KEGG", "Process"],
+                "organism": 9606,
+                "protein_df": "DifferentialExpressionTTest_1",
+            }
+        ),
+        call(
+            {
+                "colors": [],
+                "cutoff": 0.05,
+                "gene_sets": ["Process", "Component", "Function", "KEGG"],
+                "input_df_step_instance": "EnrichmentAnalysisGOAnalysisWithString_1",
+                "title": None,
+                "top_terms": 10,
+                "value": "p-value",
+            }
+        ),
     ]
 
     assert mock_method.call_count == 13
@@ -168,10 +222,20 @@ def test_runner_calculates(monkeypatch, tests_folder_name, ms_data_path, metadat
         "FilterProteinsBySamplesMissing",
     ]
     assert mock_method.call_args_list == [
-        call({'intensity_name': 'iBAQ', 'map_to_uniprot': False, 'file_path': 'tests/proteinGroups_small_cut.txt'}),
-        call({'feature_orientation': 'Columns (samples in rows, features in columns)',
-              'file_path': 'tests/metadata_cut_columns.csv'}),
-        call({'percentage': 0.5})
+        call(
+            {
+                "intensity_name": "iBAQ",
+                "map_to_uniprot": False,
+                "file_path": "tests/proteinGroups_small.txt",
+            }
+        ),
+        call(
+            {
+                "feature_orientation": "Columns (samples in rows, features in columns)",
+                "file_path": "tests/metadata_cut_columns.csv",
+            }
+        ),
+        call({"percentage": 0.5}),
     ]
     mock_plot.assert_not_called()
 
@@ -251,11 +315,13 @@ def test_serialize_workflow_graphs():
             assert _serialize_graphs(step["graphs"]) == serial_filter_graphs
 
 
-def test_integration_runner(metadata_path, ms_data_path, tests_folder_name, monkeypatch):
+def test_integration_runner(
+    metadata_path, ms_data_path, tests_folder_name, monkeypatch
+):
     name = tests_folder_name + "/test_runner_integration_" + random_string()
     runner = Runner(
         **{
-            "workflow": "standard",
+            "workflow": ".standard",
             "ms_data_path": f"{PROJECT_PATH}/{ms_data_path}",
             "meta_data_path": f"{PROJECT_PATH}/{metadata_path}",
             "peptides_path": None,
@@ -276,7 +342,7 @@ def test_integration_runner_no_plots(metadata_path, ms_data_path, tests_folder_n
     name = tests_folder_name + "/test_runner_integration" + random_string()
     runner = Runner(
         **{
-            "workflow": "standard",
+            "workflow": ".standard",
             "ms_data_path": f"{PROJECT_PATH}/{ms_data_path}",
             "meta_data_path": f"{PROJECT_PATH}/{metadata_path}",
             "peptides_path": None,

diff --git a/tests/test_data/DIANN_data/correct_metadata_table.csv b/tests/test_data/DIANN_data/correct_metadata_table.csv
@@ -1,4 +1,4 @@
-MS run,sample name,replicate
+MS run,Sample,replicate
 LM07061,24h wt DMSO,1
 LM07062,24h ko DMSO,1
 LM07063,24h wt Prodi,1