diff --git a/protzilla/data_preprocessing/transformation.py b/protzilla/data_preprocessing/transformation.py index 221b01ab..6c5be256 100644 --- a/protzilla/data_preprocessing/transformation.py +++ b/protzilla/data_preprocessing/transformation.py @@ -1,3 +1,5 @@ +import logging + import numpy as np import pandas as pd @@ -5,11 +7,12 @@ from protzilla.utilities import default_intensity_column -def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10") -> dict: +def by_log( + protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base="log10" +) -> dict: """ - This function log-transforms intensity - DataFrames. Supports log-transformation to the base - of 2 or 10. + This function log-transforms intensity, while ignoring and dropping negative or 0 intensity values. + Supports log-transformation to the base of 2 or 10. :param protein_df: a protein data frame in long format :type protein_df: pd.DataFrame @@ -22,11 +25,42 @@ def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base=" long format with the transformed data and an empty dict. :rtype: Tuple[pandas DataFrame, dict] """ + msg = [] intensity_name = default_intensity_column(protein_df) transformed_df = protein_df.copy() transformed_peptide_df = peptide_df.copy() if peptide_df is not None else None + zero_intensity_index = transformed_df[transformed_df[intensity_name] <= 0].index + untransformable_data_df = transformed_df.loc[zero_intensity_index] + transformed_df.drop(zero_intensity_index, inplace=True) + transformed_df.reset_index(drop=True, inplace=True) + + if transformed_peptide_df is not None: + zero_intensity_peptide_index = transformed_peptide_df[ + transformed_peptide_df["Intensity"] <= 0 + ].index + untransformable_peptide_data_df = transformed_peptide_df.loc[ + zero_intensity_peptide_index + ] + transformed_peptide_df.drop(zero_intensity_peptide_index, inplace=True) + transformed_peptide_df.reset_index(drop=True, inplace=True) + if not untransformable_peptide_data_df.empty: + msg.append( + dict( + msg=f"Warning: {len(untransformable_peptide_data_df)} data points of peptide data with zero or negative intensity values were found and will be dropped. " + f"Please adapt your preprocessing workflow if this is unexpected.", + level=logging.WARNING, + ) + ) + + if not untransformable_data_df.empty: + msg.append( + dict( + msg=f"Warning: {len(untransformable_data_df)} data points of {len(untransformable_data_df['Protein ID'])} distinct protein groups with zero or negative intensity values were found and will be dropped. " + f"Please adapt your preprocessing pipeline if this is unexpected.", + level=logging.WARNING, + ) + ) - # TODO 41 drop data when intensity is 0 and return them in dict if log_base == "log2": transformed_df[intensity_name] = np.log2(transformed_df[intensity_name]) if transformed_peptide_df is not None: @@ -41,7 +75,9 @@ def by_log(protein_df: pd.DataFrame, peptide_df: pd.DataFrame | None, log_base=" ) else: raise ValueError("Unknown log_base. Known log methods are 'log2' and 'log10'.") - return dict(protein_df=transformed_df, peptide_df=transformed_peptide_df) + return dict( + protein_df=transformed_df, peptide_df=transformed_peptide_df, messages=msg + ) def by_log_plot(method_inputs, method_outputs, graph_type, group_by): diff --git a/tests/protzilla/data_preprocessing/test_transformation.py b/tests/protzilla/data_preprocessing/test_transformation.py index 09827157..2610cdbc 100644 --- a/tests/protzilla/data_preprocessing/test_transformation.py +++ b/tests/protzilla/data_preprocessing/test_transformation.py @@ -16,8 +16,8 @@ def log2_transformation_df(): ["Sample2", "Protein2", "Gene2", np.nan], ["Sample2", "Protein3", "Gene3", 4], ["Sample2", "Protein4", "Gene4", 4], - ["Sample3", "Protein1", "Gene1", 8], - ["Sample3", "Protein2", "Gene2", 8], + ["Sample3", "Protein1", "Gene1", 0], + ["Sample3", "Protein2", "Gene2", 0], ["Sample3", "Protein3", "Gene3", 8], ["Sample3", "Protein4", "Gene4", 8], ["Sample4", "Protein1", "Gene1", 1024], @@ -42,8 +42,6 @@ def log2_transformation_expected_df(): ["Sample2", "Protein2", "Gene2", np.nan], ["Sample2", "Protein3", "Gene3", 2.0], ["Sample2", "Protein4", "Gene4", 2.0], - ["Sample3", "Protein1", "Gene1", 3.0], - ["Sample3", "Protein2", "Gene2", 3.0], ["Sample3", "Protein3", "Gene3", 3.0], ["Sample3", "Protein4", "Gene4", 3.0], ["Sample4", "Protein1", "Gene1", 10.0], @@ -250,10 +248,50 @@ def test_by_log_without_peptide_df(log2_transformation_df, log_base): def test_log_by_0_transformation(): - # TODO 41 test expected behaviour when 0 occurs in df df = pd.DataFrame( data=(["Sample1", "Protein1", "Gene1", 0.0],), columns=["Sample", "Protein ID", "Gene", "Intensity"], ) - by_log(df, None, log_base="log2") + method_outputs = by_log(df, None, log_base="log2") + assert method_outputs["protein_df"].empty, "The protein DataFrame should be empty." + + +def test_log2_transformation_with_negative_values(log2_transformation_df, peptides_df): + # Add negative values to the DataFrame with concat + log2_transformation_df = pd.concat( + [ + log2_transformation_df, + pd.DataFrame( + [["Sample5", "Protein5", "Gene5", -2]], + columns=log2_transformation_df.columns, + ), + ] + ) + peptides_df = pd.concat( + [ + peptides_df, + pd.DataFrame( + [["Sample5", "Protein1", "Peptide5", -2, 0.037779]], + columns=["Sample", "Protein ID", "Sequence", "Intensity", "PEP"], + ), + ] + ) + + method_inputs = { + "protein_df": log2_transformation_df, + "peptide_df": peptides_df, + "log_base": "log2", + } + method_outputs = by_log(**method_inputs) + + result_df = method_outputs["protein_df"] + result_peptide_df = method_outputs["peptide_df"] + + # Check that negative values are removed + assert not ( + result_df["Intensity"] < 0 + ).any(), "Negative values were not removed from the protein DataFrame" + assert not ( + result_peptide_df["Intensity"] < 0 + ).any(), "Negative values were not removed from the peptide DataFrame"