Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
73e13a2
added sample size calculation in methods\data_analysis.py and forms\d…
selenabr Jun 5, 2024
f133c87
enabled possibility to choose one protein for calculation dependent o…
selenabr Jun 17, 2024
49c7f0e
fixed errors with missing inputs
selenabr Jun 18, 2024
6d8c9a8
added variance calculation and testing function and edited sample siz…
selenabr Jun 18, 2024
0b95cf0
fixed some errors
selenabr Jun 19, 2024
22c293d
output field for result
selenabr Jun 20, 2024
fd756df
Merge branch 'dev' into bachelor-thesis-selena
selenabr Jun 20, 2024
b22b6e7
further implementation of output field for result
selenabr Jun 21, 2024
c6a2f3b
display display_output in output field
selenabr Jun 23, 2024
032286c
display_output field displayed in the same size and position as the o…
selenabr Jun 25, 2024
e90fab3
test function for sample_size_calculation
selenabr Jun 25, 2024
01eba42
Merge branch 'dev' into bachelor-thesis-selena
selenabr Jun 25, 2024
d3cf9d8
edited description of function
selenabr Jun 26, 2024
3ce4ae1
check if implemented function of Paper (Cairns et al., 2009) and libr…
selenabr Jul 8, 2024
f78b0b9
power calculation and test of library-function and implemented paper-…
selenabr Jul 8, 2024
e3dd1c3
added test for power_calculation method
selenabr Aug 20, 2024
2e3de5a
fixed constructor error
selenabr Aug 21, 2024
a46a074
sample size calculation for different group sizes (Cohen 1988) and mo…
selenabr Aug 23, 2024
3446be3
code formatting, resolved comments (output not a float, significant_p…
selenabr Aug 26, 2024
cb25777
feature: user can choose whether metadata contains a column for indiv…
selenabr Aug 28, 2024
52ef105
adapted test for power_calculation and sample_size_calculation and ch…
selenabr Aug 28, 2024
ac9e783
added function that calculates sample size for all proteins and shows…
selenabr Sep 3, 2024
e54c767
formatting
selenabr Sep 3, 2024
2faa972
commented the dataframe-output-stuff out, otherwise violin plot could…
selenabr Sep 3, 2024
25cf2b2
changed color of violinplot and added axis-description
selenabr Sep 3, 2024
ae4e8cb
changed color of violinplot and removed axis-description
selenabr Sep 5, 2024
5c63008
resolved comments
selenabr Sep 5, 2024
0adc15c
Added function to get dataframes with sample size column as output
selenabr Sep 5, 2024
dcba877
Added power_calculation_for_all_proteins to calculate minimum power f…
selenabr Sep 6, 2024
eb32984
Fixed hover display of violin plots
selenabr Sep 8, 2024
1adda1b
fixed typo and removed unnecessary comment
selenabr Oct 8, 2024
d0ec174
calculations for thesis (should be removed before merging into dev)
selenabr Oct 29, 2024
776dc55
calculations for thesis (should be removed before merging into dev)
selenabr Nov 4, 2024
7131d3b
put calculation for thesis into comment and changed description of me…
selenabr Nov 18, 2024
6e2daa3
Add files via upload
selenabr Nov 18, 2024
6778796
Merge branch 'dev' into bachelor-thesis-selena
selenabr Mar 4, 2025
01e9d5f
merge bachelor-thesis-selena into dev
selenabr Mar 4, 2025
412dfd1
fixed error in power_analysis.py (constants.color) and commented file…
selenabr Mar 4, 2025
7b6c159
changed steps to new format
Jonas0000 Mar 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 144 additions & 0 deletions meta_individual_column.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
Sample,Group,Batch,Individual
AD01_C1_INSOLUBLE_01,AD,C1,AD01_
AD01_C1_INSOLUBLE_02,AD,C1,AD01_
AD01_C1_INSOLUBLE_03,AD,C1,AD01_
AD01_C2_INSOLUBLE_01,AD,C2,AD01_
AD02_C1_INSOLUBLE_01,AD,C1,AD02_
AD02_C1_INSOLUBLE_02,AD,C1,AD02_
AD02_C2_INSOLUBLE_01,AD,C2,AD02_
AD03_C1_INSOLUBLE_01,AD,C1,AD03_
AD03_C1_INSOLUBLE_02,AD,C1,AD03_
AD03_C1_INSOLUBLE_03,AD,C1,AD03_
AD03_C2_INSOLUBLE_01,AD,C2,AD03_
AD04_C1_INSOLUBLE_01,AD,C1,AD04_
AD04_C2_INSOLUBLE_01,AD,C2,AD04_
AD05_C2_INSOLUBLE_01,AD,C2,AD05_
AD06_C1_INSOLUBLE_01,AD,C1,AD06_
AD07_C1_INSOLUBLE_01,AD,C1,AD07_
AD07_C1_INSOLUBLE_02,AD,C1,AD07_
AD07_C1_INSOLUBLE_03,AD,C1,AD07_
AD07_C2_INSOLUBLE_01,AD,C2,AD07_
AD08_C2_INSOLUBLE_01,AD,C2,AD08_
AD09_C1_INSOLUBLE_01,AD,C1,AD09_
AD10_C1_INSOLUBLE_01,AD,C1,AD10_
AD10_C2_INSOLUBLE_01,AD,C2,AD10_
AD11_C2_INSOLUBLE_01,AD,C2,AD11_
AD12_C2_INSOLUBLE_01,AD,C2,AD12_
AD13_C2_INSOLUBLE_01,AD,C2,AD13_
AD14_C2_INSOLUBLE_01,AD,C2,AD14_
AD15_C2_INSOLUBLE_01,AD,C2,AD15_
AD16_C2_INSOLUBLE_01,AD,C2,AD16_
AD17_C2_INSOLUBLE_01,AD,C2,AD17_
AD18_C2_INSOLUBLE_01,AD,C2,AD18_
AD19_C2_INSOLUBLE_01,AD,C2,AD19_
AD20_C1_INSOLUBLE_01,AD,C1,AD20_
AD21_C1_INSOLUBLE_01,AD,C1,AD21_
AD21_C2_INSOLUBLE_01,AD,C2,AD21_
AD22_C1_INSOLUBLE_01,AD,C1,AD22_
AD23_C1_INSOLUBLE_01,AD,C1,AD23_
AD23_C1_INSOLUBLE_02,AD,C1,AD23_
AD23_C2_INSOLUBLE_01,AD,C2,AD23_
AD24_C1_INSOLUBLE_01,AD,C1,AD24_
AD24_C1_INSOLUBLE_02,AD,C1,AD24_
AD25_C1_INSOLUBLE_01,AD,C1,AD25_
AD26_C1_INSOLUBLE_01,AD,C1,AD26_
AD27_C1_INSOLUBLE_01,AD,C1,AD27_
AD27_C1_INSOLUBLE_02,AD,C1,AD27_
AD28_C2_INSOLUBLE_01,AD,C2,AD28_
AD29_C1_INSOLUBLE_01,AD,C1,AD29_
AD30_C1_INSOLUBLE_01,AD,C1,AD30_
AD30_C1_INSOLUBLE_02,AD,C1,AD30_
AD30_C2_INSOLUBLE_01,AD,C2,AD30_
AD31_C2_INSOLUBLE_01,AD,C2,AD31_
AD32_C2_INSOLUBLE_01,AD,C2,AD32_
AD33_C2_INSOLUBLE_01,AD,C2,AD33_
AD34_C1_INSOLUBLE_01,AD,C1,AD34_
AD34_C1_INSOLUBLE_02,AD,C1,AD34_
AD35_C1_INSOLUBLE_01,AD,C1,AD35_
AD35_C1_INSOLUBLE_02,AD,C1,AD35_
AD36_C1_INSOLUBLE_01,AD,C1,AD36_
AD37_C1_INSOLUBLE_01,AD,C1,AD37_
AD37_C2_INSOLUBLE_01,AD,C2,AD37_
AD38_C1_INSOLUBLE_01,AD,C1,AD38_
AD38_C1_INSOLUBLE_02,AD,C1,AD38_
AD38_C1_INSOLUBLE_03,AD,C1,AD38_
AD39_C2_INSOLUBLE_01,AD,C2,AD39_
AD40_C2_INSOLUBLE_01,AD,C2,AD40_
AD41_C2_INSOLUBLE_01,AD,C2,AD41_
AD42_C2_INSOLUBLE_01,AD,C2,AD42_
AD43_C1_INSOLUBLE_01,AD,C1,AD43_
AD44_C1_INSOLUBLE_01,AD,C1,AD44_
AD44_C1_INSOLUBLE_02,AD,C1,AD44_
AD44_C1_INSOLUBLE_03,AD,C1,AD44_
AD44_C1_INSOLUBLE_04,AD,C1,AD44_
AD45_C1_INSOLUBLE_01,AD,C1,AD45_
AD45_C1_INSOLUBLE_02,AD,C1,AD45_
AD46_C1_INSOLUBLE_01,AD,C1,AD46_
AD46_C1_INSOLUBLE_02,AD,C1,AD46_
AD46_C1_INSOLUBLE_03,AD,C1,AD46_
AD46_C2_INSOLUBLE_01,AD,C2,AD46_
AD47_C1_INSOLUBLE_01,AD,C1,AD47_
AD48_C2_INSOLUBLE_01,AD,C2,AD48_
AD49_C2_INSOLUBLE_01,AD,C2,AD49_
CTR01_C1_INSOLUBLE_01,CTR,C1,CTR01
CTR02_C1_INSOLUBLE_01,CTR,C1,CTR02
CTR03_C1_INSOLUBLE_01,CTR,C1,CTR03
CTR04_C1_INSOLUBLE_01,CTR,C1,CTR04
CTR05_C2_INSOLUBLE_01,CTR,C2,CTR05
CTR06_C2_INSOLUBLE_01,CTR,C2,CTR06
CTR07_C1_INSOLUBLE_01,CTR,C1,CTR07
CTR08_C1_INSOLUBLE_01,CTR,C1,CTR08
CTR08_C2_INSOLUBLE_01,CTR,C2,CTR08
CTR09_C2_INSOLUBLE_01,CTR,C2,CTR09
CTR10_C1_INSOLUBLE_01,CTR,C1,CTR10
CTR10_C2_INSOLUBLE_01,CTR,C2,CTR10
CTR11_C2_INSOLUBLE_01,CTR,C2,CTR11
CTR12_C2_INSOLUBLE_01,CTR,C2,CTR12
CTR13_C2_INSOLUBLE_01,CTR,C2,CTR13
CTR14_C2_INSOLUBLE_01,CTR,C2,CTR14
CTR15_C2_INSOLUBLE_01,CTR,C2,CTR15
CTR16_C2_INSOLUBLE_01,CTR,C2,CTR16
CTR17_C2_INSOLUBLE_01,CTR,C2,CTR17
CTR18_C2_INSOLUBLE_01,CTR,C2,CTR18
CTR19_C1_INSOLUBLE_01,CTR,C1,CTR19
CTR20_C1_INSOLUBLE_01,CTR,C1,CTR20
CTR21_C2_INSOLUBLE_01,CTR,C2,CTR21
CTR22_C2_INSOLUBLE_01,CTR,C2,CTR22
CTR23_C2_INSOLUBLE_01,CTR,C2,CTR23
CTR24_C1_INSOLUBLE_01,CTR,C1,CTR24
CTR25_C1_INSOLUBLE_01,CTR,C1,CTR25
CTR26_C2_INSOLUBLE_01,CTR,C2,CTR26
CTR27_C1_INSOLUBLE_01,CTR,C1,CTR27
CTR28_C1_INSOLUBLE_01,CTR,C1,CTR28
CTR28_C1_INSOLUBLE_02,CTR,C1,CTR28
CTR28_C2_INSOLUBLE_01,CTR,C2,CTR28
CTR29_C1_INSOLUBLE_01,CTR,C1,CTR29
CTR29_C1_INSOLUBLE_02,CTR,C1,CTR29
CTR29_C1_INSOLUBLE_03,CTR,C1,CTR29
CTR30_C1_INSOLUBLE_01,CTR,C1,CTR30
CTR30_C1_INSOLUBLE_02,CTR,C1,CTR30
CTR30_C2_INSOLUBLE_01,CTR,C2,CTR30
CTR31_C1_INSOLUBLE_01,CTR,C1,CTR31
CTR31_C2_INSOLUBLE_01,CTR,C2,CTR31
CTR32_C1_INSOLUBLE_01,CTR,C1,CTR32
CTR32_C2_INSOLUBLE_01,CTR,C2,CTR32
CTR33_C1_INSOLUBLE_01,CTR,C1,CTR33
CTR34_C1_INSOLUBLE_01,CTR,C1,CTR34
CTR34_C2_INSOLUBLE_01,CTR,C2,CTR34
CTR35_C1_INSOLUBLE_01,CTR,C1,CTR35
CTR36_C1_INSOLUBLE_01,CTR,C1,CTR36
CTR36_C1_INSOLUBLE_02,CTR,C1,CTR36
CTR37_C1_INSOLUBLE_01,CTR,C1,CTR37
CTR38_C1_INSOLUBLE_01,CTR,C1,CTR38
CTR39_C1_INSOLUBLE_01,CTR,C1,CTR39
CTR40_C1_INSOLUBLE_01,CTR,C1,CTR40
CTR40_C1_INSOLUBLE_02,CTR,C1,CTR40
CTR40_C1_INSOLUBLE_03,CTR,C1,CTR40
CTR41_C1_INSOLUBLE_01,CTR,C1,CTR41
CTR41_C1_INSOLUBLE_02,CTR,C1,CTR41
CTR41_C1_INSOLUBLE_03,CTR,C1,CTR41
CTR42_C1_INSOLUBLE_01,CTR,C1,CTR42
CTR42_C1_INSOLUBLE_02,CTR,C1,CTR42
CTR42_C1_INSOLUBLE_03,CTR,C1,CTR42
CTR43_C2_INSOLUBLE_01,CTR,C2,CTR43
CTR44_C1_INSOLUBLE_01,CTR,C1,CTR44
100 changes: 61 additions & 39 deletions protzilla/data_analysis/differential_expression_mann_whitney.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,24 @@
import pandas as pd
from scipy import stats

from protzilla.data_analysis.differential_expression_helper import _map_log_base, apply_multiple_testing_correction, \
merge_differential_expression_and_significant_df, normalize_ptm_df
from protzilla.data_analysis.differential_expression_helper import (
_map_log_base,
apply_multiple_testing_correction,
normalize_ptm_df,
)
from protzilla.utilities.transform_dfs import long_to_wide


def mann_whitney_test_on_intensity_data(
protein_df: pd.DataFrame,
metadata_df: pd.DataFrame,
grouping: str,
group1: str,
group2: str,
log_base: str = None,
alpha=0.05,
multiple_testing_correction_method: str = "Benjamini-Hochberg",
p_value_calculation_method: str = "auto"
protein_df: pd.DataFrame,
metadata_df: pd.DataFrame,
grouping: str,
group1: str,
group2: str,
log_base: str = None,
alpha=0.05,
multiple_testing_correction_method: str = "Benjamini-Hochberg",
p_value_calculation_method: str = "auto",
) -> dict:
"""
Perform Mann-Whitney U test on all proteins in the given intensity data frame.
Expand Down Expand Up @@ -57,15 +60,26 @@ def mann_whitney_test_on_intensity_data(
alpha=alpha,
multiple_testing_correction_method=multiple_testing_correction_method,
columns_name="Protein ID",
p_value_calculation_method=p_value_calculation_method
p_value_calculation_method=p_value_calculation_method,
)
differentially_expressed_proteins_df = pd.merge(
protein_df,
outputs["differential_expressed_columns_df"],
on="Protein ID",
how="left",
)
differentially_expressed_proteins_df = pd.merge(protein_df, outputs["differential_expressed_columns_df"], on="Protein ID", how="left")
differentially_expressed_proteins_df = differentially_expressed_proteins_df.loc[
differentially_expressed_proteins_df["Protein ID"].isin(outputs["differential_expressed_columns_df"]["Protein ID"])
differentially_expressed_proteins_df["Protein ID"].isin(
outputs["differential_expressed_columns_df"]["Protein ID"]
)
]
significant_proteins_df = pd.merge(protein_df, outputs["significant_columns_df"], on="Protein ID", how="left")
significant_proteins_df = pd.merge(
protein_df, outputs["significant_columns_df"], on="Protein ID", how="left"
)
significant_proteins_df = significant_proteins_df.loc[
significant_proteins_df["Protein ID"].isin(outputs["significant_columns_df"]["Protein ID"])
significant_proteins_df["Protein ID"].isin(
outputs["significant_columns_df"]["Protein ID"]
)
]

return dict(
Expand All @@ -80,14 +94,14 @@ def mann_whitney_test_on_intensity_data(


def mann_whitney_test_on_ptm_data(
ptm_df: pd.DataFrame,
metadata_df: pd.DataFrame,
grouping: str,
group1: str,
group2: str,
alpha=0.05,
multiple_testing_correction_method: str = "Benjamini-Hochberg",
p_value_calculation_method: str = "auto"
ptm_df: pd.DataFrame,
metadata_df: pd.DataFrame,
grouping: str,
group1: str,
group2: str,
alpha=0.05,
multiple_testing_correction_method: str = "Benjamini-Hochberg",
p_value_calculation_method: str = "auto",
) -> dict:
"""
Perform Mann-Whitney U test on all PTMs in the given PTM data frame.
Expand Down Expand Up @@ -126,7 +140,7 @@ def mann_whitney_test_on_ptm_data(
alpha=alpha,
multiple_testing_correction_method=multiple_testing_correction_method,
columns_name="PTM",
p_value_calculation_method=p_value_calculation_method
p_value_calculation_method=p_value_calculation_method,
)

return dict(
Expand All @@ -141,16 +155,16 @@ def mann_whitney_test_on_ptm_data(


def mann_whitney_test_on_columns(
df: pd.DataFrame,
metadata_df: pd.DataFrame,
grouping: str,
group1: str,
group2: str,
log_base: str = None,
alpha=0.05,
multiple_testing_correction_method: str = "Benjamini-Hochberg",
columns_name: str = "Protein ID",
p_value_calculation_method: str = "auto"
df: pd.DataFrame,
metadata_df: pd.DataFrame,
grouping: str,
group1: str,
group2: str,
log_base: str = None,
alpha=0.05,
multiple_testing_correction_method: str = "Benjamini-Hochberg",
columns_name: str = "Protein ID",
p_value_calculation_method: str = "auto",
) -> dict:
"""
Perform Mann-Whitney U test on all columns of the data frame.
Expand Down Expand Up @@ -197,8 +211,12 @@ def mann_whitney_test_on_columns(
for column in data_columns:
group1_data = df_with_groups[df_with_groups[grouping] == group1][column]
group2_data = df_with_groups[df_with_groups[grouping] == group2][column]
u_statistic, p_value = (
stats.mannwhitneyu(group1_data, group2_data, alternative="two-sided", method=p_value_calculation_method))
u_statistic, p_value = stats.mannwhitneyu(
group1_data,
group2_data,
alternative="two-sided",
method=p_value_calculation_method,
)

if not np.isnan(p_value):
log2_fold_change = (
Expand Down Expand Up @@ -243,9 +261,13 @@ def mann_whitney_test_on_columns(

significant_columns_df = combined_df[
combined_df["corrected_p_value"] <= corrected_alpha
]
]

messages = [dict(level=logging.INFO, msg=f"Invalid columns: {invalid_columns}")] if invalid_columns else []
messages = (
[dict(level=logging.INFO, msg=f"Invalid columns: {invalid_columns}")]
if invalid_columns
else []
)

return dict(
differential_expressed_columns_df=combined_df,
Expand Down
Loading