diff --git a/protzilla/constants/colors.py b/protzilla/constants/colors.py
index eec08b1b..98daf656 100644
--- a/protzilla/constants/colors.py
+++ b/protzilla/constants/colors.py
@@ -1,8 +1,44 @@
PROTZILLA_DISCRETE_COLOR_SEQUENCE = [
- "#4A536A",
- "#87A8B9",
- "#CE5A5A",
- "#8E3325",
- "#E2A46D",
+ # Set 1: Muted Dark Slate
+ "#252935", "#3A3F50", "#50556A", "#6B7186", "#858DA2",
+ # Set 2: Muted Indian Red
+ "#CE5A5A", "#B24C4C", "#9D3F3F", "#E07272", "#F48D8D",
+ # Set 3: Muted Light Steel Blue
+ "#51646F", "#6A7D89", "#7F92A0", "#96A9B8", "#ADBFCD",
+ # Set 4: Muted Sienna
+ "#804538", "#6F3C31", "#5F342A", "#A05748", "#B66E5E",
+ # Set 5: Muted Sandy Brown
+ "#715236", "#63472F", "#57402B", "#96755A", "#A98575",
+ # Set 6: Muted Olive
+ "#6E6B48", "#5D5B3E", "#4E4D36", "#89875C", "#A1A16E",
+ # Set 7: Muted Teal
+ "#3B6B6A", "#315B5B", "#274C4C", "#507E7E", "#6B9898",
+ # Set 8: Muted Taupe
+ "#8B7E74", "#776F65", "#675E56", "#A09085", "#B9AAA1",
+ # Set 9: Muted Burgundy
+ "#7B3A4F", "#6A3345", "#582C3C", "#925664", "#A8737E",
+ # Set 10: Muted Forest Green
+ "#3D5047", "#35453E", "#2D3B35", "#5F7267", "#7B8D80",
+ # Set 11: Muted Navy
+ "#2F3E4C", "#283442", "#222B38", "#485669", "#627185",
+ # Set 12: Muted Mustard
+ "#BFA054", "#A98F4A", "#927D3F", "#D7BA75", "#E2CD96",
+ # Set 13: Muted Dusty Rose
+ "#C18394", "#AA727E", "#93616C", "#D69BA7", "#E4B8C2",
+ # Set 14: Muted Lavender
+ "#8A729D", "#7A638C", "#6A547C", "#A591B3", "#BDA9C8",
+ # Set 15: Muted Charcoal
+ "#404040", "#353535", "#2B2B2B", "#585858", "#707070",
+ # Set 16: Muted Emerald Green
+ "#4D7456", "#426448", "#37563B", "#6A9177", "#85A990",
+ # Set 17: Muted Peach
+ "#D89B83", "#C2866F", "#A7725E", "#E3B39C", "#ECC7B6",
+ # Set 18: Muted Plum
+ "#704F6E", "#634464", "#563A59", "#876A87", "#A18AA1",
+ # Set 19: Muted Periwinkle
+ "#7E8DAF", "#6F7B98", "#616A82", "#97A3BF", "#B0B9D1",
+ # Set 20: Muted Coral
+ "#CC7A5E", "#B26951", "#9A5A45", "#DD937C", "#EBAA99"
]
+
PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE = ["#4A536A", "#CE5A5A"]
diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py
new file mode 100644
index 00000000..e643fe93
--- /dev/null
+++ b/protzilla/data_analysis/time_series_helper.py
@@ -0,0 +1,15 @@
+from datetime import datetime
+
+def convert_time_to_hours(time_str):
+ """
+ Convert a string time to the number of hours since midnight.
+ :param time_str: The time string to convert in format '%H:%M:%S'
+
+ :return: Number of hours since midnight as a float
+ """
+
+ """
+ time_obj = datetime.strptime(time_str, '%H:%M:%S')
+ hours_since_midnight = time_obj.hour + time_obj.minute / 60 + time_obj.second / 3600
+ """
+ return time_str
\ No newline at end of file
diff --git a/protzilla/data_analysis/time_series_plots.py b/protzilla/data_analysis/time_series_plots.py
new file mode 100644
index 00000000..37c8ad34
--- /dev/null
+++ b/protzilla/data_analysis/time_series_plots.py
@@ -0,0 +1,179 @@
+import pandas as pd
+import plotly.graph_objects as go
+from scipy import stats
+from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
+
+from protzilla.utilities.transform_dfs import is_long_format, long_to_wide_time
+from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE
+
+# Define color constants
+colors = {
+ "plot_bgcolor": "white",
+ "gridcolor": "#F1F1F1",
+ "linecolor": "#F1F1F1",
+ "annotation_text_color": "#ffffff",
+ "annotation_proteins_of_interest": "#4A536A",
+}
+
+def time_quant_plot(
+ intensity_df: pd.DataFrame,
+ metadata_df: pd.DataFrame,
+ time_column: str,
+ protein_group: str,
+ similarity: float = 1.0,
+ similarity_measure: str = "euclidean distance",
+) -> dict:
+ """
+ A function to create a graph visualising protein quantifications across all samples
+ as a line diagram using time. It's possible to select one proteingroup
+ that will be displayed in orange and choose a similarity measurement with a similarity score
+ to get all proteingroups that are similar displayed in another color in this line diagram.
+ All other proteingroups are displayed in the background as a grey polygon.
+
+ :param intensity_df: A dataframe in protzilla wide format, where each row
+ represents a sample and each column represents a feature.
+ :param metadata_df: A dataframe containing the metadata of the samples.
+ :param time_column: The name of the column in the metadata_df that contains the time information.
+ :param protein_group: Protein IDs as the columnheader of the dataframe
+ :param similarity_measure: method to compare the chosen proteingroup with all others. The two
+ methods are "cosine similarity" and "euclidean distance".
+ :param similarity: similarity score of the chosen similarity measurement method.
+
+ :return: returns a dictionary containing a list with a plotly figure and/or a list of messages
+ """
+
+ intensity_df = pd.merge(
+ left=intensity_df,
+ right=metadata_df[["Sample", time_column]],
+ on="Sample",
+ copy=False,
+ )
+
+ wide_df = intensity_df.interpolate(method='linear', axis=0)
+ wide_df = long_to_wide_time(wide_df, time_column=time_column) if is_long_format(wide_df, time_column=time_column) else wide_df
+
+
+ if protein_group not in wide_df.columns:
+ raise ValueError("Please select a valid protein group.")
+ elif similarity_measure == "euclidean distance" and similarity < 0:
+ raise ValueError(
+ "Similarity for euclidean distance should be greater than or equal to 0."
+ )
+ elif similarity_measure == "cosine similarity" and (
+ similarity < -1 or similarity > 1
+ ):
+ raise ValueError("Similarity for cosine similarity should be between -1 and 1")
+
+ fig = go.Figure()
+
+ color_mapping = {
+ "A": PROTZILLA_DISCRETE_COLOR_SEQUENCE[0],
+ "C": PROTZILLA_DISCRETE_COLOR_SEQUENCE[4],
+ }
+
+ lower_upper_x = []
+ lower_upper_y = []
+
+ lower_upper_x.append(wide_df.index[0])
+ lower_upper_y.append(wide_df.iloc[0].min())
+
+ for index, row in wide_df.iterrows():
+ lower_upper_x.append(index)
+ lower_upper_y.append(row.max())
+
+ for index, row in reversed(list(wide_df.iterrows())):
+ lower_upper_x.append(index)
+ lower_upper_y.append(row.min())
+
+ fig.add_trace(
+ go.Scatter(
+ x=lower_upper_x,
+ y=lower_upper_y,
+ fill="toself",
+ name="Intensity Range",
+ line=dict(color="silver"),
+ )
+ )
+
+ similar_groups = []
+ for group_to_compare in wide_df.columns:
+ if group_to_compare != protein_group:
+ if similarity_measure == "euclidean distance":
+ distance = euclidean_distances(
+ stats.zscore(wide_df[protein_group]).values.reshape(1, -1),
+ stats.zscore(wide_df[group_to_compare]).values.reshape(1, -1),
+ )[0][0]
+ else:
+ distance = cosine_similarity(
+ stats.zscore(wide_df[protein_group]).values.reshape(1, -1),
+ stats.zscore(wide_df[group_to_compare]).values.reshape(1, -1),
+ )[0][0]
+ if similarity_measure == "euclidean distance":
+ if distance <= similarity:
+ similar_groups.append(group_to_compare)
+ else:
+ if distance >= similarity:
+ similar_groups.append(group_to_compare)
+
+ for group in similar_groups:
+ fig.add_trace(
+ go.Scatter(
+ x=wide_df.index,
+ y=wide_df[group],
+ mode="lines",
+ name=group[:15] + "..." if len(group) > 15 else group,
+ line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[9]),
+ showlegend=len(similar_groups) <= 7,
+ )
+ )
+
+ if len(similar_groups) > 7:
+ fig.add_trace(
+ go.Scatter(
+ x=[None],
+ y=[None],
+ mode="lines",
+ line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[9]),
+ name="Similar Protein Groups",
+ )
+ )
+
+ formatted_protein_name = (
+ protein_group[:15] + "..." if len(protein_group) > 15 else protein_group
+ )
+ fig.add_trace(
+ go.Scatter(
+ x=wide_df.index,
+ y=wide_df[protein_group],
+ mode="lines",
+ name=formatted_protein_name,
+ line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4]),
+ )
+ )
+ fig.update_layout(
+ title=f"Time Series of {formatted_protein_name} in all samples",
+ plot_bgcolor=colors["plot_bgcolor"],
+ xaxis_gridcolor=colors["gridcolor"],
+ yaxis_gridcolor=colors["gridcolor"],
+ xaxis_linecolor=colors["linecolor"],
+ yaxis_linecolor=colors["linecolor"],
+ xaxis_title=time_column,
+ yaxis_title="Intensity",
+ legend_title="Legend",
+ xaxis=dict(
+ tickmode="array",
+ tickangle=0,
+ tickvals=wide_df.index,
+ ticktext=[wide_df[time_column].unique() for wide_df[time_column] in wide_df.index],
+ ),
+ autosize=True,
+ margin=dict(l=100, r=300, t=100, b=100),
+ legend=dict(
+ x=1.05,
+ y=1,
+ bgcolor="rgba(255, 255, 255, 0.5)",
+ orientation="v",
+ ),
+ )
+
+ return dict(plots=[fig])
\ No newline at end of file
diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py
new file mode 100644
index 00000000..898f82f9
--- /dev/null
+++ b/protzilla/data_analysis/time_series_regression_analysis.py
@@ -0,0 +1,1008 @@
+import logging
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+
+from protzilla.utilities import default_intensity_column
+from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE
+
+from sklearn.linear_model import LinearRegression, RANSACRegressor
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score
+from statsmodels.tsa.arima.model import ARIMA
+from statsmodels.tsa.stattools import adfuller
+from pmdarima import auto_arima
+
+colors = {
+ "plot_bgcolor": "white",
+ "gridcolor": "#F1F1F1",
+ "linecolor": "#F1F1F1",
+ "annotation_text_color": "#4c4c4c",
+ "annotation_proteins_of_interest": "#4A536A",
+}
+
+
+def time_series_linear_regression(
+ intensity_df: pd.DataFrame,
+ metadata_df: pd.DataFrame,
+ time_column: str,
+ train_size: float,
+ protein_group: str,
+ grouping: str,
+ grouping_column: str,
+):
+ """
+ Perform linear regression on the time series data for a given protein group.
+ :param intensity_df: Peptide dataframe which contains the intensity of each sample
+ :param metadata_df: Metadata dataframe which contains the timestamps
+ :param time_column: The name of the column containing the time values
+ :param protein_group: Protein group to perform the analysis on
+ :param train_size: The proportion of the dataset to include in the test split
+ :param grouping_column: The name of the column containing the grouping information
+ :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups
+
+ :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
+ """
+ messages = []
+ color_index = 0
+ if train_size < 0 or train_size > 1:
+ raise ValueError("Test size should be between 0 and 1")
+
+ intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group]
+ intensity_column_name = default_intensity_column(intensity_df)
+ intensity_df = pd.merge(
+ left=intensity_df,
+ right=metadata_df,
+ on="Sample",
+ copy=False,
+ )
+
+ intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)
+
+ X = intensity_df[[time_column]]
+ y = intensity_df[intensity_column_name]
+
+ fig = go.Figure()
+
+ scores = []
+
+ if grouping == "With Grouping" and grouping_column in intensity_df.columns:
+ groups = intensity_df[grouping_column].unique()
+ for group in groups:
+ group_df = intensity_df[intensity_df[grouping_column] == group]
+ X_group = group_df[[time_column]]
+ y_group = group_df[intensity_column_name]
+
+ X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False)
+ model = LinearRegression()
+ model.fit(X_train, y_train)
+
+ y_pred_train = model.predict(X_train)
+ y_pred_test = model.predict(X_test)
+
+ train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
+ test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
+ train_r2 = r2_score(y_train, y_pred_train)
+ test_r2 = r2_score(y_test, y_pred_test)
+
+ train_df = pd.DataFrame({time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+ test_df = pd.DataFrame({time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+ plot_df = pd.concat([train_df, test_df])
+
+ color = PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index % len(PROTZILLA_DISCRETE_COLOR_SEQUENCE)]
+ color_index += 5
+
+ fig.add_trace(go.Scatter(
+ x=plot_df[time_column],
+ y=plot_df['Intensity'],
+ mode='markers',
+ name=f'Actual Intensity ({group})',
+ marker=dict(color=color)
+ )
+ )
+
+ fig.add_trace(go.Scatter(
+ x=plot_df[time_column],
+ y=plot_df['Predicted'],
+ mode='lines',
+ name=f'Predicted Intensity ({group})',
+ line=dict(color=color)
+ )
+ )
+
+ scores.append({
+ 'group': group,
+ 'train_root_mean_squared': train_rmse,
+ 'test_root_mean_squared': test_rmse,
+ 'train_r2_score': train_r2,
+ 'test_r2_score': test_r2,
+ })
+
+ else:
+ X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, shuffle=False)
+ model = LinearRegression()
+ model.fit(X_train, y_train)
+
+ y_pred_train = model.predict(X_train)
+ y_pred_test = model.predict(X_test)
+
+ train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
+ test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
+ train_r2 = r2_score(y_train, y_pred_train)
+ test_r2 = r2_score(y_test, y_pred_test)
+
+ train_df = pd.DataFrame(
+ {time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train,
+ 'Type': 'Train'})
+ test_df = pd.DataFrame(
+ {time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+ plot_df = pd.concat([train_df, test_df])
+
+ fig.add_trace(go.Scatter(
+ x=plot_df[time_column],
+ y=plot_df['Intensity'],
+ mode='markers',
+ name='Actual Intensity',
+ marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
+ )
+ )
+
+ fig.add_trace(go.Scatter(
+ x=plot_df[time_column],
+ y=plot_df['Predicted'],
+ mode='lines',
+ name='Predicted Intensity',
+ line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[5])
+ )
+ )
+
+ scores.append({
+ 'group': 'Overall',
+ 'train_root_mean_squared': train_rmse,
+ 'test_root_mean_squared': test_rmse,
+ 'train_r2_score': train_r2,
+ 'test_r2_score': test_r2,
+ })
+
+ # Add annotation text as a separate trace in the subplot
+ annotation_text = "
".join([
+ f"Group: {res['group']} (Train/Test)"
+ f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
"
+ f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
"
+ for res in scores
+ ])
+
+ fig.update_layout(
+ title=f"Intensity over Time for {protein_group}",
+ plot_bgcolor=colors["plot_bgcolor"],
+ xaxis_gridcolor=colors["gridcolor"],
+ yaxis_gridcolor=colors["gridcolor"],
+ xaxis_linecolor=colors["linecolor"],
+ yaxis_linecolor=colors["linecolor"],
+ xaxis_title=time_column,
+ yaxis_title="Intensity",
+ legend_title="Legend",
+ autosize=True,
+ margin=dict(l=100, r=300, t=100, b=100),
+ legend=dict(
+ y=1.05,
+ x=1,
+ bgcolor = "rgba(255, 255, 255, 0.5)",
+ orientation = "v",
+ )
+ )
+
+ messages.append(
+ {
+ "level": logging.INFO,
+ "msg": annotation_text,
+ }
+ )
+
+ return dict(
+ scores=scores,
+ plots=[fig],
+ messages=messages,
+ )
+
+
+def time_series_ransac_regression(
+ intensity_df: pd.DataFrame,
+ metadata_df: pd.DataFrame,
+ time_column: str,
+ protein_group: str,
+ max_trials: int,
+ stop_probability: float,
+ loss: str,
+ train_size: float,
+ grouping: str,
+ grouping_column: str,
+):
+ """
+ Perform RANSAC regression on the time series data for a given protein group.
+ :param intensity_df: Peptide dataframe which contains the intensity of each sample
+ :param metadata_df: Metadata dataframe which contains the timestamps
+ :param time_column: The name of the column containing the time values
+ :param max_trials: The maximum number of iterations to perform
+ :param stop_probability: The probability to stop the RANSAC algorithm
+ :param loss: The loss function to use
+ :param protein_group: Protein group to perform the analysis on
+ :param train_size: The proportion of the dataset to include in the test split
+ :param grouping_column: The name of the column containing the grouping information
+ :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups
+
+ :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
+ """
+ messages = []
+ color_index = 0
+ if train_size < 0 or train_size > 1:
+ raise ValueError("Test size should be between 0 and 1")
+
+ intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group]
+ intensity_column_name = default_intensity_column(intensity_df)
+
+ intensity_df = pd.merge(
+ left=intensity_df,
+ right=metadata_df,
+ on="Sample",
+ copy=False,
+ )
+
+ intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True)
+
+ X = intensity_df[[time_column]]
+ y = intensity_df[intensity_column_name]
+
+ fig = go.Figure()
+
+ scores = []
+
+ if grouping == "With Grouping" and grouping_column in intensity_df.columns:
+ groups = intensity_df[grouping_column].unique()
+ for group in groups:
+ group_df = intensity_df[intensity_df[grouping_column] == group]
+ X_group = group_df[[time_column]]
+ y_group = group_df[intensity_column_name]
+
+ X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False)
+ model = RANSACRegressor(max_trials = max_trials, stop_probability = stop_probability, loss = loss, base_estimator=LinearRegression())
+ model.fit(X_train, y_train)
+
+ y_pred_train = model.predict(X_train)
+ y_pred_test = model.predict(X_test)
+
+ inlier_mask_train = model.inlier_mask_
+
+ # Predict the inliers for the test set
+ test_inlier_mask = model.predict(
+ X_test) == y_pred_test
+
+ train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask_train], y_pred_train[inlier_mask_train]))
+ test_rmse = np.sqrt(mean_squared_error(y_test[test_inlier_mask], y_pred_test[test_inlier_mask]))
+ train_r2 = r2_score(y_train[inlier_mask_train], y_pred_train[inlier_mask_train])
+ test_r2 = r2_score(y_test[test_inlier_mask], y_pred_test[test_inlier_mask])
+
+ # Prepare DataFrames for plotting
+ train_df = pd.DataFrame(
+ {time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+ test_df = pd.DataFrame(
+ {time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+ train_df['Inlier'] = inlier_mask_train
+ test_df['Inlier'] = test_inlier_mask
+ plot_df = pd.concat([train_df, test_df])
+
+ # Add main plot traces
+ fig.add_trace(go.Scatter(
+ x=plot_df[plot_df['Inlier'] == True][time_column],
+ y=plot_df[plot_df['Inlier'] == True]['Intensity'],
+ mode='markers',
+ name=f'Inliers ({group})',
+ marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
+ )
+ )
+
+ fig.add_trace(go.Scatter(
+ x=plot_df[time_column],
+ y=plot_df['Predicted'],
+ mode='lines',
+ name=f'Predicted Intensity ({group})',
+ line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2])
+ )
+ )
+
+ fig.add_trace(go.Scatter(
+ x=plot_df[plot_df['Inlier'] == False][time_column],
+ y=plot_df[plot_df['Inlier'] == False]['Intensity'],
+ mode='markers',
+ name='Outliers',
+ marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4])
+ )
+ )
+
+ color_index += 5
+
+ scores.append({
+ 'group': group,
+ 'train_root_mean_squared': train_rmse,
+ 'test_root_mean_squared': test_rmse,
+ 'train_r2_score': train_r2,
+ 'test_r2_score': test_r2,
+ })
+
+ else:
+ X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, shuffle=False)
+ model = RANSACRegressor(base_estimator=LinearRegression())
+ model.fit(X_train, y_train)
+
+
+ y_pred_train = model.predict(X_train)
+ y_pred_test = model.predict(X_test)
+
+ inlier_mask_train = model.inlier_mask_
+
+ # Predict the inliers for the test set
+ test_inlier_mask = model.predict(X_test) == y_pred_test
+
+ train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask_train], y_pred_train[inlier_mask_train]))
+ test_rmse = np.sqrt(mean_squared_error(y_test[test_inlier_mask], y_pred_test[test_inlier_mask]))
+ train_r2 = r2_score(y_train[inlier_mask_train], y_pred_train[inlier_mask_train])
+ test_r2 = r2_score(y_test[test_inlier_mask], y_pred_test[test_inlier_mask])
+
+ # Prepare DataFrames for plotting
+ train_df = pd.DataFrame(
+ {time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'})
+ test_df = pd.DataFrame(
+ {time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'})
+ train_df['Inlier'] = inlier_mask_train
+ test_df['Inlier'] = test_inlier_mask
+ plot_df = pd.concat([train_df, test_df])
+
+ # Add main plot traces
+ fig.add_trace(go.Scatter(
+ x=plot_df[plot_df['Inlier'] == True][time_column],
+ y=plot_df[plot_df['Inlier'] == True]['Intensity'],
+ mode='markers',
+ name='Inliers',
+ marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
+ )
+ )
+
+ fig.add_trace(go.Scatter(
+ x=plot_df[time_column],
+ y=plot_df['Predicted'],
+ mode='lines',
+ name='Predicted Intensity',
+ line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
+ )
+ )
+
+ fig.add_trace(go.Scatter(
+ x=plot_df[plot_df['Inlier'] == False][time_column],
+ y=plot_df[plot_df['Inlier'] == False]['Intensity'],
+ mode='markers',
+ name='Outliers',
+ marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3])
+ )
+ )
+
+ scores.append({
+ 'group': 'Overall',
+ 'train_root_mean_squared': train_rmse,
+ 'test_root_mean_squared': test_rmse,
+ 'train_r2_score': train_r2,
+ 'test_r2_score': test_r2,
+ })
+
+ # Add annotation text as a separate trace in the subplot
+ annotation_text = "
".join([
+ f"Group: {res['group']} (Train/Test)"
+ f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
"
+ f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
"
+ for res in scores
+ ])
+
+ fig.update_layout(
+ title=f"Intensity over Time for {protein_group}",
+ plot_bgcolor=colors["plot_bgcolor"],
+ xaxis_gridcolor=colors["gridcolor"],
+ yaxis_gridcolor=colors["gridcolor"],
+ xaxis_linecolor=colors["linecolor"],
+ yaxis_linecolor=colors["linecolor"],
+ xaxis_title=time_column,
+ yaxis_title="Intensity",
+ legend_title="Legend",
+ autosize=True,
+ margin=dict(l=100, r=100, t=100, b=50),
+ legend=dict(
+ x=1.05,
+ y=1,
+ bgcolor="rgba(255, 255, 255, 0.5)",
+ orientation="v",
+ ),
+ )
+
+ messages.append(
+ {
+ "level": logging.INFO,
+ "msg": annotation_text,
+ }
+ )
+
+ return dict(
+ scores=scores,
+ plots=[fig],
+ messages=messages
+ )
+
+
+def adfuller_test(
+ intensity_df: pd.DataFrame,
+ metadata_df: pd.DataFrame,
+ time_column: str,
+ protein_group: str,
+ alpha: float = 0.05,
+) -> dict:
+ """
+ Perform the Augmented Dickey-Fuller test to check for stationarity in a time series.
+ :param intensity_df: The dataframe containing the time series data.
+ :param metadata_df: The dataframe containing the metadata.
+ :param protein_group: The protein group to perform the test on.
+ :param time_column: The column representing time (e.g., 'visit', 'timepoint').
+ :param alpha: The significance level for the test (default is 0.05).
+
+ :return: A dictionary containing:
+ - test_statistic: The test statistic from the ADF test.
+ - p_value: The p-value from the ADF test.
+ - critical_values: The critical values for different significance levels.
+ - is_stationary: A boolean indicating if the series is stationary.
+ - messages: A list of messages for the user.
+ """
+
+ messages = []
+ # Filter for the specific protein group
+ intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group]
+ intensity_column_name = default_intensity_column(intensity_df)
+
+ # Merge with metadata to include time information
+ merged_df = pd.merge(
+ left=intensity_df[["Sample", intensity_column_name]],
+ right=metadata_df[["Sample", time_column]],
+ on="Sample",
+ copy=False,
+ )
+
+ # Sort the data by time to ensure it is treated as a time series
+ merged_df = merged_df.sort_values(by=time_column)
+ grouped_df = merged_df.groupby(time_column)[intensity_column_name].mean().reset_index()
+
+ # Extract the time series (after aggregation)
+ time_series = grouped_df[intensity_column_name].dropna()
+
+ # Perform the ADF test
+ result = adfuller(time_series)
+ test_statistic = result[0]
+ p_value = result[1]
+ critical_values = result[4]
+
+ # Determine if the series is stationary
+ is_stationary = p_value < alpha
+
+ # Create a message for the user
+ if is_stationary:
+ messages.append(
+ {
+ "level": logging.INFO,
+ "msg": f"The time series is stationary (p-value: {p_value:.5f}).",
+ }
+ )
+ else:
+ messages.append(
+ {
+ "level": logging.WARNING,
+ "msg": f"The time series is not stationary (p-value: {p_value:.5f}).",
+ }
+ )
+
+ return dict(
+ test_statistic=test_statistic,
+ p_value=p_value,
+ critical_values=critical_values,
+ is_stationary=is_stationary,
+ messages=messages,
+ )
+
+
+def time_series_auto_arima(
+ intensity_df: pd.DataFrame,
+ metadata_df: pd.DataFrame,
+ time_column: str,
+ protein_group: str,
+ seasonal: str,
+ m: int,
+ train_size: float,
+ grouping: str,
+ grouping_column: str,
+) -> dict:
+ """
+ Perform an automatic ARIMA model selection on the time series data for a given protein group.
+ :param intensity_df: Peptide dataframe which contains the intensity of each sample
+ :param metadata_df: Metadata dataframe which contains the timestamps
+ :param time_column: The name of the column containing the time values
+ :param protein_group: Protein group to perform the analysis on
+ :param seasonal: Whether the ARIMA model should be seasonal
+ :param m: The number of time steps for a single seasonal period (ignored if seasonal=False)
+ :param train_size: The proportion of the dataset to include in the test split
+ :param grouping_column: The name of the column containing the grouping information
+ :param grouping: Whether to group the data by the 'Group' column
+
+ :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
+ """
+
+ messages = []
+ color_index = 0
+
+ if train_size < 0 or train_size > 1:
+ raise ValueError("Train size should be between 0 and 1")
+ if seasonal == "Yes":
+ seasonal = True
+ else:
+ seasonal = False
+
+ intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group]
+ intensity_df = intensity_df.sample(frac=1, random_state=42).reset_index(drop=True)
+ intensity_column_name = default_intensity_column(intensity_df)
+
+ intensity_df = pd.merge(
+ left=intensity_df,
+ right=metadata_df,
+ on="Sample",
+ copy=False,
+ )
+
+ fig = go.Figure()
+ scores = []
+
+ if grouping == "With Grouping" and grouping_column in intensity_df.columns:
+ groups = intensity_df[grouping_column].unique()
+ for group in groups:
+ group_df = intensity_df[intensity_df[grouping_column] == group]
+
+ train_df_size = int(len(group_df) * train_size)
+ train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
+
+ train_df = train_df.set_index(time_column)[intensity_column_name]
+ test_df = test_df.set_index(time_column)[intensity_column_name]
+
+ # Fit the ARIMA model
+ model = auto_arima(
+ train_df,
+ seasonal=seasonal,
+ m=m,
+ trace=True,
+ error_action='ignore',
+ suppress_warnings=True,
+ stepwise=True,
+ )
+
+ # Forecast the test set
+ forecast = model.predict(n_periods=test_df.shape[0])
+ parameters = model.get_params()
+
+ test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
+ test_r2 = r2_score(test_df, forecast)
+ train_rmse = np.sqrt(mean_squared_error(train_df, model.predict_in_sample()))
+ train_r2 = r2_score(train_df, model.predict_in_sample())
+
+ forecast_reset = forecast.reset_index(drop=True)
+ forecast_plot = pd.Series(forecast_reset.values, index=test_df.index)
+ forecast_plot = forecast_plot.groupby(forecast_plot.index).mean()
+
+ fig.add_trace(go.Scatter(
+ x=test_df.index,
+ y=test_df,
+ mode='markers',
+ name=f'Actual Intensity ({group})',
+ marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
+ )
+ )
+
+ fig.add_trace(go.Scatter(
+ x=test_df.index,
+ y=forecast,
+ mode='markers',
+ name=f'Predicted Intensity ({group})',
+ line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4])
+ )
+ )
+
+ fig.add_trace(go.Scatter(
+ x = forecast_plot.index,
+ y = forecast_plot,
+ mode = 'lines',
+ name = f'Mean Predicted Intensity ({group})',
+ line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4])
+ )
+ )
+
+ color_index += 5
+
+ scores.append({
+ 'group': group,
+ 'train_root_mean_squared': train_rmse,
+ 'test_root_mean_squared': test_rmse,
+ 'train_r2_score': train_r2,
+ 'test_r2_score': test_r2,
+ })
+ aa_order = parameters['order']
+ aa_seasonal_order = parameters['seasonal_order']
+
+ messages.append(
+ {
+ "level": logging.INFO,
+ "msg": f"Auto Arima Order (p,d,q): {aa_order}.",
+ }
+ )
+ if seasonal:
+ messages.append(
+ {
+ "level": logging.INFO,
+ "msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.",
+ }
+ )
+
+ else:
+ train_size = int(len(intensity_df) * train_size)
+ train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]
+
+ train_df = train_df.set_index(time_column)[intensity_column_name]
+ test_df = test_df.set_index(time_column)[intensity_column_name]
+
+ # Fit the ARIMA model
+ model = auto_arima(
+ train_df,
+ seasonal=seasonal,
+ m=m,
+ trace=True,
+ error_action='ignore',
+ suppress_warnings=True,
+ stepwise=True,
+ )
+
+ # Forecast the test set
+ forecast = model.predict(n_periods=test_df.shape[0])
+ parameters = model.get_params()
+
+ aa_order = parameters['order']
+ aa_seasonal_order = parameters['seasonal_order']
+
+ messages.append(
+ {
+ "level": logging.INFO,
+ "msg": f"Auto Arima Order (p,d,q): {aa_order}.",
+ }
+ )
+ if seasonal:
+ messages.append(
+ {
+ "level": logging.INFO,
+ "msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.",
+ }
+ )
+
+ test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
+ test_r2 = r2_score(test_df, forecast)
+ train_rmse = np.sqrt(mean_squared_error(train_df, model.predict_in_sample()))
+ train_r2 = r2_score(train_df, model.predict_in_sample())
+
+ forecast_reset = forecast.reset_index(drop=True)
+ forecast_plot = pd.Series(forecast_reset.values, index=test_df.index)
+ forecast_plot = forecast_plot.groupby(forecast_plot.index).mean()
+
+ fig.add_trace(go.Scatter(
+ x=test_df.index,
+ y=test_df,
+ mode='markers',
+ name='Actual Intensity',
+ marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
+ )
+ )
+
+ fig.add_trace(go.Scatter(
+ x=test_df.index,
+ y=forecast,
+ mode='markers',
+ name='Predicted Intensity',
+ line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3])
+ )
+ )
+
+ fig.add_trace(go.Scatter(
+ x=forecast_plot.index,
+ y=forecast_plot,
+ mode='lines',
+ name='Mean Predicted Intensity',
+ line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4])
+ )
+ )
+
+ scores.append({
+ 'group': 'Overall',
+ 'train_root_mean_squared': train_rmse,
+ 'test_root_mean_squared': test_rmse,
+ 'train_r2_score': train_r2,
+ 'test_r2_score': test_r2,
+ })
+
+ # Add annotation text as a separate trace in the subplot
+ annotation_text = "
".join([
+ f"Group: {res['group']} (Train/Test)"
+ f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
"
+ f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
"
+ for res in scores
+ ])
+
+
+ fig.update_layout(
+ title=f"Intensity over Time for {protein_group}",
+ plot_bgcolor=colors["plot_bgcolor"],
+ xaxis_gridcolor=colors["gridcolor"],
+ yaxis_gridcolor=colors["gridcolor"],
+ xaxis_linecolor=colors["linecolor"],
+ yaxis_linecolor=colors["linecolor"],
+ xaxis_title=time_column,
+ yaxis_title="Intensity",
+ legend_title="Legend",
+ autosize=True,
+ margin=dict(l=100, r=100, t=100, b=50),
+ legend=dict(
+ x=1.05,
+ y=1,
+ bgcolor="rgba(255, 255, 255, 0.5)",
+ orientation="v",
+ ),
+ )
+
+ messages.append(
+ {
+ "level": logging.INFO,
+ "msg": annotation_text,
+ }
+ )
+
+ return dict(
+ scores=scores,
+ plots=[fig],
+ messages=messages,
+ )
+
+
+def time_series_arima(
+ intensity_df: pd.DataFrame,
+ metadata_df: pd.DataFrame,
+ time_column: str,
+ protein_group: str,
+ seasonal: str,
+ p: int,
+ d: int,
+ q: int,
+ P: int,
+ D: int,
+ Q: int,
+ s: int,
+ train_size: float,
+ grouping: str,
+ grouping_column: str,
+) -> dict:
+
+ """
+ Perform ARIMA model selection on the time series data for a given protein group.
+ :param intensity_df: Peptide dataframe which contains the intensity of each sample
+ :param metadata_df: Metadata dataframe which contains the timestamps
+ :param time_column: The name of the column containing the time values
+ :param protein_group: Protein group to perform the analysis on
+ :param seasonal: Whether the ARIMA model should be seasonal
+ :param p: ARIMA p parameter
+ :param d: ARIMA d parameter
+ :param q: ARIMA q parameter
+ :param P: ARIMA seasonal P parameter
+ :param D: ARIMA seasonal D parameter
+ :param Q: ARIMA seasonal Q parameter
+ :param s: ARIMA seasonal s parameter
+ :param train_size: The proportion of the dataset to include in the test split
+ :param grouping_column: The name of the column containing the grouping information
+ :param grouping: Whether to group the data by the 'Group' column
+
+ :return: A dictionary containing the root mean squared error and r2 score for the training and test sets
+ """
+ messages = []
+ color_index = 0
+
+ if train_size < 0 or train_size > 1:
+ raise ValueError("Train size should be between 0 and 1")
+
+ intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group]
+ intensity_df = intensity_df.sample(frac=1, random_state=42).reset_index(drop=True)
+ intensity_column_name = default_intensity_column(intensity_df)
+
+ intensity_df = pd.merge(left=intensity_df, right=metadata_df, on="Sample", copy=False)
+
+ fig = go.Figure()
+ scores = []
+
+ if grouping == "With Grouping" and grouping_column in intensity_df.columns:
+ groups = intensity_df[grouping_column].unique()
+ for group in groups:
+ group_df = intensity_df[intensity_df[grouping_column] == group]
+
+ train_df_size = int(len(group_df) * train_size)
+ train_df, test_df = group_df[:train_df_size], group_df[train_df_size:]
+
+ train_df = train_df.set_index(time_column)[intensity_column_name]
+ test_df = test_df.set_index(time_column)[intensity_column_name]
+
+ if seasonal == "Yes":
+ model = ARIMA(
+ train_df,
+ order=(p, d, q),
+ seasonal_order=(P, D, Q, s)
+ )
+ else:
+ model = ARIMA(
+ train_df,
+ order=(p, d, q)
+ )
+
+ model_fit = model.fit()
+
+ forecast = model_fit.forecast(steps=len(test_df))
+
+ test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
+ test_r2 = r2_score(test_df, forecast)
+ train_rmse = np.sqrt(mean_squared_error(train_df, model_fit.fittedvalues))
+ train_r2 = r2_score(train_df, model_fit.fittedvalues)
+
+ forecast_reset = forecast.reset_index(drop=True)
+ forecast_plot = pd.Series(forecast_reset.values, index=test_df.index)
+ forecast_mean_plot = forecast_plot.groupby(forecast_plot.index).mean()
+
+ fig.add_trace(go.Scatter(
+ x=test_df.index,
+ y=test_df,
+ mode='markers',
+ name=f'Actual Intensity ({group})',
+ marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index])
+ )
+ )
+
+ fig.add_trace(go.Scatter(
+ x=forecast_plot.index,
+ y=forecast_plot,
+ mode='markers',
+ name= f'Predicted Intensity ({group})',
+ line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4])
+ )
+ )
+
+ fig.add_trace(go.Scatter(
+ x = forecast_mean_plot.index,
+ y = forecast_mean_plot,
+ mode = 'lines',
+ name = f'Mean Predicted Intensity ({group})',
+ line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4])
+ )
+ )
+
+ color_index += 5
+
+ scores.append({
+ 'group': group,
+ 'train_root_mean_squared': train_rmse,
+ 'test_root_mean_squared': test_rmse,
+ 'train_r2_score': train_r2,
+ 'test_r2_score': test_r2,
+ })
+
+ else:
+ train_size = int(len(intensity_df) * train_size)
+ train_df, test_df = intensity_df[:train_size], intensity_df[train_size:]
+
+ train_df = train_df.set_index(time_column)[intensity_column_name]
+ test_df = test_df.set_index(time_column)[intensity_column_name]
+
+ if seasonal == "Yes":
+ model = ARIMA(
+ train_df,
+ order=(p, d, q),
+ seasonal_order = (P, D, Q, s),
+ )
+ else:
+ model = ARIMA(train_df, order=(p, d, q))
+
+ model_fit = model.fit()
+
+ forecast = model_fit.forecast(steps=len(test_df))
+
+ test_rmse = np.sqrt(mean_squared_error(test_df, forecast))
+ test_r2 = r2_score(test_df, forecast)
+ train_rmse = np.sqrt(mean_squared_error(train_df, model_fit.fittedvalues))
+ train_r2 = r2_score(train_df, model_fit.fittedvalues)
+
+ forecast_reset = forecast.reset_index(drop=True)
+ forecast_plot = pd.Series(forecast_reset.values, index=test_df.index)
+ forecast_plot = forecast_plot.groupby(forecast_plot.index).mean()
+
+ fig.add_trace(go.Scatter(
+ x=test_df.index,
+ y=test_df,
+ mode='markers',
+ name='Actual Intensity',
+ marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0])
+ )
+ )
+
+ fig.add_trace(go.Scatter(
+ x=test_df.index,
+ y=forecast,
+ mode='markers',
+ name='Predicted Intensity',
+ line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3])
+ )
+ )
+
+ fig.add_trace(go.Scatter(
+ x=forecast_plot.index,
+ y=forecast_plot,
+ mode='lines',
+ name='Mean Predicted Intensity',
+ line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4])
+ )
+ )
+
+ scores.append({
+ 'group': 'Overall',
+ 'train_root_mean_squared': train_rmse,
+ 'test_root_mean_squared': test_rmse,
+ 'train_r2_score': train_r2,
+ 'test_r2_score': test_r2,
+ })
+
+ annotation_text = "
".join([
+ f"Group: {res['group']} (Train/Test)"
+ f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
"
+ f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
"
+ for res in scores
+ ])
+
+ fig.update_layout(
+ title=f"Intensity over Time for {protein_group}",
+ plot_bgcolor=colors["plot_bgcolor"],
+ xaxis_gridcolor=colors["gridcolor"],
+ yaxis_gridcolor=colors["gridcolor"],
+ xaxis_linecolor=colors["linecolor"],
+ yaxis_linecolor=colors["linecolor"],
+ xaxis_title=time_column,
+ yaxis_title="Intensity",
+ legend_title="Legend",
+ autosize=True,
+ margin=dict(l=100, r=100, t=100, b=50),
+ legend=dict(
+ x=1.05,
+ y=1,
+ bgcolor="rgba(255, 255, 255, 0.5)",
+ orientation="v",
+ ),
+ )
+
+ messages.append(
+ {
+ "level": logging.INFO,
+ "msg": annotation_text,
+ }
+ )
+
+ return dict(
+ scores=scores,
+ plots=[fig],
+ messages=messages,
+ )
diff --git a/protzilla/data_preprocessing/peptide_filter.py b/protzilla/data_preprocessing/peptide_filter.py
index 3b1caee9..67745d15 100644
--- a/protzilla/data_preprocessing/peptide_filter.py
+++ b/protzilla/data_preprocessing/peptide_filter.py
@@ -50,3 +50,67 @@ def by_pep_value_plot(method_inputs, method_outputs, graph_type):
elif graph_type == "Bar chart":
fig = create_bar_plot(**value_dict)
return [fig]
+
+def by_samples_missing(
+ protein_df: pd.DataFrame | None,
+ peptide_df: pd.DataFrame | None,
+ percentage: float = 0.5,
+) -> dict:
+ """
+ This function filters proteins based on the amount of samples with nan values, if the percentage of nan values
+ is below a threshold (percentage).
+
+ :param protein_df: the protein dataframe that should be filtered
+ :param peptide_df: the peptide dataframe that should be filtered in accordance to the intensity dataframe (optional)
+ :param percentage: ranging from 0 to 1. Defining the relative share of samples the proteins need to be present in,
+ in order for the protein to be kept.
+ :return: returns the filtered df as a Dataframe and a dict with a list of Protein IDs that were discarded
+ and a list of Protein IDs that were kept
+ """
+
+ filter_threshold: int = percentage * len(protein_df.Sample.unique())
+ transformed_df = long_to_wide(protein_df)
+
+ remaining_proteins_list = transformed_df.dropna(
+ axis=1, thresh=filter_threshold
+ ).columns.tolist()
+ filtered_proteins_list = (
+ transformed_df.drop(remaining_proteins_list, axis=1).columns.unique().tolist()
+ )
+ filtered_df = protein_df[
+ (protein_df["Protein ID"].isin(remaining_proteins_list))
+ ]
+ filtered_peptide_df = None
+ if peptide_df is not None:
+ filtered_peptide_df = peptide_df[
+ (peptide_df["Protein ID"].isin(remaining_proteins_list))
+ ]
+ return dict(
+ protein_df=filtered_df,
+ peptide_df=filtered_peptide_df,
+ filtered_proteins=filtered_proteins_list,
+ remaining_proteins=remaining_proteins_list,
+ )
+
+
+def _build_pie_bar_plot(remaining_proteins, filtered_proteins, graph_type):
+ if graph_type == "Pie chart":
+ fig = create_pie_plot(
+ values_of_sectors=[
+ len(remaining_proteins),
+ len(filtered_proteins),
+ ],
+ names_of_sectors=["Proteins kept", "Proteins filtered"],
+ heading="Number of Filtered Proteins",
+ )
+ elif graph_type == "Bar chart":
+ fig = create_bar_plot(
+ values_of_sectors=[
+ len(remaining_proteins),
+ len(filtered_proteins),
+ ],
+ names_of_sectors=["Proteins kept", "Proteins filtered"],
+ heading="Number of Filtered Proteins",
+ y_title="Number of Proteins",
+ )
+ return [fig]
\ No newline at end of file
diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py
index c3d9136f..595aacbb 100644
--- a/protzilla/importing/ms_data_import.py
+++ b/protzilla/importing/ms_data_import.py
@@ -123,6 +123,79 @@ def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum"
return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))])
+def tmt_data_import(
+ file_path: str, intensity_name: str = "Reporter intensity", map_to_uniprot=False,
+ aggregation_method: str = "Sum"
+) -> dict:
+ try:
+ # Read the file
+ df = pd.read_csv(
+ file_path,
+ sep="\t",
+ low_memory=False,
+ na_values=["", 0],
+ keep_default_na=True,
+ )
+
+ # Debug step: Print the column names to check the actual names in the data
+ print("Columns in the file:", df.columns.tolist())
+
+ # Try to handle different possible names for the 'Protein ID' column
+ protein_column = None
+ possible_names = ["Majority protein IDs"]
+
+ for name in possible_names:
+ if name in df.columns:
+ protein_column = name
+ break
+
+ if protein_column is None:
+ raise KeyError("No valid 'Protein ID' or equivalent column found in the data.")
+
+ df = df.rename(columns={protein_column: "Protein ID"})
+
+ # Extract protein or gene identifiers
+ protein_groups = df["Protein ID"]
+
+ # Drop columns that are not relevant
+ columns_to_drop = [
+ "Combined Spectral Count",
+ "Combined Unique Spectral Count",
+ "Combined Total Spectral Count",
+ ]
+ existing_columns = set(df.columns)
+ columns_to_drop_existing = [col for col in columns_to_drop if col in existing_columns]
+ df = df.drop(columns=columns_to_drop_existing)
+ print("Columns after dropping irrelevant ones:", df.columns.tolist())
+
+ # Use regex to find columns matching the TMT pattern with visits for both NP and T1D samples
+ intensity_columns = df.filter(
+ regex=f"{intensity_name} \\d+ (NP\\d{{2}}|TD\\d{{2}})", axis=1
+ )
+
+ # Debug step: Print the intensity columns that were matched
+ print("Matched intensity columns:", intensity_columns.columns.tolist())
+
+ # Rename columns to the format 'NPXX_1' or 'T1DXX_1'
+ intensity_columns.columns = [
+ re.sub(f"{intensity_name} (\\d+) (NP\\d{{2}}|TD\\d{{2}})",
+ lambda m: f"{m.group(2)}_{int(m.group(1)) + 1}", col) for col in intensity_columns.columns
+ ]
+
+ # Debug step: Print the renamed intensity columns
+ print("Renamed intensity columns:", intensity_columns.columns.tolist())
+ # Add back the protein identifiers to the dataframe
+ intensity_columns = intensity_columns.assign(**{"Protein ID": protein_groups})
+
+ # Apply transformation, clean-up, or aggregation (depending on your logic)
+ return transform_and_clean(intensity_columns, intensity_name, map_to_uniprot, aggregation_method)
+
+ except Exception as e:
+ msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid TMT data file."
+ return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))])
+
+
+
def transform_and_clean(
df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool, aggregation_method: str ="Sum"
) -> dict:
@@ -197,9 +270,15 @@ def clean_protein_groups(protein_groups, map_to_uniprot=True):
found_ids_per_group = []
# go through all groups and find the valid proteins
# non uniprot ids are put into extracted_ids, so they can be mapped
+ extract_protein_id_regex = re.compile(r'\|([^|]+)\|')
+
+ # Function to extract protein IDs from the formatted string
+ def extract_protein_ids(protein_group_str):
+ return extract_protein_id_regex.findall(protein_group_str)
+
for group in protein_groups:
found_in_group = []
- for protein_id in group.split(";"):
+ for protein_id in extract_protein_ids(group) or group.split(";"):
if not protein_id.startswith("ENSP") and (
match := uniprot_regex.search(protein_id)
):
diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py
index 7ad45ddd..4907bbf6 100644
--- a/protzilla/methods/data_analysis.py
+++ b/protzilla/methods/data_analysis.py
@@ -12,6 +12,15 @@
mann_whitney_test_on_intensity_data, mann_whitney_test_on_ptm_data)
from protzilla.data_analysis.differential_expression_t_test import t_test
from protzilla.data_analysis.dimension_reduction import t_sne, umap
+from protzilla.data_analysis.time_series_regression_analysis import (
+ time_series_linear_regression,
+ time_series_ransac_regression,
+ adfuller_test,
+ time_series_auto_arima,
+ time_series_arima,
+)
+from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \
+ ptms_per_protein_and_sample
from protzilla.data_analysis.model_evaluation import evaluate_classification_model
from protzilla.data_analysis.plots import (
clustergram_plot,
@@ -19,6 +28,7 @@
prot_quant_plot,
scatter_plot,
)
+from protzilla.data_analysis.time_series_plots import time_quant_plot
from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph
from protzilla.data_analysis.ptm_analysis import (
filter_peptides_of_protein,
@@ -765,6 +775,195 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
return inputs
+class PlotTimeQuant(PlotStep):
+ display_name = "Time Quantification Plot For Protein"
+ operation = "Time series analysis"
+ method_description = (
+ "Creates a line chart for intensity across Time for protein groups"
+ )
+
+ input_keys = [
+ "intensity_df",
+ "metadata_df",
+ "time_column",
+ "protein_group",
+ "similarity_measure",
+ "similarity"
+ ]
+ output_keys = []
+
+ def method(self, inputs: dict) -> dict:
+ return time_quant_plot(**inputs)
+
+
+ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+ inputs["intensity_df"] = steps.protein_df
+ inputs["metadata_df"] = steps.metadata_df
+ return inputs
+
+
+class TimeSeriesLinearRegression(PlotStep):
+ display_name = "Linear Regression"
+ operation = "Time series analysis"
+ method_description = ("A function to fit a linear model using ordinary least squares for each protein. "
+ "The linear model fits the protein intensities on Y axis and the Time on X. "
+ "The p-values are corrected for multiple testing.")
+
+ input_keys = [
+ "intensity_df",
+ "metadata_df",
+ "time_column",
+ "protein_group",
+ "train_size",
+ "grouping",
+ "grouping_column",
+ ]
+ output_keys = [
+ "scores",
+ ]
+
+ def method(self, inputs: dict) -> dict:
+ return time_series_linear_regression(**inputs)
+
+ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+ inputs["intensity_df"] = steps.protein_df
+ inputs["metadata_df"] = steps.metadata_df
+ return inputs
+
+
+class TimeSeriesRANSACRegression(PlotStep):
+ display_name = "RANSAC Regression"
+ operation = "Time series analysis"
+ method_description = " Perform RANSAC regression on the time series data for a given protein group."
+
+ input_keys = [
+ "intensity_df",
+ "metadata_df",
+ "time_column",
+ "protein_group",
+ "max_trials",
+ "stop_probability",
+ "loss",
+ "train_size",
+ "grouping",
+ "grouping_column",
+ ]
+ output_keys = [
+ "scores",
+ ]
+ def method(self, inputs: dict) -> dict:
+ return time_series_ransac_regression(**inputs)
+
+ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+ inputs["intensity_df"] = steps.protein_df
+ inputs["metadata_df"] = steps.metadata_df
+ return inputs
+
+
+class TimeSeriesADFullerTest(DataAnalysisStep):
+ display_name = "Augmented Dickey-Fuller Test"
+ operation = "Time series analysis"
+ method_description = (
+ "The Augmented Dickey-Fuller test is a type of statistical test called a unit root test. The test "
+ "determines how strongly a time series is defined by a trend. The null hypothesis of the test is that the "
+ "time series can be represented by a unit root, which implies that the time series is not stationary. "
+ "The alternative hypothesis is that the time series is stationary. If the p-value is less than the "
+ "significance level, the null hypothesis can be rejected and the time series is considered stationary."
+ "Dickey, D. & Fuller, Wayne. (1979). Distribution of the Estimators for Autoregressive Time Series With a Unit Root. "
+ "JASA. Journal of the American Statistical Association. 74. 10.2307/2286348. "
+ )
+
+ input_keys = [
+ "intensity_df",
+ "metadata_df",
+ "time_column",
+ "protein_group",
+ "alpha",
+ ]
+ output_keys = [
+ "test_statistic",
+ "p_value",
+ "critical_values",
+ "is_stationary",
+ ]
+
+ def method(self, inputs: dict) -> dict:
+ return adfuller_test(**inputs)
+
+ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+ inputs["intensity_df"] = steps.protein_df
+ inputs["metadata_df"] = steps.metadata_df
+ return inputs
+
+
+class TimeSeriesAutoARIMA(PlotStep):
+ display_name = "Auto ARIMA (AutoRegressive Integrated Moving Average)"
+ operation = "Time series analysis"
+ method_description = (
+ "Perform Auto ARIMA on the time series data for a given protein group."
+ )
+
+ input_keys = [
+ "intensity_df",
+ "metadata_df",
+ "time_column",
+ "protein_group",
+ "seasonal",
+ "m",
+ "train_size",
+ "grouping",
+ "grouping_column",
+ ]
+ output_keys = [
+ "scores",
+ ]
+
+ def method(self, inputs: dict) -> dict:
+ return time_series_auto_arima(**inputs)
+
+ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+ inputs["intensity_df"] = steps.protein_df
+ inputs["metadata_df"] = steps.metadata_df
+ return inputs
+
+
+class TimeSeriesARIMA(PlotStep):
+ display_name = "ARIMA (AutoRegressive Integrated Moving Average)"
+ operation = "Time series analysis"
+ method_description = (
+ "Perform ARIMA on the time series data for a given protein group."
+ )
+
+ input_keys = [
+ "intensity_df",
+ "metadata_df",
+ "time_column",
+ "protein_group",
+ "seasonal",
+ "p",
+ "d",
+ "q",
+ "P",
+ "D",
+ "Q",
+ "s",
+ "train_size",
+ "grouping",
+ "grouping_column",
+ ]
+ output_keys = [
+ "scores",
+ ]
+
+ def method(self, inputs: dict) -> dict:
+ return time_series_arima(**inputs)
+
+ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
+ inputs["intensity_df"] = steps.protein_df
+ inputs["metadata_df"] = steps.metadata_df
+ return inputs
+
+
class PTMsPerSample(DataAnalysisStep):
display_name = "PTMs per Sample"
operation = "Peptide analysis"
@@ -813,3 +1012,4 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict:
Step, "peptide_df", inputs["peptide_df"]
)
return inputs
+
diff --git a/protzilla/methods/data_preprocessing.py b/protzilla/methods/data_preprocessing.py
index 0565eaf0..50373899 100644
--- a/protzilla/methods/data_preprocessing.py
+++ b/protzilla/methods/data_preprocessing.py
@@ -329,8 +329,8 @@ class FilterPeptidesByPEPThreshold(DataPreprocessingStep):
operation = "filter_peptides"
method_description = "Filter by PEP-threshold"
- input_keys = ["protein_df", "peptide_df", "threshold"]
- output_keys = ["protein_df", "peptide_df", "filtered_peptides"]
+ input_keys = ["peptide_df", "threshold"]
+ output_keys = ["peptide_df", "filtered_peptides"]
def method(self, inputs):
return peptide_filter.by_pep_value(**inputs)
diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py
index 7cde1ba0..f94218f4 100644
--- a/protzilla/methods/importing.py
+++ b/protzilla/methods/importing.py
@@ -9,6 +9,7 @@
diann_import,
max_quant_import,
ms_fragger_import,
+ tmt_data_import,
)
from protzilla.importing.peptide_import import peptide_import, evidence_import
from protzilla.steps import Step, StepManager
@@ -60,6 +61,17 @@ def method(self, inputs):
return ms_fragger_import(**inputs)
+class TMTImport(ImportingStep):
+ display_name = "TMT"
+ operation = "msdataimport"
+ method_description = "TMT data import"
+ input_keys = ["file_path", "map_to_uniprot", "aggregation_method"]
+ output_keys = ["protein_df"]
+
+ def method(self, inputs):
+ return tmt_data_import(**inputs)
+
+
class MetadataImport(ImportingStep):
display_name = "Metadata import"
operation = "metadataimport"
@@ -96,7 +108,8 @@ class MetadataColumnAssignment(ImportingStep):
display_name = "Metadata column assignment"
operation = "metadataimport"
method_description = (
- "Assign columns to metadata categories, repeatable for each category"
+ "Protzilla uses a unique metadata column name to identify certain features in the metadata. "
+ "This step assigns the metadata columns to the correct feature."
)
input_keys = [
diff --git a/protzilla/steps.py b/protzilla/steps.py
index d5fb124e..32ce93b3 100644
--- a/protzilla/steps.py
+++ b/protzilla/steps.py
@@ -36,6 +36,7 @@ def __init__(self, instance_identifier: str | None = None):
self.messages: Messages = Messages([])
self.output: Output = Output()
self.plots: Plots = Plots()
+ self.display_output: DisplayOutput = DisplayOutput()
self.instance_identifier = instance_identifier
if self.instance_identifier is None:
@@ -310,6 +311,27 @@ def export(self, format_):
exports.append(BytesIO(base64.b64decode(plot)))
return exports
+class DisplayOutput:
+
+ def __init__(self, display_output: dict = None):
+ if display_output is None:
+ display_output = {}
+ self.display_output = display_output
+ def __iter__(self):
+ return iter(self.display_output)
+ def __repr__(self):
+ return f"DisplayOutput: {self.display_output}"
+ def __contains__(self, key):
+ return key in self.display_output
+ def __getitem__(self, key):
+ return self.display_output[key]
+ def __setitem__(self, key, value):
+ self.display_output[key] = value
+ def is_empty(self) -> bool:
+ return len(self.display_output) == 0
+
+
+
class StepManager:
def __repr__(self):
diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py
index 59a83259..fdb931e7 100644
--- a/protzilla/utilities/transform_dfs.py
+++ b/protzilla/utilities/transform_dfs.py
@@ -12,6 +12,7 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
:param intensity_df: the dataframe that should be transformed into
long format
:type intensity_df: pd.DataFrame
+ :param value_name: the name of the column in the metadata_df that contains the intensity information.
:return: returns dataframe in wide format suitable for use by
packages such as sklearn
@@ -23,6 +24,31 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
)
+def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None, time_column: str = None):
+ """
+ This function transforms the dataframe to a wide format that
+ can be more easily handled by packages such as sklearn.
+ Each sample gets one row with all observations as columns.
+
+ :param intensity_df: the dataframe that should be transformed into
+ long format
+ :type intensity_df: pd.DataFrame
+ :param value_name: the name of the column in the metadata_df that contains the intensity information.
+ :param time_column: the name of the column in the metadata_df that contains the time information.
+
+ :return: returns dataframe in wide format suitable for use by
+ packages such as sklearn
+ :rtype: pd.DataFrame
+ """
+ if intensity_df.duplicated(subset=[time_column, "Protein ID"]).any():
+ intensity_df = intensity_df.groupby([time_column, "Protein ID"]).mean().reset_index()
+ values_name = default_intensity_column(intensity_df) if value_name is None else value_name
+ intensity_df = pd.pivot(
+ intensity_df, index=time_column, columns="Protein ID", values=values_name
+ )
+ intensity_df = intensity_df.fillna(intensity_df.mean())
+ return intensity_df
+
def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame):
"""
This functions transforms the dataframe from a wide
@@ -58,15 +84,16 @@ def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame):
return intensity_df
-def is_long_format(df: pd.DataFrame):
- return set(df.columns[:3]) == {"Sample", "Protein ID", "Gene"}
+def is_long_format(df: pd.DataFrame, time_column: str = None):
+ required_columns = {"Sample", "Protein ID"}
+ additional_columns = {"Gene", time_column}
+ return required_columns.issubset(df.columns) and any(col in df.columns for col in additional_columns)
def is_intensity_df(df: pd.DataFrame):
"""
Checks if the dataframe is an intensity dataframe.
- An intensity dataframe should have the columns "Sample", "Protein ID" and
- and intensity column.
+ An intensity dataframe should have the columns "Sample", "Protein ID" and intensity column.
:param df: the dataframe that should be checked
:type df: pd.DataFrame
diff --git a/requirements.txt b/requirements.txt
index bc175e2a..e7f0c7ed 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,6 +21,7 @@ restring==0.1.20
scikit-learn==1.2.2
scipy==1.10.1
statsmodels==0.13.5
+pmdarima==2.0.4
umap-learn==0.5.3
Werkzeug==2.2.3
numba==0.57.0
diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py
new file mode 100644
index 00000000..7bdebbda
--- /dev/null
+++ b/tests/protzilla/data_analysis/test_time_series_analysis.py
@@ -0,0 +1,414 @@
+import pandas as pd
+import pytest
+
+from protzilla.data_analysis.time_series_regression_analysis import (
+ time_series_linear_regression,
+ time_series_ransac_regression,
+ adfuller_test,
+ time_series_auto_arima,
+ time_series_arima,
+)
+
+
+@pytest.fixture
+def time_series_test_data():
+ test_intensity_list = (
+ ["Sample1", "Protein1", "Gene1", 20],
+ ["Sample1", "Protein2", "Gene1", 16],
+ ["Sample1", "Protein3", "Gene1", 1],
+ ["Sample1", "Protein4", "Gene1", 14],
+ ["Sample2", "Protein1", "Gene1", 20],
+ ["Sample2", "Protein2", "Gene1", 15],
+ ["Sample2", "Protein3", "Gene1", 2],
+ ["Sample2", "Protein4", "Gene1", 15],
+ ["Sample3", "Protein1", "Gene1", 22],
+ ["Sample3", "Protein2", "Gene1", 14],
+ ["Sample3", "Protein3", "Gene1", 3],
+ ["Sample3", "Protein4", "Gene1", 16],
+ ["Sample4", "Protein1", "Gene1", 8],
+ ["Sample4", "Protein2", "Gene1", 15],
+ ["Sample4", "Protein3", "Gene1", 1],
+ ["Sample4", "Protein4", "Gene1", 9],
+ ["Sample5", "Protein1", "Gene1", 10],
+ ["Sample5", "Protein2", "Gene1", 14],
+ ["Sample5", "Protein3", "Gene1", 2],
+ ["Sample5", "Protein4", "Gene1", 10],
+ ["Sample6", "Protein1", "Gene1", 12],
+ ["Sample6", "Protein2", "Gene1", 13],
+ ["Sample6", "Protein3", "Gene1", 3],
+ ["Sample6", "Protein4", "Gene1", 11],
+ ["Sample7", "Protein1", "Gene1", 12],
+ ["Sample7", "Protein2", "Gene1", 13],
+ ["Sample7", "Protein3", "Gene1", 3],
+ ["Sample7", "Protein4", "Gene1", 11],
+ ["Sample1", "Protein1", "Gene2", 10],
+ ["Sample1", "Protein2", "Gene2", 14],
+ ["Sample1", "Protein3", "Gene2", 2],
+ ["Sample1", "Protein4", "Gene2", 10],
+ ["Sample2", "Protein1", "Gene2", 12],
+ ["Sample2", "Protein1", "Gene3", 13],
+
+ )
+
+ test_intensity_df = pd.DataFrame(
+ data=test_intensity_list,
+ columns=["Sample", "Protein ID", "Gene", "Intensity"],
+ )
+
+ test_metadata_df = (
+ ["Sample1", "2", "1"],
+ ["Sample2", "6", "1"],
+ ["Sample3", "7", "1"],
+ ["Sample4", "8", "1"],
+ ["Sample5", "2", "2"],
+ ["Sample6", "6", "2"],
+ ["Sample7", "7", "2"],
+ )
+ test_metadata_df = pd.DataFrame(
+ data=test_metadata_df,
+ columns=["Sample", "Time", "Group"],
+ )
+ return test_intensity_df, test_metadata_df
+
+def test_linear_regression_plot_with_grouping(show_figures, time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = time_series_linear_regression(
+ test_intensity,
+ test_metadata,
+ "Time",
+ 0.8,
+ "Protein1",
+ "Group",
+ "With Grouping"
+ )
+ assert "plots" in outputs
+ fig = outputs["plots"][0]
+ if show_figures:
+ fig.show()
+ return
+
+def test_linear_regression_plot_without_grouping(show_figures, time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = time_series_linear_regression(
+ test_intensity,
+ test_metadata,
+ "Time",
+ 0.8,
+ "Protein1",
+ "With Grouping",
+ "Group",
+ )
+ assert "plots" in outputs
+ fig = outputs["plots"][0]
+ if show_figures:
+ fig.show()
+ return
+
+def test_linear_regression_plot_invalid_train_size(time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ with pytest.raises(ValueError):
+ time_series_linear_regression(
+ test_intensity,
+ test_metadata,
+ "Time",
+ 2,
+ "Protein1",
+ "With Grouping",
+ "Group",
+ )
+ return
+
+def test_linear_regression_outputs(time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = time_series_linear_regression(
+ test_intensity,
+ test_metadata,
+ "Time",
+ 0.8,
+ "Protein1",
+ "With Grouping",
+ "Group",
+ )
+ assert "scores" in outputs
+ return
+
+
+def test_ransac_regression_plot_with_grouping(show_figures, time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = time_series_ransac_regression(
+ test_intensity,
+ test_metadata,
+ "Time",
+ "Protein1",
+ 100,
+ 0.99,
+ "absolute_error",
+ 0.8,
+ "With Grouping",
+ "Group",
+ )
+ assert "plots" in outputs
+ fig = outputs["plots"][0]
+ if show_figures:
+ fig.show()
+ return
+
+def test_ransac_regression_plot_without_grouping(show_figures, time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = time_series_ransac_regression(
+ test_intensity,
+ test_metadata,
+ "Time",
+ "Protein1",
+ 100,
+ 0.99,
+ "absolute_error",
+ 0.8,
+ "With Grouping",
+ "Group",
+
+ )
+ assert "plots" in outputs
+ fig = outputs["plots"][0]
+ if show_figures:
+ fig.show()
+ return
+
+def test_ransac_plot_invalid_train_size(time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ with pytest.raises(ValueError):
+ time_series_ransac_regression(
+ test_intensity,
+ test_metadata,
+ "Time",
+ "Protein1",
+ 100,
+ 0.99,
+ "absolute_error",
+ 2,
+ "With Grouping",
+ "Group",
+ )
+ return
+
+def test_ransac_regression_outputs(time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = time_series_ransac_regression(
+ test_intensity,
+ test_metadata,
+ "Time",
+ "Protein1",
+ 100,
+ 0.99,
+ "absolute_error",
+ 0.8,
+ "With Grouping",
+ "Group",
+ )
+ assert "scores" in outputs
+ return
+
+
+def test_adfuller_test(time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = adfuller_test(test_intensity, test_metadata, "Time", "Protein1")
+
+ assert "test_statistic" in outputs
+ assert "p_value" in outputs
+ assert "critical_values" in outputs
+ assert "is_stationary" in outputs
+ assert "messages" in outputs
+ return
+
+
+def test_auto_arima_plot_with_grouping(show_figures, time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = time_series_auto_arima(
+ test_intensity,
+ test_metadata,
+ "Time",
+ "Protein1",
+ "No",
+ 1,
+ 0.5,
+ "With Grouping",
+ "Group",
+ )
+ assert "plots" in outputs
+ fig = outputs["plots"][0]
+ if show_figures:
+ fig.show()
+ return
+
+def test_auto_arima_plot_without_grouping(show_figures, time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = time_series_auto_arima(
+ test_intensity,
+ test_metadata,
+ "Time",
+ "Protein1",
+ "No",
+ 1,
+ 0.5,
+ "With Grouping",
+ "Group",
+ )
+ assert "plots" in outputs
+ fig = outputs["plots"][0]
+ if show_figures:
+ fig.show()
+ return
+
+def test_auto_arima_plot_invalid_train_size(time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ with pytest.raises(ValueError):
+ time_series_auto_arima(
+ test_intensity,
+ test_metadata,
+ "Time",
+ "Protein1",
+ "No",
+ 1,
+ 2,
+ "With Grouping",
+ "Group",
+ )
+ return
+
+
+def test_auto_arima_outputs(time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = time_series_auto_arima(
+ test_intensity,
+ test_metadata,
+ "Time",
+ "Protein1",
+ "No",
+ 1,
+ 0.5,
+ "With Grouping",
+ "Group",
+ )
+ assert "scores" in outputs
+ return
+
+
+def test_arima_plot_with_grouping(show_figures, time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = time_series_arima(
+ test_intensity,
+ test_metadata,
+ "Time",
+ "Protein1",
+ "No",
+ 1,
+ 1,
+ 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0.5,
+ "With Grouping",
+ "Group",
+ )
+ assert "plots" in outputs
+ fig = outputs["plots"][0]
+ if show_figures:
+ fig.show()
+ return
+
+def test_arima_plot_seasonal_with_grouping(show_figures, time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = time_series_arima(
+ test_intensity,
+ test_metadata,
+ "Time",
+ "Protein1",
+ "No",
+ 1,
+ 1,
+ 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0.5,
+ "With Grouping",
+ "Group",
+ )
+ assert "plots" in outputs
+ fig = outputs["plots"][0]
+ if show_figures:
+ fig.show()
+ return
+
+def test_arima_plot_without_grouping(show_figures, time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = time_series_arima(
+ test_intensity,
+ test_metadata,
+ "Time",
+ "Protein1",
+ "No",
+ 1,
+ 1,
+ 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0.5,
+ "With Grouping",
+ "Group",
+ )
+ assert "plots" in outputs
+ fig = outputs["plots"][0]
+ if show_figures:
+ fig.show()
+ return
+
+def test_arima_plot_invalid_train_size(time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ with pytest.raises(ValueError):
+ time_series_arima(
+ test_intensity,
+ test_metadata,
+ "Time",
+ "Protein1",
+ "No",
+ 1,
+ 1,
+ 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ 2,
+ "With Grouping",
+ "Group",
+ )
+ return
+
+
+def test_arima_outputs(time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = time_series_arima(
+ test_intensity,
+ test_metadata,
+ "Time",
+ "Protein1",
+ "No",
+ 1,
+ 1,
+ 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0.5,
+ "With Grouping",
+ "Group",
+ )
+ assert "scores" in outputs
+ return
\ No newline at end of file
diff --git a/tests/protzilla/data_analysis/test_time_series_plots.py b/tests/protzilla/data_analysis/test_time_series_plots.py
new file mode 100644
index 00000000..ca3fe4fa
--- /dev/null
+++ b/tests/protzilla/data_analysis/test_time_series_plots.py
@@ -0,0 +1,76 @@
+import pandas as pd
+import pytest
+
+from protzilla.data_analysis.time_series_plots import time_quant_plot
+
+
+@pytest.fixture
+def time_series_test_data():
+ test_intensity_list = (
+ ["Sample1", "Protein1", "Gene1", 20],
+ ["Sample1", "Protein2", "Gene1", 16],
+ ["Sample1", "Protein3", "Gene1", 1],
+ ["Sample1", "Protein4", "Gene1", 14],
+ ["Sample2", "Protein1", "Gene1", 20],
+ ["Sample2", "Protein2", "Gene1", 15],
+ ["Sample2", "Protein3", "Gene1", 2],
+ ["Sample2", "Protein4", "Gene1", 15],
+ ["Sample3", "Protein1", "Gene1", 22],
+ ["Sample3", "Protein2", "Gene1", 14],
+ ["Sample3", "Protein3", "Gene1", 3],
+ ["Sample3", "Protein4", "Gene1", 16],
+ ["Sample4", "Protein1", "Gene1", 8],
+ ["Sample4", "Protein2", "Gene1", 15],
+ ["Sample4", "Protein3", "Gene1", 1],
+ ["Sample4", "Protein4", "Gene1", 9],
+ ["Sample5", "Protein1", "Gene1", 10],
+ ["Sample5", "Protein2", "Gene1", 14],
+ ["Sample5", "Protein3", "Gene1", 2],
+ ["Sample5", "Protein4", "Gene1", 10],
+ ["Sample6", "Protein1", "Gene1", 12],
+ ["Sample6", "Protein2", "Gene1", 13],
+ ["Sample6", "Protein3", "Gene1", 3],
+ ["Sample6", "Protein4", "Gene1", 11],
+ ["Sample7", "Protein1", "Gene1", 12],
+ ["Sample7", "Protein2", "Gene1", 13],
+ ["Sample7", "Protein3", "Gene1", 3],
+ ["Sample7", "Protein4", "Gene1", 11],
+ )
+
+ test_intensity_df = pd.DataFrame(
+ data=test_intensity_list,
+ columns=["Sample", "Protein ID", "Gene", "Intensity"],
+ )
+
+ test_metadata_df = (
+ ["Sample1", "2", 1],
+ ["Sample2", "6", 1],
+ ["Sample3", "7", 1],
+ ["Sample4", "10", 1],
+ )
+ test_metadata_df = pd.DataFrame(
+ data=test_metadata_df,
+ columns=["Sample", "Time", "Day"],
+ )
+ return test_intensity_df, test_metadata_df
+
+def test_time_series_plot(show_figures, time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ outputs = time_quant_plot(test_intensity, test_metadata, "Time","Protein1")
+ assert "plots" in outputs
+ fig = outputs["plots"][0]
+ if show_figures:
+ fig.show()
+ return
+
+def test_time_series_plot_invalid_euclidean_similarity(time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ with pytest.raises(ValueError):
+ time_quant_plot(test_intensity, test_metadata, "Time", "Protein1", similarity=-1, similarity_measure="euclidean distance")
+ return
+
+def test_time_series_plot_invalid_cosine_similarity(time_series_test_data):
+ test_intensity, test_metadata = time_series_test_data
+ with pytest.raises(ValueError):
+ time_quant_plot(test_intensity, test_metadata, "Time","Protein1", similarity=2, similarity_measure="cosine similarity")
+ return
\ No newline at end of file
diff --git a/tests/protzilla/test_runner.py b/tests/protzilla/test_runner.py
index b5de3148..18080f48 100644
--- a/tests/protzilla/test_runner.py
+++ b/tests/protzilla/test_runner.py
@@ -94,8 +94,8 @@ def test_runner_imports(
'FilterSamplesByProteinIntensitiesSum',
'ImputationByKNN',
'OutlierDetectionByLocalOutlierFactor',
- 'NormalisationByMedian',
'TransformationLog',
+ 'NormalisationByMedian',
'PlotProtQuant',
'DifferentialExpressionTTest',
'PlotVolcano',
@@ -109,8 +109,8 @@ def test_runner_imports(
call({'deviation_threshold': 2.0}),
call({'number_of_neighbours': 5}),
call({'number_of_neighbors': 20}),
- call({'percentile': 0.5}),
call({'log_base': 'log2'}),
+ call({'percentile': 0.5}),
call({'similarity_measure': 'euclidean distance'}),
call({'alpha': 0.05}),
call({'fc_threshold': 1}),
diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py
index bc78fd71..a0e58689 100644
--- a/ui/runs/form_mapping.py
+++ b/ui/runs/form_mapping.py
@@ -17,6 +17,7 @@
importing.MaxQuantImport: importing_forms.MaxQuantImportForm,
importing.DiannImport: importing_forms.DiannImportForm,
importing.MsFraggerImport: importing_forms.MSFraggerImportForm,
+ importing.TMTImport: importing_forms.TMTImportForm,
importing.MetadataImport: importing_forms.MetadataImportForm,
importing.MetadataImportMethodDiann: importing_forms.MetadataImportMethodDiannForm,
importing.MetadataColumnAssignment: importing_forms.MetadataColumnAssignmentForm,
@@ -49,6 +50,7 @@
data_analysis.PlotScatterPlot: data_analysis_forms.PlotScatterPlotForm,
data_analysis.PlotClustergram: data_analysis_forms.PlotClustergramForm,
data_analysis.PlotProtQuant: data_analysis_forms.PlotProtQuantForm,
+ data_analysis.PlotTimeQuant: data_analysis_forms.PlotTimeQuantForm,
data_analysis.PlotPrecisionRecallCurve: data_analysis_forms.PlotPrecisionRecallCurveForm,
data_analysis.PlotROC: data_analysis_forms.PlotROCCurveForm,
data_analysis.ClusteringKMeans: data_analysis_forms.ClusteringKMeansForm,
@@ -65,6 +67,11 @@
data_analysis.FLEXIQuantLF: data_analysis_forms.FLEXIQuantLFForm,
data_analysis.PTMsPerSample: data_analysis_forms.PTMsPerSampleForm,
data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm,
+ data_analysis.TimeSeriesLinearRegression: data_analysis_forms.TimeSeriesLinearRegressionForm,
+ data_analysis.TimeSeriesRANSACRegression: data_analysis_forms.TimeSeriesRANSACRegressionForm,
+ data_analysis.TimeSeriesADFullerTest: data_analysis_forms.TimeSeriesADFullerTestForm,
+ data_analysis.TimeSeriesAutoARIMA: data_analysis_forms.TimeSeriesAutoARIMAForm,
+ data_analysis.TimeSeriesARIMA: data_analysis_forms.TimeSeriesARIMAForm,
data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms,
data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm,
data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm,
diff --git a/ui/runs/forms/custom_fields.py b/ui/runs/forms/custom_fields.py
index 7171f173..7370b64b 100644
--- a/ui/runs/forms/custom_fields.py
+++ b/ui/runs/forms/custom_fields.py
@@ -1,6 +1,8 @@
+import json
import logging
from enum import Enum
+import django.forms as forms
from django.forms import (
BooleanField,
CharField,
@@ -126,3 +128,32 @@ class CustomFloatField(FloatField):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.widget.attrs.update({"class": "form-control mb-2"})
+
+
+from django import forms
+from django.utils.safestring import mark_safe
+
+
+class TextDisplayWidget(forms.Widget):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.attrs.update()
+
+ def render(self, name, value, attrs=None, renderer=None):
+ display_text = self.attrs.get("data-display-text", "")
+ return mark_safe(f"