diff --git a/protzilla/constants/colors.py b/protzilla/constants/colors.py index eec08b1b..98daf656 100644 --- a/protzilla/constants/colors.py +++ b/protzilla/constants/colors.py @@ -1,8 +1,44 @@ PROTZILLA_DISCRETE_COLOR_SEQUENCE = [ - "#4A536A", - "#87A8B9", - "#CE5A5A", - "#8E3325", - "#E2A46D", + # Set 1: Muted Dark Slate + "#252935", "#3A3F50", "#50556A", "#6B7186", "#858DA2", + # Set 2: Muted Indian Red + "#CE5A5A", "#B24C4C", "#9D3F3F", "#E07272", "#F48D8D", + # Set 3: Muted Light Steel Blue + "#51646F", "#6A7D89", "#7F92A0", "#96A9B8", "#ADBFCD", + # Set 4: Muted Sienna + "#804538", "#6F3C31", "#5F342A", "#A05748", "#B66E5E", + # Set 5: Muted Sandy Brown + "#715236", "#63472F", "#57402B", "#96755A", "#A98575", + # Set 6: Muted Olive + "#6E6B48", "#5D5B3E", "#4E4D36", "#89875C", "#A1A16E", + # Set 7: Muted Teal + "#3B6B6A", "#315B5B", "#274C4C", "#507E7E", "#6B9898", + # Set 8: Muted Taupe + "#8B7E74", "#776F65", "#675E56", "#A09085", "#B9AAA1", + # Set 9: Muted Burgundy + "#7B3A4F", "#6A3345", "#582C3C", "#925664", "#A8737E", + # Set 10: Muted Forest Green + "#3D5047", "#35453E", "#2D3B35", "#5F7267", "#7B8D80", + # Set 11: Muted Navy + "#2F3E4C", "#283442", "#222B38", "#485669", "#627185", + # Set 12: Muted Mustard + "#BFA054", "#A98F4A", "#927D3F", "#D7BA75", "#E2CD96", + # Set 13: Muted Dusty Rose + "#C18394", "#AA727E", "#93616C", "#D69BA7", "#E4B8C2", + # Set 14: Muted Lavender + "#8A729D", "#7A638C", "#6A547C", "#A591B3", "#BDA9C8", + # Set 15: Muted Charcoal + "#404040", "#353535", "#2B2B2B", "#585858", "#707070", + # Set 16: Muted Emerald Green + "#4D7456", "#426448", "#37563B", "#6A9177", "#85A990", + # Set 17: Muted Peach + "#D89B83", "#C2866F", "#A7725E", "#E3B39C", "#ECC7B6", + # Set 18: Muted Plum + "#704F6E", "#634464", "#563A59", "#876A87", "#A18AA1", + # Set 19: Muted Periwinkle + "#7E8DAF", "#6F7B98", "#616A82", "#97A3BF", "#B0B9D1", + # Set 20: Muted Coral + "#CC7A5E", "#B26951", "#9A5A45", "#DD937C", "#EBAA99" ] + PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE = ["#4A536A", "#CE5A5A"] diff --git a/protzilla/data_analysis/time_series_helper.py b/protzilla/data_analysis/time_series_helper.py new file mode 100644 index 00000000..e643fe93 --- /dev/null +++ b/protzilla/data_analysis/time_series_helper.py @@ -0,0 +1,15 @@ +from datetime import datetime + +def convert_time_to_hours(time_str): + """ + Convert a string time to the number of hours since midnight. + :param time_str: The time string to convert in format '%H:%M:%S' + + :return: Number of hours since midnight as a float + """ + + """ + time_obj = datetime.strptime(time_str, '%H:%M:%S') + hours_since_midnight = time_obj.hour + time_obj.minute / 60 + time_obj.second / 3600 + """ + return time_str \ No newline at end of file diff --git a/protzilla/data_analysis/time_series_plots.py b/protzilla/data_analysis/time_series_plots.py new file mode 100644 index 00000000..37c8ad34 --- /dev/null +++ b/protzilla/data_analysis/time_series_plots.py @@ -0,0 +1,179 @@ +import pandas as pd +import plotly.graph_objects as go +from scipy import stats +from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances + +from protzilla.utilities.transform_dfs import is_long_format, long_to_wide_time +from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE + +# Define color constants +colors = { + "plot_bgcolor": "white", + "gridcolor": "#F1F1F1", + "linecolor": "#F1F1F1", + "annotation_text_color": "#ffffff", + "annotation_proteins_of_interest": "#4A536A", +} + +def time_quant_plot( + intensity_df: pd.DataFrame, + metadata_df: pd.DataFrame, + time_column: str, + protein_group: str, + similarity: float = 1.0, + similarity_measure: str = "euclidean distance", +) -> dict: + """ + A function to create a graph visualising protein quantifications across all samples + as a line diagram using time. It's possible to select one proteingroup + that will be displayed in orange and choose a similarity measurement with a similarity score + to get all proteingroups that are similar displayed in another color in this line diagram. + All other proteingroups are displayed in the background as a grey polygon. + + :param intensity_df: A dataframe in protzilla wide format, where each row + represents a sample and each column represents a feature. + :param metadata_df: A dataframe containing the metadata of the samples. + :param time_column: The name of the column in the metadata_df that contains the time information. + :param protein_group: Protein IDs as the columnheader of the dataframe + :param similarity_measure: method to compare the chosen proteingroup with all others. The two + methods are "cosine similarity" and "euclidean distance". + :param similarity: similarity score of the chosen similarity measurement method. + + :return: returns a dictionary containing a list with a plotly figure and/or a list of messages + """ + + intensity_df = pd.merge( + left=intensity_df, + right=metadata_df[["Sample", time_column]], + on="Sample", + copy=False, + ) + + wide_df = intensity_df.interpolate(method='linear', axis=0) + wide_df = long_to_wide_time(wide_df, time_column=time_column) if is_long_format(wide_df, time_column=time_column) else wide_df + + + if protein_group not in wide_df.columns: + raise ValueError("Please select a valid protein group.") + elif similarity_measure == "euclidean distance" and similarity < 0: + raise ValueError( + "Similarity for euclidean distance should be greater than or equal to 0." + ) + elif similarity_measure == "cosine similarity" and ( + similarity < -1 or similarity > 1 + ): + raise ValueError("Similarity for cosine similarity should be between -1 and 1") + + fig = go.Figure() + + color_mapping = { + "A": PROTZILLA_DISCRETE_COLOR_SEQUENCE[0], + "C": PROTZILLA_DISCRETE_COLOR_SEQUENCE[4], + } + + lower_upper_x = [] + lower_upper_y = [] + + lower_upper_x.append(wide_df.index[0]) + lower_upper_y.append(wide_df.iloc[0].min()) + + for index, row in wide_df.iterrows(): + lower_upper_x.append(index) + lower_upper_y.append(row.max()) + + for index, row in reversed(list(wide_df.iterrows())): + lower_upper_x.append(index) + lower_upper_y.append(row.min()) + + fig.add_trace( + go.Scatter( + x=lower_upper_x, + y=lower_upper_y, + fill="toself", + name="Intensity Range", + line=dict(color="silver"), + ) + ) + + similar_groups = [] + for group_to_compare in wide_df.columns: + if group_to_compare != protein_group: + if similarity_measure == "euclidean distance": + distance = euclidean_distances( + stats.zscore(wide_df[protein_group]).values.reshape(1, -1), + stats.zscore(wide_df[group_to_compare]).values.reshape(1, -1), + )[0][0] + else: + distance = cosine_similarity( + stats.zscore(wide_df[protein_group]).values.reshape(1, -1), + stats.zscore(wide_df[group_to_compare]).values.reshape(1, -1), + )[0][0] + if similarity_measure == "euclidean distance": + if distance <= similarity: + similar_groups.append(group_to_compare) + else: + if distance >= similarity: + similar_groups.append(group_to_compare) + + for group in similar_groups: + fig.add_trace( + go.Scatter( + x=wide_df.index, + y=wide_df[group], + mode="lines", + name=group[:15] + "..." if len(group) > 15 else group, + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[9]), + showlegend=len(similar_groups) <= 7, + ) + ) + + if len(similar_groups) > 7: + fig.add_trace( + go.Scatter( + x=[None], + y=[None], + mode="lines", + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[9]), + name="Similar Protein Groups", + ) + ) + + formatted_protein_name = ( + protein_group[:15] + "..." if len(protein_group) > 15 else protein_group + ) + fig.add_trace( + go.Scatter( + x=wide_df.index, + y=wide_df[protein_group], + mode="lines", + name=formatted_protein_name, + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4]), + ) + ) + fig.update_layout( + title=f"Time Series of {formatted_protein_name} in all samples", + plot_bgcolor=colors["plot_bgcolor"], + xaxis_gridcolor=colors["gridcolor"], + yaxis_gridcolor=colors["gridcolor"], + xaxis_linecolor=colors["linecolor"], + yaxis_linecolor=colors["linecolor"], + xaxis_title=time_column, + yaxis_title="Intensity", + legend_title="Legend", + xaxis=dict( + tickmode="array", + tickangle=0, + tickvals=wide_df.index, + ticktext=[wide_df[time_column].unique() for wide_df[time_column] in wide_df.index], + ), + autosize=True, + margin=dict(l=100, r=300, t=100, b=100), + legend=dict( + x=1.05, + y=1, + bgcolor="rgba(255, 255, 255, 0.5)", + orientation="v", + ), + ) + + return dict(plots=[fig]) \ No newline at end of file diff --git a/protzilla/data_analysis/time_series_regression_analysis.py b/protzilla/data_analysis/time_series_regression_analysis.py new file mode 100644 index 00000000..898f82f9 --- /dev/null +++ b/protzilla/data_analysis/time_series_regression_analysis.py @@ -0,0 +1,1008 @@ +import logging + +import numpy as np +import pandas as pd +import plotly.graph_objects as go + +from protzilla.utilities import default_intensity_column +from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE + +from sklearn.linear_model import LinearRegression, RANSACRegressor +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error, r2_score +from statsmodels.tsa.arima.model import ARIMA +from statsmodels.tsa.stattools import adfuller +from pmdarima import auto_arima + +colors = { + "plot_bgcolor": "white", + "gridcolor": "#F1F1F1", + "linecolor": "#F1F1F1", + "annotation_text_color": "#4c4c4c", + "annotation_proteins_of_interest": "#4A536A", +} + + +def time_series_linear_regression( + intensity_df: pd.DataFrame, + metadata_df: pd.DataFrame, + time_column: str, + train_size: float, + protein_group: str, + grouping: str, + grouping_column: str, +): + """ + Perform linear regression on the time series data for a given protein group. + :param intensity_df: Peptide dataframe which contains the intensity of each sample + :param metadata_df: Metadata dataframe which contains the timestamps + :param time_column: The name of the column containing the time values + :param protein_group: Protein group to perform the analysis on + :param train_size: The proportion of the dataset to include in the test split + :param grouping_column: The name of the column containing the grouping information + :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups + + :return: A dictionary containing the root mean squared error and r2 score for the training and test sets + """ + messages = [] + color_index = 0 + if train_size < 0 or train_size > 1: + raise ValueError("Test size should be between 0 and 1") + + intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group] + intensity_column_name = default_intensity_column(intensity_df) + intensity_df = pd.merge( + left=intensity_df, + right=metadata_df, + on="Sample", + copy=False, + ) + + intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True) + + X = intensity_df[[time_column]] + y = intensity_df[intensity_column_name] + + fig = go.Figure() + + scores = [] + + if grouping == "With Grouping" and grouping_column in intensity_df.columns: + groups = intensity_df[grouping_column].unique() + for group in groups: + group_df = intensity_df[intensity_df[grouping_column] == group] + X_group = group_df[[time_column]] + y_group = group_df[intensity_column_name] + + X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False) + model = LinearRegression() + model.fit(X_train, y_train) + + y_pred_train = model.predict(X_train) + y_pred_test = model.predict(X_test) + + train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) + test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) + train_r2 = r2_score(y_train, y_pred_train) + test_r2 = r2_score(y_test, y_pred_test) + + train_df = pd.DataFrame({time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame({time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + plot_df = pd.concat([train_df, test_df]) + + color = PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index % len(PROTZILLA_DISCRETE_COLOR_SEQUENCE)] + color_index += 5 + + fig.add_trace(go.Scatter( + x=plot_df[time_column], + y=plot_df['Intensity'], + mode='markers', + name=f'Actual Intensity ({group})', + marker=dict(color=color) + ) + ) + + fig.add_trace(go.Scatter( + x=plot_df[time_column], + y=plot_df['Predicted'], + mode='lines', + name=f'Predicted Intensity ({group})', + line=dict(color=color) + ) + ) + + scores.append({ + 'group': group, + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) + + else: + X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, shuffle=False) + model = LinearRegression() + model.fit(X_train, y_train) + + y_pred_train = model.predict(X_train) + y_pred_test = model.predict(X_test) + + train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) + test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) + train_r2 = r2_score(y_train, y_pred_train) + test_r2 = r2_score(y_test, y_pred_test) + + train_df = pd.DataFrame( + {time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, + 'Type': 'Train'}) + test_df = pd.DataFrame( + {time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + plot_df = pd.concat([train_df, test_df]) + + fig.add_trace(go.Scatter( + x=plot_df[time_column], + y=plot_df['Intensity'], + mode='markers', + name='Actual Intensity', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) + ) + ) + + fig.add_trace(go.Scatter( + x=plot_df[time_column], + y=plot_df['Predicted'], + mode='lines', + name='Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[5]) + ) + ) + + scores.append({ + 'group': 'Overall', + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) + + # Add annotation text as a separate trace in the subplot + annotation_text = "
".join([ + f"Group: {res['group']} (Train/Test)" + f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
" + f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" + for res in scores + ]) + + fig.update_layout( + title=f"Intensity over Time for {protein_group}", + plot_bgcolor=colors["plot_bgcolor"], + xaxis_gridcolor=colors["gridcolor"], + yaxis_gridcolor=colors["gridcolor"], + xaxis_linecolor=colors["linecolor"], + yaxis_linecolor=colors["linecolor"], + xaxis_title=time_column, + yaxis_title="Intensity", + legend_title="Legend", + autosize=True, + margin=dict(l=100, r=300, t=100, b=100), + legend=dict( + y=1.05, + x=1, + bgcolor = "rgba(255, 255, 255, 0.5)", + orientation = "v", + ) + ) + + messages.append( + { + "level": logging.INFO, + "msg": annotation_text, + } + ) + + return dict( + scores=scores, + plots=[fig], + messages=messages, + ) + + +def time_series_ransac_regression( + intensity_df: pd.DataFrame, + metadata_df: pd.DataFrame, + time_column: str, + protein_group: str, + max_trials: int, + stop_probability: float, + loss: str, + train_size: float, + grouping: str, + grouping_column: str, +): + """ + Perform RANSAC regression on the time series data for a given protein group. + :param intensity_df: Peptide dataframe which contains the intensity of each sample + :param metadata_df: Metadata dataframe which contains the timestamps + :param time_column: The name of the column containing the time values + :param max_trials: The maximum number of iterations to perform + :param stop_probability: The probability to stop the RANSAC algorithm + :param loss: The loss function to use + :param protein_group: Protein group to perform the analysis on + :param train_size: The proportion of the dataset to include in the test split + :param grouping_column: The name of the column containing the grouping information + :param grouping: Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups + + :return: A dictionary containing the root mean squared error and r2 score for the training and test sets + """ + messages = [] + color_index = 0 + if train_size < 0 or train_size > 1: + raise ValueError("Test size should be between 0 and 1") + + intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group] + intensity_column_name = default_intensity_column(intensity_df) + + intensity_df = pd.merge( + left=intensity_df, + right=metadata_df, + on="Sample", + copy=False, + ) + + intensity_df = intensity_df.sample(frac=1, random_state = 42).reset_index(drop=True) + + X = intensity_df[[time_column]] + y = intensity_df[intensity_column_name] + + fig = go.Figure() + + scores = [] + + if grouping == "With Grouping" and grouping_column in intensity_df.columns: + groups = intensity_df[grouping_column].unique() + for group in groups: + group_df = intensity_df[intensity_df[grouping_column] == group] + X_group = group_df[[time_column]] + y_group = group_df[intensity_column_name] + + X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, train_size=train_size, shuffle=False) + model = RANSACRegressor(max_trials = max_trials, stop_probability = stop_probability, loss = loss, base_estimator=LinearRegression()) + model.fit(X_train, y_train) + + y_pred_train = model.predict(X_train) + y_pred_test = model.predict(X_test) + + inlier_mask_train = model.inlier_mask_ + + # Predict the inliers for the test set + test_inlier_mask = model.predict( + X_test) == y_pred_test + + train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask_train], y_pred_train[inlier_mask_train])) + test_rmse = np.sqrt(mean_squared_error(y_test[test_inlier_mask], y_pred_test[test_inlier_mask])) + train_r2 = r2_score(y_train[inlier_mask_train], y_pred_train[inlier_mask_train]) + test_r2 = r2_score(y_test[test_inlier_mask], y_pred_test[test_inlier_mask]) + + # Prepare DataFrames for plotting + train_df = pd.DataFrame( + {time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame( + {time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df['Inlier'] = inlier_mask_train + test_df['Inlier'] = test_inlier_mask + plot_df = pd.concat([train_df, test_df]) + + # Add main plot traces + fig.add_trace(go.Scatter( + x=plot_df[plot_df['Inlier'] == True][time_column], + y=plot_df[plot_df['Inlier'] == True]['Intensity'], + mode='markers', + name=f'Inliers ({group})', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index]) + ) + ) + + fig.add_trace(go.Scatter( + x=plot_df[time_column], + y=plot_df['Predicted'], + mode='lines', + name=f'Predicted Intensity ({group})', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 2]) + ) + ) + + fig.add_trace(go.Scatter( + x=plot_df[plot_df['Inlier'] == False][time_column], + y=plot_df[plot_df['Inlier'] == False]['Intensity'], + mode='markers', + name='Outliers', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4]) + ) + ) + + color_index += 5 + + scores.append({ + 'group': group, + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) + + else: + X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, shuffle=False) + model = RANSACRegressor(base_estimator=LinearRegression()) + model.fit(X_train, y_train) + + + y_pred_train = model.predict(X_train) + y_pred_test = model.predict(X_test) + + inlier_mask_train = model.inlier_mask_ + + # Predict the inliers for the test set + test_inlier_mask = model.predict(X_test) == y_pred_test + + train_rmse = np.sqrt(mean_squared_error(y_train[inlier_mask_train], y_pred_train[inlier_mask_train])) + test_rmse = np.sqrt(mean_squared_error(y_test[test_inlier_mask], y_pred_test[test_inlier_mask])) + train_r2 = r2_score(y_train[inlier_mask_train], y_pred_train[inlier_mask_train]) + test_r2 = r2_score(y_test[test_inlier_mask], y_pred_test[test_inlier_mask]) + + # Prepare DataFrames for plotting + train_df = pd.DataFrame( + {time_column: X_train[time_column], 'Intensity': y_train, 'Predicted': y_pred_train, 'Type': 'Train'}) + test_df = pd.DataFrame( + {time_column: X_test[time_column], 'Intensity': y_test, 'Predicted': y_pred_test, 'Type': 'Test'}) + train_df['Inlier'] = inlier_mask_train + test_df['Inlier'] = test_inlier_mask + plot_df = pd.concat([train_df, test_df]) + + # Add main plot traces + fig.add_trace(go.Scatter( + x=plot_df[plot_df['Inlier'] == True][time_column], + y=plot_df[plot_df['Inlier'] == True]['Intensity'], + mode='markers', + name='Inliers', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) + ) + ) + + fig.add_trace(go.Scatter( + x=plot_df[time_column], + y=plot_df['Predicted'], + mode='lines', + name='Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) + ) + ) + + fig.add_trace(go.Scatter( + x=plot_df[plot_df['Inlier'] == False][time_column], + y=plot_df[plot_df['Inlier'] == False]['Intensity'], + mode='markers', + name='Outliers', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3]) + ) + ) + + scores.append({ + 'group': 'Overall', + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) + + # Add annotation text as a separate trace in the subplot + annotation_text = "
".join([ + f"Group: {res['group']} (Train/Test)" + f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
" + f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" + for res in scores + ]) + + fig.update_layout( + title=f"Intensity over Time for {protein_group}", + plot_bgcolor=colors["plot_bgcolor"], + xaxis_gridcolor=colors["gridcolor"], + yaxis_gridcolor=colors["gridcolor"], + xaxis_linecolor=colors["linecolor"], + yaxis_linecolor=colors["linecolor"], + xaxis_title=time_column, + yaxis_title="Intensity", + legend_title="Legend", + autosize=True, + margin=dict(l=100, r=100, t=100, b=50), + legend=dict( + x=1.05, + y=1, + bgcolor="rgba(255, 255, 255, 0.5)", + orientation="v", + ), + ) + + messages.append( + { + "level": logging.INFO, + "msg": annotation_text, + } + ) + + return dict( + scores=scores, + plots=[fig], + messages=messages + ) + + +def adfuller_test( + intensity_df: pd.DataFrame, + metadata_df: pd.DataFrame, + time_column: str, + protein_group: str, + alpha: float = 0.05, +) -> dict: + """ + Perform the Augmented Dickey-Fuller test to check for stationarity in a time series. + :param intensity_df: The dataframe containing the time series data. + :param metadata_df: The dataframe containing the metadata. + :param protein_group: The protein group to perform the test on. + :param time_column: The column representing time (e.g., 'visit', 'timepoint'). + :param alpha: The significance level for the test (default is 0.05). + + :return: A dictionary containing: + - test_statistic: The test statistic from the ADF test. + - p_value: The p-value from the ADF test. + - critical_values: The critical values for different significance levels. + - is_stationary: A boolean indicating if the series is stationary. + - messages: A list of messages for the user. + """ + + messages = [] + # Filter for the specific protein group + intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group] + intensity_column_name = default_intensity_column(intensity_df) + + # Merge with metadata to include time information + merged_df = pd.merge( + left=intensity_df[["Sample", intensity_column_name]], + right=metadata_df[["Sample", time_column]], + on="Sample", + copy=False, + ) + + # Sort the data by time to ensure it is treated as a time series + merged_df = merged_df.sort_values(by=time_column) + grouped_df = merged_df.groupby(time_column)[intensity_column_name].mean().reset_index() + + # Extract the time series (after aggregation) + time_series = grouped_df[intensity_column_name].dropna() + + # Perform the ADF test + result = adfuller(time_series) + test_statistic = result[0] + p_value = result[1] + critical_values = result[4] + + # Determine if the series is stationary + is_stationary = p_value < alpha + + # Create a message for the user + if is_stationary: + messages.append( + { + "level": logging.INFO, + "msg": f"The time series is stationary (p-value: {p_value:.5f}).", + } + ) + else: + messages.append( + { + "level": logging.WARNING, + "msg": f"The time series is not stationary (p-value: {p_value:.5f}).", + } + ) + + return dict( + test_statistic=test_statistic, + p_value=p_value, + critical_values=critical_values, + is_stationary=is_stationary, + messages=messages, + ) + + +def time_series_auto_arima( + intensity_df: pd.DataFrame, + metadata_df: pd.DataFrame, + time_column: str, + protein_group: str, + seasonal: str, + m: int, + train_size: float, + grouping: str, + grouping_column: str, +) -> dict: + """ + Perform an automatic ARIMA model selection on the time series data for a given protein group. + :param intensity_df: Peptide dataframe which contains the intensity of each sample + :param metadata_df: Metadata dataframe which contains the timestamps + :param time_column: The name of the column containing the time values + :param protein_group: Protein group to perform the analysis on + :param seasonal: Whether the ARIMA model should be seasonal + :param m: The number of time steps for a single seasonal period (ignored if seasonal=False) + :param train_size: The proportion of the dataset to include in the test split + :param grouping_column: The name of the column containing the grouping information + :param grouping: Whether to group the data by the 'Group' column + + :return: A dictionary containing the root mean squared error and r2 score for the training and test sets + """ + + messages = [] + color_index = 0 + + if train_size < 0 or train_size > 1: + raise ValueError("Train size should be between 0 and 1") + if seasonal == "Yes": + seasonal = True + else: + seasonal = False + + intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group] + intensity_df = intensity_df.sample(frac=1, random_state=42).reset_index(drop=True) + intensity_column_name = default_intensity_column(intensity_df) + + intensity_df = pd.merge( + left=intensity_df, + right=metadata_df, + on="Sample", + copy=False, + ) + + fig = go.Figure() + scores = [] + + if grouping == "With Grouping" and grouping_column in intensity_df.columns: + groups = intensity_df[grouping_column].unique() + for group in groups: + group_df = intensity_df[intensity_df[grouping_column] == group] + + train_df_size = int(len(group_df) * train_size) + train_df, test_df = group_df[:train_df_size], group_df[train_df_size:] + + train_df = train_df.set_index(time_column)[intensity_column_name] + test_df = test_df.set_index(time_column)[intensity_column_name] + + # Fit the ARIMA model + model = auto_arima( + train_df, + seasonal=seasonal, + m=m, + trace=True, + error_action='ignore', + suppress_warnings=True, + stepwise=True, + ) + + # Forecast the test set + forecast = model.predict(n_periods=test_df.shape[0]) + parameters = model.get_params() + + test_rmse = np.sqrt(mean_squared_error(test_df, forecast)) + test_r2 = r2_score(test_df, forecast) + train_rmse = np.sqrt(mean_squared_error(train_df, model.predict_in_sample())) + train_r2 = r2_score(train_df, model.predict_in_sample()) + + forecast_reset = forecast.reset_index(drop=True) + forecast_plot = pd.Series(forecast_reset.values, index=test_df.index) + forecast_plot = forecast_plot.groupby(forecast_plot.index).mean() + + fig.add_trace(go.Scatter( + x=test_df.index, + y=test_df, + mode='markers', + name=f'Actual Intensity ({group})', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index]) + ) + ) + + fig.add_trace(go.Scatter( + x=test_df.index, + y=forecast, + mode='markers', + name=f'Predicted Intensity ({group})', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4]) + ) + ) + + fig.add_trace(go.Scatter( + x = forecast_plot.index, + y = forecast_plot, + mode = 'lines', + name = f'Mean Predicted Intensity ({group})', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4]) + ) + ) + + color_index += 5 + + scores.append({ + 'group': group, + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) + aa_order = parameters['order'] + aa_seasonal_order = parameters['seasonal_order'] + + messages.append( + { + "level": logging.INFO, + "msg": f"Auto Arima Order (p,d,q): {aa_order}.", + } + ) + if seasonal: + messages.append( + { + "level": logging.INFO, + "msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.", + } + ) + + else: + train_size = int(len(intensity_df) * train_size) + train_df, test_df = intensity_df[:train_size], intensity_df[train_size:] + + train_df = train_df.set_index(time_column)[intensity_column_name] + test_df = test_df.set_index(time_column)[intensity_column_name] + + # Fit the ARIMA model + model = auto_arima( + train_df, + seasonal=seasonal, + m=m, + trace=True, + error_action='ignore', + suppress_warnings=True, + stepwise=True, + ) + + # Forecast the test set + forecast = model.predict(n_periods=test_df.shape[0]) + parameters = model.get_params() + + aa_order = parameters['order'] + aa_seasonal_order = parameters['seasonal_order'] + + messages.append( + { + "level": logging.INFO, + "msg": f"Auto Arima Order (p,d,q): {aa_order}.", + } + ) + if seasonal: + messages.append( + { + "level": logging.INFO, + "msg": f"Auto Arima Seasonal Order (P,D,Q,s): {aa_seasonal_order}.", + } + ) + + test_rmse = np.sqrt(mean_squared_error(test_df, forecast)) + test_r2 = r2_score(test_df, forecast) + train_rmse = np.sqrt(mean_squared_error(train_df, model.predict_in_sample())) + train_r2 = r2_score(train_df, model.predict_in_sample()) + + forecast_reset = forecast.reset_index(drop=True) + forecast_plot = pd.Series(forecast_reset.values, index=test_df.index) + forecast_plot = forecast_plot.groupby(forecast_plot.index).mean() + + fig.add_trace(go.Scatter( + x=test_df.index, + y=test_df, + mode='markers', + name='Actual Intensity', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) + ) + ) + + fig.add_trace(go.Scatter( + x=test_df.index, + y=forecast, + mode='markers', + name='Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3]) + ) + ) + + fig.add_trace(go.Scatter( + x=forecast_plot.index, + y=forecast_plot, + mode='lines', + name='Mean Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4]) + ) + ) + + scores.append({ + 'group': 'Overall', + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) + + # Add annotation text as a separate trace in the subplot + annotation_text = "
".join([ + f"Group: {res['group']} (Train/Test)" + f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
" + f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" + for res in scores + ]) + + + fig.update_layout( + title=f"Intensity over Time for {protein_group}", + plot_bgcolor=colors["plot_bgcolor"], + xaxis_gridcolor=colors["gridcolor"], + yaxis_gridcolor=colors["gridcolor"], + xaxis_linecolor=colors["linecolor"], + yaxis_linecolor=colors["linecolor"], + xaxis_title=time_column, + yaxis_title="Intensity", + legend_title="Legend", + autosize=True, + margin=dict(l=100, r=100, t=100, b=50), + legend=dict( + x=1.05, + y=1, + bgcolor="rgba(255, 255, 255, 0.5)", + orientation="v", + ), + ) + + messages.append( + { + "level": logging.INFO, + "msg": annotation_text, + } + ) + + return dict( + scores=scores, + plots=[fig], + messages=messages, + ) + + +def time_series_arima( + intensity_df: pd.DataFrame, + metadata_df: pd.DataFrame, + time_column: str, + protein_group: str, + seasonal: str, + p: int, + d: int, + q: int, + P: int, + D: int, + Q: int, + s: int, + train_size: float, + grouping: str, + grouping_column: str, +) -> dict: + + """ + Perform ARIMA model selection on the time series data for a given protein group. + :param intensity_df: Peptide dataframe which contains the intensity of each sample + :param metadata_df: Metadata dataframe which contains the timestamps + :param time_column: The name of the column containing the time values + :param protein_group: Protein group to perform the analysis on + :param seasonal: Whether the ARIMA model should be seasonal + :param p: ARIMA p parameter + :param d: ARIMA d parameter + :param q: ARIMA q parameter + :param P: ARIMA seasonal P parameter + :param D: ARIMA seasonal D parameter + :param Q: ARIMA seasonal Q parameter + :param s: ARIMA seasonal s parameter + :param train_size: The proportion of the dataset to include in the test split + :param grouping_column: The name of the column containing the grouping information + :param grouping: Whether to group the data by the 'Group' column + + :return: A dictionary containing the root mean squared error and r2 score for the training and test sets + """ + messages = [] + color_index = 0 + + if train_size < 0 or train_size > 1: + raise ValueError("Train size should be between 0 and 1") + + intensity_df = intensity_df[intensity_df['Protein ID'] == protein_group] + intensity_df = intensity_df.sample(frac=1, random_state=42).reset_index(drop=True) + intensity_column_name = default_intensity_column(intensity_df) + + intensity_df = pd.merge(left=intensity_df, right=metadata_df, on="Sample", copy=False) + + fig = go.Figure() + scores = [] + + if grouping == "With Grouping" and grouping_column in intensity_df.columns: + groups = intensity_df[grouping_column].unique() + for group in groups: + group_df = intensity_df[intensity_df[grouping_column] == group] + + train_df_size = int(len(group_df) * train_size) + train_df, test_df = group_df[:train_df_size], group_df[train_df_size:] + + train_df = train_df.set_index(time_column)[intensity_column_name] + test_df = test_df.set_index(time_column)[intensity_column_name] + + if seasonal == "Yes": + model = ARIMA( + train_df, + order=(p, d, q), + seasonal_order=(P, D, Q, s) + ) + else: + model = ARIMA( + train_df, + order=(p, d, q) + ) + + model_fit = model.fit() + + forecast = model_fit.forecast(steps=len(test_df)) + + test_rmse = np.sqrt(mean_squared_error(test_df, forecast)) + test_r2 = r2_score(test_df, forecast) + train_rmse = np.sqrt(mean_squared_error(train_df, model_fit.fittedvalues)) + train_r2 = r2_score(train_df, model_fit.fittedvalues) + + forecast_reset = forecast.reset_index(drop=True) + forecast_plot = pd.Series(forecast_reset.values, index=test_df.index) + forecast_mean_plot = forecast_plot.groupby(forecast_plot.index).mean() + + fig.add_trace(go.Scatter( + x=test_df.index, + y=test_df, + mode='markers', + name=f'Actual Intensity ({group})', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index]) + ) + ) + + fig.add_trace(go.Scatter( + x=forecast_plot.index, + y=forecast_plot, + mode='markers', + name= f'Predicted Intensity ({group})', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4]) + ) + ) + + fig.add_trace(go.Scatter( + x = forecast_mean_plot.index, + y = forecast_mean_plot, + mode = 'lines', + name = f'Mean Predicted Intensity ({group})', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[color_index + 4]) + ) + ) + + color_index += 5 + + scores.append({ + 'group': group, + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) + + else: + train_size = int(len(intensity_df) * train_size) + train_df, test_df = intensity_df[:train_size], intensity_df[train_size:] + + train_df = train_df.set_index(time_column)[intensity_column_name] + test_df = test_df.set_index(time_column)[intensity_column_name] + + if seasonal == "Yes": + model = ARIMA( + train_df, + order=(p, d, q), + seasonal_order = (P, D, Q, s), + ) + else: + model = ARIMA(train_df, order=(p, d, q)) + + model_fit = model.fit() + + forecast = model_fit.forecast(steps=len(test_df)) + + test_rmse = np.sqrt(mean_squared_error(test_df, forecast)) + test_r2 = r2_score(test_df, forecast) + train_rmse = np.sqrt(mean_squared_error(train_df, model_fit.fittedvalues)) + train_r2 = r2_score(train_df, model_fit.fittedvalues) + + forecast_reset = forecast.reset_index(drop=True) + forecast_plot = pd.Series(forecast_reset.values, index=test_df.index) + forecast_plot = forecast_plot.groupby(forecast_plot.index).mean() + + fig.add_trace(go.Scatter( + x=test_df.index, + y=test_df, + mode='markers', + name='Actual Intensity', + marker=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[0]) + ) + ) + + fig.add_trace(go.Scatter( + x=test_df.index, + y=forecast, + mode='markers', + name='Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[3]) + ) + ) + + fig.add_trace(go.Scatter( + x=forecast_plot.index, + y=forecast_plot, + mode='lines', + name='Mean Predicted Intensity', + line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4]) + ) + ) + + scores.append({ + 'group': 'Overall', + 'train_root_mean_squared': train_rmse, + 'test_root_mean_squared': test_rmse, + 'train_r2_score': train_r2, + 'test_r2_score': test_r2, + }) + + annotation_text = "
".join([ + f"Group: {res['group']} (Train/Test)" + f"
RMSE: {res['train_root_mean_squared']:.3f} / {res['test_root_mean_squared']:.3f}
" + f"R²: {res['train_r2_score']:.3f} / {res['test_r2_score']:.3f}
" + for res in scores + ]) + + fig.update_layout( + title=f"Intensity over Time for {protein_group}", + plot_bgcolor=colors["plot_bgcolor"], + xaxis_gridcolor=colors["gridcolor"], + yaxis_gridcolor=colors["gridcolor"], + xaxis_linecolor=colors["linecolor"], + yaxis_linecolor=colors["linecolor"], + xaxis_title=time_column, + yaxis_title="Intensity", + legend_title="Legend", + autosize=True, + margin=dict(l=100, r=100, t=100, b=50), + legend=dict( + x=1.05, + y=1, + bgcolor="rgba(255, 255, 255, 0.5)", + orientation="v", + ), + ) + + messages.append( + { + "level": logging.INFO, + "msg": annotation_text, + } + ) + + return dict( + scores=scores, + plots=[fig], + messages=messages, + ) diff --git a/protzilla/data_preprocessing/peptide_filter.py b/protzilla/data_preprocessing/peptide_filter.py index 3b1caee9..67745d15 100644 --- a/protzilla/data_preprocessing/peptide_filter.py +++ b/protzilla/data_preprocessing/peptide_filter.py @@ -50,3 +50,67 @@ def by_pep_value_plot(method_inputs, method_outputs, graph_type): elif graph_type == "Bar chart": fig = create_bar_plot(**value_dict) return [fig] + +def by_samples_missing( + protein_df: pd.DataFrame | None, + peptide_df: pd.DataFrame | None, + percentage: float = 0.5, +) -> dict: + """ + This function filters proteins based on the amount of samples with nan values, if the percentage of nan values + is below a threshold (percentage). + + :param protein_df: the protein dataframe that should be filtered + :param peptide_df: the peptide dataframe that should be filtered in accordance to the intensity dataframe (optional) + :param percentage: ranging from 0 to 1. Defining the relative share of samples the proteins need to be present in, + in order for the protein to be kept. + :return: returns the filtered df as a Dataframe and a dict with a list of Protein IDs that were discarded + and a list of Protein IDs that were kept + """ + + filter_threshold: int = percentage * len(protein_df.Sample.unique()) + transformed_df = long_to_wide(protein_df) + + remaining_proteins_list = transformed_df.dropna( + axis=1, thresh=filter_threshold + ).columns.tolist() + filtered_proteins_list = ( + transformed_df.drop(remaining_proteins_list, axis=1).columns.unique().tolist() + ) + filtered_df = protein_df[ + (protein_df["Protein ID"].isin(remaining_proteins_list)) + ] + filtered_peptide_df = None + if peptide_df is not None: + filtered_peptide_df = peptide_df[ + (peptide_df["Protein ID"].isin(remaining_proteins_list)) + ] + return dict( + protein_df=filtered_df, + peptide_df=filtered_peptide_df, + filtered_proteins=filtered_proteins_list, + remaining_proteins=remaining_proteins_list, + ) + + +def _build_pie_bar_plot(remaining_proteins, filtered_proteins, graph_type): + if graph_type == "Pie chart": + fig = create_pie_plot( + values_of_sectors=[ + len(remaining_proteins), + len(filtered_proteins), + ], + names_of_sectors=["Proteins kept", "Proteins filtered"], + heading="Number of Filtered Proteins", + ) + elif graph_type == "Bar chart": + fig = create_bar_plot( + values_of_sectors=[ + len(remaining_proteins), + len(filtered_proteins), + ], + names_of_sectors=["Proteins kept", "Proteins filtered"], + heading="Number of Filtered Proteins", + y_title="Number of Proteins", + ) + return [fig] \ No newline at end of file diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py index c3d9136f..595aacbb 100644 --- a/protzilla/importing/ms_data_import.py +++ b/protzilla/importing/ms_data_import.py @@ -123,6 +123,79 @@ def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum" return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))]) +def tmt_data_import( + file_path: str, intensity_name: str = "Reporter intensity", map_to_uniprot=False, + aggregation_method: str = "Sum" +) -> dict: + try: + # Read the file + df = pd.read_csv( + file_path, + sep="\t", + low_memory=False, + na_values=["", 0], + keep_default_na=True, + ) + + # Debug step: Print the column names to check the actual names in the data + print("Columns in the file:", df.columns.tolist()) + + # Try to handle different possible names for the 'Protein ID' column + protein_column = None + possible_names = ["Majority protein IDs"] + + for name in possible_names: + if name in df.columns: + protein_column = name + break + + if protein_column is None: + raise KeyError("No valid 'Protein ID' or equivalent column found in the data.") + + df = df.rename(columns={protein_column: "Protein ID"}) + + # Extract protein or gene identifiers + protein_groups = df["Protein ID"] + + # Drop columns that are not relevant + columns_to_drop = [ + "Combined Spectral Count", + "Combined Unique Spectral Count", + "Combined Total Spectral Count", + ] + existing_columns = set(df.columns) + columns_to_drop_existing = [col for col in columns_to_drop if col in existing_columns] + df = df.drop(columns=columns_to_drop_existing) + print("Columns after dropping irrelevant ones:", df.columns.tolist()) + + # Use regex to find columns matching the TMT pattern with visits for both NP and T1D samples + intensity_columns = df.filter( + regex=f"{intensity_name} \\d+ (NP\\d{{2}}|TD\\d{{2}})", axis=1 + ) + + # Debug step: Print the intensity columns that were matched + print("Matched intensity columns:", intensity_columns.columns.tolist()) + + # Rename columns to the format 'NPXX_1' or 'T1DXX_1' + intensity_columns.columns = [ + re.sub(f"{intensity_name} (\\d+) (NP\\d{{2}}|TD\\d{{2}})", + lambda m: f"{m.group(2)}_{int(m.group(1)) + 1}", col) for col in intensity_columns.columns + ] + + # Debug step: Print the renamed intensity columns + print("Renamed intensity columns:", intensity_columns.columns.tolist()) + # Add back the protein identifiers to the dataframe + intensity_columns = intensity_columns.assign(**{"Protein ID": protein_groups}) + + # Apply transformation, clean-up, or aggregation (depending on your logic) + return transform_and_clean(intensity_columns, intensity_name, map_to_uniprot, aggregation_method) + + except Exception as e: + msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid TMT data file." + return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))]) + + + def transform_and_clean( df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool, aggregation_method: str ="Sum" ) -> dict: @@ -197,9 +270,15 @@ def clean_protein_groups(protein_groups, map_to_uniprot=True): found_ids_per_group = [] # go through all groups and find the valid proteins # non uniprot ids are put into extracted_ids, so they can be mapped + extract_protein_id_regex = re.compile(r'\|([^|]+)\|') + + # Function to extract protein IDs from the formatted string + def extract_protein_ids(protein_group_str): + return extract_protein_id_regex.findall(protein_group_str) + for group in protein_groups: found_in_group = [] - for protein_id in group.split(";"): + for protein_id in extract_protein_ids(group) or group.split(";"): if not protein_id.startswith("ENSP") and ( match := uniprot_regex.search(protein_id) ): diff --git a/protzilla/methods/data_analysis.py b/protzilla/methods/data_analysis.py index 7ad45ddd..4907bbf6 100644 --- a/protzilla/methods/data_analysis.py +++ b/protzilla/methods/data_analysis.py @@ -12,6 +12,15 @@ mann_whitney_test_on_intensity_data, mann_whitney_test_on_ptm_data) from protzilla.data_analysis.differential_expression_t_test import t_test from protzilla.data_analysis.dimension_reduction import t_sne, umap +from protzilla.data_analysis.time_series_regression_analysis import ( + time_series_linear_regression, + time_series_ransac_regression, + adfuller_test, + time_series_auto_arima, + time_series_arima, +) +from protzilla.data_analysis.ptm_analysis import filter_peptides_of_protein, ptms_per_sample, \ + ptms_per_protein_and_sample from protzilla.data_analysis.model_evaluation import evaluate_classification_model from protzilla.data_analysis.plots import ( clustergram_plot, @@ -19,6 +28,7 @@ prot_quant_plot, scatter_plot, ) +from protzilla.data_analysis.time_series_plots import time_quant_plot from protzilla.data_analysis.protein_graphs import peptides_to_isoform, variation_graph from protzilla.data_analysis.ptm_analysis import ( filter_peptides_of_protein, @@ -765,6 +775,195 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: return inputs +class PlotTimeQuant(PlotStep): + display_name = "Time Quantification Plot For Protein" + operation = "Time series analysis" + method_description = ( + "Creates a line chart for intensity across Time for protein groups" + ) + + input_keys = [ + "intensity_df", + "metadata_df", + "time_column", + "protein_group", + "similarity_measure", + "similarity" + ] + output_keys = [] + + def method(self, inputs: dict) -> dict: + return time_quant_plot(**inputs) + + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["intensity_df"] = steps.protein_df + inputs["metadata_df"] = steps.metadata_df + return inputs + + +class TimeSeriesLinearRegression(PlotStep): + display_name = "Linear Regression" + operation = "Time series analysis" + method_description = ("A function to fit a linear model using ordinary least squares for each protein. " + "The linear model fits the protein intensities on Y axis and the Time on X. " + "The p-values are corrected for multiple testing.") + + input_keys = [ + "intensity_df", + "metadata_df", + "time_column", + "protein_group", + "train_size", + "grouping", + "grouping_column", + ] + output_keys = [ + "scores", + ] + + def method(self, inputs: dict) -> dict: + return time_series_linear_regression(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["intensity_df"] = steps.protein_df + inputs["metadata_df"] = steps.metadata_df + return inputs + + +class TimeSeriesRANSACRegression(PlotStep): + display_name = "RANSAC Regression" + operation = "Time series analysis" + method_description = " Perform RANSAC regression on the time series data for a given protein group." + + input_keys = [ + "intensity_df", + "metadata_df", + "time_column", + "protein_group", + "max_trials", + "stop_probability", + "loss", + "train_size", + "grouping", + "grouping_column", + ] + output_keys = [ + "scores", + ] + def method(self, inputs: dict) -> dict: + return time_series_ransac_regression(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["intensity_df"] = steps.protein_df + inputs["metadata_df"] = steps.metadata_df + return inputs + + +class TimeSeriesADFullerTest(DataAnalysisStep): + display_name = "Augmented Dickey-Fuller Test" + operation = "Time series analysis" + method_description = ( + "The Augmented Dickey-Fuller test is a type of statistical test called a unit root test. The test " + "determines how strongly a time series is defined by a trend. The null hypothesis of the test is that the " + "time series can be represented by a unit root, which implies that the time series is not stationary. " + "The alternative hypothesis is that the time series is stationary. If the p-value is less than the " + "significance level, the null hypothesis can be rejected and the time series is considered stationary." + "Dickey, D. & Fuller, Wayne. (1979). Distribution of the Estimators for Autoregressive Time Series With a Unit Root. " + "JASA. Journal of the American Statistical Association. 74. 10.2307/2286348. " + ) + + input_keys = [ + "intensity_df", + "metadata_df", + "time_column", + "protein_group", + "alpha", + ] + output_keys = [ + "test_statistic", + "p_value", + "critical_values", + "is_stationary", + ] + + def method(self, inputs: dict) -> dict: + return adfuller_test(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["intensity_df"] = steps.protein_df + inputs["metadata_df"] = steps.metadata_df + return inputs + + +class TimeSeriesAutoARIMA(PlotStep): + display_name = "Auto ARIMA (AutoRegressive Integrated Moving Average)" + operation = "Time series analysis" + method_description = ( + "Perform Auto ARIMA on the time series data for a given protein group." + ) + + input_keys = [ + "intensity_df", + "metadata_df", + "time_column", + "protein_group", + "seasonal", + "m", + "train_size", + "grouping", + "grouping_column", + ] + output_keys = [ + "scores", + ] + + def method(self, inputs: dict) -> dict: + return time_series_auto_arima(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["intensity_df"] = steps.protein_df + inputs["metadata_df"] = steps.metadata_df + return inputs + + +class TimeSeriesARIMA(PlotStep): + display_name = "ARIMA (AutoRegressive Integrated Moving Average)" + operation = "Time series analysis" + method_description = ( + "Perform ARIMA on the time series data for a given protein group." + ) + + input_keys = [ + "intensity_df", + "metadata_df", + "time_column", + "protein_group", + "seasonal", + "p", + "d", + "q", + "P", + "D", + "Q", + "s", + "train_size", + "grouping", + "grouping_column", + ] + output_keys = [ + "scores", + ] + + def method(self, inputs: dict) -> dict: + return time_series_arima(**inputs) + + def insert_dataframes(self, steps: StepManager, inputs) -> dict: + inputs["intensity_df"] = steps.protein_df + inputs["metadata_df"] = steps.metadata_df + return inputs + + class PTMsPerSample(DataAnalysisStep): display_name = "PTMs per Sample" operation = "Peptide analysis" @@ -813,3 +1012,4 @@ def insert_dataframes(self, steps: StepManager, inputs) -> dict: Step, "peptide_df", inputs["peptide_df"] ) return inputs + diff --git a/protzilla/methods/data_preprocessing.py b/protzilla/methods/data_preprocessing.py index 0565eaf0..50373899 100644 --- a/protzilla/methods/data_preprocessing.py +++ b/protzilla/methods/data_preprocessing.py @@ -329,8 +329,8 @@ class FilterPeptidesByPEPThreshold(DataPreprocessingStep): operation = "filter_peptides" method_description = "Filter by PEP-threshold" - input_keys = ["protein_df", "peptide_df", "threshold"] - output_keys = ["protein_df", "peptide_df", "filtered_peptides"] + input_keys = ["peptide_df", "threshold"] + output_keys = ["peptide_df", "filtered_peptides"] def method(self, inputs): return peptide_filter.by_pep_value(**inputs) diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py index 7cde1ba0..f94218f4 100644 --- a/protzilla/methods/importing.py +++ b/protzilla/methods/importing.py @@ -9,6 +9,7 @@ diann_import, max_quant_import, ms_fragger_import, + tmt_data_import, ) from protzilla.importing.peptide_import import peptide_import, evidence_import from protzilla.steps import Step, StepManager @@ -60,6 +61,17 @@ def method(self, inputs): return ms_fragger_import(**inputs) +class TMTImport(ImportingStep): + display_name = "TMT" + operation = "msdataimport" + method_description = "TMT data import" + input_keys = ["file_path", "map_to_uniprot", "aggregation_method"] + output_keys = ["protein_df"] + + def method(self, inputs): + return tmt_data_import(**inputs) + + class MetadataImport(ImportingStep): display_name = "Metadata import" operation = "metadataimport" @@ -96,7 +108,8 @@ class MetadataColumnAssignment(ImportingStep): display_name = "Metadata column assignment" operation = "metadataimport" method_description = ( - "Assign columns to metadata categories, repeatable for each category" + "Protzilla uses a unique metadata column name to identify certain features in the metadata. " + "This step assigns the metadata columns to the correct feature." ) input_keys = [ diff --git a/protzilla/steps.py b/protzilla/steps.py index d5fb124e..32ce93b3 100644 --- a/protzilla/steps.py +++ b/protzilla/steps.py @@ -36,6 +36,7 @@ def __init__(self, instance_identifier: str | None = None): self.messages: Messages = Messages([]) self.output: Output = Output() self.plots: Plots = Plots() + self.display_output: DisplayOutput = DisplayOutput() self.instance_identifier = instance_identifier if self.instance_identifier is None: @@ -310,6 +311,27 @@ def export(self, format_): exports.append(BytesIO(base64.b64decode(plot))) return exports +class DisplayOutput: + + def __init__(self, display_output: dict = None): + if display_output is None: + display_output = {} + self.display_output = display_output + def __iter__(self): + return iter(self.display_output) + def __repr__(self): + return f"DisplayOutput: {self.display_output}" + def __contains__(self, key): + return key in self.display_output + def __getitem__(self, key): + return self.display_output[key] + def __setitem__(self, key, value): + self.display_output[key] = value + def is_empty(self) -> bool: + return len(self.display_output) == 0 + + + class StepManager: def __repr__(self): diff --git a/protzilla/utilities/transform_dfs.py b/protzilla/utilities/transform_dfs.py index 59a83259..fdb931e7 100644 --- a/protzilla/utilities/transform_dfs.py +++ b/protzilla/utilities/transform_dfs.py @@ -12,6 +12,7 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): :param intensity_df: the dataframe that should be transformed into long format :type intensity_df: pd.DataFrame + :param value_name: the name of the column in the metadata_df that contains the intensity information. :return: returns dataframe in wide format suitable for use by packages such as sklearn @@ -23,6 +24,31 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): ) +def long_to_wide_time(intensity_df: pd.DataFrame, value_name: str = None, time_column: str = None): + """ + This function transforms the dataframe to a wide format that + can be more easily handled by packages such as sklearn. + Each sample gets one row with all observations as columns. + + :param intensity_df: the dataframe that should be transformed into + long format + :type intensity_df: pd.DataFrame + :param value_name: the name of the column in the metadata_df that contains the intensity information. + :param time_column: the name of the column in the metadata_df that contains the time information. + + :return: returns dataframe in wide format suitable for use by + packages such as sklearn + :rtype: pd.DataFrame + """ + if intensity_df.duplicated(subset=[time_column, "Protein ID"]).any(): + intensity_df = intensity_df.groupby([time_column, "Protein ID"]).mean().reset_index() + values_name = default_intensity_column(intensity_df) if value_name is None else value_name + intensity_df = pd.pivot( + intensity_df, index=time_column, columns="Protein ID", values=values_name + ) + intensity_df = intensity_df.fillna(intensity_df.mean()) + return intensity_df + def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame): """ This functions transforms the dataframe from a wide @@ -58,15 +84,16 @@ def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame): return intensity_df -def is_long_format(df: pd.DataFrame): - return set(df.columns[:3]) == {"Sample", "Protein ID", "Gene"} +def is_long_format(df: pd.DataFrame, time_column: str = None): + required_columns = {"Sample", "Protein ID"} + additional_columns = {"Gene", time_column} + return required_columns.issubset(df.columns) and any(col in df.columns for col in additional_columns) def is_intensity_df(df: pd.DataFrame): """ Checks if the dataframe is an intensity dataframe. - An intensity dataframe should have the columns "Sample", "Protein ID" and - and intensity column. + An intensity dataframe should have the columns "Sample", "Protein ID" and intensity column. :param df: the dataframe that should be checked :type df: pd.DataFrame diff --git a/requirements.txt b/requirements.txt index bc175e2a..e7f0c7ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,6 +21,7 @@ restring==0.1.20 scikit-learn==1.2.2 scipy==1.10.1 statsmodels==0.13.5 +pmdarima==2.0.4 umap-learn==0.5.3 Werkzeug==2.2.3 numba==0.57.0 diff --git a/tests/protzilla/data_analysis/test_time_series_analysis.py b/tests/protzilla/data_analysis/test_time_series_analysis.py new file mode 100644 index 00000000..7bdebbda --- /dev/null +++ b/tests/protzilla/data_analysis/test_time_series_analysis.py @@ -0,0 +1,414 @@ +import pandas as pd +import pytest + +from protzilla.data_analysis.time_series_regression_analysis import ( + time_series_linear_regression, + time_series_ransac_regression, + adfuller_test, + time_series_auto_arima, + time_series_arima, +) + + +@pytest.fixture +def time_series_test_data(): + test_intensity_list = ( + ["Sample1", "Protein1", "Gene1", 20], + ["Sample1", "Protein2", "Gene1", 16], + ["Sample1", "Protein3", "Gene1", 1], + ["Sample1", "Protein4", "Gene1", 14], + ["Sample2", "Protein1", "Gene1", 20], + ["Sample2", "Protein2", "Gene1", 15], + ["Sample2", "Protein3", "Gene1", 2], + ["Sample2", "Protein4", "Gene1", 15], + ["Sample3", "Protein1", "Gene1", 22], + ["Sample3", "Protein2", "Gene1", 14], + ["Sample3", "Protein3", "Gene1", 3], + ["Sample3", "Protein4", "Gene1", 16], + ["Sample4", "Protein1", "Gene1", 8], + ["Sample4", "Protein2", "Gene1", 15], + ["Sample4", "Protein3", "Gene1", 1], + ["Sample4", "Protein4", "Gene1", 9], + ["Sample5", "Protein1", "Gene1", 10], + ["Sample5", "Protein2", "Gene1", 14], + ["Sample5", "Protein3", "Gene1", 2], + ["Sample5", "Protein4", "Gene1", 10], + ["Sample6", "Protein1", "Gene1", 12], + ["Sample6", "Protein2", "Gene1", 13], + ["Sample6", "Protein3", "Gene1", 3], + ["Sample6", "Protein4", "Gene1", 11], + ["Sample7", "Protein1", "Gene1", 12], + ["Sample7", "Protein2", "Gene1", 13], + ["Sample7", "Protein3", "Gene1", 3], + ["Sample7", "Protein4", "Gene1", 11], + ["Sample1", "Protein1", "Gene2", 10], + ["Sample1", "Protein2", "Gene2", 14], + ["Sample1", "Protein3", "Gene2", 2], + ["Sample1", "Protein4", "Gene2", 10], + ["Sample2", "Protein1", "Gene2", 12], + ["Sample2", "Protein1", "Gene3", 13], + + ) + + test_intensity_df = pd.DataFrame( + data=test_intensity_list, + columns=["Sample", "Protein ID", "Gene", "Intensity"], + ) + + test_metadata_df = ( + ["Sample1", "2", "1"], + ["Sample2", "6", "1"], + ["Sample3", "7", "1"], + ["Sample4", "8", "1"], + ["Sample5", "2", "2"], + ["Sample6", "6", "2"], + ["Sample7", "7", "2"], + ) + test_metadata_df = pd.DataFrame( + data=test_metadata_df, + columns=["Sample", "Time", "Group"], + ) + return test_intensity_df, test_metadata_df + +def test_linear_regression_plot_with_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_linear_regression( + test_intensity, + test_metadata, + "Time", + 0.8, + "Protein1", + "Group", + "With Grouping" + ) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_linear_regression_plot_without_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_linear_regression( + test_intensity, + test_metadata, + "Time", + 0.8, + "Protein1", + "With Grouping", + "Group", + ) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_linear_regression_plot_invalid_train_size(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + with pytest.raises(ValueError): + time_series_linear_regression( + test_intensity, + test_metadata, + "Time", + 2, + "Protein1", + "With Grouping", + "Group", + ) + return + +def test_linear_regression_outputs(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_linear_regression( + test_intensity, + test_metadata, + "Time", + 0.8, + "Protein1", + "With Grouping", + "Group", + ) + assert "scores" in outputs + return + + +def test_ransac_regression_plot_with_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_ransac_regression( + test_intensity, + test_metadata, + "Time", + "Protein1", + 100, + 0.99, + "absolute_error", + 0.8, + "With Grouping", + "Group", + ) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_ransac_regression_plot_without_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_ransac_regression( + test_intensity, + test_metadata, + "Time", + "Protein1", + 100, + 0.99, + "absolute_error", + 0.8, + "With Grouping", + "Group", + + ) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_ransac_plot_invalid_train_size(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + with pytest.raises(ValueError): + time_series_ransac_regression( + test_intensity, + test_metadata, + "Time", + "Protein1", + 100, + 0.99, + "absolute_error", + 2, + "With Grouping", + "Group", + ) + return + +def test_ransac_regression_outputs(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_ransac_regression( + test_intensity, + test_metadata, + "Time", + "Protein1", + 100, + 0.99, + "absolute_error", + 0.8, + "With Grouping", + "Group", + ) + assert "scores" in outputs + return + + +def test_adfuller_test(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = adfuller_test(test_intensity, test_metadata, "Time", "Protein1") + + assert "test_statistic" in outputs + assert "p_value" in outputs + assert "critical_values" in outputs + assert "is_stationary" in outputs + assert "messages" in outputs + return + + +def test_auto_arima_plot_with_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_auto_arima( + test_intensity, + test_metadata, + "Time", + "Protein1", + "No", + 1, + 0.5, + "With Grouping", + "Group", + ) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_auto_arima_plot_without_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_auto_arima( + test_intensity, + test_metadata, + "Time", + "Protein1", + "No", + 1, + 0.5, + "With Grouping", + "Group", + ) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_auto_arima_plot_invalid_train_size(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + with pytest.raises(ValueError): + time_series_auto_arima( + test_intensity, + test_metadata, + "Time", + "Protein1", + "No", + 1, + 2, + "With Grouping", + "Group", + ) + return + + +def test_auto_arima_outputs(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_auto_arima( + test_intensity, + test_metadata, + "Time", + "Protein1", + "No", + 1, + 0.5, + "With Grouping", + "Group", + ) + assert "scores" in outputs + return + + +def test_arima_plot_with_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_arima( + test_intensity, + test_metadata, + "Time", + "Protein1", + "No", + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0.5, + "With Grouping", + "Group", + ) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_arima_plot_seasonal_with_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_arima( + test_intensity, + test_metadata, + "Time", + "Protein1", + "No", + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0.5, + "With Grouping", + "Group", + ) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_arima_plot_without_grouping(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_arima( + test_intensity, + test_metadata, + "Time", + "Protein1", + "No", + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0.5, + "With Grouping", + "Group", + ) + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_arima_plot_invalid_train_size(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + with pytest.raises(ValueError): + time_series_arima( + test_intensity, + test_metadata, + "Time", + "Protein1", + "No", + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 2, + "With Grouping", + "Group", + ) + return + + +def test_arima_outputs(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_series_arima( + test_intensity, + test_metadata, + "Time", + "Protein1", + "No", + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0.5, + "With Grouping", + "Group", + ) + assert "scores" in outputs + return \ No newline at end of file diff --git a/tests/protzilla/data_analysis/test_time_series_plots.py b/tests/protzilla/data_analysis/test_time_series_plots.py new file mode 100644 index 00000000..ca3fe4fa --- /dev/null +++ b/tests/protzilla/data_analysis/test_time_series_plots.py @@ -0,0 +1,76 @@ +import pandas as pd +import pytest + +from protzilla.data_analysis.time_series_plots import time_quant_plot + + +@pytest.fixture +def time_series_test_data(): + test_intensity_list = ( + ["Sample1", "Protein1", "Gene1", 20], + ["Sample1", "Protein2", "Gene1", 16], + ["Sample1", "Protein3", "Gene1", 1], + ["Sample1", "Protein4", "Gene1", 14], + ["Sample2", "Protein1", "Gene1", 20], + ["Sample2", "Protein2", "Gene1", 15], + ["Sample2", "Protein3", "Gene1", 2], + ["Sample2", "Protein4", "Gene1", 15], + ["Sample3", "Protein1", "Gene1", 22], + ["Sample3", "Protein2", "Gene1", 14], + ["Sample3", "Protein3", "Gene1", 3], + ["Sample3", "Protein4", "Gene1", 16], + ["Sample4", "Protein1", "Gene1", 8], + ["Sample4", "Protein2", "Gene1", 15], + ["Sample4", "Protein3", "Gene1", 1], + ["Sample4", "Protein4", "Gene1", 9], + ["Sample5", "Protein1", "Gene1", 10], + ["Sample5", "Protein2", "Gene1", 14], + ["Sample5", "Protein3", "Gene1", 2], + ["Sample5", "Protein4", "Gene1", 10], + ["Sample6", "Protein1", "Gene1", 12], + ["Sample6", "Protein2", "Gene1", 13], + ["Sample6", "Protein3", "Gene1", 3], + ["Sample6", "Protein4", "Gene1", 11], + ["Sample7", "Protein1", "Gene1", 12], + ["Sample7", "Protein2", "Gene1", 13], + ["Sample7", "Protein3", "Gene1", 3], + ["Sample7", "Protein4", "Gene1", 11], + ) + + test_intensity_df = pd.DataFrame( + data=test_intensity_list, + columns=["Sample", "Protein ID", "Gene", "Intensity"], + ) + + test_metadata_df = ( + ["Sample1", "2", 1], + ["Sample2", "6", 1], + ["Sample3", "7", 1], + ["Sample4", "10", 1], + ) + test_metadata_df = pd.DataFrame( + data=test_metadata_df, + columns=["Sample", "Time", "Day"], + ) + return test_intensity_df, test_metadata_df + +def test_time_series_plot(show_figures, time_series_test_data): + test_intensity, test_metadata = time_series_test_data + outputs = time_quant_plot(test_intensity, test_metadata, "Time","Protein1") + assert "plots" in outputs + fig = outputs["plots"][0] + if show_figures: + fig.show() + return + +def test_time_series_plot_invalid_euclidean_similarity(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + with pytest.raises(ValueError): + time_quant_plot(test_intensity, test_metadata, "Time", "Protein1", similarity=-1, similarity_measure="euclidean distance") + return + +def test_time_series_plot_invalid_cosine_similarity(time_series_test_data): + test_intensity, test_metadata = time_series_test_data + with pytest.raises(ValueError): + time_quant_plot(test_intensity, test_metadata, "Time","Protein1", similarity=2, similarity_measure="cosine similarity") + return \ No newline at end of file diff --git a/tests/protzilla/test_runner.py b/tests/protzilla/test_runner.py index b5de3148..18080f48 100644 --- a/tests/protzilla/test_runner.py +++ b/tests/protzilla/test_runner.py @@ -94,8 +94,8 @@ def test_runner_imports( 'FilterSamplesByProteinIntensitiesSum', 'ImputationByKNN', 'OutlierDetectionByLocalOutlierFactor', - 'NormalisationByMedian', 'TransformationLog', + 'NormalisationByMedian', 'PlotProtQuant', 'DifferentialExpressionTTest', 'PlotVolcano', @@ -109,8 +109,8 @@ def test_runner_imports( call({'deviation_threshold': 2.0}), call({'number_of_neighbours': 5}), call({'number_of_neighbors': 20}), - call({'percentile': 0.5}), call({'log_base': 'log2'}), + call({'percentile': 0.5}), call({'similarity_measure': 'euclidean distance'}), call({'alpha': 0.05}), call({'fc_threshold': 1}), diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index bc78fd71..a0e58689 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -17,6 +17,7 @@ importing.MaxQuantImport: importing_forms.MaxQuantImportForm, importing.DiannImport: importing_forms.DiannImportForm, importing.MsFraggerImport: importing_forms.MSFraggerImportForm, + importing.TMTImport: importing_forms.TMTImportForm, importing.MetadataImport: importing_forms.MetadataImportForm, importing.MetadataImportMethodDiann: importing_forms.MetadataImportMethodDiannForm, importing.MetadataColumnAssignment: importing_forms.MetadataColumnAssignmentForm, @@ -49,6 +50,7 @@ data_analysis.PlotScatterPlot: data_analysis_forms.PlotScatterPlotForm, data_analysis.PlotClustergram: data_analysis_forms.PlotClustergramForm, data_analysis.PlotProtQuant: data_analysis_forms.PlotProtQuantForm, + data_analysis.PlotTimeQuant: data_analysis_forms.PlotTimeQuantForm, data_analysis.PlotPrecisionRecallCurve: data_analysis_forms.PlotPrecisionRecallCurveForm, data_analysis.PlotROC: data_analysis_forms.PlotROCCurveForm, data_analysis.ClusteringKMeans: data_analysis_forms.ClusteringKMeansForm, @@ -65,6 +67,11 @@ data_analysis.FLEXIQuantLF: data_analysis_forms.FLEXIQuantLFForm, data_analysis.PTMsPerSample: data_analysis_forms.PTMsPerSampleForm, data_analysis.PTMsProteinAndPerSample: data_analysis_forms.PTMsPerProteinAndSampleForm, + data_analysis.TimeSeriesLinearRegression: data_analysis_forms.TimeSeriesLinearRegressionForm, + data_analysis.TimeSeriesRANSACRegression: data_analysis_forms.TimeSeriesRANSACRegressionForm, + data_analysis.TimeSeriesADFullerTest: data_analysis_forms.TimeSeriesADFullerTestForm, + data_analysis.TimeSeriesAutoARIMA: data_analysis_forms.TimeSeriesAutoARIMAForm, + data_analysis.TimeSeriesARIMA: data_analysis_forms.TimeSeriesARIMAForm, data_preprocessing.ImputationByMinPerSample: data_preprocessing_forms.ImputationByMinPerSampleForms, data_integration.EnrichmentAnalysisGOAnalysisWithString: data_integration_forms.EnrichmentAnalysisGOAnalysisWithStringForm, data_integration.EnrichmentAnalysisGOAnalysisWithEnrichr: data_integration_forms.EnrichmentAnalysisGOAnalysisWithEnrichrForm, diff --git a/ui/runs/forms/custom_fields.py b/ui/runs/forms/custom_fields.py index 7171f173..7370b64b 100644 --- a/ui/runs/forms/custom_fields.py +++ b/ui/runs/forms/custom_fields.py @@ -1,6 +1,8 @@ +import json import logging from enum import Enum +import django.forms as forms from django.forms import ( BooleanField, CharField, @@ -126,3 +128,32 @@ class CustomFloatField(FloatField): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.widget.attrs.update({"class": "form-control mb-2"}) + + +from django import forms +from django.utils.safestring import mark_safe + + +class TextDisplayWidget(forms.Widget): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.attrs.update() + + def render(self, name, value, attrs=None, renderer=None): + display_text = self.attrs.get("data-display-text", "") + return mark_safe(f"
{display_text}
") + + +class TextDisplayField(forms.Field): + widget = TextDisplayWidget + + def __init__(self, *args, **kwargs): + self.text = kwargs.pop("text", "") + kwargs["required"] = False + super().__init__(*args, **kwargs) + self.update_text() + + def update_text(self, text=None): + if text is not None: + self.text = text + self.widget.attrs["data-display-text"] = self.text diff --git a/ui/runs/forms/data_analysis.py b/ui/runs/forms/data_analysis.py index 2363832d..2b8fcf64 100644 --- a/ui/runs/forms/data_analysis.py +++ b/ui/runs/forms/data_analysis.py @@ -21,6 +21,7 @@ CustomFloatField, CustomMultipleChoiceField, CustomNumberField, + TextDisplayField ) @@ -151,6 +152,14 @@ class DimensionReductionMetric(Enum): cosine = "cosine" havensine = "havensine" +class TimeSeriesGrouping(Enum): + with_grouping = "With Grouping" + without_grouping = "Without Grouping" + +class TimeSeriesRANSACLoss(Enum): + absolute_error = "absolute_error" + squared_error = "squared_error" + class DifferentialExpressionANOVAForm(MethodForm): is_dynamic = True @@ -1152,3 +1161,414 @@ def fill_form(self, run: Run) -> None: ) if single_protein_peptides: self.fields["peptide_df"].initial = single_protein_peptides[0] + + +class PlotTimeQuantForm(MethodForm): + is_dynamic = True + + intensity_df = CustomChoiceField( + choices=[], + label="Choose dataframe to be plotted", + ) + time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") + protein_group = CustomChoiceField( + choices=[], + label="Protein group: choose highlighted protein group", + ) + similarity_measure = CustomChoiceField( + choices=SimilarityMeasure, + label="Similarity Measurement: choose how to compare protein groups", + initial=SimilarityMeasure.euclidean_distance, + ) + similarity = CustomNumberField( + label="Similarity", min_value=-1, max_value=999, step_size=1, initial=1 + ) + + def fill_form(self, run: Run) -> None: + self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps( + run + ) + + input_df_instance_id = self.data.get( + "intensity_df", self.fields["intensity_df"].choices[0][0] + ) + self.fields[ + "time_column" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + + self.fields["protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + step_type=Step, + output_key="protein_df", + instance_identifier=input_df_instance_id, + )["Protein ID"].unique() + ) + + similarity_measure = self.data.get( + "similarity_measure", self.fields["similarity_measure"].choices[0][0] + ) + self.data = self.data.copy() + if similarity_measure == SimilarityMeasure.cosine_similarity: + self.fields["similarity"] = CustomFloatField( + label="Cosine Similarity", + min_value=-1, + max_value=1, + step_size=0.1, + initial=0, + ) + if ( + "similarity" not in self.data + or float(self.data["similarity"]) < -1 + or float(self.data["similarity"]) > 1 + ): + self.data["similarity"] = 0 + else: + self.fields["similarity"] = CustomNumberField( + label="Euclidean Distance", + min_value=0, + max_value=999, + step_size=1, + initial=1, + ) + if ( + "similarity" not in self.data + or float(self.data["similarity"]) < 0 + or float(self.data["similarity"]) > 999 + ): + self.data["similarity"] = 1 + + + + +class TimeSeriesLinearRegressionForm(MethodForm): + is_dynamic = True + intensity_df = CustomChoiceField( + choices=[], + label="Intensity dataframe", + ) + time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") + protein_group = CustomChoiceField( + choices=[], + label="Protein group: which protein group to perform the linear regression on", + ) + train_size = CustomFloatField( + label="Train size: proportion of the dataset to include in the test split", + min_value=0, + max_value=1, + step_size=0.1, + initial=0.8 + ) + grouping = CustomChoiceField( + choices= TimeSeriesGrouping, + label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", + initial=TimeSeriesGrouping.with_grouping + ) + grouping_column = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") + + + def fill_form(self, run: Run) -> None: + self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps( + run + ) + input_df_instance_id = self.data.get( + "intensity_df", self.fields["intensity_df"].choices[0][0] + ) + self.fields[ + "time_column" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + + self.fields[ + "grouping_column" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + + self.fields["protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + step_type=Step, + output_key="protein_df", + instance_identifier=input_df_instance_id, + )["Protein ID"].unique() + ) + grouping = self.data.get("grouping") + if grouping == "Without Grouping": + self.toggle_visibility("grouping_column", False) + + +class TimeSeriesRANSACRegressionForm(MethodForm): + is_dynamic = True + intensity_df = CustomChoiceField( + choices=[], + label="Intensity dataframe", + ) + time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") + protein_group = CustomChoiceField( + choices=[], + label="Protein group: which protein group to perform the RANSAC regression on", + ) + max_trials = CustomNumberField( + label="Max trials: the maximum number of iterations for random sample selection", + min_value=1, + step_size=1, + initial=100, + ) + stop_probability = CustomFloatField( + label="Stop Probability: the probability that the algorithm stops after a certain number of iterations if at least one outlier-free set of the training data is sampled", + min_value=0, + max_value=1, + step_size=0.01, + initial=0.99 + ) + loss = CustomChoiceField( + choices= TimeSeriesRANSACLoss, + label="Loss function: the loss function to be used for fitting the linear model", + initial=TimeSeriesRANSACLoss.absolute_error, + ) + train_size = CustomFloatField( + label="Train size: proportion of the dataset to include in the test split", + min_value=0, + max_value=1, + step_size=0.1, + initial=0.8 + ) + grouping = CustomChoiceField( + choices= TimeSeriesGrouping, + label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", + initial=TimeSeriesGrouping.with_grouping + ) + grouping_column = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") + + + def fill_form(self, run: Run) -> None: + self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps( + run + ) + input_df_instance_id = self.data.get( + "intensity_df", self.fields["intensity_df"].choices[0][0] + ) + self.fields[ + "time_column" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + + self.fields[ + "grouping_column" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + + self.fields["protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + step_type=Step, + output_key="protein_df", + instance_identifier=input_df_instance_id, + )["Protein ID"].unique() + ) + grouping = self.data.get("grouping") + if grouping == "Without Grouping": + self.toggle_visibility("grouping_column", False) + + +class TimeSeriesADFullerTestForm(MethodForm): + is_dynamic = True + intensity_df = CustomChoiceField( + choices=[], + label="Intensity dataframe", + ) + time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") + protein_group = CustomChoiceField( + choices=[], + label="Protein group: which protein group to perform the ADFuller test on", + ) + alpha = CustomFloatField( + label="Significance level", + min_value=0, + max_value=1, + initial=0.05 + ) + + def fill_form(self, run: Run) -> None: + self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps( + run + ) + input_df_instance_id = self.data.get( + "intensity_df", self.fields["intensity_df"].choices[0][0] + ) + self.fields[ + "time_column" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + self.fields["protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + step_type=Step, + output_key="protein_df", + instance_identifier=input_df_instance_id, + )["Protein ID"].unique() + ) + +class TimeSeriesAutoARIMAForm(MethodForm): + is_dynamic = True + intensity_df = CustomChoiceField( + choices=[], + label="Intensity dataframe", + ) + time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") + protein_group = CustomChoiceField( + choices=[], + label="Protein group: which protein group to perform the AutoARIMA on", + ) + seasonal = CustomChoiceField( + choices=YesNo, + label="Seasonal: Whether the ARIMA model should be seasonal", + initial=YesNo.no + ) + m = CustomNumberField( + label = "The number of time steps for a single seasonal period (ignored if seasonal=No)", + min_value=1, + step_size=1, + initial=1, + ) + train_size = CustomFloatField( + label="Train size: proportion of the dataset to include in the test split", + min_value=0, + max_value=1, + step_size=0.1, + initial=0.8, + ) + grouping = CustomChoiceField( + choices= TimeSeriesGrouping, + label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", + initial=TimeSeriesGrouping.with_grouping + ) + grouping_column = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") + + + def fill_form(self, run: Run) -> None: + self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps( + run + ) + input_df_instance_id = self.data.get( + "intensity_df", self.fields["intensity_df"].choices[0][0] + ) + self.fields[ + "time_column" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + + self.fields[ + "grouping_column" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + + self.fields["protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + step_type=Step, + output_key="protein_df", + instance_identifier=input_df_instance_id, + )["Protein ID"].unique() + ) + grouping = self.data.get("grouping") + if grouping == "Without Grouping": + self.toggle_visibility("grouping_column", False) + + +class TimeSeriesARIMAForm(MethodForm): + is_dynamic = True + intensity_df = CustomChoiceField( + choices=[], + label="Intensity dataframe", + ) + time_column = CustomChoiceField(choices=[], label="Time: The column name from metadata that represents time") + protein_group = CustomChoiceField( + choices=[], + label="Protein group: which protein group to perform the AutoARIMA on", + ) + seasonal = CustomChoiceField( + choices=YesNo, + label="Seasonal: Whether the ARIMA model should be seasonal", + initial=YesNo.no + ) + p = CustomNumberField( + label = "Autoregressive Order: The number of lag observations included in the model", + min_value=0, + step_size=1, + initial=1, + ) + d = CustomNumberField( + label = "Differencing Order: The number of times that the raw observations are differenced", + min_value=0, + step_size=1, + initial=1, + ) + q = CustomNumberField( + label = "Moving Average Order: The size of the moving average window", + min_value=0, + step_size=1, + initial=1, + ) + P = CustomNumberField( + label = "Seasonal Autoregressive Order: The number of seasonal lag observations included in the model", + min_value=0, + step_size=1, + initial=0, + required=False + ) + D = CustomNumberField( + label = "Seasonal Differencing Order: The number of times that the seasonal observations are differenced", + min_value=0, + step_size=1, + initial=0, + required=False + ) + Q = CustomNumberField( + label = "Seasonal Moving Average Order: The size of the seasonal moving average window", + min_value=0, + step_size=1, + initial=0, + required=False + ) + s = CustomNumberField( + label = "Seasonal Period: The number of periods for a single seasonal cycle", + min_value=0, + step_size=1, + initial=0, + required=False + ) + train_size = CustomFloatField( + label="Train size: proportion of the dataset to include in the test split", + min_value=0, + max_value=1, + step_size=0.1, + initial=0.8, + ) + grouping = CustomChoiceField( + choices= TimeSeriesGrouping, + label="Option to select whether regression should be performed on the entire dataset or separately on the control and experimental groups", + initial=TimeSeriesGrouping.with_grouping + ) + grouping_column = CustomChoiceField(choices=[], label="Grouping from metadata: The column name from metadata that represents the grouping") + + + def fill_form(self, run: Run) -> None: + self.fields["intensity_df"].choices = fill_helper.get_choices_for_protein_df_steps( + run + ) + input_df_instance_id = self.data.get( + "intensity_df", self.fields["intensity_df"].choices[0][0] + ) + self.fields[ + "time_column" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + + self.fields[ + "grouping_column" + ].choices = fill_helper.get_choices_for_metadata_non_sample_columns(run) + + self.fields["protein_group"].choices = fill_helper.to_choices( + run.steps.get_step_output( + step_type=Step, + output_key="protein_df", + instance_identifier=input_df_instance_id, + )["Protein ID"].unique() + ) + grouping = self.data.get("grouping") + if grouping == "Without Grouping": + self.toggle_visibility("grouping_column", False) + seasonal = self.data.get("seasonal") + if seasonal == "No": + self.toggle_visibility("P", False) + self.toggle_visibility("D", False) + self.toggle_visibility("Q", False) + self.toggle_visibility("s", False) \ No newline at end of file diff --git a/ui/runs/forms/data_preprocessing.py b/ui/runs/forms/data_preprocessing.py index 08590b79..40ca2f78 100644 --- a/ui/runs/forms/data_preprocessing.py +++ b/ui/runs/forms/data_preprocessing.py @@ -469,7 +469,6 @@ class FilterPeptidesByPEPThresholdForm(MethodForm): threshold = CustomFloatField( label="Threshold value for PEP", min_value=0, initial=0 ) - peptide_df = CustomChoiceField(choices=EmptyEnum, label="peptide_df") class FilterPeptidesByPEPThresholdPlotForm(MethodForm): diff --git a/ui/runs/forms/fill_helper.py b/ui/runs/forms/fill_helper.py index eef83be6..4e0a3be9 100644 --- a/ui/runs/forms/fill_helper.py +++ b/ui/runs/forms/fill_helper.py @@ -14,6 +14,10 @@ def get_choices_for_protein_df_steps(run: Run) -> list[tuple[str, str]]: return reversed(to_choices(run.steps.get_instance_identifiers(Step, "protein_df"))) +def get_choices_for_peptide_df_steps(run: Run) -> list[tuple[str, str]]: + return reversed(to_choices(run.steps.get_instance_identifiers(Step, "peptide_df"))) + + def get_choices( run: Run, output_key: str, step_type: type[Step] = Step ) -> list[tuple[str, str]]: diff --git a/ui/runs/forms/importing.py b/ui/runs/forms/importing.py index cc799be3..07c8d74c 100644 --- a/ui/runs/forms/importing.py +++ b/ui/runs/forms/importing.py @@ -75,6 +75,24 @@ class MSFraggerImportForm(MethodForm): choices=AggregationMethods, label="Aggregation method", initial="Sum" ) +class TMTImportForm(MethodForm): + file_path = CustomFileField(label="TMT intensities file") + map_to_uniprot = CustomBooleanField( + label="Map to Uniprot IDs using Biomart (online)", required=False + ) + aggregation_method = CustomChoiceField( + choices=AggregationMethods, label="Aggregation method", initial="Sum" + ) + +class DiannImportForm(MethodForm): + file_path = CustomFileField(label="DIA-NN intensities file:") + map_to_uniprot = CustomBooleanField( + label="Map to Uniprot IDs using Biomart (online)", required=False + ) + aggregation_method = CustomChoiceField( + choices=AggregationMethods, label="Aggregation method", initial="Sum" + ) + class MetadataImportForm(MethodForm): file_path = CustomFileField(label="Metadata file") @@ -93,12 +111,12 @@ class MetadataImportMethodDiannForm(MethodForm): class MetadataColumnAssignmentForm(MethodForm): metadata_required_column = CustomChoiceField( choices=EmptyEnum, - label="Missing, but required metadata columns", + label="Columns in Metadata that needs to be assigned", required=False, ) metadata_unknown_column = CustomChoiceField( choices=EmptyEnum, - label="Existing, but unknown metadata columns", + label="Available columns in Metadata that can be assigned", required=False, ) @@ -111,7 +129,7 @@ def fill_form(self, run: Run) -> None: if metadata is not None: self.fields["metadata_required_column"].choices = [ (col, col) - for col in ["Sample", "Group", "Batch"] + for col in ["Sample", "Group", "Batch", "Time"] if col not in metadata.columns ] if len(self.fields["metadata_required_column"].choices) == 0: @@ -122,7 +140,7 @@ def fill_form(self, run: Run) -> None: unknown_columns = list( metadata.columns[ - ~metadata.columns.isin(["Sample", "Group", "Batch"]) + ~metadata.columns.isin(["Sample", "Group", "Batch", "Time"]) ].unique() ) diff --git a/ui/runs/static/runs/style.css b/ui/runs/static/runs/style.css index 63d66a0b..477e0f11 100644 --- a/ui/runs/static/runs/style.css +++ b/ui/runs/static/runs/style.css @@ -75,3 +75,10 @@ html, body { #gsea_enrichment_plot_img { width: 800px; } + +.display-output-textarea { + display: flex; + width: 100%; + height: auto; + resize: none; +} \ No newline at end of file diff --git a/ui/runs/templates/runs/details.html b/ui/runs/templates/runs/details.html index 5809d356..84ec3cfd 100644 --- a/ui/runs/templates/runs/details.html +++ b/ui/runs/templates/runs/details.html @@ -209,6 +209,14 @@

{{ display_name }}

{% endif %} + {% if display_output %} +
+ + +
+ {% endif %} {% endif %} diff --git a/ui/runs/views.py b/ui/runs/views.py index b95be756..6d98d025 100644 --- a/ui/runs/views.py +++ b/ui/runs/views.py @@ -121,6 +121,12 @@ def detail(request: HttpRequest, run_name: str): and Path(run.current_outputs["graph_path"]).exists() ) + display_output_form = ( + run.steps.current_step.display_output is not None + and not run.current_step.display_output.is_empty() + ) + display_output_text = next(iter(run.current_step.display_output.display_output.values()), None) + return render( request, "runs/details.html", @@ -156,6 +162,8 @@ def detail(request: HttpRequest, run_name: str): method_form=method_form, is_form_dynamic=method_form.is_dynamic, plot_form=plot_form, + display_output=display_output_form, + display_output_result=display_output_text, ), ) diff --git a/user_data/workflows/standard.yaml b/user_data/workflows/standard.yaml index 0b93bd80..52d754b7 100644 --- a/user_data/workflows/standard.yaml +++ b/user_data/workflows/standard.yaml @@ -36,20 +36,20 @@ steps: plot_inputs: { } type: OutlierDetectionByLocalOutlierFactor - form_inputs: - percentile: 0.5 + log_base: log2 inputs: { } plot_inputs: - graph_type: Boxplot + graph_type: Histogram group_by: None - visual_transformation: log10 - type: NormalisationByMedian + type: TransformationLog - form_inputs: - log_base: log2 + percentile: 0.5 inputs: { } plot_inputs: - graph_type: Histogram + graph_type: Boxplot group_by: None - type: TransformationLog + visual_transformation: log10 + type: NormalisationByMedian - form_inputs: similarity_measure: euclidean distance inputs: { } diff --git a/user_data/workflows/workflow_Kuganash-BA.yaml b/user_data/workflows/workflow_Kuganash-BA.yaml new file mode 100644 index 00000000..ef2d26bb --- /dev/null +++ b/user_data/workflows/workflow_Kuganash-BA.yaml @@ -0,0 +1,77 @@ +df_mode: disk_memory +steps: +- form_inputs: + aggregation_mode: Sum + intensity_name: iBAQ + map_to_uniprot: false + inputs: {} + instance_identifier: MaxQuantImport_1 + type: MaxQuantImport +- form_inputs: + feature_orientation: Columns (samples in rows, features in columns) + inputs: {} + instance_identifier: MetadataImport_1 + type: MetadataImport +- form_inputs: + percentage: 0.5 + inputs: {} + instance_identifier: FilterProteinsBySamplesMissing_1 + plot_inputs: + graph_type: Bar chart + type: FilterProteinsBySamplesMissing +- form_inputs: + deviation_threshold: 2.0 + inputs: {} + instance_identifier: FilterSamplesByProteinIntensitiesSum_1 + plot_inputs: + graph_type: Bar chart + type: FilterSamplesByProteinIntensitiesSum +- form_inputs: + number_of_neighbours: 5 + inputs: {} + instance_identifier: ImputationByKNN_1 + plot_inputs: + graph_type: Boxplot + graph_type_quantities: Bar chart + group_by: None + visual_transformation: log10 + type: ImputationByKNN +- form_inputs: + number_of_neighbors: 20 + inputs: {} + instance_identifier: OutlierDetectionByLocalOutlierFactor_1 + plot_inputs: {} + type: OutlierDetectionByLocalOutlierFactor +- form_inputs: + log_base: log2 + inputs: {} + instance_identifier: TransformationLog_1 + plot_inputs: + graph_type: Histogram + group_by: None + type: TransformationLog +- form_inputs: + percentile: 0.5 + inputs: {} + instance_identifier: NormalisationByMedian_1 + plot_inputs: + graph_type: Boxplot + group_by: None + visual_transformation: log10 + type: NormalisationByMedian +- form_inputs: {} + inputs: {} + instance_identifier: PlotTimeQuant_1 + type: PlotTimeQuant +- form_inputs: {} + inputs: {} + instance_identifier: TimeSeriesLinearRegression_1 + type: TimeSeriesLinearRegression +- form_inputs: {} + inputs: {} + instance_identifier: TimeSeriesADFullerTest_1 + type: TimeSeriesADFullerTest +- form_inputs: {} + inputs: {} + instance_identifier: TimeSeriesAutoARIMA_1 + type: TimeSeriesAutoARIMA