Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
ae8a7b3
Fixed import for circadian mouse data
RogerAK Jun 21, 2024
53d6baf
Plotquantplot for peptide
RogerAK Jul 2, 2024
b446181
Merge branch 'refs/heads/dev' into bachelor-thesis-kuganash
RogerAK Jul 2, 2024
786da45
Plotquantplot for peptide
RogerAK Jul 10, 2024
cdcb739
Merge branch 'refs/heads/dev' into bachelor-thesis-kuganash
RogerAK Jul 10, 2024
dc9fc74
updated transform_dfs.py so that it supports peptide DFs
RogerAK Jul 10, 2024
286023d
updated transform_dfs.py so that it supports peptide DFs
RogerAK Jul 10, 2024
4b940f0
Implemeted Protquantplot with retention time instead of Intensities
RogerAK Jul 10, 2024
3d2bb30
Implemeted Timequantplot
RogerAK Jul 11, 2024
468ac23
Renamed the plot to time series plot
RogerAK Jul 11, 2024
8a419b2
Fixed Tests
RogerAK Jul 11, 2024
61b6df8
Implemented test for time series plot
RogerAK Jul 11, 2024
935f0b6
Implemented time series regression analysis
RogerAK Jul 18, 2024
9434b72
Merge branch 'refs/heads/dev' into bachelor-thesis-kuganash
RogerAK Jul 18, 2024
de89b56
Implemented time series regression analysis
RogerAK Jul 19, 2024
97ace4a
Added Docstrings
RogerAK Jul 19, 2024
38eb985
Implemented tests
RogerAK Jul 19, 2024
d752225
Implemented tests
RogerAK Jul 19, 2024
47556f3
made some minor changes
RogerAK Jul 19, 2024
4f8737d
Implemented RANSAC regression
RogerAK Jul 24, 2024
c3fae9b
output field for result
selenabr Jun 20, 2024
67c59c7
further implementation of output field for result
selenabr Jun 21, 2024
20b5e69
display display_output in output field
selenabr Jun 23, 2024
3aa711d
display_output field displayed in the same size and position as the o…
selenabr Jun 25, 2024
2b483f9
Changed is_dynamic to True
RogerAK Jul 25, 2024
5553ca7
Made some minor changes to the Plot positioning
RogerAK Jul 25, 2024
14dac5e
Made some minor changes to the Plot positioning
RogerAK Jul 25, 2024
e9c9acf
Created a thesis Workflow and added some tests for RANSAC
RogerAK Jul 25, 2024
37180af
Implemented Augmented Dickey-Fuller test to check if a time series da…
RogerAK Jul 25, 2024
21468fc
Implemented the option to do regression on each group
henninggaertner Jul 10, 2024
0153a1a
Cherry picked Text Field from Henning's BA
RogerAK Jul 31, 2024
31bd7af
Added info box for ADFuller Test
RogerAK Jul 31, 2024
fb476e4
Fixed Tests
RogerAK Aug 1, 2024
f7da4aa
implemented Auto ARIMA
RogerAK Aug 1, 2024
2e22aa6
implemented Auto ARIMA
RogerAK Aug 8, 2024
82a550a
implemented ARIMA
RogerAK Aug 8, 2024
5c0c157
Fixed RANSAC tests
RogerAK Aug 8, 2024
0b54ee8
Updated ARIMA so that it supports seasonal parameters
RogerAK Aug 14, 2024
e1b77dc
Corrected the output text for the scores
RogerAK Aug 15, 2024
d89b236
Implemented tests for auto ARIMA and ARIMA
RogerAK Aug 15, 2024
dfcbcbb
Merge branch 'refs/heads/dev' into bachelor-thesis-kuganash
RogerAK Aug 15, 2024
ec9b783
Implemented a dynamic field where the user can select the time column…
RogerAK Aug 18, 2024
4059f58
Fixed Tests
RogerAK Aug 18, 2024
9f624f8
Fixed Time Series Analysis
RogerAK Sep 5, 2024
671633d
Implemented TMT data import for PROTzilla
RogerAK Sep 14, 2024
1e1b50f
Updated TimeQuant plot
RogerAK Sep 14, 2024
a3ffe29
Updated a test
RogerAK Sep 14, 2024
7762995
Mapped TMT import
RogerAK Sep 14, 2024
aaeed09
Added an option for the user to select the Time and Grouping column n…
RogerAK Sep 14, 2024
25c37d6
Resolved some comments from from Hendrik
RogerAK Sep 14, 2024
f8b556c
Fixed Tests
RogerAK Sep 15, 2024
58ec156
Updated a variable name
RogerAK Sep 15, 2024
8dda742
Updated some methods
RogerAK Sep 24, 2024
0e20447
Updated Test
RogerAK Sep 24, 2024
f3b00e3
Removed unwanted lines
RogerAK Sep 27, 2024
f8bab05
Fixed Tests
RogerAK Sep 27, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 41 additions & 5 deletions protzilla/constants/colors.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,44 @@
PROTZILLA_DISCRETE_COLOR_SEQUENCE = [
"#4A536A",
"#87A8B9",
"#CE5A5A",
"#8E3325",
"#E2A46D",
# Set 1: Muted Dark Slate
"#252935", "#3A3F50", "#50556A", "#6B7186", "#858DA2",
# Set 2: Muted Indian Red
"#CE5A5A", "#B24C4C", "#9D3F3F", "#E07272", "#F48D8D",
# Set 3: Muted Light Steel Blue
"#51646F", "#6A7D89", "#7F92A0", "#96A9B8", "#ADBFCD",
# Set 4: Muted Sienna
"#804538", "#6F3C31", "#5F342A", "#A05748", "#B66E5E",
# Set 5: Muted Sandy Brown
"#715236", "#63472F", "#57402B", "#96755A", "#A98575",
# Set 6: Muted Olive
"#6E6B48", "#5D5B3E", "#4E4D36", "#89875C", "#A1A16E",
# Set 7: Muted Teal
"#3B6B6A", "#315B5B", "#274C4C", "#507E7E", "#6B9898",
# Set 8: Muted Taupe
"#8B7E74", "#776F65", "#675E56", "#A09085", "#B9AAA1",
# Set 9: Muted Burgundy
"#7B3A4F", "#6A3345", "#582C3C", "#925664", "#A8737E",
# Set 10: Muted Forest Green
"#3D5047", "#35453E", "#2D3B35", "#5F7267", "#7B8D80",
# Set 11: Muted Navy
"#2F3E4C", "#283442", "#222B38", "#485669", "#627185",
# Set 12: Muted Mustard
"#BFA054", "#A98F4A", "#927D3F", "#D7BA75", "#E2CD96",
# Set 13: Muted Dusty Rose
"#C18394", "#AA727E", "#93616C", "#D69BA7", "#E4B8C2",
# Set 14: Muted Lavender
"#8A729D", "#7A638C", "#6A547C", "#A591B3", "#BDA9C8",
# Set 15: Muted Charcoal
"#404040", "#353535", "#2B2B2B", "#585858", "#707070",
# Set 16: Muted Emerald Green
"#4D7456", "#426448", "#37563B", "#6A9177", "#85A990",
# Set 17: Muted Peach
"#D89B83", "#C2866F", "#A7725E", "#E3B39C", "#ECC7B6",
# Set 18: Muted Plum
"#704F6E", "#634464", "#563A59", "#876A87", "#A18AA1",
# Set 19: Muted Periwinkle
"#7E8DAF", "#6F7B98", "#616A82", "#97A3BF", "#B0B9D1",
# Set 20: Muted Coral
"#CC7A5E", "#B26951", "#9A5A45", "#DD937C", "#EBAA99"
]

PROTZILLA_DISCRETE_COLOR_OUTLIER_SEQUENCE = ["#4A536A", "#CE5A5A"]
15 changes: 15 additions & 0 deletions protzilla/data_analysis/time_series_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from datetime import datetime

def convert_time_to_hours(time_str):
"""
Convert a string time to the number of hours since midnight.
:param time_str: The time string to convert in format '%H:%M:%S'

:return: Number of hours since midnight as a float
"""

"""
time_obj = datetime.strptime(time_str, '%H:%M:%S')
hours_since_midnight = time_obj.hour + time_obj.minute / 60 + time_obj.second / 3600
"""
return time_str
179 changes: 179 additions & 0 deletions protzilla/data_analysis/time_series_plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import pandas as pd
import plotly.graph_objects as go
from scipy import stats
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

from protzilla.utilities.transform_dfs import is_long_format, long_to_wide_time
from protzilla.constants.colors import PROTZILLA_DISCRETE_COLOR_SEQUENCE

# Define color constants
colors = {
"plot_bgcolor": "white",
"gridcolor": "#F1F1F1",
"linecolor": "#F1F1F1",
"annotation_text_color": "#ffffff",
"annotation_proteins_of_interest": "#4A536A",
}

def time_quant_plot(
intensity_df: pd.DataFrame,
metadata_df: pd.DataFrame,
time_column: str,
protein_group: str,
similarity: float = 1.0,
similarity_measure: str = "euclidean distance",
) -> dict:
"""
A function to create a graph visualising protein quantifications across all samples
as a line diagram using time. It's possible to select one proteingroup
that will be displayed in orange and choose a similarity measurement with a similarity score
to get all proteingroups that are similar displayed in another color in this line diagram.
All other proteingroups are displayed in the background as a grey polygon.

:param intensity_df: A dataframe in protzilla wide format, where each row
represents a sample and each column represents a feature.
:param metadata_df: A dataframe containing the metadata of the samples.
:param time_column: The name of the column in the metadata_df that contains the time information.
:param protein_group: Protein IDs as the columnheader of the dataframe
:param similarity_measure: method to compare the chosen proteingroup with all others. The two
methods are "cosine similarity" and "euclidean distance".
:param similarity: similarity score of the chosen similarity measurement method.

:return: returns a dictionary containing a list with a plotly figure and/or a list of messages
"""

intensity_df = pd.merge(
left=intensity_df,
right=metadata_df[["Sample", time_column]],
on="Sample",
copy=False,
)

wide_df = intensity_df.interpolate(method='linear', axis=0)
wide_df = long_to_wide_time(wide_df, time_column=time_column) if is_long_format(wide_df, time_column=time_column) else wide_df


if protein_group not in wide_df.columns:
raise ValueError("Please select a valid protein group.")
elif similarity_measure == "euclidean distance" and similarity < 0:
raise ValueError(
"Similarity for euclidean distance should be greater than or equal to 0."
)
elif similarity_measure == "cosine similarity" and (
similarity < -1 or similarity > 1
):
raise ValueError("Similarity for cosine similarity should be between -1 and 1")

fig = go.Figure()

color_mapping = {
"A": PROTZILLA_DISCRETE_COLOR_SEQUENCE[0],
"C": PROTZILLA_DISCRETE_COLOR_SEQUENCE[4],
}

lower_upper_x = []
lower_upper_y = []

lower_upper_x.append(wide_df.index[0])
lower_upper_y.append(wide_df.iloc[0].min())

for index, row in wide_df.iterrows():
lower_upper_x.append(index)
lower_upper_y.append(row.max())

for index, row in reversed(list(wide_df.iterrows())):
lower_upper_x.append(index)
lower_upper_y.append(row.min())

fig.add_trace(
go.Scatter(
x=lower_upper_x,
y=lower_upper_y,
fill="toself",
name="Intensity Range",
line=dict(color="silver"),
)
)

similar_groups = []
for group_to_compare in wide_df.columns:
if group_to_compare != protein_group:
if similarity_measure == "euclidean distance":
distance = euclidean_distances(
stats.zscore(wide_df[protein_group]).values.reshape(1, -1),
stats.zscore(wide_df[group_to_compare]).values.reshape(1, -1),
)[0][0]
else:
distance = cosine_similarity(
stats.zscore(wide_df[protein_group]).values.reshape(1, -1),
stats.zscore(wide_df[group_to_compare]).values.reshape(1, -1),
)[0][0]
if similarity_measure == "euclidean distance":
if distance <= similarity:
similar_groups.append(group_to_compare)
else:
if distance >= similarity:
similar_groups.append(group_to_compare)

for group in similar_groups:
fig.add_trace(
go.Scatter(
x=wide_df.index,
y=wide_df[group],
mode="lines",
name=group[:15] + "..." if len(group) > 15 else group,
line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[9]),
showlegend=len(similar_groups) <= 7,
)
)

if len(similar_groups) > 7:
fig.add_trace(
go.Scatter(
x=[None],
y=[None],
mode="lines",
line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[9]),
name="Similar Protein Groups",
)
)

formatted_protein_name = (
protein_group[:15] + "..." if len(protein_group) > 15 else protein_group
)
fig.add_trace(
go.Scatter(
x=wide_df.index,
y=wide_df[protein_group],
mode="lines",
name=formatted_protein_name,
line=dict(color=PROTZILLA_DISCRETE_COLOR_SEQUENCE[4]),
)
)
fig.update_layout(
title=f"Time Series of {formatted_protein_name} in all samples",
plot_bgcolor=colors["plot_bgcolor"],
xaxis_gridcolor=colors["gridcolor"],
yaxis_gridcolor=colors["gridcolor"],
xaxis_linecolor=colors["linecolor"],
yaxis_linecolor=colors["linecolor"],
xaxis_title=time_column,
yaxis_title="Intensity",
legend_title="Legend",
xaxis=dict(
tickmode="array",
tickangle=0,
tickvals=wide_df.index,
ticktext=[wide_df[time_column].unique() for wide_df[time_column] in wide_df.index],
),
autosize=True,
margin=dict(l=100, r=300, t=100, b=100),
legend=dict(
x=1.05,
y=1,
bgcolor="rgba(255, 255, 255, 0.5)",
orientation="v",
),
)

return dict(plots=[fig])
Loading