diff --git a/common b/common index 79b884b4..67da19a3 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 79b884b4c7fed300972d83a6ca025abb6116cbdc +Subproject commit 67da19a36ae56ea068804d15ccadec88a06da920 diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh index 979440c9..baa19b65 100755 --- a/scripts/run_benchmark/run_full_seqeracloud.sh +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -17,16 +17,15 @@ cat > /tmp/params.yaml << HERE input_states: s3://openproblems-data/resources/task_cyto_batch_integration/datasets/**/state.yaml rename_keys: 'input_censored_split1:output_censored_split1;input_censored_split2:output_censored_split2;input_unintegrated:output_unintegrated' output_state: "state.yaml" -settings: '{"metrics_exclude": ["cms"], "methods_include": ["mnnpy", "cytovi"]}' publish_dir: "$publish_dir" HERE tw launch https://github.com/openproblems-bio/task_cyto_batch_integration.git \ - --revision build/fix_failed_stuff \ + --revision build/main \ --pull-latest \ --main-script target/nextflow/workflows/run_benchmark/main.nf \ --workspace 53907369739130 \ --params-file /tmp/params.yaml \ --entry-name auto \ --config common/nextflow_helpers/labels_tw.config \ - --labels task_cyto_batch_integration,mnnnpy + --labels task_cyto_batch_integration,test_subset diff --git a/src/control_methods/shuffle_integration/config.vsh.yaml b/src/control_methods/shuffle_integration/config.vsh.yaml index 23f22899..9ba34a08 100644 --- a/src/control_methods/shuffle_integration/config.vsh.yaml +++ b/src/control_methods/shuffle_integration/config.vsh.yaml @@ -3,10 +3,9 @@ name: shuffle_integration label: Shuffle Integration summary: Randomly shuffle cells in the whole dataset. description: | - This negative control randomly permutes cell-to-sample (hence batch) - assignments while keeping each cell's measured markers unchanged. - This destroys any biological and batch specific structure but preserves marker expression. - + This negative control randomly shuffles all cells in the input data, + destroying any biological structure (e.g., sample to cell mapping or batch assignments). + Purpose: - Provide a baseline to verify that integration methods outperform random assignment of cells to batches. diff --git a/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml b/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml index 9bf6a1d1..4f8ba2ea 100644 --- a/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml +++ b/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml @@ -3,11 +3,9 @@ name: shuffle_integration_by_batch label: Shuffle Integration — within batches summary: Randomly reassign cells to any samples within the same batch. description: | - This negative-control method randomly permutes cell-to-cell type assignments. - Cells remain assigned to their original batch (batch effects preserved). - Within each batch, cells are reassigned to random samples, destroying - biological/sample-specific structure (e.g., KO vs WT differences). - + This negative-control method randomly shuffles cells within each batch independently, + destroying cell to sample mapping while preserving batch-specific distributions. + Purpose: - Evaluate whether an integration method preserves differences between samples and biological groups while removing batch effects. diff --git a/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml b/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml index 47bc46df..bb9862bd 100644 --- a/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml +++ b/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml @@ -3,17 +3,21 @@ name: shuffle_integration_by_cell_type label: Shuffle Integration — within cell type summary: Randomly reassign cells to any cell types description: | - This negative-control method randomly permutes cell-to-cell type assignments. - Cells will be assigned to any cell types, regardless of their original cell type - or sample of origin or batch of origin. + This negative-control method randomly shuffles cells within each cell type independently, + destroying batch structure while preserving cell type-specific distributions. + This serves as a negative control that maintains biological groupings but + eliminates batch grouping in each cell type. Purpose: - Evaluate whether an integration method preserves differences between cell types while removing batch effects. Example: - - A Neutrophil from a KO sample in batch 1 may be reassigned to any cell type - (B cell, T cell, Monocyte, etc.) from any sample in any batch. + - A Neutrophil in batch 1 from KO sample may be reassigned to a Neutrophil in batch 2 + KO or WT sample or remain in a KO sample in batch 1 but assigned to different donor, + or moved to a WT sample in batch 1 or 2, or remain in the same sample, + but it will never be re-assigned to another cell type. + # status: disabled resources: - type: python_script diff --git a/src/methods/cytovi/config.vsh.yaml b/src/methods/cytovi/config.vsh.yaml index eb6ff7d8..2d9a097f 100644 --- a/src/methods/cytovi/config.vsh.yaml +++ b/src/methods/cytovi/config.vsh.yaml @@ -38,13 +38,13 @@ arguments: type: integer default: 1 description: Number of layers. - - name: --n_clusters + - name: --max_epochs type: integer - default: 20 - description: Number of clusters to use for subsampling. - - name: --subsample_fraction + default: 1000 + description: Number of epochs to train the model. + - name: --train_size type: double - default: 0.5 + default: 0.9 description: Fraction of cells to subsample from each cluster for training. # Resources required to run the component @@ -68,11 +68,10 @@ engines: packages: - anndata>=0.11.0 - scanpy[skmisc]>=1.10 - - scvi-tools==1.4.0 + - scvi-tools==1.4.0.post1 - pyyaml - requests - jsonschema - - scikit-learn github: - openproblems-bio/core#subdirectory=packages/python/openproblems diff --git a/src/methods/cytovi/script.py b/src/methods/cytovi/script.py index 6a9768af..dc8b6e07 100644 --- a/src/methods/cytovi/script.py +++ b/src/methods/cytovi/script.py @@ -1,8 +1,13 @@ +import time + import anndata as ad import numpy as np +import scvi +import torch from scvi.external import cytovi -from sklearn.cluster import KMeans -from threadpoolctl import threadpool_limits + +# from sklearn.cluster import KMeans +# from threadpoolctl import threadpool_limits ## VIASH START par = { @@ -10,16 +15,23 @@ "output": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/output_cytovi_split2.h5ad", "n_hidden": 128, "n_layers": 1, - "n_clusters": 10, - "subsample_fraction": 0.5, + "max_epochs": 1000, + "train_size": 0.9, } meta = {"name": "cytovi"} ## VIASH END +# setting calculation to TF32 to speed up training +torch.backends.cuda.matmul.allow_tf32 = True + +# increase num workers for data loading +scvi.settings.num_workers = 95 + print("Reading and preparing input files", flush=True) adata = ad.read_h5ad(par["input"]) adata.obs["batch_str"] = adata.obs["batch"].astype(str) +adata.obs["sample_key_str"] = adata.obs["sample"].astype(str) markers_to_correct = adata.var[adata.var["to_correct"]].index.to_numpy() markers_not_correct = adata.var[~adata.var["to_correct"]].index.to_numpy() @@ -33,41 +45,36 @@ adata=adata_to_correct, transformed_layer_key="preprocessed", batch_key="batch_str", + scaled_layer_key="scaled", inplace=True, ) -print("Clustering using k-means with k =", par["n_clusters"], flush=True) -# cluster data using Kmeans -with threadpool_limits(limits=1): - adata_to_correct.obs["clusters"] = ( - KMeans(n_clusters=par["n_clusters"], random_state=0) - .fit_predict(adata_to_correct.layers["scaled"]) - .astype(str) - ) -# concatenate obs so we can use it for subsampling -adata_to_correct.obs["sample_cluster"] = ( - adata_to_correct.obs["sample"].astype(str) + "_" + adata_to_correct.obs["clusters"] -) -# subsample cells without replacement -print("Subsampling cells", flush=True) -subsampled_cells = adata_to_correct.obs.groupby("sample_cluster")[ - "sample_cluster" -].apply(lambda x: x.sample(n=round(len(x) * par["subsample_fraction"]), replace=False)) -# need the cell id included in the subsample -subsampled_cells_idx = [x[1] for x in subsampled_cells.index.to_list()] - -adata_subsampled = adata_to_correct[subsampled_cells_idx, :].copy() - print( - f"Train CytoVI on subsampled data containing {adata_subsampled.shape[0]} cells", + f"Train CytoVI on {adata_to_correct.shape[0]} cells", flush=True, ) -cytovi.CYTOVI.setup_anndata(adata_subsampled, layer="scaled", batch_key="batch_str") +cytovi.CYTOVI.setup_anndata( + adata_to_correct, + layer="scaled", + batch_key="batch_str", + sample_key="sample_key_str", +) + model = cytovi.CYTOVI( - adata=adata_subsampled, n_hidden=par["n_hidden"], n_layers=par["n_layers"] + adata_to_correct, n_hidden=par["n_hidden"], n_layers=par["n_layers"] +) + +print("Start training CytoVI model", flush=True) + +start = time.time() +model.train( + batch_size=8192, + max_epochs=par["max_epochs"], + train_size=par["train_size"], ) -model.train() +end = time.time() +print(f"Training took {end - start:.2f} seconds", flush=True) # get batch corrected data print("Correcting data", flush=True) diff --git a/src/methods/harmonypy/script.py b/src/methods/harmonypy/script.py index 8fc0cccc..5ac69a5d 100644 --- a/src/methods/harmonypy/script.py +++ b/src/methods/harmonypy/script.py @@ -4,8 +4,8 @@ ## VIASH START par = { - "input": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/censored_split2.h5ad", - "output": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/output_harmony_split2.h5ad", + "input": "/Users/putri.g/Documents/cytobenchmark/debug_general/_viash_par/input_1/censored_split1.h5ad", + "output": "/Users/putri.g/Documents/cytobenchmark/debug_general/_viash_par/output_1/output_harmony_split1.h5ad", } meta = {"name": "harmonypy"} ## VIASH END @@ -13,6 +13,7 @@ print("Reading and preparing input files", flush=True) adata = ad.read_h5ad(par["input"]) +# harmony can't handle integer batch labels adata.obs["batch_str"] = adata.obs["batch"].astype(str) markers_to_correct = adata.var[adata.var["to_correct"]].index.to_numpy() @@ -21,10 +22,13 @@ adata_to_correct = adata[:, markers_to_correct].copy() print("Run harmony", flush=True) -# harmony can't handle integer batch labels + +# TODO numerical instability in kmeans causing problem with harmony. +# so adding a very small value to all entries to make sure there are no zeros +epsilon = 1e-20 out = harmonypy.run_harmony( - data_mat=adata_to_correct.layers["preprocessed"], + data_mat=adata_to_correct.layers["preprocessed"] + epsilon, meta_data=adata_to_correct.obs, vars_use="batch_str", ) diff --git a/src/metrics/bras/config.vsh.yaml b/src/metrics/bras/config.vsh.yaml index fed70ef3..a99b43f5 100644 --- a/src/metrics/bras/config.vsh.yaml +++ b/src/metrics/bras/config.vsh.yaml @@ -1,16 +1,9 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test __merge__: ../../api/comp_metric.yaml # A unique identifier for your component (required). # Can contain only lowercase letters or underscores. name: bras - - - +status: disabled # Metadata for your component info: metrics: @@ -56,13 +49,6 @@ info: # Whether a higher value represents a 'better' solution (required) maximize: true -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. - # Resources required to run the component resources: # The script of your component (required) @@ -73,6 +59,14 @@ resources: engines: # Specifications for the Docker image for this component. + # testing gpu jax version + # - type: docker + # image: openproblems/base_pytorch_nvidia:1.1 + # setup: + # - type: python + # packages: + # - jax[cuda_12_pip] + # - scib-metrics~=0.5.6 - type: docker image: python:3.11 setup: diff --git a/src/metrics/bras/script.py b/src/metrics/bras/script.py index 98229ad0..db3428a1 100644 --- a/src/metrics/bras/script.py +++ b/src/metrics/bras/script.py @@ -57,6 +57,7 @@ labels=ct_labels_s1, batch=batch_labels_s1, metric="euclidean", + chunk_size=512, ) batch_labels_s2 = integrated_s2.obs["batch"].values @@ -67,6 +68,7 @@ labels=ct_labels_s2, batch=batch_labels_s2, metric="euclidean", + chunk_size=512, ) bras_score = np.mean([bras_s1, bras_s2]) diff --git a/src/metrics/n_inconsistent_peaks/config.vsh.yaml b/src/metrics/n_inconsistent_peaks/config.vsh.yaml index 569be185..9c8bae0e 100644 --- a/src/metrics/n_inconsistent_peaks/config.vsh.yaml +++ b/src/metrics/n_inconsistent_peaks/config.vsh.yaml @@ -2,7 +2,7 @@ __merge__: ../../api/comp_metric.yaml name: n_inconsistent_peaks -# status: disabled +status: disabled info: metrics: diff --git a/src/metrics/ratio_inconsistent_peaks/config.vsh.yaml b/src/metrics/ratio_inconsistent_peaks/config.vsh.yaml new file mode 100644 index 00000000..d4ca192e --- /dev/null +++ b/src/metrics/ratio_inconsistent_peaks/config.vsh.yaml @@ -0,0 +1,70 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_metric.yaml + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: ratio_inconsistent_peaks + +# Metadata for your component +info: + metrics: + # A unique identifier for your metric (required). + # Can contain only lowercase letters or underscores. + - name: ratio_inconsistent_peaks + label: Ratio of inconsistent peaks + summary: "Ratio of the number of cell‑type marker‑expression peaks between unintegrated and batch-integrated data." + description: | + The metric compares the number of cell type specific marker expression peaks between unintegrated and batch integrated data. + The number of peaks is calculated using the `scipy.signal.find_peaks` function. + The metric is calculated as the absolute difference between the number of peaks in the unintegrated and batch-normalized data. + The (cell type) marker expression profiles are first smoothed using kernel density estimation (KDE) (`scipy.stats.gaussian_kde`), + and then peaks are then identified using the `scipy.signal.find_peaks` function. + For peak calling, the `prominence` parameter is set to 0.1 and the `height` parameter is set to 0.05*max_density. + Ratio of inconsistent peaks is defined as number of cases where the number of peaks differ between the two splits in the batch + normalized data divided by the total number of cases. + Cases where there are different number of peaks between the two splits in the unintegrated data are ignored from the denominator. + A lower score indicates better performance, means there are less cases with inconsistent peaks after batch integration. + + references: + doi: + - 10.1038/s41592-019-0686-2 + links: + # URL to the documentation for this metric (required). + documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.find_peaks.html#scipy.signal.find_peaks + # URL to the code repository for this metric (required). + repository: https://github.com/scipy/scipy/blob/v1.15.2/scipy/signal/_peak_finding.py#L0-L1 + # The minimum possible value for this metric (required) + min: 0 + # The maximum possible value for this metric (required) + max: +.inf + # Whether a higher value represents a 'better' solution (required) + maximize: false + +# Resources required to run the component +resources: + # The script of your component (required) + - type: python_script + path: script.py + - path: helper.py + - path: /src/utils/helper_functions.py + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_python:1 + setup: + - type: python + packages: + - scikit-tda + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/metrics/ratio_inconsistent_peaks/helper.py b/src/metrics/ratio_inconsistent_peaks/helper.py new file mode 100644 index 00000000..461f6915 --- /dev/null +++ b/src/metrics/ratio_inconsistent_peaks/helper.py @@ -0,0 +1,137 @@ +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns +from ripser import ripser +from scipy.signal import find_peaks +from scipy.stats import gaussian_kde + + +def standardise_marker_expression(dist_1, dist_2): + """ + Standardises the marker expression values from two distributions. + + Inputs: + dist_1: array of values (1D) representing the marker expression from distribution 1 + dist_2: array of values (1D) representing the marker expression from distribution 2 + + Outputs: + std_dist_1: array of standardised values for distribution 1 + std_dist_2: array of standardised values for distribution 2 + """ + + pooled = np.concatenate([dist_1, dist_2]) + mu, sd = pooled.mean(), pooled.std() + std_dist_1 = (dist_1 - mu) / (sd) + std_dist_2 = (dist_2 - mu) / (sd) + + return std_dist_1, std_dist_2 + + +def get_kde_density(expression_array, return_xgrid=False, plot=False): + """ + Returns the density of the array using a gaussian kernel density estimation. + + Inputs: + expression_array: array of values (1D) representing the marker expression + return_xgrid: boolean, if True, also return the x_grid values used for density estimation + plot: boolean, if True, plot the density estimation + + Outputs: + density: array of values representing the density of marker expression + x_grid (optional): array of x values where the density is evaluated + """ + + min_val = expression_array.min() + max_val = expression_array.max() + marker_values = np.reshape(expression_array, (1, -1)) # Reshape array for KDE + kde = gaussian_kde(marker_values, bw_method="scott") + x_grid = np.linspace(min_val, max_val, 100) + density = kde(x_grid) + + # If the highest value is at the first bin, shift bins by one and adjust x_grid + if np.argmax(density) == 0 and density.size > 1: + print("Shifting KDE bins by one as the highest density is at the first bin.") + # orig_x_grid = x_grid.copy() + # recale the grid so we only have 99 bins and shift everything by one to the right.. + x_grid = np.linspace(min_val, max_val, 99) + density = kde(x_grid) + + # Prepend a zero so the beginning, but remove the last value to keep size consistent + # as otherwise we will end up with an extra bin... + density = np.concatenate([[0.0], density]) + # Have to use actual grid spacing to keep uniform spacing in x_grid. + # Can't just blindly add 1. + step = (max_val - min_val) / (len(x_grid)) if len(x_grid) > 1 else 0.0 + x_grid = np.concatenate([[min_val - step], x_grid]) + + if plot: + fig, ax = plt.subplots() + sns.scatterplot(x=x_grid, y=density, ax=ax) + ax.set_title("KDE Density Estimation") + ax.set_xlabel("Marker Expression") + ax.set_ylabel("Density") + fig.tight_layout() + fig.show() + + if return_xgrid: + # handy for plotting later on and maybe even save in the AnnData object + return density, x_grid + else: + return density + + +def call_peaks(density): + """ + Returns the peaks of the density using scipy.signal.find_peaks. + + Inputs: + density: array of values representing the density of marker expression + + Outputs: + peaks: array of values representing the peaks of the density + """ + + height_trsh = 0.1 + prom_trsh = 0.01 + + peaks, _ = find_peaks(density, prominence=prom_trsh, height=height_trsh) + num_peaks = len(peaks) + + return num_peaks + + +def persistent_peak_count(ys, persistence_cutoff=0.08): + """ + Counts robust peaks in a 1D dataset using persistent homology. + + Args: + ys (np.ndarray): KDE of a marker expression (1D array) + persistence_cutoff (float): a threshold that decides which peaks are “significant enough” to count. + A large persistence peak survives over many levels of smoothing (i.e. a strong, real peak). + A small persistence peak quickly merges into a neighbor — likely noise. + 0.01: very low threshold counts even weak bumps as peaks + 0.05: moderate (default) counts clearly separated peaks + 0.1–0.2: high threshold counts only strong, dominant peaks + Default to 0.08 to biased towards strong peaks but not overly. + + Returns: + int: number of significant peaks + """ + + y = np.asarray(ys) + if y.size == 0: + return 0 + + # Shift if max is at the first bin + if y.size > 1 and np.argmax(y) == 0: + y = np.concatenate([[0.0], y[:-1]]) + + # Invert to turn peaks into "holes" for 0D persistence + Y = -ys.reshape(-1, 1) + diagram = ripser(Y, maxdim=0)["dgms"][0] + persistence = diagram[:, 1] - diagram[:, 0] + + # Define significance threshold relative to data range + threshold = persistence_cutoff * np.ptp(ys) + n_peaks = np.sum(persistence > threshold) + return n_peaks diff --git a/src/metrics/ratio_inconsistent_peaks/script.py b/src/metrics/ratio_inconsistent_peaks/script.py new file mode 100644 index 00000000..e1aeae75 --- /dev/null +++ b/src/metrics/ratio_inconsistent_peaks/script.py @@ -0,0 +1,239 @@ +import sys +from collections import defaultdict + +import anndata as ad +import numpy as np +import pandas as pd + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + "input_unintegrated": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/unintegrated.h5ad", + # "input_unintegrated": "/Users/putri.g/Documents/cytobenchmark/benchmark_out_20251015/human_blood_mass_cytometry/unintegrated.h5ad", + "input_integrated_split1": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/integrated_split1.h5ad", + "input_integrated_split2": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/integrated_split2.h5ad", + "output": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/score.h5ad", +} +meta = { + "name": "ratio_inconsistent_peaks", +} + +# for local testing only +# import src.metrics.ratio_inconsistent_peaks.helper as metric_helper +# from src.utils.helper_functions import ( +# get_obs_var_for_integrated, +# remove_unlabelled, +# subset_markers_tocorrect, +# subset_nocontrols, +# ) + +## VIASH END + +sys.path.append(meta["resources_dir"]) + +import helper as metric_helper + +# from helper import call_peaks, get_kde_density +from helper_functions import ( + get_obs_var_for_integrated, + remove_unlabelled, + subset_markers_tocorrect, + subset_nocontrols, +) + +print("Reading input files", flush=True) +integrated_s1 = ad.read_h5ad(par["input_integrated_split1"]) +integrated_s2 = ad.read_h5ad(par["input_integrated_split2"]) +unintegrated = ad.read_h5ad(par["input_unintegrated"]) + +print("Formatting input files", flush=True) +integrated_s1, integrated_s2 = get_obs_var_for_integrated( + integrated_s1, integrated_s2, unintegrated +) + +integrated_s1 = subset_nocontrols(integrated_s1) +integrated_s1 = subset_markers_tocorrect(integrated_s1) +integrated_s1 = subset_nocontrols(integrated_s1) +integrated_s1 = remove_unlabelled(integrated_s1) + +integrated_s2 = subset_nocontrols(integrated_s2) +integrated_s2 = subset_markers_tocorrect(integrated_s2) +integrated_s2 = subset_nocontrols(integrated_s2) +integrated_s2 = remove_unlabelled(integrated_s2) + +donor_list = integrated_s1.obs["donor"].unique() + +print("Compute metric (per cell type)", flush=True) + +# case 1 = consistent peaks in unintegrated and also in integrated +# case 3 = consistent peaks in unintegrated but inconsistent in integrated +# not recording case 2 or 4 where unintegrated is inconsistent +n_case1 = 0 +n_case3 = 0 + +# so we can see where each cases comes from +case_details = defaultdict(list) + +for donor in donor_list: + # for testing only + # donor = donor_list[0] + + print("Processing donor", donor, flush=True) + + u_view = unintegrated[unintegrated.obs["donor"] == donor] + + # process per split + s1_view = integrated_s1[integrated_s1.obs["donor"] == donor] + s2_view = integrated_s2[integrated_s2.obs["donor"] == donor] + + celltype_list = s1_view.obs["cell_type"].unique() + + for celltype in celltype_list: + # for testing only + # celltype = celltype_list[0] + + print(f"Processing celltype {celltype}", flush=True) + + u_view_ct = u_view[u_view.obs["cell_type"] == celltype] + s1_view_ct = s1_view[s1_view.obs["cell_type"] == celltype] + s2_view_ct = s2_view[s2_view.obs["cell_type"] == celltype] + + if s1_view_ct.shape[0] < 100 or s2_view_ct.shape[0] < 100: + print(f"Skipping celltype {celltype} and donor {donor}.", flush=True) + if s1_view_ct.shape[0] < 100: + print( + f"Because n_cells in s1 is {s1_view_ct.shape[0]}, less than 100", + flush=True, + ) + else: + print( + f"Because n_cells in s2 is {s2_view_ct.shape[0]}, less than 100", + flush=True, + ) + # TODO uncomment me when done + continue + + # unintegrated for split 1 + u_view_ct_s1 = u_view_ct[u_view_ct.obs["split"] == 1] + u_view_ct_s2 = u_view_ct[u_view_ct.obs["split"] == 2] + + for marker in s1_view_ct.var.index: + # for testing only + # marker = u_view_ct.var.index[0] + + print(f"Processing marker {marker} for celltype {celltype}", flush=True) + + print("--------------------------------", flush=True) + print("Computing peaks for unintegrated", flush=True) + + print("Standardising marker expression", flush=True) + # standardise marker expression based on pooled mean and sd of + # unscaled marker expression for unintegrated data for split 1 and 2 + u_s1_unscaled = np.array(u_view_ct_s1[:, marker].layers["preprocessed"]) + u_s2_unscaled = np.array(u_view_ct_s2[:, marker].layers["preprocessed"]) + + u_s1_scaled, u_s2_scaled = metric_helper.standardise_marker_expression( + u_s1_unscaled, + u_s2_unscaled, + ) + print("Computing KDE density", flush=True) + density_dist_u_s1 = metric_helper.get_kde_density( + expression_array=u_s1_scaled + ) + density_dist_u_s2 = metric_helper.get_kde_density( + expression_array=u_s2_scaled + ) + + print("Calling peaks", flush=True) + + peaks_u_s1 = metric_helper.call_peaks(density_dist_u_s1) + peaks_u_s2 = metric_helper.call_peaks(density_dist_u_s2) + + print("--------------------------------", flush=True) + + print("\n", flush=True) + + print("--------------------------------", flush=True) + print("Computing peaks for integrated", flush=True) + + print("Standardising marker expression", flush=True) + # standardise marker expression based on pooled mean and sd of + # unscaled marker expression for unintegrated data for split 1 and 2 + s1_unscaled = np.array(s1_view_ct[:, marker].layers["integrated"]) + s2_unscaled = np.array(s2_view_ct[:, marker].layers["integrated"]) + + s1_scaled, s2_scaled = metric_helper.standardise_marker_expression( + s1_unscaled, + s2_unscaled, + ) + print("Computing KDE density", flush=True) + density_dist_s1 = metric_helper.get_kde_density(s1_scaled) + density_dist_s2 = metric_helper.get_kde_density(s2_scaled) + + print("Calling peaks", flush=True) + + peaks_s1 = metric_helper.call_peaks(density_dist_s1) + peaks_s2 = metric_helper.call_peaks(density_dist_s2) + + print("--------------------------------", flush=True) + + print("\n", flush=True) + + print( + f"Comparing peaks between unintegrated and integrated for {donor}, {celltype}, {marker}", + flush=True, + ) + + # case 1 or 3 where we have consistent peaks in unintegrated + if peaks_u_s1 == peaks_u_s2: + if peaks_s1 != peaks_s2: + n_case3 += 1 + case_details["case3"].append((donor, celltype, marker)) + else: + n_case1 += 1 + case_details["case1"].append((donor, celltype, marker)) + else: + print( + "WARNING! Inconsistent peaks detected in unintegrated data (case 2 or 4). Skipping calculation", + flush=True, + ) + print( + f"Number of peaks in unintegrated split 1: {peaks_u_s1}, split 2: {peaks_u_s2}", + flush=True, + ) + case_details["case2or4"].append((donor, celltype, marker)) + + print("Done comparing peaks.", flush=True) + print("\n", flush=True) + +print("Done processing all celltypes and donors", flush=True) +print("Calculating ratio", flush=True) + +if n_case1 + n_case3 == 0: + print( + "Only case 2 or 4 are found!. Cannot calculate metric.", + flush=True, + ) + metric_val = np.nan +else: + metric_val = n_case3 / (n_case1 + n_case3) + + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + "dataset_id": integrated_s1.uns["dataset_id"], + "method_id": integrated_s1.uns["method_id"], + "metric_ids": [meta["name"]], + "metric_values": [metric_val], + "n_cases": { + "case1": n_case1, + "case3": n_case3, + "case2or4": len(case_details["case2or4"]), + }, + "case_details": dict(case_details), + } +) +output.write_h5ad(par["output"], compression="gzip") + +# print(uns_metric_ids, uns_metric_values) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index c424489d..de188e22 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -109,11 +109,13 @@ dependencies: - name: methods/rpca_to_mid - name: methods/cytovi - name: metrics/emd - - name: metrics/n_inconsistent_peaks + # - name: metrics/bras + # - name: metrics/n_inconsistent_peaks + - name: metrics/ratio_inconsistent_peaks - name: metrics/average_batch_r2 - name: metrics/flowsom_mapping_similarity - name: metrics/lisi - - name: metrics/bras + runners: - type: nextflow diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index be484ce2..1bb72733 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -40,11 +40,12 @@ methods = [ // construct list of metrics metrics = [ emd, - n_inconsistent_peaks, + // bras, + // n_inconsistent_peaks, + ratio_inconsistent_peaks, average_batch_r2, flowsom_mapping_similarity, - lisi, - bras + lisi ] workflow run_wf {