openproblems-bio · ghar1821 · Oct 12, 2025 · Oct 12, 2025 · Oct 12, 2025 · Oct 12, 2025
diff --git a/common b/common
diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh
@@ -17,16 +17,15 @@ cat > /tmp/params.yaml << HERE
 input_states: s3://openproblems-data/resources/task_cyto_batch_integration/datasets/**/state.yaml
 rename_keys: 'input_censored_split1:output_censored_split1;input_censored_split2:output_censored_split2;input_unintegrated:output_unintegrated'
 output_state: "state.yaml"
-settings: '{"metrics_exclude": ["cms"], "methods_include": ["mnnpy", "cytovi"]}'
 publish_dir: "$publish_dir"
 HERE
 
 tw launch https://github.com/openproblems-bio/task_cyto_batch_integration.git \
-  --revision build/fix_failed_stuff \
+  --revision build/main \
   --pull-latest \
   --main-script target/nextflow/workflows/run_benchmark/main.nf \
   --workspace 53907369739130 \
   --params-file /tmp/params.yaml \
   --entry-name auto \
   --config common/nextflow_helpers/labels_tw.config \
-  --labels task_cyto_batch_integration,mnnnpy
+  --labels task_cyto_batch_integration,test_subset
diff --git a/src/control_methods/shuffle_integration/config.vsh.yaml b/src/control_methods/shuffle_integration/config.vsh.yaml
@@ -3,10 +3,9 @@ name: shuffle_integration
 label: Shuffle Integration
 summary: Randomly shuffle cells in the whole dataset.
 description: |
-  This negative control randomly permutes cell-to-sample (hence batch)
-  assignments while keeping each cell's measured markers unchanged. 
-  This destroys any biological and batch specific structure but preserves marker expression.
-
+  This negative control randomly shuffles all cells in the input data,
+  destroying any biological structure (e.g., sample to cell mapping or batch assignments).
+
   Purpose:
   - Provide a baseline to verify that integration methods outperform
     random assignment of cells to batches.

diff --git a/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml b/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml
@@ -3,11 +3,9 @@ name: shuffle_integration_by_batch
 label: Shuffle Integration — within batches
 summary: Randomly reassign cells to any samples within the same batch.
 description: | 
-  This negative-control method randomly permutes cell-to-cell type assignments.
-  Cells remain assigned to their original batch (batch effects preserved).
-  Within each batch, cells are reassigned to random samples, destroying
-  biological/sample-specific structure (e.g., KO vs WT differences).
-
+  This negative-control method randomly shuffles cells within each batch independently, 
+  destroying cell to sample mapping while preserving batch-specific distributions. 
+
   Purpose:
   - Evaluate whether an integration method preserves differences between samples
     and biological groups while removing batch effects.

diff --git a/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml b/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml
@@ -3,17 +3,21 @@ name: shuffle_integration_by_cell_type
 label: Shuffle Integration — within cell type
 summary: Randomly reassign cells to any cell types
 description: | 
-  This negative-control method randomly permutes cell-to-cell type assignments.
-  Cells will be assigned to any cell types, regardless of their original cell type
-  or sample of origin or batch of origin.
+  This negative-control method randomly shuffles cells within each cell type independently, 
+  destroying batch structure while preserving cell type-specific distributions. 
+  This serves as a negative control that maintains biological groupings but 
+  eliminates batch grouping in each cell type.
 
   Purpose:
   - Evaluate whether an integration method preserves differences between cell types
   while removing batch effects.
 
   Example:
-  - A Neutrophil from a KO sample in batch 1 may be reassigned to any cell type
-    (B cell, T cell, Monocyte, etc.) from any sample in any batch.
+  - A Neutrophil in batch 1 from KO sample may be reassigned to a Neutrophil in batch 2
+  KO or WT sample or remain in a KO sample in batch 1 but assigned to different donor,
+  or moved to a WT sample in batch 1 or 2, or remain in the same sample,
+  but it will never be re-assigned to another cell type.
+
 # status: disabled
 resources:
   - type: python_script

diff --git a/src/methods/cytovi/config.vsh.yaml b/src/methods/cytovi/config.vsh.yaml
@@ -38,13 +38,13 @@ arguments:
     type: integer
     default: 1
     description: Number of layers.
-  - name: --n_clusters
+  - name: --max_epochs
     type: integer
-    default: 20
-    description: Number of clusters to use for subsampling.
-  - name: --subsample_fraction
+    default: 1000
+    description: Number of epochs to train the model.
+  - name: --train_size
     type: double
-    default: 0.5
+    default: 0.9
     description: Fraction of cells to subsample from each cluster for training.
 
 # Resources required to run the component
@@ -68,11 +68,10 @@ engines:
         packages:
           - anndata>=0.11.0
           - scanpy[skmisc]>=1.10
-          - scvi-tools==1.4.0
+          - scvi-tools==1.4.0.post1
           - pyyaml
           - requests
           - jsonschema
-          - scikit-learn
         github:
           - openproblems-bio/core#subdirectory=packages/python/openproblems
 

diff --git a/src/methods/cytovi/script.py b/src/methods/cytovi/script.py
@@ -1,25 +1,37 @@
+import time
+
 import anndata as ad
 import numpy as np
+import scvi
+import torch
 from scvi.external import cytovi
-from sklearn.cluster import KMeans
-from threadpoolctl import threadpool_limits
+
+# from sklearn.cluster import KMeans
+# from threadpoolctl import threadpool_limits
 
 ## VIASH START
 par = {
     "input": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/censored_split2.h5ad",
     "output": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/output_cytovi_split2.h5ad",
     "n_hidden": 128,
     "n_layers": 1,
-    "n_clusters": 10,
-    "subsample_fraction": 0.5,
+    "max_epochs": 1000,
+    "train_size": 0.9,
 }
 meta = {"name": "cytovi"}
 ## VIASH END
 
+# setting calculation to TF32 to speed up training
+torch.backends.cuda.matmul.allow_tf32 = True
+
+# increase num workers for data loading
+scvi.settings.num_workers = 95
+
 print("Reading and preparing input files", flush=True)
 adata = ad.read_h5ad(par["input"])
 
 adata.obs["batch_str"] = adata.obs["batch"].astype(str)
+adata.obs["sample_key_str"] = adata.obs["sample"].astype(str)
 
 markers_to_correct = adata.var[adata.var["to_correct"]].index.to_numpy()
 markers_not_correct = adata.var[~adata.var["to_correct"]].index.to_numpy()
@@ -33,41 +45,36 @@
     adata=adata_to_correct,
     transformed_layer_key="preprocessed",
     batch_key="batch_str",
+    scaled_layer_key="scaled",
     inplace=True,
 )
 
-print("Clustering using k-means with k =", par["n_clusters"], flush=True)
-# cluster data using Kmeans
-with threadpool_limits(limits=1):
-    adata_to_correct.obs["clusters"] = (
-        KMeans(n_clusters=par["n_clusters"], random_state=0)
-        .fit_predict(adata_to_correct.layers["scaled"])
-        .astype(str)
-    )
-# concatenate obs so we can use it for subsampling
-adata_to_correct.obs["sample_cluster"] = (
-    adata_to_correct.obs["sample"].astype(str) + "_" + adata_to_correct.obs["clusters"]
-)
-# subsample cells without replacement
-print("Subsampling cells", flush=True)
-subsampled_cells = adata_to_correct.obs.groupby("sample_cluster")[
-    "sample_cluster"
-].apply(lambda x: x.sample(n=round(len(x) * par["subsample_fraction"]), replace=False))
-# need the cell id included in the subsample
-subsampled_cells_idx = [x[1] for x in subsampled_cells.index.to_list()]
-
-adata_subsampled = adata_to_correct[subsampled_cells_idx, :].copy()
-
 print(
-    f"Train CytoVI on subsampled data containing {adata_subsampled.shape[0]} cells",
+    f"Train CytoVI on {adata_to_correct.shape[0]} cells",
     flush=True,
 )
 
-cytovi.CYTOVI.setup_anndata(adata_subsampled, layer="scaled", batch_key="batch_str")
+cytovi.CYTOVI.setup_anndata(
+    adata_to_correct,
+    layer="scaled",
+    batch_key="batch_str",
+    sample_key="sample_key_str",
+)
+
 model = cytovi.CYTOVI(
-    adata=adata_subsampled, n_hidden=par["n_hidden"], n_layers=par["n_layers"]
+    adata_to_correct, n_hidden=par["n_hidden"], n_layers=par["n_layers"]
+)
+
+print("Start training CytoVI model", flush=True)
+
+start = time.time()
+model.train(
+    batch_size=8192,
+    max_epochs=par["max_epochs"],
+    train_size=par["train_size"],
 )
-model.train()
+end = time.time()
+print(f"Training took {end - start:.2f} seconds", flush=True)
 
 # get batch corrected data
 print("Correcting data", flush=True)

diff --git a/src/methods/harmonypy/script.py b/src/methods/harmonypy/script.py
@@ -4,15 +4,16 @@
 
 ## VIASH START
 par = {
-    "input": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/censored_split2.h5ad",
-    "output": "resources_test/task_cyto_batch_integration/mouse_spleen_flow_cytometry_subset/output_harmony_split2.h5ad",
+    "input": "/Users/putri.g/Documents/cytobenchmark/debug_general/_viash_par/input_1/censored_split1.h5ad",
+    "output": "/Users/putri.g/Documents/cytobenchmark/debug_general/_viash_par/output_1/output_harmony_split1.h5ad",
 }
 meta = {"name": "harmonypy"}
 ## VIASH END
 
 print("Reading and preparing input files", flush=True)
 adata = ad.read_h5ad(par["input"])
 
+# harmony can't handle integer batch labels
 adata.obs["batch_str"] = adata.obs["batch"].astype(str)
 
 markers_to_correct = adata.var[adata.var["to_correct"]].index.to_numpy()
@@ -21,10 +22,13 @@
 adata_to_correct = adata[:, markers_to_correct].copy()
 
 print("Run harmony", flush=True)
-# harmony can't handle integer batch labels
+
+# TODO numerical instability in kmeans causing problem with harmony.
+# so adding a very small value to all entries to make sure there are no zeros
+epsilon = 1e-20
 
 out = harmonypy.run_harmony(
-    data_mat=adata_to_correct.layers["preprocessed"],
+    data_mat=adata_to_correct.layers["preprocessed"] + epsilon,
     meta_data=adata_to_correct.obs,
     vars_use="batch_str",
 )

diff --git a/src/metrics/bras/config.vsh.yaml b/src/metrics/bras/config.vsh.yaml
@@ -1,16 +1,9 @@
-# The API specifies which type of component this is.
-# It contains specifications for:
-#   - The input/output files
-#   - Common parameters
-#   - A unit test
 __merge__: ../../api/comp_metric.yaml
 
 # A unique identifier for your component (required).
 # Can contain only lowercase letters or underscores.
 name: bras
-
-
-
+status: disabled
 # Metadata for your component
 info:
   metrics:
@@ -56,13 +49,6 @@ info:
       # Whether a higher value represents a 'better' solution (required)
       maximize: true
 
-# Component-specific parameters (optional)
-# arguments:
-#   - name: "--n_neighbors"
-#     type: "integer"
-#     default: 5
-#     description: Number of neighbors to use.
-
 # Resources required to run the component
 resources:
   # The script of your component (required)
@@ -73,6 +59,14 @@ resources:
 
 engines:
   # Specifications for the Docker image for this component.
+  # testing gpu jax version
+  # - type: docker
+  #   image: openproblems/base_pytorch_nvidia:1.1
+  #   setup:
+  #     - type: python
+  #       packages:
+  #         - jax[cuda_12_pip]
+  #         - scib-metrics~=0.5.6
   - type: docker
     image: python:3.11
     setup:

diff --git a/src/metrics/bras/script.py b/src/metrics/bras/script.py
@@ -57,6 +57,7 @@
     labels=ct_labels_s1,
     batch=batch_labels_s1,
     metric="euclidean",
+    chunk_size=512,
 )
 
 batch_labels_s2 = integrated_s2.obs["batch"].values
@@ -67,6 +68,7 @@
     labels=ct_labels_s2,
     batch=batch_labels_s2,
     metric="euclidean",
+    chunk_size=512,
 )
 
 bras_score = np.mean([bras_s1, bras_s2])

diff --git a/src/metrics/n_inconsistent_peaks/config.vsh.yaml b/src/metrics/n_inconsistent_peaks/config.vsh.yaml
@@ -2,7 +2,7 @@
 __merge__: ../../api/comp_metric.yaml
 
 name: n_inconsistent_peaks
-# status: disabled
+status: disabled
 
 info:
   metrics:

diff --git a/src/metrics/ratio_inconsistent_peaks/config.vsh.yaml b/src/metrics/ratio_inconsistent_peaks/config.vsh.yaml
@@ -0,0 +1,70 @@
+# The API specifies which type of component this is.
+# It contains specifications for:
+#   - The input/output files
+#   - Common parameters
+#   - A unit test
+__merge__: ../../api/comp_metric.yaml
+
+# A unique identifier for your component (required).
+# Can contain only lowercase letters or underscores.
+name: ratio_inconsistent_peaks
+
+# Metadata for your component
+info:
+  metrics:
+      # A unique identifier for your metric (required).
+      # Can contain only lowercase letters or underscores.
+    - name: ratio_inconsistent_peaks
+      label: Ratio of inconsistent peaks
+      summary: "Ratio of the number of cell‑type marker‑expression peaks between unintegrated and batch-integrated data."
+      description: |
+        The metric compares the number of cell type specific marker expression peaks between unintegrated and batch integrated data.
+        The number of peaks is calculated using the `scipy.signal.find_peaks` function.
+        The metric is calculated as the absolute difference between the number of peaks in the unintegrated and batch-normalized data.
+        The (cell type) marker expression profiles are first smoothed using kernel density estimation (KDE) (`scipy.stats.gaussian_kde`),
+        and then peaks are then identified using the `scipy.signal.find_peaks` function.
+        For peak calling, the `prominence` parameter is set to 0.1 and the `height` parameter is set to 0.05*max_density.
+        Ratio of inconsistent peaks is defined as number of cases where the number of peaks differ between the two splits in the batch
+        normalized data divided by the total number of cases.
+        Cases where there are different number of peaks between the two splits in the unintegrated data are ignored from the denominator.
+        A lower score indicates better performance, means there are less cases with inconsistent peaks after batch integration.
+
+      references:
+        doi: 
+          - 10.1038/s41592-019-0686-2
+      links:
+        # URL to the documentation for this metric (required).
+        documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.find_peaks.html#scipy.signal.find_peaks
+        # URL to the code repository for this metric (required).
+        repository: https://github.com/scipy/scipy/blob/v1.15.2/scipy/signal/_peak_finding.py#L0-L1
+      # The minimum possible value for this metric (required)
+      min: 0
+      # The maximum possible value for this metric (required)
+      max: +.inf
+      # Whether a higher value represents a 'better' solution (required)
+      maximize: false
+
+# Resources required to run the component
+resources:
+  # The script of your component (required)
+  - type: python_script
+    path: script.py
+  - path: helper.py
+  - path: /src/utils/helper_functions.py
+
+engines:
+  # Specifications for the Docker image for this component.
+  - type: docker
+    image: openproblems/base_python:1
+    setup:
+      - type: python
+        packages:
+          - scikit-tda
+
+runners:
+  # This platform allows running the component natively
+  - type: executable
+  # Allows turning the component into a Nextflow module / pipeline.
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
+35 −0		schemas/results_v4/combined_output.json
+63 −0		schemas/results_v4/core.json
+90 −0		schemas/results_v4/dataset_info.json
+84 −0		schemas/results_v4/method_info.json
+77 −0		schemas/results_v4/metric_info.json
+50 −0		schemas/results_v4/quality_control.json
+183 −0		schemas/results_v4/results.json
+64 −0		schemas/results_v4/task_info.json
+418 −0		scripts/render_results_report