From 1a2285c56de95b953f4d4141c8417c398b4d85cb Mon Sep 17 00:00:00 2001 From: Bartosz Czech Date: Tue, 14 Apr 2026 07:02:38 +0000 Subject: [PATCH 1/5] refactor: add support for metadata --- R/merge_SE.R | 146 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 136 insertions(+), 10 deletions(-) diff --git a/R/merge_SE.R b/R/merge_SE.R index 5a288c9f..87ff2ef7 100644 --- a/R/merge_SE.R +++ b/R/merge_SE.R @@ -6,12 +6,16 @@ #' that can arise from multiple projects. #' @param discard_keys Character vector of strings that will be discarded #' during creating BumpyMatrix object. +#' @param title String specifying the final DataSetDB title. If NULL, auto-generates. +#' @param description String specifying the final DataSetDB description. If NULL, auto-generates. +#' @param source_name String specifying the standard DSDB source name. If NULL, auto-detects or uses "merged_analysis". +#' @param source_id String specifying the unique DSDB source ID. If NULL, uses "merged_dataset". #' @keywords SE_operators #' #' @examples #' mae1 <- get_synthetic_data("finalMAE_combo_2dose_nonoise") #' mae2 <- get_synthetic_data("finalMAE_combo_2dose_nonoise") -#' merge_MAE(list(mae1 = mae1, mae2 = mae2)) +#' merge_MAE(list(mae1 = mae1, mae2 = mae2), title = "Test", description = "Test MAE") #' #' @return Merged MultiAssayExperiment object. #' @export @@ -25,28 +29,112 @@ merge_MAE <- function(MAElist, "control_type", "iso_level", "conc_1", - "conc_2")) { + "conc_2"), + title = NULL, + description = NULL, + source_name = NULL, + source_id = NULL) { checkmate::assert_list(MAElist, types = "MultiAssayExperiment") + checkmate::assert_string(title, null.ok = TRUE) + checkmate::assert_string(description, null.ok = TRUE) + checkmate::assert_string(source_name, null.ok = TRUE) + checkmate::assert_string(source_id, null.ok = TRUE) experiments <- unique(unlist(lapply(MAElist, names))) merged_SE_assays <- lapply(experiments, function(exp_name) { exp_list <- lapply(MAElist, function(mae) { - if (exp_name %in% names(mae)) { - mae[[exp_name]] - } else { - NULL - } + if (exp_name %in% names(mae)) mae[[exp_name]] else NULL }) exp_list <- exp_list[!vapply(exp_list, is.null, FUN.VALUE = logical(1))] merge_SE(exp_list) }) names(merged_SE_assays) <- experiments + mae_names <- names(MAElist) + if (is.null(mae_names) || all(trimws(mae_names) == "")) { + mae_names <- paste0("Dataset_", seq_along(MAElist)) + } + + all_sources <- list() + original_titles <- c() + + for (mae in MAElist) { + for (exp in names(mae)) { + meta <- as.list(S4Vectors::metadata(mae[[exp]])$experiment_metadata) + if (length(meta) > 0) { + if (is.list(meta$sources)) all_sources <- c(all_sources, meta$sources) + if (!is.null(meta$title)) original_titles <- c(original_titles, meta$title) + } + } + } + + + if (is.null(title)) { + title <- sprintf("Merged MAE: %s", paste(mae_names, collapse = " + ")) + } + + if (is.null(description)) { + description <- sprintf("Synthetically merged dataset originating from: %s.", paste(mae_names, collapse = ", ")) + unique_titles <- unique(original_titles) + if (length(unique_titles) > 0) { + description <- paste0(description, " Original Titles: [", paste(unique_titles, collapse = " | "), "]") + } + } + + if (is.null(source_name)) { + if (length(all_sources) > 0) { + unique_names <- unique(vapply(all_sources, function(s) { + if (!is.null(s$name)) s$name else "unknown" + }, character(1))) + + source_name <- if (length(unique_names) == 1 && unique_names[1] != "unknown") { + unique_names[1] + } else { + "merged_analysis" + } + } else { + source_name <- "merged_analysis" + } + } + + if (is.null(source_id)) { + source_id <- "merged_dataset" + } + + synthetic_experiment_metadata <- list( + title = title, + description = description, + experimentalist = Sys.info()[["user"]], + sources = list(list(name = source_name, id = source_id)) + ) + + for (i in seq_along(merged_SE_assays)) { + meta_list <- as.list(S4Vectors::metadata(merged_SE_assays[[i]])) + meta_list$experiment_metadata <- synthetic_experiment_metadata + S4Vectors::metadata(merged_SE_assays[[i]]) <- meta_list + } + + base_metadata <- as.list(S4Vectors::metadata(MAElist[[1]])) + if (length(base_metadata) == 0) base_metadata <- list() + + if (!is.null(base_metadata$.internal$DataSetDB$dataset)) { + ds_meta <- as.list(base_metadata$.internal$DataSetDB$dataset) + ds_meta$title <- synthetic_experiment_metadata$title + ds_meta$description <- synthetic_experiment_metadata$description + ds_meta$sources <- synthetic_experiment_metadata$sources + + internal_meta <- as.list(base_metadata$.internal) + internal_meta$DataSetDB <- as.list(internal_meta$DataSetDB) + internal_meta$DataSetDB$dataset <- ds_meta + + base_metadata$.internal <- internal_meta + } + MultiAssayExperiment::MultiAssayExperiment( experiments = MultiAssayExperiment::ExperimentList(merged_SE_assays), - metadata = Reduce(c, lapply(MAElist, S4Vectors::metadata)) + metadata = base_metadata ) } @@ -207,7 +295,6 @@ merge_assay <- function(SElist, list(DT = DT, BM = BM) } - #' Identify unique metadata fields from a list of \code{SummarizedExperiment}s #' #' @param SElist named list of \code{SummarizedExperiment}s @@ -233,7 +320,6 @@ identify_unique_se_metadata_fields <- function(SElist) { }))) } - #' Merge metadata #' #' @param SElist named list of \code{SummarizedExperiment}s @@ -261,12 +347,52 @@ merge_metadata <- function(SElist, checkmate::assert_character(metadata_fields) all_metadata <- lapply(metadata_fields, function(x) { + + if (x %in% c("experiment_metadata", ".internal")) { + + valid_metas <- lapply(SElist, function(se) S4Vectors::metadata(se)[[x]]) + valid_metas <- valid_metas[!vapply(valid_metas, is.null, FUN.VALUE = logical(1))] + + if (length(valid_metas) == 0) return(list()) + + if (x == "experiment_metadata") { + synth <- as.list(valid_metas[[1]]) + + all_sources <- list() + for (vm in valid_metas) { + vm_list <- as.list(vm) + if (is.list(vm_list$sources)) all_sources <- c(all_sources, vm_list$sources) + } + + if (length(all_sources) > 0) { + unique_names <- unique(vapply(all_sources, function(s) { + if (!is.null(s$name)) s$name else "unknown" + }, character(1))) + + std_name <- if (length(unique_names) == 1 && unique_names[1] != "unknown") { + unique_names[1] + } else { + "merged_analysis" + } + + synth$sources <- list(list(name = std_name, id = "merged_dataset")) + } else { + synth$sources <- list() + } + + return(synth) + } + + return(as.list(valid_metas[[1]])) + } + do.call(c, lapply(names(SElist), function(SE) { meta <- list(S4Vectors::metadata(SElist[[SE]])[[x]]) names(meta) <- SE meta })) }) + names(all_metadata) <- metadata_fields all_metadata } From 093446db985d4856d0c5a8a77296ac7e5630e0cb Mon Sep 17 00:00:00 2001 From: Bartosz Czech Date: Tue, 14 Apr 2026 07:02:48 +0000 Subject: [PATCH 2/5] doc: reoxygenate --- man/merge_MAE.Rd | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/man/merge_MAE.Rd b/man/merge_MAE.Rd index 91588318..b0744b20 100644 --- a/man/merge_MAE.Rd +++ b/man/merge_MAE.Rd @@ -8,7 +8,11 @@ merge_MAE( MAElist, additional_col_name = "data_source", discard_keys = c("normalization_type", "fit_source", "record_id", "isDay0", "swap_sa", - "control_type", "iso_level", "conc_1", "conc_2") + "control_type", "iso_level", "conc_1", "conc_2"), + title = NULL, + description = NULL, + source_name = NULL, + source_id = NULL ) } \arguments{ @@ -20,6 +24,14 @@ that can arise from multiple projects.} \item{discard_keys}{Character vector of strings that will be discarded during creating BumpyMatrix object.} + +\item{title}{String specifying the final DataSetDB title. If NULL, auto-generates.} + +\item{description}{String specifying the final DataSetDB description. If NULL, auto-generates.} + +\item{source_name}{String specifying the standard DSDB source name. If NULL, auto-detects or uses "merged_analysis".} + +\item{source_id}{String specifying the unique DSDB source ID. If NULL, uses "merged_dataset".} } \value{ Merged MultiAssayExperiment object. @@ -30,7 +42,7 @@ Merge multiple MultiAssayExperiment objects \examples{ mae1 <- get_synthetic_data("finalMAE_combo_2dose_nonoise") mae2 <- get_synthetic_data("finalMAE_combo_2dose_nonoise") -merge_MAE(list(mae1 = mae1, mae2 = mae2)) +merge_MAE(list(mae1 = mae1, mae2 = mae2), title = "Test", description = "Test MAE") } \keyword{SE_operators} From 2a3f3398b3ee9a2ad54e5be9ecc361fa05922548 Mon Sep 17 00:00:00 2001 From: Bartosz Czech Date: Tue, 14 Apr 2026 07:03:01 +0000 Subject: [PATCH 3/5] test: add unit tests --- tests/testthat/test-merge_SE.R | 41 +++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/tests/testthat/test-merge_SE.R b/tests/testthat/test-merge_SE.R index 5be6bbc4..5876c44e 100644 --- a/tests/testthat/test-merge_SE.R +++ b/tests/testthat/test-merge_SE.R @@ -5,7 +5,7 @@ listSE <- lapply(listMAE, function(x) x[[2]]) names(listSE) <- c("combo1", "combo2") listMAE2 <- lapply(list.files(system.file(package = "gDRtestData", "testdata"), - "final", full.names = TRUE)[1:2], qs::qread) + "final", full.names = TRUE)[1:2], qs::qread) listSE2 <- lapply(listMAE, function(x) x[[1]]) names(listSE2) <- c("combo1", "combo2") @@ -28,8 +28,13 @@ test_that("merge_metadata and identify_unique_se_metadata_fields work as expecte metadata_fields <- identify_unique_se_metadata_fields(listSE) mergedMetadata <- merge_metadata(listSE, metadata_fields) expect_identical(names(mergedMetadata), metadata_fields) - expect_identical(names(mergedMetadata$experiment_metadata), names(listSE)) - + + # NEW: experiment_metadata is no longer nested by names(listSE). + # It is now synthesized, so we check for standard keys like 'sources'. + if ("experiment_metadata" %in% names(mergedMetadata)) { + expect_true("sources" %in% names(mergedMetadata$experiment_metadata)) + } + listSE2 <- listSE newMetaName <- "dummy_meta" S4Vectors::metadata(listSE2$combo1)[[newMetaName]] <- list() @@ -45,15 +50,16 @@ test_that("merge_SE works as expected", { checkmate::expect_class(mergedSE$result, "SummarizedExperiment") S4Vectors::metadata(mergedSE$result)[["df_raw_data"]] <- list(NULL) validate_SE(mergedSE$result) + additional_col_name <- "QCS" mergedSE2 <- purrr::quietly(merge_SE)(listSE, additional_col_name) assayNormalized <- convert_se_assay_to_dt(mergedSE2$result, "Metrics") expect_true(additional_col_name %in% names(assayNormalized)) expect_identical(unique(assayNormalized[[additional_col_name]]), names(listSE)) expect_identical(SummarizedExperiment::assayNames(listSE[[1]]), - SummarizedExperiment::assayNames(mergedSE[[1]])) + SummarizedExperiment::assayNames(mergedSE$result)) reset_env_identifiers() - }) +}) test_that("merge_SE works as expected with combo matrix data", { @@ -61,13 +67,14 @@ test_that("merge_SE works as expected with combo matrix data", { checkmate::expect_class(mergedSE$result, "SummarizedExperiment") S4Vectors::metadata(mergedSE$result)[["df_raw_data"]] <- list(NULL) validate_SE(mergedSE$result) + additional_col_name <- "QCS" mergedSE2 <- purrr::quietly(merge_SE)(listSE2, additional_col_name) assayNormalized <- convert_se_assay_to_dt(mergedSE2$result, "Metrics") expect_true(additional_col_name %in% names(assayNormalized)) expect_identical(unique(assayNormalized[[additional_col_name]]), names(listSE)) expect_identical(SummarizedExperiment::assayNames(listSE2[[1]]), - SummarizedExperiment::assayNames(mergedSE[[1]])) + SummarizedExperiment::assayNames(mergedSE$result)) }) test_that("merge_SE works as expected with mixed data types", { @@ -85,10 +92,28 @@ test_that("merge_SE works with data with additional perturbations", { expect_equal(dim(mergedSE$result), c(10, 5)) }) -test_that("merge_MAE works as expected", { - mergedMAE <- purrr::quietly(merge_MAE)(listMAE) +test_that("merge_MAE works as expected with synthetic metadata injection", { + custom_title <- "Unit Test Merged MAE" + custom_source_id <- "test_dataset_001" + + mergedMAE <- purrr::quietly(merge_MAE)( + listMAE, + title = custom_title, + source_id = custom_source_id + ) checkmate::expect_class(mergedMAE$result, "MultiAssayExperiment") validate_MAE(mergedMAE$result) + + mae_meta <- S4Vectors::metadata(mergedMAE$result) + if (!is.null(mae_meta$.internal$DataSetDB$dataset)) { + expect_equal(mae_meta$.internal$DataSetDB$dataset$title, custom_title) + expect_equal(mae_meta$.internal$DataSetDB$dataset$sources[[1]]$id, custom_source_id) + } + + se_meta <- S4Vectors::metadata(mergedMAE$result[[1]])$experiment_metadata + expect_equal(se_meta$title, custom_title) + expect_equal(se_meta$sources[[1]]$id, custom_source_id) + expect_identical( SummarizedExperiment::assayNames(MultiAssayExperiment::experiments(listMAE[[1]])[[1]]), SummarizedExperiment::assayNames(MultiAssayExperiment::experiments(mergedMAE$result)[[1]]) From e38c8e7f8722f466c86ca701dbc2f6cee25e1bbd Mon Sep 17 00:00:00 2001 From: Bartosz Czech Date: Tue, 14 Apr 2026 07:03:07 +0000 Subject: [PATCH 4/5] chore: bump version and update NEWS.md --- DESCRIPTION | 4 ++-- NEWS.md | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 31898b68..d16cf302 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: gDRutils Type: Package Title: A package with helper functions for processing drug response data -Version: 1.9.6 -Date: 2026-03-23 +Version: 1.9.7 +Date: 2026-04-14 Authors@R: c(person("Bartosz", "Czech", role=c("aut"), comment = c(ORCID = "0000-0002-9908-3007")), person("Arkadiusz", "Gladki", role=c("cre", "aut"), email="gladki.arkadiusz@gmail.com", diff --git a/NEWS.md b/NEWS.md index cd269886..a579b6cc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +## gDRutils 1.9.7 - 2026-04-14 +* add support for metadata in `merge_MAE` + ## gDRutils 1.9.6 - 2026-03-23 * `standardize_MAE` standardizes also internal identifiers From 96d8ba1116215c892280be4fdb7c04152792ac87 Mon Sep 17 00:00:00 2001 From: Bartosz Czech Date: Tue, 14 Apr 2026 07:05:16 +0000 Subject: [PATCH 5/5] test: update unit tests --- tests/testthat/test-merge_SE.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/testthat/test-merge_SE.R b/tests/testthat/test-merge_SE.R index 5876c44e..c7d7928b 100644 --- a/tests/testthat/test-merge_SE.R +++ b/tests/testthat/test-merge_SE.R @@ -28,9 +28,7 @@ test_that("merge_metadata and identify_unique_se_metadata_fields work as expecte metadata_fields <- identify_unique_se_metadata_fields(listSE) mergedMetadata <- merge_metadata(listSE, metadata_fields) expect_identical(names(mergedMetadata), metadata_fields) - - # NEW: experiment_metadata is no longer nested by names(listSE). - # It is now synthesized, so we check for standard keys like 'sources'. + if ("experiment_metadata" %in% names(mergedMetadata)) { expect_true("sources" %in% names(mergedMetadata$experiment_metadata)) }