diff --git a/R/compute2-extra.R b/R/compute2-extra.R index d852f8da..4e24bc6f 100644 --- a/R/compute2-extra.R +++ b/R/compute2-extra.R @@ -169,7 +169,7 @@ compute_extra <- function(pgx, extra = c( remove(res) message("<<< done!") } - + # This requires libx.dir to be set (or sigdb passed) to find sigdb-.h5 files if ("connectivity" %in% extra) { # try to find sigdb in libx dir if not specified @@ -383,33 +383,35 @@ compute_deconvolution <- function(pgx, rna.counts = pgx$counts, full = FALSE) { #' } #' @export compute_cellcycle_gender <- function(pgx, rna.counts = pgx$counts) { + if (!is.null(pgx$organism)) { - is.human <- (tolower(pgx$organism) == "human") + is.human <- any(tolower(pgx$organism) == "human") } else { - is.human <- (tolower(pgx.getOrganism(pgx)) == "human") + is.human <- any(tolower(pgx.getOrganism(pgx)) == "human") } + if (is.human) { + + species <- unique(pgx$genes$species) + jj <- 1:nrow(pgx$counts) + if (length(species) > 1) jj <- which(tolower(pgx$genes$species) == "human") + rna.counts <- rna.counts[jj, , drop = FALSE] + message("estimating cell cycle (using Seurat)...") pgx$samples$cell.cycle <- NULL pgx$samples$.cell.cycle <- NULL if (!(".cell_cycle" %in% colnames(pgx$samples))) { counts <- rna.counts - ## message("length(unique(rownames(counts))): ", length(unique(rownames(counts)))) - ## message("dim(counts): ", dim(counts)) - ## In multi-species now use symbol, and deduplicate in case - ## use retains feature as "gene_name/rowname" rownames(counts) <- toupper(pgx$genes[rownames(counts), "symbol"]) - counts <- counts[which(!is.na(rownames(counts))), ] + counts <- counts[which(!is.na(rownames(counts))), , drop = FALSE] if (any(duplicated(rownames(counts)))) { message("Deduplicate counts for cell cycle and gender inference") counts <- rowmean(counts, group = rownames(counts)) counts[which(is.nan(counts))] <- NA } - res <- try(pgx.inferCellCyclePhase(counts)) - if (!inherits(res, "try-error")) { - pgx$samples$.cell_cycle <- res - } + res <- try(pgx.inferCellCyclePhase(counts), silent = TRUE) + if (!inherits(res, "try-error")) pgx$samples$.cell_cycle <- res } else { message("cell cycle already estimated. skipping...") } @@ -418,13 +420,16 @@ compute_cellcycle_gender <- function(pgx, rna.counts = pgx$counts) { message("estimating gender...") pgx$samples$.gender <- NULL X <- log2(1 + rna.counts) - gene_symbol <- pgx$genes[rownames(X), "symbol"] # Use gene-symbol also for gender + gene_symbol <- pgx$genes[rownames(X), "symbol"] pgx$samples$.gender <- pgx.inferGender(X, gene_symbol) } else { message("gender already estimated. skipping...") } + } + return(pgx) + } diff --git a/R/pgx-annot.R b/R/pgx-annot.R index 98074a21..5d743eba 100644 --- a/R/pgx-annot.R +++ b/R/pgx-annot.R @@ -100,14 +100,14 @@ getProbeAnnotation <- function(organism, datatype, probetype = "", annot_table = NULL) { + if (is.null(datatype)) datatype <- "unknown" if (is.null(probetype)) probetype <- "unknown" - - unknown.organism <- (tolower(organism) %in% c("no organism", "custom", "unkown")) + + unknown.organism <- all(tolower(organism) %in% c("no organism", "custom", "unkown")) unknown.datatype <- (datatype %in% c("custom", "unkown")) - unknown.probetype <- (probetype %in% c("custom", "unkown")) + unknown.probetype <- all(probetype %in% c("custom", "unkown")) annot.unknown <- unknown.organism || unknown.datatype || unknown.probetype - annot.unknown ## clean probe names probes <- trimws(probes) @@ -117,86 +117,123 @@ getProbeAnnotation <- function(organism, if (!is.null(annot_table)) { rownames(annot_table) <- make_unique(rownames(annot_table)) } + + species <- organism + if (!is.null(annot_table)) { + kk <- intersect(c("organism", "species"), tolower(colnames(annot_table)))[1] + if (length(kk) > 0) species <- as.character(unique(annot_table[,kk])) + } - genes <- NULL - if (annot.unknown) { - # annotation table is mandatory for 'No organism' (until server side - # can handle missing genesets) - info("[getProbeAnnotation] annotating with custom annotation") - genes <- getCustomAnnotation2(probes0, annot_table) - } else if (datatype == "metabolomics") { - dbg("[getProbeAnnotation] annotating for metabolomics") - mx.check <- mx.check_mapping( - probes, - all.db = c("playdata", "annothub", "refmet"), check.first = TRUE - ) - mx.check <- mean(!is.na(mx.check)) > 0.01 - mx.check - if (mx.check) { - ## Directly annotate if probes are recognized - genes <- getMetaboliteAnnotation( - probes, - extra_annot = TRUE, - annot_table = NULL - ) + i=1; annot.list=list() + for(i in 1:length(species)) { + + message("[playbase::getProbeAnnotation] Annotating organism: ", species[i]) + + jj <- 1:length(probes) + if (length(species) > 1) { + jj <- which(as.character(annot_table[,kk]) == species[i]) + } + + genes <- NULL + + if (annot.unknown) { + # annotation table is mandatory for 'No organism' (until server side + # can handle missing genesets) + info("[getProbeAnnotation] annotating with custom annotation") + genes <- getCustomAnnotation2(probes0[jj], annot_table[jj, , drop = FALSE]) + } else if (datatype == "metabolomics") { + dbg("[getProbeAnnotation] annotating for metabolomics") + all.db <- c("playdata", "annothub", "refmet") + mx.check <- mx.check_mapping(probes[jj], all.db = all.db, check.first = TRUE) + mx.check <- mean(!is.na(mx.check)) > 0.01 + if (mx.check) { + ## Directly annotate if probes are recognized + genes <- getMetaboliteAnnotation(probes[jj], extra_annot = TRUE, annot_table = NULL) + } else { + ## Fallback on custom + dbg("[getProbeAnnotation] WARNING: not able to map metabolomics probes") + } + } else if (datatype == "multi-omics") { + dbg("[getProbeAnnotation] annotating for multi-omics") + genes <- getMultiOmicsProbeAnnotation(species[i], probes[jj]) } else { - ## Fallback on custom - dbg("[getProbeAnnotation] WARNING: not able to map metabolomics probes") + dbg("[getProbeAnnotation] annotating for transcriptomics") + genes <- getGeneAnnotation(organism = species[i], probes = probes[jj]) + } + + if (is.null(genes)) { + dbg("[getProbeAnnotation] WARNING: fallback to UNKNOWN probes") + genes <- getCustomAnnotation(probes0[jj], custom_annot = NULL) } - } else if (datatype == "multi-omics") { - dbg("[getProbeAnnotation] annotating for multi-omics") - genes <- getMultiOmicsProbeAnnotation(organism, probes) - } else { - dbg("[getProbeAnnotation] annotating for transcriptomics") - genes <- getGeneAnnotation(organism = organism, probes = probes) - } - ## final fallback is genes==NULL - if (is.null(genes)) { - dbg("[getProbeAnnotation] WARNING: fallback to UNKNOWN probes") - genes <- getCustomAnnotation(probes0, custom_annot = NULL) - } - - ## if annot_table is provided we (priority) override our annotation - ## and append any extra columns. - if (!is.null(genes) && !is.null(annot_table)) { - dbg("[getProbeAnnotation] merging custom annotation table") - colnames(annot_table) <- sub("^ortholog$", "human_ortholog", - colnames(annot_table), - ignore.case = TRUE - ) - colnames(annot_table) <- sub("^Symbol$|^gene$|^gene_name$", "symbol", - colnames(annot_table), - ignore.case = TRUE - ) - genes <- merge_annot_table(genes, annot_table, priority = 2) - } + ## if annot_table is provided we (priority) override our annotation + ## and append any extra columns. + if (!is.null(genes) && !is.null(annot_table)) { + dbg("[getProbeAnnotation] merging custom annotation table") + cl <- sub("^ortholog$", "human_ortholog", colnames(annot_table), ignore.case = TRUE) + colnames(annot_table) <- cl + cl <- sub("^Symbol$|^gene$|^gene_name$", "symbol", colnames(annot_table), ignore.case = TRUE) + colnames(annot_table) <- cl + genes <- merge_annot_table(genes, annot_table[jj, , drop = FALSE], priority = 2) + } - ## ensure full dimensions - genes <- genes[match(probes, genes$feature),] + ## ensure full dimensions; + ## restore original probe names; + ## clean up entries and reorder columns + genes <- genes[match(probes[jj], genes$feature), , drop = FALSE] + rownames(genes) <- probes0[jj] + genes <- cleanupAnnotation(genes) - ## restore original probe names - rownames(genes) <- probes0 + cm <- intersect(c("organism", "species"), tolower(colnames(genes)))[1] + if (length(cm) == 0) genes$species <- species[i] + annot.list[[species[i]]] <- genes + rm(genes); gc() - ## cleanup entries and reorder columns - genes <- cleanupAnnotation(genes) + message("getProbeAnnotation] Annotation for organism: ", species[i], " completed\n\n") + } + + ## Conform + kk <- lapply(annot.list, colnames) + kk <- unique(unlist(unname(kk))) + for(i in 1:length(annot.list)) { + add <- setdiff(kk, colnames(annot.list[[i]])) + if (length(add) > 0) { + annot.list[[i]] <- cbind(annot.list[[i]], NA) + colnames(annot.list[[i]])[ncol(annot.list[[i]])] <- add + } + } + + genes <- do.call(rbind, annot.list) + genes <- genes[, colnames(genes) != "row.names", drop = FALSE] + ss <- paste(paste0(unique(species), "."), collapse = "|") + rownames(genes) <- gsub(ss, "", rownames(genes)) + rm(annot.list); gc() + return(genes) + } #' Get gene annotation data using annothub or orthogene. #' #' @export -getGeneAnnotation <- function( - organism, - probes, - use.ah = NULL, - verbose = TRUE, - methods = c("annothub", "gprofiler")) { - if (tolower(organism) == "human") organism <- "Homo sapiens" - if (tolower(organism) == "mouse") organism <- "Mus musculus" - if (tolower(organism) == "rat") organism <- "Rattus norvegicus" - if (tolower(organism) == "dog") organism <- "Canis familiaris" +getGeneAnnotation <- function(organism, + probes, + use.ah = NULL, + verbose = TRUE, + methods = c("annothub", "gprofiler")) { + + if (any(tolower(organism) == "human")) + organism[tolower(organism) == "human"] <- "Homo sapiens" + + if (any(tolower(organism) == "mouse")) + organism[tolower(organism) == "mouse"] <- "Mus musculus" + + if (any(tolower(organism) == "rat")) + organism[tolower(organism) == "rat"] <- "Rattus norvegicus" + + if (any(tolower(organism) == "dog")) + organism[tolower(organism) == "dog"] <- "Canis familiaris" probes <- trimws(probes) probes[probes == "" | is.na(probes)] <- "NA" @@ -206,14 +243,14 @@ getGeneAnnotation <- function( probes <- sub("^[a-zA-Z0-9]+:", "", probes) } - # init empty (all missings) annot <- data.frame(feature = probes, stringsAsFactors = FALSE) missing <- rep(TRUE, length(probes)) - + for (method in methods) { + if (any(missing)) { - # annotation for current method missing_probes <- probes[which(missing)] + missing_annot <- try(switch(method, "annothub" = getGeneAnnotation.ANNOTHUB( organism = organism, @@ -229,8 +266,10 @@ getGeneAnnotation <- function( stop("Unknown method: ", method) )) - annot_ok <- !inherits(missing_annot, "try-error") && - !is.null(missing_annot) && nrow(missing_annot) > 0 + c1 <- !inherits(missing_annot, "try-error") + c2 <- !is.null(missing_annot) + c3 <- nrow(missing_annot) > 0 + annot_ok <- c1 && c2 && c3 if (annot_ok) { # not all methods have the same columns new_cols <- setdiff(colnames(missing_annot), colnames(annot)) @@ -241,22 +280,21 @@ getGeneAnnotation <- function( annot[missing, ] <- mm[, colnames(annot)] missing <- is.na(annot$symbol) | annot$symbol == "" } + } + } if (all(missing)) { # unsuccessful annotation - message("[getGeneAnnotation] WARNING. all missing??? missing.ratio=", mean(missing)) + message("[getGeneAnnotation] WARNING: all missing. Missing.ratio=", mean(missing)) annot <- NULL } - ## clean up - if (!is.null(annot)) { - annot <- cleanupAnnotation(annot) - } + if (!is.null(annot)) annot <- cleanupAnnotation(annot) return(annot) -} +} #' Get gene annotation data using AnnotationHub #' @@ -295,13 +333,13 @@ getGeneAnnotation <- function( #' head(result) #' } #' @export -getGeneAnnotation.ANNOTHUB <- function( - organism, - probes, - use.ah = NULL, - probe_type = NULL, - second.pass = TRUE, - verbose = TRUE) { +getGeneAnnotation.ANNOTHUB <- function(organism, + probes, + use.ah = NULL, + probe_type = NULL, + second.pass = TRUE, + verbose = TRUE) { + if (is.null(organism)) { warning("[getGeneAnnotation.ANNOTHUB] Please specify organism") return(NULL) diff --git a/R/pgx-compute.R b/R/pgx-compute.R index 84d35ba1..753fca52 100644 --- a/R/pgx-compute.R +++ b/R/pgx-compute.R @@ -201,10 +201,8 @@ pgx.createPGX <- function(counts, if (is.null(samples)) stop("[pgx.createPGX] FATAL: samples must be provided") if (is.null(organism)) stop("[pgx.createPGX] FATAL: organism must be provided") - message("[pgx.createPGX] dim.counts: ", dim(counts)[1], " x ", dim(counts)[2]) - message("[pgx.createPGX] class.counts: ", class(counts)) - message("[pgx.createPGX] counts has ", sum(is.na(counts)), " missing values") - + message("[pgx.createPGX] dim.counts: ", nrow(counts), "x" ,ncol(counts)) + ndup <- sum(duplicated(rownames(counts))) if (ndup > 0) { if (average.duplicated) { @@ -248,15 +246,13 @@ pgx.createPGX <- function(counts, if (nrow(annot_table) != nrow(counts)) { message("[pgx.createPGX] WARNING: annot_table has different nrows. forcing dimensions.") ii <- match(rownames(counts), rownames(annot_table)) - annot_table <- annot_table[ii, ] + annot_table <- annot_table[ii, , drop = FALSE] rownames(annot_table) <- rownames(counts) } } - if (sum(is.na(X)) > 0) { - message("[pgx.createPGX] X has ", sum(is.na(X)), " missing values") - } - + message("[pgx.createPGX] X has ", sum(is.na(X)), " missing values") + if (!is.null(X) && !all(dim(counts) == dim(X))) { stop("[pgx.createPGX] dimension of counts and X do not match\n") } @@ -282,9 +278,7 @@ pgx.createPGX <- function(counts, contrasts <- contrasts.convertToLabelMatrix(contrasts, samples) contrasts <- fixContrastMatrix(contrasts) - ## --------------------------------------------------------------------- - ## Time series conducted if user checked the box during upload - ## --------------------------------------------------------------------- + ## Time series if (dotimeseries) contrasts <- contrasts.addTimeInteraction(contrasts, samples) ## ------------------------------------------------------------------- @@ -376,7 +370,7 @@ pgx.createPGX <- function(counts, pgx <- list( name = name, organism = organism, - version = packageVersion("playbase"), # useless, just keep for back compatibility + version = packageVersion("playbase"), date = format(Sys.time(), "%Y-%m-%d %H:%M:%S"), creator = creator, datatype = datatype, @@ -398,16 +392,20 @@ pgx.createPGX <- function(counts, ## ------------------------------------------------------------------- pgx$genes <- NULL pgx$probe_type <- probe_type - - message("[createPGX] annotating genes") - pgx$genes <- getProbeAnnotation( + + message("[createPGX] Annotating genes: getProbeAnnotation") + pgx$genes <- getProbeAnnotation( organism = pgx$organism, probes = rownames(pgx$counts), datatype = pgx$datatype, probetype = pgx$probe_type, annot_table = annot_table ) + message("[createPGX] Annotating genes: process completed") + ## Conform feature names + rownames(pgx$counts) <- rownames(pgx$X) <- rownames(pgx$genes) + if (is.null(pgx$genes)) stop("[pgx.createPGX] FATAL: Could not build gene annotation") if (!"symbol" %in% colnames(pgx$genes) && "gene_name" %in% colnames(pgx$genes)) { @@ -489,27 +487,75 @@ pgx.createPGX <- function(counts, rownames(pgx$X) <- new.names } + ##-------------------------------------- + ## Multispecies data: append species ID + ##-------------------------------------- + if (length(unique(pgx$genes$species)) > 1) { + ff <- paste0(rownames(pgx$counts), sep = "_", as.character(pgx$genes$species)) + rownames(pgx$genes) <- pgx$genes$feature <- pgx$genes$gene_name <- ff + rownames(pgx$counts) <- rownames(pgx$X) <- ff + } + ## ------------------------------------------------------------------- ## Infer cell cycle/gender here (before any batchcorrection) ## ------------------------------------------------------------------- - info("[createPGX] infer cell cycle") + info("[createPGX] infer cell cycle and gender") pgx <- compute_cellcycle_gender(pgx, pgx$counts) + + species <- unique(pgx$organism) + norm_cols <- length(species) == 1 + i=1; LL=list() + for (i in 1:length(species)) { + + pgx0 <- pgx + jj <- 1:nrow(pgx$genes) + if (length(species) > 1) jj <- which(pgx0$genes$species == species[i]) + pgx0$organism <- species[i] + pgx0$counts <- pgx0$counts[jj, , drop = FALSE] + pgx0$X <- pgx0$X[jj, , drop = FALSE] + pgx0$genes <- pgx0$genes[jj, , drop = FALSE] + + ## ------------------------------------------------------------------- + ## Add GMT + ## ------------------------------------------------------------------- + ## create empty GMT if: no organism & no custom annot table & no custom geneset. + unknown.organism <- (species[i] %in% c("No organism", "custom", "unkown")) + unknown.datatype <- (pgx0$datatype %in% c("custom", "unkown")) + no3 <- unknown.organism && is.null(annot_table) && is.null(custom.geneset) + if (no3 || unknown.datatype || !add.gmt) { + message("[pgx.createPGX] WARNING: empty GMT matrix. No gene sets for ", species[i]) + pgx0$GMT <- Matrix::Matrix(0, nrow = 0, ncol = 0, sparse = TRUE) + } else { + message("[pgx.createPGX] Adding GMT for ", species[i]) + pgx0 <- pgx.add_GMT(pgx = pgx0, custom.geneset = custom.geneset, + max.genesets = max.genesets, normalize_cols = norm_cols) + } + + LL[[species[i]]] <- list(GMT = pgx0$GMT, custom.geneset = pgx0$custom.geneset) + rm(pgx0); gc() - ## ------------------------------------------------------------------- - ## Add GMT - ## ------------------------------------------------------------------- - ## If no organism, no custom annotation table and no custom geneset, - ## then create empty GMT - unknown.organism <- (pgx$organism %in% c("No organism", "custom", "unkown")) - unknown.datatype <- (pgx$datatype %in% c("custom", "unkown")) - no3 <- unknown.organism && is.null(annot_table) && is.null(custom.geneset) - if (no3 || unknown.datatype || !add.gmt) { - message("[pgx.createPGX] WARNING: empty GMT matrix. No gene sets. ") - pgx$GMT <- Matrix::Matrix(0, nrow = 0, ncol = 0, sparse = TRUE) - } else { - pgx <- pgx.add_GMT(pgx = pgx, custom.geneset = custom.geneset, max.genesets = max.genesets) } + GMT <- LL[[1]]$GMT + custom.geneset <- LL[[1]]$custom.geneset + if (length(LL) > 1) { + nfeatures <- unname(unlist(lapply(LL, function(x) rownames(x$GMT)))) + ngsets <- unname(unlist(lapply(LL, function(x) colnames(x$GMT)))) + GMT <- matrix(0, nrow = length(nfeatures), ncol = length(ngsets)) + rownames(GMT) <- nfeatures + colnames(GMT) <- ngsets + i=1 + for(i in 1:length(LL)) { + ii <- match(rownames(LL[[i]]$GMT), nfeatures) + kk <- match(colnames(LL[[i]]$GMT), ngsets) + GMT[ii, kk] <- as.matrix(LL[[i]]$GMT) + } + GMT <- normalize_cols(as(GMT, "sparseMatrix")) + } + pgx$GMT <- GMT + pgx$custom.geneset <- custom.geneset + rm(LL); gc() + ## -------------------------------- ## rm NA contrasts ## -------------------------------- @@ -589,6 +635,7 @@ pgx.createPGX <- function(counts, message("\n\n") return(pgx) + } @@ -655,7 +702,7 @@ pgx.computePGX <- function(pgx, message("\n") message("[pgx.computePGX] Starting pgx.computePGX") message("\n") - + if (!"contrasts" %in% names(pgx)) { stop("[pgx.computePGX] FATAL:: no contrasts in object") } @@ -799,8 +846,9 @@ pgx.computePGX <- function(pgx, ## ------------------ gene set tests ----------------------- if (!is.null(progress)) progress$inc(0.2, detail = "testing gene sets") - if ((pgx$organism != "No organism" && !is.null(pgx$GMT) && nrow(pgx$GMT) > 0) || - (pgx$organism == "No organism" && !is.null(custom.geneset$gmt))) { + c1 <- (any(pgx$organism != "No organism") && !is.null(pgx$GMT) && nrow(pgx$GMT) > 0) + c2 <- (any(pgx$organism == "No organism") && !is.null(custom.geneset$gmt)) + if (c1 || c2) { message("[pgx.computePGX] testing genesets...") pgx <- compute_testGenesets( @@ -961,7 +1009,7 @@ pgx.filterLowExpressed <- function(pgx, prior.cpm = 1) { pgx$counts <- pgx$counts[keep, , drop = FALSE] message("filtering out ", sum(!keep), " low-expressed genes") message("keeping ", sum(keep), " expressed genes") - if (!is.null(pgx$X)) { + if (!is.null(pgx$gX)) { ## WARNING: counts and X should match dimensions. pgx$X <- pgx$X[which(keep), , drop = FALSE] } @@ -1008,8 +1056,11 @@ pgx.filterLowExpressed <- function(pgx, prior.cpm = 1) { return(G) } -pgx.add_GMT <- function(pgx, custom.geneset = NULL, max.genesets = 20000) { - +pgx.add_GMT <- function(pgx, + custom.geneset = NULL, + max.genesets = 20000, + normalize_cols = TRUE) { + if (!"symbol" %in% colnames(pgx$genes)) { message(paste( "[pgx.add_GMT] ERROR: could not find 'symbol' column.", @@ -1023,8 +1074,7 @@ pgx.add_GMT <- function(pgx, custom.geneset = NULL, max.genesets = 20000) { ## ----------------------------------------------------------- message("[pgx.add_GMT] Creating GMT matrix... ") - # Load geneset matrix from playdata. add metabolomics if data.type - # is metabolomics + # Load geneset matrix from playdata. add metabolomics if data.type==metabolomics target <- c("human_ortholog", "symbol", "gene_name", "rownames") ortho.col <- intersect(target, colnames(pgx$genes)) if (length(ortho.col) == 0) { @@ -1075,9 +1125,11 @@ pgx.add_GMT <- function(pgx, custom.geneset = NULL, max.genesets = 20000) { # create a feature list that will be used to filter and reduce dimensions of G full_feature_list <- c( - pgx$genes$human_ortholog, pgx$genes$symbol, + pgx$genes$human_ortholog, + pgx$genes$symbol, rownames(pgx$genes) ) + full_feature_list <- full_feature_list[!is.na(full_feature_list)] full_feature_list <- full_feature_list[full_feature_list != ""] full_feature_list <- unique(full_feature_list) @@ -1087,7 +1139,6 @@ pgx.add_GMT <- function(pgx, custom.geneset = NULL, max.genesets = 20000) { G <- G[, Matrix::colSums(G != 0) > 0, drop = FALSE] if (nrow(G) == 0 || ncol(G) == 0) G <- NULL } - ## Add organism specific GO gene sets. This is species gene symbol. if (has.px2) { @@ -1095,12 +1146,12 @@ pgx.add_GMT <- function(pgx, custom.geneset = NULL, max.genesets = 20000) { go.genesets <- NULL info("[pgx.add_GMT] Adding species GO for organism", pgx$organism) go.genesets <- tryCatch( - { - getOrganismGO(pgx$organism) - }, - error = function(e) { - message("Error in getOrganismsGO:", e) - } + { + getOrganismGO(pgx$organism) + }, + error = function(e) { + message("Error in getOrganismsGO:", e) + } ) if (!is.null(go.genesets)) { @@ -1114,7 +1165,7 @@ pgx.add_GMT <- function(pgx, custom.geneset = NULL, max.genesets = 20000) { G <- .append_gmt_to_matrix(go.genesets, G, all_genes, minsize=15, maxsize=400) } ## end-if go.genesets } ## end-if !metabolics - + ## Add custom gene sets if provided if (!is.null(custom.geneset$gmt)) { message("[pgx.add_GMT] Adding custom genesets...") @@ -1124,7 +1175,7 @@ pgx.add_GMT <- function(pgx, custom.geneset = NULL, max.genesets = 20000) { all_genes <- unique(pgx$genes$symbol) G <- .append_gmt_to_matrix(customG, G, all_genes, minsize=3, maxsize=9999) } - + ## ----------------------------------------------------------- ## Prioritize gene sets by fast rank-correlation ## ----------------------------------------------------------- @@ -1135,8 +1186,11 @@ pgx.add_GMT <- function(pgx, custom.geneset = NULL, max.genesets = 20000) { if (is.null(max.genesets)) max.genesets <- 20000 if (max.genesets < 0) max.genesets <- 20000 + if (!is.null(G) && ncol(G) > max.genesets) { + message("[pgx.add_GMT] Matching gene set matrix...") + # we use SYMBOL as rownames gX <- pgx$X if (!all(rownames(gX) %in% pgx$genes$symbol)) { @@ -1201,6 +1255,7 @@ pgx.add_GMT <- function(pgx, custom.geneset = NULL, max.genesets = 20000) { jj <- jj[order(colnames(G)[jj])] ## sort alphabetically G <- G[, jj, drop = FALSE] rm(gsetX.bygroup, gsetX) + } ## ----------------------------------------------------------------------- @@ -1209,6 +1264,7 @@ pgx.add_GMT <- function(pgx, custom.geneset = NULL, max.genesets = 20000) { # final check: drop genesets in G based on geneset size if (!is.null(G)) { + gmt.size <- Matrix::colSums(G != 0) has.metabolites <- sum(grepl("^[0-9]+$|CHEBI|LIPID", rownames(G))) >= 10 has.metabolites @@ -1224,16 +1280,20 @@ pgx.add_GMT <- function(pgx, custom.geneset = NULL, max.genesets = 20000) { # add all custom genesets to size.ok idx_custom_gmt <- grep("CUSTOM", colnames(G)) + # make sure we dont miss CUSTOM genesets due to size.ok exclusion if (length(idx_custom_gmt) > 0) { names(idx_custom_gmt) <- colnames(G)[idx_custom_gmt] size.ok <- union(size.ok, idx_custom_gmt) } + G <- G[, size.ok, drop = FALSE] + } # add random genesets if G is too small if (is.null(G) || ncol(G) < 30 || nrow(G) < 3) { + add.gmt <- NULL rr <- sample(3:400, 50) gg <- pgx$genes$symbol @@ -1254,19 +1314,18 @@ pgx.add_GMT <- function(pgx, custom.geneset = NULL, max.genesets = 20000) { ) } - + # normalize columns (required for some methods downstream)log2foldchange - G <- normalize_cols(G) + if (normalize_cols) G <- normalize_cols(G) pgx$GMT <- G pgx$custom.geneset <- custom.geneset message(glue::glue("[pgx.add_GMT] Final GMT: {nrow(G)} x {ncol(G)}")) - rm(G) + rm(G); gc() - gc() return(pgx) -} +} ## ---------------------------------------------------------------------- diff --git a/R/pgx-functions.R b/R/pgx-functions.R index d4ef4cda..70259f79 100644 --- a/R/pgx-functions.R +++ b/R/pgx-functions.R @@ -1092,9 +1092,9 @@ pgx.getOrganism <- function(pgx, capitalise = FALSE) { org <- ifelse(is.mouse, "mouse", "human") } - if (capitalise && org %in% c("mouse", "human")) { - if (org == "mouse") org <- "Mouse" - if (org == "human") org <- "Human" + if (capitalise && any(org %in% c("mouse", "human"))) { + if (any(org == "mouse")) org[which(org == "mouse")] <- "Mouse" + if (any(org == "human")) org[which(org == "human")] <- "Human" } return(org) } diff --git a/R/pgx-init.R b/R/pgx-init.R index ab7cba64..6471fbb5 100644 --- a/R/pgx-init.R +++ b/R/pgx-init.R @@ -212,7 +212,7 @@ pgx.initialize <- function(pgx) { } } - if (pgx$organism %in% c("Human", "human") | !is.null(pgx$version)) { + if (any(pgx$organism %in% c("Human", "human")) | !is.null(pgx$version)) { pgx$families <- lapply(playdata::FAMILIES, function(x) { intersect(x, genes) })