diff --git a/.gitignore b/.gitignore index 5b6a065..16b31c3 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,7 @@ .Rhistory .RData .Ruserdata +.pre-commit-config.yaml +.ignore_spelling.txt +.flake8 +.lintr diff --git a/.ignore_spelling.txt b/.ignore_spelling.txt new file mode 100644 index 0000000..e69de29 diff --git a/.lintr b/.lintr new file mode 100644 index 0000000..d8bf25c --- /dev/null +++ b/.lintr @@ -0,0 +1 @@ +linters: linters_with_defaults(object_name_linter("dotted.case"), line_length_linter(132), object_usage_linter=NULL, cyclocomp_linter(20)) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3588136..efdcc4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### Fixed +- linting and spelling errors resolved with pre-commit usage. ### Deprecated diff --git a/R/encoding.R b/R/encoding.R index cf3f6d2..fe8f5cc 100644 --- a/R/encoding.R +++ b/R/encoding.R @@ -1,3 +1,31 @@ +#' For affecteds: Take genetic variant and determine the category of the combo. +#' @param variant Variant for individual. genotypes, phased genotypes, or binary encodings accepted. +assign.a <- function(variant) { + alt.codes <- c("0/1", "1/1", "1", "0|1", "1|0", "1|1") + ref.codes <- c("0/0", "0", "0|0") + if (variant %in% alt.codes) { + return("A.c") + } else if (variant %in% ref.codes) { + return("A.i") + } else { + stop("Incompatible variant value! Supported encodings are: '0' '1' '0/0' '0/1' '0|0' '0|1'") + } +} + +#' For unaffecteds: Take a genetic variant and determine the category of the combo. +#' @param variant Variant for individual. genotypes, phased genotypes, or binary encodings accepted. +assign.u <- function(variant) { + alt.codes <- c("0/1", "1/1", "1", "0|1", "1|0", "1|1") + ref.codes <- c("0/0", "0", "0|0") + + if (variant %in% alt.codes) { + return("U.i") + } else if (variant %in% ref.codes) { + return("U.c") + } else { + stop("Incompatible variant value! Supported encodings are: '0' '1' '0/0' '0/1' '0|0' '0|1'") + } +} #' Take a disease status and a genetic variant and determine which category the combo falls in. @@ -19,38 +47,32 @@ #' assign.status("A", "0/1") == "A.c" #' assign.status("A", "0|0") == "A.i" #' assign.status("U", 1) == "U.i" -#' assign.status("U", "0|0") =="U.c" +#' assign.status("U", "0|0") == "U.c" #' @export -assign.status <- function(status, variant, theoretical.max=FALSE){ - - var.err<-"Incompatible variant value! Supported encodings are: '0' '1' '0/0' '0/1' '0|0' '0|1'" - if(status == "A"){ - if(theoretical.max){ - return("A.c") - } - #NOTE - Once in a while 1/0 genotypes crop up; also 0/2 etc. if derived from multi-allelics. This edge case not covered at present. - else if(variant == "0/1" || variant == "1/1" || variant == "1" || variant == "0|1" || variant == "1|0" || variant == "1|1" ){ +assign.status <- function(status, variant, theoretical.max = FALSE) { + var.err <- "Incompatible variant value! Supported encodings are: '0' '1' '0/0' '0/1' '0|0' '0|1'" + if (status == "A") { + if (theoretical.max) { return("A.c") - }else if (variant == "0/0" || variant == "0" || variant == "0|0" ){ - return("A.i") - }else{ - stop(var.err) + # NOTE - Once in a while 1/0 genotypes crop up; also 0/2 etc. if derived from multi-allelics. + # This edge case not covered at present. + } else { + assign.a(variant) } - }else if (status == "U"){ - if(theoretical.max){ - return("U.c") - } else if(variant == "0/1" || variant == "1/1" || variant == "1" || variant == "0|1" || variant == "1|0" || variant == "1|1" ){ - return("U.i") - }else if (variant == "0/0" || variant == "0" || variant == "0|0" ){ + } else if (status == "U") { + if (theoretical.max) { return("U.c") - }else{ - stop(var.err) + } else { + assign.u(variant) } - }else{ + } else { stop("Status must be one of: U or A") } } + + + #' Take the dataframe with variants and status and determine which indivudals #' are scored correctly and which are scored incorrectly. #' Assign an A.c, A.i, U.c, U.i, unk @@ -70,23 +92,21 @@ assign.status <- function(status, variant, theoretical.max=FALSE){ #' Default is FALSE. #' @return Copy of input dataframe, with dataframe with the status categroies added as a new column "statvar.cat" #' @examples -#' #TODO - add +#' # TODO - add #' @export -score.variant.status <- function(indiv.df, theoretical.max=FALSE){ - - #when encoding theoretical max, dummy perfect associating variant generated to see what a family could score. - if(theoretical.max){ - indiv.df$statvar.cat <- unlist(lapply(1:nrow(indiv.df), function(i){ - assign.status(indiv.df$status[[i]], indiv.df$variant[[i]] , theoretical.max=TRUE ) +score.variant.status <- function(indiv.df, theoretical.max = FALSE) { + # when encoding theoretical max, dummy perfect associating variant generated to see what a family could score. + if (theoretical.max) { + indiv.df$statvar.cat <- unlist(lapply(seq_len(nrow(indiv.df)), function(i) { + assign.status(indiv.df$status[[i]], indiv.df$variant[[i]], theoretical.max = TRUE) })) - - }else{ - indiv.df$statvar.cat <- unlist(lapply(1:nrow(indiv.df), function(i){ - assign.status(indiv.df$status[[i]], indiv.df$variant[[i]] ) + } else { + indiv.df$statvar.cat <- unlist(lapply(seq_len(nrow(indiv.df)), function(i) { + assign.status(indiv.df$status[[i]], indiv.df$variant[[i]]) })) } -return(indiv.df) + return(indiv.df) } @@ -99,23 +119,22 @@ return(indiv.df) #' #' @return A list with the categorized relationship/variant information. #' @export -build.relation.dict <- function( mat.row, name.stat.dict, drop.unrelated=TRUE){ - indiv.rels = list( +build.relation.dict <- function(mat.row, name.stat.dict, drop.unrelated = TRUE) { + indiv.rels <- list( "A.c" = c(), "A.i" = c(), "U.c" = c(), "U.i" = c() ) - for(i in seq_along(mat.row)){ - + for (i in seq_along(mat.row)) { status.i <- name.stat.dict[[names(mat.row)[[i]]]] rel.i <- mat.row[[i]] - if (rel.i != -1 || drop.unrelated == FALSE){ + if (rel.i != -1 || drop.unrelated == FALSE) { indiv.rels[[status.i]] <- c(indiv.rels[[status.i]], rel.i) } - } + } return(indiv.rels) } @@ -129,20 +148,15 @@ build.relation.dict <- function( mat.row, name.stat.dict, drop.unrelated=TRUE){ #' @return A dictionary with the per-individual relationship lists. #' One value for each row of the matrix. #' @export -encode.rows <- function(relation.mat, status.df, ...){ - +encode.rows <- function(relation.mat, status.df, ...) { name.stat.dict <- status.df$statvar.cat names(name.stat.dict) <- status.df$name - score.dicts <- lapply(1:nrow(relation.mat), function(i){ - build.relation.dict(relation.mat[i,], name.stat.dict) + score.dicts <- lapply(seq_len(nrow(relation.mat)), function(i) { + build.relation.dict(relation.mat[i, ], name.stat.dict) }) names(score.dicts) <- colnames(relation.mat) return(score.dicts) } - - - - diff --git a/R/relatedness.r b/R/relatedness.r index b12974e..59693ae 100644 --- a/R/relatedness.r +++ b/R/relatedness.r @@ -1,5 +1,3 @@ - - #' Calculate a relatedness-weighted score for a given rare variant. #' #' These scores can be used to compare variants of interest within a family. @@ -59,60 +57,61 @@ #' @return A list with three components: score, score.for, score.against. #' #' @examples -#' relations<-list("A.c" = c(0, 1, 3, 1),"A.i" = c(3),"U.c" = c(1, 2),"U.i" = c(1)) +#' relations <- list("A.c" = c(0, 1, 3, 1), "A.i" = c(3), "U.c" = c(1, 2), "U.i" = c(1)) #' rv.scores <- calc.rv.score(relations) #' @export -calc.rv.score <- function(fam.list, affected.weight=1, unaffected.weight=0.5, unaffected.max=8, max.err=4){ +calc.rv.score <- function(fam.list, affected.weight = 1, unaffected.weight = 0.5, unaffected.max = 8, max.err = 4) { + relatedness <- list() - relatedness = list() - - for(i in 0:8){ - relatedness[i+1] = 1 / (2 ** (i)) - } + for (i in 0:8) { + relatedness[i + 1] <- 1 / (2^(i)) + } - score.dict = list() - - score.dict[["A.c"]] - score.dict[["A.i"]] - score.dict[["U.c"]] - score.dict[["U.i"]] - - #TODO - change this to apply the different formula for the unaffecteds - for (n in names(fam.list)){ - scores = c() - if(n == "A.c" || n == "A.i"){ - for (x in fam.list[[n]]){ - #for affected, importance score gets higher the further from reference individual you get. - scores = c(scores, 1/relatedness[[x+1]]) - } - }else if ( n=="U.c" || n == "U.i"){ - for (x in fam.list[[n]]){ - #for unaffected, importance score gets lower the further from reference individual you get. - scores = c(scores, (unaffected.max*2) * relatedness[[x+1]]) - } - } - score.dict[[n]] = scores + score.dict <- list() + + score.dict[["A.c"]] + score.dict[["A.i"]] + score.dict[["U.c"]] + score.dict[["U.i"]] + + # TODO - change this to apply the different formula for the unaffecteds + for (n in names(fam.list)) { + scores <- c() + if (n == "A.c" || n == "A.i") { + for (x in fam.list[[n]]) { + # for affected, importance score gets higher the further from reference individual you get. + scores <- c(scores, 1 / relatedness[[x + 1]]) + } + } else if (n == "U.c" || n == "U.i") { + for (x in fam.list[[n]]) { + # for unaffected, importance score gets lower the further from reference individual you get. + scores <- c(scores, (unaffected.max * 2) * relatedness[[x + 1]]) + } } + score.dict[[n]] <- scores + } - weighted.for = sum(score.dict[["A.c"]]*affected.weight ) + - sum(score.dict[["U.c"]]*unaffected.weight) + weighted.for <- sum(score.dict[["A.c"]] * affected.weight) + + sum(score.dict[["U.c"]] * unaffected.weight) - weighted.against = sum(score.dict[["A.i"]]*affected.weight ) + - sum(score.dict[["U.i"]]*unaffected.weight) + weighted.against <- sum(score.dict[["A.i"]] * affected.weight) + + sum(score.dict[["U.i"]] * unaffected.weight) - out.list <- list("score" = weighted.for - weighted.against, - "score.for" = weighted.for, - "score.against" = weighted.against ) + out.list <- list( + "score" = weighted.for - weighted.against, + "score.for" = weighted.for, + "score.against" = weighted.against + ) - n.incor <- length(score.dict[["A.i"]]) + length(score.dict[["U.i"]]) - if(n.incor > max.err){ - out.list[["score"]] <- 0 - } - return( - out.list - ) + n.incor <- length(score.dict[["A.i"]]) + length(score.dict[["U.i"]]) + if (n.incor > max.err) { + out.list[["score"]] <- 0 + } + return( + out.list + ) } @@ -121,7 +120,7 @@ calc.rv.score <- function(fam.list, affected.weight=1, unaffected.weight=0.5, un #' @param mat.df The full matrix file to subset #' @param status.df The list of sampled individuals, matrix is subset to only these individuals. #' @return A subset of the input matrix. -subset.mat <- function(mat.df, status.df){ +subset.mat <- function(mat.df, status.df) { sub.ids <- rownames(mat.df)[rownames(mat.df) %in% status.df$name] sub.mat <- mat.df[sub.ids, sub.ids] return(sub.mat) @@ -134,7 +133,7 @@ subset.mat <- function(mat.df, status.df){ #' #' By default all individuals are treated as the reference 'proband' and #' the given variant's score is calculated based on relationships to all other individuals. -#' e.g. for each row in the relationship matrix. calc.rv.score is run, with the row name indiciating the +#' e.g. for each row in the relationship matrix. calc.rv.score is run, with the row name indicating the #' reference individual that the calculation is relative to. #' Note that the relation.mat can include more individuals than are present within the status.df, but the matrix will #' be subset to include only those individuals that have status information provided. @@ -153,42 +152,42 @@ subset.mat <- function(mat.df, status.df){ #' @param unaffected.weight A coefficient to multiply the U.c and U.i relatedness values by. #' @param return.sums Boolean indicating if sum of family variant scores should be returned (default = FALSE). #' @param return.means Boolean indicating if mean of all family variant scores should be returned (default = TRUE). -#' @param affected.only Boolean indicating if the family score should be calculated using only the affected individuals? (default = TRUE). +#' @param affected.only Boolean indicating if family score should be calculated using only affected individuals (default = TRUE). #' @param max.err A heuristic cap of the number of incorrect assignments allowed when scoring. When the total number #' of incorrect (sum of affected and unaffected) is exceeded, the variant's score is set to 0, regardless of the number #' of points for or against. This simplifies scoring and allows for fast filtering of poor quality variants. Default is 4. #' @return A labelled vector with names: score, score.for, score.against #' @examples -#' mat.name1<-system.file('extdata/1234_ex2.mat', package = 'KinformR') -#' tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'KinformR') +#' mat.name1 <- system.file("extdata/1234_ex2.mat", package = "KinformR") +#' tsv.name1 <- system.file("extdata/1234_ex2.tsv", package = "KinformR") #' mat.df <- read.relation.mat(mat.name1) #' ind.df <- read.indiv(tsv.name1) -#' ind.df.status <- score.variant.status(ind.df) +#' ind.df.status <- score.variant.status(ind.df) #' score.default <- score.fam(mat.df, ind.df.status) #' @export -score.fam <- function(relation.mat, status.df, affected.weight=1, unaffected.weight=0.5, - return.sums = FALSE, return.means = TRUE, - affected.only = TRUE, max.err=4){ - - #The family encoding matrix needs to be subset to include only the individuals in the status dataframe +score.fam <- function(relation.mat, status.df, affected.weight = 1, unaffected.weight = 0.5, + return.sums = FALSE, return.means = TRUE, + affected.only = TRUE, max.err = 4) { + # The family encoding matrix needs to be subset to include only the individuals in the status dataframe sub.relation.mat <- subset.mat(relation.mat, status.df) - encoded.dat <- encode.rows(sub.relation.mat, status.df, drop.unrelated=TRUE) + encoded.dat <- encode.rows(sub.relation.mat, status.df, drop.unrelated = TRUE) per.indv.scores <- lapply(encoded.dat, calc.rv.score, - affected.weight=affected.weight, unaffected.weight=unaffected.weight, - max.err=max.err) + affected.weight = affected.weight, unaffected.weight = unaffected.weight, + max.err = max.err + ) - scores <- do.call(rbind.data.frame, per.indv.scores) + scores <- do.call(rbind.data.frame, per.indv.scores) - if (affected.only){ - affected.indiv = status.df[status.df$status == "A",] - scores<-scores[row.names(scores) %in% affected.indiv$name,] + if (affected.only) { + affected.indiv <- status.df[status.df$status == "A", ] + scores <- scores[row.names(scores) %in% affected.indiv$name, ] } - if(return.sums){ + if (return.sums) { return(colSums(scores)) - }else if(return.means){ + } else if (return.means) { return(colMeans(scores)) } return(scores) @@ -203,17 +202,15 @@ score.fam <- function(relation.mat, status.df, affected.weight=1, unaffected.wei #' score.fam1 <- c("score" = 1.0, "score.for" = 2.0, "score.against" = 1.0) #' score.fam2 <- c("score" = 1.0, "score.for" = 3.0, "score.against" = 2.0) #' # out <- add.fam.scores(c(score.fam1, score.fam2)) -#' #returns: c("score" = 2.0,"score.for" = 5.0, "score.against" = 3.0) +#' # returns: c("score" = 2.0,"score.for" = 5.0, "score.against" = 3.0) #' @export -add.fam.scores <- function(score.vec){ - outvec<-tapply(score.vec, names(score.vec), sum) - - sorted.out<-c("score" = outvec[["score"]], - "score.for" = outvec[["score.for"]], - "score.against" = outvec[["score.against"]]) +add.fam.scores <- function(score.vec) { + outvec <- tapply(score.vec, names(score.vec), sum) + + sorted.out <- c( + "score" = outvec[["score"]], + "score.for" = outvec[["score.for"]], + "score.against" = outvec[["score.against"]] + ) return(sorted.out) } - - - - diff --git a/inst/extdata/example_vcf_extract_5678.tsv b/inst/extdata/example_vcf_extract_5678.tsv index c5894d2..7e85885 100644 --- a/inst/extdata/example_vcf_extract_5678.tsv +++ b/inst/extdata/example_vcf_extract_5678.tsv @@ -1,2 +1,2 @@ #CHROM POS REF ALT MS-5678-1001_A MS-5678-1002_U MS-5678-1004_U MS-5678-6001_U MS-5678-1003_A -chr3 46203838 G A 0/1 0/0 0/0 0/0 0/1 +chr13 10171738 G A 0/1 0/0 0/0 0/0 0/1 diff --git a/inst/extdata/example_vcf_extract_9876.tsv b/inst/extdata/example_vcf_extract_9876.tsv index d3063fe..5f6ab8d 100644 --- a/inst/extdata/example_vcf_extract_9876.tsv +++ b/inst/extdata/example_vcf_extract_9876.tsv @@ -1,2 +1,2 @@ #CHROM POS REF ALT MS-9876-1002_U MS-9876-1009_U MS-9876-1006_A MS-9876-1004_A MS-9876-1007_U MS-9876-1003_U MS-9876-1001_A MS-9876-1008_U MS-9876-1005_U -chr12 120155305 C T 0|0 0|0 0|0 0|1 0|0 0|0 0|1 0|0 0|1 +chr10 123456789 C T 0|0 0|0 0|0 0|1 0|0 0|0 0|1 0|0 0|1 diff --git a/man/add.fam.scores.Rd b/man/add.fam.scores.Rd index 847583e..ac5e3a6 100644 --- a/man/add.fam.scores.Rd +++ b/man/add.fam.scores.Rd @@ -21,5 +21,5 @@ For use in instances where one wishes to combine scores from multiple families. score.fam1 <- c("score" = 1.0, "score.for" = 2.0, "score.against" = 1.0) score.fam2 <- c("score" = 1.0, "score.for" = 3.0, "score.against" = 2.0) # out <- add.fam.scores(c(score.fam1, score.fam2)) -#returns: c("score" = 2.0,"score.for" = 5.0, "score.against" = 3.0) +# returns: c("score" = 2.0,"score.for" = 5.0, "score.against" = 3.0) } diff --git a/man/assign.a.Rd b/man/assign.a.Rd new file mode 100644 index 0000000..ba51fcb --- /dev/null +++ b/man/assign.a.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/encoding.R +\name{assign.a} +\alias{assign.a} +\title{For affecteds: Take genetic variant and determine the category of the combo.} +\usage{ +assign.a(variant) +} +\arguments{ +\item{variant}{Variant for individual. genotypes, phased genotypes, or binary encodings accepted.} +} +\description{ +For affecteds: Take genetic variant and determine the category of the combo. +} diff --git a/man/assign.status.Rd b/man/assign.status.Rd index b53a95a..d7372cb 100644 --- a/man/assign.status.Rd +++ b/man/assign.status.Rd @@ -39,5 +39,5 @@ These encoding can then be used show what a family's max score would be. assign.status("A", "0/1") == "A.c" assign.status("A", "0|0") == "A.i" assign.status("U", 1) == "U.i" -assign.status("U", "0|0") =="U.c" +assign.status("U", "0|0") == "U.c" } diff --git a/man/assign.u.Rd b/man/assign.u.Rd new file mode 100644 index 0000000..7058b52 --- /dev/null +++ b/man/assign.u.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/encoding.R +\name{assign.u} +\alias{assign.u} +\title{For unaffecteds: Take a genetic variant and determine the category of the combo.} +\usage{ +assign.u(variant) +} +\arguments{ +\item{variant}{Variant for individual. genotypes, phased genotypes, or binary encodings accepted.} +} +\description{ +For unaffecteds: Take a genetic variant and determine the category of the combo. +} diff --git a/man/calc.rv.score.Rd b/man/calc.rv.score.Rd index 5352054..52ade48 100644 --- a/man/calc.rv.score.Rd +++ b/man/calc.rv.score.Rd @@ -79,6 +79,6 @@ of unaffected individuals that have a variant rather than affected individuals t Input: } \examples{ -relations<-list("A.c" = c(0, 1, 3, 1),"A.i" = c(3),"U.c" = c(1, 2),"U.i" = c(1)) +relations <- list("A.c" = c(0, 1, 3, 1), "A.i" = c(3), "U.c" = c(1, 2), "U.i" = c(1)) rv.scores <- calc.rv.score(relations) } diff --git a/man/score.fam.Rd b/man/score.fam.Rd index 3e7d09a..f704823 100644 --- a/man/score.fam.Rd +++ b/man/score.fam.Rd @@ -29,7 +29,7 @@ score.fam( \item{return.means}{Boolean indicating if mean of all family variant scores should be returned (default = TRUE).} -\item{affected.only}{Boolean indicating if the family score should be calculated using only the affected individuals? (default = TRUE).} +\item{affected.only}{Boolean indicating if family score should be calculated using only affected individuals (default = TRUE).} \item{max.err}{A heuristic cap of the number of incorrect assignments allowed when scoring. When the total number of incorrect (sum of affected and unaffected) is exceeded, the variant's score is set to 0, regardless of the number @@ -41,7 +41,7 @@ A labelled vector with names: score, score.for, score.against \description{ By default all individuals are treated as the reference 'proband' and the given variant's score is calculated based on relationships to all other individuals. -e.g. for each row in the relationship matrix. calc.rv.score is run, with the row name indiciating the +e.g. for each row in the relationship matrix. calc.rv.score is run, with the row name indicating the reference individual that the calculation is relative to. Note that the relation.mat can include more individuals than are present within the status.df, but the matrix will be subset to include only those individuals that have status information provided. @@ -57,10 +57,10 @@ NOTE: if affected.only = True, the averages and sums are calculated using only t } } \examples{ -mat.name1<-system.file('extdata/1234_ex2.mat', package = 'KinformR') -tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'KinformR') +mat.name1 <- system.file("extdata/1234_ex2.mat", package = "KinformR") +tsv.name1 <- system.file("extdata/1234_ex2.tsv", package = "KinformR") mat.df <- read.relation.mat(mat.name1) ind.df <- read.indiv(tsv.name1) -ind.df.status <- score.variant.status(ind.df) +ind.df.status <- score.variant.status(ind.df) score.default <- score.fam(mat.df, ind.df.status) } diff --git a/man/score.variant.status.Rd b/man/score.variant.status.Rd index 6d2bf1a..60777c9 100644 --- a/man/score.variant.status.Rd +++ b/man/score.variant.status.Rd @@ -31,5 +31,5 @@ using a dummy perfect associatng variant generated to see what a family could sc TODO - switch to numbers 1-4 and -1? } \examples{ -#TODO - add +# TODO - add }