From bccf26cd4c9d712b6090d2273c28b965b4388f6b Mon Sep 17 00:00:00 2001
From: Bob Verity <bobverity1@gmail.com>
Date: Fri, 28 Nov 2025 22:25:59 +0000
Subject: [PATCH] all operations changed to only work on unique entries

---
 R/main.R | 337 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 253 insertions(+), 84 deletions(-)

diff --git a/R/main.R b/R/main.R
index 609da30..a7400f9 100644
--- a/R/main.R
+++ b/R/main.R
@@ -49,6 +49,7 @@ check_variant_string <- function(x) {
   #     - contains only allowed characters
   # - codon position (second element):
   #     - contains only allowed characters
+  #     - if there are hyphens denoting a range, checks that a single internal hyphen only
   #     - positions are non-zero
   #     - positions are not duplicated
   #     - positions are sorted increasing
@@ -77,23 +78,27 @@ check_variant_string <- function(x) {
   # is character vector
   stopifnot(all(is.character(x)))
 
+  # only work on unique elements in x
+  x_unique <- unique(x)
+  n_unique <- length(x_unique)
+
   # create objects for storing which rows fail. This allows for more
   # informative error messages in which several issues can be listed at once
   # rather than breaking at the first issue
-  n <- length(x)
-  valid <- rep(TRUE, n)
-  reason <- rep(NA, n)
+  valid <- rep(TRUE, n_unique)
+  reason <- rep(NA, n_unique)
 
   # get list of valid amino acid characters. Do this once here to avoid
   # repetition for every element of x
-  IUPAC_df <- allowed_amino_acids()
-  valid_amino_characters <- paste0("^[", paste(IUPAC_df$IUPAC_amino_acid_code, collapse = ""), "_/|]+$")
+  #IUPAC_df <- allowed_amino_acids()
+  #valid_amino_characters <- paste0("^[", paste(IUPAC_df$IUPAC_amino_acid_code, collapse = ""), "_/|]+$")
+  valid_amino_characters <- "^[ACDEFGHIKLMNPQRSTVWY_/|]+$"
 
   # input may be a single string or a vector of strings. Loop through elements
-  for (i in 1:n) {
+  for (i in 1:n_unique) {
 
     # split into genes
-    s1 <- stringr::str_split(x[i], ";", n = Inf, simplify = FALSE)[[1]]
+    s1 <- stringr::str_split(x_unique[i], ";", n = Inf, simplify = FALSE)[[1]]
     n_genes <- length(s1)
 
     if (any(s1 == "")) {
@@ -140,16 +145,32 @@ check_variant_string <- function(x) {
 
       # ------------------------------------------------------------
       # check codon positions
-
-      if (!grepl("^[0-9_]+$", codon_pos_string)) {
+      if (!grepl("^[0-9_]$|^[0-9_][0-9_-]*[0-9_]$", codon_pos_string)) {
         valid[i] <- FALSE
         reason[i] <- sprintf("codon position %s contains invalid characters", codon_pos_string)
         next()
       }
 
-      # split codon position into separate loci and make numeric
-      codon_pos <- stringr::str_split(codon_pos_string, "_", n = Inf, simplify = FALSE)[[1]] |>
-        as.numeric()
+      # split codon position into separate loci
+      codon_pos <- stringr::str_split(codon_pos_string, "_", n = Inf, simplify = FALSE)[[1]]
+
+      # make numeric, dealing with ranges where needed
+      if (any(grepl("-", codon_pos))) {
+        if (any(stringr::str_count(codon_pos, "-") > 1)) {
+          valid[i] <- FALSE
+          reason[i] <- sprintf("codon position %s does not define a valid range", codon_pos)
+          next()
+        }
+        codon_pos <- mapply(function(x) {
+          r <- as.numeric(strsplit(x, "-")[[1]])
+          if (length(r) > 1) {
+            r <- r[1]:r[2]
+          }
+          r
+        }, codon_pos, SIMPLIFY = FALSE) |>
+          unlist()
+      }
+      codon_pos <- as.numeric(codon_pos)
 
       if (any(codon_pos == 0)) {
         valid[i] <- FALSE
@@ -346,7 +367,7 @@ check_variant_string <- function(x) {
     message("The following issues were found:")
     for (i in seq_along(valid)) {
       if (!valid[i]) {
-        message(sprintf("  - entry %s: %s", i, reason[i]))
+        message(sprintf("  - %s: %s", x_unique[i], reason[i]))
       }
     }
 
@@ -380,6 +401,7 @@ check_position_string <- function(x) {
   #     - no duplicated gene names
   # - codon position (second element):
   #     - contains only allowed characters
+  #     - if there are hyphens denoting a range, a single internal hyphen only
   #     - positions are non-zero
   #     - positions are not duplicated
   #     - positions are sorted increasing
@@ -387,15 +409,18 @@ check_position_string <- function(x) {
   # is character vector
   stopifnot(all(is.character(x)))
 
+  # only work on unique elements in x
+  x_unique <- unique(x)
+  n_unique <- length(x_unique)
+
   # create objects for storing which rows fail. This allows for more
   # informative error messages in which several issues can be listed at once
   # rather than breaking at the first issue
-  n <- length(x)
-  valid <- rep(TRUE, n)
-  reason <- rep(NA, n)
+  valid <- rep(TRUE, n_unique)
+  reason <- rep(NA, n_unique)
 
   # input may be a single string or a vector of strings. Loop through elements
-  for (i in 1:n) {
+  for (i in 1:n_unique) {
 
     # split into genes
     s1 <- stringr::str_split(x[i], ";", n = Inf, simplify = FALSE)[[1]]
@@ -448,8 +473,25 @@ check_position_string <- function(x) {
       }
 
       # split codon position into separate loci and make numeric
-      codon_pos <- stringr::str_split(codon_pos_string, "_", n = Inf, simplify = FALSE)[[1]] |>
-        as.numeric()
+      codon_pos <- stringr::str_split(codon_pos_string, "_", n = Inf, simplify = FALSE)[[1]]
+
+      # make numeric, dealing with ranges where needed
+      if (any(grepl("-", codon_pos))) {
+        if (any(stringr::str_count(codon_pos, "-") > 1)) {
+          valid[i] <- FALSE
+          reason[i] <- sprintf("codon position %s does not define a valid range", codon_pos)
+          next()
+        }
+        codon_pos <- mapply(function(x) {
+          r <- as.numeric(strsplit(x, "-")[[1]])
+          if (length(r) > 1) {
+            r <- r[1]:r[2]
+          }
+          r
+        }, codon_pos, SIMPLIFY = FALSE) |>
+          unlist()
+      }
+      codon_pos <- as.numeric(codon_pos)
 
       if (any(codon_pos == 0)) {
         valid[i] <- FALSE
@@ -484,7 +526,7 @@ check_position_string <- function(x) {
     message("The following issues were found:")
     for (i in seq_along(valid)) {
       if (!valid[i]) {
-        message(sprintf("  - entry %s: %s", i, reason[i]))
+        message(sprintf("  - %s: %s", x_unique[i], reason[i]))
       }
     }
 
@@ -509,16 +551,32 @@ check_position_string <- function(x) {
 
 variant_to_long <- function(x) {
 
+  # basic checks
+  stopifnot(is.character(x))
+
+  # work on unique elements in x and map back at end
+  x_unique <- unique(x)
+
   # expand each element into a list
-  ret <- mapply(function(s1) {
+  l_unique <- mapply(function(s1) {
     mapply(function(s2) {
 
       # extract gene name
       gene_name <- s2[1]
 
       # extract codon positions to vector
-      pos_vec <- strsplit(s2[2], "_")[[1]] |>
-        as.numeric()
+      pos_vec <- strsplit(s2[2], "_")[[1]]
+      if (any(grepl("-", pos_vec))) {
+        pos_vec <- mapply(function(x) {
+          r <- as.numeric(x)
+          if (length(r) > 1) {
+            r <- r[1]:r[2]
+          }
+          r
+        }, strsplit(pos_vec, "-"), SIMPLIFY = FALSE) |>
+          unlist()
+      }
+      pos_vec <- as.numeric(pos_vec)
 
       # extract amino acids to list
       aa_list <- mapply(function(s3) {
@@ -560,7 +618,10 @@ variant_to_long <- function(x) {
       return(ret)
     }, strsplit(s1, ":"), SIMPLIFY = FALSE) |>
       dplyr::bind_rows()
-  }, strsplit(x, ";"), SIMPLIFY = FALSE)
+  }, strsplit(x_unique, ";"), SIMPLIFY = FALSE)
+
+  # map uniques back to original
+  ret <- l_unique[match(x, x_unique)]
 
   return(ret)
 }
@@ -581,7 +642,10 @@ long_to_variant <- function(x) {
   # check input is list
   stopifnot(is.list(x))
 
-  ret <- mapply(function(y1) {
+  # work on unique elements in x and map back at end
+  x_unique <- unique(x)
+
+  l_unique <- mapply(function(y1) {
 
     # check inputs
     stopifnot(is.data.frame(y1))
@@ -635,11 +699,11 @@ long_to_variant <- function(x) {
     }
 
     return(ret)
-  }, x, SIMPLIFY = FALSE) |>
+  }, x_unique, SIMPLIFY = FALSE) |>
     unlist()
 
-  # final checks on return object
-  check_variant_string(ret)
+  # map uniques back to original
+  ret <- l_unique[match(x, x_unique)]
 
   return(ret)
 }
@@ -659,14 +723,38 @@ long_to_variant <- function(x) {
 
 position_to_long <- function(x) {
 
+  # basic checks
+  stopifnot(is.character(x))
+
+  # work on unique elements in x and map back at end
+  x_unique <- unique(x)
+
   # expand each element into a list
-  ret <- mapply(function(s1) {
+  l_unique <- mapply(function(s1) {
     mapply(function(s2) {
+
+      # extract codon positions to vector
+      pos_vec <- strsplit(s2[2], "_")[[1]]
+      if (any(grepl("-", pos_vec))) {
+        pos_vec <- mapply(function(x) {
+          r <- as.numeric(x)
+          if (length(r) > 1) {
+            r <- r[1]:r[2]
+          }
+          r
+        }, strsplit(pos_vec, "-"), SIMPLIFY = FALSE) |>
+          unlist()
+      }
+      pos_vec <- as.numeric(pos_vec)
+
       data.frame(gene = s2[1],
-                 pos = as.numeric(strsplit(s2[2], "_")[[1]]))
+                 pos = pos_vec)
     }, strsplit(s1, ":"), SIMPLIFY = FALSE) |>
       dplyr::bind_rows()
-  }, strsplit(x, ";"), SIMPLIFY = FALSE)
+  }, strsplit(x_unique, ";"), SIMPLIFY = FALSE)
+
+  # map uniques back to original
+  ret <- l_unique[match(x, x_unique)]
 
   return(ret)
 }
@@ -687,7 +775,10 @@ long_to_position <- function(x) {
   # check input is list
   stopifnot(is.list(x))
 
-  ret <- mapply(function(y1) {
+  # work on unique elements in x and map back at end
+  x_unique <- unique(x)
+
+  l_unique <- mapply(function(y1) {
 
     # check inputs
     stopifnot(is.data.frame(y1))
@@ -702,11 +793,11 @@ long_to_position <- function(x) {
       pull(variant) |>
       paste(collapse = ";")
 
-  }, x, SIMPLIFY = FALSE) |>
+  }, x_unique, SIMPLIFY = FALSE) |>
     unlist()
 
-  # final checks on return object
-  check_position_string(ret)
+  # map uniques back to original
+  ret <- l_unique[match(x, x_unique)]
 
   return(ret)
 }
@@ -725,12 +816,20 @@ long_to_position <- function(x) {
 
 position_from_variant_string <- function(x) {
 
-  mapply(function(y1) {
+  # work on unique elements in x and map back at end
+  x_unique <- unique(x)
+
+  l_unique <- mapply(function(y1) {
     y1 |>
       group_by(gene) |>
       reframe(pos = unique(pos))
-  }, variant_to_long(x), SIMPLIFY = FALSE) |>
+  }, variant_to_long(x_unique), SIMPLIFY = FALSE) |>
     long_to_position()
+
+  # map uniques back to original
+  ret <- l_unique[match(x, x_unique)]
+
+  return(ret)
 }
 
 #------------------------------------------------
@@ -756,7 +855,10 @@ subset_position <- function(position_string, variant_strings) {
   # get position string in long form
   df_position <- position_to_long(position_string)[[1]]
 
-  ret <- mapply(function(x) {
+  # work on unique variant strings and map back at end
+  v_unique <- unique(variant_strings)
+
+  l_unique <- mapply(function(x) {
     df_diff <- anti_join(df_position, x, by = c("gene", "pos"))
     if (nrow(df_diff) == 0) {
       df_sub <- semi_join(x, df_position, by = c("gene", "pos"))
@@ -765,9 +867,12 @@ subset_position <- function(position_string, variant_strings) {
       ret <- NA
     }
     ret
-  }, variant_to_long(variant_strings), SIMPLIFY = FALSE) |>
+  }, variant_to_long(v_unique), SIMPLIFY = FALSE) |>
     unlist()
 
+  # map uniques back to original
+  ret <- l_unique[match(variant_strings, v_unique)]
+
   return(ret)
 }
 
@@ -786,10 +891,18 @@ subset_position <- function(position_string, variant_strings) {
 
 order_variant_string <- function(x) {
 
-  mapply(function(y) {
+  # work on unique x and map back at end
+  x_unique <- unique(x)
+
+  l_unique <- mapply(function(y) {
     arrange(y, gene, pos, aa)
-  }, variant_to_long(x), SIMPLIFY = FALSE) |>
+  }, variant_to_long(x_unique), SIMPLIFY = FALSE) |>
     long_to_variant()
+
+  # map uniques back to original
+  ret <- l_unique[match(x, x_unique)]
+
+  return(ret)
 }
 
 #------------------------------------------------
@@ -806,10 +919,18 @@ order_variant_string <- function(x) {
 
 order_position_string <- function(x) {
 
-  mapply(function(y) {
+  # work on unique x and map back at end
+  x_unique <- unique(x)
+
+  l_unique <- mapply(function(y) {
     arrange(y, gene, pos)
-  }, position_to_long(x), SIMPLIFY = FALSE) |>
+  }, position_to_long(x_unique), SIMPLIFY = FALSE) |>
     long_to_position()
+
+  # map uniques back to original
+  ret <- l_unique[match(x, x_unique)]
+
+  return(ret)
 }
 
 #------------------------------------------------
@@ -825,14 +946,22 @@ order_position_string <- function(x) {
 
 count_unphased_hets <- function(x) {
 
-  mapply(function(y1) {
+  # work on unique x and map back at end
+  x_unique <- unique(x)
+
+  h_unique <- mapply(function(y1) {
     y1 |>
       group_by(gene, pos) |>
       summarise(n = (het[1] == TRUE) & (phased[1] == FALSE),
                 .groups = "drop") |>
       pull(n) |>
       sum()
-  }, variant_to_long(x))
+  }, variant_to_long(x_unique))
+
+  # map uniques back to original
+  ret <- h_unique[match(x, x_unique)]
+
+  return(ret)
 }
 
 #------------------------------------------------
@@ -848,14 +977,22 @@ count_unphased_hets <- function(x) {
 
 count_phased_hets <- function(x) {
 
-  mapply(function(y1) {
+  # work on unique x and map back at end
+  x_unique <- unique(x)
+
+  h_unique <- mapply(function(y1) {
     y1 |>
       group_by(gene, pos) |>
       summarise(n = (het[1] == TRUE) & (phased[1] == TRUE),
                 .groups = "drop") |>
       pull(n) |>
       sum()
-  }, variant_to_long(x))
+  }, variant_to_long(x_unique))
+
+  # map uniques back to original
+  ret <- h_unique[match(x, x_unique)]
+
+  return(ret)
 }
 
 #------------------------------------------------
@@ -870,11 +1007,19 @@ count_phased_hets <- function(x) {
 
 drop_read_counts <- function(x) {
 
-  mapply(function(y) {
+  # work on unique x and map back at end
+  x_unique <- unique(x)
+
+  l_unique <- mapply(function(y) {
     y$read_count <- NA
     y
-  }, variant_to_long(x), SIMPLIFY = FALSE) |>
+  }, variant_to_long(x_unique), SIMPLIFY = FALSE) |>
     long_to_variant()
+
+  # map uniques back to original
+  ret <- l_unique[match(x, x_unique)]
+
+  return(ret)
 }
 
 #------------------------------------------------
@@ -909,15 +1054,18 @@ compare_variant_string <- function(target_string, comparison_strings) {
   df_target <- variant_to_long(target_string)[[1]] |>
     select(gene, pos, aa)
 
-  # loop over all comparison strings
-  n <- length(comparison_strings)
-  ret <- data.frame(match = rep(NA, n),
-                    ambiguous = NA,
-                    prop = NA)
-  for (i in 1:n) {
+  # work on unique comparison_strings and map back at end
+  c_unique <- unique(comparison_strings)
+
+  # loop over all unique comparison strings
+  n_unique <- length(c_unique)
+  df_unique <- data.frame(match = rep(NA, n_unique),
+                          ambiguous = NA,
+                          prop = NA)
+  for (i in 1:n_unique) {
 
     # get this comparison string in long form
-    df_comparison <- variant_to_long(comparison_strings[i])[[1]]
+    df_comparison <- variant_to_long(c_unique[i])[[1]]
 
     # compare target against comparison
     df_target_match <- df_target |>
@@ -932,9 +1080,9 @@ compare_variant_string <- function(target_string, comparison_strings) {
 
     # simple exit if not match
     if (!all(df_target_match$match)) {
-      ret$match[i] <- FALSE
-      ret$ambiguous[i] <- FALSE
-      ret$prop[i] <- 0
+      df_unique$match[i] <- FALSE
+      df_unique$ambiguous[i] <- FALSE
+      df_unique$prop[i] <- 0
       next
     }
 
@@ -959,24 +1107,24 @@ compare_variant_string <- function(target_string, comparison_strings) {
     n_unphased_het <- sum(df_comparison_match$het * !df_comparison_match$phased)
 
     # determine if unambiguous match
-    ret$match[i] <- TRUE
-    ret$ambiguous[i] <- ((n_phased_het > 0) + n_unphased_het) > 1
+    df_unique$match[i] <- TRUE
+    df_unique$ambiguous[i] <- ((n_phased_het > 0) + n_unphased_het) > 1
 
     # get proportion if calculable
-    ret$prop[i] <- NA
-    if (!ret$ambiguous[i]) {
+    df_unique$prop[i] <- NA
+    if (!df_unique$ambiguous[i]) {
       if (any(is.na(df_comparison_match$read_count))) {
         if ((n_phased_het + n_unphased_het) == 0) {
-          ret$prop[i] <- 1
+          df_unique$prop[i] <- 1
         }
       } else {
         prop <- df_comparison_match$read_count / df_comparison_match$read_count_total
         if (all(prop == 1)) {
-          ret$prop[i] <- 1
+          df_unique$prop[i] <- 1
         } else {
           u_prop <- unique(prop[prop != 1])
           if (length(u_prop) == 1) {
-            ret$prop[i] <- u_prop
+            df_unique$prop[i] <- u_prop
           }
         }
       }
@@ -984,6 +1132,9 @@ compare_variant_string <- function(target_string, comparison_strings) {
 
   }
 
+  # map uniques back to original
+  ret <- df_unique[match(comparison_strings, c_unique),]
+
   return(ret)
 }
 
@@ -1009,16 +1160,19 @@ compare_position_string <- function(target_string, comparison_strings) {
   # get target in long form
   df_target <- position_to_long(target_string)[[1]]
 
+  # work on unique comparison_strings and map back at end
+  c_unique <- unique(comparison_strings)
+
   # get comparisons in long form
-  list_comparison <- position_from_variant_string(comparison_strings) |>
+  list_comparison <- position_from_variant_string(c_unique) |>
     position_to_long()
 
-  # loop over all comparison strings
-  n <- length(comparison_strings)
-  ret <- rep(NA, n)
-  for (i in 1:n) {
+  # loop over all unique comparison strings
+  n_unique <- length(c_unique)
+  ret_unique <- rep(NA, n_unique)
+  for (i in 1:n_unique) {
 
-    ret[i] <- mapply(function(j) {
+    ret_unique[i] <- mapply(function(j) {
       any((df_target$gene[j] == list_comparison[[i]]$gene) &
         (df_target$pos[j] == list_comparison[[i]]$pos))
     }, 1:nrow(df_target), SIMPLIFY = FALSE) |>
@@ -1026,6 +1180,9 @@ compare_position_string <- function(target_string, comparison_strings) {
       all()
   }
 
+  # map uniques back to original
+  ret <- ret_unique[match(comparison_strings, c_unique)]
+
   return(ret)
 }
 
@@ -1044,9 +1201,15 @@ compare_position_string <- function(target_string, comparison_strings) {
 
 extract_single_locus_variants <- function(x) {
 
-  mapply(function(y) {
+  # work on unique x and map back at end
+  x_unique <- unique(x)
+
+  l_unique <- mapply(function(y) {
     sprintf("%s:%s:%s", y$gene, y$pos, y$aa)
-  }, variant_to_long(x), SIMPLIFY = FALSE)
+  }, variant_to_long(x_unique), SIMPLIFY = FALSE)
+
+  # map uniques back to original
+  ret <- l_unique[match(x, x_unique)]
 }
 
 #------------------------------------------------
@@ -1066,21 +1229,24 @@ extract_single_locus_variants <- function(x) {
 
 get_component_variants <- function(x) {
 
+  # work on unique x and map back at end
+  x_unique <- unique(x)
+
   # focus on unambiguous
-  unphased_hets <- count_unphased_hets(x)
-  phased_hets <- count_phased_hets(x)
+  unphased_hets <- count_unphased_hets(x_unique)
+  phased_hets <- count_phased_hets(x_unique)
 
   # get all variants into long format and drop any read counts
   variant_list <- mapply(function(y) {
     y$read_count <- NA
     y
-  }, variant_to_long(x), SIMPLIFY = FALSE)
+  }, variant_to_long(x_unique), SIMPLIFY = FALSE)
 
-  n <- length(x)
-  ret <- replicate(n, NA, simplify = FALSE)
-  for (i in 1:n) {
+  n_unique <- length(x_unique)
+  df_ret <- replicate(n_unique, NA, simplify = FALSE)
+  for (i in 1:n_unique) {
     if ((unphased_hets[i] == 0) & (phased_hets[i] == 0)) {
-      ret[[i]] <- long_to_variant(variant_list[i])
+      df_ret[[i]] <- long_to_variant(variant_list[i])
 
     } else if ((unphased_hets[i] == 1) && (phased_hets[i] == 0)) {
 
@@ -1113,7 +1279,7 @@ get_component_variants <- function(x) {
                   .groups = "drop") |>
         pull(variant)
 
-      ret[[i]] <- components
+      df_ret[[i]] <- components
 
     } else if ((unphased_hets[i] == 0) && (phased_hets[i] > 0)) {
 
@@ -1148,14 +1314,17 @@ get_component_variants <- function(x) {
                   .groups = "drop") |>
         pull(variant)
 
-      ret[[i]] <- components
+      df_ret[[i]] <- components
 
     } else {
-      ret[[i]] <- NA
+      df_ret[[i]] <- NA
 
     }
   }
 
+  # map uniques back to original
+  ret <- df_ret[match(x, x_unique)]
+
   return(ret)
 }