From 5f31e6315e989af1fc8094c98c2565e37e2ee010 Mon Sep 17 00:00:00 2001
From: Bob Verity <bobverity1@gmail.com>
Date: Fri, 28 Nov 2025 14:59:51 +0000
Subject: [PATCH 1/4] improved documentation, and removed some safety rails
 from functions that could cause them to run very slowly. Checks now pushed
 back onto the user, as flagged in the updated documentation

---
 DESCRIPTION |  2 +-
 R/main.R    | 84 ++++++++++++++---------------------------------------
 README.Rmd  | 31 +++++++++++---------
 README.md   | 67 +++++++++++++++++++++++++++++-------------
 4 files changed, 88 insertions(+), 96 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 1474336..8651a06 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: variantstring
 Type: Package
 Title: Functions for working with variant string format
-Version: 1.8.2
+Version: 1.8.3
 Authors@R: c(
     person("Bob", "Verity", email = "r.verity@imperial.ac.uk", role = c("aut", "cre"))
     )
diff --git a/R/main.R b/R/main.R
index 92d733b..609da30 100644
--- a/R/main.R
+++ b/R/main.R
@@ -85,7 +85,7 @@ check_variant_string <- function(x) {
   reason <- rep(NA, n)
 
   # get list of valid amino acid characters. Do this once here to avoid
-  # repetition for element of x
+  # repetition for every element of x
   IUPAC_df <- allowed_amino_acids()
   valid_amino_characters <- paste0("^[", paste(IUPAC_df$IUPAC_amino_acid_code, collapse = ""), "_/|]+$")
 
@@ -174,7 +174,7 @@ check_variant_string <- function(x) {
 
       if (!grepl(valid_amino_characters, codon_aa_string)) {
         valid[i] <- FALSE
-        reason[i] <- "amino acid sequence contains invalid characters. See ?allowed_amino_acids()"
+        reason[i] <- "amino acid sequence contains invalid characters. See ?variantstring::allowed_amino_acids()"
         next()
       }
 
@@ -501,7 +501,7 @@ check_position_string <- function(x) {
 #' Takes a vector of variant strings and expands into a list of data.frames
 #' containing the same information in long form.
 #'
-#' @param x a vector of variant strings.
+#' @param x a vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).
 #'
 #' @import dplyr
 #'
@@ -509,9 +509,6 @@ check_position_string <- function(x) {
 
 variant_to_long <- function(x) {
 
-  # checks
-  check_variant_string(x)
-
   # expand each element into a list
   ret <- mapply(function(s1) {
     mapply(function(s2) {
@@ -572,10 +569,10 @@ variant_to_long <- function(x) {
 #' @title Take long form information and convert to variant string
 #'
 #' @description
-#' Takes a list of data.frames in long form and converts each to variant string
+#' Takes a list of data frames in long form and converts each to variant string
 #' format.
 #'
-#' @param x a list of data.frames.
+#' @param x a list of data frames corresponding to variant strings.
 #'
 #' @export
 
@@ -654,7 +651,7 @@ long_to_variant <- function(x) {
 #' Takes a vector of position strings and expands into a list of data.frames
 #' containing the same information in long form.
 #'
-#' @param x a vector of position strings.
+#' @param x a vector of position strings. Note, these are not internally checked for being valid position strings, it is up to the user to ensure this (see \code{?check_position_string}).
 #'
 #' @import dplyr
 #'
@@ -662,9 +659,6 @@ long_to_variant <- function(x) {
 
 position_to_long <- function(x) {
 
-  # checks
-  check_position_string(x)
-
   # expand each element into a list
   ret <- mapply(function(s1) {
     mapply(function(s2) {
@@ -681,10 +675,10 @@ position_to_long <- function(x) {
 #' @title Take long form information and convert to position string
 #'
 #' @description
-#' Takes a list of data.frames in long form and converts each to position string
+#' Takes a list of data frames in long form and converts each to position string
 #' format.
 #'
-#' @param x a list of data.frames.
+#' @param x a list of data frames corresponding to position strings.
 #'
 #' @export
 
@@ -723,7 +717,7 @@ long_to_position <- function(x) {
 #' @description
 #' Extract a position string from a variant string by stripping the amino acids.
 #'
-#' @param x a character string or vector of character strings.
+#' @param x a character string or vector of character strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).
 #'
 #' @import dplyr
 #'
@@ -731,16 +725,12 @@ long_to_position <- function(x) {
 
 position_from_variant_string <- function(x) {
 
-  # checks
-  check_variant_string(x)
-
   mapply(function(y1) {
     y1 |>
       group_by(gene) |>
       reframe(pos = unique(pos))
   }, variant_to_long(x), SIMPLIFY = FALSE) |>
     long_to_position()
-
 }
 
 #------------------------------------------------
@@ -751,8 +741,8 @@ position_from_variant_string <- function(x) {
 #' variant strings to only the genes and codons in the position string. Retains
 #' read counts at these positions if present.
 #'
-#' @param position_string a single position string.
-#' @param variant_strings a variant string or vector of variant strings.
+#' @param position_string a single position string. Note, these are not internally checked for being valid position strings, it is up to the user to ensure this (see \code{?check_position_string}).
+#' @param variant_strings a variant string or vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).
 #'
 #' @import dplyr
 #'
@@ -761,9 +751,7 @@ position_from_variant_string <- function(x) {
 subset_position <- function(position_string, variant_strings) {
 
   # checks
-  check_position_string(position_string)
   stopifnot(length(position_string) == 1)
-  check_variant_string(variant_strings)
 
   # get position string in long form
   df_position <- position_to_long(position_string)[[1]]
@@ -792,20 +780,16 @@ subset_position <- function(position_string, variant_strings) {
 #' useful when checking for duplicated strings as the same information may be
 #' presented in a different order.
 #'
-#' @param x a variant string or vector of variant strings.
+#' @param x a variant string or vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).
 #'
 #' @export
 
 order_variant_string <- function(x) {
 
-  # checks
-  check_variant_string(x)
-
   mapply(function(y) {
     arrange(y, gene, pos, aa)
   }, variant_to_long(x), SIMPLIFY = FALSE) |>
     long_to_variant()
-
 }
 
 #------------------------------------------------
@@ -816,20 +800,16 @@ order_variant_string <- function(x) {
 #' when checking for duplicated strings as the same information may be presented
 #' in a different order.
 #'
-#' @param x a position string or vector of position strings.
+#' @param x a position string or vector of position strings. Note, these are not internally checked for being valid position strings, it is up to the user to ensure this (see \code{?check_position_string}).
 #'
 #' @export
 
 order_position_string <- function(x) {
 
-  # checks
-  check_position_string(x)
-
   mapply(function(y) {
     arrange(y, gene, pos)
   }, position_to_long(x), SIMPLIFY = FALSE) |>
     long_to_position()
-
 }
 
 #------------------------------------------------
@@ -839,15 +819,12 @@ order_position_string <- function(x) {
 #' Count the number of unphased heterozygous loci in each variant string. Return
 #' the number as a vector.
 #'
-#' @param x a variant string or vector of variant strings.
+#' @param x a variant string or vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).
 #'
 #' @export
 
 count_unphased_hets <- function(x) {
 
-  # checks
-  check_variant_string(x)
-
   mapply(function(y1) {
     y1 |>
       group_by(gene, pos) |>
@@ -865,15 +842,12 @@ count_unphased_hets <- function(x) {
 #' Count the number of phased heterozygous loci in each variant string. Return
 #' the number as a vector.
 #'
-#' @param x a variant string or vector of variant strings.
+#' @param x a variant string or vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).
 #'
 #' @export
 
 count_phased_hets <- function(x) {
 
-  # checks
-  check_variant_string(x)
-
   mapply(function(y1) {
     y1 |>
       group_by(gene, pos) |>
@@ -890,15 +864,12 @@ count_phased_hets <- function(x) {
 #' @description
 #' Takes a vector of variant strings and strips and information on read counts.
 #'
-#' @param x a variant string or vector of variant strings.
+#' @param x a variant string or vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).
 #'
 #' @export
 
 drop_read_counts <- function(x) {
 
-  # checks
-  check_variant_string(x)
-
   mapply(function(y) {
     y$read_count <- NA
     y
@@ -918,9 +889,9 @@ drop_read_counts <- function(x) {
 #' but a second output also flags this as an ambiguous match.
 #'
 #' @param target_string a single variant string that we want to compare. Cannot
-#'   contain any heterozygous calls.
+#'   contain any heterozygous calls. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).
 #' @param comparison_strings a vector of variant strings against which the
-#'   target is compared.
+#'   target is compared. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).
 #'
 #' @import dplyr
 #'
@@ -929,12 +900,10 @@ drop_read_counts <- function(x) {
 compare_variant_string <- function(target_string, comparison_strings) {
 
   # checks
-  check_variant_string(target_string)
   stopifnot(length(target_string) == 1)
   if ((count_unphased_hets(target_string) > 0) || (count_phased_hets(target_string) > 0)) {
     stop("target string cannot contain any heterozygous loci")
   }
-  check_variant_string(comparison_strings)
 
   # get target in long form
   df_target <- variant_to_long(target_string)[[1]] |>
@@ -1026,18 +995,16 @@ compare_variant_string <- function(target_string, comparison_strings) {
 #' match is found if every codon position in every gene of the target is also
 #' found within the comparison (irrespective of the observed amino acids).
 #'
-#' @param target_string a single position string that we want to compare.
+#' @param target_string a single position string that we want to compare. Note, these are not internally checked for being valid position strings, it is up to the user to ensure this (see \code{?check_position_string}).
 #' @param comparison_strings a vector of variant strings against which the
-#'   target is compared.
+#'   target is compared. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).
 #'
 #' @export
 
 compare_position_string <- function(target_string, comparison_strings) {
 
   # checks
-  check_position_string(target_string)
   stopifnot(length(target_string) == 1)
-  check_variant_string(comparison_strings)
 
   # get target in long form
   df_target <- position_to_long(target_string)[[1]]
@@ -1071,19 +1038,15 @@ compare_position_string <- function(target_string, comparison_strings) {
 #' unique single-locus variants within the input. For example, crt:72_73:C_N/V
 #' can be extracted to crt:72:C, crt:73:N, and crt:73:V.
 #'
-#' @param x a vector of variant strings.
+#' @param x a vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).
 #'
 #' @export
 
 extract_single_locus_variants <- function(x) {
 
-  # checks
-  check_variant_string(x)
-
   mapply(function(y) {
     sprintf("%s:%s:%s", y$gene, y$pos, y$aa)
   }, variant_to_long(x), SIMPLIFY = FALSE)
-
 }
 
 #------------------------------------------------
@@ -1094,7 +1057,7 @@ extract_single_locus_variants <- function(x) {
 #' define the genotypes that are present in this mixture. This function returns
 #' all such component genotypes.
 #'
-#' @param x a vector of variant strings.
+#' @param x a vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).
 #'
 #' @importFrom tidyr pivot_longer
 #' @import dplyr
@@ -1103,9 +1066,6 @@ extract_single_locus_variants <- function(x) {
 
 get_component_variants <- function(x) {
 
-  # check
-  check_variant_string(x)
-
   # focus on unambiguous
   unphased_hets <- count_unphased_hets(x)
   phased_hets <- count_phased_hets(x)
diff --git a/README.Rmd b/README.Rmd
index 30760cc..26b57a5 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -17,7 +17,7 @@ library(variantstring)
 
 # variantstring
 
-This R package that defines *variant string format*, a convenient format for encoding multi-locus genotypes. Functionalities include converting into and out of variant string format, subsetting based on genomic position, and comparing two variant strings to see if one is a subset of the other.
+Tools for working with *variant string format*, a convenient format for encoding multi-locus genotypes. Functionalities include converting into and out of variant string format, subsetting based on genomic position, and comparing two variant strings to see if one is a subset of the other.
 
 
 ## Installation
@@ -25,10 +25,10 @@ This R package that defines *variant string format*, a convenient format for enc
 You can install directly from Github:
 
 ```{r, eval=FALSE}
-devtools::install_github(repo = "mrc-ide/variantstring@1.8.2")
+devtools::install_github(repo = "mrc-ide/variantstring@1.8.3")
 ```
 
-Note the use of the @ symbol to reference a specific tagged version. This is highly recommended as the package is still in development and backwards compatibility is not guaranteed. See the end of this page for the most recent version number.
+Note the use of the @ symbol to reference a specific tagged version. This is highly recommended as the package is still in development and backwards compatibility is not guaranteed.
 
 ## Variant string format
 
@@ -44,9 +44,9 @@ An optional fourth element is the read count, for example `pfcrt:72:C:50` specif
 
 #### 2. Loci are separated by _
 
-For example, `pfcrt:72_73:C_V` specifies that at codon 72 a C (Cysteine) was observed, and at codon 73 a V (Valine) was observed. Codon positions must be in increasing numerical order. There is no limit to the number of loci that are allowed.
+For example, `pfcrt:72_73:C_V` specifies that at codon 72 a C (Cysteine) was observed, and at codon 73 a V (Valine) was observed. Codon positions must be in increasing numerical order. There is no limit to the number of loci that are allowed, although please keep in mind that this format was designed for short variants (100 positions or less) and larger haplotypes may become very slow. For larger haplotypes we recommend other existing file formats.
 
-Underscores can be omitted between amino acids to give a more concise notation, for example `pfcrt:72_73:CV` is equivalent to `pfcrt:72_73:C_V`. However, underscores cannot be omitted between codon positions.
+Underscores can be omitted between amino acids to give a more concise notation, for example `pfcrt:72_73:CV` is equivalent to `pfcrt:72_73:C_V`. However, underscores cannot be omitted between codon positions because this would make them impossible to resolve.
 
 If read counts are present then these must correspond to each of the codon positions, for example `pfcrt:72_73:CV:50_55`.
 
@@ -58,7 +58,7 @@ Any number of alleles can be present in a heterozygous locus, for example `pfcrt
 
 Heterozygous calls can be found over multiple loci, for example `pfcrt:72_73:C/S/A_V/A`. The number of alleles does not need to be consistent over loci, for example here there are three alleles at position 72 and two alleles at position 73.
 
-When read counts are present, the counts for each allele must mirror the format of the amino acids. For example, `pfcrt:72_73:C/S/A_V/A:60/30/10_45/55` is a valid string that specifies read counts of {C=60, S=30, A=10} at codon 72 and {V=45, A=55} at codon 73.
+When read counts are present, the counts for each allele must mirror the format of the amino acids. For example, `pfcrt:72_73:C/S/A_V/A:60/30/10_45/55` is a valid string that specifies read counts of {C=60, S=30, A=10} at codon 72 and {V=45, A=55} at codon 73. These counts do not need to add to the same number, i.e. coverage may differ between loci.
 
 #### 4. Phased mixed calls are indicated by |
 
@@ -70,13 +70,13 @@ Phased and unphased loci can be combined. For example, `pfcrt:72_73_74:C|S_V|A_M
 
 Although the number of alleles must be consistent over all phased loci, it does not have to be consistent between phased and unphased loci. For example `pfcrt:72_73_74:C|S_V|A_M/I/A` is a valid string despite having two phased and three unphased alleles.
 
-Phased and unphased alleles cannot be combined within a single locus. For example, `pfcrt:72:C|S/A` is **not** a valid string. This type of partial phasing can arise in real data, but unfortunately cannot be encoded in variant string format.
+Phased and unphased alleles cannot be combined within a single locus. For example, `pfcrt:72:C|S/A` is **not** a valid string. This type of partial phasing can arise in real data, but unfortunately is beyond the scope of variant string format.
 
 Finally, if read counts are present they must mirror the format of the amino acids. For example `pfcrt:72_73_74:C|S_V|A_M/I/A:40|60_40|60_20/30/50` is a valid string.
 
 #### 5. Genes are separated by ;
 
-For example, `pfcrt:72:C;pfmdr-1:86:Y` specifies that in the *pfcrt* gene at codon 72 a C was observed, and in the *pfmdr-1* gene at codon 86 a Y was observed. In this way, multi-locus genotypes can be encoded spanning different parts of the genome, including over different chromosomes.
+For example, `pfcrt:72:C;pfmdr-1:86:Y` specifies that in the *pfcrt* gene at codon 72 a C was observed, and in the *pfmdr-1* gene at codon 86 a Y was observed. In this way, multi-locus genotypes can be encoded spanning different parts of the genome, including over different chromosomes. This is mainly useful if encoding phased information between genes is important, otherwise you might be better off splitting this information into two strings.
 
 There is no limit to the number of genes that can be encoded. The order of gene names does not matter as genes are first sorted alphabetically before applying any manipulation functions. However, the same gene name cannot be repeated multiple times.
 
@@ -98,13 +98,13 @@ data_string <- c("dhps:437:G",
                  "dhps:437_540:A/G_K/E")
 ```
 
-All of these samples were successfully sequenced at *dhps* locus 437, and some were also sequenced at locus 540. Some contain heterozygous calls while others contain only homozygous calls.
+All of these samples were successfully sequenced at *dhps* locus 437, and most were also sequenced at locus 540. Some contain heterozygous calls while others contain only homozygous calls.
 
 Our first question might be; **what unique genotypes are present within this dataset?**
 
 - The first two samples have no heterozygous sites, meaning we can be certain that both `dhps:437:G` and `dhps:437_540:G_K` are present in the data.
 - The third sample is interesting because there is a heterozygous site (540 K/E), but we can still work out the two genotypes that make up this sample. These are `dhps:437_540:G_K` and `dhps:437_540:G_E`.
-- The fourth sample contains two heterozygous sites, meaning **it is no longer possible to establish which genotypes make up this sample**. It could be composed of `A_K` coming together with `G_E`, or by `A_E` coming together with `G_K`, or by some combination of these.
+- The fourth sample contains two heterozygous sites, meaning **it is no longer possible to establish which genotypes make up this sample**. It could be composed of `A_K` coming together with `G_E`, or by `A_E` coming together with `G_K`, or by some combination of these. Whenever there are two or more unphased heterozygous loci we lose the ability to unambiguously resolve the component genotypes.
 
 In code, we can use the `get_component_variants()` function:
 
@@ -177,7 +177,7 @@ compare_position_string(target_string = "dhps:437_540",
 This is only present in three samples, meaning the first sample should be excluded from any prevalence calculation. Combining this with the results above, we can say that:
 
 - The `dhps:437_540:G_K` variant is present at between 2/3 and 3/3 samples. The prevalence is in the range 67%-100%.
-- The `dhps:437_540:G_E` variant is present at between 1/3 and 2/3 samples. The prevalence is in the range 50%-67%.
+- The `dhps:437_540:G_E` variant is present at between 1/3 and 2/3 samples. The prevalence is in the range 33%-67%.
 
 We can see that prevalence calculation is not always straightforward due to ambiguous matches, and requires a judgement call on how best to use the data. For example, we could report a range of prevalence as above, or alternatively we could exclude all samples with heterozygous calls (aka focussing on monoclonals) to produce an unbiased prevalence estimate but from a smaller sample. What is **not** valid is to exclude ambiguous matches, as this risks biasing prevalence estimates downwards.
 
@@ -193,6 +193,11 @@ variant_to_long("pfcrt:72_73_74_75_76:CVIE_K/T:54_34_64_29_54/64;pfmdr-1:86_184:
 
 This contains the same information, but may be easier to work with for some operations. We can always convert back using `long_to_variant()`.
 
-## Release history
+### Summary and word of caution
+
+variantstring is a narrowly focused package that provides utility functions for working with a specific class of genetic data (actually technically amino-acid data). By combining these building blocks, you can perform more advanced tasks, such as estimating the prevalence of multi-locus haplotypes, even when samples contain mixed infections.
+
+That said, variantstring is not designed for efficiency, either in file size or computational speed. Its purpose is to act as a convenience format: something lightweight and human-interpretable, useful when manually extracting genotype information from papers, or when storing aggregate counts at a small number of loci rather than full individual-level data.
+
+Some efficiency considerations depend on how you compose the functions. For example, functions such as \code{get_component_variants()} assume that the input is already a valid variant string; they do not re-validate the format because \code{check_variant_string()} can be expensive and is better run once at the start of your workflow. Likewise, when constructing longer analysis pipelines, it is often best to unpack variant strings early using \code{variant_to_long()}, work in long format using tidyverse-style operations, and then reassemble the strings at the end with \code{long_to_variant()}.
 
-The current version is 1.8.2, released 16 April 2025.
diff --git a/README.md b/README.md
index a980e5e..3f90e61 100644
--- a/README.md
+++ b/README.md
@@ -3,24 +3,23 @@
 
 # variantstring
 
-This R package that defines *variant string format*, a convenient format
-for encoding multi-locus genotypes. Functionalities include converting
-into and out of variant string format, subsetting based on genomic
-position, and comparing two variant strings to see if one is a subset of
-the other.
+Tools for working with *variant string format*, a convenient format for
+encoding multi-locus genotypes. Functionalities include converting into
+and out of variant string format, subsetting based on genomic position,
+and comparing two variant strings to see if one is a subset of the
+other.
 
 ## Installation
 
 You can install directly from Github:
 
 ``` r
-devtools::install_github(repo = "mrc-ide/variantstring@1.8.2")
+devtools::install_github(repo = "mrc-ide/variantstring@1.8.3")
 ```
 
 Note the use of the @ symbol to reference a specific tagged version.
 This is highly recommended as the package is still in development and
-backwards compatibility is not guaranteed. See the end of this page for
-the most recent version number.
+backwards compatibility is not guaranteed.
 
 ## Variant string format
 
@@ -49,12 +48,15 @@ be left out.
 For example, `pfcrt:72_73:C_V` specifies that at codon 72 a C (Cysteine)
 was observed, and at codon 73 a V (Valine) was observed. Codon positions
 must be in increasing numerical order. There is no limit to the number
-of loci that are allowed.
+of loci that are allowed, although please keep in mind that this format
+was designed for short variants (100 positions or less) and larger
+haplotypes may become very slow. For larger haplotypes we recommend
+other existing file formats.
 
 Underscores can be omitted between amino acids to give a more concise
 notation, for example `pfcrt:72_73:CV` is equivalent to
 `pfcrt:72_73:C_V`. However, underscores cannot be omitted between codon
-positions.
+positions because this would make them impossible to resolve.
 
 If read counts are present then these must correspond to each of the
 codon positions, for example `pfcrt:72_73:CV:50_55`.
@@ -77,7 +79,8 @@ When read counts are present, the counts for each allele must mirror the
 format of the amino acids. For example,
 `pfcrt:72_73:C/S/A_V/A:60/30/10_45/55` is a valid string that specifies
 read counts of {C=60, S=30, A=10} at codon 72 and {V=45, A=55} at codon
-73.
+73. These counts do not need to add to the same number, i.e. coverage
+may differ between loci.
 
 #### 4. Phased mixed calls are indicated by \|
 
@@ -107,8 +110,8 @@ two phased and three unphased alleles.
 
 Phased and unphased alleles cannot be combined within a single locus.
 For example, `pfcrt:72:C|S/A` is **not** a valid string. This type of
-partial phasing can arise in real data, but unfortunately cannot be
-encoded in variant string format.
+partial phasing can arise in real data, but unfortunately is beyond the
+scope of variant string format.
 
 Finally, if read counts are present they must mirror the format of the
 amino acids. For example
@@ -120,7 +123,9 @@ For example, `pfcrt:72:C;pfmdr-1:86:Y` specifies that in the *pfcrt*
 gene at codon 72 a C was observed, and in the *pfmdr-1* gene at codon 86
 a Y was observed. In this way, multi-locus genotypes can be encoded
 spanning different parts of the genome, including over different
-chromosomes.
+chromosomes. This is mainly useful if encoding phased information
+between genes is important, otherwise you might be better off splitting
+this information into two strings.
 
 There is no limit to the number of genes that can be encoded. The order
 of gene names does not matter as genes are first sorted alphabetically
@@ -149,7 +154,7 @@ data_string <- c("dhps:437:G",
 ```
 
 All of these samples were successfully sequenced at *dhps* locus 437,
-and some were also sequenced at locus 540. Some contain heterozygous
+and most were also sequenced at locus 540. Some contain heterozygous
 calls while others contain only homozygous calls.
 
 Our first question might be; **what unique genotypes are present within
@@ -164,7 +169,9 @@ this dataset?**
 - The fourth sample contains two heterozygous sites, meaning **it is no
   longer possible to establish which genotypes make up this sample**. It
   could be composed of `A_K` coming together with `G_E`, or by `A_E`
-  coming together with `G_K`, or by some combination of these.
+  coming together with `G_K`, or by some combination of these. Whenever
+  there are two or more unphased heterozygous loci we lose the ability
+  to unambiguously resolve the component genotypes.
 
 In code, we can use the `get_component_variants()` function:
 
@@ -299,7 +306,7 @@ results above, we can say that:
 - The `dhps:437_540:G_K` variant is present at between 2/3 and 3/3
   samples. The prevalence is in the range 67%-100%.
 - The `dhps:437_540:G_E` variant is present at between 1/3 and 2/3
-  samples. The prevalence is in the range 50%-67%.
+  samples. The prevalence is in the range 33%-67%.
 
 We can see that prevalence calculation is not always straightforward due
 to ambiguous matches, and requires a judgement call on how best to use
@@ -342,6 +349,26 @@ variant_to_long("pfcrt:72_73_74_75_76:CVIE_K/T:54_34_64_29_54/64;pfmdr-1:86_184:
 This contains the same information, but may be easier to work with for
 some operations. We can always convert back using `long_to_variant()`.
 
-## Release history
-
-The current version is 1.8.2, released 16 April 2025.
+### Summary and word of caution
+
+variantstring is a narrowly focused package that provides utility
+functions for working with a specific class of genetic data (actually
+technically amino-acid data). By combining these building blocks, you
+can perform more advanced tasks, such as estimating the prevalence of
+multi-locus haplotypes, even when samples contain mixed infections.
+
+That said, variantstring is not designed for efficiency, either in file
+size or computational speed. Its purpose is to act as a convenience
+format: something lightweight and human-interpretable, useful when
+manually extracting genotype information from papers, or when storing
+aggregate counts at a small number of loci rather than full
+individual-level data.
+
+Some efficiency considerations depend on how you compose the functions.
+For example, functions such as assume that the input is already a valid
+variant string; they do not re-validate the format because can be
+expensive and is better run once at the start of your workflow.
+Likewise, when constructing longer analysis pipelines, it is often best
+to unpack variant strings early using , work in long format using
+tidyverse-style operations, and then reassemble the strings at the end
+with .

From 2d66554de2d8bc3e939e2f62b2744e9068d348d8 Mon Sep 17 00:00:00 2001
From: Bob Verity <bobverity1@gmail.com>
Date: Fri, 28 Nov 2025 15:01:09 +0000
Subject: [PATCH 2/4] re-rendered help files

---
 man/compare_position_string.Rd       | 4 ++--
 man/compare_variant_string.Rd        | 4 ++--
 man/count_phased_hets.Rd             | 2 +-
 man/count_unphased_hets.Rd           | 2 +-
 man/drop_read_counts.Rd              | 2 +-
 man/extract_single_locus_variants.Rd | 2 +-
 man/get_component_variants.Rd        | 2 +-
 man/long_to_position.Rd              | 4 ++--
 man/long_to_variant.Rd               | 4 ++--
 man/order_position_string.Rd         | 2 +-
 man/order_variant_string.Rd          | 2 +-
 man/position_from_variant_string.Rd  | 2 +-
 man/position_to_long.Rd              | 2 +-
 man/subset_position.Rd               | 4 ++--
 man/variant_to_long.Rd               | 2 +-
 15 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/man/compare_position_string.Rd b/man/compare_position_string.Rd
index db73341..721c1e4 100644
--- a/man/compare_position_string.Rd
+++ b/man/compare_position_string.Rd
@@ -7,10 +7,10 @@
 compare_position_string(target_string, comparison_strings)
 }
 \arguments{
-\item{target_string}{a single position string that we want to compare.}
+\item{target_string}{a single position string that we want to compare. Note, these are not internally checked for being valid position strings, it is up to the user to ensure this (see \code{?check_position_string}).}
 
 \item{comparison_strings}{a vector of variant strings against which the
-target is compared.}
+target is compared. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).}
 }
 \description{
 Compares a target position string against a vector of comparison strings. A
diff --git a/man/compare_variant_string.Rd b/man/compare_variant_string.Rd
index 9103086..4b063f4 100644
--- a/man/compare_variant_string.Rd
+++ b/man/compare_variant_string.Rd
@@ -8,10 +8,10 @@ compare_variant_string(target_string, comparison_strings)
 }
 \arguments{
 \item{target_string}{a single variant string that we want to compare. Cannot
-contain any heterozygous calls.}
+contain any heterozygous calls. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).}
 
 \item{comparison_strings}{a vector of variant strings against which the
-target is compared.}
+target is compared. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).}
 }
 \description{
 Compares a target variant string against a vector of comparison strings. A
diff --git a/man/count_phased_hets.Rd b/man/count_phased_hets.Rd
index e9316a0..9313c9d 100644
--- a/man/count_phased_hets.Rd
+++ b/man/count_phased_hets.Rd
@@ -7,7 +7,7 @@
 count_phased_hets(x)
 }
 \arguments{
-\item{x}{a variant string or vector of variant strings.}
+\item{x}{a variant string or vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).}
 }
 \description{
 Count the number of phased heterozygous loci in each variant string. Return
diff --git a/man/count_unphased_hets.Rd b/man/count_unphased_hets.Rd
index fca883a..a0665c0 100644
--- a/man/count_unphased_hets.Rd
+++ b/man/count_unphased_hets.Rd
@@ -7,7 +7,7 @@
 count_unphased_hets(x)
 }
 \arguments{
-\item{x}{a variant string or vector of variant strings.}
+\item{x}{a variant string or vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).}
 }
 \description{
 Count the number of unphased heterozygous loci in each variant string. Return
diff --git a/man/drop_read_counts.Rd b/man/drop_read_counts.Rd
index 4b37ad2..eb8a7b3 100644
--- a/man/drop_read_counts.Rd
+++ b/man/drop_read_counts.Rd
@@ -7,7 +7,7 @@
 drop_read_counts(x)
 }
 \arguments{
-\item{x}{a variant string or vector of variant strings.}
+\item{x}{a variant string or vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).}
 }
 \description{
 Takes a vector of variant strings and strips and information on read counts.
diff --git a/man/extract_single_locus_variants.Rd b/man/extract_single_locus_variants.Rd
index e1ab02d..809cd42 100644
--- a/man/extract_single_locus_variants.Rd
+++ b/man/extract_single_locus_variants.Rd
@@ -7,7 +7,7 @@
 extract_single_locus_variants(x)
 }
 \arguments{
-\item{x}{a vector of variant strings.}
+\item{x}{a vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).}
 }
 \description{
 Takes a vector of variant strings, potentially with information at multiple
diff --git a/man/get_component_variants.Rd b/man/get_component_variants.Rd
index c3b294e..f8bcbb0 100644
--- a/man/get_component_variants.Rd
+++ b/man/get_component_variants.Rd
@@ -7,7 +7,7 @@
 get_component_variants(x)
 }
 \arguments{
-\item{x}{a vector of variant strings.}
+\item{x}{a vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).}
 }
 \description{
 For a variant string with at most one heterozygous locus we can unambiguously
diff --git a/man/long_to_position.Rd b/man/long_to_position.Rd
index b15e6c3..c90a575 100644
--- a/man/long_to_position.Rd
+++ b/man/long_to_position.Rd
@@ -7,9 +7,9 @@
 long_to_position(x)
 }
 \arguments{
-\item{x}{a list of data.frames.}
+\item{x}{a list of data frames corresponding to position strings.}
 }
 \description{
-Takes a list of data.frames in long form and converts each to position string
+Takes a list of data frames in long form and converts each to position string
 format.
 }
diff --git a/man/long_to_variant.Rd b/man/long_to_variant.Rd
index a9aa2c3..33607b4 100644
--- a/man/long_to_variant.Rd
+++ b/man/long_to_variant.Rd
@@ -7,9 +7,9 @@
 long_to_variant(x)
 }
 \arguments{
-\item{x}{a list of data.frames.}
+\item{x}{a list of data frames corresponding to variant strings.}
 }
 \description{
-Takes a list of data.frames in long form and converts each to variant string
+Takes a list of data frames in long form and converts each to variant string
 format.
 }
diff --git a/man/order_position_string.Rd b/man/order_position_string.Rd
index da188e5..22875ec 100644
--- a/man/order_position_string.Rd
+++ b/man/order_position_string.Rd
@@ -7,7 +7,7 @@
 order_position_string(x)
 }
 \arguments{
-\item{x}{a position string or vector of position strings.}
+\item{x}{a position string or vector of position strings. Note, these are not internally checked for being valid position strings, it is up to the user to ensure this (see \code{?check_position_string}).}
 }
 \description{
 Reorders a position string in alphabetical order of genes. This can be useful
diff --git a/man/order_variant_string.Rd b/man/order_variant_string.Rd
index 67b2c7f..1a56b38 100644
--- a/man/order_variant_string.Rd
+++ b/man/order_variant_string.Rd
@@ -7,7 +7,7 @@
 order_variant_string(x)
 }
 \arguments{
-\item{x}{a variant string or vector of variant strings.}
+\item{x}{a variant string or vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).}
 }
 \description{
 Reorders a variant string in alphabetical order of genes, and then
diff --git a/man/position_from_variant_string.Rd b/man/position_from_variant_string.Rd
index 5bcc2d4..6945b3a 100644
--- a/man/position_from_variant_string.Rd
+++ b/man/position_from_variant_string.Rd
@@ -7,7 +7,7 @@
 position_from_variant_string(x)
 }
 \arguments{
-\item{x}{a character string or vector of character strings.}
+\item{x}{a character string or vector of character strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).}
 }
 \description{
 Extract a position string from a variant string by stripping the amino acids.
diff --git a/man/position_to_long.Rd b/man/position_to_long.Rd
index 4dcce89..c6765c6 100644
--- a/man/position_to_long.Rd
+++ b/man/position_to_long.Rd
@@ -7,7 +7,7 @@
 position_to_long(x)
 }
 \arguments{
-\item{x}{a vector of position strings.}
+\item{x}{a vector of position strings. Note, these are not internally checked for being valid position strings, it is up to the user to ensure this (see \code{?check_position_string}).}
 }
 \description{
 Takes a vector of position strings and expands into a list of data.frames
diff --git a/man/subset_position.Rd b/man/subset_position.Rd
index 192ecb7..6e1519c 100644
--- a/man/subset_position.Rd
+++ b/man/subset_position.Rd
@@ -7,9 +7,9 @@
 subset_position(position_string, variant_strings)
 }
 \arguments{
-\item{position_string}{a single position string.}
+\item{position_string}{a single position string. Note, these are not internally checked for being valid position strings, it is up to the user to ensure this (see \code{?check_position_string}).}
 
-\item{variant_strings}{a variant string or vector of variant strings.}
+\item{variant_strings}{a variant string or vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).}
 }
 \description{
 Given a vector of variant strings and a single position string, subsets all
diff --git a/man/variant_to_long.Rd b/man/variant_to_long.Rd
index de632b4..d70b6da 100644
--- a/man/variant_to_long.Rd
+++ b/man/variant_to_long.Rd
@@ -7,7 +7,7 @@
 variant_to_long(x)
 }
 \arguments{
-\item{x}{a vector of variant strings.}
+\item{x}{a vector of variant strings. Note, these are not internally checked for being valid variant strings, it is up to the user to ensure this (see \code{?check_variant_string}).}
 }
 \description{
 Takes a vector of variant strings and expands into a list of data.frames

From d54388f42063b5efaa0eaffe8920336f7d9a9b28 Mon Sep 17 00:00:00 2001
From: Bob Verity <bobverity1@gmail.com>
Date: Fri, 28 Nov 2025 15:04:39 +0000
Subject: [PATCH 3/4] added CI

---
 .github/.gitignore                    |  1 +
 .github/workflows/checks_develop.yaml | 46 +++++++++++++++++++++++++++
 .github/workflows/checks_main.yaml    | 46 +++++++++++++++++++++++++++
 3 files changed, 93 insertions(+)
 create mode 100644 .github/.gitignore
 create mode 100644 .github/workflows/checks_develop.yaml
 create mode 100644 .github/workflows/checks_main.yaml

diff --git a/.github/.gitignore b/.github/.gitignore
new file mode 100644
index 0000000..2d19fc7
--- /dev/null
+++ b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/checks_develop.yaml b/.github/workflows/checks_develop.yaml
new file mode 100644
index 0000000..551e1bc
--- /dev/null
+++ b/.github/workflows/checks_develop.yaml
@@ -0,0 +1,46 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/master/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches:
+      - develop
+  pull_request:
+    branches:
+      - develop
+
+name: checks_develop
+
+jobs:
+  R-CMD-check:
+
+    runs-on: ${{ matrix.config.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {os: windows-latest, r: 'release'}
+          - {os: macOS-latest, r: 'release'}
+
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      R_KEEP_PKG_SOURCE: yes
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v3
+
+      - name: Set up R
+        uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: ${{ matrix.config.r }}
+
+      - name: Install Dependencies
+        uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck
+          needs: check
+
+      - name: Build Package
+        uses: r-lib/actions/check-r-package@v2
+        with:
+          args: 'c("--no-manual", "--ignore-vignettes")'
+          build_args: 'c("--no-build-vignettes")'
diff --git a/.github/workflows/checks_main.yaml b/.github/workflows/checks_main.yaml
new file mode 100644
index 0000000..5414b7d
--- /dev/null
+++ b/.github/workflows/checks_main.yaml
@@ -0,0 +1,46 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/master/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+name: checks_main
+
+jobs:
+  R-CMD-check:
+
+    runs-on: ${{ matrix.config.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {os: windows-latest, r: 'release'}
+          - {os: macOS-latest, r: 'release'}
+
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      R_KEEP_PKG_SOURCE: yes
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v3
+
+      - name: Set up R
+        uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: ${{ matrix.config.r }}
+
+      - name: Install Dependencies
+        uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck
+          needs: check
+
+      - name: Build Package
+        uses: r-lib/actions/check-r-package@v2
+        with:
+          args: 'c("--no-manual", "--ignore-vignettes")'
+          build_args: 'c("--no-build-vignettes")'

From 93e853c8f2017501acb8ecc9ba6711bda9be07b5 Mon Sep 17 00:00:00 2001
From: Bob Verity <bobverity1@gmail.com>
Date: Fri, 28 Nov 2025 15:09:46 +0000
Subject: [PATCH 4/4] added badges to readme

---
 README.Rmd | 4 +++-
 README.md  | 5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/README.Rmd b/README.Rmd
index 26b57a5..d30ac72 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -17,8 +17,10 @@ library(variantstring)
 
 # variantstring
 
-Tools for working with *variant string format*, a convenient format for encoding multi-locus genotypes. Functionalities include converting into and out of variant string format, subsetting based on genomic position, and comparing two variant strings to see if one is a subset of the other.
+[![master checks](https://github.com/mrc-ide/variantstring/workflows/checks_main/badge.svg)](https://github.com/mrc-ide/variantstring/actions)
+[![develop checks](https://github.com/mrc-ide/variantstring/workflows/checks_develop/badge.svg)](https://github.com/mrc-ide/variantstring/actions)
 
+Tools for working with *variant string format*, a convenient format for encoding multi-locus genotypes. Functionalities include converting into and out of variant string format, subsetting based on genomic position, and comparing two variant strings to see if one is a subset of the other.
 
 ## Installation
 
diff --git a/README.md b/README.md
index 3f90e61..635fc19 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,11 @@
 
 # variantstring
 
+[![master
+checks](https://github.com/mrc-ide/variantstring/workflows/checks_main/badge.svg)](https://github.com/mrc-ide/variantstring/actions)
+[![develop
+checks](https://github.com/mrc-ide/variantstring/workflows/checks_develop/badge.svg)](https://github.com/mrc-ide/variantstring/actions)
+
 Tools for working with *variant string format*, a convenient format for
 encoding multi-locus genotypes. Functionalities include converting into
 and out of variant string format, subsetting based on genomic position,