LimaRAF · LimaRAF · Apr 2, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: plantR
-Version: 0.2.0
+Version: 0.2.1
 Title: Managing Species Records from Biological Collections
 Type: Package
 Authors@R: c(person("Renato A.", "Ferreira de Lima", role = c("aut", "cre"),

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,19 @@
 
 <br/>
 
+## version 0.2.1
+
+* Improvements in `validateTax()`, which now takes into account auxiliary names of identifiers for the validation of taxonomic identifications (issues #209 and #213)
+
+* Minor improvements in `validateTax()` and accessory functions `prepDup()`, `mergeDup()` and `rmDup()` and the creation of a new internal function `getMergeCat()` to better deal with the merge and removal of duplicates (issues #211 and #212)
+
+* Changes in the internal object `fieldNames` and in the example datasets to due the changes in the names of columns in the CRIA API (issue #210)
+
+* Improvements in the function `checkList()` to better account for taxon authorships and some format improvements (issue #215)
+
+* Updates on the plantR internal dictionaries and SysData
+
+
 ## version 0.2.0
 
 * Version consolidating the changes from previous versions (0.1.8 to 0.1.10), that should require users to adapt their previous codes. Novelties and changes implemented in those versions were double-checked and some final and minor enhancements were done.

diff --git a/R/accessory.R b/R/accessory.R
@@ -151,3 +151,99 @@ checkColNames <- function(x = NULL, group = NULL) {
     return(x)
   }
 }
+
+#'
+#' @title Get Duplicate Merge Categories
+#'
+#' @param x a data frame or a data table
+#' @param dup.name character. The name of column in the input data
+#'   frame with the duplicate group ID. Default to the __plantR__
+#'   output 'dup.ID'.
+#' @param prop.name character. The name of column in the input data
+#'   frame with the proportion of duplicates found within the group
+#'   ID. Default to the __plantR__ output 'dup.prop'.
+#' @param prop numerical. The threshold value of proportion of
+#'   duplicated values retrieved (i.e. dup.prop) to enter the merging
+#'   routine. Should be between zero and one. Default to 0.75.
+#' @param rec.ID character. The name of the columns containing the
+#'   unique record identifier (see function `getTombo()`). Default to
+#'   'numTombo'.
+#'
+#' @returns a data table with an extra column called 'dup.merge'
+#'   containing the merge categories
+#'
+#' @details
+#' The merge category is a logical vector in which TRUE means that the
+#' records has a value of duplicated proportion (given in `prop.name`)
+#' equal or above the threshold defined in `prop` or records with
+#' duplicated proportion below the threshold but that have duplicated
+#' catalog numbers within the duplicated IDs defined in the argument
+#' `dup.name`.
+#'
+#' @keywords internal
+#'
+#' @importFrom data.table as.data.table setnames
+#'
+#' @noRd
+#'
+#' @examples
+#' df <- data.frame(numTombo = c("a1","b2","c3","c3","d5","d5","e7","f4","g9"),
+#'                   dup.ID = c("a1|b2","a1|b2","c3|c3","c3|c3","d5|d5|e7",
+#'                              "d5|d5|e7","d5|d5|e7","f4",NA),
+#'                   dup.prop = c(1, 1, 1, 1, 0.5, 1, 1, 1, NA))
+#' getMergeCat(df)
+#'
+getMergeCat <- function(x = NULL,
+                        dup.name = "dup.ID",
+                        prop.name = "dup.prop",
+                        prop = 0.75,
+                        rec.ID = "numTombo")
+  {
+
+  if (!inherits(x, c("data.frame", "data.table")))
+    stop("Input object needs to be a data frame or a data table!")
+
+  if (inherits(x, "data.frame")) {
+    dtb <- data.table::as.data.table(x)
+  } else {
+    dtb <- x
+  }
+
+  #Checking essential columns
+  if (!dup.name %in% names(dtb))
+    stop("Classification requires a column with the duplicate group ID")
+
+  if (!prop.name %in% names(dtb)) {
+    warning("Classification requires a column with the proportion of duplicates. Assuming to be 1")
+    dtb[, c(prop.name) := 1]
+  }
+
+  if (!rec.ID %in% names(dtb)) {
+    warning("Classification requires a column with the unique record identifier. Creating one")
+    dtb[, c(rec.ID) := .I]
+  }
+
+  # if (any(dtb[[prop.name]] > 1 | dtb[[prop.name]] < 0, na.rm = TRUE))
+  #     stop("Values provided in 'prop.name' must be between 0 and 1")
+
+  # Creating the temporary columns
+  dup.merge <- dup.rec.ID.wk <- temp.num.tombo.wk <- NULL
+  wk.cols <- c("temp.dup.ID.wk", "temp.dup.prop.wk", "temp.num.tombo.wk")
+  data.table::setnames(dtb, c(dup.name, prop.name, rec.ID), wk.cols)
+
+  # Creating the duplicate categories
+  dtb[, dup.merge := .SD >= prop, .SDcols = "temp.dup.prop.wk"][]
+
+  # Making sure that duplicated catalog numbers stay in their dup.ID
+  dtb[, dup.rec.ID.wk := duplicated(temp.num.tombo.wk) |
+                        duplicated(temp.num.tombo.wk, fromLast = TRUE),
+     by = "temp.dup.ID.wk"][]
+  dtb[dup.rec.ID.wk & !dup.merge, dup.merge := TRUE, by = "temp.dup.ID.wk"][]
+  dtb[, dup.rec.ID.wk := NULL][]
+
+  data.table::setnames(dtb, c(wk.cols, "dup.merge"),
+                       c(dup.name, prop.name, rec.ID, "dup.merge"))
+  return(dtb)
+
+}
+
diff --git a/R/checklist.R b/R/checklist.R
@@ -106,6 +106,7 @@ checkList <- function(x,
   typeStatus <- temp.rec.numb <- ordem <- temp.pais <- NULL
   coletores <- temp.accession <- numTombo <- vchrs <- NULL
   datas <- datas.tipo <- lista.vouchs <- . <- NULL
+  tmp.rec.by <- tmp.tax.name <- NULL
 
   ## PREPARING THE TABLE ##
   # Select which co-variables will be used (priority to edited columns)
@@ -155,77 +156,90 @@ checkList <- function(x,
   if (length(changeCols) > 0)
     dt[,(changeCols):= lapply(.SD, as.character), .SDcols = changeCols]
 
+  # getting the unique taxon + author combinations
+  tax.cols <- covs.final[covs.final %in% c(covs.present[["species"]],
+                                 covs.present[["authors"]])]
+  if (length(tax.cols) > 1) {
+    dt[, tmp.tax.name := do.call(paste, .SD), .SDcols = c(tax.cols)]
+    rm.na <- paste0(rep(NA, length(tax.cols)), collapse = " ")
+    dt[tmp.tax.name == rm.na, tmp.tax.name := NA_character_]
+    dt[, tmp.tax.name := gsub("NA\\s+", "", tmp.tax.name, perl = TRUE),]
+    dt[, tmp.tax.name := gsub("\\sNA+", "", tmp.tax.name, perl = TRUE),]
+  } else {
+    dt[, tmp.tax.name := .SD, .SDcols = tax.cols[1]]
+  }
+
   # getting the list of taxa and the selected columns
-  data.table::setindexv(dt, covs.present[["species"]])
-  checklist <- data.frame(unique(dt, by= c(covs.present[["species"]],
-                                           covs.present[["authors"]])))
+  data.table::setindexv(dt, "tmp.tax.name")
+  checklist <- data.frame(unique(dt, by = c("tmp.tax.name")))
   cols <-
     c(unlist(covs.present[names(covs.present) %in% c("families", "species", "authors")]),
-            "scientific.name")
+            "scientific.name", "tmp.tax.name")
   checklist <- checklist[, names(checklist) %in% cols]
   checklist$records <- NA
   checklist$tax.CL <- NA
   checklist$geo.CL <- NA
   checklist$vouchers <- NA
 
   ## NUMBER OF RECORDS PER SPECIES ##
-  records <- dt[, .N , by = c(covs.present[["species"]])]
+  records <- dt[, .N , by = c("tmp.tax.name")]
 
   if ("dup.ID" %in% names(dt)) {
 
     unicatas <- dt[is.na(dup.ID), .N ,
-                   by = c(covs.present[["species"]])]
+                   by = c("tmp.tax.name")]
     unicatas <- merge(records, unicatas,
-                      by = c(covs.present[["species"]]),
+                      by = c("tmp.tax.name"),
                       all.x = TRUE,
                       suffixes = c("", ".unis"))
     unicatas[, N := NULL]
 
     duplicatas <-
       dt[!is.na(dup.ID), .N,
-         by = c(covs.present[["species"]], "dup.ID")]
-    duplicatas <- duplicatas[, .N , by = c(covs.present[["species"]])]
+         by = c("tmp.tax.name", "dup.ID")]
+    duplicatas <- duplicatas[, .N , by = c("tmp.tax.name")]
     duplicatas <-
       data.table::merge.data.table(records, duplicatas,
-                                   by = c(covs.present[["species"]]),
+                                   by = c("tmp.tax.name"),
                                    all.x = TRUE,
                                    suffixes = c("", ".dups"))
     duplicatas[, N:= NULL]
 
-    records <- records[unicatas, on = c(covs.present[["species"]])]
-    records <- records[duplicatas, on = c(covs.present[["species"]])]
+    records <- records[unicatas, on = c("tmp.tax.name")]
+    records <- records[duplicatas, on = c("tmp.tax.name")]
 
     checklist$records <-
-      records$N[match(checklist[, covs.present[["species"]]],
+      records$N[match(checklist[, "tmp.tax.name"],
                       data.frame(records)[,1])]
   } else {
     checklist$records <-
-      records$N[match(checklist[, covs.present[["species"]]],
+      records$N[match(checklist[, "tmp.tax.name"],
                       data.frame(records)[,1])]
   }
 
 
   ## TAXONOMIC CONFIDENCE LEVEL  ##
   if (!is.na(covs.present[["taxonomy"]])) {
     # Proportion of validate identifications per species
-    colunas <- c(covs.present[["taxonomy"]], covs.present[["species"]])
+    colunas <- c(covs.present[["taxonomy"]], "tmp.tax.name")
     data.table::setkeyv(dt, cols = colunas)
     taxs <- dt[, .N,
-               by = c(covs.present[["species"]], covs.present[["taxonomy"]])]
+               by = c("tmp.tax.name", covs.present[["taxonomy"]])]
     #dt[data.table::CJ(tax.check1, scientificName.new, unique = TRUE), .N, by = .EACHI]
 
     vals <- c("unknown", "low", "medium", "high")
     all.taxs <- data.table::CJ(unique(data.frame(taxs)[,1]),
                                vals, 0, unique = TRUE)
     names(all.taxs) <- names(taxs)
-    all.taxs <- merge(all.taxs, taxs, by = c(covs.present[["species"]], covs.present[["taxonomy"]]),
+    all.taxs <- merge(all.taxs, taxs, by = c("tmp.tax.name", covs.present[["taxonomy"]]),
                       all.x = TRUE, suffixes = c(".all", ""))
     all.taxs <- all.taxs[, N.all := NULL]
     all.taxs <- all.taxs[data.frame(all.taxs)[,2] %in% "high", ]
     all.taxs[is.na(N), N := 0]
 
     # Saving the result
-    checklist$tax.CL <- round(100 * all.taxs$N[match(checklist[,1], data.frame(all.taxs)[,1])]/
+    checklist$tax.CL <- round(100 * all.taxs$N[match(checklist[["tmp.tax.name"]],
+                                                     all.taxs[["tmp.tax.name"]])]/
                                 checklist$records, 2)
   }
 
@@ -237,7 +251,7 @@ checkList <- function(x,
     dt[temp.geo.check %in% c("ok_county", "ok_locality"), temp.geo.check := "1"]
     dt[!temp.geo.check %in% "1", temp.geo.check := "0"]
 
-    colunas <- c("temp.geo.check", covs.present[["species"]])
+    colunas <- c("temp.geo.check", "tmp.tax.name")
     data.table::setkeyv(dt, cols = colunas)
     coords <- dt[, .N, by = colunas]
 
@@ -246,7 +260,8 @@ checkList <- function(x,
     coords <- coords[, temp.geo.check := NULL]
     dt[ , temp.geo.check := NULL]
 
-    checklist$geo.CL <- round(100 * coords$N[match(checklist[,1], data.frame(coords)[,1])]/
+    checklist$geo.CL <- round(100 * coords$N[match(checklist[["tmp.tax.name"]],
+                                                   coords[["tmp.tax.name"]])]/
                                 checklist$records, 2)
   }
 
@@ -268,7 +283,8 @@ checkList <- function(x,
         dt[c("s.n."), priority := priority - 3, nomatch = NULL])
     }
     temp <- data.frame(dt[, lapply(.SD, nchar),
-                          by = c(covs.present[["collectors"]]), .SDcols = c(covs.present[["collectors"]])])
+                          by = c(covs.present[["collectors"]]),
+                          .SDcols = c(covs.present[["collectors"]])])
     dt[ temp[,2] < 4, priority := priority - 3]
   }
 
@@ -299,7 +315,7 @@ checkList <- function(x,
     dt[, ordem := 1:dim(dt)[1],]
     data.table::setnames(dt, covs.present[["countries"]], "temp.pais")
     temp <- dt[, .(ordem, temp.pais, !duplicated(temp.pais, incomparables = NA_character_)),
-               by = c(covs.present[["species"]])]
+               by = c("tmp.tax.name")]
     temp$V3 <- !unlist(temp$V3)
     # temp$V4 <- !unlist(dt[, .(ordem, temp.pais, !duplicated(temp.pais, incomparables = NA_character_, fromLast = TRUE)),
     #            by = c(covs.present[["species"]])]$V3)
@@ -339,15 +355,17 @@ checkList <- function(x,
   #priority from the same collector or same county?
 
   # Organizing and filtering records based on the ranks by species
-  data.table::setorderv(dt, c(covs.present[["species"]], "priority"), c(1,-1))
+  data.table::setorderv(dt, c("tmp.tax.name", "priority"), c(1,-1))
   dt1 <- dt[dt[, .I[1:n.vouch],
-               by = c(covs.present[["species"]])]$V1]
+               by = c("tmp.tax.name")]$V1]
   dt1 <- dt1[rowSums(is.na(dt1)) < dim(dt1)[2],]
 
   ## GENERATING THE LIST OF VOUCHERS ##
   # Collector name and number
-  dt1[ , coletores := do.call(paste, c(.SD, sep=", ")),
-       .SDcols = c(covs.present[["collectors"]], covs.present[["recordNumber"]])]
+  dt1[ , tmp.rec.by := lapply(.SD, prepName, format = "init_last"),
+       .SDcols = covs.present[["collectors"]]]
+  dt1[ , coletores := do.call(paste, c(.SD, sep=" ")),
+       .SDcols = c("tmp.rec.by", covs.present[["recordNumber"]])]
 
   if (type == "short") { # Inspired in the Flora do Brail format
 
@@ -357,7 +375,7 @@ checkList <- function(x,
 
     ## Accession numbers
     if (!"temp.accession" %in% names(dt1))
-      dt1[ , temp.accession := do.call(paste, c(.SD, sep=", ")),
+      dt1[ , temp.accession := do.call(paste, c(.SD, sep=" ")),
            .SDcols = c(covs.present[["collections"]], covs.present[["catalog"]])]
 
     #correcting accessions numbers for duplicates across herbaria
@@ -385,14 +403,14 @@ checkList <- function(x,
     }
 
     #combining all voucher into a single string
-    data.table::setorderv(dt1, c(covs.present[["species"]], "vchrs"), c(1,1))
+    data.table::setorderv(dt1, c("tmp.tax.name", "vchrs"), c(1,1))
     dt2 <- dt1[ , do.call(paste, c(.SD, collapse= "; ",sep="")),
-                by = c(covs.present[["species"]]),
+                by = c("tmp.tax.name"),
                 .SDcols = "vchrs"]
 
     checklist$vouchers <-
-      as.character(dt2$V1[match(checklist[,covs.present[["species"]]],
-                                data.frame(dt2)[,1])])
+      as.character(dt2$V1[match(checklist[["tmp.tax.name"]],
+                                dt2[["tmp.tax.name"]])])
   }
 
   if (type == "selected") { # From 'species examined' in Flora Neotropica
@@ -407,8 +425,9 @@ checkList <- function(x,
 
     #collectionCode
     if (!"temp.accession" %in% names(dt1))
-      dt1[ , temp.accession := .SD,
-           .SDcols = c(covs.present[["collections"]])]
+      dt1[ , temp.accession := do.call(paste, c(.SD, sep=" ")),
+         .SDcols = c(covs.present[["collections"]], covs.present[["catalog"]])]
+
 
     #correcting accessions numbers for duplicates across herbaria
     if ("dup.ID" %in% names(dt1)) {
@@ -512,22 +531,22 @@ checkList <- function(x,
          .SDcols = c("locais", "datas", "vchrs")]
 
     #combining all voucher into a single string
-    data.table::setorderv(dt1, c(covs.present[["species"]], "lista.vouchs"), c(1,1))
+    data.table::setorderv(dt1, c("tmp.tax.name", "lista.vouchs"), c(1,1))
     dt2 <- dt1[ , do.call(paste, c(.SD, collapse= "; ",sep="")),
-                by = c(covs.present[["species"]]),
+                by = c("tmp.tax.name"),
                 .SDcols = "lista.vouchs"]
     #Saving
     checklist$vouchers <-
-      as.character(dt2$V1[match(checklist[, covs.present[["species"]]],
-                                data.frame(dt2)[,1])])
+      as.character(dt2$V1[match(checklist[["tmp.tax.name"]],
+                                dt2[["tmp.tax.name"]])])
   }
 
   # Organizing and ordering the output
   cols <- as.character(
-    c(unlist(covs.present[names(covs.present) %in% c("families", "species")]),
+    c(unlist(covs.present[names(covs.present) %in% c("families", "species", "authors")]),
       "scientific.name", "records", "tax.CL", "geo.CL","vouchers"))
-  if (all(c(covs.present[["species"]], "scientific.name") %in% names(checklist)))
-    cols <- cols[!cols %in% covs.present[["species"]]]
+  if (all(c("tmp.tax.name", "scientific.name") %in% names(checklist)))
+    cols <- cols[!cols %in% "tmp.tax.name"]
 
   cols <- cols[cols %in% names(checklist)]
   checklist <- checklist[, cols]