Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: plantR
Version: 0.2.0
Version: 0.2.1
Title: Managing Species Records from Biological Collections
Type: Package
Authors@R: c(person("Renato A.", "Ferreira de Lima", role = c("aut", "cre"),
Expand Down
13 changes: 13 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,19 @@

<br/>

## version 0.2.1

* Improvements in `validateTax()`, which now takes into account auxiliary names of identifiers for the validation of taxonomic identifications (issues #209 and #213)

* Minor improvements in `validateTax()` and accessory functions `prepDup()`, `mergeDup()` and `rmDup()` and the creation of a new internal function `getMergeCat()` to better deal with the merge and removal of duplicates (issues #211 and #212)

* Changes in the internal object `fieldNames` and in the example datasets to due the changes in the names of columns in the CRIA API (issue #210)

* Improvements in the function `checkList()` to better account for taxon authorships and some format improvements (issue #215)

* Updates on the plantR internal dictionaries and SysData


## version 0.2.0

* Version consolidating the changes from previous versions (0.1.8 to 0.1.10), that should require users to adapt their previous codes. Novelties and changes implemented in those versions were double-checked and some final and minor enhancements were done.
Expand Down
96 changes: 96 additions & 0 deletions R/accessory.R
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,99 @@ checkColNames <- function(x = NULL, group = NULL) {
return(x)
}
}

#'
#' @title Get Duplicate Merge Categories
#'
#' @param x a data frame or a data table
#' @param dup.name character. The name of column in the input data
#' frame with the duplicate group ID. Default to the __plantR__
#' output 'dup.ID'.
#' @param prop.name character. The name of column in the input data
#' frame with the proportion of duplicates found within the group
#' ID. Default to the __plantR__ output 'dup.prop'.
#' @param prop numerical. The threshold value of proportion of
#' duplicated values retrieved (i.e. dup.prop) to enter the merging
#' routine. Should be between zero and one. Default to 0.75.
#' @param rec.ID character. The name of the columns containing the
#' unique record identifier (see function `getTombo()`). Default to
#' 'numTombo'.
#'
#' @returns a data table with an extra column called 'dup.merge'
#' containing the merge categories
#'
#' @details
#' The merge category is a logical vector in which TRUE means that the
#' records has a value of duplicated proportion (given in `prop.name`)
#' equal or above the threshold defined in `prop` or records with
#' duplicated proportion below the threshold but that have duplicated
#' catalog numbers within the duplicated IDs defined in the argument
#' `dup.name`.
#'
#' @keywords internal
#'
#' @importFrom data.table as.data.table setnames
#'
#' @noRd
#'
#' @examples
#' df <- data.frame(numTombo = c("a1","b2","c3","c3","d5","d5","e7","f4","g9"),
#' dup.ID = c("a1|b2","a1|b2","c3|c3","c3|c3","d5|d5|e7",
#' "d5|d5|e7","d5|d5|e7","f4",NA),
#' dup.prop = c(1, 1, 1, 1, 0.5, 1, 1, 1, NA))
#' getMergeCat(df)
#'
getMergeCat <- function(x = NULL,
dup.name = "dup.ID",
prop.name = "dup.prop",
prop = 0.75,
rec.ID = "numTombo")
{

if (!inherits(x, c("data.frame", "data.table")))
stop("Input object needs to be a data frame or a data table!")

if (inherits(x, "data.frame")) {
dtb <- data.table::as.data.table(x)
} else {
dtb <- x
}

#Checking essential columns
if (!dup.name %in% names(dtb))
stop("Classification requires a column with the duplicate group ID")

if (!prop.name %in% names(dtb)) {
warning("Classification requires a column with the proportion of duplicates. Assuming to be 1")
dtb[, c(prop.name) := 1]
}

if (!rec.ID %in% names(dtb)) {
warning("Classification requires a column with the unique record identifier. Creating one")
dtb[, c(rec.ID) := .I]
}

# if (any(dtb[[prop.name]] > 1 | dtb[[prop.name]] < 0, na.rm = TRUE))
# stop("Values provided in 'prop.name' must be between 0 and 1")

# Creating the temporary columns
dup.merge <- dup.rec.ID.wk <- temp.num.tombo.wk <- NULL
wk.cols <- c("temp.dup.ID.wk", "temp.dup.prop.wk", "temp.num.tombo.wk")
data.table::setnames(dtb, c(dup.name, prop.name, rec.ID), wk.cols)

# Creating the duplicate categories
dtb[, dup.merge := .SD >= prop, .SDcols = "temp.dup.prop.wk"][]

# Making sure that duplicated catalog numbers stay in their dup.ID
dtb[, dup.rec.ID.wk := duplicated(temp.num.tombo.wk) |
duplicated(temp.num.tombo.wk, fromLast = TRUE),
by = "temp.dup.ID.wk"][]
dtb[dup.rec.ID.wk & !dup.merge, dup.merge := TRUE, by = "temp.dup.ID.wk"][]
dtb[, dup.rec.ID.wk := NULL][]

data.table::setnames(dtb, c(wk.cols, "dup.merge"),
c(dup.name, prop.name, rec.ID, "dup.merge"))
return(dtb)

}

99 changes: 59 additions & 40 deletions R/checklist.R
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ checkList <- function(x,
typeStatus <- temp.rec.numb <- ordem <- temp.pais <- NULL
coletores <- temp.accession <- numTombo <- vchrs <- NULL
datas <- datas.tipo <- lista.vouchs <- . <- NULL
tmp.rec.by <- tmp.tax.name <- NULL

## PREPARING THE TABLE ##
# Select which co-variables will be used (priority to edited columns)
Expand Down Expand Up @@ -155,77 +156,90 @@ checkList <- function(x,
if (length(changeCols) > 0)
dt[,(changeCols):= lapply(.SD, as.character), .SDcols = changeCols]

# getting the unique taxon + author combinations
tax.cols <- covs.final[covs.final %in% c(covs.present[["species"]],
covs.present[["authors"]])]
if (length(tax.cols) > 1) {
dt[, tmp.tax.name := do.call(paste, .SD), .SDcols = c(tax.cols)]
rm.na <- paste0(rep(NA, length(tax.cols)), collapse = " ")
dt[tmp.tax.name == rm.na, tmp.tax.name := NA_character_]
dt[, tmp.tax.name := gsub("NA\\s+", "", tmp.tax.name, perl = TRUE),]
dt[, tmp.tax.name := gsub("\\sNA+", "", tmp.tax.name, perl = TRUE),]
} else {
dt[, tmp.tax.name := .SD, .SDcols = tax.cols[1]]
}

# getting the list of taxa and the selected columns
data.table::setindexv(dt, covs.present[["species"]])
checklist <- data.frame(unique(dt, by= c(covs.present[["species"]],
covs.present[["authors"]])))
data.table::setindexv(dt, "tmp.tax.name")
checklist <- data.frame(unique(dt, by = c("tmp.tax.name")))
cols <-
c(unlist(covs.present[names(covs.present) %in% c("families", "species", "authors")]),
"scientific.name")
"scientific.name", "tmp.tax.name")
checklist <- checklist[, names(checklist) %in% cols]
checklist$records <- NA
checklist$tax.CL <- NA
checklist$geo.CL <- NA
checklist$vouchers <- NA

## NUMBER OF RECORDS PER SPECIES ##
records <- dt[, .N , by = c(covs.present[["species"]])]
records <- dt[, .N , by = c("tmp.tax.name")]

if ("dup.ID" %in% names(dt)) {

unicatas <- dt[is.na(dup.ID), .N ,
by = c(covs.present[["species"]])]
by = c("tmp.tax.name")]
unicatas <- merge(records, unicatas,
by = c(covs.present[["species"]]),
by = c("tmp.tax.name"),
all.x = TRUE,
suffixes = c("", ".unis"))
unicatas[, N := NULL]

duplicatas <-
dt[!is.na(dup.ID), .N,
by = c(covs.present[["species"]], "dup.ID")]
duplicatas <- duplicatas[, .N , by = c(covs.present[["species"]])]
by = c("tmp.tax.name", "dup.ID")]
duplicatas <- duplicatas[, .N , by = c("tmp.tax.name")]
duplicatas <-
data.table::merge.data.table(records, duplicatas,
by = c(covs.present[["species"]]),
by = c("tmp.tax.name"),
all.x = TRUE,
suffixes = c("", ".dups"))
duplicatas[, N:= NULL]

records <- records[unicatas, on = c(covs.present[["species"]])]
records <- records[duplicatas, on = c(covs.present[["species"]])]
records <- records[unicatas, on = c("tmp.tax.name")]
records <- records[duplicatas, on = c("tmp.tax.name")]

checklist$records <-
records$N[match(checklist[, covs.present[["species"]]],
records$N[match(checklist[, "tmp.tax.name"],
data.frame(records)[,1])]
} else {
checklist$records <-
records$N[match(checklist[, covs.present[["species"]]],
records$N[match(checklist[, "tmp.tax.name"],
data.frame(records)[,1])]
}


## TAXONOMIC CONFIDENCE LEVEL ##
if (!is.na(covs.present[["taxonomy"]])) {
# Proportion of validate identifications per species
colunas <- c(covs.present[["taxonomy"]], covs.present[["species"]])
colunas <- c(covs.present[["taxonomy"]], "tmp.tax.name")
data.table::setkeyv(dt, cols = colunas)
taxs <- dt[, .N,
by = c(covs.present[["species"]], covs.present[["taxonomy"]])]
by = c("tmp.tax.name", covs.present[["taxonomy"]])]
#dt[data.table::CJ(tax.check1, scientificName.new, unique = TRUE), .N, by = .EACHI]

vals <- c("unknown", "low", "medium", "high")
all.taxs <- data.table::CJ(unique(data.frame(taxs)[,1]),
vals, 0, unique = TRUE)
names(all.taxs) <- names(taxs)
all.taxs <- merge(all.taxs, taxs, by = c(covs.present[["species"]], covs.present[["taxonomy"]]),
all.taxs <- merge(all.taxs, taxs, by = c("tmp.tax.name", covs.present[["taxonomy"]]),
all.x = TRUE, suffixes = c(".all", ""))
all.taxs <- all.taxs[, N.all := NULL]
all.taxs <- all.taxs[data.frame(all.taxs)[,2] %in% "high", ]
all.taxs[is.na(N), N := 0]

# Saving the result
checklist$tax.CL <- round(100 * all.taxs$N[match(checklist[,1], data.frame(all.taxs)[,1])]/
checklist$tax.CL <- round(100 * all.taxs$N[match(checklist[["tmp.tax.name"]],
all.taxs[["tmp.tax.name"]])]/
checklist$records, 2)
}

Expand All @@ -237,7 +251,7 @@ checkList <- function(x,
dt[temp.geo.check %in% c("ok_county", "ok_locality"), temp.geo.check := "1"]
dt[!temp.geo.check %in% "1", temp.geo.check := "0"]

colunas <- c("temp.geo.check", covs.present[["species"]])
colunas <- c("temp.geo.check", "tmp.tax.name")
data.table::setkeyv(dt, cols = colunas)
coords <- dt[, .N, by = colunas]

Expand All @@ -246,7 +260,8 @@ checkList <- function(x,
coords <- coords[, temp.geo.check := NULL]
dt[ , temp.geo.check := NULL]

checklist$geo.CL <- round(100 * coords$N[match(checklist[,1], data.frame(coords)[,1])]/
checklist$geo.CL <- round(100 * coords$N[match(checklist[["tmp.tax.name"]],
coords[["tmp.tax.name"]])]/
checklist$records, 2)
}

Expand All @@ -268,7 +283,8 @@ checkList <- function(x,
dt[c("s.n."), priority := priority - 3, nomatch = NULL])
}
temp <- data.frame(dt[, lapply(.SD, nchar),
by = c(covs.present[["collectors"]]), .SDcols = c(covs.present[["collectors"]])])
by = c(covs.present[["collectors"]]),
.SDcols = c(covs.present[["collectors"]])])
dt[ temp[,2] < 4, priority := priority - 3]
}

Expand Down Expand Up @@ -299,7 +315,7 @@ checkList <- function(x,
dt[, ordem := 1:dim(dt)[1],]
data.table::setnames(dt, covs.present[["countries"]], "temp.pais")
temp <- dt[, .(ordem, temp.pais, !duplicated(temp.pais, incomparables = NA_character_)),
by = c(covs.present[["species"]])]
by = c("tmp.tax.name")]
temp$V3 <- !unlist(temp$V3)
# temp$V4 <- !unlist(dt[, .(ordem, temp.pais, !duplicated(temp.pais, incomparables = NA_character_, fromLast = TRUE)),
# by = c(covs.present[["species"]])]$V3)
Expand Down Expand Up @@ -339,15 +355,17 @@ checkList <- function(x,
#priority from the same collector or same county?

# Organizing and filtering records based on the ranks by species
data.table::setorderv(dt, c(covs.present[["species"]], "priority"), c(1,-1))
data.table::setorderv(dt, c("tmp.tax.name", "priority"), c(1,-1))
dt1 <- dt[dt[, .I[1:n.vouch],
by = c(covs.present[["species"]])]$V1]
by = c("tmp.tax.name")]$V1]
dt1 <- dt1[rowSums(is.na(dt1)) < dim(dt1)[2],]

## GENERATING THE LIST OF VOUCHERS ##
# Collector name and number
dt1[ , coletores := do.call(paste, c(.SD, sep=", ")),
.SDcols = c(covs.present[["collectors"]], covs.present[["recordNumber"]])]
dt1[ , tmp.rec.by := lapply(.SD, prepName, format = "init_last"),
.SDcols = covs.present[["collectors"]]]
dt1[ , coletores := do.call(paste, c(.SD, sep=" ")),
.SDcols = c("tmp.rec.by", covs.present[["recordNumber"]])]

if (type == "short") { # Inspired in the Flora do Brail format

Expand All @@ -357,7 +375,7 @@ checkList <- function(x,

## Accession numbers
if (!"temp.accession" %in% names(dt1))
dt1[ , temp.accession := do.call(paste, c(.SD, sep=", ")),
dt1[ , temp.accession := do.call(paste, c(.SD, sep=" ")),
.SDcols = c(covs.present[["collections"]], covs.present[["catalog"]])]

#correcting accessions numbers for duplicates across herbaria
Expand Down Expand Up @@ -385,14 +403,14 @@ checkList <- function(x,
}

#combining all voucher into a single string
data.table::setorderv(dt1, c(covs.present[["species"]], "vchrs"), c(1,1))
data.table::setorderv(dt1, c("tmp.tax.name", "vchrs"), c(1,1))
dt2 <- dt1[ , do.call(paste, c(.SD, collapse= "; ",sep="")),
by = c(covs.present[["species"]]),
by = c("tmp.tax.name"),
.SDcols = "vchrs"]

checklist$vouchers <-
as.character(dt2$V1[match(checklist[,covs.present[["species"]]],
data.frame(dt2)[,1])])
as.character(dt2$V1[match(checklist[["tmp.tax.name"]],
dt2[["tmp.tax.name"]])])
}

if (type == "selected") { # From 'species examined' in Flora Neotropica
Expand All @@ -407,8 +425,9 @@ checkList <- function(x,

#collectionCode
if (!"temp.accession" %in% names(dt1))
dt1[ , temp.accession := .SD,
.SDcols = c(covs.present[["collections"]])]
dt1[ , temp.accession := do.call(paste, c(.SD, sep=" ")),
.SDcols = c(covs.present[["collections"]], covs.present[["catalog"]])]


#correcting accessions numbers for duplicates across herbaria
if ("dup.ID" %in% names(dt1)) {
Expand Down Expand Up @@ -512,22 +531,22 @@ checkList <- function(x,
.SDcols = c("locais", "datas", "vchrs")]

#combining all voucher into a single string
data.table::setorderv(dt1, c(covs.present[["species"]], "lista.vouchs"), c(1,1))
data.table::setorderv(dt1, c("tmp.tax.name", "lista.vouchs"), c(1,1))
dt2 <- dt1[ , do.call(paste, c(.SD, collapse= "; ",sep="")),
by = c(covs.present[["species"]]),
by = c("tmp.tax.name"),
.SDcols = "lista.vouchs"]
#Saving
checklist$vouchers <-
as.character(dt2$V1[match(checklist[, covs.present[["species"]]],
data.frame(dt2)[,1])])
as.character(dt2$V1[match(checklist[["tmp.tax.name"]],
dt2[["tmp.tax.name"]])])
}

# Organizing and ordering the output
cols <- as.character(
c(unlist(covs.present[names(covs.present) %in% c("families", "species")]),
c(unlist(covs.present[names(covs.present) %in% c("families", "species", "authors")]),
"scientific.name", "records", "tax.CL", "geo.CL","vouchers"))
if (all(c(covs.present[["species"]], "scientific.name") %in% names(checklist)))
cols <- cols[!cols %in% covs.present[["species"]]]
if (all(c("tmp.tax.name", "scientific.name") %in% names(checklist)))
cols <- cols[!cols %in% "tmp.tax.name"]

cols <- cols[cols %in% names(checklist)]
checklist <- checklist[, cols]
Expand Down
Loading
Loading