From 30e6c68bdaab5a67bf6f78ef46f67813fea3829e Mon Sep 17 00:00:00 2001 From: "Snyder, Abigail C" Date: Wed, 19 Jun 2019 14:10:10 -0400 Subject: [PATCH 01/13] Define two different function options for constant extrapolation --- NAMESPACE | 1 + R/extrapolators.R | 85 +++++++++++++++++++++++++++++ man/extrapolate_constant.Rd | 28 ++++++++++ man/extrapolate_constant_optionB.Rd | 29 ++++++++++ man/last_n.Rd | 26 +++++++++ 5 files changed, 169 insertions(+) create mode 100644 R/extrapolators.R create mode 100644 man/extrapolate_constant.Rd create mode 100644 man/extrapolate_constant_optionB.Rd create mode 100644 man/last_n.Rd diff --git a/NAMESPACE b/NAMESPACE index 8d8b27837..fb409a051 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -31,6 +31,7 @@ export(run_xml_tests) export(standardize_iso) export(unprotect_integer_cols) importFrom(assertthat,assert_that) +importFrom(assertthat,is.scalar) importFrom(data.table,data.table) importFrom(dplyr,anti_join) importFrom(dplyr,arrange) diff --git a/R/extrapolators.R b/R/extrapolators.R new file mode 100644 index 000000000..18b35aeef --- /dev/null +++ b/R/extrapolators.R @@ -0,0 +1,85 @@ +# extrapolators.R +# helper extrapolator functions for use in base year changes + +#' extrapolate_constant +#' +#' computes the mean of last n values and uses this constant value to fill in +#' missing years at the end of the time series +#' +#' @param x Vector of values with NA's to be filled in via constant extrapolation +#' of the mean of last n non-NA values. +#' @param n Number of non-NA values to be averaged to provide the filler value. +#' @details Computes the mean of last n non-NA values of input vector x +#' and uses this constant value to fill in NA values in x. +#' @return Vector with all NA values replaced with the specified mean. +#' @importFrom assertthat assert_that is.scalar +#' @author ACS June 2019 +extrapolate_constant <- function(x, n=1){ + + assert_that(is.numeric(x)) + assert_that(is.scalar(n)) + + meanval <- mean(last_n(x,n)) + + x[is.na(x)] <- meanval + + return(x) +} + + +#' extrapolate_constant_optionB +#' +#' computes the mean of last n values and uses this constant value to fill in +#' missing years at the end of the time series +#' +#' @param x Tibble with column name 'value' that contains NA values to be filled in +#' via constant extrapolation of the mean of last n non-NA values. +#' @param n Number of non-NA values to be averaged to provide the filler value. +#' @details Computes the mean of last n non-NA values of column 'value' in an input tibble +#' (may be grouped) and uses this constant value to fill in NA values in column 'value'. +#' @return Tibble with column name 'value' in which all NA values have been filled +#' in with the specified mean value. +#' @importFrom assertthat assert_that is.scalar +#' @author ACS June 2019 +extrapolate_constant_optionB <- function(x, n=1){ + + assert_that(is.tibble(x)) + assert_that(is.scalar(n)) + + + # can't use the faster replace_na function because + # if x is grouped, e.g. on iso, each group will have + # its own, specific mean value to replace NAs with. + x %>% + mutate(meanval = mean(last_n(value, n)), + value = if_else(is.na(value), + meanval, + value)) %>% + select(-meanval) +} + + +#' last_n +#' +#' finds the last n non-NA values in an input vector. +#' +#' @param x Vector with some NA values +#' @param n The number of non-NA values sought. +#' @details finds the last n non-NA values in an input vector. +#' @return A vector with the last n non-NA values from input +#' vector x. +#' @importFrom assertthat assert_that is.scalar +#' @author ACS June 2019 +last_n <- function(x, n){ + + assert_that(is.scalar(n)) + + if(n > length(x[!is.na(x)])){ + stop('asking for more nonNA years than you have.') + } + + + return(tail(x[!is.na(x)], n)) +} + + diff --git a/man/extrapolate_constant.Rd b/man/extrapolate_constant.Rd new file mode 100644 index 000000000..6b4ab327a --- /dev/null +++ b/man/extrapolate_constant.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extrapolators.R +\name{extrapolate_constant} +\alias{extrapolate_constant} +\title{extrapolate_constant} +\usage{ +extrapolate_constant(x, n = 1) +} +\arguments{ +\item{x}{Vector of values with NA's to be filled in via constant extrapolation +of the mean of last n non-NA values.} + +\item{n}{Number of non-NA values to be averaged to provide the filler value.} +} +\value{ +Vector with all NA values replaced with the specified mean. +} +\description{ +computes the mean of last n values and uses this constant value to fill in + missing years at the end of the time series +} +\details{ +Computes the mean of last n non-NA values of input vector x +and uses this constant value to fill in NA values in x. +} +\author{ +ACS June 2019 +} diff --git a/man/extrapolate_constant_optionB.Rd b/man/extrapolate_constant_optionB.Rd new file mode 100644 index 000000000..c2209ac93 --- /dev/null +++ b/man/extrapolate_constant_optionB.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extrapolators.R +\name{extrapolate_constant_optionB} +\alias{extrapolate_constant_optionB} +\title{extrapolate_constant_optionB} +\usage{ +extrapolate_constant_optionB(x, n = 1) +} +\arguments{ +\item{x}{Tibble with column name 'value' that contains NA values to be filled in +via constant extrapolation of the mean of last n non-NA values.} + +\item{n}{Number of non-NA values to be averaged to provide the filler value.} +} +\value{ +Tibble with column name 'value' in which all NA values have been filled +in with the specified mean value. +} +\description{ +computes the mean of last n values and uses this constant value to fill in + missing years at the end of the time series +} +\details{ +Computes the mean of last n non-NA values of column 'value' in an input tibble +(may be grouped) and uses this constant value to fill in NA values in column 'value'. +} +\author{ +ACS June 2019 +} diff --git a/man/last_n.Rd b/man/last_n.Rd new file mode 100644 index 000000000..53c9ae579 --- /dev/null +++ b/man/last_n.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extrapolators.R +\name{last_n} +\alias{last_n} +\title{last_n} +\usage{ +last_n(x, n) +} +\arguments{ +\item{x}{Vector with some NA values} + +\item{n}{The number of non-NA values sought.} +} +\value{ +A vector with the last n non-NA values from input +vector x. +} +\description{ +finds the last n non-NA values in an input vector. +} +\details{ +finds the last n non-NA values in an input vector. +} +\author{ +ACS June 2019 +} From df42c51ca9780c14b8ec0a8bad9f51cf2e971f4c Mon Sep 17 00:00:00 2001 From: "Snyder, Abigail C" Date: Wed, 19 Jun 2019 14:10:34 -0400 Subject: [PATCH 02/13] add a temporary constant to use while putting the extrapolation skeleton into all chunks --- R/constants.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/R/constants.R b/R/constants.R index d4584bfe8..c8192278a 100644 --- a/R/constants.R +++ b/R/constants.R @@ -25,6 +25,11 @@ MODEL_BASE_YEARS <- c(1975, 1990, 2005, 2010) # calibrated per MODEL_FUTURE_YEARS <- seq(2015, 2100, 5) # future (i.e., not calibrated) time periods in the model MODEL_YEARS <- c(MODEL_BASE_YEARS, MODEL_FUTURE_YEARS) +# intermediate BYU year constant. Need for early dev where not all data sets get extended. +# Goal would be to have it handled automatically with a change to HISTORICAL_YEARS. +# Should be able to just find and replace BYU_YEAR with max(HISTORICAL_YEARS) once the +# BYU skeleton is in place for all chunks. +BYU_YEAR <- 2015 # GCAM constants ====================================================================== From 0520fa9be373dc86e97f76d9136f9cb0fd628178 Mon Sep 17 00:00:00 2001 From: "Snyder, Abigail C" Date: Wed, 19 Jun 2019 14:14:31 -0400 Subject: [PATCH 03/13] wip 2 different options for constant extrapolation --- R/extrapolators.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/extrapolators.R b/R/extrapolators.R index 18b35aeef..4259e4158 100644 --- a/R/extrapolators.R +++ b/R/extrapolators.R @@ -55,7 +55,8 @@ extrapolate_constant_optionB <- function(x, n=1){ value = if_else(is.na(value), meanval, value)) %>% - select(-meanval) + select(-meanval) %>% + distinct } From dcf4df58c3b71f154e2f0c4f0bb16ca8ba30c87e Mon Sep 17 00:00:00 2001 From: "Snyder, Abigail C" Date: Wed, 19 Jun 2019 14:20:10 -0400 Subject: [PATCH 04/13] two options for constant extrapolation in the identified test chunk --- R/zchunk_L100.GDP_hist.R | 54 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/R/zchunk_L100.GDP_hist.R b/R/zchunk_L100.GDP_hist.R index fb944f2a1..53eea1924 100644 --- a/R/zchunk_L100.GDP_hist.R +++ b/R/zchunk_L100.GDP_hist.R @@ -32,13 +32,63 @@ module_socioeconomics_L100.GDP_hist <- function(command, ...) { usda_gdp_mer <- get_data(all_data, "socioeconomics/USDA_GDP_MER") assert_that(tibble::is.tibble(usda_gdp_mer)) - # Convert to long form, filter to historical years, convert units + # Convert to long form, convert units usda_gdp_mer %>% select(-Country) %>% gather_years %>% filter(!is.na(value), !is.na(iso)) %>% mutate(value = value * CONV_BIL_MIL * gdp_deflator(1990, base_year = 2010), - year = as.integer(year)) %>% + year = as.integer(year)) -> + long_iso_year_gdp + + + # Perform BYU + # Skeleton - constant extrapolation + # BYUcompliant + # + # BYU NOTE - must make sure the units are all 1990 USD so averaging + # behaves. + # BYU NOTE - need to think about labeling that the output has BYU + # update done, and a note about the method? + if(max(long_iso_year_gdp$year) < BYU_YEAR){ + + missingyears <- tibble(year = (max(long_iso_year_gdp$year) + 1):BYU_YEAR) + + # Constant extrapolation operating only on + # numeric vector. Can operate on any numeric vector, regardless of grouping + # or column name. + # What gets output in this chunk + long_iso_year_gdp %>% + select(iso) %>% + distinct %>% + repeat_add_columns(missingyears) %>% + bind_rows(long_iso_year_gdp, .) %>% + group_by(iso) %>% + mutate(value = extrapolate_constant(value, n = 1)) %>% + ungroup -> + long_iso_year_gdp + + + # Constant extrapolation operating on a (grouped) tibble + # with a column named 'value'. + long_iso_year_gdp %>% + select(iso) %>% + distinct %>% + repeat_add_columns(missingyears) %>% + bind_rows(long_iso_year_gdp, .) %>% + group_by(iso) %>% + extrapolate_constant_optionB(n = 1) %>% + ungroup -> + long_iso_year_gdp2 + + # Double check the two methods give the same result, + # Will remove when we have decided on the method we want. + assert_that(!any(long_iso_year_gdp != long_iso_year_gdp2)) + } + + + # filter to historical years, convert units + long_iso_year_gdp %>% add_title("Historical GDP downscaled to country (iso)") %>% add_comments("Units converted to constant 1990 USD") %>% add_precursors("socioeconomics/USDA_GDP_MER") %>% From f2672a41d06aa9422b5396f744be0af8de4df802 Mon Sep 17 00:00:00 2001 From: "Snyder, Abigail C" Date: Thu, 18 Jul 2019 10:05:52 -0400 Subject: [PATCH 05/13] Drop the grouped tibble option and improve comments/doc on vector option --- R/extrapolators.R | 35 +++++------------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/R/extrapolators.R b/R/extrapolators.R index 4259e4158..7eb74446e 100644 --- a/R/extrapolators.R +++ b/R/extrapolators.R @@ -9,6 +9,8 @@ #' @param x Vector of values with NA's to be filled in via constant extrapolation #' of the mean of last n non-NA values. #' @param n Number of non-NA values to be averaged to provide the filler value. +#' Defaults to n=1: using the last recorded year's value to constantly fill in +#' the tail of vector missing values. #' @details Computes the mean of last n non-NA values of input vector x #' and uses this constant value to fill in NA values in x. #' @return Vector with all NA values replaced with the specified mean. @@ -16,9 +18,12 @@ #' @author ACS June 2019 extrapolate_constant <- function(x, n=1){ + # Some assertion tests to make sure working on right data types assert_that(is.numeric(x)) assert_that(is.scalar(n)) + # The constant value to fill in all tail of vector NA's with. + # = mean of the last n nonNA values in the meanval <- mean(last_n(x,n)) x[is.na(x)] <- meanval @@ -27,38 +32,8 @@ extrapolate_constant <- function(x, n=1){ } -#' extrapolate_constant_optionB -#' -#' computes the mean of last n values and uses this constant value to fill in -#' missing years at the end of the time series -#' -#' @param x Tibble with column name 'value' that contains NA values to be filled in -#' via constant extrapolation of the mean of last n non-NA values. -#' @param n Number of non-NA values to be averaged to provide the filler value. -#' @details Computes the mean of last n non-NA values of column 'value' in an input tibble -#' (may be grouped) and uses this constant value to fill in NA values in column 'value'. -#' @return Tibble with column name 'value' in which all NA values have been filled -#' in with the specified mean value. -#' @importFrom assertthat assert_that is.scalar -#' @author ACS June 2019 -extrapolate_constant_optionB <- function(x, n=1){ - - assert_that(is.tibble(x)) - assert_that(is.scalar(n)) - # can't use the faster replace_na function because - # if x is grouped, e.g. on iso, each group will have - # its own, specific mean value to replace NAs with. - x %>% - mutate(meanval = mean(last_n(value, n)), - value = if_else(is.na(value), - meanval, - value)) %>% - select(-meanval) %>% - distinct -} - #' last_n #' From d7f5a59b34c11e9638922a0be6488c81e3d703d0 Mon Sep 17 00:00:00 2001 From: "Snyder, Abigail C" Date: Thu, 18 Jul 2019 12:59:04 -0400 Subject: [PATCH 06/13] keep only vector form extrapolation, correct filling in all NA values error --- R/extrapolators.R | 15 ++++++++++----- man/extrapolate_constant.Rd | 12 +++++++++--- man/extrapolate_constant_optionB.Rd | 29 ----------------------------- 3 files changed, 19 insertions(+), 37 deletions(-) delete mode 100644 man/extrapolate_constant_optionB.Rd diff --git a/R/extrapolators.R b/R/extrapolators.R index 7eb74446e..b8e3e00c8 100644 --- a/R/extrapolators.R +++ b/R/extrapolators.R @@ -4,37 +4,42 @@ #' extrapolate_constant #' #' computes the mean of last n values and uses this constant value to fill in -#' missing years at the end of the time series +#' NA values corresponding to missing years at the end of the time series #' #' @param x Vector of values with NA's to be filled in via constant extrapolation #' of the mean of last n non-NA values. #' @param n Number of non-NA values to be averaged to provide the filler value. #' Defaults to n=1: using the last recorded year's value to constantly fill in #' the tail of vector missing values. +#' @param numMissing The number of NA values at the tail end of each +#' vector to be filled in. This will always be known for each data set in each +#' chunk. #' @details Computes the mean of last n non-NA values of input vector x #' and uses this constant value to fill in NA values in x. #' @return Vector with all NA values replaced with the specified mean. #' @importFrom assertthat assert_that is.scalar #' @author ACS June 2019 -extrapolate_constant <- function(x, n=1){ +extrapolate_constant <- function(x, n=1, numMissing){ # Some assertion tests to make sure working on right data types assert_that(is.numeric(x)) assert_that(is.scalar(n)) + assert_that(is.integer(numMissing)) + # The constant value to fill in all tail of vector NA's with. # = mean of the last n nonNA values in the meanval <- mean(last_n(x,n)) - x[is.na(x)] <- meanval + + # fill in only the tail end NA values with this constant. + x[(length(x) - numMissing + 1):length(x)] <- meanval return(x) } - - #' last_n #' #' finds the last n non-NA values in an input vector. diff --git a/man/extrapolate_constant.Rd b/man/extrapolate_constant.Rd index 6b4ab327a..bc1ef4579 100644 --- a/man/extrapolate_constant.Rd +++ b/man/extrapolate_constant.Rd @@ -4,20 +4,26 @@ \alias{extrapolate_constant} \title{extrapolate_constant} \usage{ -extrapolate_constant(x, n = 1) +extrapolate_constant(x, n = 1, numMissing) } \arguments{ \item{x}{Vector of values with NA's to be filled in via constant extrapolation of the mean of last n non-NA values.} -\item{n}{Number of non-NA values to be averaged to provide the filler value.} +\item{n}{Number of non-NA values to be averaged to provide the filler value. +Defaults to n=1: using the last recorded year's value to constantly fill in +the tail of vector missing values.} + +\item{numMissing}{The number of NA values at the tail end of each +vector to be filled in. This will always be known for each data set in each +chunk.} } \value{ Vector with all NA values replaced with the specified mean. } \description{ computes the mean of last n values and uses this constant value to fill in - missing years at the end of the time series + NA values corresponding to missing years at the end of the time series } \details{ Computes the mean of last n non-NA values of input vector x diff --git a/man/extrapolate_constant_optionB.Rd b/man/extrapolate_constant_optionB.Rd deleted file mode 100644 index c2209ac93..000000000 --- a/man/extrapolate_constant_optionB.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/extrapolators.R -\name{extrapolate_constant_optionB} -\alias{extrapolate_constant_optionB} -\title{extrapolate_constant_optionB} -\usage{ -extrapolate_constant_optionB(x, n = 1) -} -\arguments{ -\item{x}{Tibble with column name 'value' that contains NA values to be filled in -via constant extrapolation of the mean of last n non-NA values.} - -\item{n}{Number of non-NA values to be averaged to provide the filler value.} -} -\value{ -Tibble with column name 'value' in which all NA values have been filled -in with the specified mean value. -} -\description{ -computes the mean of last n values and uses this constant value to fill in - missing years at the end of the time series -} -\details{ -Computes the mean of last n non-NA values of column 'value' in an input tibble -(may be grouped) and uses this constant value to fill in NA values in column 'value'. -} -\author{ -ACS June 2019 -} From 42d744070e25f35cae1802fb138bd38ab5b460c9 Mon Sep 17 00:00:00 2001 From: "Snyder, Abigail C" Date: Thu, 18 Jul 2019 12:59:18 -0400 Subject: [PATCH 07/13] simplify pipeline with call to extrapolate --- R/zchunk_L100.GDP_hist.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R/zchunk_L100.GDP_hist.R b/R/zchunk_L100.GDP_hist.R index 53eea1924..856e30076 100644 --- a/R/zchunk_L100.GDP_hist.R +++ b/R/zchunk_L100.GDP_hist.R @@ -46,25 +46,25 @@ module_socioeconomics_L100.GDP_hist <- function(command, ...) { # Skeleton - constant extrapolation # BYUcompliant # - # BYU NOTE - must make sure the units are all 1990 USD so averaging - # behaves. + # BYU NOTE - must make sure the units are all 1990 USD (or at least the + # same year basis) so averaging behaves. Fine here, but worth noting + # across other chunk where money comes up, especially since we drop + # unit information pretty early on in most chunks. # BYU NOTE - need to think about labeling that the output has BYU # update done, and a note about the method? if(max(long_iso_year_gdp$year) < BYU_YEAR){ - missingyears <- tibble(year = (max(long_iso_year_gdp$year) + 1):BYU_YEAR) + missingyears <- (max(long_iso_year_gdp$year) + 1):BYU_YEAR # Constant extrapolation operating only on # numeric vector. Can operate on any numeric vector, regardless of grouping # or column name. # What gets output in this chunk long_iso_year_gdp %>% - select(iso) %>% - distinct %>% - repeat_add_columns(missingyears) %>% - bind_rows(long_iso_year_gdp, .) %>% + complete(year = c(year, missingyears), iso) %>% group_by(iso) %>% - mutate(value = extrapolate_constant(value, n = 1)) %>% + mutate(value = extrapolate_constant(value, n = 1, + numMissing = length(missingyears))) %>% ungroup -> long_iso_year_gdp From ee8c56e339ce8ac5394b44671d057551582efd82 Mon Sep 17 00:00:00 2001 From: "Snyder, Abigail C" Date: Thu, 18 Jul 2019 13:00:46 -0400 Subject: [PATCH 08/13] add the import of complete from tidyr to chunk --- R/zchunk_L100.GDP_hist.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/zchunk_L100.GDP_hist.R b/R/zchunk_L100.GDP_hist.R index 856e30076..5f0603835 100644 --- a/R/zchunk_L100.GDP_hist.R +++ b/R/zchunk_L100.GDP_hist.R @@ -15,7 +15,7 @@ #' @importFrom assertthat assert_that #' @importFrom tibble tibble #' @importFrom dplyr filter mutate select -#' @importFrom tidyr gather spread +#' @importFrom tidyr gather spread complete #' @author BBL February 2017 module_socioeconomics_L100.GDP_hist <- function(command, ...) { if(command == driver.DECLARE_INPUTS) { From 685834ff84a4d3ae45cadd41bae0ae70344d7d64 Mon Sep 17 00:00:00 2001 From: "Snyder, Abigail C" Date: Thu, 18 Jul 2019 13:14:35 -0400 Subject: [PATCH 09/13] Drop call to now non-existent tibble option --- R/zchunk_L100.GDP_hist.R | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/R/zchunk_L100.GDP_hist.R b/R/zchunk_L100.GDP_hist.R index 5f0603835..343a06167 100644 --- a/R/zchunk_L100.GDP_hist.R +++ b/R/zchunk_L100.GDP_hist.R @@ -67,23 +67,6 @@ module_socioeconomics_L100.GDP_hist <- function(command, ...) { numMissing = length(missingyears))) %>% ungroup -> long_iso_year_gdp - - - # Constant extrapolation operating on a (grouped) tibble - # with a column named 'value'. - long_iso_year_gdp %>% - select(iso) %>% - distinct %>% - repeat_add_columns(missingyears) %>% - bind_rows(long_iso_year_gdp, .) %>% - group_by(iso) %>% - extrapolate_constant_optionB(n = 1) %>% - ungroup -> - long_iso_year_gdp2 - - # Double check the two methods give the same result, - # Will remove when we have decided on the method we want. - assert_that(!any(long_iso_year_gdp != long_iso_year_gdp2)) } From 3492f43fc27de8949b40fd60ed993dac168c4b95 Mon Sep 17 00:00:00 2001 From: "Snyder, Abigail C" Date: Thu, 18 Jul 2019 13:27:38 -0400 Subject: [PATCH 10/13] add import of utils to silence package check note --- R/extrapolators.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/extrapolators.R b/R/extrapolators.R index b8e3e00c8..5c5153ea2 100644 --- a/R/extrapolators.R +++ b/R/extrapolators.R @@ -18,6 +18,7 @@ #' and uses this constant value to fill in NA values in x. #' @return Vector with all NA values replaced with the specified mean. #' @importFrom assertthat assert_that is.scalar +#' @importFrom utils tail #' @author ACS June 2019 extrapolate_constant <- function(x, n=1, numMissing){ From 36283c87c3f592326b033979f1660276ac2a7977 Mon Sep 17 00:00:00 2001 From: "Snyder, Abigail C" Date: Mon, 12 Aug 2019 14:59:24 -0400 Subject: [PATCH 11/13] update constant extrap and rename last_n helper fcn -Doesn't touch original data at all. -now the fillin mean value comes from average of last n original data years with na.rm=TRUE -update documentation to clarify -rename one of the input arguments for clarit --- NAMESPACE | 1 + R/extrapolators.R | 58 +++++++++++++++++++----------- man/extrapolate_constant.Rd | 34 +++++++++++------- man/{last_n.Rd => last_n_nonNA.Rd} | 12 ++++--- 4 files changed, 67 insertions(+), 38 deletions(-) rename man/{last_n.Rd => last_n_nonNA.Rd} (62%) diff --git a/NAMESPACE b/NAMESPACE index 2d1a39b11..e4bd5635d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -93,3 +93,4 @@ importFrom(tidyr,separate) importFrom(tidyr,spread) importFrom(tidyr,unite) importFrom(utils,capture.output) +importFrom(utils,tail) diff --git a/R/extrapolators.R b/R/extrapolators.R index 5c5153ea2..564827c8e 100644 --- a/R/extrapolators.R +++ b/R/extrapolators.R @@ -3,47 +3,63 @@ #' extrapolate_constant #' -#' computes the mean of last n values and uses this constant value to fill in -#' NA values corresponding to missing years at the end of the time series +#' computes the mean of last n original year values (with +#' \code{mean(., na.rm = TRUE)}) and uses this constant value to fill in NA +#' values corresponding to the extrapolation years at the end of the time series. +#' NOTE that this extrapolator does not touch any of the original data. It ONLY +#' fills in data corresponding to the extrapolation years. It is the user's +#' responsibility to account for this behavior in preparing raw data to be +#' extrapolated. #' -#' @param x Vector of values with NA's to be filled in via constant extrapolation -#' of the mean of last n non-NA values. -#' @param n Number of non-NA values to be averaged to provide the filler value. -#' Defaults to n=1: using the last recorded year's value to constantly fill in -#' the tail of vector missing values. -#' @param numMissing The number of NA values at the tail end of each -#' vector to be filled in. This will always be known for each data set in each -#' chunk. -#' @details Computes the mean of last n non-NA values of input vector x -#' and uses this constant value to fill in NA values in x. +#' @param x Vector of values with NA's corresponding to the extrapolation years +#' to be filled in via constant extrapolation of the mean of last n original +#' year values. +#' @param n Number of final original year values to be averaged to provide the +#' filler value. Averaging is done with \code{na.rm = TRUE}. +#' Defaults to n = 1: using the last recorded year's value to constantly fill in +#' the tail of vector missing values corresponding to extrapolation years. +#' @param numExtrapYrs The number of NA values at the tail end of each vector that +#' correspond to the extrapolation years and will be filled in. This will always +#' be known for each data set in each chunk. +#' @details Computes the mean of last n original year values of input vector x +#' and uses this constant value to fill in NA values in x that correspond to the +#' added extrapolation years. #' @return Vector with all NA values replaced with the specified mean. #' @importFrom assertthat assert_that is.scalar #' @importFrom utils tail #' @author ACS June 2019 -extrapolate_constant <- function(x, n=1, numMissing){ +extrapolate_constant <- function(x, n=1, numExtrapYrs){ # Some assertion tests to make sure working on right data types assert_that(is.numeric(x)) assert_that(is.scalar(n)) - assert_that(is.integer(numMissing)) + assert_that(is.integer(numExtrapYrs)) - # The constant value to fill in all tail of vector NA's with. - # = mean of the last n nonNA values in the - meanval <- mean(last_n(x,n)) + # The constant value to fill in all extrapolation year NA's with. + # = mean(. , na.rm = TRUE) of the last n values in the original + # data. + index_last_n_orig_yrs <- (length(x) - numExtrapYrs - n + 1):(length(x) - numExtrapYrs) + meanval <- mean(x[index_last_n_orig_yrs], na.rm = TRUE) - # fill in only the tail end NA values with this constant. - x[(length(x) - numMissing + 1):length(x)] <- meanval + # fill in only the tail end, extrapolation years + # NA values with this constant. + index_extrap_yrs <- (length(x) - numExtrapYrs + 1):length(x) + x[index_extrap_yrs] <- meanval return(x) } -#' last_n +#' last_n_nonNA #' #' finds the last n non-NA values in an input vector. +#' A convenience functions for users who wish to customize +#' their extrapolations beyond the default or who wish to +#' identify NA values in their original (unextrapolated) +#' data. #' #' @param x Vector with some NA values #' @param n The number of non-NA values sought. @@ -52,7 +68,7 @@ extrapolate_constant <- function(x, n=1, numMissing){ #' vector x. #' @importFrom assertthat assert_that is.scalar #' @author ACS June 2019 -last_n <- function(x, n){ +last_n_nonNA <- function(x, n){ assert_that(is.scalar(n)) diff --git a/man/extrapolate_constant.Rd b/man/extrapolate_constant.Rd index bc1ef4579..f862775bf 100644 --- a/man/extrapolate_constant.Rd +++ b/man/extrapolate_constant.Rd @@ -4,30 +4,38 @@ \alias{extrapolate_constant} \title{extrapolate_constant} \usage{ -extrapolate_constant(x, n = 1, numMissing) +extrapolate_constant(x, n = 1, numExtrapYrs) } \arguments{ -\item{x}{Vector of values with NA's to be filled in via constant extrapolation -of the mean of last n non-NA values.} +\item{x}{Vector of values with NA's corresponding to the extrapolation years +to be filled in via constant extrapolation of the mean of last n original +year values.} -\item{n}{Number of non-NA values to be averaged to provide the filler value. -Defaults to n=1: using the last recorded year's value to constantly fill in -the tail of vector missing values.} +\item{n}{Number of final original year values to be averaged to provide the +filler value. Averaging is done with \code{na.rm = TRUE}. +Defaults to n = 1: using the last recorded year's value to constantly fill in +the tail of vector missing values corresponding to extrapolation years.} -\item{numMissing}{The number of NA values at the tail end of each -vector to be filled in. This will always be known for each data set in each -chunk.} +\item{numExtrapYrs}{The number of NA values at the tail end of each vector that +correspond to the extrapolation years and will be filled in. This will always +be known for each data set in each chunk.} } \value{ Vector with all NA values replaced with the specified mean. } \description{ -computes the mean of last n values and uses this constant value to fill in - NA values corresponding to missing years at the end of the time series +computes the mean of last n original year values (with + \code{mean(., na.rm = TRUE)}) and uses this constant value to fill in NA + values corresponding to the extrapolation years at the end of the time series. + NOTE that this extrapolator does not touch any of the original data. It ONLY + fills in data corresponding to the extrapolation years. It is the user's + responsibility to account for this behavior in preparing raw data to be + extrapolated. } \details{ -Computes the mean of last n non-NA values of input vector x -and uses this constant value to fill in NA values in x. +Computes the mean of last n original year values of input vector x +and uses this constant value to fill in NA values in x that correspond to the +added extrapolation years. } \author{ ACS June 2019 diff --git a/man/last_n.Rd b/man/last_n_nonNA.Rd similarity index 62% rename from man/last_n.Rd rename to man/last_n_nonNA.Rd index 53c9ae579..45a3bc33a 100644 --- a/man/last_n.Rd +++ b/man/last_n_nonNA.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/extrapolators.R -\name{last_n} -\alias{last_n} -\title{last_n} +\name{last_n_nonNA} +\alias{last_n_nonNA} +\title{last_n_nonNA} \usage{ -last_n(x, n) +last_n_nonNA(x, n) } \arguments{ \item{x}{Vector with some NA values} @@ -17,6 +17,10 @@ vector x. } \description{ finds the last n non-NA values in an input vector. + A convenience functions for users who wish to customize + their extrapolations beyond the default or who wish to + identify NA values in their original (unextrapolated) + data. } \details{ finds the last n non-NA values in an input vector. From bde8436116da8ca3bc6aa92d47030bba7c7bdde5 Mon Sep 17 00:00:00 2001 From: "Snyder, Abigail C" Date: Mon, 12 Aug 2019 14:59:53 -0400 Subject: [PATCH 12/13] update extrap call chunk for clarity --- R/zchunk_L100.GDP_hist.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/zchunk_L100.GDP_hist.R b/R/zchunk_L100.GDP_hist.R index e2bf421c6..be07560c7 100644 --- a/R/zchunk_L100.GDP_hist.R +++ b/R/zchunk_L100.GDP_hist.R @@ -56,17 +56,17 @@ module_socioeconomics_L100.GDP_hist <- function(command, ...) { # update done, and a note about the method? if(max(long_iso_year_gdp$year) < BYU_YEAR){ - missingyears <- (max(long_iso_year_gdp$year) + 1):BYU_YEAR + extrapyears <- (max(long_iso_year_gdp$year) + 1):BYU_YEAR # Constant extrapolation operating only on # numeric vector. Can operate on any numeric vector, regardless of grouping # or column name. # What gets output in this chunk long_iso_year_gdp %>% - complete(year = c(year, missingyears), iso) %>% + complete(year = c(year, extrapyears), iso) %>% group_by(iso) %>% mutate(value = extrapolate_constant(value, n = 1, - numMissing = length(missingyears))) %>% + numExtrapYrs = length(extrapyears))) %>% ungroup -> long_iso_year_gdp } From c66dc3998f5839d2b2596aca3e5024ead62fffeb Mon Sep 17 00:00:00 2001 From: "Snyder, Abigail C" Date: Mon, 12 Aug 2019 15:46:33 -0400 Subject: [PATCH 13/13] update oldnew test to drop extrap years --- tests/testthat/test_oldnew.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/testthat/test_oldnew.R b/tests/testthat/test_oldnew.R index 8ff23950c..29612bf76 100644 --- a/tests/testthat/test_oldnew.R +++ b/tests/testthat/test_oldnew.R @@ -89,6 +89,18 @@ test_that("matches old data system output", { olddata <- COMPDATA[[oldf]] expect_is(olddata, "data.frame", info = paste("No comparison data found for", oldf)) + # During the base year update development process, some outputs from the data + # system will be extended and therefore have different dimensions from the + # old comparison data. This causes the old-new test to fail. + # The extended extrapolation years will be dropped from the newdata + # so that comparison can be made to the old data. This also serves to check + # that the extrapolation procedure does not touch original data. + if(max(newdata$year) == BYU_YEAR){ # one way to check that it's a BYU without flags. + newdata %>% + filter(year <= max(HISTORICAL_YEARS)) -> + newdata + } + # Finally, test (NB rounding numeric columns to a sensible number of # digits; otherwise spurious mismatches occur) # Also first converts integer columns to numeric (otherwise test will