rsample/0000755000176200001440000000000013512203633011713 5ustar liggesusersrsample/inst/0000755000176200001440000000000013512177703012700 5ustar liggesusersrsample/inst/doc/0000755000176200001440000000000013512177703013445 5ustar liggesusersrsample/inst/doc/Working_with_rsets.R0000644000176200001440000001021613512177703017463 0ustar liggesusers## ----ex_setup, include=FALSE------------------------------------------------------------ knitr::opts_chunk$set( message = FALSE, digits = 3, collapse = TRUE, comment = "#>" ) options(digits = 3, width = 90) library(ggplot2) theme_set(theme_bw()) ## ----attrition, message=FALSE----------------------------------------------------------- library(rsample) data("attrition") names(attrition) table(attrition$Attrition) ## ----form, message=FALSE---------------------------------------------------------------- mod_form <- as.formula(Attrition ~ JobSatisfaction + Gender + MonthlyIncome) ## ----model_vfold, message=FALSE--------------------------------------------------------- library(rsample) set.seed(4622) rs_obj <- vfold_cv(attrition, v = 10, repeats = 10) rs_obj ## ----lm_func---------------------------------------------------------------------------- ## splits will be the `rsplit` object with the 90/10 partition holdout_results <- function(splits, ...) { # Fit the model to the 90% mod <- glm(..., data = analysis(splits), family = binomial) # Save the 10% holdout <- assessment(splits) # `augment` will save the predictions with the holdout data set res <- broom::augment(mod, newdata = holdout) # Class predictions on the assessment set from class probs lvls <- levels(holdout$Attrition) predictions <- factor(ifelse(res$.fitted > 0, lvls[2], lvls[1]), levels = lvls) # Calculate whether the prediction was correct res$correct <- predictions == holdout$Attrition # Return the assessment data set with the additional columns res } ## ----onefold, warning = FALSE----------------------------------------------------------- example <- holdout_results(rs_obj$splits[[1]], mod_form) dim(example) dim(assessment(rs_obj$splits[[1]])) ## newly added columns: example[1:10, setdiff(names(example), names(attrition))] ## ----model_purrr, warning=FALSE--------------------------------------------------------- library(purrr) rs_obj$results <- map(rs_obj$splits, holdout_results, mod_form) rs_obj ## ----model_acc-------------------------------------------------------------------------- rs_obj$accuracy <- map_dbl(rs_obj$results, function(x) mean(x$correct)) summary(rs_obj$accuracy) ## ----type_plot-------------------------------------------------------------------------- ggplot(attrition, aes(x = Gender, y = MonthlyIncome)) + geom_boxplot() + scale_y_log10() ## ----mean_diff-------------------------------------------------------------------------- median_diff <- function(splits) { x <- analysis(splits) median(x$MonthlyIncome[x$Gender == "Female"]) - median(x$MonthlyIncome[x$Gender == "Male"]) } ## ----boot_mean_diff--------------------------------------------------------------------- set.seed(353) bt_resamples <- bootstraps(attrition, times = 500) ## ----stats------------------------------------------------------------------------------ bt_resamples$wage_diff <- map_dbl(bt_resamples$splits, median_diff) ## ----stats_plot------------------------------------------------------------------------- ggplot(bt_resamples, aes(x = wage_diff)) + geom_line(stat = "density", adjust = 1.25) + xlab("Difference in Median Monthly Income (Female - Male)") ## ----ci--------------------------------------------------------------------------------- quantile(bt_resamples$wage_diff, probs = c(0.025, 0.500, 0.975)) ## ----coefs------------------------------------------------------------------------------ glm_coefs <- function(splits, ...) { ## use `analysis` or `as.data.frame` to get the analysis data mod <- glm(..., data = analysis(splits), family = binomial) as.data.frame(t(coef(mod))) } bt_resamples$betas <- map(.x = bt_resamples$splits, .f = glm_coefs, mod_form) bt_resamples bt_resamples$betas[[1]] ## ----tidy_rsplit------------------------------------------------------------------------ first_resample <- bt_resamples$splits[[1]] class(first_resample) tidy(first_resample) ## ----tidy_rset-------------------------------------------------------------------------- class(bt_resamples) tidy(bt_resamples) rsample/inst/doc/Basics.R0000644000176200001440000000122413512177675015003 0ustar liggesusers## ----ex_setup, include=FALSE--------------------------------------------- knitr::opts_chunk$set( message = FALSE, digits = 3, collapse = TRUE, comment = "#>" ) options(digits = 3) ## ----mtcars_bt, message=FALSE-------------------------------------------- library(rsample) set.seed(8584) bt_resamples <- bootstraps(mtcars, times = 3) bt_resamples ## ----rsplit-------------------------------------------------------------- first_resample <- bt_resamples$splits[[1]] first_resample ## ----rsplit_df----------------------------------------------------------- head(as.data.frame(first_resample)) as.data.frame(first_resample, data = "assessment") rsample/inst/doc/Working_with_rsets.Rmd0000644000176200001440000001655313356266567020033 0ustar liggesusers--- title: "Working with rsets" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Working with rsets} output: knitr:::html_vignette: toc: yes --- ```{r ex_setup, include=FALSE} knitr::opts_chunk$set( message = FALSE, digits = 3, collapse = TRUE, comment = "#>" ) options(digits = 3, width = 90) library(ggplot2) theme_set(theme_bw()) ``` ## Introduction `rsample` can be used to create objects containing resamples of the original data. This page contains examples of how those objects can be used for data analysis. For illustration, the `attrition` data is used. From the help file: > These data are from the IBM Watson Analytics Lab. The website describes the data with "Uncover the factors that lead to employee attrition and explore important questions such as ‘show me a breakdown of distance from home by job role and attrition’ or 'compare average monthly income by education and attrition'. This is a fictional data set created by IBM data scientists." There are 1470 rows. The data can be accessed using ```{r attrition, message=FALSE} library(rsample) data("attrition") names(attrition) table(attrition$Attrition) ``` ## Model Assessment Let's fit a logistic regression model to the data with model terms for the job satisfaction, gender, and monthly income. If we were fitting the model to the entire data set, we might model attrition using ```r glm(Attrition ~ JobSatisfaction + Gender + MonthlyIncome, data = attrition, family = binomial) ``` For convenience, we'll create a formula object that will be used later: ```{r form, message=FALSE} mod_form <- as.formula(Attrition ~ JobSatisfaction + Gender + MonthlyIncome) ``` To evaluate this model, we will use 10 repeats of 10-fold cross-validation and use the 100 holdout samples to evaluate the overall accuracy of the model. First, let's make the splits of the data: ```{r model_vfold, message=FALSE} library(rsample) set.seed(4622) rs_obj <- vfold_cv(attrition, v = 10, repeats = 10) rs_obj ``` Now let's write a function that will, for each resample: 1. obtain the analysis data set (i.e. the 90% used for modeling) 1. fit a logistic regression model 1. predict the assessment data (the other 10% not used for the model) using the `broom` package 1. determine if each sample was predicted correctly. Here is our function: ```{r lm_func} ## splits will be the `rsplit` object with the 90/10 partition holdout_results <- function(splits, ...) { # Fit the model to the 90% mod <- glm(..., data = analysis(splits), family = binomial) # Save the 10% holdout <- assessment(splits) # `augment` will save the predictions with the holdout data set res <- broom::augment(mod, newdata = holdout) # Class predictions on the assessment set from class probs lvls <- levels(holdout$Attrition) predictions <- factor(ifelse(res$.fitted > 0, lvls[2], lvls[1]), levels = lvls) # Calculate whether the prediction was correct res$correct <- predictions == holdout$Attrition # Return the assessment data set with the additional columns res } ``` For example: ```{r onefold, warning = FALSE} example <- holdout_results(rs_obj$splits[[1]], mod_form) dim(example) dim(assessment(rs_obj$splits[[1]])) ## newly added columns: example[1:10, setdiff(names(example), names(attrition))] ``` For this model, the `.fitted` value is the linear predictor in log-odds units. To compute this data set for each of the 100 resamples, we'll use the `map` function from the `purrr` package: ```{r model_purrr, warning=FALSE} library(purrr) rs_obj$results <- map(rs_obj$splits, holdout_results, mod_form) rs_obj ``` Now we can compute the accuracy values for all of the assessment data sets: ```{r model_acc} rs_obj$accuracy <- map_dbl(rs_obj$results, function(x) mean(x$correct)) summary(rs_obj$accuracy) ``` Keep in mind that the baseline accuracy to beat is the rate of non-attrition, which is `r round(mean(attrition$Attrition == "No"), 3)`. Not a great model so far. ## Using the Bootstrap to Make Comparisons Traditionally, the bootstrap has been primarily used to empirically determine the sampling distribution of a test statistic. Given a set of samples with replacement, a statistic can be calculated on each analysis set and the results can be used to make inferences (such as confidence intervals). For example, are there differences in the median monthly income between genders? ```{r type_plot} ggplot(attrition, aes(x = Gender, y = MonthlyIncome)) + geom_boxplot() + scale_y_log10() ``` If we wanted to compare the genders, we could conduct a _t_-test or rank-based test. Instead, let's use the bootstrap to see if there is a difference in the median incomes for the two groups. We need a simple function to compute this statistic on the resample: ```{r mean_diff} median_diff <- function(splits) { x <- analysis(splits) median(x$MonthlyIncome[x$Gender == "Female"]) - median(x$MonthlyIncome[x$Gender == "Male"]) } ``` Now we would create a large number of bootstrap samples (say 2000+). For illustration, we'll only do 500 in this document. ```{r boot_mean_diff} set.seed(353) bt_resamples <- bootstraps(attrition, times = 500) ``` This function is then computed across each resample: ```{r stats} bt_resamples$wage_diff <- map_dbl(bt_resamples$splits, median_diff) ``` The bootstrap distribution of this statistic has a slightly bimodal and skewed distribution: ```{r stats_plot} ggplot(bt_resamples, aes(x = wage_diff)) + geom_line(stat = "density", adjust = 1.25) + xlab("Difference in Median Monthly Income (Female - Male)") ``` The variation is considerable in this statistic. One method of computing a confidence interval is to take the percentiles of the bootstrap distribution. A 95% confidence interval for the difference in the means would be: ```{r ci} quantile(bt_resamples$wage_diff, probs = c(0.025, 0.500, 0.975)) ``` _On average_, there is no evidence for a difference in the genders. ## Bootstrap Estimates of Model Coefficients Unless there is already a column in the resample object that contains the fitted model, a function can be used to fit the model and save all of the model coefficients. The [`broom` package](https://cran.r-project.org/package=broom) package has a `tidy` function that will save the coefficients in a data frame. Instead of returning a data frame with a row for each model term, we will save a data frame with a single row and columns for each model term. As before, `purrr::map` can be used to estimate and save these values for each split. ```{r coefs} glm_coefs <- function(splits, ...) { ## use `analysis` or `as.data.frame` to get the analysis data mod <- glm(..., data = analysis(splits), family = binomial) as.data.frame(t(coef(mod))) } bt_resamples$betas <- map(.x = bt_resamples$splits, .f = glm_coefs, mod_form) bt_resamples bt_resamples$betas[[1]] ``` ## Keeping Tidy As previously mentioned, the [`broom` package](https://cran.r-project.org/package=broom) contains a class called `tidy` that created representations of objects that can be easily used for analysis, plotting, etc. `rsample` contains `tidy` methods for `rset` and `rsplit` objects. For example: ```{r tidy_rsplit} first_resample <- bt_resamples$splits[[1]] class(first_resample) tidy(first_resample) ``` and ```{r tidy_rset} class(bt_resamples) tidy(bt_resamples) ``` rsample/inst/doc/Basics.Rmd0000644000176200001440000000621213323650050015305 0ustar liggesusers--- title: "Basics" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Basics} output: knitr:::html_vignette: toc: yes --- ```{r ex_setup, include=FALSE} knitr::opts_chunk$set( message = FALSE, digits = 3, collapse = TRUE, comment = "#>" ) options(digits = 3) ``` ## Terminology We define a _resample_ as the result of a two-way split of a data set. For example, when bootstrapping, one part of the resample is a sample with replacement of the original data. The other part of the split contains the instances that were not contained in the bootstrap sample. Cross-validation is another type of resampling. ## `rset` Objects Contain Many Resamples The main class in the package (`rset`) is for a _set_ or _collection_ of resamples. In 10-fold cross-validation, the set would consist of the 10 different resamples of the original data. Like [`modelr`](https://cran.r-project.org/package=modelr), the resamples are stored in data-frame-like `tibble` object. As a simple example, here is a small set of bootstraps of the `mtcars` data: ```{r mtcars_bt, message=FALSE} library(rsample) set.seed(8584) bt_resamples <- bootstraps(mtcars, times = 3) bt_resamples ``` ## Individual Resamples are `rsplit` Objects The resamples are stored in the `splits` column in an object that has class `rsplit`. In this package we use the following terminology for the two partitions that comprise a resample: * The _analysis_ data are those that we selected in the resample. For a bootstrap, this is the sample with replacement. For 10-fold cross-validation, this is the 90% of the data. These data are often used to fit a model or calculate a statistic in traditional bootstrapping. * The _assessment_ data are usually the section of the original data not covered by the analysis set. Again, in 10-fold CV, this is the 10% held out. These data are often used to evaluate the performance of a model that was fit to the analysis data. (Aside: While some might use the term "training" and "testing" for these data sets, we avoid them since those labels often conflict with the data that result from an initial partition of the data that is typically done _before_ resampling. The training/test split can be conducted using the `initial_split` function in this package.) Let's look at one of the `rsplit` objects ```{r rsplit} first_resample <- bt_resamples$splits[[1]] first_resample ``` This indicates that there were `r dim(bt_resamples$splits[[1]])["analysis"]` data points in the analysis set, `r dim(bt_resamples$splits[[1]])["assessment"]` instances were in the assessment set, and that the original data contained `r dim(bt_resamples$splits[[1]])["n"]` data points. These results can also be determined using the `dim` function on an `rsplit` object. To obtain either of these data sets from an `rsplit`, the `as.data.frame` function can be used. By default, the analysis set is returned but the `data` option can be used to return the assessment data: ```{r rsplit_df} head(as.data.frame(first_resample)) as.data.frame(first_resample, data = "assessment") ``` Alternatively, you can use the shortcuts `analysis(first_resample)` and `assessment(first_resample)`.rsample/inst/doc/Working_with_rsets.html0000644000176200001440000063704013512177703020240 0ustar liggesusers Working with rsets

Working with rsets

Introduction

rsample can be used to create objects containing resamples of the original data. This page contains examples of how those objects can be used for data analysis.

For illustration, the attrition data is used. From the help file:

These data are from the IBM Watson Analytics Lab. The website describes the data with “Uncover the factors that lead to employee attrition and explore important questions such as ‘show me a breakdown of distance from home by job role and attrition’ or ‘compare average monthly income by education and attrition’. This is a fictional data set created by IBM data scientists.” There are 1470 rows.

The data can be accessed using

library(rsample)
data("attrition")
names(attrition)
#>  [1] "Age"                      "Attrition"                "BusinessTravel"          
#>  [4] "DailyRate"                "Department"               "DistanceFromHome"        
#>  [7] "Education"                "EducationField"           "EnvironmentSatisfaction" 
#> [10] "Gender"                   "HourlyRate"               "JobInvolvement"          
#> [13] "JobLevel"                 "JobRole"                  "JobSatisfaction"         
#> [16] "MaritalStatus"            "MonthlyIncome"            "MonthlyRate"             
#> [19] "NumCompaniesWorked"       "OverTime"                 "PercentSalaryHike"       
#> [22] "PerformanceRating"        "RelationshipSatisfaction" "StockOptionLevel"        
#> [25] "TotalWorkingYears"        "TrainingTimesLastYear"    "WorkLifeBalance"         
#> [28] "YearsAtCompany"           "YearsInCurrentRole"       "YearsSinceLastPromotion" 
#> [31] "YearsWithCurrManager"
table(attrition$Attrition)
#> 
#>   No  Yes 
#> 1233  237

Model Assessment

Let’s fit a logistic regression model to the data with model terms for the job satisfaction, gender, and monthly income.

If we were fitting the model to the entire data set, we might model attrition using

glm(Attrition ~ JobSatisfaction + Gender + MonthlyIncome, data = attrition, family = binomial)

For convenience, we’ll create a formula object that will be used later:

mod_form <- as.formula(Attrition ~ JobSatisfaction + Gender + MonthlyIncome)

To evaluate this model, we will use 10 repeats of 10-fold cross-validation and use the 100 holdout samples to evaluate the overall accuracy of the model.

First, let’s make the splits of the data:

library(rsample)
set.seed(4622)
rs_obj <- vfold_cv(attrition, v = 10, repeats = 10)
rs_obj
#> #  10-fold cross-validation repeated 10 times 
#> # A tibble: 100 x 3
#>    splits             id       id2   
#>    <named list>       <chr>    <chr> 
#>  1 <split [1.3K/147]> Repeat01 Fold01
#>  2 <split [1.3K/147]> Repeat01 Fold02
#>  3 <split [1.3K/147]> Repeat01 Fold03
#>  4 <split [1.3K/147]> Repeat01 Fold04
#>  5 <split [1.3K/147]> Repeat01 Fold05
#>  6 <split [1.3K/147]> Repeat01 Fold06
#>  7 <split [1.3K/147]> Repeat01 Fold07
#>  8 <split [1.3K/147]> Repeat01 Fold08
#>  9 <split [1.3K/147]> Repeat01 Fold09
#> 10 <split [1.3K/147]> Repeat01 Fold10
#> # … with 90 more rows

Now let’s write a function that will, for each resample:

  1. obtain the analysis data set (i.e. the 90% used for modeling)
  2. fit a logistic regression model
  3. predict the assessment data (the other 10% not used for the model) using the broom package
  4. determine if each sample was predicted correctly.

Here is our function:

## splits will be the `rsplit` object with the 90/10 partition
holdout_results <- function(splits, ...) {
  # Fit the model to the 90%
  mod <- glm(..., data = analysis(splits), family = binomial)
  # Save the 10%
  holdout <- assessment(splits)
  # `augment` will save the predictions with the holdout data set
  res <- broom::augment(mod, newdata = holdout)
  # Class predictions on the assessment set from class probs
  lvls <- levels(holdout$Attrition)
  predictions <- factor(ifelse(res$.fitted > 0, lvls[2], lvls[1]),
                        levels = lvls)
  # Calculate whether the prediction was correct
  res$correct <- predictions == holdout$Attrition
  # Return the assessment data set with the additional columns
  res
}

For example:

example <- holdout_results(rs_obj$splits[[1]],  mod_form)
dim(example)
#> [1] 147  35
dim(assessment(rs_obj$splits[[1]]))
#> [1] 147  31
## newly added columns:
example[1:10, setdiff(names(example), names(attrition))]
#> # A tibble: 10 x 4
#>    .rownames .fitted .se.fit correct
#>    <chr>       <dbl>   <dbl> <lgl>  
#>  1 11          -1.20   0.155 TRUE   
#>  2 24          -1.78   0.166 TRUE   
#>  3 30          -1.45   0.183 TRUE   
#>  4 39          -1.60   0.172 TRUE   
#>  5 53          -1.54   0.176 TRUE   
#>  6 72          -1.93   0.190 TRUE   
#>  7 73          -3.06   0.266 TRUE   
#>  8 80          -3.28   0.346 TRUE   
#>  9 83          -2.23   0.199 TRUE   
#> 10 90          -1.28   0.151 FALSE

For this model, the .fitted value is the linear predictor in log-odds units.

To compute this data set for each of the 100 resamples, we’ll use the map function from the purrr package:

library(purrr)
rs_obj$results <- map(rs_obj$splits,
                      holdout_results,
                      mod_form)
rs_obj
#> #  10-fold cross-validation repeated 10 times 
#> # A tibble: 100 x 4
#>    splits             id       id2    results            
#>    <named list>       <chr>    <chr>  <named list>       
#>  1 <split [1.3K/147]> Repeat01 Fold01 <tibble [147 × 35]>
#>  2 <split [1.3K/147]> Repeat01 Fold02 <tibble [147 × 35]>
#>  3 <split [1.3K/147]> Repeat01 Fold03 <tibble [147 × 35]>
#>  4 <split [1.3K/147]> Repeat01 Fold04 <tibble [147 × 35]>
#>  5 <split [1.3K/147]> Repeat01 Fold05 <tibble [147 × 35]>
#>  6 <split [1.3K/147]> Repeat01 Fold06 <tibble [147 × 35]>
#>  7 <split [1.3K/147]> Repeat01 Fold07 <tibble [147 × 35]>
#>  8 <split [1.3K/147]> Repeat01 Fold08 <tibble [147 × 35]>
#>  9 <split [1.3K/147]> Repeat01 Fold09 <tibble [147 × 35]>
#> 10 <split [1.3K/147]> Repeat01 Fold10 <tibble [147 × 35]>
#> # … with 90 more rows

Now we can compute the accuracy values for all of the assessment data sets:

rs_obj$accuracy <- map_dbl(rs_obj$results, function(x) mean(x$correct))
summary(rs_obj$accuracy)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>   0.776   0.821   0.840   0.839   0.859   0.905

Keep in mind that the baseline accuracy to beat is the rate of non-attrition, which is 0.839. Not a great model so far.

Using the Bootstrap to Make Comparisons

Traditionally, the bootstrap has been primarily used to empirically determine the sampling distribution of a test statistic. Given a set of samples with replacement, a statistic can be calculated on each analysis set and the results can be used to make inferences (such as confidence intervals).

For example, are there differences in the median monthly income between genders?

ggplot(attrition, aes(x = Gender, y = MonthlyIncome)) + 
  geom_boxplot() + 
  scale_y_log10()

If we wanted to compare the genders, we could conduct a t-test or rank-based test. Instead, let’s use the bootstrap to see if there is a difference in the median incomes for the two groups. We need a simple function to compute this statistic on the resample:

median_diff <- function(splits) {
  x <- analysis(splits)
  median(x$MonthlyIncome[x$Gender == "Female"]) - 
      median(x$MonthlyIncome[x$Gender == "Male"])     
}

Now we would create a large number of bootstrap samples (say 2000+). For illustration, we’ll only do 500 in this document.

set.seed(353)
bt_resamples <- bootstraps(attrition, times = 500)

This function is then computed across each resample:

bt_resamples$wage_diff <- map_dbl(bt_resamples$splits, median_diff)

The bootstrap distribution of this statistic has a slightly bimodal and skewed distribution:

ggplot(bt_resamples, aes(x = wage_diff)) + 
  geom_line(stat = "density", adjust = 1.25) + 
  xlab("Difference in Median Monthly Income (Female - Male)")

The variation is considerable in this statistic. One method of computing a confidence interval is to take the percentiles of the bootstrap distribution. A 95% confidence interval for the difference in the means would be:

quantile(bt_resamples$wage_diff, 
         probs = c(0.025, 0.500, 0.975))
#>  2.5%   50% 97.5% 
#>  -189   262   615

On average, there is no evidence for a difference in the genders.

Bootstrap Estimates of Model Coefficients

Unless there is already a column in the resample object that contains the fitted model, a function can be used to fit the model and save all of the model coefficients. The broom package package has a tidy function that will save the coefficients in a data frame. Instead of returning a data frame with a row for each model term, we will save a data frame with a single row and columns for each model term. As before, purrr::map can be used to estimate and save these values for each split.

glm_coefs <- function(splits, ...) {
  ## use `analysis` or `as.data.frame` to get the analysis data
  mod <- glm(..., data = analysis(splits), family = binomial)
  as.data.frame(t(coef(mod)))
}
bt_resamples$betas <- map(.x = bt_resamples$splits, 
                          .f = glm_coefs, 
                          mod_form)
bt_resamples
#> # Bootstrap sampling 
#> # A tibble: 500 x 4
#>    splits             id           wage_diff betas           
#>    <list>             <chr>            <dbl> <list>          
#>  1 <split [1.5K/558]> Bootstrap001      136  <df[,6] [1 × 6]>
#>  2 <split [1.5K/528]> Bootstrap002      282. <df[,6] [1 × 6]>
#>  3 <split [1.5K/541]> Bootstrap003      470  <df[,6] [1 × 6]>
#>  4 <split [1.5K/561]> Bootstrap004     -213  <df[,6] [1 × 6]>
#>  5 <split [1.5K/518]> Bootstrap005      453  <df[,6] [1 × 6]>
#>  6 <split [1.5K/539]> Bootstrap006      684  <df[,6] [1 × 6]>
#>  7 <split [1.5K/542]> Bootstrap007       60  <df[,6] [1 × 6]>
#>  8 <split [1.5K/536]> Bootstrap008      286  <df[,6] [1 × 6]>
#>  9 <split [1.5K/552]> Bootstrap009      -30  <df[,6] [1 × 6]>
#> 10 <split [1.5K/517]> Bootstrap010      410  <df[,6] [1 × 6]>
#> # … with 490 more rows
bt_resamples$betas[[1]]
#>   (Intercept) JobSatisfaction.L JobSatisfaction.Q JobSatisfaction.C GenderMale
#> 1      -0.939            -0.501            -0.272            0.0842     0.0989
#>   MonthlyIncome
#> 1     -0.000129

Keeping Tidy

As previously mentioned, the broom package contains a class called tidy that created representations of objects that can be easily used for analysis, plotting, etc. rsample contains tidy methods for rset and rsplit objects. For example:

first_resample <- bt_resamples$splits[[1]]
class(first_resample)
#> [1] "rsplit"     "boot_split"
tidy(first_resample)
#> # A tibble: 1,470 x 2
#>      Row Data    
#>    <int> <chr>   
#>  1     2 Analysis
#>  2     3 Analysis
#>  3     4 Analysis
#>  4     7 Analysis
#>  5     9 Analysis
#>  6    10 Analysis
#>  7    11 Analysis
#>  8    13 Analysis
#>  9    18 Analysis
#> 10    19 Analysis
#> # … with 1,460 more rows

and

class(bt_resamples)
#> [1] "bootstraps" "rset"       "tbl_df"     "tbl"        "data.frame"
tidy(bt_resamples)
#> # A tibble: 735,000 x 3
#>      Row Data     Resample    
#>    <int> <chr>    <chr>       
#>  1     1 Analysis Bootstrap002
#>  2     1 Analysis Bootstrap004
#>  3     1 Analysis Bootstrap007
#>  4     1 Analysis Bootstrap008
#>  5     1 Analysis Bootstrap009
#>  6     1 Analysis Bootstrap010
#>  7     1 Analysis Bootstrap011
#>  8     1 Analysis Bootstrap013
#>  9     1 Analysis Bootstrap015
#> 10     1 Analysis Bootstrap016
#> # … with 734,990 more rows
rsample/inst/doc/Basics.html0000644000176200001440000004340113512177675015551 0ustar liggesusers Basics

Basics

Terminology

We define a resample as the result of a two-way split of a data set. For example, when bootstrapping, one part of the resample is a sample with replacement of the original data. The other part of the split contains the instances that were not contained in the bootstrap sample. Cross-validation is another type of resampling.

rset Objects Contain Many Resamples

The main class in the package (rset) is for a set or collection of resamples. In 10-fold cross-validation, the set would consist of the 10 different resamples of the original data.

Like modelr, the resamples are stored in data-frame-like tibble object. As a simple example, here is a small set of bootstraps of the mtcars data:

library(rsample)
set.seed(8584)
bt_resamples <- bootstraps(mtcars, times = 3)
bt_resamples
#> # Bootstrap sampling 
#> # A tibble: 3 x 2
#>   splits          id        
#>   <list>          <chr>     
#> 1 <split [32/14]> Bootstrap1
#> 2 <split [32/12]> Bootstrap2
#> 3 <split [32/14]> Bootstrap3

Individual Resamples are rsplit Objects

The resamples are stored in the splits column in an object that has class rsplit.

In this package we use the following terminology for the two partitions that comprise a resample:

(Aside: While some might use the term “training” and “testing” for these data sets, we avoid them since those labels often conflict with the data that result from an initial partition of the data that is typically done before resampling. The training/test split can be conducted using the initial_split function in this package.)

Let’s look at one of the rsplit objects

first_resample <- bt_resamples$splits[[1]]
first_resample
#> <32/14/32>

This indicates that there were 32 data points in the analysis set, 14 instances were in the assessment set, and that the original data contained 32 data points. These results can also be determined using the dim function on an rsplit object.

To obtain either of these data sets from an rsplit, the as.data.frame function can be used. By default, the analysis set is returned but the data option can be used to return the assessment data:

head(as.data.frame(first_resample))
#>                   mpg cyl  disp  hp drat   wt qsec vs am gear carb
#> Fiat 128         32.4   4  78.7  66 4.08 2.20 19.5  1  1    4    1
#> Toyota Corolla   33.9   4  71.1  65 4.22 1.83 19.9  1  1    4    1
#> Toyota Corolla.1 33.9   4  71.1  65 4.22 1.83 19.9  1  1    4    1
#> AMC Javelin      15.2   8 304.0 150 3.15 3.44 17.3  0  0    3    2
#> Valiant          18.1   6 225.0 105 2.76 3.46 20.2  1  0    3    1
#> Merc 450SLC      15.2   8 275.8 180 3.07 3.78 18.0  0  0    3    3
as.data.frame(first_resample, data = "assessment")
#>                     mpg cyl  disp  hp drat   wt qsec vs am gear carb
#> Mazda RX4 Wag      21.0   6 160.0 110 3.90 2.88 17.0  0  1    4    4
#> Hornet 4 Drive     21.4   6 258.0 110 3.08 3.21 19.4  1  0    3    1
#> Merc 240D          24.4   4 146.7  62 3.69 3.19 20.0  1  0    4    2
#> Merc 230           22.8   4 140.8  95 3.92 3.15 22.9  1  0    4    2
#> Merc 280           19.2   6 167.6 123 3.92 3.44 18.3  1  0    4    4
#> Merc 280C          17.8   6 167.6 123 3.92 3.44 18.9  1  0    4    4
#> Merc 450SE         16.4   8 275.8 180 3.07 4.07 17.4  0  0    3    3
#> Merc 450SL         17.3   8 275.8 180 3.07 3.73 17.6  0  0    3    3
#> Cadillac Fleetwood 10.4   8 472.0 205 2.93 5.25 18.0  0  0    3    4
#> Chrysler Imperial  14.7   8 440.0 230 3.23 5.34 17.4  0  0    3    4
#> Honda Civic        30.4   4  75.7  52 4.93 1.61 18.5  1  1    4    2
#> Fiat X1-9          27.3   4  79.0  66 4.08 1.94 18.9  1  1    4    1
#> Lotus Europa       30.4   4  95.1 113 3.77 1.51 16.9  1  1    5    2
#> Volvo 142E         21.4   4 121.0 109 4.11 2.78 18.6  1  1    4    2

Alternatively, you can use the shortcuts analysis(first_resample) and assessment(first_resample).

rsample/tests/0000755000176200001440000000000013512177703013065 5ustar liggesusersrsample/tests/testthat.R0000644000176200001440000000011513323650050015034 0ustar liggesuserslibrary(testthat) library(rsample) test_check(package = "rsample") q("no") rsample/tests/testthat/0000755000176200001440000000000013512203633014715 5ustar liggesusersrsample/tests/testthat/test_vfold.R0000644000176200001440000000453313512132573017222 0ustar liggesuserscontext("V-fold CV") library(testthat) library(rsample) library(purrr) dat1 <- data.frame(a = 1:20, b = letters[1:20]) test_that('default param', { set.seed(11) rs1 <- vfold_cv(dat1) sizes1 <- dim_rset(rs1) expect_true(all(sizes1$analysis == 18)) expect_true(all(sizes1$assessment == 2)) same_data <- map_lgl(rs1$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) good_holdout <- map_lgl(rs1$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('repeated', { set.seed(11) rs2 <- vfold_cv(dat1, repeats = 4) sizes2 <- dim_rset(rs2) expect_true(all(sizes2$analysis == 18)) expect_true(all(sizes2$assessment == 2)) same_data <- map_lgl(rs2$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) good_holdout <- map_lgl(rs2$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('strata', { iris2 <- iris[1:130, ] set.seed(11) rs3 <- vfold_cv(iris2, repeats = 2, strata = "Species") sizes3 <- dim_rset(rs3) expect_true(all(sizes3$analysis == 117)) expect_true(all(sizes3$assessment == 13)) rate <- map_dbl(rs3$splits, function(x) { dat <- as.data.frame(x)$Species mean(dat == "virginica") }) expect_true(length(unique(rate)) == 1) good_holdout <- map_lgl(rs3$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('bad args', { expect_error(vfold_cv(iris, strata = iris$Species)) expect_error(vfold_cv(iris, strata = c("Species", "Sepal.Width"))) }) test_that('printing', { expect_output(print(vfold_cv(mtcars))) }) test_that('rsplit labels', { rs <- vfold_cv(mtcars) all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) rs2 <- vfold_cv(mtcars, repeats = 4) all_labs2 <- map_df(rs2$splits, labels) original_id2 <- rs2[, grepl("^id", names(rs2))] expect_equal(all_labs2, original_id2) }) rsample/tests/testthat/test_rsplit.R0000644000176200001440000000230013414310573017412 0ustar liggesuserscontext("Rsplit constructor") library(testthat) library(rsample) dat1 <- data.frame(a = 1:100, b = 101:200) size1 <- object.size(dat1) dat2 <- as.matrix(dat1) test_that('simple rsplit', { rs1 <- rsplit(dat1, 1:2, 4:5) expect_equal(rs1$data, dat1) expect_equal(rs1$in_id, 1:2) expect_equal(rs1$out_id, 4:5) }) test_that('simple rsplit with matrices', { rs2 <- rsplit(dat2, 1:2, 4:5) expect_equal(rs2$data, dat2) expect_equal(rs2$in_id, 1:2) expect_equal(rs2$out_id, 4:5) }) test_that('bad inputs', { expect_error(rsplit(as.list(dat1), 1:2, 4:5)) expect_error(rsplit(dat1, letters[1:2], 4:5)) expect_error(rsplit(as.list(dat1), 1:2, letters[4:5])) expect_error(rsplit(as.list(dat1), -1:2, 4:5)) expect_error(rsplit(as.list(dat1), 1:2, -4:5)) expect_error(rsplit(as.list(dat1), integer(0), 4:5)) }) test_that('as.data.frame', { rs3 <- rsplit(dat1, 1:2, 4:5) expect_equal(as.data.frame(rs3), dat1[1:2,]) expect_equal(as.data.frame(rs3, data = "assessment"), dat1[4:5,]) rs4 <- rsplit(dat1, rep(1:2, each = 3), rep(4:5, c(2, 1))) expect_equal(as.data.frame(rs4), dat1[c(1, 1, 1, 2, 2, 2),]) expect_equal(as.data.frame(rs4, data = "assessment"), dat1[c(4, 4, 5),]) }) rsample/tests/testthat/test_loo.R0000644000176200001440000000145413352536641016706 0ustar liggesuserscontext("Leave-one-out CV") library(testthat) library(rsample) library(purrr) dat1 <- data.frame(a = 1:10, b = letters[1:10]) test_that('Loooooo', { loo1 <- loo_cv(dat1) expect_equal(nrow(loo1), nrow(dat1)) same_data <- map_lgl(loo1$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) holdouts <- map_lgl(loo1$splits, function(x) length(x$out_id) == 1) expect_true(all(holdouts)) retained <- map_lgl(loo1$splits, function(x) length(x$in_id) == (nrow(dat1) - 1)) expect_true(all(retained)) }) test_that('printing', { expect_output(print(loo_cv(dat1))) }) test_that('rsplit labels', { rs <- loo_cv(mtcars) all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) }) rsample/tests/testthat/test_dplyr.R0000644000176200001440000000545713414310573017247 0ustar liggesuserscontext("Compatibility with dplyr") library(rsample) library(testthat) library(dplyr) ################################################################### obj_1 <- vfold_cv(mtcars) obj_2 <- bootstraps(mtcars) obj_3 <- rolling_origin(mtcars) obj_4 <- nested_cv(mtcars, obj_1, inside = bootstraps(times = 5)) obj_5 <- mc_cv(mtcars) obj_6 <- loo_cv(mtcars) obj_7 <- group_vfold_cv(mtcars, group = "am") ################################################################### check_att <- function(x, y) length(setdiff(names(attributes(x)), names(attributes(x)))) == 0 ################################################################### test_that('object types', { expect_true(is_rset(obj_1)) expect_false(is_rset(obj_1[, -1])) }) ################################################################### test_that('dplyr ops', { expect_true( is_rset( obj_2 %>% filter(id == "Bootstrap02") ) ) expect_true( is_rset( obj_3 %>% mutate(blah = substr(id, 1, 3)) ) ) expect_true( is_rset( obj_4 %>% select(splits, id) ) ) expect_true( is_rset( obj_1 %>% arrange(id) ) ) expect_true( is_rset( obj_1 %>% mutate(blah = substr(id, 1, 3)) %>% rename(newer = blah) ) ) expect_true( check_att( obj_1 %>% mutate(blah = substr(id, 1, 3)) %>% rename(newer = blah), obj_1 ) ) expect_true( is_rset( obj_2 %>% mutate(blah = substr(id, 1, 3)) %>% rename(newer = blah) ) ) expect_true( check_att( obj_2 %>% mutate(blah = substr(id, 1, 3)) %>% rename(newer = blah), obj_2 ) ) expect_true( is_rset( obj_3 %>% mutate(blah = substr(id, 1, 3)) %>% rename(newer = blah) ) ) expect_true( check_att( obj_3 %>% mutate(blah = substr(id, 1, 3)) %>% rename(newer = blah), obj_3 ) ) expect_true( is_rset( obj_4 %>% mutate(blah = substr(id, 1, 3)) %>% rename(newer = blah) ) ) expect_true( check_att( obj_4 %>% mutate(blah = substr(id, 1, 3)) %>% rename(newer = blah), obj_4 ) ) expect_true( is_rset( obj_5 %>% mutate(blah = substr(id, 1, 3)) %>% rename(newer = blah) ) ) expect_true( check_att( obj_5 %>% mutate(blah = substr(id, 1, 3)) %>% rename(newer = blah), obj_5 ) ) expect_true( is_rset( obj_6 %>% mutate(blah = substr(id, 1, 3)) %>% rename(newer = blah) ) ) expect_true( check_att( obj_6 %>% mutate(blah = substr(id, 1, 3)) %>% rename(newer = blah), obj_6 ) ) expect_true( is_rset( obj_7 %>% mutate(blah = substr(id, 1, 3)) %>% rename(newer = blah) ) ) expect_true( check_att( obj_7 %>% mutate(blah = substr(id, 1, 3)) %>% rename(newer = blah), obj_7 ) ) expect_true( is_rset( obj_3 %>% slice(1L) ) ) }) rsample/tests/testthat/test_mc.R0000644000176200001440000000423113512132573016502 0ustar liggesuserscontext("Monte Carlo CV") library(testthat) library(rsample) library(purrr) dat1 <- data.frame(a = 1:20, b = letters[1:20]) test_that('default param', { set.seed(11) rs1 <- mc_cv(dat1) sizes1 <- dim_rset(rs1) expect_true(all(sizes1$analysis == 15)) expect_true(all(sizes1$assessment == 5)) same_data <- map_lgl(rs1$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) good_holdout <- map_lgl(rs1$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('different percent', { set.seed(11) rs2 <- mc_cv(dat1, prop = .5) sizes2 <- dim_rset(rs2) expect_true(all(sizes2$analysis == 10)) expect_true(all(sizes2$assessment == 10)) same_data <- map_lgl(rs2$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) good_holdout <- map_lgl(rs2$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('strata', { iris2 <- iris[1:130, ] set.seed(11) rs3 <- mc_cv(iris2, strata = "Species") sizes3 <- dim_rset(rs3) expect_true(all(sizes3$analysis == 99)) expect_true(all(sizes3$assessment == 31)) rate <- map_dbl(rs3$splits, function(x) { dat <- as.data.frame(x)$Species mean(dat == "virginica") }) expect_true(length(unique(rate)) == 1) good_holdout <- map_lgl(rs3$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('bad args', { expect_error(mc_cv(iris, strata = iris$Species)) expect_error(mc_cv(iris, strata = c("Species", "Sepal.Length"))) }) test_that('printing', { expect_output(print(mc_cv(iris))) }) test_that('rsplit labels', { rs <- mc_cv(mtcars) all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) }) rsample/tests/testthat/test_initial.R0000644000176200001440000000130413352536641017540 0ustar liggesuserscontext("Initial splitting") library(testthat) library(rsample) library(purrr) dat1 <- data.frame(a = 1:20, b = letters[1:20]) test_that('default param', { set.seed(11) rs1 <- initial_split(dat1) expect_equal(class(rs1), c("rsplit", "mc_split")) tr1 <- training(rs1) ts1 <- testing(rs1) expect_equal(nrow(tr1), nrow(dat1)*3/4) expect_equal(nrow(ts1), nrow(dat1)/4) }) test_that('default time param', { rs1 <- initial_time_split(dat1) expect_equal(class(rs1), "rsplit") tr1 <- training(rs1) ts1 <- testing(rs1) expect_equal(nrow(tr1), floor(nrow(dat1) * 3/4)) expect_equal(nrow(ts1), ceiling(nrow(dat1) / 4)) expect_equal(tr1, dplyr::slice(dat1, 1:floor(nrow(dat1) * 3/4))) }) rsample/tests/testthat/test_rolling.R0000644000176200001440000000435213414310573017554 0ustar liggesuserscontext("Rolling window resampling") library(testthat) library(rsample) library(purrr) dat1 <- data.frame(a = 1:20, b = letters[1:20]) test_that('default param', { rs1 <- rolling_origin(dat1) sizes1 <- dim_rset(rs1) expect_true(all(sizes1$assessment == 1)) expect_true(all(sizes1$analysis == 5:19)) same_data <- map_lgl(rs1$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) for (i in 1:nrow(rs1)) { expect_equal(rs1$splits[[i]]$in_id, 1:(i + attr(rs1, "initial") - 1)) expect_equal(rs1$splits[[i]]$out_id, i + attr(rs1, "initial")) } }) test_that('larger holdout', { rs2 <- rolling_origin(dat1, assess = 3) sizes2 <- dim_rset(rs2) expect_true(all(sizes2$assessment == 3)) expect_true(all(sizes2$analysis == 5:17)) for (i in 1:nrow(rs2)) { expect_equal(rs2$splits[[i]]$in_id, 1:(i + attr(rs2, "initial") - 1)) expect_equal(rs2$splits[[i]]$out_id, (i + attr(rs2, "initial")): (i + attr(rs2, "initial") + attr(rs2, "assess") - 1)) } }) test_that('fixed analysis size', { rs3 <- rolling_origin(dat1, cumulative = FALSE) sizes3 <- dim_rset(rs3) expect_true(all(sizes3$assessment == 1)) expect_true(all(sizes3$analysis == 5)) for (i in 1:nrow(rs3)) { expect_equal(rs3$splits[[i]]$in_id, i:(i + attr(rs3, "initial") - 1)) expect_equal(rs3$splits[[i]]$out_id, i + attr(rs3, "initial")) } }) test_that('skipping', { rs4 <- rolling_origin(dat1, cumulative = FALSE, skip = 2) sizes4 <- dim_rset(rs4) expect_true(all(sizes4$assessment == 1)) expect_true(all(sizes4$analysis == 5)) for (i in 1:nrow(rs4)) { expect_equal(rs4$splits[[i]]$in_id, (i + attr(rs4, "skip")*(i-1)): (i + attr(rs4, "skip")*(i-1) + attr(rs4, "initial") -1)) expect_equal(rs4$splits[[i]]$out_id, i + attr(rs4, "skip")*(i-1) + attr(rs4, "initial")) } }) test_that('printing', { expect_output(print(rolling_origin(dat1))) }) test_that('rsplit labels', { rs <- rolling_origin(dat1) all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) }) rsample/tests/testthat/test_boot.R0000644000176200001440000000455713512132573017061 0ustar liggesuserscontext("Bootstrapping") library(testthat) library(rsample) library(purrr) dat1 <- data.frame(a = 1:20, b = letters[1:20]) test_that('default param', { set.seed(11) rs1 <- bootstraps(dat1) sizes1 <- dim_rset(rs1) expect_true(all(sizes1$analysis == nrow(dat1))) same_data <- map_lgl(rs1$splits, function(x) all.equal(x$data, dat1)) expect_true(all(same_data)) good_holdout <- map_lgl(rs1$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) }) test_that('apparent', { rs2 <- bootstraps(dat1, apparent = TRUE) sizes2 <- dim_rset(rs2) expect_true(all(sizes2$analysis == nrow(dat1))) expect_true(all(sizes2$assessment[nrow(sizes2)] == nrow(dat1))) expect_equal(sizes2$assessment[sizes2$id == "Apparent"], nrow(dat1)) res2 <- as.data.frame(rs2$splits[[nrow(sizes2)]], data = "assessment") expect_equal(res2, dat1) }) test_that('strata', { iris2 <- iris[1:130, ] set.seed(11) rs4 <- bootstraps(iris2, strata = "Species") sizes4 <- dim_rset(rs4) expect_true(all(sizes4$analysis == nrow(iris2))) rate <- map_dbl(rs4$splits, function(x) { dat <- as.data.frame(x)$Species mean(dat == "virginica") }) expect_true(length(unique(rate)) == 1) good_holdout <- map_lgl(rs4$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) rs5 <- bootstraps(iris2, apparent = TRUE, strata = "Species") sizes5 <- dim_rset(rs5) expect_true(all(sizes5$analysis == nrow(iris2))) expect_true(all(sizes5$assessment[nrow(sizes5)] == nrow(iris2))) expect_equal(sizes5$assessment[sizes5$id == "Apparent"], nrow(iris2)) res5 <- as.data.frame(rs5$splits[[nrow(sizes5)]], data = "assessment") expect_equal(res5, iris2) }) test_that('bad args', { expect_error(bootstraps(iris, strata = iris$Species)) expect_error(bootstraps(iris, strata = c("Species", "Sepal.Length"))) }) test_that('printing', { expect_output(print(bootstraps(iris))) }) test_that('rsplit labels', { rs <- bootstraps(iris) all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) }) rsample/tests/testthat/test_bootci.R0000644000176200001440000001401313512145132017354 0ustar liggesuserslibrary(rsample) library(testthat) library(purrr) library(tibble) library(dplyr) library(broom) data("attrition") context("Bootstrap intervals") # ------------------------------------------------------------------------------ get_stats <- function(split, ...) { dat <- analysis(split) x <- dat[[1]] tibble( term = "mean", estimate = mean(x, na.rm = TRUE), std.error = sqrt(var(x, na.rm = TRUE)/sum(!is.na(x))) ) } # ------------------------------------------------------------------------------ n <- 1000 mu <- 10 sigma <- 1 set.seed(888) rand_nums <- rnorm(n, mu, sigma) ttest <- tidy(t.test(rand_nums)) dat <- data.frame(x = rand_nums) set.seed(456765) bt_norm <- bootstraps(dat, times = 1000, apparent = TRUE) %>% dplyr::mutate( stats = map(splits, ~ get_stats(.x)) ) test_that('Bootstrap estimate of mean is close to estimate of mean from normal distribution',{ single_pct_res <- int_pctl(bt_norm, stats) single_t_res <- int_t(bt_norm, stats) single_bca_res <- int_bca(bt_norm, stats, .fn = get_stats) expect_equal(ttest$conf.low, single_pct_res$.lower, tolerance = 0.01) expect_equal(ttest$estimate, single_pct_res$.estimate, tolerance = 0.01) expect_equal(ttest$conf.high, single_pct_res$.upper, tolerance = 0.01) expect_equal(ttest$conf.low, single_t_res$.lower, tolerance = 0.01) expect_equal(ttest$estimate, single_t_res$.estimate, tolerance = 0.01) expect_equal(ttest$conf.high, single_pct_res$.upper, tolerance = 0.01) expect_equal(ttest$conf.low, single_bca_res$.lower, tolerance = 0.01) expect_equal(ttest$estimate, single_bca_res$.estimate, tolerance = 0.01) expect_equal(ttest$conf.high, single_bca_res$.upper, tolerance = 0.01) }) # ------------------------------------------------------------------------------ context("Wrapper Functions") test_that("Wrappers -- selection of multiple variables works", { func <- function(split, ...) { lm(Age ~ HourlyRate + DistanceFromHome, data = analysis(split)) %>% tidy() } # generate boostrap resamples set.seed(888) bt_resamples <- bootstraps(attrition, times = 1000, apparent = TRUE) %>% mutate(res = map(splits, func)) attrit_tidy <- lm(Age ~ HourlyRate + DistanceFromHome, data = attrition) %>% tidy(conf.int = TRUE) %>% dplyr::arrange(term) pct_res <- int_pctl(bt_resamples, res) %>% inner_join(attrit_tidy, by = "term") expect_equal(pct_res$conf.low, pct_res$.lower, tolerance = .01) expect_equal(pct_res$conf.high, pct_res$.upper, tolerance = .01) t_res <- int_t(bt_resamples, res) %>% inner_join(attrit_tidy, by = "term") expect_equal(t_res$conf.low, t_res$.lower, tolerance = .01) expect_equal(t_res$conf.high, t_res$.upper, tolerance = .01) bca_res <- int_bca(bt_resamples, res, .fn = func) %>% inner_join(attrit_tidy, by = "term") expect_equal(bca_res$conf.low, bca_res$.lower, tolerance = .01) expect_equal(bca_res$conf.high, bca_res$.upper, tolerance = .01) }) # ------------------------------------------------------------------------------ context("boot_ci() Prompt Errors: Too Many NAs") test_that('Upper & lower confidence interval does not contain NA', { bad_stats <- function(split, ...) { tibble( term = "mean", estimate = NA_real_, std.error = runif(1) ) } set.seed(888) bt_resamples <- bootstraps(data.frame(x = 1:100), times = 1000, apparent = TRUE) %>% mutate(res = map(splits, bad_stats)) expect_error( int_pctl(bt_resamples, res), "missing values" ) expect_error( int_t(bt_resamples, res), "missing values" ) expect_error( int_bca(bt_resamples, res, .fn = bad_stats), "missing values" ) }) # ------------------------------------------------------------------------------ context("boot_ci() Insufficient Number of Bootstrap Resamples") set.seed(456765) bt_small <- bootstraps(dat, times = 10, apparent = TRUE) %>% dplyr::mutate( stats = map(splits, ~ get_stats(.x)), junk = 1:11 ) test_that( "Sufficient replications needed to sufficiently reduce Monte Carlo sampling Error for BCa method", { expect_warning(int_pctl(bt_small, stats)) expect_warning(int_t(bt_small, stats)) expect_warning(int_bca(bt_small, stats, .fn = get_stats)) } ) context("boot_ci() Input Validation") test_that("bad input", { expect_error(int_pctl(bt_small, id)) expect_error(int_pctl(bt_small, junk)) bad_bt_norm <- bt_norm %>% mutate(stats = map(stats, ~ .x[, 1:2])) expect_error(int_t(bad_bt_norm, stats)) expect_error(int_bca(bad_bt_norm, stats)) no_dots <- function(split) { dat <- analysis(split) x <- dat[[1]] tibble( term = "mean", estimate = mean(x, na.rm = TRUE), std.error = sqrt(var(x, na.rm = TRUE)/sum(!is.na(x))) ) } expect_error( int_bca(bt_norm, stats, .fn = no_dots), "must have an argument" ) expect_error(int_pctl(as.data.frame(bt_norm), stats)) expect_error(int_t(as.data.frame(bt_norm), stats)) expect_error(int_bca(as.data.frame(bt_norm), stats, .fn = get_stats)) expect_error( int_t(bt_norm %>% dplyr::filter(id != "Apparent"), stats) ) expect_error( int_bca(bt_norm %>% dplyr::filter(id != "Apparent"), stats, .fn = get_stats) ) poo <- function(x) { x$estimate <- "a" x } badder_bt_norm <- bt_norm %>% mutate( bad_term = map(stats, ~ .x %>% setNames(c("a", "estimate", "std.err"))), bad_est = map(stats, ~ .x %>% setNames(c("term", "b", "std.err"))), bad_err = map(stats, ~ .x %>% setNames(c("term", "estimate", "c"))), bad_num = map(stats, ~ poo(.x)) ) expect_error(int_pctl(badder_bt_norm, bad_term)) expect_error(int_t(badder_bt_norm, bad_err)) expect_error(int_bca(badder_bt_norm, bad_est, .fn = get_stats)) expect_error(int_pctl(badder_bt_norm, bad_num)) }) rsample/tests/testthat/test_for_pred.R0000644000176200001440000000242413352536641017713 0ustar liggesuserscontext("Predictor extraction") library(testthat) test_that('no dots', { expect_equal(form_pred(y ~ x + z), c("x", "z")) expect_equal(form_pred(terms(y ~ x + z)), c("x", "z")) expect_equal(form_pred(y ~ x + log(z)), c("x", "z")) expect_equal(form_pred(terms(y ~ x + log(z))), c("x", "z")) expect_equal(form_pred(log(y) ~ x + z), c("x", "z")) expect_equal(form_pred(terms(log(y) ~ x + z)), c("x", "z")) expect_equal(form_pred(y1 + y2 ~ x + z), c("x", "z")) expect_equal(form_pred(terms(y1 + y2 ~ x + z)), c("x", "z")) expect_equal(form_pred(log(y1) + y2 ~ x + z), c("x", "z")) expect_equal(form_pred(terms(log(y1) + y2 ~ x + z)), c("x", "z")) expect_equal(form_pred(~ x + z), c("x", "z")) expect_equal(form_pred(terms(~ x + z)), c("x", "z")) expect_equal(form_pred(~ x), "x") expect_equal(form_pred(terms(~ x)), "x") expect_equal(form_pred(y ~ x), "x") expect_equal(form_pred(terms(y ~ x)), "x") }) test_that('dots', { expect_error(form_pred(y ~ .)) expect_error(form_pred(terms(y ~ .))) expect_error(form_pred(y ~ (.)^2)) expect_error(form_pred(terms(y ~ (.)^2))) expect_equal(form_pred(terms(Species ~ (.)^2, data = iris)), names(iris)[1:4]) expect_equal(form_pred(terms(~ (.)^2, data = iris)), names(iris)) }) rsample/tests/testthat/test_rset.R0000644000176200001440000000335213414310573017062 0ustar liggesuserscontext("Rset constructor") library(testthat) library(rsample) cars_10fold <- vfold_cv(mtcars) test_that('bad args', { expect_error( new_rset(cars_10fold$splits[1:2], cars_10fold$id) ) expect_error( new_rset(cars_10fold$splits, cars_10fold[ "splits"]) ) expect_error( new_rset(cars_10fold$splits, cars_10fold$splits) ) args <- list(a = 1, b = 2, 3) expect_error( new_rset( cars_10fold$splits, cars_10fold$id, attrib = args ) ) }) test_that('simple rset', { res1 <- new_rset( cars_10fold$splits, cars_10fold$id ) expect_equal(names(res1), c("splits", "id")) expect_equal(class(res1), c("tbl_df", "tbl", "data.frame")) expect_equal(sort(names(attributes(res1))), c("class", "names", "row.names")) res2 <- new_rset( cars_10fold[, "splits"], cars_10fold[, "id"] ) expect_equal(names(res2), c("splits", "id")) expect_equal(class(res2), c("rset", "tbl_df", "tbl", "data.frame")) expect_equal(sort(names(attributes(res2))), sort(names(attributes(cars_10fold)))) }) test_that('rset with attributes', { args <- list(value = "potato") res3 <- new_rset( cars_10fold$splits, cars_10fold$id, attrib = args ) expect_equal(sort(names(attributes(res3))), c("class", "names", "row.names", "value")) expect_equal(attr(res3, "value"), "potato") }) test_that('rset with additional classes', { res4 <- new_rset( cars_10fold$splits, cars_10fold$id, subclass = "potato" ) expect_equal(class(res4), c("potato", "tbl_df", "tbl", "data.frame")) }) test_that('not an rsplit', { folds <- vfold_cv(mtcars) expect_error(analysis(folds$splits[1])) expect_error(assessment(folds$splits[1])) }) rsample/tests/testthat/test_labels.R0000644000176200001440000000330613507237102017345 0ustar liggesuserslibrary(testthat) library(rsample) context("Labels") test_that('basic cv', { cv_obj <- vfold_cv(mtcars) expect_equal(cv_obj$id, labels(cv_obj)) expect_is(labels(cv_obj), "character") expect_s3_class(labels(cv_obj, TRUE), "factor") }) test_that('repeated cv', { rcv_obj <- vfold_cv(mtcars, repeats = 3) expect_equal(paste(rcv_obj$id, rcv_obj$id2, sep = "."), labels(rcv_obj)) expect_is(labels(rcv_obj), "character") expect_s3_class(labels(rcv_obj, TRUE), "factor") }) test_that('nested cv', { expect_error( labels( nested_cv(mtcars, outside = vfold_cv(v = 3), inside = bootstraps(times = 5) ) ) ) }) test_that('adding labels', { set.seed(363) car_folds <- vfold_cv(mtcars, repeats = 3) res <- analysis(car_folds$splits[[1]]) %>% add_resample_id(car_folds$splits[[1]]) expect_equal(colnames(res), c(colnames(mtcars), "id", "id2")) car_bt <- bootstraps(mtcars) res <- analysis(car_bt$splits[[1]]) %>% add_resample_id(car_bt$splits[[1]]) expect_equal(colnames(res), c(colnames(mtcars), "id")) res <- analysis(car_bt$splits[[1]]) %>% add_resample_id(car_bt$splits[[1]], TRUE) expect_equal(colnames(res), c(colnames(mtcars), ".id")) expect_error( analysis(car_folds$splits[[1]]) %>% add_resample_id(car_folds$splits[[1]], 7) ) expect_error( analysis(car_folds$splits[[1]]) %>% add_resample_id(car_folds$splits[[1]], c(TRUE, TRUE)) ) expect_error( analysis(car_folds$splits[[1]]) %>% add_resample_id(car_folds$splits) ) expect_error( analysis(car_folds$splits[[1]]) %>% as.matrix() %>% add_resample_id(car_folds$splits[[1]]) ) }) rsample/tests/testthat/test_names.R0000644000176200001440000000041613414310573017206 0ustar liggesuserscontext("Naming functions") library(testthat) library(rsample) test_that('basic naming sequences', { expect_equal(names0(2), c("x1", "x2")) expect_equal(names0(2, "y"), c("y1", "y2")) expect_equal(names0(10), c(paste0("x0", 1:9), "x10")) }) rsample/tests/testthat/test_group.R0000644000176200001440000000515013414310573017237 0ustar liggesuserscontext("Group resampling") library(testthat) library(rsample) library(purrr) library(tibble) iris2 <- as_tibble(iris) get_id_left_out <- function(x) unique(as.character(assessment(x)$Species)) test_that('bad args', { expect_error(group_vfold_cv(iris, group = iris$Species)) expect_error(group_vfold_cv(iris, group = c("Species", "Sepal.Width"))) expect_error(group_vfold_cv(iris, group = "Specie")) expect_error(group_vfold_cv(iris)) expect_error(group_vfold_cv(iris, group = "Species", v = 10)) }) test_that('default param', { set.seed(11) rs1 <- group_vfold_cv(iris, "Species") sizes1 <- dim_rset(rs1) expect_true(all(sizes1$analysis == 100)) expect_true(all(sizes1$assessment == 50)) same_data <- map_lgl(rs1$splits, function(x) all.equal(x$data, iris)) expect_true(all(same_data)) good_holdout <- map_lgl(rs1$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) sp_out <- map_chr(rs1$splits, get_id_left_out) expect_true(all(table(sp_out) == 1)) }) test_that('v < max v', { set.seed(11) rs2 <- group_vfold_cv(iris, "Species", v = 2) sizes2 <- dim_rset(rs2) expect_true(!all(sizes2$analysis == 100)) expect_true(!all(sizes2$assessment == 50)) same_data <- map_lgl(rs2$splits, function(x) all.equal(x$data, iris)) expect_true(all(same_data)) good_holdout <- map_lgl(rs2$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) sp_out <- map(rs2$splits, get_id_left_out) expect_true(all(table(unlist(sp_out)) == 1)) }) test_that('tibble input', { set.seed(11) rs3 <- group_vfold_cv(iris2, "Species") sizes3 <- dim_rset(rs3) expect_true(all(sizes3$analysis == 100)) expect_true(all(sizes3$assessment == 50)) same_data <- map_lgl(rs3$splits, function(x) all.equal(x$data, iris2)) expect_true(all(same_data)) good_holdout <- map_lgl(rs3$splits, function(x) { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) sp_out <- map_chr(rs3$splits, get_id_left_out) expect_true(all(table(sp_out) == 1)) }) test_that('printing', { expect_output(print(group_vfold_cv(iris, "Species"))) }) test_that('rsplit labels', { rs <- group_vfold_cv(iris, "Species") all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) }) rsample/tests/testthat/test_strata.R0000644000176200001440000000125313512132573017402 0ustar liggesuserscontext("Strata constructor") library(testthat) library(rsample) library(purrr) test_that('simple numerics', { set.seed(13333) x1 <- rnorm(1000) str1a <- make_strata(x1) tab1a <- table(str1a) expect_equal(as.vector(tab1a), rep(250, 4)) str1b <- make_strata(x1, depth = 500) tab1b <- table(str1b) expect_equal(as.vector(tab1b), rep(500, 2)) }) test_that('simple character', { x2 <- factor(rep(LETTERS[1:5], each = 50)) str2a <- make_strata(x2) expect_equal(table(str2a, dnn = ""), table(x2, dnn = "")) }) test_that('bad data', { x3 <- factor(rep(LETTERS[1:10], each = 50)) expect_warning(make_strata(x3)) expect_warning(make_strata(mtcars$mpg)) }) rsample/tests/testthat/test_caret.R0000644000176200001440000002237713352536641017222 0ustar liggesuserscontext("Conversions for caret") library(testthat) library(rsample) ################################################################### ## Test cases for caret -> rsample that mimic `trainControl` dat <- data.frame(y = 1:15, x = 15:1) lgo1 <- structure( list( method = "LGOCV", index = structure( list( Resample1 = c(1L, 4L, 5L, 6L, 7L, 9L, 10L, 14L), Resample2 = c(2L, 4L, 5L, 6L, 9L, 10L, 14L, 15L), Resample3 = c(1L, 2L, 3L, 5L, 6L, 7L, 8L, 9L) ), .Names = c("Resample1", "Resample2", "Resample3") ), indexOut = structure( list( Resample1 = c(2L, 3L, 8L, 11L, 12L, 13L, 15L), Resample2 = c(1L, 3L, 7L, 8L, 11L, 12L, 13L), Resample3 = c(4L, 10L, 11L, 12L, 13L, 14L, 15L) ), .Names = c("Resample1", "Resample2", "Resample3") ), number = 3, p = 0.5 ), .Names = c("method", "index", "indexOut", "number", "p") ) cv_1 <- structure( list( method = "cv", index = structure( list( Fold1 = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 10L, 11L, 13L), Fold2 = c(1L, 4L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold3 = c(1L, 2L, 3L, 5L, 7L, 9L, 12L, 14L, 15L) ), .Names = c("Fold1", "Fold2", "Fold3") ), indexOut = structure( list( Resample1 = c(1L, 9L, 12L, 14L, 15L), Resample2 = c(2L, 3L, 5L, 7L), Resample3 = c(4L, 6L, 8L, 10L, 11L, 13L) ), .Names = c("Resample1", "Resample2", "Resample3") ), number = 3, repeats = NA ), .Names = c("method", "index", "indexOut", "number", "repeats") ) cv_2 <- structure( list( method = "repeatedcv", index = structure( list( Fold1.Rep1 = c(1L, 3L, 4L, 6L, 9L, 10L, 12L, 13L, 14L, 15L), Fold2.Rep1 = c(2L, 5L, 7L, 8L, 10L, 11L, 13L, 14L, 15L), Fold3.Rep1 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L), Fold1.Rep2 = c(1L, 2L, 3L, 5L, 6L, 7L, 10L, 11L, 12L, 14L), Fold2.Rep2 = c(2L, 4L, 6L, 8L, 9L, 11L, 13L, 14L, 15L), Fold3.Rep2 = c(1L, 3L, 4L, 5L, 7L, 8L, 9L, 10L, 12L, 13L, 15L) ), .Names = c( "Fold1.Rep1", "Fold2.Rep1", "Fold3.Rep1", "Fold1.Rep2", "Fold2.Rep2", "Fold3.Rep2" ) ), indexOut = structure( list( Resample1 = c(2L, 5L, 7L, 8L, 11L), Resample2 = c(1L, 3L, 4L, 6L, 9L, 12L), Resample3 = c(10L, 13L, 14L, 15L), Resample4 = c(4L, 8L, 9L, 13L, 15L), Resample5 = c(1L, 3L, 5L, 7L, 10L, 12L), Resample6 = c(2L, 6L, 11L, 14L) ), .Names = c( "Resample1", "Resample2", "Resample3", "Resample4", "Resample5", "Resample6" ) ), number = 3, repeats = 2 ), .Names = c("method", "index", "indexOut", "number", "repeats") ) cv_3 <- cv_2 cv_3$method <- "adaptive_cv" bt_1 <- structure( list( method = "boot", index = structure( list( Resample1 = c(1L, 1L, 4L, 4L, 5L, 7L, 8L, 10L, 11L, 11L, 12L, 13L, 15L, 15L, 15L), Resample2 = c(1L, 2L, 3L, 5L, 5L, 5L, 6L, 7L, 8L, 9L, 9L, 9L, 10L, 10L, 12L) ), .Names = c("Resample1", "Resample2") ), indexOut = structure( list( Resample1 = c(2L, 3L, 6L, 9L, 14L), Resample2 = c(4L, 11L, 13L, 14L, 15L) ), .Names = c("Resample1", "Resample2") ), number = 2 ), .Names = c("method", "index", "indexOut", "number") ) bt_2 <- bt_1 bt_2$method <- "boot632" bt_3 <- bt_1 bt_3$method <- "optimism_boot" bt_4 <- bt_1 bt_4$method <- "boot_all" bt_5 <- bt_1 bt_5$method <- "adaptive_boot" loo_1 <- structure( list( method = "LOOCV", index = structure( list( Fold01 = 2:15, Fold02 = c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold03 = c(1L, 2L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold04 = c(1L, 2L, 3L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold05 = c(1L, 2L, 3L, 4L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold06 = c(1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold07 = c(1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold08 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), Fold09 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 10L, 11L, 12L, 13L, 14L, 15L), Fold10 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 15L), Fold11 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 12L, 13L, 14L, 15L), Fold12 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 15L), Fold13 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 14L, 15L), Fold14 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 15L), Fold15 = 1:14 ), .Names = c( "Fold01", "Fold02", "Fold03", "Fold04", "Fold05", "Fold06", "Fold07", "Fold08", "Fold09", "Fold10", "Fold11", "Fold12", "Fold13", "Fold14", "Fold15" ) ), indexOut = structure( list( Resample01 = 1L, Resample02 = 2L, Resample03 = 3L, Resample04 = 4L, Resample05 = 5L, Resample06 = 6L, Resample07 = 7L, Resample08 = 8L, Resample09 = 9L, Resample10 = 10L, Resample11 = 11L, Resample12 = 12L, Resample13 = 13L, Resample14 = 14L, Resample15 = 15L ), .Names = c( "Resample01", "Resample02", "Resample03", "Resample04", "Resample05", "Resample06", "Resample07", "Resample08", "Resample09", "Resample10", "Resample11", "Resample12", "Resample13", "Resample14", "Resample15" ) ) ), .Names = c("method", "index", "indexOut") ) rof_1 <- structure( list( method = "timeSlice", index = structure( list( Training04 = 1:4, Training05 = 2:5, Training06 = 3:6, Training07 = 4:7, Training08 = 5:8, Training09 = 6:9, Training10 = 7:10 ), .Names = c( "Training04", "Training05", "Training06", "Training07", "Training08", "Training09", "Training10" ) ), indexOut = structure( list( Testing04 = 5:9, Testing05 = 6:10, Testing06 = 7:11, Testing07 = 8:12, Testing08 = 9:13, Testing09 = 10:14, Testing10 = 11:15 ), .Names = c( "Testing04", "Testing05", "Testing06", "Testing07", "Testing08", "Testing09", "Testing10" ) ), initialWindow = 4, horizon = 5, fixedWindow = TRUE, skip = 0 ), .Names = c( "method", "index", "indexOut", "initialWindow", "horizon", "fixedWindow", "skip" ) ) ################################################################### ## check_indices <- function(newer, orig) { for (i in seq_along(newer$splits)) { expect_equal(as.integer(newer$splits[[i]]), orig$index[[i]]) expect_equal(as.integer(newer$splits[[i]], "assessment"), orig$indexOut[[i]]) } invisible(NULL) } ################################################################### ## Tests test_that('basic v-fold', { vfold_obj_1 <- caret2rsample(cv_1, data = dat) check_indices(vfold_obj_1, cv_1) for (i in seq_along(vfold_obj_1$splits)) expect_equal(vfold_obj_1$id[[i]], names(cv_1$index)[i]) }) test_that('repeated v-fold', { vfold_obj_2 <- caret2rsample(cv_2, data = dat) check_indices(vfold_obj_2, cv_2) for (i in seq_along(vfold_obj_2$splits)) expect_equal(paste(vfold_obj_2$id2[[i]], vfold_obj_2$id[[i]], sep = "."), names(cv_2$index)[i]) }) test_that('basic boot', { bt_obj_1 <- caret2rsample(bt_1, data = dat) check_indices(bt_obj_1, bt_1) for (i in seq_along(bt_obj_1$splits)) expect_equal(bt_obj_1$id[[i]], names(bt_1$index)[i]) }) test_that('boot 632', { bt_obj_2 <- caret2rsample(bt_2, data = dat) check_indices(bt_obj_2, bt_2) for (i in seq_along(bt_obj_2$splits)) expect_equal(bt_obj_2$id[[i]], names(bt_2$index)[i]) }) test_that('boot optim', { bt_obj_3 <- caret2rsample(bt_3, data = dat) check_indices(bt_obj_3, bt_3) for (i in seq_along(bt_obj_3$splits)) expect_equal(bt_obj_3$id[[i]], names(bt_3$index)[i]) }) test_that('boot all', { bt_obj_4 <- caret2rsample(bt_4, data = dat) check_indices(bt_obj_4, bt_4) for (i in seq_along(bt_obj_4$splits)) expect_equal(bt_obj_4$id[[i]], names(bt_4$index)[i]) }) test_that('adaptive boot', { bt_obj_5 <- caret2rsample(bt_5, data = dat) check_indices(bt_obj_5, bt_5) for (i in seq_along(bt_obj_5$splits)) expect_equal(bt_obj_5$id[[i]], names(bt_5$index)[i]) }) test_that('loo', { loo_obj <- caret2rsample(loo_1, data = dat) check_indices(loo_obj, loo_1) for (i in seq_along(loo_obj$splits)) expect_equal(loo_obj$id[[i]], names(loo_1$index)[i]) }) test_that('mcv', { mcv_obj <- caret2rsample(lgo1, data = dat) check_indices(mcv_obj, lgo1) for (i in seq_along(mcv_obj$splits)) expect_equal(mcv_obj$id[[i]], names(lgo1$index)[i]) }) test_that('rolling origin', { rof_obj <- caret2rsample(rof_1, data = dat) check_indices(rof_obj, rof_1) for (i in seq_along(rof_obj$splits)) expect_equal(rof_obj$id[[i]], names(rof_1$index)[i]) }) rsample/tests/testthat/test_gather.R0000644000176200001440000000124713352536641017367 0ustar liggesuserscontext("Gather from tidyr") library(testthat) library(rsample) library(tidyr) cvs <- vfold_cv(mtcars) cvs$one <- 1 cvs$two <- 2 expt <- cvs %>% gather(model, statistic, -id) test_that('basics', { res_1 <- gather(cvs) expect_equal(res_1, expt) }) test_that('extra args ignored', { res_2 <- gather(cvs, contains("o")) expect_equal(res_2, expt) res_2 <- gather(cvs, key = ignored) expect_equal(res_2, expt) res_3 <- gather(cvs, ignored) expect_equal(res_2, expt) }) test_that('no extra cols', { expect_error(gather(vfold_cv(mtcars))) }) test_that('no splits', { cvs2 <- cvs cvs2$splits <- NULL res_4 <- gather(cvs2) expect_equal(res_4, expt) }) rsample/tests/testthat/test_nesting.R0000644000176200001440000000414213414310573017552 0ustar liggesuserscontext("Nested CV") library(testthat) library(rsample) library(purrr) test_that('default param', { set.seed(11) rs1 <- nested_cv(mtcars[1:30,], outside = vfold_cv(v = 10), inside = vfold_cv(v = 3)) sizes1 <- dim_rset(rs1) expect_true(all(sizes1$analysis == 27)) expect_true(all(sizes1$assessment == 3)) subsizes1 <- map(rs1$inner_resamples, dim_rset) subsizes1 <- do.call("rbind", subsizes1) expect_true(all(subsizes1$analysis == 18)) expect_true(all(subsizes1$assessment == 9)) set.seed(11) rs2 <- nested_cv(mtcars[1:30,], outside = vfold_cv(v = 10), inside = bootstraps(times = 3)) sizes2 <- dim_rset(rs2) expect_true(all(sizes2$analysis == 27)) expect_true(all(sizes2$assessment == 3)) subsizes2 <- map(rs2$inner_resamples, dim_rset) subsizes2 <- do.call("rbind", subsizes2) expect_true(all(subsizes2$analysis == 27)) set.seed(11) rs3 <- nested_cv(mtcars[1:30,], outside = vfold_cv(v = 10), inside = mc_cv(prop = 2/3, times = 3)) sizes3 <- dim_rset(rs3) expect_true(all(sizes3$analysis == 27)) expect_true(all(sizes3$assessment == 3)) subsizes3 <- map(rs3$inner_resamples, dim_rset) subsizes3 <- do.call("rbind", subsizes3) expect_true(all(subsizes3$analysis == 18)) expect_true(all(subsizes3$assessment == 9)) }) test_that('bad args', { expect_warning( nested_cv(mtcars, outside = bootstraps(times = 5), inside = vfold_cv(V = 3)) ) folds <- vfold_cv(mtcars) expect_error( nested_cv(mtcars, outside = vfold_cv(), inside = folds) ) }) test_that('printing', { rs1 <- nested_cv(mtcars[1:30,], outside = vfold_cv(v = 10), inside = vfold_cv(v = 3)) expect_output(print(rs1)) }) test_that('rsplit labels', { rs <- nested_cv(mtcars[1:30,], outside = vfold_cv(v = 10), inside = vfold_cv(v = 3)) all_labs <- map_df(rs$splits, labels) original_id <- rs[, grepl("^id", names(rs))] expect_equal(all_labs, original_id) }) rsample/tests/testthat/test_tidy.R0000644000176200001440000000232213414310573017052 0ustar liggesuserscontext("Tidy methods") library(testthat) library(rsample) library(purrr) check_ind <- function(x, tdat) { in_dat <- subset(tdat, Data == "Analysis") in_check <- all(sort(in_dat$Row) == x$in_ind) out_dat <- subset(tdat, Data == "Analysis") out_check <- all(sort(out_dat$Row) == x$out_ind) in_check & out_check } dat1 <- data.frame(a = 1:20, b = letters[1:20]) test_that('simple boot', { set.seed(11) rs1 <- bootstraps(dat1) td1 <- tidy(rs1, unique_ind = FALSE) name_vals <- names0(nrow(rs1), "Bootstrap") for(i in 1:nrow(rs1)) { expect_true( check_ind(rs1$splits[[i]], subset(td1, Resample == name_vals[i]) ) ) } }) test_that('vfold', { set.seed(11) rs2 <- vfold_cv(dat1) td2 <- tidy(rs2, unique_ind = FALSE) for(i in 1:nrow(rs2)) { expect_true( check_ind(rs2$splits[[i]], subset(td2, Fold == rs2$id[i]) ) ) } }) test_that('vfold with repeats', { set.seed(11) rs3 <- vfold_cv(dat1, repeats = 2) td3 <- tidy(rs3, unique_ind = FALSE) for(i in 1:nrow(rs3)) { expect_true( check_ind(rs3$splits[[i]], subset(td3, Fold == rs3$id2[i] & Repeat == rs3$id[i]) ) ) } }) rsample/NAMESPACE0000644000176200001440000000635113512177670013152 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method("[",rset) S3method(as.data.frame,rsplit) S3method(as.integer,rsplit) S3method(complement,apparent_split) S3method(complement,boot_split) S3method(complement,group_vfold_split) S3method(complement,loo_split) S3method(complement,mc_split) S3method(complement,rof_split) S3method(complement,vfold_split) S3method(dim,rsplit) S3method(gather,rset) S3method(labels,rset) S3method(labels,rsplit) S3method(labels,vfold_cv) S3method(obj_sum,rsplit) S3method(populate,rset) S3method(populate,rsplit) S3method(pretty,apparent) S3method(pretty,bootstraps) S3method(pretty,group_vfold_cv) S3method(pretty,loo_cv) S3method(pretty,mc_cv) S3method(pretty,nested_cv) S3method(pretty,rolling_origin) S3method(pretty,vfold_cv) S3method(print,apparent) S3method(print,bootstraps) S3method(print,group_vfold_cv) S3method(print,loo_cv) S3method(print,mc_cv) S3method(print,nested_cv) S3method(print,rolling_origin) S3method(print,rsplit) S3method(print,vfold_cv) S3method(tidy,nested_cv) S3method(tidy,rset) S3method(tidy,rsplit) S3method(tidy,vfold_cv) S3method(type_sum,rsplit) export(add_resample_id) export(analysis) export(apparent) export(assessment) export(bootstraps) export(caret2rsample) export(complement) export(form_pred) export(gather.rset) export(group_vfold_cv) export(initial_split) export(initial_time_split) export(int_bca) export(int_pctl) export(int_t) export(loo_cv) export(make_strata) export(mc_cv) export(nested_cv) export(populate) export(pretty.apparent) export(pretty.bootstraps) export(pretty.group_vfold_cv) export(pretty.loo_cv) export(pretty.mc_cv) export(pretty.nested_cv) export(pretty.rolling_origin) export(pretty.vfold_cv) export(rolling_origin) export(rsample2caret) export(testing) export(tidy) export(training) export(vfold_cv) exportMethods(gather.rset) exportMethods(pretty.apparent) exportMethods(pretty.bootstraps) exportMethods(pretty.group_vfold_cv) exportMethods(pretty.loo_cv) exportMethods(pretty.mc_cv) exportMethods(pretty.nested_cv) exportMethods(pretty.rolling_origin) exportMethods(pretty.vfold_cv) importFrom(dplyr,"%>%") importFrom(dplyr,arrange) importFrom(dplyr,arrange_) importFrom(dplyr,as_tibble) importFrom(dplyr,bind_cols) importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) importFrom(dplyr,do) importFrom(dplyr,full_join) importFrom(dplyr,group_by) importFrom(dplyr,inner_join) importFrom(dplyr,last) importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(furrr,future_map_dfr) importFrom(generics,tidy) importFrom(methods,formalArgs) importFrom(purrr,map) importFrom(purrr,map2) importFrom(purrr,map_dbl) importFrom(purrr,map_df) importFrom(purrr,map_dfr) importFrom(purrr,pluck) importFrom(rlang,"!!") importFrom(rlang,enquo) importFrom(rlang,exec) importFrom(rlang,is_call) importFrom(rlang,is_string) importFrom(rlang,quos) importFrom(stats,pnorm) importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,setNames) importFrom(stats,terms) importFrom(tibble,as_tibble) importFrom(tibble,is_tibble) importFrom(tibble,obj_sum) importFrom(tibble,tibble) importFrom(tibble,type_sum) importFrom(tidyr,gather) importFrom(tidyr,unnest) importFrom(tidyselect,one_of) importFrom(tidyselect,vars_select) importFrom(utils,globalVariables) rsample/NEWS.md0000644000176200001440000000440313512145132013011 0ustar liggesusers# `rsample` 0.0.5 * Added three functions to compute different bootstrap confidence intervals. * A new function (`add_resample_id`) augments a data frame with columns for the resampling identifier. * Updated `initial_split`, `mc_cv`, `vfold_cv`, `bootstraps`, and `group_vfold_cv` to use tidyselect on the stratification variable. * Updated `initial_split`, `mc_cv`, `vfold_cv`, `bootstraps` with new `breaks` parameter that specifies the number of bins to stratify by for a numeric stratification variable. # `rsample` 0.0.4 Small maintenence release. ## Minor improvements and fixes * `fill()` was removed per the deprecation warning. * Small changes were made for the new version of `tibble`. # `rsample` 0.0.3 ## New features * Added function `initial_time_split` for ordered initial sampling appropriate for time series data. ## Minor improvements and fixes * `fill()` has been renamed `populate()` to avoid a conflict with `tidyr::fill()`. * Changed the R version requirement to be R >= 3.1 instead of 3.3.3. * The `recipes`-related `prepper` function was [moved to the `recipes` package](https://github.com/tidymodels/rsample/issues/48). This makes the `rsample` install footprint much smaller. * `rsplit` objects are shown differently inside of a tibble. * Moved from the `broom` package to the `generics` package. # `rsample` 0.0.2 * `initial_split`, `training`, and `testing` were added to do training/testing splits prior to resampling. * Another resampling method, `group_vfold_cv`, was added. * `caret2rsample` and `rsample2caret` can convert `rset` objects to those used by `caret::trainControl` and vice-versa. * A function called `form_pred` can be used to determine the original names of the predictors in a formula or `terms` object. * A vignette and a function (`prepper`) were included to facilitate using the `recipes` with `rsample`. * A `gather` method was added for `rset` objects. * A `labels` method was added for `rsplit` objects. This can help identify which resample is being used even when the whole `rset` object is not available. * A variety of `dplyr` methods were added (e.g. `filter`, `mutate`, etc) that work without dropping classes or attributes of the `rsample` objects. # `rsample` 0.0.1 (2017-07-08) Initial public version on CRAN rsample/data/0000755000176200001440000000000013331130352012620 5ustar liggesusersrsample/data/attrition.RData0000644000176200001440000012267613323650050015573 0ustar liggesusersŽ]v:w;~{9o˖eے%˖&ұ--Mn $!H BZ % <{Ys1G k9(?Ƙk]9{:ru9WtrK8,O0oܿw8Wx;W~<c=xc3Kxc{؍.<;^x40ޏ ύ0 8:£ :m ۂ;GØA&+'7s" GhdR8=\?Bko ZA׫pݏۯ^{`8K6n k'ɐ;??Ü7~ k|=wSQ d-mAFY k·ztq5-3AmAf;6|E|raAz+.1Y{Ø0o85;xn %b "'.zזc_x>t9dpb۾ `xuQy=?"g^DZwm >?oENւa`np~žec-!^"w_{,b(xb9υ10g[`p֙s7G]'|Z>|Hk {;cc#~j-z=Hp?t?b"Wc&;e1svŜ,`s>t :o :8ɋ\c\ qX ] ,8 : 9~\x=] 2aehsXGjBMqҧd.w.ɥa!f]떰ޠ1. z[].}gB+H\7X;;m.֧AbQuvsOX?Υ KjAZk9'eW'ؒZr8&1g`1 7 -ׂK+yK' $7.]cX)cǂ>Ŀ;\ۻ^k,@6-{E: {]AK%#.A#.bW8p4SxFELͺ{<<O?ҟz$_IW 0Hub+9SvsZ/Zt~\;\Cͅ'۝R7$mq;Dso?;]w7\8ʞ@(b2P{%=lGž(ɔxb>I }>P0^]pmkx-} {.|[!ubS"N>"f $oH,^ ma]O>fyb=~^.Q~IN)9Ldq*闤~9}{k v{Xg1vx-5@MS%_J/-no#EtQ_6㰋}uZ-Ϧh\ZP1^F]'>-)y\O{"K/كnrq)ԢApMx,>^ %ˮWsq/$)^YI/u]3]O gğKApX8>b*c!iOHo)G¹c.7G"/lE/m.MC .gKӤȽ6 ~E|'EpFj 2!H?#s8e/ Fz4M}Ɂro Vr읏Xm\IߕܸZpO\"nu=`C; yɼ|!>a_J-o_7@(}Cɋ7l 2d 1v v\%{>Ygܶž`8,\ġre= d"wv<>_4{5Β%aA{)Zqq;W뷇e^=yzj?nYz(ÌG{~rϣ?U=q`_ֱw_ÌyT)/e\oQ{^x_o8Ա_^< ؏Qc> >V>GҵOՔ͏o|*3|pp t4qQH)ל"+-{~){`g; fS-1֜~}Տ]Z~|د=b}l̷di--,b g)o2Ͽֵka~֋=˿0 gzrk{^xalf3GyI] 7V_Y?~Ct_y˭Voy67?X"o^ӵzsJc\KY\ i<z^d >^XvX%W4^rg_V^G^p8Xuju=ol?ڏܼRvWO0xk5}=A[-.ca<My9 kU8j<`3ƕW3{x:\¹?p~ !qvU~~y|#=gf c"9Ïx50Vx=OwC (Tq <+wb rO/pU$Me> ; _97<~U'Kpgxs0^`r#_o@2 ޟ?'p,W}^B(50νs=}.Ws`Geyo*|k3T8KRMߔ»x9|p>ŧ<|>1_x>^ yGyC}sFK FN.tƛ2>{߯ɯ0}_ F5ru*Oʹ,o~2B\Tf*>˕}`Oc ?da_0u/[a>gsEr7Y]&y!<|CM+x]gyRXT\/3CK-8F>@Vz\A*bx,q%/pWq<+ 3៲Ǻ#0+~ojxl==euQ ,_2r@bܟbTB/VzU|/㐷zZ֖Qc S> qg睏?j'xOY^s,߅,›.c cƢ)N` jN yk8PY_P Eo3 S,gOdd+y\x ܕaSH{ [׊ߟ0e[J*F_ʈS5x)>KS3\V|2.d9糒߆ߛ~Kep /X?A-|_* [!wvO|2\?~. *ХXQ7.x <Qo гe|=,>G|U<>~Gk;sYkIYU01{`hv0O-w9e6`{[3\RKg<<=|=^iKV Džg-_YORy=>|X,{@_ >,ޅFJ5CrV93VybVM Ġ eul0*>˾v օjV!`c߳P7>g`gc+_|&z<%eP=HӁ#?seOcYۙϊ˿43*ߓžEA\[B3L<Ǽ|)/!_zf9>G(lz·e}s~O|ݞy$;Ww"o`:Q5,>w8}c7)W:V|}CPWO\޼r2%_/{2nuPME_o}l}]֮{|C2F!*L'<׬[FrbYد?TKػ״2lVW"~].J-P|XD>|sy-`ټR9Ouxu|GVA6g*yӅol}ߛoz>b{?s=`_jje\}Pfy7fsɷgewh~>k g^ ]0ޫ?z? u?7~r,}2lhO+ll"|O}ps0k'={{?~x2} oA"|Yo=}rdC8/~֬G#!às[ 9ġߗxSoz/rSWY*W~>y1>W2Eԟ8G)ql~I]O/Oj ƚZX:5[/ũ^ף>,)_[Ocˣ GzXa`ߵ/zWۣ9ma%O~~9yqO+&)N3^L]Gy <+){kpgũ_T_Nk~rK[~o,,dG7}7/i?Z7~<^h#|kaNapHF=@/lj-yy8/yGԼTnɋmܞ~|oMb=Yy)u_lL>yo?ƝWV>ٽ+ӳ8r͏rܹ/ʍ_%C oɯo3ݙp"|odQ䈿Gi;b/yDH:Th=\ 3bθnJ. r3M6{˙ Ipp2˞wa4b9|$~`I܏ݥq.,]I'|6s&K-{E]y%8'\w=s1b.9^oAu߱\],bĪL$2Gvc*1~I&ؑV{Sɉ㤋Jb)$%KoC]M'y-q+="o]e4BI=f[3΁ !9/,qWmt';T/qOG]Css%OGq'ȸ #ϒޅ*XE)fUd>?S֗}I)Gy$Cj.&xc>'|+$f^.R/7L]de[鳦]J(͸QoE>Jꗬ,x.1J&\+$}`%&`-8s{\IﲯI(D \+eMZ 3Jr']wrNK i_OsO=17!Qx$]͑јmlu$.b㲕(X$w.=Ŭ9OxݏHk1|֒)xʽQ!萋N6ʴ܃JΜr1J<6VɅω/\܇R.֠ɖ};E`]ҫ:^-&9b'^$s]KUEL|oNu{6U X~ԙ ׭uGuQRr}E\pi< A/J#Sqګ~.yl1)Z==V7?Kqkfu{v9ϖ-z)\cR33FX+ֵdh+\Q2u:oYQ>^x`1g,|s1OŌsۦmZyrӾ:q,{,Y9~+ζ*Ύ^WK+*rXwUlYK7%{K>暕RG3[+|aZ^JWk+Լҹ3%_JվS o+oEͷz;z\uۛ½o/ZZ8Ja*.9[\~X8MaZ+?V)t|uKO+:V^9ƬTQ2ju}+5SkwZWCon|zeٻg̻>./{Vry.F:Vrn 9QRyAN8e[%h %Kmכ;,hJH>W$zulh<|}"&/7u^' 3iin06S)' /{<ߥhy?8vm9/K!*%gЊX^YNa.-u5nuޖn._wnm>,3xj^2N$?Y9]Ժk[6X>[x=:/|jmSw\{ mT޲xE[7I؟Igɱ6ZG;[SKV8օ׬!/un_ذ4*S:ky)blN|h59Vz~_olk%ڷc1D ?aV~#}eS^OgaZ^y8Ժ5Z[K0?mqDSHC\5?4 WMJ‚5¯qֿb\qϓ+| Cs1zM˟} Eymwe+_xѩ@Ǻ9TVf#~Yqa٬}kݣaIX RyZ4Oe&Ճh;47dwjM*'zlTOߔl ]7\O_Wn^O_qs+o0rߕV.h>+7?^?ὕotݢg..\r[r~λr4%4̊%5OUPrOQb}tzZuնXu,?˿M<*i}-??{caLۭqg}lL._]Ö/4::\7:fm9]::ϝd,8[q.3Rc5o|3_Y+LKدVj>Mx+/[5MDUʧVrX>@q+Z5_V|,u򲥃M-K^5T\%0d[~5ɚ{)̤cxsM gqreeu49ֹ!0V,Fz剼\r k ן kz kz[Y'šs=c/<l{>1y7V_ kl}5X8l)zW rxgP3;_ \\kan-v{o򮇇A槂<. ?j [lW>/Bl^ zAc!oz_s?[AAj`Ar\/."w?oa{aܞOt00p)཰ޫas'X[aRI˅Îp|x~5y̫x+-9ϭZÕ.FrG0_l>lz)_ ??zxkI{9q3^0+A s)w96{/ ;a.S;_z:b_ZGB\vżz5tx~ż}!"Oz ] v;;fع z#T8 Z: ;]̗φbbTlLj{B  s:A:{bu*k.և=.+ǃ߄^־!V{'}o v 9z2Tre'ו:*ǹ/o+.bCGh9B -xz&\{+}=̯{v^ma]AgG=}%pg£|~=v'>./ G-s<eq9y;\b}q1oYy2Šïda)ɠO=ز|}q1g A\ܻϯ/et-劋y|n <tj~{c^  >䶂ON?!\; :JN|#=>toWd?u6v-|3̗^xx>b{^=lz)bwGQ~Mϓ[s3e37]g Ճ[]ZX#$ya܅SA]]7n zR%7sA.>?cju=tN.ICgúo/.b/~}`Ko޵?>bNXkA꟏q@z0̽?lly?\{=.m`'q)y#+x8= c^r&bNX$o˻!vo tZ(>W\ ׎_s~pk9"pz^ 2^ ~;H?9 [Ag\_taSgӎǛ\w.ez=|uTpV8q;g\oׂ_΄XH%z k\q/x`wrX|/8{?Z{\x//[o}I{8w0žZUz<"_=z9B,Rпnۤ~y>0֯'RE{eo+{]֧]ܲM<l ;AvX juXgo/}|[ޚ+Yr(_([/kEfQ󭇥w?E5&[JbXGek{Oq׮X ;Ec|3=l!;mSza#O/yhʆ, W)GSǸ~)XZ9zo¾ŏ^1O,vʟVSk^+k!/S6+wD?IK^<ھT'.<ڤ gnyZX䚥hnW9TNt˻^WS6 Ru05>^|ﵞ5Κӏ<˷燐=);tO)5G񁅅1crL/? yrxhqcK6א~C%5czWZGG8GL/=k/Ӌ z +smdZ2Dzx=NJۘpj&ZgQ<97}5ʵ]޹o0gVLn`/$di,y|jas\޼=|K*~)iở˟zqy=k}_eR9,~|w!<}zj=c5m*) 8Ρx_.k;gGu@_mkŠZZIk:$WvQɰQRuNU\t/VuMq O+&,òUFƀ55K=蓂{Pa:3aHcSDŽIݲcӿ1da沶k2:b9ubXQ2Oa˒ %2A9jPc//^VgSmea&Oǰna:Nj,yyxҹҊ޻krXO'xi n[^Z,CiKWk][3R8dI4R{[z5沮j· SpϭmzH"HokL!<!-$_| -5FO#6[ѡLJFt8yHsޣ5Yc5Up0̱rݺić̱Auw9Sq戾CAr)_i =7  f97J?U&}`?0Ya+3@wbh :C$\ph]q\X^kɑi#Aιyxc?͌S 'kWS!Ig!yB\KsE'xxpg_܃\Dkːx2?%' CpPäLQ8wuNJ39ù@ly_t!uMߊ:tm}oE{wyy?<>/>#1a?L%gqpMqX{PWPįg8^y8 s:aeWߜ'%Fspo1ս!?ח J6ub}RԼW-Y:qesYELc{x+%:̵g2Fbs^XrPRc]s,c,S-Nq_9L{ _@!5Fp=6o]:sD8|/OXmԽ cs:DL݃I|26Y7ާּ<ýcy3E(%38w/I9&l}Kckz]>{'VW$MA=k$"[#}2Ǒ\ y{$12ox:p̹u.q@ug1\{ײp̹AcL::N]cx}F5<&= E'c}?ST4'?`@u·] ϯG7o[~eV>]s__5xOnO~vʧ߽w{_͏ܾyƝpqҍ;7>\^gݻ}go[YTzvu Wnܻ0vᕛONxbglvZ3Hմw( sX{ZW5/Ǻi54Y۩mL5EgY=Js)9Ϻc?YyMǫGc[,ό9 lc~yyS˳: s/,RP\q~xҾlӱIk 'VGX9Hִ|ipS}]F+[c!mƆ_yCT.ײjAsʩiY|?5ڡs*sokYV`+kuc٧kP^/V^jm&Xq|>9k)ܣ9A5g-Vϧc-sV,,S絔OR|;3OOw~unIqAϳp[:g>˿ѪIVƻUXSۡ,J&WJO=_[>zTܱbmJw+okp5C+[*:<c3guj>ֺ랢!Շ<ixʢ.}VzG*uuԥۚgACaJmCjn5-KI<,ڧ5bZےcc[ c -)Xc46`̷tMV2xak}54/ KV[šTH`٪}jmSblIaʉ,ޥl|hOZT+)]Dc>O?+?pnÊ5ϲ#,yy9s*ǦpcTc@4i~bT*<ҲQձcVV=ҜǷrV ySONN[7dxq}ʇy|I0/7Y\ֲ)շ~hª9͛~5W%3|% 0VWKI+oZ'nֺV_JˋR| #VγֵjjZw+Xu0])XLj~g.Y>j #W-ËGK;/ tv/;Cgokw/ݸwv;~קn;7;}޹;x|s{s_ a7p:c^8=yƹ#qٹ? -}sqnw:a8؋/ -漅cdwg/^snCqns`ΗE;Ss>3[!םy:܄gsƟ:0cE\[>97;\;9vnۘudrtː {g?X U59!aY6]Gтq/:wO@ϟn N?k 4/x t;8_ :@un_9o:" dLC_]B`qs_ X{0d#Ɠ?Y?17wy g!g~?13~8C#C/b bY6>db?xν p)w[Op#S?<@{8?CN-`IAu6غc xa 0||_.`hC/|21n|.9{ }Aex7^þ Ώ=`o~35 2oD Xzc: ɍx؅s,"po:w} rr1~O#^Ð[GL+Wa+p7x'ː9mF,|>OqnƼ원~~(d¾p=09SV{1w? =Ⱥs쯆Mg!}o[ỡYqٌ#?,!olGȃɯLmk VO^9{vmB.N= ۡ?Uߕqok={fXA6¾ ,|y~ðgq{1D/ƶa<CC@*J?~CXocS߅|:5p|zְF%{ߌ)멿ᗋ>S87+1 p/b}nh\A.[Z={ܙsɓ vBԘ]yp0"m{;djl ~>GmAzI4pgc~{K[% w'lCΞF̆/Lp1B|̓|mO!vdB_SO >$l\F,G^5baOa>p5>|<}v#sNv+AqUGQ_rvf86?a)998ނq/T|9N/~O@? ,7~$<~(˵ E9<67|17ga(πC#̢m>g,,‡/"M^$9 s$1]Pl:c- nocg |>##>_#ðgezo 7lB 8\05,ԫ*}4nQ#4!^9>ca<Dw7ܿQ_{AI FnDl}qgaԨѣQ21lg=NzAFΛƜǍ7N`0j0:bq[br\ޘj[ szoڃ:17p6 lB=ُyOV_!̛O# q瞀[yԿm6r ڎrvwϷC jbjD<6 Meaz5eB & `칯@MFϼPVkFC,g7'=ѯ2eq^&G'A|;Y߷=3!h֟UPay'<)rb?g?0={?،>rK67|.`=Ө3(x 8?ܺ1@ Νx[C]Y nC~1;>^g9f 薬D WM\ ÆI-O]Y0h koD~&;ܰu@}4Fe=\y~? , GPk0~e==ǀ o!7G͔ϖՒ[Yg#o lBޜAFk!k2@ 1nO!?6!>%W7|qyP9s} ua>(#WQGyࣁx OަKXB^Bvyv aq|l<E,uqh),*3?M xތ7noA,!WEµ"0;kYp\;;i&1f|<6 CFNFmf ~@֑?1gyiFP2lmih8Z@]:x͡oE/9mAma =x>x }uo CIafLvFͨ\jȩ:e6[ MsYbw@7[>d=Cs1ՑSطmu_F 8F)Cz?8snval?gvVwx ԟ rl2 cFE>{x?l1 7 bƶaԁi_O`t|eYb:>O mo}πqey| ߗ'z}Ӱkۆ(vWd{!䔽$&Og {^^gvOe2<=2B Cd8B=qie8C&Y`n:ankYg}nv<5ii[|~CI?y? cػM_cطn# 8?kWA,Gqm3i) p2U{4_qulD_8fdN܃MOeǍ8{di_B Xw3QȞSwwx\3fQCFcaysytk&f=μsYVGm's?RV&o9ixn mo7>Z5-!`fz=k`+?YpfLJ_LjpX.{</|PCo4|/v^D`4o]#(!c?o_C<:<ϲ 0:5K瀣)̯}mv&a49^lؘq/}'ԗs/3l6_A+AQYsh/擃Y> NA,`~e؃}Vpp['Cu 59n?;7o;B֟q~Y{ˈ b8?eUIׇpge|_07YسMC0jeḾk;6ίMFt#}NAG#}7@σ(z79:U/7|-?_N!{:Dp )er&x1|}+ٽ%`m3Ws=mm}klrAwXE?> Wf]gW^.6\8$/6!c yX<?_C{<8~ ~ 5'G<> lV?^C|^XvGD= z1Ngv®no'KG!)5c/wy6t{5jQKb|҆Ns? 7x9d ~ _A yxO ;<>洁8nB>p}bW]mCwٌy0Ο Ɠo >Mza&-'g#`/ ~|{߆:*t MO1u'їԁz7#`U=o0|8 ~n7A~Z_/'' Y>|yqk # >B| \7wg=U`$tYg `M8;#<L}gf䖧}|c{6>X}\ub9`2b -fȱPa{Wf!ż&ֺ̞4YĜ经f{?>7#o!G'naI4w|oΞFmZ@n|94j.tdˋ Sٽo,I\ۃ6ch+~cE~=^F~=z"Mn?xvSv:pCncU%+_/SrtN' ǰx1%Խd?t{=~*pt.X5`|{ B{y|^-Wݖt}6yϛj59ya ]ߙ{:ѻ^Fuefi̿]E#uIM+7wPva:,=^gygOwom|~ko3(x5b53=g!7P'F<ȏǀUƫq[׭{G}MMsu 9 v@S6yzyN>A߃/ ~8/VN䃣ߚ{> N!I6j>7e6q8<~9-YԺZ` {?g}58}~`y~>i7&r(dQ'bw/!-tq\ދod{ak&`Z!U-m`G?e< ?Ĝ Qp1^D;-빗[_gS}/^F]9!Ƴ?k9i0c_B^{ב. ]G{akB9m-w kgWg"c[*_瀓iԇ)gsYeV.Q'>;`^d ufu O~}svk7l8qk!ׁ7Q=9{kwX=7zomA5aMx/b{{;Z un~Ka A/'7zd.cػ]}aSlgss-ps| Q _7{ ALa)ḍZS/#A/ %`c>x!Ԃc~8[By چb@*4^7G_޿<>kg;?F̮#O9>} cۿ޻/7PO_G~|gdybkafp7g:6ݎ<\v|g1w־k%QbS}٬NCX@}جc.Eyێ|:7`6P+ѿ ONmO="jϭ=ɟs>_ {ѧWmyca >|>88z9 =5j=|$f=#S&=0N7ԟn7>ds!6o3)yC|wǠ$rέ}W!'8*Wv{{}\@)}.j2Y4n,ަ34jy;T3ȱ+} u}k~ u qϠ޽ ̯:l؉\:-pCt3d@C1p/,m;vAQԖMye#\_@v}eԒǼYpsބ/8.]A~Gy Գ//7㨛´W/f[ȗ)g p}|<1G ً3|9 yν,]q lߣFL O'{X> <\Vu~x@o]gp&#B\'Y3,̟>gqy =Q=Lx>D9d ǰߝI}ȿ'Q.`}v!Cr{kD-3/!Gnxcwv&jp4Ɯ ]ClwδCW0S/6#?B.O1ɢ=`$!d+8]}{/{콄%{g+^ m~^A{-w}3$Ͱ{v."G!/8+u}'͈8 5|\[u}_y 7ـoξRƿ ȃO ?,{a7mc~z#>y9f3LO^3 ߇[lx<,ppg{KF!Nw=n|q ~Y/N?an 'Crg BgYy B|K3."M'قjeg+"z!pg ==y|\; MB?>0$|@ּx96^/#P>?@9{+`bwSZs=gøJƅ27 'b+,7b <7څk'*t쟋TYV)< 1lX\y~]چuuWN>a_[Æ.~}+˹ hz2t߇{D!+sƖ\E_r $WA%s8W!oėCd,CdY"NVGp7+es$S1s|瘲{Um=ąb؈!;B 9pyN΋|]T~ r,]`s:ʜA珒C"L',k}]Ďqz nre>.Xa^ >D5>(VVk87G9X55JbR˹_#1<+!y<>$P֖?SIe>< ]ɲ8&Xy`I=#Eƀ_b ˑ$k16,_sEWuXgHZX!dawMlNN=,x]˽֐{0[k0O5z(5/W9Wq\-:;'O`R叱ϺiW"S|0qzq-:1SY_@e=^_ X;yqc99:/k1/\S܃.:#}#y]k+Ӽw9e܃9b|u!`Pg=c ~>^A%+_ȱDW8-/G׵u[Wy:Snt5{sZ7gʮ;?+wTHGdoW_2'kƖ^Ajs|EaˆK7J>&rڠy׈8H K)XKI|1캹)6|o 1|1眭I\+J-y~Q;w!u|.]%}>? /vOY9#,KF1赎V3=uR߼d~0k}XWX_olra˿)ޤِ0W,.zd;v|Sv闯y8z8䁼ۯ^qǾ^/~i='fb_Oz<ȳ|+gOz_[vz\ys=gM?yy>LO5{ZخpT[ψQ<\]<]sx̻췼_ 6tcF,97E0,˹?t\dzA0Nd2w>̛9E◩ 7p~ A0VƋƂd|8FD1̑uxɳf#l~C̒-YGO&-adz`Hͱh$][ ٓ0o|s']8&ѵ9:'?DwM9J㧃\+F1glqu]|ux֘ >&úӴUlsݜd/݃H}es/Mp5a2<?i׭N':sm+ܒȚgŌ+\9i5麱5HHb DGۺ> 81~giYzcZ3"> $C0!~5%wHc~HyWqELZ.!Rcƹn2H><΄XbƹhCα$/Һl+Y[+k΅q".y3$Nd2^F]ܯ, "E|p]wr\'6pJnwG{=<*^{>uqͲKɱR??yed=J[o~1^O?GWq~=J_j/Eo\8G|>qźчeQ觇ڣ*o G_LOG`acyم_ݻ𹯸+w?wnݾ|"}[+VDDgNE䞯u>'uYӹ֑`)qs֣h/E_CjZq}0kk׬ cY[~1ck}t/n|1־,:۾\c^%c)i9=NcY'g/:Xk܃8|g 9q ZG1 ZֳҶu+WiZ4t+z[r~*Y|zX:Y5?,?kLkjLk->g}]قZKۤei;zǘ9/9F4tPl%'<6+kj;ueҩ'~? 7{T~6:xjkɐco}cqo\cJcZ !k^*'Ʊ [}E,{^CgaŒXyEs[*+Z[ces,nYܫ1ndkh':߳4ά5%g됖gaox\qfT+Gjq:VdӾlјu8w7,i,XvYY9;O(S`]cKZ]X}M}R!SeX57S>tԜ8l u^T2y=[?$*?Ï+x}o;JNHu,f8Wȹ.\1gNꏩ_O™^z"k,[K#7gbn^}PL῔=V~|cB9%2b(Yzּ͔?nCԏo45^z}ƿScS6O11VSkTO]^Mdz<ˋ)'\˳dZ~r>cGj]e^[dv8gcZA/_`ߒgCvXkmmcٚ•sZCQisKeKv*^!>tdXk_3[뺕'[9곬YcYCyb?/S'O^L-[ӺI܃yשz=~f]ƱZO۳O*OcmgR5Z'5אTia~^.9~K^[z]k޳i{R\OՃnVLRX8r<> kkrFm~\qցuvXk~nozTղy~WzmBO̊%IsySsJe~%k^[Qr^nOֶHG\sr p>@\%xY9?c3̈́|}ş 7k azL?95şimt uzy\S.H'? v9.Q9,km h.SvbFӠ?cSr?Zcdδ C̑.`.'`q o7-Yco^?"C| 3K?0Z=. ?adx n^ 3M"{E,y;\_ kmtvS.bZ \>89[h\~mɐ)yE 3#.bo*NW#Ao_\Wvg3Zcam}1%?&%yb/s\0"8姽I.r'H|*9^rPeqc|ޝ&lsޖy\dO.QNź+1+!VRe][֌|.y]zUi`kD"9u`,;ivC'Xk $_s_*vϾk3!:JNPj\X9?oA'b^] !Fo$?V\A]0tZ f6 gkyOZ 駷O]HXdr'|K(ᶆQoəI}Ma j8K/0Ds'\Ğ]j+zeɱM4_.֙E+R'INa%Cz w&P~zjYs\O[])dɹ팋W|+u`IzAF1%=gy s.u{mknw"/{3*<I.^$sU|C&8G,x/qq(uRYk i-( K.%\bo.c'J~K^N_[zNg?}L;ɩܓҚ"K;Lvf]e ǹM80b `&uKjAѴ?W#yP?DRqײ}EzcG\ĬHs/S|*j˒\w@|~*.9z]P:ϕk9|WP*JW+ϢIlXZ :zZ:&hƞsb[Ʋ A3'>W!y+qPR`Zg#}XqXh_TT$S2ʴ0e )E5:3 7:e- ˟KƭrSjJJ~bOlMs -+?.=GKLZ:n''wMZo+ñbPpXy΁,CqY9⦅A^csw*Gs̘VS8?/RxgyC:r)yCG8i14Wm[_܃9an0%k9ם/oO,_2=NR;8:&[:1'y_}eq܃rEuƛ#x/A,]9Z =^d/i}-t}LĂ5ViReX&C8%'g=h`9z:c56xZ\U_j1bk#4%u>w}3w8'Z9?>W63دy1rA>Zk˜O>Y~?S;l}lc uOG+sKaMޛY~L[`}g'V6--?)hl8YϻG'\bc{0?kro=xgkz6=Tzt'^/4f$^ЊuuQ{ʳ߱ys$[xrwo#;DV%{/Y2ߺgU9~hom? yXNXVќpm闪,'ֱ'_kًoR-mjL?sя?󵅡~ja^x>@V`{~uV.ΛMO-)y,bUmݏC䦔/R1vjգX5cvryBl_?ɟ^6´))n^~IaOJ]jlpoً_yM?>;sV=Ki|/)baٛ.u‚㔝o~uh@/{)R~c5?gz<-zz[5?,dkq'O^ٯ|֏'ϩ^nXsv 8wiA?+7K^qJ'-<3^~y)V.y:j_uH09>UuE^Ltg[{ʉz.?;~X8W_'WN|z=/˷PЈC=Sq[I!f|;ƌ|Jʌo݃ӟP.3#U'>G)SS.kL닎!;!q%/7Yqݨ)-߸( yM%IL[,/ |N#Uh\d\A\7 .~Mv!\v~'P~q;-y_oȷۿcC~Boӈ\Yc!ጋL*].Wd}ggtT]8[/$+_xMvk 6Hd6lX[YvWA*X"B&o_5%Nw'qdssK_\N/%s10G8os@s3 j!>⫢߲8͑95!y&׵_v}C[C?^ kh;]ċ( bNuo[߄_Zfq-]x%9z}d/"8t1/p3Z9ދY5KfȯaOyi_oK!uM|*~"dnr'[|}urb[;"Gt1H/J.˴Sx A8ⲇyO?d+yߑΒd-/"v/-s{[ YG{2ɐQ+L8c-69pP*kQ$ӍnH_"5I~]»yҙk̤=';G b f\CjȘ'q+\yB%=rM?.sWp*Y9Qz T9Ks'ȓ{y#-Yy\-stݸ'8$չXDm*?/֑Jt,ؑG.N1qb1.`Y[]J4kZ\_[DgMn:{^̑r$z8/r,'R[{4E\iܐn_8Zmq"O3l/Hx"yk^kJ)=)aHe~2yo&kH y@lиs1' XoyoLʹn_oóxEt~{oC0Wtbtx^7؏so1dcX52wD{o#Rrt?JY:/vO/9nƹ;I2 KYށ랅+Y`L^`?Th̠^cyGвXůy/ۥsu󥵏s aHN^G2FW\^c_{J D'/lj}%j^N=*ϕzh=s5IGaLcPƷS2&i>>Csy݂:܇r3b{1{?_NJ.,ɼ%:\lžtP"R/D||Y^΍roLYK_\w%rMx3ב~AnYbMr'ǞBb},pޑXH/ٷ ~A|)˵I!]xcC|ɯyu{0dȟȩ)s{2хX==jG9oJrdZ 8b8[{3k~/x=Tnus!r s!<+9pZ=!wT8=Ҽw+y4c}e~Cyu'fܱoyƍD?Ÿ%㹬g8/Sx O~c_-t8)cb#˓\Y|_ekZ5֥z$]䏪9}++ mKz=c45Xb>ƸY.{9Χrơ|c"Ux+K-n|Ec?ư`2G}Ms:Aǘ>%=:s~xXry?Ob _ڿh_j:h?z_y\ж *JF\u@6uɽ@" ro`9sV=s}Nk\wߪ1:9ysΕ{:օ9s|S9Q:-IOSVOׂ ZNӸ1V ֶ^^ ]={~[Zw7F=e3܃9Tqy>n ,Y9$dž7uO5ֺ_ıҼ}_uǐ{'H{:+Ἧy&>R~S*y5sX=rF&y$SS /~\g&^{E5wX=skKL99q϶?yt`? '6Ls'EO})}0\|Dw?s=xr%L||۹݃qֳ\.]|ǽ|ud߳_,J:{qʽx 2WX$kZ 7~6sƵL䌨9Y l|&Nbym\sk8Z_̱>>urSREfucJ_,kYx+?FcLHg xȒϜ^GA, 76%)cH8|=k1ڟU\w,'s{3|Iq9@|Ο;Es,?sR:pcZkބsݜ+XyIӟ |sPgw\:vJq". ++IcRjы{Aqu8w&YuMp+m8pA9E=O -\7޾!=嘱\~҇ t}`ߓҟMq_lz=q5PfZRcM0bˀƮgr{u-@3l̹Rr&$r ]#d/uylڔ:w++r ~#ctN~=y_ku :ǟ}*5Z{''nHgCǣ|8cB^K`\G˿%)}>\]#יwztto ,a_J,E&w}9U}]r-🎛uX+::F,k^P-e7^f85D{iy3wGޟsGL 派7Qt Cg˿k{`O92x΍OV>uY𷇓g>\ ܿwN;+~_XXq/^q_捜]{?Ys?<{7\yO n}vv+O/_OOIϭܹ#^>.}/ +^\f ˤ 4t.+oӵw/s36N"/~ə|Ɲ+q~}Vz{mK+nZ{_|w} ɷ|.,^Yxq~t⓯ܿ{s?OSޅ9^Ⱥr5 ߾^O/*.`4*Uόz3ݻ W^ ~7nOt΍n߻G ȕ rP4%pP\>vvT];׎kG͵(#G555555yњښښښښښ~5y(eMV5)5]k5555ki\\\\\\ZZZZZZӪU+[&_{Mr{Mr{Mr{Mr{M^gMJgMJgmngmngmngMH)-?a5a=6a3(l9 [ӖiZljVJV:ըN5ʭFa(%ԢfY-ʭEjQX- GQn=ʭG(%qFۈrQn#mD(A¢(6fیrQn3یr[Qn+mE (#ԊZQVێf(ı8Cc0p5yQF2T# H%Qr%D^T#/ȀZaPF Z-PEuj4-z'¾a_FW#zT22P"Ȁjd@FT#Ȁjd@5¾^XFW#֫N}5¾^mijx jh[' FT;QB}ClE"k"kZ}-¾v<.PPdE2b9E"j"kXjȀZ,HZ,ȋZ-jD-RlE"[j-ȋZE-PyQrP E"k1"jͨN$C-֊"jkk1ѶHZH-֎E"Ej"v\"֋Z$N-SKG-ҩTzQĩG#q8Hz$N=QHz$N=RE=z$N%DcG:cAG:#NHzP=rSĩG#q8Hz$N=֖z-ȡzP=֖zS=ҩTtG:cGfcũNJSeV|G#ѪG#MZ"ZKR=z$d=KR=rYܬGn#!두Xꑛzf=rY,G# 둅zda=!k7bQkDn6b}kDB6" Fda#f|5bjDn6"7kDn6"7Ff#rوUوlDn6bkD6"MF,THF[#QiQȷF[#2ՈjDf5"NȡFP#r9hY$N#ӈiD4"[-X8HFdK#҈liĚՈliD4"["IXӬЌdhF24cjF^4cjF^4#$5#/ȋfE3ЌdhF24# Hf$C3¾aߌoF7c[׌Xo:ԌŧЌ hӌ hF4#Xf4#F{f-Hf-ȋfE3yьdhF24#͈fz3+C3bތMY3ތoƼތPnE("[ʭVr+VܖboE("~[V뭈Vo+ۊmE$խVDu+&Vx+܊}T+Q݊yQ݊PnEboEbnŴ݊HmE"R[Vd+:Z1W"R[iExb*nEx"<[1$!AۊHmEb*nSq+ێm=C;ێ܎YQݎ Qݎ QݎnGTNގPnG(#1#vDu;B܎PnǾv3#vLvz;fvLظ#ڑ1#/1coG^#1mcn)ҎhƥyюJ;fvH;Ry;Ў]|;Ў{;%ȋvE;yщy)҉Dt"E:; HN$C'2N}'&Nd@'2Љ Dt";띈Nz'bNx'&Nz'މ މDTw";՝Nr'NDu'fNx'S|'܉[NDu'Q݉پ[Nr'B'&Nx'&Nz'(;1w";띈NE|| J5:qtܢcFLk-Z˴2Lk-Z˴g1[%UY%UY%UY%5Fk֪Z5ZFkhUV֪ZuZNki:UVjZ Mۤ6INǓMҡErZ$E:l-&m&$Mdot>ҧsn=pLU:qtܢ6ZĻe2qmL\[&-זkrLkwĻe2.c'ĵe2kLZ&N-ąeB|Z ZArċeE|&E܉o8"ė?;2bL\X&.1CcC|Y&TUExQ%^T6UUEjPPT%W eCz*J*aJΫTS*Zc9a>mvBjJjJjJR:R%TJNNܩ~N5NS_ė:N5NVR'ԉ/uKR'ԉ/uKR:R'ԉ;u#uQxTN5N5N\ĩ:qNĩ:qAjĩՎՎAjОAuAjP 5; N 4.4 ExѠA\hk5  Ex ^4 EjJ՗qA=XNN ~5S ; ; WsIgK 4h 4/ H8 4# Ex ^4jP jP j_ėA|i_}$4;MKQ$465GMMMMMW$~5f5jRjRjĻ&լ&qIlRjRjRj7fnT&qI|l&I|l&Il&qIlTTך&qI5I5I5I--c">-cآ">--f">-`8آ"!ZĵqE\kQlZĻE\kZ/mZ&kmZc&~Sm}mWj_&SmT]զ~mW8զz&GmQx&GmqmQx&;memN&/mKҦ&/mKҦZ&;mNjYx&;m_mQӦ&Gm9ģ6mNӦ&t#B!wE!w|p!lw<֥ѡ!w|0!w;TG:uh/!^tEqCP_:đ՗C\P/:N璞C;TG:ąՔCPM/:ċC\:TS:TS:ąqC5C{r5N5GUC>π6eS3T UP3T Ui/ ed'EƕtD+κttc@)̉o^#ḫA,C qX @/Z Z %o2P7^ɳ '$tL-¹Q@>6`]Kkny,cuq\’$7z$$R%ТZbrVTO@ pdh5,(d1覚4e0ng9 8YqG6*B*RiB[wq*Դ!+lIl"((, ¢*"0(0# 0  0(, "000ˆ"(*"""""Š(,_;̪RtRI|Pߊa({f8}lÒBM bgDL5`SDz4F;aqUFU(#  otEF.,1 T*K08;WG(%C4T!L#[ !@~@ lY BD \ ͖ ;5T,L6} eqZI  DDx%7H v &$;# pѣd9j9,^g49m *pW.Ʊ^`24*>BI3tSOH1%!2毒%d2'SĞ8b[dhO[RM`8U|Z'LJFM M駧xu8lٺ:SFww|)l|57T1Q252iigfݜ-Oj~}3]z{65AƠĚbe`ODf p%C (A#@FP!vQ*N!",`-zE 0)M>!+ZB)"KjcRUV:r@\jv;Η[ڴi S+FI4$ө U1%UJ`mi"+1lZBՐ(ք@CcrWXE*Tf@lOiF=sA8ǐ mX/} BJFzADD֒!~ T~LCz8w]BBtrsample/data/datalist0000644000176200001440000000010113327471215014353 0ustar liggesusersattrition: attrition two_class_dat: two_class_dat drinks: drinks rsample/data/two_class_dat.RData0000644000176200001440000002730413323650050016374 0ustar liggesusers{ 8cRYZT*9RVR!d EdI$dVBEi%QD$$2cfe<}{zz޿}99߹nMEL!Hd)$G$P‚Oq'gǬ8:6=D ' cC W[rFbuo`bH$E*>ΏusQ{rŻ><'Lw'xLB:zV?\ҷeOMjhp2R[S9߸6'"^Fg:VܿwVJLXf `D_JWDdΨ[VF֜NJ]APPJN`BOŕ0 rL%ӾІvDzn{ &|޹<^9( J~?r7$9qk>vUi,O%Q&Qd0_SCAܬ E?Xj%jNނs/WB#(OҲjfwW}A!oNIob!M[p`I_8Ly)v )qW<fo74⫨gK|q? LB~Gnn7_3X4 x=="KdDx IYwG|AnB{H J \ XVp;Z0Գ$~z;OA] bPXmEvޓFG46_EJy]H{:k0#Z4GW~J%>*6!&/+ Z堘nՒ0yܜک`:Oޅ$-G \1PXP+0T}C kwka`^Zq Q6iHo #/ a]9ԞA/ʑf(,"(YɿDj|Q_ׁfknUsf:(0)_O5Zt' ɟ#*}V`O*\âU_B?GTxk\ޜ J f{Zb\2@gϧ&HYA }wδsFMDhuyA9PΒ[NeWo- Eeq1c$=J,+2[KqoBb3J3PCۡ(5syZ(zc8/CKל\H$ /y)k.@Ac J6-Z/X06#<>:y"0,{4G,vJ޾D 'W^9̪RJ~M;)Ⅴw&,(Ҡ]W{kMMH,[9JͧF zHh\6ݻJB+$h(2{&U'LSˏ'JSpwfnJ\Cٞz(t QEnF"}Ossj?ÐvsS!rQE6vc7UFTFzGo.ȋ)y_"1nn"-]d^ƄS˵ˠIF/ o\ ߚLgAwae150FFb]x?g(Ze/ĶN1`}Q샮Ovo:4ӭB QI b|&a@ nKE7t|mZ6%H`30[w&CǣNK>d#ӆ$oGt=Hi` My/j6,CIw[ O2fM^8Ϳw%.8? S6Fa,Sگ s_EJs[OC(śSsI $e"*>L$_&^,~s`xQn(~i}Y$lnC >)-RHrXw /QB5z#3HhԺ,-ln6H|HEZIX? +Eak&Slc}Yaxof3j]f[2rR[ 8dɹs_A@o.׸ᛞH 2H͔rܼ,gx0/p 7&^Uk8O%b̋[I{Y| OgФ ?ĉ(~">BZ'N'w|$s Aa'z1#hbpIyB0ZSIQ؊"=QI2P!"y]%0в˻+XuB0XJ uN~-H[uz1|t `nB懿Cb/f: |Lksf]aSݕHz ?i~| `YяƪwN ǎ@%{JnlS-f k>,hcG3Rm taKb2,z8 MTzugM qbQ]Cp< s B(6SYN: L=DIk Vϐfu$P)hSuwyyiN-kk#[LCi")3o\O (&z&wJQuSt5Kw@?:Ɇܶq!W>JVݓZ2/F%vՉjYYYٓ}]~lvBBA[K~rl0Qek>ЎD˧`0ߏ` ;6$Kľ#}KoE\GX޲xKZGmEzpuX2QBsFT,j8ܿ\ -utIDJ.]YܔvHv118\4*! ̃W\ngܜ.]{|-P@{jHoOZX2>e|BK/hMOsYvvQ]O训ZQ I_OAu-lUy'6}[]ez`$yuH78gJ9AS!w 03^}b*f^\d:P`jC)0y$iYС½N<l߷h}% , !`bSן^|κ)=R ÓLY[6fkA̖mC!0f5`7\IO''3f\*Jj;FSOt^kma~Ë.YP_BO€b$nq.sZ 7ѓ!r]+= y]/-ˎ\w;γ~$#!gÏȩ =4[cu_D4}<]=7y{s"ʴCf-M)ߒXL"}X3= X˅F-@(-ggw?lJB^^PK(]J~8ߢ^V4L[| .=";@y^YaĤWWT;!oAL{oA"d*T6LSˀYn㖰1ӻcHHO9WmrD4yg/u(W̷nrE(2Msip7x pK|, S"ﵬF9g)xZg_NW*9#˃5hUuDwd_"Л呟,~eI7F5DCvEtN8 9i\$;|ȶTЇef ?:BқlכtSֽ:::n;f0{oU~6| $ULLk >aUmg`8i5Jnܤfaz䀧^qk߄͚bOȫMDͻ ڟ+MʟW:Tv.# BѷQ"WDSk'ک" 75"{Zه' VAeOu(?YjK$Ed_lEH|oWxbFn;O;!pҤ%O[*CႾ{HZ)K%R m#2Ã.7Yv[mBRH28l]Ke&E,[rsTs2za8{b}+`@')^lhzꗥ0ܬxj D"/<_ ~jLMIԸcE`4MwyzMZ?Bб:c4:l}^ߛ %.н?pg?`}D> Z-_$mc8 6\cE[^t)W.nFC ( ym;eC,;?#醗\lV#k/#\G8' +uN|4I?ƕ5iW u!$ï_9[J/eC[ryչ:hS{ƒќo-YH^'0[Z|HKK3ük`:r>opVD` "Bqz߲lY8 Z)d `:; S^lnwZ(F/w*ޅ\$܄}u|7_Go5CYObrDFr(Rھf+#ep`r9|ٝ?( I%Fshe/z_t5%}nIe'H)$x^0JP:M=}#R̮&&ì4RAϑJOu4~>_ X C~H\j`ջh`ԍ׺3G=hJ)a9n`(;fj^Rg~xBwN\Rg>^& ,$cbŚd볜Mn]Љ8(9$)iv#^ =vZ:HzNAuCi{{)VP8iw=/۟׸j-LrzXB]LHI>RĐ쥆?X~A`穪ICWPoiXQ$=7iܢY H?b_ o>ePZ$'"V00ߕwۗ7V/`pѥ)KS:7˳ҿxBG" >g,aDc ,O)"`W# ?K$̰̀|;L_t:8n.nY}T3k]5|z3rIZL̇Fym?#aH8ݔ<lз픅 itGj6Y: >Nzߞ ?l.'"֧J-0:U,2?Fv_>>IN ";W-}ufa*Iz7 ng}ӚapB=3qS[p3{:#CFY}uSNGi7kb{zu`&y04CO-y%-(;]H|eҌE+x 1o=aKUq:V?]≟xHh ?c<l J*ظ}ATz=t!pCw?3h˾qA]:? fw}2n+u^vYl_I.zrew]7:|;Mo CӡӟVeTAk_^[EaozjH2p6@p ^Zl#wYeGd /YtZ8WO]sIWAU}5$~nS3NBu |]5>yX0^$#iwHr?%Ir*aieAd`SY !fHl3ݿ, =wxns^k 9tyC"HnlԽhw=/zCqz 0~{Չg<}tǎPg~rLfd|`zϟSb=r=Rw .؜ev!5i<'UZg,mHQDAnӾg/zd3{P:N:mX)&ofsJ*\_"D.\h䤡S/]]J$Q'?;x6VyvY)Qf&̃_IiCw7Yz5)֥nu{]-3zWyO\#MkJ C=YLA]=ܾdxXnp7)<THgQݖҧ.Oo-+[k!Wq1x񷯁WBzjƱz۞]tE??hk^F|%JVAr@]A=|uO &{%%);. B+|}捲&$-| \>B0]K; =7Hi^/ }'|˛:3EVg`$9"d|)EƇL@o EW~|N/4Ysݜ".1V߶-"\M0ѣ57Z2F-:4*TK*apt5Uj4|g>l\oV[R3]Y˳8HJp zoԵ)v$A2ٙ H}@\ew , ʭ}C =@ - L *T%wf5Lj9v ]HiR.}9>L4"qcn@dYrJofEhL~fc NN5cv%ye|FfJyuCwUŪS>PܭWn|jwv]IO=>{ KПp./AҬTTuI HLuV2VQܔ}b׳Faꔤ ˍh2'Ie;>oO_ă|=iK`G|E^,-u:(`DIh4$z:n훪P>>۷,{hfCurXߏq>Srs?c02㫬X^ѓ=5x~F.+DzԠpVgwuyAO !# >C~l_=)18_?ɍmM8B@rȢ,g,o\D%0$O|ܑ;`:l[x I'%<à/g; LmH4IrV-e#I9NyϞYs8F̚ L}w z+ˡ)>ewLd?~׋u<1~Ajsqd߭wݳ~gN3_w_=lӵ..~'UNj/+]N*W߿ߍ;^͵;87~+W.~~7~'~߉_rW'x]?9ӯ;yWx'x>_?ɡ˿ZwOc;Un~~9;"r:j}+S#NGT}Jc↱c47M-?rsample/R/0000755000176200001440000000000013512177703012124 5ustar liggesusersrsample/R/aaa.R0000644000176200001440000000064213323650050012762 0ustar liggesusers.onLoad <- function(libname, pkgname) { if (requireNamespace("dplyr", quietly = TRUE)) { register_s3_method("dplyr", "arrange", "rset") register_s3_method("dplyr", "filter", "rset") register_s3_method("dplyr", "mutate", "rset") register_s3_method("dplyr", "rename", "rset") register_s3_method("dplyr", "select", "rset") register_s3_method("dplyr", "slice", "rset") } invisible() } rsample/R/lables.R0000644000176200001440000001415513507237102013511 0ustar liggesusers#' Find Labels from rset Object #' #' Produce a vector of resampling labels (e.g. "Fold1") from #' an `rset` object. Currently, `nested_cv` #' is not supported. #' #' @param object An `rset` object #' @param make_factor A logical for whether the results should be #' character or a factor. #' @param ... Not currently used. #' @return A single character or factor vector. #' @export #' @examples #' labels(vfold_cv(mtcars)) labels.rset <- function(object, make_factor = FALSE, ...) { if (inherits(object, "nested_cv")) stop("`labels` not implemented for nested resampling", call. = FALSE) if (make_factor) as.factor(object$id) else as.character(object$id) } #' @rdname labels.rset #' @export labels.vfold_cv <- function(object, make_factor = FALSE, ...) { if (inherits(object, "nested_cv")) stop("`labels` not implemented for nested resampling", call. = FALSE) is_repeated <- attr(object, "repeats") > 1 if (is_repeated) { out <- as.character(paste(object$id, object$id2, sep = ".")) } else out <- as.character(object$id) if (make_factor) out <- as.factor(out) out } #' Find Labels from rsplit Object #' #' Produce a tibble of identification variables so that single #' splits can be linked to a particular resample. #' #' @param object An `rsplit` object #' @param ... Not currently used. #' @return A tibble. #' @seealso add_resample_id #' @export #' @examples #' cv_splits <- vfold_cv(mtcars) #' labels(cv_splits$splits[[1]]) labels.rsplit <- function(object, ...) { out <- if ("id" %in% names(object)) object$id else tibble() out } ## The `pretty` methods below are good for when you need to ## textually describe the resampling procedure. Note that they ## can have more than one element (in the case of nesting) #' Short Decriptions of rsets #' #' Produce a chracter vector of describing the resampling method. #' #' @param x An `rset` object #' @param ... Not currently used. #' @return A character vector. #' @exportMethod pretty.vfold_cv #' @export pretty.vfold_cv #' @export #' @method pretty vfold_cv #' @keywords internal pretty.vfold_cv <- function(x, ...) { details <- attributes(x) res <- paste0(details$v, "-fold cross-validation") if (details$repeats > 1) res <- paste(res, "repeated", details$repeats, "times") if (details$strata) res <- paste(res, "using stratification") res } #' @exportMethod pretty.loo_cv #' @export pretty.loo_cv #' @export #' @method pretty loo_cv #' @rdname pretty.vfold_cv pretty.loo_cv <- function(x, ...) "Leave-one-out cross-validation" #' @exportMethod pretty.apparent #' @export pretty.apparent #' @export #' @method pretty apparent #' @rdname pretty.vfold_cv pretty.apparent <- function(x, ...) "Apparent sampling" #' @exportMethod pretty.rolling_origin #' @export pretty.rolling_origin #' @export #' @method pretty rolling_origin #' @rdname pretty.vfold_cv pretty.rolling_origin <- function(x, ...) "Rolling origin forecast resampling" #' @exportMethod pretty.mc_cv #' @export pretty.mc_cv #' @export #' @method pretty mc_cv #' @rdname pretty.vfold_cv pretty.mc_cv <- function(x, ...) { details <- attributes(x) res <- paste0( "# Monte Carlo cross-validation (", signif(details$prop, 2), "/", signif(1 - details$prop, 2), ") with ", details$times, " resamples " ) if (details$strata) res <- paste(res, "using stratification") res } #' @exportMethod pretty.nested_cv #' @export pretty.nested_cv #' @export #' @method pretty nested_cv #' @rdname pretty.vfold_cv pretty.nested_cv <- function(x, ...) { print(class(x)) details <- attributes(x) if (is_call(details$outside)) { class(x) <- class(x)[!(class(x) == "nested_cv")] outer_label <- pretty(x) } else { outer_label <- paste0("`", deparse(details$outside), "`") } inner_label <- if (is_call(details$inside)) pretty(x$inner_resamples[[1]]) else paste0("`", deparse(details$inside), "`") res <- c("Nested resampling:", paste(" outer:", outer_label), paste(" inner:", inner_label)) res } #' @exportMethod pretty.bootstraps #' @export pretty.bootstraps #' @export #' @method pretty bootstraps #' @rdname pretty.vfold_cv pretty.bootstraps <- function(x, ...) { details <- attributes(x) res <- "Bootstrap sampling" if (details$strata) res <- paste(res, "using stratification") if (details$apparent) res <- paste(res, "with apparent sample") res } #' @exportMethod pretty.group_vfold_cv #' @export pretty.group_vfold_cv #' @export #' @method pretty group_vfold_cv #' @rdname pretty.vfold_cv pretty.group_vfold_cv <- function(x, ...) { details <- attributes(x) paste0("Group ", details$v, "-fold cross-validation") } #' Augment a data set with resampling identifiers #' #' For a data set, `add_resample_id()` will add at least one new column that #' identifies which resample that the data came from. In most cases, a single #' column is added but for some resampling methods two or more are added. #' @param .data A data frame #' @param split A single `rset` object. #' @param dots A single logical: should the id columns be prefixed with a "." #' to avoid name conflicts with `.data`? #' @return An updated data frame. #' @examples #' #' set.seed(363) #' car_folds <- vfold_cv(mtcars, repeats = 3) #' #' analysis(car_folds$splits[[1]]) %>% #' add_resample_id(car_folds$splits[[1]]) %>% #' head() #' #' car_bt <- bootstraps(mtcars) #' #' analysis(car_bt$splits[[1]]) %>% #' add_resample_id(car_bt$splits[[1]]) %>% #' head() #' @seealso labels.rsplit #' @export add_resample_id <- function(.data, split, dots = FALSE) { if (!inherits(dots, "logical") || length(dots) > 1) { stop("`dots` should be a single logical.", call. = FALSE) } if (!inherits(.data, "data.frame")) { stop("`.data` should be a data frame.", call. = FALSE) } if (!inherits(split, "rsplit")) { stop("`split` should be a single 'rset' object.", call. = FALSE) } labs <- labels(split) if (!tibble::is_tibble(labs) && nrow(labs) == 1) { stop("`split` should be a single 'rset' object.", call. = FALSE) } if (dots) { colnames(labs) <- paste0(".", colnames(labs)) } cbind(.data, labs) } rsample/R/dplyr-compat.R0000644000176200001440000000770113323650050014656 0ustar liggesusers## adapted from ## https://github.com/hadley/dtplyr/blob/2308ff25e88bb81fe84f9051e37ddd9d572189ee/R/compat-dplyr-0.6.0.R ## and based on ## https://github.com/tidyverse/googledrive/commit/95455812d2e0d6bdf92b5f6728e3265bf65d8467#diff-ba61d4f2ccd992868e27305a9ab68a3c ## function is called in .onLoad() #' @importFrom rlang is_string register_s3_method <- function(pkg, generic, class, fun = NULL) { # nocov start stopifnot(is_string(pkg)) envir <- asNamespace(pkg) stopifnot(is_string(generic)) stopifnot(is_string(class)) if (is.null(fun)) { fun <- get(paste0(generic, ".", class), envir = parent.frame()) } stopifnot(is.function(fun)) if (pkg %in% loadedNamespaces()) { registerS3method(generic, class, fun, envir = envir) } # Always register hook in case package is later unloaded & reloaded setHook( packageEvent(pkg, "onLoad"), function(...) { registerS3method(generic, class, fun, envir = envir) } ) } # nocov end reset_rset <- function(x) { stopifnot(inherits(x, "data.frame")) structure(x, class = c("rset", "tbl_df", "tbl", "data.frame")) } #' @importFrom tibble is_tibble is_rset <- function(x) { is_tibble(x) && all("splits" %in% names(x)) && length(grepl("^id", names(x))) > 0 } #' @export `[.rset` <- function(x, i, j, drop = FALSE) { maybe_rset(NextMethod()) } # A list of attribute names in the various resampling functions. These # could get stripped off by dplyr operations rsample_att <- c("times", "apparent", "strata", "v", "repeats", "group", "prop", "outside", "inside", "initial", "assess", "cumulative", "skip" ) #' @importFrom dplyr as_tibble maybe_rset <- function(x, extras = NULL, att = NULL) { if (is_rset(x)) { x <- reset_rset(x) ## possibly reset attributes that dplyr methods removed att <- att[names(att) %in% rsample_att] if (length(att) > 0) { missing_att <- setdiff(names(att), attributes(x)) if (length(missing_att) > 0) { for (i in missing_att) attr(x, i) <- att[[i]] } } ## Add an missing classes if(length(extras) > 0) class(x) <- c(extras, class(x)) } else { x <- as_tibble(x) } x } ## rsample does not import any generics from dplyr, ## but if dplyr is loaded and main verbs are used on a ## `rset` object generated from the various resampling ## functions, we want to retain the `rset`` class (and ## any others) if it is proper to do so therefore these ## S3 methods are registered manually in .onLoad() base_classes <- c("rset", class(tibble())) arrange.rset <- function(.data, ...) { # Find out if there are extra classes beyond `rset` such # as `vfold_cv` or `bootstraps` and add them in after # the `dplyr` method is executed. extra_classes <- setdiff(class(.data), base_classes) orig_att <- attributes(.data) maybe_rset(NextMethod(), extras = extra_classes, att = orig_att) } filter.rset <- function(.data, ...) { extra_classes <- setdiff(class(.data), base_classes) orig_att <- attributes(.data) maybe_rset(NextMethod(), extras = extra_classes, att = orig_att) } # `mutate` appears to add rownames but remove other attributes. We'll # add them back in. mutate.rset <- function(.data, ...) { extra_classes <- setdiff(class(.data), base_classes) orig_att <- attributes(.data) maybe_rset(NextMethod(), extras = extra_classes, att = orig_att) } rename.rset <- function(.data, ...) { extra_classes <- setdiff(class(.data), base_classes) orig_att <- attributes(.data) maybe_rset(NextMethod(), extras = extra_classes, att = orig_att) } select.rset <- function(.data, ...) { extra_classes <- setdiff(class(.data), base_classes) orig_att <- attributes(.data) maybe_rset(NextMethod(), extras = extra_classes, att = orig_att) } slice.rset <- function(.data, ...) { extra_classes <- setdiff(class(.data), base_classes) orig_att <- attributes(.data) maybe_rset(NextMethod(), extras = extra_classes, att = orig_att) } rsample/R/rolling_origin.R0000644000176200001440000000732713512170471015270 0ustar liggesusers#' Rolling Origin Forecast Resampling #' #' This resampling method is useful when the data set has a strong time #' component. The resamples are not random and contain data points that are #' consecutive values. The function assumes that the original data set are #' sorted in time order. #' @details The main options, `initial` and `assess`, control the number of #' data points from the original data that are in the analysis and assessment #' set, respectively. When `cumulative = TRUE`, the analysis set will grow as #' resampling continues while the assessment set size will always remain #' static. #' `skip` enables the function to not use every data point in the resamples. #' When `skip = 0`, the resampling data sets will increment by one position. #' Suppose that the rows of a data set are consecutive days. Using `skip = 6` #' will make the analysis data set operate on *weeks* instead of days. The #' assessment set size is not affected by this option. #' @inheritParams vfold_cv #' @param initial The number of samples used for analysis/modeling in the #' initial resample. #' @param assess The number of samples used for each assessment resample. #' @param cumulative A logical. Should the analysis resample grow beyond the #' size specified by `initial` at each resample?. #' @param skip A integer indicating how many (if any) _additional_ resamples #' to skip to thin the total amount of data points in the analysis resample. #' See the example below. #' @export #' @return An tibble with classes `rolling_origin`, `rset`, `tbl_df`, `tbl`, #' and `data.frame`. The results include a column for the data split objects #' and a column called `id` that has a character string with the resample #' identifier. #' @examples #' set.seed(1131) #' ex_data <- data.frame(row = 1:20, some_var = rnorm(20)) #' dim(rolling_origin(ex_data)) #' dim(rolling_origin(ex_data, skip = 2)) #' dim(rolling_origin(ex_data, skip = 2, cumulative = FALSE)) #' #' # You can also roll over calendar periods by first nesting by that period, #' # which is especially useful for irregular series where a fixed window #' # is not useful. This example slides over 5 years at a time. #' library(dplyr) #' data(drinks) #' #' drinks_annual <- drinks %>% #' mutate(year = as.POSIXlt(date)$year + 1900) %>% #' nest(-year) #' #' multi_year_roll <- rolling_origin(drinks_annual, cumulative = FALSE) #' #' analysis(multi_year_roll$splits[[1]]) #' assessment(multi_year_roll$splits[[1]]) #' #' @export rolling_origin <- function(data, initial = 5, assess = 1, cumulative = TRUE, skip = 0, ...) { n <- nrow(data) if (n < initial + assess) stop("There should be at least ", initial + assess, " nrows in `data`", call. = FALSE) stops <- seq(initial, (n - assess), by = skip + 1) starts <- if (!cumulative) stops - initial + 1 else starts <- rep(1, length(stops)) in_ind <- mapply(seq, starts, stops, SIMPLIFY = FALSE) out_ind <- mapply(seq, stops + 1, stops + assess, SIMPLIFY = FALSE) indices <- mapply(merge_lists, in_ind, out_ind, SIMPLIFY = FALSE) split_objs <- purrr::map(indices, make_splits, data = data, class = "rof_split") split_objs <- list(splits = split_objs, id = names0(length(split_objs), "Slice")) roll_att <- list(initial = initial, assess = assess, cumulative = cumulative, skip = skip) new_rset(splits = split_objs$splits, ids = split_objs$id, attrib = roll_att, subclass = c("rolling_origin", "rset")) } #' @export print.rolling_origin <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("rolling_origin", "rset"))] print(x) } rsample/R/complement.R0000644000176200001440000000461313414230452014406 0ustar liggesusers#' Determine the Assessment Samples #' #' Given an `rsplit` object, `complement` will determine which #' of the data rows are contained in the assessment set. To save space, #' many of the `rset` objects will not contain indicies for the #' assessment split. #' #' @param x An `rsplit` object #' @param ... Not currently used #' @return A integer vector. #' @seealso [populate()] #' @examples #' set.seed(28432) #' fold_rs <- vfold_cv(mtcars) #' head(fold_rs$splits[[1]]$in_id) #' fold_rs$splits[[1]]$out_id #' complement(fold_rs$splits[[1]]) #' @export complement <- function (x, ...) UseMethod("complement") #' @export complement.vfold_split <- function(x, ...) { if (!all(is.na(x$out_id))) { return(x$out_id) } else { setdiff(1:nrow(x$data), x$in_id) } } #' @export complement.mc_split <- complement.vfold_split #' @export complement.loo_split <- complement.vfold_split #' @export complement.group_vfold_split <- complement.vfold_split #' @export complement.boot_split <- function(x, ...) { if (!all(is.na(x$out_id))) { return(x$out_id) } else { (1:nrow(x$data))[-unique(x$in_id)] } } #' @export complement.rof_split <- function(x, ...) { if (!all(is.na(x$out_id))) { return(x$out_id) } else { stop("Cannot derive the assessment set for this type of resampling.", call. = FALSE) } } #' @export complement.apparent_split <- function(x, ...) { if (!all(is.na(x$out_id))) { return(x$out_id) } else { 1:nrow(x$data) } } #' Add Assessment Indicies #' #' Many `rsplit` and `rset` objects do not contain indicators for #' the assessment samples. `populate()` can be used to fill the slot #' for the appropriate indices. #' @param x A `rsplit` and `rset` object. #' @param ... Not currently used #' @return An object of the same kind with the integer indicies. #' @examples #' set.seed(28432) #' fold_rs <- vfold_cv(mtcars) #' #' fold_rs$splits[[1]]$out_id #' complement(fold_rs$splits[[1]]) #' #' populate(fold_rs$splits[[1]])$out_id #' #' fold_rs_all <- populate(fold_rs) #' fold_rs_all$splits[[1]]$out_id #' @export populate <- function (x, ...) UseMethod("populate") #' @export populate.rsplit <- function(x, ...) { x$out_id <- complement(x, ...) x } #' @export populate.rset <- function(x, ...) { x$splits <- map(x$splits, populate) x } ## This will remove the assessment indices from an rsplit object rm_out <- function(x) { x$out_id <- NA x } rsample/R/loo.R0000644000176200001440000000250013512170471013030 0ustar liggesusers#' Leave-One-Out Cross-Validation #' #' Leave-one-out (LOO) cross-validation uses one data point in the original #' set as the assessment data and all other data points as the analysis set. A #' LOO resampling set has as many resamples as rows in the original data set. #' @inheritParams vfold_cv #' @return An tibble with classes `loo_cv`, `rset`, `tbl_df`, `tbl`, and #' `data.frame`. The results include a column for the data split objects and #' one column called `id` that has a character string with the resample #' identifier. #' @examples #' loo_cv(mtcars) #' @importFrom purrr map #' @export loo_cv <- function(data, ...) { split_objs <- vfold_splits(data = data, v = nrow(data)) split_objs <- list(splits = map(split_objs$splits, change_class), id = paste0("Resample", seq_along(split_objs$id))) ## We remove the holdout indicies since it will save space and we can ## derive them later when they are needed. split_objs$splits <- map(split_objs$splits, rm_out) new_rset(splits = split_objs$splits, ids = split_objs$id, subclass = c("loo_cv", "rset")) } #' @export print.loo_cv <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("loo_cv", "rset"))] print(x) } change_class <- function(x) { class(x) <- c("rsplit", "loo_split") x } rsample/R/pkg.R0000644000176200001440000000262113323650050013020 0ustar liggesusers#' rsample: General Resampling Infrastructure for R #' #'\pkg{rsample} has functions to create variations of a data set #' that can be used to evaluate models or to estimate the #' sampling distribution of some statistic. #' #' @section Terminology: #'\itemize{ #' \item A **resample** is the result of a two-way split of a #' data set. For example, when bootstrapping, one part of the #' resample is a sample with replacement of the original data. #' The other part of the split contains the instances that were #' not contained in the bootstrap sample. The data structure #' `rsplit` is used to store a single resample. #' \item When the data are split in two, the portion that are #' used to estimate the model or calculate the statistic is #' called the **analysis** set here. In machine learning this #' is sometimes called the "training set" but this would be #' poorly named since it might conflict with any initial split #' of the original data. #' \item Conversely, the other data in the split are called the #' **assessment** data. In bootstrapping, these data are #' often called the "out-of-bag" samples. #' \item A collection of resamples is contained in an #' `rset` object. #'} #' #' @section Basic Functions: #' The main resampling functions are: [vfold_cv()], #' [bootstraps()], [mc_cv()], #' [rolling_origin()], and [nested_cv()]. #' @docType package #' @name rsample NULL rsample/R/boot.R0000644000176200001440000001143713512170471013213 0ustar liggesusers#' Bootstrap Sampling #' #' A bootstrap sample is a sample that is the same size as the original data #' set that is made using replacement. This results in analysis samples that #' have multiple replicates of some of the original rows of the data. The #' assessment set is defined as the rows of the original data that were not #' included in the bootstrap sample. This is often referred to as the #' "out-of-bag" (OOB) sample. #' @details The argument `apparent` enables the option of an additional #' "resample" where the analysis and assessment data sets are the same as the #' original data set. This can be required for some types of analysis of the #' bootstrap results. #' The `strata` argument is based on a similar argument in the random forest #' package were the bootstrap samples are conducted *within the stratification #' variable*. The can help ensure that the number of data points in the #' bootstrap sample is equivalent to the proportions in the original data set. #' @inheritParams vfold_cv #' @param times The number of bootstrap samples. #' @param strata A variable that is used to conduct stratified sampling. When #' not `NULL`, each bootstrap sample is created within the stratification #' variable. This could be a single character value or a variable name that #' corresponds to a variable that exists in the data frame. #' @param breaks A single number giving the number of bins desired to stratify #' a numeric stratification variable. #' @param apparent A logical. Should an extra resample be added where the #' analysis and holdout subset are the entire data set. This is required for #' some estimators used by the `summary` function that require the apparent #' error rate. #' @export #' @return An tibble with classes `bootstraps`, `rset`, `tbl_df`, `tbl`, and #' `data.frame`. The results include a column for the data split objects and a #' column called `id` that has a character string with the resample identifier. #' @examples #' bootstraps(mtcars, times = 2) #' bootstraps(mtcars, times = 2, apparent = TRUE) #' #' library(purrr) #' iris2 <- iris[1:130, ] #' #' set.seed(13) #' resample1 <- bootstraps(iris2, times = 3) #' map_dbl(resample1$splits, #' function(x) { #' dat <- as.data.frame(x)$Species #' mean(dat == "virginica") #' }) #' #' set.seed(13) #' resample2 <- bootstraps(iris2, strata = "Species", times = 3) #' map_dbl(resample2$splits, #' function(x) { #' dat <- as.data.frame(x)$Species #' mean(dat == "virginica") #' }) #' #' set.seed(13) #' resample3 <- bootstraps(iris2, strata = "Sepal.Length", breaks = 6, times = 3) #' map_dbl(resample3$splits, #' function(x) { #' dat <- as.data.frame(x)$Species #' mean(dat == "virginica") #' }) #' @export bootstraps <- function(data, times = 25, strata = NULL, breaks = 4, apparent = FALSE, ...) { if(!missing(strata)) { strata <- tidyselect::vars_select(names(data), !!enquo(strata)) if(length(strata) == 0) strata <- NULL } strata_check(strata, names(data)) split_objs <- boot_splits( data = data, times = times, strata = strata, breaks = breaks ) if(apparent) split_objs <- bind_rows(split_objs, apparent(data)) boot_att <- list(times = times, apparent = apparent, strata = !is.null(strata)) new_rset(splits = split_objs$splits, ids = split_objs$id, attrib = boot_att, subclass = c("bootstraps", "rset")) } # Get the indices of the analysis set from the analysis set (= bootstrap sample) boot_complement <- function(ind, n) { list(analysis = ind, assessment = NA) } #' @importFrom purrr map map_df #' @importFrom tibble tibble boot_splits <- function(data, times = 25, strata = NULL, breaks = 4) { n <- nrow(data) if (is.null(strata)) { indices <- purrr::map(rep(n, times), sample, replace = TRUE) } else { stratas <- tibble::tibble(idx = 1:n, strata = make_strata(getElement(data, strata), breaks = breaks)) stratas <- split(stratas, stratas$strata) stratas <- purrr::map_df( stratas, strat_sample, prop = 1, times = times, replace = TRUE ) indices <- split(stratas$idx, stratas$rs_id) } indices <- lapply(indices, boot_complement, n = n) split_objs <- purrr::map(indices, make_splits, data = data, class = "boot_split") list(splits = split_objs, id = names0(length(split_objs), "Bootstrap")) } #' @export print.bootstraps <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("bootstraps", "rset"))] print(x) } rsample/R/rsplit.R0000644000176200001440000001000613356221017013553 0ustar liggesusersrsplit <- function(data, in_id, out_id) { if (!is.data.frame(data) & !is.matrix(data)) stop("`data` must be a data frame.", call. = FALSE) if (!is.integer(in_id) | any(in_id < 1)) stop("`in_id` must be a positive integer vector.", call. = FALSE) if(!all(is.na(out_id))) { if (!is.integer(out_id) | any(out_id < 1)) stop("`out_id` must be a positive integer vector.", call. = FALSE) } if (length(in_id) == 0) stop("At least one row should be selected for the analysis set.", call. = FALSE) structure( list( data = data, in_id = in_id, out_id = out_id ), class = "rsplit" ) } #' @export print.rsplit <- function(x, ...) { out_char <- if (all(is.na(x$out_id))) paste(length(complement(x))) else paste(length(x$out_id)) cat("<", length(x$in_id), "/", out_char, "/", nrow(x$data), ">\n", sep = "") } #' @export as.integer.rsplit <- function(x, data = c("analysis", "assessment"), ...) { data <- match.arg(data) if (data == "analysis") out <- x$in_id else { out <- if (all(is.na(x$out_id))) complement(x) else x$out_id } out } #' Convert an `rsplit` object to a data frame #' #' The analysis or assessment code can be returned as a data #' frame (as dictated by the `data` argument) using #' `as.data.frame.rsplit`. `analysis` and #' `assessment` are shortcuts. #' @param x An `rsplit` object. #' @param row.names `NULL` or a character vector giving the row names for the data frame. Missing values are not allowed. #' @param optional A logical: should the column names of the data be checked for legality? #' @param data Either "analysis" or "assessment" to specify which data are returned. #' @param ... Additional arguments to be passed to or from methods. Not currently used. #' @examples #' library(dplyr) #' set.seed(104) #' folds <- vfold_cv(mtcars) #' #' model_data_1 <- folds$splits[[1]] %>% analysis() #' holdout_data_1 <- folds$splits[[1]] %>% assessment() #' @export as.data.frame.rsplit <- function(x, row.names = NULL, optional = FALSE, data = "analysis", ...) { if (!is.null(row.names)) warning( "`row.names` is kept for consistency with the ", "underlying class but non-NULL values will be ", "ignored.", call. = FALSE) if (optional) warning( "`optional` is kept for consistency with the ", "underlying class but TRUE values will be ", "ignored.", call. = FALSE) x$data[as.integer(x, data = data, ...), , drop = FALSE] } #' @rdname as.data.frame.rsplit #' @export analysis <- function(x, ...) { if (!inherits(x, "rsplit")) stop("`x` should be an `rsplit` object", call. = FALSE) as.data.frame(x, data = "analysis", ...) } #' @rdname as.data.frame.rsplit #' @export assessment <- function(x, ...){ if (!inherits(x, "rsplit")) stop("`x` should be an `rsplit` object", call. = FALSE) as.data.frame(x, data = "assessment", ...) } #' @export dim.rsplit <- function(x, ...) { c( analysis = length(x$in_id), assessment = length(complement(x)), n = nrow(x$data), p = ncol(x$data) ) } #' @importFrom tibble obj_sum #' @method obj_sum rsplit #' @export obj_sum.rsplit <- function(x, ...) { out_char <- if (all(is.na(x$out_id))) paste(length(complement(x))) else paste(length(x$out_id)) paste0("rsplit [", length(x$in_id), "/", out_char, "]") } #' @importFrom tibble type_sum #' @importFrom dplyr case_when #' @method type_sum rsplit #' @export type_sum.rsplit <- function(x, ...) { out_char <- if (all(is.na(x$out_id))) format_n(length(complement(x))) else format_n(length(x$out_id)) paste0( "split [", format_n(length(x$in_id)), "/", out_char, "]" ) } format_n <- function(x, digits = 1) { case_when( log10(x) < 3 ~ paste(x), log10(x) >= 3 & log10(x) < 6 ~ paste0(round(x/1000, digits = digits), "K"), TRUE ~ paste0(round(x/1000000, digits = digits), "M"), ) } rsample/R/nest.R0000644000176200001440000000720713507237102013220 0ustar liggesusers#' Nested or Double Resampling #' #' `nested_cv` can be used to take the results of one resampling procedure #' and conduct further resamples within each split. Any type of resampling #' used in `rsample` can be used. #' #' @details #' It is a bad idea to use bootstrapping as the outer resampling procedure (see #' the example below) #' #' @inheritParams vfold_cv #' @param data A data frame. #' @param outside The initial resampling specification. This can be an already #' created object or an expression of a new object (see the examples below). #' If the latter is used, the `data` argument does not need to be #' specified and, if it is given, will be ignored. #' @param inside An expression for the type of resampling to be conducted #' within the initial procedure. #' @return An tibble with classe `nested_cv` and any other classes that #' outer resampling process normally contains. The results include a #' column for the outer data split objects, one or more `id` columns, #' and a column of nested tibbles called `inner_resamples` with the #' additional resamples. #' @examples #' ## Using expressions for the resampling procedures: #' nested_cv(mtcars, outside = vfold_cv(v = 3), inside = bootstraps(times = 5)) #' #' ## Using an existing object: #' folds <- vfold_cv(mtcars) #' nested_cv(mtcars, folds, inside = bootstraps(times = 5)) #' #' ## The dangers of outer bootstraps: #' set.seed(2222) #' bad_idea <- nested_cv(mtcars, #' outside = bootstraps(times = 5), #' inside = vfold_cv(v = 3)) #' #' first_outer_split <- bad_idea$splits[[1]] #' outer_analysis <- as.data.frame(first_outer_split) #' sum(grepl("Volvo 142E", rownames(outer_analysis))) #' #' ## For the 3-fold CV used inside of each bootstrap, how are the replicated #' ## `Volvo 142E` data partitioned? #' first_inner_split <- bad_idea$inner_resamples[[1]]$splits[[1]] #' inner_analysis <- as.data.frame(first_inner_split) #' inner_assess <- as.data.frame(first_inner_split, data = "assessment") #' #' sum(grepl("Volvo 142E", rownames(inner_analysis))) #' sum(grepl("Volvo 142E", rownames(inner_assess))) #' @importFrom rlang is_call #' @importFrom purrr map #' @importFrom dplyr bind_cols #' @importFrom methods formalArgs #' @export nested_cv <- function(data, outside, inside) { nest_args <- formalArgs(nested_cv) cl <- match.call() boot_msg <- paste0( "Using bootstrapping as the outer resample is dangerous ", "since the inner resample might have the same data ", "point in both the analysis and assessment set." ) outer_cl <- cl[["outside"]] if (is_call(outer_cl)) { if (grepl("^bootstraps", deparse(outer_cl))) warning(boot_msg, call. = FALSE) outer_cl$data <- quote(data) outside <- eval(outer_cl) } else { if (inherits(outside, "bootstraps")) warning(boot_msg, call. = FALSE) } inner_cl <- cl[["inside"]] if (!is_call(inner_cl)) stop( "`inside` should be a expression such as `vfold()` or ", "bootstraps(times = 10)` instead of a existing object.", call. = FALSE ) inside <- map(outside$splits, inside_resample, cl = inner_cl) inside <- tibble(inner_resamples = inside) out <- dplyr::bind_cols(outside, inside) out <- add_class(out, cls = "nested_cv", at_end = FALSE) attr(out, "outside") <- cl$outside attr(out, "inside") <- cl$inside out } inside_resample <- function(src, cl) { cl$data <- quote(as.data.frame(src)) eval(cl) } #' @importFrom tibble tibble #' @importFrom rlang is_call #' @export print.nested_cv <- function(x, ...) { char_x <- paste("#", pretty(x)) cat(char_x, sep = "\n") class(x) <- class(tibble()) print(x) } rsample/R/groups.R0000644000176200001440000000727613512132573013576 0ustar liggesusers#' Group V-Fold Cross-Validation #' #' Group V-fold cross-validation creates splits of the data based #' on some grouping variable (which may have more than a single row #' associated with it). The function can create as many splits as #' there are unique values of the grouping variable or it can #' create a smaller set of splits where more than one value is left #' out at a time. #' #' @param data A data frame. #' @param group This could be a single character value or a variable #' name that corresponds to a variable that exists in the data frame. #' @param v The number of partitions of the data set. If let #' `NULL`, `v` will be set to the number of unique values #' in the group. #' @param ... Not currently used. #' @export #' @return An tibble with classes `group_vfold_cv`, #' `rset`, `tbl_df`, `tbl`, and `data.frame`. #' The results include a column for the data split objects and an #' identification variable. #' @examples #' set.seed(3527) #' test_data <- data.frame(id = sort(sample(1:20, size = 80, replace = TRUE))) #' test_data$dat <- runif(nrow(test_data)) #' #' set.seed(5144) #' split_by_id <- group_vfold_cv(test_data, group = "id") #' #' get_id_left_out <- function(x) #' unique(assessment(x)$id) #' #' library(purrr) #' table(map_int(split_by_id$splits, get_id_left_out)) #' #' set.seed(5144) #' split_by_some_id <- group_vfold_cv(test_data, group = "id", v = 7) #' held_out <- map(split_by_some_id$splits, get_id_left_out) #' table(unlist(held_out)) #' # number held out per resample: #' map_int(held_out, length) #' @export group_vfold_cv <- function(data, group = NULL, v = NULL, ...) { if(!missing(group)) { group <- tidyselect::vars_select(names(data), !!enquo(group)) if(length(group) == 0) { group <- NULL } } if (is.null(group) || !is.character(group) || length(group) != 1) stop( "`group` should be a single character value for the column ", "that will be used for splitting.", call. = FALSE ) if (!any(names(data) == group)) stop("`group` should be a column in `data`.", call. = FALSE) split_objs <- group_vfold_splits(data = data, group = group, v = v) ## We remove the holdout indicies since it will save space and we can ## derive them later when they are needed. split_objs$splits <- map(split_objs$splits, rm_out) ## Save some overall information cv_att <- list(v = v, group = group) new_rset(splits = split_objs$splits, ids = split_objs[, grepl("^id", names(split_objs))], attrib = cv_att, subclass = c("group_vfold_cv", "rset")) } #' @importFrom dplyr %>% group_vfold_splits <- function(data, group, v = NULL) { uni_groups <- unique(getElement(data, group)) max_v <- length(uni_groups) if (is.null(v)) { v <- max_v } else { if (v > max_v) stop("`v` should be less than ", max_v, call. = FALSE) } data_ind <- data.frame(..index = 1:nrow(data), ..group = getElement(data, group)) keys <- data.frame(..group = uni_groups) n <- nrow(keys) keys$..folds <- sample(rep(1:v, length.out = n)) data_ind <- data_ind %>% full_join(keys, by = "..group") %>% arrange(..index) indices <- split(data_ind$..index, data_ind$..folds) indices <- lapply(indices, vfold_complement, n = nrow(data)) split_objs <- purrr::map(indices, make_splits, data = data, class = "group_vfold_split") tibble::tibble(splits = split_objs, id = names0(length(split_objs), "Resample")) } #' @export print.group_vfold_cv <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("group_vfold_cv", "rset"))] print(x) } #' @importFrom utils globalVariables utils::globalVariables("..index") rsample/R/tidy.R0000644000176200001440000001025513512170471013216 0ustar liggesusers#' Tidy Resampling Object #' #' The `tidy` function from the \pkg{broom} package can be used on `rset` and #' `rsplit` objects to generate tibbles with which rows are in the analysis and #' assessment sets. #' @param x A `rset` or `rsplit` object #' @param unique_ind Should unique row identifiers be returned? For example, #' if `FALSE` then bootstrapping results will include multiple rows in the #' sample for the same row in the original data. #' @param ... Not currently used. #' @return A tibble with columns `Row` and `Data`. The latter has possible #' values "Analysis" or "Assessment". For `rset` inputs, identification columns #' are also returned but their names and values depend on the type of #' resampling. `vfold_cv` contains a column "Fold" and, if repeats are used, #' another called "Repeats". `bootstraps` and `mc_cv` use the column #' "Resample". #' @details Note that for nested resampling, the rows of the inner resample, #' named `inner_Row`, are *relative* row indices and do not correspond to the #' rows in the original data set. #' @examples #' library(ggplot2) #' theme_set(theme_bw()) #' #' set.seed(4121) #' cv <- tidy(vfold_cv(mtcars, v = 5)) #' ggplot(cv, aes(x = Fold, y = Row, fill = Data)) + #' geom_tile() + scale_fill_brewer() #' #' set.seed(4121) #' rcv <- tidy(vfold_cv(mtcars, v = 5, repeats = 2)) #' ggplot(rcv, aes(x = Fold, y = Row, fill = Data)) + #' geom_tile() + facet_wrap(~Repeat) + scale_fill_brewer() #' #' set.seed(4121) #' mccv <- tidy(mc_cv(mtcars, times = 5)) #' ggplot(mccv, aes(x = Resample, y = Row, fill = Data)) + #' geom_tile() + scale_fill_brewer() #' #' set.seed(4121) #' bt <- tidy(bootstraps(mtcars, time = 5)) #' ggplot(bt, aes(x = Resample, y = Row, fill = Data)) + #' geom_tile() + scale_fill_brewer() #' #' dat <- data.frame(day = 1:30) #' # Resample by week instead of day #' ts_cv <- rolling_origin(dat, initial = 7, assess = 7, #' skip = 6, cumulative = FALSE) #' ts_cv <- tidy(ts_cv) #' ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) + #' geom_tile() + scale_fill_brewer() #' @importFrom dplyr bind_rows arrange_ #' @importFrom tibble tibble #' @export tidy.rsplit <- function(x, unique_ind = TRUE, ...) { if (unique_ind) x$in_id <- unique(x$in_id) out <- tibble(Row = c(x$in_id, complement(x)), Data = rep(c("Analysis", "Assessment"), c(length(x$in_id), length(complement(x))))) out <- dplyr::arrange(.data = out, Data, Row) out } #' @rdname tidy.rsplit #' @export #' @inheritParams tidy.rsplit #' @importFrom dplyr arrange tidy.rset <- function(x, ...) { stacked <- purrr::map(x$splits, tidy) for (i in seq(along = stacked)) stacked[[i]]$Resample <- x$id[i] stacked <- dplyr::bind_rows(stacked) stacked <- dplyr::arrange(.data = stacked, Data, Row) stacked } #' @rdname tidy.rsplit #' @export #' @inheritParams tidy.rsplit #' @importFrom dplyr arrange tidy.vfold_cv <- function(x, ...) { stacked <- purrr::map(x$splits, tidy) for (i in seq(along = stacked)) { if (attr(x, "repeats") > 1) { stacked[[i]]$Repeat <- x$id[i] stacked[[i]]$Fold <- x$id2[i] } else stacked[[i]]$Fold <- x$id[i] } stacked <- dplyr::bind_rows(stacked) stacked <- dplyr::arrange(.data = stacked, Data, Row) stacked } #' @rdname tidy.rsplit #' @export #' @inheritParams tidy.rsplit #' @importFrom dplyr arrange full_join #' @importFrom tidyr unnest #' @importFrom purrr map tidy.nested_cv <- function(x, ...) { x$inner_tidy <- purrr::map(x$inner_resamples, tidy_wrap) inner_tidy <- tidyr::unnest(x, inner_tidy) class(x) <- class(x)[class(x) != "nested_cv"] outer_tidy <- tidy(x) id_cols <- names(outer_tidy) id_cols <- id_cols[!(id_cols %in% c("Row", "Data"))] inner_id <- grep("^id", names(inner_tidy)) if(length(inner_id) != length(id_cols)) stop("Cannot merge tidt data sets", call. = FALSE) names(inner_tidy)[inner_id] <- id_cols full_join(outer_tidy, inner_tidy, by = id_cols) } tidy_wrap <- function(x) { x <- tidy(x) names(x) <- paste0("inner_", names(x)) x } # ---------------------------------------------------------------- #' @importFrom utils globalVariables utils::globalVariables( c("Data", "Row") ) rsample/R/apparent.R0000644000176200001440000000256313450421255014062 0ustar liggesusers#' Sampling for the Apparent Error Rate #' #' When building a model on a data set and re-predicting the same data, the #' performance estimate from those predictions is often call the #' "apparent" performance of the model. This estimate can be wildly #' optimistic. "Apparent sampling" here means that the analysis and #' assessment samples are the same. These resamples are sometimes used in #' the analysis of bootstrap samples and should otherwise be #' avoided like old sushi. #' #' @inheritParams vfold_cv #' @return A tibble with a single row and classes `apparent`, #' `rset`, `tbl_df`, `tbl`, and `data.frame`. The #' results include a column for the data split objects and one column #' called `id` that has a character string with the resample identifier. #' @examples #' apparent(mtcars) #' @importFrom purrr map #' @export apparent <- function(data, ...) { splits <- rsplit(data, in_id = 1:nrow(data), out_id = 1:nrow(data)) # splits <- rm_out(splits) class(splits) <- c("rsplit", "apparent_split") split_objs <- tibble::tibble(splits = list(splits), id = "Apparent") split_objs <- add_class(split_objs, cls = c("apparent", "rset"), at_end = FALSE) split_objs } #' @export print.apparent <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("apparent", "rset"))] print(x) } rsample/R/bootci.R0000644000176200001440000003127213507027211013523 0ustar liggesusers# Bootstrap confidence interval code # ------------------------------------------------------------------------------ # helpers check_rset <- function(x, app = TRUE) { if (!inherits(x, "bootstraps")) stop("`.data` should be an `rset` object generated from `bootstraps()`", call. = FALSE) if (app) { if(x %>% dplyr::filter(id == "Apparent") %>% nrow() != 1) stop("Please set `apparent = TRUE` in `bootstraps()` function", call. = FALSE) } invisible(NULL) } stat_fmt_err <- paste("`statistics` should select a list column of tidy results.") stat_nm_err <- paste("The tibble in `statistics` should have columns for", "'estimate' and 'term`") std_exp <- c("std.error", "robust.se") check_tidy_names <- function(x, std_col) { # check for proper columns if (sum(colnames(x) == "estimate") != 1) { stop(stat_nm_err, call. = FALSE) } if (sum(colnames(x) == "term") != 1) { stop(stat_nm_err, call. = FALSE) } if (std_col) { std_candidates <- colnames(x) %in% std_exp if (sum(std_candidates) != 1) { stop("`statistics` should select a single column for the standard ", "error.", call. = FALSE) } } invisible(TRUE) } #' @importFrom stats setNames check_tidy <- function(x, std_col = FALSE) { if (!is.list(x)) { stop(stat_fmt_err, call. = FALSE) } # convert to data frame from list has_id <- any(names(x) == "id") if (has_id) { x <- try(unnest(x), silent = TRUE) } else { x <- try(map_dfr(x, ~ .x), silent = TRUE) } if (inherits(x, "try-error")) { stop(stat_fmt_err, call. = FALSE) } check_tidy_names(x, std_col) if (std_col) { std_candidates <- colnames(x) %in% std_exp std_candidates <- colnames(x)[std_candidates] if (has_id) { x <- dplyr::select(x, term, estimate, id, tidyselect::one_of(std_candidates)) %>% mutate(id = (id == "Apparent")) %>% setNames(c("term", "estimate", "orig", "std_err")) } else { x <- dplyr::select(x, term, estimate, tidyselect::one_of(std_candidates)) %>% setNames(c("term", "estimate", "std_err")) } } else { if (has_id) { x <- dplyr::select(x, term, estimate, id) %>% mutate(orig = (id == "Apparent")) %>% dplyr::select(-id) } else { x <- dplyr::select(x, term, estimate) } } x } get_p0 <- function(x, alpha = 0.05) { orig <- x %>% group_by(term) %>% dplyr::filter(orig) %>% dplyr::select(term, theta_0 = estimate) %>% ungroup() x %>% dplyr::filter(!orig) %>% inner_join(orig, by = "term") %>% group_by(term) %>% summarize(p0 = mean(estimate <= theta_0, na.rm = TRUE)) %>% mutate(Z0 = stats::qnorm(p0), Za = stats::qnorm(1 - alpha / 2)) } new_stats <- function(x, lo, hi) { res <- as.numeric(quantile(x, probs = c(lo, hi), na.rm = TRUE)) tibble(.lower = min(res), .estimate = mean(x, na.rm = TRUE), .upper = max(res)) } has_dots <- function(x) { nms <- names(formals(x)) if (!any(nms == "...")) { stop("`.fn` must have an argument `...`.", call. = FALSE) } invisible(NULL) } check_num_resamples <- function(x, B = 1000) { x <- x %>% dplyr::group_by(term) %>% dplyr::summarize(n = sum(!is.na(estimate))) %>% dplyr::filter(n < B) if (nrow(x) > 0) { terms <- paste0("`", x$term, "`") msg <- paste0( "Recommend at least ", B, " non-missing bootstrap resamples for ", ifelse(length(terms) > 1, "terms: ", "term "), paste0(terms, collapse = ", "), "." ) warning(msg, call. = FALSE) } invisible(NULL) } # ------------------------------------------------------------------------------ # percentile code pctl_single <- function(stats, alpha = 0.05) { if (all(is.na(stats))) stop("All statistics have missing values..", call. = FALSE) if (!is.numeric(stats)) stop("`stats` must be a numeric vector.", call. = FALSE) # stats is a numeric vector of values ci <- stats %>% quantile(probs = c(alpha / 2, 1 - alpha / 2), na.rm = TRUE) # return a tibble with .lower, .estimate, .upper res <- tibble( .lower = min(ci), .estimate = mean(stats, na.rm = TRUE), .upper = max(ci), .alpha = alpha, .method = "percentile" ) res } #' Bootstrap confidence intervals #' @description #' Calculate bootstrap confidence intervals using various methods. #' @param .data A data frame containing the bootstrap resamples created using #' `bootstraps()`. For t- and BCa-intervals, the `apparent` argument #' should be set to `TRUE`. #' @param statistics An unquoted column name or `dplyr` selector that identifies #' a single column in the data set that contains the indiviual bootstrap #' estimates. This can be a list column of tidy tibbles (that contains columns #' `term` and `estimate`) or a simple numeric column. For t-intervals, a #' standard tidy column (usually called `std.err`) is required. #' See the examples below. #' @param alpha Level of significance #' @return Each function returns a tibble with columns `.lower`, #' `.estimate`, `.upper`, `.alpha`, `.method`, and `term`. #' `.method` is the type of interval (eg. "percentile", #' "student-t", or "BCa"). `term` is the name of the estimate. #' @details Percentile intervals are the standard method of #' obtaining confidence intervals but require thousands of #' resamples to be accurate. t-intervals may need fewer #' resamples but require a corresponding variance estimate. #' Bias-corrected and accelerated intervals require the original function #' that was used to create the statistics of interest and are #' computationally taxing. #' #' @references Davison, A., & Hinkley, D. (1997). _Bootstrap Methods and their #' Application_. Cambridge: Cambridge University Press. #' doi:10.1017/CBO9780511802843 #' #' @examples #' library(broom) #' library(dplyr) #' library(purrr) #' library(tibble) #' #' lm_est <- function(split, ...) { #' lm(mpg ~ disp + hp, data = analysis(split)) %>% #' tidy() #' } #' #' set.seed(52156) #' car_rs <- #' bootstraps(mtcars, 1000, apparent = TRUE) %>% #' mutate(results = map(splits, lm_est)) #' #' int_pctl(car_rs, results) #' int_t(car_rs, results) #' int_bca(car_rs, results, .fn = lm_est) #' #' # putting results into a tidy format #' rank_corr <- function(split) { #' dat <- analysis(split) #' tibble( #' term = "corr", #' estimate = cor(dat$Sepal.Length, dat$Sepal.Width, method = "spearman"), #' # don't know the analytical std.err so no t-intervals #' std.err = NA_real_ #' ) #' } #' #' set.seed(69325) #' bootstraps(iris, 1000, apparent = TRUE) %>% #' mutate(correlations = map(splits, rank_corr)) %>% #' int_pctl(correlations) #' @importFrom purrr map map_dfr #' @importFrom rlang enquo #' @importFrom dplyr mutate last ungroup group_by inner_join summarize do #' @importFrom tidyselect vars_select one_of #' @importFrom furrr future_map_dfr #' @export int_pctl <- function(.data, statistics, alpha = 0.05) { check_rset(.data) .data <- .data %>% dplyr::filter(id != "Apparent") column_name <- tidyselect::vars_select(names(.data), !!rlang::enquo(statistics)) if (length(column_name) != 1) { stop(stat_fmt_err, call. = FALSE) } stats <- .data[[column_name]] stats <- check_tidy(stats, std_col = FALSE) check_num_resamples(stats, B = 1000) vals <- stats %>% dplyr::group_by(term) %>% dplyr::do(pctl_single(.$estimate, alpha = alpha)) %>% dplyr::ungroup() vals } # ------------------------------------------------------------------------------ # t interval code #' @importFrom tibble tibble t_single <- function(stats, std_err, is_orig, alpha = 0.05) { # stats is a numeric vector of values # vars is a numeric vector of variances # return a tibble with .lower, .estimate, .upper # which_orig is the index of stats and std_err that has the original result if (all(is.na(stats))) stop("All statistics have missing values.", call. = FALSE) if (!is.logical(is_orig) || any(is.na(is_orig))) { stop("`is_orig` should be a logical column the same length as `stats` ", "with no missing values.", call. = FALSE) } if (length(stats) != length(std_err) && length(stats) != length(is_orig)) { stop("`stats`, `std_err`, and `is_orig` should have the same length.", call. = FALSE) } if (sum(is_orig) != 1) { stop("The original statistic must be in a single row.", call. = FALSE) } theta_obs <- stats[is_orig] std_err_obs <- std_err[is_orig] stats <- stats[!is_orig] std_err <- std_err[!is_orig] z_dist <- (stats - theta_obs) / std_err z_pntl <- quantile(z_dist, probs = c(alpha / 2, 1 - (alpha) / 2), na.rm = TRUE) ci <- theta_obs - z_pntl * std_err_obs tibble( .lower = min(ci), .estimate = mean(stats, na.rm = TRUE), .upper = max(ci), .alpha = alpha, .method = "student-t" ) } #' @rdname int_pctl #' @inheritParams int_pctl #' @importFrom dplyr as_tibble mutate #' @importFrom rlang quos #' @importFrom purrr map2 map_dfr #' @export int_t <- function(.data, statistics, alpha = 0.05) { check_rset(.data) column_name <- tidyselect::vars_select(names(.data), !!enquo(statistics)) if (length(column_name) != 1) { stop(stat_fmt_err, call. = FALSE) } stats <- .data %>% dplyr::select(!!column_name, id) stats <- check_tidy(stats, std_col = TRUE) check_num_resamples(stats, B = 500) vals <- stats %>% dplyr::group_by(term) %>% dplyr::do(t_single(.$estimate, .$std_err, .$orig, alpha = alpha)) %>% dplyr::ungroup() vals } # ---------------------------------------------------------------- #' @importFrom dplyr last #' @importFrom rlang exec #' @importFrom purrr pluck map_dbl map_dfr #' @importFrom stats qnorm pnorm bca_calc <- function(stats, orig_data, alpha = 0.05, .fn, ...) { # TODO check per term if (all(is.na(stats$estimate))) { stop("All statistics have missing values.", call. = FALSE) } ### Estimating Z0 bias-correction bias_corr_stats <- get_p0(stats) # need the original data frame here loo_rs <- loo_cv(orig_data) # We can't be sure what we will get back from the analysis function. # To test, we run on the first LOO data set and see if it is a vector or df loo_test <- try(rlang::exec(.fn, loo_rs$splits[[1]], ...), silent = TRUE) if (inherits(loo_test, "try-error")) { cat("Running `.fn` on the LOO resamples produced an error:\n") print(loo_test) stop("`.fn` failed.", call. = FALSE) } loo_res <- furrr::future_map_dfr(loo_rs$splits, .fn, ...) loo_estimate <- loo_res %>% dplyr::group_by(term) %>% dplyr::summarize(loo = mean(estimate, na.rm = TRUE)) %>% dplyr::inner_join(loo_res, by = "term") %>% dplyr::group_by(term) %>% dplyr::summarize( cubed = sum((loo - estimate)^3), squared = sum((loo - estimate)^2) ) %>% dplyr::ungroup() %>% dplyr::inner_join(bias_corr_stats, by = "term") %>% dplyr::mutate( a = cubed/(6 * (squared^(3 / 2))), Zu = (Z0 + Za) / ( 1 - a * (Z0 + Za)) + Z0, Zl = (Z0 - Za) / (1 - a * (Z0 - Za)) + Z0, lo = stats::pnorm(Zl, lower.tail = TRUE), hi = stats::pnorm(Zu, lower.tail = TRUE) ) terms <- loo_estimate$term stats <- stats %>% dplyr::filter(!orig) for (i in seq_along(terms)) { tmp <- new_stats(stats$estimate[ stats$term == terms[i] ], lo = loo_estimate$lo[i], hi = loo_estimate$hi[i]) tmp$term <- terms[i] if (i == 1) { ci_bca <- tmp } else { ci_bca <- bind_rows(ci_bca, tmp) } } ci_bca <- ci_bca %>% dplyr::select(term, .lower, .estimate, .upper) %>% dplyr::mutate( .alpha = alpha, .method = "BCa" ) } #' @rdname int_pctl #' @inheritParams int_pctl #' @param .fn A function to calculate statistic of interest. The #' function should take an `rsplit` as the first argument and the `...` are #' required. #' @param ... Arguments to pass to `.fn`. #' @references \url{https://tidymodels.github.io/rsample/articles/Applications/Intervals.html} #' @importFrom purrr map_dfr #' @export int_bca <- function(.data, statistics, alpha = 0.05, .fn, ...) { check_rset(.data) has_dots(.fn) column_name <- tidyselect::vars_select(names(.data), !!enquo(statistics)) if (length(column_name) != 1) { stop(stat_fmt_err, call. = FALSE) } stats <- .data %>% dplyr::select(!!column_name, id) stats <- check_tidy(stats) check_num_resamples(stats, B = 1000) vals <- bca_calc(stats, .data$splits[[1]]$data, alpha = alpha, .fn = .fn, ...) vals } # ---------------------------------------------------------------- #' @importFrom utils globalVariables utils::globalVariables( c("id", ".", ".estimate", ".lower", ".upper", "Z0", "Za", "Zl", "Zu", "a", "cubed", "estimate", "orig", "p0", "squared", "term", "theta_0", "loo", "n") ) rsample/R/data.R0000644000176200001440000000321513356221017013153 0ustar liggesusers#' Job Attrition #' #' @details These data are from the IBM Watson Analytics Lab. #' The website describes the data with \dQuote{Uncover the #' factors that lead to employee attrition and explore important #' questions such as \sQuote{show me a breakdown of distance #' from home by job role and attrition} or \sQuote{compare #' average monthly income by education and attrition}. This is a #' fictional data set created by IBM data scientists.}. There #' are 1470 rows. #' #' @name attrition #' @aliases attrition #' @docType data #' @return \item{attrition}{a data frame} #' #' @source The IBM Watson Analytics Lab website https://www.ibm.com/communities/analytics/watson-analytics-blog/hr-employee-attrition/ #' #' #' @keywords datasets #' @examples #' data(attrition) #' str(attrition) NULL #' Two Class Data #' #' @details There are artifical data with two predictors (`A` and `B`) and #' a factor outcome variable (`Class`). #' #' @name two_class_dat #' @aliases two_class_dat #' @docType data #' @return \item{two_class_dat}{a data frame} #' #' @keywords datasets #' @examples #' data(two_class_dat) #' str(two_class_dat) NULL #' Sample Time Series Data #' #' @details Drink sales. The exact name of the series from FRED is: #' "Merchant Wholesalers, Except Manufacturers' Sales Branches and Offices #' Sales: Nondurable Goods: Beer, Wine, and Distilled Alcoholic Beverages Sales" #' #' @name drinks #' @aliases drinks #' @docType data #' @return \item{drinks}{a data frame} #' #' @source The Federal Reserve Bank of St. Louis website https://fred.stlouisfed.org/series/S4248SM144NCEN #' #' @keywords datasets #' @examples #' data(drinks) #' str(drinks) NULL rsample/R/reexports.R0000644000176200001440000000006713356176010014301 0ustar liggesusers#' @importFrom generics tidy #' @export generics::tidy rsample/R/vfold.R0000644000176200001440000001211613512170471013355 0ustar liggesusers#' V-Fold Cross-Validation #' #' V-fold cross-validation randomly splits the data into V groups of roughly #' equal size (called "folds"). A resample of the analysis data consisted of #' V-1 of the folds while the assessment set contains the final fold. In basic #' V-fold cross-validation (i.e. no repeats), the number of resamples is equal #' to V. #' @details #' The `strata` argument causes the random sampling to be conducted *within #' the stratification variable*. The can help ensure that the number of data #' points in the analysis data is equivalent to the proportions in the original #' data set. #' When more than one repeat is requested, the basic V-fold cross-validation #' is conducted each time. For example, if three repeats are used with `v = #' 10`, there are a total of 30 splits which as three groups of 10 that are #' generated separately. #' @param data A data frame. #' @param v The number of partitions of the data set. #' @param repeats The number of times to repeat the V-fold partitioning. #' @param strata A variable that is used to conduct stratified sampling to #' create the folds. This could be a single character value or a variable name #' that corresponds to a variable that exists in the data frame. #' @param breaks A single number giving the number of bins desired to stratify #' a numeric stratification variable. #' @param ... Not currently used. #' @export #' @return A tibble with classes `vfold_cv`, `rset`, `tbl_df`, `tbl`, and #' `data.frame`. The results include a column for the data split objects and #' one or more identification variables. For a single repeats, there will be #' one column called `id` that has a character string with the fold identifier. #' For repeats, `id` is the repeat number and an additional column called `id2` #' that contains the fold information (within repeat). #' @examples #' vfold_cv(mtcars, v = 10) #' vfold_cv(mtcars, v = 10, repeats = 2) #' #' library(purrr) #' iris2 <- iris[1:130, ] #' #' set.seed(13) #' folds1 <- vfold_cv(iris2, v = 5) #' map_dbl(folds1$splits, #' function(x) { #' dat <- as.data.frame(x)$Species #' mean(dat == "virginica") #' }) #' #' set.seed(13) #' folds2 <- vfold_cv(iris2, strata = "Species", v = 5) #' map_dbl(folds2$splits, #' function(x) { #' dat <- as.data.frame(x)$Species #' mean(dat == "virginica") #' }) #' #' set.seed(13) #' folds3 <- vfold_cv(iris2, strata = "Petal.Length", breaks = 6, v = 5) #' map_dbl(folds3$splits, #' function(x) { #' dat <- as.data.frame(x)$Species #' mean(dat == "virginica") #' }) #' @export vfold_cv <- function(data, v = 10, repeats = 1, strata = NULL, breaks = 4, ...) { if(!missing(strata)) { strata <- tidyselect::vars_select(names(data), !!enquo(strata)) if(length(strata) == 0) strata <- NULL } strata_check(strata, names(data)) if (repeats == 1) { split_objs <- vfold_splits(data = data, v = v, strata = strata, breaks = breaks) } else { for (i in 1:repeats) { tmp <- vfold_splits(data = data, v = v, strata = strata) tmp$id2 <- tmp$id tmp$id <- names0(repeats, "Repeat")[i] split_objs <- if (i == 1) tmp else rbind(split_objs, tmp) } } ## We remove the holdout indicies since it will save space and we can ## derive them later when they are needed. split_objs$splits <- map(split_objs$splits, rm_out) ## Save some overall information cv_att <- list(v = v, repeats = repeats, strata = !is.null(strata)) new_rset(splits = split_objs$splits, ids = split_objs[, grepl("^id", names(split_objs))], attrib = cv_att, subclass = c("vfold_cv", "rset")) } # Get the indices of the analysis set from the assessment set vfold_complement <- function(ind, n) { list(analysis = setdiff(1:n, ind), assessment = ind) } #' @importFrom tibble tibble #' @importFrom purrr map #' @importFrom dplyr bind_rows vfold_splits <- function(data, v = 10, strata = NULL, breaks = 4) { if (!is.numeric(v) || length(v) != 1) stop("`v` must be a single integer.", call. = FALSE) n <- nrow(data) if (is.null(strata)) { folds <- sample(rep(1:v, length.out = n)) idx <- seq_len(n) indices <- split(idx, folds) } else { stratas <- tibble::tibble(idx = 1:n, strata = make_strata(getElement(data, strata), breaks = breaks)) stratas <- split(stratas, stratas$strata) stratas <- purrr::map(stratas, add_vfolds, v = v) stratas <- dplyr::bind_rows(stratas) indices <- split(stratas$idx, stratas$folds) } indices <- lapply(indices, vfold_complement, n = n) split_objs <- purrr::map(indices, make_splits, data = data, class = "vfold_split") tibble::tibble(splits = split_objs, id = names0(length(split_objs), "Fold")) } add_vfolds <- function(x, v) { x$folds <- sample(rep(1:v, length.out = nrow(x))) x } #' @export print.vfold_cv <- function(x, ...) { cat("# ", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("vfold_cv", "rset"))] print(x) } rsample/R/mc.R0000644000176200001440000001067013512170471012645 0ustar liggesusers#' Monte Carlo Cross-Validation #' #' One resample of Monte Carlo cross-validation takes a random sample (without #' replacement) of the original data set to be used for analysis. All other #' data points are added to the assessment set. #' @details The `strata` argument causes the random sampling to be conducted #' *within the stratification variable*. The can help ensure that the number of #' data points in the analysis data is equivalent to the proportions in the #' original data set. #' @inheritParams vfold_cv #' @param prop The proportion of data to be retained for modeling/analysis. #' @param times The number of times to repeat the sampling. #' @param strata A variable that is used to conduct stratified sampling to #' create the resamples. This could be a single character value or a variable #' name that corresponds to a variable that exists in the data frame. #' @param breaks A single number giving the number of bins desired to stratify #' a numeric stratification variable. #' @export #' @return An tibble with classes `mc_cv`, `rset`, `tbl_df`, `tbl`, and #' `data.frame`. The results include a column for the data split objects and a #' column called `id` that has a character string with the resample identifier. #' @examples #' mc_cv(mtcars, times = 2) #' mc_cv(mtcars, prop = .5, times = 2) #' #' library(purrr) #' iris2 <- iris[1:130, ] #' #' set.seed(13) #' resample1 <- mc_cv(iris2, times = 3, prop = .5) #' map_dbl(resample1$splits, #' function(x) { #' dat <- as.data.frame(x)$Species #' mean(dat == "virginica") #' }) #' #' set.seed(13) #' resample2 <- mc_cv(iris2, strata = "Species", times = 3, prop = .5) #' map_dbl(resample2$splits, #' function(x) { #' dat <- as.data.frame(x)$Species #' mean(dat == "virginica") #' }) #' #' set.seed(13) #' resample3 <- mc_cv(iris2, strata = "Sepal.Length", breaks = 6, times = 3, prop = .5) #' map_dbl(resample3$splits, #' function(x) { #' dat <- as.data.frame(x)$Species #' mean(dat == "virginica") #' }) #' @export mc_cv <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4, ...) { if(!missing(strata)) { strata <- tidyselect::vars_select(names(data), !!enquo(strata)) if(length(strata) == 0) strata <- NULL } strata_check(strata, names(data)) split_objs <- mc_splits(data = data, prop = 1 - prop, times = times, strata = strata, breaks = breaks) ## We remove the holdout indicies since it will save space and we can ## derive them later when they are needed. split_objs$splits <- map(split_objs$splits, rm_out) mc_att <- list(prop = prop, times = times, strata = !is.null(strata)) new_rset(splits = split_objs$splits, ids = split_objs$id, attrib = mc_att, subclass = c("mc_cv", "rset")) } # Get the indices of the analysis set from the assessment set mc_complement <- function(ind, n) { list(analysis = setdiff(1:n, ind), assessment = ind) } #' @importFrom purrr map map_df #' @importFrom tibble tibble mc_splits <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4) { if (!is.numeric(prop) | prop >= 1 | prop <= 0) stop("`prop` must be a number on (0, 1).", call. = FALSE) n <- nrow(data) if (is.null(strata)) { indices <- purrr::map(rep(n, times), sample, size = floor(n * prop)) } else { stratas <- tibble::tibble(idx = 1:n, strata = make_strata(getElement(data, strata), breaks = breaks)) stratas <- split(stratas, stratas$strata) stratas <- purrr::map_df(stratas, strat_sample, prop = prop, times = times) indices <- split(stratas$idx, stratas$rs_id) } indices <- lapply(indices, mc_complement, n = n) split_objs <- purrr::map(indices, make_splits, data = data, class = "mc_split") list(splits = split_objs, id = names0(length(split_objs), "Resample")) } #' @importFrom purrr map map_df strat_sample <- function(x, prop, times, ...) { n <- nrow(x) idx <- purrr::map(rep(n, times), sample, size = floor(n*prop), ...) out <- purrr::map_df(idx, function(ind, x) x[sort(ind), "idx"], x = x) out$rs_id <- rep(1:times, each = floor(n*prop)) out } #' @export print.mc_cv <- function(x, ...) { cat("#", pretty(x), "\n") class(x) <- class(x)[!(class(x) %in% c("mc_cv", "rset"))] print(x) } rsample/R/form_pred.R0000644000176200001440000000216413323650050014216 0ustar liggesusers#' Extract Predictor Names from Formula or Terms #' #' `all.vars` returns all variables used in a formula. This #' function only returns the variables explicitly used on the #' right-hand side (i.e., it will not resolve dots unless the #' object is terms with a data set specified). #' @param object A model formula or [stats::terms()] #' object. #' @param ... Arguments to pass to [all.vars()] #' @return A character vector of names #' @export #' @examples #' form_pred(y ~ x + z) #' form_pred(terms(y ~ x + z)) #' #' form_pred(y ~ x + log(z)) #' form_pred(log(y) ~ x + z) #' #' form_pred(y1 + y2 ~ x + z) #' form_pred(log(y1) + y2 ~ x + z) #' #' # will fail: #' # form_pred(y ~ .) #' #' form_pred(terms(Species ~ (.)^2, data = iris)) #' form_pred(terms( ~ (.)^2, data = iris)) #' @importFrom stats terms form_pred <- function(object, ...) { if(inherits(object, "formula")) { object <- terms(object) } y_index <- attr(object, "response") ## If there is something on the lhs of the formula, ## remove it and get vars if(y_index != 0) { object[[2]] <- NULL object <- terms(object) } all.vars(object, ...) } rsample/R/misc.R0000644000176200001440000000577613323650050013210 0ustar liggesusersmake_splits <- function(ind, data, class = NULL) { res <- rsplit(data, ind$analysis, ind$assessment) if (!is.null(class)) res <- add_class(res, class) res } merge_lists <- function(a, b) list(analysis = a, assessment = b) dim_rset <- function(x, ...) { dims <- purrr::map(x$splits, dim) dims <- do.call("rbind", dims) dims <- tibble::as_tibble(dims) id_cols <- grep("^id", colnames(x), value = TRUE) for (i in seq_along(id_cols)) dims[id_cols[i]] <- getElement(x, id_cols[i]) dims } names0 <- function (num, prefix = "x") { if (num < 1) stop("`num` should be > 0", call. = FALSE) ind <- format(1:num) ind <- gsub(" ", "0", ind) paste0(prefix, ind) } add_class <- function(x, cls, at_end = TRUE) { class(x) <- if (at_end) c(class(x), cls) else c(cls, class(x)) x } strata_check <- function(strata, vars) { if (!is.null(strata)) { if (!is.character(strata) | length(strata) != 1) stop("`strata` should be a single character value", call. = FALSE) if (!(strata %in% vars)) stop(strata, " is not in `data`") } invisible(NULL) } #' @importFrom tibble is_tibble as_tibble tibble #' @importFrom dplyr bind_cols # `splits`` should be either a list or a tibble with a single column # called "splits" # `ids`` should be either a character vector or a tibble with # one or more columns that begin with "id" new_rset <- function(splits, ids, attrib = NULL, subclass = character()) { stopifnot(is.list(splits)) if (!is_tibble(ids)) { ids <- tibble(id = ids) } else { if (!all(grepl("^id", names(ids)))) stop("The `ids` tibble column names should start with 'id'", call. = FALSE) } either_type <- function(x) is.character(x) | is.factor(x) ch_check <- vapply(ids, either_type, c(logical = TRUE)) if(!all(ch_check)) stop("All ID columns should be character or factor ", "vectors.", call. = FALSE) if (!is_tibble(splits)) { splits <- tibble(splits = splits) } else { if(ncol(splits) > 1 | names(splits)[1] != "splits") stop("The `splits` tibble should have a single column ", "named `splits`.", call. = FALSE) } if (nrow(ids) != nrow(splits)) stop("Split and ID vectors have different lengths.", call. = FALSE) # Create another element to the splits that is a tibble containing # an identifer for each id column so that, in isolation, the resample # id can be known just based on the `rsplit` object. This can then be # accessed using the `labels` methof for `rsplits` splits$splits <- map2(splits$splits, split(ids, 1:nrow(ids)), add_id) res <- bind_cols(splits, ids) if (!is.null(attrib)) { if (any(names(attrib) == "")) stop("`attrib` should be a fully named list.", call. = FALSE) for (i in names(attrib)) attr(res, i) <- attrib[[i]] } if (length(subclass) > 0) res <- add_class(res, cls = subclass, at_end = FALSE) res } add_id <- function(split, id) { split$id <- id split } rsample/R/make_strata.R0000644000176200001440000001002213512132573014531 0ustar liggesusers#' Create or Modify Stratification Variables #' #' For stratified resampling, this function can create strata from numeric data #' and also make non-numeric data more conducive to be used for #' stratification. #' @details #' For numeric data, if the number of unique levels is less than #' `nunique`, the data are treated as categorical data. #' #' For categorical inputs, the function will find levels of `x` than #' occur in the data with percentage less than `pool`. The values from #' these groups will be randomly assigned to the remaining strata (as will #' data points that have missing values in `x`). #' #' For numeric data with more unique values than `nunique`, the data #' will be converted to being categorical based on percentiles of the data. #' The percentile groups will have no more than 20 percent of the data in #' each group. Again, missing values in `x` are randomly assigned #' to groups. #' #' @param x An input vector. #' @param breaks A single number giving the number of bins desired to stratify a #' numeric stratification variable. #' @param nunique An integer for the number of unique value threshold in the #' algorithm. #' @param pool A proportion of data used to determine if a particular group is #' too small and should be pooled into another group. #' @param depth An integer that is used to determine the best number of #' percentiles that should be used. The number of bins are based on #' `min(5, floor(n / depth))` where `n = length(x)`. #' If `x` is numeric, there must be at least 40 rows in the data set #' (when `depth = 20`) to conduct stratified sampling. #' #' @export #' @return A factor vector. #' @examples #' set.seed(61) #' x1 <- rpois(100, lambda = 5) #' table(x1) #' table(make_strata(x1)) #' #' set.seed(554) #' x2 <- rpois(100, lambda = 1) #' table(x2) #' table(make_strata(x2)) #' #' # small groups are randomly assigned #' x3 <- factor(x2) #' table(x3) #' table(make_strata(x3)) #' #' # `oilType` data from `caret` #' x4 <- rep(LETTERS[1:7], c(37, 26, 3, 7, 11, 10, 2)) #' table(x4) #' table(make_strata(x4)) #' table(make_strata(x4, pool = 0.1)) #' table(make_strata(x4, pool = 0.0)) #' #' # not enough data to stratify #' x5 <- rnorm(20) #' table(make_strata(x5)) #' #' set.seed(483) #' x6 <- rnorm(200) #' quantile(x6, probs = (0:10)/10) #' table(make_strata(x6, breaks = 10)) #' @export #' @importFrom stats quantile make_strata <- function(x, breaks = 4, nunique = 5, pool = .15, depth = 20) { num_vals <- unique(x) n <- length(x) num_miss <- sum(is.na(x)) if (length(num_vals) <= nunique | is.character(x) | is.factor(x)) { x <- factor(x) xtab <- sort(table(x)) pcts <- xtab / n ## This should really be based on some combo of rate and number. if (all(pcts < pool)) { warning("Too little data to stratify. Unstratified resampling ", "will be used.", call. = FALSE) return(factor(rep("strata1", n))) } ## Small groups will be randomly allocated to stratas at end ## These should probably go into adjacent groups but this works for now if (any(pcts < pool)) x[x %in% names(pcts)[pcts < pool]] <- NA ## The next line will also relevel the data if `x` was a factor out <- factor(as.character(x)) } else { if (floor(n / breaks) < depth) { warning(paste0("The number of observations in each quantile is ", "below the recommended threshold of ", depth, ". Stratification ", "will be done with ", floor(n/depth), " breaks instead."), call. = FALSE) } breaks <- min(breaks, floor(n/depth)) if (breaks < 2) { warning("Too little data to stratify. Unstratified resampling ", "will be used.", call. = FALSE) return(factor(rep("strata1", n))) } pctls <- quantile(x, probs = (0:breaks) / breaks) pctls <- unique(pctls) out <- cut(x, breaks = pctls, include.lowest = TRUE) } num_miss <- sum(is.na(x)) if (num_miss > 0) out[is.na(x)] <- sample(levels(out), size = num_miss, replace = TRUE) out } rsample/R/gather.R0000644000176200001440000000436513323650050013520 0ustar liggesusers#' Gather an `rset` Object #' #' This method uses `gather` on an `rset` object to stack all of #' the non-ID or split columns in the data and is useful for #' stacking model evaluation statistics. The resulting data frame #' has a column based on the column names of `data` and another for #' the values. #' #' @inheritParams gather #' @param data An `rset` object. #' @param key,value,... Not specified in this method and will be #' ignored. Note that this means that selectors are ignored if #' they are passed to the function. #' @param na.rm If `TRUE`, will remove rows from output where the #' value column in `NA`. #' @param convert If `TRUE` will automatically run #' `type.convert()` on the key column. This is useful if the column #' names are actually numeric, integer, or logical. #' @param factor_key If FALSE, the default, the key values will be #' stored as a character vector. If `TRUE`, will be stored as a #' factor, which preserves the original ordering of the columns. #' @return A data frame with the ID columns, a column called #' `model` (with the previous column names), and a column called #' `statistic` (with the values). #' @examples #' library(rsample) #' cv_obj <- vfold_cv(mtcars, v = 10) #' cv_obj$lm_rmse <- rnorm(10, mean = 2) #' cv_obj$nnet_rmse <- rnorm(10, mean = 1) #' gather(cv_obj) #' @exportMethod gather.rset #' @export gather.rset #' @export #' @method gather rset #' @importFrom tidyr gather #' @importFrom dplyr select %>% #' @importFrom rlang !! gather.rset <- function(data, key = NULL, value = NULL, ..., na.rm = TRUE, convert = FALSE, factor_key = TRUE) { if(any(names(data) == "splits")) data <- data %>% select(-splits) data <- as.data.frame(data) id_vars <- grep("^id", names(data), value = TRUE) other_vars <- names(data)[!(names(data) %in% id_vars)] if(length(other_vars) < 2) stop("There should be at least two other columns ", "(besides `id` variables) in the data set to ", "use `gather`.") # check types? gather( data, key = model, value = statistic, - !!id_vars, na.rm = na.rm, convert = convert, factor_key = factor_key ) } #' @importFrom utils globalVariables utils::globalVariables(c("model", "splits", "statistic")) rsample/R/caret.R0000644000176200001440000001140313323650050013333 0ustar liggesusers#' Convert Resampling Objects to Other Formats #' #' These functions can convert resampling objects between #' \pkg{rsample} and \pkg{caret}. #' #' @param object An `rset` object. Currently, #' `nested_cv` is not supported. #' @return `rsample2caret` returns a list that mimics the #' `index` and `indexOut` elements of a #' `trainControl` object. `caret2rsample` returns an #' `rset` object of the appropriate class. #' @export #' @importFrom purrr map rsample2caret <- function(object, data = c("analysis", "assessment")) { if(!inherits(object, "rset")) stop("`object` must be an `rset`", call. = FALSE) data <- match.arg(data) in_ind <- purrr::map(object$splits, as.integer, data = "analysis") names(in_ind) <- labels(object) out_ind <- purrr::map(object$splits, as.integer, data = "assessment") names(out_ind) <- names(in_ind) list(index = in_ind, indexOut = out_ind) } #' @rdname rsample2caret #' @param ctrl An object produced by `trainControl` that has #' had the `index` and `indexOut` elements populated by #' integers. One method of getting this is to extract the #' `control` objects from an object produced by `train`. #' @param data The data that was originally used to produce the #' `ctrl` object. #' @importFrom purrr map map2 #' @importFrom tibble tibble #' @importFrom dplyr bind_cols #' @export caret2rsample <- function(ctrl, data = NULL) { if (is.null(data)) stop("Must supply original data", call. = FALSE) if (!any(names(ctrl) == "index")) stop("`ctrl` should have an element `index`", call. = FALSE) if (!any(names(ctrl) == "indexOut")) stop("`ctrl` should have an element `indexOut`", call. = FALSE) if (is.null(ctrl$index)) stop("`ctrl$index` should be populated with integers", call. = FALSE) if (is.null(ctrl$indexOut)) stop("`ctrl$indexOut` should be populated with integers", call. = FALSE) indices <- purrr::map2(ctrl$index, ctrl$indexOut, extract_int) id_data <- names(indices) indices <- unname(indices) indices <- purrr::map(indices, add_data, y = data) indices <- map(indices, add_rsplit_class, cl = map_rsplit_method(ctrl$method)) indices <- tibble::tibble(splits = indices) if (ctrl$method %in% c("repeatedcv", "adaptive_cv")) { id_data <- strsplit(id_data, split = ".", fixed = TRUE) id_data <- tibble::tibble( id = vapply(id_data, function(x) x[2], character(1)), id2 = vapply(id_data, function(x) x[1], character(1)) ) } else { id_data <- tibble::tibble(id = id_data) } out <- dplyr::bind_cols(indices, id_data) attrib <- map_attr(ctrl) for (i in names(attrib)) attr(out, i) <- attrib[[i]] out <- add_rset_class(out, map_rset_method(ctrl$method)) out } extract_int <- function(x, y) list(in_id = x, out_id = y) add_data <- function(x, y) c(list(data = y), x) add_rsplit_class <- function(x, cl) { class(x) <- c("rsplit", cl) x } add_rset_class <- function(x, cl) { class(x) <- c(cl, "rset", "tbl_df", "tbl", "data.frame") x } map_rsplit_method <- function(method) { out <- switch( method, cv = , repeatedcv = , adaptive_cv = "vfold_split", boot = , boot_all =, boot632 = , optimism_boot = , adaptive_boot = "boot_split", LOOCV = "loo_split", LGOCV = , adaptive_LGOCV = "mc_split", timeSlice = "rof_split", "error" ) if (out == "error") stop("Resampling method `", method, "` cannot be converted into an `rsplit` object", call. = FALSE) out } map_rset_method <- function(method) { out <- switch( method, cv = , repeatedcv = , adaptive_cv = "vfold_cv", boot = , boot_all =, boot632 = , optimism_boot = , adaptive_boot = "bootstraps", LOOCV = "loo_cv", LGOCV = , adaptive_LGOCV = "mc_cv", timeSlice = "rolling_origin", "error" ) if (out == "error") stop("Resampling method `", method, "` cannot be converted into an `rset` object", call. = FALSE) out } map_attr <- function(object) { if (grepl("cv$", object$method)) { out <- list(v = object$number, repeats = ifelse(!is.na(object$repeats), object$repeats, 1), strata = TRUE) } else if (grepl("boot", object$method)) { out <- list(times = object$number, apparent = FALSE, strata = FALSE) } else if (grepl("LGOCV$", object$method)) { out <- list(times = object$number, prop = object$p, strata = FALSE) } else if (object$method == "LOOCV") { out <- list() } else if (object$method == "timeSlice") { out <- list( initial = object$initialWindow, assess = object$horizon, cumulative = !object$fixedWindow, skip = object$skip ) } else { stop("Method", object$method, "cannot be converted") } out } rsample/R/initial_split.R0000644000176200001440000000454513512170471015116 0ustar liggesusers#' Simple Training/Test Set Splitting #' #' `initial_split` creates a single binary split of the data into a training #' set and testing set. `initial_time_split` does the same, but takes the #' _first_ `prop` samples for training, instead of a random selection. #' `training` and `testing` are used to extract the resulting data. #' @details The `strata` argument causes the random sampling to be conducted #' *within the stratification variable*. The can help ensure that the number of #' data points in the training data is equivalent to the proportions in the #' original data set. #' @inheritParams vfold_cv #' @param prop The proportion of data to be retained for modeling/analysis. #' @param strata A variable that is used to conduct stratified sampling to #' create the resamples. This could be a single character value or a variable #' name that corresponds to a variable that exists in the data frame. #' @param breaks A single number giving the number of bins desired to stratify #' a numeric stratification variable. #' @export #' @return An `rset` object that can be used with the `training` and `testing` #' functions to extract the data in each split. #' @examples #' set.seed(1353) #' car_split <- initial_split(mtcars) #' train_data <- training(car_split) #' test_data <- testing(car_split) #' #' drinks_split <- initial_time_split(drinks) #' train_data <- training(drinks_split) #' test_data <- testing(car_split) #' c(max(train_data$date), min(test_data$date)) # no overlap #' @export #' initial_split <- function(data, prop = 3/4, strata = NULL, breaks = 4, ...) { if(!missing(strata)) { strata <- tidyselect::vars_select(names(data), !!enquo(strata)) if(length(strata) == 0) strata <- NULL } res <- mc_cv( data = data, prop = prop, strata = strata, breaks = breaks, times = 1, ... ) res$splits[[1]] } #' @rdname initial_split #' @export initial_time_split <- function(data, prop = 3/4, ...) { if (!is.numeric(prop) | prop >= 1 | prop <= 0) { stop("`prop` must be a number on (0, 1).", call. = FALSE) } n_train <- floor(nrow(data) * prop) rsplit(data, 1:n_train, (n_train + 1):nrow(data)) } #' @rdname initial_split #' @export #' @param x An `rsplit` object produced by `initial_split` training <- function(x) analysis(x) #' @rdname initial_split #' @export testing <- function(x) assessment(x) rsample/vignettes/0000755000176200001440000000000013512177703013733 5ustar liggesusersrsample/vignettes/Working_with_rsets.Rmd0000644000176200001440000001655313356266567020321 0ustar liggesusers--- title: "Working with rsets" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Working with rsets} output: knitr:::html_vignette: toc: yes --- ```{r ex_setup, include=FALSE} knitr::opts_chunk$set( message = FALSE, digits = 3, collapse = TRUE, comment = "#>" ) options(digits = 3, width = 90) library(ggplot2) theme_set(theme_bw()) ``` ## Introduction `rsample` can be used to create objects containing resamples of the original data. This page contains examples of how those objects can be used for data analysis. For illustration, the `attrition` data is used. From the help file: > These data are from the IBM Watson Analytics Lab. The website describes the data with "Uncover the factors that lead to employee attrition and explore important questions such as ‘show me a breakdown of distance from home by job role and attrition’ or 'compare average monthly income by education and attrition'. This is a fictional data set created by IBM data scientists." There are 1470 rows. The data can be accessed using ```{r attrition, message=FALSE} library(rsample) data("attrition") names(attrition) table(attrition$Attrition) ``` ## Model Assessment Let's fit a logistic regression model to the data with model terms for the job satisfaction, gender, and monthly income. If we were fitting the model to the entire data set, we might model attrition using ```r glm(Attrition ~ JobSatisfaction + Gender + MonthlyIncome, data = attrition, family = binomial) ``` For convenience, we'll create a formula object that will be used later: ```{r form, message=FALSE} mod_form <- as.formula(Attrition ~ JobSatisfaction + Gender + MonthlyIncome) ``` To evaluate this model, we will use 10 repeats of 10-fold cross-validation and use the 100 holdout samples to evaluate the overall accuracy of the model. First, let's make the splits of the data: ```{r model_vfold, message=FALSE} library(rsample) set.seed(4622) rs_obj <- vfold_cv(attrition, v = 10, repeats = 10) rs_obj ``` Now let's write a function that will, for each resample: 1. obtain the analysis data set (i.e. the 90% used for modeling) 1. fit a logistic regression model 1. predict the assessment data (the other 10% not used for the model) using the `broom` package 1. determine if each sample was predicted correctly. Here is our function: ```{r lm_func} ## splits will be the `rsplit` object with the 90/10 partition holdout_results <- function(splits, ...) { # Fit the model to the 90% mod <- glm(..., data = analysis(splits), family = binomial) # Save the 10% holdout <- assessment(splits) # `augment` will save the predictions with the holdout data set res <- broom::augment(mod, newdata = holdout) # Class predictions on the assessment set from class probs lvls <- levels(holdout$Attrition) predictions <- factor(ifelse(res$.fitted > 0, lvls[2], lvls[1]), levels = lvls) # Calculate whether the prediction was correct res$correct <- predictions == holdout$Attrition # Return the assessment data set with the additional columns res } ``` For example: ```{r onefold, warning = FALSE} example <- holdout_results(rs_obj$splits[[1]], mod_form) dim(example) dim(assessment(rs_obj$splits[[1]])) ## newly added columns: example[1:10, setdiff(names(example), names(attrition))] ``` For this model, the `.fitted` value is the linear predictor in log-odds units. To compute this data set for each of the 100 resamples, we'll use the `map` function from the `purrr` package: ```{r model_purrr, warning=FALSE} library(purrr) rs_obj$results <- map(rs_obj$splits, holdout_results, mod_form) rs_obj ``` Now we can compute the accuracy values for all of the assessment data sets: ```{r model_acc} rs_obj$accuracy <- map_dbl(rs_obj$results, function(x) mean(x$correct)) summary(rs_obj$accuracy) ``` Keep in mind that the baseline accuracy to beat is the rate of non-attrition, which is `r round(mean(attrition$Attrition == "No"), 3)`. Not a great model so far. ## Using the Bootstrap to Make Comparisons Traditionally, the bootstrap has been primarily used to empirically determine the sampling distribution of a test statistic. Given a set of samples with replacement, a statistic can be calculated on each analysis set and the results can be used to make inferences (such as confidence intervals). For example, are there differences in the median monthly income between genders? ```{r type_plot} ggplot(attrition, aes(x = Gender, y = MonthlyIncome)) + geom_boxplot() + scale_y_log10() ``` If we wanted to compare the genders, we could conduct a _t_-test or rank-based test. Instead, let's use the bootstrap to see if there is a difference in the median incomes for the two groups. We need a simple function to compute this statistic on the resample: ```{r mean_diff} median_diff <- function(splits) { x <- analysis(splits) median(x$MonthlyIncome[x$Gender == "Female"]) - median(x$MonthlyIncome[x$Gender == "Male"]) } ``` Now we would create a large number of bootstrap samples (say 2000+). For illustration, we'll only do 500 in this document. ```{r boot_mean_diff} set.seed(353) bt_resamples <- bootstraps(attrition, times = 500) ``` This function is then computed across each resample: ```{r stats} bt_resamples$wage_diff <- map_dbl(bt_resamples$splits, median_diff) ``` The bootstrap distribution of this statistic has a slightly bimodal and skewed distribution: ```{r stats_plot} ggplot(bt_resamples, aes(x = wage_diff)) + geom_line(stat = "density", adjust = 1.25) + xlab("Difference in Median Monthly Income (Female - Male)") ``` The variation is considerable in this statistic. One method of computing a confidence interval is to take the percentiles of the bootstrap distribution. A 95% confidence interval for the difference in the means would be: ```{r ci} quantile(bt_resamples$wage_diff, probs = c(0.025, 0.500, 0.975)) ``` _On average_, there is no evidence for a difference in the genders. ## Bootstrap Estimates of Model Coefficients Unless there is already a column in the resample object that contains the fitted model, a function can be used to fit the model and save all of the model coefficients. The [`broom` package](https://cran.r-project.org/package=broom) package has a `tidy` function that will save the coefficients in a data frame. Instead of returning a data frame with a row for each model term, we will save a data frame with a single row and columns for each model term. As before, `purrr::map` can be used to estimate and save these values for each split. ```{r coefs} glm_coefs <- function(splits, ...) { ## use `analysis` or `as.data.frame` to get the analysis data mod <- glm(..., data = analysis(splits), family = binomial) as.data.frame(t(coef(mod))) } bt_resamples$betas <- map(.x = bt_resamples$splits, .f = glm_coefs, mod_form) bt_resamples bt_resamples$betas[[1]] ``` ## Keeping Tidy As previously mentioned, the [`broom` package](https://cran.r-project.org/package=broom) contains a class called `tidy` that created representations of objects that can be easily used for analysis, plotting, etc. `rsample` contains `tidy` methods for `rset` and `rsplit` objects. For example: ```{r tidy_rsplit} first_resample <- bt_resamples$splits[[1]] class(first_resample) tidy(first_resample) ``` and ```{r tidy_rset} class(bt_resamples) tidy(bt_resamples) ``` rsample/vignettes/Basics.Rmd0000644000176200001440000000621213323650050015573 0ustar liggesusers--- title: "Basics" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Basics} output: knitr:::html_vignette: toc: yes --- ```{r ex_setup, include=FALSE} knitr::opts_chunk$set( message = FALSE, digits = 3, collapse = TRUE, comment = "#>" ) options(digits = 3) ``` ## Terminology We define a _resample_ as the result of a two-way split of a data set. For example, when bootstrapping, one part of the resample is a sample with replacement of the original data. The other part of the split contains the instances that were not contained in the bootstrap sample. Cross-validation is another type of resampling. ## `rset` Objects Contain Many Resamples The main class in the package (`rset`) is for a _set_ or _collection_ of resamples. In 10-fold cross-validation, the set would consist of the 10 different resamples of the original data. Like [`modelr`](https://cran.r-project.org/package=modelr), the resamples are stored in data-frame-like `tibble` object. As a simple example, here is a small set of bootstraps of the `mtcars` data: ```{r mtcars_bt, message=FALSE} library(rsample) set.seed(8584) bt_resamples <- bootstraps(mtcars, times = 3) bt_resamples ``` ## Individual Resamples are `rsplit` Objects The resamples are stored in the `splits` column in an object that has class `rsplit`. In this package we use the following terminology for the two partitions that comprise a resample: * The _analysis_ data are those that we selected in the resample. For a bootstrap, this is the sample with replacement. For 10-fold cross-validation, this is the 90% of the data. These data are often used to fit a model or calculate a statistic in traditional bootstrapping. * The _assessment_ data are usually the section of the original data not covered by the analysis set. Again, in 10-fold CV, this is the 10% held out. These data are often used to evaluate the performance of a model that was fit to the analysis data. (Aside: While some might use the term "training" and "testing" for these data sets, we avoid them since those labels often conflict with the data that result from an initial partition of the data that is typically done _before_ resampling. The training/test split can be conducted using the `initial_split` function in this package.) Let's look at one of the `rsplit` objects ```{r rsplit} first_resample <- bt_resamples$splits[[1]] first_resample ``` This indicates that there were `r dim(bt_resamples$splits[[1]])["analysis"]` data points in the analysis set, `r dim(bt_resamples$splits[[1]])["assessment"]` instances were in the assessment set, and that the original data contained `r dim(bt_resamples$splits[[1]])["n"]` data points. These results can also be determined using the `dim` function on an `rsplit` object. To obtain either of these data sets from an `rsplit`, the `as.data.frame` function can be used. By default, the analysis set is returned but the `data` option can be used to return the assessment data: ```{r rsplit_df} head(as.data.frame(first_resample)) as.data.frame(first_resample, data = "assessment") ``` Alternatively, you can use the shortcuts `analysis(first_resample)` and `assessment(first_resample)`.rsample/vignettes/Applications/0000755000176200001440000000000013512177703016361 5ustar liggesusersrsample/vignettes/Applications/Keras.Rmd0000644000176200001440000001707313356240330020073 0ustar liggesusers--- title: "Grid Search Tuning of Keras Models" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Grid Search Tuning of Keras Models} output: knitr:::html_vignette: toc: yes --- ```{r setup, include=FALSE} library(AmesHousing) library(rsample) library(dplyr) library(keras) library(yardstick) library(purrr) library(ggplot2) ``` Here we demonstrate a simple grid search to optimize a tuning parameter of a [`keras`](https://keras.rstudio.com/index.html) neural network. The Ames housing data is used to demonstrate. There are a number of predictors for these data but, for simplicity, we'll see how far we can get by just using the geocodes for the properties as predictors of price. The outcome will be modeled on the log10 scale. ```{r ames-import} library(AmesHousing) library(dplyr) ames <- make_ames() %>% select(Sale_Price, Longitude, Latitude) ``` To be consistent with other analyses of these data, a training/test split is made. However, this article focuses on the training set. Normally, feature preprocessing should be estimated within the resampling process to get generalizable estimates of performance. Here, the two predictors are simply centered and scaled beforehand to avoid complexity in this analysis. However, this is generally a bad idea and the article on [`recipes`](https://topepo.github.io/rsample/articles/Applications/Recipes_and_rsample.html) describes a proper methodology for preprocessing the data. ```{r ame-split} library(rsample) set.seed(4595) data_split <- initial_split(ames, strata = "Sale_Price") ames_train <- training(data_split) %>% mutate(Sale_Price = log10(Sale_Price), Longitude = scale(Longitude, center = TRUE), Latitude = scale(Latitude, center = TRUE)) ``` The resample the model, simple 10-fold cross-validation is done such that the splits use the outcome as a stratification variable. On average, there should be `r floor(nrow(ames_train)*.1)` properties in the assessment set and this should be enough to obtain good estimates of the model RMSE. ```{r splits} set.seed(2453) cv_splits <- vfold_cv(ames_train, v = 10, strata = "Sale_Price") ``` A single layer feed-forward neural network with 10 hidden units will be used to model these data. There are a great many tuning parameters for these models including those for structural aspects (e.g. number of hidden units, activation type, number of layers), the optimization (momentum, dropout rate, etc.), and so on. For simplicity, this article will only optimize the number of training epochs (i.e. iterations); basically this is testing for early stopping. A function is needed to compute the model on the analysis set, predict the assessment set, and compute the holdout root mean squared error (in log10 units). The function below constructs the model sequentially and takes the number of epochs as a parameter. The argument `split` will be used to pass a single element of `cv_splits$splits`. This object will contain the two splits of the data for a single resample. The ellipses (`...`) will be used to pass arbitrary arguments to `keras::fit`. In this function, the seed is set. A few of the model components, such as `initializer_glorot_uniform` and `layer_dropout`, use random numbers and their specific seeds are set from the session's seed. This helps with reproducibility. ```{r model-func} library(keras) library(yardstick) library(purrr) mlp_rmse <- function(epoch, split, ...) { # Set the seed to get reproducible starting values and dropouts set.seed(4109) # Clearing the session after the computations have finished # clears memory used by the last trial in preparation for # the next iteration. on.exit(keras::backend()$clear_session()) # Define a single layer MLP with dropout and ReLUs model <- keras_model_sequential() model %>% layer_dense( units = 10, activation = 'relu', input_shape = 2, kernel_initializer = initializer_glorot_uniform() ) %>% layer_dropout(rate = 0.4) %>% layer_dense(units = 1, activation = "linear") model %>% compile( loss = 'mean_squared_error', optimizer = optimizer_rmsprop(), metrics = 'mean_squared_error' ) # The data used for modeling (aka the "analysis" set) geocode <- analysis(split) %>% select(-Sale_Price) %>% as.matrix() model %>% fit( x = geocode, y = analysis(split)[["Sale_Price"]], epochs = epoch, ... ) # Now obtain the holdout set for prediction holdout <- assessment(split) pred_geocode <- holdout %>% select(-Sale_Price) %>% as.matrix() # Get predicted values and compute RMSE holdout %>% mutate(predicted = predict(model, pred_geocode)[,1]) %>% rmse(truth = Sale_Price, estimate = predicted) %>% pull(.estimate) } ``` Let's execute the function on the first fold of the data using a batch size of 128 and disable the print/plotting of the optimization: ```{r model-ex, message = FALSE, warning = FALSE} cv_splits$splits[[1]] mlp_rmse( epoch = 100, cv_splits$splits[[1]], # now options to keras::fit batch_size = 128, verbose = 0 ) ``` This works for a single resample and a single epoch setting. To tune over epochs, another function uses `map` to work over the tuning parameter for a specific resample. This is more advantageous than fixing the tuning parameter and then iterating over the data. If there was a complex feature engineering process being used, it only needs to be called once if functions are configured so that the _inner_ loop is over the tuning parameters. The result value is a tibble with the tuning parameter values, the performance estimates, and an indicator for the fold. ```{r grid-part} across_grid <- function(split, ...) { # Create grid epoch_values <- tibble(epoch = seq(50, 600, by = 50)) # Execute grid for this resample epoch_values$rmse <- map_dbl( epoch_values$epoch, mlp_rmse, split = split, # extra options for `fit` ... ) # Now attach the resample indicators using `labels` cbind(epoch_values, labels(split)) } ``` Note that the grid could easily have a been multidimensional so that many parameters could be optimized using a regular grid or via [random search](http://www.jmlr.org/papers/v13/bergstra12a.html). If that were the case, the candidate tuning parameter values could be in rows and the parameters in columns and `mlp_rmse` would then map the columns in the tibble to their respective arguments. `map_df` is used to operate over the folds. This will row-bind the resulting data frames together so that a single data set is assembled with the individual resampling results. ```{r resample-grid, message = FALSE, warning = FALSE} tune_results <- map_df( cv_splits$splits, across_grid, batch_size = 128, verbose = 0 ) ``` ```{r resample-gshow} head(tune_results) ``` The mean RMSE per epoch is computed and plotted along with the individual curves for each resample. ```{r resample-means} mean_values <- tune_results %>% group_by(epoch) %>% summarize(rmse = mean(rmse)) mean_values library(ggplot2) ggplot(mean_values, aes(x = epoch, y = rmse)) + geom_point() + geom_line() + geom_line(data = tune_results, aes(group = id), alpha = 0.1) + scale_y_continuous(trans = "log2") + theme_bw() ``` For this analysis, there doesn't seem to be any overfitting issues with a large number of iterations (since the RMSE does not increase). `caret` includes some [pre-defined `keras` models](https://topepo.github.io/caret/train-models-by-tag.html#Neural_Network) for single layer networks that can be used to optimize the model across a number of parameters. rsample/vignettes/Applications/Time_Series.Rmd0000644000176200001440000001324413356221017021233 0ustar liggesusers--- title: "Time Series Analysis Example" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Time Series Analysis Example} output: knitr:::html_vignette: toc: yes --- ```{r setup, include = FALSE} options(digits = 3) library(timetk) library(forecast) library(rsample) library(purrr) library(tidyr) library(sweep) library(dplyr) library(ggplot2) library(zoo) ``` "[Demo Week: Tidy Forecasting with `sweep`](http://www.business-science.io/code-tools/2017/10/25/demo_week_sweep.html)" is an excellent article that uses tidy methods with time series. This article uses their analysis with `rsample` to get performance estimates for future observations using [rolling forecast origin resampling](https://robjhyndman.com/hyndsight/crossvalidation/). The data are sales of alcoholic beverages originally from [the Federal Reserve Bank of St. Louis website](https://fred.stlouisfed.org/series/S4248SM144NCEN). ```{r read-data} library(tidymodels) data("drinks") str(drinks, give.att = FALSE) ``` Each row is a month of sales (in millions of US dollars). Suppose that predictions for one year ahead were needed and the model should use the most recent data from the last 20 years. To setup this resampling scheme: ```{r rof} roll_rs <- rolling_origin( drinks, initial = 12 * 20, assess = 12, cumulative = FALSE ) nrow(roll_rs) roll_rs ``` Each `split` element contains the information about that resample: ```{r split} roll_rs$splits[[1]] ``` For plotting, let's index each split by the first day of the assessment set: ```{r labels} get_date <- function(x) min(assessment(x)$date) start_date <- map(roll_rs$splits, get_date) roll_rs$start_date <- do.call("c", start_date) head(roll_rs$start_date) ``` This resampling scheme has `r nrow(roll_rs)` splits of the data so that there will be `r nrow(roll_rs)` ARIMA models that are fit. To create the models, the `auto.arima` function from the `forecast` package is used. The functions `analysis` and `assessment` return the data frame, so another step converts the data in to a `ts` object called `mod_dat` using a function in the `timetk` package. ```{r model-fun} library(forecast) # for `auto.arima` library(timetk) # for `tk_ts` library(zoo) # for `as.yearmon` fit_model <- function(x, ...) { # suggested by Matt Dancho: x %>% analysis() %>% # Since the first day changes over resamples, adjust it # based on the first date value in the data frame tk_ts(start = .$date[[1]] %>% as.yearmon(), freq = 12, silent = TRUE) %>% auto.arima(...) } ``` Each model is saved in a new column: ```{r model-fit, warning = FALSE, message = FALSE} roll_rs$arima <- map(roll_rs$splits, fit_model) # For example: roll_rs$arima[[1]] ``` (There are some warnings produced by these first regarding extra columns in the data that can be ignored) Using the model fits, performance will be measured in two ways: * _interpolation_ error will measure how well the model fits to the data that were used to create the model. This is most likely optimistic since no holdout method is used. * _extrapolation_ or _forecast_ error evaluates the efficacy of the model on the data from the following year (that were not used in the model fit). In each case, the mean absolute percent error (MAPE) is the statistic used to characterize the model fits. The interpolation error can be computed from the `Arima` object. to make things easy, the `sweep` package's `sw_glance` function is used: ```{r interp} library(sweep) roll_rs$interpolation <- map_dbl( roll_rs$arima, function(x) sw_glance(x)[["MAPE"]] ) summary(roll_rs$interpolation) ``` For the extrapolation error, the model and split objects are required. Using these: ```{r extrap} get_extrap <- function(split, mod) { n <- nrow(assessment(split)) # Get assessment data pred_dat <- assessment(split) %>% mutate( pred = as.vector(forecast(mod, h = n)$mean), pct_error = ( S4248SM144NCEN - pred ) / S4248SM144NCEN * 100 ) mean(abs(pred_dat$pct_error)) } roll_rs$extrapolation <- map2_dbl(roll_rs$splits, roll_rs$arima, get_extrap) summary(roll_rs$extrapolation) ``` What do these error estimates look like over time? ```{r plot} roll_rs %>% select(interpolation, extrapolation, start_date) %>% as.data.frame %>% gather(error, MAPE, -start_date) %>% ggplot(aes(x = start_date, y = MAPE, col = error)) + geom_point() + geom_line() + theme_bw() + theme(legend.position = "top") ``` It is likely that the interpolation error is an underestimate to some degree. It is also worth noting that `rolling_origin()` can be used over calendar periods, rather than just over a fixed window size. This is especially useful for irregular series where a fixed window size might not make sense because of missing data points, or because of calendar features like different months having a different number of days. The example below demonstrates this idea by splitting `drinks` into a nested set of 26 years, and rolling over years rather than months. Note that the end result accomplishes a different task than the original example, in this case, each slice moves forward an entire year, rather than just one month. ```{r rof-annual} # The idea is to nest by the period to roll over, # which in this case is the year. roll_rs_annual <- drinks %>% mutate(year = as.POSIXlt(date)$year + 1900) %>% nest(-year) %>% rolling_origin( initial = 20, assess = 1, cumulative = FALSE ) analysis(roll_rs_annual$splits[[1]]) ``` The workflow to access these calendar slices is to use `bind_rows()` to join each analysis set together. ```{r} mutate( roll_rs_annual, extracted_slice = map(splits, ~ bind_rows(analysis(.x)$data)) ) ``` rsample/vignettes/Applications/Survival_Analysis.Rmd0000644000176200001440000001253513507027211022501 0ustar liggesusers--- title: "Survival Analysis Example" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Survival Analysis Example} output: knitr:::html_vignette: toc: yes --- ```{r setup, include = FALSE} options(digits = 3) library(survival) library(purrr) library(rsample) library(dplyr) library(tidyposterior) library(ggplot2) library(tidyr) ``` In this article, a parametric analysis of censored data is conducted and `rsample` is used to measure the importance of predictors in the model. The data that will be used is the NCCTG lung cancer data contained in the `survival` package: ```{r lung} library(survival) str(lung) ``` `status` is an indicator for which patients are censored (`status = 1`) or an actual event (`status = 2`). The help file `?survreg` has the following model fit: ```{r example-model} lung_mod <- survreg(Surv(time, status) ~ ph.ecog + age + strata(sex), data = lung) summary(lung_mod) ``` Note that the stratification on gender only affects the scale parameter; the estimates above are from a log-linear model for the scale parameter even though they are listed with the regression variables for the other parameter. `coef` gives results that are more clear: ```{r coef} coef(lung_mod) ``` To resample these data, it would be a good idea to try to maintain the same censoring rate across the splits. To do this, stratified resampling can be used where each analysis/assessment split is conducted within each value of the status indicator. To demonstrate, Monte Carlo resampling is used where 75% of the data are in the analysis set. A total of 100 splits are created. ```{r splits} library(rsample) set.seed(9666) mc_samp <- mc_cv(lung, strata = "status", times = 100) library(purrr) cens_rate <- function(x) mean(analysis(x)$status == 1) summary(map_dbl(mc_samp$splits, cens_rate)) ``` To demonstrate the use of resampling with censored data, the parametric model shown above will be fit with different variable sets to characterize how important each predictor is to the outcome. To do this, a set of formulas are created for the different variable sets: ```{r forms} three_fact <- as.formula(Surv(time, status) ~ ph.ecog + age + strata(sex)) rm_ph.ecog <- as.formula(Surv(time, status) ~ age + strata(sex)) rm_age <- as.formula(Surv(time, status) ~ ph.ecog + strata(sex)) rm_sex <- as.formula(Surv(time, status) ~ ph.ecog + age ) ``` The model fitting function will take the formula as an argument: ```{r fit-func} mod_fit <- function(x, form, ...) survreg(form, data = analysis(x), ...) ``` To calculate the efficacy of the model, the concordance statistic is used (see `?survConcordance`): ```{r concord} get_concord <- function(split, mod, ...) { pred_dat <- assessment(split) pred_dat$pred <- predict(mod, newdata = pred_dat) concordance(Surv(time, status) ~ pred, pred_dat, ...)$concordance } ``` With these functions, a series of models are created for each variable set. ```{r models} mc_samp$mod_full <- map(mc_samp$splits, mod_fit, form = three_fact) mc_samp$mod_ph.ecog <- map(mc_samp$splits, mod_fit, form = rm_ph.ecog) mc_samp$mod_age <- map(mc_samp$splits, mod_fit, form = rm_age) mc_samp$mod_sex <- map(mc_samp$splits, mod_fit, form = rm_sex) ``` Similarly, the concordance values are computed for each model: ```{r concord-est} mc_samp$full <- map2_dbl(mc_samp$splits, mc_samp$mod_full, get_concord) mc_samp$ph.ecog <- map2_dbl(mc_samp$splits, mc_samp$mod_ph.ecog, get_concord) mc_samp$age <- map2_dbl(mc_samp$splits, mc_samp$mod_age, get_concord) mc_samp$sex <- map2_dbl(mc_samp$splits, mc_samp$mod_sex, get_concord) ``` The distributions of the resampling estimates ```{r concord-df} library(dplyr) concord_est <- mc_samp %>% dplyr::select(-matches("^mod")) library(tidyr) library(ggplot2) concord_est %>% gather() %>% ggplot(aes(x = statistic, col = model)) + geom_line(stat = "density") + theme_bw() + theme(legend.position = "top") ``` It looks as though the model missing `ph.ecog` has larger concordance values than the other models. As one might expect, the full model and the model absent `sex` are very similar; the difference in these models should only be the scale parameters estimates. To more formally test this, the `tidyposterior` package is used to create a Bayesian model for the concordance statistics. ```{r perf-mod, warning = FALSE, message = FALSE} library(tidyposterior) concord_est <- perf_mod(concord_est, seed = 6507, iter = 5000) concord_est$stan ``` To summarize the posteriors for each model: ```{r post} ggplot(tidy(concord_est)) + theme_bw() ``` While this seems clear-cut, let's assume that a difference in the concordance statistic of 0.1 is a real effect. To compute the posteriors for the difference in models, the full model will be contrasted with the others: ```{r diffs} comparisons <- contrast_models( concord_est, list_1 = rep("full", 3), list_2 = c("ph.ecog", "age", "sex"), seed = 4654 ) ``` The posterior distributions show that, statistically, `ph.ecog` has real importance ot the model. However, since these distributions are mostly with +/- 0.05, they are unlikely to be real differences. ```{r diff-post} ggplot(comparisons, size = 0.05) + theme_bw() ``` The ROPE statistics quantify the practical effects: ```{r diff-sum} summary(comparisons, size = 0.05) %>% dplyr::select(contrast, starts_with("pract")) ``` rsample/vignettes/Applications/Recipes_and_rsample.Rmd0000644000176200001440000001370313374565725023003 0ustar liggesusers--- title: "Recipes with rsample" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Recipes with rsample} output: knitr:::html_vignette: toc: yes --- ```{r setup, include = FALSE} options(digits = 3) library(rsample) library(recipes) library(purrr) ``` The [`recipes`](https://topepo.github.io/recipes/) package contains a data preprocessor that can be used to avoid the potentially expensive formula methods as well as providing a richer set of data manipulation tools than base R can provide. This document uses version `r packageDescription("recipes")$Version` of `recipes`. In many cases, the preprocessing steps might contain quantities that require statistical estimation of parameters, such as * signal extraction using principal component analysis * imputation of missing values * transformations of individual variables (e.g. Box-Cox transformations) It is critical that any complex preprocessing steps be contained _inside_ of resampling so that the model performance estimates take into account the variability of these steps. Before discussing how `rsample` can use recipes, let's look at an example recipe for the Ames housing data. ## An Example Recipe For illustration, the Ames housing data will be used. There are sale prices of homes along with various other descriptors for the property: ```{r ames-data, message=FALSE} library(AmesHousing) ames <- make_ames() names(ames) ``` Suppose that we will again fit a simple regression model with the formula: ```{r form, eval = FALSE} log10(Sale_Price) ~ Neighborhood + House_Style + Year_Sold + Lot_Area ``` The distribution of the lot size is right-skewed: ```{r build} library(ggplot2) theme_set(theme_bw()) ggplot(ames, aes(x = Lot_Area)) + geom_histogram(binwidth = 5000, col = "red", fill ="red", alpha = .5) ``` It might benefit the model if we estimate a transformation of the data using the Box-Cox procedure. Also, note that the frequencies of the neighborhoods can vary: ```{r hood} ggplot(ames, aes(x = Neighborhood)) + geom_bar() + coord_flip() + xlab("") ``` When these are resampled, some neighborhoods will not be included in the test set and this will result in a column of dummy variables with zero entires. The same is true for the `House_Style` variable. We might want to collapse rarely occurring values into "other" categories. To define the design matrix, an initial recipe is created: ```{r rec_setup, message=FALSE, warning=FALSE} library(recipes) rec <- recipe(Sale_Price ~ Neighborhood + House_Style + Year_Sold + Lot_Area, data = ames) %>% # Log the outcome step_log(Sale_Price, base = 10) %>% # Collapse rarely occurring jobs into "other" step_other(Neighborhood, House_Style, threshold = 0.05) %>% # Dummy variables on the qualitative predictors step_dummy(all_nominal()) %>% # Unskew a predictor step_BoxCox(Lot_Area) %>% # Normalize step_center(all_predictors()) %>% step_scale(all_predictors()) rec ``` This recreates the work that the formula method traditionally uses with the additional steps. While the original data object `ames` is used in the call, it is only used to define the variables and their characteristics so a single recipe is valid across all resampled versions of the data. The recipe can be estimated on the analysis component of the resample. If we execute the recipe on the entire data set: ```{r recipe-all} rec_training_set <- prep(rec, training = ames) rec_training_set ``` To get the values of the data, the `bake` function can be used: ```{r baked} # By default, the selector `everything()` is used to # return all the variables. Other selectors can be used too. bake(rec_training_set, new_data = head(ames)) ``` Note that there are fewer dummy variables for `Neighborhood` and `House_Style` than in the data. Also, the above code using `prep` benefits from the default argument of `retain = TRUE`, which keeps the processed version of the data set so that we don't have to reapply the steps to extract the processed values. For the data used to train the recipe, we would have used: ```{r juiced} juice(rec_training_set) %>% head ``` The next section will explore recipes and bootstrap resampling for modeling: ```{r boot} library(rsample) set.seed(7712) bt_samples <- bootstraps(ames) bt_samples bt_samples$splits[[1]] ``` ## Working with Resamples We can add a recipe column to the tibble. `recipes` has a connivence function called `prepper` that can be used to call `prep` but has the split object as the first argument (for easier purrring): ```{r col-pred} library(purrr) bt_samples$recipes <- map(bt_samples$splits, prepper, recipe = rec) bt_samples bt_samples$recipes[[1]] ``` Now, to fit the model, the fit function only needs the recipe as input. This is because the above code implicitly used the `retain = TRUE` option in `prep`. Otherwise, the split objects would also be needed to `bake` the recipe (as it will in the prediction function below). ```{r cols-fit} fit_lm <- function(rec_obj, ...) lm(..., data = juice(rec_obj, everything())) bt_samples$lm_mod <- map( bt_samples$recipes, fit_lm, Sale_Price ~ . ) bt_samples ``` To get predictions, the function needs three arguments: the splits (to get the assessment data), the recipe (to process them), and the model. To iterate over these, the function `purrr::pmap` is used: ```{r cols-pred} pred_lm <- function(split_obj, rec_obj, model_obj, ...) { mod_data <- bake( rec_obj, new_data = assessment(split_obj), all_predictors(), all_outcomes() ) out <- mod_data %>% select(Sale_Price) out$predicted <- predict(model_obj, newdata = mod_data %>% select(-Sale_Price)) out } bt_samples$pred <- pmap( lst( split_obj = bt_samples$splits, rec_obj = bt_samples$recipes, model_obj = bt_samples$lm_mod ), pred_lm ) bt_samples ``` Calculating the RMSE: ```{r cols-rmse} library(yardstick) results <- map_dfr(bt_samples$pred, rmse, Sale_Price, predicted) results mean(results$.estimate) ``` rsample/vignettes/Applications/diagram.png0000755000176200001440000266010313323650050020475 0ustar liggesusersPNG  IHDRoNse pHYs+ IDATx||D IDAT p IDATw IDATZP| IDAT||!X9 IDAT|I IDAT/ IDATS IDAT|| J IDATk IDAT39t IDATL IDAT||auq IDAT' IDAT IDAT%%%$$$???>>>6665 IDAT%%%>>>>>>RRR^^^ttt??? ÍMMMjjj||CCCKKK DDD;;;]]] چddd,,,fff ```FFFVVV IDATJJJ***]]]```^^^QQQDDDsssvvvbbb| IDATZZZxxxZZZJJJmmmlll 000!!!***\j IDATlllݛDcڙqڙqkkk111ݱrLLL...׬ ggj[111>>>Ѩ-Z6DV999((( ˦9X1@Y=== JA IDAT>>>Š:k-6 ===BBB(((羢I)))||㸣MN$ 1V:ȡ???CCC ? IDAT  " IDATv/7641/3D^|Ǘ\ɬ}:~Hqſ %Is=tGk!l oQ𾗙jO|x9ۇl oQ )))*** 8,$<|A˂VD:Cki\d!|A˂VD:+++555FFFү&: ү&:333q IDAT>>>xxx1 yuVq XtyuVq Xthhh===صq;rQ+' ï +' ï>>>===V<$ 4IXz;JcRQZ>%74\@fIG-bJcRQZ>%7???EEE   >>>2FH IDATwww___OOO___llllll||>>> NNNTTTNNNAb* IDATJJJSSSWWWcccsssmmmfffyyy!!!DDDCCCnnn!!!?h> IDATbbbWWWGGG UUU^^^ͬ 唔ooo+++הlll$$$xxx>>>>>>222_j IDATlllMMM>>>>>>"""RRR(((OOO 555```VVV000KKK^^^```ZZZ))) IDAT^^^```^^^"""OOO```||^^^)))  IDAT999```^^^000###```^^^777(((vgx IDAT___^^^???222___^^^FFF===___* IDAT^^^MMMHHH```^^^TTTSSS```riH IDAT ^^^>>>:::ccciii>>>Ӷ444||HHHʮGGG >>>A( IDATOOO***JJJ```:::444666UUU   000666UUU$  " "!Vϒ IDAT """...OOO333  $"$###   ...,,,v{v868 OOO &)-)868 ɕ-5-"&" 868 j IDAT 770 **+335!(.(*(*! !77/889))*HTHƽ202))#HHJ=F=EBEDD;RRT -4-USU33-DDF¿ $ ZXZN@ IDAT33*!!''+$$%"!%!a_a??5))- (-(QNQ<<2EE<(-( ' ,,#GG>6>6"'"||%~ * IDAT $$ RRHEPE18155.FF<=G=xzGqewif_-1@n8njbw<  3[  Ԁ8S#Pss:܌l hLnTfz-fh-/@n5knwifh-/@n5ǒbw< 3[  ϘxFI̍W7&Y^`o[j}+p шW[5L ՘^TZ?3 B<u>΂a@9pԈW[2MpԈW[2L ֘^TZ?3 ¿7.>TjX>8863#6F3,F^8K8  ?E.կ&9 2F3,2F3,B^4K8 08>?;AN  gI˭ijP 컁~Qj a|ˡpC IDAT >: $''%''#-ARa}7 4=س +% Ư0=ط  3lDcG)1=kmg()n& '[UpUFRR%a^zAA"+9FFJ J]RZc<%2#aXzKA!+9AFJ  CC;   s\Z5 QQTDMD s\Z~5   0/0 縂%׶d-   縊%׶d-  IDATEwx*IRp Epx*IRj   ' %     `b IDAT   !!! !  ++-   IDAT334)() ||668/./979668767668&%&CACbbb335%%' 868 IDAT+)+:::(((``` --0334545OOO$$$ZZZ OOQECEƼŻDCDQQQ XXX001QQRGFGSSSTTTBBD447'#'/./UUUPPP1 IDATSSU##( !*)*B?BWWWLLLNNQ !TQTYYYJJJXXZ++0))*pppvuv_]_ZZZDDDFFIMMQ;;;ZZZjjjFDF濳VTV\\\ BBB޹ք IDAT++#MMOӓ9A9IFI```ddd##$448ٺ؞ MMOeee535%%IFI```:::66/##$557|||+++𼿼!(!$)$%!%%$%dddaaa%% 557 ~~~/6/ 757ddd___  $$%~~~GGG/6/868ddd]]]Sg IDAT!!     !~~~  656#"#eee[[[||  666 %#%&&&dddYYYށ444mmm "      dddXXX%%%   eeeUUUq] IDAToooRRRXXX"""dddTTT@@@RRR   qqqJJJMMM溺cccrrrLLLZZZΠppptttMMMAi IDATЖiii SSS훛eeeLLLܗnnn,,,xxx뛛dddKKK閖iii999zzz霜dddIIIhhh<<<999{{{TTT݂^^^ EEE}}}WWW" IDATͥ[[[HHHSSS㜜dddDDD [[[888SSS✜dddAAA...[[['''SSSߜddd@@@PPP[[[SSSޜddd>>>1 IDAT夤[[["""ܜddd<<<ԥ\\\OOOSSSڜddd:::||[[[???SSS؜eee888ᰰAAASSS+++SSSכeee666PPPPPPSSSY IDAT՛ddd555䰰;;;PPP111SSSӜeee222PPPPPP ֊(((GGGVVVћddd111簰555QQQ333]]]lllϜeee///®PPPPPP !!!222Λddd---### J IDATָ444PPP666hhh,,,˜ eee+++666 縸 HHH ZZZXXX555555ʐ CCCHHHHHH###)))eeeFFFyyyJJJjjjeeeSSS)))SSSǗH IDATooo Ҹ666HHH???GGGnnn@@@RRR򉉉333PPPGGGqqqHHHGGG JJJ mmm (((HHHxxxMMMZZZKKKMMM~~~___AAABBBnnn ...```'''DDD DDD@@@䌌/// -p IDAT+++!!!|| [ IDATR'574.%+Ir꼂ΘmO3:ڑ:+574,%1S~ڙqj9%;XlkT-)Uv/772)%8]ڙqy>7QerteD$9~ȹe4O[J^KxTjxXC8893#-s??AN gIǭiyjIg,p>0<>??AN1Pisa`hTC'5=?)?DE.I+ryuVq Xt'5=?)?D߮vb9V$ ;XmuKoYYc:}0+8=???G  W<ʼ|BJ^muw׈I'5=?)?DmfTg.+= &'''''%0ARh}<"+ ϱܽ7(>= &'''''%Iv\ى*D($'''''&# (8I\r~j +' ïع̳/9*D($'''''&# yb0])  %V{B4?&'''''&!  ,%7%(ka+9RyoIL6&hOԑZQɈR< 0dlTLcboFQ5,zz)~W&-Vo XfUM^5!),%(ka+9RyoIL6&hO]S*'JOڐHbEEEHHHfCzzzIIIfC                   Tz 1v% 1v%He! IDAT '##(#(||P!o IDAT LLLQQQϵKKK111XXX---!!! 666www@@@444"""ѳKKK111LLL& IDAT˔!!!lll̢kkkZZZrrrVVV888픔lll666ܔ|||%%%|||XXXܔlll%%%픔lll666ǽ|||fff$$$˄ lll```888666؛,,,___uuu555<<>>GGG/g IDATyyy;;;BBBEEEBBBIIIYYYVVVIII&&&VVVFFFZZZШjjjJJJWWW gggXXXkkk{{{VVV fff쮮 xxxiii蔔lll111ggg}}}ttt~~~DDD IDATѐllllllBBBxxx[[[---ppp666///nnn777lll HHH BBB999>>>@@@$$$LLLCCCSSSOOO{{{QQQAAA||TTTܴddd```^^^bbbRRR IDATeeeuuu qqqGGGsssbbb╕ppp+++ΰKKKؔlll!!!kkk===ގYYY(((|||***kkkRRR***===mmmzzzggg999000ZZZ>>>NNNKKKt IDAT---]]]vvvKKK;;;rrr222(((JJJ>>>Ծ...bbb$$$̔lll999 :::///[[[!!!nnn\\\JJJ)))lll777ݔlll&&&nnn!!!KKKXXXlllbbb222000'''nnnݔlll&&&큁\\\sssoooBBBggg|||___~~~RI IDATKKKGGG QQQjjjeee???NNN:::vvv<<<HHH^^^ 555___򯯯///___EEEDDDכ???III+++OOOYYY]]]SSS<<@???++(../../...(+(')''*'!#!@>@???**'../../---||Ƕ141:?::>:HFHRRR..,..*..*.., ../778>>> ʴ.1.:?::>:IGIPPP ..,..*--*..+U5 IDAT//0556???ͳ*-*9>9;?; GFGQPQ **(..*..+..+../667>>>"%"6<6;?;УPOP^]^ռ@@;**(,,)ம++,DDEEEE;@;;?;ФKJK]\]ػBB<((&..+ᮮ(()CCDDDD =C=;>; ХEDE]\]ڼCC=((&--*ᮮ##%EEFCCC .3.X^X!646qpq﫫DD>DD@̼778JJK???*.*V]V"$"0.0pop򩩩 EE?DD@ͽ¼223KKL@@@$($U\U&(&*(*nmn DD>EEAл../KKLBBBqsq-3-IOIljlHGHZZSBB>hhj999rtr&,&HNH`^`LKL]]V@@<hhj===rsr & GMG¾§VSVPOP__X??;hhj@@@-0-hhaUUOݘhhj000y{y252hhaYYSᘘggi444npn696􊉊hha\\V䘘ggi777?y IDATϑ/&"-! Žőuus@@9__Y՘--1iijԋ/&"-! rro;;5bb\ؙ%%(mmo׈/&"-! ƪool550ee^ڙ rrtáxDܙyլGKGNWNjmjbb___YYM缐..+ ɗxDܙyլ@D@[f[egeXXUbb\YM缐++(}ЍxDܙyլ:>:jtj^a^LLIee^YM缐((&%%zzw鋍ȪW{$DZ4744;4興n}߼]]W00(ȪW{$DZ-/-?G?||yn}߼UUP<<3xxvstsȪW{$DZ&(<Lppmn}߼MMIFF>oolwwŤK{{1d-:fh-/@n5ϔq[_( atY`l{b|c/, 0~p$knwi﹈k"2tx8 SXSvkv%%%hhfᵮ_gb`u) ߱uc/,0~p#a%2t8 Ěo{ /9q<֝lu0%BSa%2t6iR AOzwW Ěo{ /9q<ۘNp@#]e& r(Xr;KKG❝ccdxxŤK{{1d-:fh-/@n5ϔq[_( atY`l{b|c/, 0~p$knwi﹈k"2tx8 MRM{q{+++aa`ᵮ_gb`u) ߱uc/,0~p#a%2t8 㽈m*Pc- ֝lu0%BVuc/,0~niR AOzwW Ěo{ /9q<ۘNp@#]e& r(Xr;EEA嗗iijwwŤK{{1d-:fh-/@n5ϔq[_( atY`l{b|c/, 0~p$knwi﹈k"2tx8 GKG111[[Yᵮ_gb`u) ߱uc/,0~p#a%2t8 㽈m*Pc- ֝lu0%BVuc/,0~p#f]*1bf(DĚo{ /9q<ۘNp@#]e& r(Xr;@@<撒nnp8:8KMuWʍQ]l7vڦ_FݑJR͑F]ڰXH֟YOȋDWi@vڑFc֨YOnFspGwq@>k% IDATKMuWʍQ]l7vڦ_FݑJR͑F]ڰXH֟YOȋDWi@vڑFc֨YO֩[F֬^GqWʍQ]l7vڦ_FݑJR͑F]ڰXH֟YO𒛒ȋDWi@vڑFc֨YO֩[F֬^Gqj262"" ֯!V<2X0æ98wt}.>jὶͪ>7ul.>j%@=  *19A2 #ὶͪ =C1 ( &)&ᴭWE *OA֧2F3,޻sbghg:a&ܬ_C98wt+>j252"" ֯!V<2X0æ98wt}.>jٹì>7ul98wt}%@=  *19A2 #ὶͪ =C1 (  ᴭWE *OA֧2F3,޻sbghg:a&ܬ_C98wt+>j252!!֯!V<2X0æ98wt}.>jٹì>7ul98wt}6E1 )4;B)(ὶͪ =C1 ( ݲU>.~F[·t5"dd4W9M)\e5qȞa 8 &gI˱ijQЦAz:B5qŞaz ;)gI˭ij"0.~F[·t5"dd4W9M)\e5qȞa 8 &gI˱ijQЦAz:B5qŞaz ;)gI˭ij>&5=;)?@A+I)n15qŞaz +"0.~F[·t5"dd4W9M)\e5qȞa 8 &gI˱ijQЦAz:B5qŞaz ;)gI˭ij>&5=;)?@A+I)n15qŞaz 7"0K !4E!0-\6-A,3C=A>S4(8D(2#&-A,3CO A>S4 ***((( ծ,ўHޠ\mU\$_ʹ%w 'QMpz?t!1E!.-\8 -=,/NQ434778&&$ͤQނ:ӅI>K !4E!0-\6-A,3CM(D(M/(8D(2#*!4E!0-\+A>S4 ((("""ծ,ўHޠ\mU\$_ʹ%w 'QMpz?t!1E!.-\8 -=,/NQ434667,,)ͤQނ:ӅI>K !4E!0-\6-A,3CM(D(M/(8D(2#*!4E!0-\7 A>S4 )))>>>.0.Ҧ%XhCPiSS8a5V_D2 NVi;) dk/$ lmg('h&YQYbbc__\BB>è?e  $QKz'~g2$ l kg()n& sUR0) MZӔUt08vlg()n& >sUR0) M[>]"o[G9lA==@=<=252Ҧ%XhCPiSS8a5V_D2 NVi;) dk/$ lmg('h&MDMbbcWWUGGCè?e  $QKz'~g2$ l kg()n&  RqoTL2&hBUt08vdc2$ l >sUR0) M[>]"o[G9lA225=;=8<8Ҧ%XhCPiSS8a5V_D2 NVi;) M IDATdk/$ lmg('h&BZBbbcPPNMMHè?e  $QKz'~g2$ l kg()n&  RqoTL2&hBUt08vdc2$ l >sUR0) M[>]"o[G9lA((,"""SVS ۟_z V  fefhhe         ssuGJG ۟_z V  mkm__\       ||<><ķ ۟_z V  sqsVVS       Ά̜Ӱ |v9½__]qqsҐӰ |v9ȴUUT vvy؄Ӱ |v9ЦKKJ{{~؂ " ۅppm11,gg`˙??B``a~~ " Όmmj,,'gg`Ι99>:))'ggi555V]V+.+NMNqpqEE?CC? EEFJJK222U\U.1.IHIonoDD?EE@!!¼DDEJJK555  U\U141BABnmn DD?DD?$$"ļDDEIIJ888š 7<7;?;ѫ^]^]\]⻻AA<))&.., ⫫DDEDDE>>>Ǣ9>9<@<ҧ^]^]\] 似DD?((%.., ⬬@@ADDE@@@ʢ8;! !HFHQQQ..+//+..,../556???&)&(*(&)&%#%@?@??? ...//0//0... Կ'*'&(&&)&@?@@@@ IDAT)))..///0/// %)%')'&)&@?@???"""//0..////!#!373  *)*)())))///%%"$$$ 474   *)**)****...""%%%585 *)**)*)))...!!&&&)+)((( )+) )))!!  *,* )))""       \ IDAT\ IDAT@ IDAT||- IDAT1 IDAT{-U IDAT, G IDAT|| | IDAT IDAT IDAT9k IDAT||@ IDATLio IDATr- IDATf{IDATc%G'oIENDB`rsample/vignettes/Applications/Intervals.Rmd0000644000176200001440000001704313512145132020770 0ustar liggesusers--- title: "Bootstrap Confidence Intervals" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Bootstrap Confidence Intervals} output: knitr:::html_vignette: toc: yes --- ```{r setup, include=FALSE} library(tidymodels) library(nlstools) library(GGally) theme_set(theme_bw()) ``` The bootstrap was originally intended for estimating confidence intervals for complex statistics whose variance properties are difficult to analytically derive. Davison and Hinkley's [_Bootstrap Methods and Their Application_](https://www.cambridge.org/core/books/bootstrap-methods-and-their-application/ED2FD043579F27952363566DC09CBD6A) is a great resource for these methods. `rsample` contains a few function to compute the most common types of intervals. To demonstrate the computations for the different types of intervals, we'll use a nonlinear regression example from [Baty _et al_ (2015)](https://www.jstatsoft.org/article/view/v066i05). They showed data that monitored oxygen uptake in a patient with rest and exercise phases (in the data frame `O2K`). ```{r O2K-dat} library(tidymodels) library(nlstools) library(GGally) data(O2K) ggplot(O2K, aes(x = t, y = VO2)) + geom_point() ``` The authors fit a segmented regression model where the transition point was known (this is the time when exercise commenced). Their model was: ```{r O2K-fit} nonlin_form <- as.formula( VO2 ~ (t <= 5.883) * VO2rest + (t > 5.883) * (VO2rest + (VO2peak - VO2rest) * (1 - exp(-(t - 5.883) / mu))) ) # Starting values from visual inspection start_vals <- list(VO2rest = 400, VO2peak = 1600, mu = 1) res <- nls(nonlin_form, start = start_vals, data = O2K) tidy(res) ``` `broom::tidy()` returns our analysis object in a standardized way. The column names shown here are used for most types of objects and this allows us to use the results more easily. For `rsample`, we'll rely on the `tidy()` method to work with bootstrap estimates when we need confidence intervals. There's an example at the end of a univariate statistic that isn't automatically formatted with `tidy()`. To run our model over different bootstraps, we'll write a function that uses the `split` object as input and produces a tidy data frame: ```{r model-info} # Will be used to fit the models to different bootstrap data sets: fit_fun <- function(split, ...) { # We could check for convergence, make new parameters, etc. nls(nonlin_form, data = analysis(split), ...) %>% tidy() } ``` First, let's create a set of resamples and fit separate models to each. The options `apparent = TRUE` will be set. This creates a final resample that is a copy of the original (unsampled) data set. This is required for some of the interval methods. ```{r resample} set.seed(462) nlin_bt <- bootstraps(O2K, times = 2000, apparent = TRUE) %>% mutate(models = map(splits, ~ fit_fun(.x, start = start_vals))) nlin_bt nlin_bt$models[[1]] ``` Let's look at the data and see if there any outliers or aberrant results: ```{r extract} nls_coef <- nlin_bt %>% dplyr::select(-splits) %>% # Turn it into a tibble by stacking the `models` col unnest() %>% # Get rid of unneeded columns dplyr::select(id, term, estimate) head(nls_coef) ``` Now let's create a scatterplot matrix: ```{r splom} nls_coef %>% # Put different parameters in columns tidyr::spread(term, estimate) %>% # Keep only numeric columns dplyr::select(-id) %>% ggscatmat(alpha = .25) ``` One potential outlier on the right for `VO2peak` but we'll leave it in. The univariate distributions are: ```{r hists} nls_coef %>% ggplot(aes(x = estimate)) + geom_histogram(bins = 20, col = "white") + facet_wrap(~ term, scales = "free_x") ``` ## Percentile intervals The most basic type of interval uses _percentiles_ of the resampling distribution. To get the percentile intervals, the `rset` object is passed as the first argument and the second argument is the list column of tidy results: ```{r pctl} p_ints <- int_pctl(nlin_bt, models) p_ints ``` When overlaid with the univariate distributions: ```{r pctl-plot} nls_coef %>% ggplot(aes(x = estimate)) + geom_histogram(bins = 20, col = "white") + facet_wrap(~ term, scales = "free_x") + geom_vline(data = p_ints, aes(xintercept = .lower), col = "red") + geom_vline(data = p_ints, aes(xintercept = .upper), col = "red") ``` How do these intervals compare to the parametric asymptotic values? ```{r int-compare} parametric <- tidy(res, conf.int = TRUE) %>% dplyr::select( term, .lower = conf.low, .estimate = estimate, .upper = conf.high ) %>% mutate( .alpha = 0.05, .method = "parametric" ) intervals <- bind_rows(parametric, p_ints) %>% arrange(term, .method) intervals %>% split(intervals$term) ``` The percentile intervals are wider than the parametric intervals (which assume asymptotic normality). Do the estimates appear to be normally distributed? We can look at quantile-quantile plots: ```{r qqplot} nls_coef %>% ggplot(aes(sample = estimate)) + stat_qq() + stat_qq_line(alpha = .25) + facet_wrap(~ term, scales = "free") ``` ## t-intervals Bootstrap _t_-intervals are estimated by computing intermediate statistics that are _t_-like in structure. To use these, we require the estimated variance _for each individual resampled estimate_. In our example, this comes along with the fitted model object. We can extract the standard errors of the parameters. Luckily, most `tidy()` provide this in a column named `std.error`. The arguments for these intervals are the same: ```{r t-ints} t_stats <- int_t(nlin_bt, models) intervals <- bind_rows(intervals, t_stats) %>% arrange(term, .method) intervals %>% split(intervals$term) ``` ## Bias-corrected and accelerated intervals For bias-corrected and accelerated (BCa) intervals, an additional argument is required. The `.fn` argument is a function that computes the statistic of interest. The first argument should be for the `rsplit` object and other arguments can be passed in using the ellipses. These intervals use an internal leave-one-out resample to compute the Jackknife statistic and will recompute the statistic for _every bootstrap resample_. If the statistic is expensive to compute, this may take some time. For those calculations, we use the `furrr` package so these can be computed in parallel if you have set up a parallel processing plan (see `?future::plan`). The user-facing function takes an argument for the function and the ellipses. ```{r bca-comp} bias_corr <- int_bca(nlin_bt, models, .fn = fit_fun, start = start_vals) intervals <- bind_rows(intervals, bias_corr) %>% arrange(term, .method) intervals %>% split(intervals$term) ``` ## No existing tidy method In this case, your function can emulate the minimum results: * a character column called `term`, * a numeric column called `estimate`, and, optionally, * a numeric column called `std.error`. The last column is only needed for `int_t`. Suppose we just want to estimate the fold-increase in the outcome between the 90th and 10th percentiles over the course of the experiment. Our function might look like: ```{r fold-foo} fold_incr <- function(split, ...) { dat <- analysis(split) quants <- quantile(dat$VO2, probs = c(.1, .9)) tibble( term = "fold increase", estimate = unname(quants[2]/quants[1]), # We don't know the analytical formula for this std.error = NA_real_ ) } ``` Everything else works the same as before: ```{r fold-ci} nlin_bt <- nlin_bt %>% mutate(folds = map(splits, fold_incr)) int_pctl(nlin_bt, folds) int_bca(nlin_bt, folds, .fn = fold_incr) ``` rsample/vignettes/Applications/Nested_Resampling.Rmd0000644000176200001440000002470313323650050022425 0ustar liggesusers--- title: "Nested Resampling" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Nested Resampling} output: knitr:::html_vignette: toc: yes --- ```{r setup, include=FALSE} library(rsample) library(purrr) library(dplyr) library(ggplot2) library(scales) library(mlbench) library(kernlab) theme_set(theme_bw()) ``` (A version of this article was originally published in the [_Applied Predictive Modeling_ blog](http://appliedpredictivemodeling.com/blog/2017/9/2/njdc83d01pzysvvlgik02t5qnaljnd)) A typical scheme for splitting the data when developing a predictive model is to create an initial split of the data into a training and test set. If resampling is used, it is executed on the training set. A series of binary splits is created. In `rsample`, we use the term _analysis set_ for the data that are used to fit the model and the _assessment set_ is used to compute performance: ![](diagram.png) A common method for tuning models is grid search where a candidate set of tuning parameters is created. The full set of models for every combination of the tuning parameter grid and the resamples is created. Each time, the assessment data are used to measure performance and the average value is determined for each tuning parameter. The potential problem is, once we pick the tuning parameter associated with the best performance, this performance value is usually quoted as the performance of the model. There is serious potential for _optimization bias_ since we uses the same data to tune the model and quote performance. This would result in an optimistic estimate of performance. Nested resampling does an additional layer of resampling that separates the tuning activities from the process used to estimate the efficacy of the model. An _outer_ resampling scheme is used and, for every split in the outer resample, another full set of resampling splits are created on the original analysis set. For example, if 10-fold cross-validation is used on the outside and 5-fold cross-validation on the inside, a total of 500 models will be fit. The parameter tuning will be conducted 10 times and the best parameters are determined from the average of the 5 assessment sets. This process occurs 10 times. Once the tuning results are complete, a model is fit to each of the outer resampling splits using the best parameter associated with that resample. The average of the outer method's assessment sets are a unbiased estimate of the model. We will simulate some regression data to illustrate the methods. The `mlbench` function `mlbench::mlbench.friedman1` can simulate a complex regression data structure from the [original MARS publication](https://scholar.google.com/scholar?hl=en&q=%22Multivariate+adaptive+regression+splines%22&btnG=&as_sdt=1%2C7&as_sdtp=). A training set size of 100 data points are generated as well as a large set that will be used to characterize how well the resampling procedure performed. ```{r sim-data} library(mlbench) sim_data <- function(n) { tmp <- mlbench.friedman1(n, sd=1) tmp <- cbind(tmp$x, tmp$y) tmp <- as.data.frame(tmp) names(tmp)[ncol(tmp)] <- "y" tmp } set.seed(9815) train_dat <- sim_data(100) large_dat <- sim_data(10^5) ``` To get started, the types of resampling methods need to be specified. This isn't a large data set, so 5 repeats of 10-fold cross validation will be used as the _outer_ resampling method that will be used to generate the estimate of overall performance. To tune the model, it would be good to have precise estimates for each of the values of the tuning parameter so 25 iterations of the bootstrap will be used. This means that there will eventually be `5 * 10 * 25 = 1250` models that are fit to the data _per tuning parameter_. These will be discarded once the performance of the model has been quantified. To create the tibble with the resampling specifications: ```{r tibble-gen} library(rsample) results <- nested_cv(train_dat, outside = vfold_cv(repeats = 5), inside = bootstraps(times = 25)) results ``` The splitting information for each resample is contained in the `split` objects. Focusing on the second fold of the first repeat: ```{r split-example} results$splits[[2]] ``` `<90/10/100>` indicates the number of data in the analysis set, assessment set, and the original data. Each element of `inner_resamples` has its own tibble with the bootstrapping splits. ```{r inner-splits} results$inner_resamples[[5]] ``` These are self-contained, meaning that the bootstrap sample is aware that it is a sample of a specific 90% of the data: ```{r inner-boot-split} results$inner_resamples[[5]]$splits[[1]] ``` To start, we need to define how the model will be created and measured. For our example, a radial basis support vector machine model will be created using the function `kernlab::ksvm`. This model is generally thought of as having _two_ tuning parameters: the SVM cost value and the kernel parameter `sigma`. For illustration, only the cost value will be tuned and the function `kernlab::sigest` will be used to estimate `sigma` during each model fit. This is automatically done by `ksvm`. After the model is fit to the analysis set, the root-mean squared error (RMSE) is computed on the assessment set. One important note: for this model, it is critical to center and scale the predictors before computing dot products. We don't do this operation here because `mlbench.friedman1` simulates all of the predictors to be standard uniform random variables. Our function to fit the model and compute the RMSE is: ```{r rmse-func} library(kernlab) # `object` will be an `rsplit` object from our `results` tibble # `cost` is the tuning parameter svm_rmse <- function(object, cost = 1) { y_col <- ncol(object$data) mod <- ksvm(y ~ ., data = analysis(object), C = cost) holdout_pred <- predict(mod, assessment(object)[-y_col]) rmse <- sqrt(mean((assessment(object)$y - holdout_pred) ^ 2, na.rm = TRUE)) rmse } # In some case, we want to parameterize the function over the tuning parameter: rmse_wrapper <- function(cost, object) svm_rmse(object, cost) ``` For the nested resampling, a model needs to be fit for each tuning parameter and each bootstrap split. To do this, a wrapper can be created: ```{r inner-tune-func} library(purrr) library(dplyr) # `object` will be an `rsplit` object for the bootstrap samples tune_over_cost <- function(object) { results <- tibble(cost = 2 ^ seq(-2, 8, by = 1)) results$RMSE <- map_dbl(results$cost, rmse_wrapper, object = object) results } ``` Since this will be called across the set of outer cross-validation splits, another wrapper is required: ```{r inner-func} # `object` is an `rsplit` object in `results$inner_resamples` summarize_tune_results <- function(object) { # Return row-bound tibble that has the 25 bootstrap results map_df(object$splits, tune_over_cost) %>% # For each value of the tuning parameter, compute the # average RMSE which is the inner bootstrap estimate. group_by(cost) %>% summarize(mean_RMSE = mean(RMSE, na.rm = TRUE), n = length(RMSE)) } ``` Now that those functions are defined, we can execute all the inner resampling loops: ```{r inner-runs} tuning_results <- map(results$inner_resamples, summarize_tune_results) ``` `tuning_results` is a list of data frames for each of the 50 outer resamples. Let's make a plot of the averaged results to see what the relationship is between the RMSE and the tuning parameters for each of the inner bootstrapping operations: ```{r rmse-plot, fig.height=4} library(ggplot2) library(scales) pooled_inner <- tuning_results %>% bind_rows best_cost <- function(dat) dat[which.min(dat$mean_RMSE),] p <- ggplot(pooled_inner, aes(x = cost, y = mean_RMSE)) + scale_x_continuous(trans = 'log2') + xlab("SVM Cost") + ylab("Inner RMSE") for (i in 1:length(tuning_results)) p <- p + geom_line(data = tuning_results[[i]], alpha = .2) + geom_point(data = best_cost(tuning_results[[i]]), pch = 16) p <- p + geom_smooth(data = pooled_inner, se = FALSE) p ``` Each grey line is a separate bootstrap resampling curve created from a different 90% of the data. The blue line is a loess smooth of all the results pooled together. To determine the best parameter estimate for each of the outer resampling iterations: ```{r choose, fig.height=4} cost_vals <- tuning_results %>% map_df(best_cost) %>% select(cost) results <- bind_cols(results, cost_vals) results$cost <- factor(results$cost, levels = paste(2 ^ seq(-2, 8, by = 1))) ggplot(results, aes(x = cost)) + geom_bar() + xlab("SVM Cost") + scale_x_discrete(drop = FALSE) ``` Most of the resamples produced and optimal cost values of 2.0 but the distribution is right-skewed due to the flat trend in the resampling profile once the cost value becomes 10 or larger. Now that we have these estimates, we can compute the outer resampling results for each of the `r nrow(results)` splits using the corresponding tuning parameter value: ```{r run-outer} results$RMSE <- map2_dbl(results$splits, results$cost, svm_rmse) summary(results$RMSE) ``` The estimated RMSE for the model tuning process is `r round(mean(results$RMSE), 2)`. What is the RMSE estimate for the non-nested procedure when only the outer resampling method is used? For each cost value in the tuning grid, `r nrow(results)` SVM models are fit and their RMSE values are averaged. The table of cost values and mean RMSE estimates is used to determine the best cost value. The associated RMSE is the biased estimate. ```{r not-nested, fig.height=4} not_nested <- map(results$splits, tune_over_cost) %>% bind_rows outer_summary <- not_nested %>% group_by(cost) %>% summarize(outer_RMSE = mean(RMSE), n = length(RMSE)) outer_summary ggplot(outer_summary, aes(x = cost, y = outer_RMSE)) + geom_point() + geom_line() + scale_x_continuous(trans = 'log2') + xlab("SVM Cost") + ylab("RMSE") ``` The non-nested procedure estimates the RMSE to be `r round(min(outer_summary$outer_RMSE), 2)`. Both estimates are fairly close. The approximately true RMSE for an SVM model with a cost value of 2.0 and be approximated with the large sample that was simulated at the beginning. ```{r large-sample-estimate} finalModel <- ksvm(y ~ ., data = train_dat, C = 2) large_pred <- predict(finalModel, large_dat[, -ncol(large_dat)]) sqrt(mean((large_dat$y - large_pred) ^ 2, na.rm = TRUE)) ``` The nested procedure produces a closer estimate to the approximate truth but the non-nested estimate is very similar. rsample/MD50000644000176200001440000001211713512203633012225 0ustar liggesusersb628168485f81a9dfd65a149effca335 *DESCRIPTION 3052462b24dbf3f8e60f0f0c6fb1143c *NAMESPACE 32d13c38443c2bd0c9c5f12e4b59b6bf *NEWS.md ac5edd1264c8caa83d6dc10dc0dc4e4d *R/aaa.R c2c141736c42e76263d90814f4ef1909 *R/apparent.R 0949271e47c85325a8b704ce402a5d7c *R/boot.R a74d46417ff23fa89078885f309da5b2 *R/bootci.R b2e5f5fb57549fae2a1fd68e60b9f602 *R/caret.R d773d61e1d103c712640a5d2f4b91276 *R/complement.R 65399092a57518f45b04b69ff44a14ae *R/data.R ad4a31e8e39ef38f7a144a5c70531fe1 *R/dplyr-compat.R ca3a7ad57b27b5a8a3bdd2e5ab40dba2 *R/form_pred.R 12ee2aaf93683ad700145c2673376751 *R/gather.R 05d72f200fd345eb13e1a8b25b1f0cb5 *R/groups.R 42eca1ebad56d30419bfa4b7ab2c8f95 *R/initial_split.R ba1be94c9df5ce1a56181aeab156bf52 *R/lables.R b914d9548f54f4234531e6b0a1ba849d *R/loo.R ccb68f428969d41db326659874c36558 *R/make_strata.R a1150ede350c8c9a64ec2ea3042b9360 *R/mc.R ab304f9c4ac75e150f2936e0c0ac88ef *R/misc.R 45d132e002b112067bf3a41d3a2e5452 *R/nest.R 4a63ec642f83ddfacc85b24f5c517466 *R/pkg.R bcd9913a1775f03d3c6321548f461aa6 *R/reexports.R e7c8d9f9c955c273263b110edd41656d *R/rolling_origin.R fc573033b201eb36add0a51b5a73a046 *R/rsplit.R 8c2553e57540b93caae9af3fa7c0b7f1 *R/tidy.R 3189f48c6a2d06d087d92561ece161b8 *R/vfold.R 11389583bd0de4efe15ef48475ada186 *build/vignette.rds 9e92477dce7c15dbeb8e8aed2b0e843a *data/attrition.RData 45dacc327c0ae4cb08faf44336ed1dbc *data/datalist 21a652bd9b0c72c2340917933206329c *data/drinks.rda 608c7dde9dce1b59be2c07b747e5baf3 *data/two_class_dat.RData 5d8c6f031f6fb898497d81dedb5dbd08 *inst/doc/Basics.R 4d07ed748161ff53aa250f7bcfdefc77 *inst/doc/Basics.Rmd 84045b94ed95b0eb08e3d7528dbb4eed *inst/doc/Basics.html ee8bac9d441b19eed54daf9416d11086 *inst/doc/Working_with_rsets.R 2aaab27e4714cab416083e626167a8a6 *inst/doc/Working_with_rsets.Rmd afb680e761bf843e06eef9c749d1478f *inst/doc/Working_with_rsets.html e240490eeffed1de6ff40e19d40ac2da *man/add_resample_id.Rd f92699fdd0cf4e6758319220d40590ee *man/apparent.Rd 9fa5e4bcd663cdc23f045b13cd2be661 *man/as.data.frame.rsplit.Rd 70d6d4280b523a2adaa1d8fe381e8ab2 *man/attrition.Rd 5c7d634f2be6ac48c82c9eb568cb3428 *man/bootstraps.Rd aab37a8a68058ca1f787cfbfd0387a33 *man/complement.Rd 304b924a181fe300f2b5fb837f53b841 *man/drinks.Rd 8c05929a19a5dbe8ad9adf09216aea58 *man/figures/logo.png d34b62ab6dd5afa68b5bad266683be87 *man/form_pred.Rd a2ffd46e86e34dc3f2e9f70b95ed8543 *man/gather.rset.Rd a3448766c9fe7e849c01b7407b69e55e *man/group_vfold_cv.Rd eefb027a119af54a6a9436b83ed0f3be *man/initial_split.Rd 9fe6aa440bf09d15c74866b4aa6c4131 *man/int_pctl.Rd 16663c40d1712ef8bfc718bc4ff11410 *man/labels.rset.Rd 6cfabf6b033634945e6f20c002f41d70 *man/labels.rsplit.Rd a25a18c3e7d20ec24335bc3ee9997ad3 *man/loo_cv.Rd bb007972952c6c227280e788d767ecf2 *man/make_strata.Rd 2cb52ae25e66c26910bff649f5f41824 *man/mc_cv.Rd 2ef072da610fdedd050a978703e3d2ae *man/nested_cv.Rd 5fb86cc49c30379acf17d63c13bebe34 *man/populate.Rd 954f5d29493614f1dd4a9e37af65ed50 *man/pretty.vfold_cv.Rd c29e44e700e69aefc5db3019cf39057a *man/reexports.Rd 7187eab8af4799b84ff0d8acc401636c *man/rolling_origin.Rd ff5c81f62fac8bcc8e147a4ffacfdfae *man/rsample.Rd d7f0332a6cc5b2c4648ffdc34541a407 *man/rsample2caret.Rd cdde013830f957f1ca360716b6d507ca *man/tidy.rsplit.Rd 11528b0332fa57e5975379da29e24f5f *man/two_class_dat.Rd 2ea6e9270f65e5ae9c87c34fe112fa14 *man/vfold_cv.Rd f4559eacae7dad0922c02a6e2b1a4cbd *tests/testthat.R d80b1360dfee5b9a0fca291eadd7ebb1 *tests/testthat/test_boot.R 333981ca043ce007c98855bdcaa1dd39 *tests/testthat/test_bootci.R afbd0a2af303e40aec311d87ee5456b2 *tests/testthat/test_caret.R 9052cb10ce5432b37db86b985fc147e1 *tests/testthat/test_dplyr.R d3ec6ffbf293869b0a35b044f26f2e59 *tests/testthat/test_for_pred.R af09f3032a7ce3d353553c463a6de6d3 *tests/testthat/test_gather.R 525bbfa0cf19d8dfa8a26bff29865cc9 *tests/testthat/test_group.R 4d4766a9dbdcb9e4a2e3673790ed0fb7 *tests/testthat/test_initial.R b2e67f2b59b4a2c498a09070aea3c5a4 *tests/testthat/test_labels.R c4e2dfa172ba4f9a83dfde9c58a4c787 *tests/testthat/test_loo.R 0aba3e1b429df04b99e0b8dd0f119ff4 *tests/testthat/test_mc.R 0dfd9da9127f2fd1a20ef3df34bd33da *tests/testthat/test_names.R 6886497dba1db48fddabab672a727d10 *tests/testthat/test_nesting.R 9c1f1d4c8eb514258a4f2860cbbe5eed *tests/testthat/test_rolling.R d42426d2fdb720f208a1fa78ba02649f *tests/testthat/test_rset.R d578dc9c699ff5204049c501288e8398 *tests/testthat/test_rsplit.R 520bf6410f8b2ef4e09ec87e95aa9309 *tests/testthat/test_strata.R 38c664815bd30cc28ecd4e3c0b069821 *tests/testthat/test_tidy.R 67f0c93838c6849ae969b76411627e77 *tests/testthat/test_vfold.R 62b665887b7ffb7ad99e5c96565350ff *vignettes/Applications/Intervals.Rmd ff3eecebd05ef6e1d6879022eac2bcb5 *vignettes/Applications/Keras.Rmd 343aaa30ca110d6946f9767085906ac8 *vignettes/Applications/Nested_Resampling.Rmd 4152ee52f573f508ec99c8152d9f05b0 *vignettes/Applications/Recipes_and_rsample.Rmd 0599118ac2d15bb8d30d4ca3c915a0f4 *vignettes/Applications/Survival_Analysis.Rmd 7fcc58a669eee70ef7855870b70a36d9 *vignettes/Applications/Time_Series.Rmd df1ca90e64750f622cc2fef6a66a0fa4 *vignettes/Applications/diagram.png 4d07ed748161ff53aa250f7bcfdefc77 *vignettes/Basics.Rmd 2aaab27e4714cab416083e626167a8a6 *vignettes/Working_with_rsets.Rmd rsample/build/0000755000176200001440000000000013512177703013022 5ustar liggesusersrsample/build/vignette.rds0000644000176200001440000000034513512177703015363 0ustar liggesusersuM 0۞ƍ; 6`$ wuNĊn&^giYqᛃfDvд#23-:¶gR |a Description: Classes and functions to create and summarize different types of resampling objects (e.g. bootstrap, cross-validation). Imports: dplyr, purrr, tibble, rlang (>= 0.4.0), methods, generics, utils, tidyselect, furrr Depends: R (>= 3.1), tidyr Suggests: ggplot2, testthat, rmarkdown, knitr, AmesHousing, recipes (>= 0.1.4), broom URL: https://tidymodels.github.io/rsample BugReports: https://github.com/tidymodels/rsample/issues License: GPL-2 Encoding: UTF-8 VignetteBuilder: knitr LazyData: true RoxygenNote: 6.1.1 NeedsCompilation: no Packaged: 2019-07-12 21:46:43 UTC; max Author: Max Kuhn [aut, cre], Fanny Chow [aut], Hadley Wickham [aut], RStudio [cph] Repository: CRAN Date/Publication: 2019-07-12 22:20:11 UTC rsample/man/0000755000176200001440000000000013512177703012476 5ustar liggesusersrsample/man/figures/0000755000176200001440000000000013507027211014132 5ustar liggesusersrsample/man/figures/logo.png0000644000176200001440000007111113507027211015601 0ustar liggesusersPNG  IHDRagAMA a cHRMz&u0`:pQ< pHYs  iTXtXML:com.adobe.xmp 1 1 2 ؀@IDATx}u{. JĂY5[Q1vF͗/!&N""EDإzw{{텥 }3Ϝ9šiWo ~j3:'tNyr=}gO:~x;.i֞f!غx2;3-kopJy[b>;@?jKMsz>Sk mI^ }q}ʹ5:xٗX8lX-))XVawk#["7cNÐ.phf8 ښ+l}+{bJXVzr/?7Wxnu"iW m @nê/K*-.ꝶgOIxzmoqO0R".,oxYf1N)pQ_Ri3r'Ju51XGohG1mY'e#/duȜnho+#pUaNoC_Xs2KuݽgO:NuIYԃ/|V Y, -BAtxp]: wvaVf..g6 4N6D~O]Z8E9:@CH-6Ul6H *ϢD Vpj/m֦sVY)֡mڴ8*D[inVE5`q:{XKF^LvDGFID ar,V|Q*Vn'.x-ti\ iY9wOJR$?9jgT).S:&ޢ8+A--%22R\.degIVn{^2e @u Et<Ώ1`QşS4lq \ *bZPn X$6&V%@@2$ 1F+JdBEfH'2ko᪼A^18^ mc$\Т[Bi9$1.^0) W!//O%E Vȼ!.X$2xF3=y;.},hhHr=@NjTFQ.su5Sܥn7C0hH,tcn>\55<"B.C2-El9_La׀CdY>MEd^"sجکZOZJO~jү!aS|MJiJg#"`V`"7 +**,p\tDNPpk^xX {<buBNh񔀩y١y6KсsN$Cná+/E9Od/fJ~Q!g:xg] ORjH>:]Z98oM(ir]+3Y<Ӭ8/E@^Rך5^ ' /InndBl(B2̰:y#1oΈy1es y3ͰhOFJ0^["@rV?J%{R5"r+/tRMfW"녏{j}(3O{`JfA%ZJ~JK#l.B |r+]9H\wws$'?Owu,(,n[8+pe!^7ӽy꿠8q%vc85݅gr fS+L%1@ɭ9U:֗(z"x ]7D HK~6ζ]eg@`"Ccu*`(.t"BTf'5D9 11 線rV UھcLOɻϗ)$vK iڨ󖹿Ej=>w =Uح-5L.i@4,ai<4j!g `E Pp$Lu asJjFauWޱ\qr\))~Vʞr>JPw!NόUvOtW ?{7FCLRn( %Mb` BPUs/塚])R˼5+tCw`ijgo>+A~8XˁA'TdL0T ,aZf ٬OM<^3&]kr0P+,dNlV)HOCmm|,%*[픲ԎG#Ɨo+%E7Ew9XKGcŜ"`  Qyo lPe;.A39OL@wx ͩK[-u( 2x[)RJNJcZS 4..] V8=,G!L}O.9F0ˆRkJdj27:5X+*)V]ʼ!W4rR5 {?zL@Rl\k l29o-^;\R(+=c%Z0"c>jV,tG|<]5Q~kn|?: @Ǧ1Ы]&MnwsUd˖l=&aw`nRhfVdڃLU0SCBʭ$GA! v[#|rB !"H sE؝ἷ>Z"-f"ilq|ąayM:uI,Du}ZxɁ nA˙=w+q$*2@B(.;n9{_4֡:+)rm_-'G}d U J&˩WrVCJ1r {?iݶG{;we]'x!bg"\j8M{w"{!#l)l`Wu?aqmP @T^,FdS`*2+UXJnEM,]e~2 \{ׁreޓI`5(yݳk>tX'u6 F/ U nP "Rw=hϖijlSڴXSQ"ԥR73&У^GfU@@Qr%w5~$B k>~Iy7Uq2mV4뇮:663#I.Ud2L@ Pu+nwݩ/\uja곹MlF.M5Rk+"w,41Ѩ79Qrpy[A |(Z|WpV.kb&+8C-E:/ߔSݬ,_rW.QF Fep_4ecFO[L]{brG[vLƹV^ "A@Le2}+uYp")U93;HGF~ȱV2`UV#`>6laRopեSS?*F4È ?V~7&؃P`C͘À&7]hv>Lyj"OmD{laZoCA жlBz b̀QK/ \xʬS _NEcECJs p̪qڃ:=gӕicْ}L.'ڷSE?qCzHjc$WW_>)D{a:Uxc^l@uan>&(W}0 0 QV&6?w'g\{J?5(h 8VJyjnW v7e~A :H.1q1JINfP5I]5^+{9)[X|4g=+χHʅRĆ1@ p͞BCҍ##\`2l{USDPzsu$4:݇w/M~r2{雋!2RI ZqWtC׻Q#ve2Ii<%(!r+Eq!@5`_ T+G Q@-.Q57``mFeDnJYpck!vʱO "z~B )g@x߃ţFʾx}a1G3>x['0)?6Q5;M Պy)_퐣DȼE[cz=;;޹hZgp Z! x0v{'=t"r0zh'@ Օ"LdNzD80VLlLrg{Ɩ [T@ʅ8)%HգXnlC}_띘ׯ70oN EZB"\\NzЫ AAN 5?ϬXଉe*LF֠eKY,=kTI,v{Cf=4`8 UXз+2*a4Ѥ5tH4PV+e$;X/]vh( 4*385~bО#RXP$a'9,+nwnw^UВR𡿳4z:{m3l(? d U6 gKm0pJTEXOId2S]Dw\Ң@@-2 VrC֘(f Qo+!g`_ t $6:Fq$G̔wȊͫe^'"Sd܀Q0]UәJ[q  c.K yVYtQTlX^zɼ Sml3˩ oM6!e+/bb.?Ҫ%xBH|80KNw!tAOn>*@ ]=hx4]򳗵۝:r\V*,ĂrEQ>0r"|cxˉ0ﻯ䱸?>dTv̒۶ŋȘArѰQPؔnqXQKZ\s*uko!?\)9Y.3DorhRTLٱX/W阚* q!cws>41 0>H=db pki|Ob+;=bY\gq{_9WbF +uQ,EXVVdJe^=T SG0  d<8h" Et9ع$Œ.L'Gw i4cэ|Cz7po^ywʵN-Ctrǫ yp)Tة[.q 7}/{{EM 1#RmJ̚49y"@V~@HW+3iV'hoP+{*l/bìԼ_%Aco]!Γ+RzOLzu6İ~ځr˝䳾eʙKal( n%Hjr$%9E1 P;[մS{\mZ.=˨l JȤ\RTqp3_%2X" ,[VV?}ě-*ἁ K܇\AVM:V`E F9h' 837c$t.{5Yx9sTL0y;~%22oavi$[^l۶U&}\6uco*\v43[VSu^+7\{ 2Xq| 2e3nЮ`\B.v:B/ʻ\9AkuPgȠܞoi&V:߹\QXWA_DTU.9(i9粵+2s1YɛzUA/rFƢV`1rM1rÔkߒsѼ5eK:%ͻW?7n{]&OUWo#W/F~usr(yn>t ?+7tހFM|K ڈtuNO]2fw_qg8 ~vxȭA{3S/|{|㐺j-{໫daۑ #;Y˾  i7HCztjI|l.=Z+5AL>r?`/l1gi<0];v}<$HD]0'=?H/ZDTD7EJN Hj9oj z;Z_5=E(jS7PYGӨ?dVB; VT Ab#1sl2 sȱu"6ȦI~Ҷ][ АDGNwtYo䊫m:W+}mi02lL9y2`hJ4Ջa7Yn2zph-8408x(<B-@{-!x}2"X,{wb qV9:uFJLيoLF3$w%XQV:93:L"ZDC7#6Z>kFK9;_YM;Kŧk.$ʆA ޒޚ*n;DbOiwHeksN'sd6)&C=? |Rt+Y!^aE~`Crrh0R:gP0&U쟟Bp}+] ܤ?Fޘ>AjŶ"+نw\]2W(Li&2֖"g'˹+幧zXDW5ϫ_z^p!' peG{;Tgf^J]RJr iZ"5@HAW,ƶ\cݔjQlA|TH/ p1(=/5a F4.C3{/3#*{~/nANs,WƗ[ۅ K(2vL8Bu\&B<' rRg.J9_' [r Cbc <9AX\+wT!:3F &(*𩾔R!K!&/Aq^!3#Vx|5n9xL6yŗg}Xr4pl8<&##Cv%+K&#oNZ< o@WsIt'˕K \(nq6t qi"MK +Hh(++fsϏQ!*x/2Eȅy,~)}?&2h8aY $þ fl?DF)GEd L*/K*r"RsmJCqE00xB!^F +VixB ,PA~V~~^>>= :lnO豣e{U[ʂ.{a8\('|Dc;PlDWWHaXXfX⬤uƕ绒@L6 kY%ظuffAWq>}9,E nϒ1үe;iݹUYNAx$q0u+{__MեU] ?C.60#{mT\]{* QLE"efE=E|75Wt)m!i XIH]FM8+YWߵ\K8dLDI  k8=иNMbdƀ"T{DȮDڟ9Y_圑En!!$*4z-RsZI78˸ǔߚ(!k5#8 蟠T#U 292' v%+ҥsu8JAsk]Z紌UΤ @r&"&J \),fc1imjޜl9I!L<$Ή-% h&"rgIQ$@[ 4~ʰ `27coU3~X&SP&@6QYpj1 UUdf 6KIo΋Qv!b d\itY&ܽS^,jD>W`, UaET85f yOv.Wl E0aPthR㫦f:L"hjwVr\И)˗ދO߮Ӷo٦Nܥ ֲ%w)8p˾W_k_-I," $(|fI(F`?8lp0kP& .wlQz&$)Y)rtPtw:CpE.xNL ̉k'"0ɀ)Oe<:!Goc9vvX^Kf}&={8}{Ɏ%==]vۍC{KKKa1^v+m[x, eylW8W`E9zr{Ph$G =Wq&`V#7k'\FF6Q,_ĶURK Z#㉕:uPc BuYbGjF4-@1~Grg匉.+=xOG?{N3CY"9*:cl61|ߦԜYD!ץWykiqӻ?א1 ?f(X{U4\a{'䡂Qgc 2p|Աl[M +WU6ZWd2(12K4 iqǟ&4*ҡC+`kO䕪3mOohHѰ%6k2VQZ"X9X6k7j@6jRB"ņhl#FNv?L U=v ~"io]VfWu6w6"ʮzISj#hUX]>ųrU7ZϬ \(Ǟ~\>bdT`kU#=i}<{ o: ,0hh[5mO=nd)1HDCd09z܈d'lp6ܶ|35col/XE)K*@+Lv|oA|^V"CAlξTcru eRtK%M~q5}}r`3x`> 4QUS<Z*)% cFb+teZ=6 "q00ӑq!9{A[[SLľNhK}&6:TˮfH/ߑ)c&[nvع^DU2_s^}# m-Isf/_b8 s0V nkKYkx L_-%Pu_F1BduaV lHUKWu=2 VB!wjE9ЁjAN)o,|WrY'1tY#Eʈm?L'7HOY*$dk[C {wp~$pBcQ)2E/"H#AɜďObr8SҪlaL2z"a8(qS$iU iFF]8Z.i?xH8ҥ@kȿLT NЖSwO حS_3ڹKܝה0Q^v"T<0fnFugVp,{ 9x0C`oKZ "s@Vp\ܦ3c|yG@gY} LΪu^.MƤKM-CTkىj"l;/Y"].AFjzUVKYf,9jxȻ1RPR \e&حpض ;܎4v/꘠N> BsY#灀)jO)aFN 46iАA2`9z!(pO[x!ٴi,YLY9+ u:hśKS߯sM',Nav ?8M 2red[\tFH8y+2P<8yYa$ 60FSs`:Չ*6i8seȈ2!r믐(mNHUs[xD5L!(⶛G1[U?ea;oQݠ6! O6iyIUvVn'y3lOʍ@yS2 Yva5VŠ 4&[zψXR__ ^6(sՕW#y87?\EОl%<\ϥ8- w:5٣g 6 wDʓJwgV-D>2"oȐEP'5{䴁m\=k *L:>1l8%EPW38j¬Ҫe+bz}%¨znNy*yh7锕i|/(nGLl?R U|g~x<\PfT^e\M㞄 a+;"S/qܨ/HtmS4=tVS:'9Y;UiAҦXejG3X&e)8Ѓq3=58322d-J ~xkS9^5"ƩXU X$lN)4>߶,+ O=tW?#kSeLYh2r\tDlB*qȴ;+n L;sPFP*6P+ `/f e v1|b)M2:S E }5a>u`z6iή@%R@83]>'Ʋ"q,|o$hU QNl̜Q0`>Uźp2|HP_ڜKgpsDryщT?AO(68-v9.X[\rJꦢ\勵5 PvU>haZ~i8P:;6J7|sT?}wP@CYʸq/rWwΜ}y*ʿ; (͛rfY8p!47;~E7|_#hiO !?g̃a|fVCeWe a'8{o3L_eI4q㣫,._显+"ty5;^;<3e#U>iU k.gV,wGѲ¹s!$|<`oShaF`3 ޱT,hdc@HXTȓPAϰ|FhLyM&sw u>YRU\z5'cYJarCݥbשy3ӧ2,Ӊ8U:mP3qT1]άJAZ>if+GAVEH$YqɲRLSzp:78[ 7?Dg %,BlfYb6 ,6q L&T;#!PH})_ɰۡۄTo+LkKYAp$(q+B˱ӡx07=AuI48؈7r% 鈼? ae,w(NyJǂ{\_1D#Kf^E3_#2'qOddRe 5ȜW BoMS9 3WS @\- d`BXݗɂ#ke+[ma*%K6ا5Z;\)]g C--/eҺ8j+m[`B8[Ȇ\LovP@~ȩte3-qRxUb/.9"h@7"b:H3FrJ 笭^ntGڡJeHRczK߸. qan+f"2BL##?N9bYWzT KTUߑ_(102̯0UN^/IDATw|:L!;J&hD;ȘV}tUE -Z}0~)o2mo$-xR9wȐuL+)ʓe7&+z](=Zu8e> %#L=.o/H<?<a$'V1-$/^Hdus?ըrT/u:ʤTʤ55 >mzKB$ecdV?7/"({]&#; AȳaD}f9Y^LPTeo\c䦁SEd<|[hNV]C1sI~&~\6YZǶ´2sϒo6ϗ׳Qh@jxV@6"` ,XAvI8սege!]Q AFuGâ$ ݐ/uYkiep=zМ@JT@{\t@6[Gݪ*ވlI3 F9KҽEgyh3wT2;DŽIT-嶤[sbs8 "*xʀut+AG68IUXngHvb]| P/|Iժ+7t @;U')pxu'N/mzJud g+qck=9'`)_SMS 76kRZ#\$ÒTe1?`oj+9T"=Y'y%EErK~k`<\eȹ-nkNv!Y9\xL"mo޲/|.-walWr0S 63D4b?A(GsVQ2gܬ ˬH.**W%ak:\Ⱦ+!:x;w:Wg㬔j|hu]]-_/g` &EeŰA(؈rvp_A~t~ш[b'x|˥ !*wd"v}WrXrTLcޟe_!e#DX&EuU$Dظm?is\EMIB.HN 1pԾdّ$T2 ^~ .*Sa Iqn5?>+ sע|xxr}#om)eRXfkjadYLH(հg; t4uUl~#bK<I G/eC:& $#|IUӠȞ6*ſO ˫iI Qm3@.*O̫vCfA<5ynrF0/NكAɝI2ug,w}&gʀܝ$apsߨ[w&{oD)DQƑ9]Ҡ[}_]FGtTp|OorQ/eѫ/tQZ}^)4 s'EǕraPSjH왽HN2팫82}_?P^ 0?`=|cF/AEW~.HvHdZk e{@L.! l I8LQd4_R<b `.}${}L9A5Uw,q#`aHXWvj@ ۖS$ceB#c8c,Ugb[K,zN<ư(75,20hCD4t^[i*Ao^rnT 6`d?FhX֥VWyeRl_,F @ - r3#⁡G.RcX!#Uqt>.RC}hyP'/GM*'|VJ$0[*r*3 e d\wx&hTfҴ,$!j5rDԩ`+*- [ڮDpɁx\R"8 ,1Ip2C2&db12O;(WϤ*n рݫvƲ=PV1aɏ 냰 3Naq=dA,F::n}=.{fDb0G;`|◢$EJٟstLht"/Bp%m/*"!hpwM]͕?Be_=^>ܱ@w(ڨ=ROPU==J/i91Y\EqntE$&'F|fg3*@ p,) *G+rvl a\)O5W> =zK@TeWl{UuЋs"IJ`\vfRM&ɇᘏn0RB|0Ι8\nV`_ 8<ɌہHoSӵ5fCj9B|ZK5|= iIװe悉aʰ8R1[r A{+ n)PY޷ОmVs]?kkr< h;ʨ-/k櫑Y~i"* jw l[qX_Buۊ xbp%sV.v _Z;> E¢  0O^MgT"WG=Z5a @6dA"@r50x51|slVxT>)\W2}?)6-rs@YSJGFwkz^$RQa5^Ø 3HH'1kA1ˀp4]'7s yP>-mj?;~G|)gGtݘg6AC`?k'2m_.YM ƾ`rY]# Z組'jUN F#C_2Sܐ;J rgi\Kv!%AYܛoW̓L{wsp,sSԠ94,`.bE *7OF3<~teL[e_̾yoGV3'#j6ڮ&F[Pir@[ɆŅWIW;VF0.4^\> Wd晷KO `DZ!ǘg8E+Y榼1n_+Vx3gnTHȶ=(gv%,־=v` NwQElھ-RTH~HbOpUDo%^Fj'-ҏl˞, . TGuiWQxxd;_`ǚ_\UXƂ4ةݴ@\ T.b!k}RS\/qs'P<#ichL_dޡfl0E3s?IVMDYrA3E{pH2>AriCOPt]% 6ح;QVemVs{ ,:0"h޾:)̎RhoK[)>8+ȅ%qc76g@J" n7weHAQuMK%>$[~Ş/eG-8kTEJ ] }y'pZp8T $chV0.hё}bu⼒`\"7B kŇa5/ZŊcXyn_aNT߶d7(o1Ų0k4T.qoDNDd}1; 0s`x̷u  rZ@֋UIvH)e( vɽC0Tnseaarx{X% Wv@czVX{P.d:Ҟ؝#oFST\PnV` @~0𦶦 kYj쐞Vy6cV_fƞ@u%4 LQB< pF]>hw 9"QJ%TAXvQ 㱜3``@3K.9,kGSn8=a:xBT,\G Jöbe\DW48uLG`|>Ɗ1 rQwþ꙽E(wpt:].dBTDz +O"SL:\uldeB}-uϚyޘh?c> pȮj6,F@%7j.>r4Fe^ 4 ΞApoPt$f;t|Q$X6V!Я(?r.:V/@}0N7@.pwN+.Rw#6\%[k_yȉIK}4{k4TҁDAR(W:}Nݛ@׌* {Nqe. =lwYVbU Z2^!VF8p[<f|U>HUrl=X/fSMp(qx x7]&[ 3Z =2ӮV(`ĉLLp"ر N(Z%p<<npH`QSk@`@- d{\bXs{罴oƜ"9M %55~'t[Ý^8ݐg MȢ"<wB̡Oߛ)&yWs;a`TIƔyh{U`lW.rk4>}mr)j]@/r@aT<. =r"ыe\(ӌlO!ixCVfDerH.Qͧ Ēda"|-dː h܈bd1}BK(HR#:Na*%yrgVdnB=uƘ/ȧOKz rEޭWI* ړ,b-Yvb>R±x%uVC(|1-?..n&2R臙ƻ܀p^ѨąJe>ƙ <]-dY[PsI;k+Y],[BPˋyA{+jLsE]7Ur@8Q)IV=J){ v)IXkfE"fl3/'J PNheZL?e{a22:Ej'?gѲ᳝_A|\/QVa;3a?GnP` @oֶ1{j_{`lPXRpN) 4GSH E؏Đl=%rS M}eJH)+1eXAFe4\>)yT׏CvLv䤅y ʥ{XPO)f+ B;\)foH`^zؔ.FNHh=9/!엇M{GLUF@#K'dwA9~E-(.b. @ߧ`?^ c}m;Hyr4)ʵ)FAsr33@MdЯ% m"#{5ofQ )$0Cw?diC' #4}ZTS3ƽѕ&1!D~/Zr  >XN"GYa2c6::#tYFJ'Y›_m"jxBK A%hXyp>27(GXGbDp9ru|#]Z`&ݩ>66VP ׭+Wx_ҫgEWw4Vz32fsB!yԺnTHgWRbvA[C&͐#>;2$We a&Z'P0+ IFbRF_1Z6fMU)edxp.sClLE> 1L<%DKXb*#D@Q %̐f=w湞mw=+j0c 4ó)g=b!DAy!ʆ DqIܤح90aceڬp+I-l~9_;Fvq{wAYY-]V|˔҉w!?n+w"j2|(UǶ" J(@aj fݸBhOBQ4gpoy踋`3Kෲf%ߪЇoqӇgN֩ N ~2!i$MJxi蜈z|Ļ, ^mZ%Nu!B] q!fks-al8 IPPe0Z=q[)Ԯ4 nP~K ނdZVMmlZ{Zh]<)pլ#aJoAx[|~cd@99V~wYT2>rd`=kՑmo}O@V~V],+>0_UcAMFöiێA_nkwUvlE|q35m7ꎼ kwwn{u).be9Owrl*d=ocDDH,P  gD~{0ꒆկv52qie8H[97>u9n қat2YfʺEo[D}j}uB:,]R iIP5=  ҪTzXS&ˏ>}~+ٱPAےڸOX 31U,Ľ[$lZ(~[l[v9ݺuNjBAdp,~|phJXauͰu[sh[SYYL'O7=7=h a@!)'šYuT4hiȉЌHIENDB`rsample/man/labels.rsplit.Rd0000644000176200001440000000104213507237102015532 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/lables.R \name{labels.rsplit} \alias{labels.rsplit} \title{Find Labels from rsplit Object} \usage{ \method{labels}{rsplit}(object, ...) } \arguments{ \item{object}{An \code{rsplit} object} \item{...}{Not currently used.} } \value{ A tibble. } \description{ Produce a tibble of identification variables so that single splits can be linked to a particular resample. } \examples{ cv_splits <- vfold_cv(mtcars) labels(cv_splits$splits[[1]]) } \seealso{ add_resample_id } rsample/man/labels.rset.Rd0000644000176200001440000000131213352536641015202 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/lables.R \name{labels.rset} \alias{labels.rset} \alias{labels.vfold_cv} \title{Find Labels from rset Object} \usage{ \method{labels}{rset}(object, make_factor = FALSE, ...) \method{labels}{vfold_cv}(object, make_factor = FALSE, ...) } \arguments{ \item{object}{An \code{rset} object} \item{make_factor}{A logical for whether the results should be character or a factor.} \item{...}{Not currently used.} } \value{ A single character or factor vector. } \description{ Produce a vector of resampling labels (e.g. "Fold1") from an \code{rset} object. Currently, \code{nested_cv} is not supported. } \examples{ labels(vfold_cv(mtcars)) } rsample/man/as.data.frame.rsplit.Rd0000644000176200001440000000226113352536641016710 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rsplit.R \name{as.data.frame.rsplit} \alias{as.data.frame.rsplit} \alias{analysis} \alias{assessment} \title{Convert an \code{rsplit} object to a data frame} \usage{ \method{as.data.frame}{rsplit}(x, row.names = NULL, optional = FALSE, data = "analysis", ...) analysis(x, ...) assessment(x, ...) } \arguments{ \item{x}{An \code{rsplit} object.} \item{row.names}{\code{NULL} or a character vector giving the row names for the data frame. Missing values are not allowed.} \item{optional}{A logical: should the column names of the data be checked for legality?} \item{data}{Either "analysis" or "assessment" to specify which data are returned.} \item{...}{Additional arguments to be passed to or from methods. Not currently used.} } \description{ The analysis or assessment code can be returned as a data frame (as dictated by the \code{data} argument) using \code{as.data.frame.rsplit}. \code{analysis} and \code{assessment} are shortcuts. } \examples{ library(dplyr) set.seed(104) folds <- vfold_cv(mtcars) model_data_1 <- folds$splits[[1]] \%>\% analysis() holdout_data_1 <- folds$splits[[1]] \%>\% assessment() } rsample/man/int_pctl.Rd0000644000176200001440000000561513507027211014600 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/bootci.R \name{int_pctl} \alias{int_pctl} \alias{int_t} \alias{int_bca} \title{Bootstrap confidence intervals} \usage{ int_pctl(.data, statistics, alpha = 0.05) int_t(.data, statistics, alpha = 0.05) int_bca(.data, statistics, alpha = 0.05, .fn, ...) } \arguments{ \item{.data}{A data frame containing the bootstrap resamples created using \code{bootstraps()}. For t- and BCa-intervals, the \code{apparent} argument should be set to \code{TRUE}.} \item{statistics}{An unquoted column name or \code{dplyr} selector that identifies a single column in the data set that contains the indiviual bootstrap estimates. This can be a list column of tidy tibbles (that contains columns \code{term} and \code{estimate}) or a simple numeric column. For t-intervals, a standard tidy column (usually called \code{std.err}) is required. See the examples below.} \item{alpha}{Level of significance} \item{.fn}{A function to calculate statistic of interest. The function should take an \code{rsplit} as the first argument and the \code{...} are required.} \item{...}{Arguments to pass to \code{.fn}.} } \value{ Each function returns a tibble with columns \code{.lower}, \code{.estimate}, \code{.upper}, \code{.alpha}, \code{.method}, and \code{term}. \code{.method} is the type of interval (eg. "percentile", "student-t", or "BCa"). \code{term} is the name of the estimate. } \description{ Calculate bootstrap confidence intervals using various methods. } \details{ Percentile intervals are the standard method of obtaining confidence intervals but require thousands of resamples to be accurate. t-intervals may need fewer resamples but require a corresponding variance estimate. Bias-corrected and accelerated intervals require the original function that was used to create the statistics of interest and are computationally taxing. } \examples{ library(broom) library(dplyr) library(purrr) library(tibble) lm_est <- function(split, ...) { lm(mpg ~ disp + hp, data = analysis(split)) \%>\% tidy() } set.seed(52156) car_rs <- bootstraps(mtcars, 1000, apparent = TRUE) \%>\% mutate(results = map(splits, lm_est)) int_pctl(car_rs, results) int_t(car_rs, results) int_bca(car_rs, results, .fn = lm_est) # putting results into a tidy format rank_corr <- function(split) { dat <- analysis(split) tibble( term = "corr", estimate = cor(dat$Sepal.Length, dat$Sepal.Width, method = "spearman"), # don't know the analytical std.err so no t-intervals std.err = NA_real_ ) } set.seed(69325) bootstraps(iris, 1000, apparent = TRUE) \%>\% mutate(correlations = map(splits, rank_corr)) \%>\% int_pctl(correlations) } \references{ Davison, A., & Hinkley, D. (1997). \emph{Bootstrap Methods and their Application}. Cambridge: Cambridge University Press. doi:10.1017/CBO9780511802843 \url{https://tidymodels.github.io/rsample/articles/Applications/Intervals.html} } rsample/man/complement.Rd0000644000176200001440000000134513352536641015135 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/complement.R \name{complement} \alias{complement} \title{Determine the Assessment Samples} \usage{ complement(x, ...) } \arguments{ \item{x}{An \code{rsplit} object} \item{...}{Not currently used} } \value{ A integer vector. } \description{ Given an \code{rsplit} object, \code{complement} will determine which of the data rows are contained in the assessment set. To save space, many of the \code{rset} objects will not contain indicies for the assessment split. } \examples{ set.seed(28432) fold_rs <- vfold_cv(mtcars) head(fold_rs$splits[[1]]$in_id) fold_rs$splits[[1]]$out_id complement(fold_rs$splits[[1]]) } \seealso{ \code{\link[=populate]{populate()}} } rsample/man/gather.rset.Rd0000644000176200001440000000304213352536641015214 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/gather.R \name{gather.rset} \alias{gather.rset} \title{Gather an \code{rset} Object} \usage{ \method{gather}{rset}(data, key = NULL, value = NULL, ..., na.rm = TRUE, convert = FALSE, factor_key = TRUE) } \arguments{ \item{data}{An \code{rset} object.} \item{key, value, ...}{Not specified in this method and will be ignored. Note that this means that selectors are ignored if they are passed to the function.} \item{na.rm}{If \code{TRUE}, will remove rows from output where the value column in \code{NA}.} \item{convert}{If \code{TRUE} will automatically run \code{type.convert()} on the key column. This is useful if the column names are actually numeric, integer, or logical.} \item{factor_key}{If FALSE, the default, the key values will be stored as a character vector. If \code{TRUE}, will be stored as a factor, which preserves the original ordering of the columns.} } \value{ A data frame with the ID columns, a column called \code{model} (with the previous column names), and a column called \code{statistic} (with the values). } \description{ This method uses \code{gather} on an \code{rset} object to stack all of the non-ID or split columns in the data and is useful for stacking model evaluation statistics. The resulting data frame has a column based on the column names of \code{data} and another for the values. } \examples{ library(rsample) cv_obj <- vfold_cv(mtcars, v = 10) cv_obj$lm_rmse <- rnorm(10, mean = 2) cv_obj$nnet_rmse <- rnorm(10, mean = 1) gather(cv_obj) } rsample/man/rsample.Rd0000644000176200001440000000307413352536641014436 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/pkg.R \docType{package} \name{rsample} \alias{rsample} \alias{rsample-package} \title{rsample: General Resampling Infrastructure for R} \description{ \pkg{rsample} has functions to create variations of a data set that can be used to evaluate models or to estimate the sampling distribution of some statistic. } \section{Terminology}{ \itemize{ \item A \strong{resample} is the result of a two-way split of a data set. For example, when bootstrapping, one part of the resample is a sample with replacement of the original data. The other part of the split contains the instances that were not contained in the bootstrap sample. The data structure \code{rsplit} is used to store a single resample. \item When the data are split in two, the portion that are used to estimate the model or calculate the statistic is called the \strong{analysis} set here. In machine learning this is sometimes called the "training set" but this would be poorly named since it might conflict with any initial split of the original data. \item Conversely, the other data in the split are called the \strong{assessment} data. In bootstrapping, these data are often called the "out-of-bag" samples. \item A collection of resamples is contained in an \code{rset} object. } } \section{Basic Functions}{ The main resampling functions are: \code{\link[=vfold_cv]{vfold_cv()}}, \code{\link[=bootstraps]{bootstraps()}}, \code{\link[=mc_cv]{mc_cv()}}, \code{\link[=rolling_origin]{rolling_origin()}}, and \code{\link[=nested_cv]{nested_cv()}}. } rsample/man/rsample2caret.Rd0000644000176200001440000000203113352536641015527 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/caret.R \name{rsample2caret} \alias{rsample2caret} \alias{caret2rsample} \title{Convert Resampling Objects to Other Formats} \usage{ rsample2caret(object, data = c("analysis", "assessment")) caret2rsample(ctrl, data = NULL) } \arguments{ \item{object}{An \code{rset} object. Currently, \code{nested_cv} is not supported.} \item{data}{The data that was originally used to produce the \code{ctrl} object.} \item{ctrl}{An object produced by \code{trainControl} that has had the \code{index} and \code{indexOut} elements populated by integers. One method of getting this is to extract the \code{control} objects from an object produced by \code{train}.} } \value{ \code{rsample2caret} returns a list that mimics the \code{index} and \code{indexOut} elements of a \code{trainControl} object. \code{caret2rsample} returns an \code{rset} object of the appropriate class. } \description{ These functions can convert resampling objects between \pkg{rsample} and \pkg{caret}. } rsample/man/vfold_cv.Rd0000644000176200001440000000530513512170471014565 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/vfold.R \name{vfold_cv} \alias{vfold_cv} \title{V-Fold Cross-Validation} \usage{ vfold_cv(data, v = 10, repeats = 1, strata = NULL, breaks = 4, ...) } \arguments{ \item{data}{A data frame.} \item{v}{The number of partitions of the data set.} \item{repeats}{The number of times to repeat the V-fold partitioning.} \item{strata}{A variable that is used to conduct stratified sampling to create the folds. This could be a single character value or a variable name that corresponds to a variable that exists in the data frame.} \item{breaks}{A single number giving the number of bins desired to stratify a numeric stratification variable.} \item{...}{Not currently used.} } \value{ A tibble with classes \code{vfold_cv}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and one or more identification variables. For a single repeats, there will be one column called \code{id} that has a character string with the fold identifier. For repeats, \code{id} is the repeat number and an additional column called \code{id2} that contains the fold information (within repeat). } \description{ V-fold cross-validation randomly splits the data into V groups of roughly equal size (called "folds"). A resample of the analysis data consisted of V-1 of the folds while the assessment set contains the final fold. In basic V-fold cross-validation (i.e. no repeats), the number of resamples is equal to V. } \details{ The \code{strata} argument causes the random sampling to be conducted \emph{within the stratification variable}. The can help ensure that the number of data points in the analysis data is equivalent to the proportions in the original data set. When more than one repeat is requested, the basic V-fold cross-validation is conducted each time. For example, if three repeats are used with \code{v = 10}, there are a total of 30 splits which as three groups of 10 that are generated separately. } \examples{ vfold_cv(mtcars, v = 10) vfold_cv(mtcars, v = 10, repeats = 2) library(purrr) iris2 <- iris[1:130, ] set.seed(13) folds1 <- vfold_cv(iris2, v = 5) map_dbl(folds1$splits, function(x) { dat <- as.data.frame(x)$Species mean(dat == "virginica") }) set.seed(13) folds2 <- vfold_cv(iris2, strata = "Species", v = 5) map_dbl(folds2$splits, function(x) { dat <- as.data.frame(x)$Species mean(dat == "virginica") }) set.seed(13) folds3 <- vfold_cv(iris2, strata = "Petal.Length", breaks = 6, v = 5) map_dbl(folds3$splits, function(x) { dat <- as.data.frame(x)$Species mean(dat == "virginica") }) } rsample/man/mc_cv.Rd0000644000176200001440000000430613512170471014052 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/mc.R \name{mc_cv} \alias{mc_cv} \title{Monte Carlo Cross-Validation} \usage{ mc_cv(data, prop = 3/4, times = 25, strata = NULL, breaks = 4, ...) } \arguments{ \item{data}{A data frame.} \item{prop}{The proportion of data to be retained for modeling/analysis.} \item{times}{The number of times to repeat the sampling.} \item{strata}{A variable that is used to conduct stratified sampling to create the resamples. This could be a single character value or a variable name that corresponds to a variable that exists in the data frame.} \item{breaks}{A single number giving the number of bins desired to stratify a numeric stratification variable.} \item{...}{Not currently used.} } \value{ An tibble with classes \code{mc_cv}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and a column called \code{id} that has a character string with the resample identifier. } \description{ One resample of Monte Carlo cross-validation takes a random sample (without replacement) of the original data set to be used for analysis. All other data points are added to the assessment set. } \details{ The \code{strata} argument causes the random sampling to be conducted \emph{within the stratification variable}. The can help ensure that the number of data points in the analysis data is equivalent to the proportions in the original data set. } \examples{ mc_cv(mtcars, times = 2) mc_cv(mtcars, prop = .5, times = 2) library(purrr) iris2 <- iris[1:130, ] set.seed(13) resample1 <- mc_cv(iris2, times = 3, prop = .5) map_dbl(resample1$splits, function(x) { dat <- as.data.frame(x)$Species mean(dat == "virginica") }) set.seed(13) resample2 <- mc_cv(iris2, strata = "Species", times = 3, prop = .5) map_dbl(resample2$splits, function(x) { dat <- as.data.frame(x)$Species mean(dat == "virginica") }) set.seed(13) resample3 <- mc_cv(iris2, strata = "Sepal.Length", breaks = 6, times = 3, prop = .5) map_dbl(resample3$splits, function(x) { dat <- as.data.frame(x)$Species mean(dat == "virginica") }) } rsample/man/reexports.Rd0000644000176200001440000000061413356221300015006 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/reexports.R \docType{import} \name{reexports} \alias{reexports} \alias{tidy} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{generics}{\code{\link[generics]{tidy}}} }} rsample/man/drinks.Rd0000644000176200001440000000115513356221017014253 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{drinks} \alias{drinks} \title{Sample Time Series Data} \source{ The Federal Reserve Bank of St. Louis website https://fred.stlouisfed.org/series/S4248SM144NCEN } \value{ \item{drinks}{a data frame} } \description{ Sample Time Series Data } \details{ Drink sales. The exact name of the series from FRED is: "Merchant Wholesalers, Except Manufacturers' Sales Branches and Offices Sales: Nondurable Goods: Beer, Wine, and Distilled Alcoholic Beverages Sales" } \examples{ data(drinks) str(drinks) } \keyword{datasets} rsample/man/two_class_dat.Rd0000644000176200001440000000067013352536641015620 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{two_class_dat} \alias{two_class_dat} \title{Two Class Data} \value{ \item{two_class_dat}{a data frame} } \description{ Two Class Data } \details{ There are artifical data with two predictors (\code{A} and \code{B}) and a factor outcome variable (\code{Class}). } \examples{ data(two_class_dat) str(two_class_dat) } \keyword{datasets} rsample/man/bootstraps.Rd0000644000176200001440000000563013512170471015164 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/boot.R \name{bootstraps} \alias{bootstraps} \title{Bootstrap Sampling} \usage{ bootstraps(data, times = 25, strata = NULL, breaks = 4, apparent = FALSE, ...) } \arguments{ \item{data}{A data frame.} \item{times}{The number of bootstrap samples.} \item{strata}{A variable that is used to conduct stratified sampling. When not \code{NULL}, each bootstrap sample is created within the stratification variable. This could be a single character value or a variable name that corresponds to a variable that exists in the data frame.} \item{breaks}{A single number giving the number of bins desired to stratify a numeric stratification variable.} \item{apparent}{A logical. Should an extra resample be added where the analysis and holdout subset are the entire data set. This is required for some estimators used by the \code{summary} function that require the apparent error rate.} \item{...}{Not currently used.} } \value{ An tibble with classes \code{bootstraps}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and a column called \code{id} that has a character string with the resample identifier. } \description{ A bootstrap sample is a sample that is the same size as the original data set that is made using replacement. This results in analysis samples that have multiple replicates of some of the original rows of the data. The assessment set is defined as the rows of the original data that were not included in the bootstrap sample. This is often referred to as the "out-of-bag" (OOB) sample. } \details{ The argument \code{apparent} enables the option of an additional "resample" where the analysis and assessment data sets are the same as the original data set. This can be required for some types of analysis of the bootstrap results. The \code{strata} argument is based on a similar argument in the random forest package were the bootstrap samples are conducted \emph{within the stratification variable}. The can help ensure that the number of data points in the bootstrap sample is equivalent to the proportions in the original data set. } \examples{ bootstraps(mtcars, times = 2) bootstraps(mtcars, times = 2, apparent = TRUE) library(purrr) iris2 <- iris[1:130, ] set.seed(13) resample1 <- bootstraps(iris2, times = 3) map_dbl(resample1$splits, function(x) { dat <- as.data.frame(x)$Species mean(dat == "virginica") }) set.seed(13) resample2 <- bootstraps(iris2, strata = "Species", times = 3) map_dbl(resample2$splits, function(x) { dat <- as.data.frame(x)$Species mean(dat == "virginica") }) set.seed(13) resample3 <- bootstraps(iris2, strata = "Sepal.Length", breaks = 6, times = 3) map_dbl(resample3$splits, function(x) { dat <- as.data.frame(x)$Species mean(dat == "virginica") }) } rsample/man/tidy.rsplit.Rd0000644000176200001440000000456013512170471015252 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tidy.R \name{tidy.rsplit} \alias{tidy.rsplit} \alias{tidy.rset} \alias{tidy.vfold_cv} \alias{tidy.nested_cv} \title{Tidy Resampling Object} \usage{ \method{tidy}{rsplit}(x, unique_ind = TRUE, ...) \method{tidy}{rset}(x, ...) \method{tidy}{vfold_cv}(x, ...) \method{tidy}{nested_cv}(x, ...) } \arguments{ \item{x}{A \code{rset} or \code{rsplit} object} \item{unique_ind}{Should unique row identifiers be returned? For example, if \code{FALSE} then bootstrapping results will include multiple rows in the sample for the same row in the original data.} \item{...}{Not currently used.} } \value{ A tibble with columns \code{Row} and \code{Data}. The latter has possible values "Analysis" or "Assessment". For \code{rset} inputs, identification columns are also returned but their names and values depend on the type of resampling. \code{vfold_cv} contains a column "Fold" and, if repeats are used, another called "Repeats". \code{bootstraps} and \code{mc_cv} use the column "Resample". } \description{ The \code{tidy} function from the \pkg{broom} package can be used on \code{rset} and \code{rsplit} objects to generate tibbles with which rows are in the analysis and assessment sets. } \details{ Note that for nested resampling, the rows of the inner resample, named \code{inner_Row}, are \emph{relative} row indices and do not correspond to the rows in the original data set. } \examples{ library(ggplot2) theme_set(theme_bw()) set.seed(4121) cv <- tidy(vfold_cv(mtcars, v = 5)) ggplot(cv, aes(x = Fold, y = Row, fill = Data)) + geom_tile() + scale_fill_brewer() set.seed(4121) rcv <- tidy(vfold_cv(mtcars, v = 5, repeats = 2)) ggplot(rcv, aes(x = Fold, y = Row, fill = Data)) + geom_tile() + facet_wrap(~Repeat) + scale_fill_brewer() set.seed(4121) mccv <- tidy(mc_cv(mtcars, times = 5)) ggplot(mccv, aes(x = Resample, y = Row, fill = Data)) + geom_tile() + scale_fill_brewer() set.seed(4121) bt <- tidy(bootstraps(mtcars, time = 5)) ggplot(bt, aes(x = Resample, y = Row, fill = Data)) + geom_tile() + scale_fill_brewer() dat <- data.frame(day = 1:30) # Resample by week instead of day ts_cv <- rolling_origin(dat, initial = 7, assess = 7, skip = 6, cumulative = FALSE) ts_cv <- tidy(ts_cv) ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) + geom_tile() + scale_fill_brewer() } rsample/man/pretty.vfold_cv.Rd0000644000176200001440000000152513352536641016122 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/lables.R \name{pretty.vfold_cv} \alias{pretty.vfold_cv} \alias{pretty.loo_cv} \alias{pretty.apparent} \alias{pretty.rolling_origin} \alias{pretty.mc_cv} \alias{pretty.nested_cv} \alias{pretty.bootstraps} \alias{pretty.group_vfold_cv} \title{Short Decriptions of rsets} \usage{ \method{pretty}{vfold_cv}(x, ...) \method{pretty}{loo_cv}(x, ...) \method{pretty}{apparent}(x, ...) \method{pretty}{rolling_origin}(x, ...) \method{pretty}{mc_cv}(x, ...) \method{pretty}{nested_cv}(x, ...) \method{pretty}{bootstraps}(x, ...) \method{pretty}{group_vfold_cv}(x, ...) } \arguments{ \item{x}{An \code{rset} object} \item{...}{Not currently used.} } \value{ A character vector. } \description{ Produce a chracter vector of describing the resampling method. } \keyword{internal} rsample/man/group_vfold_cv.Rd0000644000176200001440000000324013512132573015776 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/groups.R \name{group_vfold_cv} \alias{group_vfold_cv} \title{Group V-Fold Cross-Validation} \usage{ group_vfold_cv(data, group = NULL, v = NULL, ...) } \arguments{ \item{data}{A data frame.} \item{group}{This could be a single character value or a variable name that corresponds to a variable that exists in the data frame.} \item{v}{The number of partitions of the data set. If let \code{NULL}, \code{v} will be set to the number of unique values in the group.} \item{...}{Not currently used.} } \value{ An tibble with classes \code{group_vfold_cv}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and an identification variable. } \description{ Group V-fold cross-validation creates splits of the data based on some grouping variable (which may have more than a single row associated with it). The function can create as many splits as there are unique values of the grouping variable or it can create a smaller set of splits where more than one value is left out at a time. } \examples{ set.seed(3527) test_data <- data.frame(id = sort(sample(1:20, size = 80, replace = TRUE))) test_data$dat <- runif(nrow(test_data)) set.seed(5144) split_by_id <- group_vfold_cv(test_data, group = "id") get_id_left_out <- function(x) unique(assessment(x)$id) library(purrr) table(map_int(split_by_id$splits, get_id_left_out)) set.seed(5144) split_by_some_id <- group_vfold_cv(test_data, group = "id", v = 7) held_out <- map(split_by_some_id$splits, get_id_left_out) table(unlist(held_out)) # number held out per resample: map_int(held_out, length) } rsample/man/add_resample_id.Rd0000644000176200001440000000201113507237102016045 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/lables.R \name{add_resample_id} \alias{add_resample_id} \title{Augment a data set with resampling identifiers} \usage{ add_resample_id(.data, split, dots = FALSE) } \arguments{ \item{.data}{A data frame} \item{split}{A single \code{rset} object.} \item{dots}{A single logical: should the id columns be prefixed with a "." to avoid name conflicts with \code{.data}?} } \value{ An updated data frame. } \description{ For a data set, \code{add_resample_id()} will add at least one new column that identifies which resample that the data came from. In most cases, a single column is added but for some resampling methods two or more are added. } \examples{ set.seed(363) car_folds <- vfold_cv(mtcars, repeats = 3) analysis(car_folds$splits[[1]]) \%>\% add_resample_id(car_folds$splits[[1]]) \%>\% head() car_bt <- bootstraps(mtcars) analysis(car_bt$splits[[1]]) \%>\% add_resample_id(car_bt$splits[[1]]) \%>\% head() } \seealso{ labels.rsplit } rsample/man/initial_split.Rd0000644000176200001440000000363313512170471015631 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/initial_split.R \name{initial_split} \alias{initial_split} \alias{initial_time_split} \alias{training} \alias{testing} \title{Simple Training/Test Set Splitting} \usage{ initial_split(data, prop = 3/4, strata = NULL, breaks = 4, ...) initial_time_split(data, prop = 3/4, ...) training(x) testing(x) } \arguments{ \item{data}{A data frame.} \item{prop}{The proportion of data to be retained for modeling/analysis.} \item{strata}{A variable that is used to conduct stratified sampling to create the resamples. This could be a single character value or a variable name that corresponds to a variable that exists in the data frame.} \item{breaks}{A single number giving the number of bins desired to stratify a numeric stratification variable.} \item{...}{Not currently used.} \item{x}{An \code{rsplit} object produced by \code{initial_split}} } \value{ An \code{rset} object that can be used with the \code{training} and \code{testing} functions to extract the data in each split. } \description{ \code{initial_split} creates a single binary split of the data into a training set and testing set. \code{initial_time_split} does the same, but takes the \emph{first} \code{prop} samples for training, instead of a random selection. \code{training} and \code{testing} are used to extract the resulting data. } \details{ The \code{strata} argument causes the random sampling to be conducted \emph{within the stratification variable}. The can help ensure that the number of data points in the training data is equivalent to the proportions in the original data set. } \examples{ set.seed(1353) car_split <- initial_split(mtcars) train_data <- training(car_split) test_data <- testing(car_split) drinks_split <- initial_time_split(drinks) train_data <- training(drinks_split) test_data <- testing(car_split) c(max(train_data$date), min(test_data$date)) # no overlap } rsample/man/rolling_origin.Rd0000644000176200001440000000523213512170471015777 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rolling_origin.R \name{rolling_origin} \alias{rolling_origin} \title{Rolling Origin Forecast Resampling} \usage{ rolling_origin(data, initial = 5, assess = 1, cumulative = TRUE, skip = 0, ...) } \arguments{ \item{data}{A data frame.} \item{initial}{The number of samples used for analysis/modeling in the initial resample.} \item{assess}{The number of samples used for each assessment resample.} \item{cumulative}{A logical. Should the analysis resample grow beyond the size specified by \code{initial} at each resample?.} \item{skip}{A integer indicating how many (if any) \emph{additional} resamples to skip to thin the total amount of data points in the analysis resample. See the example below.} \item{...}{Not currently used.} } \value{ An tibble with classes \code{rolling_origin}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and a column called \code{id} that has a character string with the resample identifier. } \description{ This resampling method is useful when the data set has a strong time component. The resamples are not random and contain data points that are consecutive values. The function assumes that the original data set are sorted in time order. } \details{ The main options, \code{initial} and \code{assess}, control the number of data points from the original data that are in the analysis and assessment set, respectively. When \code{cumulative = TRUE}, the analysis set will grow as resampling continues while the assessment set size will always remain static. \code{skip} enables the function to not use every data point in the resamples. When \code{skip = 0}, the resampling data sets will increment by one position. Suppose that the rows of a data set are consecutive days. Using \code{skip = 6} will make the analysis data set operate on \emph{weeks} instead of days. The assessment set size is not affected by this option. } \examples{ set.seed(1131) ex_data <- data.frame(row = 1:20, some_var = rnorm(20)) dim(rolling_origin(ex_data)) dim(rolling_origin(ex_data, skip = 2)) dim(rolling_origin(ex_data, skip = 2, cumulative = FALSE)) # You can also roll over calendar periods by first nesting by that period, # which is especially useful for irregular series where a fixed window # is not useful. This example slides over 5 years at a time. library(dplyr) data(drinks) drinks_annual <- drinks \%>\% mutate(year = as.POSIXlt(date)$year + 1900) \%>\% nest(-year) multi_year_roll <- rolling_origin(drinks_annual, cumulative = FALSE) analysis(multi_year_roll$splits[[1]]) assessment(multi_year_roll$splits[[1]]) } rsample/man/nested_cv.Rd0000644000176200001440000000424013352536641014741 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nest.R \name{nested_cv} \alias{nested_cv} \title{Nested or Double Resampling} \usage{ nested_cv(data, outside, inside) } \arguments{ \item{data}{A data frame.} \item{outside}{The initial resampling specification. This can be an already created object or an expression of a new object (see the examples below). If the latter is used, the \code{data} argument does not need to be specified and, if it is given, will be ignored.} \item{inside}{An expression for the type of resampling to be conducted within the initial procedure.} } \value{ An tibble with classe \code{nested_cv} and any other classes that outer resampling process normally contains. The results include a column for the outer data split objects, one or more \code{id} columns, and a column of nested tibbles called \code{inner_resamples} with the additional resamples. } \description{ \code{nested_cv} can be used to take the results of one resampling procedure and conduct further resamples within each split. Any type of resampling used in \code{rsample} can be used. } \details{ It is a bad idea to use bootstrapping as the outer resampling procedure (see the example below) } \examples{ ## Using expressions for the resampling procedures: nested_cv(mtcars, outside = vfold_cv(v = 3), inside = bootstraps(times = 5)) ## Using an existing object: folds <- vfold_cv(mtcars) nested_cv(mtcars, folds, inside = bootstraps(times = 5)) ## The dangers of outer bootstraps: set.seed(2222) bad_idea <- nested_cv(mtcars, outside = bootstraps(times = 5), inside = vfold_cv(v = 3)) first_outer_split <- bad_idea$splits[[1]] outer_analysis <- as.data.frame(first_outer_split) sum(grepl("Volvo 142E", rownames(outer_analysis))) ## For the 3-fold CV used inside of each bootstrap, how are the replicated ## `Volvo 142E` data partitioned? first_inner_split <- bad_idea$inner_resamples[[1]]$splits[[1]] inner_analysis <- as.data.frame(first_inner_split) inner_assess <- as.data.frame(first_inner_split, data = "assessment") sum(grepl("Volvo 142E", rownames(inner_analysis))) sum(grepl("Volvo 142E", rownames(inner_assess))) } rsample/man/form_pred.Rd0000644000176200001440000000167213352536641014752 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/form_pred.R \name{form_pred} \alias{form_pred} \title{Extract Predictor Names from Formula or Terms} \usage{ form_pred(object, ...) } \arguments{ \item{object}{A model formula or \code{\link[stats:terms]{stats::terms()}} object.} \item{...}{Arguments to pass to \code{\link[=all.vars]{all.vars()}}} } \value{ A character vector of names } \description{ \code{all.vars} returns all variables used in a formula. This function only returns the variables explicitly used on the right-hand side (i.e., it will not resolve dots unless the object is terms with a data set specified). } \examples{ form_pred(y ~ x + z) form_pred(terms(y ~ x + z)) form_pred(y ~ x + log(z)) form_pred(log(y) ~ x + z) form_pred(y1 + y2 ~ x + z) form_pred(log(y1) + y2 ~ x + z) # will fail: # form_pred(y ~ .) form_pred(terms(Species ~ (.)^2, data = iris)) form_pred(terms( ~ (.)^2, data = iris)) } rsample/man/apparent.Rd0000644000176200001440000000176213450421255014600 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/apparent.R \name{apparent} \alias{apparent} \title{Sampling for the Apparent Error Rate} \usage{ apparent(data, ...) } \arguments{ \item{data}{A data frame.} \item{...}{Not currently used.} } \value{ A tibble with a single row and classes \code{apparent}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and one column called \code{id} that has a character string with the resample identifier. } \description{ When building a model on a data set and re-predicting the same data, the performance estimate from those predictions is often call the "apparent" performance of the model. This estimate can be wildly optimistic. "Apparent sampling" here means that the analysis and assessment samples are the same. These resamples are sometimes used in the analysis of bootstrap samples and should otherwise be avoided like old sushi. } \examples{ apparent(mtcars) } rsample/man/populate.Rd0000644000176200001440000000136013352536641014620 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/complement.R \name{populate} \alias{populate} \title{Add Assessment Indicies} \usage{ populate(x, ...) } \arguments{ \item{x}{A \code{rsplit} and \code{rset} object.} \item{...}{Not currently used} } \value{ An object of the same kind with the integer indicies. } \description{ Many \code{rsplit} and \code{rset} objects do not contain indicators for the assessment samples. \code{populate()} can be used to fill the slot for the appropriate indices. } \examples{ set.seed(28432) fold_rs <- vfold_cv(mtcars) fold_rs$splits[[1]]$out_id complement(fold_rs$splits[[1]]) populate(fold_rs$splits[[1]])$out_id fold_rs_all <- populate(fold_rs) fold_rs_all$splits[[1]]$out_id } rsample/man/attrition.Rd0000644000176200001440000000153013352536641015003 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{attrition} \alias{attrition} \title{Job Attrition} \source{ The IBM Watson Analytics Lab website https://www.ibm.com/communities/analytics/watson-analytics-blog/hr-employee-attrition/ } \value{ \item{attrition}{a data frame} } \description{ Job Attrition } \details{ These data are from the IBM Watson Analytics Lab. The website describes the data with \dQuote{Uncover the factors that lead to employee attrition and explore important questions such as \sQuote{show me a breakdown of distance from home by job role and attrition} or \sQuote{compare average monthly income by education and attrition}. This is a fictional data set created by IBM data scientists.}. There are 1470 rows. } \examples{ data(attrition) str(attrition) } \keyword{datasets} rsample/man/make_strata.Rd0000644000176200001440000000462313512132573015261 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/make_strata.R \name{make_strata} \alias{make_strata} \title{Create or Modify Stratification Variables} \usage{ make_strata(x, breaks = 4, nunique = 5, pool = 0.15, depth = 20) } \arguments{ \item{x}{An input vector.} \item{breaks}{A single number giving the number of bins desired to stratify a numeric stratification variable.} \item{nunique}{An integer for the number of unique value threshold in the algorithm.} \item{pool}{A proportion of data used to determine if a particular group is too small and should be pooled into another group.} \item{depth}{An integer that is used to determine the best number of percentiles that should be used. The number of bins are based on \code{min(5, floor(n / depth))} where \code{n = length(x)}. If \code{x} is numeric, there must be at least 40 rows in the data set (when \code{depth = 20}) to conduct stratified sampling.} } \value{ A factor vector. } \description{ For stratified resampling, this function can create strata from numeric data and also make non-numeric data more conducive to be used for stratification. } \details{ For numeric data, if the number of unique levels is less than \code{nunique}, the data are treated as categorical data. For categorical inputs, the function will find levels of \code{x} than occur in the data with percentage less than \code{pool}. The values from these groups will be randomly assigned to the remaining strata (as will data points that have missing values in \code{x}). For numeric data with more unique values than \code{nunique}, the data will be converted to being categorical based on percentiles of the data. The percentile groups will have no more than 20 percent of the data in each group. Again, missing values in \code{x} are randomly assigned to groups. } \examples{ set.seed(61) x1 <- rpois(100, lambda = 5) table(x1) table(make_strata(x1)) set.seed(554) x2 <- rpois(100, lambda = 1) table(x2) table(make_strata(x2)) # small groups are randomly assigned x3 <- factor(x2) table(x3) table(make_strata(x3)) # `oilType` data from `caret` x4 <- rep(LETTERS[1:7], c(37, 26, 3, 7, 11, 10, 2)) table(x4) table(make_strata(x4)) table(make_strata(x4, pool = 0.1)) table(make_strata(x4, pool = 0.0)) # not enough data to stratify x5 <- rnorm(20) table(make_strata(x5)) set.seed(483) x6 <- rnorm(200) quantile(x6, probs = (0:10)/10) table(make_strata(x6, breaks = 10)) } rsample/man/loo_cv.Rd0000644000176200001440000000141313512170471014240 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/loo.R \name{loo_cv} \alias{loo_cv} \title{Leave-One-Out Cross-Validation} \usage{ loo_cv(data, ...) } \arguments{ \item{data}{A data frame.} \item{...}{Not currently used.} } \value{ An tibble with classes \code{loo_cv}, \code{rset}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include a column for the data split objects and one column called \code{id} that has a character string with the resample identifier. } \description{ Leave-one-out (LOO) cross-validation uses one data point in the original set as the assessment data and all other data points as the analysis set. A LOO resampling set has as many resamples as rows in the original data set. } \examples{ loo_cv(mtcars) }