Skip to contents

This function implements model-consistent Lasso estimation through the bootstrap. It supports parallel processing by way of the future package, allowing the user to flexibly specify many parallelization methods. This method was developed as a variable-selection algorithm, but this package also supports making ensemble predictions on new data using the bagged Lasso models.

Usage

bolasso(
  formula,
  data,
  n.boot = 100,
  progress = TRUE,
  implement = c("glmnet", "gamlr"),
  x = NULL,
  y = NULL,
  fast = FALSE,
  ...
)

Arguments

formula

An optional object of class formula (or one that can be coerced to that class): a symbolic description of the model to be fitted. Can be omitted when x and y are non-missing.

data

An optional object of class data.frame that contains the modeling variables referenced in form. Can be omitted when x and y are non-missing.

n.boot

An integer specifying the number of bootstrap replicates.

progress

A boolean indicating whether to display progress across bootstrap folds.

implement

A character; either 'glmnet' or 'gamlr', specifying which Lasso implementation to utilize. For specific modeling details, see glmnet::cv.glmnet or gamlr::cv.gamlr.

x

An optional predictor matrix in lieu of form and data.

y

An optional response vector in lieu of form and data.

fast

A boolean. Whether or not to fit a "fast" bootstrap procedure. If fast == TRUE, bolasso will fit glmnet::cv.glmnet on the entire dataset. It will then fit all bootstrapped models with the value of lambda (regularization parameter) that minimized cross-validation loss in the full model. If fast == FALSE (the default), bolasso will use cross-validation to find the optimal lambda for each bootstrap model.

...

Additional parameters to pass to either glmnet::cv.glmnet or gamlr::cv.gamlr.

Value

An object of class bolasso. This object is a list of length n.boot of cv.glmnet or cv.gamlr objects.

See also

glmnet::cv.glmnet and gamlr::cv.gamlr for full details on the respective implementations and arguments that can be passed to ....

Examples

mtcars[, c(2, 10:11)] <- lapply(mtcars[, c(2, 10:11)], as.factor)
idx <- sample(nrow(mtcars), 22)
mtcars_train <- mtcars[idx, ]
mtcars_test <- mtcars[-idx, ]

## Formula Interface

# Train model
set.seed(123)
bolasso_form <- bolasso(
  form = mpg ~ .,
  data = mtcars_train,
  n.boot = 20,
  nfolds = 5
)

# Retrieve a tidy tibble of bootstrap coefficients for each covariate
tidy(bolasso_form)
#> # A tibble: 20 × 19
#>    id     Intercept   cyl4   cyl6   cyl8     disp       hp   drat     wt   qsec
#>    <chr>      <dbl>  <dbl>  <dbl>  <dbl>    <dbl>    <dbl>  <dbl>  <dbl>  <dbl>
#>  1 boot1      33.6  0.342  -0.541  0      0       -0.0251   0     -3.62  0.0901
#>  2 boot2      29.3  2.05    0     -1.07   0       -0.0155   0     -2.78  0     
#>  3 boot3      29.6  5.55    0      0      0       -0.0173   0     -2.53  0     
#>  4 boot4      44.2  0      -1.89   3.54   0       -0.0602   0     -4.11  0     
#>  5 boot5      22.4  2.29    0      0      0       -0.00303  0     -3.98  0.577 
#>  6 boot6      40.2  0.0442 -3.29   0      0       -0.0239   0     -5.21  0.133 
#>  7 boot7       9.96 7.54    0     -3.86   0.0400   0        2.49  -2.16  0     
#>  8 boot8      25.9  5.42    0      0     -0.0211   0        0     -0.841 0     
#>  9 boot9      34.3  4.36    0      0      0.00763  0        0     -5.42  0.109 
#> 10 boot10     36.0  5.00   -0.867  0      0        0        0     -4.79  0     
#> 11 boot11     34.0  5.57    0      0      0       -0.0315  -0.671 -2.43  0     
#> 12 boot12     30.4  4.20    0      0      0       -0.00637  0     -3.39  0     
#> 13 boot13     33.6  2.02    0      0      0       -0.0122   0     -3.89  0     
#> 14 boot14     34.1  2.95    0      0      0       -0.0267   0     -3.18  0     
#> 15 boot15     31.6  2.07    0      0      0       -0.0170   0     -3.13  0     
#> 16 boot16     38.5  1.89   -4.23   0      0.0158   0        0.581 -7.68  0.248 
#> 17 boot17     33.7  3.84    0      0      0       -0.0223   0     -3.50  0     
#> 18 boot18     43.9  0.395  -3.74   0      0       -0.0384   0     -4.97  0     
#> 19 boot19     39.1  2.90   -1.74   0      0       -0.0294   0     -4.01  0     
#> 20 boot20     22.0  3.85    0     -0.308  0       -0.00228  0     -4.36  0.648 
#> # ℹ 9 more variables: vs <dbl>, am <dbl>, gear4 <dbl>, gear5 <dbl>,
#> #   carb2 <dbl>, carb3 <dbl>, carb4 <dbl>, carb6 <dbl>, carb8 <dbl>

# Extract selected variables
selected_variables(bolasso_form, threshold = 0.9, select = "lambda.min")
#> # A tibble: 20 × 3
#>    id       cyl4     wt
#>    <chr>   <dbl>  <dbl>
#>  1 boot1  0.342  -3.62 
#>  2 boot2  2.05   -2.78 
#>  3 boot3  5.55   -2.53 
#>  4 boot4  0      -4.11 
#>  5 boot5  2.29   -3.98 
#>  6 boot6  0.0442 -5.21 
#>  7 boot7  7.54   -2.16 
#>  8 boot8  5.42   -0.841
#>  9 boot9  4.36   -5.42 
#> 10 boot10 5.00   -4.79 
#> 11 boot11 5.57   -2.43 
#> 12 boot12 4.20   -3.39 
#> 13 boot13 2.02   -3.89 
#> 14 boot14 2.95   -3.18 
#> 15 boot15 2.07   -3.13 
#> 16 boot16 1.89   -7.68 
#> 17 boot17 3.84   -3.50 
#> 18 boot18 0.395  -4.97 
#> 19 boot19 2.90   -4.01 
#> 20 boot20 3.85   -4.36 

# Bagged ensemble prediction on test data
predict(bolasso_form,
        new.data = mtcars_test,
        select = "lambda.min")
#>                       boot1     boot2    boot3     boot4    boot5     boot6
#> Mazda RX4         22.136000 21.355411 21.65471 21.967038 21.12178 22.780565
#> Datsun 710        24.928046 27.496720 28.58999 28.930573 25.90059 29.181294
#> Duster 360        15.793256 13.717993 15.98303 15.357355 16.57274 17.812836
#> Merc 240D         22.679659 23.689025 25.98446 23.622992 23.33223 22.625793
#> Merc 280          19.005454 19.172257 18.41674 17.702542 18.87890 19.248354
#> Merc 450SLC       17.060371 16.829704 16.90419 17.391665 17.17946 17.918092
#> Chrysler Imperial  9.882457  9.014427 11.74785  8.959564 10.46209  9.129185
#> Toyota Corona     24.428651 25.191090 27.21624 28.093210 26.11878 28.516030
#> Camaro Z28        14.776491 12.967204 15.29944 14.246742 15.24961 16.348295
#> Fiat X1-9         27.026104 28.985211 30.03050 32.140456 27.68260 31.871866
#>                      boot7    boot8    boot9   boot10    boot11   boot12
#> Mazda RX4         19.24550 20.98732 22.61251 22.54044 21.980552 24.63474
#> Datsun 710        33.14551 28.60832 29.34080 30.45956 31.539419 30.81072
#> Duster 360        12.82984 15.34008 18.92005 18.85523 13.509988 16.71592
#> Merc 240D         29.75726 27.06058 22.34043 23.28057 24.354958 28.06333
#> Merc 280          17.82649 20.98565 18.80218 19.22500 17.507895 22.62863
#> Merc 450SLC       16.93580 16.93810 17.90399 17.84905 17.115417 16.41933
#> Chrysler Imperial 12.24004 12.16036 10.07738 10.35062  9.651028 10.80312
#> Toyota Corona     26.16696 27.59887 28.79879 29.76482 28.791991 26.45173
#> Camaro Z28        13.13997 15.32367 17.33309 17.56157 12.504033 15.80196
#> Fiat X1-9         33.39080 29.54351 31.23857 32.30423 33.172279 32.28603
#>                     boot13   boot14   boot15   boot16   boot17    boot18
#> Mazda RX4         22.07862 22.83754 21.49454 23.03257 22.09494 22.240705
#> Datsun 710        25.92231 27.20084 24.79358 31.42579 28.56655 29.236127
#> Duster 360        16.73828 16.20564 16.22302 22.57289 15.75047 16.072484
#> Merc 240D         22.91239 25.26060 22.02643 19.00373 24.06170 23.749863
#> Merc 280          19.18101 19.88035 18.70379 17.60249 20.13764 17.666961
#> Merc 450SLC       16.71130 17.27470 16.66956 18.84311 16.46770 18.243150
#> Chrysler Imperial 10.01055 10.95750 10.91523 10.60293  9.86899  7.830309
#> Toyota Corona     25.30916 26.63244 24.27119 30.76295 27.96931 28.361961
#> Camaro Z28        15.68715 15.34634 15.37686 20.53564 14.80482 14.731036
#> Fiat X1-9         27.74952 29.14782 26.45900 34.13086 30.51841 32.186810
#>                      boot19    boot20
#> Mazda RX4         21.946519 21.005058
#> Datsun 710        30.002143 27.594649
#> Duster 360        15.903332 15.847325
#> Merc 240D         23.546456 24.300253
#> Merc 280          18.273216 18.593329
#> Merc 450SLC       18.679698 16.479113
#> Chrysler Imperial  9.220408  9.169328
#> Toyota Corona     29.302575 27.860143
#> Camaro Z28        14.819687 14.392258
#> Fiat X1-9         32.341210 29.521835

## Alternate Matrix Interface

# Train model
set.seed(123)
bolasso_mat <- bolasso(
  x = model.matrix(mpg ~ . - 1, mtcars_train),
  y = mtcars_train[, 1],
  data = mtcars_train,
  n.boot = 20,
  nfolds = 5
)

# Bagged ensemble prediction on test data
predict(bolasso_mat,
        new.data = model.matrix(mpg ~ . - 1, mtcars_test),
        select = "lambda.min")
#>                       boot1     boot2    boot3     boot4    boot5     boot6
#> Mazda RX4         22.136000 21.355411 21.65471 21.967038 21.12178 22.780565
#> Datsun 710        24.928046 27.496720 28.58999 28.930573 25.90059 29.181294
#> Duster 360        15.793256 13.717993 15.98303 15.357355 16.57274 17.812836
#> Merc 240D         22.679659 23.689025 25.98446 23.622992 23.33223 22.625793
#> Merc 280          19.005454 19.172257 18.41674 17.702542 18.87890 19.248354
#> Merc 450SLC       17.060371 16.829704 16.90419 17.391665 17.17946 17.918092
#> Chrysler Imperial  9.882457  9.014427 11.74785  8.959564 10.46209  9.129185
#> Toyota Corona     24.428651 25.191090 27.21624 28.093210 26.11878 28.516030
#> Camaro Z28        14.776491 12.967204 15.29944 14.246742 15.24961 16.348295
#> Fiat X1-9         27.026104 28.985211 30.03050 32.140456 27.68260 31.871866
#>                      boot7    boot8    boot9   boot10    boot11   boot12
#> Mazda RX4         19.24550 20.98732 22.61251 22.54044 21.980552 24.63474
#> Datsun 710        33.14551 28.60832 29.34080 30.45956 31.539419 30.81072
#> Duster 360        12.82984 15.34008 18.92005 18.85523 13.509988 16.71592
#> Merc 240D         29.75726 27.06058 22.34043 23.28057 24.354958 28.06333
#> Merc 280          17.82649 20.98565 18.80218 19.22500 17.507895 22.62863
#> Merc 450SLC       16.93580 16.93810 17.90399 17.84905 17.115417 16.41933
#> Chrysler Imperial 12.24004 12.16036 10.07738 10.35062  9.651028 10.80312
#> Toyota Corona     26.16696 27.59887 28.79879 29.76482 28.791991 26.45173
#> Camaro Z28        13.13997 15.32367 17.33309 17.56157 12.504033 15.80196
#> Fiat X1-9         33.39080 29.54351 31.23857 32.30423 33.172279 32.28603
#>                     boot13   boot14   boot15   boot16   boot17    boot18
#> Mazda RX4         22.07862 22.83754 21.49454 23.03257 22.09494 22.240705
#> Datsun 710        25.92231 27.20084 24.79358 31.42579 28.56655 29.236127
#> Duster 360        16.73828 16.20564 16.22302 22.57289 15.75047 16.072484
#> Merc 240D         22.91239 25.26060 22.02643 19.00373 24.06170 23.749863
#> Merc 280          19.18101 19.88035 18.70379 17.60249 20.13764 17.666961
#> Merc 450SLC       16.71130 17.27470 16.66956 18.84311 16.46770 18.243150
#> Chrysler Imperial 10.01055 10.95750 10.91523 10.60293  9.86899  7.830309
#> Toyota Corona     25.30916 26.63244 24.27119 30.76295 27.96931 28.361961
#> Camaro Z28        15.68715 15.34634 15.37686 20.53564 14.80482 14.731036
#> Fiat X1-9         27.74952 29.14782 26.45900 34.13086 30.51841 32.186810
#>                      boot19    boot20
#> Mazda RX4         21.946519 21.005058
#> Datsun 710        30.002143 27.594649
#> Duster 360        15.903332 15.847325
#> Merc 240D         23.546456 24.300253
#> Merc 280          18.273216 18.593329
#> Merc 450SLC       18.679698 16.479113
#> Chrysler Imperial  9.220408  9.169328
#> Toyota Corona     29.302575 27.860143
#> Camaro Z28        14.819687 14.392258
#> Fiat X1-9         32.341210 29.521835

# Extract the indices of the bootstrap replicates
bootstrap_samples(bolasso_mat)
#> $boot1
#>  [1]  3  3  4  5  5  7  8  9  9 10 10 11 14 14 15 18 19 19 19 20 22 22
#> 
#> $boot2
#>  [1]  2  5  6  7  7  7  8  9  9 10 10 11 12 12 13 13 14 15 17 21 21 21
#> 
#> $boot3
#>  [1]  1  2  4  6  6  7  8  9 11 15 15 16 16 17 17 18 18 20 21 22 22 22
#> 
#> $boot4
#>  [1]  3  3  3  5  5  7  8  8 12 13 14 14 14 15 16 19 19 20 21 22 22 22
#> 
#> $boot5
#>  [1]  2  3  7  8 10 10 10 11 12 12 14 14 14 15 15 17 17 18 19 20 22 22
#> 
#> $boot6
#>  [1]  2  4  5  6  6  7  7  7  9 10 11 12 13 14 14 16 16 19 19 20 21 22
#> 
#> $boot7
#>  [1]  1  3  4  4  7  8  8  8 10 11 11 12 15 16 16 17 20 20 20 20 22 22
#> 
#> $boot8
#>  [1]  1  2  4  6  8  8  8  8 10 11 12 13 13 13 14 14 14 16 18 21 21 21
#> 
#> $boot9
#>  [1]  1  5  7  7  7  9  9  9 10 10 11 11 11 13 14 14 19 20 20 21 22 22
#> 
#> $boot10
#>  [1]  1  1  1  2  3  4  5  6  6  7  9 10 10 14 17 17 17 18 20 20 21 21
#> 
#> $boot11
#>  [1]  1  5  6  7  7  8  8  9 10 13 13 16 17 17 18 18 20 21 21 21 21 21
#> 
#> $boot12
#>  [1]  2  2  2  3  3  3  3  4  5  6  6 10 10 10 12 13 13 16 16 17 17 18
#> 
#> $boot13
#>  [1]  1  2  2  2  3  4  5  7  7  9  9  9 11 11 12 12 13 15 16 19 21 22
#> 
#> $boot14
#>  [1]  3  3  6  6  8  8  9 10 10 12 13 15 16 17 17 18 18 19 20 20 20 22
#> 
#> $boot15
#>  [1]  1  2  4  5  7  8 10 10 10 11 12 13 13 16 17 17 18 18 19 19 19 22
#> 
#> $boot16
#>  [1]  1  1  2  3  4  6 11 12 12 12 14 15 15 16 16 16 17 18 19 19 21 21
#> 
#> $boot17
#>  [1]  5  6  6  7  8  9  9 10 10 10 11 11 12 12 13 17 18 18 18 19 20 21
#> 
#> $boot18
#>  [1]  3  4  4  4  4  6  6  7  8  9 10 10 14 15 16 16 17 19 20 20 21 22
#> 
#> $boot19
#>  [1]  2  5  5  6  7  8  9 10 11 11 14 15 15 15 16 16 16 19 19 20 20 22
#> 
#> $boot20
#>  [1]  2  3  5  6  6  6  7  8  8  8 10 11 14 14 15 15 17 17 18 19 20 21
#>