This function implements model-consistent Lasso estimation through the bootstrap. It supports parallel processing by way of the future package, allowing the user to flexibly specify many parallelization methods. This method was developed as a variable-selection algorithm, but this package also supports making ensemble predictions on new data using the bagged Lasso models.
Usage
bolasso(
formula,
data,
n.boot = 100,
progress = TRUE,
implement = c("glmnet", "gamlr"),
x = NULL,
y = NULL,
fast = FALSE,
...
)
Arguments
- formula
An optional object of class formula (or one that can be coerced to that class): a symbolic description of the model to be fitted. Can be omitted when
x
andy
are non-missing.- data
An optional object of class data.frame that contains the modeling variables referenced in
form
. Can be omitted whenx
andy
are non-missing.- n.boot
An integer specifying the number of bootstrap replicates.
- progress
A boolean indicating whether to display progress across bootstrap folds.
- implement
A character; either 'glmnet' or 'gamlr', specifying which Lasso implementation to utilize. For specific modeling details, see
glmnet::cv.glmnet
orgamlr::cv.gamlr
.- x
An optional predictor matrix in lieu of
form
anddata
.- y
An optional response vector in lieu of
form
anddata
.- fast
A boolean. Whether or not to fit a "fast" bootstrap procedure. If
fast == TRUE
,bolasso
will fit glmnet::cv.glmnet on the entire dataset. It will then fit all bootstrapped models with the value of lambda (regularization parameter) that minimized cross-validation loss in the full model. Iffast == FALSE
(the default),bolasso
will use cross-validation to find the optimal lambda for each bootstrap model.- ...
Additional parameters to pass to either
glmnet::cv.glmnet
orgamlr::cv.gamlr
.
Value
An object of class bolasso
. This object is a list of length
n.boot
of cv.glmnet
or cv.gamlr
objects.
See also
glmnet::cv.glmnet and gamlr::cv.gamlr for full details on the
respective implementations and arguments that can be passed to ...
.
Examples
mtcars[, c(2, 10:11)] <- lapply(mtcars[, c(2, 10:11)], as.factor)
idx <- sample(nrow(mtcars), 22)
mtcars_train <- mtcars[idx, ]
mtcars_test <- mtcars[-idx, ]
## Formula Interface
# Train model
set.seed(123)
bolasso_form <- bolasso(
form = mpg ~ .,
data = mtcars_train,
n.boot = 20,
nfolds = 5
)
# Retrieve a tidy tibble of bootstrap coefficients for each covariate
tidy(bolasso_form)
#> # A tibble: 20 × 19
#> id Intercept cyl4 cyl6 cyl8 disp hp drat wt qsec
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 boot1 33.6 0.342 -0.541 0 0 -0.0251 0 -3.62 0.0901
#> 2 boot2 29.3 2.05 0 -1.07 0 -0.0155 0 -2.78 0
#> 3 boot3 29.6 5.55 0 0 0 -0.0173 0 -2.53 0
#> 4 boot4 44.2 0 -1.89 3.54 0 -0.0602 0 -4.11 0
#> 5 boot5 22.4 2.29 0 0 0 -0.00303 0 -3.98 0.577
#> 6 boot6 40.2 0.0442 -3.29 0 0 -0.0239 0 -5.21 0.133
#> 7 boot7 9.96 7.54 0 -3.86 0.0400 0 2.49 -2.16 0
#> 8 boot8 25.9 5.42 0 0 -0.0211 0 0 -0.841 0
#> 9 boot9 34.3 4.36 0 0 0.00763 0 0 -5.42 0.109
#> 10 boot10 36.0 5.00 -0.867 0 0 0 0 -4.79 0
#> 11 boot11 34.0 5.57 0 0 0 -0.0315 -0.671 -2.43 0
#> 12 boot12 30.4 4.20 0 0 0 -0.00637 0 -3.39 0
#> 13 boot13 33.6 2.02 0 0 0 -0.0122 0 -3.89 0
#> 14 boot14 34.1 2.95 0 0 0 -0.0267 0 -3.18 0
#> 15 boot15 31.6 2.07 0 0 0 -0.0170 0 -3.13 0
#> 16 boot16 38.5 1.89 -4.23 0 0.0158 0 0.581 -7.68 0.248
#> 17 boot17 33.7 3.84 0 0 0 -0.0223 0 -3.50 0
#> 18 boot18 43.9 0.395 -3.74 0 0 -0.0384 0 -4.97 0
#> 19 boot19 39.1 2.90 -1.74 0 0 -0.0294 0 -4.01 0
#> 20 boot20 22.0 3.85 0 -0.308 0 -0.00228 0 -4.36 0.648
#> # ℹ 9 more variables: vs <dbl>, am <dbl>, gear4 <dbl>, gear5 <dbl>,
#> # carb2 <dbl>, carb3 <dbl>, carb4 <dbl>, carb6 <dbl>, carb8 <dbl>
# Extract selected variables
selected_variables(bolasso_form, threshold = 0.9, select = "lambda.min")
#> # A tibble: 20 × 3
#> id cyl4 wt
#> <chr> <dbl> <dbl>
#> 1 boot1 0.342 -3.62
#> 2 boot2 2.05 -2.78
#> 3 boot3 5.55 -2.53
#> 4 boot4 0 -4.11
#> 5 boot5 2.29 -3.98
#> 6 boot6 0.0442 -5.21
#> 7 boot7 7.54 -2.16
#> 8 boot8 5.42 -0.841
#> 9 boot9 4.36 -5.42
#> 10 boot10 5.00 -4.79
#> 11 boot11 5.57 -2.43
#> 12 boot12 4.20 -3.39
#> 13 boot13 2.02 -3.89
#> 14 boot14 2.95 -3.18
#> 15 boot15 2.07 -3.13
#> 16 boot16 1.89 -7.68
#> 17 boot17 3.84 -3.50
#> 18 boot18 0.395 -4.97
#> 19 boot19 2.90 -4.01
#> 20 boot20 3.85 -4.36
# Bagged ensemble prediction on test data
predict(bolasso_form,
new.data = mtcars_test,
select = "lambda.min")
#> boot1 boot2 boot3 boot4 boot5 boot6
#> Mazda RX4 22.136000 21.355411 21.65471 21.967038 21.12178 22.780565
#> Datsun 710 24.928046 27.496720 28.58999 28.930573 25.90059 29.181294
#> Duster 360 15.793256 13.717993 15.98303 15.357355 16.57274 17.812836
#> Merc 240D 22.679659 23.689025 25.98446 23.622992 23.33223 22.625793
#> Merc 280 19.005454 19.172257 18.41674 17.702542 18.87890 19.248354
#> Merc 450SLC 17.060371 16.829704 16.90419 17.391665 17.17946 17.918092
#> Chrysler Imperial 9.882457 9.014427 11.74785 8.959564 10.46209 9.129185
#> Toyota Corona 24.428651 25.191090 27.21624 28.093210 26.11878 28.516030
#> Camaro Z28 14.776491 12.967204 15.29944 14.246742 15.24961 16.348295
#> Fiat X1-9 27.026104 28.985211 30.03050 32.140456 27.68260 31.871866
#> boot7 boot8 boot9 boot10 boot11 boot12
#> Mazda RX4 19.24550 20.98732 22.61251 22.54044 21.980552 24.63474
#> Datsun 710 33.14551 28.60832 29.34080 30.45956 31.539419 30.81072
#> Duster 360 12.82984 15.34008 18.92005 18.85523 13.509988 16.71592
#> Merc 240D 29.75726 27.06058 22.34043 23.28057 24.354958 28.06333
#> Merc 280 17.82649 20.98565 18.80218 19.22500 17.507895 22.62863
#> Merc 450SLC 16.93580 16.93810 17.90399 17.84905 17.115417 16.41933
#> Chrysler Imperial 12.24004 12.16036 10.07738 10.35062 9.651028 10.80312
#> Toyota Corona 26.16696 27.59887 28.79879 29.76482 28.791991 26.45173
#> Camaro Z28 13.13997 15.32367 17.33309 17.56157 12.504033 15.80196
#> Fiat X1-9 33.39080 29.54351 31.23857 32.30423 33.172279 32.28603
#> boot13 boot14 boot15 boot16 boot17 boot18
#> Mazda RX4 22.07862 22.83754 21.49454 23.03257 22.09494 22.240705
#> Datsun 710 25.92231 27.20084 24.79358 31.42579 28.56655 29.236127
#> Duster 360 16.73828 16.20564 16.22302 22.57289 15.75047 16.072484
#> Merc 240D 22.91239 25.26060 22.02643 19.00373 24.06170 23.749863
#> Merc 280 19.18101 19.88035 18.70379 17.60249 20.13764 17.666961
#> Merc 450SLC 16.71130 17.27470 16.66956 18.84311 16.46770 18.243150
#> Chrysler Imperial 10.01055 10.95750 10.91523 10.60293 9.86899 7.830309
#> Toyota Corona 25.30916 26.63244 24.27119 30.76295 27.96931 28.361961
#> Camaro Z28 15.68715 15.34634 15.37686 20.53564 14.80482 14.731036
#> Fiat X1-9 27.74952 29.14782 26.45900 34.13086 30.51841 32.186810
#> boot19 boot20
#> Mazda RX4 21.946519 21.005058
#> Datsun 710 30.002143 27.594649
#> Duster 360 15.903332 15.847325
#> Merc 240D 23.546456 24.300253
#> Merc 280 18.273216 18.593329
#> Merc 450SLC 18.679698 16.479113
#> Chrysler Imperial 9.220408 9.169328
#> Toyota Corona 29.302575 27.860143
#> Camaro Z28 14.819687 14.392258
#> Fiat X1-9 32.341210 29.521835
## Alternate Matrix Interface
# Train model
set.seed(123)
bolasso_mat <- bolasso(
x = model.matrix(mpg ~ . - 1, mtcars_train),
y = mtcars_train[, 1],
data = mtcars_train,
n.boot = 20,
nfolds = 5
)
# Bagged ensemble prediction on test data
predict(bolasso_mat,
new.data = model.matrix(mpg ~ . - 1, mtcars_test),
select = "lambda.min")
#> boot1 boot2 boot3 boot4 boot5 boot6
#> Mazda RX4 22.136000 21.355411 21.65471 21.967038 21.12178 22.780565
#> Datsun 710 24.928046 27.496720 28.58999 28.930573 25.90059 29.181294
#> Duster 360 15.793256 13.717993 15.98303 15.357355 16.57274 17.812836
#> Merc 240D 22.679659 23.689025 25.98446 23.622992 23.33223 22.625793
#> Merc 280 19.005454 19.172257 18.41674 17.702542 18.87890 19.248354
#> Merc 450SLC 17.060371 16.829704 16.90419 17.391665 17.17946 17.918092
#> Chrysler Imperial 9.882457 9.014427 11.74785 8.959564 10.46209 9.129185
#> Toyota Corona 24.428651 25.191090 27.21624 28.093210 26.11878 28.516030
#> Camaro Z28 14.776491 12.967204 15.29944 14.246742 15.24961 16.348295
#> Fiat X1-9 27.026104 28.985211 30.03050 32.140456 27.68260 31.871866
#> boot7 boot8 boot9 boot10 boot11 boot12
#> Mazda RX4 19.24550 20.98732 22.61251 22.54044 21.980552 24.63474
#> Datsun 710 33.14551 28.60832 29.34080 30.45956 31.539419 30.81072
#> Duster 360 12.82984 15.34008 18.92005 18.85523 13.509988 16.71592
#> Merc 240D 29.75726 27.06058 22.34043 23.28057 24.354958 28.06333
#> Merc 280 17.82649 20.98565 18.80218 19.22500 17.507895 22.62863
#> Merc 450SLC 16.93580 16.93810 17.90399 17.84905 17.115417 16.41933
#> Chrysler Imperial 12.24004 12.16036 10.07738 10.35062 9.651028 10.80312
#> Toyota Corona 26.16696 27.59887 28.79879 29.76482 28.791991 26.45173
#> Camaro Z28 13.13997 15.32367 17.33309 17.56157 12.504033 15.80196
#> Fiat X1-9 33.39080 29.54351 31.23857 32.30423 33.172279 32.28603
#> boot13 boot14 boot15 boot16 boot17 boot18
#> Mazda RX4 22.07862 22.83754 21.49454 23.03257 22.09494 22.240705
#> Datsun 710 25.92231 27.20084 24.79358 31.42579 28.56655 29.236127
#> Duster 360 16.73828 16.20564 16.22302 22.57289 15.75047 16.072484
#> Merc 240D 22.91239 25.26060 22.02643 19.00373 24.06170 23.749863
#> Merc 280 19.18101 19.88035 18.70379 17.60249 20.13764 17.666961
#> Merc 450SLC 16.71130 17.27470 16.66956 18.84311 16.46770 18.243150
#> Chrysler Imperial 10.01055 10.95750 10.91523 10.60293 9.86899 7.830309
#> Toyota Corona 25.30916 26.63244 24.27119 30.76295 27.96931 28.361961
#> Camaro Z28 15.68715 15.34634 15.37686 20.53564 14.80482 14.731036
#> Fiat X1-9 27.74952 29.14782 26.45900 34.13086 30.51841 32.186810
#> boot19 boot20
#> Mazda RX4 21.946519 21.005058
#> Datsun 710 30.002143 27.594649
#> Duster 360 15.903332 15.847325
#> Merc 240D 23.546456 24.300253
#> Merc 280 18.273216 18.593329
#> Merc 450SLC 18.679698 16.479113
#> Chrysler Imperial 9.220408 9.169328
#> Toyota Corona 29.302575 27.860143
#> Camaro Z28 14.819687 14.392258
#> Fiat X1-9 32.341210 29.521835
# Extract the indices of the bootstrap replicates
bootstrap_samples(bolasso_mat)
#> $boot1
#> [1] 3 3 4 5 5 7 8 9 9 10 10 11 14 14 15 18 19 19 19 20 22 22
#>
#> $boot2
#> [1] 2 5 6 7 7 7 8 9 9 10 10 11 12 12 13 13 14 15 17 21 21 21
#>
#> $boot3
#> [1] 1 2 4 6 6 7 8 9 11 15 15 16 16 17 17 18 18 20 21 22 22 22
#>
#> $boot4
#> [1] 3 3 3 5 5 7 8 8 12 13 14 14 14 15 16 19 19 20 21 22 22 22
#>
#> $boot5
#> [1] 2 3 7 8 10 10 10 11 12 12 14 14 14 15 15 17 17 18 19 20 22 22
#>
#> $boot6
#> [1] 2 4 5 6 6 7 7 7 9 10 11 12 13 14 14 16 16 19 19 20 21 22
#>
#> $boot7
#> [1] 1 3 4 4 7 8 8 8 10 11 11 12 15 16 16 17 20 20 20 20 22 22
#>
#> $boot8
#> [1] 1 2 4 6 8 8 8 8 10 11 12 13 13 13 14 14 14 16 18 21 21 21
#>
#> $boot9
#> [1] 1 5 7 7 7 9 9 9 10 10 11 11 11 13 14 14 19 20 20 21 22 22
#>
#> $boot10
#> [1] 1 1 1 2 3 4 5 6 6 7 9 10 10 14 17 17 17 18 20 20 21 21
#>
#> $boot11
#> [1] 1 5 6 7 7 8 8 9 10 13 13 16 17 17 18 18 20 21 21 21 21 21
#>
#> $boot12
#> [1] 2 2 2 3 3 3 3 4 5 6 6 10 10 10 12 13 13 16 16 17 17 18
#>
#> $boot13
#> [1] 1 2 2 2 3 4 5 7 7 9 9 9 11 11 12 12 13 15 16 19 21 22
#>
#> $boot14
#> [1] 3 3 6 6 8 8 9 10 10 12 13 15 16 17 17 18 18 19 20 20 20 22
#>
#> $boot15
#> [1] 1 2 4 5 7 8 10 10 10 11 12 13 13 16 17 17 18 18 19 19 19 22
#>
#> $boot16
#> [1] 1 1 2 3 4 6 11 12 12 12 14 15 15 16 16 16 17 18 19 19 21 21
#>
#> $boot17
#> [1] 5 6 6 7 8 9 9 10 10 10 11 11 12 12 13 17 18 18 18 19 20 21
#>
#> $boot18
#> [1] 3 4 4 4 4 6 6 7 8 9 10 10 14 15 16 16 17 19 20 20 21 22
#>
#> $boot19
#> [1] 2 5 5 6 7 8 9 10 11 11 14 15 15 15 16 16 16 19 19 20 20 22
#>
#> $boot20
#> [1] 2 3 5 6 6 6 7 8 8 8 10 11 14 14 15 15 17 17 18 19 20 21
#>