A convenience wrapper for greedy
and exhaustive
feature selection algorithms that
extract valuable attributes depending on the evaluation method (called evaluator). This function
is a reimplementation of FSelector's exhaustive.search and greedy.search.
A character vector with attributes' names to be used to extract the most valuable features.
A function (evaluator) to be used to score features' sets at each iteration of the algorithm passed via mode
.
See Examples.
A data set for fun
function (evaluator).
A character that determines which search algorithm to perform. Defualt is "greedy"
.
Used when mode = "greedy"
- whether to use the
backward
or the forward
multiple-way search. Default is "forward"
.
Used when mode = "exhaustive"
- a vector of sizes
of attributes subsets.
Allow parallelization.
Other arguments passed to foreach function.
A list with following components
best - a data.frame with the best subset and it's score (1 - feature used, 0 - feature not used),
all - a data.frame with all checked features' subsets and their score (1 - feature used, 0 - feature not used),
data - the data used in the feature selection,
fun - the evaluator used to compute the score of importance for features' subsets,
call - an origin call of the feature_search
,
mode - the mode used in the call.
The evaluator function passed with fun
is used to determine
the importance score of current features' subset.
The score is used in a multiple-way (backward or forward) greedy
algorithm as a stopping moment or as a selection criterion
in the exhaustive
search that checks all possible
attributes' subset combinations (of sizes passed in sizes
).
Note that score depends on the evaluator you provide in the fun
parameter.
# Enable parallelization in examples
if (FALSE) { # \dontrun{
library(doParallel)
cl <- makeCluster(2)
registerDoParallel(cl)
} # }
# Close at the end
# stopCluster(cl) #nolint
# registerDoSEQ() #nolint
# 1) Evaluator from FSelector package.
evaluator <- function(subset, data, dependent = names(iris)[5]) {
library(rpart)
k <- 5
splits <- runif(nrow(data))
results <- sapply(1:k, function(i) {
test.idx <- (splits >= (i - 1) / k) & (splits < i / k)
train.idx <- !test.idx
test <- data[test.idx, , drop = FALSE]
train <- data[train.idx, , drop = FALSE]
tree <- rpart(to_formula(subset, dependent), train)
error.rate <- sum(test[[dependent]] != predict(tree, test, type = "c")) /
nrow(test)
return(1 - error.rate)
})
return(mean(results))
}
set.seed(123)
# Default greedy search.
system.time(
feature_search(attributes = names(iris)[-5],
fun = evaluator,
data = iris)
)
#> user system elapsed
#> 0.108 0.000 0.108
system.time(
feature_search(attributes = names(iris)[-5],
fun = evaluator,
data = iris,
parallel = FALSE)
)
#> user system elapsed
#> 0.096 0.000 0.095
# Optional exhaustive search.
system.time(
feature_search(attributes = names(iris)[-5],
fun = evaluator,
data = iris,
mode = "exhaustive")
)
#> user system elapsed
#> 0.132 0.000 0.132
system.time(
feature_search(attributes = names(iris)[-5],
fun = evaluator,
data = iris,
mode = "exhaustive",
parallel = FALSE)
)
#> user system elapsed
#> 0.166 0.000 0.165
# 2) Maximize R^2 statistics in the linear regression model/problem.
evaluator_R2_lm <- function(attributes, data, dependent = names(iris)[1]) {
summary(
lm(to_formula(attributes, dependent), data = data)
)$r.squared
}
feature_search(attributes = names(iris)[-1],
fun = evaluator_R2_lm, data = iris,
mode = "exhaustive")
#> $best
#> Sepal.Width Petal.Length Petal.Width Species values
#> 15 1 1 1 1 0.8673123
#>
#> $all
#> Sepal.Width Petal.Length Petal.Width Species values
#> 1 1 0 0 0 0.01382265
#> 2 0 1 0 0 0.7599546
#> 3 0 0 1 0 0.6690277
#> 4 0 0 0 1 0.6187057
#> 5 1 1 0 0 0.8401778
#> 6 1 0 1 0 0.7072371
#> 7 1 0 0 1 0.7259066
#> 8 0 1 1 0 0.7662613
#> 9 0 1 0 1 0.8367238
#> 10 0 0 1 1 0.6693664
#> 11 1 1 1 0 0.8586117
#> 12 1 1 0 1 0.8633088
#> 13 1 0 1 1 0.7323845
#> 14 0 1 1 1 0.8367254
#> 15 1 1 1 1 0.8673123
#>
#> $fun
#> function (attributes, data, dependent = names(iris)[1])
#> {
#> summary(lm(to_formula(attributes, dependent), data = data))$r.squared
#> }
#> <environment: 0x55b23e8c31f0>
#>
#> $call
#> feature_search(attributes = names(iris)[-1], fun = evaluator_R2_lm,
#> data = iris, mode = "exhaustive")
#>
#> $mode
#> [1] "exhaustive"
#>
#> $data
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5.0 3.6 1.4 0.2 setosa
#> 6 5.4 3.9 1.7 0.4 setosa
#> 7 4.6 3.4 1.4 0.3 setosa
#> 8 5.0 3.4 1.5 0.2 setosa
#> 9 4.4 2.9 1.4 0.2 setosa
#> 10 4.9 3.1 1.5 0.1 setosa
#> 11 5.4 3.7 1.5 0.2 setosa
#> 12 4.8 3.4 1.6 0.2 setosa
#> 13 4.8 3.0 1.4 0.1 setosa
#> 14 4.3 3.0 1.1 0.1 setosa
#> 15 5.8 4.0 1.2 0.2 setosa
#> 16 5.7 4.4 1.5 0.4 setosa
#> 17 5.4 3.9 1.3 0.4 setosa
#> 18 5.1 3.5 1.4 0.3 setosa
#> 19 5.7 3.8 1.7 0.3 setosa
#> 20 5.1 3.8 1.5 0.3 setosa
#> 21 5.4 3.4 1.7 0.2 setosa
#> 22 5.1 3.7 1.5 0.4 setosa
#> 23 4.6 3.6 1.0 0.2 setosa
#> 24 5.1 3.3 1.7 0.5 setosa
#> 25 4.8 3.4 1.9 0.2 setosa
#> 26 5.0 3.0 1.6 0.2 setosa
#> 27 5.0 3.4 1.6 0.4 setosa
#> 28 5.2 3.5 1.5 0.2 setosa
#> 29 5.2 3.4 1.4 0.2 setosa
#> 30 4.7 3.2 1.6 0.2 setosa
#> 31 4.8 3.1 1.6 0.2 setosa
#> 32 5.4 3.4 1.5 0.4 setosa
#> 33 5.2 4.1 1.5 0.1 setosa
#> 34 5.5 4.2 1.4 0.2 setosa
#> 35 4.9 3.1 1.5 0.2 setosa
#> 36 5.0 3.2 1.2 0.2 setosa
#> 37 5.5 3.5 1.3 0.2 setosa
#> 38 4.9 3.6 1.4 0.1 setosa
#> 39 4.4 3.0 1.3 0.2 setosa
#> 40 5.1 3.4 1.5 0.2 setosa
#> 41 5.0 3.5 1.3 0.3 setosa
#> 42 4.5 2.3 1.3 0.3 setosa
#> 43 4.4 3.2 1.3 0.2 setosa
#> 44 5.0 3.5 1.6 0.6 setosa
#> 45 5.1 3.8 1.9 0.4 setosa
#> 46 4.8 3.0 1.4 0.3 setosa
#> 47 5.1 3.8 1.6 0.2 setosa
#> 48 4.6 3.2 1.4 0.2 setosa
#> 49 5.3 3.7 1.5 0.2 setosa
#> 50 5.0 3.3 1.4 0.2 setosa
#> 51 7.0 3.2 4.7 1.4 versicolor
#> 52 6.4 3.2 4.5 1.5 versicolor
#> 53 6.9 3.1 4.9 1.5 versicolor
#> 54 5.5 2.3 4.0 1.3 versicolor
#> 55 6.5 2.8 4.6 1.5 versicolor
#> 56 5.7 2.8 4.5 1.3 versicolor
#> 57 6.3 3.3 4.7 1.6 versicolor
#> 58 4.9 2.4 3.3 1.0 versicolor
#> 59 6.6 2.9 4.6 1.3 versicolor
#> 60 5.2 2.7 3.9 1.4 versicolor
#> 61 5.0 2.0 3.5 1.0 versicolor
#> 62 5.9 3.0 4.2 1.5 versicolor
#> 63 6.0 2.2 4.0 1.0 versicolor
#> 64 6.1 2.9 4.7 1.4 versicolor
#> 65 5.6 2.9 3.6 1.3 versicolor
#> 66 6.7 3.1 4.4 1.4 versicolor
#> 67 5.6 3.0 4.5 1.5 versicolor
#> 68 5.8 2.7 4.1 1.0 versicolor
#> 69 6.2 2.2 4.5 1.5 versicolor
#> 70 5.6 2.5 3.9 1.1 versicolor
#> 71 5.9 3.2 4.8 1.8 versicolor
#> 72 6.1 2.8 4.0 1.3 versicolor
#> 73 6.3 2.5 4.9 1.5 versicolor
#> 74 6.1 2.8 4.7 1.2 versicolor
#> 75 6.4 2.9 4.3 1.3 versicolor
#> 76 6.6 3.0 4.4 1.4 versicolor
#> 77 6.8 2.8 4.8 1.4 versicolor
#> 78 6.7 3.0 5.0 1.7 versicolor
#> 79 6.0 2.9 4.5 1.5 versicolor
#> 80 5.7 2.6 3.5 1.0 versicolor
#> 81 5.5 2.4 3.8 1.1 versicolor
#> 82 5.5 2.4 3.7 1.0 versicolor
#> 83 5.8 2.7 3.9 1.2 versicolor
#> 84 6.0 2.7 5.1 1.6 versicolor
#> 85 5.4 3.0 4.5 1.5 versicolor
#> 86 6.0 3.4 4.5 1.6 versicolor
#> 87 6.7 3.1 4.7 1.5 versicolor
#> 88 6.3 2.3 4.4 1.3 versicolor
#> 89 5.6 3.0 4.1 1.3 versicolor
#> 90 5.5 2.5 4.0 1.3 versicolor
#> 91 5.5 2.6 4.4 1.2 versicolor
#> 92 6.1 3.0 4.6 1.4 versicolor
#> 93 5.8 2.6 4.0 1.2 versicolor
#> 94 5.0 2.3 3.3 1.0 versicolor
#> 95 5.6 2.7 4.2 1.3 versicolor
#> 96 5.7 3.0 4.2 1.2 versicolor
#> 97 5.7 2.9 4.2 1.3 versicolor
#> 98 6.2 2.9 4.3 1.3 versicolor
#> 99 5.1 2.5 3.0 1.1 versicolor
#> 100 5.7 2.8 4.1 1.3 versicolor
#> 101 6.3 3.3 6.0 2.5 virginica
#> 102 5.8 2.7 5.1 1.9 virginica
#> 103 7.1 3.0 5.9 2.1 virginica
#> 104 6.3 2.9 5.6 1.8 virginica
#> 105 6.5 3.0 5.8 2.2 virginica
#> 106 7.6 3.0 6.6 2.1 virginica
#> 107 4.9 2.5 4.5 1.7 virginica
#> 108 7.3 2.9 6.3 1.8 virginica
#> 109 6.7 2.5 5.8 1.8 virginica
#> 110 7.2 3.6 6.1 2.5 virginica
#> 111 6.5 3.2 5.1 2.0 virginica
#> 112 6.4 2.7 5.3 1.9 virginica
#> 113 6.8 3.0 5.5 2.1 virginica
#> 114 5.7 2.5 5.0 2.0 virginica
#> 115 5.8 2.8 5.1 2.4 virginica
#> 116 6.4 3.2 5.3 2.3 virginica
#> 117 6.5 3.0 5.5 1.8 virginica
#> 118 7.7 3.8 6.7 2.2 virginica
#> 119 7.7 2.6 6.9 2.3 virginica
#> 120 6.0 2.2 5.0 1.5 virginica
#> 121 6.9 3.2 5.7 2.3 virginica
#> 122 5.6 2.8 4.9 2.0 virginica
#> 123 7.7 2.8 6.7 2.0 virginica
#> 124 6.3 2.7 4.9 1.8 virginica
#> 125 6.7 3.3 5.7 2.1 virginica
#> 126 7.2 3.2 6.0 1.8 virginica
#> 127 6.2 2.8 4.8 1.8 virginica
#> 128 6.1 3.0 4.9 1.8 virginica
#> 129 6.4 2.8 5.6 2.1 virginica
#> 130 7.2 3.0 5.8 1.6 virginica
#> 131 7.4 2.8 6.1 1.9 virginica
#> 132 7.9 3.8 6.4 2.0 virginica
#> 133 6.4 2.8 5.6 2.2 virginica
#> 134 6.3 2.8 5.1 1.5 virginica
#> 135 6.1 2.6 5.6 1.4 virginica
#> 136 7.7 3.0 6.1 2.3 virginica
#> 137 6.3 3.4 5.6 2.4 virginica
#> 138 6.4 3.1 5.5 1.8 virginica
#> 139 6.0 3.0 4.8 1.8 virginica
#> 140 6.9 3.1 5.4 2.1 virginica
#> 141 6.7 3.1 5.6 2.4 virginica
#> 142 6.9 3.1 5.1 2.3 virginica
#> 143 5.8 2.7 5.1 1.9 virginica
#> 144 6.8 3.2 5.9 2.3 virginica
#> 145 6.7 3.3 5.7 2.5 virginica
#> 146 6.7 3.0 5.2 2.3 virginica
#> 147 6.3 2.5 5.0 1.9 virginica
#> 148 6.5 3.0 5.2 2.0 virginica
#> 149 6.2 3.4 5.4 2.3 virginica
#> 150 5.9 3.0 5.1 1.8 virginica
#>
# 3) Optimize BIC crietion in generalized linear model.
# Aim of Bayesian approach it to identify the model with the highest
# probability of being the true model. - Kuha 2004
utils::data(anorexia, package = "MASS")
evaluator_BIC_glm <- function(attributes, data, dependent = "Postwt") {
extractAIC(
fit = glm(to_formula(attributes, dependent), family = gaussian,
data = data),
k = log(nrow(data))
)[2]
}
feature_search(attributes = c("Prewt", "Treat", "offset(Prewt)"),
fun = evaluator_BIC_glm,
data = anorexia,
mode = "exhaustive")
#> $best
#> Prewt Treat offset(Prewt) values
#> 3 0 0 1 508.7408
#>
#> $all
#> Prewt Treat offset(Prewt) values
#> 1 1 0 0 505.5144
#> 2 0 1 0 502.1123
#> 3 0 0 1 508.7408
#> 4 1 1 0 499.08
#> 5 1 0 1 505.5144
#> 6 0 1 1 506.7839
#> 7 1 1 1 499.08
#>
#> $fun
#> function (attributes, data, dependent = "Postwt")
#> {
#> extractAIC(fit = glm(to_formula(attributes, dependent), family = gaussian,
#> data = data), k = log(nrow(data)))[2]
#> }
#> <environment: 0x55b23e8c31f0>
#>
#> $call
#> feature_search(attributes = c("Prewt", "Treat", "offset(Prewt)"),
#> fun = evaluator_BIC_glm, data = anorexia, mode = "exhaustive")
#>
#> $mode
#> [1] "exhaustive"
#>
#> $data
#> Treat Prewt Postwt
#> 1 Cont 80.7 80.2
#> 2 Cont 89.4 80.1
#> 3 Cont 91.8 86.4
#> 4 Cont 74.0 86.3
#> 5 Cont 78.1 76.1
#> 6 Cont 88.3 78.1
#> 7 Cont 87.3 75.1
#> 8 Cont 75.1 86.7
#> 9 Cont 80.6 73.5
#> 10 Cont 78.4 84.6
#> 11 Cont 77.6 77.4
#> 12 Cont 88.7 79.5
#> 13 Cont 81.3 89.6
#> 14 Cont 78.1 81.4
#> 15 Cont 70.5 81.8
#> 16 Cont 77.3 77.3
#> 17 Cont 85.2 84.2
#> 18 Cont 86.0 75.4
#> 19 Cont 84.1 79.5
#> 20 Cont 79.7 73.0
#> 21 Cont 85.5 88.3
#> 22 Cont 84.4 84.7
#> 23 Cont 79.6 81.4
#> 24 Cont 77.5 81.2
#> 25 Cont 72.3 88.2
#> 26 Cont 89.0 78.8
#> 27 CBT 80.5 82.2
#> 28 CBT 84.9 85.6
#> 29 CBT 81.5 81.4
#> 30 CBT 82.6 81.9
#> 31 CBT 79.9 76.4
#> 32 CBT 88.7 103.6
#> 33 CBT 94.9 98.4
#> 34 CBT 76.3 93.4
#> 35 CBT 81.0 73.4
#> 36 CBT 80.5 82.1
#> 37 CBT 85.0 96.7
#> 38 CBT 89.2 95.3
#> 39 CBT 81.3 82.4
#> 40 CBT 76.5 72.5
#> 41 CBT 70.0 90.9
#> 42 CBT 80.4 71.3
#> 43 CBT 83.3 85.4
#> 44 CBT 83.0 81.6
#> 45 CBT 87.7 89.1
#> 46 CBT 84.2 83.9
#> 47 CBT 86.4 82.7
#> 48 CBT 76.5 75.7
#> 49 CBT 80.2 82.6
#> 50 CBT 87.8 100.4
#> 51 CBT 83.3 85.2
#> 52 CBT 79.7 83.6
#> 53 CBT 84.5 84.6
#> 54 CBT 80.8 96.2
#> 55 CBT 87.4 86.7
#> 56 FT 83.8 95.2
#> 57 FT 83.3 94.3
#> 58 FT 86.0 91.5
#> 59 FT 82.5 91.9
#> 60 FT 86.7 100.3
#> 61 FT 79.6 76.7
#> 62 FT 76.9 76.8
#> 63 FT 94.2 101.6
#> 64 FT 73.4 94.9
#> 65 FT 80.5 75.2
#> 66 FT 81.6 77.8
#> 67 FT 82.1 95.5
#> 68 FT 77.6 90.7
#> 69 FT 83.5 92.5
#> 70 FT 89.9 93.8
#> 71 FT 86.0 91.7
#> 72 FT 87.3 98.0
#>
# Close parallelization
if (FALSE) { # \dontrun{
stopCluster(cl)
registerDoSEQ()
} # }