A convenience wrapper for greedy and exhaustive feature selection algorithms that extract valuable attributes depending on the evaluation method (called evaluator). This function is a reimplementation of FSelector's exhaustive.search and greedy.search.

feature_search(
  attributes,
  fun,
  data,
  mode = c("greedy", "exhaustive"),
  type = c("forward", "backward"),
  sizes = 1:length(attributes),
  parallel = TRUE,
  ...
)

Arguments

attributes

A character vector with attributes' names to be used to extract the most valuable features.

fun

A function (evaluator) to be used to score features' sets at each iteration of the algorithm passed via mode. See Examples.

data

A data set for fun function (evaluator).

mode

A character that determines which search algorithm to perform. Defualt is "greedy".

type

Used when mode = "greedy" - whether to use the backward or the forward multiple-way search. Default is "forward".

sizes

Used when mode = "exhaustive" - a vector of sizes of attributes subsets.

parallel

Allow parallelization.

...

Other arguments passed to foreach function.

Value

A list with following components

  • best - a data.frame with the best subset and it's score (1 - feature used, 0 - feature not used),

  • all - a data.frame with all checked features' subsets and their score (1 - feature used, 0 - feature not used),

  • data - the data used in the feature selection,

  • fun - the evaluator used to compute the score of importance for features' subsets,

  • call - an origin call of the feature_search,

  • mode - the mode used in the call.

Details

The evaluator function passed with fun is used to determine the importance score of current features' subset. The score is used in a multiple-way (backward or forward) greedy algorithm as a stopping moment or as a selection criterion in the exhaustive search that checks all possible attributes' subset combinations (of sizes passed in sizes).

Note

Note that score depends on the evaluator you provide in the fun parameter.

Author

Zygmunt Zawadzki zygmunt@zstat.pl

Krzysztof Slomczynski krzysztofslomczynski@gmail.com

Examples

# Enable parallelization in examples if (FALSE) { library(doParallel) cl <- makeCluster(2) registerDoParallel(cl) } # Close at the end # stopCluster(cl) #nolint # registerDoSEQ() #nolint # 1) Evaluator from FSelector package. evaluator <- function(subset, data, dependent = names(iris)[5]) { library(rpart) k <- 5 splits <- runif(nrow(data)) results <- sapply(1:k, function(i) { test.idx <- (splits >= (i - 1) / k) & (splits < i / k) train.idx <- !test.idx test <- data[test.idx, , drop = FALSE] train <- data[train.idx, , drop = FALSE] tree <- rpart(to_formula(subset, dependent), train) error.rate <- sum(test[[dependent]] != predict(tree, test, type = "c")) / nrow(test) return(1 - error.rate) }) return(mean(results)) } set.seed(123) # Default greedy search. system.time( feature_search(attributes = names(iris)[-5], fun = evaluator, data = iris) )
#> user system elapsed #> 0.275 0.000 0.281
system.time( feature_search(attributes = names(iris)[-5], fun = evaluator, data = iris, parallel = FALSE) )
#> user system elapsed #> 0.242 0.000 0.242
# Optional exhaustive search. system.time( feature_search(attributes = names(iris)[-5], fun = evaluator, data = iris, mode = "exhaustive") )
#> user system elapsed #> 0.349 0.000 0.350
system.time( feature_search(attributes = names(iris)[-5], fun = evaluator, data = iris, mode = "exhaustive", parallel = FALSE) )
#> user system elapsed #> 0.355 0.000 0.356
# 2) Maximize R^2 statistics in the linear regression model/problem. evaluator_R2_lm <- function(attributes, data, dependent = names(iris)[1]) { summary( lm(to_formula(attributes, dependent), data = data) )$r.squared } feature_search(attributes = names(iris)[-1], fun = evaluator_R2_lm, data = iris, mode = "exhaustive")
#> $best #> Sepal.Width Petal.Length Petal.Width Species values #> 15 1 1 1 1 0.8673123 #> #> $all #> Sepal.Width Petal.Length Petal.Width Species values #> 1 1 0 0 0 0.01382265 #> 2 0 1 0 0 0.7599546 #> 3 0 0 1 0 0.6690277 #> 4 0 0 0 1 0.6187057 #> 5 1 1 0 0 0.8401778 #> 6 1 0 1 0 0.7072371 #> 7 1 0 0 1 0.7259066 #> 8 0 1 1 0 0.7662613 #> 9 0 1 0 1 0.8367238 #> 10 0 0 1 1 0.6693664 #> 11 1 1 1 0 0.8586117 #> 12 1 1 0 1 0.8633088 #> 13 1 0 1 1 0.7323845 #> 14 0 1 1 1 0.8367254 #> 15 1 1 1 1 0.8673123 #> #> $fun #> function(attributes, data, dependent = names(iris)[1]) { #> summary( #> lm(to_formula(attributes, dependent), data = data) #> )$r.squared #> } #> <environment: 0x55f4ba661620> #> #> $call #> feature_search(attributes = names(iris)[-1], fun = evaluator_R2_lm, #> data = iris, mode = "exhaustive") #> #> $mode #> [1] "exhaustive" #> #> $data #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 1 5.1 3.5 1.4 0.2 setosa #> 2 4.9 3.0 1.4 0.2 setosa #> 3 4.7 3.2 1.3 0.2 setosa #> 4 4.6 3.1 1.5 0.2 setosa #> 5 5.0 3.6 1.4 0.2 setosa #> 6 5.4 3.9 1.7 0.4 setosa #> 7 4.6 3.4 1.4 0.3 setosa #> 8 5.0 3.4 1.5 0.2 setosa #> 9 4.4 2.9 1.4 0.2 setosa #> 10 4.9 3.1 1.5 0.1 setosa #> 11 5.4 3.7 1.5 0.2 setosa #> 12 4.8 3.4 1.6 0.2 setosa #> 13 4.8 3.0 1.4 0.1 setosa #> 14 4.3 3.0 1.1 0.1 setosa #> 15 5.8 4.0 1.2 0.2 setosa #> 16 5.7 4.4 1.5 0.4 setosa #> 17 5.4 3.9 1.3 0.4 setosa #> 18 5.1 3.5 1.4 0.3 setosa #> 19 5.7 3.8 1.7 0.3 setosa #> 20 5.1 3.8 1.5 0.3 setosa #> 21 5.4 3.4 1.7 0.2 setosa #> 22 5.1 3.7 1.5 0.4 setosa #> 23 4.6 3.6 1.0 0.2 setosa #> 24 5.1 3.3 1.7 0.5 setosa #> 25 4.8 3.4 1.9 0.2 setosa #> 26 5.0 3.0 1.6 0.2 setosa #> 27 5.0 3.4 1.6 0.4 setosa #> 28 5.2 3.5 1.5 0.2 setosa #> 29 5.2 3.4 1.4 0.2 setosa #> 30 4.7 3.2 1.6 0.2 setosa #> 31 4.8 3.1 1.6 0.2 setosa #> 32 5.4 3.4 1.5 0.4 setosa #> 33 5.2 4.1 1.5 0.1 setosa #> 34 5.5 4.2 1.4 0.2 setosa #> 35 4.9 3.1 1.5 0.2 setosa #> 36 5.0 3.2 1.2 0.2 setosa #> 37 5.5 3.5 1.3 0.2 setosa #> 38 4.9 3.6 1.4 0.1 setosa #> 39 4.4 3.0 1.3 0.2 setosa #> 40 5.1 3.4 1.5 0.2 setosa #> 41 5.0 3.5 1.3 0.3 setosa #> 42 4.5 2.3 1.3 0.3 setosa #> 43 4.4 3.2 1.3 0.2 setosa #> 44 5.0 3.5 1.6 0.6 setosa #> 45 5.1 3.8 1.9 0.4 setosa #> 46 4.8 3.0 1.4 0.3 setosa #> 47 5.1 3.8 1.6 0.2 setosa #> 48 4.6 3.2 1.4 0.2 setosa #> 49 5.3 3.7 1.5 0.2 setosa #> 50 5.0 3.3 1.4 0.2 setosa #> 51 7.0 3.2 4.7 1.4 versicolor #> 52 6.4 3.2 4.5 1.5 versicolor #> 53 6.9 3.1 4.9 1.5 versicolor #> 54 5.5 2.3 4.0 1.3 versicolor #> 55 6.5 2.8 4.6 1.5 versicolor #> 56 5.7 2.8 4.5 1.3 versicolor #> 57 6.3 3.3 4.7 1.6 versicolor #> 58 4.9 2.4 3.3 1.0 versicolor #> 59 6.6 2.9 4.6 1.3 versicolor #> 60 5.2 2.7 3.9 1.4 versicolor #> 61 5.0 2.0 3.5 1.0 versicolor #> 62 5.9 3.0 4.2 1.5 versicolor #> 63 6.0 2.2 4.0 1.0 versicolor #> 64 6.1 2.9 4.7 1.4 versicolor #> 65 5.6 2.9 3.6 1.3 versicolor #> 66 6.7 3.1 4.4 1.4 versicolor #> 67 5.6 3.0 4.5 1.5 versicolor #> 68 5.8 2.7 4.1 1.0 versicolor #> 69 6.2 2.2 4.5 1.5 versicolor #> 70 5.6 2.5 3.9 1.1 versicolor #> 71 5.9 3.2 4.8 1.8 versicolor #> 72 6.1 2.8 4.0 1.3 versicolor #> 73 6.3 2.5 4.9 1.5 versicolor #> 74 6.1 2.8 4.7 1.2 versicolor #> 75 6.4 2.9 4.3 1.3 versicolor #> 76 6.6 3.0 4.4 1.4 versicolor #> 77 6.8 2.8 4.8 1.4 versicolor #> 78 6.7 3.0 5.0 1.7 versicolor #> 79 6.0 2.9 4.5 1.5 versicolor #> 80 5.7 2.6 3.5 1.0 versicolor #> 81 5.5 2.4 3.8 1.1 versicolor #> 82 5.5 2.4 3.7 1.0 versicolor #> 83 5.8 2.7 3.9 1.2 versicolor #> 84 6.0 2.7 5.1 1.6 versicolor #> 85 5.4 3.0 4.5 1.5 versicolor #> 86 6.0 3.4 4.5 1.6 versicolor #> 87 6.7 3.1 4.7 1.5 versicolor #> 88 6.3 2.3 4.4 1.3 versicolor #> 89 5.6 3.0 4.1 1.3 versicolor #> 90 5.5 2.5 4.0 1.3 versicolor #> 91 5.5 2.6 4.4 1.2 versicolor #> 92 6.1 3.0 4.6 1.4 versicolor #> 93 5.8 2.6 4.0 1.2 versicolor #> 94 5.0 2.3 3.3 1.0 versicolor #> 95 5.6 2.7 4.2 1.3 versicolor #> 96 5.7 3.0 4.2 1.2 versicolor #> 97 5.7 2.9 4.2 1.3 versicolor #> 98 6.2 2.9 4.3 1.3 versicolor #> 99 5.1 2.5 3.0 1.1 versicolor #> 100 5.7 2.8 4.1 1.3 versicolor #> 101 6.3 3.3 6.0 2.5 virginica #> 102 5.8 2.7 5.1 1.9 virginica #> 103 7.1 3.0 5.9 2.1 virginica #> 104 6.3 2.9 5.6 1.8 virginica #> 105 6.5 3.0 5.8 2.2 virginica #> 106 7.6 3.0 6.6 2.1 virginica #> 107 4.9 2.5 4.5 1.7 virginica #> 108 7.3 2.9 6.3 1.8 virginica #> 109 6.7 2.5 5.8 1.8 virginica #> 110 7.2 3.6 6.1 2.5 virginica #> 111 6.5 3.2 5.1 2.0 virginica #> 112 6.4 2.7 5.3 1.9 virginica #> 113 6.8 3.0 5.5 2.1 virginica #> 114 5.7 2.5 5.0 2.0 virginica #> 115 5.8 2.8 5.1 2.4 virginica #> 116 6.4 3.2 5.3 2.3 virginica #> 117 6.5 3.0 5.5 1.8 virginica #> 118 7.7 3.8 6.7 2.2 virginica #> 119 7.7 2.6 6.9 2.3 virginica #> 120 6.0 2.2 5.0 1.5 virginica #> 121 6.9 3.2 5.7 2.3 virginica #> 122 5.6 2.8 4.9 2.0 virginica #> 123 7.7 2.8 6.7 2.0 virginica #> 124 6.3 2.7 4.9 1.8 virginica #> 125 6.7 3.3 5.7 2.1 virginica #> 126 7.2 3.2 6.0 1.8 virginica #> 127 6.2 2.8 4.8 1.8 virginica #> 128 6.1 3.0 4.9 1.8 virginica #> 129 6.4 2.8 5.6 2.1 virginica #> 130 7.2 3.0 5.8 1.6 virginica #> 131 7.4 2.8 6.1 1.9 virginica #> 132 7.9 3.8 6.4 2.0 virginica #> 133 6.4 2.8 5.6 2.2 virginica #> 134 6.3 2.8 5.1 1.5 virginica #> 135 6.1 2.6 5.6 1.4 virginica #> 136 7.7 3.0 6.1 2.3 virginica #> 137 6.3 3.4 5.6 2.4 virginica #> 138 6.4 3.1 5.5 1.8 virginica #> 139 6.0 3.0 4.8 1.8 virginica #> 140 6.9 3.1 5.4 2.1 virginica #> 141 6.7 3.1 5.6 2.4 virginica #> 142 6.9 3.1 5.1 2.3 virginica #> 143 5.8 2.7 5.1 1.9 virginica #> 144 6.8 3.2 5.9 2.3 virginica #> 145 6.7 3.3 5.7 2.5 virginica #> 146 6.7 3.0 5.2 2.3 virginica #> 147 6.3 2.5 5.0 1.9 virginica #> 148 6.5 3.0 5.2 2.0 virginica #> 149 6.2 3.4 5.4 2.3 virginica #> 150 5.9 3.0 5.1 1.8 virginica #>
# 3) Optimize BIC crietion in generalized linear model. # Aim of Bayesian approach it to identify the model with the highest # probability of being the true model. - Kuha 2004 utils::data(anorexia, package = "MASS") evaluator_BIC_glm <- function(attributes, data, dependent = "Postwt") { extractAIC( fit = glm(to_formula(attributes, dependent), family = gaussian, data = data), k = log(nrow(data)) )[2] } feature_search(attributes = c("Prewt", "Treat", "offset(Prewt)"), fun = evaluator_BIC_glm, data = anorexia, mode = "exhaustive")
#> $best #> Prewt Treat offset(Prewt) values #> 3 0 0 1 508.7408 #> #> $all #> Prewt Treat offset(Prewt) values #> 1 1 0 0 505.5144 #> 2 0 1 0 502.1123 #> 3 0 0 1 508.7408 #> 4 1 1 0 499.08 #> 5 1 0 1 505.5144 #> 6 0 1 1 506.7839 #> 7 1 1 1 499.08 #> #> $fun #> function(attributes, data, dependent = "Postwt") { #> extractAIC( #> fit = glm(to_formula(attributes, dependent), family = gaussian, #> data = data), #> k = log(nrow(data)) #> )[2] #> } #> <environment: 0x55f4ba661620> #> #> $call #> feature_search(attributes = c("Prewt", "Treat", "offset(Prewt)"), #> fun = evaluator_BIC_glm, data = anorexia, mode = "exhaustive") #> #> $mode #> [1] "exhaustive" #> #> $data #> Treat Prewt Postwt #> 1 Cont 80.7 80.2 #> 2 Cont 89.4 80.1 #> 3 Cont 91.8 86.4 #> 4 Cont 74.0 86.3 #> 5 Cont 78.1 76.1 #> 6 Cont 88.3 78.1 #> 7 Cont 87.3 75.1 #> 8 Cont 75.1 86.7 #> 9 Cont 80.6 73.5 #> 10 Cont 78.4 84.6 #> 11 Cont 77.6 77.4 #> 12 Cont 88.7 79.5 #> 13 Cont 81.3 89.6 #> 14 Cont 78.1 81.4 #> 15 Cont 70.5 81.8 #> 16 Cont 77.3 77.3 #> 17 Cont 85.2 84.2 #> 18 Cont 86.0 75.4 #> 19 Cont 84.1 79.5 #> 20 Cont 79.7 73.0 #> 21 Cont 85.5 88.3 #> 22 Cont 84.4 84.7 #> 23 Cont 79.6 81.4 #> 24 Cont 77.5 81.2 #> 25 Cont 72.3 88.2 #> 26 Cont 89.0 78.8 #> 27 CBT 80.5 82.2 #> 28 CBT 84.9 85.6 #> 29 CBT 81.5 81.4 #> 30 CBT 82.6 81.9 #> 31 CBT 79.9 76.4 #> 32 CBT 88.7 103.6 #> 33 CBT 94.9 98.4 #> 34 CBT 76.3 93.4 #> 35 CBT 81.0 73.4 #> 36 CBT 80.5 82.1 #> 37 CBT 85.0 96.7 #> 38 CBT 89.2 95.3 #> 39 CBT 81.3 82.4 #> 40 CBT 76.5 72.5 #> 41 CBT 70.0 90.9 #> 42 CBT 80.4 71.3 #> 43 CBT 83.3 85.4 #> 44 CBT 83.0 81.6 #> 45 CBT 87.7 89.1 #> 46 CBT 84.2 83.9 #> 47 CBT 86.4 82.7 #> 48 CBT 76.5 75.7 #> 49 CBT 80.2 82.6 #> 50 CBT 87.8 100.4 #> 51 CBT 83.3 85.2 #> 52 CBT 79.7 83.6 #> 53 CBT 84.5 84.6 #> 54 CBT 80.8 96.2 #> 55 CBT 87.4 86.7 #> 56 FT 83.8 95.2 #> 57 FT 83.3 94.3 #> 58 FT 86.0 91.5 #> 59 FT 82.5 91.9 #> 60 FT 86.7 100.3 #> 61 FT 79.6 76.7 #> 62 FT 76.9 76.8 #> 63 FT 94.2 101.6 #> 64 FT 73.4 94.9 #> 65 FT 80.5 75.2 #> 66 FT 81.6 77.8 #> 67 FT 82.1 95.5 #> 68 FT 77.6 90.7 #> 69 FT 83.5 92.5 #> 70 FT 89.9 93.8 #> 71 FT 86.0 91.7 #> 72 FT 87.3 98.0 #>
# Close parallelization if (FALSE) { stopCluster(cl) registerDoSEQ() }