General Feature Searching Engine — feature

A convenience wrapper for greedy and exhaustive feature selection algorithms that extract valuable attributes depending on the evaluation method (called evaluator). This function is a reimplementation of FSelector's exhaustive.search and greedy.search.

feature_search(
  attributes,
  fun,
  data,
  mode = c("greedy", "exhaustive"),
  type = c("forward", "backward"),
  sizes = 1:length(attributes),
  parallel = TRUE,
  ...
)

Arguments

attributes: A character vector with attributes' names to be used to extract the most valuable features.
fun: A function (evaluator) to be used to score features' sets at each iteration of the algorithm passed via mode. See Examples.
data: A data set for fun function (evaluator).
mode: A character that determines which search algorithm to perform. Defualt is "greedy".
type: Used when mode = "greedy" - whether to use the backward or the forward multiple-way search. Default is "forward".
sizes: Used when mode = "exhaustive" - a vector of sizes of attributes subsets.
parallel: Allow parallelization.
...: Other arguments passed to foreach function.

Value

A list with following components

best - a data.frame with the best subset and it's score (1 - feature used, 0 - feature not used),
all - a data.frame with all checked features' subsets and their score (1 - feature used, 0 - feature not used),
data - the data used in the feature selection,
fun - the evaluator used to compute the score of importance for features' subsets,
call - an origin call of the feature_search,
mode - the mode used in the call.

Details

The evaluator function passed with fun is used to determine the importance score of current features' subset. The score is used in a multiple-way (backward or forward) greedy algorithm as a stopping moment or as a selection criterion in the exhaustive search that checks all possible attributes' subset combinations (of sizes passed in sizes).

Note

Note that score depends on the evaluator you provide in the fun parameter.

Author

Zygmunt Zawadzki zygmunt@zstat.pl

Krzysztof Slomczynski krzysztofslomczynski@gmail.com

Examples


# Enable parallelization in examples
if (FALSE) { # \dontrun{
 library(doParallel)
 cl <- makeCluster(2)
 registerDoParallel(cl)
} # }
# Close at the end
# stopCluster(cl) #nolint
# registerDoSEQ() #nolint

# 1) Evaluator from FSelector package.
evaluator <- function(subset, data, dependent = names(iris)[5]) {
  library(rpart)
  k <- 5
  splits <- runif(nrow(data))
  results <- sapply(1:k, function(i) {
    test.idx <- (splits >= (i - 1) / k) & (splits < i / k)
    train.idx <- !test.idx
    test <- data[test.idx, , drop = FALSE]
    train <- data[train.idx, , drop = FALSE]
    tree <- rpart(to_formula(subset, dependent), train)
    error.rate <- sum(test[[dependent]] != predict(tree, test, type = "c")) /
    nrow(test)
    return(1 - error.rate)
  })
  return(mean(results))
}

set.seed(123)
# Default greedy search.
system.time(
  feature_search(attributes = names(iris)[-5],
                 fun = evaluator,
                 data = iris)
)
#>    user  system elapsed 
#>   0.108   0.000   0.108 
system.time(
  feature_search(attributes = names(iris)[-5],
                 fun = evaluator,
                 data = iris,
                 parallel = FALSE)
)
#>    user  system elapsed 
#>   0.096   0.000   0.095 

# Optional exhaustive search.
system.time(
  feature_search(attributes = names(iris)[-5],
                 fun = evaluator,
                 data = iris,
                 mode = "exhaustive")
)
#>    user  system elapsed 
#>   0.132   0.000   0.132 
system.time(
  feature_search(attributes = names(iris)[-5],
                 fun = evaluator,
                 data = iris,
                 mode = "exhaustive",
                 parallel = FALSE)
)
#>    user  system elapsed 
#>   0.166   0.000   0.165 

# 2) Maximize R^2 statistics in the linear regression model/problem.

evaluator_R2_lm <- function(attributes, data, dependent = names(iris)[1]) {
  summary(
    lm(to_formula(attributes, dependent), data = data)
  )$r.squared
}

feature_search(attributes = names(iris)[-1],
               fun = evaluator_R2_lm, data = iris,
               mode = "exhaustive")
#> $best
#>    Sepal.Width Petal.Length Petal.Width Species    values
#> 15           1            1           1       1 0.8673123
#> 
#> $all
#>    Sepal.Width Petal.Length Petal.Width Species     values
#> 1            1            0           0       0 0.01382265
#> 2            0            1           0       0  0.7599546
#> 3            0            0           1       0  0.6690277
#> 4            0            0           0       1  0.6187057
#> 5            1            1           0       0  0.8401778
#> 6            1            0           1       0  0.7072371
#> 7            1            0           0       1  0.7259066
#> 8            0            1           1       0  0.7662613
#> 9            0            1           0       1  0.8367238
#> 10           0            0           1       1  0.6693664
#> 11           1            1           1       0  0.8586117
#> 12           1            1           0       1  0.8633088
#> 13           1            0           1       1  0.7323845
#> 14           0            1           1       1  0.8367254
#> 15           1            1           1       1  0.8673123
#> 
#> $fun
#> function (attributes, data, dependent = names(iris)[1]) 
#> {
#>     summary(lm(to_formula(attributes, dependent), data = data))$r.squared
#> }
#> <environment: 0x55b23e8c31f0>
#> 
#> $call
#> feature_search(attributes = names(iris)[-1], fun = evaluator_R2_lm, 
#>     data = iris, mode = "exhaustive")
#> 
#> $mode
#> [1] "exhaustive"
#> 
#> $data
#>     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
#> 1            5.1         3.5          1.4         0.2     setosa
#> 2            4.9         3.0          1.4         0.2     setosa
#> 3            4.7         3.2          1.3         0.2     setosa
#> 4            4.6         3.1          1.5         0.2     setosa
#> 5            5.0         3.6          1.4         0.2     setosa
#> 6            5.4         3.9          1.7         0.4     setosa
#> 7            4.6         3.4          1.4         0.3     setosa
#> 8            5.0         3.4          1.5         0.2     setosa
#> 9            4.4         2.9          1.4         0.2     setosa
#> 10           4.9         3.1          1.5         0.1     setosa
#> 11           5.4         3.7          1.5         0.2     setosa
#> 12           4.8         3.4          1.6         0.2     setosa
#> 13           4.8         3.0          1.4         0.1     setosa
#> 14           4.3         3.0          1.1         0.1     setosa
#> 15           5.8         4.0          1.2         0.2     setosa
#> 16           5.7         4.4          1.5         0.4     setosa
#> 17           5.4         3.9          1.3         0.4     setosa
#> 18           5.1         3.5          1.4         0.3     setosa
#> 19           5.7         3.8          1.7         0.3     setosa
#> 20           5.1         3.8          1.5         0.3     setosa
#> 21           5.4         3.4          1.7         0.2     setosa
#> 22           5.1         3.7          1.5         0.4     setosa
#> 23           4.6         3.6          1.0         0.2     setosa
#> 24           5.1         3.3          1.7         0.5     setosa
#> 25           4.8         3.4          1.9         0.2     setosa
#> 26           5.0         3.0          1.6         0.2     setosa
#> 27           5.0         3.4          1.6         0.4     setosa
#> 28           5.2         3.5          1.5         0.2     setosa
#> 29           5.2         3.4          1.4         0.2     setosa
#> 30           4.7         3.2          1.6         0.2     setosa
#> 31           4.8         3.1          1.6         0.2     setosa
#> 32           5.4         3.4          1.5         0.4     setosa
#> 33           5.2         4.1          1.5         0.1     setosa
#> 34           5.5         4.2          1.4         0.2     setosa
#> 35           4.9         3.1          1.5         0.2     setosa
#> 36           5.0         3.2          1.2         0.2     setosa
#> 37           5.5         3.5          1.3         0.2     setosa
#> 38           4.9         3.6          1.4         0.1     setosa
#> 39           4.4         3.0          1.3         0.2     setosa
#> 40           5.1         3.4          1.5         0.2     setosa
#> 41           5.0         3.5          1.3         0.3     setosa
#> 42           4.5         2.3          1.3         0.3     setosa
#> 43           4.4         3.2          1.3         0.2     setosa
#> 44           5.0         3.5          1.6         0.6     setosa
#> 45           5.1         3.8          1.9         0.4     setosa
#> 46           4.8         3.0          1.4         0.3     setosa
#> 47           5.1         3.8          1.6         0.2     setosa
#> 48           4.6         3.2          1.4         0.2     setosa
#> 49           5.3         3.7          1.5         0.2     setosa
#> 50           5.0         3.3          1.4         0.2     setosa
#> 51           7.0         3.2          4.7         1.4 versicolor
#> 52           6.4         3.2          4.5         1.5 versicolor
#> 53           6.9         3.1          4.9         1.5 versicolor
#> 54           5.5         2.3          4.0         1.3 versicolor
#> 55           6.5         2.8          4.6         1.5 versicolor
#> 56           5.7         2.8          4.5         1.3 versicolor
#> 57           6.3         3.3          4.7         1.6 versicolor
#> 58           4.9         2.4          3.3         1.0 versicolor
#> 59           6.6         2.9          4.6         1.3 versicolor
#> 60           5.2         2.7          3.9         1.4 versicolor
#> 61           5.0         2.0          3.5         1.0 versicolor
#> 62           5.9         3.0          4.2         1.5 versicolor
#> 63           6.0         2.2          4.0         1.0 versicolor
#> 64           6.1         2.9          4.7         1.4 versicolor
#> 65           5.6         2.9          3.6         1.3 versicolor
#> 66           6.7         3.1          4.4         1.4 versicolor
#> 67           5.6         3.0          4.5         1.5 versicolor
#> 68           5.8         2.7          4.1         1.0 versicolor
#> 69           6.2         2.2          4.5         1.5 versicolor
#> 70           5.6         2.5          3.9         1.1 versicolor
#> 71           5.9         3.2          4.8         1.8 versicolor
#> 72           6.1         2.8          4.0         1.3 versicolor
#> 73           6.3         2.5          4.9         1.5 versicolor
#> 74           6.1         2.8          4.7         1.2 versicolor
#> 75           6.4         2.9          4.3         1.3 versicolor
#> 76           6.6         3.0          4.4         1.4 versicolor
#> 77           6.8         2.8          4.8         1.4 versicolor
#> 78           6.7         3.0          5.0         1.7 versicolor
#> 79           6.0         2.9          4.5         1.5 versicolor
#> 80           5.7         2.6          3.5         1.0 versicolor
#> 81           5.5         2.4          3.8         1.1 versicolor
#> 82           5.5         2.4          3.7         1.0 versicolor
#> 83           5.8         2.7          3.9         1.2 versicolor
#> 84           6.0         2.7          5.1         1.6 versicolor
#> 85           5.4         3.0          4.5         1.5 versicolor
#> 86           6.0         3.4          4.5         1.6 versicolor
#> 87           6.7         3.1          4.7         1.5 versicolor
#> 88           6.3         2.3          4.4         1.3 versicolor
#> 89           5.6         3.0          4.1         1.3 versicolor
#> 90           5.5         2.5          4.0         1.3 versicolor
#> 91           5.5         2.6          4.4         1.2 versicolor
#> 92           6.1         3.0          4.6         1.4 versicolor
#> 93           5.8         2.6          4.0         1.2 versicolor
#> 94           5.0         2.3          3.3         1.0 versicolor
#> 95           5.6         2.7          4.2         1.3 versicolor
#> 96           5.7         3.0          4.2         1.2 versicolor
#> 97           5.7         2.9          4.2         1.3 versicolor
#> 98           6.2         2.9          4.3         1.3 versicolor
#> 99           5.1         2.5          3.0         1.1 versicolor
#> 100          5.7         2.8          4.1         1.3 versicolor
#> 101          6.3         3.3          6.0         2.5  virginica
#> 102          5.8         2.7          5.1         1.9  virginica
#> 103          7.1         3.0          5.9         2.1  virginica
#> 104          6.3         2.9          5.6         1.8  virginica
#> 105          6.5         3.0          5.8         2.2  virginica
#> 106          7.6         3.0          6.6         2.1  virginica
#> 107          4.9         2.5          4.5         1.7  virginica
#> 108          7.3         2.9          6.3         1.8  virginica
#> 109          6.7         2.5          5.8         1.8  virginica
#> 110          7.2         3.6          6.1         2.5  virginica
#> 111          6.5         3.2          5.1         2.0  virginica
#> 112          6.4         2.7          5.3         1.9  virginica
#> 113          6.8         3.0          5.5         2.1  virginica
#> 114          5.7         2.5          5.0         2.0  virginica
#> 115          5.8         2.8          5.1         2.4  virginica
#> 116          6.4         3.2          5.3         2.3  virginica
#> 117          6.5         3.0          5.5         1.8  virginica
#> 118          7.7         3.8          6.7         2.2  virginica
#> 119          7.7         2.6          6.9         2.3  virginica
#> 120          6.0         2.2          5.0         1.5  virginica
#> 121          6.9         3.2          5.7         2.3  virginica
#> 122          5.6         2.8          4.9         2.0  virginica
#> 123          7.7         2.8          6.7         2.0  virginica
#> 124          6.3         2.7          4.9         1.8  virginica
#> 125          6.7         3.3          5.7         2.1  virginica
#> 126          7.2         3.2          6.0         1.8  virginica
#> 127          6.2         2.8          4.8         1.8  virginica
#> 128          6.1         3.0          4.9         1.8  virginica
#> 129          6.4         2.8          5.6         2.1  virginica
#> 130          7.2         3.0          5.8         1.6  virginica
#> 131          7.4         2.8          6.1         1.9  virginica
#> 132          7.9         3.8          6.4         2.0  virginica
#> 133          6.4         2.8          5.6         2.2  virginica
#> 134          6.3         2.8          5.1         1.5  virginica
#> 135          6.1         2.6          5.6         1.4  virginica
#> 136          7.7         3.0          6.1         2.3  virginica
#> 137          6.3         3.4          5.6         2.4  virginica
#> 138          6.4         3.1          5.5         1.8  virginica
#> 139          6.0         3.0          4.8         1.8  virginica
#> 140          6.9         3.1          5.4         2.1  virginica
#> 141          6.7         3.1          5.6         2.4  virginica
#> 142          6.9         3.1          5.1         2.3  virginica
#> 143          5.8         2.7          5.1         1.9  virginica
#> 144          6.8         3.2          5.9         2.3  virginica
#> 145          6.7         3.3          5.7         2.5  virginica
#> 146          6.7         3.0          5.2         2.3  virginica
#> 147          6.3         2.5          5.0         1.9  virginica
#> 148          6.5         3.0          5.2         2.0  virginica
#> 149          6.2         3.4          5.4         2.3  virginica
#> 150          5.9         3.0          5.1         1.8  virginica
#> 

# 3) Optimize BIC crietion in generalized linear model.
# Aim of Bayesian approach it to identify the model with the highest
# probability of being the true model. - Kuha 2004

utils::data(anorexia, package = "MASS")

evaluator_BIC_glm <- function(attributes, data, dependent = "Postwt") {
  extractAIC(
    fit = glm(to_formula(attributes, dependent), family = gaussian,
              data = data),
    k = log(nrow(data))
  )[2]
}

feature_search(attributes = c("Prewt", "Treat", "offset(Prewt)"),
               fun = evaluator_BIC_glm,
               data = anorexia,
               mode = "exhaustive")
#> $best
#>   Prewt Treat offset(Prewt)   values
#> 3     0     0             1 508.7408
#> 
#> $all
#>   Prewt Treat offset(Prewt)   values
#> 1     1     0             0 505.5144
#> 2     0     1             0 502.1123
#> 3     0     0             1 508.7408
#> 4     1     1             0   499.08
#> 5     1     0             1 505.5144
#> 6     0     1             1 506.7839
#> 7     1     1             1   499.08
#> 
#> $fun
#> function (attributes, data, dependent = "Postwt") 
#> {
#>     extractAIC(fit = glm(to_formula(attributes, dependent), family = gaussian, 
#>         data = data), k = log(nrow(data)))[2]
#> }
#> <environment: 0x55b23e8c31f0>
#> 
#> $call
#> feature_search(attributes = c("Prewt", "Treat", "offset(Prewt)"), 
#>     fun = evaluator_BIC_glm, data = anorexia, mode = "exhaustive")
#> 
#> $mode
#> [1] "exhaustive"
#> 
#> $data
#>    Treat Prewt Postwt
#> 1   Cont  80.7   80.2
#> 2   Cont  89.4   80.1
#> 3   Cont  91.8   86.4
#> 4   Cont  74.0   86.3
#> 5   Cont  78.1   76.1
#> 6   Cont  88.3   78.1
#> 7   Cont  87.3   75.1
#> 8   Cont  75.1   86.7
#> 9   Cont  80.6   73.5
#> 10  Cont  78.4   84.6
#> 11  Cont  77.6   77.4
#> 12  Cont  88.7   79.5
#> 13  Cont  81.3   89.6
#> 14  Cont  78.1   81.4
#> 15  Cont  70.5   81.8
#> 16  Cont  77.3   77.3
#> 17  Cont  85.2   84.2
#> 18  Cont  86.0   75.4
#> 19  Cont  84.1   79.5
#> 20  Cont  79.7   73.0
#> 21  Cont  85.5   88.3
#> 22  Cont  84.4   84.7
#> 23  Cont  79.6   81.4
#> 24  Cont  77.5   81.2
#> 25  Cont  72.3   88.2
#> 26  Cont  89.0   78.8
#> 27   CBT  80.5   82.2
#> 28   CBT  84.9   85.6
#> 29   CBT  81.5   81.4
#> 30   CBT  82.6   81.9
#> 31   CBT  79.9   76.4
#> 32   CBT  88.7  103.6
#> 33   CBT  94.9   98.4
#> 34   CBT  76.3   93.4
#> 35   CBT  81.0   73.4
#> 36   CBT  80.5   82.1
#> 37   CBT  85.0   96.7
#> 38   CBT  89.2   95.3
#> 39   CBT  81.3   82.4
#> 40   CBT  76.5   72.5
#> 41   CBT  70.0   90.9
#> 42   CBT  80.4   71.3
#> 43   CBT  83.3   85.4
#> 44   CBT  83.0   81.6
#> 45   CBT  87.7   89.1
#> 46   CBT  84.2   83.9
#> 47   CBT  86.4   82.7
#> 48   CBT  76.5   75.7
#> 49   CBT  80.2   82.6
#> 50   CBT  87.8  100.4
#> 51   CBT  83.3   85.2
#> 52   CBT  79.7   83.6
#> 53   CBT  84.5   84.6
#> 54   CBT  80.8   96.2
#> 55   CBT  87.4   86.7
#> 56    FT  83.8   95.2
#> 57    FT  83.3   94.3
#> 58    FT  86.0   91.5
#> 59    FT  82.5   91.9
#> 60    FT  86.7  100.3
#> 61    FT  79.6   76.7
#> 62    FT  76.9   76.8
#> 63    FT  94.2  101.6
#> 64    FT  73.4   94.9
#> 65    FT  80.5   75.2
#> 66    FT  81.6   77.8
#> 67    FT  82.1   95.5
#> 68    FT  77.6   90.7
#> 69    FT  83.5   92.5
#> 70    FT  89.9   93.8
#> 71    FT  86.0   91.7
#> 72    FT  87.3   98.0
#> 

# Close parallelization
if (FALSE) { # \dontrun{
stopCluster(cl)
registerDoSEQ()
} # }