Discretize a range of numeric attributes in the dataset into nominal
attributes. Minimum Description Length
(MDL) method is set as the default
control. There is also available equalsizeControl
method.
discretize(
x,
y,
control = list(mdlControl(), equalsizeControl()),
all = TRUE,
discIntegers = TRUE,
call = NULL
)
mdlControl()
equalsizeControl(k = 10)
customBreaksControl(breaks)
Explanatory continuous variables to be discretized or a formula.
Dependent variable for supervised discretization or a data.frame when x
ia a formula.
discretizationControl
object containing the parameters for
discretization algorithm. Possible inputs are mdlControl
or equalsizeControl
, so far. If passed as a list, the first element is used.
Logical indicating if a returned data.frame should contain other features that were not discretized.
(Example: should Sepal.Width
be returned, when you pass iris
and discretize Sepal.Length, Petal.Length, Petal.Width
.)
logical value. If true (default), then integers are treated as numeric vectors and they are discretized. If false integers are treated as factors and they are left as is.
Keep as NULL
. Inner method parameter for consistency.
Number of partitions.
custom breaks used for partitioning.
U. M. Fayyad and K. B. Irani. Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning. In 13th International Joint Conference on Uncertainly in Artificial Intelligence(IJCAI93), pages 1022-1029, 1993.
# vectors
discretize(x = iris[[1]], y = iris[[5]])
#> Species Sepal.Length
#> 1 setosa (-Inf,5.55]
#> 2 setosa (-Inf,5.55]
#> 3 setosa (-Inf,5.55]
#> 4 setosa (-Inf,5.55]
#> 5 setosa (-Inf,5.55]
#> 6 setosa (-Inf,5.55]
#> 7 setosa (-Inf,5.55]
#> 8 setosa (-Inf,5.55]
#> 9 setosa (-Inf,5.55]
#> 10 setosa (-Inf,5.55]
#> 11 setosa (-Inf,5.55]
#> 12 setosa (-Inf,5.55]
#> 13 setosa (-Inf,5.55]
#> 14 setosa (-Inf,5.55]
#> 15 setosa (5.55,6.15]
#> 16 setosa (5.55,6.15]
#> 17 setosa (-Inf,5.55]
#> 18 setosa (-Inf,5.55]
#> 19 setosa (5.55,6.15]
#> 20 setosa (-Inf,5.55]
#> 21 setosa (-Inf,5.55]
#> 22 setosa (-Inf,5.55]
#> 23 setosa (-Inf,5.55]
#> 24 setosa (-Inf,5.55]
#> 25 setosa (-Inf,5.55]
#> 26 setosa (-Inf,5.55]
#> 27 setosa (-Inf,5.55]
#> 28 setosa (-Inf,5.55]
#> 29 setosa (-Inf,5.55]
#> 30 setosa (-Inf,5.55]
#> 31 setosa (-Inf,5.55]
#> 32 setosa (-Inf,5.55]
#> 33 setosa (-Inf,5.55]
#> 34 setosa (-Inf,5.55]
#> 35 setosa (-Inf,5.55]
#> 36 setosa (-Inf,5.55]
#> 37 setosa (-Inf,5.55]
#> 38 setosa (-Inf,5.55]
#> 39 setosa (-Inf,5.55]
#> 40 setosa (-Inf,5.55]
#> 41 setosa (-Inf,5.55]
#> 42 setosa (-Inf,5.55]
#> 43 setosa (-Inf,5.55]
#> 44 setosa (-Inf,5.55]
#> 45 setosa (-Inf,5.55]
#> 46 setosa (-Inf,5.55]
#> 47 setosa (-Inf,5.55]
#> 48 setosa (-Inf,5.55]
#> 49 setosa (-Inf,5.55]
#> 50 setosa (-Inf,5.55]
#> 51 versicolor (6.15, Inf]
#> 52 versicolor (6.15, Inf]
#> 53 versicolor (6.15, Inf]
#> 54 versicolor (-Inf,5.55]
#> 55 versicolor (6.15, Inf]
#> 56 versicolor (5.55,6.15]
#> 57 versicolor (6.15, Inf]
#> 58 versicolor (-Inf,5.55]
#> 59 versicolor (6.15, Inf]
#> 60 versicolor (-Inf,5.55]
#> 61 versicolor (-Inf,5.55]
#> 62 versicolor (5.55,6.15]
#> 63 versicolor (5.55,6.15]
#> 64 versicolor (5.55,6.15]
#> 65 versicolor (5.55,6.15]
#> 66 versicolor (6.15, Inf]
#> 67 versicolor (5.55,6.15]
#> 68 versicolor (5.55,6.15]
#> 69 versicolor (6.15, Inf]
#> 70 versicolor (5.55,6.15]
#> 71 versicolor (5.55,6.15]
#> 72 versicolor (5.55,6.15]
#> 73 versicolor (6.15, Inf]
#> 74 versicolor (5.55,6.15]
#> 75 versicolor (6.15, Inf]
#> 76 versicolor (6.15, Inf]
#> 77 versicolor (6.15, Inf]
#> 78 versicolor (6.15, Inf]
#> 79 versicolor (5.55,6.15]
#> 80 versicolor (5.55,6.15]
#> 81 versicolor (-Inf,5.55]
#> 82 versicolor (-Inf,5.55]
#> 83 versicolor (5.55,6.15]
#> 84 versicolor (5.55,6.15]
#> 85 versicolor (-Inf,5.55]
#> 86 versicolor (5.55,6.15]
#> 87 versicolor (6.15, Inf]
#> 88 versicolor (6.15, Inf]
#> 89 versicolor (5.55,6.15]
#> 90 versicolor (-Inf,5.55]
#> 91 versicolor (-Inf,5.55]
#> 92 versicolor (5.55,6.15]
#> 93 versicolor (5.55,6.15]
#> 94 versicolor (-Inf,5.55]
#> 95 versicolor (5.55,6.15]
#> 96 versicolor (5.55,6.15]
#> 97 versicolor (5.55,6.15]
#> 98 versicolor (6.15, Inf]
#> 99 versicolor (-Inf,5.55]
#> 100 versicolor (5.55,6.15]
#> 101 virginica (6.15, Inf]
#> 102 virginica (5.55,6.15]
#> 103 virginica (6.15, Inf]
#> 104 virginica (6.15, Inf]
#> 105 virginica (6.15, Inf]
#> 106 virginica (6.15, Inf]
#> 107 virginica (-Inf,5.55]
#> 108 virginica (6.15, Inf]
#> 109 virginica (6.15, Inf]
#> 110 virginica (6.15, Inf]
#> 111 virginica (6.15, Inf]
#> 112 virginica (6.15, Inf]
#> 113 virginica (6.15, Inf]
#> 114 virginica (5.55,6.15]
#> 115 virginica (5.55,6.15]
#> 116 virginica (6.15, Inf]
#> 117 virginica (6.15, Inf]
#> 118 virginica (6.15, Inf]
#> 119 virginica (6.15, Inf]
#> 120 virginica (5.55,6.15]
#> 121 virginica (6.15, Inf]
#> 122 virginica (5.55,6.15]
#> 123 virginica (6.15, Inf]
#> 124 virginica (6.15, Inf]
#> 125 virginica (6.15, Inf]
#> 126 virginica (6.15, Inf]
#> 127 virginica (6.15, Inf]
#> 128 virginica (5.55,6.15]
#> 129 virginica (6.15, Inf]
#> 130 virginica (6.15, Inf]
#> 131 virginica (6.15, Inf]
#> 132 virginica (6.15, Inf]
#> 133 virginica (6.15, Inf]
#> 134 virginica (6.15, Inf]
#> 135 virginica (5.55,6.15]
#> 136 virginica (6.15, Inf]
#> 137 virginica (6.15, Inf]
#> 138 virginica (6.15, Inf]
#> 139 virginica (5.55,6.15]
#> 140 virginica (6.15, Inf]
#> 141 virginica (6.15, Inf]
#> 142 virginica (6.15, Inf]
#> 143 virginica (5.55,6.15]
#> 144 virginica (6.15, Inf]
#> 145 virginica (6.15, Inf]
#> 146 virginica (6.15, Inf]
#> 147 virginica (6.15, Inf]
#> 148 virginica (6.15, Inf]
#> 149 virginica (6.15, Inf]
#> 150 virginica (5.55,6.15]
# list and vector
head(discretize(x = list(iris[[1]], iris$Sepal.Width), y = iris$Species))
#> Species Sepal.Length Sepal.Width
#> 1 setosa (-Inf,5.55] (3.35, Inf]
#> 2 setosa (-Inf,5.55] (2.95,3.35]
#> 3 setosa (-Inf,5.55] (2.95,3.35]
#> 4 setosa (-Inf,5.55] (2.95,3.35]
#> 5 setosa (-Inf,5.55] (3.35, Inf]
#> 6 setosa (-Inf,5.55] (3.35, Inf]
# formula input
head(discretize(x = Species ~ ., y = iris))
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 (-Inf,5.55] (3.35, Inf] (-Inf,2.45] (-Inf,0.8] setosa
#> 2 (-Inf,5.55] (2.95,3.35] (-Inf,2.45] (-Inf,0.8] setosa
#> 3 (-Inf,5.55] (2.95,3.35] (-Inf,2.45] (-Inf,0.8] setosa
#> 4 (-Inf,5.55] (2.95,3.35] (-Inf,2.45] (-Inf,0.8] setosa
#> 5 (-Inf,5.55] (3.35, Inf] (-Inf,2.45] (-Inf,0.8] setosa
#> 6 (-Inf,5.55] (3.35, Inf] (-Inf,2.45] (-Inf,0.8] setosa
head(discretize(Species ~ ., iris))
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 (-Inf,5.55] (3.35, Inf] (-Inf,2.45] (-Inf,0.8] setosa
#> 2 (-Inf,5.55] (2.95,3.35] (-Inf,2.45] (-Inf,0.8] setosa
#> 3 (-Inf,5.55] (2.95,3.35] (-Inf,2.45] (-Inf,0.8] setosa
#> 4 (-Inf,5.55] (2.95,3.35] (-Inf,2.45] (-Inf,0.8] setosa
#> 5 (-Inf,5.55] (3.35, Inf] (-Inf,2.45] (-Inf,0.8] setosa
#> 6 (-Inf,5.55] (3.35, Inf] (-Inf,2.45] (-Inf,0.8] setosa
# use different methods for specific columns
ir1 <- discretize(Species ~ Sepal.Length, iris)
ir2 <- discretize(Species ~ Sepal.Width, ir1, control = equalsizeControl(3))
ir3 <- discretize(Species ~ Petal.Length, ir2, control = equalsizeControl(5))
head(ir3)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 (-Inf,5.55] (3.2, Inf] (-Inf,1.5] 0.2 setosa
#> 2 (-Inf,5.55] (2.9,3.2] (-Inf,1.5] 0.2 setosa
#> 3 (-Inf,5.55] (3.2, Inf] (-Inf,1.5] 0.2 setosa
#> 4 (-Inf,5.55] (2.9,3.2] (-Inf,1.5] 0.2 setosa
#> 5 (-Inf,5.55] (3.2, Inf] (-Inf,1.5] 0.2 setosa
#> 6 (-Inf,5.55] (3.2, Inf] (1.5,3.9] 0.4 setosa
# custom breaks
ir <- discretize(Species ~ Sepal.Length, iris,
control = customBreaksControl(breaks = c(0, 2, 5, 7.5, 10)))
head(ir)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 (5,7.5] 3.5 1.4 0.2 setosa
#> 2 (2,5] 3.0 1.4 0.2 setosa
#> 3 (2,5] 3.2 1.3 0.2 setosa
#> 4 (2,5] 3.1 1.5 0.2 setosa
#> 5 (2,5] 3.6 1.4 0.2 setosa
#> 6 (5,7.5] 3.9 1.7 0.4 setosa
if (FALSE) { # \dontrun{
# Same results
library(RWeka)
Rweka_disc_out <- RWeka::Discretize(Species ~ Sepal.Length, iris)[, 1]
FSelectorRcpp_disc_out <- FSelectorRcpp::discretize(Species ~ Sepal.Length,
iris)[, 1]
table(Rweka_disc_out, FSelectorRcpp_disc_out)
# But faster method
library(microbenchmark)
microbenchmark(FSelectorRcpp::discretize(Species ~ Sepal.Length, iris),
RWeka::Discretize(Species ~ Sepal.Length, iris))
} # }