Discretize a range of numeric attributes in the dataset into nominal attributes. Minimum Description Length (MDL) method is set as the default control. There is also available equalsizeControl method.

discretize(
  x,
  y,
  control = list(mdlControl(), equalsizeControl()),
  all = TRUE,
  discIntegers = TRUE,
  call = NULL
)

mdlControl()

equalsizeControl(k = 10)

customBreaksControl(breaks)

Arguments

x

Explanatory continuous variables to be discretized or a formula.

y

Dependent variable for supervised discretization or a data.frame when x ia a formula.

control

discretizationControl object containing the parameters for discretization algorithm. Possible inputs are mdlControl or equalsizeControl, so far. If passed as a list, the first element is used.

all

Logical indicating if a returned data.frame should contain other features that were not discretized. (Example: should Sepal.Width be returned, when you pass iris and discretize Sepal.Length, Petal.Length, Petal.Width.)

discIntegers

logical value. If true (default), then integers are treated as numeric vectors and they are discretized. If false integers are treated as factors and they are left as is.

call

Keep as NULL. Inner method parameter for consistency.

k

Number of partitions.

breaks

custom breaks used for partitioning.

References

U. M. Fayyad and K. B. Irani. Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning. In 13th International Joint Conference on Uncertainly in Artificial Intelligence(IJCAI93), pages 1022-1029, 1993.

Author

Zygmunt Zawadzki zygmunt@zstat.pl

Examples


# vectors
discretize(x = iris[[1]], y = iris[[5]])
#>        Species Sepal.Length
#> 1       setosa  (-Inf,5.55]
#> 2       setosa  (-Inf,5.55]
#> 3       setosa  (-Inf,5.55]
#> 4       setosa  (-Inf,5.55]
#> 5       setosa  (-Inf,5.55]
#> 6       setosa  (-Inf,5.55]
#> 7       setosa  (-Inf,5.55]
#> 8       setosa  (-Inf,5.55]
#> 9       setosa  (-Inf,5.55]
#> 10      setosa  (-Inf,5.55]
#> 11      setosa  (-Inf,5.55]
#> 12      setosa  (-Inf,5.55]
#> 13      setosa  (-Inf,5.55]
#> 14      setosa  (-Inf,5.55]
#> 15      setosa  (5.55,6.15]
#> 16      setosa  (5.55,6.15]
#> 17      setosa  (-Inf,5.55]
#> 18      setosa  (-Inf,5.55]
#> 19      setosa  (5.55,6.15]
#> 20      setosa  (-Inf,5.55]
#> 21      setosa  (-Inf,5.55]
#> 22      setosa  (-Inf,5.55]
#> 23      setosa  (-Inf,5.55]
#> 24      setosa  (-Inf,5.55]
#> 25      setosa  (-Inf,5.55]
#> 26      setosa  (-Inf,5.55]
#> 27      setosa  (-Inf,5.55]
#> 28      setosa  (-Inf,5.55]
#> 29      setosa  (-Inf,5.55]
#> 30      setosa  (-Inf,5.55]
#> 31      setosa  (-Inf,5.55]
#> 32      setosa  (-Inf,5.55]
#> 33      setosa  (-Inf,5.55]
#> 34      setosa  (-Inf,5.55]
#> 35      setosa  (-Inf,5.55]
#> 36      setosa  (-Inf,5.55]
#> 37      setosa  (-Inf,5.55]
#> 38      setosa  (-Inf,5.55]
#> 39      setosa  (-Inf,5.55]
#> 40      setosa  (-Inf,5.55]
#> 41      setosa  (-Inf,5.55]
#> 42      setosa  (-Inf,5.55]
#> 43      setosa  (-Inf,5.55]
#> 44      setosa  (-Inf,5.55]
#> 45      setosa  (-Inf,5.55]
#> 46      setosa  (-Inf,5.55]
#> 47      setosa  (-Inf,5.55]
#> 48      setosa  (-Inf,5.55]
#> 49      setosa  (-Inf,5.55]
#> 50      setosa  (-Inf,5.55]
#> 51  versicolor  (6.15, Inf]
#> 52  versicolor  (6.15, Inf]
#> 53  versicolor  (6.15, Inf]
#> 54  versicolor  (-Inf,5.55]
#> 55  versicolor  (6.15, Inf]
#> 56  versicolor  (5.55,6.15]
#> 57  versicolor  (6.15, Inf]
#> 58  versicolor  (-Inf,5.55]
#> 59  versicolor  (6.15, Inf]
#> 60  versicolor  (-Inf,5.55]
#> 61  versicolor  (-Inf,5.55]
#> 62  versicolor  (5.55,6.15]
#> 63  versicolor  (5.55,6.15]
#> 64  versicolor  (5.55,6.15]
#> 65  versicolor  (5.55,6.15]
#> 66  versicolor  (6.15, Inf]
#> 67  versicolor  (5.55,6.15]
#> 68  versicolor  (5.55,6.15]
#> 69  versicolor  (6.15, Inf]
#> 70  versicolor  (5.55,6.15]
#> 71  versicolor  (5.55,6.15]
#> 72  versicolor  (5.55,6.15]
#> 73  versicolor  (6.15, Inf]
#> 74  versicolor  (5.55,6.15]
#> 75  versicolor  (6.15, Inf]
#> 76  versicolor  (6.15, Inf]
#> 77  versicolor  (6.15, Inf]
#> 78  versicolor  (6.15, Inf]
#> 79  versicolor  (5.55,6.15]
#> 80  versicolor  (5.55,6.15]
#> 81  versicolor  (-Inf,5.55]
#> 82  versicolor  (-Inf,5.55]
#> 83  versicolor  (5.55,6.15]
#> 84  versicolor  (5.55,6.15]
#> 85  versicolor  (-Inf,5.55]
#> 86  versicolor  (5.55,6.15]
#> 87  versicolor  (6.15, Inf]
#> 88  versicolor  (6.15, Inf]
#> 89  versicolor  (5.55,6.15]
#> 90  versicolor  (-Inf,5.55]
#> 91  versicolor  (-Inf,5.55]
#> 92  versicolor  (5.55,6.15]
#> 93  versicolor  (5.55,6.15]
#> 94  versicolor  (-Inf,5.55]
#> 95  versicolor  (5.55,6.15]
#> 96  versicolor  (5.55,6.15]
#> 97  versicolor  (5.55,6.15]
#> 98  versicolor  (6.15, Inf]
#> 99  versicolor  (-Inf,5.55]
#> 100 versicolor  (5.55,6.15]
#> 101  virginica  (6.15, Inf]
#> 102  virginica  (5.55,6.15]
#> 103  virginica  (6.15, Inf]
#> 104  virginica  (6.15, Inf]
#> 105  virginica  (6.15, Inf]
#> 106  virginica  (6.15, Inf]
#> 107  virginica  (-Inf,5.55]
#> 108  virginica  (6.15, Inf]
#> 109  virginica  (6.15, Inf]
#> 110  virginica  (6.15, Inf]
#> 111  virginica  (6.15, Inf]
#> 112  virginica  (6.15, Inf]
#> 113  virginica  (6.15, Inf]
#> 114  virginica  (5.55,6.15]
#> 115  virginica  (5.55,6.15]
#> 116  virginica  (6.15, Inf]
#> 117  virginica  (6.15, Inf]
#> 118  virginica  (6.15, Inf]
#> 119  virginica  (6.15, Inf]
#> 120  virginica  (5.55,6.15]
#> 121  virginica  (6.15, Inf]
#> 122  virginica  (5.55,6.15]
#> 123  virginica  (6.15, Inf]
#> 124  virginica  (6.15, Inf]
#> 125  virginica  (6.15, Inf]
#> 126  virginica  (6.15, Inf]
#> 127  virginica  (6.15, Inf]
#> 128  virginica  (5.55,6.15]
#> 129  virginica  (6.15, Inf]
#> 130  virginica  (6.15, Inf]
#> 131  virginica  (6.15, Inf]
#> 132  virginica  (6.15, Inf]
#> 133  virginica  (6.15, Inf]
#> 134  virginica  (6.15, Inf]
#> 135  virginica  (5.55,6.15]
#> 136  virginica  (6.15, Inf]
#> 137  virginica  (6.15, Inf]
#> 138  virginica  (6.15, Inf]
#> 139  virginica  (5.55,6.15]
#> 140  virginica  (6.15, Inf]
#> 141  virginica  (6.15, Inf]
#> 142  virginica  (6.15, Inf]
#> 143  virginica  (5.55,6.15]
#> 144  virginica  (6.15, Inf]
#> 145  virginica  (6.15, Inf]
#> 146  virginica  (6.15, Inf]
#> 147  virginica  (6.15, Inf]
#> 148  virginica  (6.15, Inf]
#> 149  virginica  (6.15, Inf]
#> 150  virginica  (5.55,6.15]

# list and vector
head(discretize(x = list(iris[[1]], iris$Sepal.Width), y = iris$Species))
#>   Species Sepal.Length Sepal.Width
#> 1  setosa  (-Inf,5.55] (3.35, Inf]
#> 2  setosa  (-Inf,5.55] (2.95,3.35]
#> 3  setosa  (-Inf,5.55] (2.95,3.35]
#> 4  setosa  (-Inf,5.55] (2.95,3.35]
#> 5  setosa  (-Inf,5.55] (3.35, Inf]
#> 6  setosa  (-Inf,5.55] (3.35, Inf]

# formula input
head(discretize(x = Species ~ ., y = iris))
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1  (-Inf,5.55] (3.35, Inf]  (-Inf,2.45]  (-Inf,0.8]  setosa
#> 2  (-Inf,5.55] (2.95,3.35]  (-Inf,2.45]  (-Inf,0.8]  setosa
#> 3  (-Inf,5.55] (2.95,3.35]  (-Inf,2.45]  (-Inf,0.8]  setosa
#> 4  (-Inf,5.55] (2.95,3.35]  (-Inf,2.45]  (-Inf,0.8]  setosa
#> 5  (-Inf,5.55] (3.35, Inf]  (-Inf,2.45]  (-Inf,0.8]  setosa
#> 6  (-Inf,5.55] (3.35, Inf]  (-Inf,2.45]  (-Inf,0.8]  setosa
head(discretize(Species ~ ., iris))
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1  (-Inf,5.55] (3.35, Inf]  (-Inf,2.45]  (-Inf,0.8]  setosa
#> 2  (-Inf,5.55] (2.95,3.35]  (-Inf,2.45]  (-Inf,0.8]  setosa
#> 3  (-Inf,5.55] (2.95,3.35]  (-Inf,2.45]  (-Inf,0.8]  setosa
#> 4  (-Inf,5.55] (2.95,3.35]  (-Inf,2.45]  (-Inf,0.8]  setosa
#> 5  (-Inf,5.55] (3.35, Inf]  (-Inf,2.45]  (-Inf,0.8]  setosa
#> 6  (-Inf,5.55] (3.35, Inf]  (-Inf,2.45]  (-Inf,0.8]  setosa

# use different methods for specific columns
ir1 <- discretize(Species ~ Sepal.Length, iris)
ir2 <- discretize(Species ~ Sepal.Width, ir1, control = equalsizeControl(3))
ir3 <- discretize(Species ~ Petal.Length, ir2, control = equalsizeControl(5))
head(ir3)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1  (-Inf,5.55]  (3.2, Inf]   (-Inf,1.5]         0.2  setosa
#> 2  (-Inf,5.55]   (2.9,3.2]   (-Inf,1.5]         0.2  setosa
#> 3  (-Inf,5.55]  (3.2, Inf]   (-Inf,1.5]         0.2  setosa
#> 4  (-Inf,5.55]   (2.9,3.2]   (-Inf,1.5]         0.2  setosa
#> 5  (-Inf,5.55]  (3.2, Inf]   (-Inf,1.5]         0.2  setosa
#> 6  (-Inf,5.55]  (3.2, Inf]    (1.5,3.9]         0.4  setosa

# custom breaks
ir <- discretize(Species ~ Sepal.Length, iris,
  control = customBreaksControl(breaks = c(0, 2, 5, 7.5, 10)))
head(ir)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1      (5,7.5]         3.5          1.4         0.2  setosa
#> 2        (2,5]         3.0          1.4         0.2  setosa
#> 3        (2,5]         3.2          1.3         0.2  setosa
#> 4        (2,5]         3.1          1.5         0.2  setosa
#> 5        (2,5]         3.6          1.4         0.2  setosa
#> 6      (5,7.5]         3.9          1.7         0.4  setosa

if (FALSE) { # \dontrun{
# Same results
library(RWeka)
Rweka_disc_out <- RWeka::Discretize(Species ~ Sepal.Length, iris)[, 1]
FSelectorRcpp_disc_out <- FSelectorRcpp::discretize(Species ~ Sepal.Length,
                                                    iris)[, 1]
table(Rweka_disc_out, FSelectorRcpp_disc_out)
# But faster method
library(microbenchmark)
microbenchmark(FSelectorRcpp::discretize(Species ~ Sepal.Length, iris),
               RWeka::Discretize(Species ~ Sepal.Length, iris))

} # }