3 Commits 5cdaee29c3 ... dace1c01bc

Author SHA1 Message Date
  Miron B. Kursa dace1c01bc Merge branch 'newEx' into devel 2 years ago
  Miron B. Kursa 84e779e47d Examples and tests use srx 2 years ago
  Miron B. Kursa c07ff0f50e Srx dataset 2 years ago
8 changed files with 118 additions and 25 deletions
  1. 37 5
      R/Boruta.R
  2. 1 1
      R/importance.R
  3. 16 0
      data/srx.R
  4. 28 7
      man/Boruta.Rd
  5. 3 3
      man/plotImpHistory.Rd
  6. 22 0
      man/srx.Rd
  7. 6 0
      tests/testthat/testBasic.R
  8. 5 9
      tests/testthat/testSources.R

+ 37 - 5
R/Boruta.R

@@ -50,17 +50,37 @@ Boruta<-function(x,...)
 #' @export
 #' @examples
 #' set.seed(777)
-#' #Add some nonsense attributes to iris dataset by shuffling original attributes
+#'
+#' #Boruta on the "small redundant XOR" problem; read ?srx for details
+#' data(srx)
+#' Boruta(Y~.,data=srx)->Boruta.srx
+#'
+#' #Results summary
+#' print(Boruta.srx)
+#'
+#' #Result plot
+#' plot(Boruta.srx)
+#'
+#' #Attribute statistics
+#' attStats(Boruta.srx)
+#'
+#' #Using alternative importance source, rFerns
+#' Boruta(Y~.,data=srx,getImp=getImpFerns)->Boruta.srx.ferns
+#' print(Boruta.srx.ferns)
+#' 
+#' #Verbose
+#' Boruta(Y~.,data=srx,doTrace=2)->Boruta.srx
+#'
+#' \dontrun{
+#' #Boruta on the iris problem extended with artificial irrelevant features
+#' #Generate said features
 #' iris.extended<-data.frame(iris,apply(iris[,-5],2,sample))
 #' names(iris.extended)[6:9]<-paste("Nonsense",1:4,sep="")
 #' #Run Boruta on this data
 #' Boruta(Species~.,data=iris.extended,doTrace=2)->Boruta.iris.extended
 #' #Nonsense attributes should be rejected
 #' print(Boruta.iris.extended)
-#'
-#' #Boruta using rFerns' importance
-#' Boruta(Species~.,data=iris.extended,getImp=getImpFerns)->Boruta.ferns.irisE
-#' print(Boruta.ferns.irisE)
+#' }
 #'
 #' \dontrun{
 #' #Boruta on the HouseVotes84 data from mlbench
@@ -295,3 +315,15 @@ print.Boruta<-function(x,...){
  }
  invisible(x)
 }
+
+#' Small redundant XOR data
+#'
+#' A synthetic data set with 32 rows corresponding to all combinations of values of five logical features, A, B, N1, N2 and N3.
+#' The decision Y is equal to A xor B, hence N1--N3 are irrelevant attributes.
+#' The set also contains 3 additional features, A or B (AoB), A and B (AnB) and not A (nA), which provide a redundant, but still relevant way to reconstruct Y.
+#'
+#' This is set is an easy way to demonstrate the difference between all relevant feature selection methods, which should select all features except N1--N3, and minimal optimal ones, which will probably ignore most of them.
+#' @format A data frame with 8 predictors, 4 relevant: A, B, AoB, AnB and nA, as well as 3 irrelevant N1, N2 and N3, and decision attribute Y.
+#' @source \url{https://mbq.me/blog/relevance-and-redundancy}
+"srx"
+

+ 1 - 1
R/importance.R

@@ -201,7 +201,7 @@ getImpXgboost<-function(x,y,nrounds=5,verbose=0,...){
    verbose=verbose
   )
  )->imp
- setNames(rep(0,ncol(x)),colnames(x))->ans
+ stats::setNames(rep(0,ncol(x)),colnames(x))->ans
  ans[imp$Feature]<-imp$Gain
  ans
 }

+ 16 - 0
data/srx.R

@@ -0,0 +1,16 @@
+# Small XOR problem data
+# documentation in R/Boruta.R
+
+stats::setNames(
+ do.call(expand.grid,rep(list(c(TRUE,FALSE)),5)),
+ c("A","B","N1","N2","N3")
+)->srx
+data.frame(
+ srx,
+ AoB=with(srx,A|B),
+ AnB=with(srx,A&B),
+ nA=!srx$A
+)->srx
+srx$Y<-with(srx,A!=B)
+srx<-as.data.frame(lapply(srx,factor))
+

+ 28 - 7
man/Boruta.Rd

@@ -8,8 +8,9 @@
 \usage{
 Boruta(x, ...)
 
-\method{Boruta}{default}(x, y, pValue = 0.01, mcAdj = TRUE, maxRuns = 100,
-  doTrace = 0, holdHistory = TRUE, getImp = getImpRfZ, ...)
+\method{Boruta}{default}(x, y, pValue = 0.01, mcAdj = TRUE,
+  maxRuns = 100, doTrace = 0, holdHistory = TRUE,
+  getImp = getImpRfZ, ...)
 
 \method{Boruta}{formula}(formula, data = .GlobalEnv, ...)
 }
@@ -70,17 +71,37 @@ Instead, you can use \code{\link{TentativeRoughFix}} function, which will perfor
 }
 \examples{
 set.seed(777)
-#Add some nonsense attributes to iris dataset by shuffling original attributes
+
+#Boruta on the "small redundant XOR" problem; read ?srx for details
+data(srx)
+Boruta(Y~.,data=srx)->Boruta.srx
+
+#Results summary
+print(Boruta.srx)
+
+#Result plot
+plot(Boruta.srx)
+
+#Attribute statistics
+attStats(Boruta.srx)
+
+#Using alternative importance source, rFerns
+Boruta(Y~.,data=srx,getImp=getImpFerns)->Boruta.srx.ferns
+print(Boruta.srx.ferns)
+
+#Versbose
+Boruta(Y~.,data=srx,doTrace=2)->Boruta.srx
+
+\dontrun{
+#Boruta on the iris problem extended with artificial irrelevant features
+#Generate said features
 iris.extended<-data.frame(iris,apply(iris[,-5],2,sample))
 names(iris.extended)[6:9]<-paste("Nonsense",1:4,sep="")
 #Run Boruta on this data
 Boruta(Species~.,data=iris.extended,doTrace=2)->Boruta.iris.extended
 #Nonsense attributes should be rejected
 print(Boruta.iris.extended)
-
-#Boruta using rFerns' importance
-Boruta(Species~.,data=iris.extended,getImp=getImpFerns)->Boruta.ferns.irisE
-print(Boruta.ferns.irisE)
+}
 
 \dontrun{
 #Boruta on the HouseVotes84 data from mlbench

+ 3 - 3
man/plotImpHistory.Rd

@@ -4,9 +4,9 @@
 \alias{plotImpHistory}
 \title{Plot Boruta object as importance history}
 \usage{
-plotImpHistory(x, colCode = c("green", "yellow", "red", "blue"), col = NULL,
-  type = "l", lty = 1, pch = 0, xlab = "Classifier run",
-  ylab = "Importance", ...)
+plotImpHistory(x, colCode = c("green", "yellow", "red", "blue"),
+  col = NULL, type = "l", lty = 1, pch = 0,
+  xlab = "Classifier run", ylab = "Importance", ...)
 }
 \arguments{
 \item{x}{an object of a class Boruta.}

+ 22 - 0
man/srx.Rd

@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Boruta.R
+\docType{data}
+\name{srx}
+\alias{srx}
+\title{Small redundant XOR data}
+\format{A data frame with 8 predictors, 4 relevant: A, B, AoB, AnB and nA, as well as 3 irrelevant N1, N2 and N3, and decision attribute Y.}
+\source{
+\url{https://mbq.me/blog/relevance-and-redundancy}
+}
+\usage{
+srx
+}
+\description{
+A synthetic data set with 32 rows corresponding to all combinations of values of five logical features, A, B, N1, N2 and N3.
+The decision Y is equal to A xor B, hence N1--N3 are irrelevant attributes.
+The set also contains 3 additional features, A or B (AoB), A and B (AnB) and not A (nA), which provide a redundant, but still relevant way to reconstruct Y.
+}
+\details{
+This is set is an easy way to demonstrate the difference between all relevant feature selection methods, which should select all features except N1--N3, and minimal optimal ones, which will probably ignore most of them.
+}
+\keyword{datasets}

+ 6 - 0
tests/testthat/testBasic.R

@@ -12,4 +12,10 @@ test_that("Selection works on an extended iris",{
   getSelectedAttributes(Boruta.iris.extended),
   names(iris)[-5]
  )
+
+ attStats(Boruta.iris.extended)->a
+ expect_equal(sort(rownames(a)),sort(names(iris.extended)[-5]))
+ 
+ getSelectedAttributes(Boruta.iris.extended)->a
+ expect_equal(sort(a),sort(names(iris)[-5]))
 })

+ 5 - 9
tests/testthat/testSources.R

@@ -1,15 +1,11 @@
 context("Importance source tests")
 
-#Set, as in https://mbq.me/blog/relevance-and-redundancy
-setNames(
- do.call(expand.grid,rep(list(c(T,F)),5)),
- c("A","B","N1","N2","N3")
-)->X
-cbind(X,AoB=with(X,A|B),AnB=with(X,A&B),nA=!X$A)->X
-factor(with(X,A!=B))->Y
-data.frame(lapply(X,factor))->X
-#Numeric value, for more fun
 set.seed(777)
+
+data(srx)
+X<-srx[,-ncol(srx)]
+Y<-srx$Y
+#Also a numeric nonsense feature, for more fun
 X$N4<-runif(nrow(X))
 
 impSources<-c(