naivewrapper.R 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. # Naive wrapper for shadow VIM
  2. #
  3. # Copyright 2011-2018 Miron B. Kursa
  4. #
  5. # This file is part of rFerns R package.
  6. #
  7. #rFerns is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  8. #rFerns is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  9. #You should have received a copy of the GNU General Public License along with rFerns. If not, see http://www.gnu.org/licenses/.
  10. #' Naive feature selection method utilising the rFerns shadow imporance
  11. #'
  12. #' Proof-of-concept ensemble of rFerns models, built to stabilise and improve selection based on shadow importance.
  13. #' It employs a super-ensemble of \code{iterations} small rFerns forests, each built on a subspace of \code{size} attributes, which is selected randomly, but with a higher selection probability for attributes claimed important by previous sub-models.
  14. #' Final selection is a group of attributes which hold a substantial weight at the end of the procedure.
  15. #' @param x Data frame containing attributes; must have unique names and contain only numeric, integer or (ordered) factor columns.
  16. #' Factors must have less than 31 levels. No \code{NA} values are permitted.
  17. #' @param y A decision vector. Must a factor of the same length as \code{nrow(X)} for ordinary many-label classification, or a logical matrix with each column corresponding to a class for multi-label classification.
  18. #' @param iterations Number of iterations i.e., the number of sub-models built.
  19. #' @param depth The depth of the ferns; must be in 1--16 range. Note that time and memory requirements scale with \code{2^depth}.
  20. #' @param threads Number of parallel threads, copied to the underlying \code{rFerns} call.
  21. #' @param ferns Number of ferns to be build in each sub-model. This should be a small number, around 3-5 times \code{size}.
  22. #' @param size Number of attributes considered by each sub-model.
  23. #' @param lambda Lambda parameter driving the re-weighting step of the method.
  24. #' @param saveHistory Should weight history be stored.
  25. #' @return An object of class \code{naiveWrapper}, which is a list with the following components:
  26. #' \item{found}{Names of all selected attributes.}
  27. #' \item{weights}{Vector of weights indicating the confidence that certain feature is relevant.}
  28. #' \item{timeTaken}{Time of computation.}
  29. #' \item{weightHistory}{History of weights over all iterations, present if \code{saveHistory} was \code{TRUE}.}
  30. #' \item{params}{Copies of algorithm parameters, \code{iterations}, \code{depth}, \code{ferns} and \code{size}, as a named vector.}
  31. #' @references Kursa MB (2017). \emph{Efficient all relevant feature selection with random ferns}. In: Kryszkiewicz M., Appice A., Slezak D., Rybinski H., Skowron A., Ras Z. (eds) Foundations of Intelligent Systems. ISMIS 2017. Lecture Notes in Computer Science, vol 10352. Springer, Cham.
  32. #' @examples
  33. #' set.seed(77)
  34. #' #Fetch Iris data
  35. #' data(iris)
  36. #' #Extend with random noise
  37. #' noisyIris<-cbind(iris[,-5],apply(iris[,-5],2,sample))
  38. #' names(noisyIris)[5:8]<-sprintf("Nonsense%d",1:4)
  39. #' #Execute selection
  40. #' naiveWrapper(noisyIris,iris$Species,iterations=50,ferns=20,size=8)
  41. #' @export
  42. naiveWrapper<-function(x,y,iterations=1000,depth=5,ferns=100,size=30,lambda=5,threads=0,saveHistory=FALSE){
  43. if(size>ncol(x)){
  44. size<-ncol(x)
  45. warning(sprintf("size parameter limited to ncol(X)=%d",ncol(x)))
  46. }
  47. if(any(duplicated(names(x)))) stop("Cannot accept duplicated column names in x.")
  48. Sys.time()->b
  49. wh<-NULL
  50. weight<-rep(1,ncol(x)); names(weight)<-names(x)
  51. for(iter in 1:iterations){
  52. weight[weight<.1]<-.1
  53. if(saveHistory) cbind(wh,weight)->wh
  54. use<-sample(names(x),size,prob=weight/sum(weight))
  55. rFerns(x[,use,drop=FALSE],y,
  56. ferns=ferns,
  57. depth=depth,
  58. threads=threads,
  59. saveForest=FALSE,
  60. imp="sha")$importance->ii
  61. found<-rownames(ii)[ii$MeanScoreLoss>max(ii$Shadow)]
  62. weight[found]<-weight[found]+lambda
  63. weight[use[!(use%in%found)]]<-weight[use[!(use%in%found)]]-2*lambda
  64. }
  65. weight[weight<.1]<-.1
  66. found<-names(weight)[weight>max(c(weight,2+6*lambda))/2]
  67. Sys.time()->a
  68. ans<-list(
  69. found=found,
  70. weight=weight,
  71. timeTaken=a-b,
  72. weightHistory=wh,
  73. parameters=c(
  74. iterations=iterations,
  75. depth=depth,
  76. ferns=ferns,
  77. size=size
  78. )
  79. )
  80. class(ans)<-"naiveWrapper"
  81. ans
  82. }
  83. #' @method print naiveWrapper
  84. #' @export
  85. print.naiveWrapper<-function(x,...){
  86. cat("Naive shadow importance feature selection\n")
  87. if(length(x$found)==0){
  88. cat(" No attributes selected.\n")
  89. }else{
  90. cat(strwrap(sprintf(" %d attributes selected: %s.\n",
  91. length(x$found),
  92. paste(x$found,collapse=", ")
  93. )))
  94. }
  95. cat("\n")
  96. }