importance.R 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. # Importance sources
  2. #' randomForest importance adapters
  3. #'
  4. #' Those function is intended to be given to a \code{getImp} argument of \code{\link{Boruta}} function to be called by the Boruta algorithm as an importance source.
  5. #' \code{getImpLegacyRfZ} generates default, normalized permutation importance, \code{getImpLegacyRfRaw} raw permutation importance, finally \code{getImpLegacyRfGini} generates Gini index importance, all using \code{\link[randomForest]{randomForest}} as a Random Forest algorithm implementation.
  6. #' @name getImpLegacyRf
  7. #' @rdname getImpLegacyRf
  8. #' @aliases getImpLegacyRfZ getImpLegacyRfGini getLegacyImpRfRaw
  9. #' @note The \code{getImpLegacyRfZ} function was a default importance source in Boruta versions prior to 5.0; since then \code{\link{ranger}} Random Forest implementation is used instead of \code{\link[randomForest]{randomForest}}, for speed, memory conservation and an ability to utilise multithreading.
  10. #' Both importance sources should generally lead to the same results, yet there are differences.
  11. #'
  12. #' Most notably, ranger by default treats factor attributes as ordered (and works very slow if instructed otherwise with \code{respect.unordered.factors=TRUE}); on the other hand it lifts 32 levels limit specific to \code{\link[randomForest]{randomForest}}.
  13. #' To this end, Boruta decision for factor attributes may be different.
  14. #'
  15. #' Random Forest methods has two main parameters, number of attributes tried at each split and the number of trees in the forest; first one is called \code{mtry} in both implementations, but the second \code{ntree} in \code{\link[randomForest]{randomForest}} and \code{num.trees} in \code{\link{ranger}}.
  16. #' To this end, to maintain compatibility, \code{getImpRf*} functions still accept \code{ntree} parameter relaying it into \code{num.trees}.
  17. #' Still, both parameters take the same defaults in both implementations (square root of the number all all attributes and 500 respectively).
  18. #'
  19. #' Moreover, \code{\link{ranger}} brings some addition capabilities to Boruta, like analysis of survival problems or sticky variables which are always considered on splits.
  20. #'
  21. #' Finally, the results for the same PRNG seed will be different.
  22. #' @param x data frame of predictors including shadows.
  23. #' @param y response vector.
  24. #' @param ... parameters passed to the underlying \code{\link[randomForest]{randomForest}} call; they are relayed from \code{...} of \code{\link{Boruta}}.
  25. #' @examples
  26. #' set.seed(777)
  27. #' #Add some nonsense attributes to iris dataset by shuffling original attributes
  28. #' iris.extended<-data.frame(iris,apply(iris[,-5],2,sample))
  29. #' names(iris.extended)[6:9]<-paste("Nonsense",1:4,sep="")
  30. #' #Run Boruta on this data
  31. #' Boruta(Species~.,getImp=getImpLegacyRfZ,
  32. #' data=iris.extended,doTrace=2)->Boruta.iris.extended
  33. #' #Nonsense attributes should be rejected
  34. #' print(Boruta.iris.extended)
  35. #' @export
  36. getImpLegacyRfZ<-function(x,y,...){
  37. randomForest::randomForest(x,y,
  38. importance=TRUE,keep.forest=FALSE,...)->rf
  39. randomForest::importance(rf,1,scale=TRUE)[,1]
  40. }
  41. comment(getImpLegacyRfZ)<-'randomForest normalized permutation importance'
  42. #' @rdname getImpLegacyRf
  43. #' @export
  44. getImpLegacyRfRaw<-function(x,y,...){
  45. randomForest::randomForest(x,y,
  46. importance=TRUE,keep.forest=FALSE,...)->rf
  47. randomForest::importance(rf,1,scale=FALSE)[,1]
  48. }
  49. comment(getImpLegacyRfRaw)<-'randomForest raw permutation importance'
  50. #' @rdname getImpLegacyRf
  51. #' @export
  52. getImpLegacyRfGini<-function(x,y,...){
  53. randomForest::randomForest(x,y,
  54. keep.forest=FALSE,...)->rf
  55. randomForest::importance(rf,2,scale=FALSE)[,1]
  56. }
  57. comment(getImpLegacyRfGini)<-'randomForest Gini index importance'
  58. #' ranger Random Forest importance adapters
  59. #'
  60. #' Those function is intended to be given to a \code{getImp} argument of \code{\link{Boruta}} function to be called by the Boruta algorithm as an importance source.
  61. #' \code{getImpRfZ} generates default, normalized permutation importance, \code{getImpRfRaw} raw permutation importance, finally \code{getImpRfGini} generates Gini index importance.
  62. #' @name getImpRf
  63. #' @rdname getImpRf
  64. #' @aliases getImpRfZ getImpRfGini getImpRfRaw
  65. #' @param x data frame of predictors including shadows.
  66. #' @param y response vector.
  67. #' @param ntree Number of trees in the forest; copied into \code{\link{ranger}}'s native num.trees, put to retain transparent compatibility with randomForest.
  68. #' @param num.trees Number of trees in the forest, as according to \code{\link{ranger}}'s nomenclature. If not given, set to \code{ntree} value. If both are given, \code{num.trees} takes precedence.
  69. #' @param ... parameters passed to the underlying \code{\link{ranger}} call; they are relayed from \code{...} of \code{\link{Boruta}}.
  70. #' @note Prior to Boruta 5.0, \code{getImpLegacyRfZ} function was a default importance source in Boruta; see \link{getImpLegacyRf} for more details.
  71. #' @import ranger
  72. #' @export
  73. getImpRfZ<-function(x,y,ntree=500,num.trees=ntree,...){
  74. if(inherits(y,"Surv")){
  75. x$shadow.Boruta.time<-y[,"time"]
  76. x$shadow.Boruta.status<-y[,"status"]
  77. return(ranger::ranger(data=x,
  78. dependent.variable.name="shadow.Boruta.time",
  79. status.variable.name="shadow.Boruta.status",
  80. num.trees=num.trees,importance="permutation",
  81. scale.permutation.importance=TRUE,
  82. write.forest=FALSE,...)$variable.importance)
  83. }
  84. #Abusing the fact that Boruta disallows attributes with names
  85. # starting from "shadow"
  86. x$shadow.Boruta.decision<-y
  87. ranger::ranger(data=x,dependent.variable.name="shadow.Boruta.decision",
  88. num.trees=num.trees,importance="permutation",
  89. scale.permutation.importance=TRUE,
  90. write.forest=FALSE,...)$variable.importance
  91. }
  92. comment(getImpRfZ)<-'ranger normalized permutation importance'
  93. #' @rdname getImpRf
  94. #' @export
  95. getImpRfGini<-function(x,y,ntree=500,num.trees=ntree,...){
  96. if(inherits(y,"Surv"))
  97. stop("Ranger cannot produce Gini importance for survival problems.")
  98. x$shadow.Boruta.decision<-y
  99. ranger::ranger(data=x,dependent.variable.name="shadow.Boruta.decision",
  100. num.trees=num.trees,importance="impurity",
  101. scale.permutation.importance=FALSE,
  102. write.forest=FALSE,...)$variable.importance
  103. }
  104. comment(getImpRfGini)<-'ranger Gini index importance'
  105. #' @rdname getImpRf
  106. #' @export
  107. getImpRfRaw<-function(x,y,ntree=500,num.trees=ntree,...){
  108. if(inherits(y,"Surv")){
  109. x$shadow.Boruta.time<-y[,"time"]
  110. x$shadow.Boruta.status<-y[,"status"]
  111. return(ranger::ranger(data=x,
  112. dependent.variable.name="shadow.Boruta.time",
  113. status.variable.name="shadow.Boruta.status",
  114. num.trees=num.trees,importance="permutation",
  115. write.forest=FALSE,...)$variable.importance)
  116. }
  117. x$shadow.Boruta.decision<-y
  118. ranger::ranger(data=x,dependent.variable.name="shadow.Boruta.decision",
  119. num.trees=num.trees,importance="permutation",
  120. scale.permutation.importance=FALSE,
  121. write.forest=FALSE,...)$variable.importance
  122. }
  123. comment(getImpRfRaw)<-'ranger raw permutation importance'
  124. #' ranger Extra-trees importance adapters
  125. #'
  126. #' Those function is intended to be given to a \code{getImp} argument of \code{\link{Boruta}} function to be called by the Boruta algorithm as an importance source.
  127. #' \code{getImpExtraZ} generates default, normalized permutation importance, \code{getImpExtraRaw} raw permutation importance, finally \code{getImpExtraGini} generates Gini impurity importance.
  128. #' @name getImpExtra
  129. #' @rdname getImpExtra
  130. #' @aliases getImpExtraZ getImpExtraGini getImpExtraRaw
  131. #' @param x data frame of predictors including shadows.
  132. #' @param y response vector.
  133. #' @param ntree Number of trees in the forest; copied into \code{\link{ranger}}'s native num.trees, put to retain transparent compatibility with randomForest.
  134. #' @param num.trees Number of trees in the forest, as according to \code{\link{ranger}}'s nomenclature. If not given, set to \code{ntree} value. If both are given, \code{num.trees} takes precedence.
  135. #' @param ... parameters passed to the underlying \code{\link{ranger}} call; they are relayed from \code{...} of \code{\link{Boruta}}. Note that these function work just by setting \code{splitrule} to \code{"extratrees"}.
  136. #' @export
  137. getImpExtraZ<-function(x,y,ntree=500,num.trees=ntree,...)
  138. getImpRfZ(x,y,ntree=ntree,splitrule="extratrees",...)
  139. comment(getImpExtraZ)<-'ranger normalized permutation importance'
  140. #' @rdname getImpExtra
  141. #' @export
  142. getImpExtraGini<-function(x,y,ntree=500,num.trees=ntree,...)
  143. getImpRfGini(x,y,ntree=ntree,splitrule="extratrees",...)
  144. comment(getImpExtraGini)<-'ranger extra-trees Gini index importance'
  145. #' @rdname getImpExtra
  146. #' @export
  147. getImpExtraRaw<-function(x,y,ntree=500,num.trees=ntree,...)
  148. getImpRfRaw(x,y,ntree=ntree,splitrule="extratrees",...)
  149. comment(getImpExtraRaw)<-'ranger extra-trees raw permutation importance'
  150. #' Random Ferns importance
  151. #'
  152. #' This function is intended to be given to a \code{getImp} argument of \code{\link{Boruta}} function to be called by the Boruta algorithm as an importance source.
  153. #' @param x data frame of predictors including shadows.
  154. #' @param y response vector.
  155. #' @param ... parameters passed to the underlying \code{\link[rFerns]{rFerns}} call; they are relayed from \code{...} of \code{\link{Boruta}}.
  156. #' @export
  157. #' @note Random Ferns importance calculation should be much faster than using Random Forest; however, one must first optimize the value of the \code{depth} parameter and
  158. #' it is quite likely that the number of ferns in the ensemble required for the importance to converge will be higher than the number of trees in case of Random Forest.
  159. getImpFerns<-function(x,y,...){
  160. f<-rFerns::rFerns(x,y,
  161. saveForest=FALSE,importance=TRUE,...)
  162. f$importance[,1]
  163. }
  164. comment(getImpFerns)<-'rFerns importance'
  165. #' Xgboost importance
  166. #'
  167. #' This function is intended to be given to a \code{getImp} argument of \code{\link{Boruta}} function to be called by the Boruta algorithm as an importance source.
  168. #' @param x data frame of predictors including shadows.
  169. #' @param y response vector.
  170. #' @param nrounds Number of rounds; passed to the underlying \code{\link[xgboost]{xgboost}} call.
  171. #' @param verbose Verbosity level of xgboost; either 0 (silent) or 1 (progress reports). Passed to the underlying \code{\link[xgboost]{xgboost}} call.
  172. #' @param ... other parameters passed to the underlying \code{\link[xgboost]{xgboost}} call.
  173. #' Similarly as \code{nrounds} and \code{verbose}, they are relayed from \code{...} of \code{\link{Boruta}}.
  174. #' For convenience, this function sets \code{nrounds} to 5 and verbose to 0, but this can be overridden.
  175. #' @note Only dense matrix interface is supported; all predictions given to \code{\link{Boruta}} call have to be numeric (not integer).
  176. #' Categorical features should be split into indicator attributes.
  177. #' This functionality is inspired by the Python package BoostARoota by Chase DeHan.
  178. #' I have some doubts whether boosting importance can be used for all relevant selection without hitting substantial false negative rates; please consider this functionality experimental.
  179. #' @references \url{https://github.com/chasedehan/BoostARoota}
  180. #' @export
  181. getImpXgboost<-function(x,y,nrounds=5,verbose=0,...){
  182. xgboost::xgb.importance(
  183. model=xgboost::xgboost(
  184. data=as.matrix(x),
  185. label=y,
  186. nrounds=nrounds,
  187. verbose=verbose
  188. )
  189. )->imp
  190. stats::setNames(rep(0,ncol(x)),colnames(x))->ans
  191. ans[imp$Feature]<-imp$Gain
  192. ans
  193. }
  194. comment(getImpXgboost)<-'xgboost gain importance'