forest.h 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. /* Code handling fern ensembles -- creation, prediction, OOB, accuracy...
  2. Copyright 2011-2018 Miron B. Kursa
  3. This file is part of rFerns R package.
  4. rFerns is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
  5. rFerns is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  6. You should have received a copy of the GNU General Public License along with rFerns. If not, see http://www.gnu.org/licenses/.
  7. */
  8. void killModel(model *x);
  9. model *makeModel(DATASET_,ferns *ferns,params *P,R_){
  10. uint scalarExecution=(P->threads==1),nt=P->threads;
  11. uint numC=P->numClasses;
  12. uint D=P->D;
  13. assert(D<=MAX_D);
  14. uint twoToD=P->twoToD;
  15. uint multi=P->multilabel;
  16. //=Allocations=//
  17. //Internal objects
  18. ALLOCN(_curPreds,score_t,numC*N*nt);
  19. ALLOCN(_bag,uint,N*nt);
  20. ALLOCN(_idx,uint,N*nt);
  21. //Output objects
  22. ALLOCN(ans,model,1);
  23. //OOB prediction stuff
  24. ALLOCZ(ans->oobPreds,score_t,numC*N*nt);
  25. score_t *_oobPredsAcc=ans->oobPreds;
  26. ALLOCZ(ans->oobOutOfBagC,uint,N*nt);
  27. uint *_oobPredsC=ans->oobOutOfBagC;
  28. //Stuff for importance
  29. uint *_buf_idxPerm=NULL;
  30. //Actual importance result
  31. ans->imp=NULL;
  32. ans->shimp=NULL;
  33. ans->try=NULL;
  34. //Allocate if needed only
  35. if(P->calcImp){
  36. ALLOCZ(ans->imp,double,nX);
  37. ALLOCZ(ans->shimp,double,nX);
  38. ALLOCZ(ans->try,double,nX);
  39. if(P->calcImp==1){
  40. ALLOC(_buf_idxPerm,uint,N*nt);
  41. }else if(P->calcImp==2){
  42. ALLOC(_buf_idxPerm,uint,N*nt*2);
  43. }else error("Somehow invalid importance flag; several internal logic breach. Please report.");
  44. }
  45. ans->forest=ferns;
  46. uint modelSeed=RINTEGER;
  47. //=Building model=//
  48. #pragma omp parallel for num_threads(nt)
  49. for(uint e=0;e<(P->numFerns);e++){
  50. uint tn=omp_get_thread_num(); //Number of the thread we're in
  51. uint *bag=_bag+(N*tn),*idx=_idx+(N*tn);
  52. score_t *curPreds=_curPreds+(N*numC*tn);
  53. score_t *oobPredsAcc=_oobPredsAcc+(N*numC*tn);
  54. uint *oobPredsC=_oobPredsC+(N*tn);
  55. int fernLoc=(P->holdForest)?e:tn;
  56. if(scalarExecution){
  57. CHECK_INTERRUPT; //Place to go though event loop, if such is present
  58. }
  59. rng_t _curFernRng,*curFernRng=&_curFernRng;
  60. SETSEEDEX(curFernRng,e+1,modelSeed);
  61. makeBagMask(bag,N,curFernRng);
  62. makeFern(_DATASET,_thFERN(fernLoc),bag,curPreds,idx,_SIMP,curFernRng);
  63. //Accumulating OOB errors, independently per thread
  64. for(uint ee=0;ee<N;ee++){
  65. oobPredsC[ee]+=!(bag[ee]);
  66. for(uint eee=0;eee<numC;eee++)
  67. oobPredsAcc[eee+numC*ee]+=((double)(!(bag[ee])))*curPreds[eee+numC*ee];
  68. }
  69. //Importance
  70. if(P->calcImp){
  71. /*
  72. For importance, we want to know which unique attributes were used to build it.
  73. Their number will be placed in numAC, and attC[0..(numAC-1)] will contain their indices.
  74. */
  75. uint attC[MAX_D];
  76. attC[0]=(ferns->splitAtts)[fernLoc*D];
  77. uint numAC=1;
  78. for(uint ee=1;ee<D;ee++){
  79. for(uint eee=0;eee<numAC;eee++)
  80. if((ferns->splitAtts)[fernLoc*D+ee]==attC[eee]) goto isDuplicate;
  81. attC[numAC]=(ferns->splitAtts)[fernLoc*D+ee]; numAC++;
  82. isDuplicate:
  83. continue;
  84. }
  85. if(P->calcImp==1){
  86. uint *buf_idxPermA=_buf_idxPerm+(tn*N);
  87. for(uint ee=0;ee<numAC;ee++){
  88. accLoss loss=calcAccLoss(_DATASET,attC[ee],_thFERN(fernLoc),bag,idx,curPreds,numC,D,curFernRng,buf_idxPermA);
  89. #pragma omp critical
  90. {
  91. ans->imp[attC[ee]]+=loss.direct;
  92. ans->try[attC[ee]]++;
  93. }
  94. }
  95. }else{
  96. uint *buf_idxPermA=_buf_idxPerm+(tn*N*2);
  97. uint *buf_idxPermB=_buf_idxPerm+(tn*N*2+N);
  98. for(uint ee=0;ee<numAC;ee++){
  99. accLoss loss=calcAccLossConsistent(_DATASET,attC[ee],_thFERN(fernLoc),bag,idx,curPreds,numC,D,curFernRng,P->consSeed,buf_idxPermA,buf_idxPermB);
  100. #pragma omp critical
  101. {
  102. ans->imp[attC[ee]]+=loss.direct;
  103. ans->shimp[attC[ee]]+=loss.shadow;
  104. ans->try[attC[ee]]++;
  105. }
  106. }
  107. }
  108. }
  109. }
  110. //=Finishing up=//
  111. //Finishing importance
  112. if(P->calcImp) for(uint e=0;e<nX;e++){
  113. if(ans->try[e]==0){
  114. ans->imp[e]=0.;
  115. ans->try[e]=0.;
  116. ans->shimp[e]=0.;
  117. }else{
  118. ans->imp[e]/=ans->try[e];
  119. if(P->calcImp==2){
  120. ans->shimp[e]/=ans->try[e];
  121. }else{
  122. //This is probably redundant
  123. ans->shimp[e]=0.;
  124. }
  125. }
  126. }
  127. //Collecting OOB in parallel case
  128. if(nt!=1) for(int e=0;e<N;e++){
  129. //Loop over threads; we accumulate to tn 0, so from 1
  130. for(int tn=1;tn<nt;tn++){
  131. //Rprintf("%d/%d %d/%d\n",e,N,tn,nt);
  132. for(int ee=0;ee<numC;ee++)
  133. _oobPredsAcc[e*numC+ee]+=_oobPredsAcc[tn*N*numC+e*numC+ee];
  134. _oobPredsC[e]+=_oobPredsC[tn*N+e];
  135. }
  136. }
  137. //Releasing memory
  138. FREE(_bag); FREE(_curPreds); FREE(_idx);
  139. FREE(_buf_idxPerm);
  140. return(ans);
  141. #ifndef IN_R
  142. allocFailed:
  143. killModel(ans);
  144. IFFREE(_bag); IFFREE(_curPreds); IFFREE(_idx);
  145. IFFREE(_buf_idxPerm);
  146. return(NULL);
  147. #endif
  148. }
  149. void predictWithModelSimple(PREDSET_,ferns *x,uint *ans,SIMPP_,double *sans,R_){
  150. ferns *ferns=x;
  151. for(uint e=0;e<numC*N;e++)
  152. sans[e]=0.;
  153. //Use ans memory as idx buffer
  154. uint *idx=ans;
  155. for(uint e=0;e<numFerns;e++){
  156. predictFernAdd(
  157. _PREDSET,
  158. _thFERN(e),
  159. sans,
  160. idx,
  161. _SIMP);
  162. }
  163. if(!multi){
  164. for(uint e=0;e<N;e++)
  165. ans[e]=whichMaxTieAware(&(sans[e*numC]),numC,e);
  166. }else{
  167. for(uint e=0;e<numC;e++)
  168. for(uint ee=0;ee<N;ee++)
  169. ans[e*N+ee]=sans[ee*numC+e]>0.;
  170. }
  171. }
  172. void predictWithModelScores(PREDSET_,ferns *x,double *ans,SIMPP_,uint *idx){
  173. ferns *ferns=x;
  174. for(uint e=0;e<numC*N;e++)
  175. ans[e]=0.;
  176. for(uint e=0;e<numFerns;e++)
  177. predictFernAdd(
  178. _PREDSET,
  179. _thFERN(e),
  180. ans,
  181. idx,
  182. _SIMP);
  183. }
  184. void killModel(model *x){
  185. if(x){
  186. IFFREE(x->oobPreds);
  187. IFFREE(x->oobOutOfBagC);
  188. IFFREE(x->oobErr);
  189. IFFREE(x->imp);
  190. IFFREE(x->shimp);
  191. IFFREE(x->try);
  192. FREE(x);
  193. }
  194. }