b3RadixSort32CL.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711
  1. #include "b3RadixSort32CL.h"
  2. #include "b3LauncherCL.h"
  3. #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
  4. #include "b3PrefixScanCL.h"
  5. #include "b3FillCL.h"
  6. #define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl"
  7. #include "kernels/RadixSort32KernelsCL.h"
  8. b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
  9. :m_commandQueue(queue)
  10. {
  11. b3OpenCLDeviceInfo info;
  12. b3OpenCLUtils::getDeviceInfo(device,&info);
  13. m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0;
  14. m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx,queue);
  15. m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx,queue);
  16. m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx,queue);
  17. m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx,queue);
  18. m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx,queue);
  19. m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx,queue);
  20. if (initialCapacity>0)
  21. {
  22. m_workBuffer1->resize(initialCapacity);
  23. m_workBuffer3->resize(initialCapacity);
  24. m_workBuffer3a->resize(initialCapacity);
  25. m_workBuffer4->resize(initialCapacity);
  26. m_workBuffer4a->resize(initialCapacity);
  27. }
  28. m_scan = new b3PrefixScanCL(ctx,device,queue);
  29. m_fill = new b3FillCL(ctx,device,queue);
  30. const char* additionalMacros = "";
  31. cl_int pErrNum;
  32. const char* kernelSource = radixSort32KernelsCL;
  33. cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH);
  34. b3Assert(sortProg);
  35. m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros );
  36. b3Assert(m_streamCountSortDataKernel );
  37. m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros );
  38. b3Assert(m_streamCountKernel);
  39. if (m_deviceCPU)
  40. {
  41. m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros );
  42. b3Assert(m_sortAndScatterSortDataKernel);
  43. m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros );
  44. b3Assert(m_sortAndScatterKernel);
  45. } else
  46. {
  47. m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros );
  48. b3Assert(m_sortAndScatterSortDataKernel);
  49. m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros );
  50. b3Assert(m_sortAndScatterKernel);
  51. }
  52. m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros );
  53. b3Assert(m_prefixScanKernel);
  54. }
  55. b3RadixSort32CL::~b3RadixSort32CL()
  56. {
  57. delete m_scan;
  58. delete m_fill;
  59. delete m_workBuffer1;
  60. delete m_workBuffer2;
  61. delete m_workBuffer3;
  62. delete m_workBuffer3a;
  63. delete m_workBuffer4;
  64. delete m_workBuffer4a;
  65. clReleaseKernel(m_streamCountSortDataKernel);
  66. clReleaseKernel(m_streamCountKernel);
  67. clReleaseKernel(m_sortAndScatterSortDataKernel);
  68. clReleaseKernel(m_sortAndScatterKernel);
  69. clReleaseKernel(m_prefixScanKernel);
  70. }
  71. void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
  72. {
  73. int n = inout.size();
  74. const int BITS_PER_PASS = 8;
  75. const int NUM_TABLES = (1<<BITS_PER_PASS);
  76. int tables[NUM_TABLES];
  77. int counter[NUM_TABLES];
  78. b3SortData* src = &inout[0];
  79. b3AlignedObjectArray<b3SortData> workbuffer;
  80. workbuffer.resize(inout.size());
  81. b3SortData* dst = &workbuffer[0];
  82. int count=0;
  83. for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
  84. {
  85. for(int i=0; i<NUM_TABLES; i++)
  86. {
  87. tables[i] = 0;
  88. }
  89. for(int i=0; i<n; i++)
  90. {
  91. int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
  92. tables[tableIdx]++;
  93. }
  94. //#define TEST
  95. #ifdef TEST
  96. printf("histogram size=%d\n",NUM_TABLES);
  97. for (int i=0;i<NUM_TABLES;i++)
  98. {
  99. if (tables[i]!=0)
  100. {
  101. printf("tables[%d]=%d]\n",i,tables[i]);
  102. }
  103. }
  104. #endif //TEST
  105. // prefix scan
  106. int sum = 0;
  107. for(int i=0; i<NUM_TABLES; i++)
  108. {
  109. int iData = tables[i];
  110. tables[i] = sum;
  111. sum += iData;
  112. counter[i] = 0;
  113. }
  114. // distribute
  115. for(int i=0; i<n; i++)
  116. {
  117. int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
  118. dst[tables[tableIdx] + counter[tableIdx]] = src[i];
  119. counter[tableIdx] ++;
  120. }
  121. b3Swap( src, dst );
  122. count++;
  123. }
  124. if (count&1)
  125. {
  126. b3Assert(0);//need to copy
  127. }
  128. }
  129. void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
  130. {
  131. b3AlignedObjectArray<b3SortData> inout;
  132. keyValuesInOut.copyToHost(inout);
  133. executeHost(inout,sortBits);
  134. keyValuesInOut.copyFromHost(inout);
  135. }
  136. void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
  137. b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
  138. {
  139. }
  140. //#define DEBUG_RADIXSORT
  141. //#define DEBUG_RADIXSORT2
  142. void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
  143. {
  144. int originalSize = keyValuesInOut.size();
  145. int workingSize = originalSize;
  146. int dataAlignment = DATA_ALIGNMENT;
  147. #ifdef DEBUG_RADIXSORT2
  148. b3AlignedObjectArray<b3SortData> test2;
  149. keyValuesInOut.copyToHost(test2);
  150. printf("numElem = %d\n",test2.size());
  151. for (int i=0;i<test2.size();i++)
  152. {
  153. printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
  154. printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
  155. }
  156. #endif //DEBUG_RADIXSORT2
  157. b3OpenCLArray<b3SortData>* src = 0;
  158. if (workingSize%dataAlignment)
  159. {
  160. workingSize += dataAlignment-(workingSize%dataAlignment);
  161. m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
  162. m_workBuffer4->resize(workingSize);
  163. b3SortData fillValue;
  164. fillValue.m_key = 0xffffffff;
  165. fillValue.m_value = 0xffffffff;
  166. #define USE_BTFILL
  167. #ifdef USE_BTFILL
  168. m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize);
  169. #else
  170. //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
  171. for (int i=originalSize; i<workingSize;i++)
  172. {
  173. m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
  174. }
  175. #endif//USE_BTFILL
  176. src = m_workBuffer4;
  177. } else
  178. {
  179. src = &keyValuesInOut;
  180. m_workBuffer4->resize(0);
  181. }
  182. b3Assert( workingSize%DATA_ALIGNMENT == 0 );
  183. int minCap = NUM_BUCKET*NUM_WGS;
  184. int n = workingSize;
  185. m_workBuffer1->resize(minCap);
  186. m_workBuffer3->resize(workingSize);
  187. // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
  188. b3Assert( BITS_PER_PASS == 4 );
  189. b3Assert( WG_SIZE == 64 );
  190. b3Assert( (sortBits&0x3) == 0 );
  191. b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
  192. b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
  193. b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
  194. int nWGs = NUM_WGS;
  195. b3ConstData cdata;
  196. {
  197. int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
  198. int nBlocks = (n+blockSize-1)/(blockSize);
  199. cdata.m_n = n;
  200. cdata.m_nWGs = NUM_WGS;
  201. cdata.m_startBit = 0;
  202. cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
  203. if( nBlocks < NUM_WGS )
  204. {
  205. cdata.m_nBlocksPerWG = 1;
  206. nWGs = nBlocks;
  207. }
  208. }
  209. int count=0;
  210. for(int ib=0; ib<sortBits; ib+=4)
  211. {
  212. #ifdef DEBUG_RADIXSORT2
  213. keyValuesInOut.copyToHost(test2);
  214. printf("numElem = %d\n",test2.size());
  215. for (int i=0;i<test2.size();i++)
  216. {
  217. if (test2[i].m_key != test2[i].m_value)
  218. {
  219. printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
  220. printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
  221. }
  222. }
  223. #endif //DEBUG_RADIXSORT2
  224. cdata.m_startBit = ib;
  225. if (src->size())
  226. {
  227. b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
  228. b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel,"m_streamCountSortDataKernel");
  229. launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
  230. launcher.setConst( cdata );
  231. int num = NUM_WGS*WG_SIZE;
  232. launcher.launch1D( num, WG_SIZE );
  233. }
  234. #ifdef DEBUG_RADIXSORT
  235. b3AlignedObjectArray<unsigned int> testHist;
  236. srcHisto->copyToHost(testHist);
  237. printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
  238. for (int i=0;i<testHist.size();i++)
  239. {
  240. if (testHist[i]!=0)
  241. printf("testHist[%d]=%d\n",i,testHist[i]);
  242. }
  243. #endif //DEBUG_RADIXSORT
  244. //fast prefix scan is not working properly on Mac OSX yet
  245. #ifdef __APPLE__
  246. bool fastScan=false;
  247. #else
  248. bool fastScan=!m_deviceCPU;//only use fast scan on GPU
  249. #endif
  250. if (fastScan)
  251. {// prefix scan group histogram
  252. b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
  253. b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" );
  254. launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
  255. launcher.setConst( cdata );
  256. launcher.launch1D( 128, 128 );
  257. destHisto = srcHisto;
  258. }else
  259. {
  260. //unsigned int sum; //for debugging
  261. m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
  262. }
  263. #ifdef DEBUG_RADIXSORT
  264. destHisto->copyToHost(testHist);
  265. printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
  266. for (int i=0;i<testHist.size();i++)
  267. {
  268. if (testHist[i]!=0)
  269. printf("testHist[%d]=%d\n",i,testHist[i]);
  270. }
  271. for (int i=0;i<testHist.size();i+=NUM_WGS)
  272. {
  273. printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
  274. }
  275. #endif //DEBUG_RADIXSORT
  276. #define USE_GPU
  277. #ifdef USE_GPU
  278. if (src->size())
  279. {// local sort and distribute
  280. b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
  281. b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel,"m_sortAndScatterSortDataKernel" );
  282. launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
  283. launcher.setConst( cdata );
  284. launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
  285. }
  286. #else
  287. {
  288. #define NUM_TABLES 16
  289. //#define SEQUENTIAL
  290. #ifdef SEQUENTIAL
  291. int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  292. int tables[NUM_TABLES];
  293. int startBit = ib;
  294. destHisto->copyToHost(testHist);
  295. b3AlignedObjectArray<b3SortData> srcHost;
  296. b3AlignedObjectArray<b3SortData> dstHost;
  297. dstHost.resize(src->size());
  298. src->copyToHost(srcHost);
  299. for (int i=0;i<NUM_TABLES;i++)
  300. {
  301. tables[i] = testHist[i*NUM_WGS];
  302. }
  303. // distribute
  304. for(int i=0; i<n; i++)
  305. {
  306. int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
  307. dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
  308. counter2[tableIdx] ++;
  309. }
  310. #else
  311. int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  312. int tables[NUM_TABLES];
  313. b3AlignedObjectArray<b3SortData> dstHostOK;
  314. dstHostOK.resize(src->size());
  315. destHisto->copyToHost(testHist);
  316. b3AlignedObjectArray<b3SortData> srcHost;
  317. src->copyToHost(srcHost);
  318. int blockSize = 256;
  319. int nBlocksPerWG = cdata.m_nBlocksPerWG;
  320. int startBit = ib;
  321. {
  322. for (int i=0;i<NUM_TABLES;i++)
  323. {
  324. tables[i] = testHist[i*NUM_WGS];
  325. }
  326. // distribute
  327. for(int i=0; i<n; i++)
  328. {
  329. int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
  330. dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
  331. counter2[tableIdx] ++;
  332. }
  333. }
  334. b3AlignedObjectArray<b3SortData> dstHost;
  335. dstHost.resize(src->size());
  336. int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  337. for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
  338. {
  339. int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  340. int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
  341. for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
  342. {
  343. for (int lIdx = 0;lIdx < 64;lIdx++)
  344. {
  345. int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
  346. // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
  347. // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
  348. // AMD: AtomInc performs better while NV prefers ++
  349. for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
  350. {
  351. if( addr+j < n )
  352. {
  353. // printf ("addr+j=%d\n", addr+j);
  354. int i = addr+j;
  355. int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
  356. int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
  357. b3SortData ok = dstHostOK[destIndex];
  358. if (ok.m_key != srcHost[i].m_key)
  359. {
  360. printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
  361. printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
  362. }
  363. if (ok.m_value != srcHost[i].m_value)
  364. {
  365. printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
  366. printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );
  367. }
  368. dstHost[destIndex] = srcHost[i];
  369. counter[tableIdx] ++;
  370. }
  371. }
  372. }
  373. }
  374. }
  375. #endif //SEQUENTIAL
  376. dst->copyFromHost(dstHost);
  377. }
  378. #endif//USE_GPU
  379. #ifdef DEBUG_RADIXSORT
  380. destHisto->copyToHost(testHist);
  381. printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
  382. for (int i=0;i<testHist.size();i++)
  383. {
  384. if (testHist[i]!=0)
  385. printf("testHist[%d]=%d\n",i,testHist[i]);
  386. }
  387. #endif //DEBUG_RADIXSORT
  388. b3Swap(src, dst );
  389. b3Swap(srcHisto,destHisto);
  390. #ifdef DEBUG_RADIXSORT2
  391. keyValuesInOut.copyToHost(test2);
  392. printf("numElem = %d\n",test2.size());
  393. for (int i=0;i<test2.size();i++)
  394. {
  395. if (test2[i].m_key != test2[i].m_value)
  396. {
  397. printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
  398. printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
  399. }
  400. }
  401. #endif //DEBUG_RADIXSORT2
  402. count++;
  403. }
  404. if (count&1)
  405. {
  406. b3Assert(0);//need to copy from workbuffer to keyValuesInOut
  407. }
  408. if (m_workBuffer4->size())
  409. {
  410. m_workBuffer4->resize(originalSize);
  411. keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
  412. }
  413. #ifdef DEBUG_RADIXSORT
  414. keyValuesInOut.copyToHost(test2);
  415. printf("numElem = %d\n",test2.size());
  416. for (int i=0;i<test2.size();i++)
  417. {
  418. printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
  419. printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
  420. }
  421. #endif
  422. }
  423. void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
  424. {
  425. int originalSize = keysInOut.size();
  426. int workingSize = originalSize;
  427. int dataAlignment = DATA_ALIGNMENT;
  428. b3OpenCLArray<unsigned int>* src = 0;
  429. if (workingSize%dataAlignment)
  430. {
  431. workingSize += dataAlignment-(workingSize%dataAlignment);
  432. m_workBuffer4a->copyFromOpenCLArray(keysInOut);
  433. m_workBuffer4a->resize(workingSize);
  434. unsigned int fillValue = 0xffffffff;
  435. m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize);
  436. src = m_workBuffer4a;
  437. } else
  438. {
  439. src = &keysInOut;
  440. m_workBuffer4a->resize(0);
  441. }
  442. b3Assert( workingSize%DATA_ALIGNMENT == 0 );
  443. int minCap = NUM_BUCKET*NUM_WGS;
  444. int n = workingSize;
  445. m_workBuffer1->resize(minCap);
  446. m_workBuffer3->resize(workingSize);
  447. m_workBuffer3a->resize(workingSize);
  448. // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
  449. b3Assert( BITS_PER_PASS == 4 );
  450. b3Assert( WG_SIZE == 64 );
  451. b3Assert( (sortBits&0x3) == 0 );
  452. b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
  453. b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
  454. b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
  455. int nWGs = NUM_WGS;
  456. b3ConstData cdata;
  457. {
  458. int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
  459. int nBlocks = (n+blockSize-1)/(blockSize);
  460. cdata.m_n = n;
  461. cdata.m_nWGs = NUM_WGS;
  462. cdata.m_startBit = 0;
  463. cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
  464. if( nBlocks < NUM_WGS )
  465. {
  466. cdata.m_nBlocksPerWG = 1;
  467. nWGs = nBlocks;
  468. }
  469. }
  470. int count=0;
  471. for(int ib=0; ib<sortBits; ib+=4)
  472. {
  473. cdata.m_startBit = ib;
  474. if (src->size())
  475. {
  476. b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
  477. b3LauncherCL launcher(m_commandQueue, m_streamCountKernel,"m_streamCountKernel");
  478. launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
  479. launcher.setConst( cdata );
  480. int num = NUM_WGS*WG_SIZE;
  481. launcher.launch1D( num, WG_SIZE );
  482. }
  483. //fast prefix scan is not working properly on Mac OSX yet
  484. #ifdef __APPLE__
  485. bool fastScan=false;
  486. #else
  487. bool fastScan=!m_deviceCPU;
  488. #endif
  489. if (fastScan)
  490. {// prefix scan group histogram
  491. b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
  492. b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" );
  493. launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
  494. launcher.setConst( cdata );
  495. launcher.launch1D( 128, 128 );
  496. destHisto = srcHisto;
  497. }else
  498. {
  499. //unsigned int sum; //for debugging
  500. m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
  501. }
  502. if (src->size())
  503. {// local sort and distribute
  504. b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
  505. b3LauncherCL launcher( m_commandQueue, m_sortAndScatterKernel ,"m_sortAndScatterKernel");
  506. launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
  507. launcher.setConst( cdata );
  508. launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
  509. }
  510. b3Swap(src, dst );
  511. b3Swap(srcHisto,destHisto);
  512. count++;
  513. }
  514. if (count&1)
  515. {
  516. b3Assert(0);//need to copy from workbuffer to keyValuesInOut
  517. }
  518. if (m_workBuffer4a->size())
  519. {
  520. m_workBuffer4a->resize(originalSize);
  521. keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
  522. }
  523. }