123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711 |
- #include "b3RadixSort32CL.h"
- #include "b3LauncherCL.h"
- #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
- #include "b3PrefixScanCL.h"
- #include "b3FillCL.h"
- #define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl"
- #include "kernels/RadixSort32KernelsCL.h"
- b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
- :m_commandQueue(queue)
- {
- b3OpenCLDeviceInfo info;
- b3OpenCLUtils::getDeviceInfo(device,&info);
- m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0;
- m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx,queue);
- m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx,queue);
- m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx,queue);
- m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx,queue);
- m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx,queue);
- m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx,queue);
- if (initialCapacity>0)
- {
- m_workBuffer1->resize(initialCapacity);
- m_workBuffer3->resize(initialCapacity);
- m_workBuffer3a->resize(initialCapacity);
- m_workBuffer4->resize(initialCapacity);
- m_workBuffer4a->resize(initialCapacity);
- }
- m_scan = new b3PrefixScanCL(ctx,device,queue);
- m_fill = new b3FillCL(ctx,device,queue);
-
- const char* additionalMacros = "";
- cl_int pErrNum;
- const char* kernelSource = radixSort32KernelsCL;
-
- cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH);
- b3Assert(sortProg);
- m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros );
- b3Assert(m_streamCountSortDataKernel );
-
- m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros );
- b3Assert(m_streamCountKernel);
-
- if (m_deviceCPU)
- {
-
- m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros );
- b3Assert(m_sortAndScatterSortDataKernel);
- m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros );
- b3Assert(m_sortAndScatterKernel);
- } else
- {
- m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros );
- b3Assert(m_sortAndScatterSortDataKernel);
- m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros );
- b3Assert(m_sortAndScatterKernel);
- }
-
- m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros );
- b3Assert(m_prefixScanKernel);
-
- }
- b3RadixSort32CL::~b3RadixSort32CL()
- {
- delete m_scan;
- delete m_fill;
- delete m_workBuffer1;
- delete m_workBuffer2;
- delete m_workBuffer3;
- delete m_workBuffer3a;
- delete m_workBuffer4;
- delete m_workBuffer4a;
- clReleaseKernel(m_streamCountSortDataKernel);
- clReleaseKernel(m_streamCountKernel);
- clReleaseKernel(m_sortAndScatterSortDataKernel);
- clReleaseKernel(m_sortAndScatterKernel);
- clReleaseKernel(m_prefixScanKernel);
- }
- void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
- {
- int n = inout.size();
- const int BITS_PER_PASS = 8;
- const int NUM_TABLES = (1<<BITS_PER_PASS);
- int tables[NUM_TABLES];
- int counter[NUM_TABLES];
- b3SortData* src = &inout[0];
- b3AlignedObjectArray<b3SortData> workbuffer;
- workbuffer.resize(inout.size());
- b3SortData* dst = &workbuffer[0];
- int count=0;
- for(int startBit=0; startBit<sortBits; startBit+=BITS_PER_PASS)
- {
- for(int i=0; i<NUM_TABLES; i++)
- {
- tables[i] = 0;
- }
- for(int i=0; i<n; i++)
- {
- int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
- tables[tableIdx]++;
- }
- //#define TEST
- #ifdef TEST
- printf("histogram size=%d\n",NUM_TABLES);
- for (int i=0;i<NUM_TABLES;i++)
- {
- if (tables[i]!=0)
- {
- printf("tables[%d]=%d]\n",i,tables[i]);
- }
- }
- #endif //TEST
- // prefix scan
- int sum = 0;
- for(int i=0; i<NUM_TABLES; i++)
- {
- int iData = tables[i];
- tables[i] = sum;
- sum += iData;
- counter[i] = 0;
- }
- // distribute
- for(int i=0; i<n; i++)
- {
- int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES-1);
-
- dst[tables[tableIdx] + counter[tableIdx]] = src[i];
- counter[tableIdx] ++;
- }
- b3Swap( src, dst );
- count++;
- }
- if (count&1)
- {
- b3Assert(0);//need to copy
- }
- }
- void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
- {
- b3AlignedObjectArray<b3SortData> inout;
- keyValuesInOut.copyToHost(inout);
- executeHost(inout,sortBits);
- keyValuesInOut.copyFromHost(inout);
- }
- void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
- b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
- {
- }
- //#define DEBUG_RADIXSORT
- //#define DEBUG_RADIXSORT2
- void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
- {
-
- int originalSize = keyValuesInOut.size();
- int workingSize = originalSize;
-
-
- int dataAlignment = DATA_ALIGNMENT;
- #ifdef DEBUG_RADIXSORT2
- b3AlignedObjectArray<b3SortData> test2;
- keyValuesInOut.copyToHost(test2);
- printf("numElem = %d\n",test2.size());
- for (int i=0;i<test2.size();i++)
- {
- printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
- printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
- }
- #endif //DEBUG_RADIXSORT2
-
- b3OpenCLArray<b3SortData>* src = 0;
- if (workingSize%dataAlignment)
- {
- workingSize += dataAlignment-(workingSize%dataAlignment);
- m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
- m_workBuffer4->resize(workingSize);
- b3SortData fillValue;
- fillValue.m_key = 0xffffffff;
- fillValue.m_value = 0xffffffff;
- #define USE_BTFILL
- #ifdef USE_BTFILL
- m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4,(b3Int2&)fillValue,workingSize-originalSize,originalSize);
- #else
- //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
-
- for (int i=originalSize; i<workingSize;i++)
- {
- m_workBuffer4->copyFromHostPointer(&fillValue,1,i);
- }
- #endif//USE_BTFILL
- src = m_workBuffer4;
- } else
- {
- src = &keyValuesInOut;
- m_workBuffer4->resize(0);
- }
-
- b3Assert( workingSize%DATA_ALIGNMENT == 0 );
- int minCap = NUM_BUCKET*NUM_WGS;
- int n = workingSize;
- m_workBuffer1->resize(minCap);
- m_workBuffer3->resize(workingSize);
-
- // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
- b3Assert( BITS_PER_PASS == 4 );
- b3Assert( WG_SIZE == 64 );
- b3Assert( (sortBits&0x3) == 0 );
-
-
- b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
- b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
- b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
- int nWGs = NUM_WGS;
- b3ConstData cdata;
- {
- int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
- int nBlocks = (n+blockSize-1)/(blockSize);
- cdata.m_n = n;
- cdata.m_nWGs = NUM_WGS;
- cdata.m_startBit = 0;
- cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
- if( nBlocks < NUM_WGS )
- {
- cdata.m_nBlocksPerWG = 1;
- nWGs = nBlocks;
- }
- }
- int count=0;
- for(int ib=0; ib<sortBits; ib+=4)
- {
- #ifdef DEBUG_RADIXSORT2
- keyValuesInOut.copyToHost(test2);
- printf("numElem = %d\n",test2.size());
- for (int i=0;i<test2.size();i++)
- {
- if (test2[i].m_key != test2[i].m_value)
- {
- printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
- printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
- }
- }
- #endif //DEBUG_RADIXSORT2
-
- cdata.m_startBit = ib;
-
- if (src->size())
- {
- b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
- b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel,"m_streamCountSortDataKernel");
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( cdata );
-
- int num = NUM_WGS*WG_SIZE;
- launcher.launch1D( num, WG_SIZE );
- }
-
-
- #ifdef DEBUG_RADIXSORT
- b3AlignedObjectArray<unsigned int> testHist;
- srcHisto->copyToHost(testHist);
- printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
- for (int i=0;i<testHist.size();i++)
- {
- if (testHist[i]!=0)
- printf("testHist[%d]=%d\n",i,testHist[i]);
- }
- #endif //DEBUG_RADIXSORT
-
-
- //fast prefix scan is not working properly on Mac OSX yet
- #ifdef __APPLE__
- bool fastScan=false;
- #else
- bool fastScan=!m_deviceCPU;//only use fast scan on GPU
- #endif
- if (fastScan)
- {// prefix scan group histogram
- b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
- b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" );
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( cdata );
- launcher.launch1D( 128, 128 );
- destHisto = srcHisto;
- }else
- {
- //unsigned int sum; //for debugging
- m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
- }
- #ifdef DEBUG_RADIXSORT
- destHisto->copyToHost(testHist);
- printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
- for (int i=0;i<testHist.size();i++)
- {
- if (testHist[i]!=0)
- printf("testHist[%d]=%d\n",i,testHist[i]);
- }
-
- for (int i=0;i<testHist.size();i+=NUM_WGS)
- {
- printf("testHist[%d]=%d\n",i/NUM_WGS,testHist[i]);
- }
- #endif //DEBUG_RADIXSORT
- #define USE_GPU
- #ifdef USE_GPU
-
- if (src->size())
- {// local sort and distribute
- b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
- b3LauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel,"m_sortAndScatterSortDataKernel" );
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( cdata );
- launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
-
- }
- #else
- {
- #define NUM_TABLES 16
- //#define SEQUENTIAL
- #ifdef SEQUENTIAL
- int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
- int tables[NUM_TABLES];
- int startBit = ib;
-
- destHisto->copyToHost(testHist);
- b3AlignedObjectArray<b3SortData> srcHost;
- b3AlignedObjectArray<b3SortData> dstHost;
- dstHost.resize(src->size());
-
- src->copyToHost(srcHost);
-
- for (int i=0;i<NUM_TABLES;i++)
- {
- tables[i] = testHist[i*NUM_WGS];
- }
-
- // distribute
- for(int i=0; i<n; i++)
- {
- int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
-
- dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
- counter2[tableIdx] ++;
- }
-
-
- #else
-
- int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
- int tables[NUM_TABLES];
- b3AlignedObjectArray<b3SortData> dstHostOK;
- dstHostOK.resize(src->size());
- destHisto->copyToHost(testHist);
- b3AlignedObjectArray<b3SortData> srcHost;
- src->copyToHost(srcHost);
-
- int blockSize = 256;
- int nBlocksPerWG = cdata.m_nBlocksPerWG;
- int startBit = ib;
- {
- for (int i=0;i<NUM_TABLES;i++)
- {
- tables[i] = testHist[i*NUM_WGS];
- }
-
- // distribute
- for(int i=0; i<n; i++)
- {
- int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
-
- dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
- counter2[tableIdx] ++;
- }
-
- }
-
-
- b3AlignedObjectArray<b3SortData> dstHost;
- dstHost.resize(src->size());
-
-
- int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
-
-
- for (int wgIdx=0;wgIdx<NUM_WGS;wgIdx++)
- {
- int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
- int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
-
- for(int iblock=0; iblock<b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
- {
- for (int lIdx = 0;lIdx < 64;lIdx++)
- {
- int addr = iblock*blockSize + blockSize*cdata.m_nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
-
- // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
- // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
- // AMD: AtomInc performs better while NV prefers ++
- for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
- {
- if( addr+j < n )
- {
- // printf ("addr+j=%d\n", addr+j);
-
- int i = addr+j;
-
- int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES-1);
-
- int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx];
-
- b3SortData ok = dstHostOK[destIndex];
-
- if (ok.m_key != srcHost[i].m_key)
- {
- printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key );
- printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value );
- }
- if (ok.m_value != srcHost[i].m_value)
- {
-
- printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value );
- printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key );
- }
-
- dstHost[destIndex] = srcHost[i];
- counter[tableIdx] ++;
-
- }
- }
- }
- }
- }
-
-
- #endif //SEQUENTIAL
-
- dst->copyFromHost(dstHost);
- }
- #endif//USE_GPU
-
-
-
- #ifdef DEBUG_RADIXSORT
- destHisto->copyToHost(testHist);
- printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size());
- for (int i=0;i<testHist.size();i++)
- {
- if (testHist[i]!=0)
- printf("testHist[%d]=%d\n",i,testHist[i]);
- }
- #endif //DEBUG_RADIXSORT
- b3Swap(src, dst );
- b3Swap(srcHisto,destHisto);
- #ifdef DEBUG_RADIXSORT2
- keyValuesInOut.copyToHost(test2);
- printf("numElem = %d\n",test2.size());
- for (int i=0;i<test2.size();i++)
- {
- if (test2[i].m_key != test2[i].m_value)
- {
- printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
- printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
- }
- }
- #endif //DEBUG_RADIXSORT2
-
- count++;
-
-
- }
-
-
-
- if (count&1)
- {
- b3Assert(0);//need to copy from workbuffer to keyValuesInOut
- }
- if (m_workBuffer4->size())
- {
- m_workBuffer4->resize(originalSize);
- keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
- }
- #ifdef DEBUG_RADIXSORT
- keyValuesInOut.copyToHost(test2);
-
- printf("numElem = %d\n",test2.size());
- for (int i=0;i<test2.size();i++)
- {
- printf("test2[%d].m_key=%d\n",i,test2[i].m_key);
- printf("test2[%d].m_value=%d\n",i,test2[i].m_value);
- }
- #endif
-
- }
- void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
- {
- int originalSize = keysInOut.size();
- int workingSize = originalSize;
-
-
- int dataAlignment = DATA_ALIGNMENT;
- b3OpenCLArray<unsigned int>* src = 0;
- if (workingSize%dataAlignment)
- {
- workingSize += dataAlignment-(workingSize%dataAlignment);
- m_workBuffer4a->copyFromOpenCLArray(keysInOut);
- m_workBuffer4a->resize(workingSize);
- unsigned int fillValue = 0xffffffff;
-
- m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize);
- src = m_workBuffer4a;
- } else
- {
- src = &keysInOut;
- m_workBuffer4a->resize(0);
- }
-
-
- b3Assert( workingSize%DATA_ALIGNMENT == 0 );
- int minCap = NUM_BUCKET*NUM_WGS;
- int n = workingSize;
-
- m_workBuffer1->resize(minCap);
- m_workBuffer3->resize(workingSize);
- m_workBuffer3a->resize(workingSize);
- // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
- b3Assert( BITS_PER_PASS == 4 );
- b3Assert( WG_SIZE == 64 );
- b3Assert( (sortBits&0x3) == 0 );
-
-
- b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
- b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
- b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
- int nWGs = NUM_WGS;
- b3ConstData cdata;
- {
- int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256
- int nBlocks = (n+blockSize-1)/(blockSize);
- cdata.m_n = n;
- cdata.m_nWGs = NUM_WGS;
- cdata.m_startBit = 0;
- cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs;
- if( nBlocks < NUM_WGS )
- {
- cdata.m_nBlocksPerWG = 1;
- nWGs = nBlocks;
- }
- }
- int count=0;
- for(int ib=0; ib<sortBits; ib+=4)
- {
- cdata.m_startBit = ib;
-
- if (src->size())
- {
- b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( srcHisto->getBufferCL() ) };
- b3LauncherCL launcher(m_commandQueue, m_streamCountKernel,"m_streamCountKernel");
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( cdata );
-
- int num = NUM_WGS*WG_SIZE;
- launcher.launch1D( num, WG_SIZE );
- }
-
- //fast prefix scan is not working properly on Mac OSX yet
- #ifdef __APPLE__
- bool fastScan=false;
- #else
- bool fastScan=!m_deviceCPU;
- #endif
- if (fastScan)
- {// prefix scan group histogram
- b3BufferInfoCL bInfo[] = { b3BufferInfoCL( srcHisto->getBufferCL() ) };
- b3LauncherCL launcher( m_commandQueue, m_prefixScanKernel,"m_prefixScanKernel" );
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( cdata );
- launcher.launch1D( 128, 128 );
- destHisto = srcHisto;
- }else
- {
- //unsigned int sum; //for debugging
- m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum);
- }
- if (src->size())
- {// local sort and distribute
- b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src->getBufferCL(), true ), b3BufferInfoCL( destHisto->getBufferCL(), true ), b3BufferInfoCL( dst->getBufferCL() )};
- b3LauncherCL launcher( m_commandQueue, m_sortAndScatterKernel ,"m_sortAndScatterKernel");
- launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
- launcher.setConst( cdata );
- launcher.launch1D( nWGs*WG_SIZE, WG_SIZE );
-
- }
-
- b3Swap(src, dst );
- b3Swap(srcHisto,destHisto);
- count++;
- }
-
- if (count&1)
- {
- b3Assert(0);//need to copy from workbuffer to keyValuesInOut
- }
- if (m_workBuffer4a->size())
- {
- m_workBuffer4a->resize(originalSize);
- keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
- }
-
- }
|