SoftwareCache.h 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. /*
  2. ===========================================================================
  3. Doom 3 BFG Edition GPL Source Code
  4. Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
  5. This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
  6. Doom 3 BFG Edition Source Code is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. Doom 3 BFG Edition Source Code is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with Doom 3 BFG Edition Source Code. If not, see <http://www.gnu.org/licenses/>.
  16. In addition, the Doom 3 BFG Edition Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 BFG Edition Source Code. If not, please request a copy in writing from id Software at the address below.
  17. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
  18. ===========================================================================
  19. */
  20. #ifndef __SOFTWARECACHE_H__
  21. #define __SOFTWARECACHE_H__
  22. #pragma warning( disable : 4324 ) // structure was padded due to __declspec(align())
  23. /*
  24. ================================================================================================
  25. On-Demand Streamed Objects and Arrays
  26. idODSObject // DMA in a single object
  27. idODSCachedObject // DMA in a single object through a software cache
  28. idODSArray // DMA in an array with objects
  29. idODSIndexedArray // DMA gather from an array with objects
  30. idODSStreamedArray // overlapped DMA streaming of an array with objects
  31. idODSStreamedIndexedArray // overlapped DMA gather from an array with objects
  32. On the SPU the 'idODSObject' streams the data into temporary memory using the DMA controller
  33. and the object constructor immediately waits for the DMA transfer to complete. In other words
  34. there is no caching and every random memory access incurs full memory latency. This should be
  35. used to stream in objects that are only used once at unpredictable times.
  36. The 'idODSCachedObject' uses an object based software cache on the SPU which is useful for
  37. streaming in objects that may be used repeatedly or which usage can be predicted allowing
  38. the objects to be prefetched.
  39. class idMyType {};
  40. class idMyCache : public idSoftwareCache< idMyType, 8, 4 > {};
  41. idMyCache myCache;
  42. idMyType * myPtr;
  43. idODSCachedObject< idMyType, idMyCache > myODS( myPtr, myCache );
  44. The 'idSoftwareCache' implements a Prefetch() function that can be used to prefetch whole
  45. objects into the cache well before they are needed. However, any idODSObject, idODSArray,
  46. idODSIndexedArray etc. after calling the Prefetch() function will have to wait for the
  47. prefetch to complete. In other words, make sure there is enough "work" done in between
  48. a Prefetch() call and the first next idODS* object.
  49. The 'idODSArray' streams in a block of objects that are tightly packed in memory.
  50. The 'idODSIndexedArray' is used to gather a number of objects that are not necessarily
  51. contiguous in memory. On the SPU a DMA-list is used in the 'idODSIndexedArray' constructor
  52. to efficiently gather all the objects.
  53. The 'idODSStreamedArray' is used for sequentially reading a large input array. Overlapped
  54. streaming is used where one batch of array elements can be accessed while the next batch
  55. is being streamed in.
  56. The 'idODSStreamedIndexedArray' is used for gathering elements from an array using a
  57. sequentially read index. Overlapped streaming is used for both the index and the array
  58. elements where one batch of array elements can be accessed while the next batch of
  59. indices/array elements is being streamed in.
  60. Outside the SPU, data is never copied to temporary memory because this would cause
  61. significant load-hit-store penalties. Instead, the object constructor issues prefetch
  62. instructions where appropriate and only maintains pointers to the actual data. In the
  63. case of 'idODSObject' or 'idODSCachedObject' the class is no more than a simple wrapper
  64. of a pointer and the class should completely compile away with zero overhead.
  65. COMMON MISTAKES:
  66. 1. When using ODS objects do not forget to set the "globalDmaTag" that is used to issue
  67. and wait for DMAs.
  68. void cellSpursJobMain2( CellSpursJobContext2 * stInfo, CellSpursJob256 * job ) {
  69. globalDmaTag = stInfo->dmaTag; // for ODS objects
  70. }
  71. 2. ODS objects can consume quite a bit of stack space. You may have to increase the SPU job
  72. stack size. For instance:
  73. job->header.sizeStack = SPURS_QUADWORDS( 16 * 1024 ); // the ODS objects get pretty large
  74. Make sure you measure the size of each ODS object and if there are recursive functions
  75. using ODS objects make sure the recursion is bounded. When the stack overflows the scratch
  76. and output memory may get overwritten and the results will be undefined. Finding stack
  77. overflows is painful.
  78. 3. While you can setup a regular DMA list entry to use a NULL pointer with zero size, do not use
  79. a NULL pointer for a cache DMA list entry. This confuses SPURS and can cause your SPU binary
  80. to get corrupted.
  81. ================================================================================================
  82. */
  83. extern uint32 globalDmaTag;
  84. #define MAX_DMA_SIZE ( 1 << 14 )
  85. #define ODS_ROUND16( x ) ( ( x + 15 ) & ~15 )
  86. enum streamBufferType_t {
  87. SBT_DOUBLE = 2,
  88. SBT_QUAD = 4
  89. };
  90. /*
  91. ================================================================================================
  92. non-SPU code
  93. ================================================================================================
  94. */
  95. /*
  96. ================================================
  97. idSoftwareCache
  98. ================================================
  99. */
  100. template< typename _type_, int _entries_ = 8, int _associativity_ = 4, bool aligned = false >
  101. class ALIGNTYPE128 idSoftwareCache {
  102. public:
  103. void Prefetch( const _type_ * obj ) {
  104. ::Prefetch( obj, 0 );
  105. }
  106. };
  107. /*
  108. ================================================
  109. idODSObject
  110. ================================================
  111. */
  112. template< typename _type_ >
  113. class idODSObject {
  114. public:
  115. idODSObject( const _type_ * obj ) : objectPtr( obj ) {}
  116. operator const _type_ & () const { return *objectPtr; }
  117. const _type_ * operator->() const { return objectPtr; }
  118. const _type_ & Get() const { return *objectPtr; }
  119. const _type_ * Ptr() const { return objectPtr; }
  120. const _type_ * OriginalPtr() const { return objectPtr; }
  121. private:
  122. const _type_ * objectPtr;
  123. };
  124. /*
  125. ================================================
  126. idODSCachedObject
  127. ================================================
  128. */
  129. template< typename _type_, typename _cache_ >
  130. class idODSCachedObject {
  131. public:
  132. idODSCachedObject( const _type_ * obj, _cache_ & cache ) : objectPtr( obj ) {}
  133. operator const _type_ & () const { return *objectPtr; }
  134. const _type_ * operator->() const { return objectPtr; }
  135. const _type_ & Get() const { return *objectPtr; }
  136. const _type_ * Ptr() const { return objectPtr; }
  137. const _type_ * OriginalPtr() const { return objectPtr; }
  138. private:
  139. const _type_ * objectPtr;
  140. };
  141. /*
  142. ================================================
  143. idODSArray
  144. ================================================
  145. */
  146. template< typename _type_, int max >
  147. class idODSArray {
  148. public:
  149. idODSArray( const _type_ * array, int num ) : arrayPtr( array ), arrayNum( num ) {
  150. assert( num <= max );
  151. Prefetch( array, 0 );
  152. }
  153. const _type_ & operator[]( int index ) const {
  154. assert( index >= 0 && index < arrayNum );
  155. return arrayPtr[index];
  156. }
  157. const _type_ * Ptr() const { return arrayPtr; }
  158. const int Num() const { return arrayNum; }
  159. private:
  160. const _type_ * arrayPtr;
  161. int arrayNum;
  162. };
  163. /*
  164. ================================================
  165. idODSIndexedArray
  166. ================================================
  167. */
  168. template< typename _elemType_, typename _indexType_, int max >
  169. class idODSIndexedArray {
  170. public:
  171. idODSIndexedArray( const _elemType_ * array, const _indexType_ * index, int num ) : arrayNum( num ) {
  172. assert( num <= max );
  173. for ( int i = 0; i < num; i++ ) {
  174. Prefetch( arrayPtr, abs( index[i] ) * sizeof( _elemType_ ) );
  175. arrayPtr[i] = array + abs( index[i] );
  176. }
  177. }
  178. const _elemType_ & operator[]( int index ) const {
  179. assert( index >= 0 && index < arrayNum );
  180. return * arrayPtr[index];
  181. }
  182. void ReplicateUpToMultipleOfFour() {
  183. assert( ( max & 3 ) == 0 );
  184. while( ( arrayNum & 3 ) != 0 ) {
  185. arrayPtr[arrayNum++] = arrayPtr[0];
  186. }
  187. }
  188. private:
  189. const _elemType_ * arrayPtr[max];
  190. int arrayNum;
  191. };
  192. /*
  193. ================================================
  194. idODSStreamedOutputArray
  195. ================================================
  196. */
  197. template< typename _type_, int _bufferSize_ >
  198. class ALIGNTYPE16 idODSStreamedOutputArray {
  199. public:
  200. idODSStreamedOutputArray( _type_ * array, int * numElements, int maxElements ) :
  201. localNum( 0 ),
  202. outArray( array ),
  203. outNum( numElements ),
  204. outMax( maxElements ) {
  205. compile_time_assert( CONST_ISPOWEROFTWO( _bufferSize_ ) );
  206. compile_time_assert( ( ( _bufferSize_ * sizeof( _type_ ) ) & 15 ) == 0 );
  207. compile_time_assert( _bufferSize_ * sizeof( _type_ ) < MAX_DMA_SIZE );
  208. assert_16_byte_aligned( array );
  209. }
  210. ~idODSStreamedOutputArray() {
  211. *outNum = localNum;
  212. }
  213. int Num() const { return localNum; }
  214. void Append( _type_ element ) { assert( localNum < outMax ); outArray[localNum++] = element; }
  215. _type_ & Alloc() { assert( localNum < outMax ); return outArray[localNum++]; }
  216. private:
  217. int localNum;
  218. _type_ * outArray;
  219. int * outNum;
  220. int outMax;
  221. };
  222. /*
  223. ================================================
  224. idODSStreamedArray
  225. ================================================
  226. */
  227. template< typename _type_, int _bufferSize_, streamBufferType_t _sbt_ = SBT_DOUBLE, int _roundUpToMultiple_ = 1 >
  228. class ALIGNTYPE16 idODSStreamedArray {
  229. public:
  230. idODSStreamedArray( const _type_ * array, const int numElements ) :
  231. cachedArrayStart( 0 ),
  232. cachedArrayEnd( 0 ),
  233. streamArrayEnd( 0 ),
  234. inArray( array ),
  235. inArrayNum( numElements ),
  236. inArrayNumRoundedUp( numElements ) {
  237. compile_time_assert( CONST_ISPOWEROFTWO( _bufferSize_ ) );
  238. compile_time_assert( ( ( _bufferSize_ * sizeof( _type_ ) ) & 15 ) == 0 );
  239. compile_time_assert( _bufferSize_ * sizeof( _type_ ) < MAX_DMA_SIZE );
  240. compile_time_assert( _roundUpToMultiple_ >= 1 );
  241. assert_16_byte_aligned( array );
  242. assert( (uintptr_t)array > _bufferSize_ * sizeof( _type_ ) );
  243. // Fetch the first batch of elements.
  244. FetchNextBatch();
  245. // Calculate the rounded up size here making the mod effectively for free because we have to wait
  246. // for memory access anyway while the above FetchNextBatch() does not need the rounded up size yet.
  247. inArrayNumRoundedUp += _roundUpToMultiple_ - 1;
  248. inArrayNumRoundedUp -= inArrayNumRoundedUp % ( ( _roundUpToMultiple_ > 1 ) ? _roundUpToMultiple_ : 1 );
  249. }
  250. ~idODSStreamedArray() {
  251. // Flush the accessible part of the array.
  252. FlushArray( inArray, cachedArrayStart * sizeof( _type_ ), cachedArrayEnd * sizeof( _type_ ) );
  253. }
  254. // Fetches a new batch of array elements and returns the first index after this new batch.
  255. // After calling this, the elements starting at the index returned by the previous call to
  256. // FetchNextBach() (or zero if not yet called) up to (excluding) the index returned by
  257. // this call to FetchNextBatch() can be accessed through the [] operator. When quad-buffering,
  258. // the elements starting at the index returned by the second-from-last call to FetchNextBatch()
  259. // can still be accessed. This is useful when the algorithm needs to successively access
  260. // an odd number of elements at the same time that may cross a single buffer boundary.
  261. int FetchNextBatch() {
  262. // If not everything has been streamed already.
  263. if ( cachedArrayEnd < inArrayNum ) {
  264. cachedArrayEnd = streamArrayEnd;
  265. cachedArrayStart = Max( cachedArrayEnd - _bufferSize_ * ( _sbt_ - 1 ), 0 );
  266. // Flush the last batch of elements that is no longer accessible.
  267. FlushArray( inArray, ( cachedArrayStart - _bufferSize_ ) * sizeof( _type_ ), cachedArrayStart * sizeof( _type_ ) );
  268. // Prefetch the next batch of elements.
  269. if ( streamArrayEnd < inArrayNum ) {
  270. streamArrayEnd = Min( streamArrayEnd + _bufferSize_, inArrayNum );
  271. for ( unsigned int offset = cachedArrayEnd * sizeof( _type_ ); offset < streamArrayEnd * sizeof( _type_ ); offset += CACHE_LINE_SIZE ) {
  272. Prefetch( inArray, offset );
  273. }
  274. }
  275. }
  276. return ( cachedArrayEnd == inArrayNum ) ? inArrayNumRoundedUp : cachedArrayEnd;
  277. }
  278. // Provides access to the elements starting at the index returned by the next-to-last call
  279. // to FetchNextBach() (or zero if only called once so far) up to (excluding) the index
  280. // returned by the last call to FetchNextBatch(). When quad-buffering, the elements starting
  281. // at the index returned by the second-from-last call to FetchNextBatch() can still be accessed.
  282. // This is useful when the algorithm needs to successively access an odd number of elements
  283. // at the same time that may cross a single buffer boundary.
  284. const _type_ & operator[]( int index ) const {
  285. assert( ( index >= cachedArrayStart && index < cachedArrayEnd ) || ( cachedArrayEnd == inArrayNum && index >= inArrayNum && index < inArrayNumRoundedUp ) );
  286. if ( _roundUpToMultiple_ > 1 ) {
  287. index &= ( index - inArrayNum ) >> 31;
  288. }
  289. return inArray[index];
  290. }
  291. private:
  292. int cachedArrayStart;
  293. int cachedArrayEnd;
  294. int streamArrayEnd;
  295. const _type_ * inArray;
  296. int inArrayNum;
  297. int inArrayNumRoundedUp;
  298. static void FlushArray( const void * flushArray, int flushStart, int flushEnd ) {
  299. #if 0
  300. // arrayFlushBase is rounded up so we do not flush anything before the array.
  301. // arrayFlushStart is rounded down so we start right after the last cache line that was previously flushed.
  302. // arrayFlushEnd is rounded down so we do not flush a cache line that holds data that may still be partially
  303. // accessible or a cache line that stretches beyond the end of the array.
  304. const uintptr_t arrayAddress = (uintptr_t)flushArray;
  305. const uintptr_t arrayFlushBase = ( arrayAddress + CACHE_LINE_SIZE - 1 ) & ~( CACHE_LINE_SIZE - 1 );
  306. const uintptr_t arrayFlushStart = ( arrayAddress + flushStart ) & ~( CACHE_LINE_SIZE - 1 );
  307. const uintptr_t arrayFlushEnd = ( arrayAddress + flushEnd ) & ~( CACHE_LINE_SIZE - 1 );
  308. for ( uintptr_t offset = Max( arrayFlushBase, arrayFlushStart ); offset < arrayFlushEnd; offset += CACHE_LINE_SIZE ) {
  309. FlushCacheLine( flushArray, offset - arrayAddress );
  310. }
  311. #endif
  312. }
  313. };
  314. /*
  315. ================================================
  316. idODSStreamedIndexedArray
  317. For gathering elements from an array using a sequentially read index.
  318. This uses overlapped streaming for both the index and the array elements
  319. where one batch of indices and/or array elements can be accessed while
  320. the next batch is being streamed in.
  321. NOTE: currently the size of array elements must be a multiple of 16 bytes.
  322. An index with offsets and more complex logic is needed to support other sizes.
  323. ================================================
  324. */
  325. template< typename _elemType_, typename _indexType_, int _bufferSize_, streamBufferType_t _sbt_ = SBT_DOUBLE, int _roundUpToMultiple_ = 1 >
  326. class ALIGNTYPE16 idODSStreamedIndexedArray {
  327. public:
  328. idODSStreamedIndexedArray( const _elemType_ * array, const int numElements, const _indexType_ * index, const int numIndices ) :
  329. cachedArrayStart( 0 ),
  330. cachedArrayEnd( 0 ),
  331. streamArrayEnd( 0 ),
  332. cachedIndexStart( 0 ),
  333. cachedIndexEnd( 0 ),
  334. streamIndexEnd( 0 ),
  335. inArray( array ),
  336. inArrayNum( numElements ),
  337. inIndex( index ),
  338. inIndexNum( numIndices ),
  339. inIndexNumRoundedUp( numIndices ) {
  340. compile_time_assert( CONST_ISPOWEROFTWO( _bufferSize_ ) );
  341. compile_time_assert( ( ( _bufferSize_ * sizeof( _indexType_ ) ) & 15 ) == 0 );
  342. compile_time_assert( _bufferSize_ * sizeof( _indexType_ ) < MAX_DMA_SIZE );
  343. compile_time_assert( _bufferSize_ * sizeof( _elemType_ ) < MAX_DMA_SIZE );
  344. compile_time_assert( ( sizeof( _elemType_ ) & 15 ) == 0 ); // to avoid complexity due to cellDmaListGet
  345. compile_time_assert( _roundUpToMultiple_ >= 1 );
  346. assert_16_byte_aligned( index );
  347. assert_16_byte_aligned( array );
  348. assert( (uintptr_t)index > _bufferSize_ * sizeof( _indexType_ ) );
  349. assert( (uintptr_t)array > _bufferSize_ * sizeof( _elemType_ ) );
  350. // Fetch the first batch of indices.
  351. FetchNextBatch();
  352. // Fetch the first batch of elements and the next batch of indices.
  353. FetchNextBatch();
  354. // Calculate the rounded up size here making the mod effectively for free because we have to wait
  355. // for memory access anyway while the above FetchNextBatch() do not need the rounded up size yet.
  356. inIndexNumRoundedUp += _roundUpToMultiple_ - 1;
  357. inIndexNumRoundedUp -= inIndexNumRoundedUp % ( ( _roundUpToMultiple_ > 1 ) ? _roundUpToMultiple_ : 1 );
  358. }
  359. ~idODSStreamedIndexedArray() {
  360. // Flush the accessible part of the index.
  361. FlushArray( inIndex, cachedIndexStart * sizeof( _indexType_ ), cachedIndexEnd * sizeof( _indexType_ ) );
  362. // Flush the accessible part of the array.
  363. FlushArray( inArray, cachedArrayStart * sizeof( _elemType_ ), cachedArrayEnd * sizeof( _elemType_ ) );
  364. }
  365. // Fetches a new batch of array elements and returns the first index after this new batch.
  366. // After calling this, the elements starting at the index returned by the previous call to
  367. // FetchNextBach() (or zero if not yet called) up to (excluding) the index returned by
  368. // this call to FetchNextBatch() can be accessed through the [] operator. When quad-buffering,
  369. // the elements starting at the index returned by the second-from-last call to FetchNextBatch()
  370. // can still be accessed. This is useful when the algorithm needs to successively access
  371. // an odd number of elements at the same time that may cross a single buffer boundary.
  372. int FetchNextBatch() {
  373. // If not everything has been streamed already.
  374. if ( cachedArrayEnd < inIndexNum ) {
  375. if ( streamIndexEnd > 0 ) {
  376. cachedArrayEnd = streamArrayEnd;
  377. cachedArrayStart = Max( cachedArrayEnd - _bufferSize_ * ( _sbt_ - 1 ), 0 );
  378. cachedIndexEnd = streamIndexEnd;
  379. cachedIndexStart = Max( cachedIndexEnd - _bufferSize_ * ( _sbt_ - 1 ), 0 );
  380. // Flush the last batch of indices that are no longer accessible.
  381. FlushArray( inIndex, ( cachedIndexStart - _bufferSize_ ) * sizeof( _indexType_ ), cachedIndexStart * sizeof( _indexType_ ) );
  382. // Flush the last batch of elements that is no longer accessible.
  383. FlushArray( inArray, ( cachedArrayStart - _bufferSize_ ) * sizeof( _elemType_ ), cachedArrayStart * sizeof( _elemType_ ) );
  384. // Prefetch the next batch of elements.
  385. if ( streamArrayEnd < inIndexNum ) {
  386. streamArrayEnd = cachedIndexEnd;
  387. for ( int i = cachedArrayEnd; i < streamArrayEnd; i++ ) {
  388. assert( i >= cachedIndexStart && i < cachedIndexEnd );
  389. assert( inIndex[i] >= 0 && inIndex[i] < inArrayNum );
  390. Prefetch( inArray, inIndex[i] * sizeof( _elemType_ ) );
  391. }
  392. }
  393. }
  394. // Prefetch the next batch of indices.
  395. if ( streamIndexEnd < inIndexNum ) {
  396. streamIndexEnd = Min( streamIndexEnd + _bufferSize_, inIndexNum );
  397. for ( unsigned int offset = cachedIndexEnd * sizeof( _indexType_ ); offset < streamIndexEnd * sizeof( _indexType_ ); offset += CACHE_LINE_SIZE ) {
  398. Prefetch( inIndex, offset );
  399. }
  400. }
  401. }
  402. return ( cachedArrayEnd == inIndexNum ) ? inIndexNumRoundedUp : cachedArrayEnd;
  403. }
  404. // Provides access to the elements starting at the index returned by the next-to-last call
  405. // to FetchNextBach() (or zero if only called once so far) up to (excluding) the index
  406. // returned by the last call to FetchNextBatch(). When quad-buffering, the elements starting
  407. // at the index returned by the second-from-last call to FetchNextBatch() can still be accessed.
  408. // This is useful when the algorithm needs to successively access an odd number of elements
  409. // at the same time that may cross a single buffer boundary.
  410. const _elemType_ & operator[]( int index ) const {
  411. assert( ( index >= cachedArrayStart && index < cachedArrayEnd ) || ( cachedArrayEnd == inIndexNum && index >= inIndexNum && index < inIndexNumRoundedUp ) );
  412. if ( _roundUpToMultiple_ > 1 ) {
  413. index &= ( index - inIndexNum ) >> 31;
  414. }
  415. return inArray[inIndex[index]];
  416. }
  417. private:
  418. int cachedArrayStart;
  419. int cachedArrayEnd;
  420. int streamArrayEnd;
  421. int cachedIndexStart;
  422. int cachedIndexEnd;
  423. int streamIndexEnd;
  424. const _elemType_ * inArray;
  425. int inArrayNum;
  426. const _indexType_ * inIndex;
  427. int inIndexNum;
  428. int inIndexNumRoundedUp;
  429. static void FlushArray( const void * flushArray, int flushStart, int flushEnd ) {
  430. #if 0
  431. // arrayFlushBase is rounded up so we do not flush anything before the array.
  432. // arrayFlushStart is rounded down so we start right after the last cache line that was previously flushed.
  433. // arrayFlushEnd is rounded down so we do not flush a cache line that holds data that may still be partially
  434. // accessible or a cache line that stretches beyond the end of the array.
  435. const uintptr_t arrayAddress = (uintptr_t)flushArray;
  436. const uintptr_t arrayFlushBase = ( arrayAddress + CACHE_LINE_SIZE - 1 ) & ~( CACHE_LINE_SIZE - 1 );
  437. const uintptr_t arrayFlushStart = ( arrayAddress + flushStart ) & ~( CACHE_LINE_SIZE - 1 );
  438. const uintptr_t arrayFlushEnd = ( arrayAddress + flushEnd ) & ~( CACHE_LINE_SIZE - 1 );
  439. for ( uintptr_t offset = Max( arrayFlushBase, arrayFlushStart ); offset < arrayFlushEnd; offset += CACHE_LINE_SIZE ) {
  440. FlushCacheLine( flushArray, offset - arrayAddress );
  441. }
  442. #endif
  443. }
  444. };
  445. #endif // !__SOFTWARECACHE_H__