ethash_cl_miner_kernel.cl 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593
  1. // author Tim Hughes <tim@twistedfury.com>
  2. // Tested on Radeon HD 7850
  3. // Hashrate: 15940347 hashes/s
  4. // Bandwidth: 124533 MB/s
  5. // search kernel should fit in <= 84 VGPRS (3 wavefronts)
  6. #ifdef cl_clang_storage_class_specifiers
  7. #pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
  8. #endif
  9. #define THREADS_PER_HASH (128 / 16)
  10. #define HASHES_PER_LOOP (GROUP_SIZE / THREADS_PER_HASH)
  11. #define FNV_PRIME 0x01000193
  12. __constant uint2 const Keccak_f1600_RC[24] = {
  13. (uint2)(0x00000001, 0x00000000),
  14. (uint2)(0x00008082, 0x00000000),
  15. (uint2)(0x0000808a, 0x80000000),
  16. (uint2)(0x80008000, 0x80000000),
  17. (uint2)(0x0000808b, 0x00000000),
  18. (uint2)(0x80000001, 0x00000000),
  19. (uint2)(0x80008081, 0x80000000),
  20. (uint2)(0x00008009, 0x80000000),
  21. (uint2)(0x0000008a, 0x00000000),
  22. (uint2)(0x00000088, 0x00000000),
  23. (uint2)(0x80008009, 0x00000000),
  24. (uint2)(0x8000000a, 0x00000000),
  25. (uint2)(0x8000808b, 0x00000000),
  26. (uint2)(0x0000008b, 0x80000000),
  27. (uint2)(0x00008089, 0x80000000),
  28. (uint2)(0x00008003, 0x80000000),
  29. (uint2)(0x00008002, 0x80000000),
  30. (uint2)(0x00000080, 0x80000000),
  31. (uint2)(0x0000800a, 0x00000000),
  32. (uint2)(0x8000000a, 0x80000000),
  33. (uint2)(0x80008081, 0x80000000),
  34. (uint2)(0x00008080, 0x80000000),
  35. (uint2)(0x80000001, 0x00000000),
  36. (uint2)(0x80008008, 0x80000000),
  37. };
  38. static void keccak_f1600_round(uint2* a, uint r, uint out_size)
  39. {
  40. #if !__ENDIAN_LITTLE__
  41. for (uint i = 0; i != 25; ++i)
  42. a[i] = a[i].yx;
  43. #endif
  44. uint2 b[25];
  45. uint2 t;
  46. // Theta
  47. b[0] = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20];
  48. b[1] = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21];
  49. b[2] = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22];
  50. b[3] = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23];
  51. b[4] = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24];
  52. t = b[4] ^ (uint2)(b[1].x << 1 | b[1].y >> 31, b[1].y << 1 | b[1].x >> 31);
  53. a[0] ^= t;
  54. a[5] ^= t;
  55. a[10] ^= t;
  56. a[15] ^= t;
  57. a[20] ^= t;
  58. t = b[0] ^ (uint2)(b[2].x << 1 | b[2].y >> 31, b[2].y << 1 | b[2].x >> 31);
  59. a[1] ^= t;
  60. a[6] ^= t;
  61. a[11] ^= t;
  62. a[16] ^= t;
  63. a[21] ^= t;
  64. t = b[1] ^ (uint2)(b[3].x << 1 | b[3].y >> 31, b[3].y << 1 | b[3].x >> 31);
  65. a[2] ^= t;
  66. a[7] ^= t;
  67. a[12] ^= t;
  68. a[17] ^= t;
  69. a[22] ^= t;
  70. t = b[2] ^ (uint2)(b[4].x << 1 | b[4].y >> 31, b[4].y << 1 | b[4].x >> 31);
  71. a[3] ^= t;
  72. a[8] ^= t;
  73. a[13] ^= t;
  74. a[18] ^= t;
  75. a[23] ^= t;
  76. t = b[3] ^ (uint2)(b[0].x << 1 | b[0].y >> 31, b[0].y << 1 | b[0].x >> 31);
  77. a[4] ^= t;
  78. a[9] ^= t;
  79. a[14] ^= t;
  80. a[19] ^= t;
  81. a[24] ^= t;
  82. // Rho Pi
  83. b[0] = a[0];
  84. b[10] = (uint2)(a[1].x << 1 | a[1].y >> 31, a[1].y << 1 | a[1].x >> 31);
  85. b[7] = (uint2)(a[10].x << 3 | a[10].y >> 29, a[10].y << 3 | a[10].x >> 29);
  86. b[11] = (uint2)(a[7].x << 6 | a[7].y >> 26, a[7].y << 6 | a[7].x >> 26);
  87. b[17] = (uint2)(a[11].x << 10 | a[11].y >> 22, a[11].y << 10 | a[11].x >> 22);
  88. b[18] = (uint2)(a[17].x << 15 | a[17].y >> 17, a[17].y << 15 | a[17].x >> 17);
  89. b[3] = (uint2)(a[18].x << 21 | a[18].y >> 11, a[18].y << 21 | a[18].x >> 11);
  90. b[5] = (uint2)(a[3].x << 28 | a[3].y >> 4, a[3].y << 28 | a[3].x >> 4);
  91. b[16] = (uint2)(a[5].y << 4 | a[5].x >> 28, a[5].x << 4 | a[5].y >> 28);
  92. b[8] = (uint2)(a[16].y << 13 | a[16].x >> 19, a[16].x << 13 | a[16].y >> 19);
  93. b[21] = (uint2)(a[8].y << 23 | a[8].x >> 9, a[8].x << 23 | a[8].y >> 9);
  94. b[24] = (uint2)(a[21].x << 2 | a[21].y >> 30, a[21].y << 2 | a[21].x >> 30);
  95. b[4] = (uint2)(a[24].x << 14 | a[24].y >> 18, a[24].y << 14 | a[24].x >> 18);
  96. b[15] = (uint2)(a[4].x << 27 | a[4].y >> 5, a[4].y << 27 | a[4].x >> 5);
  97. b[23] = (uint2)(a[15].y << 9 | a[15].x >> 23, a[15].x << 9 | a[15].y >> 23);
  98. b[19] = (uint2)(a[23].y << 24 | a[23].x >> 8, a[23].x << 24 | a[23].y >> 8);
  99. b[13] = (uint2)(a[19].x << 8 | a[19].y >> 24, a[19].y << 8 | a[19].x >> 24);
  100. b[12] = (uint2)(a[13].x << 25 | a[13].y >> 7, a[13].y << 25 | a[13].x >> 7);
  101. b[2] = (uint2)(a[12].y << 11 | a[12].x >> 21, a[12].x << 11 | a[12].y >> 21);
  102. b[20] = (uint2)(a[2].y << 30 | a[2].x >> 2, a[2].x << 30 | a[2].y >> 2);
  103. b[14] = (uint2)(a[20].x << 18 | a[20].y >> 14, a[20].y << 18 | a[20].x >> 14);
  104. b[22] = (uint2)(a[14].y << 7 | a[14].x >> 25, a[14].x << 7 | a[14].y >> 25);
  105. b[9] = (uint2)(a[22].y << 29 | a[22].x >> 3, a[22].x << 29 | a[22].y >> 3);
  106. b[6] = (uint2)(a[9].x << 20 | a[9].y >> 12, a[9].y << 20 | a[9].x >> 12);
  107. b[1] = (uint2)(a[6].y << 12 | a[6].x >> 20, a[6].x << 12 | a[6].y >> 20);
  108. // Chi
  109. a[0] = bitselect(b[0] ^ b[2], b[0], b[1]);
  110. a[1] = bitselect(b[1] ^ b[3], b[1], b[2]);
  111. a[2] = bitselect(b[2] ^ b[4], b[2], b[3]);
  112. a[3] = bitselect(b[3] ^ b[0], b[3], b[4]);
  113. if (out_size >= 4)
  114. {
  115. a[4] = bitselect(b[4] ^ b[1], b[4], b[0]);
  116. a[5] = bitselect(b[5] ^ b[7], b[5], b[6]);
  117. a[6] = bitselect(b[6] ^ b[8], b[6], b[7]);
  118. a[7] = bitselect(b[7] ^ b[9], b[7], b[8]);
  119. a[8] = bitselect(b[8] ^ b[5], b[8], b[9]);
  120. if (out_size >= 8)
  121. {
  122. a[9] = bitselect(b[9] ^ b[6], b[9], b[5]);
  123. a[10] = bitselect(b[10] ^ b[12], b[10], b[11]);
  124. a[11] = bitselect(b[11] ^ b[13], b[11], b[12]);
  125. a[12] = bitselect(b[12] ^ b[14], b[12], b[13]);
  126. a[13] = bitselect(b[13] ^ b[10], b[13], b[14]);
  127. a[14] = bitselect(b[14] ^ b[11], b[14], b[10]);
  128. a[15] = bitselect(b[15] ^ b[17], b[15], b[16]);
  129. a[16] = bitselect(b[16] ^ b[18], b[16], b[17]);
  130. a[17] = bitselect(b[17] ^ b[19], b[17], b[18]);
  131. a[18] = bitselect(b[18] ^ b[15], b[18], b[19]);
  132. a[19] = bitselect(b[19] ^ b[16], b[19], b[15]);
  133. a[20] = bitselect(b[20] ^ b[22], b[20], b[21]);
  134. a[21] = bitselect(b[21] ^ b[23], b[21], b[22]);
  135. a[22] = bitselect(b[22] ^ b[24], b[22], b[23]);
  136. a[23] = bitselect(b[23] ^ b[20], b[23], b[24]);
  137. a[24] = bitselect(b[24] ^ b[21], b[24], b[20]);
  138. }
  139. }
  140. // Iota
  141. a[0] ^= Keccak_f1600_RC[r];
  142. #if !__ENDIAN_LITTLE__
  143. for (uint i = 0; i != 25; ++i)
  144. a[i] = a[i].yx;
  145. #endif
  146. }
  147. static void keccak_f1600_no_absorb(ulong* a, uint in_size, uint out_size, uint isolate)
  148. {
  149. for (uint i = in_size; i != 25; ++i)
  150. {
  151. a[i] = 0;
  152. }
  153. #if __ENDIAN_LITTLE__
  154. a[in_size] ^= 0x0000000000000001;
  155. a[24-out_size*2] ^= 0x8000000000000000;
  156. #else
  157. a[in_size] ^= 0x0100000000000000;
  158. a[24-out_size*2] ^= 0x0000000000000080;
  159. #endif
  160. // Originally I unrolled the first and last rounds to interface
  161. // better with surrounding code, however I haven't done this
  162. // without causing the AMD compiler to blow up the VGPR usage.
  163. uint r = 0;
  164. do
  165. {
  166. // This dynamic branch stops the AMD compiler unrolling the loop
  167. // and additionally saves about 33% of the VGPRs, enough to gain another
  168. // wavefront. Ideally we'd get 4 in flight, but 3 is the best I can
  169. // massage out of the compiler. It doesn't really seem to matter how
  170. // much we try and help the compiler save VGPRs because it seems to throw
  171. // that information away, hence the implementation of keccak here
  172. // doesn't bother.
  173. if (isolate)
  174. {
  175. keccak_f1600_round((uint2*)a, r++, 25);
  176. }
  177. }
  178. while (r < 23);
  179. // final round optimised for digest size
  180. keccak_f1600_round((uint2*)a, r++, out_size);
  181. }
  182. #define copy(dst, src, count) for (uint i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }
  183. #define countof(x) (sizeof(x) / sizeof(x[0]))
  184. static uint fnv(uint x, uint y)
  185. {
  186. return x * FNV_PRIME ^ y;
  187. }
  188. static uint4 fnv4(uint4 x, uint4 y)
  189. {
  190. return x * FNV_PRIME ^ y;
  191. }
  192. static uint fnv_reduce(uint4 v)
  193. {
  194. return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
  195. }
  196. typedef union
  197. {
  198. ulong ulongs[32 / sizeof(ulong)];
  199. uint uints[32 / sizeof(uint)];
  200. } hash32_t;
  201. typedef union
  202. {
  203. ulong ulongs[64 / sizeof(ulong)];
  204. uint4 uint4s[64 / sizeof(uint4)];
  205. } hash64_t;
  206. typedef union
  207. {
  208. uint uints[128 / sizeof(uint)];
  209. uint4 uint4s[128 / sizeof(uint4)];
  210. } hash128_t;
  211. static hash64_t init_hash(__constant hash32_t const* header, ulong nonce, uint isolate)
  212. {
  213. hash64_t init;
  214. uint const init_size = countof(init.ulongs);
  215. uint const hash_size = countof(header->ulongs);
  216. // sha3_512(header .. nonce)
  217. ulong state[25];
  218. copy(state, header->ulongs, hash_size);
  219. state[hash_size] = nonce;
  220. keccak_f1600_no_absorb(state, hash_size + 1, init_size, isolate);
  221. copy(init.ulongs, state, init_size);
  222. return init;
  223. }
  224. static uint inner_loop_chunks(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, __global hash128_t const* g_dag1, __global hash128_t const* g_dag2, __global hash128_t const* g_dag3, uint isolate)
  225. {
  226. uint4 mix = init;
  227. // share init0
  228. if (thread_id == 0)
  229. *share = mix.x;
  230. barrier(CLK_LOCAL_MEM_FENCE);
  231. uint init0 = *share;
  232. uint a = 0;
  233. do
  234. {
  235. bool update_share = thread_id == (a/4) % THREADS_PER_HASH;
  236. #pragma unroll
  237. for (uint i = 0; i != 4; ++i)
  238. {
  239. if (update_share)
  240. {
  241. uint m[4] = { mix.x, mix.y, mix.z, mix.w };
  242. *share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE;
  243. }
  244. barrier(CLK_LOCAL_MEM_FENCE);
  245. mix = fnv4(mix, *share>=3 * DAG_SIZE / 4 ? g_dag3[*share - 3 * DAG_SIZE / 4].uint4s[thread_id] : *share>=DAG_SIZE / 2 ? g_dag2[*share - DAG_SIZE / 2].uint4s[thread_id] : *share>=DAG_SIZE / 4 ? g_dag1[*share - DAG_SIZE / 4].uint4s[thread_id]:g_dag[*share].uint4s[thread_id]);
  246. }
  247. } while ((a += 4) != (ACCESSES & isolate));
  248. return fnv_reduce(mix);
  249. }
  250. static uint inner_loop(uint4 init, uint thread_id, __local uint* share, __global hash128_t const* g_dag, uint isolate)
  251. {
  252. uint4 mix = init;
  253. // share init0
  254. if (thread_id == 0)
  255. *share = mix.x;
  256. barrier(CLK_LOCAL_MEM_FENCE);
  257. uint init0 = *share;
  258. uint a = 0;
  259. do
  260. {
  261. bool update_share = thread_id == (a/4) % THREADS_PER_HASH;
  262. #pragma unroll
  263. for (uint i = 0; i != 4; ++i)
  264. {
  265. if (update_share)
  266. {
  267. uint m[4] = { mix.x, mix.y, mix.z, mix.w };
  268. *share = fnv(init0 ^ (a+i), m[i]) % DAG_SIZE;
  269. }
  270. barrier(CLK_LOCAL_MEM_FENCE);
  271. mix = fnv4(mix, g_dag[*share].uint4s[thread_id]);
  272. }
  273. }
  274. while ((a += 4) != (ACCESSES & isolate));
  275. return fnv_reduce(mix);
  276. }
  277. static hash32_t final_hash(hash64_t const* init, hash32_t const* mix, uint isolate)
  278. {
  279. ulong state[25];
  280. hash32_t hash;
  281. uint const hash_size = countof(hash.ulongs);
  282. uint const init_size = countof(init->ulongs);
  283. uint const mix_size = countof(mix->ulongs);
  284. // keccak_256(keccak_512(header..nonce) .. mix);
  285. copy(state, init->ulongs, init_size);
  286. copy(state + init_size, mix->ulongs, mix_size);
  287. keccak_f1600_no_absorb(state, init_size+mix_size, hash_size, isolate);
  288. // copy out
  289. copy(hash.ulongs, state, hash_size);
  290. return hash;
  291. }
  292. static hash32_t compute_hash_simple(
  293. __constant hash32_t const* g_header,
  294. __global hash128_t const* g_dag,
  295. ulong nonce,
  296. uint isolate
  297. )
  298. {
  299. hash64_t init = init_hash(g_header, nonce, isolate);
  300. hash128_t mix;
  301. for (uint i = 0; i != countof(mix.uint4s); ++i)
  302. {
  303. mix.uint4s[i] = init.uint4s[i % countof(init.uint4s)];
  304. }
  305. uint mix_val = mix.uints[0];
  306. uint init0 = mix.uints[0];
  307. uint a = 0;
  308. do
  309. {
  310. uint pi = fnv(init0 ^ a, mix_val) % DAG_SIZE;
  311. uint n = (a+1) % countof(mix.uints);
  312. #pragma unroll
  313. for (uint i = 0; i != countof(mix.uints); ++i)
  314. {
  315. mix.uints[i] = fnv(mix.uints[i], g_dag[pi].uints[i]);
  316. mix_val = i == n ? mix.uints[i] : mix_val;
  317. }
  318. }
  319. while (++a != (ACCESSES & isolate));
  320. // reduce to output
  321. hash32_t fnv_mix;
  322. for (uint i = 0; i != countof(fnv_mix.uints); ++i)
  323. {
  324. fnv_mix.uints[i] = fnv_reduce(mix.uint4s[i]);
  325. }
  326. return final_hash(&init, &fnv_mix, isolate);
  327. }
  328. typedef union
  329. {
  330. struct
  331. {
  332. hash64_t init;
  333. uint pad; // avoid lds bank conflicts
  334. };
  335. hash32_t mix;
  336. } compute_hash_share;
  337. static hash32_t compute_hash(
  338. __local compute_hash_share* share,
  339. __constant hash32_t const* g_header,
  340. __global hash128_t const* g_dag,
  341. ulong nonce,
  342. uint isolate
  343. )
  344. {
  345. uint const gid = get_global_id(0);
  346. // Compute one init hash per work item.
  347. hash64_t init = init_hash(g_header, nonce, isolate);
  348. // Threads work together in this phase in groups of 8.
  349. uint const thread_id = gid % THREADS_PER_HASH;
  350. uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH;
  351. hash32_t mix;
  352. uint i = 0;
  353. do
  354. {
  355. // share init with other threads
  356. if (i == thread_id)
  357. share[hash_id].init = init;
  358. barrier(CLK_LOCAL_MEM_FENCE);
  359. uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))];
  360. barrier(CLK_LOCAL_MEM_FENCE);
  361. uint thread_mix = inner_loop(thread_init, thread_id, share[hash_id].mix.uints, g_dag, isolate);
  362. share[hash_id].mix.uints[thread_id] = thread_mix;
  363. barrier(CLK_LOCAL_MEM_FENCE);
  364. if (i == thread_id)
  365. mix = share[hash_id].mix;
  366. barrier(CLK_LOCAL_MEM_FENCE);
  367. }
  368. while (++i != (THREADS_PER_HASH & isolate));
  369. return final_hash(&init, &mix, isolate);
  370. }
  371. static hash32_t compute_hash_chunks(
  372. __local compute_hash_share* share,
  373. __constant hash32_t const* g_header,
  374. __global hash128_t const* g_dag,
  375. __global hash128_t const* g_dag1,
  376. __global hash128_t const* g_dag2,
  377. __global hash128_t const* g_dag3,
  378. ulong nonce,
  379. uint isolate
  380. )
  381. {
  382. uint const gid = get_global_id(0);
  383. // Compute one init hash per work item.
  384. hash64_t init = init_hash(g_header, nonce, isolate);
  385. // Threads work together in this phase in groups of 8.
  386. uint const thread_id = gid % THREADS_PER_HASH;
  387. uint const hash_id = (gid % GROUP_SIZE) / THREADS_PER_HASH;
  388. hash32_t mix;
  389. uint i = 0;
  390. do
  391. {
  392. // share init with other threads
  393. if (i == thread_id)
  394. share[hash_id].init = init;
  395. barrier(CLK_LOCAL_MEM_FENCE);
  396. uint4 thread_init = share[hash_id].init.uint4s[thread_id % (64 / sizeof(uint4))];
  397. barrier(CLK_LOCAL_MEM_FENCE);
  398. uint thread_mix = inner_loop_chunks(thread_init, thread_id, share[hash_id].mix.uints, g_dag, g_dag1, g_dag2, g_dag3, isolate);
  399. share[hash_id].mix.uints[thread_id] = thread_mix;
  400. barrier(CLK_LOCAL_MEM_FENCE);
  401. if (i == thread_id)
  402. mix = share[hash_id].mix;
  403. barrier(CLK_LOCAL_MEM_FENCE);
  404. }
  405. while (++i != (THREADS_PER_HASH & isolate));
  406. return final_hash(&init, &mix, isolate);
  407. }
  408. __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
  409. __kernel void ethash_hash_simple(
  410. __global hash32_t* g_hashes,
  411. __constant hash32_t const* g_header,
  412. __global hash128_t const* g_dag,
  413. ulong start_nonce,
  414. uint isolate
  415. )
  416. {
  417. uint const gid = get_global_id(0);
  418. g_hashes[gid] = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
  419. }
  420. __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
  421. __kernel void ethash_search_simple(
  422. __global volatile uint* restrict g_output,
  423. __constant hash32_t const* g_header,
  424. __global hash128_t const* g_dag,
  425. ulong start_nonce,
  426. ulong target,
  427. uint isolate
  428. )
  429. {
  430. uint const gid = get_global_id(0);
  431. hash32_t hash = compute_hash_simple(g_header, g_dag, start_nonce + gid, isolate);
  432. if (hash.ulongs[countof(hash.ulongs)-1] < target)
  433. {
  434. uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
  435. g_output[slot] = gid;
  436. }
  437. }
  438. __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
  439. __kernel void ethash_hash(
  440. __global hash32_t* g_hashes,
  441. __constant hash32_t const* g_header,
  442. __global hash128_t const* g_dag,
  443. ulong start_nonce,
  444. uint isolate
  445. )
  446. {
  447. __local compute_hash_share share[HASHES_PER_LOOP];
  448. uint const gid = get_global_id(0);
  449. g_hashes[gid] = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate);
  450. }
  451. __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
  452. __kernel void ethash_search(
  453. __global volatile uint* restrict g_output,
  454. __constant hash32_t const* g_header,
  455. __global hash128_t const* g_dag,
  456. ulong start_nonce,
  457. ulong target,
  458. uint isolate
  459. )
  460. {
  461. __local compute_hash_share share[HASHES_PER_LOOP];
  462. uint const gid = get_global_id(0);
  463. hash32_t hash = compute_hash(share, g_header, g_dag, start_nonce + gid, isolate);
  464. if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
  465. {
  466. uint slot = min(MAX_OUTPUTS, atomic_inc(&g_output[0]) + 1);
  467. g_output[slot] = gid;
  468. }
  469. }
  470. __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
  471. __kernel void ethash_hash_chunks(
  472. __global hash32_t* g_hashes,
  473. __constant hash32_t const* g_header,
  474. __global hash128_t const* g_dag,
  475. __global hash128_t const* g_dag1,
  476. __global hash128_t const* g_dag2,
  477. __global hash128_t const* g_dag3,
  478. ulong start_nonce,
  479. uint isolate
  480. )
  481. {
  482. __local compute_hash_share share[HASHES_PER_LOOP];
  483. uint const gid = get_global_id(0);
  484. g_hashes[gid] = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3,start_nonce + gid, isolate);
  485. }
  486. __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
  487. __kernel void ethash_search_chunks(
  488. __global volatile uint* restrict g_output,
  489. __constant hash32_t const* g_header,
  490. __global hash128_t const* g_dag,
  491. __global hash128_t const* g_dag1,
  492. __global hash128_t const* g_dag2,
  493. __global hash128_t const* g_dag3,
  494. ulong start_nonce,
  495. ulong target,
  496. uint isolate
  497. )
  498. {
  499. __local compute_hash_share share[HASHES_PER_LOOP];
  500. uint const gid = get_global_id(0);
  501. hash32_t hash = compute_hash_chunks(share, g_header, g_dag, g_dag1, g_dag2, g_dag3, start_nonce + gid, isolate);
  502. if (as_ulong(as_uchar8(hash.ulongs[0]).s76543210) < target)
  503. {
  504. uint slot = min(convert_uint(MAX_OUTPUTS), convert_uint(atomic_inc(&g_output[0]) + 1));
  505. g_output[slot] = gid;
  506. }
  507. }