main.cu 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791
  1. #include <cinttypes>
  2. #include <iostream>
  3. #include <chrono>
  4. #include <cuda.h>
  5. #define THREAD_COUNT 128
  6. #define TASK_WORK (1ULL << 30)
  7. #define FAST_NEXT_INT
  8. #ifdef BOINC
  9. #include "boinc_api.h"
  10. #if defined _WIN32 || defined _WIN64
  11. #include "boinc_win.h"
  12. #endif
  13. #endif
  14. #ifndef BOINC
  15. #define boinc_fopen(file, mode) fopen(file, mode)
  16. #define boinc_delete_file(file) remove(file)
  17. #define boinc_begin_critical_section()
  18. #define boinc_end_critical_section()
  19. #define boinc_fraction_done(frac)
  20. #define boinc_finish(s) exit(s)
  21. #define boinc_time_to_checkpoint() true
  22. #define boinc_checkpoint_completed()
  23. #endif
  24. namespace Random {
  25. #define RANDOM__MULTIPLIER 25214903917ULL
  26. #define RANDOM__MULTIPLIER_INVERSE 246154705703781ULL
  27. #define RANDOM__ADDEND 11ULL
  28. #define RANDOM__ADDEND_INVERSE 107048004364969ULL
  29. #define RANDOM__MASK ((1ULL << 48) - 1)
  30. __device__ uint64_t setSeed(uint64_t seed) {
  31. return (seed ^ RANDOM__MULTIPLIER) & RANDOM__MASK;
  32. }
  33. __device__ int32_t next(uint64_t &seed, int bits) {
  34. seed = (seed * RANDOM__MULTIPLIER + RANDOM__ADDEND) & RANDOM__MASK;
  35. return (int32_t)(seed >> (48 - bits));
  36. }
  37. __device__ int32_t nextInt(uint64_t &seed) {
  38. return next(seed, 32);
  39. }
  40. __device__ int32_t nextInt(uint64_t &seed, int bound) {
  41. if ((bound & -bound) == bound) {
  42. seed = (seed * RANDOM__MULTIPLIER + RANDOM__ADDEND) & RANDOM__MASK;
  43. return (int32_t)((bound * (seed >> 17)) >> 31);
  44. }
  45. int32_t bits, value;
  46. #ifndef FAST_NEXT_INT
  47. do {
  48. #endif
  49. seed = (seed * RANDOM__MULTIPLIER + RANDOM__ADDEND) & RANDOM__MASK;
  50. bits = seed >> 17;
  51. value = bits % bound;
  52. #ifndef FAST_NEXT_INT
  53. } while (bits - value + (bound - 1) < 0);
  54. #endif
  55. return value;
  56. }
  57. __device__ uint64_t nextLong(uint64_t &seed) {
  58. return ((uint64_t)next(seed, 32) << 32) + next(seed, 32);
  59. }
  60. __device__ float nextFloat(uint64_t &seed) {
  61. return next(seed, 24) / ((float)(1 << 24));
  62. }
  63. __device__ double nextDouble(uint64_t &seed) {
  64. return (((uint64_t)next(seed, 26) << 27) + next(seed, 27)) / (double)(1ULL << 53);
  65. }
  66. template <int n>
  67. __device__ constexpr void advance(uint64_t &seed) {
  68. uint64_t m = 1;
  69. uint64_t a = 0;
  70. for (int i = 0; i < n; i++) {
  71. a = (a * RANDOM__MULTIPLIER + RANDOM__ADDEND) & RANDOM__MASK;
  72. m = (m * RANDOM__MULTIPLIER) & RANDOM__MASK;
  73. }
  74. seed = (seed * m + a) & RANDOM__MASK;
  75. }
  76. }
  77. __shared__ uint8_t sharedMemory[256 * THREAD_COUNT];
  78. #define SHARED_MEMORY_ACCESS(n) sharedMemory[(n << 7) | threadIdx.x]
  79. #define CASTED_SHARED_MEMORY_ACCESS(n) ((double*)sharedMemory)[(n << 7) | threadIdx.x]
  80. namespace Terrain {
  81. struct OctaveData {
  82. double xOffset;
  83. double yOffset;
  84. double zOffset;
  85. uint8_t permutations[256];
  86. };
  87. struct NoiseData {
  88. OctaveData noise1[16];
  89. OctaveData noise2[16];
  90. OctaveData noise3[8];
  91. OctaveData noise6[16];
  92. };
  93. __device__ void initializeOctave(uint64_t &random, OctaveData *octaveData) {
  94. octaveData->xOffset = Random::nextDouble(random) * 256.0;
  95. octaveData->yOffset = Random::nextDouble(random) * 256.0;
  96. octaveData->zOffset = Random::nextDouble(random) * 256.0;
  97. for (int i = 0; i < 256; i++) {
  98. SHARED_MEMORY_ACCESS(i) = i;
  99. }
  100. for (int i = 0; i < 256; i++) {
  101. uint8_t k = Random::nextInt(random, 256 - i) + i;
  102. uint8_t l = SHARED_MEMORY_ACCESS(i);
  103. octaveData->permutations[i] = SHARED_MEMORY_ACCESS(k);
  104. SHARED_MEMORY_ACCESS(k) = l;
  105. }
  106. }
  107. __device__ void initializeNoise(uint64_t worldSeed, NoiseData* noiseData) {
  108. uint64_t random = Random::setSeed(worldSeed);
  109. for (int i = 0; i < 16; i++) { initializeOctave(random, &noiseData->noise1[i]); }
  110. for (int i = 0; i < 16; i++) { initializeOctave(random, &noiseData->noise2[i]); }
  111. for (int i = 0; i < 8; i++) { initializeOctave(random, &noiseData->noise3[i]); }
  112. #ifndef FAST_NEXT_INT
  113. for (int i = 0; i < 14; i++) {
  114. Random::advance<7>(random);
  115. for (int j = 1; j < 256; j++) {
  116. Random::nextInt(random, 256 - j);
  117. }
  118. }
  119. #else
  120. Random::advance<3668>(random);
  121. #endif
  122. for (int i = 0; i < 16; i++) { initializeOctave(random, &noiseData->noise6[i]); }
  123. }
  124. __device__ double lerp(double t, double a, double b) {
  125. return a + t * (b - a);
  126. }
  127. __device__ double func_4110_a(int i, double x, double z) {
  128. switch (i & 0xF) {
  129. case 0x0:
  130. return x;
  131. case 0x1:
  132. return -x;
  133. case 0x2:
  134. return x;
  135. case 0x3:
  136. return -x;
  137. case 0x4:
  138. return x + z;
  139. case 0x5:
  140. return -x + z;
  141. case 0x6:
  142. return x - z;
  143. case 0x7:
  144. return -x - z;
  145. case 0x8:
  146. return z;
  147. case 0x9:
  148. return -z;
  149. case 0xA:
  150. return -z;
  151. case 0xB:
  152. return -z;
  153. case 0xC:
  154. return x;
  155. case 0xD:
  156. return z;
  157. case 0xE:
  158. return -x;
  159. case 0xF:
  160. return -z;
  161. default:
  162. return 0;
  163. }
  164. }
  165. __device__ double grad(int i, double x, double y, double z) {
  166. switch (i & 0xF) {
  167. case 0x0:
  168. return x + y;
  169. case 0x1:
  170. return -x + y;
  171. case 0x2:
  172. return x - y;
  173. case 0x3:
  174. return -x - y;
  175. case 0x4:
  176. return x + z;
  177. case 0x5:
  178. return -x + z;
  179. case 0x6:
  180. return x - z;
  181. case 0x7:
  182. return -x - z;
  183. case 0x8:
  184. return y + z;
  185. case 0x9:
  186. return -y + z;
  187. case 0xA:
  188. return y - z;
  189. case 0xB:
  190. return -y - z;
  191. case 0xC:
  192. return y + x;
  193. case 0xD:
  194. return -y + z;
  195. case 0xE:
  196. return y - x;
  197. case 0xF:
  198. return -y - z;
  199. default:
  200. return 0;
  201. }
  202. }
  203. __device__ uint8_t getPermutation(const uint8_t* __restrict__ permutations, int n) {
  204. return permutations[n & 0xFF];
  205. }
  206. __device__ double optimizedNoise2D(const OctaveData* __restrict__ octaveDatas, double baseX, double baseZ, int xIteration, int zIteration, double noiseScaleX, double noiseScaleZ, int numOctaves) {
  207. double outputValue = 0;
  208. double octavesFactor = 1.0;
  209. for (int i = 0; i < numOctaves; i++) {
  210. double noiseFactorX = noiseScaleX * octavesFactor;
  211. double noiseFactorZ = noiseScaleZ * octavesFactor;
  212. double startX = (double)baseX * octavesFactor * noiseScaleX;
  213. double startZ = (double)baseZ * octavesFactor * noiseScaleZ;
  214. double octaveWidth = 1.0 / octavesFactor;
  215. double xCoord = startX + (double)xIteration * noiseFactorX + octaveDatas[i].xOffset;
  216. int xCoordFloor = (int)xCoord;
  217. if (xCoord < (double)xCoordFloor) {
  218. xCoordFloor--;
  219. }
  220. int xUnitCube = xCoordFloor & 0xFF;
  221. xCoord -= xCoordFloor;
  222. double fadeX = xCoord * xCoord * xCoord * (xCoord * (xCoord * 6.0 - 15.0) + 10.0);
  223. double zCoord = startZ + (double)zIteration * noiseFactorZ + octaveDatas[i].zOffset;
  224. int zCoordFloor = (int)zCoord;
  225. if (zCoord < (double)zCoordFloor) {
  226. zCoordFloor--;
  227. }
  228. int zUnitCube = zCoordFloor & 0xFF;
  229. zCoord -= zCoordFloor;
  230. double fadeZ = zCoord * zCoord * zCoord * (zCoord * (zCoord * 6.0 - 15.0) + 10.0);
  231. int l = getPermutation(octaveDatas[i].permutations, xUnitCube) + 0;
  232. int j1 = getPermutation(octaveDatas[i].permutations, l) + zUnitCube;
  233. int k1 = getPermutation(octaveDatas[i].permutations, xUnitCube + 1) + 0;
  234. int l1 = getPermutation(octaveDatas[i].permutations, k1) + zUnitCube;
  235. double d9 = lerp(fadeX, func_4110_a(getPermutation(octaveDatas[i].permutations, j1), xCoord, zCoord), grad(getPermutation(octaveDatas[i].permutations, l1), xCoord - 1.0, 0.0, zCoord));
  236. double d11 = lerp(fadeX, grad(getPermutation(octaveDatas[i].permutations, j1 + 1), xCoord, 0.0, zCoord - 1.0), grad(getPermutation(octaveDatas[i].permutations, l1 + 1), xCoord - 1.0, 0.0, zCoord - 1.0));
  237. double d23 = lerp(fadeZ, d9, d11);
  238. outputValue += d23 * octaveWidth;
  239. octavesFactor /= 2.0;
  240. }
  241. return outputValue;
  242. }
  243. __device__ void optimizedNoise3D(const OctaveData* __restrict__ octaveDatas, int sharedMemoryOffset, double baseX, double baseY, double baseZ, int xIteration, int zIteration, double noiseScaleX, double noiseScaleY, double noiseScaleZ, int numOctaves, int yIterationStart, int yIterations) {
  244. double octavesFactor = 1.0;
  245. for (int i = 0; i < numOctaves; i++) {
  246. double noiseFactorX = noiseScaleX * octavesFactor;
  247. double noiseFactorY = noiseScaleY * octavesFactor;
  248. double noiseFactorZ = noiseScaleZ * octavesFactor;
  249. double startX = (double)baseX * octavesFactor * noiseScaleX;
  250. double startY = (double)baseY * octavesFactor * noiseScaleY;
  251. double startZ = (double)baseZ * octavesFactor * noiseScaleZ;
  252. int i2 = -1;
  253. double d13 = 0.0;
  254. double d15 = 0.0;
  255. double d16 = 0.0;
  256. double d18 = 0.0;
  257. double octaveWidth = 1.0 / octavesFactor;
  258. double xCoord = startX + (double)xIteration * noiseFactorX + octaveDatas[i].xOffset;
  259. int xCoordFloor = (int)xCoord;
  260. if (xCoord < (double)xCoordFloor) {
  261. xCoordFloor--;
  262. }
  263. int xUnitCube = xCoordFloor & 0xFF;
  264. xCoord -= xCoordFloor;
  265. double fadeX = xCoord * xCoord * xCoord * (xCoord * (xCoord * 6.0 - 15.0) + 10.0);
  266. double zCoord = startZ + (double)zIteration * noiseFactorZ + octaveDatas[i].zOffset;
  267. int zCoordFloor = (int)zCoord;
  268. if (zCoord < (double)zCoordFloor) {
  269. zCoordFloor--;
  270. }
  271. int zUnitCube = zCoordFloor & 0xFF;
  272. zCoord -= zCoordFloor;
  273. double fadeZ = zCoord * zCoord * zCoord * (zCoord * (zCoord * 6.0 - 15.0) + 10.0);
  274. for (int yIteration = 0; yIteration < yIterationStart + yIterations; yIteration++) {
  275. double yCoord = startY + (double)yIteration * noiseFactorY + octaveDatas[i].yOffset;
  276. int yCoordFloor = (int)yCoord;
  277. if (yCoord < (double)yCoordFloor) {
  278. yCoordFloor--;
  279. }
  280. int yUnitCube = yCoordFloor & 0xFF;
  281. yCoord -= yCoordFloor;
  282. double fadeY = yCoord * yCoord * yCoord * (yCoord * (yCoord * 6.0 - 15.0) + 10.0);
  283. if (yIteration == 0 || yUnitCube != i2) {
  284. i2 = yUnitCube;
  285. int j2 = getPermutation(octaveDatas[i].permutations, xUnitCube) + yUnitCube;
  286. int k2 = getPermutation(octaveDatas[i].permutations, j2) + zUnitCube;
  287. int l2 = getPermutation(octaveDatas[i].permutations, j2 + 1) + zUnitCube;
  288. int i3 = getPermutation(octaveDatas[i].permutations, xUnitCube + 1) + yUnitCube;
  289. int k3 = getPermutation(octaveDatas[i].permutations, i3) + zUnitCube;
  290. int l3 = getPermutation(octaveDatas[i].permutations, i3 + 1) + zUnitCube;
  291. d13 = lerp(fadeX, grad(getPermutation(octaveDatas[i].permutations, k2), xCoord, yCoord, zCoord), grad(getPermutation(octaveDatas[i].permutations, k3), xCoord - 1.0, yCoord, zCoord));
  292. d15 = lerp(fadeX, grad(getPermutation(octaveDatas[i].permutations, l2), xCoord, yCoord - 1.0, zCoord), grad(getPermutation(octaveDatas[i].permutations, l3), xCoord - 1.0, yCoord - 1.0, zCoord));
  293. d16 = lerp(fadeX, grad(getPermutation(octaveDatas[i].permutations, k2 + 1), xCoord, yCoord, zCoord - 1.0), grad(getPermutation(octaveDatas[i].permutations, k3 + 1), xCoord - 1.0, yCoord, zCoord - 1.0));
  294. d18 = lerp(fadeX, grad(getPermutation(octaveDatas[i].permutations, l2 + 1), xCoord, yCoord - 1.0, zCoord - 1.0), grad(getPermutation(octaveDatas[i].permutations, l3 + 1), xCoord - 1.0, yCoord - 1.0, zCoord - 1.0));
  295. }
  296. double d28 = lerp(fadeY, d13, d15);
  297. double d29 = lerp(fadeY, d16, d18);
  298. double d30 = lerp(fadeZ, d28, d29);
  299. if (yIteration >= yIterationStart) {
  300. CASTED_SHARED_MEMORY_ACCESS(yIteration - yIterationStart + sharedMemoryOffset) += d30 * octaveWidth;
  301. }
  302. }
  303. octavesFactor /= 2.0;
  304. }
  305. }
  306. __device__ void mixNoiseValues(int sharedMemoryOutputOffset, int sharedMemoryNoise1Offset, int sharedMemoryNoise2Offset, int sharedMemoryNoise3Offset, double noise6, int yAreaStart, int yAreas) {
  307. int i2 = 0;
  308. int j2 = 0;
  309. float f1 = 0.37000000476837158203125f;
  310. float f2 = -0.07500000298023223876953125;
  311. double d2 = noise6 / 8000.0;
  312. if (d2 < 0.0) {
  313. d2 = -d2 * 0.29999999999999999;
  314. }
  315. d2 = d2 * 3.0 - 2.0;
  316. if (d2 < 0.0) {
  317. d2 /= 2.0;
  318. if (d2 < -1.0) {
  319. d2 = -1.0;
  320. }
  321. d2 /= 1.3999999999999999;
  322. d2 /= 2.0;
  323. } else {
  324. if (d2 > 1.0) {
  325. d2 = 1.0;
  326. }
  327. d2 /= 8.0;
  328. }
  329. j2++;
  330. for (int k3 = yAreaStart; k3 < (yAreaStart + yAreas); k3++) {
  331. double d3 = f2;
  332. double d4 = f1;
  333. d3 += d2 * 0.20000000000000001;
  334. d3 = (d3 * (double)17) / 16.0;
  335. double d5 = (double)17 / 2.0 + d3 * 4.0;
  336. double d6 = 0.0;
  337. double d7 = (((double)k3 - d5) * 12.0 * 128.0) / (double)(1 << 7) / d4;
  338. if (d7 < 0.0) {
  339. d7 *= 4.0;
  340. }
  341. double d8 = CASTED_SHARED_MEMORY_ACCESS(i2 + sharedMemoryNoise1Offset) / 512.0;
  342. double d9 = CASTED_SHARED_MEMORY_ACCESS(i2 + sharedMemoryNoise2Offset) / 512.0;
  343. double d10 = (CASTED_SHARED_MEMORY_ACCESS(i2 + sharedMemoryNoise3Offset) / 10.0 + 1.0) / 2.0;
  344. if (d10 < 0.0) {
  345. d6 = d8;
  346. } else if (d10 > 1.0) {
  347. d6 = d9;
  348. } else {
  349. d6 = d8 + (d9 - d8) * d10;
  350. }
  351. d6 -= d7;
  352. if (k3 > 17 - 4) {
  353. double d11 = (float)(k3 - (17 - 4)) / 3.0f;
  354. d6 = d6 * (1.0 - d11) + -10.0 * d11;
  355. }
  356. CASTED_SHARED_MEMORY_ACCESS(i2 + sharedMemoryOutputOffset) = d6;
  357. i2++;
  358. }
  359. }
  360. __device__ void optimizedNoise(const NoiseData* __restrict__ noiseData, int sharedMemoryWriteOffset, int32_t x, int32_t y, int32_t z, int xArea, int zArea, int yAreaStart, int yAreas) {
  361. double noise6Value = optimizedNoise2D(noiseData->noise6, (double)x, (double)z, xArea, zArea, 200.0, 200.0, 16);
  362. for (int i = 0; i < yAreas; i++) {
  363. CASTED_SHARED_MEMORY_ACCESS(i) = 0.0;
  364. }
  365. for (int i = 0; i < yAreas; i++) {
  366. CASTED_SHARED_MEMORY_ACCESS(i + yAreas) = 0.0;
  367. }
  368. for (int i = 0; i < yAreas; i++) {
  369. CASTED_SHARED_MEMORY_ACCESS(i + yAreas + yAreas) = 0.0;
  370. }
  371. optimizedNoise3D(noiseData->noise1, 0, (double)x, (double)y, (double)z, xArea, zArea, 684.41200000000003, 684.41200000000003, 684.41200000000003, 16, yAreaStart, yAreas);
  372. optimizedNoise3D(noiseData->noise2, yAreas, (double)x, (double)y, (double)z, xArea, zArea, 684.41200000000003, 684.41200000000003, 684.41200000000003, 16, yAreaStart, yAreas);
  373. optimizedNoise3D(noiseData->noise3, yAreas + yAreas, (double)x, (double)y, (double)z, xArea, zArea, 8.5551500000000011, 4.2775750000000006, 8.5551500000000011, 8, yAreaStart, yAreas);
  374. mixNoiseValues(sharedMemoryWriteOffset, 0, yAreas, yAreas + yAreas, noise6Value, yAreaStart, yAreas);
  375. }
  376. __device__ void optimizedPointLerp(int sharedMemoryOffset, double bottomRight, double bottomLeft, double topRight, double topLeft, double bottomRight2, double bottomLeft2, double topRight2, double topLeft2, uint8_t baseHeight) {
  377. double bottomRightDiff = (bottomRight2 - bottomRight) * 0.125;
  378. double bottomLeftDiff = (bottomLeft2 - bottomLeft) * 0.125;
  379. double topRightDiff = (topRight2 - topRight) * 0.125;
  380. double topLeftDiff = (topLeft2 - topLeft) * 0.125;
  381. for (int y = 0; y < 8; y++) {
  382. double localBottomRight = bottomRight;
  383. double localTopRight = topRight;
  384. double localBottomRightDiff = (bottomLeft - bottomRight) * 0.25;
  385. double localTopRightDiff = (topLeft - topRight) * 0.25;
  386. for (int x = 0; x < 4; x++) {
  387. double localHeight = localBottomRight;
  388. double zStep = (localTopRight - localBottomRight) * 0.25;
  389. localHeight -= zStep;
  390. for (int z = 0; z < 4; z++) {
  391. if ((localHeight += zStep) > 0.0) {
  392. SHARED_MEMORY_ACCESS(x * 4 + z + sharedMemoryOffset) = baseHeight + y;
  393. }
  394. }
  395. localBottomRight += localBottomRightDiff;
  396. localTopRight += localTopRightDiff;
  397. }
  398. bottomRight += bottomRightDiff;
  399. bottomLeft += bottomLeftDiff;
  400. topRight += topRightDiff;
  401. topLeft += topLeftDiff;
  402. }
  403. }
  404. __device__ uint8_t optimizedMod4Lerp(double a, double b, uint8_t baseHeight) {
  405. uint8_t height = 0;
  406. double diff = (b - a) * 0.125;
  407. for (int i = 0; i < 8; i++) {
  408. if (a > 0) {
  409. height = baseHeight + i;
  410. }
  411. a += diff;
  412. }
  413. return height;
  414. }
  415. }
  416. __device__ bool checkTerrain(uint64_t worldSeed) {
  417. Terrain::NoiseData noiseData;
  418. Terrain::initializeNoise(worldSeed, &noiseData);
  419. Terrain::optimizedNoise(&noiseData, 9, -22 * 4, 0, 2 * 4, 0, 2, 8, 2);
  420. if (Terrain::optimizedMod4Lerp(CASTED_SHARED_MEMORY_ACCESS(9), CASTED_SHARED_MEMORY_ACCESS(10), 64) != 65) {
  421. return false;
  422. }
  423. Terrain::optimizedNoise(&noiseData, 11, -22 * 4, 0, 2 * 4, 1, 2, 8, 2);
  424. if (Terrain::optimizedMod4Lerp(CASTED_SHARED_MEMORY_ACCESS(11), CASTED_SHARED_MEMORY_ACCESS(12), 64) != 67) {
  425. return false;
  426. }
  427. Terrain::optimizedNoise(&noiseData, 13, -22 * 4, 0, 2 * 4, 0, 3, 8, 2);
  428. if (Terrain::optimizedMod4Lerp(CASTED_SHARED_MEMORY_ACCESS(13), CASTED_SHARED_MEMORY_ACCESS(14), 64) != 67) {
  429. return false;
  430. }
  431. Terrain::optimizedNoise(&noiseData, 15, -22 * 4, 0, 2 * 4, 1, 3, 7, 3);
  432. if (CASTED_SHARED_MEMORY_ACCESS(16) > 0) { return false; }
  433. if (Terrain::optimizedMod4Lerp(CASTED_SHARED_MEMORY_ACCESS(15), CASTED_SHARED_MEMORY_ACCESS(16), 56) != 63) {
  434. return false;
  435. }
  436. Terrain::optimizedNoise(&noiseData, 18, -22 * 4, 0, 2 * 4, 2, 3, 7, 2);
  437. if (CASTED_SHARED_MEMORY_ACCESS(19) > 0) { return false; }
  438. if (Terrain::optimizedMod4Lerp(CASTED_SHARED_MEMORY_ACCESS(18), CASTED_SHARED_MEMORY_ACCESS(19), 56) != 63) {
  439. return false;
  440. }
  441. int sharedMemoryOffset = 0;
  442. for (int i = 0; i < 16; i++) {
  443. SHARED_MEMORY_ACCESS(sharedMemoryOffset + i) = 0;
  444. }
  445. Terrain::optimizedPointLerp(sharedMemoryOffset, CASTED_SHARED_MEMORY_ACCESS(9), CASTED_SHARED_MEMORY_ACCESS(11), CASTED_SHARED_MEMORY_ACCESS(13), CASTED_SHARED_MEMORY_ACCESS(16), CASTED_SHARED_MEMORY_ACCESS(10), CASTED_SHARED_MEMORY_ACCESS(12), CASTED_SHARED_MEMORY_ACCESS(14), CASTED_SHARED_MEMORY_ACCESS(17), 64);
  446. if (SHARED_MEMORY_ACCESS(sharedMemoryOffset + 2) != 66) { return false; }
  447. if (SHARED_MEMORY_ACCESS(sharedMemoryOffset + 3) != 67) { return false; }
  448. if (SHARED_MEMORY_ACCESS(sharedMemoryOffset + 4) != 65) { return false; }
  449. if (SHARED_MEMORY_ACCESS(sharedMemoryOffset + 6) != 66) { return false; }
  450. if (SHARED_MEMORY_ACCESS(sharedMemoryOffset + 7) != 66) { return false; }
  451. if (SHARED_MEMORY_ACCESS(sharedMemoryOffset + 8) != 65) { return false; }
  452. // if (SHARED_MEMORY_ACCESS(sharedMemoryOffset + 9) != 65) { return false; }
  453. if (SHARED_MEMORY_ACCESS(sharedMemoryOffset + 12) != 66) { return false; }
  454. if (SHARED_MEMORY_ACCESS(sharedMemoryOffset + 13) != 65) { return false; }
  455. if (SHARED_MEMORY_ACCESS(sharedMemoryOffset + 14) != 64) { return false; }
  456. if (SHARED_MEMORY_ACCESS(sharedMemoryOffset + 15) != 64) { return false; }
  457. return true;
  458. }
  459. __device__ __managed__ uint32_t outputCounter = 0;
  460. __device__ __managed__ uint64_t outputBuffer[100000];
  461. __global__ void __launch_bounds__(THREAD_COUNT, 3) gpuWork(uint64_t seedOffset) {
  462. uint64_t worldSeed = (uint64_t)blockIdx.x * (uint64_t)blockDim.x + (uint64_t)threadIdx.x + seedOffset;
  463. if (!checkTerrain(worldSeed)) {
  464. return;
  465. }
  466. uint32_t idx = atomicAdd(&outputCounter, 1);
  467. outputBuffer[idx] = worldSeed;
  468. }
  469. uint64_t milliseconds() {
  470. return (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch())).count();
  471. }
  472. #define GPU_ASSERT(code) gpuAssert((code), __FILE__, __LINE__)
  473. inline void gpuAssert(cudaError_t code, const char *file, int line) {
  474. if (code != cudaSuccess) {
  475. fprintf(stderr, "GPUassert: %s (code %d) %s %d\n", cudaGetErrorString(code), code, file, line);
  476. boinc_finish(code);
  477. }
  478. }
  479. int calculateBlockSize(double threshold) {
  480. gpuWork<<<1, THREAD_COUNT>>>(0);
  481. GPU_ASSERT(cudaPeekAtLastError());
  482. GPU_ASSERT(cudaDeviceSynchronize());
  483. GPU_ASSERT(cudaPeekAtLastError());
  484. outputCounter = 0;
  485. int setBits = 0;
  486. int lowestSetBit = 30;
  487. for (int i = 0; i < 30; i++) {
  488. int j;
  489. for (j = 0; j < lowestSetBit; j++) {
  490. int32_t newBits = setBits | (1 << j);
  491. uint64_t startTime = milliseconds();
  492. gpuWork<<<newBits, THREAD_COUNT>>>(0);
  493. GPU_ASSERT(cudaPeekAtLastError());
  494. GPU_ASSERT(cudaDeviceSynchronize());
  495. GPU_ASSERT(cudaPeekAtLastError());
  496. outputCounter = 0;
  497. uint64_t endTime = milliseconds();
  498. double elapsed = (double)(endTime - startTime) / 1000.0;
  499. if (elapsed > threshold) {
  500. if (j != 0) {
  501. setBits |= (1 << (j - 1));
  502. lowestSetBit = (j - 1);
  503. } else if (j == 0) {
  504. lowestSetBit = 0;
  505. }
  506. break;
  507. }
  508. }
  509. if (lowestSetBit == 0) { break; }
  510. if (j == lowestSetBit) {
  511. setBits |= (1 << (j - 1));
  512. lowestSetBit = (j - 1);
  513. }
  514. }
  515. return setBits;
  516. }
  517. struct CheckpointData {
  518. int lastIteration;
  519. double elapsed;
  520. int blockCount;
  521. };
  522. int main(int argc, char* argv[]) {
  523. int taskNumber = 0;
  524. int device = 0;
  525. for (int i = 1; i < argc; i += 2) {
  526. const char *param = argv[i];
  527. if (strcmp(param, "-t") == 0 || strcmp(param, "--task") == 0) {
  528. taskNumber = atoi(argv[i + 1]);
  529. } else if (strcmp(param, "-d") == 0 || strcmp(param, "--device") == 0) {
  530. device = atoi(argv[i + 1]);
  531. }
  532. }
  533. int startIteration = 0;
  534. double elapsed = 0;
  535. int BLOCK_COUNT = 0;
  536. fprintf(stderr, "Recieved work unit: %d.\n", taskNumber);
  537. fflush(stderr);
  538. #ifdef BOINC
  539. BOINC_OPTIONS options;
  540. boinc_options_defaults(options);
  541. options.normal_thread_priority = true;
  542. boinc_init_options(&options);
  543. APP_INIT_DATA aid;
  544. boinc_get_init_data(aid);
  545. if (aid.gpu_device_num >= 0) {
  546. fprintf(stderr, "boinc gpu: %d, cli gpu: %d.\n", aid.gpu_device_num, device);
  547. device = aid.gpu_device_num;
  548. } else {
  549. fprintf(stderr, "cli gpu: %d.\n", device);
  550. }
  551. #endif
  552. cudaSetDevice(device);
  553. GPU_ASSERT(cudaPeekAtLastError());
  554. GPU_ASSERT(cudaDeviceSynchronize());
  555. GPU_ASSERT(cudaPeekAtLastError());
  556. FILE* checkpointFile = boinc_fopen("trailer_checkpoint.txt", "rb");
  557. if (checkpointFile) {
  558. boinc_begin_critical_section();
  559. struct CheckpointData checkpointData;
  560. fread(&checkpointData, sizeof(checkpointData), 1, checkpointFile);
  561. startIteration = checkpointData.lastIteration + 1;
  562. elapsed = checkpointData.elapsed;
  563. BLOCK_COUNT = checkpointData.blockCount;
  564. fclose(checkpointFile);
  565. fprintf(stderr, "Loaded checkpoint %d %.2f %d.\n", startIteration, elapsed, BLOCK_COUNT);
  566. fflush(stderr);
  567. boinc_end_critical_section();
  568. } else {
  569. fprintf(stderr, "No checkpoint to load.\n");
  570. }
  571. if (BLOCK_COUNT == 0) {
  572. cudaDeviceProp deviceProp;
  573. cudaGetDeviceProperties(&deviceProp, device);
  574. int cc = deviceProp.major * 10 + deviceProp.minor;
  575. if (cc <= 52) {
  576. BLOCK_COUNT = calculateBlockSize(0.02);
  577. } else if (deviceProp.major == 6) {
  578. BLOCK_COUNT = calculateBlockSize(0.1);
  579. } else if (deviceProp.major == 7) {
  580. BLOCK_COUNT = calculateBlockSize(0.15);
  581. } else if (deviceProp.major == 8) {
  582. BLOCK_COUNT = calculateBlockSize(0.5);
  583. } else {
  584. fprintf(stderr, "Unrecognized compute capability.\n");
  585. fflush(stderr);
  586. boinc_finish(1);
  587. }
  588. fprintf(stderr, "Calculated block count: %d.\n", BLOCK_COUNT);
  589. if (BLOCK_COUNT == 0) { BLOCK_COUNT = 1; }
  590. fflush(stderr);
  591. }
  592. uint64_t GRID_WORK = (uint64_t)BLOCK_COUNT * THREAD_COUNT;
  593. int ITERATIONS_NEEDED = ((TASK_WORK + GRID_WORK - 1) / GRID_WORK);
  594. for (int i = startIteration; i < ITERATIONS_NEEDED; i++) {
  595. uint64_t seedOffset = (TASK_WORK * taskNumber) + GRID_WORK * i;
  596. uint64_t startTime = milliseconds();
  597. gpuWork<<<BLOCK_COUNT, THREAD_COUNT>>>(seedOffset);
  598. GPU_ASSERT(cudaPeekAtLastError());
  599. GPU_ASSERT(cudaDeviceSynchronize());
  600. GPU_ASSERT(cudaPeekAtLastError());
  601. uint64_t endTime = milliseconds();
  602. boinc_begin_critical_section();
  603. double localElapsed = ((double)(endTime - startTime) / 1000);
  604. elapsed += localElapsed;
  605. if (boinc_time_to_checkpoint()) {
  606. struct CheckpointData checkpointData;
  607. checkpointData.lastIteration = i;
  608. checkpointData.elapsed = elapsed;
  609. checkpointData.blockCount = BLOCK_COUNT;
  610. FILE* checkpointFile = boinc_fopen("trailer_checkpoint.txt", "wb");
  611. fwrite(&checkpointData, sizeof(checkpointData), 1, checkpointFile);
  612. fclose(checkpointFile);
  613. boinc_checkpoint_completed();
  614. }
  615. if (outputCounter > 0) {
  616. FILE *seedsOut = boinc_fopen("trailer_seeds.txt", "a");
  617. for (int j = 0; j < outputCounter; j++) {
  618. if (outputBuffer[j] < (TASK_WORK * (taskNumber + 1))) {
  619. fprintf(seedsOut, "Seed: %llu\n", outputBuffer[j]);
  620. }
  621. }
  622. fclose(seedsOut);
  623. outputCounter = 0;
  624. }
  625. double fracDone = (double)i / ITERATIONS_NEEDED;
  626. boinc_fraction_done(fracDone);
  627. boinc_end_critical_section();
  628. }
  629. boinc_begin_critical_section();
  630. FILE *seedsOut = boinc_fopen("trailer_seeds.txt", "a");
  631. fclose(seedsOut);
  632. fprintf(stderr, "Finished in %.2f seconds. Speed: %.2f/s.\n", elapsed, (double)TASK_WORK / elapsed);
  633. fflush(stderr);
  634. boinc_delete_file("trailer_checkpoint.txt");
  635. boinc_end_critical_section();
  636. boinc_finish(0);
  637. }