vertexcodec.cpp 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330
  1. // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
  2. #include "meshoptimizer.h"
  3. #include <assert.h>
  4. #include <string.h>
  5. // The block below auto-detects SIMD ISA that can be used on the target platform
  6. #ifndef MESHOPTIMIZER_NO_SIMD
  7. // The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings
  8. #if defined(__AVX__) || defined(__SSSE3__)
  9. #define SIMD_SSE
  10. #endif
  11. // An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings
  12. #if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
  13. #undef SIMD_SSE
  14. #define SIMD_AVX
  15. #endif
  16. // MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback
  17. #if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
  18. #define SIMD_SSE
  19. #define SIMD_FALLBACK
  20. #endif
  21. // GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
  22. #if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
  23. #define SIMD_SSE
  24. #define SIMD_FALLBACK
  25. #define SIMD_TARGET __attribute__((target("ssse3")))
  26. #endif
  27. // GCC/clang define these when NEON support is available
  28. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  29. #define SIMD_NEON
  30. #endif
  31. // On MSVC, we assume that ARM builds always target NEON-capable devices
  32. #if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
  33. #define SIMD_NEON
  34. #endif
  35. // When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
  36. #if defined(__wasm_simd128__)
  37. #define SIMD_WASM
  38. // Prevent compiling other variant when wasm simd compilation is active
  39. #undef SIMD_NEON
  40. #undef SIMD_SSE
  41. #undef SIMD_AVX
  42. #endif
  43. #ifndef SIMD_TARGET
  44. #define SIMD_TARGET
  45. #endif
  46. // When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap
  47. // We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs
  48. #if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
  49. #define SIMD_LATENCYOPT
  50. #endif
  51. #endif // !MESHOPTIMIZER_NO_SIMD
  52. #ifdef SIMD_SSE
  53. #include <tmmintrin.h>
  54. #endif
  55. #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  56. #ifdef _MSC_VER
  57. #include <intrin.h> // __cpuid
  58. #else
  59. #include <cpuid.h> // __cpuid
  60. #endif
  61. #endif
  62. #ifdef SIMD_AVX
  63. #include <immintrin.h>
  64. #endif
  65. #ifdef SIMD_NEON
  66. #if defined(_MSC_VER) && defined(_M_ARM64)
  67. #include <arm64_neon.h>
  68. #else
  69. #include <arm_neon.h>
  70. #endif
  71. #endif
  72. #ifdef SIMD_WASM
  73. #include <wasm_simd128.h>
  74. #endif
  75. #ifndef TRACE
  76. #define TRACE 0
  77. #endif
  78. #if TRACE
  79. #include <stdio.h>
  80. #endif
  81. #ifdef SIMD_WASM
  82. #define wasmx_splat_v32x4(v, i) wasm_i32x4_shuffle(v, v, i, i, i, i)
  83. #define wasmx_unpacklo_v8x16(a, b) wasm_i8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
  84. #define wasmx_unpackhi_v8x16(a, b) wasm_i8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
  85. #define wasmx_unpacklo_v16x8(a, b) wasm_i16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
  86. #define wasmx_unpackhi_v16x8(a, b) wasm_i16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
  87. #define wasmx_unpacklo_v64x2(a, b) wasm_i64x2_shuffle(a, b, 0, 2)
  88. #define wasmx_unpackhi_v64x2(a, b) wasm_i64x2_shuffle(a, b, 1, 3)
  89. #endif
  90. namespace meshopt
  91. {
  92. const unsigned char kVertexHeader = 0xa0;
  93. static int gEncodeVertexVersion = 0;
  94. const size_t kVertexBlockSizeBytes = 8192;
  95. const size_t kVertexBlockMaxSize = 256;
  96. const size_t kByteGroupSize = 16;
  97. const size_t kByteGroupDecodeLimit = 24;
  98. const size_t kTailMaxSize = 32;
  99. static size_t getVertexBlockSize(size_t vertex_size)
  100. {
  101. // make sure the entire block fits into the scratch buffer
  102. size_t result = kVertexBlockSizeBytes / vertex_size;
  103. // align to byte group size; we encode each byte as a byte group
  104. // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
  105. result &= ~(kByteGroupSize - 1);
  106. return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
  107. }
  108. inline unsigned char zigzag8(unsigned char v)
  109. {
  110. return ((signed char)(v) >> 7) ^ (v << 1);
  111. }
  112. inline unsigned char unzigzag8(unsigned char v)
  113. {
  114. return -(v & 1) ^ (v >> 1);
  115. }
  116. #if TRACE
  117. struct Stats
  118. {
  119. size_t size;
  120. size_t header; // bytes for header
  121. size_t bitg[4]; // bytes for bit groups
  122. size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group
  123. };
  124. static Stats* bytestats = NULL;
  125. static Stats vertexstats[256];
  126. #endif
  127. static bool encodeBytesGroupZero(const unsigned char* buffer)
  128. {
  129. for (size_t i = 0; i < kByteGroupSize; ++i)
  130. if (buffer[i])
  131. return false;
  132. return true;
  133. }
  134. static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
  135. {
  136. assert(bits >= 1 && bits <= 8);
  137. if (bits == 1)
  138. return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
  139. if (bits == 8)
  140. return kByteGroupSize;
  141. size_t result = kByteGroupSize * bits / 8;
  142. unsigned char sentinel = (1 << bits) - 1;
  143. for (size_t i = 0; i < kByteGroupSize; ++i)
  144. result += buffer[i] >= sentinel;
  145. return result;
  146. }
  147. static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
  148. {
  149. assert(bits >= 1 && bits <= 8);
  150. if (bits == 1)
  151. return data;
  152. if (bits == 8)
  153. {
  154. memcpy(data, buffer, kByteGroupSize);
  155. return data + kByteGroupSize;
  156. }
  157. size_t byte_size = 8 / bits;
  158. assert(kByteGroupSize % byte_size == 0);
  159. // fixed portion: bits bits for each value
  160. // variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
  161. unsigned char sentinel = (1 << bits) - 1;
  162. for (size_t i = 0; i < kByteGroupSize; i += byte_size)
  163. {
  164. unsigned char byte = 0;
  165. for (size_t k = 0; k < byte_size; ++k)
  166. {
  167. unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
  168. byte <<= bits;
  169. byte |= enc;
  170. }
  171. *data++ = byte;
  172. }
  173. for (size_t i = 0; i < kByteGroupSize; ++i)
  174. {
  175. if (buffer[i] >= sentinel)
  176. {
  177. *data++ = buffer[i];
  178. }
  179. }
  180. return data;
  181. }
  182. static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
  183. {
  184. assert(buffer_size % kByteGroupSize == 0);
  185. unsigned char* header = data;
  186. // round number of groups to 4 to get number of header bytes
  187. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  188. if (size_t(data_end - data) < header_size)
  189. return NULL;
  190. data += header_size;
  191. memset(header, 0, header_size);
  192. for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
  193. {
  194. if (size_t(data_end - data) < kByteGroupDecodeLimit)
  195. return NULL;
  196. int best_bits = 8;
  197. size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
  198. for (int bits = 1; bits < 8; bits *= 2)
  199. {
  200. size_t size = encodeBytesGroupMeasure(buffer + i, bits);
  201. if (size < best_size)
  202. {
  203. best_bits = bits;
  204. best_size = size;
  205. }
  206. }
  207. int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2 ? 1 : (best_bits == 4 ? 2 : 3));
  208. assert((1 << bitslog2) == best_bits);
  209. size_t header_offset = i / kByteGroupSize;
  210. header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
  211. unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
  212. assert(data + best_size == next);
  213. data = next;
  214. #if TRACE
  215. bytestats->bitg[bitslog2] += best_size;
  216. #endif
  217. }
  218. #if TRACE
  219. bytestats->header += header_size;
  220. #endif
  221. return data;
  222. }
  223. static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  224. {
  225. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  226. unsigned char buffer[kVertexBlockMaxSize];
  227. assert(sizeof(buffer) % kByteGroupSize == 0);
  228. // we sometimes encode elements we didn't fill when rounding to kByteGroupSize
  229. memset(buffer, 0, sizeof(buffer));
  230. for (size_t k = 0; k < vertex_size; ++k)
  231. {
  232. size_t vertex_offset = k;
  233. unsigned char p = last_vertex[k];
  234. for (size_t i = 0; i < vertex_count; ++i)
  235. {
  236. buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
  237. p = vertex_data[vertex_offset];
  238. vertex_offset += vertex_size;
  239. }
  240. #if TRACE
  241. const unsigned char* olddata = data;
  242. bytestats = &vertexstats[k];
  243. for (size_t ig = 0; ig < vertex_count; ig += kByteGroupSize)
  244. {
  245. unsigned char last = (ig == 0) ? last_vertex[k] : vertex_data[vertex_size * (ig - 1) + k];
  246. unsigned char delta = 0xff;
  247. for (size_t i = ig; i < ig + kByteGroupSize && i < vertex_count; ++i)
  248. delta &= ~(vertex_data[vertex_size * i + k] ^ last);
  249. for (int j = 0; j < 8; ++j)
  250. bytestats->bitc[j] += (vertex_count - ig < kByteGroupSize ? vertex_count - ig : kByteGroupSize) * ((delta >> j) & 1);
  251. }
  252. #endif
  253. data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
  254. if (!data)
  255. return NULL;
  256. #if TRACE
  257. bytestats = NULL;
  258. vertexstats[k].size += data - olddata;
  259. #endif
  260. }
  261. memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
  262. return data;
  263. }
  264. #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
  265. static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
  266. {
  267. #define READ() byte = *data++
  268. #define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
  269. unsigned char byte, enc, encv;
  270. const unsigned char* data_var;
  271. switch (bitslog2)
  272. {
  273. case 0:
  274. memset(buffer, 0, kByteGroupSize);
  275. return data;
  276. case 1:
  277. data_var = data + 4;
  278. // 4 groups with 4 2-bit values in each byte
  279. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  280. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  281. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  282. READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
  283. return data_var;
  284. case 2:
  285. data_var = data + 8;
  286. // 8 groups with 2 4-bit values in each byte
  287. READ(), NEXT(4), NEXT(4);
  288. READ(), NEXT(4), NEXT(4);
  289. READ(), NEXT(4), NEXT(4);
  290. READ(), NEXT(4), NEXT(4);
  291. READ(), NEXT(4), NEXT(4);
  292. READ(), NEXT(4), NEXT(4);
  293. READ(), NEXT(4), NEXT(4);
  294. READ(), NEXT(4), NEXT(4);
  295. return data_var;
  296. case 3:
  297. memcpy(buffer, data, kByteGroupSize);
  298. return data + kByteGroupSize;
  299. default:
  300. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  301. return data;
  302. }
  303. #undef READ
  304. #undef NEXT
  305. }
  306. static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
  307. {
  308. assert(buffer_size % kByteGroupSize == 0);
  309. const unsigned char* header = data;
  310. // round number of groups to 4 to get number of header bytes
  311. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  312. if (size_t(data_end - data) < header_size)
  313. return NULL;
  314. data += header_size;
  315. for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
  316. {
  317. if (size_t(data_end - data) < kByteGroupDecodeLimit)
  318. return NULL;
  319. size_t header_offset = i / kByteGroupSize;
  320. int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
  321. data = decodeBytesGroup(data, buffer + i, bitslog2);
  322. }
  323. return data;
  324. }
  325. static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  326. {
  327. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  328. unsigned char buffer[kVertexBlockMaxSize];
  329. unsigned char transposed[kVertexBlockSizeBytes];
  330. size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  331. assert(vertex_count <= vertex_count_aligned);
  332. for (size_t k = 0; k < vertex_size; ++k)
  333. {
  334. data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
  335. if (!data)
  336. return NULL;
  337. size_t vertex_offset = k;
  338. unsigned char p = last_vertex[k];
  339. for (size_t i = 0; i < vertex_count; ++i)
  340. {
  341. unsigned char v = unzigzag8(buffer[i]) + p;
  342. transposed[vertex_offset] = v;
  343. p = v;
  344. vertex_offset += vertex_size;
  345. }
  346. }
  347. memcpy(vertex_data, transposed, vertex_count * vertex_size);
  348. memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
  349. return data;
  350. }
  351. #endif
  352. #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
  353. static unsigned char kDecodeBytesGroupShuffle[256][8];
  354. static unsigned char kDecodeBytesGroupCount[256];
  355. #ifdef __wasm__
  356. __attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
  357. #endif
  358. static bool
  359. decodeBytesGroupBuildTables()
  360. {
  361. for (int mask = 0; mask < 256; ++mask)
  362. {
  363. unsigned char shuffle[8];
  364. unsigned char count = 0;
  365. for (int i = 0; i < 8; ++i)
  366. {
  367. int maski = (mask >> i) & 1;
  368. shuffle[i] = maski ? count : 0x80;
  369. count += (unsigned char)(maski);
  370. }
  371. memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
  372. kDecodeBytesGroupCount[mask] = count;
  373. }
  374. return true;
  375. }
  376. static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
  377. #endif
  378. #ifdef SIMD_SSE
  379. SIMD_TARGET
  380. static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
  381. {
  382. __m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
  383. __m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
  384. __m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
  385. __m128i sm1r = _mm_add_epi8(sm1, sm1off);
  386. return _mm_unpacklo_epi64(sm0, sm1r);
  387. }
  388. SIMD_TARGET
  389. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  390. {
  391. switch (bitslog2)
  392. {
  393. case 0:
  394. {
  395. __m128i result = _mm_setzero_si128();
  396. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  397. return data;
  398. }
  399. case 1:
  400. {
  401. #ifdef __GNUC__
  402. typedef int __attribute__((aligned(1))) unaligned_int;
  403. #else
  404. typedef int unaligned_int;
  405. #endif
  406. #ifdef SIMD_LATENCYOPT
  407. unsigned int data32;
  408. memcpy(&data32, data, 4);
  409. data32 &= data32 >> 1;
  410. // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
  411. unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
  412. // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
  413. int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
  414. #endif
  415. __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
  416. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
  417. __m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
  418. __m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
  419. __m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
  420. __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
  421. int mask16 = _mm_movemask_epi8(mask);
  422. unsigned char mask0 = (unsigned char)(mask16 & 255);
  423. unsigned char mask1 = (unsigned char)(mask16 >> 8);
  424. __m128i shuf = decodeShuffleMask(mask0, mask1);
  425. __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  426. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  427. #ifdef SIMD_LATENCYOPT
  428. return data + 4 + datacnt;
  429. #else
  430. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  431. #endif
  432. }
  433. case 2:
  434. {
  435. #ifdef SIMD_LATENCYOPT
  436. unsigned long long data64;
  437. memcpy(&data64, data, 8);
  438. data64 &= data64 >> 1;
  439. data64 &= data64 >> 2;
  440. // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
  441. int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
  442. #endif
  443. __m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
  444. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
  445. __m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
  446. __m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
  447. __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
  448. int mask16 = _mm_movemask_epi8(mask);
  449. unsigned char mask0 = (unsigned char)(mask16 & 255);
  450. unsigned char mask1 = (unsigned char)(mask16 >> 8);
  451. __m128i shuf = decodeShuffleMask(mask0, mask1);
  452. __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
  453. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  454. #ifdef SIMD_LATENCYOPT
  455. return data + 8 + datacnt;
  456. #else
  457. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  458. #endif
  459. }
  460. case 3:
  461. {
  462. __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
  463. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  464. return data + 16;
  465. }
  466. default:
  467. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  468. return data;
  469. }
  470. }
  471. #endif
  472. #ifdef SIMD_AVX
  473. static const __m128i decodeBytesGroupConfig[] = {
  474. _mm_set1_epi8(3),
  475. _mm_set1_epi8(15),
  476. _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
  477. _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
  478. };
  479. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  480. {
  481. switch (bitslog2)
  482. {
  483. case 0:
  484. {
  485. __m128i result = _mm_setzero_si128();
  486. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  487. return data;
  488. }
  489. case 1:
  490. case 2:
  491. {
  492. const unsigned char* skip = data + (bitslog2 << 2);
  493. __m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
  494. __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
  495. __m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
  496. __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
  497. __m128i selw = _mm_shuffle_epi32(selb, 0x44);
  498. __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
  499. __mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
  500. __m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
  501. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  502. return skip + _mm_popcnt_u32(mask16);
  503. }
  504. case 3:
  505. {
  506. __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
  507. _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
  508. return data + 16;
  509. }
  510. default:
  511. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  512. return data;
  513. }
  514. }
  515. #endif
  516. #ifdef SIMD_NEON
  517. static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
  518. {
  519. uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
  520. uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
  521. uint8x8_t r0 = vtbl1_u8(rest0, sm0);
  522. uint8x8_t r1 = vtbl1_u8(rest1, sm1);
  523. return vcombine_u8(r0, r1);
  524. }
  525. static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
  526. {
  527. // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
  528. const uint64_t magic = 0x000103070f1f3f80ull;
  529. uint64x2_t mask2 = vreinterpretq_u64_u8(mask);
  530. mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
  531. mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
  532. }
  533. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  534. {
  535. switch (bitslog2)
  536. {
  537. case 0:
  538. {
  539. uint8x16_t result = vdupq_n_u8(0);
  540. vst1q_u8(buffer, result);
  541. return data;
  542. }
  543. case 1:
  544. {
  545. #ifdef SIMD_LATENCYOPT
  546. unsigned int data32;
  547. memcpy(&data32, data, 4);
  548. data32 &= data32 >> 1;
  549. // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
  550. unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
  551. // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
  552. int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
  553. #endif
  554. uint8x8_t sel2 = vld1_u8(data);
  555. uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
  556. uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
  557. uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
  558. uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
  559. unsigned char mask0, mask1;
  560. neonMoveMask(mask, mask0, mask1);
  561. uint8x8_t rest0 = vld1_u8(data + 4);
  562. uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
  563. uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
  564. vst1q_u8(buffer, result);
  565. #ifdef SIMD_LATENCYOPT
  566. return data + 4 + datacnt;
  567. #else
  568. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  569. #endif
  570. }
  571. case 2:
  572. {
  573. #ifdef SIMD_LATENCYOPT
  574. unsigned long long data64;
  575. memcpy(&data64, data, 8);
  576. data64 &= data64 >> 1;
  577. data64 &= data64 >> 2;
  578. // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
  579. int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
  580. #endif
  581. uint8x8_t sel4 = vld1_u8(data);
  582. uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
  583. uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
  584. uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
  585. unsigned char mask0, mask1;
  586. neonMoveMask(mask, mask0, mask1);
  587. uint8x8_t rest0 = vld1_u8(data + 8);
  588. uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
  589. uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
  590. vst1q_u8(buffer, result);
  591. #ifdef SIMD_LATENCYOPT
  592. return data + 8 + datacnt;
  593. #else
  594. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  595. #endif
  596. }
  597. case 3:
  598. {
  599. uint8x16_t result = vld1q_u8(data);
  600. vst1q_u8(buffer, result);
  601. return data + 16;
  602. }
  603. default:
  604. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  605. return data;
  606. }
  607. }
  608. #endif
  609. #ifdef SIMD_WASM
  610. SIMD_TARGET
  611. static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
  612. {
  613. v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
  614. v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
  615. v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
  616. sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
  617. v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
  618. return wasmx_unpacklo_v64x2(sm0, sm1r);
  619. }
  620. SIMD_TARGET
  621. static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
  622. {
  623. // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
  624. const uint64_t magic = 0x000103070f1f3f80ull;
  625. mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
  626. mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
  627. }
  628. SIMD_TARGET
  629. static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
  630. {
  631. switch (bitslog2)
  632. {
  633. case 0:
  634. {
  635. v128_t result = wasm_i8x16_splat(0);
  636. wasm_v128_store(buffer, result);
  637. return data;
  638. }
  639. case 1:
  640. {
  641. v128_t sel2 = wasm_v128_load(data);
  642. v128_t rest = wasm_v128_load(data + 4);
  643. v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
  644. v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
  645. v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
  646. v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
  647. unsigned char mask0, mask1;
  648. wasmMoveMask(mask, mask0, mask1);
  649. v128_t shuf = decodeShuffleMask(mask0, mask1);
  650. v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
  651. wasm_v128_store(buffer, result);
  652. return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  653. }
  654. case 2:
  655. {
  656. v128_t sel4 = wasm_v128_load(data);
  657. v128_t rest = wasm_v128_load(data + 8);
  658. v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
  659. v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
  660. v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
  661. unsigned char mask0, mask1;
  662. wasmMoveMask(mask, mask0, mask1);
  663. v128_t shuf = decodeShuffleMask(mask0, mask1);
  664. v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
  665. wasm_v128_store(buffer, result);
  666. return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
  667. }
  668. case 3:
  669. {
  670. v128_t result = wasm_v128_load(data);
  671. wasm_v128_store(buffer, result);
  672. return data + 16;
  673. }
  674. default:
  675. assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
  676. return data;
  677. }
  678. }
  679. #endif
  680. #if defined(SIMD_SSE) || defined(SIMD_AVX)
  681. SIMD_TARGET
  682. static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
  683. {
  684. __m128i t0 = _mm_unpacklo_epi8(x0, x1);
  685. __m128i t1 = _mm_unpackhi_epi8(x0, x1);
  686. __m128i t2 = _mm_unpacklo_epi8(x2, x3);
  687. __m128i t3 = _mm_unpackhi_epi8(x2, x3);
  688. x0 = _mm_unpacklo_epi16(t0, t2);
  689. x1 = _mm_unpackhi_epi16(t0, t2);
  690. x2 = _mm_unpacklo_epi16(t1, t3);
  691. x3 = _mm_unpackhi_epi16(t1, t3);
  692. }
  693. SIMD_TARGET
  694. static __m128i unzigzag8(__m128i v)
  695. {
  696. __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
  697. __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
  698. return _mm_xor_si128(xl, xr);
  699. }
  700. #endif
  701. #ifdef SIMD_NEON
  702. static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
  703. {
  704. uint8x16x2_t t01 = vzipq_u8(x0, x1);
  705. uint8x16x2_t t23 = vzipq_u8(x2, x3);
  706. uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
  707. uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
  708. x0 = vreinterpretq_u8_u16(x01.val[0]);
  709. x1 = vreinterpretq_u8_u16(x01.val[1]);
  710. x2 = vreinterpretq_u8_u16(x23.val[0]);
  711. x3 = vreinterpretq_u8_u16(x23.val[1]);
  712. }
  713. static uint8x16_t unzigzag8(uint8x16_t v)
  714. {
  715. uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
  716. uint8x16_t xr = vshrq_n_u8(v, 1);
  717. return veorq_u8(xl, xr);
  718. }
  719. #endif
  720. #ifdef SIMD_WASM
  721. SIMD_TARGET
  722. static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
  723. {
  724. v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
  725. v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
  726. v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
  727. v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
  728. x0 = wasmx_unpacklo_v16x8(t0, t2);
  729. x1 = wasmx_unpackhi_v16x8(t0, t2);
  730. x2 = wasmx_unpacklo_v16x8(t1, t3);
  731. x3 = wasmx_unpackhi_v16x8(t1, t3);
  732. }
  733. SIMD_TARGET
  734. static v128_t unzigzag8(v128_t v)
  735. {
  736. v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
  737. v128_t xr = wasm_u8x16_shr(v, 1);
  738. return wasm_v128_xor(xl, xr);
  739. }
  740. #endif
  741. #if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
  742. SIMD_TARGET
  743. static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
  744. {
  745. assert(buffer_size % kByteGroupSize == 0);
  746. assert(kByteGroupSize == 16);
  747. const unsigned char* header = data;
  748. // round number of groups to 4 to get number of header bytes
  749. size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
  750. if (size_t(data_end - data) < header_size)
  751. return NULL;
  752. data += header_size;
  753. size_t i = 0;
  754. // fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
  755. for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
  756. {
  757. size_t header_offset = i / kByteGroupSize;
  758. unsigned char header_byte = header[header_offset / 4];
  759. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
  760. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
  761. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
  762. data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
  763. }
  764. // slow-path: process remaining groups
  765. for (; i < buffer_size; i += kByteGroupSize)
  766. {
  767. if (size_t(data_end - data) < kByteGroupDecodeLimit)
  768. return NULL;
  769. size_t header_offset = i / kByteGroupSize;
  770. int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
  771. data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
  772. }
  773. return data;
  774. }
  775. SIMD_TARGET
  776. static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
  777. {
  778. assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
  779. unsigned char buffer[kVertexBlockMaxSize * 4];
  780. unsigned char transposed[kVertexBlockSizeBytes];
  781. size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
  782. for (size_t k = 0; k < vertex_size; k += 4)
  783. {
  784. for (size_t j = 0; j < 4; ++j)
  785. {
  786. data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
  787. if (!data)
  788. return NULL;
  789. }
  790. #if defined(SIMD_SSE) || defined(SIMD_AVX)
  791. #define TEMP __m128i
  792. #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
  793. #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
  794. #define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
  795. #define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
  796. #define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
  797. #endif
  798. #ifdef SIMD_NEON
  799. #define TEMP uint8x8_t
  800. #define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
  801. #define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
  802. #define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
  803. #define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
  804. #define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
  805. #endif
  806. #ifdef SIMD_WASM
  807. #define TEMP v128_t
  808. #define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
  809. #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
  810. #define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
  811. #define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
  812. #define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
  813. #endif
  814. PREP();
  815. unsigned char* savep = transposed + k;
  816. for (size_t j = 0; j < vertex_count_aligned; j += 16)
  817. {
  818. LOAD(0);
  819. LOAD(1);
  820. LOAD(2);
  821. LOAD(3);
  822. r0 = unzigzag8(r0);
  823. r1 = unzigzag8(r1);
  824. r2 = unzigzag8(r2);
  825. r3 = unzigzag8(r3);
  826. transpose8(r0, r1, r2, r3);
  827. TEMP t0, t1, t2, t3;
  828. GRP4(0);
  829. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  830. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  831. GRP4(1);
  832. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  833. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  834. GRP4(2);
  835. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  836. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  837. GRP4(3);
  838. FIXD(0), FIXD(1), FIXD(2), FIXD(3);
  839. SAVE(0), SAVE(1), SAVE(2), SAVE(3);
  840. #undef TEMP
  841. #undef PREP
  842. #undef LOAD
  843. #undef GRP4
  844. #undef FIXD
  845. #undef SAVE
  846. }
  847. }
  848. memcpy(vertex_data, transposed, vertex_count * vertex_size);
  849. memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
  850. return data;
  851. }
  852. #endif
  853. #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  854. static unsigned int getCpuFeatures()
  855. {
  856. int cpuinfo[4] = {};
  857. #ifdef _MSC_VER
  858. __cpuid(cpuinfo, 1);
  859. #else
  860. __cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
  861. #endif
  862. return cpuinfo[2];
  863. }
  864. static unsigned int cpuid = getCpuFeatures();
  865. #endif
  866. } // namespace meshopt
  867. size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
  868. {
  869. using namespace meshopt;
  870. assert(vertex_size > 0 && vertex_size <= 256);
  871. assert(vertex_size % 4 == 0);
  872. #if TRACE
  873. memset(vertexstats, 0, sizeof(vertexstats));
  874. #endif
  875. const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
  876. unsigned char* data = buffer;
  877. unsigned char* data_end = buffer + buffer_size;
  878. if (size_t(data_end - data) < 1 + vertex_size)
  879. return 0;
  880. int version = gEncodeVertexVersion;
  881. *data++ = (unsigned char)(kVertexHeader | version);
  882. unsigned char first_vertex[256] = {};
  883. if (vertex_count > 0)
  884. memcpy(first_vertex, vertex_data, vertex_size);
  885. unsigned char last_vertex[256] = {};
  886. memcpy(last_vertex, first_vertex, vertex_size);
  887. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  888. size_t vertex_offset = 0;
  889. while (vertex_offset < vertex_count)
  890. {
  891. size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  892. data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
  893. if (!data)
  894. return 0;
  895. vertex_offset += block_size;
  896. }
  897. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  898. if (size_t(data_end - data) < tail_size)
  899. return 0;
  900. // write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
  901. if (vertex_size < kTailMaxSize)
  902. {
  903. memset(data, 0, kTailMaxSize - vertex_size);
  904. data += kTailMaxSize - vertex_size;
  905. }
  906. memcpy(data, first_vertex, vertex_size);
  907. data += vertex_size;
  908. assert(data >= buffer + tail_size);
  909. assert(data <= buffer + buffer_size);
  910. #if TRACE
  911. size_t total_size = data - buffer;
  912. for (size_t k = 0; k < vertex_size; ++k)
  913. {
  914. const Stats& vsk = vertexstats[k];
  915. printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
  916. size_t total_k = vsk.header + vsk.bitg[0] + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[3];
  917. printf(" |\thdr [%5.1f%%] bitg 1-3 [%4.1f%% %4.1f%% %4.1f%%]",
  918. double(vsk.header) / double(total_k) * 100, double(vsk.bitg[1]) / double(total_k) * 100,
  919. double(vsk.bitg[2]) / double(total_k) * 100, double(vsk.bitg[3]) / double(total_k) * 100);
  920. printf(" |\tbitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
  921. double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,
  922. double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,
  923. double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,
  924. double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);
  925. printf("\n");
  926. }
  927. #endif
  928. return data - buffer;
  929. }
  930. size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
  931. {
  932. using namespace meshopt;
  933. assert(vertex_size > 0 && vertex_size <= 256);
  934. assert(vertex_size % 4 == 0);
  935. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  936. size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
  937. size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
  938. size_t vertex_block_data_size = vertex_block_size;
  939. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  940. return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
  941. }
  942. void meshopt_encodeVertexVersion(int version)
  943. {
  944. assert(unsigned(version) <= 0);
  945. meshopt::gEncodeVertexVersion = version;
  946. }
  947. int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
  948. {
  949. using namespace meshopt;
  950. assert(vertex_size > 0 && vertex_size <= 256);
  951. assert(vertex_size % 4 == 0);
  952. const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL;
  953. #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
  954. decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
  955. #elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
  956. decode = decodeVertexBlockSimd;
  957. #else
  958. decode = decodeVertexBlock;
  959. #endif
  960. #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
  961. assert(gDecodeBytesGroupInitialized);
  962. (void)gDecodeBytesGroupInitialized;
  963. #endif
  964. unsigned char* vertex_data = static_cast<unsigned char*>(destination);
  965. const unsigned char* data = buffer;
  966. const unsigned char* data_end = buffer + buffer_size;
  967. if (size_t(data_end - data) < 1 + vertex_size)
  968. return -2;
  969. unsigned char data_header = *data++;
  970. if ((data_header & 0xf0) != kVertexHeader)
  971. return -1;
  972. int version = data_header & 0x0f;
  973. if (version > 0)
  974. return -1;
  975. unsigned char last_vertex[256];
  976. memcpy(last_vertex, data_end - vertex_size, vertex_size);
  977. size_t vertex_block_size = getVertexBlockSize(vertex_size);
  978. size_t vertex_offset = 0;
  979. while (vertex_offset < vertex_count)
  980. {
  981. size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
  982. data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
  983. if (!data)
  984. return -2;
  985. vertex_offset += block_size;
  986. }
  987. size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
  988. if (size_t(data_end - data) != tail_size)
  989. return -3;
  990. return 0;
  991. }
  992. #undef SIMD_NEON
  993. #undef SIMD_SSE
  994. #undef SIMD_AVX
  995. #undef SIMD_WASM
  996. #undef SIMD_FALLBACK
  997. #undef SIMD_TARGET
  998. #undef SIMD_LATENCYOPT