xz_dec_bcj.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579
  1. /* xz_dec_bcj.c - Branch/Call/Jump (BCJ) filter decoders */
  2. /*
  3. * GRUB -- GRand Unified Bootloader
  4. * Copyright (C) 2010 Free Software Foundation, Inc.
  5. *
  6. * GRUB is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * GRUB is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with GRUB. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. /*
  20. * This file is based on code from XZ embedded project
  21. * http://tukaani.org/xz/embedded.html
  22. */
  23. #include "xz_private.h"
  24. struct xz_dec_bcj {
  25. /* Type of the BCJ filter being used */
  26. enum {
  27. BCJ_X86 = 4, /* x86 or x86-64 */
  28. BCJ_POWERPC = 5, /* Big endian only */
  29. BCJ_IA64 = 6, /* Big or little endian */
  30. BCJ_ARM = 7, /* Little endian only */
  31. BCJ_ARMTHUMB = 8, /* Little endian only */
  32. BCJ_SPARC = 9 /* Big or little endian */
  33. } type;
  34. /*
  35. * Return value of the next filter in the chain. We need to preserve
  36. * this information across calls, because we must not call the next
  37. * filter anymore once it has returned XZ_STREAM_END.
  38. */
  39. enum xz_ret ret;
  40. /* True if we are operating in single-call mode. */
  41. bool single_call;
  42. /*
  43. * Absolute position relative to the beginning of the uncompressed
  44. * data (in a single .xz Block). We care only about the lowest 32
  45. * bits so this doesn't need to be uint64_t even with big files.
  46. */
  47. uint32_t pos;
  48. /* x86 filter state */
  49. uint32_t x86_prev_mask;
  50. /* Temporary space to hold the variables from struct xz_buf */
  51. uint8_t *out;
  52. size_t out_pos;
  53. size_t out_size;
  54. struct {
  55. /* Amount of already filtered data in the beginning of buf */
  56. size_t filtered;
  57. /* Total amount of data currently stored in buf */
  58. size_t size;
  59. /*
  60. * Buffer to hold a mix of filtered and unfiltered data. This
  61. * needs to be big enough to hold Alignment + 2 * Look-ahead:
  62. *
  63. * Type Alignment Look-ahead
  64. * x86 1 4
  65. * PowerPC 4 0
  66. * IA-64 16 0
  67. * ARM 4 0
  68. * ARM-Thumb 2 2
  69. * SPARC 4 0
  70. */
  71. uint8_t buf[16];
  72. } temp;
  73. };
  74. #ifdef XZ_DEC_X86
  75. /*
  76. * This is macro used to test the most significant byte of a memory address
  77. * in an x86 instruction.
  78. */
  79. #define bcj_x86_test_msbyte(b) ((b) == 0x00 || (b) == 0xFF)
  80. static noinline_for_stack size_t bcj_x86(
  81. struct xz_dec_bcj *s, uint8_t *buf, size_t size)
  82. {
  83. static const bool mask_to_allowed_status[8]
  84. = { true, true, true, false, true, false, false, false };
  85. static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };
  86. size_t i;
  87. size_t prev_pos = (size_t)-1;
  88. uint32_t prev_mask = s->x86_prev_mask;
  89. uint32_t src;
  90. uint32_t dest;
  91. uint32_t j;
  92. uint8_t b;
  93. if (size <= 4)
  94. return 0;
  95. size -= 4;
  96. for (i = 0; i < size; ++i) {
  97. if ((buf[i] & 0xFE) != 0xE8)
  98. continue;
  99. prev_pos = i - prev_pos;
  100. if (prev_pos > 3) {
  101. prev_mask = 0;
  102. } else {
  103. prev_mask = (prev_mask << (prev_pos - 1)) & 7;
  104. if (prev_mask != 0) {
  105. b = buf[i + 4 - mask_to_bit_num[prev_mask]];
  106. if (!mask_to_allowed_status[prev_mask]
  107. || bcj_x86_test_msbyte(b)) {
  108. prev_pos = i;
  109. prev_mask = (prev_mask << 1) | 1;
  110. continue;
  111. }
  112. }
  113. }
  114. prev_pos = i;
  115. if (bcj_x86_test_msbyte(buf[i + 4])) {
  116. src = get_unaligned_le32(buf + i + 1);
  117. while (true) {
  118. dest = src - (s->pos + (uint32_t)i + 5);
  119. if (prev_mask == 0)
  120. break;
  121. j = mask_to_bit_num[prev_mask] * 8;
  122. b = (uint8_t)(dest >> (24 - j));
  123. if (!bcj_x86_test_msbyte(b))
  124. break;
  125. src = dest ^ (((uint32_t)1 << (32 - j)) - 1);
  126. }
  127. dest &= 0x01FFFFFF;
  128. dest |= (uint32_t)0 - (dest & 0x01000000);
  129. put_unaligned_le32(dest, buf + i + 1);
  130. i += 4;
  131. } else {
  132. prev_mask = (prev_mask << 1) | 1;
  133. }
  134. }
  135. prev_pos = i - prev_pos;
  136. s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);
  137. return i;
  138. }
  139. #endif
  140. #ifdef XZ_DEC_POWERPC
  141. static noinline_for_stack size_t bcj_powerpc(
  142. struct xz_dec_bcj *s, uint8_t *buf, size_t size)
  143. {
  144. size_t i;
  145. uint32_t instr;
  146. for (i = 0; i + 3 < size; i += 4) {
  147. instr = get_unaligned_be32(buf + i);
  148. if ((instr & 0xFC000003) == 0x48000001) {
  149. instr &= 0x03FFFFFC;
  150. instr -= s->pos + (uint32_t)i;
  151. instr &= 0x03FFFFFC;
  152. instr |= 0x48000001;
  153. put_unaligned_be32(instr, buf + i);
  154. }
  155. }
  156. return i;
  157. }
  158. #endif
  159. #ifdef XZ_DEC_IA64
  160. static noinline_for_stack size_t bcj_ia64(
  161. struct xz_dec_bcj *s, uint8_t *buf, size_t size)
  162. {
  163. static const uint8_t branch_table[32] = {
  164. 0, 0, 0, 0, 0, 0, 0, 0,
  165. 0, 0, 0, 0, 0, 0, 0, 0,
  166. 4, 4, 6, 6, 0, 0, 7, 7,
  167. 4, 4, 0, 0, 4, 4, 0, 0
  168. };
  169. /*
  170. * The local variables take a little bit stack space, but it's less
  171. * than what LZMA2 decoder takes, so it doesn't make sense to reduce
  172. * stack usage here without doing that for the LZMA2 decoder too.
  173. */
  174. /* Loop counters */
  175. size_t i;
  176. size_t j;
  177. /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */
  178. uint32_t slot;
  179. /* Bitwise offset of the instruction indicated by slot */
  180. uint32_t bit_pos;
  181. /* bit_pos split into byte and bit parts */
  182. uint32_t byte_pos;
  183. uint32_t bit_res;
  184. /* Address part of an instruction */
  185. uint32_t addr;
  186. /* Mask used to detect which instructions to convert */
  187. uint32_t mask;
  188. /* 41-bit instruction stored somewhere in the lowest 48 bits */
  189. uint64_t instr;
  190. /* Instruction normalized with bit_res for easier manipulation */
  191. uint64_t norm;
  192. for (i = 0; i + 16 <= size; i += 16) {
  193. mask = branch_table[buf[i] & 0x1F];
  194. for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) {
  195. if (((mask >> slot) & 1) == 0)
  196. continue;
  197. byte_pos = bit_pos >> 3;
  198. bit_res = bit_pos & 7;
  199. instr = 0;
  200. for (j = 0; j < 6; ++j)
  201. instr |= (uint64_t)(buf[i + j + byte_pos])
  202. << (8 * j);
  203. norm = instr >> bit_res;
  204. if (((norm >> 37) & 0x0F) == 0x05
  205. && ((norm >> 9) & 0x07) == 0) {
  206. addr = (norm >> 13) & 0x0FFFFF;
  207. addr |= ((uint32_t)(norm >> 36) & 1) << 20;
  208. addr <<= 4;
  209. addr -= s->pos + (uint32_t)i;
  210. addr >>= 4;
  211. norm &= ~((uint64_t)0x8FFFFF << 13);
  212. norm |= (uint64_t)(addr & 0x0FFFFF) << 13;
  213. norm |= (uint64_t)(addr & 0x100000)
  214. << (36 - 20);
  215. instr &= (1 << bit_res) - 1;
  216. instr |= norm << bit_res;
  217. for (j = 0; j < 6; j++)
  218. buf[i + j + byte_pos]
  219. = (uint8_t)(instr >> (8 * j));
  220. }
  221. }
  222. }
  223. return i;
  224. }
  225. #endif
  226. #ifdef XZ_DEC_ARM
  227. static noinline_for_stack size_t bcj_arm(
  228. struct xz_dec_bcj *s, uint8_t *buf, size_t size)
  229. {
  230. size_t i;
  231. uint32_t addr;
  232. for (i = 0; i + 4 <= size; i += 4) {
  233. if (buf[i + 3] == 0xEB) {
  234. addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8)
  235. | ((uint32_t)buf[i + 2] << 16);
  236. addr <<= 2;
  237. addr -= s->pos + (uint32_t)i + 8;
  238. addr >>= 2;
  239. buf[i] = (uint8_t)addr;
  240. buf[i + 1] = (uint8_t)(addr >> 8);
  241. buf[i + 2] = (uint8_t)(addr >> 16);
  242. }
  243. }
  244. return i;
  245. }
  246. #endif
  247. #ifdef XZ_DEC_ARMTHUMB
  248. static noinline_for_stack size_t bcj_armthumb(
  249. struct xz_dec_bcj *s, uint8_t *buf, size_t size)
  250. {
  251. size_t i;
  252. uint32_t addr;
  253. for (i = 0; i + 4 <= size; i += 2) {
  254. if ((buf[i + 1] & 0xF8) == 0xF0
  255. && (buf[i + 3] & 0xF8) == 0xF8) {
  256. addr = (((uint32_t)buf[i + 1] & 0x07) << 19)
  257. | ((uint32_t)buf[i] << 11)
  258. | (((uint32_t)buf[i + 3] & 0x07) << 8)
  259. | (uint32_t)buf[i + 2];
  260. addr <<= 1;
  261. addr -= s->pos + (uint32_t)i + 4;
  262. addr >>= 1;
  263. buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));
  264. buf[i] = (uint8_t)(addr >> 11);
  265. buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));
  266. buf[i + 2] = (uint8_t)addr;
  267. i += 2;
  268. }
  269. }
  270. return i;
  271. }
  272. #endif
  273. #ifdef XZ_DEC_SPARC
  274. static noinline_for_stack size_t bcj_sparc(
  275. struct xz_dec_bcj *s, uint8_t *buf, size_t size)
  276. {
  277. size_t i;
  278. uint32_t instr;
  279. for (i = 0; i + 4 <= size; i += 4) {
  280. instr = get_unaligned_be32(buf + i);
  281. if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) {
  282. instr <<= 2;
  283. instr -= s->pos + (uint32_t)i;
  284. instr >>= 2;
  285. instr = ((uint32_t)0x40000000 - (instr & 0x400000))
  286. | 0x40000000 | (instr & 0x3FFFFF);
  287. put_unaligned_be32(instr, buf + i);
  288. }
  289. }
  290. return i;
  291. }
  292. #endif
  293. /*
  294. * Apply the selected BCJ filter. Update *pos and s->pos to match the amount
  295. * of data that got filtered.
  296. *
  297. * NOTE: This is implemented as a switch statement to avoid using function
  298. * pointers, which could be problematic in the kernel boot code, which must
  299. * avoid pointers to static data (at least on x86).
  300. */
  301. static void bcj_apply(struct xz_dec_bcj *s,
  302. uint8_t *buf __attribute__((unused)), size_t *pos, size_t size __attribute__((unused)))
  303. {
  304. size_t filtered;
  305. buf += *pos;
  306. size -= *pos;
  307. switch (s->type) {
  308. #ifdef XZ_DEC_X86
  309. case BCJ_X86:
  310. filtered = bcj_x86(s, buf, size);
  311. break;
  312. #endif
  313. #ifdef XZ_DEC_POWERPC
  314. case BCJ_POWERPC:
  315. filtered = bcj_powerpc(s, buf, size);
  316. break;
  317. #endif
  318. #ifdef XZ_DEC_IA64
  319. case BCJ_IA64:
  320. filtered = bcj_ia64(s, buf, size);
  321. break;
  322. #endif
  323. #ifdef XZ_DEC_ARM
  324. case BCJ_ARM:
  325. filtered = bcj_arm(s, buf, size);
  326. break;
  327. #endif
  328. #ifdef XZ_DEC_ARMTHUMB
  329. case BCJ_ARMTHUMB:
  330. filtered = bcj_armthumb(s, buf, size);
  331. break;
  332. #endif
  333. #ifdef XZ_DEC_SPARC
  334. case BCJ_SPARC:
  335. filtered = bcj_sparc(s, buf, size);
  336. break;
  337. #endif
  338. default:
  339. /* Never reached but silence compiler warnings. */
  340. filtered = 0;
  341. break;
  342. }
  343. *pos += filtered;
  344. s->pos += filtered;
  345. }
  346. /*
  347. * Flush pending filtered data from temp to the output buffer.
  348. * Move the remaining mixture of possibly filtered and unfiltered
  349. * data to the beginning of temp.
  350. */
  351. static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)
  352. {
  353. size_t copy_size;
  354. copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);
  355. memcpy(b->out + b->out_pos, s->temp.buf, copy_size);
  356. b->out_pos += copy_size;
  357. s->temp.filtered -= copy_size;
  358. s->temp.size -= copy_size;
  359. memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);
  360. }
  361. /*
  362. * The BCJ filter functions are primitive in sense that they process the
  363. * data in chunks of 1-16 bytes. To hide this issue, this function does
  364. * some buffering.
  365. */
  366. enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s,
  367. struct xz_dec_lzma2 *lzma2, struct xz_buf *b)
  368. {
  369. size_t out_start;
  370. /*
  371. * Flush pending already filtered data to the output buffer. Return
  372. * immediatelly if we couldn't flush everything, or if the next
  373. * filter in the chain had already returned XZ_STREAM_END.
  374. */
  375. if (s->temp.filtered > 0) {
  376. bcj_flush(s, b);
  377. if (s->temp.filtered > 0)
  378. return XZ_OK;
  379. if (s->ret == XZ_STREAM_END)
  380. return XZ_STREAM_END;
  381. }
  382. /*
  383. * If we have more output space than what is currently pending in
  384. * temp, copy the unfiltered data from temp to the output buffer
  385. * and try to fill the output buffer by decoding more data from the
  386. * next filter in the chain. Apply the BCJ filter on the new data
  387. * in the output buffer. If everything cannot be filtered, copy it
  388. * to temp and rewind the output buffer position accordingly.
  389. */
  390. if (s->temp.size < b->out_size - b->out_pos) {
  391. out_start = b->out_pos;
  392. memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);
  393. b->out_pos += s->temp.size;
  394. s->ret = xz_dec_lzma2_run(lzma2, b);
  395. if (s->ret != XZ_STREAM_END
  396. && (s->ret != XZ_OK || s->single_call))
  397. return s->ret;
  398. bcj_apply(s, b->out, &out_start, b->out_pos);
  399. /*
  400. * As an exception, if the next filter returned XZ_STREAM_END,
  401. * we can do that too, since the last few bytes that remain
  402. * unfiltered are meant to remain unfiltered.
  403. */
  404. if (s->ret == XZ_STREAM_END)
  405. return XZ_STREAM_END;
  406. s->temp.size = b->out_pos - out_start;
  407. b->out_pos -= s->temp.size;
  408. memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);
  409. }
  410. /*
  411. * If we have unfiltered data in temp, try to fill by decoding more
  412. * data from the next filter. Apply the BCJ filter on temp. Then we
  413. * hopefully can fill the actual output buffer by copying filtered
  414. * data from temp. A mix of filtered and unfiltered data may be left
  415. * in temp; it will be taken care on the next call to this function.
  416. */
  417. if (s->temp.size > 0) {
  418. /* Make b->out{,_pos,_size} temporarily point to s->temp. */
  419. s->out = b->out;
  420. s->out_pos = b->out_pos;
  421. s->out_size = b->out_size;
  422. b->out = s->temp.buf;
  423. b->out_pos = s->temp.size;
  424. b->out_size = sizeof(s->temp.buf);
  425. s->ret = xz_dec_lzma2_run(lzma2, b);
  426. s->temp.size = b->out_pos;
  427. b->out = s->out;
  428. b->out_pos = s->out_pos;
  429. b->out_size = s->out_size;
  430. if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)
  431. return s->ret;
  432. bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);
  433. /*
  434. * If the next filter returned XZ_STREAM_END, we mark that
  435. * everything is filtered, since the last unfiltered bytes
  436. * of the stream are meant to be left as is.
  437. */
  438. if (s->ret == XZ_STREAM_END)
  439. s->temp.filtered = s->temp.size;
  440. bcj_flush(s, b);
  441. if (s->temp.filtered > 0)
  442. return XZ_OK;
  443. }
  444. return s->ret;
  445. }
  446. #ifdef GRUB_EMBED_DECOMPRESSOR
  447. struct xz_dec_bcj bcj;
  448. #endif
  449. struct xz_dec_bcj * xz_dec_bcj_create(bool single_call)
  450. {
  451. struct xz_dec_bcj *s;
  452. #ifdef GRUB_EMBED_DECOMPRESSOR
  453. s = &bcj;
  454. #else
  455. s = kmalloc(sizeof(*s), GFP_KERNEL);
  456. #endif
  457. if (s != NULL)
  458. s->single_call = single_call;
  459. return s;
  460. }
  461. enum xz_ret xz_dec_bcj_reset(
  462. struct xz_dec_bcj *s, uint8_t id)
  463. {
  464. switch (id) {
  465. #ifdef XZ_DEC_X86
  466. case BCJ_X86:
  467. #endif
  468. #ifdef XZ_DEC_POWERPC
  469. case BCJ_POWERPC:
  470. #endif
  471. #ifdef XZ_DEC_IA64
  472. case BCJ_IA64:
  473. #endif
  474. #ifdef XZ_DEC_ARM
  475. case BCJ_ARM:
  476. #endif
  477. #ifdef XZ_DEC_ARMTHUMB
  478. case BCJ_ARMTHUMB:
  479. #endif
  480. #ifdef XZ_DEC_SPARC
  481. case BCJ_SPARC:
  482. #endif
  483. break;
  484. default:
  485. /* Unsupported Filter ID */
  486. return XZ_OPTIONS_ERROR;
  487. }
  488. s->type = id;
  489. s->ret = XZ_OK;
  490. s->pos = 0;
  491. s->x86_prev_mask = 0;
  492. s->temp.filtered = 0;
  493. s->temp.size = 0;
  494. return XZ_OK;
  495. }