filter_neon_intrinsics.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. /* filter_neon_intrinsics.c - NEON optimised filter functions
  2. *
  3. * Copyright (c) 2014,2016 Glenn Randers-Pehrson
  4. * Written by James Yu <james.yu at linaro.org>, October 2013.
  5. * Based on filter_neon.S, written by Mans Rullgard, 2011.
  6. *
  7. * Last changed in libpng 1.6.22 [May 26, 2016]
  8. *
  9. * This code is released under the libpng license.
  10. * For conditions of distribution and use, see the disclaimer
  11. * and license in png.h
  12. */
  13. #include "../pngpriv.h"
  14. #ifdef PNG_READ_SUPPORTED
  15. /* This code requires -mfpu=neon on the command line: */
  16. #if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
  17. #include <arm_neon.h>
  18. /* libpng row pointers are not necessarily aligned to any particular boundary,
  19. * however this code will only work with appropriate alignment. arm/arm_init.c
  20. * checks for this (and will not compile unless it is done). This code uses
  21. * variants of png_aligncast to avoid compiler warnings.
  22. */
  23. #define png_ptr(type,pointer) png_aligncast(type *,pointer)
  24. #define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
  25. /* The following relies on a variable 'temp_pointer' being declared with type
  26. * 'type'. This is written this way just to hide the GCC strict aliasing
  27. * warning; note that the code is safe because there never is an alias between
  28. * the input and output pointers.
  29. */
  30. #define png_ldr(type,pointer)\
  31. (temp_pointer = png_ptr(type,pointer), *temp_pointer)
  32. #if PNG_ARM_NEON_OPT > 0
  33. void
  34. png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
  35. png_const_bytep prev_row)
  36. {
  37. png_bytep rp = row;
  38. png_bytep rp_stop = row + row_info->rowbytes;
  39. png_const_bytep pp = prev_row;
  40. png_debug(1, "in png_read_filter_row_up_neon");
  41. for (; rp < rp_stop; rp += 16, pp += 16)
  42. {
  43. uint8x16_t qrp, qpp;
  44. qrp = vld1q_u8(rp);
  45. qpp = vld1q_u8(pp);
  46. qrp = vaddq_u8(qrp, qpp);
  47. vst1q_u8(rp, qrp);
  48. }
  49. }
  50. void
  51. png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
  52. png_const_bytep prev_row)
  53. {
  54. png_bytep rp = row;
  55. png_bytep rp_stop = row + row_info->rowbytes;
  56. uint8x16_t vtmp = vld1q_u8(rp);
  57. uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
  58. uint8x8x2_t vrp = *vrpt;
  59. uint8x8x4_t vdest;
  60. vdest.val[3] = vdup_n_u8(0);
  61. png_debug(1, "in png_read_filter_row_sub3_neon");
  62. for (; rp < rp_stop;)
  63. {
  64. uint8x8_t vtmp1, vtmp2;
  65. uint32x2_t *temp_pointer;
  66. vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
  67. vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
  68. vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
  69. vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
  70. vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
  71. vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
  72. vdest.val[3] = vadd_u8(vdest.val[2], vtmp1);
  73. vtmp = vld1q_u8(rp + 12);
  74. vrpt = png_ptr(uint8x8x2_t, &vtmp);
  75. vrp = *vrpt;
  76. vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
  77. rp += 3;
  78. vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
  79. rp += 3;
  80. vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
  81. rp += 3;
  82. vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
  83. rp += 3;
  84. }
  85. PNG_UNUSED(prev_row)
  86. }
  87. void
  88. png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
  89. png_const_bytep prev_row)
  90. {
  91. png_bytep rp = row;
  92. png_bytep rp_stop = row + row_info->rowbytes;
  93. uint8x8x4_t vdest;
  94. vdest.val[3] = vdup_n_u8(0);
  95. png_debug(1, "in png_read_filter_row_sub4_neon");
  96. for (; rp < rp_stop; rp += 16)
  97. {
  98. uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
  99. uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp);
  100. uint8x8x4_t vrp = *vrpt;
  101. uint32x2x4_t *temp_pointer;
  102. vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
  103. vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
  104. vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]);
  105. vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]);
  106. vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
  107. }
  108. PNG_UNUSED(prev_row)
  109. }
  110. void
  111. png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
  112. png_const_bytep prev_row)
  113. {
  114. png_bytep rp = row;
  115. png_const_bytep pp = prev_row;
  116. png_bytep rp_stop = row + row_info->rowbytes;
  117. uint8x16_t vtmp;
  118. uint8x8x2_t *vrpt;
  119. uint8x8x2_t vrp;
  120. uint8x8x4_t vdest;
  121. vdest.val[3] = vdup_n_u8(0);
  122. vtmp = vld1q_u8(rp);
  123. vrpt = png_ptr(uint8x8x2_t,&vtmp);
  124. vrp = *vrpt;
  125. png_debug(1, "in png_read_filter_row_avg3_neon");
  126. for (; rp < rp_stop; pp += 12)
  127. {
  128. uint8x8_t vtmp1, vtmp2, vtmp3;
  129. uint8x8x2_t *vppt;
  130. uint8x8x2_t vpp;
  131. uint32x2_t *temp_pointer;
  132. vtmp = vld1q_u8(pp);
  133. vppt = png_ptr(uint8x8x2_t,&vtmp);
  134. vpp = *vppt;
  135. vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
  136. vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
  137. vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
  138. vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
  139. vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
  140. vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
  141. vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
  142. vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
  143. vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
  144. vtmp = vld1q_u8(rp + 12);
  145. vrpt = png_ptr(uint8x8x2_t,&vtmp);
  146. vrp = *vrpt;
  147. vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
  148. vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
  149. vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
  150. vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2);
  151. vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
  152. vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
  153. rp += 3;
  154. vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
  155. rp += 3;
  156. vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
  157. rp += 3;
  158. vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
  159. rp += 3;
  160. }
  161. }
  162. void
  163. png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
  164. png_const_bytep prev_row)
  165. {
  166. png_bytep rp = row;
  167. png_bytep rp_stop = row + row_info->rowbytes;
  168. png_const_bytep pp = prev_row;
  169. uint8x8x4_t vdest;
  170. vdest.val[3] = vdup_n_u8(0);
  171. png_debug(1, "in png_read_filter_row_avg4_neon");
  172. for (; rp < rp_stop; rp += 16, pp += 16)
  173. {
  174. uint32x2x4_t vtmp;
  175. uint8x8x4_t *vrpt, *vppt;
  176. uint8x8x4_t vrp, vpp;
  177. uint32x2x4_t *temp_pointer;
  178. vtmp = vld4_u32(png_ptr(uint32_t,rp));
  179. vrpt = png_ptr(uint8x8x4_t,&vtmp);
  180. vrp = *vrpt;
  181. vtmp = vld4_u32(png_ptrc(uint32_t,pp));
  182. vppt = png_ptr(uint8x8x4_t,&vtmp);
  183. vpp = *vppt;
  184. vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
  185. vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
  186. vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
  187. vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
  188. vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]);
  189. vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
  190. vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
  191. vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
  192. vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
  193. }
  194. }
  195. static uint8x8_t
  196. paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
  197. {
  198. uint8x8_t d, e;
  199. uint16x8_t p1, pa, pb, pc;
  200. p1 = vaddl_u8(a, b); /* a + b */
  201. pc = vaddl_u8(c, c); /* c * 2 */
  202. pa = vabdl_u8(b, c); /* pa */
  203. pb = vabdl_u8(a, c); /* pb */
  204. pc = vabdq_u16(p1, pc); /* pc */
  205. p1 = vcleq_u16(pa, pb); /* pa <= pb */
  206. pa = vcleq_u16(pa, pc); /* pa <= pc */
  207. pb = vcleq_u16(pb, pc); /* pb <= pc */
  208. p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
  209. d = vmovn_u16(pb);
  210. e = vmovn_u16(p1);
  211. d = vbsl_u8(d, b, c);
  212. e = vbsl_u8(e, a, d);
  213. return e;
  214. }
  215. void
  216. png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
  217. png_const_bytep prev_row)
  218. {
  219. png_bytep rp = row;
  220. png_const_bytep pp = prev_row;
  221. png_bytep rp_stop = row + row_info->rowbytes;
  222. uint8x16_t vtmp;
  223. uint8x8x2_t *vrpt;
  224. uint8x8x2_t vrp;
  225. uint8x8_t vlast = vdup_n_u8(0);
  226. uint8x8x4_t vdest;
  227. vdest.val[3] = vdup_n_u8(0);
  228. vtmp = vld1q_u8(rp);
  229. vrpt = png_ptr(uint8x8x2_t,&vtmp);
  230. vrp = *vrpt;
  231. png_debug(1, "in png_read_filter_row_paeth3_neon");
  232. for (; rp < rp_stop; pp += 12)
  233. {
  234. uint8x8x2_t *vppt;
  235. uint8x8x2_t vpp;
  236. uint8x8_t vtmp1, vtmp2, vtmp3;
  237. uint32x2_t *temp_pointer;
  238. vtmp = vld1q_u8(pp);
  239. vppt = png_ptr(uint8x8x2_t,&vtmp);
  240. vpp = *vppt;
  241. vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
  242. vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
  243. vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
  244. vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
  245. vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
  246. vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
  247. vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
  248. vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
  249. vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
  250. vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
  251. vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
  252. vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
  253. vtmp = vld1q_u8(rp + 12);
  254. vrpt = png_ptr(uint8x8x2_t,&vtmp);
  255. vrp = *vrpt;
  256. vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3);
  257. vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
  258. vlast = vtmp2;
  259. vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
  260. rp += 3;
  261. vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
  262. rp += 3;
  263. vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
  264. rp += 3;
  265. vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
  266. rp += 3;
  267. }
  268. }
  269. void
  270. png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
  271. png_const_bytep prev_row)
  272. {
  273. png_bytep rp = row;
  274. png_bytep rp_stop = row + row_info->rowbytes;
  275. png_const_bytep pp = prev_row;
  276. uint8x8_t vlast = vdup_n_u8(0);
  277. uint8x8x4_t vdest;
  278. vdest.val[3] = vdup_n_u8(0);
  279. png_debug(1, "in png_read_filter_row_paeth4_neon");
  280. for (; rp < rp_stop; rp += 16, pp += 16)
  281. {
  282. uint32x2x4_t vtmp;
  283. uint8x8x4_t *vrpt, *vppt;
  284. uint8x8x4_t vrp, vpp;
  285. uint32x2x4_t *temp_pointer;
  286. vtmp = vld4_u32(png_ptr(uint32_t,rp));
  287. vrpt = png_ptr(uint8x8x4_t,&vtmp);
  288. vrp = *vrpt;
  289. vtmp = vld4_u32(png_ptrc(uint32_t,pp));
  290. vppt = png_ptr(uint8x8x4_t,&vtmp);
  291. vpp = *vppt;
  292. vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
  293. vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
  294. vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
  295. vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
  296. vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]);
  297. vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
  298. vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]);
  299. vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
  300. vlast = vpp.val[3];
  301. vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
  302. }
  303. }
  304. #endif /* PNG_ARM_NEON_OPT > 0 */
  305. #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
  306. #endif /* READ */