filter_neon.S 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. /* filter_neon.S - NEON optimised filter functions
  2. *
  3. * Copyright (c) 2014,2017 Glenn Randers-Pehrson
  4. * Written by Mans Rullgard, 2011.
  5. * Last changed in libpng 1.6.31 [July 27, 2017]
  6. *
  7. * This code is released under the libpng license.
  8. * For conditions of distribution and use, see the disclaimer
  9. * and license in png.h
  10. */
  11. /* This is required to get the symbol renames, which are #defines, and the
  12. * definitions (or not) of PNG_ARM_NEON_OPT and PNG_ARM_NEON_IMPLEMENTATION.
  13. */
  14. #define PNG_VERSION_INFO_ONLY
  15. #include "../pngpriv.h"
  16. #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__ELF__)
  17. .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
  18. #endif
  19. #ifdef PNG_READ_SUPPORTED
  20. /* Assembler NEON support - only works for 32-bit ARM (i.e. it does not work for
  21. * ARM64). The code in arm/filter_neon_intrinsics.c supports ARM64, however it
  22. * only works if -mfpu=neon is specified on the GCC command line. See pngpriv.h
  23. * for the logic which sets PNG_USE_ARM_NEON_ASM:
  24. */
  25. #if PNG_ARM_NEON_IMPLEMENTATION == 2 /* hand-coded assembler */
  26. #if PNG_ARM_NEON_OPT > 0
  27. #ifdef __ELF__
  28. # define ELF
  29. #else
  30. # define ELF @
  31. #endif
  32. .arch armv7-a
  33. .fpu neon
  34. .macro func name, export=0
  35. .macro endfunc
  36. ELF .size \name, . - \name
  37. .endfunc
  38. .purgem endfunc
  39. .endm
  40. .text
  41. /* Explicitly specifying alignment here because some versions of
  42. * GAS don't align code correctly. This is harmless in correctly
  43. * written versions of GAS.
  44. */
  45. .align 2
  46. .if \export
  47. .global \name
  48. .endif
  49. ELF .type \name, STT_FUNC
  50. .func \name
  51. \name:
  52. .endm
  53. func png_read_filter_row_sub4_neon, export=1
  54. ldr r3, [r0, #4] @ rowbytes
  55. vmov.i8 d3, #0
  56. 1:
  57. vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128]
  58. vadd.u8 d0, d3, d4
  59. vadd.u8 d1, d0, d5
  60. vadd.u8 d2, d1, d6
  61. vadd.u8 d3, d2, d7
  62. vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
  63. subs r3, r3, #16
  64. bgt 1b
  65. bx lr
  66. endfunc
  67. func png_read_filter_row_sub3_neon, export=1
  68. ldr r3, [r0, #4] @ rowbytes
  69. vmov.i8 d3, #0
  70. mov r0, r1
  71. mov r2, #3
  72. mov r12, #12
  73. vld1.8 {q11}, [r0], r12
  74. 1:
  75. vext.8 d5, d22, d23, #3
  76. vadd.u8 d0, d3, d22
  77. vext.8 d6, d22, d23, #6
  78. vadd.u8 d1, d0, d5
  79. vext.8 d7, d23, d23, #1
  80. vld1.8 {q11}, [r0], r12
  81. vst1.32 {d0[0]}, [r1,:32], r2
  82. vadd.u8 d2, d1, d6
  83. vst1.32 {d1[0]}, [r1], r2
  84. vadd.u8 d3, d2, d7
  85. vst1.32 {d2[0]}, [r1], r2
  86. vst1.32 {d3[0]}, [r1], r2
  87. subs r3, r3, #12
  88. bgt 1b
  89. bx lr
  90. endfunc
  91. func png_read_filter_row_up_neon, export=1
  92. ldr r3, [r0, #4] @ rowbytes
  93. 1:
  94. vld1.8 {q0}, [r1,:128]
  95. vld1.8 {q1}, [r2,:128]!
  96. vadd.u8 q0, q0, q1
  97. vst1.8 {q0}, [r1,:128]!
  98. subs r3, r3, #16
  99. bgt 1b
  100. bx lr
  101. endfunc
  102. func png_read_filter_row_avg4_neon, export=1
  103. ldr r12, [r0, #4] @ rowbytes
  104. vmov.i8 d3, #0
  105. 1:
  106. vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128]
  107. vld4.32 {d16[],d17[],d18[],d19[]},[r2,:128]!
  108. vhadd.u8 d0, d3, d16
  109. vadd.u8 d0, d0, d4
  110. vhadd.u8 d1, d0, d17
  111. vadd.u8 d1, d1, d5
  112. vhadd.u8 d2, d1, d18
  113. vadd.u8 d2, d2, d6
  114. vhadd.u8 d3, d2, d19
  115. vadd.u8 d3, d3, d7
  116. vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
  117. subs r12, r12, #16
  118. bgt 1b
  119. bx lr
  120. endfunc
  121. func png_read_filter_row_avg3_neon, export=1
  122. push {r4,lr}
  123. ldr r12, [r0, #4] @ rowbytes
  124. vmov.i8 d3, #0
  125. mov r0, r1
  126. mov r4, #3
  127. mov lr, #12
  128. vld1.8 {q11}, [r0], lr
  129. 1:
  130. vld1.8 {q10}, [r2], lr
  131. vext.8 d5, d22, d23, #3
  132. vhadd.u8 d0, d3, d20
  133. vext.8 d17, d20, d21, #3
  134. vadd.u8 d0, d0, d22
  135. vext.8 d6, d22, d23, #6
  136. vhadd.u8 d1, d0, d17
  137. vext.8 d18, d20, d21, #6
  138. vadd.u8 d1, d1, d5
  139. vext.8 d7, d23, d23, #1
  140. vld1.8 {q11}, [r0], lr
  141. vst1.32 {d0[0]}, [r1,:32], r4
  142. vhadd.u8 d2, d1, d18
  143. vst1.32 {d1[0]}, [r1], r4
  144. vext.8 d19, d21, d21, #1
  145. vadd.u8 d2, d2, d6
  146. vhadd.u8 d3, d2, d19
  147. vst1.32 {d2[0]}, [r1], r4
  148. vadd.u8 d3, d3, d7
  149. vst1.32 {d3[0]}, [r1], r4
  150. subs r12, r12, #12
  151. bgt 1b
  152. pop {r4,pc}
  153. endfunc
  154. .macro paeth rx, ra, rb, rc
  155. vaddl.u8 q12, \ra, \rb @ a + b
  156. vaddl.u8 q15, \rc, \rc @ 2*c
  157. vabdl.u8 q13, \rb, \rc @ pa
  158. vabdl.u8 q14, \ra, \rc @ pb
  159. vabd.u16 q15, q12, q15 @ pc
  160. vcle.u16 q12, q13, q14 @ pa <= pb
  161. vcle.u16 q13, q13, q15 @ pa <= pc
  162. vcle.u16 q14, q14, q15 @ pb <= pc
  163. vand q12, q12, q13 @ pa <= pb && pa <= pc
  164. vmovn.u16 d28, q14
  165. vmovn.u16 \rx, q12
  166. vbsl d28, \rb, \rc
  167. vbsl \rx, \ra, d28
  168. .endm
  169. func png_read_filter_row_paeth4_neon, export=1
  170. ldr r12, [r0, #4] @ rowbytes
  171. vmov.i8 d3, #0
  172. vmov.i8 d20, #0
  173. 1:
  174. vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128]
  175. vld4.32 {d16[],d17[],d18[],d19[]},[r2,:128]!
  176. paeth d0, d3, d16, d20
  177. vadd.u8 d0, d0, d4
  178. paeth d1, d0, d17, d16
  179. vadd.u8 d1, d1, d5
  180. paeth d2, d1, d18, d17
  181. vadd.u8 d2, d2, d6
  182. paeth d3, d2, d19, d18
  183. vmov d20, d19
  184. vadd.u8 d3, d3, d7
  185. vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
  186. subs r12, r12, #16
  187. bgt 1b
  188. bx lr
  189. endfunc
  190. func png_read_filter_row_paeth3_neon, export=1
  191. push {r4,lr}
  192. ldr r12, [r0, #4] @ rowbytes
  193. vmov.i8 d3, #0
  194. vmov.i8 d4, #0
  195. mov r0, r1
  196. mov r4, #3
  197. mov lr, #12
  198. vld1.8 {q11}, [r0], lr
  199. 1:
  200. vld1.8 {q10}, [r2], lr
  201. paeth d0, d3, d20, d4
  202. vext.8 d5, d22, d23, #3
  203. vadd.u8 d0, d0, d22
  204. vext.8 d17, d20, d21, #3
  205. paeth d1, d0, d17, d20
  206. vst1.32 {d0[0]}, [r1,:32], r4
  207. vext.8 d6, d22, d23, #6
  208. vadd.u8 d1, d1, d5
  209. vext.8 d18, d20, d21, #6
  210. paeth d2, d1, d18, d17
  211. vext.8 d7, d23, d23, #1
  212. vld1.8 {q11}, [r0], lr
  213. vst1.32 {d1[0]}, [r1], r4
  214. vadd.u8 d2, d2, d6
  215. vext.8 d19, d21, d21, #1
  216. paeth d3, d2, d19, d18
  217. vst1.32 {d2[0]}, [r1], r4
  218. vmov d4, d19
  219. vadd.u8 d3, d3, d7
  220. vst1.32 {d3[0]}, [r1], r4
  221. subs r12, r12, #12
  222. bgt 1b
  223. pop {r4,pc}
  224. endfunc
  225. #endif /* PNG_ARM_NEON_OPT > 0 */
  226. #endif /* PNG_ARM_NEON_IMPLEMENTATION == 2 (assembler) */
  227. #endif /* READ */