vpx_convolve.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613
  1. /*
  2. * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <assert.h>
  11. #include <string.h>
  12. #include "./vpx_config.h"
  13. #include "./vpx_dsp_rtcd.h"
  14. #include "vpx/vpx_integer.h"
  15. #include "vpx_dsp/vpx_convolve.h"
  16. #include "vpx_dsp/vpx_dsp_common.h"
  17. #include "vpx_dsp/vpx_filter.h"
  18. #include "vpx_ports/mem.h"
  19. static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
  20. uint8_t *dst, ptrdiff_t dst_stride,
  21. const InterpKernel *x_filters,
  22. int x0_q4, int x_step_q4, int w, int h) {
  23. int x, y;
  24. src -= SUBPEL_TAPS / 2 - 1;
  25. for (y = 0; y < h; ++y) {
  26. int x_q4 = x0_q4;
  27. for (x = 0; x < w; ++x) {
  28. const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
  29. const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
  30. int k, sum = 0;
  31. for (k = 0; k < SUBPEL_TAPS; ++k)
  32. sum += src_x[k] * x_filter[k];
  33. dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
  34. x_q4 += x_step_q4;
  35. }
  36. src += src_stride;
  37. dst += dst_stride;
  38. }
  39. }
  40. static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
  41. uint8_t *dst, ptrdiff_t dst_stride,
  42. const InterpKernel *x_filters,
  43. int x0_q4, int x_step_q4, int w, int h) {
  44. int x, y;
  45. src -= SUBPEL_TAPS / 2 - 1;
  46. for (y = 0; y < h; ++y) {
  47. int x_q4 = x0_q4;
  48. for (x = 0; x < w; ++x) {
  49. const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
  50. const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
  51. int k, sum = 0;
  52. for (k = 0; k < SUBPEL_TAPS; ++k)
  53. sum += src_x[k] * x_filter[k];
  54. dst[x] = ROUND_POWER_OF_TWO(dst[x] +
  55. clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
  56. x_q4 += x_step_q4;
  57. }
  58. src += src_stride;
  59. dst += dst_stride;
  60. }
  61. }
  62. static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
  63. uint8_t *dst, ptrdiff_t dst_stride,
  64. const InterpKernel *y_filters,
  65. int y0_q4, int y_step_q4, int w, int h) {
  66. int x, y;
  67. src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  68. for (x = 0; x < w; ++x) {
  69. int y_q4 = y0_q4;
  70. for (y = 0; y < h; ++y) {
  71. const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
  72. const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
  73. int k, sum = 0;
  74. for (k = 0; k < SUBPEL_TAPS; ++k)
  75. sum += src_y[k * src_stride] * y_filter[k];
  76. dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
  77. y_q4 += y_step_q4;
  78. }
  79. ++src;
  80. ++dst;
  81. }
  82. }
  83. static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
  84. uint8_t *dst, ptrdiff_t dst_stride,
  85. const InterpKernel *y_filters,
  86. int y0_q4, int y_step_q4, int w, int h) {
  87. int x, y;
  88. src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  89. for (x = 0; x < w; ++x) {
  90. int y_q4 = y0_q4;
  91. for (y = 0; y < h; ++y) {
  92. const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
  93. const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
  94. int k, sum = 0;
  95. for (k = 0; k < SUBPEL_TAPS; ++k)
  96. sum += src_y[k * src_stride] * y_filter[k];
  97. dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
  98. clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
  99. y_q4 += y_step_q4;
  100. }
  101. ++src;
  102. ++dst;
  103. }
  104. }
  105. static void convolve(const uint8_t *src, ptrdiff_t src_stride,
  106. uint8_t *dst, ptrdiff_t dst_stride,
  107. const InterpKernel *const x_filters,
  108. int x0_q4, int x_step_q4,
  109. const InterpKernel *const y_filters,
  110. int y0_q4, int y_step_q4,
  111. int w, int h) {
  112. // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  113. // 2d filtering proceeds in 2 steps:
  114. // (1) Interpolate horizontally into an intermediate buffer, temp.
  115. // (2) Interpolate temp vertically to derive the sub-pixel result.
  116. // Deriving the maximum number of rows in the temp buffer (135):
  117. // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  118. // --Largest block size is 64x64 pixels.
  119. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  120. // original frame (in 1/16th pixel units).
  121. // --Must round-up because block may be located at sub-pixel position.
  122. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  123. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  124. uint8_t temp[135 * 64];
  125. int intermediate_height =
  126. (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
  127. assert(w <= 64);
  128. assert(h <= 64);
  129. assert(y_step_q4 <= 32);
  130. assert(x_step_q4 <= 32);
  131. convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
  132. x_filters, x0_q4, x_step_q4, w, intermediate_height);
  133. convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
  134. y_filters, y0_q4, y_step_q4, w, h);
  135. }
  136. static const InterpKernel *get_filter_base(const int16_t *filter) {
  137. // NOTE: This assumes that the filter table is 256-byte aligned.
  138. // TODO(agrange) Modify to make independent of table alignment.
  139. return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
  140. }
  141. static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
  142. return (int)((const InterpKernel *)(intptr_t)f - base);
  143. }
  144. void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
  145. uint8_t *dst, ptrdiff_t dst_stride,
  146. const int16_t *filter_x, int x_step_q4,
  147. const int16_t *filter_y, int y_step_q4,
  148. int w, int h) {
  149. const InterpKernel *const filters_x = get_filter_base(filter_x);
  150. const int x0_q4 = get_filter_offset(filter_x, filters_x);
  151. (void)filter_y;
  152. (void)y_step_q4;
  153. convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
  154. x0_q4, x_step_q4, w, h);
  155. }
  156. void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
  157. uint8_t *dst, ptrdiff_t dst_stride,
  158. const int16_t *filter_x, int x_step_q4,
  159. const int16_t *filter_y, int y_step_q4,
  160. int w, int h) {
  161. const InterpKernel *const filters_x = get_filter_base(filter_x);
  162. const int x0_q4 = get_filter_offset(filter_x, filters_x);
  163. (void)filter_y;
  164. (void)y_step_q4;
  165. convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
  166. x0_q4, x_step_q4, w, h);
  167. }
  168. void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
  169. uint8_t *dst, ptrdiff_t dst_stride,
  170. const int16_t *filter_x, int x_step_q4,
  171. const int16_t *filter_y, int y_step_q4,
  172. int w, int h) {
  173. const InterpKernel *const filters_y = get_filter_base(filter_y);
  174. const int y0_q4 = get_filter_offset(filter_y, filters_y);
  175. (void)filter_x;
  176. (void)x_step_q4;
  177. convolve_vert(src, src_stride, dst, dst_stride, filters_y,
  178. y0_q4, y_step_q4, w, h);
  179. }
  180. void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
  181. uint8_t *dst, ptrdiff_t dst_stride,
  182. const int16_t *filter_x, int x_step_q4,
  183. const int16_t *filter_y, int y_step_q4,
  184. int w, int h) {
  185. const InterpKernel *const filters_y = get_filter_base(filter_y);
  186. const int y0_q4 = get_filter_offset(filter_y, filters_y);
  187. (void)filter_x;
  188. (void)x_step_q4;
  189. convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
  190. y0_q4, y_step_q4, w, h);
  191. }
  192. void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
  193. uint8_t *dst, ptrdiff_t dst_stride,
  194. const int16_t *filter_x, int x_step_q4,
  195. const int16_t *filter_y, int y_step_q4,
  196. int w, int h) {
  197. const InterpKernel *const filters_x = get_filter_base(filter_x);
  198. const int x0_q4 = get_filter_offset(filter_x, filters_x);
  199. const InterpKernel *const filters_y = get_filter_base(filter_y);
  200. const int y0_q4 = get_filter_offset(filter_y, filters_y);
  201. convolve(src, src_stride, dst, dst_stride,
  202. filters_x, x0_q4, x_step_q4,
  203. filters_y, y0_q4, y_step_q4, w, h);
  204. }
  205. void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
  206. uint8_t *dst, ptrdiff_t dst_stride,
  207. const int16_t *filter_x, int x_step_q4,
  208. const int16_t *filter_y, int y_step_q4,
  209. int w, int h) {
  210. /* Fixed size intermediate buffer places limits on parameters. */
  211. DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
  212. assert(w <= 64);
  213. assert(h <= 64);
  214. vpx_convolve8_c(src, src_stride, temp, 64,
  215. filter_x, x_step_q4, filter_y, y_step_q4, w, h);
  216. vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
  217. }
  218. void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
  219. uint8_t *dst, ptrdiff_t dst_stride,
  220. const int16_t *filter_x, int filter_x_stride,
  221. const int16_t *filter_y, int filter_y_stride,
  222. int w, int h) {
  223. int r;
  224. (void)filter_x; (void)filter_x_stride;
  225. (void)filter_y; (void)filter_y_stride;
  226. for (r = h; r > 0; --r) {
  227. memcpy(dst, src, w);
  228. src += src_stride;
  229. dst += dst_stride;
  230. }
  231. }
  232. void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
  233. uint8_t *dst, ptrdiff_t dst_stride,
  234. const int16_t *filter_x, int filter_x_stride,
  235. const int16_t *filter_y, int filter_y_stride,
  236. int w, int h) {
  237. int x, y;
  238. (void)filter_x; (void)filter_x_stride;
  239. (void)filter_y; (void)filter_y_stride;
  240. for (y = 0; y < h; ++y) {
  241. for (x = 0; x < w; ++x)
  242. dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
  243. src += src_stride;
  244. dst += dst_stride;
  245. }
  246. }
  247. void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
  248. uint8_t *dst, ptrdiff_t dst_stride,
  249. const int16_t *filter_x, int x_step_q4,
  250. const int16_t *filter_y, int y_step_q4,
  251. int w, int h) {
  252. vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
  253. filter_y, y_step_q4, w, h);
  254. }
  255. void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride,
  256. uint8_t *dst, ptrdiff_t dst_stride,
  257. const int16_t *filter_x, int x_step_q4,
  258. const int16_t *filter_y, int y_step_q4,
  259. int w, int h) {
  260. vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
  261. filter_y, y_step_q4, w, h);
  262. }
  263. void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride,
  264. uint8_t *dst, ptrdiff_t dst_stride,
  265. const int16_t *filter_x, int x_step_q4,
  266. const int16_t *filter_y, int y_step_q4,
  267. int w, int h) {
  268. vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
  269. filter_y, y_step_q4, w, h);
  270. }
  271. void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
  272. uint8_t *dst, ptrdiff_t dst_stride,
  273. const int16_t *filter_x, int x_step_q4,
  274. const int16_t *filter_y, int y_step_q4,
  275. int w, int h) {
  276. vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
  277. x_step_q4, filter_y, y_step_q4, w, h);
  278. }
  279. void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
  280. uint8_t *dst, ptrdiff_t dst_stride,
  281. const int16_t *filter_x, int x_step_q4,
  282. const int16_t *filter_y, int y_step_q4,
  283. int w, int h) {
  284. vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
  285. x_step_q4, filter_y, y_step_q4, w, h);
  286. }
  287. void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride,
  288. uint8_t *dst, ptrdiff_t dst_stride,
  289. const int16_t *filter_x, int x_step_q4,
  290. const int16_t *filter_y, int y_step_q4,
  291. int w, int h) {
  292. vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
  293. filter_y, y_step_q4, w, h);
  294. }
  295. #if CONFIG_VP9_HIGHBITDEPTH
  296. static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
  297. uint8_t *dst8, ptrdiff_t dst_stride,
  298. const InterpKernel *x_filters,
  299. int x0_q4, int x_step_q4,
  300. int w, int h, int bd) {
  301. int x, y;
  302. uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  303. uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  304. src -= SUBPEL_TAPS / 2 - 1;
  305. for (y = 0; y < h; ++y) {
  306. int x_q4 = x0_q4;
  307. for (x = 0; x < w; ++x) {
  308. const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
  309. const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
  310. int k, sum = 0;
  311. for (k = 0; k < SUBPEL_TAPS; ++k)
  312. sum += src_x[k] * x_filter[k];
  313. dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
  314. x_q4 += x_step_q4;
  315. }
  316. src += src_stride;
  317. dst += dst_stride;
  318. }
  319. }
  320. static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
  321. uint8_t *dst8, ptrdiff_t dst_stride,
  322. const InterpKernel *x_filters,
  323. int x0_q4, int x_step_q4,
  324. int w, int h, int bd) {
  325. int x, y;
  326. uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  327. uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  328. src -= SUBPEL_TAPS / 2 - 1;
  329. for (y = 0; y < h; ++y) {
  330. int x_q4 = x0_q4;
  331. for (x = 0; x < w; ++x) {
  332. const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
  333. const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
  334. int k, sum = 0;
  335. for (k = 0; k < SUBPEL_TAPS; ++k)
  336. sum += src_x[k] * x_filter[k];
  337. dst[x] = ROUND_POWER_OF_TWO(dst[x] +
  338. clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
  339. x_q4 += x_step_q4;
  340. }
  341. src += src_stride;
  342. dst += dst_stride;
  343. }
  344. }
  345. static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
  346. uint8_t *dst8, ptrdiff_t dst_stride,
  347. const InterpKernel *y_filters,
  348. int y0_q4, int y_step_q4, int w, int h,
  349. int bd) {
  350. int x, y;
  351. uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  352. uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  353. src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  354. for (x = 0; x < w; ++x) {
  355. int y_q4 = y0_q4;
  356. for (y = 0; y < h; ++y) {
  357. const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
  358. const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
  359. int k, sum = 0;
  360. for (k = 0; k < SUBPEL_TAPS; ++k)
  361. sum += src_y[k * src_stride] * y_filter[k];
  362. dst[y * dst_stride] = clip_pixel_highbd(
  363. ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
  364. y_q4 += y_step_q4;
  365. }
  366. ++src;
  367. ++dst;
  368. }
  369. }
  370. static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
  371. uint8_t *dst8, ptrdiff_t dst_stride,
  372. const InterpKernel *y_filters,
  373. int y0_q4, int y_step_q4, int w, int h,
  374. int bd) {
  375. int x, y;
  376. uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  377. uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  378. src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  379. for (x = 0; x < w; ++x) {
  380. int y_q4 = y0_q4;
  381. for (y = 0; y < h; ++y) {
  382. const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
  383. const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
  384. int k, sum = 0;
  385. for (k = 0; k < SUBPEL_TAPS; ++k)
  386. sum += src_y[k * src_stride] * y_filter[k];
  387. dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
  388. clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
  389. y_q4 += y_step_q4;
  390. }
  391. ++src;
  392. ++dst;
  393. }
  394. }
  395. static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
  396. uint8_t *dst, ptrdiff_t dst_stride,
  397. const InterpKernel *const x_filters,
  398. int x0_q4, int x_step_q4,
  399. const InterpKernel *const y_filters,
  400. int y0_q4, int y_step_q4,
  401. int w, int h, int bd) {
  402. // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  403. // 2d filtering proceeds in 2 steps:
  404. // (1) Interpolate horizontally into an intermediate buffer, temp.
  405. // (2) Interpolate temp vertically to derive the sub-pixel result.
  406. // Deriving the maximum number of rows in the temp buffer (135):
  407. // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  408. // --Largest block size is 64x64 pixels.
  409. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  410. // original frame (in 1/16th pixel units).
  411. // --Must round-up because block may be located at sub-pixel position.
  412. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  413. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  414. uint16_t temp[64 * 135];
  415. int intermediate_height =
  416. (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
  417. assert(w <= 64);
  418. assert(h <= 64);
  419. assert(y_step_q4 <= 32);
  420. assert(x_step_q4 <= 32);
  421. highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
  422. src_stride, CONVERT_TO_BYTEPTR(temp), 64,
  423. x_filters, x0_q4, x_step_q4, w,
  424. intermediate_height, bd);
  425. highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
  426. 64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
  427. w, h, bd);
  428. }
  429. void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
  430. uint8_t *dst, ptrdiff_t dst_stride,
  431. const int16_t *filter_x, int x_step_q4,
  432. const int16_t *filter_y, int y_step_q4,
  433. int w, int h, int bd) {
  434. const InterpKernel *const filters_x = get_filter_base(filter_x);
  435. const int x0_q4 = get_filter_offset(filter_x, filters_x);
  436. (void)filter_y;
  437. (void)y_step_q4;
  438. highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
  439. x0_q4, x_step_q4, w, h, bd);
  440. }
  441. void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
  442. uint8_t *dst, ptrdiff_t dst_stride,
  443. const int16_t *filter_x, int x_step_q4,
  444. const int16_t *filter_y, int y_step_q4,
  445. int w, int h, int bd) {
  446. const InterpKernel *const filters_x = get_filter_base(filter_x);
  447. const int x0_q4 = get_filter_offset(filter_x, filters_x);
  448. (void)filter_y;
  449. (void)y_step_q4;
  450. highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
  451. x0_q4, x_step_q4, w, h, bd);
  452. }
  453. void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
  454. uint8_t *dst, ptrdiff_t dst_stride,
  455. const int16_t *filter_x, int x_step_q4,
  456. const int16_t *filter_y, int y_step_q4,
  457. int w, int h, int bd) {
  458. const InterpKernel *const filters_y = get_filter_base(filter_y);
  459. const int y0_q4 = get_filter_offset(filter_y, filters_y);
  460. (void)filter_x;
  461. (void)x_step_q4;
  462. highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
  463. y0_q4, y_step_q4, w, h, bd);
  464. }
  465. void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
  466. uint8_t *dst, ptrdiff_t dst_stride,
  467. const int16_t *filter_x, int x_step_q4,
  468. const int16_t *filter_y, int y_step_q4,
  469. int w, int h, int bd) {
  470. const InterpKernel *const filters_y = get_filter_base(filter_y);
  471. const int y0_q4 = get_filter_offset(filter_y, filters_y);
  472. (void)filter_x;
  473. (void)x_step_q4;
  474. highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
  475. y0_q4, y_step_q4, w, h, bd);
  476. }
  477. void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
  478. uint8_t *dst, ptrdiff_t dst_stride,
  479. const int16_t *filter_x, int x_step_q4,
  480. const int16_t *filter_y, int y_step_q4,
  481. int w, int h, int bd) {
  482. const InterpKernel *const filters_x = get_filter_base(filter_x);
  483. const int x0_q4 = get_filter_offset(filter_x, filters_x);
  484. const InterpKernel *const filters_y = get_filter_base(filter_y);
  485. const int y0_q4 = get_filter_offset(filter_y, filters_y);
  486. highbd_convolve(src, src_stride, dst, dst_stride,
  487. filters_x, x0_q4, x_step_q4,
  488. filters_y, y0_q4, y_step_q4, w, h, bd);
  489. }
  490. void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
  491. uint8_t *dst, ptrdiff_t dst_stride,
  492. const int16_t *filter_x, int x_step_q4,
  493. const int16_t *filter_y, int y_step_q4,
  494. int w, int h, int bd) {
  495. // Fixed size intermediate buffer places limits on parameters.
  496. DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
  497. assert(w <= 64);
  498. assert(h <= 64);
  499. vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
  500. filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
  501. vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
  502. NULL, 0, NULL, 0, w, h, bd);
  503. }
  504. void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
  505. uint8_t *dst8, ptrdiff_t dst_stride,
  506. const int16_t *filter_x, int filter_x_stride,
  507. const int16_t *filter_y, int filter_y_stride,
  508. int w, int h, int bd) {
  509. int r;
  510. uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  511. uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  512. (void)filter_x;
  513. (void)filter_y;
  514. (void)filter_x_stride;
  515. (void)filter_y_stride;
  516. (void)bd;
  517. for (r = h; r > 0; --r) {
  518. memcpy(dst, src, w * sizeof(uint16_t));
  519. src += src_stride;
  520. dst += dst_stride;
  521. }
  522. }
  523. void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
  524. uint8_t *dst8, ptrdiff_t dst_stride,
  525. const int16_t *filter_x, int filter_x_stride,
  526. const int16_t *filter_y, int filter_y_stride,
  527. int w, int h, int bd) {
  528. int x, y;
  529. uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  530. uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  531. (void)filter_x;
  532. (void)filter_y;
  533. (void)filter_x_stride;
  534. (void)filter_y_stride;
  535. (void)bd;
  536. for (y = 0; y < h; ++y) {
  537. for (x = 0; x < w; ++x) {
  538. dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
  539. }
  540. src += src_stride;
  541. dst += dst_stride;
  542. }
  543. }
  544. #endif