gfxAlphaRecoverySSE2.cpp 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
  2. * This Source Code Form is subject to the terms of the Mozilla Public
  3. * License, v. 2.0. If a copy of the MPL was not distributed with this
  4. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  5. #include "gfxAlphaRecovery.h"
  6. #include "gfxImageSurface.h"
  7. #include <emmintrin.h>
  8. // This file should only be compiled on x86 and x64 systems. Additionally,
  9. // you'll need to compile it with -msse2 if you're using GCC on x86.
  10. #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
  11. __declspec(align(16)) static uint32_t greenMaski[] =
  12. { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
  13. __declspec(align(16)) static uint32_t alphaMaski[] =
  14. { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
  15. #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
  16. static uint32_t greenMaski[] __attribute__ ((aligned (16))) =
  17. { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
  18. static uint32_t alphaMaski[] __attribute__ ((aligned (16))) =
  19. { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
  20. #elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
  21. #pragma align 16 (greenMaski, alphaMaski)
  22. static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
  23. static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
  24. #endif
  25. bool
  26. gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
  27. const gfxImageSurface* whiteSurf)
  28. {
  29. mozilla::gfx::IntSize size = blackSurf->GetSize();
  30. if (size != whiteSurf->GetSize() ||
  31. (blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
  32. blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) ||
  33. (whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
  34. whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32))
  35. return false;
  36. blackSurf->Flush();
  37. whiteSurf->Flush();
  38. unsigned char* blackData = blackSurf->Data();
  39. unsigned char* whiteData = whiteSurf->Data();
  40. if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
  41. (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
  42. // Cannot keep these in alignment.
  43. return false;
  44. }
  45. __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
  46. __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
  47. for (int32_t i = 0; i < size.height; ++i) {
  48. int32_t j = 0;
  49. // Loop single pixels until at 4 byte alignment.
  50. while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
  51. *((uint32_t*)blackData) =
  52. RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
  53. *reinterpret_cast<uint32_t*>(whiteData));
  54. blackData += 4;
  55. whiteData += 4;
  56. j++;
  57. }
  58. // This extra loop allows the compiler to do some more clever registry
  59. // management and makes it about 5% faster than with only the 4 pixel
  60. // at a time loop.
  61. for (; j < size.width - 8; j += 8) {
  62. __m128i black1 = _mm_load_si128((__m128i*)blackData);
  63. __m128i white1 = _mm_load_si128((__m128i*)whiteData);
  64. __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
  65. __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
  66. // Execute the same instructions as described in RecoverPixel, only
  67. // using an SSE2 packed saturated subtract.
  68. white1 = _mm_subs_epu8(white1, black1);
  69. white2 = _mm_subs_epu8(white2, black2);
  70. white1 = _mm_subs_epu8(greenMask, white1);
  71. white2 = _mm_subs_epu8(greenMask, white2);
  72. // Producing the final black pixel in an XMM register and storing
  73. // that is actually faster than doing a masked store since that
  74. // does an unaligned storage. We have the black pixel in a register
  75. // anyway.
  76. black1 = _mm_andnot_si128(alphaMask, black1);
  77. black2 = _mm_andnot_si128(alphaMask, black2);
  78. white1 = _mm_slli_si128(white1, 2);
  79. white2 = _mm_slli_si128(white2, 2);
  80. white1 = _mm_and_si128(alphaMask, white1);
  81. white2 = _mm_and_si128(alphaMask, white2);
  82. black1 = _mm_or_si128(white1, black1);
  83. black2 = _mm_or_si128(white2, black2);
  84. _mm_store_si128((__m128i*)blackData, black1);
  85. _mm_store_si128((__m128i*)(blackData + 16), black2);
  86. blackData += 32;
  87. whiteData += 32;
  88. }
  89. for (; j < size.width - 4; j += 4) {
  90. __m128i black = _mm_load_si128((__m128i*)blackData);
  91. __m128i white = _mm_load_si128((__m128i*)whiteData);
  92. white = _mm_subs_epu8(white, black);
  93. white = _mm_subs_epu8(greenMask, white);
  94. black = _mm_andnot_si128(alphaMask, black);
  95. white = _mm_slli_si128(white, 2);
  96. white = _mm_and_si128(alphaMask, white);
  97. black = _mm_or_si128(white, black);
  98. _mm_store_si128((__m128i*)blackData, black);
  99. blackData += 16;
  100. whiteData += 16;
  101. }
  102. // Loop single pixels until we're done.
  103. while (j < size.width) {
  104. *((uint32_t*)blackData) =
  105. RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
  106. *reinterpret_cast<uint32_t*>(whiteData));
  107. blackData += 4;
  108. whiteData += 4;
  109. j++;
  110. }
  111. blackData += blackSurf->Stride() - j * 4;
  112. whiteData += whiteSurf->Stride() - j * 4;
  113. }
  114. blackSurf->MarkDirty();
  115. return true;
  116. }
  117. static int32_t
  118. ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1)
  119. {
  120. return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
  121. }
  122. /*static*/ mozilla::gfx::IntRect
  123. gfxAlphaRecovery::AlignRectForSubimageRecovery(const mozilla::gfx::IntRect& aRect,
  124. gfxImageSurface* aSurface)
  125. {
  126. NS_ASSERTION(mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 == aSurface->Format(),
  127. "Thebes grew support for non-ARGB32 COLOR_ALPHA?");
  128. static const int32_t kByteAlignLog2 = GoodAlignmentLog2();
  129. static const int32_t bpp = 4;
  130. static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp;
  131. //
  132. // We're going to create a subimage of the surface with size
  133. // <sw,sh> for alpha recovery, and want a SIMD fast-path. The
  134. // rect <x,y, w,h> /needs/ to be redrawn, but it might not be
  135. // properly aligned for SIMD. So we want to find a rect <x',y',
  136. // w',h'> that's a superset of what needs to be redrawn but is
  137. // properly aligned. Proper alignment is
  138. //
  139. // BPP * (x' + y' * sw) \cong 0 (mod ALIGN)
  140. // BPP * w' \cong BPP * sw (mod ALIGN)
  141. //
  142. // (We assume the pixel at surface <0,0> is already ALIGN'd.)
  143. // That rect (obviously) has to fit within the surface bounds, and
  144. // we should also minimize the extra pixels redrawn only for
  145. // alignment's sake. So we also want
  146. //
  147. // minimize <x',y', w',h'>
  148. // 0 <= x' <= x
  149. // 0 <= y' <= y
  150. // w <= w' <= sw
  151. // h <= h' <= sh
  152. //
  153. // This is a messy integer non-linear programming problem, except
  154. // ... we can assume that ALIGN/BPP is a very small constant. So,
  155. // brute force is viable. The algorithm below will find a
  156. // solution if one exists, but isn't guaranteed to find the
  157. // minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at
  158. // most 64 iterations below). In what's likely the common case,
  159. // an already-aligned rectangle, it only needs 1 iteration.
  160. //
  161. // Is this alignment worth doing? Recovering alpha will take work
  162. // proportional to w*h (assuming alpha recovery computation isn't
  163. // memory bound). This analysis can lead to O(w+h) extra work
  164. // (with small constants). In exchange, we expect to shave off a
  165. // ALIGN/BPP constant by using SIMD-ized alpha recovery. So as
  166. // w*h diverges from w+h, the win factor approaches ALIGN/BPP. We
  167. // only really care about the w*h >> w+h case anyway; others
  168. // should be fast enough even with the overhead. (Unless the cost
  169. // of repainting the expanded rect is high, but in that case
  170. // SIMD-ized alpha recovery won't make a difference so this code
  171. // shouldn't be called.)
  172. //
  173. mozilla::gfx::IntSize surfaceSize = aSurface->GetSize();
  174. const int32_t stride = bpp * surfaceSize.width;
  175. if (stride != aSurface->Stride()) {
  176. NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
  177. return aRect;
  178. }
  179. const int32_t x = aRect.x, y = aRect.y, w = aRect.width, h = aRect.height;
  180. const int32_t r = x + w;
  181. const int32_t sw = surfaceSize.width;
  182. const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride);
  183. // The outer two loops below keep the rightmost (|r| above) and
  184. // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
  185. // return only a superset of the original rect. These loops
  186. // search for an aligned top-left pixel by trying to expand <x,y>
  187. // left and up by <dx,dy> pixels, respectively.
  188. //
  189. // Then if a properly-aligned top-left pixel is found, the
  190. // innermost loop tries to find an aligned stride by moving the
  191. // rightmost pixel rightward by dr.
  192. int32_t dx, dy, dr;
  193. for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
  194. for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
  195. if (0 != ByteAlignment(kByteAlignLog2,
  196. bpp * (x - dx), y - dy, stride)) {
  197. continue;
  198. }
  199. for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
  200. if (strideAlign == ByteAlignment(kByteAlignLog2,
  201. bpp * (w + dr + dx))) {
  202. goto FOUND_SOLUTION;
  203. }
  204. }
  205. }
  206. }
  207. // Didn't find a solution.
  208. return aRect;
  209. FOUND_SOLUTION:
  210. mozilla::gfx::IntRect solution = mozilla::gfx::IntRect(x - dx, y - dy, w + dr + dx, h + dy);
  211. MOZ_ASSERT(mozilla::gfx::IntRect(0, 0, sw, surfaceSize.height).Contains(solution),
  212. "'Solution' extends outside surface bounds!");
  213. return solution;
  214. }