Scanline.cc 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. #include "Scanline.hh"
  2. #include "PixelOperations.hh"
  3. #include "unreachable.hh"
  4. #include <cassert>
  5. #include <cstddef>
  6. #include <cstring>
  7. #ifdef __SSE2__
  8. #include <emmintrin.h>
  9. #endif
  10. namespace openmsx {
  11. // class Multiply<uint16_t>
  12. Multiply<uint16_t>::Multiply(const PixelOperations<uint16_t>& pixelOps_)
  13. : pixelOps(pixelOps_)
  14. {
  15. factor = 0;
  16. memset(tab, 0, sizeof(tab));
  17. }
  18. void Multiply<uint16_t>::setFactor(unsigned f)
  19. {
  20. if (f == factor) {
  21. return;
  22. }
  23. factor = f;
  24. for (unsigned p = 0; p < 0x10000; ++p) {
  25. tab[p] = ((((p & pixelOps.getRmask()) * f) >> 8) & pixelOps.getRmask()) |
  26. ((((p & pixelOps.getGmask()) * f) >> 8) & pixelOps.getGmask()) |
  27. ((((p & pixelOps.getBmask()) * f) >> 8) & pixelOps.getBmask());
  28. }
  29. }
  30. inline uint16_t Multiply<uint16_t>::multiply(uint16_t p, unsigned f) const
  31. {
  32. unsigned r = (((p & pixelOps.getRmask()) * f) >> 8) & pixelOps.getRmask();
  33. unsigned g = (((p & pixelOps.getGmask()) * f) >> 8) & pixelOps.getGmask();
  34. unsigned b = (((p & pixelOps.getBmask()) * f) >> 8) & pixelOps.getBmask();
  35. return r | g | b;
  36. }
  37. inline uint16_t Multiply<uint16_t>::multiply(uint16_t p) const
  38. {
  39. return tab[p];
  40. }
  41. inline const uint16_t* Multiply<uint16_t>::getTable() const
  42. {
  43. return tab;
  44. }
  45. // class Multiply<uint32_t>
  46. Multiply<uint32_t>::Multiply(const PixelOperations<uint32_t>& /*pixelOps*/)
  47. {
  48. }
  49. void Multiply<uint32_t>::setFactor(unsigned f)
  50. {
  51. factor = f;
  52. }
  53. inline uint32_t Multiply<uint32_t>::multiply(uint32_t p, unsigned f) const
  54. {
  55. return PixelOperations<uint32_t>::multiply(p, f);
  56. }
  57. inline uint32_t Multiply<uint32_t>::multiply(uint32_t p) const
  58. {
  59. return multiply(p, factor);
  60. }
  61. const uint32_t* Multiply<uint32_t>::getTable() const
  62. {
  63. UNREACHABLE; return nullptr;
  64. }
  65. #ifdef __SSE2__
  66. // 32bpp
  67. static inline void drawSSE2_1(
  68. const char* __restrict in1, const char* __restrict in2,
  69. char* __restrict out, __m128i f)
  70. {
  71. __m128i zero = _mm_setzero_si128();
  72. __m128i a = *reinterpret_cast<const __m128i*>(in1);
  73. __m128i b = *reinterpret_cast<const __m128i*>(in2);
  74. __m128i c = _mm_avg_epu8(a, b);
  75. __m128i l = _mm_unpacklo_epi8(c, zero);
  76. __m128i h = _mm_unpackhi_epi8(c, zero);
  77. __m128i m = _mm_mulhi_epu16(l, f);
  78. __m128i n = _mm_mulhi_epu16(h, f);
  79. __m128i r = _mm_packus_epi16(m, n);
  80. *reinterpret_cast<__m128i*>(out) = r;
  81. }
  82. static inline void drawSSE2(
  83. const uint32_t* __restrict in1_,
  84. const uint32_t* __restrict in2_,
  85. uint32_t* __restrict out_,
  86. unsigned factor,
  87. size_t width,
  88. PixelOperations<uint32_t>& /*dummy*/,
  89. Multiply<uint32_t>& /*dummy*/)
  90. {
  91. width *= sizeof(uint32_t); // in bytes
  92. assert(width >= 64);
  93. assert((reinterpret_cast<uintptr_t>(in1_) % sizeof(__m128i)) == 0);
  94. assert((reinterpret_cast<uintptr_t>(in2_) % sizeof(__m128i)) == 0);
  95. assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
  96. auto* in1 = reinterpret_cast<const char*>(in1_) + width;
  97. auto* in2 = reinterpret_cast<const char*>(in2_) + width;
  98. auto* out = reinterpret_cast< char*>(out_) + width;
  99. __m128i f = _mm_set1_epi16(factor << 8);
  100. ptrdiff_t x = -ptrdiff_t(width);
  101. do {
  102. drawSSE2_1(in1 + x + 0, in2 + x + 0, out + x + 0, f);
  103. drawSSE2_1(in1 + x + 16, in2 + x + 16, out + x + 16, f);
  104. drawSSE2_1(in1 + x + 32, in2 + x + 32, out + x + 32, f);
  105. drawSSE2_1(in1 + x + 48, in2 + x + 48, out + x + 48, f);
  106. x += 64;
  107. } while (x < 0);
  108. }
  109. // 16bpp
  110. static inline void drawSSE2(
  111. const uint16_t* __restrict in1_,
  112. const uint16_t* __restrict in2_,
  113. uint16_t* __restrict out_,
  114. unsigned factor,
  115. size_t width,
  116. PixelOperations<uint16_t>& pixelOps,
  117. Multiply<uint16_t>& darkener)
  118. {
  119. width *= sizeof(uint16_t); // in bytes
  120. assert(width >= 16);
  121. auto* in1 = reinterpret_cast<const char*>(in1_) + width;
  122. auto* in2 = reinterpret_cast<const char*>(in2_) + width;
  123. auto* out = reinterpret_cast< char*>(out_) + width;
  124. darkener.setFactor(factor);
  125. const uint16_t* table = darkener.getTable();
  126. __m128i mask = _mm_set1_epi16(pixelOps.getBlendMask());
  127. ptrdiff_t x = -ptrdiff_t(width);
  128. do {
  129. __m128i a = *reinterpret_cast<const __m128i*>(in1 + x);
  130. __m128i b = *reinterpret_cast<const __m128i*>(in2 + x);
  131. __m128i c = _mm_add_epi16(
  132. _mm_and_si128(a, b),
  133. _mm_srli_epi16(
  134. _mm_and_si128(mask, _mm_xor_si128(a, b)),
  135. 1));
  136. *reinterpret_cast<__m128i*>(out + x) = _mm_set_epi16(
  137. table[_mm_extract_epi16(c, 7)],
  138. table[_mm_extract_epi16(c, 6)],
  139. table[_mm_extract_epi16(c, 5)],
  140. table[_mm_extract_epi16(c, 4)],
  141. table[_mm_extract_epi16(c, 3)],
  142. table[_mm_extract_epi16(c, 2)],
  143. table[_mm_extract_epi16(c, 1)],
  144. table[_mm_extract_epi16(c, 0)]);
  145. // An alternative for the above statement is this block (this
  146. // is close to what we has in our old MMX routine). On gcc this
  147. // generates significantly shorter (25%) but also significantly
  148. // slower (30%) code. On clang both alternatives generate
  149. // identical code, comparable in size to the fast gcc version
  150. // (but still a bit faster).
  151. //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 0)], 0);
  152. //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 1)], 1);
  153. //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 2)], 2);
  154. //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 3)], 3);
  155. //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 4)], 4);
  156. //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 5)], 5);
  157. //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 6)], 6);
  158. //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 7)], 7);
  159. //*reinterpret_cast<__m128i*>(out + x) = c;
  160. x += 16;
  161. } while (x < 0);
  162. }
  163. #endif
  164. // class Scanline
  165. template <class Pixel>
  166. Scanline<Pixel>::Scanline(const PixelOperations<Pixel>& pixelOps_)
  167. : darkener(pixelOps_)
  168. , pixelOps(pixelOps_)
  169. {
  170. }
  171. template <class Pixel>
  172. void Scanline<Pixel>::draw(
  173. const Pixel* __restrict src1, const Pixel* __restrict src2,
  174. Pixel* __restrict dst, unsigned factor, size_t width)
  175. {
  176. #ifdef __SSE2__
  177. drawSSE2(src1, src2, dst, factor, width, pixelOps, darkener);
  178. #else
  179. // non-SSE2 routine, both 16bpp and 32bpp
  180. darkener.setFactor(factor);
  181. for (unsigned x = 0; x < width; ++x) {
  182. dst[x] = darkener.multiply(
  183. pixelOps.template blend<1, 1>(src1[x], src2[x]));
  184. }
  185. #endif
  186. }
  187. template <class Pixel>
  188. Pixel Scanline<Pixel>::darken(Pixel p, unsigned factor)
  189. {
  190. return darkener.multiply(p, factor);
  191. }
  192. template <class Pixel>
  193. Pixel Scanline<Pixel>::darken(Pixel p1, Pixel p2, unsigned factor)
  194. {
  195. return darkener.multiply(pixelOps.template blend<1, 1>(p1, p2), factor);
  196. }
  197. // Force template instantiation.
  198. #if HAVE_16BPP
  199. template class Scanline<uint16_t>;
  200. #endif
  201. #if HAVE_32BPP
  202. template class Scanline<uint32_t>;
  203. #endif
  204. } // namespace openmsx