Simple2xScaler.cc 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. #include "Simple2xScaler.hh"
  2. #include "SuperImposedVideoFrame.hh"
  3. #include "LineScalers.hh"
  4. #include "RawFrame.hh"
  5. #include "ScalerOutput.hh"
  6. #include "RenderSettings.hh"
  7. #include "unreachable.hh"
  8. #include "vla.hh"
  9. #include <cassert>
  10. #include <cstddef>
  11. #include <cstdint>
  12. #ifdef __SSE2__
  13. #include <emmintrin.h>
  14. #endif
  15. namespace openmsx {
  16. // class Simple2xScaler
  17. template <class Pixel>
  18. Simple2xScaler<Pixel>::Simple2xScaler(
  19. const PixelOperations<Pixel>& pixelOps_,
  20. RenderSettings& renderSettings)
  21. : Scaler2<Pixel>(pixelOps_)
  22. , settings(renderSettings)
  23. , pixelOps(pixelOps_)
  24. , mult1(pixelOps)
  25. , mult2(pixelOps)
  26. , mult3(pixelOps)
  27. , scanline(pixelOps)
  28. {
  29. }
  30. template <class Pixel>
  31. void Simple2xScaler<Pixel>::scaleBlank1to2(
  32. FrameSource& src, unsigned srcStartY, unsigned srcEndY,
  33. ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
  34. {
  35. int scanlineFactor = settings.getScanlineFactor();
  36. unsigned dstHeight = dst.getHeight();
  37. unsigned stopDstY = (dstEndY == dstHeight)
  38. ? dstEndY : dstEndY - 2;
  39. unsigned srcY = srcStartY, dstY = dstStartY;
  40. for (/* */; dstY < stopDstY; srcY += 1, dstY += 2) {
  41. auto color0 = src.getLineColor<Pixel>(srcY);
  42. dst.fillLine(dstY + 0, color0);
  43. Pixel color1 = scanline.darken(color0, scanlineFactor);
  44. dst.fillLine(dstY + 1, color1);
  45. }
  46. if (dstY != dstHeight) {
  47. unsigned nextLineWidth = src.getLineWidth(srcY + 1);
  48. assert(src.getLineWidth(srcY) == 1);
  49. assert(nextLineWidth != 1);
  50. this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
  51. dst, dstY, dstEndY);
  52. }
  53. }
  54. #ifdef __SSE2__
  55. // Combines upper-half of 'x' with lower half of 'y'.
  56. static inline __m128i shuffle(__m128i x, __m128i y)
  57. {
  58. // mm_shuffle_pd() actually shuffles 64-bit floating point values, we
  59. // need to shuffle integers. Though floats and ints are stored in the
  60. // same xmmN registers. So this instruction does the right thing.
  61. // However (some?) x86 CPUs keep the float and integer interpretations
  62. // of these registers in different physical locations in the chip and
  63. // there is some overhead on switching between these interpretations.
  64. // So the casts in the statement below don't generate any instructions,
  65. // but they still can cause overhead on (some?) CPUs.
  66. return _mm_castpd_si128(_mm_shuffle_pd(
  67. _mm_castsi128_pd(x), _mm_castsi128_pd(y), 1));
  68. }
  69. // 32bpp
  70. static void blur1on2_SSE2(
  71. const uint32_t* __restrict in_, uint32_t* __restrict out_,
  72. unsigned c1_, unsigned c2_, size_t width)
  73. {
  74. width *= sizeof(uint32_t); // in bytes
  75. assert(width >= (2 * sizeof(__m128i)));
  76. assert((reinterpret_cast<uintptr_t>(in_ ) % sizeof(__m128i)) == 0);
  77. assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
  78. ptrdiff_t x = -ptrdiff_t(width - sizeof(__m128i));
  79. auto* in = reinterpret_cast<const char*>(in_ ) - x;
  80. auto* out = reinterpret_cast< char*>(out_) - 2 * x;
  81. // Setup first iteration
  82. __m128i c1 = _mm_set1_epi16(c1_);
  83. __m128i c2 = _mm_set1_epi16(c2_);
  84. __m128i zero = _mm_setzero_si128();
  85. __m128i abcd = *reinterpret_cast<const __m128i*>(in);
  86. __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
  87. __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
  88. __m128i d1a1 = _mm_mullo_epi16(c1, d0a0);
  89. // Each iteration reads 4 pixels and generates 8 pixels
  90. do {
  91. // At the start of each iteration these variables are live:
  92. // abcd, a0b0, d1a1
  93. __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
  94. __m128i b0c0 = shuffle(a0b0, c0d0);
  95. __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
  96. __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
  97. __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
  98. __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
  99. __m128i abab = _mm_packus_epi16(daab, abbc);
  100. *reinterpret_cast<__m128i*>(out + 2 * x) =
  101. _mm_shuffle_epi32(abab, 0xd8);
  102. abcd = *reinterpret_cast<const __m128i*>(in + x + 16);
  103. a0b0 = _mm_unpacklo_epi8(abcd, zero);
  104. __m128i d0a0_= shuffle(c0d0, a0b0);
  105. __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
  106. d1a1 = _mm_mullo_epi16(c1, d0a0_);
  107. __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
  108. __m128i cdda = _mm_srli_epi16(_mm_add_epi16(c2d2, d1a1), 8);
  109. __m128i cdcd = _mm_packus_epi16(bccd, cdda);
  110. *reinterpret_cast<__m128i*>(out + 2 * x + 16) =
  111. _mm_shuffle_epi32(cdcd, 0xd8);
  112. x += 16;
  113. } while (x < 0);
  114. // Last iteration (because this doesn't need to read new input)
  115. __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
  116. __m128i b0c0 = shuffle(a0b0, c0d0);
  117. __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
  118. __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
  119. __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
  120. __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
  121. __m128i abab = _mm_packus_epi16(daab, abbc);
  122. *reinterpret_cast<__m128i*>(out) = _mm_shuffle_epi32(abab, 0xd8);
  123. __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
  124. __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
  125. __m128i d1d1 = _mm_mullo_epi16(c1, d0d0);
  126. __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
  127. __m128i cddd = _mm_srli_epi16(_mm_add_epi16(c2d2, d1d1), 8);
  128. __m128i cdcd = _mm_packus_epi16(bccd, cddd);
  129. *reinterpret_cast<__m128i*>(out + 16) = _mm_shuffle_epi32(cdcd, 0xd8);
  130. }
  131. // no SSE2 16bpp routine yet (probably not worth the effort)
  132. static void blur1on2_SSE2(const uint16_t* /*in*/, uint16_t* /*out*/,
  133. unsigned /*c1*/, unsigned /*c2*/, size_t /*width*/)
  134. {
  135. UNREACHABLE;
  136. }
  137. #endif
  138. template <class Pixel>
  139. void Simple2xScaler<Pixel>::blur1on2(
  140. const Pixel* __restrict pIn, Pixel* __restrict pOut,
  141. unsigned alpha, size_t srcWidth)
  142. {
  143. /* This routine is functionally equivalent to the following:
  144. *
  145. * void blur1on2(const Pixel* pIn, Pixel* pOut, unsigned alpha)
  146. * {
  147. * unsigned c1 = alpha / 4;
  148. * unsigned c2 = 256 - c1;
  149. *
  150. * Pixel prev, curr, next;
  151. * prev = curr = pIn[0];
  152. *
  153. * unsigned x;
  154. * for (x = 0; x < (srcWidth - 1); ++x) {
  155. * pOut[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
  156. * Pixel next = pIn[x + 1];
  157. * pOut[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
  158. * prev = curr;
  159. * curr = next;
  160. * }
  161. *
  162. * pOut[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
  163. * next = curr;
  164. * pOut[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
  165. * }
  166. */
  167. if (alpha == 0) {
  168. Scale_1on2<Pixel> scale;
  169. scale(pIn, pOut, 2 * srcWidth);
  170. return;
  171. }
  172. assert(alpha <= 256);
  173. unsigned c1 = alpha / 4;
  174. unsigned c2 = 256 - c1;
  175. #ifdef __SSE2__
  176. if (sizeof(Pixel) == 4) {
  177. // SSE2, only 32bpp
  178. blur1on2_SSE2(pIn, pOut, c1, c2, srcWidth);
  179. return;
  180. }
  181. #endif
  182. // C++ routine, both 16bpp and 32bpp.
  183. // The loop is 2x unrolled and all common subexpressions and redundant
  184. // assignments have been eliminated. 1 iteration generates 4 pixels.
  185. mult1.setFactor32(c1);
  186. mult2.setFactor32(c2);
  187. Pixel p0 = pIn[0];
  188. Pixel p1;
  189. unsigned f0 = mult1.mul32(p0);
  190. unsigned f1 = f0;
  191. unsigned tmp;
  192. size_t x;
  193. for (x = 0; x < (srcWidth - 2); x += 2) {
  194. tmp = mult2.mul32(p0);
  195. pOut[2 * x + 0] = mult1.conv32(f1 + tmp);
  196. p1 = pIn[x + 1];
  197. f1 = mult1.mul32(p1);
  198. pOut[2 * x + 1] = mult1.conv32(f1 + tmp);
  199. tmp = mult2.mul32(p1);
  200. pOut[2 * x + 2] = mult1.conv32(f0 + tmp);
  201. p0 = pIn[x + 2];
  202. f0 = mult1.mul32(p0);
  203. pOut[2 * x + 3] = mult1.conv32(f0 + tmp);
  204. }
  205. tmp = mult2.mul32(p0);
  206. pOut[2 * x + 0] = mult1.conv32(f1 + tmp);
  207. p1 = pIn[x + 1];
  208. f1 = mult1.mul32(p1);
  209. pOut[2 * x + 1] = mult1.conv32(f1 + tmp);
  210. tmp = mult2.mul32(p1);
  211. pOut[2 * x + 2] = mult1.conv32(f0 + tmp);
  212. pOut[2 * x + 3] = p1;
  213. }
  214. #ifdef __SSE2__
  215. // 32bpp
  216. static void blur1on1_SSE2(
  217. const uint32_t* __restrict in_, uint32_t* __restrict out_,
  218. unsigned c1_, unsigned c2_, size_t width)
  219. {
  220. width *= sizeof(uint32_t); // in bytes
  221. assert(width >= (2 * sizeof(__m128i)));
  222. assert((reinterpret_cast<uintptr_t>(in_ ) % sizeof(__m128i)) == 0);
  223. assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
  224. ptrdiff_t x = -ptrdiff_t(width - sizeof(__m128i));
  225. auto* in = reinterpret_cast<const char*>(in_ ) - x;
  226. auto* out = reinterpret_cast< char*>(out_) - x;
  227. // Setup first iteration
  228. __m128i c1 = _mm_set1_epi16(c1_);
  229. __m128i c2 = _mm_set1_epi16(c2_);
  230. __m128i zero = _mm_setzero_si128();
  231. __m128i abcd = *reinterpret_cast<const __m128i*>(in);
  232. __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
  233. __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
  234. // Each iteration reads 4 pixels and generates 4 pixels
  235. do {
  236. // At the start of each iteration these variables are live:
  237. // abcd, a0b0, d0a0
  238. __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
  239. __m128i b0c0 = shuffle(a0b0, c0d0);
  240. __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
  241. __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
  242. __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
  243. abcd = *reinterpret_cast<const __m128i*>(in + x + 16);
  244. a0b0 = _mm_unpacklo_epi8(abcd, zero);
  245. d0a0 = shuffle(c0d0, a0b0);
  246. __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
  247. __m128i bdca = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0a0));
  248. __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdca, c2d2), 8);
  249. *reinterpret_cast<__m128i*>(out + x) =
  250. _mm_packus_epi16(aabb, ccdd);
  251. x += 16;
  252. } while (x < 0);
  253. // Last iteration (because this doesn't need to read new input)
  254. __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
  255. __m128i b0c0 = shuffle(a0b0, c0d0);
  256. __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
  257. __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
  258. __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
  259. __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
  260. __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
  261. __m128i bdcd = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0d0));
  262. __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdcd, c2d2), 8);
  263. *reinterpret_cast<__m128i*>(out) = _mm_packus_epi16(aabb, ccdd);
  264. }
  265. // no SSE2 16bpp routine yet (probably not worth the effort)
  266. static void blur1on1_SSE2(const uint16_t* /*in*/, uint16_t* /*out*/,
  267. unsigned /*c1*/, unsigned /*c2*/, size_t /*width*/)
  268. {
  269. UNREACHABLE;
  270. }
  271. #endif
  272. template <class Pixel>
  273. void Simple2xScaler<Pixel>::blur1on1(
  274. const Pixel* __restrict pIn, Pixel* __restrict pOut,
  275. unsigned alpha, size_t srcWidth)
  276. {
  277. /* This routine is functionally equivalent to the following:
  278. *
  279. * void blur1on1(const Pixel* pIn, Pixel* pOut, unsigned alpha)
  280. * {
  281. * unsigned c1 = alpha / 4;
  282. * unsigned c2 = 256 - alpha / 2;
  283. *
  284. * Pixel prev, curr, next;
  285. * prev = curr = pIn[0];
  286. *
  287. * unsigned x;
  288. * for (x = 0; x < (srcWidth - 1); ++x) {
  289. * next = pIn[x + 1];
  290. * pOut[x] = (c1 * prev + c2 * curr + c1 * next) >> 8;
  291. * prev = curr;
  292. * curr = next;
  293. * }
  294. *
  295. * next = curr;
  296. * pOut[x] = c1 * prev + c2 * curr + c1 * next;
  297. * }
  298. */
  299. if (alpha == 0) {
  300. Scale_1on1<Pixel> copy;
  301. copy(pIn, pOut, srcWidth);
  302. return;
  303. }
  304. unsigned c1 = alpha / 4;
  305. unsigned c2 = 256 - alpha / 2;
  306. #ifdef __SSE2__
  307. if (sizeof(Pixel) == 4) {
  308. // SSE2, only 32bpp
  309. blur1on1_SSE2(pIn, pOut, c1, c2, srcWidth);
  310. return;
  311. }
  312. #endif
  313. // C++ routine, both 16bpp and 32bpp.
  314. // The loop is 2x unrolled and all common subexpressions and redundant
  315. // assignments have been eliminated. 1 iteration generates 2 pixels.
  316. mult1.setFactor32(c1);
  317. mult3.setFactor32(c2);
  318. Pixel p0 = pIn[0];
  319. Pixel p1;
  320. unsigned f0 = mult1.mul32(p0);
  321. unsigned f1 = f0;
  322. size_t x;
  323. for (x = 0; x < (srcWidth - 2); x += 2) {
  324. p1 = pIn[x + 1];
  325. unsigned t0 = mult1.mul32(p1);
  326. pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
  327. f0 = t0;
  328. p0 = pIn[x + 2];
  329. unsigned t1 = mult1.mul32(p0);
  330. pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t1);
  331. f1 = t1;
  332. }
  333. p1 = pIn[x + 1];
  334. unsigned t0 = mult1.mul32(p1);
  335. pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
  336. pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t0);
  337. }
  338. template <class Pixel>
  339. void Simple2xScaler<Pixel>::drawScanline(
  340. const Pixel* in1, const Pixel* in2, Pixel* out, int factor,
  341. unsigned dstWidth)
  342. {
  343. if (factor != 255) {
  344. scanline.draw(in1, in2, out, factor, dstWidth);
  345. } else {
  346. Scale_1on1<Pixel> scale;
  347. scale(in1, out, dstWidth);
  348. }
  349. }
  350. template <class Pixel>
  351. void Simple2xScaler<Pixel>::scale1x1to2x2(FrameSource& src,
  352. unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
  353. ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
  354. {
  355. VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
  356. int blur = settings.getBlurFactor();
  357. int scanlineFactor = settings.getScanlineFactor();
  358. unsigned dstY = dstStartY;
  359. auto* srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
  360. auto* dstLine0 = dst.acquireLine(dstY + 0);
  361. blur1on2(srcLine, dstLine0, blur, srcWidth);
  362. for (/**/; dstY < dstEndY - 2; dstY += 2) {
  363. srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
  364. auto* dstLine2 = dst.acquireLine(dstY + 2);
  365. blur1on2(srcLine, dstLine2, blur, srcWidth);
  366. auto* dstLine1 = dst.acquireLine(dstY + 1);
  367. drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
  368. 2 * srcWidth);
  369. dst.releaseLine(dstY + 0, dstLine0);
  370. dst.releaseLine(dstY + 1, dstLine1);
  371. dstLine0 = dstLine2;
  372. }
  373. srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
  374. VLA_SSE_ALIGNED(Pixel, buf2, 2 * srcWidth);
  375. blur1on2(srcLine, buf2, blur, srcWidth);
  376. auto* dstLine1 = dst.acquireLine(dstY + 1);
  377. drawScanline(dstLine0, buf2, dstLine1, scanlineFactor, 2 * srcWidth);
  378. dst.releaseLine(dstY + 0, dstLine0);
  379. dst.releaseLine(dstY + 1, dstLine1);
  380. }
  381. template <class Pixel>
  382. void Simple2xScaler<Pixel>::scale1x1to1x2(FrameSource& src,
  383. unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
  384. ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
  385. {
  386. VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
  387. int blur = settings.getBlurFactor();
  388. int scanlineFactor = settings.getScanlineFactor();
  389. unsigned dstY = dstStartY;
  390. auto* srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
  391. auto* dstLine0 = dst.acquireLine(dstY);
  392. blur1on1(srcLine, dstLine0, blur, srcWidth);
  393. for (/**/; dstY < dstEndY - 2; dstY += 2) {
  394. srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
  395. auto* dstLine2 = dst.acquireLine(dstY + 2);
  396. blur1on1(srcLine, dstLine2, blur, srcWidth);
  397. auto* dstLine1 = dst.acquireLine(dstY + 1);
  398. drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
  399. srcWidth);
  400. dst.releaseLine(dstY + 0, dstLine0);
  401. dst.releaseLine(dstY + 1, dstLine1);
  402. dstLine0 = dstLine2;
  403. }
  404. srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
  405. VLA_SSE_ALIGNED(Pixel, buf2, srcWidth);
  406. blur1on1(srcLine, buf2, blur, srcWidth);
  407. auto* dstLine1 = dst.acquireLine(dstY + 1);
  408. drawScanline(dstLine0, buf2, dstLine1, scanlineFactor, srcWidth);
  409. dst.releaseLine(dstY + 0, dstLine0);
  410. dst.releaseLine(dstY + 1, dstLine1);
  411. }
  412. template <class Pixel>
  413. void Simple2xScaler<Pixel>::scaleImage(
  414. FrameSource& src, const RawFrame* superImpose,
  415. unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
  416. ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
  417. {
  418. if (superImpose) {
  419. // Note: this implementation is different from the openGL
  420. // version. Here we first alpha-blend and then scale, so the
  421. // video layer will also get blurred (and possibly down-scaled
  422. // to MSX resolution). The openGL version will only blur the
  423. // MSX frame, then blend with the video frame and then apply
  424. // scanlines. I think the openGL version is visually slightly
  425. // better, but much more work to implement in software (in
  426. // openGL shaders it's very easy). Maybe we can improve this
  427. // later (if required at all).
  428. SuperImposedVideoFrame<Pixel> sf(src, *superImpose, pixelOps);
  429. srcWidth = sf.getLineWidth(srcStartY);
  430. this->dispatchScale(sf, srcStartY, srcEndY, srcWidth,
  431. dst, dstStartY, dstEndY);
  432. } else {
  433. this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
  434. dst, dstStartY, dstEndY);
  435. }
  436. }
  437. // Force template instantiation.
  438. #if HAVE_16BPP
  439. template class Simple2xScaler<uint16_t>;
  440. #endif
  441. #if HAVE_32BPP
  442. template class Simple2xScaler<uint32_t>;
  443. #endif
  444. } // namespace openmsx