FBPostProcessor.cc 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. #include "FBPostProcessor.hh"
  2. #include "RawFrame.hh"
  3. #include "StretchScalerOutput.hh"
  4. #include "ScalerOutput.hh"
  5. #include "RenderSettings.hh"
  6. #include "Scaler.hh"
  7. #include "ScalerFactory.hh"
  8. #include "SDLOutputSurface.hh"
  9. #include "Math.hh"
  10. #include "aligned.hh"
  11. #include "checked_cast.hh"
  12. #include "random.hh"
  13. #include "xrange.hh"
  14. #include <algorithm>
  15. #include <cassert>
  16. #include <cmath>
  17. #include <cstdint>
  18. #include <cstddef>
  19. #ifdef __SSE2__
  20. #include <emmintrin.h>
  21. #endif
  22. namespace openmsx {
  23. constexpr unsigned NOISE_SHIFT = 8192;
  24. constexpr unsigned NOISE_BUF_SIZE = 2 * NOISE_SHIFT;
  25. alignas(SSE_ALIGNMENT) static signed char noiseBuf[NOISE_BUF_SIZE];
  26. template <class Pixel>
  27. void FBPostProcessor<Pixel>::preCalcNoise(float factor)
  28. {
  29. // We skip noise drawing if the factor is 0, so there is no point in
  30. // initializing the random data in that case.
  31. if (factor == 0.0f) return;
  32. // for 32bpp groups of 4 consecutive noiseBuf elements (starting at
  33. // 4 element boundaries) must have the same value. Later optimizations
  34. // depend on it.
  35. float scale[4];
  36. if (sizeof(Pixel) == 4) {
  37. // 32bpp
  38. // TODO ATM we compensate for big endian here. A better
  39. // alternative is to turn noiseBuf into an array of ints (it's
  40. // now bytes) and in the 16bpp code extract R,G,B components
  41. // from those ints
  42. const auto p = Pixel(OPENMSX_BIGENDIAN ? 0x00010203
  43. : 0x03020100);
  44. // TODO we can also fill the array with 'factor' and only set
  45. // 'alpha' to 0.0. But PixelOperations doesn't offer a simple
  46. // way to get the position of the alpha byte (yet).
  47. scale[0] = scale[1] = scale[2] = scale[3] = 0.0f;
  48. scale[pixelOps.red (p)] = factor;
  49. scale[pixelOps.green(p)] = factor;
  50. scale[pixelOps.blue (p)] = factor;
  51. } else {
  52. // 16bpp
  53. scale[0] = (pixelOps.getMaxRed() / 255.0f) * factor;
  54. scale[1] = (pixelOps.getMaxGreen() / 255.0f) * factor;
  55. scale[2] = (pixelOps.getMaxBlue() / 255.0f) * factor;
  56. scale[3] = 0.0f;
  57. }
  58. auto& generator = global_urng(); // fast (non-cryptographic) random numbers
  59. std::normal_distribution<float> distribution(0.0f, 1.0f);
  60. for (unsigned i = 0; i < NOISE_BUF_SIZE; i += 4) {
  61. float r = distribution(generator);
  62. noiseBuf[i + 0] = Math::clip<-128, 127>(roundf(r * scale[0]));
  63. noiseBuf[i + 1] = Math::clip<-128, 127>(roundf(r * scale[1]));
  64. noiseBuf[i + 2] = Math::clip<-128, 127>(roundf(r * scale[2]));
  65. noiseBuf[i + 3] = Math::clip<-128, 127>(roundf(r * scale[3]));
  66. }
  67. }
  68. #ifdef __SSE2__
  69. static inline void drawNoiseLineSse2(uint32_t* buf_, signed char* noise, size_t width)
  70. {
  71. // To each of the RGBA color components (a value in range [0..255]) we
  72. // want to add a signed noise value (in range [-128..127]) and also clip
  73. // the result to the range [0..255]. There is no SSE instruction that
  74. // directly performs this operation. But we can:
  75. // - subtract 128 from the RGBA component to get a signed byte
  76. // - perform the addition with signed saturation
  77. // - add 128 to the result to get back to the unsigned byte range
  78. // For 8-bit values the following 3 expressions are equivalent:
  79. // x + 128 == x - 128 == x ^ 128
  80. // So the expression becomes:
  81. // signed_add_sat(value ^ 128, noise) ^ 128
  82. // The follwoing loop does just that, though it processes 64 bytes per
  83. // iteration.
  84. ptrdiff_t x = width * sizeof(uint32_t);
  85. assert((x & 63) == 0);
  86. assert((uintptr_t(buf_) & 15) == 0);
  87. char* buf = reinterpret_cast<char*>(buf_) + x;
  88. char* nse = reinterpret_cast<char*>(noise) + x;
  89. x = -x;
  90. __m128i b7 = _mm_set1_epi8(-128); // 0x80
  91. do {
  92. __m128i i0 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 0));
  93. __m128i i1 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 16));
  94. __m128i i2 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 32));
  95. __m128i i3 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 48));
  96. __m128i n0 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 0));
  97. __m128i n1 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 16));
  98. __m128i n2 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 32));
  99. __m128i n3 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 48));
  100. __m128i o0 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i0, b7), n0), b7);
  101. __m128i o1 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i1, b7), n1), b7);
  102. __m128i o2 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i2, b7), n2), b7);
  103. __m128i o3 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i3, b7), n3), b7);
  104. _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 0), o0);
  105. _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 16), o1);
  106. _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 32), o2);
  107. _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 48), o3);
  108. x += 4 * sizeof(__m128i);
  109. } while (x < 0);
  110. }
  111. #endif
  112. /** Add noise to the given pixel.
  113. * @param p contains 4 8-bit unsigned components, so components have range [0, 255]
  114. * @param n contains 4 8-bit signed components, so components have range [-128, 127]
  115. * @result per component result of clip<0, 255>(p + n)
  116. */
  117. static inline uint32_t addNoise4(uint32_t p, uint32_t n)
  118. {
  119. // unclipped result (lower 8 bits of each component)
  120. // alternative:
  121. // uint32_t s20 = ((p & 0x00FF00FF) + (n & 0x00FF00FF)) & 0x00FF00FF;
  122. // uint32_t s31 = ((p & 0xFF00FF00) + (n & 0xFF00FF00)) & 0xFF00FF00;
  123. // uint32_t s = s20 | s31;
  124. uint32_t s0 = p + n; // carry spills to neighbors
  125. uint32_t ci = (p ^ n ^ s0) & 0x01010100; // carry-in bits of prev sum
  126. uint32_t s = s0 - ci; // subtract carry bits again
  127. // Underflow of a component happens ONLY
  128. // WHEN input component is in range [0, 127]
  129. // AND noise component is negative
  130. // AND result component is in range [128, 255]
  131. // Overflow of a component happens ONLY
  132. // WHEN input component in in range [128, 255]
  133. // AND noise component is positive
  134. // AND result component is in range [0, 127]
  135. // Create a mask per component containing 00 for no under/overflow,
  136. // FF for under/overflow
  137. // ((~p & n & s) | (p & ~n & ~s)) == ((p ^ n) & (p ^ s))
  138. uint32_t t = (p ^ n) & (p ^ s) & 0x80808080;
  139. uint32_t u1 = t & s; // underflow (alternative: u1 = t & n)
  140. // alternative1: uint32_t u2 = u1 | (u1 >> 1);
  141. // uint32_t u4 = u2 | (u2 >> 2);
  142. // uint32_t u8 = u4 | (u4 >> 4);
  143. // alternative2: uint32_t u8 = (u1 >> 7) * 0xFF;
  144. uint32_t u8 = (u1 << 1) - (u1 >> 7);
  145. uint32_t o1 = t & p; // overflow
  146. uint32_t o8 = (o1 << 1) - (o1 >> 7);
  147. // clip result
  148. return (s & (~u8)) | o8;
  149. }
  150. template <class Pixel>
  151. void FBPostProcessor<Pixel>::drawNoiseLine(
  152. Pixel* buf, signed char* noise, size_t width)
  153. {
  154. #ifdef __SSE2__
  155. if (sizeof(Pixel) == 4) {
  156. // cast to avoid compilation error in case of 16bpp (even
  157. // though this code is dead in that case).
  158. auto* buf32 = reinterpret_cast<uint32_t*>(buf);
  159. drawNoiseLineSse2(buf32, noise, width);
  160. return;
  161. }
  162. #endif
  163. // c++ version
  164. if (sizeof(Pixel) == 4) {
  165. // optimized version for 32bpp
  166. auto noise4 = reinterpret_cast<uint32_t*>(noise);
  167. for (size_t i = 0; i < width; ++i) {
  168. buf[i] = addNoise4(buf[i], noise4[i]);
  169. }
  170. } else {
  171. int mr = pixelOps.getMaxRed();
  172. int mg = pixelOps.getMaxGreen();
  173. int mb = pixelOps.getMaxBlue();
  174. for (size_t i = 0; i < width; ++i) {
  175. Pixel p = buf[i];
  176. int r = pixelOps.red(p);
  177. int g = pixelOps.green(p);
  178. int b = pixelOps.blue(p);
  179. r += noise[4 * i + 0];
  180. g += noise[4 * i + 1];
  181. b += noise[4 * i + 2];
  182. r = std::min(std::max(r, 0), mr);
  183. g = std::min(std::max(g, 0), mg);
  184. b = std::min(std::max(b, 0), mb);
  185. buf[i] = pixelOps.combine(r, g, b);
  186. }
  187. }
  188. }
  189. template <class Pixel>
  190. void FBPostProcessor<Pixel>::drawNoise(OutputSurface& output_)
  191. {
  192. if (renderSettings.getNoise() == 0.0f) return;
  193. auto& output = checked_cast<SDLOutputSurface&>(output_);
  194. auto [w, h] = output.getLogicalSize();
  195. auto pixelAccess = output.getDirectPixelAccess();
  196. for (int y = 0; y < h; ++y) {
  197. auto* buf = pixelAccess.getLinePtr<Pixel>(y);
  198. drawNoiseLine(buf, &noiseBuf[noiseShift[y]], w);
  199. }
  200. }
  201. template <class Pixel>
  202. void FBPostProcessor<Pixel>::update(const Setting& setting)
  203. {
  204. VideoLayer::update(setting);
  205. auto& noiseSetting = renderSettings.getNoiseSetting();
  206. if (&setting == &noiseSetting) {
  207. preCalcNoise(noiseSetting.getDouble());
  208. }
  209. }
  210. template <class Pixel>
  211. FBPostProcessor<Pixel>::FBPostProcessor(MSXMotherBoard& motherBoard_,
  212. Display& display_, OutputSurface& screen_, const std::string& videoSource,
  213. unsigned maxWidth_, unsigned height_, bool canDoInterlace_)
  214. : PostProcessor(
  215. motherBoard_, display_, screen_, videoSource, maxWidth_, height_,
  216. canDoInterlace_)
  217. , noiseShift(screen.getLogicalHeight())
  218. , pixelOps(screen.getPixelFormat())
  219. {
  220. scaleAlgorithm = RenderSettings::NO_SCALER;
  221. scaleFactor = unsigned(-1);
  222. auto& noiseSetting = renderSettings.getNoiseSetting();
  223. noiseSetting.attach(*this);
  224. preCalcNoise(noiseSetting.getDouble());
  225. assert((screen.getLogicalWidth() * sizeof(Pixel)) < NOISE_SHIFT);
  226. }
  227. template <class Pixel>
  228. FBPostProcessor<Pixel>::~FBPostProcessor()
  229. {
  230. renderSettings.getNoiseSetting().detach(*this);
  231. }
  232. template <class Pixel>
  233. void FBPostProcessor<Pixel>::paint(OutputSurface& output_)
  234. {
  235. auto& output = checked_cast<SDLOutputSurface&>(output_);
  236. if (renderSettings.getInterleaveBlackFrame()) {
  237. interleaveCount ^= 1;
  238. if (interleaveCount) {
  239. output.clearScreen();
  240. return;
  241. }
  242. }
  243. if (!paintFrame) return;
  244. // New scaler algorithm selected?
  245. auto algo = renderSettings.getScaleAlgorithm();
  246. unsigned factor = renderSettings.getScaleFactor();
  247. if ((scaleAlgorithm != algo) || (scaleFactor != factor)) {
  248. scaleAlgorithm = algo;
  249. scaleFactor = factor;
  250. currScaler = ScalerFactory<Pixel>::createScaler(
  251. PixelOperations<Pixel>(output.getPixelFormat()),
  252. renderSettings);
  253. }
  254. // Scale image.
  255. const unsigned srcHeight = paintFrame->getHeight();
  256. const unsigned dstHeight = output.getLogicalHeight();
  257. unsigned g = Math::gcd(srcHeight, dstHeight);
  258. unsigned srcStep = srcHeight / g;
  259. unsigned dstStep = dstHeight / g;
  260. // TODO: Store all MSX lines in RawFrame and only scale the ones that fit
  261. // on the PC screen, as a preparation for resizable output window.
  262. unsigned srcStartY = 0;
  263. unsigned dstStartY = 0;
  264. while (dstStartY < dstHeight) {
  265. // Currently this is true because the source frame height
  266. // is always >= dstHeight/(dstStep/srcStep).
  267. assert(srcStartY < srcHeight);
  268. // get region with equal lineWidth
  269. unsigned lineWidth = getLineWidth(paintFrame, srcStartY, srcStep);
  270. unsigned srcEndY = srcStartY + srcStep;
  271. unsigned dstEndY = dstStartY + dstStep;
  272. while ((srcEndY < srcHeight) && (dstEndY < dstHeight) &&
  273. (getLineWidth(paintFrame, srcEndY, srcStep) == lineWidth)) {
  274. srcEndY += srcStep;
  275. dstEndY += dstStep;
  276. }
  277. // fill region
  278. //fprintf(stderr, "post processing lines %d-%d: %d\n",
  279. // srcStartY, srcEndY, lineWidth );
  280. float horStretch = renderSettings.getHorizontalStretch();
  281. unsigned inWidth = lrintf(horStretch);
  282. std::unique_ptr<ScalerOutput<Pixel>> dst(
  283. StretchScalerOutputFactory<Pixel>::create(
  284. output, pixelOps, inWidth));
  285. currScaler->scaleImage(
  286. *paintFrame, superImposeVideoFrame,
  287. srcStartY, srcEndY, lineWidth, // source
  288. *dst, dstStartY, dstEndY); // dest
  289. // next region
  290. srcStartY = srcEndY;
  291. dstStartY = dstEndY;
  292. }
  293. drawNoise(output);
  294. output.flushFrameBuffer();
  295. }
  296. template <class Pixel>
  297. std::unique_ptr<RawFrame> FBPostProcessor<Pixel>::rotateFrames(
  298. std::unique_ptr<RawFrame> finishedFrame, EmuTime::param time)
  299. {
  300. auto& generator = global_urng(); // fast (non-cryptographic) random numbers
  301. std::uniform_int_distribution<int> distribution(0, NOISE_SHIFT / 16 - 1);
  302. for (auto y : xrange(screen.getLogicalHeight())) {
  303. noiseShift[y] = distribution(generator) * 16;
  304. }
  305. return PostProcessor::rotateFrames(std::move(finishedFrame), time);
  306. }
  307. // Force template instantiation.
  308. #if HAVE_16BPP
  309. template class FBPostProcessor<uint16_t>;
  310. #endif
  311. #if HAVE_32BPP
  312. template class FBPostProcessor<uint32_t>;
  313. #endif
  314. } // namespace openmsx