123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355 |
- #include "FBPostProcessor.hh"
- #include "RawFrame.hh"
- #include "StretchScalerOutput.hh"
- #include "ScalerOutput.hh"
- #include "RenderSettings.hh"
- #include "Scaler.hh"
- #include "ScalerFactory.hh"
- #include "SDLOutputSurface.hh"
- #include "Math.hh"
- #include "aligned.hh"
- #include "checked_cast.hh"
- #include "random.hh"
- #include "xrange.hh"
- #include <algorithm>
- #include <cassert>
- #include <cmath>
- #include <cstdint>
- #include <cstddef>
- #ifdef __SSE2__
- #include <emmintrin.h>
- #endif
- namespace openmsx {
- constexpr unsigned NOISE_SHIFT = 8192;
- constexpr unsigned NOISE_BUF_SIZE = 2 * NOISE_SHIFT;
- alignas(SSE_ALIGNMENT) static signed char noiseBuf[NOISE_BUF_SIZE];
- template <class Pixel>
- void FBPostProcessor<Pixel>::preCalcNoise(float factor)
- {
- // We skip noise drawing if the factor is 0, so there is no point in
- // initializing the random data in that case.
- if (factor == 0.0f) return;
- // for 32bpp groups of 4 consecutive noiseBuf elements (starting at
- // 4 element boundaries) must have the same value. Later optimizations
- // depend on it.
- float scale[4];
- if (sizeof(Pixel) == 4) {
- // 32bpp
- // TODO ATM we compensate for big endian here. A better
- // alternative is to turn noiseBuf into an array of ints (it's
- // now bytes) and in the 16bpp code extract R,G,B components
- // from those ints
- const auto p = Pixel(OPENMSX_BIGENDIAN ? 0x00010203
- : 0x03020100);
- // TODO we can also fill the array with 'factor' and only set
- // 'alpha' to 0.0. But PixelOperations doesn't offer a simple
- // way to get the position of the alpha byte (yet).
- scale[0] = scale[1] = scale[2] = scale[3] = 0.0f;
- scale[pixelOps.red (p)] = factor;
- scale[pixelOps.green(p)] = factor;
- scale[pixelOps.blue (p)] = factor;
- } else {
- // 16bpp
- scale[0] = (pixelOps.getMaxRed() / 255.0f) * factor;
- scale[1] = (pixelOps.getMaxGreen() / 255.0f) * factor;
- scale[2] = (pixelOps.getMaxBlue() / 255.0f) * factor;
- scale[3] = 0.0f;
- }
- auto& generator = global_urng(); // fast (non-cryptographic) random numbers
- std::normal_distribution<float> distribution(0.0f, 1.0f);
- for (unsigned i = 0; i < NOISE_BUF_SIZE; i += 4) {
- float r = distribution(generator);
- noiseBuf[i + 0] = Math::clip<-128, 127>(roundf(r * scale[0]));
- noiseBuf[i + 1] = Math::clip<-128, 127>(roundf(r * scale[1]));
- noiseBuf[i + 2] = Math::clip<-128, 127>(roundf(r * scale[2]));
- noiseBuf[i + 3] = Math::clip<-128, 127>(roundf(r * scale[3]));
- }
- }
- #ifdef __SSE2__
- static inline void drawNoiseLineSse2(uint32_t* buf_, signed char* noise, size_t width)
- {
- // To each of the RGBA color components (a value in range [0..255]) we
- // want to add a signed noise value (in range [-128..127]) and also clip
- // the result to the range [0..255]. There is no SSE instruction that
- // directly performs this operation. But we can:
- // - subtract 128 from the RGBA component to get a signed byte
- // - perform the addition with signed saturation
- // - add 128 to the result to get back to the unsigned byte range
- // For 8-bit values the following 3 expressions are equivalent:
- // x + 128 == x - 128 == x ^ 128
- // So the expression becomes:
- // signed_add_sat(value ^ 128, noise) ^ 128
- // The follwoing loop does just that, though it processes 64 bytes per
- // iteration.
- ptrdiff_t x = width * sizeof(uint32_t);
- assert((x & 63) == 0);
- assert((uintptr_t(buf_) & 15) == 0);
- char* buf = reinterpret_cast<char*>(buf_) + x;
- char* nse = reinterpret_cast<char*>(noise) + x;
- x = -x;
- __m128i b7 = _mm_set1_epi8(-128); // 0x80
- do {
- __m128i i0 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 0));
- __m128i i1 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 16));
- __m128i i2 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 32));
- __m128i i3 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 48));
- __m128i n0 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 0));
- __m128i n1 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 16));
- __m128i n2 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 32));
- __m128i n3 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 48));
- __m128i o0 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i0, b7), n0), b7);
- __m128i o1 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i1, b7), n1), b7);
- __m128i o2 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i2, b7), n2), b7);
- __m128i o3 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i3, b7), n3), b7);
- _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 0), o0);
- _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 16), o1);
- _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 32), o2);
- _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 48), o3);
- x += 4 * sizeof(__m128i);
- } while (x < 0);
- }
- #endif
- /** Add noise to the given pixel.
- * @param p contains 4 8-bit unsigned components, so components have range [0, 255]
- * @param n contains 4 8-bit signed components, so components have range [-128, 127]
- * @result per component result of clip<0, 255>(p + n)
- */
- static inline uint32_t addNoise4(uint32_t p, uint32_t n)
- {
- // unclipped result (lower 8 bits of each component)
- // alternative:
- // uint32_t s20 = ((p & 0x00FF00FF) + (n & 0x00FF00FF)) & 0x00FF00FF;
- // uint32_t s31 = ((p & 0xFF00FF00) + (n & 0xFF00FF00)) & 0xFF00FF00;
- // uint32_t s = s20 | s31;
- uint32_t s0 = p + n; // carry spills to neighbors
- uint32_t ci = (p ^ n ^ s0) & 0x01010100; // carry-in bits of prev sum
- uint32_t s = s0 - ci; // subtract carry bits again
- // Underflow of a component happens ONLY
- // WHEN input component is in range [0, 127]
- // AND noise component is negative
- // AND result component is in range [128, 255]
- // Overflow of a component happens ONLY
- // WHEN input component in in range [128, 255]
- // AND noise component is positive
- // AND result component is in range [0, 127]
- // Create a mask per component containing 00 for no under/overflow,
- // FF for under/overflow
- // ((~p & n & s) | (p & ~n & ~s)) == ((p ^ n) & (p ^ s))
- uint32_t t = (p ^ n) & (p ^ s) & 0x80808080;
- uint32_t u1 = t & s; // underflow (alternative: u1 = t & n)
- // alternative1: uint32_t u2 = u1 | (u1 >> 1);
- // uint32_t u4 = u2 | (u2 >> 2);
- // uint32_t u8 = u4 | (u4 >> 4);
- // alternative2: uint32_t u8 = (u1 >> 7) * 0xFF;
- uint32_t u8 = (u1 << 1) - (u1 >> 7);
- uint32_t o1 = t & p; // overflow
- uint32_t o8 = (o1 << 1) - (o1 >> 7);
- // clip result
- return (s & (~u8)) | o8;
- }
- template <class Pixel>
- void FBPostProcessor<Pixel>::drawNoiseLine(
- Pixel* buf, signed char* noise, size_t width)
- {
- #ifdef __SSE2__
- if (sizeof(Pixel) == 4) {
- // cast to avoid compilation error in case of 16bpp (even
- // though this code is dead in that case).
- auto* buf32 = reinterpret_cast<uint32_t*>(buf);
- drawNoiseLineSse2(buf32, noise, width);
- return;
- }
- #endif
- // c++ version
- if (sizeof(Pixel) == 4) {
- // optimized version for 32bpp
- auto noise4 = reinterpret_cast<uint32_t*>(noise);
- for (size_t i = 0; i < width; ++i) {
- buf[i] = addNoise4(buf[i], noise4[i]);
- }
- } else {
- int mr = pixelOps.getMaxRed();
- int mg = pixelOps.getMaxGreen();
- int mb = pixelOps.getMaxBlue();
- for (size_t i = 0; i < width; ++i) {
- Pixel p = buf[i];
- int r = pixelOps.red(p);
- int g = pixelOps.green(p);
- int b = pixelOps.blue(p);
- r += noise[4 * i + 0];
- g += noise[4 * i + 1];
- b += noise[4 * i + 2];
- r = std::min(std::max(r, 0), mr);
- g = std::min(std::max(g, 0), mg);
- b = std::min(std::max(b, 0), mb);
- buf[i] = pixelOps.combine(r, g, b);
- }
- }
- }
- template <class Pixel>
- void FBPostProcessor<Pixel>::drawNoise(OutputSurface& output_)
- {
- if (renderSettings.getNoise() == 0.0f) return;
- auto& output = checked_cast<SDLOutputSurface&>(output_);
- auto [w, h] = output.getLogicalSize();
- auto pixelAccess = output.getDirectPixelAccess();
- for (int y = 0; y < h; ++y) {
- auto* buf = pixelAccess.getLinePtr<Pixel>(y);
- drawNoiseLine(buf, &noiseBuf[noiseShift[y]], w);
- }
- }
- template <class Pixel>
- void FBPostProcessor<Pixel>::update(const Setting& setting)
- {
- VideoLayer::update(setting);
- auto& noiseSetting = renderSettings.getNoiseSetting();
- if (&setting == &noiseSetting) {
- preCalcNoise(noiseSetting.getDouble());
- }
- }
- template <class Pixel>
- FBPostProcessor<Pixel>::FBPostProcessor(MSXMotherBoard& motherBoard_,
- Display& display_, OutputSurface& screen_, const std::string& videoSource,
- unsigned maxWidth_, unsigned height_, bool canDoInterlace_)
- : PostProcessor(
- motherBoard_, display_, screen_, videoSource, maxWidth_, height_,
- canDoInterlace_)
- , noiseShift(screen.getLogicalHeight())
- , pixelOps(screen.getPixelFormat())
- {
- scaleAlgorithm = RenderSettings::NO_SCALER;
- scaleFactor = unsigned(-1);
- auto& noiseSetting = renderSettings.getNoiseSetting();
- noiseSetting.attach(*this);
- preCalcNoise(noiseSetting.getDouble());
- assert((screen.getLogicalWidth() * sizeof(Pixel)) < NOISE_SHIFT);
- }
- template <class Pixel>
- FBPostProcessor<Pixel>::~FBPostProcessor()
- {
- renderSettings.getNoiseSetting().detach(*this);
- }
- template <class Pixel>
- void FBPostProcessor<Pixel>::paint(OutputSurface& output_)
- {
- auto& output = checked_cast<SDLOutputSurface&>(output_);
- if (renderSettings.getInterleaveBlackFrame()) {
- interleaveCount ^= 1;
- if (interleaveCount) {
- output.clearScreen();
- return;
- }
- }
- if (!paintFrame) return;
- // New scaler algorithm selected?
- auto algo = renderSettings.getScaleAlgorithm();
- unsigned factor = renderSettings.getScaleFactor();
- if ((scaleAlgorithm != algo) || (scaleFactor != factor)) {
- scaleAlgorithm = algo;
- scaleFactor = factor;
- currScaler = ScalerFactory<Pixel>::createScaler(
- PixelOperations<Pixel>(output.getPixelFormat()),
- renderSettings);
- }
- // Scale image.
- const unsigned srcHeight = paintFrame->getHeight();
- const unsigned dstHeight = output.getLogicalHeight();
- unsigned g = Math::gcd(srcHeight, dstHeight);
- unsigned srcStep = srcHeight / g;
- unsigned dstStep = dstHeight / g;
- // TODO: Store all MSX lines in RawFrame and only scale the ones that fit
- // on the PC screen, as a preparation for resizable output window.
- unsigned srcStartY = 0;
- unsigned dstStartY = 0;
- while (dstStartY < dstHeight) {
- // Currently this is true because the source frame height
- // is always >= dstHeight/(dstStep/srcStep).
- assert(srcStartY < srcHeight);
- // get region with equal lineWidth
- unsigned lineWidth = getLineWidth(paintFrame, srcStartY, srcStep);
- unsigned srcEndY = srcStartY + srcStep;
- unsigned dstEndY = dstStartY + dstStep;
- while ((srcEndY < srcHeight) && (dstEndY < dstHeight) &&
- (getLineWidth(paintFrame, srcEndY, srcStep) == lineWidth)) {
- srcEndY += srcStep;
- dstEndY += dstStep;
- }
- // fill region
- //fprintf(stderr, "post processing lines %d-%d: %d\n",
- // srcStartY, srcEndY, lineWidth );
- float horStretch = renderSettings.getHorizontalStretch();
- unsigned inWidth = lrintf(horStretch);
- std::unique_ptr<ScalerOutput<Pixel>> dst(
- StretchScalerOutputFactory<Pixel>::create(
- output, pixelOps, inWidth));
- currScaler->scaleImage(
- *paintFrame, superImposeVideoFrame,
- srcStartY, srcEndY, lineWidth, // source
- *dst, dstStartY, dstEndY); // dest
- // next region
- srcStartY = srcEndY;
- dstStartY = dstEndY;
- }
- drawNoise(output);
- output.flushFrameBuffer();
- }
- template <class Pixel>
- std::unique_ptr<RawFrame> FBPostProcessor<Pixel>::rotateFrames(
- std::unique_ptr<RawFrame> finishedFrame, EmuTime::param time)
- {
- auto& generator = global_urng(); // fast (non-cryptographic) random numbers
- std::uniform_int_distribution<int> distribution(0, NOISE_SHIFT / 16 - 1);
- for (auto y : xrange(screen.getLogicalHeight())) {
- noiseShift[y] = distribution(generator) * 16;
- }
- return PostProcessor::rotateFrames(std::move(finishedFrame), time);
- }
- // Force template instantiation.
- #if HAVE_16BPP
- template class FBPostProcessor<uint16_t>;
- #endif
- #if HAVE_32BPP
- template class FBPostProcessor<uint32_t>;
- #endif
- } // namespace openmsx
|