diff options
Diffstat (limited to 'src/blitter')
-rw-r--r-- | src/blitter/32bpp_anim_sse4.cpp | 80 | ||||
-rw-r--r-- | src/blitter/32bpp_anim_sse4.hpp | 5 | ||||
-rw-r--r-- | src/blitter/32bpp_sse2.cpp | 61 | ||||
-rw-r--r-- | src/blitter/32bpp_sse2.hpp | 90 | ||||
-rw-r--r-- | src/blitter/32bpp_sse4.cpp | 111 | ||||
-rw-r--r-- | src/blitter/32bpp_sse4.hpp | 33 | ||||
-rw-r--r-- | src/blitter/32bpp_sse_func.hpp | 225 | ||||
-rw-r--r-- | src/blitter/32bpp_ssse3.cpp | 70 | ||||
-rw-r--r-- | src/blitter/32bpp_ssse3.hpp | 49 |
9 files changed, 356 insertions, 368 deletions
diff --git a/src/blitter/32bpp_anim_sse4.cpp b/src/blitter/32bpp_anim_sse4.cpp index e8873d5d3..ae1b34d69 100644 --- a/src/blitter/32bpp_anim_sse4.cpp +++ b/src/blitter/32bpp_anim_sse4.cpp @@ -83,12 +83,12 @@ inline void Blitter_32bppSSE4_Anim::Draw(const Blitter::BlitterParams *bp, ZoomL const byte m0 = mvX2; if (m0 >= PALETTE_ANIM_START) { const Colour c0 = (this->LookupColourInPalette(m0).data & 0x00FFFFFF) | (src[0].data & 0xFF000000); - INSR32(AdjustBrightness(c0, (byte) (mvX2 >> 8)).data, srcABCD, 0); + InsertFirstUint32(AdjustBrightneSSE(c0, (byte) (mvX2 >> 8)).data, srcABCD); } const byte m1 = mvX2 >> 16; if (m1 >= PALETTE_ANIM_START) { const Colour c1 = (this->LookupColourInPalette(m1).data & 0x00FFFFFF) | (src[1].data & 0xFF000000); - INSR32(AdjustBrightness(c1, (byte) (mvX2 >> 24)).data, srcABCD, 1); + InsertSecondUint32(AdjustBrightneSSE(c1, (byte) (mvX2 >> 24)).data, srcABCD); } /* Update anim buffer. */ @@ -118,7 +118,7 @@ inline void Blitter_32bppSSE4_Anim::Draw(const Blitter::BlitterParams *bp, ZoomL /* Blend colours. */ bmno_alpha_blend: - ALPHA_BLEND_2(); + srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm); bmno_full_opacity: _mm_storel_epi64((__m128i *) dst, srcABCD); bmno_full_transparency: @@ -132,20 +132,19 @@ bmno_full_transparency: if (src->a == 0) { } else if (src->a == 255) { *anim = *(const uint16*) src_mv; - *dst = (src_mv->m >= PALETTE_ANIM_START) ? AdjustBrightness(LookupColourInPalette(src_mv->m), src_mv->v) : *src; + *dst = (src_mv->m >= PALETTE_ANIM_START) ? AdjustBrightneSSE(LookupColourInPalette(src_mv->m), src_mv->v) : *src; } else { *anim = 0; __m128i srcABCD; __m128i dstABCD = _mm_cvtsi32_si128(dst->data); if (src_mv->m >= PALETTE_ANIM_START) { - Colour colour = AdjustBrightness(LookupColourInPalette(src_mv->m), src_mv->v); + Colour colour = AdjustBrightneSSE(LookupColourInPalette(src_mv->m), src_mv->v); colour.a = src->a; srcABCD = _mm_cvtsi32_si128(colour.data); } else { srcABCD = _mm_cvtsi32_si128(src->data); } - ALPHA_BLEND_2(); - dst->data = _mm_cvtsi128_si32(srcABCD); + dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm)); } } break; @@ -162,24 +161,36 @@ bmno_full_transparency: const uint m1 = (byte) (mvX2 >> 16); const uint r1 = remap[m1]; if (mvX2 & 0x00FF00FF) { - /* Written so the compiler uses CMOV. */ - const Colour src0 = src[0]; - const Colour c0map = (this->LookupColourInPalette(r0).data & 0x00FFFFFF) | (src0.data & 0xFF000000); - Colour c0 = dst[0]; - c0 = r0 == 0 ? c0 : c0map; - c0 = m0 != 0 ? c0 : src0; - srcABCD = _mm_cvtsi32_si128(c0.data); - - const Colour src1 = src[1]; - const Colour c1map = (this->LookupColourInPalette(r1).data & 0x00FFFFFF) | (src1.data & 0xFF000000); - Colour c1 = dst[1]; - c1 = r1 == 0 ? c1 : c1map; - c1 = m1 != 0 ? c1 : src1; - INSR32(c1.data, srcABCD, 1); + #define CMOV_REMAP(m_colour, m_colour_init, m_src, m_m) \ + /* Written so the compiler uses CMOV. */ \ + Colour m_colour = m_colour_init; \ + { \ + const Colour srcm = (Colour) (m_src); \ + const uint m = (byte) (m_m); \ + const uint r = remap[m]; \ + const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \ + m_colour = r == 0 ? m_colour : cmap; \ + m_colour = m != 0 ? m_colour : srcm; \ + } +#ifdef _SQ64 + uint64 srcs = _mm_cvtsi128_si64(srcABCD); + uint64 dsts = _mm_cvtsi128_si64(dstABCD); + uint64 remapped_src = 0; + CMOV_REMAP(c0, dsts, srcs, mvX2); + remapped_src = c0.data; + CMOV_REMAP(c1, dsts >> 32, srcs >> 32, mvX2 >> 16); + remapped_src |= (uint64) c1.data << 32; + srcABCD = _mm_cvtsi64_si128(remapped_src); +#else + Colour remapped_src[2]; + CMOV_REMAP(c0, _mm_cvtsi128_si32(dstABCD), _mm_cvtsi128_si32(srcABCD), mvX2); + remapped_src[0] = c0.data; + CMOV_REMAP(c1, dst[1], src[1], mvX2 >> 16); + remapped_src[1] = c1.data; + srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src); +#endif - if ((mvX2 & 0xFF00FF00) != 0x80008000) { - ADJUST_BRIGHTNESS_2(srcABCD, mvX2); - } + if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2); } /* Update anim buffer. */ @@ -211,7 +222,7 @@ bmno_full_transparency: /* Blend colours. */ bmcr_alpha_blend: - ALPHA_BLEND_2(); + srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm); bmcr_full_opacity: _mm_storel_epi64((__m128i *) dst, srcABCD); bmcr_full_transparency: @@ -229,7 +240,7 @@ bmcr_full_transparency: const uint r = remap[src_mv->m]; *anim = (src->a == 255) ? r | ((uint16) src_mv->v << 8 ) : 0; if (r != 0) { - Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v); + Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v); if (src->a == 255) { *dst = remapped_colour; } else { @@ -244,7 +255,7 @@ bmcr_full_transparency: if (src->a < 255) { bmcr_alpha_blend_single: __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - ALPHA_BLEND_2(); + srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm); } dst->data = _mm_cvtsi128_si32(srcABCD); } @@ -256,8 +267,7 @@ bmcr_alpha_blend_single: for (uint x = (uint) bp->width / 2; x > 0; x--) { __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); - DARKEN_2(); - _mm_storel_epi64((__m128i *) dst, dstAB); + _mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base)); src += 2; dst += 2; anim += 2; @@ -268,8 +278,7 @@ bmcr_alpha_blend_single: if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) { __m128i srcABCD = _mm_cvtsi32_si128(src->data); __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - DARKEN_2(); - dst->data = _mm_cvtsi128_si32(dstAB); + dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base)); if (src[0].a) anim[0] = 0; } break; @@ -318,13 +327,4 @@ void Blitter_32bppSSE4_Anim::Draw(Blitter::BlitterParams *bp, BlitterMode mode, } } -/** Same code as seen in 32bpp_sse2.cpp but some macros are not the same. */ -inline Colour Blitter_32bppSSE4_Anim::AdjustBrightness(Colour colour, uint8 brightness) -{ - /* Shortcut for normal brightness. */ - if (brightness == DEFAULT_BRIGHTNESS) return colour; - - return Blitter_32bppSSE4::ReallyAdjustBrightness(colour, brightness); -} - #endif /* WITH_SSE */ diff --git a/src/blitter/32bpp_anim_sse4.hpp b/src/blitter/32bpp_anim_sse4.hpp index 0f1131c88..9a3f93ca8 100644 --- a/src/blitter/32bpp_anim_sse4.hpp +++ b/src/blitter/32bpp_anim_sse4.hpp @@ -14,6 +14,9 @@ #ifdef WITH_SSE +#ifndef SSE_VERSION +#define SSE_VERSION 4 +#endif #include "32bpp_anim.hpp" #include "32bpp_sse4.hpp" @@ -28,11 +31,9 @@ public: template <BlitterMode mode, Blitter_32bppSSE_Base::ReadMode read_mode, Blitter_32bppSSE_Base::BlockType bt_last> /* virtual */ void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom); /* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom); - /* virtual */ Colour AdjustBrightness(Colour colour, uint8 brightness); /* virtual */ Sprite *Encode(const SpriteLoader::Sprite *sprite, AllocatorProc *allocator) { return Blitter_32bppSSE_Base::Encode(sprite, allocator); } - /* virtual */ const char *GetName() { return "32bpp-sse4-anim"; } }; diff --git a/src/blitter/32bpp_sse2.cpp b/src/blitter/32bpp_sse2.cpp index 0b3eb1899..49fb28c35 100644 --- a/src/blitter/32bpp_sse2.cpp +++ b/src/blitter/32bpp_sse2.cpp @@ -73,8 +73,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel for (uint x = (uint) effective_width / 2; x > 0; x--) { __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); - ALPHA_BLEND_2(); - _mm_storel_epi64((__m128i*) dst, srcABCD); + _mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, clear_hi, clear_hi)); src += 2; dst += 2; } @@ -82,8 +81,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) { __m128i srcABCD = _mm_cvtsi32_si128(src->data); __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - ALPHA_BLEND_2(); - dst->data = _mm_cvtsi128_si32(srcABCD); + dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, clear_hi, clear_hi)); } break; @@ -94,7 +92,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel if (src_mv->m) { const uint r = remap[src_mv->m]; if (r != 0) { - Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v); + Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v); if (src->a == 255) { *dst = remapped_colour; } else { @@ -108,7 +106,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel if (src->a < 255) { bmcr_alpha_blend_single: __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - ALPHA_BLEND_2(); + srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, clear_hi, clear_hi); } dst->data = _mm_cvtsi128_si32(srcABCD); } @@ -123,8 +121,7 @@ bmcr_alpha_blend_single: for (uint x = (uint) bp->width / 2; x > 0; x--) { __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); - DARKEN_2(); - _mm_storel_epi64((__m128i *) dst, dstAB); + _mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, tr_nom_base, tr_nom_base)); src += 2; dst += 2; } @@ -132,8 +129,7 @@ bmcr_alpha_blend_single: if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) { __m128i srcABCD = _mm_cvtsi32_si128(src->data); __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - DARKEN_2(); - dst->data = _mm_cvtsi128_si32(dstAB); + dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, tr_nom_base, tr_nom_base)); } break; } @@ -235,7 +231,7 @@ Sprite *Blitter_32bppSSE_Base::Encode(const SpriteLoader::Sprite *sprite, Alloca dst_mv->v = (rgb_max == 0) ? Blitter_32bppBase::DEFAULT_BRIGHTNESS : rgb_max; /* Pre-convert the mapping channel to a RGB value. */ - const Colour colour = AdjustBrightness(Blitter_32bppBase::LookupColourInPalette(src->m), dst_mv->v); + const Colour colour = AdjustBrightneSSE(Blitter_32bppBase::LookupColourInPalette(src->m), dst_mv->v); dst_rgba->r = colour.r; dst_rgba->g = colour.g; dst_rgba->b = colour.b; @@ -282,47 +278,4 @@ Sprite *Blitter_32bppSSE_Base::Encode(const SpriteLoader::Sprite *sprite, Alloca return dst_sprite; } -/** ReallyAdjustBrightness() is not called that often. - * Inlining this function implies a far jump, which has a huge latency. - */ -inline Colour Blitter_32bppSSE2::AdjustBrightness(Colour colour, uint8 brightness) -{ - /* Shortcut for normal brightness. */ - if (brightness == DEFAULT_BRIGHTNESS) return colour; - - return Blitter_32bppSSE2::ReallyAdjustBrightness(colour, brightness); -} - -IGNORE_UNINITIALIZED_WARNING_START -Colour Blitter_32bppSSE2::ReallyAdjustBrightness(Colour colour, uint8 brightness) -{ - uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32; - c16 *= brightness; - uint64 c16_ob = c16; // Helps out of order execution. - c16 /= DEFAULT_BRIGHTNESS; - c16 &= 0x01FF01FF01FF; - - /* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */ - c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16; - const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2; - - const uint32 alpha32 = colour.data & 0xFF000000; - __m128i ret; - LOAD64(c16, ret); - if (ob != 0) { - __m128i ob128 = _mm_cvtsi32_si128(ob); - ob128 = _mm_shufflelo_epi16(ob128, 0xC0); - __m128i white = OVERBRIGHT_VALUE_MASK; - __m128i c128 = ret; - ret = _mm_subs_epu16(white, c128); /* PSUBUSW, (255 - rgb) */ - ret = _mm_mullo_epi16(ret, ob128); /* PMULLW, ob*(255 - rgb) */ - ret = _mm_srli_epi16(ret, 8); /* PSRLW, ob*(255 - rgb)/256 */ - ret = _mm_add_epi16(ret, c128); /* PADDW, ob*(255 - rgb)/256 + rgb */ - } - - ret = _mm_packus_epi16(ret, ret); /* PACKUSWB, saturate and pack. */ - return alpha32 | _mm_cvtsi128_si32(ret); -} -IGNORE_UNINITIALIZED_WARNING_STOP - #endif /* WITH_SSE */ diff --git a/src/blitter/32bpp_sse2.hpp b/src/blitter/32bpp_sse2.hpp index 1c3307c70..3bab0d752 100644 --- a/src/blitter/32bpp_sse2.hpp +++ b/src/blitter/32bpp_sse2.hpp @@ -14,91 +14,10 @@ #ifdef WITH_SSE -#include "32bpp_simple.hpp" -#include "emmintrin.h" - -#define META_LENGTH 2 ///< Number of uint32 inserted before each line of pixels in a sprite. -#define MARGIN_NORMAL_THRESHOLD (zoom == ZOOM_LVL_OUT_32X ? 8 : 4) ///< Minimum width to use margins with BM_NORMAL. -#define MARGIN_REMAP_THRESHOLD 4 ///< Minimum width to use margins with BM_COLOUR_REMAP. - -#ifdef _MSC_VER - #define ALIGN(n) __declspec(align(n)) -#else - #define ALIGN(n) __attribute__ ((aligned (n))) -#endif - -typedef union ALIGN(16) um128i { - __m128i m128i; - uint8 m128i_u8[16]; - uint16 m128i_u16[8]; - uint32 m128i_u32[4]; - uint64 m128i_u64[2]; -} um128i; - -#define CLEAR_HIGH_BYTE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0) -#define ALPHA_CONTROL_MASK _mm_setr_epi8( 6, 7, 6, 7, 6, 7, -1, -1, 14, 15, 14, 15, 14, 15, -1, -1) -#define PACK_LOW_CONTROL_MASK _mm_setr_epi8( 0, 2, 4, -1, 8, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1) -#define PACK_HIGH_CONTROL_MASK _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, -1, 8, 10, 12, -1) -#define BRIGHTNESS_LOW_CONTROL_MASK _mm_setr_epi8( 1, 2, 1, 2, 1, 2, 0, 2, 3, 2, 3, 2, 3, 2, 0, 2) -#define BRIGHTNESS_DIV_CLEANER _mm_setr_epi8(-1, 1, -1, 1, -1, 1, -1, 0, -1, 1, -1, 1, -1, 1, -1, 0) -#define OVERBRIGHT_PRESENCE_MASK _mm_setr_epi8( 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0) -#define OVERBRIGHT_VALUE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0) -#define OVERBRIGHT_CONTROL_MASK _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 7, 7, 2, 3, 2, 3, 2, 3, 7, 7) -#define TRANSPARENT_NOM_BASE _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256) - -#define EXTR32(m_from, m_rank) (*(um128i*) &m_from).m128i_u32[m_rank] -#define EXTR64(m_from, m_rank) (*(um128i*) &m_from).m128i_u64[m_rank] -#define INSR32(m_val, m_into, m_rank) { \ - (*(um128i*) &m_into).m128i = _mm_insert_epi16((*(um128i*) &m_into).m128i, m_val, (m_rank)*2); \ - (*(um128i*) &m_into).m128i = _mm_insert_epi16((*(um128i*) &m_into).m128i, (m_val) >> 16, (m_rank)*2 + 1); \ -} -#define INSR64(m_val, m_into, m_rank) (*(um128i*) &m_into).m128i_u64[m_rank] = (m_val) - -#ifdef _SQ64 - #define LOAD64(m_val, m_into) m_into = _mm_cvtsi64_si128(m_val); -#else - #define LOAD64(m_val, m_into) INSR64(m_val, m_into, 0) +#ifndef SSE_VERSION +#define SSE_VERSION 2 #endif - -/* PUT_ALPHA_IN_FRONT_OF_RGB is redefined in 32bpp_ssse3.hpp. */ -#define PUT_ALPHA_IN_FRONT_OF_RGB(m_from, m_into) \ - m_into = _mm_shufflelo_epi16(m_from, 0x3F); /* PSHUFLW, put alpha1 in front of each rgb1 */ \ - m_into = _mm_shufflehi_epi16(m_into, 0x3F); /* PSHUFHW, put alpha2 in front of each rgb2 */ - -/* PACK_AB_WITHOUT_SATURATION is redefined in 32bpp_ssse3.hpp. */ -#define PACK_AB_WITHOUT_SATURATION(m_from, m_into) \ - m_from = _mm_and_si128(m_from, clear_hi); /* PAND, wipe high bytes to keep low bytes when packing */ \ - m_into = _mm_packus_epi16(m_from, m_from); /* PACKUSWB, pack 2 colours (with saturation) */ - -/* Alpha blend 2 pixels. */ -#define ALPHA_BLEND_2() { \ - __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); /* PUNPCKLBW, expand each uint8 into uint16 */ \ - __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); \ - \ - __m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); /* PCMPGTW, if (alpha > 0) a++; */ \ - alphaAB = _mm_srli_epi16(alphaAB, 15); \ - alphaAB = _mm_add_epi16(alphaAB, srcAB); \ - PUT_ALPHA_IN_FRONT_OF_RGB(alphaAB, alphaAB); \ - \ - srcAB = _mm_sub_epi16(srcAB, dstAB); /* PSUBW, (r - Cr) */ \ - srcAB = _mm_mullo_epi16(srcAB, alphaAB); /* PMULLW, a*(r - Cr) */ \ - srcAB = _mm_srli_epi16(srcAB, 8); /* PSRLW, a*(r - Cr)/256 */ \ - srcAB = _mm_add_epi16(srcAB, dstAB); /* PADDW, a*(r - Cr)/256 + Cr */ \ - PACK_AB_WITHOUT_SATURATION(srcAB, srcABCD); \ -} - -/* Darken 2 pixels. - * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4) - */ -#define DARKEN_2() \ - __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); \ - __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); \ - __m128i PUT_ALPHA_IN_FRONT_OF_RGB(srcAB, alphaAB); \ - alphaAB = _mm_srli_epi16(alphaAB, 2); /* Reduce to 64 levels of shades so the max value fits in 16 bits. */ \ - __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); \ - dstAB = _mm_mullo_epi16(dstAB, nom); \ - dstAB = _mm_srli_epi16(dstAB, 8); \ - dstAB = _mm_packus_epi16(dstAB, dstAB); +#include "32bpp_sse_func.hpp" /** Base methods for 32bpp SSE blitters. */ class Blitter_32bppSSE_Base { @@ -138,14 +57,11 @@ public: }; Sprite *Encode(const SpriteLoader::Sprite *sprite, AllocatorProc *allocator); - virtual Colour AdjustBrightness(Colour colour, uint8 brightness) = 0; }; /** The SSE2 32 bpp blitter (without palette animation). */ class Blitter_32bppSSE2 : public Blitter_32bppSimple, public Blitter_32bppSSE_Base { public: - virtual Colour AdjustBrightness(Colour colour, uint8 brightness); - static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness); /* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom); template <BlitterMode mode, Blitter_32bppSSE_Base::ReadMode read_mode, Blitter_32bppSSE_Base::BlockType bt_last> void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom); diff --git a/src/blitter/32bpp_sse4.cpp b/src/blitter/32bpp_sse4.cpp index 619110cb6..1403d3659 100644 --- a/src/blitter/32bpp_sse4.cpp +++ b/src/blitter/32bpp_sse4.cpp @@ -74,8 +74,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel for (uint x = (uint) effective_width / 2; x > 0; x--) { __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); - ALPHA_BLEND_2(); - _mm_storel_epi64((__m128i*) dst, srcABCD); + _mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm)); src += 2; dst += 2; } @@ -83,8 +82,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) { __m128i srcABCD = _mm_cvtsi32_si128(src->data); __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - ALPHA_BLEND_2(); - dst->data = _mm_cvtsi128_si32(srcABCD); + dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm)); } break; @@ -96,33 +94,39 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel /* Remap colours. */ if (mvX2 & 0x00FF00FF) { - /* Written so the compiler uses CMOV. */ - const Colour src0 = src[0]; - const uint m0 = (byte) mvX2; - const uint r0 = remap[m0]; - const Colour c0map = (this->LookupColourInPalette(r0).data & 0x00FFFFFF) | (src0.data & 0xFF000000); - Colour c0 = 0; // Use alpha of 0 to keep dst as is. - c0 = r0 == 0 ? c0 : c0map; - c0 = m0 != 0 ? c0 : src0; - srcABCD = _mm_cvtsi32_si128(c0.data); - - const Colour src1 = src[1]; - const uint m1 = (byte) (mvX2 >> 16); - const uint r1 = remap[m1]; - const Colour c1map = (this->LookupColourInPalette(r1).data & 0x00FFFFFF) | (src1.data & 0xFF000000); - Colour c1 = 0; - c1 = r1 == 0 ? c1 : c1map; - c1 = m1 != 0 ? c1 : src1; - INSR32(c1.data, srcABCD, 1); - - if ((mvX2 & 0xFF00FF00) != 0x80008000) { - ADJUST_BRIGHTNESS_2(srcABCD, mvX2); - } + #define CMOV_REMAP(m_colour, m_src, m_m) \ + /* Written so the compiler uses CMOV. */ \ + Colour m_colour = 0; \ + { \ + const Colour srcm = (Colour) (m_src); \ + const uint m = (byte) (m_m); \ + const uint r = remap[m]; \ + const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \ + m_colour = r == 0 ? m_colour : cmap; \ + m_colour = m != 0 ? m_colour : srcm; \ + } +#ifdef _SQ64 + uint64 srcs = _mm_cvtsi128_si64(srcABCD); + uint64 remapped_src = 0; + CMOV_REMAP(c0, srcs, mvX2); + remapped_src = c0.data; + CMOV_REMAP(c1, srcs >> 32, mvX2 >> 16); + remapped_src |= (uint64) c1.data << 32; + srcABCD = _mm_cvtsi64_si128(remapped_src); +#else + Colour remapped_src[2]; + CMOV_REMAP(c0, _mm_cvtsi128_si32(srcABCD), mvX2); + remapped_src[0] = c0.data; + CMOV_REMAP(c1, src[1], mvX2 >> 16); + remapped_src[1] = c1.data; + srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src); +#endif + + if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2); } /* Blend colours. */ - ALPHA_BLEND_2(); - _mm_storel_epi64((__m128i *) dst, srcABCD); + _mm_storel_epi64((__m128i *) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm)); dst += 2; src += 2; src_mv += 2; @@ -134,7 +138,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel if (src_mv->m) { const uint r = remap[src_mv->m]; if (r != 0) { - Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v); + Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v); if (src->a == 255) { *dst = remapped_colour; } else { @@ -148,7 +152,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel if (src->a < 255) { bmcr_alpha_blend_single: __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - ALPHA_BLEND_2(); + srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm); } dst->data = _mm_cvtsi128_si32(srcABCD); } @@ -160,8 +164,7 @@ bmcr_alpha_blend_single: for (uint x = (uint) bp->width / 2; x > 0; x--) { __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); - DARKEN_2(); - _mm_storel_epi64((__m128i *) dst, dstAB); + _mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base)); src += 2; dst += 2; } @@ -169,8 +172,7 @@ bmcr_alpha_blend_single: if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) { __m128i srcABCD = _mm_cvtsi32_si128(src->data); __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - DARKEN_2(); - dst->data = _mm_cvtsi128_si32(dstAB); + dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base)); } break; } @@ -217,45 +219,4 @@ void Blitter_32bppSSE4::Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomL } } -/** Same code as seen in 32bpp_sse2.cpp but some macros are not the same. */ -inline Colour Blitter_32bppSSE4::AdjustBrightness(Colour colour, uint8 brightness) -{ - /* Shortcut for normal brightness. */ - if (brightness == DEFAULT_BRIGHTNESS) return colour; - - return Blitter_32bppSSE4::ReallyAdjustBrightness(colour, brightness); -} - -IGNORE_UNINITIALIZED_WARNING_START -Colour Blitter_32bppSSE4::ReallyAdjustBrightness(Colour colour, uint8 brightness) -{ - uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32; - c16 *= brightness; - uint64 c16_ob = c16; // Helps out of order execution. - c16 /= DEFAULT_BRIGHTNESS; - c16 &= 0x01FF01FF01FF; - - /* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */ - c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16; - const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2; - - const uint32 alpha32 = colour.data & 0xFF000000; - __m128i ret; - LOAD64(c16, ret); - if (ob != 0) { - __m128i ob128 = _mm_cvtsi32_si128(ob); - ob128 = _mm_shufflelo_epi16(ob128, 0xC0); - __m128i white = OVERBRIGHT_VALUE_MASK; - __m128i c128 = ret; - ret = _mm_subs_epu16(white, c128); /* PSUBUSW, (255 - rgb) */ - ret = _mm_mullo_epi16(ret, ob128); /* PMULLW, ob*(255 - rgb) */ - ret = _mm_srli_epi16(ret, 8); /* PSRLW, ob*(255 - rgb)/256 */ - ret = _mm_add_epi16(ret, c128); /* PADDW, ob*(255 - rgb)/256 + rgb */ - } - - ret = _mm_packus_epi16(ret, ret); /* PACKUSWB, saturate and pack. */ - return alpha32 | _mm_cvtsi128_si32(ret); -} -IGNORE_UNINITIALIZED_WARNING_STOP - #endif /* WITH_SSE */ diff --git a/src/blitter/32bpp_sse4.hpp b/src/blitter/32bpp_sse4.hpp index f8a563b85..7a3332d87 100644 --- a/src/blitter/32bpp_sse4.hpp +++ b/src/blitter/32bpp_sse4.hpp @@ -14,41 +14,14 @@ #ifdef WITH_SSE -#include "32bpp_ssse3.hpp" -#include "smmintrin.h" - -#undef EXTR32 -#define EXTR32(m_from, m_rank) _mm_extract_epi32((*(um128i*) &m_from).m128i, m_rank) -#undef INSR32 -#define INSR32(m_val, m_into, m_rank) (*(um128i*) &m_into).m128i = _mm_insert_epi32((*(um128i*) &m_into).m128i, m_val, m_rank) - -IGNORE_UNINITIALIZED_WARNING_START -#ifdef _SQ64 - #undef INSR64 - #define INSR64(m_val, m_into, m_rank) (*(um128i*) &m_into).m128i = _mm_insert_epi64((*(um128i*) &m_into).m128i, m_val, m_rank) -#else - typedef union { uint64 u64; struct _u32 { uint32 low, high; } u32; } u6432; - #undef INSR64 - #define INSR64(m_val, m_into, m_rank) { \ - u6432 v; \ - v.u64 = m_val; \ - (*(um128i*) &m_into).m128i = _mm_insert_epi32((*(um128i*) &m_into).m128i, v.u32.low, (m_rank)*2); \ - (*(um128i*) &m_into).m128i = _mm_insert_epi32((*(um128i*) &m_into).m128i, v.u32.high, (m_rank)*2 + 1); \ - } - - #undef LOAD64 - #define LOAD64(m_val, m_into) \ - m_into = _mm_cvtsi32_si128(m_val); \ - INSR32((m_val) >> 32, m_into, 1); +#ifndef SSE_VERSION +#define SSE_VERSION 4 #endif -IGNORE_UNINITIALIZED_WARNING_STOP +#include "32bpp_ssse3.hpp" /** The SSE4 32 bpp blitter (without palette animation). */ class Blitter_32bppSSE4 : public Blitter_32bppSSSE3 { public: - Colour AdjustBrightness(Colour colour, uint8 brightness); - static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness); - /* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom); template <BlitterMode mode, Blitter_32bppSSE_Base::ReadMode read_mode, Blitter_32bppSSE_Base::BlockType bt_last> void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom); diff --git a/src/blitter/32bpp_sse_func.hpp b/src/blitter/32bpp_sse_func.hpp new file mode 100644 index 000000000..d6febcf49 --- /dev/null +++ b/src/blitter/32bpp_sse_func.hpp @@ -0,0 +1,225 @@ +/* $Id$ */ + +/* + * This file is part of OpenTTD. + * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2. + * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>. + */ + +/** @file 32bpp_sse_base.hpp Functions related to SSE 32 bpp blitter. */ + +#ifndef BLITTER_32BPP_SSE_BASE_HPP +#define BLITTER_32BPP_SSE_BASE_HPP + +#ifdef WITH_SSE + +#include "32bpp_simple.hpp" +#if (SSE_VERSION == 2) +#include <emmintrin.h> +#elif (SSE_VERSION == 3) +#include <tmmintrin.h> +#elif (SSE_VERSION == 4) +#include <smmintrin.h> +#endif + +#define META_LENGTH 2 ///< Number of uint32 inserted before each line of pixels in a sprite. +#define MARGIN_NORMAL_THRESHOLD (zoom == ZOOM_LVL_OUT_32X ? 8 : 4) ///< Minimum width to use margins with BM_NORMAL. +#define MARGIN_REMAP_THRESHOLD 4 ///< Minimum width to use margins with BM_COLOUR_REMAP. + +#ifdef _MSC_VER + #define ALIGN(n) __declspec(align(n)) +#else + #define ALIGN(n) __attribute__ ((aligned (n))) +#endif + +typedef union ALIGN(16) um128i { + __m128i m128i; + uint8 m128i_u8[16]; + uint16 m128i_u16[8]; + uint32 m128i_u32[4]; + uint64 m128i_u64[2]; +} um128i; + +#define CLEAR_HIGH_BYTE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0) +#define ALPHA_CONTROL_MASK _mm_setr_epi8( 6, 7, 6, 7, 6, 7, -1, -1, 14, 15, 14, 15, 14, 15, -1, -1) +#define PACK_LOW_CONTROL_MASK _mm_setr_epi8( 0, 2, 4, -1, 8, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1) +#define PACK_HIGH_CONTROL_MASK _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, -1, 8, 10, 12, -1) +#define BRIGHTNESS_LOW_CONTROL_MASK _mm_setr_epi8( 1, 2, 1, 2, 1, 2, 0, 2, 3, 2, 3, 2, 3, 2, 0, 2) +#define BRIGHTNESS_DIV_CLEANER _mm_setr_epi8(-1, 1, -1, 1, -1, 1, -1, 0, -1, 1, -1, 1, -1, 1, -1, 0) +#define OVERBRIGHT_PRESENCE_MASK _mm_setr_epi8( 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0) +#define OVERBRIGHT_VALUE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0) +#define OVERBRIGHT_CONTROL_MASK _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 7, 7, 2, 3, 2, 3, 2, 3, 7, 7) +#define TRANSPARENT_NOM_BASE _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256) + +static inline void InsertFirstUint32(const uint32 value, __m128i &into) +{ +#if (SSE_VERSION >= 4) + into = _mm_insert_epi32(into, value, 0); +#else + NOT_REACHED(); +#endif +} + +static inline void InsertSecondUint32(const uint32 value, __m128i &into) +{ +#if (SSE_VERSION >= 4) + into = _mm_insert_epi32(into, value, 1); +#else + into = _mm_insert_epi16(into, value, 2); + into = _mm_insert_epi16(into, value >> 16, 3); +#endif +} + +static inline void LoadUint64(const uint64 value, __m128i &into) +{ +#ifdef _SQ64 + into = _mm_cvtsi64_si128(value); +#else + #if (SSE_VERSION >= 4) + into = _mm_cvtsi32_si128(value); + InsertSecondUint32(value >> 32, into); + #else + (*(um128i*) &into).m128i_u64[0] = value; + #endif +#endif +} + +static inline __m128i PackUnsaturated(__m128i from, const __m128i &mask) +{ +#if (SSE_VERSION == 2) + from = _mm_and_si128(from, mask); // PAND, wipe high bytes to keep low bytes when packing + return _mm_packus_epi16(from, from); // PACKUSWB, pack 2 colours (with saturation) +#else + return _mm_shuffle_epi8(from, mask); +#endif +} + +static inline __m128i DistributeAlpha(const __m128i from, const __m128i &mask) +{ +#if (SSE_VERSION == 2) + __m128i alphaAB = _mm_shufflelo_epi16(from, 0x3F); // PSHUFLW, put alpha1 in front of each rgb1 + return _mm_shufflehi_epi16(alphaAB, 0x3F); // PSHUFHW, put alpha2 in front of each rgb2 +#else + return _mm_shuffle_epi8(from, mask); +#endif +} + +static inline __m128i AlphaBlendTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &pack_mask) +{ + __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128()); // PUNPCKLBW, expand each uint8 into uint16 + __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); + + __m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); // PCMPGTW, if (alpha > 0) a++; + alphaAB = _mm_srli_epi16(alphaAB, 15); + alphaAB = _mm_add_epi16(alphaAB, srcAB); + alphaAB = DistributeAlpha(alphaAB, distribution_mask); + + srcAB = _mm_sub_epi16(srcAB, dstAB); // PSUBW, (r - Cr) + srcAB = _mm_mullo_epi16(srcAB, alphaAB); // PMULLW, a*(r - Cr) + srcAB = _mm_srli_epi16(srcAB, 8); // PSRLW, a*(r - Cr)/256 + srcAB = _mm_add_epi16(srcAB, dstAB); // PADDW, a*(r - Cr)/256 + Cr + return PackUnsaturated(srcAB, pack_mask); +} + +/* Darken 2 pixels. + * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4) + */ +static inline __m128i DarkenTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &tr_nom_base) +{ + __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128()); + __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); + __m128i alphaAB = DistributeAlpha(srcAB, distribution_mask); + alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits. + __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); + dstAB = _mm_mullo_epi16(dstAB, nom); + dstAB = _mm_srli_epi16(dstAB, 8); + return _mm_packus_epi16(dstAB, dstAB); +} + +IGNORE_UNINITIALIZED_WARNING_START +static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness) +{ + uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32; + c16 *= brightness; + uint64 c16_ob = c16; // Helps out of order execution. + c16 /= Blitter_32bppBase::DEFAULT_BRIGHTNESS; + c16 &= 0x01FF01FF01FF; + + /* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */ + c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16; + const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2; + + const uint32 alpha32 = colour.data & 0xFF000000; + __m128i ret; + LoadUint64(c16, ret); + if (ob != 0) { + __m128i ob128 = _mm_cvtsi32_si128(ob); + ob128 = _mm_shufflelo_epi16(ob128, 0xC0); + __m128i white = OVERBRIGHT_VALUE_MASK; + __m128i c128 = ret; + ret = _mm_subs_epu16(white, c128); // PSUBUSW, (255 - rgb) + ret = _mm_mullo_epi16(ret, ob128); // PMULLW, ob*(255 - rgb) + ret = _mm_srli_epi16(ret, 8); // PSRLW, ob*(255 - rgb)/256 + ret = _mm_add_epi16(ret, c128); // PADDW, ob*(255 - rgb)/256 + rgb + } + + ret = _mm_packus_epi16(ret, ret); // PACKUSWB, saturate and pack. + return alpha32 | _mm_cvtsi128_si32(ret); +} +IGNORE_UNINITIALIZED_WARNING_STOP + +/** ReallyAdjustBrightness() is not called that often. + * Inlining this function implies a far jump, which has a huge latency. + */ +static inline Colour AdjustBrightneSSE(Colour colour, uint8 brightness) +{ + /* Shortcut for normal brightness. */ + if (brightness == Blitter_32bppBase::DEFAULT_BRIGHTNESS) return colour; + + return ReallyAdjustBrightness(colour, brightness); +} + +static inline __m128i AdjustBrightnessOfTwoPixels(__m128i from, uint32 brightness) +{ +#if (SSE_VERSION < 3) + NOT_REACHED(); +#else + /* The following dataflow differs from the one of AdjustBrightness() only for alpha. + * In order to keep alpha in colAB, insert a 1 in a unused brightness byte (a*1->a). + * OK, not a 1 but DEFAULT_BRIGHTNESS to compensate the div. + */ + brightness &= 0xFF00FF00; + brightness += Blitter_32bppBase::DEFAULT_BRIGHTNESS; + + __m128i colAB = _mm_unpacklo_epi8(from, _mm_setzero_si128()); + __m128i briAB = _mm_cvtsi32_si128(brightness); + briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK); // DEFAULT_BRIGHTNESS in 0, 0x00 in 2. + colAB = _mm_mullo_epi16(colAB, briAB); + __m128i colAB_ob = _mm_srli_epi16(colAB, 8+7); + colAB = _mm_srli_epi16(colAB, 7); + + /* Sum overbright. + * Maximum for each rgb is 508 => 9 bits. The highest bit tells if there is overbright. + * -255 is changed in -256 so we just have to take the 8 lower bits into account. + */ + colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER); + colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK); + colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK); + colAB_ob = _mm_and_si128(colAB_ob, colAB); + __m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, _mm_setzero_si128()), _mm_setzero_si128()); + + obAB = _mm_srli_epi16(obAB, 1); // Reduce overbright strength. + obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK); + __m128i retAB = OVERBRIGHT_VALUE_MASK; // ob_mask is equal to white. + retAB = _mm_subs_epu16(retAB, colAB); // (255 - rgb) + retAB = _mm_mullo_epi16(retAB, obAB); // ob*(255 - rgb) + retAB = _mm_srli_epi16(retAB, 8); // ob*(255 - rgb)/256 + retAB = _mm_add_epi16(retAB, colAB); // ob*(255 - rgb)/256 + rgb + + return _mm_packus_epi16(retAB, retAB); +#endif +} + +#endif /* WITH_SSE */ +#endif /* BLITTER_32BPP_SSE_BASE_HPP */ diff --git a/src/blitter/32bpp_ssse3.cpp b/src/blitter/32bpp_ssse3.cpp index 9cee7dbf9..3c42d359c 100644 --- a/src/blitter/32bpp_ssse3.cpp +++ b/src/blitter/32bpp_ssse3.cpp @@ -74,8 +74,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel for (uint x = (uint) effective_width / 2; x > 0; x--) { __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); - ALPHA_BLEND_2(); - _mm_storel_epi64((__m128i*) dst, srcABCD); + _mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm)); src += 2; dst += 2; } @@ -83,8 +82,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) { __m128i srcABCD = _mm_cvtsi32_si128(src->data); __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - ALPHA_BLEND_2(); - dst->data = _mm_cvtsi128_si32(srcABCD); + dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm)); } break; @@ -96,33 +94,39 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel /* Remap colours. */ if (mvX2 & 0x00FF00FF) { - /* Written so the compiler uses CMOV. */ - const Colour src0 = src[0]; - const uint m0 = (byte) mvX2; - const uint r0 = remap[m0]; - const Colour c0map = (this->LookupColourInPalette(r0).data & 0x00FFFFFF) | (src0.data & 0xFF000000); - Colour c0 = 0; // Use alpha of 0 to keep dst as is. - c0 = r0 == 0 ? c0 : c0map; - c0 = m0 != 0 ? c0 : src0; - srcABCD = _mm_cvtsi32_si128(c0.data); - - const Colour src1 = src[1]; - const uint m1 = (byte) (mvX2 >> 16); - const uint r1 = remap[m1]; - const Colour c1map = (this->LookupColourInPalette(r1).data & 0x00FFFFFF) | (src1.data & 0xFF000000); - Colour c1 = 0; - c1 = r1 == 0 ? c1 : c1map; - c1 = m1 != 0 ? c1 : src1; - INSR32(c1.data, srcABCD, 1); - - if ((mvX2 & 0xFF00FF00) != 0x80008000) { - ADJUST_BRIGHTNESS_2(srcABCD, mvX2); - } + #define CMOV_REMAP(m_colour, m_src, m_m) \ + /* Written so the compiler uses CMOV. */ \ + Colour m_colour = 0; \ + { \ + const Colour srcm = (Colour) (m_src); \ + const uint m = (byte) (m_m); \ + const uint r = remap[m]; \ + const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \ + m_colour = r == 0 ? m_colour : cmap; \ + m_colour = m != 0 ? m_colour : srcm; \ + } +#ifdef _SQ64 + uint64 srcs = _mm_cvtsi128_si64(srcABCD); + uint64 remapped_src = 0; + CMOV_REMAP(c0, srcs, mvX2); + remapped_src = c0.data; + CMOV_REMAP(c1, srcs >> 32, mvX2 >> 16); + remapped_src |= (uint64) c1.data << 32; + srcABCD = _mm_cvtsi64_si128(remapped_src); +#else + Colour remapped_src[2]; + CMOV_REMAP(c0, _mm_cvtsi128_si32(srcABCD), mvX2); + remapped_src[0] = c0.data; + CMOV_REMAP(c1, src[1], mvX2 >> 16); + remapped_src[1] = c1.data; + srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src); +#endif + + if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2); } /* Blend colours. */ - ALPHA_BLEND_2(); - _mm_storel_epi64((__m128i *) dst, srcABCD); + _mm_storel_epi64((__m128i *) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm)); dst += 2; src += 2; src_mv += 2; @@ -134,7 +138,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel if (src_mv->m) { const uint r = remap[src_mv->m]; if (r != 0) { - Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v); + Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v); if (src->a == 255) { *dst = remapped_colour; } else { @@ -148,7 +152,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel if (src->a < 255) { bmcr_alpha_blend_single: __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - ALPHA_BLEND_2(); + srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm); } dst->data = _mm_cvtsi128_si32(srcABCD); } @@ -160,8 +164,7 @@ bmcr_alpha_blend_single: for (uint x = (uint) bp->width / 2; x > 0; x--) { __m128i srcABCD = _mm_loadl_epi64((const __m128i*) src); __m128i dstABCD = _mm_loadl_epi64((__m128i*) dst); - DARKEN_2(); - _mm_storel_epi64((__m128i *) dst, dstAB); + _mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base)); src += 2; dst += 2; } @@ -169,8 +172,7 @@ bmcr_alpha_blend_single: if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) { __m128i srcABCD = _mm_cvtsi32_si128(src->data); __m128i dstABCD = _mm_cvtsi32_si128(dst->data); - DARKEN_2(); - dst->data = _mm_cvtsi128_si32(dstAB); + dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base)); } break; } diff --git a/src/blitter/32bpp_ssse3.hpp b/src/blitter/32bpp_ssse3.hpp index 23e6e4878..090dd5c42 100644 --- a/src/blitter/32bpp_ssse3.hpp +++ b/src/blitter/32bpp_ssse3.hpp @@ -14,53 +14,10 @@ #ifdef WITH_SSE +#ifndef SSE_VERSION +#define SSE_VERSION 3 +#endif #include "32bpp_sse2.hpp" -#include "tmmintrin.h" - -/* Use PSHUFB instead of PSHUFHW+PSHUFLW. */ -#undef PUT_ALPHA_IN_FRONT_OF_RGB -#define PUT_ALPHA_IN_FRONT_OF_RGB(m_from, m_into) m_into = _mm_shuffle_epi8(m_from, a_cm); - -#undef PACK_AB_WITHOUT_SATURATION -#define PACK_AB_WITHOUT_SATURATION(m_from, m_into) m_into = _mm_shuffle_epi8(m_from, pack_low_cm); - -/* Adjust brightness of 2 pixels. */ -#define ADJUST_BRIGHTNESS_2(m_colourX2, m_brightnessX2) \ - /* The following dataflow differs from the one of AdjustBrightness() only for alpha. - * In order to keep alpha in colAB, insert a 1 in a unused brightness byte (a*1->a). - * OK, not a 1 but DEFAULT_BRIGHTNESS to compensate the div. - */ \ - m_brightnessX2 &= 0xFF00FF00; \ - m_brightnessX2 += DEFAULT_BRIGHTNESS; \ - \ - __m128i zero = _mm_setzero_si128(); \ - __m128i colAB = _mm_unpacklo_epi8(m_colourX2, zero); \ - \ - __m128i briAB = _mm_cvtsi32_si128(m_brightnessX2); \ - briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK); /* DEFAULT_BRIGHTNESS in 0, 0x00 in 2. */ \ - colAB = _mm_mullo_epi16(colAB, briAB); \ - __m128i colAB_ob = _mm_srli_epi16(colAB, 8+7); \ - colAB = _mm_srli_epi16(colAB, 7); \ - \ - /* Sum overbright. - * Maximum for each rgb is 508 => 9 bits. The highest bit tells if there is overbright. - * -255 is changed in -256 so we just have to take the 8 lower bits into account. - */ \ - colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER); \ - colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK); \ - colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK); \ - colAB_ob = _mm_and_si128(colAB_ob, colAB); \ - __m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, zero), zero); \ - \ - obAB = _mm_srli_epi16(obAB, 1); /* Reduce overbright strength. */ \ - obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK); \ - __m128i retAB = OVERBRIGHT_VALUE_MASK; /* ob_mask is equal to white. */ \ - retAB = _mm_subs_epu16(retAB, colAB); /* (255 - rgb) */ \ - retAB = _mm_mullo_epi16(retAB, obAB); /* ob*(255 - rgb) */ \ - retAB = _mm_srli_epi16(retAB, 8); /* ob*(255 - rgb)/256 */ \ - retAB = _mm_add_epi16(retAB, colAB); /* ob*(255 - rgb)/256 + rgb */ \ - \ - m_colourX2 = _mm_packus_epi16(retAB, retAB); /** The SSSE3 32 bpp blitter (without palette animation). */ class Blitter_32bppSSSE3 : public Blitter_32bppSSE2 { |