diff options
Diffstat (limited to 'src/blitter/32bpp_sse_func.hpp')
-rw-r--r-- | src/blitter/32bpp_sse_func.hpp | 225 |
1 files changed, 225 insertions, 0 deletions
diff --git a/src/blitter/32bpp_sse_func.hpp b/src/blitter/32bpp_sse_func.hpp new file mode 100644 index 000000000..d6febcf49 --- /dev/null +++ b/src/blitter/32bpp_sse_func.hpp @@ -0,0 +1,225 @@ +/* $Id$ */ + +/* + * This file is part of OpenTTD. + * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2. + * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>. + */ + +/** @file 32bpp_sse_base.hpp Functions related to SSE 32 bpp blitter. */ + +#ifndef BLITTER_32BPP_SSE_BASE_HPP +#define BLITTER_32BPP_SSE_BASE_HPP + +#ifdef WITH_SSE + +#include "32bpp_simple.hpp" +#if (SSE_VERSION == 2) +#include <emmintrin.h> +#elif (SSE_VERSION == 3) +#include <tmmintrin.h> +#elif (SSE_VERSION == 4) +#include <smmintrin.h> +#endif + +#define META_LENGTH 2 ///< Number of uint32 inserted before each line of pixels in a sprite. +#define MARGIN_NORMAL_THRESHOLD (zoom == ZOOM_LVL_OUT_32X ? 8 : 4) ///< Minimum width to use margins with BM_NORMAL. +#define MARGIN_REMAP_THRESHOLD 4 ///< Minimum width to use margins with BM_COLOUR_REMAP. + +#ifdef _MSC_VER + #define ALIGN(n) __declspec(align(n)) +#else + #define ALIGN(n) __attribute__ ((aligned (n))) +#endif + +typedef union ALIGN(16) um128i { + __m128i m128i; + uint8 m128i_u8[16]; + uint16 m128i_u16[8]; + uint32 m128i_u32[4]; + uint64 m128i_u64[2]; +} um128i; + +#define CLEAR_HIGH_BYTE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0) +#define ALPHA_CONTROL_MASK _mm_setr_epi8( 6, 7, 6, 7, 6, 7, -1, -1, 14, 15, 14, 15, 14, 15, -1, -1) +#define PACK_LOW_CONTROL_MASK _mm_setr_epi8( 0, 2, 4, -1, 8, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1) +#define PACK_HIGH_CONTROL_MASK _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, -1, 8, 10, 12, -1) +#define BRIGHTNESS_LOW_CONTROL_MASK _mm_setr_epi8( 1, 2, 1, 2, 1, 2, 0, 2, 3, 2, 3, 2, 3, 2, 0, 2) +#define BRIGHTNESS_DIV_CLEANER _mm_setr_epi8(-1, 1, -1, 1, -1, 1, -1, 0, -1, 1, -1, 1, -1, 1, -1, 0) +#define OVERBRIGHT_PRESENCE_MASK _mm_setr_epi8( 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0) +#define OVERBRIGHT_VALUE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0) +#define OVERBRIGHT_CONTROL_MASK _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 7, 7, 2, 3, 2, 3, 2, 3, 7, 7) +#define TRANSPARENT_NOM_BASE _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256) + +static inline void InsertFirstUint32(const uint32 value, __m128i &into) +{ +#if (SSE_VERSION >= 4) + into = _mm_insert_epi32(into, value, 0); +#else + NOT_REACHED(); +#endif +} + +static inline void InsertSecondUint32(const uint32 value, __m128i &into) +{ +#if (SSE_VERSION >= 4) + into = _mm_insert_epi32(into, value, 1); +#else + into = _mm_insert_epi16(into, value, 2); + into = _mm_insert_epi16(into, value >> 16, 3); +#endif +} + +static inline void LoadUint64(const uint64 value, __m128i &into) +{ +#ifdef _SQ64 + into = _mm_cvtsi64_si128(value); +#else + #if (SSE_VERSION >= 4) + into = _mm_cvtsi32_si128(value); + InsertSecondUint32(value >> 32, into); + #else + (*(um128i*) &into).m128i_u64[0] = value; + #endif +#endif +} + +static inline __m128i PackUnsaturated(__m128i from, const __m128i &mask) +{ +#if (SSE_VERSION == 2) + from = _mm_and_si128(from, mask); // PAND, wipe high bytes to keep low bytes when packing + return _mm_packus_epi16(from, from); // PACKUSWB, pack 2 colours (with saturation) +#else + return _mm_shuffle_epi8(from, mask); +#endif +} + +static inline __m128i DistributeAlpha(const __m128i from, const __m128i &mask) +{ +#if (SSE_VERSION == 2) + __m128i alphaAB = _mm_shufflelo_epi16(from, 0x3F); // PSHUFLW, put alpha1 in front of each rgb1 + return _mm_shufflehi_epi16(alphaAB, 0x3F); // PSHUFHW, put alpha2 in front of each rgb2 +#else + return _mm_shuffle_epi8(from, mask); +#endif +} + +static inline __m128i AlphaBlendTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &pack_mask) +{ + __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128()); // PUNPCKLBW, expand each uint8 into uint16 + __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); + + __m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); // PCMPGTW, if (alpha > 0) a++; + alphaAB = _mm_srli_epi16(alphaAB, 15); + alphaAB = _mm_add_epi16(alphaAB, srcAB); + alphaAB = DistributeAlpha(alphaAB, distribution_mask); + + srcAB = _mm_sub_epi16(srcAB, dstAB); // PSUBW, (r - Cr) + srcAB = _mm_mullo_epi16(srcAB, alphaAB); // PMULLW, a*(r - Cr) + srcAB = _mm_srli_epi16(srcAB, 8); // PSRLW, a*(r - Cr)/256 + srcAB = _mm_add_epi16(srcAB, dstAB); // PADDW, a*(r - Cr)/256 + Cr + return PackUnsaturated(srcAB, pack_mask); +} + +/* Darken 2 pixels. + * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4) + */ +static inline __m128i DarkenTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &tr_nom_base) +{ + __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128()); + __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); + __m128i alphaAB = DistributeAlpha(srcAB, distribution_mask); + alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits. + __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); + dstAB = _mm_mullo_epi16(dstAB, nom); + dstAB = _mm_srli_epi16(dstAB, 8); + return _mm_packus_epi16(dstAB, dstAB); +} + +IGNORE_UNINITIALIZED_WARNING_START +static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness) +{ + uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32; + c16 *= brightness; + uint64 c16_ob = c16; // Helps out of order execution. + c16 /= Blitter_32bppBase::DEFAULT_BRIGHTNESS; + c16 &= 0x01FF01FF01FF; + + /* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */ + c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16; + const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2; + + const uint32 alpha32 = colour.data & 0xFF000000; + __m128i ret; + LoadUint64(c16, ret); + if (ob != 0) { + __m128i ob128 = _mm_cvtsi32_si128(ob); + ob128 = _mm_shufflelo_epi16(ob128, 0xC0); + __m128i white = OVERBRIGHT_VALUE_MASK; + __m128i c128 = ret; + ret = _mm_subs_epu16(white, c128); // PSUBUSW, (255 - rgb) + ret = _mm_mullo_epi16(ret, ob128); // PMULLW, ob*(255 - rgb) + ret = _mm_srli_epi16(ret, 8); // PSRLW, ob*(255 - rgb)/256 + ret = _mm_add_epi16(ret, c128); // PADDW, ob*(255 - rgb)/256 + rgb + } + + ret = _mm_packus_epi16(ret, ret); // PACKUSWB, saturate and pack. + return alpha32 | _mm_cvtsi128_si32(ret); +} +IGNORE_UNINITIALIZED_WARNING_STOP + +/** ReallyAdjustBrightness() is not called that often. + * Inlining this function implies a far jump, which has a huge latency. + */ +static inline Colour AdjustBrightneSSE(Colour colour, uint8 brightness) +{ + /* Shortcut for normal brightness. */ + if (brightness == Blitter_32bppBase::DEFAULT_BRIGHTNESS) return colour; + + return ReallyAdjustBrightness(colour, brightness); +} + +static inline __m128i AdjustBrightnessOfTwoPixels(__m128i from, uint32 brightness) +{ +#if (SSE_VERSION < 3) + NOT_REACHED(); +#else + /* The following dataflow differs from the one of AdjustBrightness() only for alpha. + * In order to keep alpha in colAB, insert a 1 in a unused brightness byte (a*1->a). + * OK, not a 1 but DEFAULT_BRIGHTNESS to compensate the div. + */ + brightness &= 0xFF00FF00; + brightness += Blitter_32bppBase::DEFAULT_BRIGHTNESS; + + __m128i colAB = _mm_unpacklo_epi8(from, _mm_setzero_si128()); + __m128i briAB = _mm_cvtsi32_si128(brightness); + briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK); // DEFAULT_BRIGHTNESS in 0, 0x00 in 2. + colAB = _mm_mullo_epi16(colAB, briAB); + __m128i colAB_ob = _mm_srli_epi16(colAB, 8+7); + colAB = _mm_srli_epi16(colAB, 7); + + /* Sum overbright. + * Maximum for each rgb is 508 => 9 bits. The highest bit tells if there is overbright. + * -255 is changed in -256 so we just have to take the 8 lower bits into account. + */ + colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER); + colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK); + colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK); + colAB_ob = _mm_and_si128(colAB_ob, colAB); + __m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, _mm_setzero_si128()), _mm_setzero_si128()); + + obAB = _mm_srli_epi16(obAB, 1); // Reduce overbright strength. + obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK); + __m128i retAB = OVERBRIGHT_VALUE_MASK; // ob_mask is equal to white. + retAB = _mm_subs_epu16(retAB, colAB); // (255 - rgb) + retAB = _mm_mullo_epi16(retAB, obAB); // ob*(255 - rgb) + retAB = _mm_srli_epi16(retAB, 8); // ob*(255 - rgb)/256 + retAB = _mm_add_epi16(retAB, colAB); // ob*(255 - rgb)/256 + rgb + + return _mm_packus_epi16(retAB, retAB); +#endif +} + +#endif /* WITH_SSE */ +#endif /* BLITTER_32BPP_SSE_BASE_HPP */ |