summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--projects/openttd_vs100.vcxproj1
-rw-r--r--projects/openttd_vs100.vcxproj.filters3
-rw-r--r--projects/openttd_vs80.vcproj4
-rw-r--r--projects/openttd_vs90.vcproj4
-rw-r--r--source.list1
-rw-r--r--src/blitter/32bpp_anim_sse4.cpp80
-rw-r--r--src/blitter/32bpp_anim_sse4.hpp5
-rw-r--r--src/blitter/32bpp_sse2.cpp61
-rw-r--r--src/blitter/32bpp_sse2.hpp90
-rw-r--r--src/blitter/32bpp_sse4.cpp111
-rw-r--r--src/blitter/32bpp_sse4.hpp33
-rw-r--r--src/blitter/32bpp_sse_func.hpp225
-rw-r--r--src/blitter/32bpp_ssse3.cpp70
-rw-r--r--src/blitter/32bpp_ssse3.hpp49
14 files changed, 369 insertions, 368 deletions
diff --git a/projects/openttd_vs100.vcxproj b/projects/openttd_vs100.vcxproj
index 127077c77..ba10f2794 100644
--- a/projects/openttd_vs100.vcxproj
+++ b/projects/openttd_vs100.vcxproj
@@ -1123,6 +1123,7 @@
<ClInclude Include="..\src\blitter\32bpp_optimized.hpp" />
<ClCompile Include="..\src\blitter\32bpp_simple.cpp" />
<ClInclude Include="..\src\blitter\32bpp_simple.hpp" />
+ <ClInclude Include="..\src\blitter\32bpp_sse_func.hpp" />
<ClCompile Include="..\src\blitter\32bpp_sse2.cpp" />
<ClInclude Include="..\src\blitter\32bpp_sse2.hpp" />
<ClCompile Include="..\src\blitter\32bpp_sse4.cpp" />
diff --git a/projects/openttd_vs100.vcxproj.filters b/projects/openttd_vs100.vcxproj.filters
index 7ecffde58..613b1a3e1 100644
--- a/projects/openttd_vs100.vcxproj.filters
+++ b/projects/openttd_vs100.vcxproj.filters
@@ -2598,6 +2598,9 @@
<ClInclude Include="..\src\blitter\32bpp_simple.hpp">
<Filter>Blitters</Filter>
</ClInclude>
+ <ClInclude Include="..\src\blitter\32bpp_sse_func.hpp">
+ <Filter>Blitters</Filter>
+ </ClInclude>
<ClCompile Include="..\src\blitter\32bpp_sse2.cpp">
<Filter>Blitters</Filter>
</ClCompile>
diff --git a/projects/openttd_vs80.vcproj b/projects/openttd_vs80.vcproj
index a343c5acf..7dde340e9 100644
--- a/projects/openttd_vs80.vcproj
+++ b/projects/openttd_vs80.vcproj
@@ -3835,6 +3835,10 @@
>
</File>
<File
+ RelativePath=".\..\src\blitter\32bpp_sse_func.hpp"
+ >
+ </File>
+ <File
RelativePath=".\..\src\blitter\32bpp_sse2.cpp"
>
</File>
diff --git a/projects/openttd_vs90.vcproj b/projects/openttd_vs90.vcproj
index 0faf59fbf..3f14faacb 100644
--- a/projects/openttd_vs90.vcproj
+++ b/projects/openttd_vs90.vcproj
@@ -3832,6 +3832,10 @@
>
</File>
<File
+ RelativePath=".\..\src\blitter\32bpp_sse_func.hpp"
+ >
+ </File>
+ <File
RelativePath=".\..\src\blitter\32bpp_sse2.cpp"
>
</File>
diff --git a/source.list b/source.list
index 13a6e7239..04bc67d55 100644
--- a/source.list
+++ b/source.list
@@ -912,6 +912,7 @@ blitter/32bpp_optimized.hpp
blitter/32bpp_simple.cpp
blitter/32bpp_simple.hpp
#if SSE
+blitter/32bpp_sse_func.hpp
blitter/32bpp_sse2.cpp
blitter/32bpp_sse2.hpp
blitter/32bpp_sse4.cpp
diff --git a/src/blitter/32bpp_anim_sse4.cpp b/src/blitter/32bpp_anim_sse4.cpp
index e8873d5d3..ae1b34d69 100644
--- a/src/blitter/32bpp_anim_sse4.cpp
+++ b/src/blitter/32bpp_anim_sse4.cpp
@@ -83,12 +83,12 @@ inline void Blitter_32bppSSE4_Anim::Draw(const Blitter::BlitterParams *bp, ZoomL
const byte m0 = mvX2;
if (m0 >= PALETTE_ANIM_START) {
const Colour c0 = (this->LookupColourInPalette(m0).data & 0x00FFFFFF) | (src[0].data & 0xFF000000);
- INSR32(AdjustBrightness(c0, (byte) (mvX2 >> 8)).data, srcABCD, 0);
+ InsertFirstUint32(AdjustBrightneSSE(c0, (byte) (mvX2 >> 8)).data, srcABCD);
}
const byte m1 = mvX2 >> 16;
if (m1 >= PALETTE_ANIM_START) {
const Colour c1 = (this->LookupColourInPalette(m1).data & 0x00FFFFFF) | (src[1].data & 0xFF000000);
- INSR32(AdjustBrightness(c1, (byte) (mvX2 >> 24)).data, srcABCD, 1);
+ InsertSecondUint32(AdjustBrightneSSE(c1, (byte) (mvX2 >> 24)).data, srcABCD);
}
/* Update anim buffer. */
@@ -118,7 +118,7 @@ inline void Blitter_32bppSSE4_Anim::Draw(const Blitter::BlitterParams *bp, ZoomL
/* Blend colours. */
bmno_alpha_blend:
- ALPHA_BLEND_2();
+ srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
bmno_full_opacity:
_mm_storel_epi64((__m128i *) dst, srcABCD);
bmno_full_transparency:
@@ -132,20 +132,19 @@ bmno_full_transparency:
if (src->a == 0) {
} else if (src->a == 255) {
*anim = *(const uint16*) src_mv;
- *dst = (src_mv->m >= PALETTE_ANIM_START) ? AdjustBrightness(LookupColourInPalette(src_mv->m), src_mv->v) : *src;
+ *dst = (src_mv->m >= PALETTE_ANIM_START) ? AdjustBrightneSSE(LookupColourInPalette(src_mv->m), src_mv->v) : *src;
} else {
*anim = 0;
__m128i srcABCD;
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
if (src_mv->m >= PALETTE_ANIM_START) {
- Colour colour = AdjustBrightness(LookupColourInPalette(src_mv->m), src_mv->v);
+ Colour colour = AdjustBrightneSSE(LookupColourInPalette(src_mv->m), src_mv->v);
colour.a = src->a;
srcABCD = _mm_cvtsi32_si128(colour.data);
} else {
srcABCD = _mm_cvtsi32_si128(src->data);
}
- ALPHA_BLEND_2();
- dst->data = _mm_cvtsi128_si32(srcABCD);
+ dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
}
}
break;
@@ -162,24 +161,36 @@ bmno_full_transparency:
const uint m1 = (byte) (mvX2 >> 16);
const uint r1 = remap[m1];
if (mvX2 & 0x00FF00FF) {
- /* Written so the compiler uses CMOV. */
- const Colour src0 = src[0];
- const Colour c0map = (this->LookupColourInPalette(r0).data & 0x00FFFFFF) | (src0.data & 0xFF000000);
- Colour c0 = dst[0];
- c0 = r0 == 0 ? c0 : c0map;
- c0 = m0 != 0 ? c0 : src0;
- srcABCD = _mm_cvtsi32_si128(c0.data);
-
- const Colour src1 = src[1];
- const Colour c1map = (this->LookupColourInPalette(r1).data & 0x00FFFFFF) | (src1.data & 0xFF000000);
- Colour c1 = dst[1];
- c1 = r1 == 0 ? c1 : c1map;
- c1 = m1 != 0 ? c1 : src1;
- INSR32(c1.data, srcABCD, 1);
+ #define CMOV_REMAP(m_colour, m_colour_init, m_src, m_m) \
+ /* Written so the compiler uses CMOV. */ \
+ Colour m_colour = m_colour_init; \
+ { \
+ const Colour srcm = (Colour) (m_src); \
+ const uint m = (byte) (m_m); \
+ const uint r = remap[m]; \
+ const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \
+ m_colour = r == 0 ? m_colour : cmap; \
+ m_colour = m != 0 ? m_colour : srcm; \
+ }
+#ifdef _SQ64
+ uint64 srcs = _mm_cvtsi128_si64(srcABCD);
+ uint64 dsts = _mm_cvtsi128_si64(dstABCD);
+ uint64 remapped_src = 0;
+ CMOV_REMAP(c0, dsts, srcs, mvX2);
+ remapped_src = c0.data;
+ CMOV_REMAP(c1, dsts >> 32, srcs >> 32, mvX2 >> 16);
+ remapped_src |= (uint64) c1.data << 32;
+ srcABCD = _mm_cvtsi64_si128(remapped_src);
+#else
+ Colour remapped_src[2];
+ CMOV_REMAP(c0, _mm_cvtsi128_si32(dstABCD), _mm_cvtsi128_si32(srcABCD), mvX2);
+ remapped_src[0] = c0.data;
+ CMOV_REMAP(c1, dst[1], src[1], mvX2 >> 16);
+ remapped_src[1] = c1.data;
+ srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src);
+#endif
- if ((mvX2 & 0xFF00FF00) != 0x80008000) {
- ADJUST_BRIGHTNESS_2(srcABCD, mvX2);
- }
+ if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2);
}
/* Update anim buffer. */
@@ -211,7 +222,7 @@ bmno_full_transparency:
/* Blend colours. */
bmcr_alpha_blend:
- ALPHA_BLEND_2();
+ srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
bmcr_full_opacity:
_mm_storel_epi64((__m128i *) dst, srcABCD);
bmcr_full_transparency:
@@ -229,7 +240,7 @@ bmcr_full_transparency:
const uint r = remap[src_mv->m];
*anim = (src->a == 255) ? r | ((uint16) src_mv->v << 8 ) : 0;
if (r != 0) {
- Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
+ Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
if (src->a == 255) {
*dst = remapped_colour;
} else {
@@ -244,7 +255,7 @@ bmcr_full_transparency:
if (src->a < 255) {
bmcr_alpha_blend_single:
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
- ALPHA_BLEND_2();
+ srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
}
dst->data = _mm_cvtsi128_si32(srcABCD);
}
@@ -256,8 +267,7 @@ bmcr_alpha_blend_single:
for (uint x = (uint) bp->width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
- DARKEN_2();
- _mm_storel_epi64((__m128i *) dst, dstAB);
+ _mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
src += 2;
dst += 2;
anim += 2;
@@ -268,8 +278,7 @@ bmcr_alpha_blend_single:
if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
- DARKEN_2();
- dst->data = _mm_cvtsi128_si32(dstAB);
+ dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
if (src[0].a) anim[0] = 0;
}
break;
@@ -318,13 +327,4 @@ void Blitter_32bppSSE4_Anim::Draw(Blitter::BlitterParams *bp, BlitterMode mode,
}
}
-/** Same code as seen in 32bpp_sse2.cpp but some macros are not the same. */
-inline Colour Blitter_32bppSSE4_Anim::AdjustBrightness(Colour colour, uint8 brightness)
-{
- /* Shortcut for normal brightness. */
- if (brightness == DEFAULT_BRIGHTNESS) return colour;
-
- return Blitter_32bppSSE4::ReallyAdjustBrightness(colour, brightness);
-}
-
#endif /* WITH_SSE */
diff --git a/src/blitter/32bpp_anim_sse4.hpp b/src/blitter/32bpp_anim_sse4.hpp
index 0f1131c88..9a3f93ca8 100644
--- a/src/blitter/32bpp_anim_sse4.hpp
+++ b/src/blitter/32bpp_anim_sse4.hpp
@@ -14,6 +14,9 @@
#ifdef WITH_SSE
+#ifndef SSE_VERSION
+#define SSE_VERSION 4
+#endif
#include "32bpp_anim.hpp"
#include "32bpp_sse4.hpp"
@@ -28,11 +31,9 @@ public:
template <BlitterMode mode, Blitter_32bppSSE_Base::ReadMode read_mode, Blitter_32bppSSE_Base::BlockType bt_last>
/* virtual */ void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom);
/* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom);
- /* virtual */ Colour AdjustBrightness(Colour colour, uint8 brightness);
/* virtual */ Sprite *Encode(const SpriteLoader::Sprite *sprite, AllocatorProc *allocator) {
return Blitter_32bppSSE_Base::Encode(sprite, allocator);
}
-
/* virtual */ const char *GetName() { return "32bpp-sse4-anim"; }
};
diff --git a/src/blitter/32bpp_sse2.cpp b/src/blitter/32bpp_sse2.cpp
index 0b3eb1899..49fb28c35 100644
--- a/src/blitter/32bpp_sse2.cpp
+++ b/src/blitter/32bpp_sse2.cpp
@@ -73,8 +73,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
for (uint x = (uint) effective_width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
- ALPHA_BLEND_2();
- _mm_storel_epi64((__m128i*) dst, srcABCD);
+ _mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, clear_hi, clear_hi));
src += 2;
dst += 2;
}
@@ -82,8 +81,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
- ALPHA_BLEND_2();
- dst->data = _mm_cvtsi128_si32(srcABCD);
+ dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, clear_hi, clear_hi));
}
break;
@@ -94,7 +92,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if (src_mv->m) {
const uint r = remap[src_mv->m];
if (r != 0) {
- Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
+ Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
if (src->a == 255) {
*dst = remapped_colour;
} else {
@@ -108,7 +106,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if (src->a < 255) {
bmcr_alpha_blend_single:
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
- ALPHA_BLEND_2();
+ srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, clear_hi, clear_hi);
}
dst->data = _mm_cvtsi128_si32(srcABCD);
}
@@ -123,8 +121,7 @@ bmcr_alpha_blend_single:
for (uint x = (uint) bp->width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
- DARKEN_2();
- _mm_storel_epi64((__m128i *) dst, dstAB);
+ _mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, tr_nom_base, tr_nom_base));
src += 2;
dst += 2;
}
@@ -132,8 +129,7 @@ bmcr_alpha_blend_single:
if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
- DARKEN_2();
- dst->data = _mm_cvtsi128_si32(dstAB);
+ dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, tr_nom_base, tr_nom_base));
}
break;
}
@@ -235,7 +231,7 @@ Sprite *Blitter_32bppSSE_Base::Encode(const SpriteLoader::Sprite *sprite, Alloca
dst_mv->v = (rgb_max == 0) ? Blitter_32bppBase::DEFAULT_BRIGHTNESS : rgb_max;
/* Pre-convert the mapping channel to a RGB value. */
- const Colour colour = AdjustBrightness(Blitter_32bppBase::LookupColourInPalette(src->m), dst_mv->v);
+ const Colour colour = AdjustBrightneSSE(Blitter_32bppBase::LookupColourInPalette(src->m), dst_mv->v);
dst_rgba->r = colour.r;
dst_rgba->g = colour.g;
dst_rgba->b = colour.b;
@@ -282,47 +278,4 @@ Sprite *Blitter_32bppSSE_Base::Encode(const SpriteLoader::Sprite *sprite, Alloca
return dst_sprite;
}
-/** ReallyAdjustBrightness() is not called that often.
- * Inlining this function implies a far jump, which has a huge latency.
- */
-inline Colour Blitter_32bppSSE2::AdjustBrightness(Colour colour, uint8 brightness)
-{
- /* Shortcut for normal brightness. */
- if (brightness == DEFAULT_BRIGHTNESS) return colour;
-
- return Blitter_32bppSSE2::ReallyAdjustBrightness(colour, brightness);
-}
-
-IGNORE_UNINITIALIZED_WARNING_START
-Colour Blitter_32bppSSE2::ReallyAdjustBrightness(Colour colour, uint8 brightness)
-{
- uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
- c16 *= brightness;
- uint64 c16_ob = c16; // Helps out of order execution.
- c16 /= DEFAULT_BRIGHTNESS;
- c16 &= 0x01FF01FF01FF;
-
- /* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */
- c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16;
- const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2;
-
- const uint32 alpha32 = colour.data & 0xFF000000;
- __m128i ret;
- LOAD64(c16, ret);
- if (ob != 0) {
- __m128i ob128 = _mm_cvtsi32_si128(ob);
- ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
- __m128i white = OVERBRIGHT_VALUE_MASK;
- __m128i c128 = ret;
- ret = _mm_subs_epu16(white, c128); /* PSUBUSW, (255 - rgb) */
- ret = _mm_mullo_epi16(ret, ob128); /* PMULLW, ob*(255 - rgb) */
- ret = _mm_srli_epi16(ret, 8); /* PSRLW, ob*(255 - rgb)/256 */
- ret = _mm_add_epi16(ret, c128); /* PADDW, ob*(255 - rgb)/256 + rgb */
- }
-
- ret = _mm_packus_epi16(ret, ret); /* PACKUSWB, saturate and pack. */
- return alpha32 | _mm_cvtsi128_si32(ret);
-}
-IGNORE_UNINITIALIZED_WARNING_STOP
-
#endif /* WITH_SSE */
diff --git a/src/blitter/32bpp_sse2.hpp b/src/blitter/32bpp_sse2.hpp
index 1c3307c70..3bab0d752 100644
--- a/src/blitter/32bpp_sse2.hpp
+++ b/src/blitter/32bpp_sse2.hpp
@@ -14,91 +14,10 @@
#ifdef WITH_SSE
-#include "32bpp_simple.hpp"
-#include "emmintrin.h"
-
-#define META_LENGTH 2 ///< Number of uint32 inserted before each line of pixels in a sprite.
-#define MARGIN_NORMAL_THRESHOLD (zoom == ZOOM_LVL_OUT_32X ? 8 : 4) ///< Minimum width to use margins with BM_NORMAL.
-#define MARGIN_REMAP_THRESHOLD 4 ///< Minimum width to use margins with BM_COLOUR_REMAP.
-
-#ifdef _MSC_VER
- #define ALIGN(n) __declspec(align(n))
-#else
- #define ALIGN(n) __attribute__ ((aligned (n)))
-#endif
-
-typedef union ALIGN(16) um128i {
- __m128i m128i;
- uint8 m128i_u8[16];
- uint16 m128i_u16[8];
- uint32 m128i_u32[4];
- uint64 m128i_u64[2];
-} um128i;
-
-#define CLEAR_HIGH_BYTE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0)
-#define ALPHA_CONTROL_MASK _mm_setr_epi8( 6, 7, 6, 7, 6, 7, -1, -1, 14, 15, 14, 15, 14, 15, -1, -1)
-#define PACK_LOW_CONTROL_MASK _mm_setr_epi8( 0, 2, 4, -1, 8, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1)
-#define PACK_HIGH_CONTROL_MASK _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, -1, 8, 10, 12, -1)
-#define BRIGHTNESS_LOW_CONTROL_MASK _mm_setr_epi8( 1, 2, 1, 2, 1, 2, 0, 2, 3, 2, 3, 2, 3, 2, 0, 2)
-#define BRIGHTNESS_DIV_CLEANER _mm_setr_epi8(-1, 1, -1, 1, -1, 1, -1, 0, -1, 1, -1, 1, -1, 1, -1, 0)
-#define OVERBRIGHT_PRESENCE_MASK _mm_setr_epi8( 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0)
-#define OVERBRIGHT_VALUE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0)
-#define OVERBRIGHT_CONTROL_MASK _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 7, 7, 2, 3, 2, 3, 2, 3, 7, 7)
-#define TRANSPARENT_NOM_BASE _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256)
-
-#define EXTR32(m_from, m_rank) (*(um128i*) &m_from).m128i_u32[m_rank]
-#define EXTR64(m_from, m_rank) (*(um128i*) &m_from).m128i_u64[m_rank]
-#define INSR32(m_val, m_into, m_rank) { \
- (*(um128i*) &m_into).m128i = _mm_insert_epi16((*(um128i*) &m_into).m128i, m_val, (m_rank)*2); \
- (*(um128i*) &m_into).m128i = _mm_insert_epi16((*(um128i*) &m_into).m128i, (m_val) >> 16, (m_rank)*2 + 1); \
-}
-#define INSR64(m_val, m_into, m_rank) (*(um128i*) &m_into).m128i_u64[m_rank] = (m_val)
-
-#ifdef _SQ64
- #define LOAD64(m_val, m_into) m_into = _mm_cvtsi64_si128(m_val);
-#else
- #define LOAD64(m_val, m_into) INSR64(m_val, m_into, 0)
+#ifndef SSE_VERSION
+#define SSE_VERSION 2
#endif
-
-/* PUT_ALPHA_IN_FRONT_OF_RGB is redefined in 32bpp_ssse3.hpp. */
-#define PUT_ALPHA_IN_FRONT_OF_RGB(m_from, m_into) \
- m_into = _mm_shufflelo_epi16(m_from, 0x3F); /* PSHUFLW, put alpha1 in front of each rgb1 */ \
- m_into = _mm_shufflehi_epi16(m_into, 0x3F); /* PSHUFHW, put alpha2 in front of each rgb2 */
-
-/* PACK_AB_WITHOUT_SATURATION is redefined in 32bpp_ssse3.hpp. */
-#define PACK_AB_WITHOUT_SATURATION(m_from, m_into) \
- m_from = _mm_and_si128(m_from, clear_hi); /* PAND, wipe high bytes to keep low bytes when packing */ \
- m_into = _mm_packus_epi16(m_from, m_from); /* PACKUSWB, pack 2 colours (with saturation) */
-
-/* Alpha blend 2 pixels. */
-#define ALPHA_BLEND_2() { \
- __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); /* PUNPCKLBW, expand each uint8 into uint16 */ \
- __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); \
- \
- __m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); /* PCMPGTW, if (alpha > 0) a++; */ \
- alphaAB = _mm_srli_epi16(alphaAB, 15); \
- alphaAB = _mm_add_epi16(alphaAB, srcAB); \
- PUT_ALPHA_IN_FRONT_OF_RGB(alphaAB, alphaAB); \
- \
- srcAB = _mm_sub_epi16(srcAB, dstAB); /* PSUBW, (r - Cr) */ \
- srcAB = _mm_mullo_epi16(srcAB, alphaAB); /* PMULLW, a*(r - Cr) */ \
- srcAB = _mm_srli_epi16(srcAB, 8); /* PSRLW, a*(r - Cr)/256 */ \
- srcAB = _mm_add_epi16(srcAB, dstAB); /* PADDW, a*(r - Cr)/256 + Cr */ \
- PACK_AB_WITHOUT_SATURATION(srcAB, srcABCD); \
-}
-
-/* Darken 2 pixels.
- * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
- */
-#define DARKEN_2() \
- __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); \
- __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); \
- __m128i PUT_ALPHA_IN_FRONT_OF_RGB(srcAB, alphaAB); \
- alphaAB = _mm_srli_epi16(alphaAB, 2); /* Reduce to 64 levels of shades so the max value fits in 16 bits. */ \
- __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); \
- dstAB = _mm_mullo_epi16(dstAB, nom); \
- dstAB = _mm_srli_epi16(dstAB, 8); \
- dstAB = _mm_packus_epi16(dstAB, dstAB);
+#include "32bpp_sse_func.hpp"
/** Base methods for 32bpp SSE blitters. */
class Blitter_32bppSSE_Base {
@@ -138,14 +57,11 @@ public:
};
Sprite *Encode(const SpriteLoader::Sprite *sprite, AllocatorProc *allocator);
- virtual Colour AdjustBrightness(Colour colour, uint8 brightness) = 0;
};
/** The SSE2 32 bpp blitter (without palette animation). */
class Blitter_32bppSSE2 : public Blitter_32bppSimple, public Blitter_32bppSSE_Base {
public:
- virtual Colour AdjustBrightness(Colour colour, uint8 brightness);
- static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness);
/* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom);
template <BlitterMode mode, Blitter_32bppSSE_Base::ReadMode read_mode, Blitter_32bppSSE_Base::BlockType bt_last>
void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom);
diff --git a/src/blitter/32bpp_sse4.cpp b/src/blitter/32bpp_sse4.cpp
index 619110cb6..1403d3659 100644
--- a/src/blitter/32bpp_sse4.cpp
+++ b/src/blitter/32bpp_sse4.cpp
@@ -74,8 +74,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
for (uint x = (uint) effective_width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
- ALPHA_BLEND_2();
- _mm_storel_epi64((__m128i*) dst, srcABCD);
+ _mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
src += 2;
dst += 2;
}
@@ -83,8 +82,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
- ALPHA_BLEND_2();
- dst->data = _mm_cvtsi128_si32(srcABCD);
+ dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
}
break;
@@ -96,33 +94,39 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
/* Remap colours. */
if (mvX2 & 0x00FF00FF) {
- /* Written so the compiler uses CMOV. */
- const Colour src0 = src[0];
- const uint m0 = (byte) mvX2;
- const uint r0 = remap[m0];
- const Colour c0map = (this->LookupColourInPalette(r0).data & 0x00FFFFFF) | (src0.data & 0xFF000000);
- Colour c0 = 0; // Use alpha of 0 to keep dst as is.
- c0 = r0 == 0 ? c0 : c0map;
- c0 = m0 != 0 ? c0 : src0;
- srcABCD = _mm_cvtsi32_si128(c0.data);
-
- const Colour src1 = src[1];
- const uint m1 = (byte) (mvX2 >> 16);
- const uint r1 = remap[m1];
- const Colour c1map = (this->LookupColourInPalette(r1).data & 0x00FFFFFF) | (src1.data & 0xFF000000);
- Colour c1 = 0;
- c1 = r1 == 0 ? c1 : c1map;
- c1 = m1 != 0 ? c1 : src1;
- INSR32(c1.data, srcABCD, 1);
-
- if ((mvX2 & 0xFF00FF00) != 0x80008000) {
- ADJUST_BRIGHTNESS_2(srcABCD, mvX2);
- }
+ #define CMOV_REMAP(m_colour, m_src, m_m) \
+ /* Written so the compiler uses CMOV. */ \
+ Colour m_colour = 0; \
+ { \
+ const Colour srcm = (Colour) (m_src); \
+ const uint m = (byte) (m_m); \
+ const uint r = remap[m]; \
+ const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \
+ m_colour = r == 0 ? m_colour : cmap; \
+ m_colour = m != 0 ? m_colour : srcm; \
+ }
+#ifdef _SQ64
+ uint64 srcs = _mm_cvtsi128_si64(srcABCD);
+ uint64 remapped_src = 0;
+ CMOV_REMAP(c0, srcs, mvX2);
+ remapped_src = c0.data;
+ CMOV_REMAP(c1, srcs >> 32, mvX2 >> 16);
+ remapped_src |= (uint64) c1.data << 32;
+ srcABCD = _mm_cvtsi64_si128(remapped_src);
+#else
+ Colour remapped_src[2];
+ CMOV_REMAP(c0, _mm_cvtsi128_si32(srcABCD), mvX2);
+ remapped_src[0] = c0.data;
+ CMOV_REMAP(c1, src[1], mvX2 >> 16);
+ remapped_src[1] = c1.data;
+ srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src);
+#endif
+
+ if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2);
}
/* Blend colours. */
- ALPHA_BLEND_2();
- _mm_storel_epi64((__m128i *) dst, srcABCD);
+ _mm_storel_epi64((__m128i *) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
dst += 2;
src += 2;
src_mv += 2;
@@ -134,7 +138,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if (src_mv->m) {
const uint r = remap[src_mv->m];
if (r != 0) {
- Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
+ Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
if (src->a == 255) {
*dst = remapped_colour;
} else {
@@ -148,7 +152,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if (src->a < 255) {
bmcr_alpha_blend_single:
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
- ALPHA_BLEND_2();
+ srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
}
dst->data = _mm_cvtsi128_si32(srcABCD);
}
@@ -160,8 +164,7 @@ bmcr_alpha_blend_single:
for (uint x = (uint) bp->width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
- DARKEN_2();
- _mm_storel_epi64((__m128i *) dst, dstAB);
+ _mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
src += 2;
dst += 2;
}
@@ -169,8 +172,7 @@ bmcr_alpha_blend_single:
if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
- DARKEN_2();
- dst->data = _mm_cvtsi128_si32(dstAB);
+ dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
}
break;
}
@@ -217,45 +219,4 @@ void Blitter_32bppSSE4::Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomL
}
}
-/** Same code as seen in 32bpp_sse2.cpp but some macros are not the same. */
-inline Colour Blitter_32bppSSE4::AdjustBrightness(Colour colour, uint8 brightness)
-{
- /* Shortcut for normal brightness. */
- if (brightness == DEFAULT_BRIGHTNESS) return colour;
-
- return Blitter_32bppSSE4::ReallyAdjustBrightness(colour, brightness);
-}
-
-IGNORE_UNINITIALIZED_WARNING_START
-Colour Blitter_32bppSSE4::ReallyAdjustBrightness(Colour colour, uint8 brightness)
-{
- uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
- c16 *= brightness;
- uint64 c16_ob = c16; // Helps out of order execution.
- c16 /= DEFAULT_BRIGHTNESS;
- c16 &= 0x01FF01FF01FF;
-
- /* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */
- c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16;
- const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2;
-
- const uint32 alpha32 = colour.data & 0xFF000000;
- __m128i ret;
- LOAD64(c16, ret);
- if (ob != 0) {
- __m128i ob128 = _mm_cvtsi32_si128(ob);
- ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
- __m128i white = OVERBRIGHT_VALUE_MASK;
- __m128i c128 = ret;
- ret = _mm_subs_epu16(white, c128); /* PSUBUSW, (255 - rgb) */
- ret = _mm_mullo_epi16(ret, ob128); /* PMULLW, ob*(255 - rgb) */
- ret = _mm_srli_epi16(ret, 8); /* PSRLW, ob*(255 - rgb)/256 */
- ret = _mm_add_epi16(ret, c128); /* PADDW, ob*(255 - rgb)/256 + rgb */
- }
-
- ret = _mm_packus_epi16(ret, ret); /* PACKUSWB, saturate and pack. */
- return alpha32 | _mm_cvtsi128_si32(ret);
-}
-IGNORE_UNINITIALIZED_WARNING_STOP
-
#endif /* WITH_SSE */
diff --git a/src/blitter/32bpp_sse4.hpp b/src/blitter/32bpp_sse4.hpp
index f8a563b85..7a3332d87 100644
--- a/src/blitter/32bpp_sse4.hpp
+++ b/src/blitter/32bpp_sse4.hpp
@@ -14,41 +14,14 @@
#ifdef WITH_SSE
-#include "32bpp_ssse3.hpp"
-#include "smmintrin.h"
-
-#undef EXTR32
-#define EXTR32(m_from, m_rank) _mm_extract_epi32((*(um128i*) &m_from).m128i, m_rank)
-#undef INSR32
-#define INSR32(m_val, m_into, m_rank) (*(um128i*) &m_into).m128i = _mm_insert_epi32((*(um128i*) &m_into).m128i, m_val, m_rank)
-
-IGNORE_UNINITIALIZED_WARNING_START
-#ifdef _SQ64
- #undef INSR64
- #define INSR64(m_val, m_into, m_rank) (*(um128i*) &m_into).m128i = _mm_insert_epi64((*(um128i*) &m_into).m128i, m_val, m_rank)
-#else
- typedef union { uint64 u64; struct _u32 { uint32 low, high; } u32; } u6432;
- #undef INSR64
- #define INSR64(m_val, m_into, m_rank) { \
- u6432 v; \
- v.u64 = m_val; \
- (*(um128i*) &m_into).m128i = _mm_insert_epi32((*(um128i*) &m_into).m128i, v.u32.low, (m_rank)*2); \
- (*(um128i*) &m_into).m128i = _mm_insert_epi32((*(um128i*) &m_into).m128i, v.u32.high, (m_rank)*2 + 1); \
- }
-
- #undef LOAD64
- #define LOAD64(m_val, m_into) \
- m_into = _mm_cvtsi32_si128(m_val); \
- INSR32((m_val) >> 32, m_into, 1);
+#ifndef SSE_VERSION
+#define SSE_VERSION 4
#endif
-IGNORE_UNINITIALIZED_WARNING_STOP
+#include "32bpp_ssse3.hpp"
/** The SSE4 32 bpp blitter (without palette animation). */
class Blitter_32bppSSE4 : public Blitter_32bppSSSE3 {
public:
- Colour AdjustBrightness(Colour colour, uint8 brightness);
- static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness);
-
/* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom);
template <BlitterMode mode, Blitter_32bppSSE_Base::ReadMode read_mode, Blitter_32bppSSE_Base::BlockType bt_last>
void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom);
diff --git a/src/blitter/32bpp_sse_func.hpp b/src/blitter/32bpp_sse_func.hpp
new file mode 100644
index 000000000..d6febcf49
--- /dev/null
+++ b/src/blitter/32bpp_sse_func.hpp
@@ -0,0 +1,225 @@
+/* $Id$ */
+
+/*
+ * This file is part of OpenTTD.
+ * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
+ * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file 32bpp_sse_base.hpp Functions related to SSE 32 bpp blitter. */
+
+#ifndef BLITTER_32BPP_SSE_BASE_HPP
+#define BLITTER_32BPP_SSE_BASE_HPP
+
+#ifdef WITH_SSE
+
+#include "32bpp_simple.hpp"
+#if (SSE_VERSION == 2)
+#include <emmintrin.h>
+#elif (SSE_VERSION == 3)
+#include <tmmintrin.h>
+#elif (SSE_VERSION == 4)
+#include <smmintrin.h>
+#endif
+
+#define META_LENGTH 2 ///< Number of uint32 inserted before each line of pixels in a sprite.
+#define MARGIN_NORMAL_THRESHOLD (zoom == ZOOM_LVL_OUT_32X ? 8 : 4) ///< Minimum width to use margins with BM_NORMAL.
+#define MARGIN_REMAP_THRESHOLD 4 ///< Minimum width to use margins with BM_COLOUR_REMAP.
+
+#ifdef _MSC_VER
+ #define ALIGN(n) __declspec(align(n))
+#else
+ #define ALIGN(n) __attribute__ ((aligned (n)))
+#endif
+
+typedef union ALIGN(16) um128i {
+ __m128i m128i;
+ uint8 m128i_u8[16];
+ uint16 m128i_u16[8];
+ uint32 m128i_u32[4];
+ uint64 m128i_u64[2];
+} um128i;
+
+#define CLEAR_HIGH_BYTE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0)
+#define ALPHA_CONTROL_MASK _mm_setr_epi8( 6, 7, 6, 7, 6, 7, -1, -1, 14, 15, 14, 15, 14, 15, -1, -1)
+#define PACK_LOW_CONTROL_MASK _mm_setr_epi8( 0, 2, 4, -1, 8, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1)
+#define PACK_HIGH_CONTROL_MASK _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, -1, 8, 10, 12, -1)
+#define BRIGHTNESS_LOW_CONTROL_MASK _mm_setr_epi8( 1, 2, 1, 2, 1, 2, 0, 2, 3, 2, 3, 2, 3, 2, 0, 2)
+#define BRIGHTNESS_DIV_CLEANER _mm_setr_epi8(-1, 1, -1, 1, -1, 1, -1, 0, -1, 1, -1, 1, -1, 1, -1, 0)
+#define OVERBRIGHT_PRESENCE_MASK _mm_setr_epi8( 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0)
+#define OVERBRIGHT_VALUE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0)
+#define OVERBRIGHT_CONTROL_MASK _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 7, 7, 2, 3, 2, 3, 2, 3, 7, 7)
+#define TRANSPARENT_NOM_BASE _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256)
+
+static inline void InsertFirstUint32(const uint32 value, __m128i &into)
+{
+#if (SSE_VERSION >= 4)
+ into = _mm_insert_epi32(into, value, 0);
+#else
+ NOT_REACHED();
+#endif
+}
+
+static inline void InsertSecondUint32(const uint32 value, __m128i &into)
+{
+#if (SSE_VERSION >= 4)
+ into = _mm_insert_epi32(into, value, 1);
+#else
+ into = _mm_insert_epi16(into, value, 2);
+ into = _mm_insert_epi16(into, value >> 16, 3);
+#endif
+}
+
+static inline void LoadUint64(const uint64 value, __m128i &into)
+{
+#ifdef _SQ64
+ into = _mm_cvtsi64_si128(value);
+#else
+ #if (SSE_VERSION >= 4)
+ into = _mm_cvtsi32_si128(value);
+ InsertSecondUint32(value >> 32, into);
+ #else
+ (*(um128i*) &into).m128i_u64[0] = value;
+ #endif
+#endif
+}
+
+static inline __m128i PackUnsaturated(__m128i from, const __m128i &mask)
+{
+#if (SSE_VERSION == 2)
+ from = _mm_and_si128(from, mask); // PAND, wipe high bytes to keep low bytes when packing
+ return _mm_packus_epi16(from, from); // PACKUSWB, pack 2 colours (with saturation)
+#else
+ return _mm_shuffle_epi8(from, mask);
+#endif
+}
+
+static inline __m128i DistributeAlpha(const __m128i from, const __m128i &mask)
+{
+#if (SSE_VERSION == 2)
+ __m128i alphaAB = _mm_shufflelo_epi16(from, 0x3F); // PSHUFLW, put alpha1 in front of each rgb1
+ return _mm_shufflehi_epi16(alphaAB, 0x3F); // PSHUFHW, put alpha2 in front of each rgb2
+#else
+ return _mm_shuffle_epi8(from, mask);
+#endif
+}
+
+static inline __m128i AlphaBlendTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &pack_mask)
+{
+ __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128()); // PUNPCKLBW, expand each uint8 into uint16
+ __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
+
+ __m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); // PCMPGTW, if (alpha > 0) a++;
+ alphaAB = _mm_srli_epi16(alphaAB, 15);
+ alphaAB = _mm_add_epi16(alphaAB, srcAB);
+ alphaAB = DistributeAlpha(alphaAB, distribution_mask);
+
+ srcAB = _mm_sub_epi16(srcAB, dstAB); // PSUBW, (r - Cr)
+ srcAB = _mm_mullo_epi16(srcAB, alphaAB); // PMULLW, a*(r - Cr)
+ srcAB = _mm_srli_epi16(srcAB, 8); // PSRLW, a*(r - Cr)/256
+ srcAB = _mm_add_epi16(srcAB, dstAB); // PADDW, a*(r - Cr)/256 + Cr
+ return PackUnsaturated(srcAB, pack_mask);
+}
+
+/* Darken 2 pixels.
+ * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
+ */
+static inline __m128i DarkenTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &tr_nom_base)
+{
+ __m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+ __m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
+ __m128i alphaAB = DistributeAlpha(srcAB, distribution_mask);
+ alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.
+ __m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
+ dstAB = _mm_mullo_epi16(dstAB, nom);
+ dstAB = _mm_srli_epi16(dstAB, 8);
+ return _mm_packus_epi16(dstAB, dstAB);
+}
+
+IGNORE_UNINITIALIZED_WARNING_START
+static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness)
+{
+ uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
+ c16 *= brightness;
+ uint64 c16_ob = c16; // Helps out of order execution.
+ c16 /= Blitter_32bppBase::DEFAULT_BRIGHTNESS;
+ c16 &= 0x01FF01FF01FF;
+
+ /* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */
+ c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16;
+ const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2;
+
+ const uint32 alpha32 = colour.data & 0xFF000000;
+ __m128i ret;
+ LoadUint64(c16, ret);
+ if (ob != 0) {
+ __m128i ob128 = _mm_cvtsi32_si128(ob);
+ ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
+ __m128i white = OVERBRIGHT_VALUE_MASK;
+ __m128i c128 = ret;
+ ret = _mm_subs_epu16(white, c128); // PSUBUSW, (255 - rgb)
+ ret = _mm_mullo_epi16(ret, ob128); // PMULLW, ob*(255 - rgb)
+ ret = _mm_srli_epi16(ret, 8); // PSRLW, ob*(255 - rgb)/256
+ ret = _mm_add_epi16(ret, c128); // PADDW, ob*(255 - rgb)/256 + rgb
+ }
+
+ ret = _mm_packus_epi16(ret, ret); // PACKUSWB, saturate and pack.
+ return alpha32 | _mm_cvtsi128_si32(ret);
+}
+IGNORE_UNINITIALIZED_WARNING_STOP
+
+/** ReallyAdjustBrightness() is not called that often.
+ * Inlining this function implies a far jump, which has a huge latency.
+ */
+static inline Colour AdjustBrightneSSE(Colour colour, uint8 brightness)
+{
+ /* Shortcut for normal brightness. */
+ if (brightness == Blitter_32bppBase::DEFAULT_BRIGHTNESS) return colour;
+
+ return ReallyAdjustBrightness(colour, brightness);
+}
+
+static inline __m128i AdjustBrightnessOfTwoPixels(__m128i from, uint32 brightness)
+{
+#if (SSE_VERSION < 3)
+ NOT_REACHED();
+#else
+ /* The following dataflow differs from the one of AdjustBrightness() only for alpha.
+ * In order to keep alpha in colAB, insert a 1 in a unused brightness byte (a*1->a).
+ * OK, not a 1 but DEFAULT_BRIGHTNESS to compensate the div.
+ */
+ brightness &= 0xFF00FF00;
+ brightness += Blitter_32bppBase::DEFAULT_BRIGHTNESS;
+
+ __m128i colAB = _mm_unpacklo_epi8(from, _mm_setzero_si128());
+ __m128i briAB = _mm_cvtsi32_si128(brightness);
+ briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK); // DEFAULT_BRIGHTNESS in 0, 0x00 in 2.
+ colAB = _mm_mullo_epi16(colAB, briAB);
+ __m128i colAB_ob = _mm_srli_epi16(colAB, 8+7);
+ colAB = _mm_srli_epi16(colAB, 7);
+
+ /* Sum overbright.
+ * Maximum for each rgb is 508 => 9 bits. The highest bit tells if there is overbright.
+ * -255 is changed in -256 so we just have to take the 8 lower bits into account.
+ */
+ colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER);
+ colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK);
+ colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK);
+ colAB_ob = _mm_and_si128(colAB_ob, colAB);
+ __m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, _mm_setzero_si128()), _mm_setzero_si128());
+
+ obAB = _mm_srli_epi16(obAB, 1); // Reduce overbright strength.
+ obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK);
+ __m128i retAB = OVERBRIGHT_VALUE_MASK; // ob_mask is equal to white.
+ retAB = _mm_subs_epu16(retAB, colAB); // (255 - rgb)
+ retAB = _mm_mullo_epi16(retAB, obAB); // ob*(255 - rgb)
+ retAB = _mm_srli_epi16(retAB, 8); // ob*(255 - rgb)/256
+ retAB = _mm_add_epi16(retAB, colAB); // ob*(255 - rgb)/256 + rgb
+
+ return _mm_packus_epi16(retAB, retAB);
+#endif
+}
+
+#endif /* WITH_SSE */
+#endif /* BLITTER_32BPP_SSE_BASE_HPP */
diff --git a/src/blitter/32bpp_ssse3.cpp b/src/blitter/32bpp_ssse3.cpp
index 9cee7dbf9..3c42d359c 100644
--- a/src/blitter/32bpp_ssse3.cpp
+++ b/src/blitter/32bpp_ssse3.cpp
@@ -74,8 +74,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
for (uint x = (uint) effective_width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
- ALPHA_BLEND_2();
- _mm_storel_epi64((__m128i*) dst, srcABCD);
+ _mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
src += 2;
dst += 2;
}
@@ -83,8 +82,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
- ALPHA_BLEND_2();
- dst->data = _mm_cvtsi128_si32(srcABCD);
+ dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
}
break;
@@ -96,33 +94,39 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
/* Remap colours. */
if (mvX2 & 0x00FF00FF) {
- /* Written so the compiler uses CMOV. */
- const Colour src0 = src[0];
- const uint m0 = (byte) mvX2;
- const uint r0 = remap[m0];
- const Colour c0map = (this->LookupColourInPalette(r0).data & 0x00FFFFFF) | (src0.data & 0xFF000000);
- Colour c0 = 0; // Use alpha of 0 to keep dst as is.
- c0 = r0 == 0 ? c0 : c0map;
- c0 = m0 != 0 ? c0 : src0;
- srcABCD = _mm_cvtsi32_si128(c0.data);
-
- const Colour src1 = src[1];
- const uint m1 = (byte) (mvX2 >> 16);
- const uint r1 = remap[m1];
- const Colour c1map = (this->LookupColourInPalette(r1).data & 0x00FFFFFF) | (src1.data & 0xFF000000);
- Colour c1 = 0;
- c1 = r1 == 0 ? c1 : c1map;
- c1 = m1 != 0 ? c1 : src1;
- INSR32(c1.data, srcABCD, 1);
-
- if ((mvX2 & 0xFF00FF00) != 0x80008000) {
- ADJUST_BRIGHTNESS_2(srcABCD, mvX2);
- }
+ #define CMOV_REMAP(m_colour, m_src, m_m) \
+ /* Written so the compiler uses CMOV. */ \
+ Colour m_colour = 0; \
+ { \
+ const Colour srcm = (Colour) (m_src); \
+ const uint m = (byte) (m_m); \
+ const uint r = remap[m]; \
+ const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \
+ m_colour = r == 0 ? m_colour : cmap; \
+ m_colour = m != 0 ? m_colour : srcm; \
+ }
+#ifdef _SQ64
+ uint64 srcs = _mm_cvtsi128_si64(srcABCD);
+ uint64 remapped_src = 0;
+ CMOV_REMAP(c0, srcs, mvX2);
+ remapped_src = c0.data;
+ CMOV_REMAP(c1, srcs >> 32, mvX2 >> 16);
+ remapped_src |= (uint64) c1.data << 32;
+ srcABCD = _mm_cvtsi64_si128(remapped_src);
+#else
+ Colour remapped_src[2];
+ CMOV_REMAP(c0, _mm_cvtsi128_si32(srcABCD), mvX2);
+ remapped_src[0] = c0.data;
+ CMOV_REMAP(c1, src[1], mvX2 >> 16);
+ remapped_src[1] = c1.data;
+ srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src);
+#endif
+
+ if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2);
}
/* Blend colours. */
- ALPHA_BLEND_2();
- _mm_storel_epi64((__m128i *) dst, srcABCD);
+ _mm_storel_epi64((__m128i *) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
dst += 2;
src += 2;
src_mv += 2;
@@ -134,7 +138,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if (src_mv->m) {
const uint r = remap[src_mv->m];
if (r != 0) {
- Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
+ Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
if (src->a == 255) {
*dst = remapped_colour;
} else {
@@ -148,7 +152,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if (src->a < 255) {
bmcr_alpha_blend_single:
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
- ALPHA_BLEND_2();
+ srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
}
dst->data = _mm_cvtsi128_si32(srcABCD);
}
@@ -160,8 +164,7 @@ bmcr_alpha_blend_single:
for (uint x = (uint) bp->width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
- DARKEN_2();
- _mm_storel_epi64((__m128i *) dst, dstAB);
+ _mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
src += 2;
dst += 2;
}
@@ -169,8 +172,7 @@ bmcr_alpha_blend_single:
if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
- DARKEN_2();
- dst->data = _mm_cvtsi128_si32(dstAB);
+ dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
}
break;
}
diff --git a/src/blitter/32bpp_ssse3.hpp b/src/blitter/32bpp_ssse3.hpp
index 23e6e4878..090dd5c42 100644
--- a/src/blitter/32bpp_ssse3.hpp
+++ b/src/blitter/32bpp_ssse3.hpp
@@ -14,53 +14,10 @@
#ifdef WITH_SSE
+#ifndef SSE_VERSION
+#define SSE_VERSION 3
+#endif
#include "32bpp_sse2.hpp"
-#include "tmmintrin.h"
-
-/* Use PSHUFB instead of PSHUFHW+PSHUFLW. */
-#undef PUT_ALPHA_IN_FRONT_OF_RGB
-#define PUT_ALPHA_IN_FRONT_OF_RGB(m_from, m_into) m_into = _mm_shuffle_epi8(m_from, a_cm);
-
-#undef PACK_AB_WITHOUT_SATURATION
-#define PACK_AB_WITHOUT_SATURATION(m_from, m_into) m_into = _mm_shuffle_epi8(m_from, pack_low_cm);
-
-/* Adjust brightness of 2 pixels. */
-#define ADJUST_BRIGHTNESS_2(m_colourX2, m_brightnessX2) \
- /* The following dataflow differs from the one of AdjustBrightness() only for alpha.
- * In order to keep alpha in colAB, insert a 1 in a unused brightness byte (a*1->a).
- * OK, not a 1 but DEFAULT_BRIGHTNESS to compensate the div.
- */ \
- m_brightnessX2 &= 0xFF00FF00; \
- m_brightnessX2 += DEFAULT_BRIGHTNESS; \
- \
- __m128i zero = _mm_setzero_si128(); \
- __m128i colAB = _mm_unpacklo_epi8(m_colourX2, zero); \
- \
- __m128i briAB = _mm_cvtsi32_si128(m_brightnessX2); \
- briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK); /* DEFAULT_BRIGHTNESS in 0, 0x00 in 2. */ \
- colAB = _mm_mullo_epi16(colAB, briAB); \
- __m128i colAB_ob = _mm_srli_epi16(colAB, 8+7); \
- colAB = _mm_srli_epi16(colAB, 7); \
- \
- /* Sum overbright.
- * Maximum for each rgb is 508 => 9 bits. The highest bit tells if there is overbright.
- * -255 is changed in -256 so we just have to take the 8 lower bits into account.
- */ \
- colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER); \
- colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK); \
- colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK); \
- colAB_ob = _mm_and_si128(colAB_ob, colAB); \
- __m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, zero), zero); \
- \
- obAB = _mm_srli_epi16(obAB, 1); /* Reduce overbright strength. */ \
- obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK); \
- __m128i retAB = OVERBRIGHT_VALUE_MASK; /* ob_mask is equal to white. */ \
- retAB = _mm_subs_epu16(retAB, colAB); /* (255 - rgb) */ \
- retAB = _mm_mullo_epi16(retAB, obAB); /* ob*(255 - rgb) */ \
- retAB = _mm_srli_epi16(retAB, 8); /* ob*(255 - rgb)/256 */ \
- retAB = _mm_add_epi16(retAB, colAB); /* ob*(255 - rgb)/256 + rgb */ \
- \
- m_colourX2 = _mm_packus_epi16(retAB, retAB);
/** The SSSE3 32 bpp blitter (without palette animation). */
class Blitter_32bppSSSE3 : public Blitter_32bppSSE2 {