9 files changed, 356 insertions, 368 deletions
diff --git a/src/blitter/32bpp_anim_sse4.cpp b/src/blitter/32bpp_anim_sse4.cpp
index e8873d5d3..ae1b34d69 100644
--- a/src/blitter/32bpp_anim_sse4.cpp
+++ b/src/blitter/32bpp_anim_sse4.cpp
@@ -83,12 +83,12 @@ inline void Blitter_32bppSSE4_Anim::Draw(const Blitter::BlitterParams *bp, ZoomL
 					const byte m0 = mvX2;
 					if (m0 >= PALETTE_ANIM_START) {
 						const Colour c0 = (this->LookupColourInPalette(m0).data & 0x00FFFFFF) | (src[0].data & 0xFF000000);
-						INSR32(AdjustBrightness(c0, (byte) (mvX2 >> 8)).data, srcABCD, 0);
+						InsertFirstUint32(AdjustBrightneSSE(c0, (byte) (mvX2 >> 8)).data, srcABCD);
 					}
 					const byte m1 = mvX2 >> 16;
 					if (m1 >= PALETTE_ANIM_START) {
 						const Colour c1 = (this->LookupColourInPalette(m1).data & 0x00FFFFFF) | (src[1].data & 0xFF000000);
-						INSR32(AdjustBrightness(c1, (byte) (mvX2 >> 24)).data, srcABCD, 1);
+						InsertSecondUint32(AdjustBrightneSSE(c1, (byte) (mvX2 >> 24)).data, srcABCD);
 					}
 
 					/* Update anim buffer. */
@@ -118,7 +118,7 @@ inline void Blitter_32bppSSE4_Anim::Draw(const Blitter::BlitterParams *bp, ZoomL
 
 					/* Blend colours. */
 bmno_alpha_blend:
-					ALPHA_BLEND_2();
+					srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
 bmno_full_opacity:
 					_mm_storel_epi64((__m128i *) dst, srcABCD);
 bmno_full_transparency:
@@ -132,20 +132,19 @@ bmno_full_transparency:
 					if (src->a == 0) {
 					} else if (src->a == 255) {
 						*anim = *(const uint16*) src_mv;
-						*dst = (src_mv->m >= PALETTE_ANIM_START) ? AdjustBrightness(LookupColourInPalette(src_mv->m), src_mv->v) : *src;
+						*dst = (src_mv->m >= PALETTE_ANIM_START) ? AdjustBrightneSSE(LookupColourInPalette(src_mv->m), src_mv->v) : *src;
 					} else {
 						*anim = 0;
 						__m128i srcABCD;
 						__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
 						if (src_mv->m >= PALETTE_ANIM_START) {
-							Colour colour = AdjustBrightness(LookupColourInPalette(src_mv->m), src_mv->v);
+							Colour colour = AdjustBrightneSSE(LookupColourInPalette(src_mv->m), src_mv->v);
 							colour.a = src->a;
 							srcABCD = _mm_cvtsi32_si128(colour.data);
 						} else {
 							srcABCD = _mm_cvtsi32_si128(src->data);
 						}
-						ALPHA_BLEND_2();
-						dst->data = _mm_cvtsi128_si32(srcABCD);
+						dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
 					}
 				}
 				break;
@@ -162,24 +161,36 @@ bmno_full_transparency:
 					const uint m1 = (byte) (mvX2 >> 16);
 					const uint r1 = remap[m1];
 					if (mvX2 & 0x00FF00FF) {
-						/* Written so the compiler uses CMOV. */
-						const Colour src0 = src[0];
-						const Colour c0map = (this->LookupColourInPalette(r0).data & 0x00FFFFFF) | (src0.data & 0xFF000000);
-						Colour c0 = dst[0];
-						c0 = r0 == 0 ? c0 : c0map;
-						c0 = m0 != 0 ? c0 : src0;
-						srcABCD = _mm_cvtsi32_si128(c0.data);
-
-						const Colour src1 = src[1];
-						const Colour c1map = (this->LookupColourInPalette(r1).data & 0x00FFFFFF) | (src1.data & 0xFF000000);
-						Colour c1 = dst[1];
-						c1 = r1 == 0 ? c1 : c1map;
-						c1 = m1 != 0 ? c1 : src1;
-						INSR32(c1.data, srcABCD, 1);
+						#define CMOV_REMAP(m_colour, m_colour_init, m_src, m_m) \
+							/* Written so the compiler uses CMOV. */ \
+							Colour m_colour = m_colour_init; \
+							{ \
+							const Colour srcm = (Colour) (m_src); \
+							const uint m = (byte) (m_m); \
+							const uint r = remap[m]; \
+							const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \
+							m_colour = r == 0 ? m_colour : cmap; \
+							m_colour = m != 0 ? m_colour : srcm; \
+							}
+#ifdef _SQ64
+						uint64 srcs = _mm_cvtsi128_si64(srcABCD);
+						uint64 dsts = _mm_cvtsi128_si64(dstABCD);
+						uint64 remapped_src = 0;
+						CMOV_REMAP(c0, dsts, srcs, mvX2);
+						remapped_src = c0.data;
+						CMOV_REMAP(c1, dsts >> 32, srcs >> 32, mvX2 >> 16);
+						remapped_src |= (uint64) c1.data << 32;
+						srcABCD = _mm_cvtsi64_si128(remapped_src);
+#else
+						Colour remapped_src[2];
+						CMOV_REMAP(c0, _mm_cvtsi128_si32(dstABCD), _mm_cvtsi128_si32(srcABCD), mvX2);
+						remapped_src[0] = c0.data;
+						CMOV_REMAP(c1, dst[1], src[1], mvX2 >> 16);
+						remapped_src[1] = c1.data;
+						srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src);
+#endif
 
-						if ((mvX2 & 0xFF00FF00) != 0x80008000) {
-							ADJUST_BRIGHTNESS_2(srcABCD, mvX2);
-						}
+						if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2);
 					}
 
 					/* Update anim buffer. */
@@ -211,7 +222,7 @@ bmno_full_transparency:
 
 					/* Blend colours. */
 bmcr_alpha_blend:
-					ALPHA_BLEND_2();
+					srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
 bmcr_full_opacity:
 					_mm_storel_epi64((__m128i *) dst, srcABCD);
 bmcr_full_transparency:
@@ -229,7 +240,7 @@ bmcr_full_transparency:
 						const uint r = remap[src_mv->m];
 						*anim = (src->a == 255) ? r | ((uint16) src_mv->v << 8 ) : 0;
 						if (r != 0) {
-							Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
+							Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
 							if (src->a == 255) {
 								*dst = remapped_colour;
 							} else {
@@ -244,7 +255,7 @@ bmcr_full_transparency:
 						if (src->a < 255) {
 bmcr_alpha_blend_single:
 							__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
-							ALPHA_BLEND_2();
+							srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
 						}
 						dst->data = _mm_cvtsi128_si32(srcABCD);
 					}
@@ -256,8 +267,7 @@ bmcr_alpha_blend_single:
 				for (uint x = (uint) bp->width / 2; x > 0; x--) {
 					__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 					__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
-					DARKEN_2();
-					_mm_storel_epi64((__m128i *) dst, dstAB);
+					_mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
 					src += 2;
 					dst += 2;
 					anim += 2;
@@ -268,8 +278,7 @@ bmcr_alpha_blend_single:
 				if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) {
 					__m128i srcABCD = _mm_cvtsi32_si128(src->data);
 					__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
-					DARKEN_2();
-					dst->data = _mm_cvtsi128_si32(dstAB);
+					dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
 					if (src[0].a) anim[0] = 0;
 				}
 				break;
@@ -318,13 +327,4 @@ void Blitter_32bppSSE4_Anim::Draw(Blitter::BlitterParams *bp, BlitterMode mode,
 	}
 }
 
-/** Same code as seen in 32bpp_sse2.cpp but some macros are not the same. */
-inline Colour Blitter_32bppSSE4_Anim::AdjustBrightness(Colour colour, uint8 brightness)
-{
-	/* Shortcut for normal brightness. */
-	if (brightness == DEFAULT_BRIGHTNESS) return colour;
-
-	return Blitter_32bppSSE4::ReallyAdjustBrightness(colour, brightness);
-}
-
 #endif /* WITH_SSE */
diff --git a/src/blitter/32bpp_anim_sse4.hpp b/src/blitter/32bpp_anim_sse4.hpp
index 0f1131c88..9a3f93ca8 100644
--- a/src/blitter/32bpp_anim_sse4.hpp
+++ b/src/blitter/32bpp_anim_sse4.hpp
@@ -14,6 +14,9 @@
 
 #ifdef WITH_SSE
 
+#ifndef SSE_VERSION
+#define SSE_VERSION 4
+#endif
 #include "32bpp_anim.hpp"
 #include "32bpp_sse4.hpp"
 
@@ -28,11 +31,9 @@ public:
 	template <BlitterMode mode, Blitter_32bppSSE_Base::ReadMode read_mode, Blitter_32bppSSE_Base::BlockType bt_last>
 	/* virtual */ void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom);
 	/* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom);
-	/* virtual */ Colour AdjustBrightness(Colour colour, uint8 brightness);
 	/* virtual */ Sprite *Encode(const SpriteLoader::Sprite *sprite, AllocatorProc *allocator) {
 		return Blitter_32bppSSE_Base::Encode(sprite, allocator);
 	}
-
 	/* virtual */ const char *GetName() { return "32bpp-sse4-anim"; }
 };
 
diff --git a/src/blitter/32bpp_sse2.cpp b/src/blitter/32bpp_sse2.cpp
index 0b3eb1899..49fb28c35 100644
--- a/src/blitter/32bpp_sse2.cpp
+++ b/src/blitter/32bpp_sse2.cpp
@@ -73,8 +73,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
 				for (uint x = (uint) effective_width / 2; x > 0; x--) {
 					__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 					__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
-					ALPHA_BLEND_2();
-					_mm_storel_epi64((__m128i*) dst, srcABCD);
+					_mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, clear_hi, clear_hi));
 					src += 2;
 					dst += 2;
 				}
@@ -82,8 +81,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
 				if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
 					__m128i srcABCD = _mm_cvtsi32_si128(src->data);
 					__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
-					ALPHA_BLEND_2();
-					dst->data = _mm_cvtsi128_si32(srcABCD);
+					dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, clear_hi, clear_hi));
 				}
 				break;
 
@@ -94,7 +92,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
 					if (src_mv->m) {
 						const uint r = remap[src_mv->m];
 						if (r != 0) {
-							Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
+							Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
 							if (src->a == 255) {
 								*dst = remapped_colour;
 							} else {
@@ -108,7 +106,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
 						if (src->a < 255) {
 bmcr_alpha_blend_single:
 							__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
-							ALPHA_BLEND_2();
+							srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, clear_hi, clear_hi);
 						}
 						dst->data = _mm_cvtsi128_si32(srcABCD);
 					}
@@ -123,8 +121,7 @@ bmcr_alpha_blend_single:
 				for (uint x = (uint) bp->width / 2; x > 0; x--) {
 					__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 					__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
-					DARKEN_2();
-					_mm_storel_epi64((__m128i *) dst, dstAB);
+					_mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, tr_nom_base, tr_nom_base));
 					src += 2;
 					dst += 2;
 				}
@@ -132,8 +129,7 @@ bmcr_alpha_blend_single:
 				if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) {
 					__m128i srcABCD = _mm_cvtsi32_si128(src->data);
 					__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
-					DARKEN_2();
-					dst->data = _mm_cvtsi128_si32(dstAB);
+					dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, tr_nom_base, tr_nom_base));
 				}
 				break;
 		}
@@ -235,7 +231,7 @@ Sprite *Blitter_32bppSSE_Base::Encode(const SpriteLoader::Sprite *sprite, Alloca
 						dst_mv->v = (rgb_max == 0) ? Blitter_32bppBase::DEFAULT_BRIGHTNESS : rgb_max;
 
 						/* Pre-convert the mapping channel to a RGB value. */
-						const Colour colour = AdjustBrightness(Blitter_32bppBase::LookupColourInPalette(src->m), dst_mv->v);
+						const Colour colour = AdjustBrightneSSE(Blitter_32bppBase::LookupColourInPalette(src->m), dst_mv->v);
 						dst_rgba->r = colour.r;
 						dst_rgba->g = colour.g;
 						dst_rgba->b = colour.b;
@@ -282,47 +278,4 @@ Sprite *Blitter_32bppSSE_Base::Encode(const SpriteLoader::Sprite *sprite, Alloca
 	return dst_sprite;
 }
 
-/** ReallyAdjustBrightness() is not called that often.
- * Inlining this function implies a far jump, which has a huge latency.
- */
-inline Colour Blitter_32bppSSE2::AdjustBrightness(Colour colour, uint8 brightness)
-{
-	/* Shortcut for normal brightness. */
-	if (brightness == DEFAULT_BRIGHTNESS) return colour;
-
-	return Blitter_32bppSSE2::ReallyAdjustBrightness(colour, brightness);
-}
-
-IGNORE_UNINITIALIZED_WARNING_START
-Colour Blitter_32bppSSE2::ReallyAdjustBrightness(Colour colour, uint8 brightness)
-{
-	uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
-	c16 *= brightness;
-	uint64 c16_ob = c16; // Helps out of order execution.
-	c16 /= DEFAULT_BRIGHTNESS;
-	c16 &= 0x01FF01FF01FF;
-
-	/* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */
-	c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16;
-	const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2;
-
-	const uint32 alpha32 = colour.data & 0xFF000000;
-	__m128i ret;
-	LOAD64(c16, ret);
-	if (ob != 0) {
-		__m128i ob128 = _mm_cvtsi32_si128(ob);
-		ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
-		__m128i white = OVERBRIGHT_VALUE_MASK;
-		__m128i c128 = ret;
-		ret = _mm_subs_epu16(white, c128); /* PSUBUSW,   (255 - rgb) */
-		ret = _mm_mullo_epi16(ret, ob128); /* PMULLW, ob*(255 - rgb) */
-		ret = _mm_srli_epi16(ret, 8);      /* PSRLW,  ob*(255 - rgb)/256 */
-		ret = _mm_add_epi16(ret, c128);    /* PADDW,  ob*(255 - rgb)/256 + rgb */
-	}
-
-	ret = _mm_packus_epi16(ret, ret);      /* PACKUSWB, saturate and pack. */
-	return alpha32 | _mm_cvtsi128_si32(ret);
-}
-IGNORE_UNINITIALIZED_WARNING_STOP
-
 #endif /* WITH_SSE */
diff --git a/src/blitter/32bpp_sse2.hpp b/src/blitter/32bpp_sse2.hpp
index 1c3307c70..3bab0d752 100644
--- a/src/blitter/32bpp_sse2.hpp
+++ b/src/blitter/32bpp_sse2.hpp
@@ -14,91 +14,10 @@
 
 #ifdef WITH_SSE
 
-#include "32bpp_simple.hpp"
-#include "emmintrin.h"
-
-#define META_LENGTH 2 ///< Number of uint32 inserted before each line of pixels in a sprite.
-#define MARGIN_NORMAL_THRESHOLD (zoom == ZOOM_LVL_OUT_32X ? 8 : 4) ///< Minimum width to use margins with BM_NORMAL.
-#define MARGIN_REMAP_THRESHOLD 4 ///< Minimum width to use margins with BM_COLOUR_REMAP.
-
-#ifdef _MSC_VER
-	#define ALIGN(n) __declspec(align(n))
-#else
-	#define ALIGN(n) __attribute__ ((aligned (n)))
-#endif
-
-typedef union ALIGN(16) um128i {
-	__m128i m128i;
-	uint8 m128i_u8[16];
-	uint16 m128i_u16[8];
-	uint32 m128i_u32[4];
-	uint64 m128i_u64[2];
-} um128i;
-
-#define CLEAR_HIGH_BYTE_MASK        _mm_setr_epi8(-1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0)
-#define ALPHA_CONTROL_MASK          _mm_setr_epi8( 6,  7,  6,  7,  6,  7, -1, -1, 14, 15, 14, 15, 14, 15, -1, -1)
-#define PACK_LOW_CONTROL_MASK       _mm_setr_epi8( 0,  2,  4, -1,  8, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1)
-#define PACK_HIGH_CONTROL_MASK      _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,  0,  2,  4, -1,  8, 10, 12, -1)
-#define BRIGHTNESS_LOW_CONTROL_MASK _mm_setr_epi8( 1,  2,  1,  2,  1,  2,  0,  2,  3,  2,  3,  2,  3,  2,  0,  2)
-#define BRIGHTNESS_DIV_CLEANER      _mm_setr_epi8(-1,  1, -1,  1, -1,  1, -1,  0, -1,  1, -1,  1, -1,  1, -1,  0)
-#define OVERBRIGHT_PRESENCE_MASK    _mm_setr_epi8( 1,  0,  1,  0,  1,  0,  0,  0,  1,  0,  1,  0,  1,  0,  0,  0)
-#define OVERBRIGHT_VALUE_MASK       _mm_setr_epi8(-1,  0, -1,  0, -1,  0,  0,  0, -1,  0, -1,  0, -1,  0,  0,  0)
-#define OVERBRIGHT_CONTROL_MASK     _mm_setr_epi8( 0,  1,  0,  1,  0,  1,  7,  7,  2,  3,  2,  3,  2,  3,  7,  7)
-#define TRANSPARENT_NOM_BASE        _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256)
-
-#define EXTR32(m_from, m_rank) (*(um128i*) &m_from).m128i_u32[m_rank]
-#define EXTR64(m_from, m_rank) (*(um128i*) &m_from).m128i_u64[m_rank]
-#define INSR32(m_val, m_into, m_rank) { \
-	(*(um128i*) &m_into).m128i = _mm_insert_epi16((*(um128i*) &m_into).m128i, m_val, (m_rank)*2); \
-	(*(um128i*) &m_into).m128i = _mm_insert_epi16((*(um128i*) &m_into).m128i, (m_val) >> 16, (m_rank)*2 + 1); \
-}
-#define INSR64(m_val, m_into, m_rank) (*(um128i*) &m_into).m128i_u64[m_rank] = (m_val)
-
-#ifdef _SQ64
-	#define LOAD64(m_val, m_into) m_into = _mm_cvtsi64_si128(m_val);
-#else
-	#define LOAD64(m_val, m_into) INSR64(m_val, m_into, 0)
+#ifndef SSE_VERSION
+#define SSE_VERSION 2
 #endif
-
-/* PUT_ALPHA_IN_FRONT_OF_RGB is redefined in 32bpp_ssse3.hpp. */
-#define PUT_ALPHA_IN_FRONT_OF_RGB(m_from, m_into) \
-	m_into = _mm_shufflelo_epi16(m_from, 0x3F); /* PSHUFLW, put alpha1 in front of each rgb1 */ \
-	m_into = _mm_shufflehi_epi16(m_into, 0x3F); /* PSHUFHW, put alpha2 in front of each rgb2 */
-
-/* PACK_AB_WITHOUT_SATURATION is redefined in 32bpp_ssse3.hpp. */
-#define PACK_AB_WITHOUT_SATURATION(m_from, m_into) \
-	m_from = _mm_and_si128(m_from, clear_hi);  /* PAND, wipe high bytes to keep low bytes when packing */ \
-	m_into = _mm_packus_epi16(m_from, m_from); /* PACKUSWB, pack 2 colours (with saturation) */
-
-/* Alpha blend 2 pixels. */
-#define ALPHA_BLEND_2() { \
-	__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); /* PUNPCKLBW, expand each uint8 into uint16 */ \
-	__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); \
-	\
-	__m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128());   /* PCMPGTW, if (alpha > 0) a++; */ \
-	alphaAB = _mm_srli_epi16(alphaAB, 15); \
-	alphaAB = _mm_add_epi16(alphaAB, srcAB); \
-	PUT_ALPHA_IN_FRONT_OF_RGB(alphaAB, alphaAB); \
-	\
-	srcAB = _mm_sub_epi16(srcAB, dstAB);          /* PSUBW,    (r - Cr) */ \
-	srcAB = _mm_mullo_epi16(srcAB, alphaAB);      /* PMULLW, a*(r - Cr) */ \
-	srcAB = _mm_srli_epi16(srcAB, 8);             /* PSRLW,  a*(r - Cr)/256 */ \
-	srcAB = _mm_add_epi16(srcAB, dstAB);          /* PADDW,  a*(r - Cr)/256 + Cr */ \
-	PACK_AB_WITHOUT_SATURATION(srcAB, srcABCD); \
-}
-
-/* Darken 2 pixels.
- * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
- */
-#define DARKEN_2() \
-	__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); \
-	__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); \
-	__m128i PUT_ALPHA_IN_FRONT_OF_RGB(srcAB, alphaAB); \
-	alphaAB = _mm_srli_epi16(alphaAB, 2); /* Reduce to 64 levels of shades so the max value fits in 16 bits. */ \
-	__m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); \
-	dstAB = _mm_mullo_epi16(dstAB, nom); \
-	dstAB = _mm_srli_epi16(dstAB, 8); \
-	dstAB = _mm_packus_epi16(dstAB, dstAB);
+#include "32bpp_sse_func.hpp"
 
 /** Base methods for 32bpp SSE blitters. */
 class Blitter_32bppSSE_Base {
@@ -138,14 +57,11 @@ public:
 	};
 
 	Sprite *Encode(const SpriteLoader::Sprite *sprite, AllocatorProc *allocator);
-	virtual Colour AdjustBrightness(Colour colour, uint8 brightness) = 0;
 };
 
 /** The SSE2 32 bpp blitter (without palette animation). */
 class Blitter_32bppSSE2 : public Blitter_32bppSimple, public Blitter_32bppSSE_Base {
 public:
-	virtual Colour AdjustBrightness(Colour colour, uint8 brightness);
-	static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness);
 	/* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom);
 	template <BlitterMode mode, Blitter_32bppSSE_Base::ReadMode read_mode, Blitter_32bppSSE_Base::BlockType bt_last>
 	void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom);
diff --git a/src/blitter/32bpp_sse4.cpp b/src/blitter/32bpp_sse4.cpp
index 619110cb6..1403d3659 100644
--- a/src/blitter/32bpp_sse4.cpp
+++ b/src/blitter/32bpp_sse4.cpp
@@ -74,8 +74,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
 				for (uint x = (uint) effective_width / 2; x > 0; x--) {
 					__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 					__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
-					ALPHA_BLEND_2();
-					_mm_storel_epi64((__m128i*) dst, srcABCD);
+					_mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
 					src += 2;
 					dst += 2;
 				}
@@ -83,8 +82,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
 				if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
 					__m128i srcABCD = _mm_cvtsi32_si128(src->data);
 					__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
-					ALPHA_BLEND_2();
-					dst->data = _mm_cvtsi128_si32(srcABCD);
+					dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
 				}
 				break;
 
@@ -96,33 +94,39 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
 
 					/* Remap colours. */
 					if (mvX2 & 0x00FF00FF) {
-						/* Written so the compiler uses CMOV. */
-						const Colour src0 = src[0];
-						const uint m0 = (byte) mvX2;
-						const uint r0 = remap[m0];
-						const Colour c0map = (this->LookupColourInPalette(r0).data & 0x00FFFFFF) | (src0.data & 0xFF000000);
-						Colour c0 = 0; // Use alpha of 0 to keep dst as is.
-						c0 = r0 == 0 ? c0 : c0map;
-						c0 = m0 != 0 ? c0 : src0;
-						srcABCD = _mm_cvtsi32_si128(c0.data);
-
-						const Colour src1 = src[1];
-						const uint m1 = (byte) (mvX2 >> 16);
-						const uint r1 = remap[m1];
-						const Colour c1map = (this->LookupColourInPalette(r1).data & 0x00FFFFFF) | (src1.data & 0xFF000000);
-						Colour c1 = 0;
-						c1 = r1 == 0 ? c1 : c1map;
-						c1 = m1 != 0 ? c1 : src1;
-						INSR32(c1.data, srcABCD, 1);
-
-						if ((mvX2 & 0xFF00FF00) != 0x80008000) {
-							ADJUST_BRIGHTNESS_2(srcABCD, mvX2);
-						}
+						#define CMOV_REMAP(m_colour, m_src, m_m) \
+							/* Written so the compiler uses CMOV. */ \
+							Colour m_colour = 0; \
+							{ \
+							const Colour srcm = (Colour) (m_src); \
+							const uint m = (byte) (m_m); \
+							const uint r = remap[m]; \
+							const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \
+							m_colour = r == 0 ? m_colour : cmap; \
+							m_colour = m != 0 ? m_colour : srcm; \
+							}
+#ifdef _SQ64
+						uint64 srcs = _mm_cvtsi128_si64(srcABCD);
+						uint64 remapped_src = 0;
+						CMOV_REMAP(c0, srcs, mvX2);
+						remapped_src = c0.data;
+						CMOV_REMAP(c1, srcs >> 32, mvX2 >> 16);
+						remapped_src |= (uint64) c1.data << 32;
+						srcABCD = _mm_cvtsi64_si128(remapped_src);
+#else
+						Colour remapped_src[2];
+						CMOV_REMAP(c0, _mm_cvtsi128_si32(srcABCD), mvX2);
+						remapped_src[0] = c0.data;
+						CMOV_REMAP(c1, src[1], mvX2 >> 16);
+						remapped_src[1] = c1.data;
+						srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src);
+#endif
+
+						if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2);
 					}
 
 					/* Blend colours. */
-					ALPHA_BLEND_2();
-					_mm_storel_epi64((__m128i *) dst, srcABCD);
+					_mm_storel_epi64((__m128i *) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
 					dst += 2;
 					src += 2;
 					src_mv += 2;
@@ -134,7 +138,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
 					if (src_mv->m) {
 						const uint r = remap[src_mv->m];
 						if (r != 0) {
-							Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
+							Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
 							if (src->a == 255) {
 								*dst = remapped_colour;
 							} else {
@@ -148,7 +152,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
 						if (src->a < 255) {
 bmcr_alpha_blend_single:
 							__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
-							ALPHA_BLEND_2();
+							srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
 						}
 						dst->data = _mm_cvtsi128_si32(srcABCD);
 					}
@@ -160,8 +164,7 @@ bmcr_alpha_blend_single:
 				for (uint x = (uint) bp->width / 2; x > 0; x--) {
 					__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 					__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
-					DARKEN_2();
-					_mm_storel_epi64((__m128i *) dst, dstAB);
+					_mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
 					src += 2;
 					dst += 2;
 				}
@@ -169,8 +172,7 @@ bmcr_alpha_blend_single:
 				if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) {
 					__m128i srcABCD = _mm_cvtsi32_si128(src->data);
 					__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
-					DARKEN_2();
-					dst->data = _mm_cvtsi128_si32(dstAB);
+					dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
 				}
 				break;
 		}
@@ -217,45 +219,4 @@ void Blitter_32bppSSE4::Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomL
 	}
 }
 
-/** Same code as seen in 32bpp_sse2.cpp but some macros are not the same. */
-inline Colour Blitter_32bppSSE4::AdjustBrightness(Colour colour, uint8 brightness)
-{
-	/* Shortcut for normal brightness. */
-	if (brightness == DEFAULT_BRIGHTNESS) return colour;
-
-	return Blitter_32bppSSE4::ReallyAdjustBrightness(colour, brightness);
-}
-
-IGNORE_UNINITIALIZED_WARNING_START
-Colour Blitter_32bppSSE4::ReallyAdjustBrightness(Colour colour, uint8 brightness)
-{
-	uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
-	c16 *= brightness;
-	uint64 c16_ob = c16; // Helps out of order execution.
-	c16 /= DEFAULT_BRIGHTNESS;
-	c16 &= 0x01FF01FF01FF;
-
-	/* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */
-	c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16;
-	const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2;
-
-	const uint32 alpha32 = colour.data & 0xFF000000;
-	__m128i ret;
-	LOAD64(c16, ret);
-	if (ob != 0) {
-		__m128i ob128 = _mm_cvtsi32_si128(ob);
-		ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
-		__m128i white = OVERBRIGHT_VALUE_MASK;
-		__m128i c128 = ret;
-		ret = _mm_subs_epu16(white, c128); /* PSUBUSW,   (255 - rgb) */
-		ret = _mm_mullo_epi16(ret, ob128); /* PMULLW, ob*(255 - rgb) */
-		ret = _mm_srli_epi16(ret, 8);      /* PSRLW,  ob*(255 - rgb)/256 */
-		ret = _mm_add_epi16(ret, c128);    /* PADDW,  ob*(255 - rgb)/256 + rgb */
-	}
-
-	ret = _mm_packus_epi16(ret, ret);      /* PACKUSWB, saturate and pack. */
-	return alpha32 | _mm_cvtsi128_si32(ret);
-}
-IGNORE_UNINITIALIZED_WARNING_STOP
-
 #endif /* WITH_SSE */
diff --git a/src/blitter/32bpp_sse4.hpp b/src/blitter/32bpp_sse4.hpp
index f8a563b85..7a3332d87 100644
--- a/src/blitter/32bpp_sse4.hpp
+++ b/src/blitter/32bpp_sse4.hpp
@@ -14,41 +14,14 @@
 
 #ifdef WITH_SSE
 
-#include "32bpp_ssse3.hpp"
-#include "smmintrin.h"
-
-#undef EXTR32
-#define EXTR32(m_from, m_rank) _mm_extract_epi32((*(um128i*) &m_from).m128i, m_rank)
-#undef INSR32
-#define INSR32(m_val, m_into, m_rank) (*(um128i*) &m_into).m128i = _mm_insert_epi32((*(um128i*) &m_into).m128i, m_val, m_rank)
-
-IGNORE_UNINITIALIZED_WARNING_START
-#ifdef _SQ64
-	#undef INSR64
-	#define INSR64(m_val, m_into, m_rank) (*(um128i*) &m_into).m128i = _mm_insert_epi64((*(um128i*) &m_into).m128i, m_val, m_rank)
-#else
-	typedef union { uint64 u64; struct _u32 { uint32 low, high; } u32; } u6432;
-	#undef INSR64
-	#define INSR64(m_val, m_into, m_rank) { \
-		u6432 v; \
-		v.u64 = m_val; \
-		(*(um128i*) &m_into).m128i = _mm_insert_epi32((*(um128i*) &m_into).m128i, v.u32.low, (m_rank)*2); \
-		(*(um128i*) &m_into).m128i = _mm_insert_epi32((*(um128i*) &m_into).m128i, v.u32.high, (m_rank)*2 + 1); \
-	}
-
-	#undef LOAD64
-	#define LOAD64(m_val, m_into) \
-		m_into = _mm_cvtsi32_si128(m_val); \
-		INSR32((m_val) >> 32, m_into, 1);
+#ifndef SSE_VERSION
+#define SSE_VERSION 4
 #endif
-IGNORE_UNINITIALIZED_WARNING_STOP
+#include "32bpp_ssse3.hpp"
 
 /** The SSE4 32 bpp blitter (without palette animation). */
 class Blitter_32bppSSE4 : public Blitter_32bppSSSE3 {
 public:
-	Colour AdjustBrightness(Colour colour, uint8 brightness);
-	static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness);
-
 	/* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom);
 	template <BlitterMode mode, Blitter_32bppSSE_Base::ReadMode read_mode, Blitter_32bppSSE_Base::BlockType bt_last>
 	void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom);
diff --git a/src/blitter/32bpp_sse_func.hpp b/src/blitter/32bpp_sse_func.hpp
new file mode 100644
index 000000000..d6febcf49
--- /dev/null
+++ b/src/blitter/32bpp_sse_func.hpp
@@ -0,0 +1,225 @@
+/* $Id$ */
+
+/*
+ * This file is part of OpenTTD.
+ * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
+ * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file 32bpp_sse_base.hpp Functions related to SSE 32 bpp blitter. */
+
+#ifndef BLITTER_32BPP_SSE_BASE_HPP
+#define BLITTER_32BPP_SSE_BASE_HPP
+
+#ifdef WITH_SSE
+
+#include "32bpp_simple.hpp"
+#if (SSE_VERSION == 2)
+#include <emmintrin.h>
+#elif (SSE_VERSION == 3)
+#include <tmmintrin.h>
+#elif (SSE_VERSION == 4)
+#include <smmintrin.h>
+#endif
+
+#define META_LENGTH 2 ///< Number of uint32 inserted before each line of pixels in a sprite.
+#define MARGIN_NORMAL_THRESHOLD (zoom == ZOOM_LVL_OUT_32X ? 8 : 4) ///< Minimum width to use margins with BM_NORMAL.
+#define MARGIN_REMAP_THRESHOLD 4 ///< Minimum width to use margins with BM_COLOUR_REMAP.
+
+#ifdef _MSC_VER
+	#define ALIGN(n) __declspec(align(n))
+#else
+	#define ALIGN(n) __attribute__ ((aligned (n)))
+#endif
+
+typedef union ALIGN(16) um128i {
+	__m128i m128i;
+	uint8 m128i_u8[16];
+	uint16 m128i_u16[8];
+	uint32 m128i_u32[4];
+	uint64 m128i_u64[2];
+} um128i;
+
+#define CLEAR_HIGH_BYTE_MASK        _mm_setr_epi8(-1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0)
+#define ALPHA_CONTROL_MASK          _mm_setr_epi8( 6,  7,  6,  7,  6,  7, -1, -1, 14, 15, 14, 15, 14, 15, -1, -1)
+#define PACK_LOW_CONTROL_MASK       _mm_setr_epi8( 0,  2,  4, -1,  8, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1)
+#define PACK_HIGH_CONTROL_MASK      _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,  0,  2,  4, -1,  8, 10, 12, -1)
+#define BRIGHTNESS_LOW_CONTROL_MASK _mm_setr_epi8( 1,  2,  1,  2,  1,  2,  0,  2,  3,  2,  3,  2,  3,  2,  0,  2)
+#define BRIGHTNESS_DIV_CLEANER      _mm_setr_epi8(-1,  1, -1,  1, -1,  1, -1,  0, -1,  1, -1,  1, -1,  1, -1,  0)
+#define OVERBRIGHT_PRESENCE_MASK    _mm_setr_epi8( 1,  0,  1,  0,  1,  0,  0,  0,  1,  0,  1,  0,  1,  0,  0,  0)
+#define OVERBRIGHT_VALUE_MASK       _mm_setr_epi8(-1,  0, -1,  0, -1,  0,  0,  0, -1,  0, -1,  0, -1,  0,  0,  0)
+#define OVERBRIGHT_CONTROL_MASK     _mm_setr_epi8( 0,  1,  0,  1,  0,  1,  7,  7,  2,  3,  2,  3,  2,  3,  7,  7)
+#define TRANSPARENT_NOM_BASE        _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256)
+
+static inline void InsertFirstUint32(const uint32 value, __m128i &into)
+{
+#if (SSE_VERSION >= 4)
+	into = _mm_insert_epi32(into, value, 0);
+#else
+	NOT_REACHED();
+#endif
+}
+
+static inline void InsertSecondUint32(const uint32 value, __m128i &into)
+{
+#if (SSE_VERSION >= 4)
+	into = _mm_insert_epi32(into, value, 1);
+#else
+	into = _mm_insert_epi16(into, value, 2);
+	into = _mm_insert_epi16(into, value >> 16, 3);
+#endif
+}
+
+static inline void LoadUint64(const uint64 value, __m128i &into)
+{
+#ifdef _SQ64
+	into = _mm_cvtsi64_si128(value);
+#else
+	#if (SSE_VERSION >= 4)
+		into = _mm_cvtsi32_si128(value);
+		InsertSecondUint32(value >> 32, into);
+	#else
+		(*(um128i*) &into).m128i_u64[0] = value;
+	#endif
+#endif
+}
+
+static inline __m128i PackUnsaturated(__m128i from, const __m128i &mask)
+{
+#if (SSE_VERSION == 2)
+	from = _mm_and_si128(from, mask);    // PAND, wipe high bytes to keep low bytes when packing
+	return _mm_packus_epi16(from, from); // PACKUSWB, pack 2 colours (with saturation)
+#else
+	return _mm_shuffle_epi8(from, mask);
+#endif
+}
+
+static inline __m128i DistributeAlpha(const __m128i from, const __m128i &mask)
+{
+#if (SSE_VERSION == 2)
+	__m128i alphaAB = _mm_shufflelo_epi16(from, 0x3F); // PSHUFLW, put alpha1 in front of each rgb1
+	return _mm_shufflehi_epi16(alphaAB, 0x3F);         // PSHUFHW, put alpha2 in front of each rgb2
+#else
+	return _mm_shuffle_epi8(from, mask);
+#endif
+}
+
+static inline __m128i AlphaBlendTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &pack_mask)
+{
+	__m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());   // PUNPCKLBW, expand each uint8 into uint16
+	__m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
+
+	__m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); // PCMPGTW, if (alpha > 0) a++;
+	alphaAB = _mm_srli_epi16(alphaAB, 15);
+	alphaAB = _mm_add_epi16(alphaAB, srcAB);
+	alphaAB = DistributeAlpha(alphaAB, distribution_mask);
+
+	srcAB = _mm_sub_epi16(srcAB, dstAB);     // PSUBW,    (r - Cr)
+	srcAB = _mm_mullo_epi16(srcAB, alphaAB); // PMULLW, a*(r - Cr)
+	srcAB = _mm_srli_epi16(srcAB, 8);        // PSRLW,  a*(r - Cr)/256
+	srcAB = _mm_add_epi16(srcAB, dstAB);     // PADDW,  a*(r - Cr)/256 + Cr
+	return PackUnsaturated(srcAB, pack_mask);
+}
+
+/* Darken 2 pixels.
+ * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
+ */
+static inline __m128i DarkenTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &tr_nom_base)
+{
+	__m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+	__m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
+	__m128i alphaAB = DistributeAlpha(srcAB, distribution_mask);
+	alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.
+	__m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
+	dstAB = _mm_mullo_epi16(dstAB, nom);
+	dstAB = _mm_srli_epi16(dstAB, 8);
+	return _mm_packus_epi16(dstAB, dstAB);
+}
+
+IGNORE_UNINITIALIZED_WARNING_START
+static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness)
+{
+	uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
+	c16 *= brightness;
+	uint64 c16_ob = c16; // Helps out of order execution.
+	c16 /= Blitter_32bppBase::DEFAULT_BRIGHTNESS;
+	c16 &= 0x01FF01FF01FF;
+
+	/* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */
+	c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16;
+	const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2;
+
+	const uint32 alpha32 = colour.data & 0xFF000000;
+	__m128i ret;
+	LoadUint64(c16, ret);
+	if (ob != 0) {
+		__m128i ob128 = _mm_cvtsi32_si128(ob);
+		ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
+		__m128i white = OVERBRIGHT_VALUE_MASK;
+		__m128i c128 = ret;
+		ret = _mm_subs_epu16(white, c128); // PSUBUSW,   (255 - rgb)
+		ret = _mm_mullo_epi16(ret, ob128); // PMULLW, ob*(255 - rgb)
+		ret = _mm_srli_epi16(ret, 8);      // PSRLW,  ob*(255 - rgb)/256
+		ret = _mm_add_epi16(ret, c128);    // PADDW,  ob*(255 - rgb)/256 + rgb
+	}
+
+	ret = _mm_packus_epi16(ret, ret);      // PACKUSWB, saturate and pack.
+	return alpha32 | _mm_cvtsi128_si32(ret);
+}
+IGNORE_UNINITIALIZED_WARNING_STOP
+
+/** ReallyAdjustBrightness() is not called that often.
+ * Inlining this function implies a far jump, which has a huge latency.
+ */
+static inline Colour AdjustBrightneSSE(Colour colour, uint8 brightness)
+{
+	/* Shortcut for normal brightness. */
+	if (brightness == Blitter_32bppBase::DEFAULT_BRIGHTNESS) return colour;
+
+	return ReallyAdjustBrightness(colour, brightness);
+}
+
+static inline __m128i AdjustBrightnessOfTwoPixels(__m128i from, uint32 brightness)
+{
+#if (SSE_VERSION < 3)
+	NOT_REACHED();
+#else
+	/* The following dataflow differs from the one of AdjustBrightness() only for alpha.
+	 * In order to keep alpha in colAB, insert a 1 in a unused brightness byte (a*1->a).
+	 * OK, not a 1 but DEFAULT_BRIGHTNESS to compensate the div.
+	 */
+	brightness &= 0xFF00FF00;
+	brightness += Blitter_32bppBase::DEFAULT_BRIGHTNESS;
+
+	__m128i colAB = _mm_unpacklo_epi8(from, _mm_setzero_si128());
+	__m128i briAB = _mm_cvtsi32_si128(brightness);
+	briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK); // DEFAULT_BRIGHTNESS in 0, 0x00 in 2.
+	colAB = _mm_mullo_epi16(colAB, briAB);
+	__m128i colAB_ob = _mm_srli_epi16(colAB, 8+7);
+	colAB = _mm_srli_epi16(colAB, 7);
+
+	/* Sum overbright.
+	 * Maximum for each rgb is 508 => 9 bits. The highest bit tells if there is overbright.
+	 * -255 is changed in -256 so we just have to take the 8 lower bits into account.
+	 */
+	colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER);
+	colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK);
+	colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK);
+	colAB_ob = _mm_and_si128(colAB_ob, colAB);
+	__m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, _mm_setzero_si128()), _mm_setzero_si128());
+
+	obAB = _mm_srli_epi16(obAB, 1);        // Reduce overbright strength.
+	obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK);
+	__m128i retAB = OVERBRIGHT_VALUE_MASK; // ob_mask is equal to white.
+	retAB = _mm_subs_epu16(retAB, colAB);  //    (255 - rgb)
+	retAB = _mm_mullo_epi16(retAB, obAB);  // ob*(255 - rgb)
+	retAB = _mm_srli_epi16(retAB, 8);      // ob*(255 - rgb)/256
+	retAB = _mm_add_epi16(retAB, colAB);   // ob*(255 - rgb)/256 + rgb
+
+	return _mm_packus_epi16(retAB, retAB);
+#endif
+}
+
+#endif /* WITH_SSE */
+#endif /* BLITTER_32BPP_SSE_BASE_HPP */
diff --git a/src/blitter/32bpp_ssse3.cpp b/src/blitter/32bpp_ssse3.cpp
index 9cee7dbf9..3c42d359c 100644
--- a/src/blitter/32bpp_ssse3.cpp
+++ b/src/blitter/32bpp_ssse3.cpp
@@ -74,8 +74,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
 				for (uint x = (uint) effective_width / 2; x > 0; x--) {
 					__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 					__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
-					ALPHA_BLEND_2();
-					_mm_storel_epi64((__m128i*) dst, srcABCD);
+					_mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
 					src += 2;
 					dst += 2;
 				}
@@ -83,8 +82,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
 				if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
 					__m128i srcABCD = _mm_cvtsi32_si128(src->data);
 					__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
-					ALPHA_BLEND_2();
-					dst->data = _mm_cvtsi128_si32(srcABCD);
+					dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
 				}
 				break;
 
@@ -96,33 +94,39 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
 
 					/* Remap colours. */
 					if (mvX2 & 0x00FF00FF) {
-						/* Written so the compiler uses CMOV. */
-						const Colour src0 = src[0];
-						const uint m0 = (byte) mvX2;
-						const uint r0 = remap[m0];
-						const Colour c0map = (this->LookupColourInPalette(r0).data & 0x00FFFFFF) | (src0.data & 0xFF000000);
-						Colour c0 = 0; // Use alpha of 0 to keep dst as is.
-						c0 = r0 == 0 ? c0 : c0map;
-						c0 = m0 != 0 ? c0 : src0;
-						srcABCD = _mm_cvtsi32_si128(c0.data);
-
-						const Colour src1 = src[1];
-						const uint m1 = (byte) (mvX2 >> 16);
-						const uint r1 = remap[m1];
-						const Colour c1map = (this->LookupColourInPalette(r1).data & 0x00FFFFFF) | (src1.data & 0xFF000000);
-						Colour c1 = 0;
-						c1 = r1 == 0 ? c1 : c1map;
-						c1 = m1 != 0 ? c1 : src1;
-						INSR32(c1.data, srcABCD, 1);
-
-						if ((mvX2 & 0xFF00FF00) != 0x80008000) {
-							ADJUST_BRIGHTNESS_2(srcABCD, mvX2);
-						}
+						#define CMOV_REMAP(m_colour, m_src, m_m) \
+							/* Written so the compiler uses CMOV. */ \
+							Colour m_colour = 0; \
+							{ \
+							const Colour srcm = (Colour) (m_src); \
+							const uint m = (byte) (m_m); \
+							const uint r = remap[m]; \
+							const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \
+							m_colour = r == 0 ? m_colour : cmap; \
+							m_colour = m != 0 ? m_colour : srcm; \
+							}
+#ifdef _SQ64
+						uint64 srcs = _mm_cvtsi128_si64(srcABCD);
+						uint64 remapped_src = 0;
+						CMOV_REMAP(c0, srcs, mvX2);
+						remapped_src = c0.data;
+						CMOV_REMAP(c1, srcs >> 32, mvX2 >> 16);
+						remapped_src |= (uint64) c1.data << 32;
+						srcABCD = _mm_cvtsi64_si128(remapped_src);
+#else
+						Colour remapped_src[2];
+						CMOV_REMAP(c0, _mm_cvtsi128_si32(srcABCD), mvX2);
+						remapped_src[0] = c0.data;
+						CMOV_REMAP(c1, src[1], mvX2 >> 16);
+						remapped_src[1] = c1.data;
+						srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src);
+#endif
+
+						if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2);
 					}
 
 					/* Blend colours. */
-					ALPHA_BLEND_2();
-					_mm_storel_epi64((__m128i *) dst, srcABCD);
+					_mm_storel_epi64((__m128i *) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
 					dst += 2;
 					src += 2;
 					src_mv += 2;
@@ -134,7 +138,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
 					if (src_mv->m) {
 						const uint r = remap[src_mv->m];
 						if (r != 0) {
-							Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
+							Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
 							if (src->a == 255) {
 								*dst = remapped_colour;
 							} else {
@@ -148,7 +152,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
 						if (src->a < 255) {
 bmcr_alpha_blend_single:
 							__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
-							ALPHA_BLEND_2();
+							srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
 						}
 						dst->data = _mm_cvtsi128_si32(srcABCD);
 					}
@@ -160,8 +164,7 @@ bmcr_alpha_blend_single:
 				for (uint x = (uint) bp->width / 2; x > 0; x--) {
 					__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
 					__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
-					DARKEN_2();
-					_mm_storel_epi64((__m128i *) dst, dstAB);
+					_mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
 					src += 2;
 					dst += 2;
 				}
@@ -169,8 +172,7 @@ bmcr_alpha_blend_single:
 				if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) {
 					__m128i srcABCD = _mm_cvtsi32_si128(src->data);
 					__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
-					DARKEN_2();
-					dst->data = _mm_cvtsi128_si32(dstAB);
+					dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
 				}
 				break;
 		}
diff --git a/src/blitter/32bpp_ssse3.hpp b/src/blitter/32bpp_ssse3.hpp
index 23e6e4878..090dd5c42 100644
--- a/src/blitter/32bpp_ssse3.hpp
+++ b/src/blitter/32bpp_ssse3.hpp
@@ -14,53 +14,10 @@
 
 #ifdef WITH_SSE
 
+#ifndef SSE_VERSION
+#define SSE_VERSION 3
+#endif
 #include "32bpp_sse2.hpp"
-#include "tmmintrin.h"
-
-/* Use PSHUFB instead of PSHUFHW+PSHUFLW. */
-#undef PUT_ALPHA_IN_FRONT_OF_RGB
-#define PUT_ALPHA_IN_FRONT_OF_RGB(m_from, m_into) m_into = _mm_shuffle_epi8(m_from, a_cm);
-
-#undef PACK_AB_WITHOUT_SATURATION
-#define PACK_AB_WITHOUT_SATURATION(m_from, m_into) m_into = _mm_shuffle_epi8(m_from, pack_low_cm);
-
-/* Adjust brightness of 2 pixels. */
-#define ADJUST_BRIGHTNESS_2(m_colourX2, m_brightnessX2) \
-	/* The following dataflow differs from the one of AdjustBrightness() only for alpha.
-	 * In order to keep alpha in colAB, insert a 1 in a unused brightness byte (a*1->a).
-	 * OK, not a 1 but DEFAULT_BRIGHTNESS to compensate the div.
-	 */ \
-	m_brightnessX2 &= 0xFF00FF00; \
-	m_brightnessX2 += DEFAULT_BRIGHTNESS; \
-	\
-	__m128i zero = _mm_setzero_si128(); \
-	__m128i colAB = _mm_unpacklo_epi8(m_colourX2, zero); \
-	\
-	__m128i briAB = _mm_cvtsi32_si128(m_brightnessX2); \
-	briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK); /* DEFAULT_BRIGHTNESS in 0, 0x00 in 2. */ \
-	colAB = _mm_mullo_epi16(colAB, briAB); \
-	__m128i colAB_ob = _mm_srli_epi16(colAB, 8+7); \
-	colAB = _mm_srli_epi16(colAB, 7); \
-	\
-	/* Sum overbright.
-	 * Maximum for each rgb is 508 => 9 bits. The highest bit tells if there is overbright.
-	 * -255 is changed in -256 so we just have to take the 8 lower bits into account.
-	 */ \
-	colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER); \
-	colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK); \
-	colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK); \
-	colAB_ob = _mm_and_si128(colAB_ob, colAB); \
-	__m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, zero), zero); \
-	\
-	obAB = _mm_srli_epi16(obAB, 1);        /* Reduce overbright strength. */ \
-	obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK); \
-	__m128i retAB = OVERBRIGHT_VALUE_MASK; /* ob_mask is equal to white. */ \
-	retAB = _mm_subs_epu16(retAB, colAB);  /*    (255 - rgb) */ \
-	retAB = _mm_mullo_epi16(retAB, obAB);  /* ob*(255 - rgb) */ \
-	retAB = _mm_srli_epi16(retAB, 8);      /* ob*(255 - rgb)/256 */ \
-	retAB = _mm_add_epi16(retAB, colAB);   /* ob*(255 - rgb)/256 + rgb */ \
-	\
-	m_colourX2 = _mm_packus_epi16(retAB, retAB);
 
 /** The SSSE3 32 bpp blitter (without palette animation). */
 class Blitter_32bppSSSE3 : public Blitter_32bppSSE2 {