summaryrefslogtreecommitdiff
path: root/src/blitter/32bpp_sse2.hpp
diff options
context:
space:
mode:
authorrubidium <rubidium@openttd.org>2014-01-02 23:12:32 +0000
committerrubidium <rubidium@openttd.org>2014-01-02 23:12:32 +0000
commitfb05674cb76e05d12763539ae3f8fd18414a6660 (patch)
tree952796fb20c183992a4072913815d00aea06628e /src/blitter/32bpp_sse2.hpp
parent899c0f9cd230bc36fc006e54bf88f598ab725257 (diff)
downloadopenttd-fb05674cb76e05d12763539ae3f8fd18414a6660.tar.xz
(svn r26211) -Add: specialised non-animated SS2 blitter (MJP)
With 32bpp base set about 30% faster than 32bpp-optimized, or about 10% for 8bpp base sets in the Draw function. Respectively about 5 and 1% of total run time
Diffstat (limited to 'src/blitter/32bpp_sse2.hpp')
-rw-r--r--src/blitter/32bpp_sse2.hpp128
1 files changed, 128 insertions, 0 deletions
diff --git a/src/blitter/32bpp_sse2.hpp b/src/blitter/32bpp_sse2.hpp
new file mode 100644
index 000000000..0e11510cf
--- /dev/null
+++ b/src/blitter/32bpp_sse2.hpp
@@ -0,0 +1,128 @@
+/* $Id$ */
+
+/*
+ * This file is part of OpenTTD.
+ * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
+ * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file 32bpp_sse2.hpp SSE2 32 bpp blitter. */
+
+#ifndef BLITTER_32BPP_SSE2_HPP
+#define BLITTER_32BPP_SSE2_HPP
+
+#ifdef WITH_SSE
+
+#include "32bpp_simple.hpp"
+#include "emmintrin.h"
+
+#define META_LENGTH 2 ///< Number of uint32 inserted before each line of pixels in a sprite.
+#define MARGIN_NORMAL_THRESHOLD (zoom == ZOOM_LVL_OUT_32X ? 8 : 4) ///< Minimum width to use margins with BM_NORMAL.
+#define MARGIN_REMAP_THRESHOLD 4 ///< Minimum width to use margins with BM_COLOUR_REMAP.
+
+#ifdef _MSC_VER
+ #define ALIGN(n) __declspec(align(n))
+#else
+ #define ALIGN(n) __attribute__ ((aligned (n)))
+#endif
+
+typedef union ALIGN(16) um128i {
+ __m128i m128i;
+ uint8 m128i_u8[16];
+ uint16 m128i_u16[8];
+ uint32 m128i_u32[4];
+ uint64 m128i_u64[2];
+} um128i;
+
+#define CLEAR_HIGH_BYTE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0)
+#define ALPHA_CONTROL_MASK _mm_setr_epi8( 6, 7, 6, 7, 6, 7, -1, -1, 14, 15, 14, 15, 14, 15, -1, -1)
+#define PACK_LOW_CONTROL_MASK _mm_setr_epi8( 0, 2, 4, -1, 8, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1)
+#define PACK_HIGH_CONTROL_MASK _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, -1, 8, 10, 12, -1)
+#define BRIGHTNESS_LOW_CONTROL_MASK _mm_setr_epi8( 1, 2, 1, 2, 1, 2, 0, 2, 3, 2, 3, 2, 3, 2, 0, 2)
+#define BRIGHTNESS_DIV_CLEANER _mm_setr_epi8(-1, 1, -1, 1, -1, 1, -1, 0, -1, 1, -1, 1, -1, 1, -1, 0)
+#define OVERBRIGHT_PRESENCE_MASK _mm_setr_epi8( 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0)
+#define OVERBRIGHT_VALUE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0)
+#define OVERBRIGHT_CONTROL_MASK _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 7, 7, 2, 3, 2, 3, 2, 3, 7, 7)
+#define TRANSPARENT_NOM_BASE _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256)
+
+#define EXTR32(from, rank) (*(um128i*) &from).m128i_u32[rank]
+#define EXTR64(from, rank) (*(um128i*) &from).m128i_u64[rank]
+#define INSR32(val, into, rank) { \
+ (*(um128i*) &into).m128i = _mm_insert_epi16((*(um128i*) &into).m128i, val, (rank)*2); \
+ (*(um128i*) &into).m128i = _mm_insert_epi16((*(um128i*) &into).m128i, (val) >> 16, (rank)*2 + 1); \
+}
+#define INSR64(val, into, rank) (*(um128i*) &into).m128i_u64[rank] = (val)
+
+/* Alpha blend 2 pixels. */
+#define ALPHA_BLEND_2() { \
+ __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); /* PUNPCKLBW, expand each uint8 into uint16 */ \
+ __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); \
+ \
+ __m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); /* PCMPGTW, if (alpha > 0) a++; */ \
+ alphaAB = _mm_srli_epi16(alphaAB, 15); \
+ alphaAB = _mm_add_epi16(alphaAB, srcAB); \
+ alphaAB = _mm_shufflelo_epi16(alphaAB, 0x3F); /* PSHUFLW, put alpha1 in front of each rgb1 */ \
+ alphaAB = _mm_shufflehi_epi16(alphaAB, 0x3F); /* PSHUFHW, put alpha2 in front of each rgb2 */ \
+ \
+ srcAB = _mm_sub_epi16(srcAB, dstAB); /* PSUBW, (r - Cr) */ \
+ srcAB = _mm_mullo_epi16(srcAB, alphaAB); /* PMULLW, a*(r - Cr) */ \
+ srcAB = _mm_srli_epi16(srcAB, 8); /* PSRLW, a*(r - Cr)/256 */ \
+ srcAB = _mm_add_epi16(srcAB, dstAB); /* PADDW, a*(r - Cr)/256 + Cr */ \
+ srcAB = _mm_and_si128(srcAB, clear_hi); /* PAND, wipe high bytes to keep low bytes when packing */ \
+ srcABCD = _mm_packus_epi16(srcAB, srcAB); /* PACKUSWB, pack 2 colours (with saturation) */ \
+}
+
+/** The SSE2 32 bpp blitter (without palette animation). */
+class Blitter_32bppSSE2 : public Blitter_32bppSimple {
+public:
+ struct MapValue {
+ uint8 m;
+ uint8 v;
+ };
+ assert_compile(sizeof(MapValue) == 2);
+
+ /** Helper for creating specialised functions for specific optimisations. */
+ enum ReadMode {
+ RM_WITH_SKIP, ///< Use normal code for skipping empty pixels.
+ RM_WITH_MARGIN, ///< Use cached number of empty pixels at begin and end of line to reduce work.
+ RM_NONE, ///< No specialisation.
+ };
+
+ /** Helper for creating specialised functions for the case where the sprite width is odd or even. */
+ enum BlockType {
+ BT_EVEN, ///< An even number of pixels in the width; no need for a special case for the last pixel.
+ BT_ODD, ///< An odd number of pixels in the width; special case for the last pixel.
+ BT_NONE, ///< No specialisation for either case.
+ };
+
+ /** Data stored about a (single) sprite. */
+ struct SpriteInfo {
+ uint32 sprite_offset; ///< The offset to the sprite data.
+ uint32 mv_offset; ///< The offset to the map value data.
+ uint16 sprite_line_size; ///< The size of a single line (pitch).
+ uint16 sprite_width; ///< The width of the sprite.
+ };
+ struct SpriteData {
+ SpriteInfo infos[ZOOM_LVL_COUNT];
+ byte data[]; ///< Data, all zoomlevels.
+ };
+
+ virtual Colour AdjustBrightness(Colour colour, uint8 brightness);
+ Colour ReallyAdjustBrightness(Colour colour, uint8 brightness);
+ /* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom);
+ template <BlitterMode mode, ReadMode read_mode, BlockType bt_last>
+ void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom);
+ /* virtual */ Sprite *Encode(const SpriteLoader::Sprite *sprite, AllocatorProc *allocator);
+ /* virtual */ const char *GetName() { return "32bpp-sse2"; }
+};
+
+/** Factory for the SSE2 32 bpp blitter (without palette animation). */
+class FBlitter_32bppSSE2 : public BlitterFactory {
+public:
+ FBlitter_32bppSSE2() : BlitterFactory("32bpp-sse2", "32bpp SSE2 Blitter (no palette animation)", HasCPUIDFlag(1, 3, 26)) {}
+ /* virtual */ Blitter *CreateInstance() { return new Blitter_32bppSSE2(); }
+};
+
+#endif /* WITH_SSE */
+#endif /* BLITTER_32BPP_SSE2_HPP */