diff options
author | rubidium <rubidium@openttd.org> | 2014-01-02 23:12:32 +0000 |
---|---|---|
committer | rubidium <rubidium@openttd.org> | 2014-01-02 23:12:32 +0000 |
commit | fb05674cb76e05d12763539ae3f8fd18414a6660 (patch) | |
tree | 952796fb20c183992a4072913815d00aea06628e /src/blitter | |
parent | 899c0f9cd230bc36fc006e54bf88f598ab725257 (diff) | |
download | openttd-fb05674cb76e05d12763539ae3f8fd18414a6660.tar.xz |
(svn r26211) -Add: specialised non-animated SS2 blitter (MJP)
With 32bpp base set about 30% faster than 32bpp-optimized, or about 10% for 8bpp base sets in the Draw function. Respectively about 5 and 1% of total run time
Diffstat (limited to 'src/blitter')
-rw-r--r-- | src/blitter/32bpp_sse2.cpp | 348 | ||||
-rw-r--r-- | src/blitter/32bpp_sse2.hpp | 128 |
2 files changed, 476 insertions, 0 deletions
diff --git a/src/blitter/32bpp_sse2.cpp b/src/blitter/32bpp_sse2.cpp new file mode 100644 index 000000000..09c0ffa8a --- /dev/null +++ b/src/blitter/32bpp_sse2.cpp @@ -0,0 +1,348 @@ +/* $Id$ */ + +/* + * This file is part of OpenTTD. + * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2. + * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>. + */ + +/** @file 32bpp_sse2.cpp Implementation of the SSE2 32 bpp blitter. */ + +#ifdef WITH_SSE + +#include "../stdafx.h" +#include "../zoom_func.h" +#include "../settings_type.h" +#include "32bpp_sse2.hpp" + +/** Instantiation of the SSE2 32bpp blitter factory. */ +static FBlitter_32bppSSE2 iFBlitter_32bppSSE2; + +/** + * Draws a sprite to a (screen) buffer. It is templated to allow faster operation. + * + * @tparam mode blitter mode + * @param bp further blitting parameters + * @param zoom zoom level at which we are drawing + */ +template <BlitterMode mode, Blitter_32bppSSE2::ReadMode read_mode, Blitter_32bppSSE2::BlockType bt_last> +inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom) +{ + Colour *dst_line = (Colour *) bp->dst + bp->top * bp->pitch + bp->left; + int effective_width = bp->width; + + /* Find where to start reading in the source sprite */ + const SpriteData * const sd = (const SpriteData *) bp->sprite; + const SpriteInfo * const si = &sd->infos[zoom]; + const MapValue *src_mv_line = (const MapValue *) &sd->data[si->mv_offset] + bp->skip_top * si->sprite_width; + const Colour *src_rgba_line = (const Colour *) ((const byte *) &sd->data[si->sprite_offset] + bp->skip_top * si->sprite_line_size); + + if (read_mode != RM_WITH_MARGIN) { + src_rgba_line += bp->skip_left; + src_mv_line += bp->skip_left; + } + + /* Load these variables into register before loop. */ + const __m128i clear_hi = CLEAR_HIGH_BYTE_MASK; + + for (int y = bp->height; y != 0; y--) { + Colour *dst = dst_line; + const Colour *src = src_rgba_line + META_LENGTH; + const MapValue *src_mv = src_mv_line; + + switch (mode) { + default: { + switch (read_mode) { + case RM_WITH_MARGIN: { + src += src_rgba_line[0].data; + dst += src_rgba_line[0].data; + const int width_diff = si->sprite_width - bp->width; + effective_width = bp->width - (int) src_rgba_line[0].data; + const int delta_diff = (int) src_rgba_line[1].data - width_diff; + const int new_width = effective_width - (delta_diff & ~1); + effective_width = delta_diff > 0 ? new_width : effective_width; + if (effective_width <= 0) break; + /* FALLTHROUGH */ + } + + case RM_WITH_SKIP: { + for (uint x = (uint) effective_width / 2; x > 0; x--) { + __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); + __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); + ALPHA_BLEND_2(); + *(uint64*) dst = EXTR64(srcABCD, 0); + src += 2; + dst += 2; + } + if (bt_last == BT_ODD) { + __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); + __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); + ALPHA_BLEND_2(); + (*dst).data = EXTR32(srcABCD, 0); + } + break; + } + + default: NOT_REACHED(); + } + break; + } + case BM_COLOUR_REMAP: { + switch (read_mode) { + case RM_WITH_MARGIN: { + src += src_rgba_line[0].data; + src_mv += src_rgba_line[0].data; + dst += src_rgba_line[0].data; + const int width_diff = si->sprite_width - bp->width; + effective_width = bp->width - (int) src_rgba_line[0].data; + const int delta_diff = (int) src_rgba_line[1].data - width_diff; + const int nd = effective_width - delta_diff; + effective_width = delta_diff > 0 ? nd : effective_width; + if (effective_width <= 0) break; + /* FALLTHROUGH */ + } + + case RM_WITH_SKIP: { + const byte *remap = bp->remap; + for (uint x = (uint) effective_width; x != 0; x--) { + /* In case the m-channel is zero, do not remap this pixel in any way */ + if (src_mv->m == 0) { + if (src->a < 255) { + __m128i srcABCD = _mm_loadu_si128((const __m128i*) src); + __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); + ALPHA_BLEND_2(); + (*dst).data = EXTR32(srcABCD, 0); + } else { + *dst = src->data; + } + } else { + const uint r = remap[src_mv->m]; + if (r != 0) { + Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v); + if (src->a < 255) { + __m128i srcABCD; + __m128i dstABCD = _mm_loadu_si128((__m128i*) dst); + remapped_colour.a = src->a; + INSR32(remapped_colour.data, srcABCD, 0); + ALPHA_BLEND_2(); + (*dst).data = EXTR32(srcABCD, 0); + } else + *dst = remapped_colour; + } + } + src_mv++; + dst++; + src++; + } + break; + } + + default: NOT_REACHED(); + } + src_mv_line += si->sprite_width; + break; + } + case BM_TRANSPARENT: + /* Make the current colour a bit more black, so it looks like this image is transparent */ + for (int x = bp->width; x != 0; x--) { + if (src->a == 255) { + *dst = MakeTransparent(*dst, 3, 4); + } else { + *dst = MakeTransparent(*dst, (256 * 4 - src->a), 256 * 4); + } + dst++; + src++; + } + break; + } + + src_rgba_line = (const Colour*) ((const byte*) src_rgba_line + si->sprite_line_size); + dst_line += bp->pitch; + } +} + +/** + * Draws a sprite to a (screen) buffer. Calls adequate templated function. + * + * @param bp further blitting parameters + * @param mode blitter mode + * @param zoom zoom level at which we are drawing + */ +void Blitter_32bppSSE2::Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom) +{ + switch (mode) { + case BM_NORMAL: { + const BlockType bt_last = (BlockType) (bp->width & 1); + if (bp->skip_left != 0 || bp->width <= MARGIN_NORMAL_THRESHOLD) { + switch (bt_last) { + case BT_EVEN: Draw<BM_NORMAL, RM_WITH_SKIP, BT_EVEN>(bp, zoom); return; + case BT_ODD: Draw<BM_NORMAL, RM_WITH_SKIP, BT_ODD>(bp, zoom); return; + default: NOT_REACHED(); + } + } else { + switch (bt_last) { + case BT_EVEN: Draw<BM_NORMAL, RM_WITH_MARGIN, BT_EVEN>(bp, zoom); return; + case BT_ODD: Draw<BM_NORMAL, RM_WITH_MARGIN, BT_ODD>(bp, zoom); return; + default: NOT_REACHED(); + } + } + break; + } + case BM_COLOUR_REMAP: + if (bp->skip_left != 0 || bp->width <= MARGIN_REMAP_THRESHOLD) { + Draw<BM_COLOUR_REMAP, RM_WITH_SKIP, BT_NONE>(bp, zoom); return; + } else { + Draw<BM_COLOUR_REMAP, RM_WITH_MARGIN, BT_NONE>(bp, zoom); return; + } + case BM_TRANSPARENT: Draw<BM_TRANSPARENT, RM_NONE, BT_NONE>(bp, zoom); return; + default: NOT_REACHED(); + } +} + +Sprite *Blitter_32bppSSE2::Encode(const SpriteLoader::Sprite *sprite, AllocatorProc *allocator) +{ + /* First uint32 of a line = ~1 & the number of transparent pixels from the left. + * Second uint32 of a line = the number of transparent pixels from the right. + * Then all RGBA then all MV. + */ + ZoomLevel zoom_min = ZOOM_LVL_NORMAL; + ZoomLevel zoom_max = ZOOM_LVL_NORMAL; + if (sprite->type != ST_FONT) { + zoom_min = _settings_client.gui.zoom_min; + zoom_max = _settings_client.gui.zoom_max; + if (zoom_max == zoom_min) zoom_max = ZOOM_LVL_MAX; + } + + /* Calculate sizes and allocate. */ + SpriteData sd; + uint all_sprites_size = 0; + for (ZoomLevel z = zoom_min; z <= zoom_max; z++) { + const SpriteLoader::Sprite *src_sprite = &sprite[z]; + sd.infos[z].sprite_width = src_sprite->width; + sd.infos[z].sprite_offset = all_sprites_size; + sd.infos[z].sprite_line_size = sizeof(Colour) * src_sprite->width + sizeof(uint32) * META_LENGTH; + + const uint rgba_size = sd.infos[z].sprite_line_size * src_sprite->height; + sd.infos[z].mv_offset = all_sprites_size + rgba_size; + + const uint mv_size = sizeof(MapValue) * src_sprite->width * src_sprite->height; + all_sprites_size += rgba_size + mv_size; + } + + Sprite *dst_sprite = (Sprite *) allocator(sizeof(Sprite) + sizeof(SpriteData) + all_sprites_size); + dst_sprite->height = sprite->height; + dst_sprite->width = sprite->width; + dst_sprite->x_offs = sprite->x_offs; + dst_sprite->y_offs = sprite->y_offs; + memcpy(dst_sprite->data, &sd, sizeof(SpriteData)); + + /* Copy colours. */ + for (ZoomLevel z = zoom_min; z <= zoom_max; z++) { + const SpriteLoader::Sprite *src_sprite = &sprite[z]; + const SpriteLoader::CommonPixel *src = (const SpriteLoader::CommonPixel *) src_sprite->data; + Colour *dst_rgba_line = (Colour *) &dst_sprite->data[sizeof(SpriteData) + sd.infos[z].sprite_offset]; + MapValue *dst_mv = (MapValue *) &dst_sprite->data[sizeof(SpriteData) + sd.infos[z].mv_offset]; + for (uint y = src_sprite->height; y != 0; y--) { + Colour *dst_rgba = dst_rgba_line + META_LENGTH; + for (uint x = src_sprite->width; x != 0; x--) { + if (src->a != 0) { + dst_rgba->a = src->a; + dst_mv->m = src->m; + if (src->m != 0) { + /* Get brightest value (or default brightness if it's a black pixel). */ + const uint8 rgb_max = max(src->r, max(src->g, src->b)); + dst_mv->v = (rgb_max == 0) ? DEFAULT_BRIGHTNESS : rgb_max; + + /* Pre-convert the mapping channel to a RGB value. */ + const Colour colour = AdjustBrightness(LookupColourInPalette(src->m), dst_mv->v); + dst_rgba->r = colour.r; + dst_rgba->g = colour.g; + dst_rgba->b = colour.b; + } else { + dst_rgba->r = src->r; + dst_rgba->g = src->g; + dst_rgba->b = src->b; + dst_mv->v = DEFAULT_BRIGHTNESS; + } + } else { + dst_rgba->data = 0; + *(uint16*) dst_mv = 0; + } + dst_rgba++; + dst_mv++; + src++; + } + + /* Count the number of transparent pixels from the left. */ + dst_rgba = dst_rgba_line + META_LENGTH; + uint32 nb_pix_transp = 0; + for (uint x = src_sprite->width; x != 0; x--) { + if (dst_rgba->a == 0) nb_pix_transp++; + else break; + dst_rgba++; + } + (*dst_rgba_line).data = nb_pix_transp & ~1; // "& ~1" to preserve the last block type + + Colour *nb_right = dst_rgba_line + 1; + dst_rgba_line = (Colour*) ((byte*) dst_rgba_line + sd.infos[z].sprite_line_size); + + /* Count the number of transparent pixels from the right. */ + dst_rgba = dst_rgba_line - 1; + nb_pix_transp = 0; + for (uint x = src_sprite->width; x != 0; x--) { + if (dst_rgba->a == 0) nb_pix_transp++; + else break; + dst_rgba--; + } + (*nb_right).data = nb_pix_transp; // no "& ~1" here, must be done when we know bp->width + } + } + + return dst_sprite; +} + +/** ReallyAdjustBrightness() is not called that often. + * Inlining this function implies a far jump, which has a huge latency. + */ +inline Colour Blitter_32bppSSE2::AdjustBrightness(Colour colour, uint8 brightness) +{ + /* Shortcut for normal brightness. */ + if (brightness == DEFAULT_BRIGHTNESS) return colour; + + return this->ReallyAdjustBrightness(colour, brightness); +} + +Colour Blitter_32bppSSE2::ReallyAdjustBrightness(Colour colour, uint8 brightness) +{ + ALIGN(16) uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32; + c16 *= brightness; + uint64 c16_ob = c16; // Helps out of order execution. + c16 /= DEFAULT_BRIGHTNESS; + c16 &= 0x01FF01FF01FF; + + /* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */ + c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16; + uint64 ob = (uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32); + + const uint32 alpha32 = colour.data & 0xFF000000; + __m128i ret; + INSR64(c16, ret, 0); + if (ob != 0) { + /* Reduce overbright strength. */ + ob /= 2; + __m128i ob128; + INSR64(ob | ob << 16 | ob << 32, ob128, 0); + __m128i white = OVERBRIGHT_VALUE_MASK; + __m128i c128 = ret; + ret = _mm_subs_epu16(white, c128); /* PSUBUSW, (255 - rgb) */ + ret = _mm_mullo_epi16(ret, ob128); /* PMULLW, ob*(255 - rgb) */ + ret = _mm_srli_epi16(ret, 8); /* PSRLW, ob*(255 - rgb)/256 */ + ret = _mm_add_epi16(ret, c128); /* PADDW, ob*(255 - rgb)/256 + rgb */ + } + + ret = _mm_packus_epi16(ret, ret); /* PACKUSWB, saturate and pack. */ + return alpha32 | EXTR32(ret, 0); +} + +#endif /* WITH_SSE */ diff --git a/src/blitter/32bpp_sse2.hpp b/src/blitter/32bpp_sse2.hpp new file mode 100644 index 000000000..0e11510cf --- /dev/null +++ b/src/blitter/32bpp_sse2.hpp @@ -0,0 +1,128 @@ +/* $Id$ */ + +/* + * This file is part of OpenTTD. + * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2. + * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>. + */ + +/** @file 32bpp_sse2.hpp SSE2 32 bpp blitter. */ + +#ifndef BLITTER_32BPP_SSE2_HPP +#define BLITTER_32BPP_SSE2_HPP + +#ifdef WITH_SSE + +#include "32bpp_simple.hpp" +#include "emmintrin.h" + +#define META_LENGTH 2 ///< Number of uint32 inserted before each line of pixels in a sprite. +#define MARGIN_NORMAL_THRESHOLD (zoom == ZOOM_LVL_OUT_32X ? 8 : 4) ///< Minimum width to use margins with BM_NORMAL. +#define MARGIN_REMAP_THRESHOLD 4 ///< Minimum width to use margins with BM_COLOUR_REMAP. + +#ifdef _MSC_VER + #define ALIGN(n) __declspec(align(n)) +#else + #define ALIGN(n) __attribute__ ((aligned (n))) +#endif + +typedef union ALIGN(16) um128i { + __m128i m128i; + uint8 m128i_u8[16]; + uint16 m128i_u16[8]; + uint32 m128i_u32[4]; + uint64 m128i_u64[2]; +} um128i; + +#define CLEAR_HIGH_BYTE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0) +#define ALPHA_CONTROL_MASK _mm_setr_epi8( 6, 7, 6, 7, 6, 7, -1, -1, 14, 15, 14, 15, 14, 15, -1, -1) +#define PACK_LOW_CONTROL_MASK _mm_setr_epi8( 0, 2, 4, -1, 8, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1) +#define PACK_HIGH_CONTROL_MASK _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, -1, 8, 10, 12, -1) +#define BRIGHTNESS_LOW_CONTROL_MASK _mm_setr_epi8( 1, 2, 1, 2, 1, 2, 0, 2, 3, 2, 3, 2, 3, 2, 0, 2) +#define BRIGHTNESS_DIV_CLEANER _mm_setr_epi8(-1, 1, -1, 1, -1, 1, -1, 0, -1, 1, -1, 1, -1, 1, -1, 0) +#define OVERBRIGHT_PRESENCE_MASK _mm_setr_epi8( 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0) +#define OVERBRIGHT_VALUE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0) +#define OVERBRIGHT_CONTROL_MASK _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 7, 7, 2, 3, 2, 3, 2, 3, 7, 7) +#define TRANSPARENT_NOM_BASE _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256) + +#define EXTR32(from, rank) (*(um128i*) &from).m128i_u32[rank] +#define EXTR64(from, rank) (*(um128i*) &from).m128i_u64[rank] +#define INSR32(val, into, rank) { \ + (*(um128i*) &into).m128i = _mm_insert_epi16((*(um128i*) &into).m128i, val, (rank)*2); \ + (*(um128i*) &into).m128i = _mm_insert_epi16((*(um128i*) &into).m128i, (val) >> 16, (rank)*2 + 1); \ +} +#define INSR64(val, into, rank) (*(um128i*) &into).m128i_u64[rank] = (val) + +/* Alpha blend 2 pixels. */ +#define ALPHA_BLEND_2() { \ + __m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); /* PUNPCKLBW, expand each uint8 into uint16 */ \ + __m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); \ + \ + __m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); /* PCMPGTW, if (alpha > 0) a++; */ \ + alphaAB = _mm_srli_epi16(alphaAB, 15); \ + alphaAB = _mm_add_epi16(alphaAB, srcAB); \ + alphaAB = _mm_shufflelo_epi16(alphaAB, 0x3F); /* PSHUFLW, put alpha1 in front of each rgb1 */ \ + alphaAB = _mm_shufflehi_epi16(alphaAB, 0x3F); /* PSHUFHW, put alpha2 in front of each rgb2 */ \ + \ + srcAB = _mm_sub_epi16(srcAB, dstAB); /* PSUBW, (r - Cr) */ \ + srcAB = _mm_mullo_epi16(srcAB, alphaAB); /* PMULLW, a*(r - Cr) */ \ + srcAB = _mm_srli_epi16(srcAB, 8); /* PSRLW, a*(r - Cr)/256 */ \ + srcAB = _mm_add_epi16(srcAB, dstAB); /* PADDW, a*(r - Cr)/256 + Cr */ \ + srcAB = _mm_and_si128(srcAB, clear_hi); /* PAND, wipe high bytes to keep low bytes when packing */ \ + srcABCD = _mm_packus_epi16(srcAB, srcAB); /* PACKUSWB, pack 2 colours (with saturation) */ \ +} + +/** The SSE2 32 bpp blitter (without palette animation). */ +class Blitter_32bppSSE2 : public Blitter_32bppSimple { +public: + struct MapValue { + uint8 m; + uint8 v; + }; + assert_compile(sizeof(MapValue) == 2); + + /** Helper for creating specialised functions for specific optimisations. */ + enum ReadMode { + RM_WITH_SKIP, ///< Use normal code for skipping empty pixels. + RM_WITH_MARGIN, ///< Use cached number of empty pixels at begin and end of line to reduce work. + RM_NONE, ///< No specialisation. + }; + + /** Helper for creating specialised functions for the case where the sprite width is odd or even. */ + enum BlockType { + BT_EVEN, ///< An even number of pixels in the width; no need for a special case for the last pixel. + BT_ODD, ///< An odd number of pixels in the width; special case for the last pixel. + BT_NONE, ///< No specialisation for either case. + }; + + /** Data stored about a (single) sprite. */ + struct SpriteInfo { + uint32 sprite_offset; ///< The offset to the sprite data. + uint32 mv_offset; ///< The offset to the map value data. + uint16 sprite_line_size; ///< The size of a single line (pitch). + uint16 sprite_width; ///< The width of the sprite. + }; + struct SpriteData { + SpriteInfo infos[ZOOM_LVL_COUNT]; + byte data[]; ///< Data, all zoomlevels. + }; + + virtual Colour AdjustBrightness(Colour colour, uint8 brightness); + Colour ReallyAdjustBrightness(Colour colour, uint8 brightness); + /* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom); + template <BlitterMode mode, ReadMode read_mode, BlockType bt_last> + void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom); + /* virtual */ Sprite *Encode(const SpriteLoader::Sprite *sprite, AllocatorProc *allocator); + /* virtual */ const char *GetName() { return "32bpp-sse2"; } +}; + +/** Factory for the SSE2 32 bpp blitter (without palette animation). */ +class FBlitter_32bppSSE2 : public BlitterFactory { +public: + FBlitter_32bppSSE2() : BlitterFactory("32bpp-sse2", "32bpp SSE2 Blitter (no palette animation)", HasCPUIDFlag(1, 3, 26)) {} + /* virtual */ Blitter *CreateInstance() { return new Blitter_32bppSSE2(); } +}; + +#endif /* WITH_SSE */ +#endif /* BLITTER_32BPP_SSE2_HPP */ |