summaryrefslogtreecommitdiff
path: root/src/blitter/32bpp_sse2.hpp
blob: a0ed74cdbc303657e980b1854db344907917c13f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/* $Id$ */

/*
 * This file is part of OpenTTD.
 * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
 * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
 */

/** @file 32bpp_sse2.hpp SSE2 32 bpp blitter. */

#ifndef BLITTER_32BPP_SSE2_HPP
#define BLITTER_32BPP_SSE2_HPP

#ifdef WITH_SSE

#include "32bpp_simple.hpp"
#include "emmintrin.h"

#define META_LENGTH 2 ///< Number of uint32 inserted before each line of pixels in a sprite.
#define MARGIN_NORMAL_THRESHOLD (zoom == ZOOM_LVL_OUT_32X ? 8 : 4) ///< Minimum width to use margins with BM_NORMAL.
#define MARGIN_REMAP_THRESHOLD 4 ///< Minimum width to use margins with BM_COLOUR_REMAP.

#ifdef _MSC_VER
	#define ALIGN(n) __declspec(align(n))
#else
	#define ALIGN(n) __attribute__ ((aligned (n)))
#endif

typedef union ALIGN(16) um128i {
	__m128i m128i;
	uint8 m128i_u8[16];
	uint16 m128i_u16[8];
	uint32 m128i_u32[4];
	uint64 m128i_u64[2];
} um128i;

#define CLEAR_HIGH_BYTE_MASK        _mm_setr_epi8(-1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0)
#define ALPHA_CONTROL_MASK          _mm_setr_epi8( 6,  7,  6,  7,  6,  7, -1, -1, 14, 15, 14, 15, 14, 15, -1, -1)
#define PACK_LOW_CONTROL_MASK       _mm_setr_epi8( 0,  2,  4, -1,  8, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1)
#define PACK_HIGH_CONTROL_MASK      _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,  0,  2,  4, -1,  8, 10, 12, -1)
#define BRIGHTNESS_LOW_CONTROL_MASK _mm_setr_epi8( 1,  2,  1,  2,  1,  2,  0,  2,  3,  2,  3,  2,  3,  2,  0,  2)
#define BRIGHTNESS_DIV_CLEANER      _mm_setr_epi8(-1,  1, -1,  1, -1,  1, -1,  0, -1,  1, -1,  1, -1,  1, -1,  0)
#define OVERBRIGHT_PRESENCE_MASK    _mm_setr_epi8( 1,  0,  1,  0,  1,  0,  0,  0,  1,  0,  1,  0,  1,  0,  0,  0)
#define OVERBRIGHT_VALUE_MASK       _mm_setr_epi8(-1,  0, -1,  0, -1,  0,  0,  0, -1,  0, -1,  0, -1,  0,  0,  0)
#define OVERBRIGHT_CONTROL_MASK     _mm_setr_epi8( 0,  1,  0,  1,  0,  1,  7,  7,  2,  3,  2,  3,  2,  3,  7,  7)
#define TRANSPARENT_NOM_BASE        _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256)

#define EXTR32(from, rank) (*(um128i*) &from).m128i_u32[rank]
#define EXTR64(from, rank) (*(um128i*) &from).m128i_u64[rank]
#define INSR32(val, into, rank) { \
	(*(um128i*) &into).m128i = _mm_insert_epi16((*(um128i*) &into).m128i, val, (rank)*2); \
	(*(um128i*) &into).m128i = _mm_insert_epi16((*(um128i*) &into).m128i, (val) >> 16, (rank)*2 + 1); \
}
#define INSR64(val, into, rank) (*(um128i*) &into).m128i_u64[rank] = (val)

/* Alpha blend 2 pixels. */
#define ALPHA_BLEND_2() { \
	__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); /* PUNPCKLBW, expand each uint8 into uint16 */ \
	__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); \
	\
	__m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128());   /* PCMPGTW, if (alpha > 0) a++; */ \
	alphaAB = _mm_srli_epi16(alphaAB, 15); \
	alphaAB = _mm_add_epi16(alphaAB, srcAB); \
	alphaAB = _mm_shufflelo_epi16(alphaAB, 0x3F); /* PSHUFLW, put alpha1 in front of each rgb1 */ \
	alphaAB = _mm_shufflehi_epi16(alphaAB, 0x3F); /* PSHUFHW, put alpha2 in front of each rgb2 */ \
	\
	srcAB = _mm_sub_epi16(srcAB, dstAB);          /* PSUBW,    (r - Cr) */ \
	srcAB = _mm_mullo_epi16(srcAB, alphaAB);      /* PMULLW, a*(r - Cr) */ \
	srcAB = _mm_srli_epi16(srcAB, 8);             /* PSRLW,  a*(r - Cr)/256 */ \
	srcAB = _mm_add_epi16(srcAB, dstAB);          /* PADDW,  a*(r - Cr)/256 + Cr */ \
	srcAB = _mm_and_si128(srcAB, clear_hi);       /* PAND, wipe high bytes to keep low bytes when packing */ \
	srcABCD = _mm_packus_epi16(srcAB, srcAB);     /* PACKUSWB, pack 2 colours (with saturation) */ \
}

/** Base methods for 32bpp SSE blitters. */
class Blitter_32bppSSE_Base {
public:
	virtual ~Blitter_32bppSSE_Base() {}

	struct MapValue {
		uint8 m;
		uint8 v;
	};
	assert_compile(sizeof(MapValue) == 2);

	/** Helper for creating specialised functions for specific optimisations. */
	enum ReadMode {
		RM_WITH_SKIP,   ///< Use normal code for skipping empty pixels.
		RM_WITH_MARGIN, ///< Use cached number of empty pixels at begin and end of line to reduce work.
		RM_NONE,        ///< No specialisation.
	};

	/** Helper for creating specialised functions for the case where the sprite width is odd or even. */
	enum BlockType {
		BT_EVEN, ///< An even number of pixels in the width; no need for a special case for the last pixel.
		BT_ODD,  ///< An odd number of pixels in the width; special case for the last pixel.
		BT_NONE, ///< No specialisation for either case.
	};

	/** Data stored about a (single) sprite. */
	struct SpriteInfo {
		uint32 sprite_offset;    ///< The offset to the sprite data.
		uint32 mv_offset;        ///< The offset to the map value data.
		uint16 sprite_line_size; ///< The size of a single line (pitch).
		uint16 sprite_width;     ///< The width of the sprite.
	};
	struct SpriteData {
		SpriteInfo infos[ZOOM_LVL_COUNT];
		byte data[]; ///< Data, all zoomlevels.
	};

	Sprite *Encode(const SpriteLoader::Sprite *sprite, AllocatorProc *allocator);
	virtual Colour AdjustBrightness(Colour colour, uint8 brightness) = 0;
};

/** The SSE2 32 bpp blitter (without palette animation). */
class Blitter_32bppSSE2 : public Blitter_32bppSimple, public Blitter_32bppSSE_Base {
public:
	virtual Colour AdjustBrightness(Colour colour, uint8 brightness);
	static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness);
	/* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom);
	template <BlitterMode mode, Blitter_32bppSSE_Base::ReadMode read_mode, Blitter_32bppSSE_Base::BlockType bt_last>
	void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom);

	/* virtual */ Sprite *Encode(const SpriteLoader::Sprite *sprite, AllocatorProc *allocator) {
		return Blitter_32bppSSE_Base::Encode(sprite, allocator);
	}

	/* virtual */ const char *GetName() { return "32bpp-sse2"; }
};

/** Factory for the SSE2 32 bpp blitter (without palette animation). */
class FBlitter_32bppSSE2 : public BlitterFactory {
public:
	FBlitter_32bppSSE2() : BlitterFactory("32bpp-sse2", "32bpp SSE2 Blitter (no palette animation)", HasCPUIDFlag(1, 3, 26)) {}
	/* virtual */ Blitter *CreateInstance() { return new Blitter_32bppSSE2(); }
};

#endif /* WITH_SSE */
#endif /* BLITTER_32BPP_SSE2_HPP */