2 files changed, 124 insertions, 0 deletions
diff --git a/src/os/windows/string_uniscribe.cpp b/src/os/windows/string_uniscribe.cpp
index fbf908e09..ea8b8c022 100644
--- a/src/os/windows/string_uniscribe.cpp
+++ b/src/os/windows/string_uniscribe.cpp
@@ -16,6 +16,7 @@
 #include "string_uniscribe.h"
 #include "../../language.h"
 #include "../../strings_func.h"
+#include "../../string_func.h"
 #include "../../table/control_codes.h"
 #include "win32.h"
 #include <vector>
@@ -505,4 +506,105 @@ const int *UniscribeParagraphLayout::UniscribeVisualRun::GetGlyphToCharMap() con
 	return this->glyph_to_char;
 }
 
+
+/* virtual */ void UniscribeStringIterator::SetString(const char *s)
+{
+	const char *string_base = s;
+
+	this->utf16_to_utf8.clear();
+	this->str_info.clear();
+	this->cur_pos = 0;
+
+	/* Uniscribe operates on UTF-16, thus we have to convert the input string.
+	 * To be able to return proper offsets, we have to create a mapping at the same time. */
+	std::vector<wchar_t> utf16_str;     ///< UTF-16 copy of the string.
+	while (*s != '\0') {
+		size_t idx = s - string_base;
+
+		WChar c = Utf8Consume(&s);
+		if (c < 0x10000) {
+			utf16_str.push_back((wchar_t)c);
+		} else {
+			/* Make a surrogate pair. */
+			utf16_str.push_back((wchar_t)(0xD800 + ((c - 0x10000) >> 10)));
+			utf16_str.push_back((wchar_t)(0xDC00 + ((c - 0x10000) & 0x3FF)));
+			this->utf16_to_utf8.push_back(idx);
+		}
+		this->utf16_to_utf8.push_back(idx);
+	}
+	this->utf16_to_utf8.push_back(s - string_base);
+
+	/* Query Uniscribe for word and cluster break information. */
+	this->str_info.resize(utf16_to_utf8.size());
+
+	if (utf16_str.size() > 0) {
+		/* Itemize string into language runs. */
+		std::vector<SCRIPT_ITEM> runs = UniscribeItemizeString(&utf16_str[0], (int32)utf16_str.size());
+
+		for (std::vector<SCRIPT_ITEM>::const_iterator run = runs.begin(); runs.size() > 0 && run != runs.end() - 1; run++) {
+			/* Get information on valid word and character break.s */
+			int len = (run + 1)->iCharPos - run->iCharPos;
+			std::vector<SCRIPT_LOGATTR> attr(len);
+			ScriptBreak(&utf16_str[run->iCharPos], len, &run->a, &attr[0]);
+
+			/* Extract the information we're interested in. */
+			for (size_t c = 0; c < attr.size(); c++) {
+				/* First character of a run is always a valid word break. */
+				this->str_info[c + run->iCharPos].word_stop = attr[c].fWordStop || c == 0;
+				this->str_info[c + run->iCharPos].char_stop = attr[c].fCharStop;
+			}
+		}
+	}
+
+	/* End-of-string is always a valid stopping point. */
+	this->str_info.back().char_stop = true;
+	this->str_info.back().word_stop = true;
+}
+
+/* virtual */ size_t UniscribeStringIterator::SetCurPosition(size_t pos)
+{
+	/* Convert incoming position to an UTF-16 string index. */
+	size_t utf16_pos = 0;
+	for (size_t i = 0; i < this->utf16_to_utf8.size(); i++) {
+		if (this->utf16_to_utf8[i] == pos) {
+			utf16_pos = i;
+			break;
+		}
+	}
+
+	/* Sanitize in case we get a position inside a grapheme cluster. */
+	while (utf16_pos > 0 && !this->str_info[utf16_pos].char_stop) utf16_pos--;
+	this->cur_pos = utf16_pos;
+
+	return this->utf16_to_utf8[this->cur_pos];
+}
+
+/* virtual */ size_t UniscribeStringIterator::Next(IterType what)
+{
+	assert(this->cur_pos <= this->utf16_to_utf8.size());
+	assert(what == StringIterator::ITER_CHARACTER || what == StringIterator::ITER_WORD);
+
+	if (this->cur_pos == this->utf16_to_utf8.size()) return END;
+
+	do {
+		this->cur_pos++;
+	} while (this->cur_pos < this->utf16_to_utf8.size() && (what  == ITER_WORD ? !this->str_info[this->cur_pos].word_stop : !this->str_info[this->cur_pos].char_stop));
+
+	return this->cur_pos == this->utf16_to_utf8.size() ? END : this->utf16_to_utf8[this->cur_pos];
+}
+
+/*virtual */ size_t UniscribeStringIterator::Prev(IterType what)
+{
+	assert(this->cur_pos <= this->utf16_to_utf8.size());
+	assert(what == StringIterator::ITER_CHARACTER || what == StringIterator::ITER_WORD);
+
+	if (this->cur_pos == 0) return END;
+
+	do {
+		this->cur_pos--;
+	} while (this->cur_pos > 0 && (what == ITER_WORD ? !this->str_info[this->cur_pos].word_stop : !this->str_info[this->cur_pos].char_stop));
+
+	return this->utf16_to_utf8[this->cur_pos];
+}
+
 #endif /* defined(WITH_UNISCRIBE) */
diff --git a/src/os/windows/string_uniscribe.h b/src/os/windows/string_uniscribe.h
index a43a2a96e..6af858a88 100644
--- a/src/os/windows/string_uniscribe.h
+++ b/src/os/windows/string_uniscribe.h
@@ -15,6 +15,8 @@
 #if defined(WITH_UNISCRIBE)
 
 #include "../../gfx_layout.h"
+#include "../../string_base.h"
+#include <vector>
 
 
 void UniscribeResetScriptCache(FontSize size);
@@ -65,6 +67,26 @@ public:
 	}
 };
 
+/** String iterator using Uniscribe as a backend. */
+class UniscribeStringIterator : public StringIterator {
+	/** */
+	struct CharInfo {
+		bool word_stop : 1; ///< Code point is suitable as a word break.
+		bool char_stop : 1; ///< Code point is the start of a grapheme cluster, i.e. a "character".
+	};
+
+	std::vector<CharInfo> str_info;      ///< Break information for each code point.
+	std::vector<size_t>   utf16_to_utf8; ///< Mapping from UTF-16 code point position to index in the UTF-8 source string.
+
+	size_t cur_pos; ///< Current iteration position.
+
+public:
+	virtual void SetString(const char *s);
+	virtual size_t SetCurPosition(size_t pos);
+	virtual size_t Next(IterType what);
+	virtual size_t Prev(IterType what);
+};
+
 #endif /* defined(WITH_UNISCRIBE) */
 
 #endif /* STRING_UNISCRIBE_H */