summaryrefslogtreecommitdiff
path: root/src/os
diff options
context:
space:
mode:
Diffstat (limited to 'src/os')
-rw-r--r--src/os/windows/string_uniscribe.cpp102
-rw-r--r--src/os/windows/string_uniscribe.h22
2 files changed, 124 insertions, 0 deletions
diff --git a/src/os/windows/string_uniscribe.cpp b/src/os/windows/string_uniscribe.cpp
index fbf908e09..ea8b8c022 100644
--- a/src/os/windows/string_uniscribe.cpp
+++ b/src/os/windows/string_uniscribe.cpp
@@ -16,6 +16,7 @@
#include "string_uniscribe.h"
#include "../../language.h"
#include "../../strings_func.h"
+#include "../../string_func.h"
#include "../../table/control_codes.h"
#include "win32.h"
#include <vector>
@@ -505,4 +506,105 @@ const int *UniscribeParagraphLayout::UniscribeVisualRun::GetGlyphToCharMap() con
return this->glyph_to_char;
}
+
+/* virtual */ void UniscribeStringIterator::SetString(const char *s)
+{
+ const char *string_base = s;
+
+ this->utf16_to_utf8.clear();
+ this->str_info.clear();
+ this->cur_pos = 0;
+
+ /* Uniscribe operates on UTF-16, thus we have to convert the input string.
+ * To be able to return proper offsets, we have to create a mapping at the same time. */
+ std::vector<wchar_t> utf16_str; ///< UTF-16 copy of the string.
+ while (*s != '\0') {
+ size_t idx = s - string_base;
+
+ WChar c = Utf8Consume(&s);
+ if (c < 0x10000) {
+ utf16_str.push_back((wchar_t)c);
+ } else {
+ /* Make a surrogate pair. */
+ utf16_str.push_back((wchar_t)(0xD800 + ((c - 0x10000) >> 10)));
+ utf16_str.push_back((wchar_t)(0xDC00 + ((c - 0x10000) & 0x3FF)));
+ this->utf16_to_utf8.push_back(idx);
+ }
+ this->utf16_to_utf8.push_back(idx);
+ }
+ this->utf16_to_utf8.push_back(s - string_base);
+
+ /* Query Uniscribe for word and cluster break information. */
+ this->str_info.resize(utf16_to_utf8.size());
+
+ if (utf16_str.size() > 0) {
+ /* Itemize string into language runs. */
+ std::vector<SCRIPT_ITEM> runs = UniscribeItemizeString(&utf16_str[0], (int32)utf16_str.size());
+
+ for (std::vector<SCRIPT_ITEM>::const_iterator run = runs.begin(); runs.size() > 0 && run != runs.end() - 1; run++) {
+ /* Get information on valid word and character break.s */
+ int len = (run + 1)->iCharPos - run->iCharPos;
+ std::vector<SCRIPT_LOGATTR> attr(len);
+ ScriptBreak(&utf16_str[run->iCharPos], len, &run->a, &attr[0]);
+
+ /* Extract the information we're interested in. */
+ for (size_t c = 0; c < attr.size(); c++) {
+ /* First character of a run is always a valid word break. */
+ this->str_info[c + run->iCharPos].word_stop = attr[c].fWordStop || c == 0;
+ this->str_info[c + run->iCharPos].char_stop = attr[c].fCharStop;
+ }
+ }
+ }
+
+ /* End-of-string is always a valid stopping point. */
+ this->str_info.back().char_stop = true;
+ this->str_info.back().word_stop = true;
+}
+
+/* virtual */ size_t UniscribeStringIterator::SetCurPosition(size_t pos)
+{
+ /* Convert incoming position to an UTF-16 string index. */
+ size_t utf16_pos = 0;
+ for (size_t i = 0; i < this->utf16_to_utf8.size(); i++) {
+ if (this->utf16_to_utf8[i] == pos) {
+ utf16_pos = i;
+ break;
+ }
+ }
+
+ /* Sanitize in case we get a position inside a grapheme cluster. */
+ while (utf16_pos > 0 && !this->str_info[utf16_pos].char_stop) utf16_pos--;
+ this->cur_pos = utf16_pos;
+
+ return this->utf16_to_utf8[this->cur_pos];
+}
+
+/* virtual */ size_t UniscribeStringIterator::Next(IterType what)
+{
+ assert(this->cur_pos <= this->utf16_to_utf8.size());
+ assert(what == StringIterator::ITER_CHARACTER || what == StringIterator::ITER_WORD);
+
+ if (this->cur_pos == this->utf16_to_utf8.size()) return END;
+
+ do {
+ this->cur_pos++;
+ } while (this->cur_pos < this->utf16_to_utf8.size() && (what == ITER_WORD ? !this->str_info[this->cur_pos].word_stop : !this->str_info[this->cur_pos].char_stop));
+
+ return this->cur_pos == this->utf16_to_utf8.size() ? END : this->utf16_to_utf8[this->cur_pos];
+}
+
+/*virtual */ size_t UniscribeStringIterator::Prev(IterType what)
+{
+ assert(this->cur_pos <= this->utf16_to_utf8.size());
+ assert(what == StringIterator::ITER_CHARACTER || what == StringIterator::ITER_WORD);
+
+ if (this->cur_pos == 0) return END;
+
+ do {
+ this->cur_pos--;
+ } while (this->cur_pos > 0 && (what == ITER_WORD ? !this->str_info[this->cur_pos].word_stop : !this->str_info[this->cur_pos].char_stop));
+
+ return this->utf16_to_utf8[this->cur_pos];
+}
+
#endif /* defined(WITH_UNISCRIBE) */
diff --git a/src/os/windows/string_uniscribe.h b/src/os/windows/string_uniscribe.h
index a43a2a96e..6af858a88 100644
--- a/src/os/windows/string_uniscribe.h
+++ b/src/os/windows/string_uniscribe.h
@@ -15,6 +15,8 @@
#if defined(WITH_UNISCRIBE)
#include "../../gfx_layout.h"
+#include "../../string_base.h"
+#include <vector>
void UniscribeResetScriptCache(FontSize size);
@@ -65,6 +67,26 @@ public:
}
};
+/** String iterator using Uniscribe as a backend. */
+class UniscribeStringIterator : public StringIterator {
+ /** */
+ struct CharInfo {
+ bool word_stop : 1; ///< Code point is suitable as a word break.
+ bool char_stop : 1; ///< Code point is the start of a grapheme cluster, i.e. a "character".
+ };
+
+ std::vector<CharInfo> str_info; ///< Break information for each code point.
+ std::vector<size_t> utf16_to_utf8; ///< Mapping from UTF-16 code point position to index in the UTF-8 source string.
+
+ size_t cur_pos; ///< Current iteration position.
+
+public:
+ virtual void SetString(const char *s);
+ virtual size_t SetCurPosition(size_t pos);
+ virtual size_t Next(IterType what);
+ virtual size_t Prev(IterType what);
+};
+
#endif /* defined(WITH_UNISCRIBE) */
#endif /* STRING_UNISCRIBE_H */