diff options
author | michi_cc <michi_cc@openttd.org> | 2013-08-05 20:35:31 +0000 |
---|---|---|
committer | michi_cc <michi_cc@openttd.org> | 2013-08-05 20:35:31 +0000 |
commit | 76367f6bf1b5b459c9a15faa0cc0ea1dab191c6f (patch) | |
tree | da45ccb0547732c9ff21f670971d5e7979c38f8c /src/string.cpp | |
parent | e7dc14b25af4b2802a956dd1cd99c187fb4acb56 (diff) | |
download | openttd-76367f6bf1b5b459c9a15faa0cc0ea1dab191c6f.tar.xz |
(svn r25653) -Add: Caret movement by words for CJK languages.
Diffstat (limited to 'src/string.cpp')
-rw-r--r-- | src/string.cpp | 167 |
1 files changed, 151 insertions, 16 deletions
diff --git a/src/string.cpp b/src/string.cpp index bb1f2bbd0..ada9f9022 100644 --- a/src/string.cpp +++ b/src/string.cpp @@ -661,50 +661,132 @@ int strnatcmp(const char *s1, const char *s2, bool ignore_garbage_at_front) class IcuStringIterator : public StringIterator { icu::BreakIterator *char_itr; ///< ICU iterator for characters. + icu::BreakIterator *word_itr; ///< ICU iterator for words. const char *string; ///< Iteration string in UTF-8. + SmallVector<UChar, 32> utf16_str; ///< UTF-16 copy of the string. + SmallVector<size_t, 32> utf16_to_utf8; ///< Mapping from UTF-16 code point position to index in the UTF-8 source string. + public: - IcuStringIterator() : char_itr(NULL) + IcuStringIterator() : char_itr(NULL), word_itr(NULL) { UErrorCode status = U_ZERO_ERROR; this->char_itr = icu::BreakIterator::createCharacterInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status); + this->word_itr = icu::BreakIterator::createWordInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status); + + *this->utf16_str.Append() = '\0'; + *this->utf16_to_utf8.Append() = 0; } virtual ~IcuStringIterator() { delete this->char_itr; + delete this->word_itr; } virtual void SetString(const char *s) { this->string = s; + /* Unfortunately current ICU versions only provide rudimentary support + * for word break iterators (especially for CJK languages) in combination + * with UTF-8 input. As a work around we have to convert the input to + * UTF-16 and create a mapping back to UTF-8 character indices. */ + this->utf16_str.Clear(); + this->utf16_to_utf8.Clear(); + + while (*s != '\0') { + size_t idx = s - this->string; + + WChar c = Utf8Consume(&s); + if (c < 0x10000) { + *this->utf16_str.Append() = (UChar)c; + } else { + /* Make a surrogate pair. */ + *this->utf16_str.Append() = (UChar)(0xD800 + ((c - 0x10000) >> 10)); + *this->utf16_str.Append() = (UChar)(0xDC00 + ((c - 0x10000) & 0x3FF)); + *this->utf16_to_utf8.Append() = idx; + } + *this->utf16_to_utf8.Append() = idx; + } + *this->utf16_str.Append() = '\0'; + *this->utf16_to_utf8.Append() = s - this->string; + UText text = UTEXT_INITIALIZER; UErrorCode status = U_ZERO_ERROR; - utext_openUTF8(&text, s, -1, &status); + utext_openUChars(&text, this->utf16_str.Begin(), this->utf16_str.Length() - 1, &status); this->char_itr->setText(&text, status); + this->word_itr->setText(&text, status); this->char_itr->first(); + this->word_itr->first(); } virtual size_t SetCurPosition(size_t pos) { + /* Convert incoming position to an UTF-16 string index. */ + uint utf16_pos = 0; + for (uint i = 0; i < this->utf16_to_utf8.Length(); i++) { + if (this->utf16_to_utf8[i] == pos) { + utf16_pos = i; + break; + } + } + /* isBoundary has the documented side-effect of setting the current * position to the first valid boundary equal to or greater than * the passed value. */ - this->char_itr->isBoundary((int32_t)pos); - return this->char_itr->current(); + this->char_itr->isBoundary(utf16_pos); + return this->utf16_to_utf8[this->char_itr->current()]; } - virtual size_t Next() + virtual size_t Next(IterType what) { - int32_t pos = this->char_itr->next(); - return pos == icu::BreakIterator::DONE ? END : pos; + int32_t pos; + switch (what) { + case ITER_CHARACTER: + pos = this->char_itr->next(); + break; + + case ITER_WORD: + pos = this->word_itr->following(this->char_itr->current()); + /* The ICU word iterator considers both the start and the end of a word a valid + * break point, but we only want word starts. Move to the next location in + * case the new position points to whitespace. */ + while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->next(); + + this->char_itr->isBoundary(pos); + break; + + default: + NOT_REACHED(); + } + + return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos]; } - virtual size_t Prev() + virtual size_t Prev(IterType what) { - int32_t pos = this->char_itr->previous(); - return pos == icu::BreakIterator::DONE ? END : pos; + int32_t pos; + switch (what) { + case ITER_CHARACTER: + pos = this->char_itr->previous(); + break; + + case ITER_WORD: + pos = this->word_itr->preceding(this->char_itr->current()); + /* The ICU word iterator considers both the start and the end of a word a valid + * break point, but we only want word starts. Move to the previous location in + * case the new position points to whitespace. */ + while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->previous(); + + this->char_itr->isBoundary(pos); + break; + + default: + NOT_REACHED(); + } + + return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos]; } }; @@ -742,26 +824,79 @@ public: return this->cur_pos = pos; } - virtual size_t Next() + virtual size_t Next(IterType what) { assert(this->string != NULL); /* Already at the end? */ if (this->cur_pos >= this->len) return END; - WChar c; - this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos); - return this->cur_pos; + switch (what) { + case ITER_CHARACTER: { + WChar c; + this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos); + return this->cur_pos; + } + + case ITER_WORD: { + WChar c; + /* Consume current word. */ + size_t offs = Utf8Decode(&c, this->string + this->cur_pos); + while (this->cur_pos < this->len && !IsWhitespace(c)) { + this->cur_pos += offs; + offs = Utf8Decode(&c, this->string + this->cur_pos); + } + /* Consume whitespace to the next word. */ + while (this->cur_pos < this->len && IsWhitespace(c)) { + this->cur_pos += offs; + offs = Utf8Decode(&c, this->string + this->cur_pos); + } + + return this->cur_pos; + } + + default: + NOT_REACHED(); + } + + return END; } - virtual size_t Prev() + virtual size_t Prev(IterType what) { assert(this->string != NULL); /* Already at the beginning? */ if (this->cur_pos == 0) return END; - return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string; + switch (what) { + case ITER_CHARACTER: + return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string; + + case ITER_WORD: { + const char *s = this->string + this->cur_pos; + WChar c; + /* Consume preceding whitespace. */ + do { + s = Utf8PrevChar(s); + Utf8Decode(&c, s); + } while (s > this->string && IsWhitespace(c)); + /* Consume preceding word. */ + while (s > this->string && !IsWhitespace(c)) { + s = Utf8PrevChar(s); + Utf8Decode(&c, s); + } + /* Move caret back to the beginning of the word. */ + if (IsWhitespace(c)) Utf8Consume(&s); + + return this->cur_pos = s - this->string; + } + + default: + NOT_REACHED(); + } + + return END; } }; |