summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/string.cpp167
-rw-r--r--src/string_base.h10
-rw-r--r--src/string_func.h46
-rw-r--r--src/textbuf.cpp120
-rw-r--r--src/textbuf_type.h4
5 files changed, 222 insertions, 125 deletions
diff --git a/src/string.cpp b/src/string.cpp
index bb1f2bbd0..ada9f9022 100644
--- a/src/string.cpp
+++ b/src/string.cpp
@@ -661,50 +661,132 @@ int strnatcmp(const char *s1, const char *s2, bool ignore_garbage_at_front)
class IcuStringIterator : public StringIterator
{
icu::BreakIterator *char_itr; ///< ICU iterator for characters.
+ icu::BreakIterator *word_itr; ///< ICU iterator for words.
const char *string; ///< Iteration string in UTF-8.
+ SmallVector<UChar, 32> utf16_str; ///< UTF-16 copy of the string.
+ SmallVector<size_t, 32> utf16_to_utf8; ///< Mapping from UTF-16 code point position to index in the UTF-8 source string.
+
public:
- IcuStringIterator() : char_itr(NULL)
+ IcuStringIterator() : char_itr(NULL), word_itr(NULL)
{
UErrorCode status = U_ZERO_ERROR;
this->char_itr = icu::BreakIterator::createCharacterInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
+ this->word_itr = icu::BreakIterator::createWordInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
+
+ *this->utf16_str.Append() = '\0';
+ *this->utf16_to_utf8.Append() = 0;
}
virtual ~IcuStringIterator()
{
delete this->char_itr;
+ delete this->word_itr;
}
virtual void SetString(const char *s)
{
this->string = s;
+ /* Unfortunately current ICU versions only provide rudimentary support
+ * for word break iterators (especially for CJK languages) in combination
+ * with UTF-8 input. As a work around we have to convert the input to
+ * UTF-16 and create a mapping back to UTF-8 character indices. */
+ this->utf16_str.Clear();
+ this->utf16_to_utf8.Clear();
+
+ while (*s != '\0') {
+ size_t idx = s - this->string;
+
+ WChar c = Utf8Consume(&s);
+ if (c < 0x10000) {
+ *this->utf16_str.Append() = (UChar)c;
+ } else {
+ /* Make a surrogate pair. */
+ *this->utf16_str.Append() = (UChar)(0xD800 + ((c - 0x10000) >> 10));
+ *this->utf16_str.Append() = (UChar)(0xDC00 + ((c - 0x10000) & 0x3FF));
+ *this->utf16_to_utf8.Append() = idx;
+ }
+ *this->utf16_to_utf8.Append() = idx;
+ }
+ *this->utf16_str.Append() = '\0';
+ *this->utf16_to_utf8.Append() = s - this->string;
+
UText text = UTEXT_INITIALIZER;
UErrorCode status = U_ZERO_ERROR;
- utext_openUTF8(&text, s, -1, &status);
+ utext_openUChars(&text, this->utf16_str.Begin(), this->utf16_str.Length() - 1, &status);
this->char_itr->setText(&text, status);
+ this->word_itr->setText(&text, status);
this->char_itr->first();
+ this->word_itr->first();
}
virtual size_t SetCurPosition(size_t pos)
{
+ /* Convert incoming position to an UTF-16 string index. */
+ uint utf16_pos = 0;
+ for (uint i = 0; i < this->utf16_to_utf8.Length(); i++) {
+ if (this->utf16_to_utf8[i] == pos) {
+ utf16_pos = i;
+ break;
+ }
+ }
+
/* isBoundary has the documented side-effect of setting the current
* position to the first valid boundary equal to or greater than
* the passed value. */
- this->char_itr->isBoundary((int32_t)pos);
- return this->char_itr->current();
+ this->char_itr->isBoundary(utf16_pos);
+ return this->utf16_to_utf8[this->char_itr->current()];
}
- virtual size_t Next()
+ virtual size_t Next(IterType what)
{
- int32_t pos = this->char_itr->next();
- return pos == icu::BreakIterator::DONE ? END : pos;
+ int32_t pos;
+ switch (what) {
+ case ITER_CHARACTER:
+ pos = this->char_itr->next();
+ break;
+
+ case ITER_WORD:
+ pos = this->word_itr->following(this->char_itr->current());
+ /* The ICU word iterator considers both the start and the end of a word a valid
+ * break point, but we only want word starts. Move to the next location in
+ * case the new position points to whitespace. */
+ while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->next();
+
+ this->char_itr->isBoundary(pos);
+ break;
+
+ default:
+ NOT_REACHED();
+ }
+
+ return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
}
- virtual size_t Prev()
+ virtual size_t Prev(IterType what)
{
- int32_t pos = this->char_itr->previous();
- return pos == icu::BreakIterator::DONE ? END : pos;
+ int32_t pos;
+ switch (what) {
+ case ITER_CHARACTER:
+ pos = this->char_itr->previous();
+ break;
+
+ case ITER_WORD:
+ pos = this->word_itr->preceding(this->char_itr->current());
+ /* The ICU word iterator considers both the start and the end of a word a valid
+ * break point, but we only want word starts. Move to the previous location in
+ * case the new position points to whitespace. */
+ while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->previous();
+
+ this->char_itr->isBoundary(pos);
+ break;
+
+ default:
+ NOT_REACHED();
+ }
+
+ return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
}
};
@@ -742,26 +824,79 @@ public:
return this->cur_pos = pos;
}
- virtual size_t Next()
+ virtual size_t Next(IterType what)
{
assert(this->string != NULL);
/* Already at the end? */
if (this->cur_pos >= this->len) return END;
- WChar c;
- this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
- return this->cur_pos;
+ switch (what) {
+ case ITER_CHARACTER: {
+ WChar c;
+ this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
+ return this->cur_pos;
+ }
+
+ case ITER_WORD: {
+ WChar c;
+ /* Consume current word. */
+ size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
+ while (this->cur_pos < this->len && !IsWhitespace(c)) {
+ this->cur_pos += offs;
+ offs = Utf8Decode(&c, this->string + this->cur_pos);
+ }
+ /* Consume whitespace to the next word. */
+ while (this->cur_pos < this->len && IsWhitespace(c)) {
+ this->cur_pos += offs;
+ offs = Utf8Decode(&c, this->string + this->cur_pos);
+ }
+
+ return this->cur_pos;
+ }
+
+ default:
+ NOT_REACHED();
+ }
+
+ return END;
}
- virtual size_t Prev()
+ virtual size_t Prev(IterType what)
{
assert(this->string != NULL);
/* Already at the beginning? */
if (this->cur_pos == 0) return END;
- return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
+ switch (what) {
+ case ITER_CHARACTER:
+ return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
+
+ case ITER_WORD: {
+ const char *s = this->string + this->cur_pos;
+ WChar c;
+ /* Consume preceding whitespace. */
+ do {
+ s = Utf8PrevChar(s);
+ Utf8Decode(&c, s);
+ } while (s > this->string && IsWhitespace(c));
+ /* Consume preceding word. */
+ while (s > this->string && !IsWhitespace(c)) {
+ s = Utf8PrevChar(s);
+ Utf8Decode(&c, s);
+ }
+ /* Move caret back to the beginning of the word. */
+ if (IsWhitespace(c)) Utf8Consume(&s);
+
+ return this->cur_pos = s - this->string;
+ }
+
+ default:
+ NOT_REACHED();
+ }
+
+ return END;
}
};
diff --git a/src/string_base.h b/src/string_base.h
index 73439f639..e1eaed349 100644
--- a/src/string_base.h
+++ b/src/string_base.h
@@ -15,6 +15,12 @@
/** Class for iterating over different kind of parts of a string. */
class StringIterator {
public:
+ /** Type of the iterator. */
+ enum IterType {
+ ITER_CHARACTER, ///< Iterate over characters (or more exactly grapheme clusters).
+ ITER_WORD, ///< Iterate over words.
+ };
+
/** Sentinel to indicate end-of-iteration. */
static const size_t END = SIZE_MAX;
@@ -45,13 +51,13 @@ public:
* Advance the cursor by one iteration unit.
* @return New cursor position (in bytes) or #END if the cursor is already at the end of the string.
*/
- virtual size_t Next() = 0;
+ virtual size_t Next(IterType what = ITER_CHARACTER) = 0;
/**
* Move the cursor back by one iteration unit.
* @return New cursor position (in bytes) or #END if the cursor is already at the start of the string.
*/
- virtual size_t Prev() = 0;
+ virtual size_t Prev(IterType what = ITER_CHARACTER) = 0;
protected:
StringIterator() {}
diff --git a/src/string_func.h b/src/string_func.h
index b0a42b808..d7056f1be 100644
--- a/src/string_func.h
+++ b/src/string_func.h
@@ -90,7 +90,6 @@ static inline WChar Utf8Consume(const char **s)
return c;
}
-
/**
* Return the length of a UTF-8 encoded character.
* @param c Unicode character.
@@ -157,6 +156,51 @@ static inline const char *Utf8PrevChar(const char *s)
size_t Utf8StringLength(const char *s);
/**
+ * Is the given character a lead surrogate code point?
+ * @param c The character to test.
+ * @return True if the character is a lead surrogate code point.
+ */
+static inline bool Utf16IsLeadSurrogate(uint c)
+{
+ return c >= 0xD800 && c <= 0xDBFF;
+}
+
+/**
+ * Is the given character a lead surrogate code point?
+ * @param c The character to test.
+ * @return True if the character is a lead surrogate code point.
+ */
+static inline bool Utf16IsTrailSurrogate(uint c)
+{
+ return c >= 0xDC00 && c <= 0xDFFF;
+}
+
+/**
+ * Convert an UTF-16 surrogate pair to the corresponding Unicode character.
+ * @param lead Lead surrogate code point.
+ * @param trail Trail surrogate code point.
+ * @return Decoded Unicode character.
+ */
+static inline WChar Utf16DecodeSurrogate(uint lead, uint trail)
+{
+ return 0x10000 + (((lead - 0xD800) << 10) | (trail - 0xDC00));
+}
+
+/**
+ * Decode an UTF-16 character.
+ * @param c Pointer to one or two UTF-16 code points.
+ * @return Decoded Unicode character.
+ */
+static inline WChar Utf16DecodeChar(const uint16 *c)
+{
+ if (Utf16IsLeadSurrogate(c[0])) {
+ return Utf16DecodeSurrogate(c[0], c[1]);
+ } else {
+ return *c;
+ }
+}
+
+/**
* Is the given character a text direction character.
* @param c The character to test.
* @return true iff the character is used to influence
diff --git a/src/textbuf.cpp b/src/textbuf.cpp
index 6ea042244..9a307058f 100644
--- a/src/textbuf.cpp
+++ b/src/textbuf.cpp
@@ -219,70 +219,12 @@ bool Textbuf::InsertClipboard()
return true;
}
-/**
- * Checks if it is possible to move caret to the left
- * @return true if the caret can be moved to the left, otherwise false.
- */
-bool Textbuf::CanMoveCaretLeft()
-{
- return this->caretpos != 0;
-}
-
-/**
- * Moves the caret to the left.
- * @pre Ensure that Textbuf::CanMoveCaretLeft returns true
- * @return The character under the caret.
- */
-WChar Textbuf::MoveCaretLeft()
-{
- assert(this->CanMoveCaretLeft());
-
- size_t pos = this->char_iter->Prev();
- if (pos == StringIterator::END) pos = 0;
-
- this->caretpos = (uint16)pos;
- this->UpdateCaretPosition();
-
- WChar c;
- Utf8Decode(&c, this->buf + this->caretpos);
-
- return c;
-}
-
-/**
- * Checks if it is possible to move caret to the right
- * @return true if the caret can be moved to the right, otherwise false.
- */
-bool Textbuf::CanMoveCaretRight()
-{
- return this->caretpos < this->bytes - 1;
-}
-
-/**
- * Moves the caret to the right.
- * @pre Ensure that Textbuf::CanMoveCaretRight returns true
- * @return The character under the caret.
- */
-WChar Textbuf::MoveCaretRight()
-{
- assert(this->CanMoveCaretRight());
-
- size_t pos = this->char_iter->Next();
- if (pos == StringIterator::END) pos = this->bytes - 1;
-
- this->caretpos = (uint16)pos;
- this->UpdateCaretPosition();
-
- WChar c;
- Utf8Decode(&c, this->buf + this->caretpos);
- return c;
-}
-
/** Update the character iter after the text has changed. */
void Textbuf::UpdateStringIter()
{
this->char_iter->SetString(this->buf);
- this->caretpos = (uint16)this->char_iter->SetCurPosition(this->caretpos);
+ size_t pos = this->char_iter->SetCurPosition(this->caretpos);
+ this->caretpos = pos == StringIterator::END ? 0 : (uint16)pos;
}
/** Update pixel width of the text. */
@@ -307,64 +249,38 @@ bool Textbuf::MovePos(uint16 keycode)
{
switch (keycode) {
case WKC_LEFT:
- if (this->CanMoveCaretLeft()) {
- this->MoveCaretLeft();
- return true;
- }
- break;
-
case WKC_CTRL | WKC_LEFT: {
- if (!this->CanMoveCaretLeft()) break;
-
- /* Unconditionally move one char to the left. */
- WChar c = this->MoveCaretLeft();
- /* Consume left whitespaces. */
- while (IsWhitespace(c)) {
- if (!this->CanMoveCaretLeft()) return true;
- c = this->MoveCaretLeft();
- }
- /* Consume left word. */
- while (!IsWhitespace(c)) {
- if (!this->CanMoveCaretLeft()) return true;
- c = this->MoveCaretLeft();
- }
- /* Place caret at the beginning of the left word. */
- this->MoveCaretRight();
+ if (this->caretpos == 0) break;
+
+ size_t pos = this->char_iter->Prev(keycode & WKC_CTRL ? StringIterator::ITER_WORD : StringIterator::ITER_CHARACTER);
+ if (pos == StringIterator::END) return true;
+
+ this->caretpos = (uint16)pos;
+ this->UpdateCaretPosition();
return true;
}
case WKC_RIGHT:
- if (this->CanMoveCaretRight()) {
- this->MoveCaretRight();
- return true;
- }
- break;
-
case WKC_CTRL | WKC_RIGHT: {
- if (!this->CanMoveCaretRight()) break;
-
- /* Unconditionally move one char to the right. */
- WChar c = this->MoveCaretRight();
- /* Continue to consume current word. */
- while (!IsWhitespace(c)) {
- if (!this->CanMoveCaretRight()) return true;
- c = this->MoveCaretRight();
- }
- /* Consume right whitespaces. */
- while (IsWhitespace(c)) {
- if (!this->CanMoveCaretRight()) return true;
- c = this->MoveCaretRight();
- }
+ if (this->caretpos >= this->bytes - 1) break;
+
+ size_t pos = this->char_iter->Next(keycode & WKC_CTRL ? StringIterator::ITER_WORD : StringIterator::ITER_CHARACTER);
+ if (pos == StringIterator::END) return true;
+
+ this->caretpos = (uint16)pos;
+ this->UpdateCaretPosition();
return true;
}
case WKC_HOME:
this->caretpos = 0;
+ this->char_iter->SetCurPosition(this->caretpos);
this->UpdateCaretPosition();
return true;
case WKC_END:
this->caretpos = this->bytes - 1;
+ this->char_iter->SetCurPosition(this->caretpos);
this->UpdateCaretPosition();
return true;
diff --git a/src/textbuf_type.h b/src/textbuf_type.h
index 611d7e443..f5100249c 100644
--- a/src/textbuf_type.h
+++ b/src/textbuf_type.h
@@ -67,10 +67,6 @@ private:
bool CanDelChar(bool backspace);
WChar GetNextDelChar(bool backspace);
void DelChar(bool backspace);
- bool CanMoveCaretLeft();
- WChar MoveCaretLeft();
- bool CanMoveCaretRight();
- WChar MoveCaretRight();
void UpdateStringIter();
void UpdateWidth();