diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/string.cpp | 42 |
1 files changed, 30 insertions, 12 deletions
diff --git a/src/string.cpp b/src/string.cpp index dfd01450e..38f7d1bd1 100644 --- a/src/string.cpp +++ b/src/string.cpp @@ -192,19 +192,35 @@ static void str_validate(T &dst, const char *str, const char *last, StringValida while (str <= last && *str != '\0') { size_t len = Utf8EncodedCharLen(*str); - /* If the character is unknown, i.e. encoded length is 0 - * we assume worst case for the length check. - * The length check is needed to prevent Utf8Decode to read - * over the terminating '\0' if that happens to be placed - * within the encoding of an UTF8 character. */ - if ((len == 0 && str + 4 > last) || str + len > last) break; - WChar c; - len = Utf8Decode(&c, str); - /* It's possible to encode the string termination character - * into a multiple bytes. This prevents those termination - * characters to be skipped */ - if (c == '\0') break; + /* If the first byte does not look like the first byte of an encoded + * character, i.e. encoded length is 0, then this byte is definitely bad + * and it should be skipped. + * When the first byte looks like the first byte of an encoded character, + * then the remaining bytes in the string are checked whether the whole + * encoded character can be there. If that is not the case, this byte is + * skipped. + * Finally we attempt to decode the encoded character, which does certain + * extra validations to see whether the correct number of bytes were used + * to encode the character. If that is not the case, the byte is probably + * invalid and it is skipped. We could emit a question mark, but then the + * logic below cannot just copy bytes, it would need to re-encode the + * decoded characters as the length in bytes may have changed. + * + * The goals here is to get as much valid Utf8 encoded characters from the + * source string to the destination string. + * + * Note: a multi-byte encoded termination ('\0') will trigger the encoded + * char length and the decoded length to differ, so it will be ignored as + * invalid character data. If it were to reach the termination, then we + * would also reach the "last" byte of the string and a normal '\0' + * termination will be placed after it. + */ + if (len == 0 || str + len > last || len != Utf8Decode(&c, str)) { + /* Maybe the next byte is still a valid character? */ + str++; + continue; + } if ((IsPrintable(c) && (c < SCC_SPRITE_START || c > SCC_SPRITE_END)) || ((settings & SVS_ALLOW_CONTROL_CODE) != 0 && c == SCC_ENCODED)) { /* Copy the character back. Even if dst is current the same as str @@ -225,6 +241,8 @@ static void str_validate(T &dst, const char *str, const char *last, StringValida if ((settings & SVS_REPLACE_WITH_QUESTION_MARK) != 0) *dst++ = '?'; } } + + /* String termination, if needed, is left to the caller of this function. */ } /** |