1 files changed, 30 insertions, 12 deletions
diff --git a/src/string.cpp b/src/string.cpp
index dfd01450e..38f7d1bd1 100644
--- a/src/string.cpp
+++ b/src/string.cpp
@@ -192,19 +192,35 @@ static void str_validate(T &dst, const char *str, const char *last, StringValida
 
 	while (str <= last && *str != '\0') {
 		size_t len = Utf8EncodedCharLen(*str);
-		/* If the character is unknown, i.e. encoded length is 0
-		 * we assume worst case for the length check.
-		 * The length check is needed to prevent Utf8Decode to read
-		 * over the terminating '\0' if that happens to be placed
-		 * within the encoding of an UTF8 character. */
-		if ((len == 0 && str + 4 > last) || str + len > last) break;
-
 		WChar c;
-		len = Utf8Decode(&c, str);
-		/* It's possible to encode the string termination character
-		 * into a multiple bytes. This prevents those termination
-		 * characters to be skipped */
-		if (c == '\0') break;
+		/* If the first byte does not look like the first byte of an encoded
+		 * character, i.e. encoded length is 0, then this byte is definitely bad
+		 * and it should be skipped.
+		 * When the first byte looks like the first byte of an encoded character,
+		 * then the remaining bytes in the string are checked whether the whole
+		 * encoded character can be there. If that is not the case, this byte is
+		 * skipped.
+		 * Finally we attempt to decode the encoded character, which does certain
+		 * extra validations to see whether the correct number of bytes were used
+		 * to encode the character. If that is not the case, the byte is probably
+		 * invalid and it is skipped. We could emit a question mark, but then the
+		 * logic below cannot just copy bytes, it would need to re-encode the
+		 * decoded characters as the length in bytes may have changed.
+		 *
+		 * The goals here is to get as much valid Utf8 encoded characters from the
+		 * source string to the destination string.
+		 *
+		 * Note: a multi-byte encoded termination ('\0') will trigger the encoded
+		 * char length and the decoded length to differ, so it will be ignored as
+		 * invalid character data. If it were to reach the termination, then we
+		 * would also reach the "last" byte of the string and a normal '\0'
+		 * termination will be placed after it.
+		 */
+		if (len == 0 || str + len > last || len != Utf8Decode(&c, str)) {
+			/* Maybe the next byte is still a valid character? */
+			str++;
+			continue;
+		}
 
 		if ((IsPrintable(c) && (c < SCC_SPRITE_START || c > SCC_SPRITE_END)) || ((settings & SVS_ALLOW_CONTROL_CODE) != 0 && c == SCC_ENCODED)) {
 			/* Copy the character back. Even if dst is current the same as str
@@ -225,6 +241,8 @@ static void str_validate(T &dst, const char *str, const char *last, StringValida
 			if ((settings & SVS_REPLACE_WITH_QUESTION_MARK) != 0) *dst++ = '?';
 		}
 	}
+
+	/* String termination, if needed, is left to the caller of this function. */
 }
 
 /**