diff options
author | peter1138 <peter1138@openttd.org> | 2006-11-16 22:05:33 +0000 |
---|---|---|
committer | peter1138 <peter1138@openttd.org> | 2006-11-16 22:05:33 +0000 |
commit | 1a4f1c8177f7ee351cb0096e3456d055b97dc60a (patch) | |
tree | 4fb6c0fac873efffc85cef437baa70d50d51fdfb /string.c | |
parent | 40d647ddde652bb8f1c7b4215279cc82d01ca38f (diff) | |
download | openttd-1a4f1c8177f7ee351cb0096e3456d055b97dc60a.tar.xz |
(svn r7182) -Feature: Merge utf8 branch. This brings us support for Unicode/UTF-8 and the option for fonts rendered by FreeType. Language changes to come.
Diffstat (limited to 'string.c')
-rw-r--r-- | string.c | 124 |
1 files changed, 103 insertions, 21 deletions
@@ -4,6 +4,8 @@ #include "openttd.h" #include "functions.h" #include "string.h" +#include "macros.h" +#include "table/control_codes.h" #include <stdarg.h> #include <ctype.h> // required for tolower() @@ -68,8 +70,27 @@ char* CDECL str_fmt(const char* str, ...) void str_validate(char *str) { - for (; *str != '\0'; str++) - if (!IsValidAsciiChar(*str, CS_ALPHANUMERAL)) *str = '?'; + char *dst = str; + WChar c; + size_t len = Utf8Decode(&c, str); + + for (; c != '\0'; len = Utf8Decode(&c, str)) { + if (IsPrintable(c) && (c < SCC_SPRITE_START || c > SCC_SPRITE_END || + IsValidChar(c - SCC_SPRITE_START, CS_ALPHANUMERAL))) { + /* Copy the character back. Even if dst is current the same as str + * (i.e. no characters have been changed) this is quicker than + * moving the pointers ahead by len */ + do { + *dst++ = *str++; + } while (--len); + } else { + /* Replace the undesirable character with a question mark */ + str += len; + *dst++ = '?'; + } + } + + *dst = '\0'; } void str_strip_colours(char *str) @@ -92,29 +113,15 @@ void str_strip_colours(char *str) * @param afilter the filter to use * @return true or false depending if the character is printable/valid or not */ -bool IsValidAsciiChar(byte key, CharSetFilter afilter) +bool IsValidChar(WChar key, CharSetFilter afilter) { - bool firsttest = false; - switch (afilter) { - case CS_ALPHANUMERAL: - firsttest = (key >= ' ' && key < 127); - break; - - /* We are very strict here */ - case CS_NUMERAL: - return (key >= '0' && key <= '9'); - - case CS_ALPHA: - default: - firsttest = ((key >= 'A' && key <= 'Z') || (key >= 'a' && key <= 'z')); - break; + case CS_ALPHANUMERAL: return IsPrintable(key); + case CS_NUMERAL: return (key >= '0' && key <= '9'); + case CS_ALPHA: return IsPrintable(key) && !(key >= '0' && key <= '9'); } - /* Allow some special chars too that are non-ASCII but still valid (like '^' above 'a') */ - return (firsttest || (key >= 160 && - key != 0xAA && key != 0xAC && key != 0xAD && key != 0xAF && - key != 0xB5 && key != 0xB6 && key != 0xB7 && key != 0xB9)); + return false; } void strtolower(char *str) @@ -145,3 +152,78 @@ int CDECL vsnprintf(char *str, size_t size, const char *format, va_list ap) #endif /* _MSC_VER */ #endif /* WIN32 */ + + +/* UTF-8 handling routines */ + + +/* Decode and consume the next UTF-8 encoded character + * @param c Buffer to place decoded character. + * @param s Character stream to retrieve character from. + * @return Number of characters in the sequence. + */ +size_t Utf8Decode(WChar *c, const char *s) +{ + assert(c != NULL); + + if (!HASBIT(s[0], 7)) { + /* Single byte character: 0xxxxxxx */ + *c = s[0]; + return 1; + } else if (GB(s[0], 5, 3) == 6) { + if (IsUtf8Part(s[1])) { + /* Double byte character: 110xxxxx 10xxxxxx */ + *c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6); + if (*c >= 0x80) return 2; + } + } else if (GB(s[0], 4, 4) == 14) { + if (IsUtf8Part(s[1]) && IsUtf8Part(s[2])) { + /* Triple byte character: 1110xxxx 10xxxxxx 10xxxxxx */ + *c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6); + if (*c >= 0x800) return 3; + } + } else if (GB(s[0], 3, 5) == 30) { + if (IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) { + /* 4 byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + *c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6); + if (*c >= 0x10000 && *c <= 0x10FFFF) return 4; + } + } + + //DEBUG(misc, 1) ("Invalid UTF-8 sequence"); + *c = '?'; + return 1; +} + + +/* Encode a unicode character and place it in the buffer + * @param buf Buffer to place character. + * @param c Unicode character to encode. + * @return Number of characters in the encoded sequence. + */ +size_t Utf8Encode(char *buf, WChar c) +{ + if (c < 0x80) { + *buf = c; + return 1; + } else if (c < 0x800) { + *buf++ = 0xC0 + GB(c, 6, 5); + *buf = 0x80 + GB(c, 0, 6); + return 2; + } else if (c < 0x10000) { + *buf++ = 0xE0 + GB(c, 12, 4); + *buf++ = 0x80 + GB(c, 6, 6); + *buf = 0x80 + GB(c, 0, 6); + return 3; + } else if (c < 0x110000) { + *buf++ = 0xF0 + GB(c, 18, 3); + *buf++ = 0x80 + GB(c, 12, 6); + *buf++ = 0x80 + GB(c, 6, 6); + *buf = 0x80 + GB(c, 0, 6); + return 4; + } + + //DEBUG(misc, 1) ("Can't UTF-8 encode value 0x%X", c); + *buf = '?'; + return 1; +} |