diff options
Diffstat (limited to 'src/strgen')
-rw-r--r-- | src/strgen/strgen.cpp | 1024 | ||||
-rw-r--r-- | src/strgen/strgen.h | 10 | ||||
-rw-r--r-- | src/strgen/strgen_base.cpp | 1030 |
3 files changed, 1054 insertions, 1010 deletions
diff --git a/src/strgen/strgen.cpp b/src/strgen/strgen.cpp index 110e87fa0..4ca02a1a7 100644 --- a/src/strgen/strgen.cpp +++ b/src/strgen/strgen.cpp @@ -40,219 +40,6 @@ #include "../table/strgen_tables.h" -/* Compiles a list of strings into a compiled string list */ - -static bool _translated; ///< Whether the current language is not the master language -static bool _translation; ///< Is the current file actually a translation or not -static const char *_file = "(unknown file)"; ///< The filename of the input, so we can refer to it in errors/warnings -static int _cur_line; ///< The current line we're parsing in the input file -static int _errors, _warnings, _show_todo; - -static const ptrdiff_t MAX_COMMAND_PARAM_SIZE = 100; ///< Maximum size of every command block, not counting the name of the command itself -static const CmdStruct *ParseCommandString(const char **str, char *param, int *argno, int *casei); - -/** - * Create a new case. - * @param caseidx The index of the case. - * @param string The translation of the case. - * @param next The next chained case. - */ -Case::Case(int caseidx, const char *string, Case *next) : - caseidx(caseidx), string(strdup(string)), next(next) -{ -} - -/** Free everything we allocated. */ -Case::~Case() -{ - free(this->string); - delete this->next; -} - -/** - * Create a new string. - * @param name The name of the string. - * @param english The english "translation" of the string. - * @param index The index in the string table. - * @param line The line this string was found on. - */ -LangString::LangString(const char *name, const char *english, int index, int line) : - name(strdup(name)), english(strdup(english)), translated(NULL), - hash_next(0), index(index), line(line), translated_case(NULL) -{ -} - -/** Free everything we allocated. */ -LangString::~LangString() -{ - free(this->name); - free(this->english); - free(this->translated); - delete this->translated_case; -} - -/** Free all data related to the translation. */ -void LangString::FreeTranslation() -{ - free(this->translated); - this->translated = NULL; - - delete this->translated_case; - this->translated_case = NULL; -} - -/** - * Create a new string data container. - * @param max_strings The maximum number of strings. - */ -StringData::StringData(size_t tabs) : tabs(tabs), max_strings(tabs * STRINGS_IN_TAB) -{ - this->strings = CallocT<LangString *>(max_strings); - this->hash_heads = CallocT<uint16>(max_strings); - this->next_string_id = 0; -} - -/** Free everything we allocated. */ -StringData::~StringData() -{ - for (size_t i = 0; i < this->max_strings; i++) delete this->strings[i]; - free(this->strings); - free(this->hash_heads); -} - -/** Free all data related to the translation. */ -void StringData::FreeTranslation() -{ - for (size_t i = 0; i < this->max_strings; i++) { - LangString *ls = this->strings[i]; - if (ls != NULL) ls->FreeTranslation(); - } -} - -/** - * Create a hash of the string for finding them back quickly. - * @param s The string to hash. - * @return The hashed string. - */ -uint StringData::HashStr(const char *s) const -{ - uint hash = 0; - for (; *s != '\0'; s++) hash = ROL(hash, 3) ^ *s; - return hash % this->max_strings; -} - -/** - * Add a newly created LangString. - * @param s The name of the string. - * @param ls The string to add. - */ -void StringData::Add(const char *s, LangString *ls) -{ - uint hash = this->HashStr(s); - ls->hash_next = this->hash_heads[hash]; - /* Off-by-one for hash find. */ - this->hash_heads[hash] = ls->index + 1; - this->strings[ls->index] = ls; -} - -/** - * Find a LangString based on the string name. - * @param s The string name to search on. - * @return The LangString or NULL if it is not known. - */ -LangString *StringData::Find(const char *s) -{ - int idx = this->hash_heads[this->HashStr(s)]; - - while (--idx >= 0) { - LangString *ls = this->strings[idx]; - - if (strcmp(ls->name, s) == 0) return ls; - idx = ls->hash_next; - } - return NULL; -} - -/** - * Create a compound hash. - * @param hash The hash to add the string hash to. - * @param s The string hash. - * @return The new hash. - */ -uint StringData::VersionHashStr(uint hash, const char *s) const -{ - for (; *s != '\0'; s++) { - hash = ROL(hash, 3) ^ *s; - hash = (hash & 1 ? hash >> 1 ^ 0xDEADBEEF : hash >> 1); - } - return hash; -} - -/** - * Make a hash of the file to get a unique "version number" - * @return The version number. - */ -uint StringData::Version() const -{ - uint hash = 0; - - for (size_t i = 0; i < this->max_strings; i++) { - const LangString *ls = this->strings[i]; - - if (ls != NULL) { - const CmdStruct *cs; - const char *s; - char buf[MAX_COMMAND_PARAM_SIZE]; - int argno; - int casei; - - s = ls->name; - hash ^= i * 0x717239; - hash = (hash & 1 ? hash >> 1 ^ 0xDEADBEEF : hash >> 1); - hash = this->VersionHashStr(hash, s + 1); - - s = ls->english; - while ((cs = ParseCommandString(&s, buf, &argno, &casei)) != NULL) { - if (cs->flags & C_DONTCOUNT) continue; - - hash ^= (cs - _cmd_structs) * 0x1234567; - hash = (hash & 1 ? hash >> 1 ^ 0xF00BAA4 : hash >> 1); - } - } - } - - return hash; -} - -/** - * Count the number of tab elements that are in use. - * @param tab The tab to count the elements of. - */ -uint StringData::CountInUse(uint tab) const -{ - int i; - for (i = STRINGS_IN_TAB; --i >= 0;) if (this->strings[(tab * STRINGS_IN_TAB) + i] != NULL) break; - return i + 1; -} - -static LanguagePackHeader _lang; ///< Header information about a language. - -static const char *_cur_ident; - -struct CmdPair { - const CmdStruct *a; - const char *v; -}; - -struct ParsedCommandStruct { - uint np; - CmdPair pairs[32]; - const CmdStruct *cmd[32]; // ordered by param # -}; - -/* Used when generating some advanced commands. */ -static ParsedCommandStruct _cur_pcs; -static int _cur_argidx; #ifdef _MSC_VER # define LINE_NUM_FMT(s) "%s (%d): warning: %s (" s ")\n" @@ -260,9 +47,7 @@ static int _cur_argidx; # define LINE_NUM_FMT(s) "%s:%d: " s ": %s\n" #endif -static void CDECL strgen_warning(const char *s, ...) WARN_FORMAT(1, 2); - -static void CDECL strgen_warning(const char *s, ...) +void CDECL strgen_warning(const char *s, ...) { char buf[1024]; va_list va; @@ -273,9 +58,7 @@ static void CDECL strgen_warning(const char *s, ...) _warnings++; } -static void CDECL strgen_error(const char *s, ...) WARN_FORMAT(1, 2); - -static void CDECL strgen_error(const char *s, ...) +void CDECL strgen_error(const char *s, ...) { char buf[1024]; va_list va; @@ -286,7 +69,7 @@ static void CDECL strgen_error(const char *s, ...) _errors++; } -void NORETURN CDECL error(const char *s, ...) +void NORETURN CDECL strgen_fatal(const char *s, ...) { char buf[1024]; va_list va; @@ -300,356 +83,18 @@ void NORETURN CDECL error(const char *s, ...) throw std::exception(); } -/** The buffer for writing a single string. */ -struct Buffer : SmallVector<byte, 256> { - /** - * Conveniance method for adding a byte. - * @param value The value to add. - */ - void AppendByte(byte value) - { - *this->Append() = value; - } - - /** - * Add an Unicode character encoded in UTF-8 to the buffer. - * @param value The character to add. - */ - void AppendUtf8(uint32 value) - { - if (value < 0x80) { - *this->Append() = value; - } else if (value < 0x800) { - *this->Append() = 0xC0 + GB(value, 6, 5); - *this->Append() = 0x80 + GB(value, 0, 6); - } else if (value < 0x10000) { - *this->Append() = 0xE0 + GB(value, 12, 4); - *this->Append() = 0x80 + GB(value, 6, 6); - *this->Append() = 0x80 + GB(value, 0, 6); - } else if (value < 0x110000) { - *this->Append() = 0xF0 + GB(value, 18, 3); - *this->Append() = 0x80 + GB(value, 12, 6); - *this->Append() = 0x80 + GB(value, 6, 6); - *this->Append() = 0x80 + GB(value, 0, 6); - } else { - strgen_warning("Invalid unicode value U+0x%X", value); - } - } -}; - -size_t Utf8Validate(const char *s) -{ - uint32 c; - - if (!HasBit(s[0], 7)) { - /* 1 byte */ - return 1; - } else if (GB(s[0], 5, 3) == 6 && IsUtf8Part(s[1])) { - /* 2 bytes */ - c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6); - if (c >= 0x80) return 2; - } else if (GB(s[0], 4, 4) == 14 && IsUtf8Part(s[1]) && IsUtf8Part(s[2])) { - /* 3 bytes */ - c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6); - if (c >= 0x800) return 3; - } else if (GB(s[0], 3, 5) == 30 && IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) { - /* 4 bytes */ - c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6); - if (c >= 0x10000 && c <= 0x10FFFF) return 4; - } - - return 0; -} - - -static void EmitSingleChar(Buffer *buffer, char *buf, int value) -{ - if (*buf != '\0') strgen_warning("Ignoring trailing letters in command"); - buffer->AppendUtf8(value); -} - - -/* The plural specifier looks like - * {NUM} {PLURAL -1 passenger passengers} then it picks either passenger/passengers depending on the count in NUM */ - -/* This is encoded like - * CommandByte <ARG#> <NUM> {Length of each string} {each string} */ - -bool ParseRelNum(char **buf, int *value, int *offset) -{ - const char *s = *buf; - char *end; - bool rel = false; - - while (*s == ' ' || *s == '\t') s++; - if (*s == '+') { - rel = true; - s++; - } - int v = strtol(s, &end, 0); - if (end == s) return false; - if (rel || v < 0) { - *value += v; - } else { - *value = v; - } - if (offset != NULL && *end == ':') { - /* Take the Nth within */ - s = end + 1; - *offset = strtol(s, &end, 0); - if (end == s) return false; - } - *buf = end; - return true; -} - -/* Parse out the next word, or NULL */ -char *ParseWord(char **buf) -{ - char *s = *buf, *r; - - while (*s == ' ' || *s == '\t') s++; - if (*s == '\0') return NULL; - - if (*s == '"') { - r = ++s; - /* parse until next " or NUL */ - for (;;) { - if (*s == '\0') break; - if (*s == '"') { - *s++ = '\0'; - break; - } - s++; - } - } else { - /* proceed until whitespace or NUL */ - r = s; - for (;;) { - if (*s == '\0') break; - if (*s == ' ' || *s == '\t') { - *s++ = '\0'; - break; - } - s++; - } - } - *buf = s; - return r; -} - -/* Forward declaration */ -static int TranslateArgumentIdx(int arg, int offset = 0); - -static void EmitWordList(Buffer *buffer, const char * const *words, uint nw) -{ - buffer->AppendByte(nw); - for (uint i = 0; i < nw; i++) buffer->AppendByte(strlen(words[i]) + 1); - for (uint i = 0; i < nw; i++) { - for (uint j = 0; words[i][j] != '\0'; j++) buffer->AppendByte(words[i][j]); - buffer->AppendByte(0); - } -} - -static void EmitPlural(Buffer *buffer, char *buf, int value) -{ - int argidx = _cur_argidx; - int offset = 0; - const char *words[5]; - int nw = 0; - - /* Parse out the number, if one exists. Otherwise default to prev arg. */ - if (!ParseRelNum(&buf, &argidx, &offset)) argidx--; - - /* Parse each string */ - for (nw = 0; nw < 5; nw++) { - words[nw] = ParseWord(&buf); - if (words[nw] == NULL) break; - } - - if (nw == 0) { - error("%s: No plural words", _cur_ident); - } - - if (_plural_forms[_lang.plural_form].plural_count != nw) { - if (_translated) { - error("%s: Invalid number of plural forms. Expecting %d, found %d.", _cur_ident, - _plural_forms[_lang.plural_form].plural_count, nw); - } else { - if ((_show_todo & 2) != 0) strgen_warning("'%s' is untranslated. Tweaking english string to allow compilation for plural forms", _cur_ident); - if (nw > _plural_forms[_lang.plural_form].plural_count) { - nw = _plural_forms[_lang.plural_form].plural_count; - } else { - for (; nw < _plural_forms[_lang.plural_form].plural_count; nw++) { - words[nw] = words[nw - 1]; - } - } - } - } - - buffer->AppendUtf8(SCC_PLURAL_LIST); - buffer->AppendByte(_lang.plural_form); - buffer->AppendByte(TranslateArgumentIdx(argidx, offset)); - EmitWordList(buffer, words, nw); -} - - -static void EmitGender(Buffer *buffer, char *buf, int value) -{ - int argidx = _cur_argidx; - int offset = 0; - uint nw; - - if (buf[0] == '=') { - buf++; - - /* This is a {G=DER} command */ - nw = _lang.GetGenderIndex(buf); - if (nw >= MAX_NUM_GENDERS) error("G argument '%s' invalid", buf); - - /* now nw contains the gender index */ - buffer->AppendUtf8(SCC_GENDER_INDEX); - buffer->AppendByte(nw); - } else { - const char *words[MAX_NUM_GENDERS]; - - /* This is a {G 0 foo bar two} command. - * If no relative number exists, default to +0 */ - if (!ParseRelNum(&buf, &argidx, &offset)) {} - - const CmdStruct *cmd = _cur_pcs.cmd[argidx]; - if (cmd == NULL || (cmd->flags & C_GENDER) == 0) { - error("Command '%s' can't have a gender", cmd == NULL ? "<empty>" : cmd->cmd); - } - - for (nw = 0; nw < MAX_NUM_GENDERS; nw++) { - words[nw] = ParseWord(&buf); - if (words[nw] == NULL) break; - } - if (nw != _lang.num_genders) error("Bad # of arguments for gender command"); - - assert(IsInsideBS(cmd->value, SCC_CONTROL_START, UINT8_MAX)); - buffer->AppendUtf8(SCC_GENDER_LIST); - buffer->AppendByte(TranslateArgumentIdx(argidx, offset)); - EmitWordList(buffer, words, nw); - } -} - -static const CmdStruct *FindCmd(const char *s, int len) -{ - for (const CmdStruct *cs = _cmd_structs; cs != endof(_cmd_structs); cs++) { - if (strncmp(cs->cmd, s, len) == 0 && cs->cmd[len] == '\0') return cs; - } - return NULL; -} - -static uint ResolveCaseName(const char *str, uint len) -{ - /* First get a clean copy of only the case name, then resolve it. */ - char case_str[CASE_GENDER_LEN]; - len = min(lengthof(case_str) - 1, len); - memcpy(case_str, str, len); - case_str[len] = '\0'; - - uint8 case_idx = _lang.GetCaseIndex(case_str); - if (case_idx >= MAX_NUM_CASES) error("Invalid case-name '%s'", case_str); - return case_idx + 1; -} - - -/* returns NULL on eof - * else returns command struct */ -static const CmdStruct *ParseCommandString(const char **str, char *param, int *argno, int *casei) -{ - const char *s = *str, *start; - char c; - - *argno = -1; - *casei = -1; - - /* Scan to the next command, exit if there's no next command. */ - for (; *s != '{'; s++) { - if (*s == '\0') return NULL; - } - s++; // Skip past the { - - if (*s >= '0' && *s <= '9') { - char *end; - - *argno = strtoul(s, &end, 0); - if (*end != ':') error("missing arg #"); - s = end + 1; - } - - /* parse command name */ - start = s; - do { - c = *s++; - } while (c != '}' && c != ' ' && c != '=' && c != '.' && c != 0); - - const CmdStruct *cmd = FindCmd(start, s - start - 1); - if (cmd == NULL) { - strgen_error("Undefined command '%.*s'", (int)(s - start - 1), start); - return NULL; - } - - if (c == '.') { - const char *casep = s; - - if (!(cmd->flags & C_CASE)) { - error("Command '%s' can't have a case", cmd->cmd); - } - - do { - c = *s++; - } while (c != '}' && c != ' ' && c != '\0'); - *casei = ResolveCaseName(casep, s - casep - 1); - } - - if (c == '\0') { - strgen_error("Missing } from command '%s'", start); - return NULL; - } - - - if (c != '}') { - if (c == '=') s--; - /* copy params */ - start = s; - for (;;) { - c = *s++; - if (c == '}') break; - if (c == '\0') { - strgen_error("Missing } from command '%s'", start); - return NULL; - } - if (s - start == MAX_COMMAND_PARAM_SIZE) error("param command too long"); - *param++ = c; - } - } - *param = '\0'; - - *str = s; - - return cmd; -} - -/** - * Prepare reading. - * @param data The data to fill during reading. - * @param file The file we are reading. - * @param master Are we reading the master file? - * @param translation Are we reading a translation? - */ -StringReader::StringReader(StringData &data, const char *file, bool master, bool translation) : - data(data), file(strdup(file)), master(master), translation(translation) -{ -} - -/** Make sure the right reader gets freed. */ -StringReader::~StringReader() +void NORETURN CDECL error(const char *s, ...) { - free(file); + char buf[1024]; + va_list va; + va_start(va, s); + vsnprintf(buf, lengthof(buf), s, va); + va_end(va); + fprintf(stderr, LINE_NUM_FMT("FATAL"), _file, _cur_line, buf); +#ifdef _MSC_VER + fprintf(stderr, LINE_NUM_FMT("warning"), _file, _cur_line, "language is not compiled"); +#endif + exit(2); } /** A reader that simply reads using fopen. */ @@ -768,233 +213,6 @@ void FileStringReader::HandlePragma(char *str) } } -static void ExtractCommandString(ParsedCommandStruct *p, const char *s, bool warnings) -{ - char param[MAX_COMMAND_PARAM_SIZE]; - int argno; - int argidx = 0; - int casei; - - memset(p, 0, sizeof(*p)); - - for (;;) { - /* read until next command from a. */ - const CmdStruct *ar = ParseCommandString(&s, param, &argno, &casei); - - if (ar == NULL) break; - - /* Sanity checking */ - if (argno != -1 && ar->consumes == 0) error("Non consumer param can't have a paramindex"); - - if (ar->consumes) { - if (argno != -1) argidx = argno; - if (argidx < 0 || (uint)argidx >= lengthof(p->cmd)) error("invalid param idx %d", argidx); - if (p->cmd[argidx] != NULL && p->cmd[argidx] != ar) error("duplicate param idx %d", argidx); - - p->cmd[argidx++] = ar; - } else if (!(ar->flags & C_DONTCOUNT)) { // Ignore some of them - if (p->np >= lengthof(p->pairs)) error("too many commands in string, max " PRINTF_SIZE, lengthof(p->pairs)); - p->pairs[p->np].a = ar; - p->pairs[p->np].v = param[0] != '\0' ? strdup(param) : ""; - p->np++; - } - } -} - - -static const CmdStruct *TranslateCmdForCompare(const CmdStruct *a) -{ - if (a == NULL) return NULL; - - if (strcmp(a->cmd, "STRING1") == 0 || - strcmp(a->cmd, "STRING2") == 0 || - strcmp(a->cmd, "STRING3") == 0 || - strcmp(a->cmd, "STRING4") == 0 || - strcmp(a->cmd, "STRING5") == 0 || - strcmp(a->cmd, "RAW_STRING") == 0) { - return FindCmd("STRING", 6); - } - - return a; -} - - -static bool CheckCommandsMatch(char *a, char *b, const char *name) -{ - /* If we're not translating, i.e. we're compiling the base language, - * it is pointless to do all these checks as it'll always be correct. - * After all, all checks are based on the base language. - */ - if (!_translation) return true; - - ParsedCommandStruct templ; - ParsedCommandStruct lang; - bool result = true; - - ExtractCommandString(&templ, b, true); - ExtractCommandString(&lang, a, true); - - /* For each string in templ, see if we find it in lang */ - if (templ.np != lang.np) { - strgen_warning("%s: template string and language string have a different # of commands", name); - result = false; - } - - for (uint i = 0; i < templ.np; i++) { - /* see if we find it in lang, and zero it out */ - bool found = false; - for (uint j = 0; j < lang.np; j++) { - if (templ.pairs[i].a == lang.pairs[j].a && - strcmp(templ.pairs[i].v, lang.pairs[j].v) == 0) { - /* it was found in both. zero it out from lang so we don't find it again */ - lang.pairs[j].a = NULL; - found = true; - break; - } - } - - if (!found) { - strgen_warning("%s: command '%s' exists in template file but not in language file", name, templ.pairs[i].a->cmd); - result = false; - } - } - - /* if we reach here, all non consumer commands match up. - * Check if the non consumer commands match up also. */ - for (uint i = 0; i < lengthof(templ.cmd); i++) { - if (TranslateCmdForCompare(templ.cmd[i]) != lang.cmd[i]) { - strgen_warning("%s: Param idx #%d '%s' doesn't match with template command '%s'", name, i, - lang.cmd[i] == NULL ? "<empty>" : TranslateCmdForCompare(lang.cmd[i])->cmd, - templ.cmd[i] == NULL ? "<empty>" : templ.cmd[i]->cmd); - result = false; - } - } - - return result; -} - -void StringReader::HandleString(char *str) -{ - if (*str == '#') { - if (str[1] == '#' && str[2] != '#') this->HandlePragma(str + 2); - return; - } - - /* Ignore comments & blank lines */ - if (*str == ';' || *str == ' ' || *str == '\0') return; - - char *s = strchr(str, ':'); - if (s == NULL) { - strgen_error("Line has no ':' delimiter"); - return; - } - - char *t; - /* Trim spaces. - * After this str points to the command name, and s points to the command contents */ - for (t = s; t > str && (t[-1] == ' ' || t[-1] == '\t'); t--) {} - *t = 0; - s++; - - /* Check string is valid UTF-8 */ - const char *tmp; - for (tmp = s; *tmp != '\0';) { - size_t len = Utf8Validate(tmp); - if (len == 0) error("Invalid UTF-8 sequence in '%s'", s); - - WChar c; - Utf8Decode(&c, tmp); - if (c <= 0x001F || // ASCII control character range - (c >= 0xE000 && c <= 0xF8FF) || // Private range - (c >= 0xFFF0 && c <= 0xFFFF)) { // Specials range - error("Unwanted UTF-8 character U+%04X in sequence '%s'", c, s); - } - - tmp += len; - } - - /* Check if the string has a case.. - * The syntax for cases is IDENTNAME.case */ - char *casep = strchr(str, '.'); - if (casep != NULL) *casep++ = '\0'; - - /* Check if this string already exists.. */ - LangString *ent = this->data.Find(str); - - if (this->master) { - if (casep != NULL) { - strgen_error("Cases in the base translation are not supported."); - return; - } - - if (ent != NULL) { - strgen_error("String name '%s' is used multiple times", str); - return; - } - - if (this->data.strings[this->data.next_string_id] != NULL) { - strgen_error("String ID 0x%X for '%s' already in use by '%s'", this->data.next_string_id, str, this->data.strings[this->data.next_string_id]->name); - return; - } - - /* Allocate a new LangString */ - this->data.Add(str, new LangString(str, s, this->data.next_string_id++, _cur_line)); - } else { - if (ent == NULL) { - strgen_warning("String name '%s' does not exist in master file", str); - return; - } - - if (ent->translated && casep == NULL) { - strgen_error("String name '%s' is used multiple times", str); - return; - } - - /* make sure that the commands match */ - if (!CheckCommandsMatch(s, ent->english, str)) return; - - if (casep != NULL) { - ent->translated_case = new Case(ResolveCaseName(casep, strlen(casep)), s, ent->translated_case); - } else { - ent->translated = strdup(s); - /* If the string was translated, use the line from the - * translated language so errors in the translated file - * are properly referenced to. */ - ent->line = _cur_line; - } - } -} - - -static void rstrip(char *buf) -{ - int i = strlen(buf); - while (i > 0 && (buf[i - 1] == '\r' || buf[i - 1] == '\n' || buf[i - 1] == ' ')) i--; - buf[i] = '\0'; -} - -void StringReader::ParseFile() -{ - char buf[2048]; - _warnings = _errors = 0; - - _translation = this->master || this->translation; - _file = this->file; - - /* For each new file we parse, reset the genders, and language codes. */ - MemSetT(&_lang, 0); - strecpy(_lang.digit_group_separator, ",", lastof(_lang.digit_group_separator)); - strecpy(_lang.digit_group_separator_currency, ",", lastof(_lang.digit_group_separator_currency)); - strecpy(_lang.digit_decimal_separator, ".", lastof(_lang.digit_decimal_separator)); - - _cur_line = 1; - while (this->ReadLine(buf, sizeof(buf)) != NULL) { - rstrip(buf); - this->HandleString(buf); - _cur_line++; - } -} - bool CompareFiles(const char *n1, const char *n2) { FILE *f2 = fopen(n2, "rb"); @@ -1060,23 +278,6 @@ struct FileWriter { } }; -/** - * Write the header information. - * @param data The data about the string. - */ -void HeaderWriter::WriteHeader(const StringData &data) -{ - int last = 0; - for (size_t i = 0; i < data.max_strings; i++) { - if (data.strings[i] != NULL) { - this->WriteStringID(data.strings[i]->name, i); - last = i; - } - } - - this->WriteStringID("STR_LAST_STRINGID", last); -} - struct HeaderFileWriter : HeaderWriter, FileWriter { /** The real file name we eventually want to write to. */ const char *real_filename; @@ -1135,203 +336,6 @@ struct HeaderFileWriter : HeaderWriter, FileWriter { } }; -static int TranslateArgumentIdx(int argidx, int offset) -{ - int sum; - - if (argidx < 0 || (uint)argidx >= lengthof(_cur_pcs.cmd)) { - error("invalid argidx %d", argidx); - } - const CmdStruct *cs = _cur_pcs.cmd[argidx]; - if (cs != NULL && cs->consumes <= offset) { - error("invalid argidx offset %d:%d", argidx, offset); - } - - if (_cur_pcs.cmd[argidx] == NULL) { - error("no command for this argidx %d", argidx); - } - - for (int i = sum = 0; i < argidx; i++) { - const CmdStruct *cs = _cur_pcs.cmd[i]; - - sum += (cs != NULL) ? cs->consumes : 1; - } - - return sum + offset; -} - -static void PutArgidxCommand(Buffer *buffer) -{ - buffer->AppendUtf8(SCC_ARG_INDEX); - buffer->AppendByte(TranslateArgumentIdx(_cur_argidx)); -} - - -static void PutCommandString(Buffer *buffer, const char *str) -{ - _cur_argidx = 0; - - while (*str != '\0') { - /* Process characters as they are until we encounter a { */ - if (*str != '{') { - buffer->AppendByte(*str++); - continue; - } - - char param[MAX_COMMAND_PARAM_SIZE]; - int argno; - int casei; - const CmdStruct *cs = ParseCommandString(&str, param, &argno, &casei); - if (cs == NULL) break; - - if (casei != -1) { - buffer->AppendUtf8(SCC_SET_CASE); // {SET_CASE} - buffer->AppendByte(casei); - } - - /* For params that consume values, we need to handle the argindex properly */ - if (cs->consumes > 0) { - /* Check if we need to output a move-param command */ - if (argno != -1 && argno != _cur_argidx) { - _cur_argidx = argno; - PutArgidxCommand(buffer); - } - - /* Output the one from the master string... it's always accurate. */ - cs = _cur_pcs.cmd[_cur_argidx++]; - if (cs == NULL) { - error("%s: No argument exists at position %d", _cur_ident, _cur_argidx - 1); - } - } - - cs->proc(buffer, param, cs->value); - } -} - -/** - * Write the length as a simple gamma. - * @param length The number to write. - */ -void LanguageWriter::WriteLength(uint length) -{ - char buffer[2]; - int offs = 0; - if (length >= 0x4000) { - error("string too long"); - } - - if (length >= 0xC0) { - buffer[offs++] = (length >> 8) | 0xC0; - } - buffer[offs++] = length & 0xFF; - this->Write((byte*)buffer, offs); -} - -/** - * Actually write the language. - * @param data The data about the string. - */ -void LanguageWriter::WriteLang(const StringData &data) -{ - uint *in_use = AllocaM(uint, data.tabs); - for (size_t tab = 0; tab < data.tabs; tab++) { - uint n = data.CountInUse(tab); - - in_use[tab] = n; - _lang.offsets[tab] = TO_LE16(n); - - for (uint j = 0; j != in_use[tab]; j++) { - const LangString *ls = data.strings[(tab * StringData::STRINGS_IN_TAB) + j]; - if (ls != NULL && ls->translated == NULL) _lang.missing++; - } - } - - _lang.ident = TO_LE32(LanguagePackHeader::IDENT); - _lang.version = TO_LE32(data.Version()); - _lang.missing = TO_LE16(_lang.missing); - _lang.winlangid = TO_LE16(_lang.winlangid); - - this->WriteHeader(&_lang); - Buffer buffer; - - for (size_t tab = 0; tab < data.tabs; tab++) { - for (uint j = 0; j != in_use[tab]; j++) { - const LangString *ls = data.strings[(tab * StringData::STRINGS_IN_TAB) + j]; - const Case *casep; - const char *cmdp; - - /* For undefined strings, just set that it's an empty string */ - if (ls == NULL) { - this->WriteLength(0); - continue; - } - - _cur_ident = ls->name; - _cur_line = ls->line; - - /* Produce a message if a string doesn't have a translation. */ - if (_show_todo > 0 && ls->translated == NULL) { - if ((_show_todo & 2) != 0) { - strgen_warning("'%s' is untranslated", ls->name); - } - if ((_show_todo & 1) != 0) { - const char *s = "<TODO> "; - while (*s != '\0') buffer.AppendByte(*s++); - } - } - - /* Extract the strings and stuff from the english command string */ - ExtractCommandString(&_cur_pcs, ls->english, false); - - if (ls->translated_case != NULL || ls->translated != NULL) { - casep = ls->translated_case; - cmdp = ls->translated; - } else { - casep = NULL; - cmdp = ls->english; - } - - _translated = cmdp != ls->english; - - if (casep != NULL) { - const Case *c; - uint num; - - /* Need to output a case-switch. - * It has this format - * <0x9E> <NUM CASES> <CASE1> <LEN1> <STRING1> <CASE2> <LEN2> <STRING2> <CASE3> <LEN3> <STRING3> <STRINGDEFAULT> - * Each LEN is printed using 2 bytes in big endian order. */ - buffer.AppendUtf8(SCC_SWITCH_CASE); - /* Count the number of cases */ - for (num = 0, c = casep; c; c = c->next) num++; - buffer.AppendByte(num); - - /* Write each case */ - for (c = casep; c != NULL; c = c->next) { - buffer.AppendByte(c->caseidx); - /* Make some space for the 16-bit length */ - size_t pos = buffer.Length(); - buffer.AppendByte(0); - buffer.AppendByte(0); - /* Write string */ - PutCommandString(&buffer, c->string); - buffer.AppendByte(0); // terminate with a zero - /* Fill in the length */ - size_t size = buffer.Length() - (pos + 2); - buffer[pos + 0] = GB(size, 8, 8); - buffer[pos + 1] = GB(size, 0, 8); - } - } - - if (cmdp != NULL) PutCommandString(&buffer, cmdp); - - this->WriteLength(buffer.Length()); - this->Write(buffer.Begin(), buffer.Length()); - buffer.Clear(); - } - } -} - /** Class for writing a language to disk. */ struct LanguageFileWriter : LanguageWriter, FileWriter { /** diff --git a/src/strgen/strgen.h b/src/strgen/strgen.h index 80737f6eb..b3584ab5f 100644 --- a/src/strgen/strgen.h +++ b/src/strgen/strgen.h @@ -138,4 +138,14 @@ struct LanguageWriter { void WriteLang(const StringData &data); }; +void CDECL strgen_warning(const char *s, ...) WARN_FORMAT(1, 2); +void CDECL strgen_error(const char *s, ...) WARN_FORMAT(1, 2); +void NORETURN CDECL strgen_fatal(const char *s, ...) WARN_FORMAT(1, 2); +char *ParseWord(char **buf); + +extern const char *_file; +extern int _cur_line; +extern int _errors, _warnings, _show_todo; +extern LanguagePackHeader _lang; + #endif /* STRGEN_H */ diff --git a/src/strgen/strgen_base.cpp b/src/strgen/strgen_base.cpp new file mode 100644 index 000000000..98e5aefb1 --- /dev/null +++ b/src/strgen/strgen_base.cpp @@ -0,0 +1,1030 @@ +/* $Id$ */ + +/* + * This file is part of OpenTTD. + * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2. + * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>. + */ + +/** @file strgen.cpp Tool to create computer readable (stand-alone) translation files. */ + +#include "../stdafx.h" +#include "../core/endian_func.hpp" +#include "../string_func.h" +#include "../strings_type.h" +#include "../language.h" +#include "../table/control_codes.h" + +#include "strgen.h" + +#include <stdarg.h> +#include <exception> + +#include "../table/strgen_tables.h" + +/* Compiles a list of strings into a compiled string list */ + +static bool _translated; ///< Whether the current language is not the master language +static bool _translation; ///< Is the current file actually a translation or not +const char *_file = "(unknown file)"; ///< The filename of the input, so we can refer to it in errors/warnings +int _cur_line; ///< The current line we're parsing in the input file +int _errors, _warnings, _show_todo; +LanguagePackHeader _lang; ///< Header information about a language. + +static const ptrdiff_t MAX_COMMAND_PARAM_SIZE = 100; ///< Maximum size of every command block, not counting the name of the command itself +static const CmdStruct *ParseCommandString(const char **str, char *param, int *argno, int *casei); + +/** + * Create a new case. + * @param caseidx The index of the case. + * @param string The translation of the case. + * @param next The next chained case. + */ +Case::Case(int caseidx, const char *string, Case *next) : + caseidx(caseidx), string(strdup(string)), next(next) +{ +} + +/** Free everything we allocated. */ +Case::~Case() +{ + free(this->string); + delete this->next; +} + +/** + * Create a new string. + * @param name The name of the string. + * @param english The english "translation" of the string. + * @param index The index in the string table. + * @param line The line this string was found on. + */ +LangString::LangString(const char *name, const char *english, int index, int line) : + name(strdup(name)), english(strdup(english)), translated(NULL), + hash_next(0), index(index), line(line), translated_case(NULL) +{ +} + +/** Free everything we allocated. */ +LangString::~LangString() +{ + free(this->name); + free(this->english); + free(this->translated); + delete this->translated_case; +} + +/** Free all data related to the translation. */ +void LangString::FreeTranslation() +{ + free(this->translated); + this->translated = NULL; + + delete this->translated_case; + this->translated_case = NULL; +} + +/** + * Create a new string data container. + * @param max_strings The maximum number of strings. + */ +StringData::StringData(size_t tabs) : tabs(tabs), max_strings(tabs * STRINGS_IN_TAB) +{ + this->strings = CallocT<LangString *>(max_strings); + this->hash_heads = CallocT<uint16>(max_strings); + this->next_string_id = 0; +} + +/** Free everything we allocated. */ +StringData::~StringData() +{ + for (size_t i = 0; i < this->max_strings; i++) delete this->strings[i]; + free(this->strings); + free(this->hash_heads); +} + +/** Free all data related to the translation. */ +void StringData::FreeTranslation() +{ + for (size_t i = 0; i < this->max_strings; i++) { + LangString *ls = this->strings[i]; + if (ls != NULL) ls->FreeTranslation(); + } +} + +/** + * Create a hash of the string for finding them back quickly. + * @param s The string to hash. + * @return The hashed string. + */ +uint StringData::HashStr(const char *s) const +{ + uint hash = 0; + for (; *s != '\0'; s++) hash = ROL(hash, 3) ^ *s; + return hash % this->max_strings; +} + +/** + * Add a newly created LangString. + * @param s The name of the string. + * @param ls The string to add. + */ +void StringData::Add(const char *s, LangString *ls) +{ + uint hash = this->HashStr(s); + ls->hash_next = this->hash_heads[hash]; + /* Off-by-one for hash find. */ + this->hash_heads[hash] = ls->index + 1; + this->strings[ls->index] = ls; +} + +/** + * Find a LangString based on the string name. + * @param s The string name to search on. + * @return The LangString or NULL if it is not known. + */ +LangString *StringData::Find(const char *s) +{ + int idx = this->hash_heads[this->HashStr(s)]; + + while (--idx >= 0) { + LangString *ls = this->strings[idx]; + + if (strcmp(ls->name, s) == 0) return ls; + idx = ls->hash_next; + } + return NULL; +} + +/** + * Create a compound hash. + * @param hash The hash to add the string hash to. + * @param s The string hash. + * @return The new hash. + */ +uint StringData::VersionHashStr(uint hash, const char *s) const +{ + for (; *s != '\0'; s++) { + hash = ROL(hash, 3) ^ *s; + hash = (hash & 1 ? hash >> 1 ^ 0xDEADBEEF : hash >> 1); + } + return hash; +} + +/** + * Make a hash of the file to get a unique "version number" + * @return The version number. + */ +uint StringData::Version() const +{ + uint hash = 0; + + for (size_t i = 0; i < this->max_strings; i++) { + const LangString *ls = this->strings[i]; + + if (ls != NULL) { + const CmdStruct *cs; + const char *s; + char buf[MAX_COMMAND_PARAM_SIZE]; + int argno; + int casei; + + s = ls->name; + hash ^= i * 0x717239; + hash = (hash & 1 ? hash >> 1 ^ 0xDEADBEEF : hash >> 1); + hash = this->VersionHashStr(hash, s + 1); + + s = ls->english; + while ((cs = ParseCommandString(&s, buf, &argno, &casei)) != NULL) { + if (cs->flags & C_DONTCOUNT) continue; + + hash ^= (cs - _cmd_structs) * 0x1234567; + hash = (hash & 1 ? hash >> 1 ^ 0xF00BAA4 : hash >> 1); + } + } + } + + return hash; +} + +/** + * Count the number of tab elements that are in use. + * @param tab The tab to count the elements of. + */ +uint StringData::CountInUse(uint tab) const +{ + int i; + for (i = STRINGS_IN_TAB; --i >= 0;) if (this->strings[(tab * STRINGS_IN_TAB) + i] != NULL) break; + return i + 1; +} + +static const char *_cur_ident; + +struct CmdPair { + const CmdStruct *a; + const char *v; +}; + +struct ParsedCommandStruct { + uint np; + CmdPair pairs[32]; + const CmdStruct *cmd[32]; // ordered by param # +}; + +/* Used when generating some advanced commands. */ +static ParsedCommandStruct _cur_pcs; +static int _cur_argidx; + +/** The buffer for writing a single string. */ +struct Buffer : SmallVector<byte, 256> { + /** + * Conveniance method for adding a byte. + * @param value The value to add. + */ + void AppendByte(byte value) + { + *this->Append() = value; + } + + /** + * Add an Unicode character encoded in UTF-8 to the buffer. + * @param value The character to add. + */ + void AppendUtf8(uint32 value) + { + if (value < 0x80) { + *this->Append() = value; + } else if (value < 0x800) { + *this->Append() = 0xC0 + GB(value, 6, 5); + *this->Append() = 0x80 + GB(value, 0, 6); + } else if (value < 0x10000) { + *this->Append() = 0xE0 + GB(value, 12, 4); + *this->Append() = 0x80 + GB(value, 6, 6); + *this->Append() = 0x80 + GB(value, 0, 6); + } else if (value < 0x110000) { + *this->Append() = 0xF0 + GB(value, 18, 3); + *this->Append() = 0x80 + GB(value, 12, 6); + *this->Append() = 0x80 + GB(value, 6, 6); + *this->Append() = 0x80 + GB(value, 0, 6); + } else { + strgen_warning("Invalid unicode value U+0x%X", value); + } + } +}; + +size_t Utf8Validate(const char *s) +{ + uint32 c; + + if (!HasBit(s[0], 7)) { + /* 1 byte */ + return 1; + } else if (GB(s[0], 5, 3) == 6 && IsUtf8Part(s[1])) { + /* 2 bytes */ + c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6); + if (c >= 0x80) return 2; + } else if (GB(s[0], 4, 4) == 14 && IsUtf8Part(s[1]) && IsUtf8Part(s[2])) { + /* 3 bytes */ + c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6); + if (c >= 0x800) return 3; + } else if (GB(s[0], 3, 5) == 30 && IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) { + /* 4 bytes */ + c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6); + if (c >= 0x10000 && c <= 0x10FFFF) return 4; + } + + return 0; +} + + +void EmitSingleChar(Buffer *buffer, char *buf, int value) +{ + if (*buf != '\0') strgen_warning("Ignoring trailing letters in command"); + buffer->AppendUtf8(value); +} + + +/* The plural specifier looks like + * {NUM} {PLURAL -1 passenger passengers} then it picks either passenger/passengers depending on the count in NUM */ + +/* This is encoded like + * CommandByte <ARG#> <NUM> {Length of each string} {each string} */ + +bool ParseRelNum(char **buf, int *value, int *offset) +{ + const char *s = *buf; + char *end; + bool rel = false; + + while (*s == ' ' || *s == '\t') s++; + if (*s == '+') { + rel = true; + s++; + } + int v = strtol(s, &end, 0); + if (end == s) return false; + if (rel || v < 0) { + *value += v; + } else { + *value = v; + } + if (offset != NULL && *end == ':') { + /* Take the Nth within */ + s = end + 1; + *offset = strtol(s, &end, 0); + if (end == s) return false; + } + *buf = end; + return true; +} + +/* Parse out the next word, or NULL */ +char *ParseWord(char **buf) +{ + char *s = *buf, *r; + + while (*s == ' ' || *s == '\t') s++; + if (*s == '\0') return NULL; + + if (*s == '"') { + r = ++s; + /* parse until next " or NUL */ + for (;;) { + if (*s == '\0') break; + if (*s == '"') { + *s++ = '\0'; + break; + } + s++; + } + } else { + /* proceed until whitespace or NUL */ + r = s; + for (;;) { + if (*s == '\0') break; + if (*s == ' ' || *s == '\t') { + *s++ = '\0'; + break; + } + s++; + } + } + *buf = s; + return r; +} + +/* Forward declaration */ +static int TranslateArgumentIdx(int arg, int offset = 0); + +static void EmitWordList(Buffer *buffer, const char * const *words, uint nw) +{ + buffer->AppendByte(nw); + for (uint i = 0; i < nw; i++) buffer->AppendByte(strlen(words[i]) + 1); + for (uint i = 0; i < nw; i++) { + for (uint j = 0; words[i][j] != '\0'; j++) buffer->AppendByte(words[i][j]); + buffer->AppendByte(0); + } +} + +void EmitPlural(Buffer *buffer, char *buf, int value) +{ + int argidx = _cur_argidx; + int offset = 0; + const char *words[5]; + int nw = 0; + + /* Parse out the number, if one exists. Otherwise default to prev arg. */ + if (!ParseRelNum(&buf, &argidx, &offset)) argidx--; + + /* Parse each string */ + for (nw = 0; nw < 5; nw++) { + words[nw] = ParseWord(&buf); + if (words[nw] == NULL) break; + } + + if (nw == 0) { + strgen_fatal("%s: No plural words", _cur_ident); + } + + if (_plural_forms[_lang.plural_form].plural_count != nw) { + if (_translated) { + strgen_fatal("%s: Invalid number of plural forms. Expecting %d, found %d.", _cur_ident, + _plural_forms[_lang.plural_form].plural_count, nw); + } else { + if ((_show_todo & 2) != 0) strgen_warning("'%s' is untranslated. Tweaking english string to allow compilation for plural forms", _cur_ident); + if (nw > _plural_forms[_lang.plural_form].plural_count) { + nw = _plural_forms[_lang.plural_form].plural_count; + } else { + for (; nw < _plural_forms[_lang.plural_form].plural_count; nw++) { + words[nw] = words[nw - 1]; + } + } + } + } + + buffer->AppendUtf8(SCC_PLURAL_LIST); + buffer->AppendByte(_lang.plural_form); + buffer->AppendByte(TranslateArgumentIdx(argidx, offset)); + EmitWordList(buffer, words, nw); +} + + +void EmitGender(Buffer *buffer, char *buf, int value) +{ + int argidx = _cur_argidx; + int offset = 0; + uint nw; + + if (buf[0] == '=') { + buf++; + + /* This is a {G=DER} command */ + nw = _lang.GetGenderIndex(buf); + if (nw >= MAX_NUM_GENDERS) strgen_fatal("G argument '%s' invalid", buf); + + /* now nw contains the gender index */ + buffer->AppendUtf8(SCC_GENDER_INDEX); + buffer->AppendByte(nw); + } else { + const char *words[MAX_NUM_GENDERS]; + + /* This is a {G 0 foo bar two} command. + * If no relative number exists, default to +0 */ + if (!ParseRelNum(&buf, &argidx, &offset)) {} + + const CmdStruct *cmd = _cur_pcs.cmd[argidx]; + if (cmd == NULL || (cmd->flags & C_GENDER) == 0) { + strgen_fatal("Command '%s' can't have a gender", cmd == NULL ? "<empty>" : cmd->cmd); + } + + for (nw = 0; nw < MAX_NUM_GENDERS; nw++) { + words[nw] = ParseWord(&buf); + if (words[nw] == NULL) break; + } + if (nw != _lang.num_genders) strgen_fatal("Bad # of arguments for gender command"); + + assert(IsInsideBS(cmd->value, SCC_CONTROL_START, UINT8_MAX)); + buffer->AppendUtf8(SCC_GENDER_LIST); + buffer->AppendByte(TranslateArgumentIdx(argidx, offset)); + EmitWordList(buffer, words, nw); + } +} + +static const CmdStruct *FindCmd(const char *s, int len) +{ + for (const CmdStruct *cs = _cmd_structs; cs != endof(_cmd_structs); cs++) { + if (strncmp(cs->cmd, s, len) == 0 && cs->cmd[len] == '\0') return cs; + } + return NULL; +} + +static uint ResolveCaseName(const char *str, uint len) +{ + /* First get a clean copy of only the case name, then resolve it. */ + char case_str[CASE_GENDER_LEN]; + len = min(lengthof(case_str) - 1, len); + memcpy(case_str, str, len); + case_str[len] = '\0'; + + uint8 case_idx = _lang.GetCaseIndex(case_str); + if (case_idx >= MAX_NUM_CASES) strgen_fatal("Invalid case-name '%s'", case_str); + return case_idx + 1; +} + + +/* returns NULL on eof + * else returns command struct */ +static const CmdStruct *ParseCommandString(const char **str, char *param, int *argno, int *casei) +{ + const char *s = *str, *start; + char c; + + *argno = -1; + *casei = -1; + + /* Scan to the next command, exit if there's no next command. */ + for (; *s != '{'; s++) { + if (*s == '\0') return NULL; + } + s++; // Skip past the { + + if (*s >= '0' && *s <= '9') { + char *end; + + *argno = strtoul(s, &end, 0); + if (*end != ':') strgen_fatal("missing arg #"); + s = end + 1; + } + + /* parse command name */ + start = s; + do { + c = *s++; + } while (c != '}' && c != ' ' && c != '=' && c != '.' && c != 0); + + const CmdStruct *cmd = FindCmd(start, s - start - 1); + if (cmd == NULL) { + strgen_error("Undefined command '%.*s'", (int)(s - start - 1), start); + return NULL; + } + + if (c == '.') { + const char *casep = s; + + if (!(cmd->flags & C_CASE)) { + strgen_fatal("Command '%s' can't have a case", cmd->cmd); + } + + do { + c = *s++; + } while (c != '}' && c != ' ' && c != '\0'); + *casei = ResolveCaseName(casep, s - casep - 1); + } + + if (c == '\0') { + strgen_error("Missing } from command '%s'", start); + return NULL; + } + + + if (c != '}') { + if (c == '=') s--; + /* copy params */ + start = s; + for (;;) { + c = *s++; + if (c == '}') break; + if (c == '\0') { + strgen_error("Missing } from command '%s'", start); + return NULL; + } + if (s - start == MAX_COMMAND_PARAM_SIZE) error("param command too long"); + *param++ = c; + } + } + *param = '\0'; + + *str = s; + + return cmd; +} + +/** + * Prepare reading. + * @param data The data to fill during reading. + * @param file The file we are reading. + * @param master Are we reading the master file? + * @param translation Are we reading a translation? + */ +StringReader::StringReader(StringData &data, const char *file, bool master, bool translation) : + data(data), file(strdup(file)), master(master), translation(translation) +{ +} + +/** Make sure the right reader gets freed. */ +StringReader::~StringReader() +{ + free(file); +} + +static void ExtractCommandString(ParsedCommandStruct *p, const char *s, bool warnings) +{ + char param[MAX_COMMAND_PARAM_SIZE]; + int argno; + int argidx = 0; + int casei; + + memset(p, 0, sizeof(*p)); + + for (;;) { + /* read until next command from a. */ + const CmdStruct *ar = ParseCommandString(&s, param, &argno, &casei); + + if (ar == NULL) break; + + /* Sanity checking */ + if (argno != -1 && ar->consumes == 0) strgen_fatal("Non consumer param can't have a paramindex"); + + if (ar->consumes) { + if (argno != -1) argidx = argno; + if (argidx < 0 || (uint)argidx >= lengthof(p->cmd)) strgen_fatal("invalid param idx %d", argidx); + if (p->cmd[argidx] != NULL && p->cmd[argidx] != ar) strgen_fatal("duplicate param idx %d", argidx); + + p->cmd[argidx++] = ar; + } else if (!(ar->flags & C_DONTCOUNT)) { // Ignore some of them + if (p->np >= lengthof(p->pairs)) strgen_fatal("too many commands in string, max " PRINTF_SIZE, lengthof(p->pairs)); + p->pairs[p->np].a = ar; + p->pairs[p->np].v = param[0] != '\0' ? strdup(param) : ""; + p->np++; + } + } +} + + +static const CmdStruct *TranslateCmdForCompare(const CmdStruct *a) +{ + if (a == NULL) return NULL; + + if (strcmp(a->cmd, "STRING1") == 0 || + strcmp(a->cmd, "STRING2") == 0 || + strcmp(a->cmd, "STRING3") == 0 || + strcmp(a->cmd, "STRING4") == 0 || + strcmp(a->cmd, "STRING5") == 0 || + strcmp(a->cmd, "RAW_STRING") == 0) { + return FindCmd("STRING", 6); + } + + return a; +} + + +static bool CheckCommandsMatch(char *a, char *b, const char *name) +{ + /* If we're not translating, i.e. we're compiling the base language, + * it is pointless to do all these checks as it'll always be correct. + * After all, all checks are based on the base language. + */ + if (!_translation) return true; + + ParsedCommandStruct templ; + ParsedCommandStruct lang; + bool result = true; + + ExtractCommandString(&templ, b, true); + ExtractCommandString(&lang, a, true); + + /* For each string in templ, see if we find it in lang */ + if (templ.np != lang.np) { + strgen_warning("%s: template string and language string have a different # of commands", name); + result = false; + } + + for (uint i = 0; i < templ.np; i++) { + /* see if we find it in lang, and zero it out */ + bool found = false; + for (uint j = 0; j < lang.np; j++) { + if (templ.pairs[i].a == lang.pairs[j].a && + strcmp(templ.pairs[i].v, lang.pairs[j].v) == 0) { + /* it was found in both. zero it out from lang so we don't find it again */ + lang.pairs[j].a = NULL; + found = true; + break; + } + } + + if (!found) { + strgen_warning("%s: command '%s' exists in template file but not in language file", name, templ.pairs[i].a->cmd); + result = false; + } + } + + /* if we reach here, all non consumer commands match up. + * Check if the non consumer commands match up also. */ + for (uint i = 0; i < lengthof(templ.cmd); i++) { + if (TranslateCmdForCompare(templ.cmd[i]) != lang.cmd[i]) { + strgen_warning("%s: Param idx #%d '%s' doesn't match with template command '%s'", name, i, + lang.cmd[i] == NULL ? "<empty>" : TranslateCmdForCompare(lang.cmd[i])->cmd, + templ.cmd[i] == NULL ? "<empty>" : templ.cmd[i]->cmd); + result = false; + } + } + + return result; +} + +void StringReader::HandleString(char *str) +{ + if (*str == '#') { + if (str[1] == '#' && str[2] != '#') this->HandlePragma(str + 2); + return; + } + + /* Ignore comments & blank lines */ + if (*str == ';' || *str == ' ' || *str == '\0') return; + + char *s = strchr(str, ':'); + if (s == NULL) { + strgen_error("Line has no ':' delimiter"); + return; + } + + char *t; + /* Trim spaces. + * After this str points to the command name, and s points to the command contents */ + for (t = s; t > str && (t[-1] == ' ' || t[-1] == '\t'); t--) {} + *t = 0; + s++; + + /* Check string is valid UTF-8 */ + const char *tmp; + for (tmp = s; *tmp != '\0';) { + size_t len = Utf8Validate(tmp); + if (len == 0) strgen_fatal("Invalid UTF-8 sequence in '%s'", s); + + WChar c; + Utf8Decode(&c, tmp); + if (c <= 0x001F || // ASCII control character range + (c >= 0xE000 && c <= 0xF8FF) || // Private range + (c >= 0xFFF0 && c <= 0xFFFF)) { // Specials range + strgen_fatal("Unwanted UTF-8 character U+%04X in sequence '%s'", c, s); + } + + tmp += len; + } + + /* Check if the string has a case.. + * The syntax for cases is IDENTNAME.case */ + char *casep = strchr(str, '.'); + if (casep != NULL) *casep++ = '\0'; + + /* Check if this string already exists.. */ + LangString *ent = this->data.Find(str); + + if (this->master) { + if (casep != NULL) { + strgen_error("Cases in the base translation are not supported."); + return; + } + + if (ent != NULL) { + strgen_error("String name '%s' is used multiple times", str); + return; + } + + if (this->data.strings[this->data.next_string_id] != NULL) { + strgen_error("String ID 0x%X for '%s' already in use by '%s'", this->data.next_string_id, str, this->data.strings[this->data.next_string_id]->name); + return; + } + + /* Allocate a new LangString */ + this->data.Add(str, new LangString(str, s, this->data.next_string_id++, _cur_line)); + } else { + if (ent == NULL) { + strgen_warning("String name '%s' does not exist in master file", str); + return; + } + + if (ent->translated && casep == NULL) { + strgen_error("String name '%s' is used multiple times", str); + return; + } + + /* make sure that the commands match */ + if (!CheckCommandsMatch(s, ent->english, str)) return; + + if (casep != NULL) { + ent->translated_case = new Case(ResolveCaseName(casep, strlen(casep)), s, ent->translated_case); + } else { + ent->translated = strdup(s); + /* If the string was translated, use the line from the + * translated language so errors in the translated file + * are properly referenced to. */ + ent->line = _cur_line; + } + } +} + + +static void rstrip(char *buf) +{ + int i = strlen(buf); + while (i > 0 && (buf[i - 1] == '\r' || buf[i - 1] == '\n' || buf[i - 1] == ' ')) i--; + buf[i] = '\0'; +} + +void StringReader::ParseFile() +{ + char buf[2048]; + _warnings = _errors = 0; + + _translation = this->master || this->translation; + _file = this->file; + + /* For each new file we parse, reset the genders, and language codes. */ + MemSetT(&_lang, 0); + strecpy(_lang.digit_group_separator, ",", lastof(_lang.digit_group_separator)); + strecpy(_lang.digit_group_separator_currency, ",", lastof(_lang.digit_group_separator_currency)); + strecpy(_lang.digit_decimal_separator, ".", lastof(_lang.digit_decimal_separator)); + + _cur_line = 1; + while (this->ReadLine(buf, sizeof(buf)) != NULL) { + rstrip(buf); + this->HandleString(buf); + _cur_line++; + } +} + +/** + * Write the header information. + * @param data The data about the string. + */ +void HeaderWriter::WriteHeader(const StringData &data) +{ + int last = 0; + for (size_t i = 0; i < data.max_strings; i++) { + if (data.strings[i] != NULL) { + this->WriteStringID(data.strings[i]->name, i); + last = i; + } + } + + this->WriteStringID("STR_LAST_STRINGID", last); +} + +static int TranslateArgumentIdx(int argidx, int offset) +{ + int sum; + + if (argidx < 0 || (uint)argidx >= lengthof(_cur_pcs.cmd)) { + strgen_fatal("invalid argidx %d", argidx); + } + const CmdStruct *cs = _cur_pcs.cmd[argidx]; + if (cs != NULL && cs->consumes <= offset) { + strgen_fatal("invalid argidx offset %d:%d", argidx, offset); + } + + if (_cur_pcs.cmd[argidx] == NULL) { + strgen_fatal("no command for this argidx %d", argidx); + } + + for (int i = sum = 0; i < argidx; i++) { + const CmdStruct *cs = _cur_pcs.cmd[i]; + + sum += (cs != NULL) ? cs->consumes : 1; + } + + return sum + offset; +} + +static void PutArgidxCommand(Buffer *buffer) +{ + buffer->AppendUtf8(SCC_ARG_INDEX); + buffer->AppendByte(TranslateArgumentIdx(_cur_argidx)); +} + + +static void PutCommandString(Buffer *buffer, const char *str) +{ + _cur_argidx = 0; + + while (*str != '\0') { + /* Process characters as they are until we encounter a { */ + if (*str != '{') { + buffer->AppendByte(*str++); + continue; + } + + char param[MAX_COMMAND_PARAM_SIZE]; + int argno; + int casei; + const CmdStruct *cs = ParseCommandString(&str, param, &argno, &casei); + if (cs == NULL) break; + + if (casei != -1) { + buffer->AppendUtf8(SCC_SET_CASE); // {SET_CASE} + buffer->AppendByte(casei); + } + + /* For params that consume values, we need to handle the argindex properly */ + if (cs->consumes > 0) { + /* Check if we need to output a move-param command */ + if (argno != -1 && argno != _cur_argidx) { + _cur_argidx = argno; + PutArgidxCommand(buffer); + } + + /* Output the one from the master string... it's always accurate. */ + cs = _cur_pcs.cmd[_cur_argidx++]; + if (cs == NULL) { + strgen_fatal("%s: No argument exists at position %d", _cur_ident, _cur_argidx - 1); + } + } + + cs->proc(buffer, param, cs->value); + } +} + +/** + * Write the length as a simple gamma. + * @param length The number to write. + */ +void LanguageWriter::WriteLength(uint length) +{ + char buffer[2]; + int offs = 0; + if (length >= 0x4000) { + strgen_fatal("string too long"); + } + + if (length >= 0xC0) { + buffer[offs++] = (length >> 8) | 0xC0; + } + buffer[offs++] = length & 0xFF; + this->Write((byte*)buffer, offs); +} + +/** + * Actually write the language. + * @param data The data about the string. + */ +void LanguageWriter::WriteLang(const StringData &data) +{ + uint *in_use = AllocaM(uint, data.tabs); + for (size_t tab = 0; tab < data.tabs; tab++) { + uint n = data.CountInUse(tab); + + in_use[tab] = n; + _lang.offsets[tab] = TO_LE16(n); + + for (uint j = 0; j != in_use[tab]; j++) { + const LangString *ls = data.strings[(tab * StringData::STRINGS_IN_TAB) + j]; + if (ls != NULL && ls->translated == NULL) _lang.missing++; + } + } + + _lang.ident = TO_LE32(LanguagePackHeader::IDENT); + _lang.version = TO_LE32(data.Version()); + _lang.missing = TO_LE16(_lang.missing); + _lang.winlangid = TO_LE16(_lang.winlangid); + + this->WriteHeader(&_lang); + Buffer buffer; + + for (size_t tab = 0; tab < data.tabs; tab++) { + for (uint j = 0; j != in_use[tab]; j++) { + const LangString *ls = data.strings[(tab * StringData::STRINGS_IN_TAB) + j]; + const Case *casep; + const char *cmdp; + + /* For undefined strings, just set that it's an empty string */ + if (ls == NULL) { + this->WriteLength(0); + continue; + } + + _cur_ident = ls->name; + _cur_line = ls->line; + + /* Produce a message if a string doesn't have a translation. */ + if (_show_todo > 0 && ls->translated == NULL) { + if ((_show_todo & 2) != 0) { + strgen_warning("'%s' is untranslated", ls->name); + } + if ((_show_todo & 1) != 0) { + const char *s = "<TODO> "; + while (*s != '\0') buffer.AppendByte(*s++); + } + } + + /* Extract the strings and stuff from the english command string */ + ExtractCommandString(&_cur_pcs, ls->english, false); + + if (ls->translated_case != NULL || ls->translated != NULL) { + casep = ls->translated_case; + cmdp = ls->translated; + } else { + casep = NULL; + cmdp = ls->english; + } + + _translated = cmdp != ls->english; + + if (casep != NULL) { + const Case *c; + uint num; + + /* Need to output a case-switch. + * It has this format + * <0x9E> <NUM CASES> <CASE1> <LEN1> <STRING1> <CASE2> <LEN2> <STRING2> <CASE3> <LEN3> <STRING3> <STRINGDEFAULT> + * Each LEN is printed using 2 bytes in big endian order. */ + buffer.AppendUtf8(SCC_SWITCH_CASE); + /* Count the number of cases */ + for (num = 0, c = casep; c; c = c->next) num++; + buffer.AppendByte(num); + + /* Write each case */ + for (c = casep; c != NULL; c = c->next) { + buffer.AppendByte(c->caseidx); + /* Make some space for the 16-bit length */ + size_t pos = buffer.Length(); + buffer.AppendByte(0); + buffer.AppendByte(0); + /* Write string */ + PutCommandString(&buffer, c->string); + buffer.AppendByte(0); // terminate with a zero + /* Fill in the length */ + size_t size = buffer.Length() - (pos + 2); + buffer[pos + 0] = GB(size, 8, 8); + buffer[pos + 1] = GB(size, 0, 8); + } + } + + if (cmdp != NULL) PutCommandString(&buffer, cmdp); + + this->WriteLength(buffer.Length()); + this->Write(buffer.Begin(), buffer.Length()); + buffer.Clear(); + } + } +} |