diff options
author | Eduardo Chappa <echappa@gmx.com> | 2013-02-03 00:59:38 -0700 |
---|---|---|
committer | Eduardo Chappa <echappa@gmx.com> | 2013-02-03 00:59:38 -0700 |
commit | 094ca96844842928810f14844413109fc6cdd890 (patch) | |
tree | e60efbb980f38ba9308ccb4fb2b77b87bbc115f3 /imap/src/c-client/utf8.h | |
download | alpine-094ca96844842928810f14844413109fc6cdd890.tar.xz |
Initial Alpine Version
Diffstat (limited to 'imap/src/c-client/utf8.h')
-rw-r--r-- | imap/src/c-client/utf8.h | 584 |
1 files changed, 584 insertions, 0 deletions
diff --git a/imap/src/c-client/utf8.h b/imap/src/c-client/utf8.h new file mode 100644 index 00000000..105f856d --- /dev/null +++ b/imap/src/c-client/utf8.h @@ -0,0 +1,584 @@ +/* ======================================================================== + * Copyright 1988-2008 University of Washington + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * + * ======================================================================== + */ + +/* + * Program: UTF-8 routines + * + * Author: Mark Crispin + * Networks and Distributed Computing + * Computing & Communications + * University of Washington + * Administration Building, AG-44 + * Seattle, WA 98195 + * Internet: MRC@CAC.Washington.EDU + * + * Date: 11 June 1997 + * Last Edited: 17 January 2008 + */ + +/* UTF-8 size and conversion routines from UCS-2 values (thus in the BMP). + * Don't use these if UTF-16 data (surrogate pairs) are an issue. + * For UCS-4 values, use the utf8_size() and utf8_put() functions. + */ + +#define UTF8_SIZE_BMP(c) ((c & 0xff80) ? ((c & 0xf800) ? 3 : 2) : 1) + +#define UTF8_PUT_BMP(b,c) { \ + if (c & 0xff80) { /* non-ASCII? */ \ + if (c & 0xf800) { /* three byte code */ \ + *b++ = 0xe0 | (c >> 12); \ + *b++ = 0x80 | ((c >> 6) & 0x3f); \ + } \ + else *b++ = 0xc0 | ((c >> 6) & 0x3f); \ + *b++ = 0x80 | (c & 0x3f); \ + } \ + else *b++ = c; \ +} + +/* utf8_text() flag values */ + +#define U8T_CASECANON 2 /* canonicalize case */ +#define U8T_DECOMPOSE 4 /* decompose */ + /* full canonicalization */ +#define U8T_CANONICAL (U8T_CASECANON | U8T_DECOMPOSE) + + +/* utf8_get() return values */ + + /* 0x0000 - 0xffff BMP plane */ +#define U8GM_NONBMP 0xffff0000 /* mask for non-BMP values */ + /* 0x10000 - 0x10ffff extended planes */ + /* 0x110000 - 0x7ffffff non-Unicode */ +#define U8G_ERROR 0x80000000 /* error flag */ +#define U8G_BADCONT U8G_ERROR+1 /* continuation when not in progress */ +#define U8G_INCMPLT U8G_ERROR+2 /* incomplete UTF-8 character */ +#define U8G_NOTUTF8 U8G_ERROR+3 /* not a valid UTF-8 octet */ +#define U8G_ENDSTRG U8G_ERROR+4 /* end of string */ +#define U8G_ENDSTRI U8G_ERROR+5 /* end of string w/ incomplete UTF-8 char */ +#define U8G_SURROGA U8G_ERROR+6 /* surrogate codepoint */ +#define U8G_NOTUNIC U8G_ERROR+7 /* non-Unicode codepoint */ + + +/* ucs4_width() return values */ + +#define U4W_ERROR 0x80000000 /* error flags */ +#define U4W_NOTUNCD U4W_ERROR+1 /* not a Unicode char */ +#define U4W_PRIVATE U4W_ERROR+2 /* private-space plane */ +#define U4W_SSPCHAR U4W_ERROR+3 /* Supplementary Special-purpose Plane */ +#define U4W_UNASSGN U4W_ERROR+4 /* unassigned space plane */ +#define U4W_CONTROL U4W_ERROR+5 /* C0/C1 control */ +#define U4W_CTLSRGT U4W_CONTROL /* in case legacy code references this */ + +/* ISO-2022 engine states */ + +#define I2S_CHAR 0 /* character */ +#define I2S_ESC 1 /* previous character was ESC */ +#define I2S_MUL 2 /* previous character was multi-byte code */ +#define I2S_INT 3 /* previous character was intermediate */ + + +/* ISO-2022 Gn selections */ + +#define I2C_G0 0 /* G0 */ +#define I2C_G1 1 /* G1 */ +#define I2C_G2 2 /* G2 */ +#define I2C_G3 3 /* G3 */ +#define I2C_SG2 (2 << 2) /* single shift G2 */ +#define I2C_SG3 (3 << 2) /* single shift G2 */ + + +/* ISO-2022 octet definitions */ + +#define I2C_ESC 0x1b /* ESCape */ + + /* Intermediate character */ +#define I2C_STRUCTURE 0x20 /* announce code structure */ +#define I2C_C0 0x21 /* C0 */ +#define I2C_C1 0x22 /* C1 */ +#define I2C_CONTROL 0x23 /* single control function */ +#define I2C_MULTI 0x24 /* multi-byte character set */ +#define I2C_OTHER 0x25 /* other coding system */ +#define I2C_REVISED 0x26 /* revised registration */ +#define I2C_G0_94 0x28 /* G0 94-character set */ +#define I2C_G1_94 0x29 /* G1 94-character set */ +#define I2C_G2_94 0x2A /* G2 94-character set */ +#define I2C_G3_94 0x2B /* G3 94-character set */ +#define I2C_G0_96 0x2C /* (not in ISO-2022) G0 96-character set */ +#define I2C_G1_96 0x2D /* G1 96-character set */ +#define I2C_G2_96 0x2E /* G2 96-character set */ +#define I2C_G3_96 0x2F /* G3 96-character set */ + + /* Locking shifts */ +#define I2C_SI 0x0f /* lock shift to G0 (Shift In) */ +#define I2C_SO 0x0e /* lock shift to G1 (Shift Out) */ + /* prefixed by ESC */ +#define I2C_LS2 0x6e /* lock shift to G2 */ +#define I2C_LS3 0x6f /* lock shift to G3 */ +#define I2C_LS1R 0x7e /* lock shift GR to G1 */ +#define I2C_LS2R 0x7d /* lock shift GR to G2 */ +#define I2C_LS3R 0x7c /* lock shift GR to G3 */ + + /* Single shifts */ +#define I2C_SS2_ALT 0x8e /* single shift to G2 (SS2) */ +#define I2C_SS3_ALT 0x8f /* single shift to G3 (SS3) */ +#define I2C_SS2_ALT_7 0x19 /* single shift to G2 (SS2) */ +#define I2C_SS3_ALT_7 0x1d /* single shift to G3 (SS3) */ + /* prefixed by ESC */ +#define I2C_SS2 0x4e /* single shift to G2 (SS2) */ +#define I2C_SS3 0x4f /* single shift to G3 (SS3) */ + +/* 94 character sets */ + + /* 4/0 ISO 646 IRV */ +#define I2CS_94_BRITISH 0x41 /* 4/1 ISO 646 British */ +#define I2CS_94_ASCII 0x42 /* 4/2 ISO 646 USA (ASCII) */ + /* 4/3 NATS Finland/Sweden (primary) */ + /* 4/4 NATS Finland/Sweden (secondary) */ + /* 4/5 NATS Denmark/Norway (primary) */ + /* 4/6 NATS Denmark/Norway (secondary) */ + /* 4/7 ISO 646 Swedish SEN 850200 */ + /* 4/8 ISO 646 Swedish names */ +#define I2CS_94_JIS_BUGROM 0x48 /* 4/8 some buggy software does this */ +#define I2CS_94_JIS_KANA 0x49 /* 4/9 JIS X 0201-1976 right half */ +#define I2CS_94_JIS_ROMAN 0x4a /* 4/a JIS X 0201-1976 left half */ + /* 4/b ISO 646 German */ + /* 4/c ISO 646 Portuguese (Olivetti) */ + /* 4/d ISO 6438 African */ + /* 4/e ISO 5427 Cyrillic (Honeywell-Bull) */ + /* 4/f DIN 31624 extended bibliography */ + /* 5/0 ISO 5426-1980 Bibliography */ + /* 5/1 ISO 5427-1981 Cyrillic*/ + /* 5/2 ISO 646 French (withdrawn) */ + /* 5/3 ISO 5428-1980 Greek bibliography */ + /* 5/4 GB 1988-80 Chinese */ + /* 5/5 Latin-Greek (Honeywell-Bull) */ + /* 5/6 UK Viewdata/Teletext */ + /* 5/7 INIS (IRV subset) */ + /* 5/8 ISO 5428 Greek Bibliography */ + /* 5/9 ISO 646 Italian (Olivetti) */ + /* 5/a ISO 646 Spanish (Olivetti) */ + /* 5/b Greek (Olivetti) */ + /* 5/c Latin-Greek (Olivetti) */ + /* 5/d INIS non-standard extension */ + /* 5/e INIS Cyrillic extension */ + /* 5/f Arabic CODAR-U IERA */ + /* 6/0 ISO 646 Norwegian */ + /* 6/1 Norwegian version 2 (withdrawn) */ + /* 6/2 Videotex supplementary */ + /* 6/3 Videotex supplementary #2 */ + /* 6/4 Videotex supplementary #3 */ + /* 6/5 APL */ + /* 6/6 ISO 646 French */ + /* 6/7 ISO 646 Portuguese (IBM) */ + /* 6/8 ISO 646 Spanish (IBM) */ + /* 6/9 ISO 646 Hungarian */ + /* 6/a Greek ELOT (withdrawn) */ + /* 6/b ISO 9036 Arabic 7-bit */ + /* 6/c ISO 646 IRV supplementary set */ + /* 6/d JIS C6229-1984 OCR-A */ + /* 6/e JIS C6229-1984 OCR-B */ + /* 6/f JIS C6229-1984 OCR-B additional */ + /* 7/0 JIS C6229-1984 hand-printed */ + /* 7/1 JIS C6229-1984 additional hand-printd */ + /* 7/2 JIS C6229-1984 katakana hand-printed */ + /* 7/3 E13B Japanese graphic */ + /* 7/4 Supplementary Videotex (withdrawn) */ + /* 7/5 Teletex primary CCITT T.61 */ + /* 7/6 Teletex secondary CCITT T.61 */ + /* 7/7 CSA Z 243.4-1985 Alternate primary #1 */ + /* 7/8 CSA Z 243.4-1985 Alternate primary #2 */ + /* 7/9 Mosaic CCITT T.101 */ + /* 7/a Serbocroatian/Slovenian Latin */ + /* 7/b Serbocroatian Cyrillic */ + /* 7/c Supplementary CCITT T.101 */ + /* 7/d Macedonian Cyrillic */ + +/* 94 character sets - second intermediate byte */ + + /* 4/0 Greek primary CCITT */ + /* 4/1 Cuba */ + /* 4/2 ISO/IEC 646 invariant */ + /* 4/3 Irish Gaelic 7-bit */ + /* 4/4 Turkmen */ + + +/* 94x94 character sets */ + +#define I2CS_94x94_JIS_OLD 0x40 /* 4/0 JIS X 0208-1978 */ +#define I2CS_94x94_GB 0x41 /* 4/1 GB 2312 */ +#define I2CS_94x94_JIS_NEW 0x42 /* 4/2 JIS X 0208-1983 */ +#define I2CS_94x94_KSC 0x43 /* 4/3 KSC 5601 */ +#define I2CS_94x94_JIS_EXT 0x44 /* 4/4 JIS X 0212-1990 */ + /* 4/5 CCITT Chinese */ + /* 4/6 Blisssymbol Graphic */ +#define I2CS_94x94_CNS1 0x47 /* 4/7 CNS 11643 plane 1 */ +#define I2CS_94x94_CNS2 0x48 /* 4/8 CNS 11643 plane 2 */ +#define I2CS_94x94_CNS3 0x49 /* 4/9 CNS 11643 plane 3 */ +#define I2CS_94x94_CNS4 0x4a /* 4/a CNS 11643 plane 4 */ +#define I2CS_94x94_CNS5 0x4b /* 4/b CNS 11643 plane 5 */ +#define I2CS_94x94_CNS6 0x4c /* 4/c CNS 11643 plane 6 */ +#define I2CS_94x94_CNS7 0x4d /* 4/d CNS 11643 plane 7 */ + /* 4/e DPRK (North Korea) KGCII */ + /* 4/f JGCII plane 1 */ + /* 5/0 JGCII plane 2 */ + +/* 96 character sets */ + +#define I2CS_96_ISO8859_1 0x41 /* 4/1 Latin-1 (Western Europe) */ +#define I2CS_96_ISO8859_2 0x42 /* 4/2 Latin-2 (Czech, Slovak) */ +#define I2CS_96_ISO8859_3 0x43 /* 4/3 Latin-3 (Dutch, Turkish) */ +#define I2CS_96_ISO8859_4 0x44 /* 4/4 Latin-4 (Scandinavian) */ + /* 4/5 CSA Z 243.4-1985 */ +#define I2CS_96_ISO8859_7 0x46 /* 4/6 Greek */ +#define I2CS_96_ISO8859_6 0x47 /* 4/7 Arabic */ +#define I2CS_96_ISO8859_8 0x48 /* 4/8 Hebrew */ + /* 4/9 Czechoslovak CSN 369103 */ + /* 4/a Supplementary Latin and non-alpha */ + /* 4/b Technical */ +#define I2CS_96_ISO8859_5 0x4c /* 4/c Cyrillic */ +#define I2CS_96_ISO8859_9 0x4d /* 4/d Latin-5 (Finnish, Portuguese) */ + /* 4/e ISO 6937-2 residual */ + /* 4/f Basic Cyrillic */ + /* 5/0 Supplementary Latin 1, 2 and 5 */ + /* 5/1 Basic Box */ + /* 5/2 Supplementary ISO/IEC 6937 : 1992 */ + /* 5/3 CCITT Hebrew supplementary */ +#define I2CS_96_TIS620 0x54 /* 5/4 TIS 620 */ + /* 5/5 Arabic/French/German */ +#define I2CS_96_ISO8859_10 0x56 /* 5/6 Latin-6 (Northern Europe) */ + /* 5/7 ??? */ + /* 5/8 Sami (Lappish) supplementary */ +#define I2CS_96_ISO8859_13 0x59 /* 5/9 Latin-7 (Baltic) */ +#define I2CS_96_VSCII 0x5a /* 5/a Vietnamese */ + /* 5/b Technical #1 IEC 1289 */ +#define I2CS_96_ISO8859_14 0x5c /* 5/c Latin-8 (Celtic) */ + /* 5/d Sami supplementary Latin */ + /* 5/e Latin/Hebrew */ + /* 5/f Celtic supplementary Latin */ + /* 6/0 Uralic supplementary Cyrillic */ + /* 6/1 Volgaic supplementary Cyrillic */ +#define I2CS_96_ISO8859_15 0x62 /* 6/2 Latin-9 (Euro) */ + /* 6/3 Latin-1 with Euro */ + /* 6/4 Latin-4 with Euro */ + /* 6/5 Latin-7 with Euro */ +#define I2CS_96_ISO8859_16 0x66 /* 6/6 Latin-10 (Balkan) */ + /* 6/7 Ogham */ + /* 6/8 Sami supplementary Latin #2 */ + /* 7/d Supplementary Mosaic for CCITT 101 */ + +/* 96x96 character sets */ + +/* Types of character sets */ + +#define I2CS_94 0x000 /* 94 character set */ +#define I2CS_96 0x100 /* 96 character set */ +#define I2CS_MUL 0x200 /* multi-byte */ +#define I2CS_94x94 (I2CS_MUL | I2CS_94) +#define I2CS_96x96 (I2CS_MUL | I2CS_96) + + +/* Character set identifiers stored in Gn */ + +#define I2CS_BRITISH (I2CS_94 | I2CS_94_BRITISH) +#define I2CS_ASCII (I2CS_94 | I2CS_94_ASCII) +#define I2CS_JIS_BUGROM (I2CS_94 | I2CS_94_JIS_BUGROM) +#define I2CS_JIS_KANA (I2CS_94 | I2CS_94_JIS_KANA) +#define I2CS_JIS_ROMAN (I2CS_94 | I2CS_94_JIS_ROMAN) +#define I2CS_JIS_OLD (I2CS_94x94 | I2CS_94x94_JIS_OLD) +#define I2CS_GB (I2CS_94x94 | I2CS_94x94_GB) +#define I2CS_JIS_NEW (I2CS_94x94 | I2CS_94x94_JIS_NEW) +#define I2CS_KSC (I2CS_94x94 | I2CS_94x94_KSC) +#define I2CS_JIS_EXT (I2CS_94x94 | I2CS_94x94_JIS_EXT) +#define I2CS_CNS1 (I2CS_94x94 | I2CS_94x94_CNS1) +#define I2CS_CNS2 (I2CS_94x94 | I2CS_94x94_CNS2) +#define I2CS_CNS3 (I2CS_94x94 | I2CS_94x94_CNS3) +#define I2CS_CNS4 (I2CS_94x94 | I2CS_94x94_CNS4) +#define I2CS_CNS5 (I2CS_94x94 | I2CS_94x94_CNS5) +#define I2CS_CNS6 (I2CS_94x94 | I2CS_94x94_CNS6) +#define I2CS_CNS7 (I2CS_94x94 | I2CS_94x94_CNS7) +#define I2CS_ISO8859_1 (I2CS_96 | I2CS_96_ISO8859_1) +#define I2CS_ISO8859_2 (I2CS_96 | I2CS_96_ISO8859_2) +#define I2CS_ISO8859_3 (I2CS_96 | I2CS_96_ISO8859_3) +#define I2CS_ISO8859_4 (I2CS_96 | I2CS_96_ISO8859_4) +#define I2CS_ISO8859_7 (I2CS_96 | I2CS_96_ISO8859_7) +#define I2CS_ISO8859_6 (I2CS_96 | I2CS_96_ISO8859_6) +#define I2CS_ISO8859_8 (I2CS_96 | I2CS_96_ISO8859_8) +#define I2CS_ISO8859_5 (I2CS_96 | I2CS_96_ISO8859_5) +#define I2CS_ISO8859_9 (I2CS_96 | I2CS_96_ISO8859_9) +#define I2CS_TIS620 (I2CS_96 | I2CS_96_TIS620) +#define I2CS_ISO8859_10 (I2CS_96 | I2CS_96_ISO8859_10) +#define I2CS_ISO8859_13 (I2CS_96 | I2CS_96_ISO8859_13) +#define I2CS_VSCII (I2CS_96 | I2CS_96_VSCII) +#define I2CS_ISO8859_14 (I2CS_96 | I2CS_96_ISO8859_14) +#define I2CS_ISO8859_15 (I2CS_96 | I2CS_96_ISO8859_15) +#define I2CS_ISO8859_16 (I2CS_96 | I2CS_96_ISO8859_16) + + +/* Miscellaneous ISO 2022 definitions */ + +#define EUC_CS2 0x8e /* single shift CS2 */ +#define EUC_CS3 0x8f /* single shift CS3 */ + +#define BITS7 0x7f /* 7-bit value mask */ +#define BIT8 0x80 /* 8th bit mask */ + +/* The following saves us from having to have yet more charset tables */ + +/* Unicode codepoints */ + +#define UCS2_C0CONTROL 0x00 /* first C0 control */ +#define UCS2_C0CONTROLEND 0x1F /* last C0 control */ +#define UCS2_C1CONTROL 0x80 /* first C1 control */ +#define UCS2_C1CONTROLEND 0x9F /* last C1 control */ + + /* ISO 646 substituted Unicode codepoints */ +#define UCS2_POUNDSTERLING 0x00a3 +#define UCS2_YEN 0x00a5 +#define UCS2_OVERLINE 0x203e +#define UCS2_EURO 0x20ac +#define UCS2_KATAKANA 0xff61 /* first katakana codepoint */ +#define UCS2_BOM 0xfeff /* byte order mark */ +#define UCS2_BOGON 0xfffd /* replacement character */ + /* next two codepoints are not Unicode chars */ +#define UCS2_BOMCHECK 0xfffe /* used to check byte order with UCS2_BOM */ +#define UCS2_NOTCHAR 0xffff /* not a character */ + +#define UCS4_BMPBASE 0x0000 /* Basic Multilingual Plane */ +#define UCS4_SMPBASE 0x10000 /* Supplementary Multilinugual Plane */ +#define UCS4_SIPBASE 0x20000 /* Supplementary Ideographic Plane */ + /* EastAsianWidth says plane 3 is wide */ +#define UCS4_UNABASE 0x40000 /* unassigned space */ +#define UCS4_SSPBASE 0xe0000 /* Supplementary Special-purpose Plane */ +#define UCS4_PVTBASE 0xf0000 /* private-space (two planes) */ +#define UCS4_MAXUNICODE 0x10ffff/* highest Unicode codepoint */ + +#define UTF16_BASE 0x10000 /* base of codepoints needing surrogates */ +#define UTF16_SHIFT 10 /* surrogate shift */ +#define UTF16_MASK 0x3ff /* surrogate mask */ +#define UTF16_SURR 0xd800 /* UTF-16 surrogate area */ +#define UTF16_SURRH 0xd800 /* UTF-16 first high surrogate */ +#define UTF16_SURRHEND 0xdbff /* UTF-16 last high surrogate */ +#define UTF16_SURRL 0xdc00 /* UTF-16 first low surrogate */ +#define UTF16_SURRLEND 0xdfff /* UTF-16 last low surrogate */ +#define UTF16_MAXSURR 0xdfff /* end of UTF-16 surrogates */ + + +/* UBOGON is used to represent a codepoint in a character set which does not + * map to Unicode. It is also used for mapping failures, e.g. incomplete + * shift sequences. This name has the same text width as 0x????, for + * convenience in the mapping tables. + * + * NOCHAR is used to represent a codepoint in Unicode which does not map to + * the target character set in a reverse mapping table. This name has the + * same text width as 0x???? in case we ever add static reverse mapping tables. + */ + +#define UBOGON UCS2_BOGON +#define NOCHAR UCS2_NOTCHAR + +/* Codepoints in non-Unicode character sets */ + +/* Codepoints in ISO 646 character sets */ + +/* British ASCII codepoints */ + +#define BRITISH_POUNDSTERLING 0x23 + +/* JIS Roman codepoints */ + +#define JISROMAN_YEN 0x5c +#define JISROMAN_OVERLINE 0x7e + + +/* Hankaku katakana codepoints & parameters + * + * In earlier versions, MAX_KANA_7 and MAX_KANA_8 were the maximum codepoint + * values. Although this made sense, it was confusing with the "max ku" and + * "max ten" values used in the double-byte tables; there are 1-origin, but + * the calculated values used for "ku" and "ten" are 0-origin (derived by + * substracting the "base"). What this all meant is that for double byte + * characters the limit test is of the form (value < max_ku), but for single + * byte characters (which used the same cell to hold the max ku) the limit + * test was (value <= max_ku). + * + * By making MAX_KANA_[78] be maximum+1, the same (value < max_ku) limit test + * is used throughout. - 6/15/2006 + */ + +#define MIN_KANA_7 0x21 +#define MAX_KANA_7 0x60 /* maximum value + 1 */ +#define KANA_7 (UCS2_KATAKANA - MIN_KANA_7) +#define MIN_KANA_8 (MIN_KANA_7 | BIT8) +#define MAX_KANA_8 (MAX_KANA_7 | BIT8) +#define KANA_8 (UCS2_KATAKANA - MIN_KANA_8) + +/* Charset scripts */ + +/* The term "script" is used here in a very loose sense, enough to make + * purists cringe. Basically, the idea is to give the main program some + * idea of how it should treat the characters of text in a charset with + * respect to font, drawing routines, etc. + * + * In some cases, "script" is associated with a charset; in other cases, + * it's more closely tied to a language. + */ + +#define SC_UNICODE 0x1 /* Unicode */ +#define SC_LATIN_1 0x10 /* Western Europe */ +#define SC_LATIN_2 0x20 /* Eastern Europe */ +#define SC_LATIN_3 0x40 /* Southern Europe */ +#define SC_LATIN_4 0x80 /* Northern Europe */ +#define SC_LATIN_5 0x100 /* Turkish */ +#define SC_LATIN_6 0x200 /* Nordic */ +#define SC_LATIN_7 0x400 /* Baltic */ +#define SC_LATIN_8 0x800 /* Celtic */ +#define SC_LATIN_9 0x1000 /* Euro */ +#define SC_LATIN_0 SC_LATIN_9 /* colloquial name for Latin-9 */ +#define SC_ARABIC 0x2000 +#define SC_CYRILLIC 0x4000 +#define SC_GREEK 0x8000 +#define SC_HEBREW 0x10000 +#define SC_THAI 0x20000 +#define SC_UKRANIAN 0x40000 +#define SC_LATIN_10 0x80000 /* Balkan */ +#define SC_VIETNAMESE 0x100000 +#define SC_CHINESE_SIMPLIFIED 0x1000000 +#define SC_CHINESE_TRADITIONAL 0x2000000 +#define SC_JAPANESE 0x4000000 +#define SC_KOREAN 0x8000000 + + +/* Script table */ + +typedef struct utf8_scent { + char *name; /* script name */ + char *description; /* script description */ + unsigned long script; /* script bitmask */ +} SCRIPT; + +/* Character set table support */ + +typedef struct utf8_csent { + char *name; /* charset name */ + unsigned short type; /* type of charset */ + unsigned short flags; /* charset flags */ + void *tab; /* additional data */ + unsigned long script; /* script(s) implemented by this charset */ + char *preferred; /* preferred charset over this one */ +} CHARSET; + + +struct utf8_eucparam { + unsigned int base_ku : 8; /* base row */ + unsigned int base_ten : 8; /* base column */ + unsigned int max_ku : 8; /* maximum row */ + unsigned int max_ten : 8; /* maximum column */ + void *tab; /* conversion table */ +}; + + +/* Charset types */ + +#define CT_UNKNOWN 0 /* unknown 8-bit */ +#define CT_ASCII 1 /* 7-bit ASCII no table */ +#define CT_UCS2 2 /* 2 byte 16-bit Unicode no table */ +#define CT_UCS4 3 /* 4 byte 32-bit Unicode no table */ +#define CT_1BYTE0 10 /* 1 byte ISO 8859-1 no table */ +#define CT_1BYTE 11 /* 1 byte ASCII + table 0x80-0xff */ +#define CT_1BYTE8 12 /* 1 byte table 0x00 - 0xff */ +#define CT_EUC 100 /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ +#define CT_DBYTE 101 /* 2 byte ASCII + utf8_eucparam */ +#define CT_DBYTE2 102 /* 2 byte ASCII + utf8_eucparam plane1/2 */ +#define CT_UTF16 1000 /* variable UTF-16 encoded Unicode no table */ +#define CT_UTF8 1001 /* variable UTF-8 encoded Unicode no table */ +#define CT_UTF7 1002 /* variable UTF-7 encoded Unicode no table */ +#define CT_2022 10000 /* variable ISO-2022 encoded no table */ +#define CT_SJIS 10001 /* 2 byte Shift-JIS encoded JIS no table */ + + +/* Character set flags */ + +#define CF_PRIMARY 0x1 /* primary name for this charset */ +#define CF_DISPLAY 0x2 /* charset used in displays */ +#define CF_POSTING 0x4 /* charset used in email posting */ +#define CF_UNSUPRT 0x8 /* charset unsupported (can't convert to it) */ +#define CF_NOEMAIL 0x10 /* charset not used in email */ + + +/* UTF-7 engine states */ + +#define U7_ASCII 0 /* ASCII character */ +#define U7_PLUS 1 /* plus seen */ +#define U7_UNICODE 2 /* Unicode characters */ +#define U7_MINUS 3 /* absorbed minus seen */ + +/* Function prototypes */ + +typedef unsigned long (*ucs4cn_t) (unsigned long c); +typedef unsigned long (*ucs4de_t) (unsigned long c,void **more); + +SCRIPT *utf8_script (char *script); +const CHARSET *utf8_charset (char *charset); +char *utf8_badcharset (char *charset); +long utf8_text (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret,long flags); +long utf8_text_cs (SIZEDTEXT *text,const CHARSET *cs,SIZEDTEXT *ret, + ucs4cn_t cv,ucs4de_t de); +long utf8_cstext (SIZEDTEXT *text,char *charset,SIZEDTEXT *ret, + unsigned long errch); +long utf8_cstocstext (SIZEDTEXT *text,char *sc,SIZEDTEXT *ret,char *dc, + unsigned long errch); +unsigned short *utf8_rmap (char *charset); +unsigned short *utf8_rmap_cs (const CHARSET *cs); +unsigned short *utf8_rmap_gen (const CHARSET *cs,unsigned short *oldmap); +long utf8_rmaptext (SIZEDTEXT *text,unsigned short *rmap,SIZEDTEXT *ret, + unsigned long errch,long iso2022jp); +unsigned long utf8_rmapsize (SIZEDTEXT *text,unsigned short *rmap, + unsigned long errch,long iso2022jp); +long ucs4_rmaptext (unsigned long *ucs4,unsigned long len,unsigned short *rmap, + SIZEDTEXT *ret,unsigned long errch); +long ucs4_rmaplen (unsigned long *ucs4,unsigned long len,unsigned short *rmap, + unsigned long errch); +long ucs4_rmapbuf (unsigned char *t,unsigned long *ucs4,unsigned long len, + unsigned short *rmap,unsigned long errch); +unsigned long utf8_get (unsigned char **s,unsigned long *i); +unsigned long utf8_get_raw (unsigned char **s,unsigned long *i); +unsigned long ucs4_cs_get (CHARSET *cs,unsigned char **s,unsigned long *i); +unsigned long *utf8_csvalidmap (char *charsets[]); +const CHARSET *utf8_infercharset (SIZEDTEXT *src); +long utf8_validate (unsigned char *s,unsigned long i); +void utf8_text_1byte0 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); +void utf8_text_1byte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, + ucs4de_t de); +void utf8_text_1byte8 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, + ucs4de_t de); +void utf8_text_euc (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, + ucs4de_t de); +void utf8_text_dbyte (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, + ucs4de_t de); +void utf8_text_dbyte2 (SIZEDTEXT *text,SIZEDTEXT *ret,void *tab,ucs4cn_t cv, + ucs4de_t de); +void utf8_text_sjis (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); +void utf8_text_2022 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); +void utf8_text_utf7 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); +void utf8_text_utf8 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); +void utf8_text_ucs2 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); +void utf8_text_ucs4 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); +void utf8_text_utf16 (SIZEDTEXT *text,SIZEDTEXT *ret,ucs4cn_t cv,ucs4de_t de); +unsigned long utf8_size (unsigned long c); +unsigned char *utf8_put (unsigned char *s,unsigned long c); +unsigned long ucs4_titlecase (unsigned long c); +long ucs4_width (unsigned long c); +long utf8_strwidth (unsigned char *s); +long utf8_textwidth (SIZEDTEXT *utf8); +unsigned long ucs4_decompose (unsigned long c,void **more); +unsigned long ucs4_decompose_recursive (unsigned long c,void **more); |