diff options
Diffstat (limited to 'pith/charset.c')
-rw-r--r-- | pith/charset.c | 929 |
1 files changed, 929 insertions, 0 deletions
diff --git a/pith/charset.c b/pith/charset.c new file mode 100644 index 00000000..6177c7c4 --- /dev/null +++ b/pith/charset.c @@ -0,0 +1,929 @@ +#if !defined(lint) && !defined(DOS) +static char rcsid[] = "$Id: charset.c 1032 2008-04-11 00:30:04Z hubert@u.washington.edu $"; +#endif + +/* + * ======================================================================== + * Copyright 2006-2008 University of Washington + * Copyright 2013 Eduardo Chappa + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * ======================================================================== + */ + +#include "../pith/headers.h" +#include "../pith/charset.h" +#include "../pith/state.h" +#include "../pith/conf.h" +#include "../pith/escapes.h" +#include "../pith/mimedesc.h" +#include "../pith/filter.h" +#include "../pith/string.h" +#include "../pith/options.h" + + +/* + * Internal prototypes + */ +int rfc1522_token(char *, int (*)(int), char *, char **); +int rfc1522_valtok(int); +int rfc1522_valenc(int); +int rfc1522_valid(char *, char **, char **, char **, char **); +void rfc1522_copy_and_transliterate(unsigned char *, unsigned char **, size_t, + unsigned char *, unsigned long, char *); +unsigned char *rfc1522_encoded_word(unsigned char *, int, char *); +char *rfc1522_8bit(void *, int); +char *rfc1522_binary(void *, int); + + +char * +body_charset(MAILSTREAM *stream, long int msgno, unsigned char *section) +{ + BODY *body; + char *charset; + + + if((body = mail_body(stream, msgno, section)) && body->type == TYPETEXT){ + if(!(charset = parameter_val(body->parameter, "charset"))) + charset = cpystr("US-ASCII"); + + return(charset); + } + + return(NULL); +} + + +/* + * Copies the source string into allocated space with the 8-bit EUC codes + * (on Unix) or the Shift-JIS (on PC) converted into ISO-2022-JP. + * Caller is responsible for freeing the result. + */ +unsigned char * +trans_euc_to_2022_jp(unsigned char *src) +{ + size_t len, alloc; + unsigned char *rv, *p, *q; + int inside_esc_seq = 0; + int c1 = -1; /* remembers first of pair for Shift-JIS */ + + if(!src) + return(NULL); + + len = strlen((char *) src); + + /* + * Worst possible increase is every other character an 8-bit character. + * In that case, each of those gets 6 extra charactes for the escape + * sequences. We're not too concerned about the extra length because + * these are relatively short strings. + */ + alloc = len + 1 + ((len+1)/2) * 6; + rv = (unsigned char *) fs_get(alloc * sizeof(char)); + + for(p = src, q = rv; *p; p++){ + if(inside_esc_seq){ + if(c1 >= 0){ /* second of a pair? */ + int adjust = *p < 159; + int rowOffset = c1 < 160 ? 112 : 176; + int cellOffset = adjust ? (*p > 127 ? 32 : 31) : 126; + + *q++ = ((c1 - rowOffset) << 1) - adjust; + *q++ = *p - cellOffset; + c1 = -1; + } + else if(*p & 0x80){ + *q++ = (*p & 0x7f); + } + else{ + *q++ = '\033'; + *q++ = '('; + *q++ = 'B'; + *q++ = (*p); + c1 = -1; + inside_esc_seq = 0; + } + } + else{ + if(*p & 0x80){ + *q++ = '\033'; + *q++ = '$'; + *q++ = 'B'; + *q++ = (*p & 0x7f); + inside_esc_seq = 1; + } + else{ + *q++ = (*p); + } + } + } + + if(inside_esc_seq){ + *q++ = '\033'; + *q++ = '('; + *q++ = 'B'; + } + + *q = '\0'; + + return(rv); +} + + +/* + * * * * * * * * * RFC 1522 support routines * * * * * * * * + * + * RFC 1522 support is *very* loosely based on code contributed + * by Lars-Erik Johansson <lej@cdg.chalmers.se>. Thanks to Lars-Erik, + * and appologies for taking such liberties with his code. + */ + +#define RFC1522_INIT "=?" +#define RFC1522_INIT_L 2 +#define RFC1522_TERM "?=" +#define RFC1522_TERM_L 2 +#define RFC1522_DLIM "?" +#define RFC1522_DLIM_L 1 +#define RFC1522_MAXW 256 /* RFC's say 75, but no senders seem to care*/ +#define ESPECIALS "()<>@,;:\"/[]?.=" +#define RFC1522_OVERHEAD(S) (RFC1522_INIT_L + RFC1522_TERM_L + \ + (2 * RFC1522_DLIM_L) + strlen(S) + 1); +#define RFC1522_ENC_CHAR(C) (((C) & 0x80) || !rfc1522_valtok(C) \ + || (C) == '_' ) + +/* + * rfc1522_decode_to_utf8 - try to decode the given source string ala RFC 2047 + * (obsoleted RFC 1522) into the given destination buffer, + * encoded in UTF-8. + * + * How large should d be? The decoded string of octets will fit in + * the same size string as the source string. However, because we're + * translating that into UTF-8 the result may expand. Currently the + * Thai character set has single octet characters which expand to + * three octets in UTF-8. So it would be safe to use 3 * strlen(s) + * for the size of d. One can imagine a currently non-existent + * character set that expanded to 4 octets instead, so use 4 to be + * super safe. + * + * Returns: pointer to either the destination buffer containing the + * decoded text, or a pointer to the source buffer if there was + * no valid 'encoded-word' found during scanning. + */ +unsigned char * +rfc1522_decode_to_utf8(unsigned char *d, size_t len, char *s) +{ + unsigned char *rv = NULL, *p; + char *start = s, *sw, *enc, *txt, *ew, **q, *lang; + char *cset; + unsigned long l; + int i; + + *d = '\0'; /* init destination */ + + while(s && (sw = strstr(s, RFC1522_INIT))){ + if(!rv) /* there's something to do, init it */ + rv = d; + /* validate the rest of the encoded-word */ + if(rfc1522_valid(sw, &cset, &enc, &txt, &ew)){ + /* + * We may have been putting off copying the first part of the + * source while waiting to see if we have to copy at all. + */ + if(rv == d && s != start){ + rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) start, + sw - start, NULL); + s = sw; + } + + /* copy everything between s and sw to destination */ + for(i = 0; &s[i] < sw; i++) + if(!isspace((unsigned char)s[i])){ /* if some non-whitespace */ + while(s < sw && d-rv<len-1) + *d++ = (unsigned char) *s++; + + break; + } + + enc[-1] = txt[-1] = ew[0] = '\0'; /* tie off token strings */ + + if((lang = strchr(cset, '*')) != NULL) + *lang++ = '\0'; + + /* based on encoding, write the encoded text to output buffer */ + switch(*enc){ + case 'Q' : /* 'Q' encoding */ + case 'q' : + /* special hocus-pocus to deal with '_' exception, too bad */ + for(l = 0L, i = 0; txt[l]; l++) + if(txt[l] == '_') + i++; + + if(i){ + q = (char **) fs_get((i + 1) * sizeof(char *)); + for(l = 0L, i = 0; txt[l]; l++) + if(txt[l] == '_'){ + q[i++] = &txt[l]; + txt[l] = SPACE; + } + + q[i] = NULL; + } + else + q = NULL; + + if((p = rfc822_qprint((unsigned char *)txt, strlen(txt), &l)) != NULL){ + rfc1522_copy_and_transliterate(rv, &d, len, p, l, cset); + fs_give((void **)&p); /* free encoded buf */ + } + else{ + if(q) + fs_give((void **) &q); + + goto bogus; + } + + if(q){ /* restore underscores */ + for(i = 0; q[i]; i++) + *(q[i]) = '_'; + + fs_give((void **)&q); + } + + break; + + case 'B' : /* 'B' encoding */ + case 'b' : + if((p = rfc822_base64((unsigned char *) txt, strlen(txt), &l)) != NULL){ + rfc1522_copy_and_transliterate(rv, &d, len, p, l, cset); + fs_give((void **)&p); /* free encoded buf */ + } + else + goto bogus; + + break; + + default: + rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) txt, + strlen(txt), NULL); + dprint((1, "RFC1522_decode: Unknown ENCODING: %s\n", + enc ? enc : "?")); + break; + } + + /* restore trompled source string */ + enc[-1] = txt[-1] = '?'; + ew[0] = RFC1522_TERM[0]; + + /* advance s to start of text after encoded-word */ + s = ew + RFC1522_TERM_L; + + if(lang) + lang[-1] = '*'; + } + else{ + /* + * Found intro, but bogus data followed, treat it as normal text. + */ + l = (sw - s) + RFC1522_INIT_L; + rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, l, NULL); + for(; isspace((unsigned char) *(s+l)) && d-rv<len-1;l++) + *d++ = *(s+l); /* copy any trailing space */ + rv[len-1] = '\0'; + *d = '\0'; + s += l; + } + } + + if(rv){ + if(s && *s){ /* copy remaining text */ + rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, strlen(s), NULL); + rv[len-1] = '\0'; + } + } + else if(s){ + rv = d; + rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, strlen(s), NULL); + rv[len-1] = '\0'; + } + + return(rv ? rv : (unsigned char *) start); + + bogus: + dprint((1, "RFC1522_decode: BOGUS INPUT: -->%s<--\n", + start ? start : "?")); + return((unsigned char *) start); +} + + +/* + * rfc1522_token - scan the given source line up to the end_str making + * sure all subsequent chars are "valid" leaving endp + * a the start of the end_str. + * Returns: TRUE if we got a valid token, FALSE otherwise + */ +int +rfc1522_token(char *s, int (*valid) (int), char *end_str, char **endp) +{ + while(*s){ + if((char) *s == *end_str /* test for matching end_str */ + && ((end_str[1]) + ? !strncmp((char *)s + 1, end_str + 1, strlen(end_str + 1)) + : 1)){ + *endp = s; + return(TRUE); + } + + if(!(*valid)(*s++)) /* test for valid char */ + break; + } + + return(FALSE); +} + + +/* + * rfc1522_valtok - test for valid character in the RFC 1522 encoded + * word's charset and encoding fields. + */ +int +rfc1522_valtok(int c) +{ + return(!(c == SPACE || iscntrl(c & 0x7f) || strindex(ESPECIALS, c))); +} + + +/* + * rfc1522_valenc - test for valid character in the RFC 1522 encoded + * word's encoded-text field. + */ +int +rfc1522_valenc(int c) +{ + return(!(c == '?' || c == SPACE) && isprint((unsigned char)c)); +} + + +/* + * rfc1522_valid - validate the given string as to it's rfc1522-ness + */ +int +rfc1522_valid(char *s, char **charset, char **enc, char **txt, char **endp) +{ + char *c, *e, *t, *p; + int rv; + + rv = rfc1522_token(c = s+RFC1522_INIT_L, rfc1522_valtok, RFC1522_DLIM, &e) + && rfc1522_token(++e, rfc1522_valtok, RFC1522_DLIM, &t) + && rfc1522_token(++t, rfc1522_valenc, RFC1522_TERM, &p) + && p - s <= RFC1522_MAXW; + + if(charset) + *charset = c; + + if(enc) + *enc = e; + + if(txt) + *txt = t; + + if(endp) + *endp = p; + + return(rv); +} + + +/* + * rfc1522_copy_and_transliterate - copy given buf to destination buffer + * as UTF-8 characters + */ +void +rfc1522_copy_and_transliterate(unsigned char *rv, + unsigned char **d, + size_t len, + unsigned char *s, + unsigned long l, + char *cset) +{ + unsigned long i; + SIZEDTEXT src, xsrc; + + src.data = s; + src.size = l; + memset(&xsrc, 0, sizeof(SIZEDTEXT)); + + /* transliterate decoded segment to utf-8 */ + if(cset){ + if(strucmp((char *) cset, "us-ascii") + && strucmp((char *) cset, "utf-8")){ + if(utf8_charset(cset)){ + if(!utf8_text(&src, cset, &xsrc, 0L)){ + /* should not happen */ + panic("c-client failed to transliterate recognized characterset"); + } + } + else{ + /* non-xlatable charset */ + for(i = 0; i < l; i++) + if(src.data[i] & 0x80){ + xsrc.data = (unsigned char *) fs_get((l+1) * sizeof(unsigned char)); + xsrc.size = l; + for(i = 0; i < l; i++) + xsrc.data[i] = (src.data[i] & 0x80) ? '?' : src.data[i]; + + break; + } + } + } + } + else{ + const CHARSET *cs; + + src.data = s; + src.size = strlen((char *) s); + + if((cs = utf8_infercharset(&src))){ + if(!(cs->type == CT_ASCII || cs->type == CT_UTF8)){ + if(!utf8_text_cs(&src, cs, &xsrc, 0L, 0L)){ + /* should not happen */ + panic("c-client failed to transliterate recognized characterset"); + } + } + } + else if((cset=ps_global->VAR_UNK_CHAR_SET) + && strucmp((char *) cset, "us-ascii") + && strucmp((char *) cset, "utf-8") + && utf8_charset(cset)){ + if(!utf8_text(&src, cset, &xsrc, 0L)){ + /* should not happen */ + panic("c-client failed to transliterate recognized character set"); + } + } + else{ + /* unknown bytes - mask off high bit chars */ + for(i = 0; i < l; i++) + if(src.data[i] & 0x80){ + xsrc.data = (unsigned char *) fs_get((l+1) * sizeof(unsigned char)); + xsrc.size = l; + for(i = 0; i < l; i++) + xsrc.data[i] = (src.data[i] & 0x80) ? '?' : src.data[i]; + + break; + } + } + } + + if(xsrc.data){ + s = xsrc.data; + l = xsrc.size; + } + + i = MIN(l,len-1-((*d)-rv)); + strncpy((char *) (*d), (char *) s, i); + (*d)[i] = '\0'; + *d += l; /* advance dest ptr to EOL */ + if((*d)-rv > len-1) + *d = rv+len-1; + + if(xsrc.data && src.data != xsrc.data) + fs_give((void **) &xsrc.data); +} + + + +/* + * rfc1522_encode - encode the given source string ala RFC 1522, + * IF NECESSARY, into the given destination buffer. + * Don't bother copying if it turns out encoding + * isn't necessary. + * + * Returns: pointer to either the destination buffer containing the + * encoded text, or a pointer to the source buffer if we didn't + * have to encode anything. + */ +char * +rfc1522_encode(char *d, size_t dlen, unsigned char *s, char *charset) +{ + unsigned char *p, *q; + int n; + + if(!s) + return((char *) s); + + if(!charset) + charset = UNKNOWN_CHARSET; + + /* look for a reason to encode */ + for(p = s, n = 0; *p; p++) + if((*p) & 0x80){ + n++; + } + else if(*p == RFC1522_INIT[0] + && !strncmp((char *) p, RFC1522_INIT, RFC1522_INIT_L)){ + if(rfc1522_valid((char *) p, NULL, NULL, NULL, (char **) &q)) + p = q + RFC1522_TERM_L - 1; /* advance past encoded gunk */ + } + else if(*p == ESCAPE && match_escapes((char *)(p+1))){ + n++; + } + + if(n){ /* found, encoding to do */ + char *rv = d, *t, + enc = (n > (2 * (p - s)) / 3) ? 'B' : 'Q'; + + while(*s){ + if(d-rv < dlen-1-(RFC1522_INIT_L+2*RFC1522_DLIM_L+1)){ + sstrncpy(&d, RFC1522_INIT, dlen-(d-rv)); /* insert intro header, */ + sstrncpy(&d, charset, dlen-(d-rv)); /* character set tag, */ + sstrncpy(&d, RFC1522_DLIM, dlen-(d-rv)); /* and encoding flavor */ + if(dlen-(d-rv) > 0) + *d++ = enc; + + sstrncpy(&d, RFC1522_DLIM, dlen-(d-rv)); + } + + /* + * feed lines to encoder such that they're guaranteed + * less than RFC1522_MAXW. + */ + p = rfc1522_encoded_word(s, enc, charset); + if(enc == 'B') /* insert encoded data */ + sstrncpy(&d, t = rfc1522_binary(s, p - s), dlen-1-(d-rv)); + else /* 'Q' encoding */ + sstrncpy(&d, t = rfc1522_8bit(s, p - s), dlen-1-(d-rv)); + + sstrncpy(&d, RFC1522_TERM, dlen-1-(d-rv)); /* insert terminator */ + fs_give((void **) &t); + if(*p) /* more src string follows */ + sstrncpy(&d, "\015\012 ", dlen-1-(d-rv)); /* insert cont. line */ + + s = p; /* advance s */ + } + + rv[dlen-1] = '\0'; + return(rv); + } + else + return((char *) s); /* no work for us here */ +} + + + +/* + * rfc1522_encoded_word -- cut given string into max length encoded word + * + * Return: pointer into 's' such that the encoded 's' is no greater + * than RFC1522_MAXW + * + * NOTE: this line break code is NOT cognizant of any SI/SO + * charset requirements nor similar strategies using escape + * codes. Hopefully this will matter little and such + * representation strategies don't also include 8bit chars. + */ +unsigned char * +rfc1522_encoded_word(unsigned char *s, int enc, char *charset) +{ + int goal = RFC1522_MAXW - RFC1522_OVERHEAD(charset); + + if(enc == 'B') /* base64 encode */ + for(goal = ((goal / 4) * 3) - 2; goal && *s; goal--, s++) + ; + else /* special 'Q' encoding */ + for(; goal && *s; s++) + if((goal -= RFC1522_ENC_CHAR(*s) ? 3 : 1) < 0) + break; + + return(s); +} + + + +/* + * rfc1522_8bit -- apply RFC 1522 'Q' encoding to the given 8bit buffer + * + * Return: alloc'd buffer containing encoded string + */ +char * +rfc1522_8bit(void *src, int slen) +{ + char *ret = (char *) fs_get ((size_t) (3*slen + 2)); + char *d = ret; + unsigned char c; + unsigned char *s = (unsigned char *) src; + + while (slen--) { /* for each character */ + if (((c = *s++) == '\015') && (*s == '\012') && slen) { + *d++ = '\015'; /* true line break */ + *d++ = *s++; + slen--; + } + else if(c == SPACE){ /* special encoding case */ + *d++ = '_'; + } + else if(RFC1522_ENC_CHAR(c)){ + *d++ = '='; /* quote character */ + C2XPAIR(c, d); + } + else + *d++ = (char) c; /* ordinary character */ + } + + *d = '\0'; /* tie off destination */ + return(ret); +} + + +/* + * rfc1522_binary -- apply RFC 1522 'B' encoding to the given 8bit buffer + * + * Return: alloc'd buffer containing encoded string + */ +char * +rfc1522_binary (void *src, int srcl) +{ + static char *v = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + unsigned char *s = (unsigned char *) src; + char *ret, *d; + + d = ret = (char *) fs_get ((size_t) ((((srcl + 2) / 3) * 4) + 1)); + for (; srcl; s += 3) { /* process tuplets */ + /* byte 1: high 6 bits (1) */ + *d++ = v[s[0] >> 2]; + /* byte 2: low 2 bits (1), high 4 bits (2) */ + *d++ = v[((s[0] << 4) + (--srcl ? (s[1] >> 4) : 0)) & 0x3f]; + /* byte 3: low 4 bits (2), high 2 bits (3) */ + *d++ = srcl ? v[((s[1] << 2) + (--srcl ? (s[2] >> 6) :0)) & 0x3f] :'='; + /* byte 4: low 6 bits (3) */ + *d++ = srcl ? v[s[2] & 0x3f] : '='; + if(srcl) + srcl--; /* count third character if processed */ + } + + *d = '\0'; /* tie off string */ + return(ret); /* return the resulting string */ +} + + +/* + * Checks if charset conversion is possible and which quality could be achieved + * + * args: from_cs -- charset to convert from + * to_cs -- charset to convert to + * + * Results: + * CONV_TABLE->table -- conversion table, NULL if conversion not needed + * or not supported + * CONV_TABLE->quality -- conversion quality (conversion not supported, not + * needed, loses special chars, or loses letters + * + * The other entries of CONV_TABLE are used inside this function only + * and may not be used outside unless this documentation is updated. + */ +CONV_TABLE * +conversion_table(char *from_cs, char *to_cs) +{ + int i, j; + unsigned char *p = NULL; + unsigned short *fromtab, *totab; + CONV_TABLE *ct = NULL; + const CHARSET *from, *to; + static CONV_TABLE null_tab; + + if(!(from_cs && *from_cs && to_cs && *to_cs) || !strucmp(from_cs, to_cs)){ + memset(&null_tab, 0, sizeof(null_tab)); + null_tab.quality = CV_NO_TRANSLATE_NEEDED; + return(&null_tab); + } + + /* + * First check to see if we are already set up for this pair of charsets. + */ + if((ct = ps_global->conv_table) != NULL + && ct->from_charset && ct->to_charset + && !strucmp(ct->from_charset, from_cs) + && !strucmp(ct->to_charset, to_cs)) + return(ct); + + /* + * No such luck. Get rid of the cache of the previous translation table + * and build a new one. + */ + if(ct){ + if(ct->table && (ct->convert != gf_convert_utf8_charset)) + fs_give((void **) &ct->table); + + if(ct->from_charset) + fs_give((void **) &ct->from_charset); + + if(ct->to_charset) + fs_give((void **) &ct->to_charset); + } + else + ct = ps_global->conv_table = (CONV_TABLE *) fs_get(sizeof(*ct)); + + memset(ct, 0, sizeof(*ct)); + + ct->from_charset = cpystr(from_cs); + ct->to_charset = cpystr(to_cs); + ct->quality = CV_NO_TRANSLATE_POSSIBLE; + + /* + * Check to see if a translation is feasible. + */ + from = utf8_charset(from_cs); + to = utf8_charset(to_cs); + + if(from && to){ /* if both charsets found */ + /* no mapping if same or from is ASCII */ + if((from->type == to->type && from->tab == to->tab) + || (from->type == CT_ASCII)) + ct->quality = CV_NO_TRANSLATE_NEEDED; + else switch(from->type){ + case CT_1BYTE0: /* 1 byte no table */ + case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ + case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ + switch(to->type){ + case CT_1BYTE0: /* 1 byte no table */ + case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ + case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ + ct->quality = (from->script & to->script) ? + CV_LOSES_SOME_LETTERS : CV_LOSES_SPECIAL_CHARS; + break; + } + break; + case CT_UTF8: /* variable UTF-8 encoded Unicode no table */ + /* If source is UTF-8, see if destination charset has an 8 or 16 bit + * coded character set that we can translate to. By special + * dispensation, kludge ISO-2022-JP to EUC or Shift-JIS, but don't + * try to do any other ISO 2022 charsets or UTF-7. + */ + switch (to->type){ + case CT_SJIS: /* 2 byte Shift-JIS */ + /* only win if can get EUC-JP chartab */ + if(utf8_charset("EUC-JP")) + ct->quality = CV_LOSES_SOME_LETTERS; + break; + case CT_ASCII: /* 7-bit ASCII no table */ + case CT_1BYTE0: /* 1 byte no table */ + case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ + case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ + case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ + case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */ + case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */ + ct->quality = CV_LOSES_SOME_LETTERS; + break; + } + break; + } + + switch (ct->quality) { /* need to map? */ + case CV_NO_TRANSLATE_POSSIBLE: + case CV_NO_TRANSLATE_NEEDED: + break; /* no mapping needed */ + default: /* do mapping */ + switch (from->type) { + case CT_UTF8: /* UTF-8 to legacy character set */ + if((ct->table = utf8_rmap (to_cs)) != NULL) + ct->convert = gf_convert_utf8_charset; + break; + + case CT_1BYTE0: /* ISO 8859-1 */ + case CT_1BYTE: /* low part ASCII, high part other */ + case CT_1BYTE8: /* low part has some non-ASCII */ + /* + * The fromtab and totab tables are mappings from the 128 character + * positions 128-255 to their Unicode values (so unsigned shorts). + * The table we are creating is such that if + * + * from_char_value -> unicode_value + * to_char_value -> same_unicode_value + * + * then we want to map from_char_value -> to_char_value + * + * To simplify conversions we create the whole 256 element array, + * with the first 128 positions just the identity. If there is no + * conversion for a particular from_char_value (that is, no + * to_char_value maps to the same unicode character) then we put + * '?' in that character. We may want to output blob on the PC, + * but don't so far. + * + * If fromtab or totab are NULL, that means the mapping is simply + * the identity mapping. Since that is still useful to us, we + * create it on the fly. + */ + fromtab = (unsigned short *) from->tab; + totab = (unsigned short *) to->tab; + + ct->convert = gf_convert_8bit_charset; + p = ct->table = (unsigned char *) + fs_get(256 * sizeof(unsigned char)); + for(i = 0; i < 256; i++){ + unsigned int fc; + p[i] = '?'; + switch(from->type){ /* get "from" UCS-2 codepoint */ + case CT_1BYTE0: /* ISO 8859-1 */ + fc = i; + break; + case CT_1BYTE: /* low part ASCII, high part other */ + fc = (i < 128) ? i : fromtab[i-128]; + break; + case CT_1BYTE8: /* low part has some non-ASCII */ + fc = fromtab[i]; + break; + } + switch(to->type){ /* match against "to" UCS-2 codepoint */ + case CT_1BYTE0: /* identity match for ISO 8859-1*/ + if(fc < 256) + p[i] = fc; + break; + case CT_1BYTE: /* ASCII is identity, search high part */ + if(fc < 128) p[i] = fc; + else for(j = 0; j < 128; j++){ + if(fc == totab[j]){ + p[i] = 128 + j; + break; + } + } + break; + case CT_1BYTE8: /* search all codepoints */ + for(j = 0; j < 256; j++){ + if(fc == totab[j]){ + p[i] = j; + break; + } + } + break; + } + } + break; + } + } + } + + return(ct); +} + + +/* + * Replace personal names in list of addresses with + * decoded personal names in UTF-8. + * Assumes we can free and reallocate the name. + */ +void +decode_addr_names_to_utf8(struct mail_address *a) +{ + for(; a; a = a->next) + if(a->personal) + convert_possibly_encoded_str_to_utf8(&a->personal); +} + + +/* + * Strp is a pointer to an allocated string. + * This routine will convert the string to UTF-8, possibly + * freeing and re-allocating it. + * The source string may or may not have RFC1522 encoding + * which will be undone using rfc1522_decode. + * The string will have been converted on return. + */ +void +convert_possibly_encoded_str_to_utf8(char **strp) +{ + size_t len, lensrc, lenresult; + char *bufp, *decoded; + + if(!strp || !*strp || **strp == '\0') + return; + + len = 4 * strlen(*strp) + 1; + bufp = (char *) fs_get(len); + + decoded = (char *) rfc1522_decode_to_utf8((unsigned char *) bufp, len, *strp); + if(decoded != (*strp)){ /* unchanged */ + if((lensrc=strlen(*strp)) >= (lenresult=strlen(decoded))){ + strncpy(*strp, decoded, lensrc); + (*strp)[lensrc] = '\0'; + } + else{ + fs_give((void **) strp); + if(decoded == bufp){ /* this will be true */ + fs_resize((void **) &bufp, lenresult+1); + *strp = bufp; + bufp = NULL; + } + else{ /* this is unreachable */ + *strp = cpystr(decoded); + } + } + } + /* else, already UTF-8 */ + + if(bufp) + fs_give((void **) &bufp); +} |