#if !defined(lint) && !defined(DOS) static char rcsid[] = "$Id: charset.c 1032 2008-04-11 00:30:04Z hubert@u.washington.edu $"; #endif /* * ======================================================================== * Copyright 2013-2021 Eduardo Chappa * Copyright 2006-2008 University of Washington * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * ======================================================================== */ #include "../pith/headers.h" #include "../pith/charset.h" #include "../pith/state.h" #include "../pith/conf.h" #include "../pith/escapes.h" #include "../pith/mimedesc.h" #include "../pith/filter.h" #include "../pith/string.h" #include "../pith/options.h" /* * Internal prototypes */ int rfc1522_token(char *, int (*)(int), char *, char **); int rfc1522_valtok(int); int rfc1522_valenc(int); int rfc1522_valid(char *, char **, char **, char **, char **); void rfc1522_copy_and_transliterate(unsigned char *, unsigned char **, size_t, unsigned char *, unsigned long, char *); unsigned char *rfc1522_encoded_word(unsigned char *, int, char *); char *rfc1522_8bit(void *, int); char *rfc1522_binary(void *, int); char * body_charset(MAILSTREAM *stream, long int msgno, unsigned char *section) { BODY *body; char *charset; if((body = mail_body(stream, msgno, section)) && body->type == TYPETEXT){ if(!(charset = parameter_val(body->parameter, "charset"))) charset = cpystr("US-ASCII"); return(charset); } return(NULL); } /* * Copies the source string into allocated space with the 8-bit EUC codes * (on Unix) or the Shift-JIS (on PC) converted into ISO-2022-JP. * Caller is responsible for freeing the result. */ unsigned char * trans_euc_to_2022_jp(unsigned char *src) { size_t len, alloc; unsigned char *rv, *p, *q; int inside_esc_seq = 0; int c1 = -1; /* remembers first of pair for Shift-JIS */ if(!src) return(NULL); len = strlen((char *) src); /* * Worst possible increase is every other character an 8-bit character. * In that case, each of those gets 6 extra characters for the escape * sequences. We're not too concerned about the extra length because * these are relatively short strings. */ alloc = len + 1 + ((len+1)/2) * 6; rv = (unsigned char *) fs_get(alloc * sizeof(char)); for(p = src, q = rv; *p; p++){ if(inside_esc_seq){ if(c1 >= 0){ /* second of a pair? */ int adjust = *p < 159; int rowOffset = c1 < 160 ? 112 : 176; int cellOffset = adjust ? (*p > 127 ? 32 : 31) : 126; *q++ = ((c1 - rowOffset) << 1) - adjust; *q++ = *p - cellOffset; c1 = -1; } else if(*p & 0x80){ *q++ = (*p & 0x7f); } else{ *q++ = '\033'; *q++ = '('; *q++ = 'B'; *q++ = (*p); c1 = -1; inside_esc_seq = 0; } } else{ if(*p & 0x80){ *q++ = '\033'; *q++ = '$'; *q++ = 'B'; *q++ = (*p & 0x7f); inside_esc_seq = 1; } else{ *q++ = (*p); } } } if(inside_esc_seq){ *q++ = '\033'; *q++ = '('; *q++ = 'B'; } *q = '\0'; return(rv); } /* * * * * * * * * * RFC 1522 support routines * * * * * * * * * * RFC 1522 support is *very* loosely based on code contributed * by Lars-Erik Johansson . Thanks to Lars-Erik, * and apologies for taking such liberties with his code. */ #define RFC1522_INIT "=?" #define RFC1522_INIT_L 2 #define RFC1522_TERM "?=" #define RFC1522_TERM_L 2 #define RFC1522_DLIM "?" #define RFC1522_DLIM_L 1 #define RFC1522_MAXW 75 /* RFC's say 75, but no senders seem to care*/ #define ESPECIALS "()<>@,;:\"/[]?.=" #define RFC1522_OVERHEAD(S) (RFC1522_INIT_L + RFC1522_TERM_L + \ (2 * RFC1522_DLIM_L) + strlen(S) + 1); #define RFC1522_ENC_CHAR(C) (((C) & 0x80) || !rfc1522_valtok(C) \ || (C) == '_' ) /* * rfc1522_decode_to_utf8 - try to decode the given source string ala RFC 2047 * (obsoleted RFC 1522) into the given destination buffer, * encoded in UTF-8. * * How large should d be? The decoded string of octets will fit in * the same size string as the source string. However, because we're * translating that into UTF-8 the result may expand. Currently the * Thai character set has single octet characters which expand to * three octets in UTF-8. So it would be safe to use 3 * strlen(s) * for the size of d. One can imagine a currently non-existent * character set that expanded to 4 octets instead, so use 4 to be * super safe. * * Returns: pointer to either the destination buffer containing the * decoded text, or a pointer to the source buffer if there was * no valid 'encoded-word' found during scanning. */ unsigned char * rfc1522_decode_to_utf8(unsigned char *d, size_t len, char *s) { unsigned char *rv = NULL, *p; char *start = s, *sw, *enc, *txt, *ew, **q, *lang; char *cset; unsigned long l; int i; *d = '\0'; /* init destination */ while(s && (sw = strstr(s, RFC1522_INIT))){ if(!rv) /* there's something to do, init it */ rv = d; /* validate the rest of the encoded-word */ if(rfc1522_valid(sw, &cset, &enc, &txt, &ew)){ /* * We may have been putting off copying the first part of the * source while waiting to see if we have to copy at all. */ if(rv == d && s != start){ rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) start, sw - start, NULL); s = sw; } /* copy everything between s and sw to destination */ for(i = 0; &s[i] < sw; i++) if(!isspace((unsigned char)s[i])){ /* if some non-whitespace */ while(s < sw && d-rv%s<--\n", start ? start : "?")); return((unsigned char *) start); } /* * rfc1522_token - scan the given source line up to the end_str making * sure all subsequent chars are "valid" leaving endp * a the start of the end_str. * Returns: TRUE if we got a valid token, FALSE otherwise */ int rfc1522_token(char *s, int (*valid) (int), char *end_str, char **endp) { while(*s){ if((char) *s == *end_str /* test for matching end_str */ && ((end_str[1]) ? !strncmp((char *)s + 1, end_str + 1, strlen(end_str + 1)) : 1)){ *endp = s; return(TRUE); } if(!(*valid)(*s++)) /* test for valid char */ break; } return(FALSE); } /* * rfc1522_valtok - test for valid character in the RFC 1522 encoded * word's charset and encoding fields. */ int rfc1522_valtok(int c) { return(!(c == SPACE || iscntrl(c & 0x7f) || strindex(ESPECIALS, c))); } /* * rfc1522_valenc - test for valid character in the RFC 1522 encoded * word's encoded-text field. */ int rfc1522_valenc(int c) { return(!(c == '?' || c == SPACE) && isprint((unsigned char)c)); } /* * rfc1522_valid - validate the given string as to it's rfc1522-ness */ int rfc1522_valid(char *s, char **charset, char **enc, char **txt, char **endp) { char *c, *e, *t, *p; int rv; rv = rfc1522_token(c = s+RFC1522_INIT_L, rfc1522_valtok, RFC1522_DLIM, &e) && rfc1522_token(++e, rfc1522_valtok, RFC1522_DLIM, &t) && rfc1522_token(++t, rfc1522_valenc, RFC1522_TERM, &p); if(charset) *charset = c; if(enc) *enc = e; if(txt) *txt = t; if(endp) *endp = p; return(rv); } /* * rfc1522_copy_and_transliterate - copy given buf to destination buffer * as UTF-8 characters */ void rfc1522_copy_and_transliterate(unsigned char *rv, unsigned char **d, size_t len, unsigned char *s, unsigned long l, char *cset) { unsigned long i; SIZEDTEXT src, xsrc; src.data = s; src.size = l; memset(&xsrc, 0, sizeof(SIZEDTEXT)); /* transliterate decoded segment to utf-8 */ if(cset){ if(strucmp((char *) cset, "us-ascii") && strucmp((char *) cset, "utf-8")){ if(utf8_charset(cset)){ if(!utf8_text(&src, cset, &xsrc, 0L)){ /* should not happen */ alpine_panic("c-client failed to transliterate recognized characterset"); } } else{ /* non-xlatable charset */ for(i = 0; i < l; i++) if(src.data[i] & 0x80){ xsrc.data = (unsigned char *) fs_get((l+1) * sizeof(unsigned char)); xsrc.size = l; for(i = 0; i < l; i++) xsrc.data[i] = (src.data[i] & 0x80) ? '?' : src.data[i]; break; } } } } else{ const CHARSET *cs; src.data = s; src.size = strlen((char *) s); if((cs = utf8_infercharset(&src))){ if(!(cs->type == CT_ASCII || cs->type == CT_UTF8)){ if(!utf8_text_cs(&src, cs, &xsrc, 0L, 0L)){ /* should not happen */ alpine_panic("c-client failed to transliterate recognized characterset"); } } } else if((cset=ps_global->VAR_UNK_CHAR_SET) && strucmp((char *) cset, "us-ascii") && strucmp((char *) cset, "utf-8") && utf8_charset(cset)){ if(!utf8_text(&src, cset, &xsrc, 0L)){ /* should not happen */ alpine_panic("c-client failed to transliterate recognized character set"); } } else{ /* unknown bytes - mask off high bit chars */ for(i = 0; i < l; i++) if(src.data[i] & 0x80){ xsrc.data = (unsigned char *) fs_get((l+1) * sizeof(unsigned char)); xsrc.size = l; for(i = 0; i < l; i++) xsrc.data[i] = (src.data[i] & 0x80) ? '?' : src.data[i]; break; } } } if(xsrc.data){ s = xsrc.data; l = xsrc.size; } i = MIN(l,len-1-((*d)-rv)); strncpy((char *) (*d), (char *) s, i); (*d)[i] = '\0'; *d += l; /* advance dest ptr to EOL */ if((*d)-rv > len-1) *d = rv+len-1; if(xsrc.data && src.data != xsrc.data) fs_give((void **) &xsrc.data); } /* * rfc1522_encode - encode the given source string ala RFC 1522, * IF NECESSARY, into the given destination buffer. * Don't bother copying if it turns out encoding * isn't necessary. * * Returns: pointer to either the destination buffer containing the * encoded text, or a pointer to the source buffer if we didn't * have to encode anything. */ char * rfc1522_encode(char *d, size_t dlen, unsigned char *s, char *charset) { unsigned char *p, *q; int n; if(!s) return((char *) s); if(!charset) charset = UNKNOWN_CHARSET; /* look for a reason to encode */ for(p = s, n = 0; *p; p++) if((*p) & 0x80){ n++; } else if(*p == RFC1522_INIT[0] && !strncmp((char *) p, RFC1522_INIT, RFC1522_INIT_L)){ if(rfc1522_valid((char *) p, NULL, NULL, NULL, (char **) &q)) p = q + RFC1522_TERM_L - 1; /* advance past encoded gunk */ } else if(*p == ESCAPE && match_escapes((char *)(p+1))){ n++; } if(n){ /* found, encoding to do */ char *rv = d, *t, enc = (n > (2 * (p - s)) / 3) ? 'B' : 'Q'; while(*s){ if(d-rv < dlen-1-(RFC1522_INIT_L+2*RFC1522_DLIM_L+1)){ sstrncpy(&d, RFC1522_INIT, dlen-(d-rv)); /* insert intro header, */ sstrncpy(&d, charset, dlen-(d-rv)); /* character set tag, */ sstrncpy(&d, RFC1522_DLIM, dlen-(d-rv)); /* and encoding flavor */ if(dlen-(d-rv) > 0) *d++ = enc; sstrncpy(&d, RFC1522_DLIM, dlen-(d-rv)); } /* * feed lines to encoder such that they're guaranteed * less than RFC1522_MAXW. */ p = rfc1522_encoded_word(s, enc, charset); if(enc == 'B') /* insert encoded data */ sstrncpy(&d, t = rfc1522_binary(s, p - s), dlen-1-(d-rv)); else /* 'Q' encoding */ sstrncpy(&d, t = rfc1522_8bit(s, p - s), dlen-1-(d-rv)); sstrncpy(&d, RFC1522_TERM, dlen-1-(d-rv)); /* insert terminator */ fs_give((void **) &t); if(*p) /* more src string follows */ sstrncpy(&d, "\015\012 ", dlen-1-(d-rv)); /* insert cont. line */ s = p; /* advance s */ } rv[dlen-1] = '\0'; return(rv); } else return((char *) s); /* no work for us here */ } /* * rfc1522_encoded_word -- cut given string into max length encoded word * * Return: pointer into 's' such that the encoded 's' is no greater * than RFC1522_MAXW * * NOTE: this line break code is NOT cognizant of any SI/SO * charset requirements nor similar strategies using escape * codes. Hopefully this will matter little and such * representation strategies don't also include 8bit chars. */ unsigned char * rfc1522_encoded_word(unsigned char *s, int enc, char *charset) { int goal = RFC1522_MAXW - RFC1522_OVERHEAD(charset); if(enc == 'B') /* base64 encode */ for(goal = ((goal / 4) * 3) - 2; goal && *s; goal--, s++) ; else /* special 'Q' encoding */ if(!strucmp(charset, "UTF-8")){ /* special handling for utf-8 */ int i,more; unsigned char *p; for(; goal && *s; s++){ more = *s < 0x80 ? 0 : *s < 0xe0 ? 1 : *s < 0xf0 ? 2 : *s < 0xf8 ? 3 : *s < 0xfc ? 4 : *s < 0xfe ? 5 : -1; if(more >= 0){ /* check that we have at least more characters */ for(p = s, i = 0; i <= more && *p != '\0'; i++, p++) goal -= RFC1522_ENC_CHAR(*p) ? 3 : 1; if(goal < 0) /* does not fit in encoded word */ break; s += i - 1; /* i - 1 should be equal to more */ } else /* encode it, and skip it */ if((goal -= RFC1522_ENC_CHAR(*s) ? 3 : 1) < 0) break; } } else for(; goal && *s; s++) if((goal -= RFC1522_ENC_CHAR(*s) ? 3 : 1) < 0) break; return(s); } /* * rfc1522_8bit -- apply RFC 1522 'Q' encoding to the given 8bit buffer * * Return: alloc'd buffer containing encoded string */ char * rfc1522_8bit(void *src, int slen) { char *ret = (char *) fs_get ((size_t) (3*slen + 2)); char *d = ret; unsigned char c; unsigned char *s = (unsigned char *) src; while (slen--) { /* for each character */ if (((c = *s++) == '\015') && (*s == '\012') && slen) { *d++ = '\015'; /* true line break */ *d++ = *s++; slen--; } else if(c == SPACE){ /* special encoding case */ *d++ = '_'; } else if(RFC1522_ENC_CHAR(c)){ *d++ = '='; /* quote character */ C2XPAIR(c, d); } else *d++ = (char) c; /* ordinary character */ } *d = '\0'; /* tie off destination */ return(ret); } /* * rfc1522_binary -- apply RFC 1522 'B' encoding to the given 8bit buffer * * Return: alloc'd buffer containing encoded string */ char * rfc1522_binary (void *src, int srcl) { static char *v = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; unsigned char *s = (unsigned char *) src; char *ret, *d; d = ret = (char *) fs_get ((size_t) ((((srcl + 2) / 3) * 4) + 1)); for (; srcl; s += 3) { /* process tuplets */ /* byte 1: high 6 bits (1) */ *d++ = v[s[0] >> 2]; /* byte 2: low 2 bits (1), high 4 bits (2) */ *d++ = v[((s[0] << 4) + (--srcl ? (s[1] >> 4) : 0)) & 0x3f]; /* byte 3: low 4 bits (2), high 2 bits (3) */ *d++ = srcl ? v[((s[1] << 2) + (--srcl ? (s[2] >> 6) :0)) & 0x3f] :'='; /* byte 4: low 6 bits (3) */ *d++ = srcl ? v[s[2] & 0x3f] : '='; if(srcl) srcl--; /* count third character if processed */ } *d = '\0'; /* tie off string */ return(ret); /* return the resulting string */ } /* * Checks if charset conversion is possible and which quality could be achieved * * args: from_cs -- charset to convert from * to_cs -- charset to convert to * * Results: * CONV_TABLE->table -- conversion table, NULL if conversion not needed * or not supported * CONV_TABLE->quality -- conversion quality (conversion not supported, not * needed, loses special chars, or loses letters * * The other entries of CONV_TABLE are used inside this function only * and may not be used outside unless this documentation is updated. */ CONV_TABLE * conversion_table(char *from_cs, char *to_cs) { int i, j; unsigned char *p = NULL; unsigned short *fromtab, *totab; CONV_TABLE *ct = NULL; const CHARSET *from, *to; static CONV_TABLE null_tab; if(!(from_cs && *from_cs && to_cs && *to_cs) || !strucmp(from_cs, to_cs)){ memset(&null_tab, 0, sizeof(null_tab)); null_tab.quality = CV_NO_TRANSLATE_NEEDED; return(&null_tab); } /* * First check to see if we are already set up for this pair of charsets. */ if((ct = ps_global->conv_table) != NULL && ct->from_charset && ct->to_charset && !strucmp(ct->from_charset, from_cs) && !strucmp(ct->to_charset, to_cs)) return(ct); /* * No such luck. Get rid of the cache of the previous translation table * and build a new one. */ if(ct){ if(ct->table && (ct->convert != gf_convert_utf8_charset)) fs_give((void **) &ct->table); if(ct->from_charset) fs_give((void **) &ct->from_charset); if(ct->to_charset) fs_give((void **) &ct->to_charset); } else ct = ps_global->conv_table = (CONV_TABLE *) fs_get(sizeof(*ct)); memset(ct, 0, sizeof(*ct)); ct->from_charset = cpystr(from_cs); ct->to_charset = cpystr(to_cs); ct->quality = CV_NO_TRANSLATE_POSSIBLE; /* * Check to see if a translation is feasible. */ from = utf8_charset(from_cs); to = utf8_charset(to_cs); if(from && to){ /* if both charsets found */ /* no mapping if same or from is ASCII */ if((from->type == to->type && from->tab == to->tab) || (from->type == CT_ASCII)) ct->quality = CV_NO_TRANSLATE_NEEDED; else switch(from->type){ case CT_1BYTE0: /* 1 byte no table */ case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ switch(to->type){ case CT_1BYTE0: /* 1 byte no table */ case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ ct->quality = (from->script & to->script) ? CV_LOSES_SOME_LETTERS : CV_LOSES_SPECIAL_CHARS; break; } break; case CT_UTF8: /* variable UTF-8 encoded Unicode no table */ /* If source is UTF-8, see if destination charset has an 8 or 16 bit * coded character set that we can translate to. By special * dispensation, kludge ISO-2022-JP to EUC or Shift-JIS, but don't * try to do any other ISO 2022 charsets or UTF-7. */ switch (to->type){ case CT_SJIS: /* 2 byte Shift-JIS */ /* only win if can get EUC-JP chartab */ if(utf8_charset("EUC-JP")) ct->quality = CV_LOSES_SOME_LETTERS; break; case CT_ASCII: /* 7-bit ASCII no table */ case CT_1BYTE0: /* 1 byte no table */ case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */ case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */ case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */ case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */ case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */ ct->quality = CV_LOSES_SOME_LETTERS; break; } break; } switch (ct->quality) { /* need to map? */ case CV_NO_TRANSLATE_POSSIBLE: case CV_NO_TRANSLATE_NEEDED: break; /* no mapping needed */ default: /* do mapping */ switch (from->type) { case CT_UTF8: /* UTF-8 to legacy character set */ if((ct->table = utf8_rmap (to_cs)) != NULL) ct->convert = gf_convert_utf8_charset; break; case CT_1BYTE0: /* ISO 8859-1 */ case CT_1BYTE: /* low part ASCII, high part other */ case CT_1BYTE8: /* low part has some non-ASCII */ /* * The fromtab and totab tables are mappings from the 128 character * positions 128-255 to their Unicode values (so unsigned shorts). * The table we are creating is such that if * * from_char_value -> unicode_value * to_char_value -> same_unicode_value * * then we want to map from_char_value -> to_char_value * * To simplify conversions we create the whole 256 element array, * with the first 128 positions just the identity. If there is no * conversion for a particular from_char_value (that is, no * to_char_value maps to the same unicode character) then we put * '?' in that character. We may want to output blob on the PC, * but don't so far. * * If fromtab or totab are NULL, that means the mapping is simply * the identity mapping. Since that is still useful to us, we * create it on the fly. */ fromtab = (unsigned short *) from->tab; totab = (unsigned short *) to->tab; ct->convert = gf_convert_8bit_charset; p = ct->table = (unsigned char *) fs_get(256 * sizeof(unsigned char)); for(i = 0; i < 256; i++){ unsigned int fc; p[i] = '?'; switch(from->type){ /* get "from" UCS-2 codepoint */ case CT_1BYTE0: /* ISO 8859-1 */ fc = i; break; case CT_1BYTE: /* low part ASCII, high part other */ fc = (i < 128) ? i : fromtab[i-128]; break; case CT_1BYTE8: /* low part has some non-ASCII */ fc = fromtab[i]; break; } switch(to->type){ /* match against "to" UCS-2 codepoint */ case CT_1BYTE0: /* identity match for ISO 8859-1*/ if(fc < 256) p[i] = fc; break; case CT_1BYTE: /* ASCII is identity, search high part */ if(fc < 128) p[i] = fc; else for(j = 0; j < 128; j++){ if(fc == totab[j]){ p[i] = 128 + j; break; } } break; case CT_1BYTE8: /* search all codepoints */ for(j = 0; j < 256; j++){ if(fc == totab[j]){ p[i] = j; break; } } break; } } break; } } } return(ct); } /* * Replace personal names in list of addresses with * decoded personal names in UTF-8. * Assumes we can free and reallocate the name. */ void decode_addr_names_to_utf8(struct mail_address *a) { for(; a; a = a->next) if(a->personal) convert_possibly_encoded_str_to_utf8(&a->personal); } /* * Strp is a pointer to an allocated string. * This routine will convert the string to UTF-8, possibly * freeing and re-allocating it. * The source string may or may not have RFC1522 encoding * which will be undone using rfc1522_decode. * The string will have been converted on return. */ void convert_possibly_encoded_str_to_utf8(char **strp) { size_t len, lensrc, lenresult; char *bufp, *decoded; if(!strp || !*strp || **strp == '\0') return; len = 4 * strlen(*strp) + 1; bufp = (char *) fs_get(len); decoded = (char *) rfc1522_decode_to_utf8((unsigned char *) bufp, len, *strp); if(decoded != (*strp)){ /* unchanged */ if((lensrc=strlen(*strp)) >= (lenresult=strlen(decoded))){ strncpy(*strp, decoded, lensrc); (*strp)[lensrc] = '\0'; } else{ fs_give((void **) strp); if(decoded == bufp){ /* this will be true */ fs_resize((void **) &bufp, lenresult+1); *strp = bufp; bufp = NULL; } else{ /* this is unreachable */ *strp = cpystr(decoded); } } } /* else, already UTF-8 */ if(bufp) fs_give((void **) &bufp); }