diff options
Diffstat (limited to 'pith/url.c')
-rw-r--r-- | pith/url.c | 542 |
1 files changed, 542 insertions, 0 deletions
diff --git a/pith/url.c b/pith/url.c new file mode 100644 index 00000000..173cb879 --- /dev/null +++ b/pith/url.c @@ -0,0 +1,542 @@ +#if !defined(lint) && !defined(DOS) +static char rcsid[] = "$Id: url.c 769 2007-10-24 00:15:40Z hubert@u.washington.edu $"; +#endif + +/* + * ======================================================================== + * Copyright 2006-2007 University of Washington + * Copyright 2013 Eduardo Chappa + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * ======================================================================== + */ + +#include "../pith/headers.h" +#include "../pith/url.h" +#include "../pith/mailview.h" +#include "../pith/string.h" + +/* + * Internal prototypes + */ +char *rfc1738_scheme_part(char *); +int rfc1738uchar(char *); +int rfc1738xchar(char *); + + +/* + * * * * * * * * * RFC 1738 support routines * * * * * * * * + */ + + +/* + * Various helpful definitions + */ +#define RFC1738_SAFE "$-_.+" /* "safe" */ +#define RFC1738_EXTRA "!*'()," /* "extra" */ +#define RFC1738_RSVP ";/?:@&=" /* "reserved" */ +#define RFC1738_NEWS "-.+_" /* valid for "news:" URL */ +#define RFC1738_FUDGE "#{}|\\^~[]" /* Unsafe, but popular */ +#define RFC1738_ESC(S) (*(S) == '%' && isxpair((S) + 1)) + + +/* + * rfc1738_scan -- Scan the given line for possible URLs as defined + * in RFC1738 + */ +char * +rfc1738_scan(char *line, int *len) +{ + char *colon, *start, *end; + int n; + + /* process each : in the line */ + for(; (colon = strindex(line, ':')) != NULL; line = end){ + end = colon + 1; + if(colon == line) /* zero length scheme? */ + continue; + + /* + * Valid URL (ala RFC1738 BNF)? First, first look to the + * left to make sure there are valid "scheme" chars... + */ + start = colon - 1; + while(1) + if(!(isdigit((unsigned char) *start) + || isalpha((unsigned char) *start) + || strchr("+-.", *start))){ + start++; /* advance over bogus char */ + break; + } + else if(start > line) + start--; + else + break; + + /* + * Make sure everyhing up to the colon is a known scheme... + */ + if(start && (n = colon - start) && !isdigit((unsigned char) *start) + && (((n == 3 + && (*start == 'F' || *start == 'f') + && !struncmp(start+1, "tp", 2)) + || (n == 4 + && (((*start == 'H' || *start == 'h') + && !struncmp(start + 1, "ttp", 3)) + || ((*start == 'N' || *start == 'n') + && !struncmp(start + 1, "ews", 3)) + || ((*start == 'N' || *start == 'n') + && !struncmp(start + 1, "ntp", 3)) + || ((*start == 'W' || *start == 'w') + && !struncmp(start + 1, "ais", 3)) +#ifdef ENABLE_LDAP + || ((*start == 'L' || *start == 'l') + && !struncmp(start + 1, "dap", 3)) +#endif + || ((*start == 'I' || *start == 'i') + && !struncmp(start + 1, "map", 3)) + || ((*start == 'F' || *start == 'f') + && !struncmp(start + 1, "ile", 3)))) + || (n == 5 + && (*start == 'H' || *start == 'h') + && !struncmp(start+1, "ttps", 4)) + || (n == 6 + && (((*start == 'G' || *start == 'g') + && !struncmp(start+1, "opher", 5)) + || ((*start == 'M' || *start == 'm') + && !struncmp(start + 1, "ailto", 5)) + || ((*start == 'T' || *start == 't') + && !struncmp(start + 1, "elnet", 5)))) + || (n == 8 + && (*start == 'P' || *start == 'p') + && !struncmp(start + 1, "rospero", 7)) + || (n == 11 + && (*start == 'x' || *start == 'X') + && !struncmp(start + 1, "-pine-help", 10)) + || (n == 13 + && (*start == 'x' || *start == 'X') + && !struncmp(start + 1, "-alpine-help", 12))) + || url_external_specific_handler(start, n))){ + /* + * Second, make sure that everything to the right of the + * colon is valid for a "schemepart"... + */ + + if((end = rfc1738_scheme_part(colon + 1)) - colon > 1){ + int i, j; + + /* make sure something useful follows colon */ + for(i = 0, j = end - colon; i < j; i++) + if(!strchr(RFC1738_RSVP, colon[i])) + break; + + if(i != j){ + *len = end - start; + + /* + * Special case handling for comma. + * See the problem is comma's valid, but if it's the + * last character in the url, it's likely intended + * as a delimiter in the text rather part of the URL. + * In most cases any way, that's why we have the + * exception. + */ + if(*(end - 1) == ',' + || (*(end - 1) == '.' && (!*end || *end == ' '))) + (*len)--; + + if(*len - (colon - start) > 0) + return(start); + } + } + } + } + + return(NULL); +} + + +/* + * rfc1738_scheme_part - make sure what's to the right of the + * colon is valid + * + * NOTE: we have a problem matching closing parens when users + * bracket the url in parens. So, lets try terminating our + * match on any closing paren that doesn't have a coresponding + * open-paren. + */ +char * +rfc1738_scheme_part(char *s) +{ + int n, paren = 0, bracket = 0; + + while(1) + switch(*s){ + default : + if((n = rfc1738xchar(s)) != 0){ + s += n; + break; + } + + case '\0' : + return(s); + + case '[' : + bracket++; + s++; + break; + + case ']' : + if(bracket--){ + s++; + break; + } + + return(s); + + case '(' : + paren++; + s++; + break; + + case ')' : + if(paren--){ + s++; + break; + } + + return(s); + } +} + + + +/* + * rfc1738_str - convert rfc1738 escaped octets in place + */ +char * +rfc1738_str(char *s) +{ + register char *p = s, *q = s; + + while(1) + switch(*q = *p++){ + case '%' : + if(isxpair(p)){ + *q = X2C(p); + p += 2; + } + + default : + q++; + break; + + case '\0': + return(s); + } +} + + +/* + * rfc1738uchar - returns TRUE if the given char fits RFC 1738 "uchar" BNF + */ +int +rfc1738uchar(char *s) +{ + return((RFC1738_ESC(s)) /* "escape" */ + ? 2 + : (isalnum((unsigned char) *s) /* alphanumeric */ + || strchr(RFC1738_SAFE, *s) /* other special stuff */ + || strchr(RFC1738_EXTRA, *s))); +} + + +/* + * rfc1738xchar - returns TRUE if the given char fits RFC 1738 "xchar" BNF + */ +int +rfc1738xchar(char *s) +{ + int n; + + return((n = rfc1738uchar(s)) + ? n + : (strchr(RFC1738_RSVP, *s) != NULL + || strchr(RFC1738_FUDGE, *s))); +} + + +/* + * rfc1738_num - return long value of a string of digits, possibly escaped + */ +unsigned long +rfc1738_num(char **s) +{ + register char *p = *s; + unsigned long n = 0L; + + for(; *p; p++) + if(*p == '%' && isxpair(p+1)){ + int c = X2C(p+1); + if(isdigit((unsigned char) c)){ + n = (c - '0') + (n * 10); + p += 2; + } + else + break; + } + else if(isdigit((unsigned char) *p)) + n = (*p - '0') + (n * 10); + else + break; + + *s = p; + return(n); +} + + +int +rfc1738_group(char *s) +{ + return(isalnum((unsigned char) *s) + || RFC1738_ESC(s) + || strchr(RFC1738_NEWS, *s)); +} + + +/* + * Encode (hexify) a mailto url. + * + * Args s -- src url + * + * Returns An allocated string which is suitably encoded. + * Result should be freed by caller. + * + * Since we don't know here which characters are reserved characters (? and &) + * for use in delimiting the pieces of the url and which are just those + * characters contained in the data that should be encoded, we always encode + * them. That's because we know we don't use those as reserved characters. + * If you do use those as reserved characters you have to encode each part + * separately. + */ +char * +rfc1738_encode_mailto(char *s) +{ + char *d, *ret = NULL; + + if(s){ + /* Worst case, encode every character */ + ret = d = (char *)fs_get((3*strlen(s) + 1) * sizeof(char)); + while(*s){ + if(isalnum((unsigned char)*s) + || strchr(RFC1738_SAFE, *s) + || strchr(RFC1738_EXTRA, *s)) + *d++ = *s++; + else{ + *d++ = '%'; + C2XPAIR(*s, d); + s++; + } + } + + *d = '\0'; + } + + return(ret); +} + + +/* + * * * * * * * * * RFC 1808 support routines * * * * * * * * + */ + + +int +rfc1808_tokens(char *url, char **scheme, char **net_loc, char **path, + char **parms, char **query, char **frag) +{ + char *p, *q, *start, *tmp = cpystr(url); + + start = tmp; + if((p = strchr(start, '#')) != NULL){ /* fragment spec? */ + *p++ = '\0'; + if(*p) + *frag = cpystr(p); + } + + if((p = strchr(start, ':')) && p != start){ /* scheme part? */ + for(q = start; q < p; q++) + if(!(isdigit((unsigned char) *q) + || isalpha((unsigned char) *q) + || strchr("+-.", *q))) + break; + + if(p == q){ + *p++ = '\0'; + *scheme = cpystr(start); + start = p; + } + } + + if(*start == '/' && *(start+1) == '/'){ /* net_loc */ + if((p = strchr(start+2, '/')) != NULL) + *p++ = '\0'; + + *net_loc = cpystr(start+2); + if(p) + start = p; + else *start = '\0'; /* End of parse */ + } + + if((p = strchr(start, '?')) != NULL){ + *p++ = '\0'; + *query = cpystr(p); + } + + if((p = strchr(start, ';')) != NULL){ + *p++ = '\0'; + *parms = cpystr(p); + } + + if(*start) + *path = cpystr(start); + + fs_give((void **) &tmp); + + return(1); +} + + + +/* + * web_host_scan -- Scan the given line for possible web host names + * + * NOTE: scan below is limited to DNS names ala RFC1034 + */ +char * +web_host_scan(char *line, int *len) +{ + char *end, last = '\0'; + + for(; *line; last = *line++) + if((*line == 'w' || *line == 'W') + && (!last || !(isalnum((unsigned char) last) + || last == '.' || last == '-' || last == '/')) + && (((*(line + 1) == 'w' || *(line + 1) == 'W') /* "www." */ + && (*(line + 2) == 'w' || *(line + 2) == 'W')) + || ((*(line + 1) == 'e' || *(line + 1) == 'E') /* "web." */ + && (*(line + 2) == 'b' || *(line + 2) == 'B'))) + && (*(line + 3) == '.')){ + end = rfc1738_scheme_part(line + 3); + if((*len = end - line) > ((*(line+3) == '.') ? 4 : 3)){ + /* Dread comma exception, see note in rfc1738_scan */ + if(strchr(",:", *(line + (*len) - 1)) + || (*(line + (*len) - 1) == '.' + && (!*(line + (*len)) || *(line + (*len)) == ' '))) + (*len)--; + + return(line); + } + else + line += 3; + } + + return(NULL); +} + + +/* + * mail_addr_scan -- Scan the given line for possible RFC822 addr-spec's + * + * NOTE: Well, OK, not strictly addr-specs since there's alot of junk + * we're tying to sift thru and we'd like to minimize false-pos + * matches. + */ +char * +mail_addr_scan(char *line, int *len) +{ + char *amp, *start, *end; +/* + * This list is not the whole standards-based list, this is just a list + * of likely email address characters. We don't want to include everything + * because punctuation in the text might get mixed in with the address. + */ +#define NONALPHANUMOK ".-_+%/=" + + /* process each : in the line */ + for(; (amp = strindex(line, '@')) != NULL; line = end){ + end = amp + 1; + /* zero length addr? */ + if(amp == line || !(isalnum((unsigned char) *(start = amp - 1)) + || strchr(NONALPHANUMOK, *start))) + continue; + + /* + * Valid address (ala RFC822 BNF)? First, first look to the + * left to make sure there are valid "scheme" chars... + */ + while(1) + /* NOTE: we're not doing quoted-strings */ + if(!(isalnum((unsigned char) *start) || strchr(NONALPHANUMOK, *start))){ + /* advance over bogus char, and erase leading punctuation */ + for(start++; *start && strchr(NONALPHANUMOK, *start); start++) + ; + + break; + } + else if(start > line) + start--; + else + break; + + /* + * Make sure everyhing up to the colon is a known scheme... + */ + if(start && (amp - start) > 0){ + /* + * Second, make sure that everything to the right of + * amp is valid for a "domain"... + */ + if(*(end = amp + 1) == '['){ /* domain literal */ + int dots = 3; + + for(++end; *end ; end++) + if(*end == ']'){ + if(!dots){ + *len = end - start + 1; + return(start); + } + else + break; /* bogus */ + } + else if(*end == '.'){ + if(--dots < 0) + break; /* bogus */ + } + else if(!isdigit((unsigned char) *end)) + break; /* bogus */ + } + else if(isalnum((unsigned char) *end)){ /* domain name? */ + for(++end; ; end++) + if(!(*end && (isalnum((unsigned char) *end) + || *end == '-' + || *end == '.' + || *end == '_'))){ + /* can't end with dash, dot or underscore */ + while(!isalnum((unsigned char) *(end - 1))) + end--; + + *len = end - start; + return(start); + } + } + } + } + + return(NULL); +} |