1 files changed, 542 insertions, 0 deletions
diff --git a/pith/url.c b/pith/url.c
new file mode 100644
index 00000000..173cb879
--- /dev/null
+++ b/pith/url.c
@@ -0,0 +1,542 @@
+#if !defined(lint) && !defined(DOS)
+static char rcsid[] = "$Id: url.c 769 2007-10-24 00:15:40Z hubert@u.washington.edu $";
+#endif
+
+/*
+ * ========================================================================
+ * Copyright 2006-2007 University of Washington
+ * Copyright 2013 Eduardo Chappa
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * ========================================================================
+ */
+
+#include "../pith/headers.h"
+#include "../pith/url.h"
+#include "../pith/mailview.h"
+#include "../pith/string.h"
+
+/*
+ * Internal prototypes
+ */
+char *rfc1738_scheme_part(char *);
+int   rfc1738uchar(char *);
+int   rfc1738xchar(char *);
+
+
+/*
+ *  * * * * * * * *      RFC 1738 support routines      * * * * * * * *
+ */
+
+
+/*
+ * Various helpful definitions
+ */
+#define	RFC1738_SAFE	"$-_.+"			/* "safe" */
+#define	RFC1738_EXTRA	"!*'(),"		/* "extra" */
+#define	RFC1738_RSVP	";/?:@&="		/* "reserved" */
+#define	RFC1738_NEWS	"-.+_"			/* valid for "news:" URL */
+#define	RFC1738_FUDGE	"#{}|\\^~[]"		/* Unsafe, but popular */
+#define	RFC1738_ESC(S)	(*(S) == '%' && isxpair((S) + 1))
+
+
+/*
+ * rfc1738_scan -- Scan the given line for possible URLs as defined
+ *		   in RFC1738
+ */
+char *
+rfc1738_scan(char *line, int *len)
+{
+    char *colon, *start, *end;
+    int   n;
+
+    /* process each : in the line */
+    for(; (colon = strindex(line, ':')) != NULL; line = end){
+	end = colon + 1;
+	if(colon == line)		/* zero length scheme? */
+	  continue;
+
+	/*
+	 * Valid URL (ala RFC1738 BNF)?  First, first look to the
+	 * left to make sure there are valid "scheme" chars...
+	 */
+	start = colon - 1;
+	while(1)
+	  if(!(isdigit((unsigned char) *start)
+	       || isalpha((unsigned char) *start)
+	       || strchr("+-.", *start))){
+	      start++;			/* advance over bogus char */
+	      break;
+	  }
+	  else if(start > line)
+	    start--;
+	  else
+	    break;
+
+	/*
+	 * Make sure everyhing up to the colon is a known scheme...
+	 */
+	if(start && (n = colon - start) && !isdigit((unsigned char) *start)
+	   && (((n == 3
+		 && (*start == 'F' || *start == 'f')
+		 && !struncmp(start+1, "tp", 2))
+		|| (n == 4
+		    && (((*start == 'H' || *start == 'h') 
+			 && !struncmp(start + 1, "ttp", 3))
+			|| ((*start == 'N' || *start == 'n')
+			    && !struncmp(start + 1, "ews", 3))
+			|| ((*start == 'N' || *start == 'n')
+			    && !struncmp(start + 1, "ntp", 3))
+			|| ((*start == 'W' || *start == 'w')
+			    && !struncmp(start + 1, "ais", 3))
+#ifdef	ENABLE_LDAP
+			|| ((*start == 'L' || *start == 'l')
+			    && !struncmp(start + 1, "dap", 3))
+#endif
+			|| ((*start == 'I' || *start == 'i')
+			    && !struncmp(start + 1, "map", 3))
+			|| ((*start == 'F' || *start == 'f')
+			    && !struncmp(start + 1, "ile", 3))))
+		|| (n == 5
+		    && (*start == 'H' || *start == 'h')
+		    && !struncmp(start+1, "ttps", 4))
+		|| (n == 6
+		    && (((*start == 'G' || *start == 'g')
+			 && !struncmp(start+1, "opher", 5))
+			|| ((*start == 'M' || *start == 'm')
+			    && !struncmp(start + 1, "ailto", 5))
+			|| ((*start == 'T' || *start == 't')
+			    && !struncmp(start + 1, "elnet", 5))))
+		|| (n == 8
+		    && (*start == 'P' || *start == 'p')
+		    && !struncmp(start + 1, "rospero", 7))
+		|| (n == 11
+		    && (*start == 'x' || *start == 'X')
+		    && !struncmp(start + 1, "-pine-help", 10))
+		|| (n == 13
+		    && (*start == 'x' || *start == 'X')
+		    && !struncmp(start + 1, "-alpine-help", 12)))
+	       || url_external_specific_handler(start, n))){
+		/*
+		 * Second, make sure that everything to the right of the
+		 * colon is valid for a "schemepart"...
+		 */
+
+	    if((end = rfc1738_scheme_part(colon + 1)) - colon > 1){
+		int i, j;
+
+		/* make sure something useful follows colon */
+		for(i = 0, j = end - colon; i < j; i++)
+		  if(!strchr(RFC1738_RSVP, colon[i]))
+		    break;
+
+		if(i != j){
+		    *len = end - start;
+
+		    /*
+		     * Special case handling for comma.
+		     * See the problem is comma's valid, but if it's the
+		     * last character in the url, it's likely intended
+		     * as a delimiter in the text rather part of the URL.
+		     * In most cases any way, that's why we have the
+		     * exception.
+		     */
+		    if(*(end - 1) == ','
+		       || (*(end - 1) == '.' && (!*end  || *end == ' ')))
+		      (*len)--;
+
+		    if(*len - (colon - start) > 0)
+		      return(start);
+		}
+	    }
+	}
+    }
+
+    return(NULL);
+}
+
+
+/*
+ * rfc1738_scheme_part - make sure what's to the right of the
+ *			 colon is valid
+ *
+ * NOTE: we have a problem matching closing parens when users 
+ *       bracket the url in parens.  So, lets try terminating our
+ *	 match on any closing paren that doesn't have a coresponding
+ *       open-paren.
+ */
+char *
+rfc1738_scheme_part(char *s)
+{
+    int n, paren = 0, bracket = 0;
+
+    while(1)
+      switch(*s){
+	default :
+	  if((n = rfc1738xchar(s)) != 0){
+	      s += n;
+	      break;
+	  }
+
+	case '\0' :
+	  return(s);
+
+	case '[' :
+	  bracket++;
+	  s++;
+	  break;
+
+	case ']' :
+	  if(bracket--){
+	      s++;
+	      break;
+	  }
+
+	  return(s);
+
+	case '(' :
+	  paren++;
+	  s++;
+	  break;
+
+	case ')' :
+	  if(paren--){
+	      s++;
+	      break;
+	  }
+
+	  return(s);
+      }
+}
+
+
+
+/*
+ * rfc1738_str - convert rfc1738 escaped octets in place
+ */
+char *
+rfc1738_str(char *s)
+{
+    register char *p = s, *q = s;
+
+    while(1)
+      switch(*q = *p++){
+	case '%' :
+	  if(isxpair(p)){
+	      *q = X2C(p);
+	      p += 2;
+	  }
+
+	default :
+	  q++;
+	  break;
+
+	case '\0':
+	  return(s);
+      }
+}
+
+
+/*
+ * rfc1738uchar - returns TRUE if the given char fits RFC 1738 "uchar" BNF
+ */
+int
+rfc1738uchar(char *s)
+{
+    return((RFC1738_ESC(s))		/* "escape" */
+	     ? 2
+	     : (isalnum((unsigned char) *s)	/* alphanumeric */
+		|| strchr(RFC1738_SAFE, *s)	/* other special stuff */
+		|| strchr(RFC1738_EXTRA, *s)));
+}
+
+
+/*
+ * rfc1738xchar - returns TRUE if the given char fits RFC 1738 "xchar" BNF
+ */
+int
+rfc1738xchar(char *s)
+{
+    int n;
+
+    return((n = rfc1738uchar(s))
+	    ? n
+	    : (strchr(RFC1738_RSVP, *s) != NULL
+	       || strchr(RFC1738_FUDGE, *s)));
+}
+
+
+/*
+ * rfc1738_num - return long value of a string of digits, possibly escaped
+ */
+unsigned long
+rfc1738_num(char **s)
+{
+    register char *p = *s;
+    unsigned long n = 0L;
+
+    for(; *p; p++)
+      if(*p == '%' && isxpair(p+1)){
+	  int c = X2C(p+1);
+	  if(isdigit((unsigned char) c)){
+	      n = (c - '0') + (n * 10);
+	      p += 2;
+	  }
+	  else
+	    break;
+      }
+      else if(isdigit((unsigned char) *p))
+	n = (*p - '0') + (n * 10);
+      else
+	break;
+
+    *s = p;
+    return(n);
+}
+
+
+int
+rfc1738_group(char *s)
+{
+    return(isalnum((unsigned char) *s)
+	   || RFC1738_ESC(s)
+	   || strchr(RFC1738_NEWS, *s));
+}
+
+
+/*
+ * Encode (hexify) a mailto url.
+ *
+ * Args  s -- src url
+ *
+ * Returns  An allocated string which is suitably encoded.
+ *          Result should be freed by caller.
+ *
+ * Since we don't know here which characters are reserved characters (? and &)
+ * for use in delimiting the pieces of the url and which are just those
+ * characters contained in the data that should be encoded, we always encode
+ * them. That's because we know we don't use those as reserved characters.
+ * If you do use those as reserved characters you have to encode each part
+ * separately.
+ */
+char *
+rfc1738_encode_mailto(char *s)
+{
+    char *d, *ret = NULL;
+
+    if(s){
+	/* Worst case, encode every character */
+	ret = d = (char *)fs_get((3*strlen(s) + 1) * sizeof(char));
+	while(*s){
+	    if(isalnum((unsigned char)*s)
+	       || strchr(RFC1738_SAFE, *s)
+	       || strchr(RFC1738_EXTRA, *s))
+	      *d++ = *s++;
+	    else{
+		*d++ = '%';
+		C2XPAIR(*s, d);
+		s++;
+	    }
+	}
+
+	*d = '\0';
+    }
+
+    return(ret);
+}
+
+
+/*
+ *  * * * * * * * *      RFC 1808 support routines      * * * * * * * *
+ */
+
+
+int
+rfc1808_tokens(char *url, char **scheme, char **net_loc, char **path,
+	       char **parms, char **query, char **frag)
+{
+    char *p, *q, *start, *tmp = cpystr(url);
+
+    start = tmp;
+    if((p = strchr(start, '#')) != NULL){	/* fragment spec? */
+	*p++ = '\0';
+	if(*p)
+	  *frag = cpystr(p);
+    }
+
+    if((p = strchr(start, ':')) && p != start){ /* scheme part? */
+	for(q = start; q < p; q++)
+	  if(!(isdigit((unsigned char) *q)
+	       || isalpha((unsigned char) *q)
+	       || strchr("+-.", *q)))
+	    break;
+
+	if(p == q){
+	    *p++ = '\0';
+	    *scheme = cpystr(start);
+	    start = p;
+	}
+    }
+
+    if(*start == '/' && *(start+1) == '/'){ /* net_loc */
+	if((p = strchr(start+2, '/')) != NULL)
+	  *p++ = '\0';
+
+	*net_loc = cpystr(start+2);
+	if(p)
+	  start = p;
+	else *start = '\0';		/* End of parse */
+    }
+
+    if((p = strchr(start, '?')) != NULL){
+	*p++ = '\0';
+	*query = cpystr(p);
+    }
+
+    if((p = strchr(start, ';')) != NULL){
+	*p++ = '\0';
+	*parms = cpystr(p);
+    }
+
+    if(*start)
+      *path = cpystr(start);
+
+    fs_give((void **) &tmp);
+
+    return(1);
+}
+
+
+
+/*
+ * web_host_scan -- Scan the given line for possible web host names
+ *
+ * NOTE: scan below is limited to DNS names ala RFC1034
+ */
+char *
+web_host_scan(char *line, int *len)
+{
+    char *end, last = '\0';
+
+    for(; *line; last = *line++)
+      if((*line == 'w' || *line == 'W')
+	 && (!last || !(isalnum((unsigned char) last)
+			|| last == '.' || last == '-' || last == '/'))
+	 && (((*(line + 1) == 'w' || *(line + 1) == 'W')	/* "www." */
+	      && (*(line + 2) == 'w' || *(line + 2) == 'W'))
+	     || ((*(line + 1) == 'e' || *(line + 1) == 'E')	/* "web." */
+		 && (*(line + 2) == 'b' || *(line + 2) == 'B')))
+	 && (*(line + 3) == '.')){
+	  end = rfc1738_scheme_part(line + 3);
+	  if((*len = end - line) > ((*(line+3) == '.') ? 4 : 3)){
+	      /* Dread comma exception, see note in rfc1738_scan */
+	      if(strchr(",:", *(line + (*len) - 1))
+		 || (*(line + (*len) - 1) == '.'
+		     && (!*(line + (*len)) || *(line + (*len)) == ' ')))
+		(*len)--;
+
+	      return(line);
+	  }
+	  else
+	    line += 3;
+      }
+
+    return(NULL);
+}
+
+
+/*
+ * mail_addr_scan -- Scan the given line for possible RFC822 addr-spec's
+ *
+ * NOTE: Well, OK, not strictly addr-specs since there's alot of junk
+ *	 we're tying to sift thru and we'd like to minimize false-pos
+ *	 matches.
+ */
+char *
+mail_addr_scan(char *line, int *len)
+{
+    char *amp, *start, *end;
+/*
+ * This list is not the whole standards-based list, this is just a list
+ * of likely email address characters. We don't want to include everything
+ * because punctuation in the text might get mixed in with the address.
+ */
+#define NONALPHANUMOK ".-_+%/="
+
+    /* process each : in the line */
+    for(; (amp = strindex(line, '@')) != NULL; line = end){
+	end = amp + 1;
+	/* zero length addr? */
+	if(amp == line || !(isalnum((unsigned char) *(start = amp - 1))
+			    || strchr(NONALPHANUMOK, *start)))
+	  continue;
+
+	/*
+	 * Valid address (ala RFC822 BNF)?  First, first look to the
+	 * left to make sure there are valid "scheme" chars...
+	 */
+	while(1)
+	  /* NOTE: we're not doing quoted-strings */
+	  if(!(isalnum((unsigned char) *start) || strchr(NONALPHANUMOK, *start))){
+	      /* advance over bogus char, and erase leading punctuation */
+	      for(start++; *start && strchr(NONALPHANUMOK, *start); start++)
+	        ;
+
+	      break;
+	  }
+	  else if(start > line)
+	    start--;
+	  else
+	    break;
+
+	/*
+	 * Make sure everyhing up to the colon is a known scheme...
+	 */
+	if(start && (amp - start) > 0){
+	    /*
+	     * Second, make sure that everything to the right of
+	     * amp is valid for a "domain"...
+	     */
+	    if(*(end = amp + 1) == '['){ /* domain literal */
+		int dots = 3;
+
+		for(++end; *end ; end++)
+		  if(*end == ']'){
+		      if(!dots){
+			  *len = end - start + 1;
+			  return(start);
+		      }
+		      else
+			break;		/* bogus */
+		  }
+		  else if(*end == '.'){
+		      if(--dots < 0)
+			break;		/* bogus */
+		  }
+		  else if(!isdigit((unsigned char) *end))
+		    break;		/* bogus */
+	    }
+	    else if(isalnum((unsigned char) *end)){ /* domain name? */
+		for(++end; ; end++)
+		  if(!(*end && (isalnum((unsigned char) *end)
+				|| *end == '-'
+				|| *end == '.'
+				|| *end == '_'))){
+		      /* can't end with dash, dot or underscore */
+		      while(!isalnum((unsigned char) *(end - 1)))
+			end--;
+
+		      *len = end - start;
+		      return(start);
+		  }
+	    }
+	}
+    }
+
+    return(NULL);
+}