#if !defined(lint) && !defined(DOS)
static char rcsid[] = "$Id: url.c 769 2007-10-24 00:15:40Z hubert@u.washington.edu $";
#endif

/*
 * ========================================================================
 * Copyright 2006-2007 University of Washington
 * Copyright 2013-2021 Eduardo Chappa
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * ========================================================================
 */

#include "../pith/headers.h"
#include "../pith/url.h"
#include "../pith/mailview.h"
#include "../pith/string.h"

/*
 * Internal prototypes
 */
char *rfc1738_scheme_part(char *);
int   rfc1738uchar(char *);
int   rfc1738xchar(char *);


/*
 *  * * * * * * * *      RFC 1738 support routines      * * * * * * * *
 */


/*
 * Various helpful definitions
 */
#define	RFC1738_SAFE	"$-_.+"			/* "safe" */
#define	RFC1738_EXTRA	"!*'(),"		/* "extra" */
#define	RFC1738_RSVP	";/?:@&="		/* "reserved" */
#define	RFC1738_NEWS	"-.+_"			/* valid for "news:" URL */
#define	RFC1738_FUDGE	"#{}|\\^~[]"		/* Unsafe, but popular */
#define	RFC1738_ESC(S)	(*(S) == '%' && isxpair((S) + 1))


/*
 * rfc1738_scan -- Scan the given line for possible URLs as defined
 *		   in RFC1738
 */
char *
rfc1738_scan(char *line, int *len)
{
    char *colon, *start, *end;
    int   n;

    /* process each : in the line */
    for(; (colon = strindex(line, ':')) != NULL; line = end){
	end = colon + 1;
	if(colon == line)		/* zero length scheme? */
	  continue;

	/*
	 * Valid URL (ala RFC1738 BNF)?  First, first look to the
	 * left to make sure there are valid "scheme" chars...
	 */
	start = colon - 1;
	while(1)
	  if(!(isdigit((unsigned char) *start)
	       || isalpha((unsigned char) *start)
	       || strchr("+-.", *start))){
	      start++;			/* advance over bogus char */
	      break;
	  }
	  else if(start > line)
	    start--;
	  else
	    break;

	/*
	 * Make sure everything up to the colon is a known scheme...
	 */
	if(start && (n = colon - start) && !isdigit((unsigned char) *start)
	   && (((n == 2
		 && (*start == 'w' || *start == 'W')
		 && (*(start+1) == 's' || *(start+1) == 'S'))
		|| (n == 3
		 && (((*start == 'F' || *start == 'f')
		 	&& !struncmp(start+1, "tp", 2))
		     ||
		    ((*start == 'w' || *start == 'W')
		 	&& !struncmp(start+1, "ss", 2))))
		|| (n == 4
		    && (((*start == 'H' || *start == 'h') 
			 && !struncmp(start + 1, "ttp", 3))
			|| ((*start == 'N' || *start == 'n')
			    && !struncmp(start + 1, "ews", 3))
			|| ((*start == 'N' || *start == 'n')
			    && !struncmp(start + 1, "ntp", 3))
			|| ((*start == 'W' || *start == 'w')
			    && !struncmp(start + 1, "ais", 3))
#ifdef	ENABLE_LDAP
			|| ((*start == 'L' || *start == 'l')
			    && !struncmp(start + 1, "dap", 3))
#endif
			|| ((*start == 'I' || *start == 'i')
			    && !struncmp(start + 1, "map", 3))
			|| ((*start == 'F' || *start == 'f')
			    && !struncmp(start + 1, "ile", 3))))
		|| (n == 5
		    && (*start == 'H' || *start == 'h')
		    && !struncmp(start+1, "ttps", 4))
		|| (n == 6
		    && (((*start == 'G' || *start == 'g')
			 && !struncmp(start+1, "opher", 5))
			|| ((*start == 'M' || *start == 'm')
			    && !struncmp(start + 1, "ailto", 5))
			|| ((*start == 'T' || *start == 't')
			    && !struncmp(start + 1, "elnet", 5))))
		|| (n == 8
		    && (*start == 'P' || *start == 'p')
		    && !struncmp(start + 1, "rospero", 7))
		|| (n == 11
		    && (*start == 'x' || *start == 'X')
		    && !struncmp(start + 1, "-pine-help", 10))
		|| (n == 13
		    && (*start == 'x' || *start == 'X')
		    && !struncmp(start + 1, "-alpine-help", 12)))
	       || url_external_specific_handler(start, n))){
		/*
		 * Second, make sure that everything to the right of the
		 * colon is valid for a "schemepart"...
		 */

	    if((end = rfc1738_scheme_part(colon + 1)) - colon > 1){
		int i, j;

		/* make sure something useful follows colon */
		for(i = 0, j = end - colon; i < j; i++)
		  if(!strchr(RFC1738_RSVP, colon[i]))
		    break;

		if(i != j){
		    *len = end - start;

		    /*
		     * Special case handling for comma.
		     * See the problem is comma's valid, but if it's the
		     * last character in the url, it's likely intended
		     * as a delimiter in the text rather part of the URL.
		     * In most cases any way, that's why we have the
		     * exception.
		     */
		    if(*(end - 1) == ','
		       || (*(end - 1) == '.' && (!*end  || *end == ' ')))
		      (*len)--;

		    if(*len - (colon - start) > 0)
		      return(start);
		}
	    }
	}
    }

    return(NULL);
}


/*
 * rfc1738_scheme_part - make sure what's to the right of the
 *			 colon is valid
 *
 * NOTE: we have a problem matching closing parens when users 
 *       bracket the url in parens.  So, lets try terminating our
 *	 match on any closing paren that doesn't have a corresponding
 *       open-paren.
 */
char *
rfc1738_scheme_part(char *s)
{
    int n, paren = 0, bracket = 0;

    while(1)
      switch(*s){
	default :
	  if((n = rfc1738xchar(s)) != 0){
	      s += n;
	      break;
	  }

	case '\0' :
	  return(s);

	case '[' :
	  bracket++;
	  s++;
	  break;

	case ']' :
	  if(bracket--){
	      s++;
	      break;
	  }

	  return(s);

	case '(' :
	  paren++;
	  s++;
	  break;

	case ')' :
	  if(paren--){
	      s++;
	      break;
	  }

	  return(s);
      }
}


/*
 * rfc1738_str - convert rfc1738 escaped octets in place
 */
char *
rfc1738_str(char *s)
{
    register char *p = s, *q = s;

    while(1)
      switch(*q = *p++){
	case '%' :
	  if(isxpair(p)){
	      *q = X2C(p);
	      p += 2;
	  }

	default :
	  q++;
	  break;

	case '\0':
	  return(s);
      }
}


/*
 * rfc1738uchar - returns TRUE if the given char fits RFC 1738 "uchar" BNF
 */
int
rfc1738uchar(char *s)
{
    int valid = (RFC1738_ESC(s))		/* "escape" */
	     ? 2
	     : (isalnum((unsigned char) *s)	/* alphanumeric */
		|| strchr(RFC1738_SAFE, *s)	/* other special stuff */
		|| strchr(RFC1738_EXTRA, *s));

    if(!valid){
	char *t;
	UCS ucs;
        CBUF_S cbuf;

        cbuf.cbuf[0] = '\0';
	cbuf.cbufp = cbuf.cbuf;
        cbuf.cbufend = cbuf.cbuf;

	for(t = s; t && *t; t++){
	   if(utf8_to_ucs4_oneatatime((unsigned char) *t & 0xff, &cbuf, &ucs, NULL)){
	     if ((ucs >= 0x00A0 && ucs <= 0xD7FF)
		|| (ucs >= 0xE000 && ucs <= 0xFDCF) 
		|| (ucs >= 0xFDF0 && ucs <= 0xFFEF)
		|| (ucs >= 0x10000 && ucs <= 0x1FFFD)
		|| (ucs >= 0x20000 && ucs <= 0x2FFFD)
		|| (ucs >= 0x30000 && ucs <= 0x3FFFD)
		|| (ucs >= 0x40000 && ucs <= 0x4FFFD)
		|| (ucs >= 0x50000 && ucs <= 0x5FFFD)
		|| (ucs >= 0x60000 && ucs <= 0x6FFFD)
		|| (ucs >= 0x70000 && ucs <= 0x7FFFD)
		|| (ucs >= 0x80000 && ucs <= 0x8FFFD)
		|| (ucs >= 0x90000 && ucs <= 0x9FFFD)
		|| (ucs >= 0xA0000 && ucs <= 0xAFFFD)
		|| (ucs >= 0xB0000 && ucs <= 0xBFFFD)
		|| (ucs >= 0xC0000 && ucs <= 0xCFFFD)
		|| (ucs >= 0xD0000 && ucs <= 0xDFFFD)
		|| (ucs >= 0xE0000 && ucs <= 0xEFFFD)
		|| (ucs >= 0xF0000 && ucs <= 0xFFFFD)
		|| (ucs >= 0x100000 && ucs <= 0x10FFFD))
                   valid = t-s+1;
		break;
	   }
	}
    }
    return valid;
}


/*
 * rfc1738xchar - returns TRUE if the given char fits RFC 1738 "xchar" BNF
 */
int
rfc1738xchar(char *s)
{
    int n;

    return((n = rfc1738uchar(s))
	    ? n
	    : (strchr(RFC1738_RSVP, *s) != NULL
	       || strchr(RFC1738_FUDGE, *s)));
}


/*
 * rfc1738_num - return long value of a string of digits, possibly escaped
 */
unsigned long
rfc1738_num(char **s)
{
    register char *p = *s;
    unsigned long n = 0L;

    for(; *p; p++)
      if(*p == '%' && isxpair(p+1)){
	  int c = X2C(p+1);
	  if(isdigit((unsigned char) c)){
	      n = (c - '0') + (n * 10);
	      p += 2;
	  }
	  else
	    break;
      }
      else if(isdigit((unsigned char) *p))
	n = (*p - '0') + (n * 10);
      else
	break;

    *s = p;
    return(n);
}


int
rfc1738_group(char *s)
{
    return(isalnum((unsigned char) *s)
	   || RFC1738_ESC(s)
	   || strchr(RFC1738_NEWS, *s));
}


/*
 * Encode (hexify) a mailto url.
 *
 * Args  s -- src url
 *
 * Returns  An allocated string which is suitably encoded.
 *          Result should be freed by caller.
 *
 * Since we don't know here which characters are reserved characters (? and &)
 * for use in delimiting the pieces of the url and which are just those
 * characters contained in the data that should be encoded, we always encode
 * them. That's because we know we don't use those as reserved characters.
 * If you do use those as reserved characters you have to encode each part
 * separately.
 */
char *
rfc1738_encode_mailto(char *s)
{
    char *d, *ret = NULL;

    if(s){
	/* Worst case, encode every character */
	ret = d = (char *)fs_get((3*strlen(s) + 1) * sizeof(char));
	while(*s){
	    if(isalnum((unsigned char)*s)
	       || strchr(RFC1738_SAFE, *s)
	       || strchr(RFC1738_EXTRA, *s))
	      *d++ = *s++;
	    else{
		*d++ = '%';
		C2XPAIR(*s, d);
		s++;
	    }
	}

	*d = '\0';
    }

    return(ret);
}


/*
 *  * * * * * * * *      RFC 1808 support routines      * * * * * * * *
 */


int
rfc1808_tokens(char *url, char **scheme, char **net_loc, char **path,
	       char **parms, char **query, char **frag)
{
    char *p, *q, *start, *tmp = cpystr(url);

    start = tmp;
    if((p = strchr(start, '#')) != NULL){	/* fragment spec? */
	*p++ = '\0';
	if(*p)
	  *frag = cpystr(p);
    }

    if((p = strchr(start, ':')) && p != start){ /* scheme part? */
	for(q = start; q < p; q++)
	  if(!(isdigit((unsigned char) *q)
	       || isalpha((unsigned char) *q)
	       || strchr("+-.", *q)))
	    break;

	if(p == q){
	    *p++ = '\0';
	    *scheme = cpystr(start);
	    start = p;
	}
    }

    if(*start == '/' && *(start+1) == '/'){ /* net_loc */
	if((p = strchr(start+2, '/')) != NULL)
	  *p++ = '\0';

	*net_loc = cpystr(start+2);
	if(p)
	  start = p;
	else *start = '\0';		/* End of parse */
    }

    if((p = strchr(start, '?')) != NULL){
	*p++ = '\0';
	*query = cpystr(p);
    }

    if((p = strchr(start, ';')) != NULL){
	*p++ = '\0';
	*parms = cpystr(p);
    }

    if(*start)
      *path = cpystr(start);

    fs_give((void **) &tmp);

    return(1);
}


/*
 * web_host_scan -- Scan the given line for possible web host names
 *
 * NOTE: scan below is limited to DNS names ala RFC1034
 */
char *
web_host_scan(char *line, int *len)
{
    char *end, last = '\0';

    for(; *line; last = *line++)
      if((*line == 'w' || *line == 'W')
	 && (!last || !(isalnum((unsigned char) last)
			|| last == '.' || last == '-' || last == '/'))
	 && (((*(line + 1) == 'w' || *(line + 1) == 'W')	/* "www." */
	      && (*(line + 2) == 'w' || *(line + 2) == 'W'))
	     || ((*(line + 1) == 'e' || *(line + 1) == 'E')	/* "web." */
		 && (*(line + 2) == 'b' || *(line + 2) == 'B')))
	 && (*(line + 3) == '.')){
	  end = rfc1738_scheme_part(line + 3);
	  if((*len = end - line) > ((*(line+3) == '.') ? 4 : 3)){
	      /* Dread comma exception, see note in rfc1738_scan */
	      if(strchr(",:", *(line + (*len) - 1))
		 || (*(line + (*len) - 1) == '.'
		     && (!*(line + (*len)) || *(line + (*len)) == ' ')))
		(*len)--;

	      return(line);
	  }
	  else
	    line += 3;
      }

    return(NULL);
}


/*
 * mail_addr_scan -- Scan the given line for possible RFC822 addr-spec's
 *
 * NOTE: Well, OK, not strictly addr-specs since there's a lot of junk
 *	 we're tying to sift thru and we'd like to minimize false-pos
 *	 matches.
 */
char *
mail_addr_scan(char *line, int *len)
{
    char *amp, *start, *end;
/*
 * This list is not the whole standards-based list, this is just a list
 * of likely email address characters. We don't want to include everything
 * because punctuation in the text might get mixed in with the address.
 */
#define NONALPHANUMOK ".-_+%/="

    /* process each : in the line */
    for(; (amp = strindex(line, '@')) != NULL; line = end){
	end = amp + 1;
	/* zero length addr? */
	if(amp == line || !(isalnum((unsigned char) *(start = amp - 1))
			    || strchr(NONALPHANUMOK, *start)))
	  continue;

	/*
	 * Valid address (ala RFC822 BNF)?  First, first look to the
	 * left to make sure there are valid "scheme" chars...
	 */
	while(1)
	  /* NOTE: we're not doing quoted-strings */
	  if(!(isalnum((unsigned char) *start) || strchr(NONALPHANUMOK, *start))){
	      /* advance over bogus char, and erase leading punctuation */
	      for(start++; *start && strchr(NONALPHANUMOK, *start); start++)
	        ;

	      break;
	  }
	  else if(start > line)
	    start--;
	  else
	    break;

	/*
	 * Make sure everything up to the colon is a known scheme...
	 */
	if(start && (amp - start) > 0){
	    /*
	     * Second, make sure that everything to the right of
	     * amp is valid for a "domain"...
	     */
	    if(*(end = amp + 1) == '['){ /* domain literal */
		int dots = 3;

		for(++end; *end ; end++)
		  if(*end == ']'){
		      if(!dots){
			  *len = end - start + 1;
			  return(start);
		      }
		      else
			break;		/* bogus */
		  }
		  else if(*end == '.'){
		      if(--dots < 0)
			break;		/* bogus */
		  }
		  else if(!isdigit((unsigned char) *end))
		    break;		/* bogus */
	    }
	    else if(isalnum((unsigned char) *end)){ /* domain name? */
		for(++end; ; end++)
		  if(!(*end && (isalnum((unsigned char) *end)
				|| *end == '-'
				|| *end == '.'
				|| *end == '_'))){
		      /* can't end with dash, dot or underscore */
		      while(!isalnum((unsigned char) *(end - 1)))
			end--;

		      *len = end - start;
		      return(start);
		  }
	    }
	}
    }

    return(NULL);
}