/*
 * parse.c - word chopper for jittr commands.
 * 16.3.96 jw.
 *
 * parse_hexval() fixed. Did not recognize uppercase. 28.5.96
 *
 * {foo\{bar} fixed to be parsed as "foo\{bar". This is TCL behaviour.
 * The backslash is not interpolated within {}, but it removes the magic 
 * of a subsequent { or } character. Is that still sane?  
 * 22.6.96 jw.
 *
 * TCL does not hounour a single quote in any way. I give the single quote
 * the same magic, that it has in every unix shell (except tcsh):
 * Nothing is magic until the next single quote. We thus have 4 levels of
 * quoting (\, "", '', and {}), which is quite handy for complicated ping-pong
 * commands.
 *
 * "\@" timestamp expansion added. It prints the date in american style
 * MM/DD/YYYY format and the time in UTC, which should be compared with the
 * outout of the unix command "date -u". 27.6.96 jw.
 *
 * Stupid bug in parse_check fixed. Leading delimiters could confuse the
 * tokenizer in rare cases. 11.7.96 jw.
 *
 * parse_word() now understands "\x{8950 4e47 0d0a 1a0a 0000 000d 4948 4452}".
 * 12.May 97, jw
 *
 */
#include <sys/time.h>		/* struct timeval */
#include "jittr/atom.h"		/* struct dstring */
#include "jittr/jittr.h"	/* debug.h, struct exec_context & prototypes */
#include "jittr/schedule.h"	/* sched_format_tv() */

/* ============ generic parsing, somewhat similar to tcl ========= */
int
parse_hexval(byte)
int byte;
{
  if (byte >= '0' && byte <= '9') return byte - '0';
  if (byte >= 'A' && byte <= 'F') return byte - 'A' + 10;
  if (byte >= 'a' && byte <= 'f') return byte - 'a' + 10;
  return -1;
}

/*
 * Returns 1 if character c is element of string s, 0 otherwise.
 * S is a null-terminated string, and the final null character is not 
 * considered part of the string. If you want to match a null-character,
 * make *s == '\0', in this case the string is terminated by a second 
 * null character.
 */
static int
issep(c, s)
int c;
char *s;
{
  ASSERT(s);

  if (!*s)
    {
      s++;
      if (!c)
        return 1;
    }
  while (*s)
    if (c == *s++)
      return 1;
  return 0;
}

/*
 * The character c was preceded by a backslash and parse word did not know
 * what it means. parse_expand_special has at least 40 characters room
 * to write whatever it produces at *d. It should return the length
 * of its product. Returns negative, when in trouble. 
 *
 * XXX Fixme: This function itself should expand the size of the dstring.
 *     If it returns more than max_size bytes per invocation, the parse_word()
 *     may crash!
 */
static int
parse_expand_special(d, c, max_size)
char *d;
int c, max_size;
{
  struct timeval now;
  int l = 0;

  switch (c)
    {
    case '@':
      jittr_gettimeval(&now);
      l = sched_format_tv(d, max_size, &now, NULL);
      break;
    default:
      debug1("parse_expand_special: cannot expand \"\\%c\".\n", c);
    }
  return l;
}

/*
 * Find the first word in the buffer found at *pp. This buffer is considered
 * *blenp bytes long. Words are delimited by one of the characters in seperator.
 * See issep() for details. Characters used for the syntax of the strings
 * (' " \ ^ { }) are interpreted as syntactic characters, unless preceded by a
 * backslash.
 *
 * Leading whitespace characters (that means: characters from the seperator 
 * string) are skipped before parsing.
 *
 * The string is assumed to be in tcl-syntax, as described in the ``Tcl(n)'' 
 * man page. Bracket ('[') command substitution, dollar ('$') variable 
 * substitution and hash ('#') comments are currently not supported.
 * All other use of backslash ('\') substitutions are suported as described;
 * including octal and hexadecimal byte representation. And an addional "\@"
 * for timestamp conversion.
 *
 * The word is parsed for escaped character and the resulting text is written
 * into (*dst)->data. If data was pointing outside of (*dst)->buf, it is 
 * set equal to &(*dst)->buf. The address of this location is returned. 
 * The dstring found at *dst will be automatically expanded to hold all the
 * data.
 *
 * If you prefer an in-place parser then pass NULL as dst.
 *
 * The word will be null-terminated, to help dumb parsers that may follow. 
 * If the buffer ends where the word ends and dst is NULL, no '\0' byte can
 * be provided, but the word is considered valid.
 * NULL is returned when there were no words in the buffer.
 * If wlenp is nonzero, the length of the word (without the '\0' byte)
 * is written to *wlenp.
 *
 * *pp itself is incremented to point after parsed word, so that scanning may 
 * easily continue at the position where we left off. *blenp is updated to
 * contain the number of bytes remaining *pp. Trailing whitespace is not 
 * skipped here. *pp will be incremented by *wlenp+ bytes; plus any 
 * amount of leading whitespace (if present), minus one byte, if no '\0' byte
 * was written.
 *
 * Parse_word handles true binary strings, which are *NOT* terminated by null
 * characters. Keep this code in sync with parse_check() below.
 */
char *
parse_word(dst, pp, blenp, wlenp, seperator)
dstring **dst;
char **pp;
int *blenp, *wlenp;
char *seperator;
{
  char *d, *ds;		/* pointers for destination */
  char *p, *s; 		/* pointers for source */
  int c;
  int slash = 0;
  int brace = 0;
  int caret = 0;
  int dquote = 0;
  int squote = 0;
  int hexblock = 0;
  int l, len = *blenp;
  
  if (!seperator)
    seperator = "\0 \t\n\r;";

  d = ds = s = p = *pp;

  if (dst)	/* do not modify the original buffer, use this one */
    {
      int i = 0;
      
      if (*dst)
        {
	  i = (char *)(*dst)->data - (*dst)->buf;

	  /* 
	   * The byte after the last valid one is a '\0' byte usually.
	   * Data points behind that, most of the time. We accept that. Thus 
	   * (*dst)->length is not enough to check for a valid data pointer.
	   */
	  if (i < 0 || i > (*dst)->allocated)
	    {
	      (*dst)->data = (void *)(*dst)->buf;
	      i = 0;
	    }
	}

      /* 
       * Have enough memory available.  40 characters are for
       * parse_expand_special, all other conversions shorten the text.
       */
      dstring_append(dst, 0, NULL, i + len + 40);
      ds = d = (char *)((*dst)->data = (void *)((*dst)->buf + i));
    }

  while ((len-- > 0) && issep(*p, seperator))	/* strip leading whitespace */
    p++;

  if (len < 0)
    {
      *pp = p;
      if (wlenp)
        *wlenp = 0;
      return NULL;				/* pp was an empty buffer */
    }

  len++;

  while (len-- > 0)
    {
      c = *p++;

      if (brace && slash)
        {
	  slash = 0;
	  *d++ = c;
	  continue;
	}
      if (brace)
        {
	  *d = c;
	  if (c == '{') brace++;
	  if (c == '}') brace--;
	  if (c == '\\') slash = 1;
	  if (brace) d++;
	  continue;
	}
      if (caret)
        {
	  if (c >= 'a' && c <= 'z')
	    c -= 'a' - 'A';
	  *d++ = c ^ 0x40;
	  caret = 0; 
	  continue;
	}
      if (slash)
        {
	  switch (c)
	    {
	    case 'a': c = 'G' ^ 0x40; break;
	    case 'b': c = 'H' ^ 0x40; break;
	    case 'f': c = 'L' ^ 0x40; break;
	    case 'e': c = '[' ^ 0x40; break;
	    case 'n': c = '\n'; break;
	    case 'r': c = '\r'; break;
	    case 't': c = '\t'; break;
	    case 'v': c = 0xb;  break;
	    case '\n': c = ' '; break;
	    case '\\': c = '\\'; break;
	    case '0': /* c = '\0'; break;		*/
	    case '1': case '2': case '3':
	    case '4': case '5': case '6':
	    case '7': case '8': case '9':
	      c -= '0';
	      if (len <= 0 || *p < '0' || *p > '7') break;
	      c = (*p - '0') | (c << 3);
	      len--; p++; 
	      if (len <= 0 || *p < '0' || *p > '7') break;
	      c = (*p - '0') | (c << 3);
	      len--; p++; 
	      break;
	    case 'x': 
	      if (len > 0 && *p == '{')
		{
		  slash = 0;
		  hexblock = 1;
		  len--; p++;
		  continue;
		}
	      else
	      if (len <= 0 || parse_hexval(*p) < 0) break;
	      c = parse_hexval(*p); len--; p++; 
	      if (len <= 0 || parse_hexval(*p) < 0) break;
	      c = parse_hexval(*p) | (c << 4); len--; p++; 
	      break;
	    case '@':
	      /* 
	       * If we are an in-place parser, we may not have 
	       * enough room for fancy expansions.
	       */
	      l = parse_expand_special(d, c, dst ? 40 : p - d);
	      if ((l <= 0) && (c = '@'))
	        break;
	      d += l; c = *(--d);
	      break;
	    }
	  *d++ = c;
	  slash = 0;
	  continue;
	}
      if (hexblock)
        {
	  if (c == '}')
	    {
	      hexblock = 0;
	      continue;
	    }
	  if (parse_hexval(c) < 0) continue;
	  c = parse_hexval(c);
	  if (len <= 0 || parse_hexval(*p) < 0) continue;
	  c = parse_hexval(*p) | (c << 4); len--; p++; 
	  *d++ = c;
	  continue;
	}
      switch (c)
        {
        case  '\\':
	  if (squote)
	    *d++ = c;
	  else
	    slash = 1;
	  break;
	case '^':
	  if (squote)
	    *d++ = c;
	  else
	    caret = 1;
	  break;
        case '"':
	  if (squote)
	    *d++ = c;
	  else
	    dquote = 1 - dquote;
	  break;
        case '\'':
	  if (dquote)
	    *d++ = c;
	  else
	  squote = 1 - squote;
	  break;
	case '{':
	  if (dquote || squote) 
	    *d++ = c;
	  else
	    brace = 1;
	  break;
	default:
	  if (!dquote && !squote && issep(c, seperator))
	    len = 0;	/* a sure way to break this loop */
	  else
	    *d++ = c;
	}
    }
  if (s == p)
    {
      if (wlenp)
        *wlenp = 0;
      return NULL;
    }
  *blenp -= p - *pp;
  if (dst || *blenp > 0 || d < p)
    *d = '\0';
  if (dst)
    {
      (*dst)->length = d - (*dst)->buf;	/* '\0'-byte *not* included */
      (*dst)->data = (void *)(d+1);	/* '\0'-byte included */
    }
  *pp = p;
  if (wlenp)
    *wlenp = d-ds;
  return ds;
}

/*
 * Parse a buffer in a similar fashion like parse_word() above, but the buffer 
 * will remain unchanged and parsing is continued over word boundaries (they 
 * are simply not recognized).
 *
 * Check_parse() returns negative if the buffer contains any incomplete or 
 * malformed words: 
 *  -1 if everything is correct, but no delimiter is found.
 *  -2 if the malformed / incomplete buffer should be corrected by appending 
 *     more text.
 *  -3 if it can't become correct syntax by appending text. Currently can not
 *     happen. Excessive closing braces are - like in tcl - not magic.
 * Otherwise it returns the byte offset where an unquoted character from 
 * delim was found. Checking ends there. Delim defaults to "\0\n\r;". 
 * Characters in delim are not magic until the first character is seen that
 * is not in delim.
 * 
 * Keep this code in sync with parse_word().
 */
int
parse_check(buf, len, delim)
char *buf;
int len;
char *delim;
{
  int brace = 0;
  int dquote = 0;
  int squote = 0;
  int carrot = 0;
  int slash = 0;
  int l = 0;

  if (!delim)
    delim = "\0\n\r;";

  while (l < len && issep(*buf, delim))	/* strip leading whitspace */
    {
      buf++;
      if (++l == len)
        return 0;
    }

  for (; l < len; l++)
    {
      if (brace && slash)
	slash = 0;
      else if (brace)
        {
	  if (*buf == '{') brace++;
	  if (*buf == '}') brace--;
	  if (*buf == '\\') slash = 1;
	}
      else if (carrot)
        carrot = 0;
      else if (slash)
        slash = 0;
      else if (!squote && *buf == '^')
	carrot = 1;
      else if (!squote && *buf == '\\')
	slash = 1;
      else if (!squote && *buf == '"')
	dquote = 1 - dquote;
      else if (!dquote && *buf == '\'')
        squote = 1 - squote;
      else if (!dquote && !squote)
        {
	  if (*buf == '{')
	    brace = 1;
	  else if (issep(*buf, delim))
	    return l;
	}
      buf++;
    }
  if (carrot || slash || brace || dquote || squote)
    return -2;
  return -1;
}

/*
 * parse_needsq() is a predicate that checks if the string needs quoting when
 * presented to parse_word() or parse_check(). If len is negative, strlen is
 * called.
 *
 * If parse_needsq() returns 0, 
 *	the string will do fine as a bare word.
 * If parse_needsq() returns 1,
 *	buf contains multiple bare words and whitespace.
 * If parse_needsq() returns > 1, 
 *	putting it in braces is sufficient.
 * If parse_needsq() returns negative,
 *	you should process it by dstring_appendq()
 *	AND put double quotes around it.
 */
int
parse_needsq(buf, len)
char *buf;
int len;
{
  int b = 0, w = 0;
  unsigned char *p = (unsigned char *)buf;

  if (len < 0)
    len = strlen(buf);

  while (len--)
    {
      if (*p == '"' || *p == '\'' || *p == '\\' || *p == '^' || *p == '\n')
        return 2;
      else if (*p == ' ' || *p == '\t') 
        w++;
      else if (*p == '{')
        b++;
      else if ((*p == '}') && (--b < 0))
	break;
      else if (*p < ' ' || *p == 127 || *p == 128 || *p > 252)
        return -1;
      p++;
    }
  return b ? -1 : (w ? 1 : 0);
}

/* ============ jittr specific argument chopper ================= */

static char     *jarg_raw = NULL;  /* points into exec_context->what->buf */

char **
jarg_rawp(lenp)
int *lenp;
{
  dstring *args = *(struct dstring **)exec_context;

  if (!jarg_raw && args)
    jarg_raw = (char *)args->data;

  if (lenp)
    *lenp = args ? (args->buf + args->length - jarg_raw) : 0;

  return &jarg_raw;
}

/* 
 * return either the name how it was called, or if not recorded, its fullname
 */
char *
jarg_name()
{
  dstring *args;
  
  if (!exec_context)
    {
      debug("jarg_name: no exec_context, cannot find command text\n");
      return "????";
    }

  args = *(struct dstring **)exec_context;
  return args ? args->buf : atom_name(exec_context->which);
}

/*
 * These static buffer are a hack: It saves malloc and free cycles but
 * breaks under recursion. atom_exec() currently forbids recursion by
 * asserting that jarg_reset() returns 0 before calling a callback.
 *
 * If you need recursion, change atom_exec() to provide a buffer from
 * its stack or to cause push/pop operations on a stack implemented here.
 * 
 * For today, I need that the value returned by jarg_first_word() is still 
 * valid after jarg_next_word() was called. This requires seperate dstrings for
 * each word, because the entire dstring can move, when appended to.
 * 
 * A chained list of dstring structures is used here.   None of the buffers is
 * ever freed or shortened. They always grow. If you want to implement a
 * garbage collector, start here, continue with client I/O buffers and then
 * with undead clients.
 */

struct jarg_node
{
  struct dstring *buf;
  struct jarg_node *next;
};

static struct jarg_node *jarg_av0 = NULL;
static struct jarg_node **jarg_ap = &jarg_av0;

/*
 * make the buffers all unused. return 0, if nothing needed to be done
 * return nonzero otherwise.
 */
int
jarg_reset()
{
  ASSERT(jarg_ap);	/* must always contain an address */
  jarg_raw = NULL;
  if (*jarg_ap == jarg_av0)
    return 0;
  jarg_ap = &jarg_av0;
  return 1;
}

char *
jarg_next_word(lenp)
int *lenp;
{
  dstring *args = *(struct dstring **)exec_context;
  char *s = NULL;
  int rest = 0;

  if (args)
    {
      if (!jarg_raw) 
        jarg_raw = (char *)args->data;
      rest = args->length - (jarg_raw - args->buf);
      if (!*jarg_ap)
        *jarg_ap = (struct jarg_node *)calloc(sizeof(struct jarg_node), 1);
      if (!*jarg_ap)
        return NULL;	/* ouch */
      s = parse_word(&(*jarg_ap)->buf, &jarg_raw, &rest, lenp, NULL);
      if ((*jarg_ap)->buf)
        jarg_ap = &((*jarg_ap)->next);
    }
  if (!s && lenp)
    *lenp = 0;
  return s;
}

char *
jarg_first_word(lenp)
int *lenp;
{
  jarg_reset();
  return jarg_next_word(lenp);
}
