45872bfa8b5701e2da983a5052c9f5c18c661621
kent
  Thu Dec 9 12:54:18 2021 -0800
Adding option for tokenizer to handle a pretty good spectrum of two character operations.

diff --git src/lib/tokenizer.c src/lib/tokenizer.c
index 7fc9e41..0a5c5de 100644
--- src/lib/tokenizer.c
+++ src/lib/tokenizer.c
@@ -1,216 +1,243 @@
 /* tokenizer - A tokenizer structure that will chop up file into
  * tokens.  It is aware of quoted strings and otherwise tends to return
  * white-space or punctuated-separated words, with punctuation in
  * a separate token.  This is used by autoSql. */
 
 /* Copyright (C) 2011 The Regents of the University of California 
  * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "errAbort.h"
 #include "linefile.h"
 #include "tokenizer.h"
 
 
 struct tokenizer *tokenizerOnLineFile(struct lineFile *lf)
 /* Create a new tokenizer on open lineFile. */
 {
 struct tokenizer *tkz;
 AllocVar(tkz);
 tkz->sAlloc = 128;
 tkz->string = needMem(tkz->sAlloc);
 tkz->lf = lf;
 tkz->curLine = tkz->linePt = "";
 return tkz;
 }
 
 struct tokenizer *tokenizerNew(char *fileName)
 /* Return a new tokenizer. */
 {
 return tokenizerOnLineFile(lineFileOpen(fileName, TRUE));
 }
 
 void tokenizerFree(struct tokenizer **pTkz)
 /* Tear down a tokenizer. */
 {
 struct tokenizer *tkz;
 if ((tkz = *pTkz) != NULL)
     {
     freeMem(tkz->string);
     lineFileClose(&tkz->lf);
     freez(pTkz);
     }
 }
 
 void tokenizerReuse(struct tokenizer *tkz)
 /* Reuse token. */
 {
 if (!tkz->eof)
     tkz->reuse = TRUE;
 }
 
 int tokenizerLineCount(struct tokenizer *tkz)
 /* Return line of current token. */
 {
 return tkz->lf->lineIx;
 }
 
 char *tokenizerFileName(struct tokenizer *tkz)
 /* Return name of file. */
 {
 return tkz->lf->fileName;
 }
 
 char *tokenizerNext(struct tokenizer *tkz)
 /* Return token's next string (also available as tkz->string) or
  * NULL at EOF. */
 {
 char *start, *end;
 char c, *s;
 int size;
 if (tkz->reuse)
     {
     tkz->reuse = FALSE;
     return tkz->string;
     }
 tkz->leadingSpaces = 0;
 for (;;)	/* Skip over white space and comments. */
     {
     int lineSize;
     s = start = skipLeadingSpaces(tkz->linePt);
     tkz->leadingSpaces += s - tkz->linePt;
     if ((c = start[0]) != 0)
 	{
 	if (tkz->uncommentC && c == '/')
 	     {
 	     if (start[1] == '/')
 		 ;  /* Keep going in loop effectively ignoring rest of line. */
 	     else if (start[1] == '*')
 		 {
 		 start += 2;
 		 for (;;)
 		     {
 		     char *end = stringIn("*/", start);
 		     if (end != NULL)
 			  {
 			  tkz->linePt = end+2;
 			  break;
 			  }
 		     if (!lineFileNext(tkz->lf, &tkz->curLine, &lineSize))
 			  errAbort("End of file (%s) in comment", tokenizerFileName(tkz));
 		     start = tkz->curLine;
 		     }
 		 continue;
 		 }
 	     else
 		 break;
 	     }
 	else if (tkz->uncommentShell && c == '#')
 	     ;  /* Keep going in loop effectively ignoring rest of line. */
 	else
 	    break;	/* Got something real. */
 	}
     if (!lineFileNext(tkz->lf, &tkz->curLine, &lineSize))
 	{
 	tkz->eof = TRUE;
 	return NULL;
 	}
     tkz->leadingSpaces += 1;
     tkz->linePt = tkz->curLine;
     }
 if (isalnum(c) || (c == '_'))
     {
     for (;;)
 	{
         s++;
 	if (!(isalnum(*s) || (*s == '_')))
 	    break;
 	}
     end = s;
     }
 else if (c == '"' || c == '\'')
     {
     char quot = c;
     if (tkz->leaveQuotes)
 	start = s++;
     else
 	start = ++s;
     for (;;)
 	{
 	c = *s;
 	if (c == quot)
 	    {
 	    if (s[-1] == '\\')
 		{
 		if (s >= start+2 && s[-2] == '\\')
 		    break;
 		}
 	    else
 		break;
 	    }
 	else if (c == 0)
 	    {
 	    break;
 	    }
 	++s;
 	}
     end = s;
     if (c != 0)
 	++s;
     if (tkz->leaveQuotes)
 	end += 1;
     }
-else
+else /* case for punctuation etc. */
+    {
+    if (tkz->twoCharOps)
+        {
+	int opSize = 1;
+	switch (c)
 	   {
+	   case '>':
+	   case '<':
+	   case '=':
+	   case '!':
+	       if (start[1] == '=')
+		   opSize = 2;
+	       break;
+	   case '+':
+	   case '-':
+	   case '|':
+	   case '&':
+	       if (start[1] == c)
+		   opSize = 2;
+	       break;
+	   default:
+	       opSize = 1;
+	       break;
+	   }
+	s += opSize;
+	end = s;
+	}
+    else
 	end = ++s;
     }
 tkz->linePt = s;
 size = end - start;
 if (size >= tkz->sAlloc)
     {
     tkz->sAlloc = size+128;
     tkz->string = needMoreMem(tkz->string, 0, tkz->sAlloc);
     }
 memcpy(tkz->string, start, size);
 tkz->string[size] = 0;
 return tkz->string;
 }
 
 
 void tokenizerErrAbort(struct tokenizer *tkz, char *format, ...)
 /* Print error message followed by file and line number and
  * abort. */
 {
 va_list args;
 va_start(args, format);
 vaWarn(format, args);
 errAbort("line %d of %s:\n%s", 
 	tokenizerLineCount(tkz), tokenizerFileName(tkz), tkz->curLine);
 }
 
 void tokenizerNotEnd(struct tokenizer *tkz)
 /* Squawk if at end. */
 {
 if (tkz->eof)
     errAbort("Unexpected end of input line %d of %s", tkz->lf->lineIx, tkz->lf->fileName);
 }
 
 char *tokenizerMustHaveNext(struct tokenizer *tkz)
 /* Get next token, which must be there. */
 {
 char *s = tokenizerNext(tkz);
 if (s == NULL)
     errAbort("Unexpected end of input line %d of %s", tkz->lf->lineIx, tkz->lf->fileName);
 return s;
 }
 
 void tokenizerMustMatch(struct tokenizer *tkz, char *string)
 /* Require next token to match string.  Return next token
  * if it does, otherwise abort. */
 {
 if (sameWord(tkz->string, string))
     tokenizerMustHaveNext(tkz);
 else
     tokenizerErrAbort(tkz, "Expecting %s got %s", string, tkz->string);
 }