45872bfa8b5701e2da983a5052c9f5c18c661621 kent Thu Dec 9 12:54:18 2021 -0800 Adding option for tokenizer to handle a pretty good spectrum of two character operations. diff --git src/lib/tokenizer.c src/lib/tokenizer.c index 7fc9e41..0a5c5de 100644 --- src/lib/tokenizer.c +++ src/lib/tokenizer.c @@ -1,216 +1,243 @@ /* tokenizer - A tokenizer structure that will chop up file into * tokens. It is aware of quoted strings and otherwise tends to return * white-space or punctuated-separated words, with punctuation in * a separate token. This is used by autoSql. */ /* Copyright (C) 2011 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "errAbort.h" #include "linefile.h" #include "tokenizer.h" struct tokenizer *tokenizerOnLineFile(struct lineFile *lf) /* Create a new tokenizer on open lineFile. */ { struct tokenizer *tkz; AllocVar(tkz); tkz->sAlloc = 128; tkz->string = needMem(tkz->sAlloc); tkz->lf = lf; tkz->curLine = tkz->linePt = ""; return tkz; } struct tokenizer *tokenizerNew(char *fileName) /* Return a new tokenizer. */ { return tokenizerOnLineFile(lineFileOpen(fileName, TRUE)); } void tokenizerFree(struct tokenizer **pTkz) /* Tear down a tokenizer. */ { struct tokenizer *tkz; if ((tkz = *pTkz) != NULL) { freeMem(tkz->string); lineFileClose(&tkz->lf); freez(pTkz); } } void tokenizerReuse(struct tokenizer *tkz) /* Reuse token. */ { if (!tkz->eof) tkz->reuse = TRUE; } int tokenizerLineCount(struct tokenizer *tkz) /* Return line of current token. */ { return tkz->lf->lineIx; } char *tokenizerFileName(struct tokenizer *tkz) /* Return name of file. */ { return tkz->lf->fileName; } char *tokenizerNext(struct tokenizer *tkz) /* Return token's next string (also available as tkz->string) or * NULL at EOF. */ { char *start, *end; char c, *s; int size; if (tkz->reuse) { tkz->reuse = FALSE; return tkz->string; } tkz->leadingSpaces = 0; for (;;) /* Skip over white space and comments. */ { int lineSize; s = start = skipLeadingSpaces(tkz->linePt); tkz->leadingSpaces += s - tkz->linePt; if ((c = start[0]) != 0) { if (tkz->uncommentC && c == '/') { if (start[1] == '/') ; /* Keep going in loop effectively ignoring rest of line. */ else if (start[1] == '*') { start += 2; for (;;) { char *end = stringIn("*/", start); if (end != NULL) { tkz->linePt = end+2; break; } if (!lineFileNext(tkz->lf, &tkz->curLine, &lineSize)) errAbort("End of file (%s) in comment", tokenizerFileName(tkz)); start = tkz->curLine; } continue; } else break; } else if (tkz->uncommentShell && c == '#') ; /* Keep going in loop effectively ignoring rest of line. */ else break; /* Got something real. */ } if (!lineFileNext(tkz->lf, &tkz->curLine, &lineSize)) { tkz->eof = TRUE; return NULL; } tkz->leadingSpaces += 1; tkz->linePt = tkz->curLine; } if (isalnum(c) || (c == '_')) { for (;;) { s++; if (!(isalnum(*s) || (*s == '_'))) break; } end = s; } else if (c == '"' || c == '\'') { char quot = c; if (tkz->leaveQuotes) start = s++; else start = ++s; for (;;) { c = *s; if (c == quot) { if (s[-1] == '\\') { if (s >= start+2 && s[-2] == '\\') break; } else break; } else if (c == 0) { break; } ++s; } end = s; if (c != 0) ++s; if (tkz->leaveQuotes) end += 1; } -else +else /* case for punctuation etc. */ + { + if (tkz->twoCharOps) + { + int opSize = 1; + switch (c) { + case '>': + case '<': + case '=': + case '!': + if (start[1] == '=') + opSize = 2; + break; + case '+': + case '-': + case '|': + case '&': + if (start[1] == c) + opSize = 2; + break; + default: + opSize = 1; + break; + } + s += opSize; + end = s; + } + else end = ++s; } tkz->linePt = s; size = end - start; if (size >= tkz->sAlloc) { tkz->sAlloc = size+128; tkz->string = needMoreMem(tkz->string, 0, tkz->sAlloc); } memcpy(tkz->string, start, size); tkz->string[size] = 0; return tkz->string; } void tokenizerErrAbort(struct tokenizer *tkz, char *format, ...) /* Print error message followed by file and line number and * abort. */ { va_list args; va_start(args, format); vaWarn(format, args); errAbort("line %d of %s:\n%s", tokenizerLineCount(tkz), tokenizerFileName(tkz), tkz->curLine); } void tokenizerNotEnd(struct tokenizer *tkz) /* Squawk if at end. */ { if (tkz->eof) errAbort("Unexpected end of input line %d of %s", tkz->lf->lineIx, tkz->lf->fileName); } char *tokenizerMustHaveNext(struct tokenizer *tkz) /* Get next token, which must be there. */ { char *s = tokenizerNext(tkz); if (s == NULL) errAbort("Unexpected end of input line %d of %s", tkz->lf->lineIx, tkz->lf->fileName); return s; } void tokenizerMustMatch(struct tokenizer *tkz, char *string) /* Require next token to match string. Return next token * if it does, otherwise abort. */ { if (sameWord(tkz->string, string)) tokenizerMustHaveNext(tkz); else tokenizerErrAbort(tkz, "Expecting %s got %s", string, tkz->string); }