src/utils/raSqlQuery/raSqlQuery.c 1.1
1.1 2009/11/20 00:24:55 kent
Starting work on raSqlQuery.
Index: src/utils/raSqlQuery/raSqlQuery.c
===================================================================
RCS file: src/utils/raSqlQuery/raSqlQuery.c
diff -N src/utils/raSqlQuery/raSqlQuery.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/utils/raSqlQuery/raSqlQuery.c 20 Nov 2009 00:24:55 -0000 1.1
@@ -0,0 +1,658 @@
+/* raSqlQuery - Do a SQL-like query on a RA file.. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "obscure.h"
+#include "ra.h"
+#include "localmem.h"
+#include "tokenizer.h"
+#include "sqlNum.h"
+
+static char const rcsid[] = "$Id$";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+ "raSqlQuery - Do a SQL-like query on a RA file.\n"
+ "usage:\n"
+ " raSqlQuery raFile(s) query-options\n"
+ "One of the following query-options must be specified\n"
+ " -queryFile=fileName\n"
+ " \"-query=select list,of,fields where field='this'\"\n"
+ "The queryFile just has a query in it in the same form as the query option, but\n"
+ "without needing the quotes necessarily.\n"
+ " The syntax of a query statement is very SQL-like. It must begin with either\n"
+ "'select' or 'count'. Select is followed by a field list, or '*' for all fields\n"
+ "Count is not followed by anything. The 'where' clause is optional, and if it\n"
+ "exists it can contain expressions involving fields, numbers, strings, arithmetic, 'and'\n"
+ "'or' and so forth.\n"
+ "Other options:\n"
+ " -merge=keyField - If there are multiple raFiles, records with the same keyField will be\n"
+ " merged together with fields in later files overriding fields in earlier files\n"
+ " -skipMissing - If merging, skip records without keyfield rather than abort\n"
+ "The output will be to stdout, in the form of a .ra file if the select command is used\n"
+ "and just a simple number if the count command is used\n"
+ );
+}
+
+static char *clQueryFile = NULL;
+static char *clQuery = NULL;
+static char *clMerge = NULL;
+static boolean clSkipMissing = FALSE;
+
+static struct optionSpec options[] = {
+ {"queryFile", OPTION_STRING},
+ {"query", OPTION_STRING},
+ {"merge", OPTION_STRING},
+ {"skipMissing", OPTION_BOOLEAN},
+ {NULL, 0},
+};
+
+struct raField
+/* A single field. */
+ {
+ struct raField *next; /* Next in list. */
+ char *name; /* Field name. */
+ char *val; /* Field value. */
+ };
+
+struct raRecord
+/* A single RA record. */
+ {
+ struct raRecord *next; /* Next in list. */
+ struct raField *key; /* Key field if any. */
+ struct raField *fieldList; /* List of fields. */
+ };
+
+enum rqlParseOp
+ {
+ rqlParseUnknown, /* Should not occur */
+ rqlParseLiteral, /* Literal string or number. */
+ rqlParseSymbol, /* A symbol name. */
+ rqlParseEq, /* An equals comparison */
+ rqlParseNe, /* A not equals comparison */
+ rqlParseAnd, /* An and */
+ rqlParseOr, /* An or */
+ };
+
+char *rqlParseOpToString(enum rqlParseOp op)
+/* Return string representation of parse op. */
+{
+switch (op)
+ {
+ case rqlParseLiteral:
+ return "rqlParseLiteral";
+ case rqlParseSymbol:
+ return "rqlParseSymbol";
+ case rqlParseEq:
+ return "rqlParseEq";
+ case rqlParseNe:
+ return "rqlParseNe";
+ case rqlParseAnd:
+ return "rqlParseAnd";
+ case rqlParseOr:
+ return "rqlParseOr";
+ default:
+ return "rqlParseUnknown";
+ }
+}
+
+enum rqlType
+/* A type */
+ {
+ rqlTypeBoolean = 1,
+ rqlTypeString = 2,
+ rqlTypeInt = 3,
+ rqlTypeDouble = 4,
+ };
+
+union rqlVal
+/* Some value of arbirary type that can be of any type corresponding to rqlType */
+ {
+ boolean b;
+ char *s;
+ int i;
+ double x;
+ };
+
+struct rqlEval
+/* Result of evaluation of parse tree. */
+ {
+ enum rqlType type;
+ union rqlVal val;
+ };
+
+struct rqlParse
+/* A rql parse-tree. */
+ {
+ struct rqlParse *next; /* Points to younger sibling if any. */
+ struct rqlParse *children; /* Points to oldest child if any. */
+ enum rqlParseOp op; /* Operation at this node. */
+ enum rqlType type; /* Return type of this operation. */
+ union rqlVal val; /* Return value of this operation. */
+ };
+
+void rqlValDump(union rqlVal val, enum rqlType type, FILE *f)
+/* Dump out value to file. */
+{
+switch (type)
+ {
+ case rqlTypeBoolean:
+ fprintf(f, "%s", (val.b ? "true" : "false") );
+ break;
+ case rqlTypeString:
+ fprintf(f, "%s", val.s);
+ break;
+ case rqlTypeInt:
+ fprintf(f, "%d", val.i);
+ break;
+ case rqlTypeDouble:
+ fprintf(f, "%f", val.x);
+ break;
+ }
+}
+
+struct rqlStatement
+/* A parsed out RQL statement */
+ {
+ char *next; /* Next in list */
+ char *command; /* Generally the first word in the statement. */
+ struct slName *fieldList; /* List of fields if any. */
+ struct rqlParse *whereClause; /* Where clause if any - tokenized. */
+ };
+
+void rqlParseDump(struct rqlParse *p, int depth, FILE *f)
+/* Dump out rqlParse tree and children. */
+{
+spaceOut(f, 3*depth);
+fprintf(f, "%s ", rqlParseOpToString(p->op));
+rqlValDump(p->val, p->type, f);
+fprintf(f, "\n");
+struct rqlParse *child;
+for (child = p->children; child != NULL; child= child->next)
+ rqlParseDump(child, depth+1, f);
+}
+
+struct rqlParse *rqlParseAtom(struct tokenizer *tkz)
+/* Return low level (symbol or literal) */
+{
+char *tok = tokenizerMustHaveNext(tkz);
+struct rqlParse *p;
+AllocVar(p);
+char c = tok[0];
+if (c == '\'' || c == '"')
+ {
+ p->op = rqlParseLiteral;
+ p->type = rqlTypeString;
+ int len = strlen(tok+1);
+ p->val.s = cloneStringZ(tok+1, len-1);
+ }
+else if (isalpha(c) || c == '_')
+ {
+ p->op = rqlParseSymbol;
+ p->type = rqlTypeString; /* String until promoted at least. */
+ p->val.s = cloneString(tok);
+ }
+else if (isdigit(c))
+ {
+ p->op = rqlParseLiteral;
+ if (strchr(tok, '.'))
+ {
+ p->type = rqlTypeDouble;
+ p->val.x = sqlDouble(tok);
+ }
+ else
+ {
+ p->type = rqlTypeInt;
+ p->val.i = sqlUnsigned(tok);
+ }
+ }
+else
+ {
+ errAbort("Unexpected %s line %d of %s", tok, tkz->lf->lineIx, tkz->lf->fileName);
+ }
+return p;
+}
+
+static void expectingGot(struct tokenizer *tkz, char *expecting, char *got)
+/* Print out error message about unexpected input. */
+{
+errAbort("Expecting %s, got %s, line %d of %s", expecting, got, tkz->lf->lineIx,
+ tkz->lf->fileName);
+}
+
+static void skipOverRequired(struct tokenizer *tkz, char *expecting)
+/* Make sure that next token is tok, and skip over it. */
+{
+tokenizerMustHaveNext(tkz);
+if (!sameString(tkz->string, expecting))
+ expectingGot(tkz, expecting, tkz->string);
+}
+
+struct rqlParse *rqlParseCmp(struct tokenizer *tkz)
+/* Parse out comparison. */
+{
+struct rqlParse *l = rqlParseAtom(tkz);
+struct rqlParse *p = l;
+char *tok = tokenizerNext(tkz);
+if (tok != NULL)
+ {
+ enum rqlParseOp op = rqlParseUnknown;
+ if (sameString(tok, "="))
+ {
+ op = rqlParseEq;
+ }
+ else if (sameString(tok, "!"))
+ {
+ op = rqlParseNe;
+ skipOverRequired(tkz, "=");
+ }
+ else
+ {
+ tokenizerReuse(tkz);
+ return p;
+ }
+ struct rqlParse *r = rqlParseAtom(tkz);
+ AllocVar(p);
+ p->op = op;
+ p->type = rqlTypeBoolean;
+ p->children = l;
+ l->next = r;
+ }
+return p;
+}
+
+struct rqlParse *rqlParseClause(struct tokenizer *tkz)
+/* Parse out a clause, usually a where clause. */
+{
+return rqlParseCmp(tkz);
+}
+
+void rqlStatementDump(struct rqlStatement *rql, FILE *f)
+/* Print out statement to file. */
+{
+fprintf(f, "%s", rql->command);
+if (rql->fieldList)
+ {
+ fprintf(f, " ");
+ struct slName *field = rql->fieldList;
+ fprintf(f, "%s", field->name);
+ for (field = field->next; field != NULL; field = field->next)
+ fprintf(f, ",%s", field->name);
+ }
+if (rql->whereClause)
+ {
+ fprintf(f, " where:\n");
+ rqlParseDump(rql->whereClause, 0, f);
+ }
+fprintf(f, "\n");
+}
+
+struct rqlEval rqlEvalOnRecord(struct rqlParse *p, struct raRecord *ra);
+/* Evaluate self on ra. */
+
+struct rqlEval rqlEvalCoerceToBoolean(struct rqlEval r)
+/* Return TRUE if it's a nonempty string or a non-zero number. */
+{
+switch (r.type)
+ {
+ case rqlTypeBoolean:
+ break; /* It's already done. */
+ case rqlTypeString:
+ r.val.b = (r.val.s != NULL && r.val.s[0] != 0);
+ break;
+ case rqlTypeInt:
+ r.val.b = (r.val.i != 0);
+ break;
+ case rqlTypeDouble:
+ r.val.b = (r.val.x != 0.0);
+ break;
+ default:
+ errAbort("Unknown type %d", r.type);
+ r.val.b = FALSE;
+ break;
+ }
+r.type = rqlTypeBoolean;
+return r;
+}
+
+struct rqlEval rqlEvalCoerceToInt(struct rqlEval r)
+/* Return TRUE if it's a nonempty string or a non-zero number. */
+{
+switch (r.type)
+ {
+ case rqlTypeBoolean:
+ r.val.i = r.val.b;
+ break;
+ case rqlTypeInt:
+ break; /* It's already done. */
+ case rqlTypeDouble:
+ r.val.i = r.val.x;
+ break;
+ case rqlTypeString:
+ r.val.i = atoi(r.val.s);
+ break;
+ default:
+ errAbort("Unknown type %d", r.type);
+ r.val.i = 0;
+ break;
+ }
+r.type = rqlTypeInt;
+return r;
+}
+
+struct rqlEval rqlEvalCoerceToFloat(struct rqlEval r)
+/* Convert to floating point. */
+{
+switch (r.type)
+ {
+ case rqlTypeBoolean:
+ r.val.x = r.val.b;
+ break;
+ case rqlTypeInt:
+ r.val.x = r.val.i;
+ break;
+ case rqlTypeDouble:
+ break; /* It's already done. */
+ case rqlTypeString:
+ r.val.x = atof(r.val.s);
+ break;
+ default:
+ errAbort("Unknown type %d", r.type);
+ r.val.x = 0;
+ break;
+ }
+r.type = rqlTypeDouble;
+return r;
+}
+
+struct raField *raRecordField(struct raRecord *ra, char *fieldName)
+/* Return named field if it exists, otherwise NULL */
+{
+struct raField *field;
+for (field = ra->fieldList; field != NULL; field = field->next)
+ {
+ if (sameString(field->name, fieldName))
+ return field;
+ }
+return NULL;
+}
+
+struct rqlEval rqlEvalEq(struct rqlParse *p, struct raRecord *ra)
+/* Return true if two children are equal, doing some casting if need be. */
+{
+struct rqlParse *lp = p->children;
+struct rqlParse *rp = lp->next;
+struct rqlEval lv = rqlEvalOnRecord(lp, ra);
+struct rqlEval rv = rqlEvalOnRecord(rp, ra);
+if (lv.type != rv.type)
+ errAbort("Mixed types in rqlParseEquals");
+struct rqlEval res;
+res.type = rqlTypeBoolean;
+switch (lv.type)
+ {
+ case rqlTypeBoolean:
+ res.val.b = (lv.val.b == rv.val.b);
+ break;
+ case rqlTypeString:
+ res.val.b = sameString(lv.val.s, rv.val.s);
+ break;
+ case rqlTypeInt:
+ res.val.b = (lv.val.i == rv.val.i);
+ break;
+ case rqlTypeDouble:
+ res.val.b = (lv.val.x == rv.val.x);
+ break;
+ default:
+ errAbort("Unknown type %d", lv.type);
+ res.val.b = FALSE;
+ }
+return res;
+}
+
+struct rqlEval rqlEvalOnRecord(struct rqlParse *p, struct raRecord *ra)
+/* Evaluate self on ra. */
+{
+struct rqlEval res;
+switch (p->op)
+ {
+ case rqlParseLiteral:
+ res.val = p->val;
+ res.type = p->type;
+ break;
+ case rqlParseSymbol:
+ res.type = rqlTypeString;
+ struct raField *f = raRecordField(ra, p->val.s);
+ if (f == NULL)
+ res.val.s = "";
+ else
+ res.val.s = f->val;
+ break;
+ case rqlParseEq:
+ res = rqlEvalEq(p, ra);
+ break;
+ case rqlParseNe:
+ res = rqlEvalEq(p, ra);
+ res.val.b = !res.val.b;
+ break;
+ default:
+ errAbort("Unknown op %s\n", rqlParseOpToString(p->op));
+ res.type = rqlTypeInt; // Keep compiler from complaining.
+ res.val.i = 0; // Keep compiler from complaining.
+ break;
+ }
+return res;
+}
+
+struct rqlStatement *rqlStatementParse(struct lineFile *lf)
+/* Parse an RQL statement out of text */
+{
+struct tokenizer *tkz = tokenizerOnLineFile(lf);
+tkz->uncommentShell = TRUE;
+tkz->uncommentC = TRUE;
+tkz->leaveQuotes = TRUE;
+struct rqlStatement *rql;
+AllocVar(rql);
+rql->command = cloneString(tokenizerMustHaveNext(tkz));
+if (sameString(rql->command, "select"))
+ {
+ struct slName *list = NULL;
+ char *tok = tokenizerMustHaveNext(tkz);
+ list = slNameNew(tok);
+ for (;;)
+ {
+ /* Parse out comma-separated field list. */
+ char *comma = tokenizerNext(tkz);
+ if (comma == NULL || comma[0] != ',')
+ {
+ tokenizerReuse(tkz);
+ break;
+ }
+ struct slName *field = slNameAddHead(&list, tokenizerMustHaveNext(tkz));
+ }
+ slReverse(&list);
+ rql->fieldList = list;
+ }
+else if (sameString(rql->command, "count"))
+ {
+ /* No parameters to count. */
+ }
+else
+ errAbort("Unknown RQL command '%s line %d of %s\n", rql->command, lf->lineIx, lf->fileName);
+
+char *where = tokenizerNext(tkz);
+if (where != NULL)
+ {
+ if (!sameString(where, "where"))
+ errAbort("Unknown clause '%s' line %d of %s", where, lf->lineIx, lf->fileName);
+ rql->whereClause = rqlParseClause(tkz);
+ }
+
+char *extra = tokenizerNext(tkz);
+if (extra != NULL)
+ errAbort("Extra stuff starting with '%s' past end of statement line %d of %s",
+ extra, lf->lineIx, lf->fileName);
+return rql;
+}
+
+struct raField *raFieldFromLine(char *line, struct lm *lm)
+/* Parse out line and convert it to a raField. Will return NULL on empty lines.
+ * Will insert some zeroes into the input line as well. */
+{
+char *word = nextWord(&line);
+if (word == NULL)
+ return NULL;
+struct raField *field;
+lmAllocVar(lm, field);
+field->name = lmCloneString(lm, word);
+char *val = emptyForNull(skipLeadingSpaces(line));
+field->val = lmCloneString(lm, val);
+return field;
+}
+
+struct raRecord *raRecordReadOne(struct lineFile *lf, struct lm *lm)
+/* Read next record from file. Returns NULL at end of file. */
+{
+struct raField *field, *fieldList = NULL;
+char *line, *word;
+
+/* Read first line and start fieldList on it. */
+if (!lineFileNextReal(lf, &line))
+ return NULL;
+fieldList = raFieldFromLine(line, lm);
+
+/* Keep going until get a blank line. */
+for (;;)
+ {
+ if (!lineFileNext(lf, &line, NULL))
+ break;
+ field = raFieldFromLine(line, lm);
+ if (field == NULL)
+ break;
+ slAddHead(&fieldList, field);
+ }
+
+slReverse(&fieldList);
+struct raRecord *record;
+lmAllocVar(lm, record);
+record->fieldList = fieldList;
+return record;
+}
+
+static struct raRecord *readRaRecords(int inCount, char *inNames[], char *mergeField, struct lm *lm)
+/* Scan through files, merging records on mergeField if it is non-NULL. */
+{
+if (inCount <= 0)
+ return NULL;
+if (mergeField)
+ {
+ errAbort("mergeField not yet supported");
+ return NULL;
+ }
+else
+ {
+ struct raRecord *recordList = NULL;
+ int i;
+ for (i=0; i<inCount; ++i)
+ {
+ char *fileName = inNames[i];
+ struct lineFile *lf = lineFileOpen(fileName, TRUE);
+ struct raRecord *record;
+ while ((record = raRecordReadOne(lf, lm)) != NULL)
+ {
+ slAddHead(&recordList, record);
+ }
+ lineFileClose(&lf);
+ }
+ slReverse(&recordList);
+ return recordList;
+ }
+}
+
+boolean rqlStatementMatch(struct rqlStatement *rql, struct raRecord *ra)
+/* Return TRUE if where clause in statement evaluates true for ra. */
+{
+struct rqlParse *whereClause = rql->whereClause;
+if (whereClause == NULL)
+ return TRUE;
+else
+ {
+ struct rqlEval res = rqlEvalOnRecord(whereClause, ra);
+ res = rqlEvalCoerceToBoolean(res);
+ return res.val.b;
+ }
+}
+
+void rqlStatementOutput(struct rqlStatement *rql, struct raRecord *ra, FILE *out)
+/* Output fields from ra to file. */
+{
+struct slName *fieldList = rql->fieldList, *field;
+for (field = fieldList; field != NULL; field = field->next)
+ {
+ struct raField *r;
+ boolean doWild = anyWild(field->name);
+ for (r = ra->fieldList; r != NULL; r = r->next)
+ {
+ boolean match;
+ if (doWild)
+ match = wildMatch(field->name, r->name);
+ else
+ match = (strcmp(field->name, r->name) == 0);
+ if (match)
+ fprintf(out, "%s %s\n", r->name, r->val);
+ }
+ }
+fprintf(out, "\n");
+}
+
+void raSqlQuery(int inCount, char *inNames[], struct lineFile *query, char *mergeField, struct lm *lm,
+ FILE *out)
+/* raSqlQuery - Do a SQL-like query on a RA file.. */
+{
+struct raRecord *raList = readRaRecords(inCount, inNames, mergeField, lm);
+uglyf("Got %d ra records\n", slCount(raList));
+struct rqlStatement *rql = rqlStatementParse(query);
+rqlStatementDump(rql, uglyOut);
+struct raRecord *ra;
+int matchCount = 0;
+boolean doSelect = sameString(rql->command, "select");
+for (ra = raList; ra != NULL; ra = ra->next)
+ {
+ if (rqlStatementMatch(rql, ra))
+ {
+ matchCount += 1;
+ if (doSelect)
+ {
+ rqlStatementOutput(rql, ra, out);
+ }
+ }
+ }
+if (!doSelect)
+ printf("%d\n", matchCount);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc < 2)
+ usage();
+clMerge = optionVal("merge", NULL);
+clSkipMissing = optionExists("skipMissing");
+clQueryFile = optionVal("queryFile", NULL);
+clQuery = optionVal("query", NULL);
+if (clQueryFile == NULL && clQuery == NULL)
+ errAbort("Please specify either the query or queryFile option.");
+if (clQueryFile != NULL && clQuery != NULL)
+ errAbort("Please specify just one of the query or queryFile options.");
+struct lineFile *query = NULL;
+if (clQuery)
+ query = lineFileOnString("query", TRUE, cloneString(clQuery));
+else
+ query = lineFileOpen(clQueryFile, TRUE);
+struct lm *lm = lmInit(0);
+raSqlQuery(argc-1, argv+1, query, clMerge, lm, stdout);
+return 0;
+}