src/utils/raSqlQuery/raSqlQuery.c 1.22
1.22 2009/11/22 05:58:13 kent
Making count statement work better with db=all and with from clauses.
Index: src/utils/raSqlQuery/raSqlQuery.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/raSqlQuery/raSqlQuery.c,v
retrieving revision 1.21
retrieving revision 1.22
diff -b -B -U 1000000 -r1.21 -r1.22
--- src/utils/raSqlQuery/raSqlQuery.c 22 Nov 2009 05:28:52 -0000 1.21
+++ src/utils/raSqlQuery/raSqlQuery.c 22 Nov 2009 05:58:13 -0000 1.22
@@ -1,545 +1,549 @@
/* raSqlQuery - Do a SQL-like query on a RA file.. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "dystring.h"
#include "options.h"
#include "obscure.h"
#include "ra.h"
#include "localmem.h"
#include "tokenizer.h"
#include "sqlNum.h"
#include "raRecord.h"
#include "rql.h"
#include "portable.h"
#include "../../hg/inc/hdb.h" /* Just for strict option. */
static char const rcsid[] = "$Id$";
static char *clQueryFile = NULL;
static char *clQuery = NULL;
static char *clKey = "name";
static char *clParentField = "subTrack";
static char *clNoInheritField = "noInherit";
static boolean clMerge = FALSE;
static boolean clParent = FALSE;
static boolean clAddFile = FALSE;
static boolean clAddDb = FALSE;
static char *clRestrict = NULL;
static boolean clStrict = FALSE;
static char *clDb = NULL;
static boolean clOverrideNeeded = FALSE;
static char *clTrackDbRootDir = "~/kent/src/hg/makeDb/trackDb";
static char *clTrackDbRelPath = "../../trackDb*.ra ../trackDb*.ra trackDb*.ra";
void usage()
/* Explain usage and exit. */
{
errAbort(
"raSqlQuery - Do a SQL-like query on a RA file.\n"
" raSqlQuery raFile(s) query-options\n"
"or\n"
" raSqlQuery -db=dbName query-options\n"
"Where dbName is a UCSC Genome database like hg18, sacCer1, etc.\n"
"One of the following query-options must be specified\n"
" -queryFile=fileName\n"
" \"-query=select list,of,fields where field='this'\"\n"
"The queryFile just has a query in it in the same form as the query option.\n"
"The syntax of a query statement is very SQL-like. The most common commands are:\n"
" select tag1,tag2,tag3 where tag1 like 'prefix%%'\n"
"where the %% is a SQL wildcard. Sorry to mix wildcards. Another command query is\n"
" select count(*) where tag = 'val\n"
"The from list is optional. If it exists it is a list of raFile names\n"
" select track,type from *Encode* where type like 'bigWig%%'\n"
"Other command line options:\n"
" -addFile - Add 'file' field to say where record is defined\n"
" -addDb - Add 'db' field to say where record is defined\n"
" -strict - Used only with db option. Only report tracks that exist in db\n"
" -key=keyField - Use the as the key field for merges and parenting. Default %s\n"
" -parent - Merge together inheriting on parentField\n"
" -parentField=field - Use field as the one that tells us who is our parent. Default %s\n"
" -overrideNeeded - If set records are only overridden field-by-field by later records\n"
" if 'override' follows the track name. Otherwiser later record replaces\n"
" earlier record completely. If not set all records overridden field by field\n"
" -noInheritField=field - If field is present don't inherit fields from parent\n"
" -merge - If there are multiple raFiles, records with the same keyField will be\n"
" merged together with fields in later files overriding fields in earlier files\n"
" -restrict=keyListFile - restrict output to only ones with keys in file.\n"
" -db=hg19 - Acts on trackDb files for the given database. Sets up list of files\n"
" appropriately and sets parent, merge, and override all.\n"
" Use db=all for all databases\n"
, clKey, clParentField
);
}
static struct optionSpec options[] = {
{"queryFile", OPTION_STRING},
{"query", OPTION_STRING},
{"merge", OPTION_BOOLEAN},
{"key", OPTION_STRING},
{"parent", OPTION_BOOLEAN},
{"parentField", OPTION_STRING},
{"noInheritField", OPTION_STRING},
{"addFile", OPTION_BOOLEAN},
{"addDb", OPTION_BOOLEAN},
{"restrict", OPTION_STRING},
{"strict", OPTION_BOOLEAN},
{"db", OPTION_STRING},
{"overrideNeeded", OPTION_BOOLEAN},
{NULL, 0},
};
struct dbPath
/* A database directory and path. */
{
struct dbPath *next;
char *db;
char *dir;
};
static struct dbPath *getDbPathList(char *rootDir)
/* Get list of all "database" directories with any trackDb.ra files two under us. */
{
char *root = simplifyPathToDir(rootDir);
struct dbPath *pathList = NULL, *path;
struct fileInfo *org, *orgList = listDirX(root, "*", TRUE);
for (org = orgList; org != NULL; org = org->next)
{
if (org->isDir)
{
struct fileInfo *db, *dbList = listDirX(org->name, "*", TRUE);
for (db = dbList; db != NULL; db = db->next)
{
if (db->isDir)
{
char trackDbPath[PATH_LEN];
safef(trackDbPath, sizeof(trackDbPath), "%s/trackDb.ra", db->name);
if (fileExists(trackDbPath))
{
AllocVar(path);
path->dir = cloneString(db->name);
char *s = strrchr(db->name, '/');
assert(s != NULL);
path->db = cloneString(s+1);
slAddHead(&pathList, path);
}
}
}
slFreeList(&dbList);
}
}
slFreeList(&orgList);
slReverse(&pathList);
freez(&root);
return pathList;
}
static struct slName *dbPathToFiles(struct dbPath *p)
/* Convert dbPath to a list of files. */
{
struct slName *pathList = NULL;
char *dbDir = p->dir;
char *buf = cloneString(clTrackDbRelPath);
char *line = buf, *word;
while ((word = nextWord(&line)) != NULL)
{
char relDir[PATH_LEN], relFile[PATH_LEN], relSuffix[PATH_LEN];
splitPath(word, relDir, relFile, relSuffix);
char dir[PATH_LEN];
safef(dir, sizeof(dir), "%s/%s", dbDir, relDir);
char *path = simplifyPathToDir(dir);
char pattern[PATH_LEN];
safef(pattern, sizeof(pattern), "%s%s", relFile, relSuffix);
struct fileInfo *fi, *fiList = listDirX(path, pattern, TRUE);
for (fi = fiList; fi != NULL; fi = fi->next)
slNameAddHead(&pathList, fi->name);
freeMem(path);
slFreeList(&fiList);
}
freeMem(buf);
slReverse(&pathList);
return pathList;
}
static void mergeRecords(struct raRecord *old, struct raRecord *record, char *key, struct lm *lm)
/* Merge record into old, updating any old fields with new record values. */
{
struct raField *field;
for (field = record->fieldList; field != NULL; field = field->next)
{
if (!sameString(field->name, key))
{
struct raField *oldField = raRecordField(old, field->name);
if (oldField != NULL)
oldField->val = field->val;
else
{
lmAllocVar(lm, oldField);
oldField->name = field->name;
oldField->val = field->val;
slAddTail(&old->fieldList, oldField);
}
}
}
old->posList = slCat(old->posList, record->posList);
}
static void mergeParentRecord(struct raRecord *record, struct raRecord *parent,
struct lm *lm)
/* Merge in parent record. This only updates fields that are in parent but not record. */
{
struct raField *parentField;
for (parentField= parent->fieldList; parentField!= NULL; parentField= parentField->next)
{
struct raField *oldField = raRecordField(record, parentField->name);
if (oldField == NULL)
{
struct raField *newField;
lmAllocVar(lm, newField);
newField->name = parentField->name;
newField->val = parentField->val;
slAddTail(&record->fieldList, newField);
}
}
}
static struct raRecord *readRaRecords(int inCount, char *inNames[], char *keyField,
boolean doMerge, char *db, boolean addDb,
boolean overrideNeeded, struct lm *lm)
/* Scan through files, merging records on key if doMerge. */
{
if (inCount <= 0)
return NULL;
if (doMerge)
{
struct raRecord *recordList = NULL, *record;
struct hash *recordHash = hashNew(0);
int i;
for (i=0; i<inCount; ++i)
{
char *fileName = inNames[i];
struct lineFile *lf = lineFileOpen(fileName, TRUE);
while ((record = raRecordReadOne(lf, keyField, lm)) != NULL)
{
record->posList = raFilePosNew(lm, fileName, lf->lineIx);
if (addDb)
record->db = db;
char *key = record->key;
if (key != NULL)
{
struct raRecord *oldRecord = hashFindVal(recordHash, key);
if (oldRecord != NULL)
{
if (overrideNeeded && !record->override)
{
oldRecord->fieldList = record->fieldList;
oldRecord->posList = record->posList;
}
else
mergeRecords(oldRecord, record, keyField, lm);
}
else
{
slAddHead(&recordList, record);
hashAdd(recordHash, key, record);
}
}
}
lineFileClose(&lf);
}
slReverse(&recordList);
return recordList;
}
else
{
struct raRecord *recordList = NULL;
int i;
for (i=0; i<inCount; ++i)
{
char *fileName = inNames[i];
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct raRecord *record;
while ((record = raRecordReadOne(lf, keyField, lm)) != NULL)
{
record->posList = raFilePosNew(lm, fileName, lf->lineIx);
slAddHead(&recordList, record);
}
lineFileClose(&lf);
}
slReverse(&recordList);
return recordList;
}
}
boolean rqlStatementMatch(struct rqlStatement *rql, struct raRecord *ra)
-/* Return TRUE if where clause in statement evaluates true for ra. */
-{
-struct rqlParse *whereClause = rql->whereClause;
-if (whereClause == NULL)
- return TRUE;
-else
- {
- struct rqlEval res = rqlEvalOnRecord(whereClause, ra);
- res = rqlEvalCoerceToBoolean(res);
- return res.val.b;
- }
-}
-
-void rqlStatementOutput(struct rqlStatement *rql, struct raRecord *ra,
- char *addFileField, char *addDbField, FILE *out)
-/* Output fields from ra to file. If addFileField is non-null add a new
- * field with this name at end of output. */
+/* Return TRUE if where clause and tableList in statement evaluates true for ra. */
{
if (rql->tableList != NULL)
{
boolean gotMatch = FALSE;
struct slName *table;
for (table = rql->tableList; table != NULL; table = table->next)
{
struct raFilePos *fp;
for (fp = ra->posList; fp != NULL; fp = fp->next)
{
if (wildMatch(table->name, fp->fileName))
{
gotMatch = TRUE;
break;
}
}
if (gotMatch)
break;
}
if (!gotMatch)
- return;
+ return FALSE;
+ }
+struct rqlParse *whereClause = rql->whereClause;
+if (whereClause == NULL)
+ return TRUE;
+else
+ {
+ struct rqlEval res = rqlEvalOnRecord(whereClause, ra);
+ res = rqlEvalCoerceToBoolean(res);
+ return res.val.b;
}
+}
+
+void rqlStatementOutput(struct rqlStatement *rql, struct raRecord *ra,
+ char *addFileField, char *addDbField, FILE *out)
+/* Output fields from ra to file. If addFileField is non-null add a new
+ * field with this name at end of output. */
+{
if (addDbField)
fprintf(out, "%s %s\n", addDbField, ra->db);
struct slName *fieldList = rql->fieldList, *field;
for (field = fieldList; field != NULL; field = field->next)
{
struct raField *r;
boolean doWild = anyWild(field->name);
for (r = ra->fieldList; r != NULL; r = r->next)
{
boolean match;
if (doWild)
match = wildMatch(field->name, r->name);
else
match = (strcmp(field->name, r->name) == 0);
if (match)
fprintf(out, "%s %s\n", r->name, r->val);
}
}
if (addFileField != NULL)
{
fprintf(out, "%s", addFileField);
struct raFilePos *fp;
for (fp = ra->posList; fp != NULL; fp = fp->next)
{
fprintf(out, " %s %d", fp->fileName, fp->lineIx);
}
fprintf(out, "\n");
}
fprintf(out, "\n");
}
struct hash *hashAllWordsInFile(char *fileName)
/* Make a hash of all space or line delimited words in file. */
{
struct hash *hash = hashNew(0);
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line, *word;
while (lineFileNext(lf, &line, NULL))
{
while ((word = nextWord(&line)) != NULL)
hashAdd(hash, word, NULL);
}
lineFileClose(&lf);
return hash;
}
static struct raRecord *findParent(struct raRecord *rec,
char *parentFieldName, char *noInheritFieldName, struct hash *hash)
/* Find parent field if possible. */
{
struct raField *noInheritField = raRecordField(rec, noInheritFieldName);
if (noInheritField != NULL)
return NULL;
struct raField *parentField = raRecordField(rec, parentFieldName);
if (parentField == NULL)
return NULL;
char *parentLine = parentField->val;
int len = strlen(parentLine);
char buf[len+1];
strcpy(buf, parentLine);
char *parentName = firstWordInLine(buf);
struct raRecord *parent = hashFindVal(hash, parentName);
if (parent == NULL)
warn("%s is a subTrack of %s, but %s doesn't exist", rec->key,
parentField->val, parentField->val);
return parent;
}
static void inheritFromParents(struct raRecord *list, char *parentField, char *noInheritField,
struct lm *lm)
/* Go through list. If an element has a parent field, then fill in non-existent fields from
* parent. */
{
/* Build up hash of records indexed by key field. */
struct hash *hash = hashNew(0);
struct raRecord *rec;
for (rec = list; rec != NULL; rec = rec->next)
{
if (rec->key != NULL)
hashAdd(hash, rec->key, rec);
}
/* Scan through doing inheritance. */
for (rec = list; rec != NULL; rec = rec->next)
{
struct raRecord *parent;
for (parent = findParent(rec, parentField, noInheritField, hash); parent != NULL;
parent = findParent(parent, parentField, noInheritField, hash) )
{
mergeParentRecord(rec, parent, lm);
}
}
}
-void raSqlQuery(int inCount, char *inNames[],
+int raSqlQuery(struct rqlStatement *rql, int inCount, char *inNames[],
char *db, char *parentField, struct lm *lm, FILE *out)
/* raSqlQuery - Do a SQL-like query on a RA file.. */
{
-struct lineFile *query;
-if (clQuery)
- query = lineFileOnString("query", TRUE, cloneString(clQuery));
-else
- query = lineFileOpen(clQueryFile, TRUE);
struct raRecord *raList = readRaRecords(inCount, inNames, clKey,
clMerge, db, clAddDb, clOverrideNeeded, lm);
if (parentField != NULL)
{
inheritFromParents(raList, parentField, clNoInheritField, lm);
}
if (clRestrict)
{
struct hash *restrictHash = hashAllWordsInFile(clRestrict);
restrictHash = hashAllWordsInFile(clRestrict);
struct raRecord *newList = NULL, *next, *rec;
for (rec = raList; rec != NULL; rec = next)
{
next = rec->next;
if (rec->key && hashLookup(restrictHash, rec->key))
{
slAddHead(&newList, rec);
}
}
slReverse(&newList);
raList = newList;
hashFree(&restrictHash);
}
-struct rqlStatement *rql = rqlStatementParse(query);
verbose(2, "Got %d records in raFiles\n", slCount(raList));
if (verboseLevel() > 1)
rqlStatementDump(rql, stderr);
struct raRecord *ra;
int matchCount = 0;
boolean doSelect = sameString(rql->command, "select");
for (ra = raList; ra != NULL; ra = ra->next)
{
if (rqlStatementMatch(rql, ra))
{
if (!clStrict || (ra->key && hTableOrSplitExists(db, ra->key)))
{
matchCount += 1;
if (doSelect)
{
rqlStatementOutput(rql, ra, (clAddFile ? "file" : NULL),
(clAddDb ? "db" : NULL), out);
}
}
}
}
-if (!doSelect)
- printf("%d\n", matchCount);
+return matchCount;
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
clMerge = optionExists("merge");
clParent = optionExists("parent");
clParentField = optionVal("parentField", clParentField);
clKey = optionVal("key", clKey);
clQueryFile = optionVal("queryFile", NULL);
clQuery = optionVal("query", NULL);
clNoInheritField = optionVal("noInheritField", clNoInheritField);
clAddFile = optionExists("addFile");
clAddDb = optionExists("addDb");
clRestrict = optionVal("restrict", NULL);
clStrict = optionExists("strict");
clOverrideNeeded = optionExists("overrideNeeded");
clDb = optionVal("db", NULL);
if (argc < 2 && !clDb)
usage();
if (clQueryFile == NULL && clQuery == NULL)
errAbort("Please specify either the query or queryFile option.");
if (clQueryFile != NULL && clQuery != NULL)
errAbort("Please specify just one of the query or queryFile options.");
struct lm *lm = lmInit(0);
if (clStrict && clDb == NULL)
errAbort("Only can use -strict with -db.");
if (clDb != NULL)
{
clMerge = TRUE;
clParent = TRUE;
clOverrideNeeded = TRUE;
clKey = "track";
if (sameString(clDb, "all"))
clAddDb = TRUE;
}
+/* Parse query */
+struct lineFile *query;
+if (clQuery)
+ query = lineFileOnString("query", TRUE, cloneString(clQuery));
+else
+ query = lineFileOpen(clQueryFile, TRUE);
+struct rqlStatement *rql = rqlStatementParse(query);
+
char *parentField = (clParent ? clParentField : NULL);
char **fileNames;
int fileCount;
+int matchCount = 0;
if (clDb)
{
if (argc != 1)
errAbort("You can't specify any input files with the -db option.");
struct dbPath *db, *dbList = getDbPathList(clTrackDbRootDir);
boolean gotAny = FALSE;
for (db = dbList; db != NULL; db = db->next)
{
if (sameString(clDb, "all") || sameString(clDb, db->db))
{
struct slName *path, *pathList = dbPathToFiles(db);
fileCount = slCount(pathList);
if (fileCount == 0)
errAbort("No paths returned by dbToTrackDbFiles(%s)", clDb);
AllocArray(fileNames, fileCount);
int i;
for (i=0, path = pathList; path != NULL; path = path->next, ++i)
{
fileNames[i] = path->name;
}
- raSqlQuery(fileCount, fileNames, db->db, parentField, lm, stdout);
+ matchCount += raSqlQuery(rql, fileCount, fileNames, db->db, parentField, lm, stdout);
gotAny = TRUE;
}
}
if (!gotAny)
errAbort("No database named %s found off %s\n", clDb, clTrackDbRootDir);
}
else
{
fileNames = argv+1;
fileCount = argc-1;
- raSqlQuery(fileCount, fileNames, "n/a", parentField, lm, stdout);
+ matchCount += raSqlQuery(rql, fileCount, fileNames, "n/a", parentField, lm, stdout);
}
+if (sameString(rql->command, "count"))
+ printf("%d\n", matchCount);
return 0;
}