src/utils/raSqlQuery/raSqlQuery.c 1.20
1.20 2009/11/22 02:10:20 kent
Improving usage statement.
Index: src/utils/raSqlQuery/raSqlQuery.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/raSqlQuery/raSqlQuery.c,v
retrieving revision 1.19
retrieving revision 1.20
diff -b -B -U 1000000 -r1.19 -r1.20
--- src/utils/raSqlQuery/raSqlQuery.c 22 Nov 2009 02:06:06 -0000 1.19
+++ src/utils/raSqlQuery/raSqlQuery.c 22 Nov 2009 02:10:20 -0000 1.20
@@ -1,528 +1,526 @@
/* raSqlQuery - Do a SQL-like query on a RA file.. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "dystring.h"
#include "options.h"
#include "obscure.h"
#include "ra.h"
#include "localmem.h"
#include "tokenizer.h"
#include "sqlNum.h"
#include "raRecord.h"
#include "rql.h"
#include "portable.h"
-#include "../../hg/inc/hdb.h"
+#include "../../hg/inc/hdb.h" /* Just for strict option. */
static char const rcsid[] = "$Id$";
static char *clQueryFile = NULL;
static char *clQuery = NULL;
static char *clKey = "name";
static char *clParentField = "subTrack";
static char *clNoInheritField = "noInherit";
static boolean clMerge = FALSE;
static boolean clParent = FALSE;
static boolean clAddFile = FALSE;
static boolean clAddDb = FALSE;
static char *clRestrict = NULL;
static boolean clStrict = FALSE;
static char *clDb = NULL;
static boolean clOverrideNeeded = FALSE;
static char *clTrackDbRootDir = "~/kent/src/hg/makeDb/trackDb";
static char *clTrackDbRelPath = "../../trackDb*.ra ../trackDb*.ra trackDb*.ra";
void usage()
/* Explain usage and exit. */
{
errAbort(
"raSqlQuery - Do a SQL-like query on a RA file.\n"
- "usage:\n"
" raSqlQuery raFile(s) query-options\n"
"or\n"
" raSqlQuery -db=dbName query-options\n"
"Where dbName is a UCSC Genome database like hg18, sacCer1, etc.\n"
"One of the following query-options must be specified\n"
" -queryFile=fileName\n"
" \"-query=select list,of,fields where field='this'\"\n"
- "The queryFile just has a query in it in the same form as the query option, but\n"
- "without needing the quotes necessarily.\n"
- " The syntax of a query statement is very SQL-like. It must begin with either\n"
+ "The queryFile just has a query in it in the same form as the query option.\n"
+ "The syntax of a query statement is very SQL-like. It must begin with either\n"
"'select' or 'count'. Select is followed by a field list, or '*' for all fields\n"
"Count is not followed by anything. The 'where' clause is optional, and if it\n"
"exists it can contain expressions involving fields, numbers, strings, arithmetic, 'and'\n"
- "'or' and so forth. Unlike SQL there is no 'from' claus.\n"
+ "'or' and so forth. Unlike SQL there is no 'from' clause.\n"
"Other options:\n"
+ " -addFile - Add 'file' field to say where record is defined\n"
+ " -addDb - Add 'db' field to say where record is defined\n"
+ " -strict - Used only with db option. Only report tracks that exist in db\n"
" -key=keyField - Use the as the key field for merges and parenting. Default %s\n"
" -parent - Merge together inheriting on parentField\n"
" -parentField=field - Use field as the one that tells us who is our parent. Default %s\n"
" -overrideNeeded - If set records are only overridden field-by-field by later records\n"
" if 'override' follows the track name. Otherwiser later record replaces\n"
" earlier record completely. If not set all records overridden field by field\n"
" -noInheritField=field - If field is present don't inherit fields from parent\n"
" -merge - If there are multiple raFiles, records with the same keyField will be\n"
" merged together with fields in later files overriding fields in earlier files\n"
- " -addFile - Add 'file' field to say where record is defined\n"
- " -addDb - Add 'db' field to say where record is defined\n"
" -restrict=keyListFile - restrict output to only ones with keys in file.\n"
- " -strict - Used only with db option. Only report tracks that exist in db\n"
" -db=hg19 - Acts on trackDb files for the given database. Sets up list of files\n"
" appropriately and sets parent, merge, and override all.\n"
" Use db=all for all databases\n"
"The output will be to stdout, in the form of a .ra file if the select command is used\n"
- "and just a simple number if the count command is used\n"
+ "and just a simple number if the count command is used.\n"
, clKey, clParentField
);
}
static struct optionSpec options[] = {
{"queryFile", OPTION_STRING},
{"query", OPTION_STRING},
{"merge", OPTION_BOOLEAN},
{"key", OPTION_STRING},
{"parent", OPTION_BOOLEAN},
{"parentField", OPTION_STRING},
{"noInheritField", OPTION_STRING},
{"addFile", OPTION_BOOLEAN},
{"addDb", OPTION_BOOLEAN},
{"restrict", OPTION_STRING},
{"strict", OPTION_BOOLEAN},
{"db", OPTION_STRING},
{"overrideNeeded", OPTION_BOOLEAN},
{NULL, 0},
};
struct dbPath
/* A database directory and path. */
{
struct dbPath *next;
char *db;
char *dir;
};
static struct dbPath *getDbPathList(char *rootDir)
/* Get list of all "database" directories with any trackDb.ra files two under us. */
{
char *root = simplifyPathToDir(rootDir);
struct dbPath *pathList = NULL, *path;
struct fileInfo *org, *orgList = listDirX(root, "*", TRUE);
for (org = orgList; org != NULL; org = org->next)
{
if (org->isDir)
{
struct fileInfo *db, *dbList = listDirX(org->name, "*", TRUE);
for (db = dbList; db != NULL; db = db->next)
{
if (db->isDir)
{
char trackDbPath[PATH_LEN];
safef(trackDbPath, sizeof(trackDbPath), "%s/trackDb.ra", db->name);
if (fileExists(trackDbPath))
{
AllocVar(path);
path->dir = cloneString(db->name);
char *s = strrchr(db->name, '/');
assert(s != NULL);
path->db = cloneString(s+1);
slAddHead(&pathList, path);
}
}
}
slFreeList(&dbList);
}
}
slFreeList(&orgList);
slReverse(&pathList);
freez(&root);
return pathList;
}
static struct slName *dbPathToFiles(struct dbPath *p)
/* Convert dbPath to a list of files. */
{
struct slName *pathList = NULL;
char *dbDir = p->dir;
char *buf = cloneString(clTrackDbRelPath);
char *line = buf, *word;
while ((word = nextWord(&line)) != NULL)
{
char relDir[PATH_LEN], relFile[PATH_LEN], relSuffix[PATH_LEN];
splitPath(word, relDir, relFile, relSuffix);
char dir[PATH_LEN];
safef(dir, sizeof(dir), "%s/%s", dbDir, relDir);
char *path = simplifyPathToDir(dir);
char pattern[PATH_LEN];
safef(pattern, sizeof(pattern), "%s%s", relFile, relSuffix);
struct fileInfo *fi, *fiList = listDirX(path, pattern, TRUE);
for (fi = fiList; fi != NULL; fi = fi->next)
slNameAddHead(&pathList, fi->name);
freeMem(path);
slFreeList(&fiList);
}
freeMem(buf);
slReverse(&pathList);
return pathList;
}
static void mergeRecords(struct raRecord *old, struct raRecord *record, char *key, struct lm *lm)
/* Merge record into old, updating any old fields with new record values. */
{
struct raField *field;
for (field = record->fieldList; field != NULL; field = field->next)
{
if (!sameString(field->name, key))
{
struct raField *oldField = raRecordField(old, field->name);
if (oldField != NULL)
oldField->val = field->val;
else
{
lmAllocVar(lm, oldField);
oldField->name = field->name;
oldField->val = field->val;
slAddTail(&old->fieldList, oldField);
}
}
}
old->posList = slCat(old->posList, record->posList);
}
static void mergeParentRecord(struct raRecord *record, struct raRecord *parent,
struct lm *lm)
/* Merge in parent record. This only updates fields that are in parent but not record. */
{
struct raField *parentField;
for (parentField= parent->fieldList; parentField!= NULL; parentField= parentField->next)
{
struct raField *oldField = raRecordField(record, parentField->name);
if (oldField == NULL)
{
struct raField *newField;
lmAllocVar(lm, newField);
newField->name = parentField->name;
newField->val = parentField->val;
slAddTail(&record->fieldList, newField);
}
}
}
static struct raRecord *readRaRecords(int inCount, char *inNames[], char *keyField,
boolean doMerge, boolean addFile, char *db, boolean addDb,
boolean overrideNeeded, struct lm *lm)
/* Scan through files, merging records on key if doMerge. */
{
if (inCount <= 0)
return NULL;
if (doMerge)
{
struct raRecord *recordList = NULL, *record;
struct hash *recordHash = hashNew(0);
int i;
for (i=0; i<inCount; ++i)
{
char *fileName = inNames[i];
struct lineFile *lf = lineFileOpen(fileName, TRUE);
while ((record = raRecordReadOne(lf, keyField, lm)) != NULL)
{
if (addFile)
record->posList = raFilePosNew(lm, fileName, lf->lineIx);
if (addDb)
record->db = db;
char *key = record->key;
if (key != NULL)
{
struct raRecord *oldRecord = hashFindVal(recordHash, key);
if (oldRecord != NULL)
{
if (overrideNeeded && !record->override)
{
oldRecord->fieldList = record->fieldList;
oldRecord->posList = record->posList;
}
else
mergeRecords(oldRecord, record, keyField, lm);
}
else
{
slAddHead(&recordList, record);
hashAdd(recordHash, key, record);
}
}
}
lineFileClose(&lf);
}
slReverse(&recordList);
return recordList;
}
else
{
struct raRecord *recordList = NULL;
int i;
for (i=0; i<inCount; ++i)
{
char *fileName = inNames[i];
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct raRecord *record;
while ((record = raRecordReadOne(lf, keyField, lm)) != NULL)
{
if (addFile)
record->posList = raFilePosNew(lm, fileName, lf->lineIx);
slAddHead(&recordList, record);
}
lineFileClose(&lf);
}
slReverse(&recordList);
return recordList;
}
}
boolean rqlStatementMatch(struct rqlStatement *rql, struct raRecord *ra)
/* Return TRUE if where clause in statement evaluates true for ra. */
{
struct rqlParse *whereClause = rql->whereClause;
if (whereClause == NULL)
return TRUE;
else
{
struct rqlEval res = rqlEvalOnRecord(whereClause, ra);
res = rqlEvalCoerceToBoolean(res);
return res.val.b;
}
}
void rqlStatementOutput(struct rqlStatement *rql, struct raRecord *ra,
char *addFileField, boolean addDb, FILE *out)
/* Output fields from ra to file. If addFileField is non-null add a new
* field with this name at end of output. */
{
if (addDb)
fprintf(out, "db %s\n", ra->db);
struct slName *fieldList = rql->fieldList, *field;
for (field = fieldList; field != NULL; field = field->next)
{
struct raField *r;
boolean doWild = anyWild(field->name);
for (r = ra->fieldList; r != NULL; r = r->next)
{
boolean match;
if (doWild)
match = wildMatch(field->name, r->name);
else
match = (strcmp(field->name, r->name) == 0);
if (match)
fprintf(out, "%s %s\n", r->name, r->val);
}
}
if (addFileField != NULL)
{
fprintf(out, "%s", addFileField);
struct raFilePos *fp;
for (fp = ra->posList; fp != NULL; fp = fp->next)
{
fprintf(out, " %s %d", fp->fileName, fp->lineIx);
}
fprintf(out, "\n");
}
fprintf(out, "\n");
}
struct hash *hashAllWordsInFile(char *fileName)
/* Make a hash of all space or line delimited words in file. */
{
struct hash *hash = hashNew(0);
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line, *word;
while (lineFileNext(lf, &line, NULL))
{
while ((word = nextWord(&line)) != NULL)
hashAdd(hash, word, NULL);
}
lineFileClose(&lf);
return hash;
}
static struct raRecord *findParent(struct raRecord *rec,
char *parentFieldName, char *noInheritFieldName, struct hash *hash)
/* Find parent field if possible. */
{
struct raField *noInheritField = raRecordField(rec, noInheritFieldName);
if (noInheritField != NULL)
return NULL;
struct raField *parentField = raRecordField(rec, parentFieldName);
if (parentField == NULL)
return NULL;
char *parentLine = parentField->val;
int len = strlen(parentLine);
char buf[len+1];
strcpy(buf, parentLine);
char *parentName = firstWordInLine(buf);
struct raRecord *parent = hashFindVal(hash, parentName);
if (parent == NULL)
warn("%s is a subTrack of %s, but %s doesn't exist", rec->key,
parentField->val, parentField->val);
return parent;
}
static void inheritFromParents(struct raRecord *list, char *parentField, char *noInheritField,
struct lm *lm)
/* Go through list. If an element has a parent field, then fill in non-existent fields from
* parent. */
{
/* Build up hash of records indexed by key field. */
struct hash *hash = hashNew(0);
struct raRecord *rec;
for (rec = list; rec != NULL; rec = rec->next)
{
if (rec->key != NULL)
hashAdd(hash, rec->key, rec);
}
/* Scan through doing inheritance. */
for (rec = list; rec != NULL; rec = rec->next)
{
struct raRecord *parent;
for (parent = findParent(rec, parentField, noInheritField, hash); parent != NULL;
parent = findParent(parent, parentField, noInheritField, hash) )
{
mergeParentRecord(rec, parent, lm);
}
}
}
void raSqlQuery(int inCount, char *inNames[],
char *db, char *parentField, struct lm *lm, FILE *out)
/* raSqlQuery - Do a SQL-like query on a RA file.. */
{
struct lineFile *query;
if (clQuery)
query = lineFileOnString("query", TRUE, cloneString(clQuery));
else
query = lineFileOpen(clQueryFile, TRUE);
struct raRecord *raList = readRaRecords(inCount, inNames, clKey,
clMerge, clAddFile, db, clAddDb, clOverrideNeeded, lm);
if (parentField != NULL)
{
inheritFromParents(raList, parentField, clNoInheritField, lm);
}
if (clRestrict)
{
struct hash *restrictHash = hashAllWordsInFile(clRestrict);
restrictHash = hashAllWordsInFile(clRestrict);
struct raRecord *newList = NULL, *next, *rec;
for (rec = raList; rec != NULL; rec = next)
{
next = rec->next;
if (rec->key && hashLookup(restrictHash, rec->key))
{
slAddHead(&newList, rec);
}
}
slReverse(&newList);
raList = newList;
hashFree(&restrictHash);
}
struct rqlStatement *rql = rqlStatementParse(query);
verbose(2, "Got %d records in raFiles\n", slCount(raList));
if (verboseLevel() > 1)
rqlStatementDump(rql, stderr);
struct raRecord *ra;
int matchCount = 0;
boolean doSelect = sameString(rql->command, "select");
for (ra = raList; ra != NULL; ra = ra->next)
{
if (rqlStatementMatch(rql, ra))
{
if (!clStrict || (ra->key && hTableOrSplitExists(db, ra->key)))
{
matchCount += 1;
if (doSelect)
{
rqlStatementOutput(rql, ra, (clAddFile ? "file" : NULL), clAddDb, out);
}
}
}
}
if (!doSelect)
printf("%d\n", matchCount);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
clMerge = optionExists("merge");
clParent = optionExists("parent");
clParentField = optionVal("parentField", clParentField);
clKey = optionVal("key", clKey);
clQueryFile = optionVal("queryFile", NULL);
clQuery = optionVal("query", NULL);
clNoInheritField = optionVal("noInheritField", clNoInheritField);
clAddFile = optionExists("addFile");
clAddDb = optionExists("addDb");
clRestrict = optionVal("restrict", NULL);
clStrict = optionExists("strict");
clOverrideNeeded = optionExists("overrideNeeded");
clDb = optionVal("db", NULL);
if (argc < 2 && !clDb)
usage();
if (clQueryFile == NULL && clQuery == NULL)
errAbort("Please specify either the query or queryFile option.");
if (clQueryFile != NULL && clQuery != NULL)
errAbort("Please specify just one of the query or queryFile options.");
struct lm *lm = lmInit(0);
if (clStrict && clDb == NULL)
errAbort("Only can use -strict with -db.");
if (clDb != NULL)
{
clMerge = TRUE;
clParent = TRUE;
clOverrideNeeded = TRUE;
clKey = "track";
if (sameString(clDb, "all"))
clAddDb = TRUE;
}
char *parentField = (clParent ? clParentField : NULL);
char **fileNames;
int fileCount;
if (clDb)
{
if (argc != 1)
errAbort("You can't specify any input files with the -db option.");
struct dbPath *db, *dbList = getDbPathList(clTrackDbRootDir);
boolean gotAny = FALSE;
for (db = dbList; db != NULL; db = db->next)
{
if (sameString(clDb, "all") || sameString(clDb, db->db))
{
struct slName *path, *pathList = dbPathToFiles(db);
fileCount = slCount(pathList);
if (fileCount == 0)
errAbort("No paths returned by dbToTrackDbFiles(%s)", clDb);
AllocArray(fileNames, fileCount);
int i;
for (i=0, path = pathList; path != NULL; path = path->next, ++i)
{
fileNames[i] = path->name;
}
raSqlQuery(fileCount, fileNames, db->db, parentField, lm, stdout);
gotAny = TRUE;
}
}
if (!gotAny)
errAbort("No database named %s found off %s\n", clDb, clTrackDbRootDir);
}
else
{
fileNames = argv+1;
fileCount = argc-1;
raSqlQuery(fileCount, fileNames, "n/a", parentField, lm, stdout);
}
return 0;
}