src/utils/raSqlQuery/raSqlQuery.c 1.21

1.21 2009/11/22 05:28:52 kent
Implementing from as file list.
Index: src/utils/raSqlQuery/raSqlQuery.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/raSqlQuery/raSqlQuery.c,v
retrieving revision 1.20
retrieving revision 1.21
diff -b -B -U 1000000 -r1.20 -r1.21
--- src/utils/raSqlQuery/raSqlQuery.c	22 Nov 2009 02:10:20 -0000	1.20
+++ src/utils/raSqlQuery/raSqlQuery.c	22 Nov 2009 05:28:52 -0000	1.21
@@ -1,526 +1,545 @@
 /* raSqlQuery - Do a SQL-like query on a RA file.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "dystring.h"
 #include "options.h"
 #include "obscure.h"
 #include "ra.h"
 #include "localmem.h"
 #include "tokenizer.h"
 #include "sqlNum.h"
 #include "raRecord.h"
 #include "rql.h"
 #include "portable.h"
 #include "../../hg/inc/hdb.h"  /* Just for strict option. */
 
 static char const rcsid[] = "$Id$";
 
 static char *clQueryFile = NULL;
 static char *clQuery = NULL;
 static char *clKey = "name";
 static char *clParentField = "subTrack";
 static char *clNoInheritField = "noInherit";
 static boolean clMerge = FALSE;
 static boolean clParent = FALSE;
 static boolean clAddFile = FALSE;
 static boolean clAddDb = FALSE;
 static char *clRestrict = NULL;
 static boolean clStrict = FALSE;
 static char *clDb = NULL;
 static boolean clOverrideNeeded = FALSE;
 
 static char *clTrackDbRootDir = "~/kent/src/hg/makeDb/trackDb";
 static char *clTrackDbRelPath = "../../trackDb*.ra ../trackDb*.ra trackDb*.ra"; 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "raSqlQuery - Do a SQL-like query on a RA file.\n"
   "   raSqlQuery raFile(s) query-options\n"
   "or\n"
   "   raSqlQuery -db=dbName query-options\n"
   "Where dbName is a UCSC Genome database like hg18, sacCer1, etc.\n"
   "One of the following query-options must be specified\n"
   "   -queryFile=fileName\n"
   "   \"-query=select list,of,fields where field='this'\"\n"
   "The queryFile just has a query in it in the same form as the query option.\n"
-  "The syntax of a query statement is very SQL-like.  It must begin with either\n"
-  "'select' or 'count'.  Select is followed by a field list, or '*' for all fields\n"
-  "Count is not followed by anything.  The 'where' clause is optional, and if it\n"
-  "exists it can contain expressions involving fields, numbers, strings, arithmetic, 'and'\n"
-  "'or' and so forth.  Unlike SQL there is no 'from' clause.\n"
-  "Other options:\n"
+  "The syntax of a query statement is very SQL-like. The most common commands are:\n"
+  "    select tag1,tag2,tag3 where tag1 like 'prefix%%'\n"
+  "where the %% is a SQL wildcard.  Sorry to mix wildcards. Another command query is\n"
+  "    select count(*) where tag = 'val\n"
+  "The from list is optional.  If it exists it is a list of raFile names\n"
+  "    select track,type from *Encode* where type like 'bigWig%%'\n"
+  "Other command line options:\n"
   "   -addFile - Add 'file' field to say where record is defined\n"
   "   -addDb - Add 'db' field to say where record is defined\n"
   "   -strict - Used only with db option.  Only report tracks that exist in db\n"
   "   -key=keyField - Use the as the key field for merges and parenting. Default %s\n"
   "   -parent - Merge together inheriting on parentField\n"
   "   -parentField=field - Use field as the one that tells us who is our parent. Default %s\n"
   "   -overrideNeeded - If set records are only overridden field-by-field by later records\n"
   "               if 'override' follows the track name. Otherwiser later record replaces\n"
   "               earlier record completely.  If not set all records overridden field by field\n"
   "   -noInheritField=field - If field is present don't inherit fields from parent\n"
   "   -merge - If there are multiple raFiles, records with the same keyField will be\n"
   "          merged together with fields in later files overriding fields in earlier files\n"
   "   -restrict=keyListFile - restrict output to only ones with keys in file.\n"
   "   -db=hg19 - Acts on trackDb files for the given database.  Sets up list of files\n"
   "              appropriately and sets parent, merge, and override all.\n"
   "              Use db=all for all databases\n"
-  "The output will be to stdout, in the form of a .ra file if the select command is used\n"
-  "and just a simple number if the count command is used.\n"
   , clKey, clParentField
   );
 }
 
 
 static struct optionSpec options[] = {
    {"queryFile", OPTION_STRING},
    {"query", OPTION_STRING},
    {"merge", OPTION_BOOLEAN},
    {"key", OPTION_STRING},
    {"parent", OPTION_BOOLEAN},
    {"parentField", OPTION_STRING},
    {"noInheritField", OPTION_STRING},
    {"addFile", OPTION_BOOLEAN},
    {"addDb", OPTION_BOOLEAN},
    {"restrict", OPTION_STRING},
    {"strict", OPTION_BOOLEAN},
    {"db", OPTION_STRING},
    {"overrideNeeded", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 
 struct dbPath
 /* A database directory and path. */
     {
     struct dbPath *next;
     char *db;
     char *dir;
     };
 
 static struct dbPath *getDbPathList(char *rootDir)
 /* Get list of all "database" directories with any trackDb.ra files two under us. */
 {
 char *root = simplifyPathToDir(rootDir);
 struct dbPath *pathList = NULL, *path;
 struct fileInfo *org, *orgList = listDirX(root, "*", TRUE);
 for (org = orgList; org != NULL; org = org->next)
     {
     if (org->isDir)
         {
 	struct fileInfo *db, *dbList = listDirX(org->name, "*", TRUE);
 	for (db = dbList; db != NULL; db = db->next)
 	    {
 	    if (db->isDir)
 	        {
 		char trackDbPath[PATH_LEN];
 		safef(trackDbPath, sizeof(trackDbPath), "%s/trackDb.ra", db->name);
 		if (fileExists(trackDbPath))
 		    {
 		    AllocVar(path);
 		    path->dir = cloneString(db->name);
 		    char *s = strrchr(db->name, '/');
 		    assert(s != NULL);
 		    path->db = cloneString(s+1);
 		    slAddHead(&pathList, path);
 		    }
 		}
 	    }
 	slFreeList(&dbList);
 	}
     }
 slFreeList(&orgList);
 slReverse(&pathList);
 freez(&root);
 return pathList;
 }
 
 static struct slName *dbPathToFiles(struct dbPath *p)
 /* Convert dbPath to a list of files. */
 {
 struct slName *pathList = NULL;
 char *dbDir = p->dir;
 char *buf = cloneString(clTrackDbRelPath);
 char *line = buf, *word;
 while ((word = nextWord(&line)) != NULL)
     {
     char relDir[PATH_LEN], relFile[PATH_LEN], relSuffix[PATH_LEN];
     splitPath(word, relDir, relFile, relSuffix);
     char dir[PATH_LEN];
     safef(dir, sizeof(dir), "%s/%s", dbDir, relDir);
     char *path = simplifyPathToDir(dir);
     char pattern[PATH_LEN];
     safef(pattern, sizeof(pattern), "%s%s", relFile, relSuffix);
     struct fileInfo *fi, *fiList = listDirX(path, pattern, TRUE);
     for (fi = fiList; fi != NULL; fi = fi->next)
 	slNameAddHead(&pathList, fi->name);
     freeMem(path);
     slFreeList(&fiList);
     }
 freeMem(buf);
 slReverse(&pathList);
 return pathList;
 }
 
 
 
 static void mergeRecords(struct raRecord *old, struct raRecord *record, char *key, struct lm *lm)
 /* Merge record into old,  updating any old fields with new record values. */
 {
 struct raField *field;
 for (field = record->fieldList; field != NULL; field = field->next)
     {
     if (!sameString(field->name, key))
 	{
 	struct raField *oldField = raRecordField(old, field->name);
 	if (oldField != NULL)
 	    oldField->val = field->val;
 	else
 	    {
 	    lmAllocVar(lm, oldField);
 	    oldField->name = field->name;
 	    oldField->val = field->val;
 	    slAddTail(&old->fieldList, oldField);
 	    }
 	}
     }
 old->posList = slCat(old->posList, record->posList);
 }
 
 static void mergeParentRecord(struct raRecord *record, struct raRecord *parent, 
 	struct lm *lm)
 /* Merge in parent record.  This only updates fields that are in parent but not record. */
 {
 struct raField *parentField;
 for (parentField= parent->fieldList; parentField!= NULL; parentField= parentField->next)
     {
     struct raField *oldField = raRecordField(record, parentField->name);
     if (oldField == NULL)
         {
 	struct raField *newField;
 	lmAllocVar(lm, newField);
 	newField->name = parentField->name;
 	newField->val = parentField->val;
 	slAddTail(&record->fieldList, newField);
 	}
     }
 }
 
 static struct raRecord *readRaRecords(int inCount, char *inNames[], char *keyField,
-	boolean doMerge, boolean addFile, char *db, boolean addDb,
+	boolean doMerge, char *db, boolean addDb,
 	boolean overrideNeeded, struct lm *lm)
 /* Scan through files, merging records on key if doMerge. */
 {
 if (inCount <= 0)
     return NULL;
 if (doMerge)
     {
     struct raRecord *recordList = NULL, *record;
     struct hash *recordHash = hashNew(0);
     int i;
     for (i=0; i<inCount; ++i)
         {
 	char *fileName = inNames[i];
 	struct lineFile *lf = lineFileOpen(fileName, TRUE);
 	while ((record = raRecordReadOne(lf, keyField, lm)) != NULL)
 	    {
-	    if (addFile)
 	        record->posList = raFilePosNew(lm, fileName, lf->lineIx);
 	    if (addDb)
 		record->db = db;
 	    char *key = record->key;
 	    if (key != NULL)
 		{
 		struct raRecord *oldRecord = hashFindVal(recordHash, key);
 		if (oldRecord != NULL)
 		    {
 		    if (overrideNeeded && !record->override)
 		        {
 			oldRecord->fieldList = record->fieldList;
 			oldRecord->posList = record->posList;
 			}
 		    else
 			mergeRecords(oldRecord, record, keyField, lm);
 		    }
 		else
 		    {
 		    slAddHead(&recordList, record);
 		    hashAdd(recordHash, key, record);
 		    }
 		}
 	    }
 	lineFileClose(&lf);
 	}
     slReverse(&recordList);
     return recordList;
     }
 else
     {
     struct raRecord *recordList = NULL;
     int i;
     for (i=0; i<inCount; ++i)
         {
 	char *fileName = inNames[i];
 	struct lineFile *lf = lineFileOpen(fileName, TRUE);
 	struct raRecord *record;
 	while ((record = raRecordReadOne(lf, keyField, lm)) != NULL)
 	    {
-	    if (addFile)
 	        record->posList = raFilePosNew(lm, fileName, lf->lineIx);
 	    slAddHead(&recordList, record);
 	    }
 	lineFileClose(&lf);
 	}
     slReverse(&recordList);
     return recordList;
     }
 }
 
 boolean rqlStatementMatch(struct rqlStatement *rql, struct raRecord *ra)
 /* Return TRUE if where clause in statement evaluates true for ra. */
 {
 struct rqlParse *whereClause = rql->whereClause;
 if (whereClause == NULL)
     return TRUE;
 else
     {
     struct rqlEval res = rqlEvalOnRecord(whereClause, ra);
     res = rqlEvalCoerceToBoolean(res);
     return res.val.b;
     }
 }
 
 void rqlStatementOutput(struct rqlStatement *rql, struct raRecord *ra, 
-	char *addFileField, boolean addDb, FILE *out)
+	char *addFileField, char *addDbField, FILE *out)
 /* Output fields  from ra to file.  If addFileField is non-null add a new
  * field with this name at end of output. */
 {
-if (addDb)
-    fprintf(out, "db %s\n", ra->db);
+if (rql->tableList != NULL)
+    {
+    boolean gotMatch = FALSE;
+    struct slName *table;
+    for (table = rql->tableList; table != NULL; table = table->next)
+        {
+	struct raFilePos *fp;
+	for (fp = ra->posList; fp != NULL; fp = fp->next)
+	    {
+	    if (wildMatch(table->name, fp->fileName))
+	         {
+		 gotMatch = TRUE;
+		 break;
+		 }
+	    }
+	if (gotMatch)
+	    break;
+	}
+    if (!gotMatch)
+        return;
+    }
+if (addDbField)
+    fprintf(out, "%s %s\n", addDbField, ra->db);
 struct slName *fieldList = rql->fieldList, *field;
 for (field = fieldList; field != NULL; field = field->next)
     {
     struct raField *r;
     boolean doWild = anyWild(field->name);
     for (r = ra->fieldList; r != NULL; r = r->next)
         {
 	boolean match;
 	if (doWild)
 	    match = wildMatch(field->name, r->name);
 	else
 	    match = (strcmp(field->name, r->name) == 0);
 	if (match)
 	    fprintf(out, "%s %s\n", r->name, r->val);
 	}
     }
 if (addFileField != NULL)
     {
     fprintf(out, "%s", addFileField);
     struct raFilePos *fp;
     for (fp = ra->posList; fp != NULL; fp = fp->next)
 	{
 	fprintf(out, " %s %d", fp->fileName, fp->lineIx);
 	}
     fprintf(out, "\n");
     }
 fprintf(out, "\n");
 }
 
 struct hash *hashAllWordsInFile(char *fileName)
 /* Make a hash of all space or line delimited words in file. */
 {
 struct hash *hash = hashNew(0);
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *line, *word;
 while (lineFileNext(lf, &line, NULL))
     {
     while ((word = nextWord(&line)) != NULL)
         hashAdd(hash, word, NULL);
     }
 lineFileClose(&lf);
 return hash;
 }
 
 
 static struct raRecord *findParent(struct raRecord *rec, 
 	char *parentFieldName, char *noInheritFieldName, struct hash *hash)
 /* Find parent field if possible. */
 {
 struct raField *noInheritField = raRecordField(rec, noInheritFieldName);
 if (noInheritField != NULL)
     return NULL;
 struct raField *parentField = raRecordField(rec, parentFieldName);
 if (parentField == NULL)
     return NULL;
 char *parentLine = parentField->val;
 int len = strlen(parentLine);
 char buf[len+1];
 strcpy(buf, parentLine);
 char *parentName = firstWordInLine(buf);
 struct raRecord *parent = hashFindVal(hash, parentName);
 if (parent == NULL)
      warn("%s is a subTrack of %s, but %s doesn't exist", rec->key,
      	parentField->val, parentField->val);
 return parent;
 }
 
 static void inheritFromParents(struct raRecord *list, char *parentField, char *noInheritField,
 	struct lm *lm)
 /* Go through list.  If an element has a parent field, then fill in non-existent fields from
  * parent. */
 {
 /* Build up hash of records indexed by key field. */
 struct hash *hash = hashNew(0);
 struct raRecord *rec;
 for (rec = list; rec != NULL; rec = rec->next)
     {
     if (rec->key != NULL)
 	hashAdd(hash, rec->key, rec);
     }
 
 /* Scan through doing inheritance. */
 for (rec = list; rec != NULL; rec = rec->next)
     {
     struct raRecord *parent;
     for (parent = findParent(rec, parentField, noInheritField, hash); parent != NULL;
     	parent = findParent(parent, parentField, noInheritField, hash) )
 	{
 	mergeParentRecord(rec, parent, lm);
 	}
     }
 }
 
 void raSqlQuery(int inCount, char *inNames[], 
 	char *db, char *parentField, struct lm *lm, FILE *out)
 /* raSqlQuery - Do a SQL-like query on a RA file.. */
 {
 struct lineFile *query;
 if (clQuery)
     query = lineFileOnString("query", TRUE, cloneString(clQuery));
 else
     query = lineFileOpen(clQueryFile, TRUE);
 struct raRecord *raList = readRaRecords(inCount, inNames, clKey, 
-	clMerge, clAddFile, db, clAddDb, clOverrideNeeded, lm);
+	clMerge, db, clAddDb, clOverrideNeeded, lm);
 if (parentField != NULL)
     {
     inheritFromParents(raList, parentField, clNoInheritField, lm);
     }
 if (clRestrict)
     {
     struct hash *restrictHash = hashAllWordsInFile(clRestrict);
     restrictHash = hashAllWordsInFile(clRestrict);
     struct raRecord *newList = NULL, *next, *rec;
     for (rec = raList; rec != NULL; rec = next)
         {
 	next = rec->next;
 	if (rec->key && hashLookup(restrictHash, rec->key))
 	    {
 	    slAddHead(&newList, rec);
 	    }
 	}
     slReverse(&newList);
     raList = newList;
     hashFree(&restrictHash);
     }
 struct rqlStatement *rql = rqlStatementParse(query);
 verbose(2, "Got %d records in raFiles\n", slCount(raList));
 if (verboseLevel() > 1)
     rqlStatementDump(rql, stderr);
 struct raRecord *ra;
 int matchCount = 0;
 boolean doSelect = sameString(rql->command, "select");
 for (ra = raList; ra != NULL; ra = ra->next)
     {
     if (rqlStatementMatch(rql, ra))
         {
 	if (!clStrict || (ra->key && hTableOrSplitExists(db, ra->key)))
 	    {
 	    matchCount += 1;
 	    if (doSelect)
 		{
-		rqlStatementOutput(rql, ra, (clAddFile ? "file" : NULL), clAddDb, out);
+		rqlStatementOutput(rql, ra, (clAddFile ? "file" : NULL), 
+			(clAddDb ? "db" : NULL), out);
 		}
 	    }
 	}
     }
 if (!doSelect)
     printf("%d\n", matchCount);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 clMerge = optionExists("merge");
 clParent = optionExists("parent");
 clParentField = optionVal("parentField", clParentField);
 clKey = optionVal("key", clKey);
 clQueryFile = optionVal("queryFile", NULL);
 clQuery = optionVal("query", NULL);
 clNoInheritField = optionVal("noInheritField", clNoInheritField);
 clAddFile = optionExists("addFile");
 clAddDb = optionExists("addDb");
 clRestrict = optionVal("restrict", NULL);
 clStrict = optionExists("strict");
 clOverrideNeeded = optionExists("overrideNeeded");
 clDb = optionVal("db", NULL);
 if (argc < 2 && !clDb)
     usage();
 if (clQueryFile == NULL && clQuery == NULL)
     errAbort("Please specify either the query or queryFile option.");
 if (clQueryFile != NULL && clQuery != NULL)
     errAbort("Please specify just one of the query or queryFile options.");
 struct lm *lm = lmInit(0);
 if (clStrict && clDb == NULL)
     errAbort("Only can use -strict with -db.");
 
 if (clDb != NULL)
     {
     clMerge = TRUE;
     clParent = TRUE;
     clOverrideNeeded = TRUE;
     clKey = "track";
     if (sameString(clDb, "all"))
         clAddDb = TRUE;
     }
 char *parentField = (clParent ? clParentField : NULL);
 char **fileNames;
 int fileCount;
 if (clDb)
     {
     if (argc != 1)
 	 errAbort("You can't specify any input files with the -db option.");
     struct dbPath *db, *dbList = getDbPathList(clTrackDbRootDir);
     boolean gotAny = FALSE;
     for (db = dbList; db != NULL; db = db->next)
 	{
 	if (sameString(clDb, "all") || sameString(clDb, db->db))
 	    {
 	    struct slName *path, *pathList = dbPathToFiles(db);
 	    fileCount = slCount(pathList);
 	    if (fileCount == 0)
 		errAbort("No paths returned by dbToTrackDbFiles(%s)", clDb);
 	    AllocArray(fileNames, fileCount);
 	    int i;
 	    for (i=0, path = pathList; path != NULL; path = path->next, ++i)
 		{
 		fileNames[i] = path->name;
 		}
 	    raSqlQuery(fileCount, fileNames, db->db, parentField, lm, stdout);
 	    gotAny = TRUE;
 	    }
 	}
     if (!gotAny)
         errAbort("No database named %s found off %s\n", clDb, clTrackDbRootDir);
     }
 else
     {
     fileNames = argv+1;
     fileCount = argc-1;
     raSqlQuery(fileCount, fileNames, "n/a", parentField, lm, stdout);
     }
 return 0;
 }