src/utils/raSqlQuery/raSqlQuery.c 1.15

1.15 2009/11/22 00:25:26 kent
Making it deal with overrides. Making it start to work with our usual trackDb directory structure when using -db= flag.
Index: src/utils/raSqlQuery/raSqlQuery.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/raSqlQuery/raSqlQuery.c,v
retrieving revision 1.14
retrieving revision 1.15
diff -b -B -U 4 -r1.14 -r1.15
--- src/utils/raSqlQuery/raSqlQuery.c	20 Nov 2009 20:25:54 -0000	1.14
+++ src/utils/raSqlQuery/raSqlQuery.c	22 Nov 2009 00:25:26 -0000	1.15
@@ -10,19 +10,26 @@
 #include "tokenizer.h"
 #include "sqlNum.h"
 #include "raRecord.h"
 #include "rql.h"
+#include "portable.h"
 
 static char const rcsid[] = "$Id$";
 
 static char *clQueryFile = NULL;
 static char *clQuery = NULL;
-static char *clKey = "track";
+static char *clKey = "name";
 static char *clParentField = "subTrack";
 static char *clNoInheritField = "noInherit";
 static boolean clMerge = FALSE;
 static boolean clParent = FALSE;
 static boolean clAddFile = FALSE;
+static char *clRestrict = NULL;
+static char *clDb = NULL;
+static boolean clOverrideNeeded = FALSE;
+
+static char *clTrackDbRootDir = "~/kent/src/hg/makeDb/trackDb";
+static char *clTrackDbRelPath = "../../trackDb*.ra ../trackDb*.ra trackDb*.ra"; 
 
 void usage()
 /* Explain usage and exit. */
 {
@@ -43,12 +50,18 @@
   "Other options:\n"
   "   -key=keyField - Use the as the key field for merges and parenting. Default %s\n"
   "   -parent - Merge together inheriting on parentField\n"
   "   -parentField=field - Use field as the one that tells us who is our parent. Default %s\n"
+  "   -overrideNeeded - If set records are only overridden field-by-field by later records\n"
+  "               if 'override' follows the track name. Otherwiser later record replaces\n"
+  "               earlier record completely.  If not set all records overridden field by field\n"
   "   -noInheritField=field - If field is present don't inherit fields from parent\n"
   "   -merge - If there are multiple raFiles, records with the same keyField will be\n"
   "          merged together with fields in later files overriding fields in earlier files\n"
   "   -addFile - Add 'file' field to say where record is defined\n"
+  "   -restrict=keyListFile - restrict output to only ones with keys in file, which\n"
+  "   -db=hg19 - Acts on trackDb files for the given database.  Sets up list of files\n"
+  "              appropriately and sets parent, merge, and override all.\n"
   "The output will be to stdout, in the form of a .ra file if the select command is used\n"
   "and just a simple number if the count command is used\n"
   , clKey, clParentField
   );
@@ -62,17 +76,83 @@
    {"parent", OPTION_BOOLEAN},
    {"parentField", OPTION_STRING},
    {"noInheritField", OPTION_STRING},
    {"addFile", OPTION_BOOLEAN},
+   {"restrict", OPTION_STRING},
+   {"db", OPTION_STRING},
+   {"overrideNeeded", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
-static void mergeRecords(struct raRecord *old, struct raRecord *record, struct lm *lm)
+
+struct dbPath
+    {
+    char *db;
+    char *dir;
+    };
+
+struct dbPath dbPath[] = {
+    {"hg19", "human/hg19"},
+    {"hg18", "human/hg18"},
+    {"hg17", "human/hg17"},
+    {"mm9", "mouse/mm9"},
+    {"mm8", "mouse/mm8"},
+};
+
+static struct slName *dbPathToFiles(struct dbPath *p)
+/* Convert dbPath to a list of files. */
+{
+struct slName *pathList = NULL;
+char dbDir[PATH_LEN];
+safef(dbDir, sizeof(dbDir), "%s/%s", clTrackDbRootDir, p->dir);
+char *buf = cloneString(clTrackDbRelPath);
+char *line = buf, *word;
+while ((word = nextWord(&line)) != NULL)
+    {
+    char relDir[PATH_LEN], relFile[PATH_LEN], relSuffix[PATH_LEN];
+    splitPath(word, relDir, relFile, relSuffix);
+    char dir[PATH_LEN];
+    safef(dir, sizeof(dir), "%s/%s", dbDir, relDir);
+    char *path = simplifyPathToDir(dir);
+    char pattern[PATH_LEN];
+    safef(pattern, sizeof(pattern), "%s%s", relFile, relSuffix);
+    struct fileInfo *fi, *fiList = listDirX(path, pattern, TRUE);
+    for (fi = fiList; fi != NULL; fi = fi->next)
+	slNameAddHead(&pathList, fi->name);
+    freeMem(path);
+    slFreeList(&fiList);
+    }
+freeMem(buf);
+slReverse(&pathList);
+return pathList;
+}
+
+static struct slName *dbToTrackDbFiles(char *db)
+/* Given a database, figure out list of trackDb files. */
+{
+int i;
+for (i=0; i<ArraySize(dbPath); ++i)
+    {
+    struct dbPath *p = &dbPath[i];
+    if (sameString(p->db, db))
+        {
+	return dbPathToFiles(p);
+	}
+    }
+errAbort("Couldn't find db %s", db);
+return NULL;
+}
+
+
+
+static void mergeRecords(struct raRecord *old, struct raRecord *record, char *key, struct lm *lm)
 /* Merge record into old,  updating any old fields with new record values. */
 {
 struct raField *field;
 for (field = record->fieldList; field != NULL; field = field->next)
     {
+    if (!sameString(field->name, key))
+	{
     struct raField *oldField = raRecordField(old, field->name);
     if (oldField != NULL)
         oldField->val = field->val;
     else
@@ -82,12 +162,14 @@
 	oldField->val = field->val;
 	slAddTail(&old->fieldList, oldField);
 	}
     }
+    }
 old->posList = slCat(old->posList, record->posList);
 }
 
-static void mergeParentRecord(struct raRecord *record, struct raRecord *parent, struct lm *lm)
+static void mergeParentRecord(struct raRecord *record, struct raRecord *parent, 
+	struct lm *lm)
 /* Merge in parent record.  This only updates fields that are in parent but not record. */
 {
 struct raField *parentField;
 for (parentField= parent->fieldList; parentField!= NULL; parentField= parentField->next)
@@ -103,64 +185,45 @@
 	}
     }
 }
 
-struct raField *makeKeyField(struct raRecord *record, char *key, struct lm *lm)
-/* Make up key field if possible from field with name of key.  If not possible then
- * return NULL.  May have to munge keyField to just include first word. */
-{
-/* See if can find key at all. */
-struct raField *fullKey = raRecordField(record, key);
-if (fullKey == NULL)
-    return NULL;
-
-/* See if it has more than one word by looking for spaces. */
-char *fullKeyVal = fullKey->val;
-char *endFirstWord = skipToSpaces(fullKeyVal);
-if (endFirstWord == NULL)
-    return fullKey;
-
-/* If it does have more than one word, make up a new key. */
-struct raField *shortKey;
-lmAllocVar(lm, shortKey);
-shortKey->name = fullKey->name;
-shortKey->val = lmCloneStringZ(lm, fullKeyVal, endFirstWord - fullKeyVal);
-return shortKey;
-}
-
-static struct raRecord *readRaRecords(int inCount, char *inNames[], 
-	char *mergeField, boolean addFile, struct lm *lm)
-/* Scan through files, merging records on mergeField if it is non-NULL. */
+static struct raRecord *readRaRecords(int inCount, char *inNames[], char *keyField,
+	boolean doMerge, boolean addFile, boolean overrideNeeded, struct lm *lm)
+/* Scan through files, merging records on key if doMerge. */
 {
 if (inCount <= 0)
     return NULL;
-if (mergeField)
+if (doMerge)
     {
     struct raRecord *recordList = NULL, *record;
     struct hash *recordHash = hashNew(0);
-    slReverse(&recordList);
     int i;
     for (i=0; i<inCount; ++i)
         {
 	char *fileName = inNames[i];
 	struct lineFile *lf = lineFileOpen(fileName, TRUE);
-	while ((record = raRecordReadOne(lf, lm)) != NULL)
+	while ((record = raRecordReadOne(lf, keyField, lm)) != NULL)
 	    {
 	    if (addFile)
 	        record->posList = raFilePosNew(lm, fileName, lf->lineIx);
-	    struct raField *keyField = makeKeyField(record, mergeField, lm);
-	    if (keyField != NULL)
+	    char *key = record->key;
+	    if (key != NULL)
 		{
-		struct raRecord *oldRecord = hashFindVal(recordHash, keyField->val);
+		struct raRecord *oldRecord = hashFindVal(recordHash, key);
 		if (oldRecord != NULL)
 		    {
-		    mergeRecords(oldRecord, record, lm);
+		    if (overrideNeeded && !record->override)
+		        {
+			oldRecord->fieldList = record->fieldList;
+			oldRecord->posList = record->posList;
+			}
+		    else
+			mergeRecords(oldRecord, record, keyField, lm);
 		    }
 		else
 		    {
-		    record->key = keyField;
 		    slAddHead(&recordList, record);
-		    hashAdd(recordHash, keyField->val, record);
+		    hashAdd(recordHash, key, record);
 		    }
 		}
 	    }
 	lineFileClose(&lf);
@@ -176,9 +239,9 @@
         {
 	char *fileName = inNames[i];
 	struct lineFile *lf = lineFileOpen(fileName, TRUE);
 	struct raRecord *record;
-	while ((record = raRecordReadOne(lf, lm)) != NULL)
+	while ((record = raRecordReadOne(lf, keyField, lm)) != NULL)
 	    {
 	    if (addFile)
 	        record->posList = raFilePosNew(lm, fileName, lf->lineIx);
 	    slAddHead(&recordList, record);
@@ -237,19 +300,24 @@
     }
 fprintf(out, "\n");
 }
 
-static void addMissingKeys(struct raRecord *list, char *keyField, struct lm *lm)
-/* Add key to all raRecords that don't already have it. */
+struct hash *hashAllWordsInFile(char *fileName)
+/* Make a hash of all space or line delimited words in file. */
 {
-struct raRecord *rec;
-for (rec = list; rec != NULL; rec = rec->next)
+struct hash *hash = hashNew(0);
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *line, *word;
+while (lineFileNext(lf, &line, NULL))
     {
-    if (rec->key == NULL)
-        rec->key = makeKeyField(rec, keyField, lm);
+    while ((word = nextWord(&line)) != NULL)
+        hashAdd(hash, word, NULL);
     }
+lineFileClose(&lf);
+return hash;
 }
 
+
 static struct raRecord *findParent(struct raRecord *rec, 
 	char *parentFieldName, char *noInheritFieldName, struct hash *hash)
 /* Find parent field if possible. */
 {
@@ -265,9 +333,9 @@
 strcpy(buf, parentLine);
 char *parentName = firstWordInLine(buf);
 struct raRecord *parent = hashFindVal(hash, parentName);
 if (parent == NULL)
-     warn("%s is a subTrack of %s, but %s doesn't exist", rec->key->val,
+     warn("%s is a subTrack of %s, but %s doesn't exist", rec->key,
      	parentField->val, parentField->val);
 return parent;
 }
 
@@ -281,9 +349,9 @@
 struct raRecord *rec;
 for (rec = list; rec != NULL; rec = rec->next)
     {
     if (rec->key != NULL)
-	hashAdd(hash, rec->key->val, rec);
+	hashAdd(hash, rec->key, rec);
     }
 
 /* Scan through doing inheritance. */
 for (rec = list; rec != NULL; rec = rec->next)
@@ -296,18 +364,35 @@
 	}
     }
 }
 
-void raSqlQuery(int inCount, char *inNames[], struct lineFile *query, char *mergeField, 
+void raSqlQuery(int inCount, char *inNames[], struct lineFile *query, boolean doMerge, 
 	char *parentField, char *noInheritField, struct lm *lm, FILE *out)
 /* raSqlQuery - Do a SQL-like query on a RA file.. */
 {
-struct raRecord *raList = readRaRecords(inCount, inNames, mergeField, clAddFile, lm);
+struct raRecord *raList = readRaRecords(inCount, inNames, clKey, 
+	doMerge, clAddFile, clOverrideNeeded, lm);
 if (parentField != NULL)
     {
-    addMissingKeys(raList, clKey, lm);
     inheritFromParents(raList, parentField, noInheritField, lm);
     }
+if (clRestrict)
+    {
+    struct hash *restrictHash = hashAllWordsInFile(clRestrict);
+    restrictHash = hashAllWordsInFile(clRestrict);
+    struct raRecord *newList = NULL, *next, *rec;
+    for (rec = raList; rec != NULL; rec = next)
+        {
+	next = rec->next;
+	if (rec->key && hashLookup(restrictHash, rec->key))
+	    {
+	    slAddHead(&newList, rec);
+	    }
+	}
+    slReverse(&newList);
+    raList = newList;
+    hashFree(&restrictHash);
+    }
 struct rqlStatement *rql = rqlStatementParse(query);
 verbose(2, "Got %d records in raFiles\n", slCount(raList));
 if (verboseLevel() > 1)
     rqlStatementDump(rql, stderr);
@@ -332,18 +417,21 @@
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
-if (argc < 2)
-    usage();
 clMerge = optionExists("merge");
 clParent = optionExists("parent");
 clParentField = optionVal("parentField", clParentField);
 clKey = optionVal("key", clKey);
 clQueryFile = optionVal("queryFile", NULL);
 clQuery = optionVal("query", NULL);
 clNoInheritField = optionVal("noInheritField", clNoInheritField);
 clAddFile = optionExists("addFile");
+clRestrict = optionVal("restrict", NULL);
+clOverrideNeeded = optionExists("overrideNeeded");
+clDb = optionVal("db", NULL);
+if (argc < 2 && !clDb)
+    usage();
 if (clQueryFile == NULL && clQuery == NULL)
     errAbort("Please specify either the query or queryFile option.");
 if (clQueryFile != NULL && clQuery != NULL)
     errAbort("Please specify just one of the query or queryFile options.");
@@ -352,9 +440,38 @@
     query = lineFileOnString("query", TRUE, cloneString(clQuery));
 else
     query = lineFileOpen(clQueryFile, TRUE);
 struct lm *lm = lmInit(0);
-char *mergeField = (clMerge ? clKey : NULL);
+
+if (clDb != NULL)
+    {
+    clMerge = TRUE;
+    clParent = TRUE;
+    clOverrideNeeded = TRUE;
+    clKey = "track";
+    }
+char **fileNames;
+int fileCount;
+if (clDb)
+    {
+    if (argc != 1)
+         usage();
+    struct slName *path, *pathList = dbToTrackDbFiles(clDb);
+    fileCount = slCount(pathList);
+    if (fileCount == 0)
+        errAbort("No paths returned by dbToTrackDbFiles(%s)", clDb);
+    AllocArray(fileNames, fileCount);
+    int i;
+    for (i=0, path = pathList; path != NULL; path = path->next, ++i)
+	{
+	fileNames[i] = path->name;
+	}
+    }
+else
+    {
+    fileNames = argv+1;
+    fileCount = argc-1;
+    }
 char *parentField = (clParent ? clParentField : NULL);
-raSqlQuery(argc-1, argv+1, query, mergeField, parentField, clNoInheritField, lm, stdout);
+raSqlQuery(fileCount, fileNames, query, clMerge, parentField, clNoInheritField, lm, stdout);
 return 0;
 }