fcfb73cd2dc944a0a10ef34699dcb0c185ea8e6a max Wed May 26 05:12:50 2021 -0700 Trying to give users a clearer idea how many licenses this repo contains, what is under which license and who to contact to get the license. refs #27614 diff --git src/hg/utils/raSqlQuery/raSqlQuery.c src/hg/utils/raSqlQuery/raSqlQuery.c new file mode 100644 index 0000000..3b9a548 --- /dev/null +++ src/hg/utils/raSqlQuery/raSqlQuery.c @@ -0,0 +1,622 @@ +/* raSqlQuery - Do a SQL-like query on a RA file.. */ + +/* Copyright (C) 2012 The Regents of the University of California + * See README in this or parent directory for licensing information. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "dystring.h" +#include "options.h" +#include "obscure.h" +#include "ra.h" +#include "localmem.h" +#include "tokenizer.h" +#include "sqlNum.h" +#include "raRecord.h" +#include "rql.h" +#include "portable.h" +#include "../../hg/inc/hdb.h" /* Just for strict option. */ + + +static char *clQueryFile = NULL; +static char *clQuery = NULL; +static char *clKey = "name"; +static char *clParentField = "subTrack"; +static char *clNoInheritField = "noInherit"; +static boolean clMerge = FALSE; +static boolean clParent = FALSE; +static boolean clAddFile = FALSE; +static boolean clAddDb = FALSE; +static char *clRestrict = NULL; +static boolean clStrict = FALSE; +static char *clDb = NULL; +static boolean clOverrideNeeded = FALSE; + +static char *clTrackDbRootDir = "~/kent/src/hg/makeDb/trackDb"; +static char *clTrackDbRelPath = "../../trackDb*.ra ../trackDb*.ra trackDb*.ra"; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "raSqlQuery - Do a SQL-like query on a RA file.\n" + " raSqlQuery raFile(s) query-options\n" + "or\n" + " raSqlQuery -db=dbName query-options\n" + "Where dbName is a UCSC Genome database like hg18, sacCer1, etc.\n" + "One of the following query-options must be specified\n" + " -queryFile=fileName\n" + " \"-query=select list,of,fields from file where field='this'\"\n" + "The queryFile just has a query in it in the same form as the query option.\n" + "The syntax of a query statement is very SQL-like. The most common commands are:\n" + " select tag1,tag2,tag3 where tag1 like 'prefix%%'\n" + "where the %% is a SQL wildcard. Sorry to mix wildcards. Another command query is\n" + " select count(*) from * where tag = 'val\n" + "The from list is optional. If it exists it is a list of raFile names\n" + " select track,type from *Encode* where type like 'bigWig%%'\n" + "Other command line options:\n" + " -addFile - Add 'file' field to say where record is defined\n" + " -addDb - Add 'db' field to say where record is defined\n" + " -strict - Used only with db option. Only report tracks that exist in db\n" + " -key=keyField - Use the as the key field for merges and parenting. Default %s\n" + " -parent - Merge together inheriting on parentField\n" + " -parentField=field - Use field as the one that tells us who is our parent. Default %s\n" + " -overrideNeeded - If set records are only overridden field-by-field by later records\n" + " if 'override' follows the track name. Otherwiser later record replaces\n" + " earlier record completely. If not set all records overridden field by field\n" + " -noInheritField=field - If field is present don't inherit fields from parent\n" + " -merge - If there are multiple raFiles, records with the same keyField will be\n" + " merged together with fields in later files overriding fields in earlier files\n" + " -restrict=keyListFile - restrict output to only ones with keys in file.\n" + " -db=hg19 - Acts on trackDb files for the given database. Sets up list of files\n" + " appropriately and sets parent, merge, and override all.\n" + " Use db=all for all databases\n" + , clKey, clParentField + ); +} + + +static struct optionSpec options[] = { + {"queryFile", OPTION_STRING}, + {"query", OPTION_STRING}, + {"merge", OPTION_BOOLEAN}, + {"key", OPTION_STRING}, + {"parent", OPTION_BOOLEAN}, + {"parentField", OPTION_STRING}, + {"noInheritField", OPTION_STRING}, + {"addFile", OPTION_BOOLEAN}, + {"addDb", OPTION_BOOLEAN}, + {"restrict", OPTION_STRING}, + {"strict", OPTION_BOOLEAN}, + {"db", OPTION_STRING}, + {"overrideNeeded", OPTION_BOOLEAN}, + {NULL, 0}, +}; + + +struct dbPath +/* A database directory and path. */ + { + struct dbPath *next; + char *db; + char *dir; + }; + +static struct dbPath *getDbPathList(char *rootDir) +/* Get list of all "database" directories with any trackDb.ra files two under us. */ +{ +char *root = simplifyPathToDir(rootDir); +struct dbPath *pathList = NULL, *path; +struct fileInfo *org, *orgList = listDirX(root, "*", TRUE); +for (org = orgList; org != NULL; org = org->next) + { + if (org->isDir) + { + struct fileInfo *db, *dbList = listDirX(org->name, "*", TRUE); + for (db = dbList; db != NULL; db = db->next) + { + if (db->isDir) + { + char trackDbPath[PATH_LEN]; + safef(trackDbPath, sizeof(trackDbPath), "%s/trackDb.ra", db->name); + if (fileExists(trackDbPath)) + { + AllocVar(path); + path->dir = cloneString(db->name); + char *s = strrchr(db->name, '/'); + assert(s != NULL); + path->db = cloneString(s+1); + slAddHead(&pathList, path); + } + } + } + slFreeList(&dbList); + } + } +slFreeList(&orgList); +slReverse(&pathList); +freez(&root); +return pathList; +} + +static struct slName *dbPathToFiles(struct dbPath *p) +/* Convert dbPath to a list of files. */ +{ +struct slName *pathList = NULL; +char *dbDir = p->dir; +char *buf = cloneString(clTrackDbRelPath); +char *line = buf, *word; +while ((word = nextWord(&line)) != NULL) + { + char relDir[PATH_LEN], relFile[PATH_LEN], relSuffix[PATH_LEN]; + splitPath(word, relDir, relFile, relSuffix); + char dir[PATH_LEN]; + safef(dir, sizeof(dir), "%s/%s", dbDir, relDir); + char *path = simplifyPathToDir(dir); + char pattern[PATH_LEN]; + safef(pattern, sizeof(pattern), "%s%s", relFile, relSuffix); + struct fileInfo *fi, *fiList = listDirX(path, pattern, TRUE); + for (fi = fiList; fi != NULL; fi = fi->next) + slNameAddHead(&pathList, fi->name); + freeMem(path); + slFreeList(&fiList); + } +freeMem(buf); +slReverse(&pathList); +return pathList; +} + + + +static void mergeRecords(struct raRecord *old, struct raRecord *record, char *key, struct lm *lm) +/* Merge record into old, updating any old fields with new record values. */ +{ +struct raField *field; +for (field = record->fieldList; field != NULL; field = field->next) + { + if (!sameString(field->name, key)) + { + struct raField *oldField = raRecordField(old, field->name); + if (oldField != NULL) + oldField->val = field->val; + else + { + lmAllocVar(lm, oldField); + oldField->name = field->name; + oldField->val = field->val; + slAddTail(&old->fieldList, oldField); + } + } + } +old->posList = slCat(old->posList, record->posList); +} + +static void mergeParentRecord(struct raRecord *record, struct raRecord *parent, + struct lm *lm) +/* Merge in parent record. This only updates fields that are in parent but not record. */ +{ +struct raField *parentField; +for (parentField= parent->fieldList; parentField!= NULL; parentField= parentField->next) + { + struct raField *oldField = raRecordField(record, parentField->name); + if (oldField == NULL) + { + struct raField *newField; + lmAllocVar(lm, newField); + newField->name = parentField->name; + newField->val = parentField->val; + slAddTail(&record->fieldList, newField); + } + } +} + +static struct raRecord *readRaRecords(int inCount, char *inNames[], char *keyField, + boolean doMerge, char *db, boolean addDb, + boolean overrideNeeded, struct lm *lm) +/* Scan through files, merging records on key if doMerge. */ +{ +if (inCount <= 0) + return NULL; +if (doMerge) + { + struct raRecord *recordList = NULL, *record; + struct hash *recordHash = hashNew(0); + int i; + for (i=0; i<inCount; ++i) + { + char *fileName = inNames[i]; + struct lineFile *lf = lineFileOpen(fileName, TRUE); + while ((record = raRecordReadOne(lf, keyField, lm)) != NULL) + { + record->posList = raFilePosNew(lm, fileName, lf->lineIx); + if (addDb) + record->db = db; + char *key = record->key; + if (key != NULL) + { + struct raRecord *oldRecord = hashFindVal(recordHash, key); + if (oldRecord != NULL) + { + if (overrideNeeded && !record->override) + { + oldRecord->fieldList = record->fieldList; + oldRecord->posList = record->posList; + oldRecord->settingsByView = record->settingsByView; + oldRecord->subGroups = record->subGroups; + oldRecord->view = record->view; + oldRecord->viewHash = record->viewHash; + } + else + mergeRecords(oldRecord, record, keyField, lm); + } + else + { + slAddHead(&recordList, record); + hashAdd(recordHash, key, record); + } + } + } + lineFileClose(&lf); + } + slReverse(&recordList); + return recordList; + } +else + { + struct raRecord *recordList = NULL; + int i; + for (i=0; i<inCount; ++i) + { + char *fileName = inNames[i]; + struct lineFile *lf = lineFileOpen(fileName, TRUE); + struct raRecord *record; + while ((record = raRecordReadOne(lf, keyField, lm)) != NULL) + { + record->posList = raFilePosNew(lm, fileName, lf->lineIx); + slAddHead(&recordList, record); + } + lineFileClose(&lf); + } + slReverse(&recordList); + return recordList; + } +} + +static char *lookupField(void *record, char *key) +/* Lookup a field in a raRecord. */ +{ +struct raRecord *ra = record; +struct raField *field = raRecordField(ra, key); +if (field == NULL) + return NULL; +else + return field->val; +} + +boolean rqlStatementMatch(struct rqlStatement *rql, struct raRecord *ra, struct lm *lm) +/* Return TRUE if where clause and tableList in statement evaluates true for ra. */ +{ +if (rql->tableList != NULL) + { + boolean gotMatch = FALSE; + struct slName *table; + for (table = rql->tableList; table != NULL; table = table->next) + { + struct raFilePos *fp; + for (fp = ra->posList; fp != NULL; fp = fp->next) + { + if (wildMatch(table->name, fp->fileName)) + { + gotMatch = TRUE; + break; + } + } + if (gotMatch) + break; + } + if (!gotMatch) + return FALSE; + } +struct rqlParse *whereClause = rql->whereClause; +if (whereClause == NULL) + return TRUE; +else + { + struct rqlEval res = rqlEvalOnRecord(whereClause, ra, lookupField, lm); + res = rqlEvalCoerceToBoolean(res); + return res.val.b; + } +} + +void rqlStatementOutput(struct rqlStatement *rql, struct raRecord *ra, + char *addFileField, char *addDbField, FILE *out) +/* Output fields from ra to file. If addFileField is non-null add a new + * field with this name at end of output. */ +{ +if (addDbField) + fprintf(out, "%s %s\n", addDbField, ra->db); +struct slName *fieldList = rql->fieldList, *field; +for (field = fieldList; field != NULL; field = field->next) + { + struct raField *r; + boolean doWild = anyWild(field->name); + for (r = ra->fieldList; r != NULL; r = r->next) + { + boolean match; + if (doWild) + match = wildMatch(field->name, r->name); + else + match = (strcmp(field->name, r->name) == 0); + if (match) + fprintf(out, "%s %s\n", r->name, r->val); + } + } +if (addFileField != NULL) + { + fprintf(out, "%s", addFileField); + struct raFilePos *fp; + for (fp = ra->posList; fp != NULL; fp = fp->next) + { + fprintf(out, " %s %d", fp->fileName, fp->lineIx); + } + fprintf(out, "\n"); + } +fprintf(out, "\n"); +} + +struct hash *hashAllWordsInFile(char *fileName) +/* Make a hash of all space or line delimited words in file. */ +{ +struct hash *hash = hashNew(0); +struct lineFile *lf = lineFileOpen(fileName, TRUE); +char *line, *word; +while (lineFileNext(lf, &line, NULL)) + { + while ((word = nextWord(&line)) != NULL) + hashAdd(hash, word, NULL); + } +lineFileClose(&lf); +return hash; +} + + +static struct raRecord *findParent(struct raRecord *rec, + char *parentFieldName, struct hash *hash) +/* Find parent field if possible. */ +{ +struct raField *parentField = raRecordField(rec, parentFieldName); +if (parentField == NULL) + return NULL; +char *parentLine = parentField->val; +int len = strlen(parentLine); +char buf[len+1]; +strcpy(buf, parentLine); +char *parentName = firstWordInLine(buf); +struct raRecord *parent = hashFindVal(hash, parentName); +if (parent == NULL) + warn("%s is a subTrack of %s, but %s doesn't exist", rec->key, + parentField->val, parentField->val); +return parent; +} + +static void inheritFromParents(struct raRecord *list, char *parentField, char *noInheritField, + struct lm *lm) +/* Go through list. If an element has a parent field, then fill in non-existent fields from + * parent. */ +{ +/* Build up hash of records indexed by key field. */ +struct hash *hash = hashNew(0); +struct raRecord *rec; +for (rec = list; rec != NULL; rec = rec->next) + { + if (rec->key != NULL) + hashAdd(hash, rec->key, rec); + } + +/* Scan through linking up parents. */ +for (rec = list; rec != NULL; rec = rec->next) + { + struct raRecord *parent = findParent(rec, parentField, hash); + if (parent != NULL) + { + rec->parent = parent; + rec->olderSibling = parent->children; + parent->children = rec; + } + } + +/* Scan through doing inheritance. */ +for (rec = list; rec != NULL; rec = rec->next) + { + /* First inherit from view. */ + char *viewName = rec->view; + if (viewName != NULL) + { + struct slPair *view; + if (rec->parent == NULL) + { + verbose(2, "%s has view %s but no parent\n", rec->key, viewName); + continue; + } + for (view = rec->parent->settingsByView; view != NULL; view = view->next) + { + if (sameString(view->name, viewName)) + break; + else + { + if (rec->parent->viewHash != NULL) + { + char *alias = hashFindVal(rec->parent->viewHash, viewName); + if (alias != NULL) + if (sameString(view->name, alias)) + break; + } + } + } + if (view != NULL) + { + struct slPair *setting; + for (setting = view->val; setting != NULL; setting = setting->next) + { + struct raField *oldField = raRecordField(rec, setting->name); + if (oldField == NULL) + { + struct raField *newField; + lmAllocVar(lm, newField); + newField->name = lmCloneString(lm, setting->name); + newField->val = lmCloneString(lm, setting->val); + slAddTail(&rec->fieldList, newField); + } + } + } + else + { + verbose(3, "view %s not in parent settingsByView of %s\n", viewName, rec->key); + } + } + + /* Then inherit from parents. */ + struct raRecord *parent; + for (parent = rec->parent; parent != NULL; parent = parent->parent) + { + mergeParentRecord(rec, parent, lm); + } + } +} + +int raSqlQuery(struct rqlStatement *rql, int inCount, char *inNames[], + char *db, char *parentField, struct lm *lm, FILE *out) +/* raSqlQuery - Do a SQL-like query on a RA file.. */ +{ +struct raRecord *raList = readRaRecords(inCount, inNames, clKey, + clMerge, db, clAddDb, clOverrideNeeded, lm); +if (parentField != NULL) + { + inheritFromParents(raList, parentField, clNoInheritField, lm); + } +if (clRestrict) + { + struct hash *restrictHash = hashAllWordsInFile(clRestrict); + restrictHash = hashAllWordsInFile(clRestrict); + struct raRecord *newList = NULL, *next, *rec; + for (rec = raList; rec != NULL; rec = next) + { + next = rec->next; + if (rec->key && hashLookup(restrictHash, rec->key)) + { + slAddHead(&newList, rec); + } + } + slReverse(&newList); + raList = newList; + hashFree(&restrictHash); + } +verbose(2, "Got %d records in raFiles\n", slCount(raList)); +if (verboseLevel() > 1) + rqlStatementDump(rql, stderr); +struct raRecord *ra; +int matchCount = 0; +boolean doSelect = sameString(rql->command, "select"); +for (ra = raList; ra != NULL; ra = ra->next) + { + if (rqlStatementMatch(rql, ra, lm)) + { + if (!clStrict || (ra->key && hTableOrSplitExists(db, ra->key))) + { + matchCount += 1; + if (doSelect) + { + rqlStatementOutput(rql, ra, (clAddFile ? "file" : NULL), + (clAddDb ? "db" : NULL), out); + } + } + } + } +return matchCount; +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +clMerge = optionExists("merge"); +clParent = optionExists("parent"); +clParentField = optionVal("parentField", clParentField); +clKey = optionVal("key", clKey); +clQueryFile = optionVal("queryFile", NULL); +clQuery = optionVal("query", NULL); +clNoInheritField = optionVal("noInheritField", clNoInheritField); +clAddFile = optionExists("addFile"); +clAddDb = optionExists("addDb"); +clRestrict = optionVal("restrict", NULL); +clStrict = optionExists("strict"); +clOverrideNeeded = optionExists("overrideNeeded"); +clDb = optionVal("db", NULL); +if (argc < 2 && !clDb) + usage(); +if (clQueryFile == NULL && clQuery == NULL) + errAbort("Please specify either the query or queryFile option."); +if (clQueryFile != NULL && clQuery != NULL) + errAbort("Please specify just one of the query or queryFile options."); +struct lm *lm = lmInit(0); +if (clStrict && clDb == NULL) + errAbort("Only can use -strict with -db."); + +if (clDb != NULL) + { + clMerge = TRUE; + clParent = TRUE; + clOverrideNeeded = TRUE; + clKey = "track"; + if (sameString(clDb, "all")) + clAddDb = TRUE; + } +/* Parse query */ +struct lineFile *query; +if (clQuery) + query = lineFileOnString("query", TRUE, cloneString(clQuery)); +else + query = lineFileOpen(clQueryFile, TRUE); +struct rqlStatement *rql = rqlStatementParse(query); + +char *parentField = (clParent ? clParentField : NULL); +char **fileNames; +int fileCount; +int matchCount = 0; +if (clDb) + { + if (argc != 1) + errAbort("You can't specify any input files with the -db option."); + struct dbPath *db, *dbList = getDbPathList(clTrackDbRootDir); + boolean gotAny = FALSE; + for (db = dbList; db != NULL; db = db->next) + { + if (sameString(clDb, "all") || sameString(clDb, db->db)) + { + struct slName *path, *pathList = dbPathToFiles(db); + fileCount = slCount(pathList); + if (fileCount == 0) + errAbort("No paths returned by dbToTrackDbFiles(%s)", clDb); + AllocArray(fileNames, fileCount); + int i; + for (i=0, path = pathList; path != NULL; path = path->next, ++i) + { + fileNames[i] = path->name; + } + matchCount += raSqlQuery(rql, fileCount, fileNames, db->db, parentField, lm, stdout); + gotAny = TRUE; + } + } + if (!gotAny) + errAbort("No database named %s found off %s\n", clDb, clTrackDbRootDir); + } +else + { + fileNames = argv+1; + fileCount = argc-1; + matchCount += raSqlQuery(rql, fileCount, fileNames, "n/a", parentField, lm, stdout); + } +if (sameString(rql->command, "count")) + printf("%d\n", matchCount); +return 0; +}