c1702f4d7e442e45db8d6f40d581f70b975d1340 kent Fri May 26 10:59:08 2017 -0700 Creating a new directory for tab-separated-file utilities and seeding it with tabQuery formerly in the utils directory. diff --git src/tabFile/tabQuery/tabQuery.c src/tabFile/tabQuery/tabQuery.c new file mode 100644 index 0000000..4aed2d0 --- /dev/null +++ src/tabFile/tabQuery/tabQuery.c @@ -0,0 +1,157 @@ +/* tabQuery - Run sql-like query on a tab separated file.. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "localmem.h" +#include "dystring.h" +#include "fieldedTable.h" +#include "rql.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "tabQuery - Run sql-like query on a tab separated file.\n" + "usage:\n" + " tabQuery rqlStatement\n" + "where rqlStatement is much like a SQL statement, but with no joins and no commands\n" + "other than select allowed. The input file name is taken from the 'from' clause.\n" + "examples\n" + " tabQuery select file,date from manifest.tsv\n" + "This will output the file and date fields from the manifest.tsv file\n" + " tabQuery select file,date,lab from manifest.tsv where lab like 'myLab%%'\n" + "This will output the selected three fields from the file where the lab starts with myLab\n" + " tabQuery select file,data from manifest.tsv where lab='myLab'\n" + "This will output the selected two fields where the lab field is exactly myLab.\n" + " tabQuery select * from manifest.tsv where lab='myLab'\n" + "This will output all fields where the lab field is exactly myLab.\n" + " tabQuery select a*,b* from manifest.tsv where lab='myLab'\n" + "This will output all fields starting with a or b where the lab field is exactly myLab.\n" + " tabQuery select count(*) from manifest.tsv where type='fastq' and size < 1000\n" + "This will count the number of records where type is fastq and size less than 1000\n" + ); +} + +struct fieldedTable *gTable; +struct hash *gFieldHash; + +char *lookup(void *record, char *key) +/* Lookup key in record */ +{ +struct fieldedRow *row = record; +int fieldIx = hashIntValDefault(gFieldHash, key, -1); +if (fieldIx < 0) + errAbort("Field %s isn't found in %s", key, gTable->name); +return row->row[fieldIx]; +} + +void tabQuery(char *query) +/* tabQuery - Run sql-like query on a tab separated file.. */ +{ +/* Parse statement and make sure that it just references one table */ +struct rqlStatement *rql = rqlStatementParseString(query); +int tableCount = slCount(rql->tableList); +if (tableCount != 1) + errAbort("One and only one file allowed in the from clause\n"); + +boolean doCount = FALSE; +if (sameWord(rql->command, "count")) + doCount = TRUE; +else if (sameWord(rql->command, "select")) + doCount = FALSE; +else + errAbort("Unrecognized rql command %s", rql->command); + +/* Read in tab separated value file */ +char *tabFile = rql->tableList->name; +gTable = fieldedTableFromTabFile(tabFile, tabFile, NULL, 0); + +/* Make an integer valued hash of field indexes */ +gFieldHash = hashNew(0); +int i; +for (i=0; i<gTable->fieldCount; ++i) + hashAddInt(gFieldHash, gTable->fields[i], i); + +/* Make sure all fields in query exist */ +struct slName *field; +for (field = rql->fieldList; field != NULL; field = field->next) + if (!hashLookup(gFieldHash, field->name)) + { + if (!anyWild(field->name)) + errAbort("field %s doesn't exist in %s", field->name, tabFile); + } + +/* Make list of fields as opposed to array */ +struct slName *allFieldList = NULL; +for (i=0; i<gTable->fieldCount; ++i) + slNameAddHead(&allFieldList, gTable->fields[i]); +slReverse(&allFieldList); + +/* Expand any field names with wildcards. */ +rql->fieldList = wildExpandList(allFieldList, rql->fieldList, TRUE); + + +/* Print out label row. */ +if (!doCount) + { + printf("#"); + char *sep = ""; + for (field = rql->fieldList; field != NULL; field = field->next) + { + printf("%s%s", sep, field->name); + sep = "\t"; + } + printf("\n"); + } + +/* Print out or just count selected fields that match query */ +int matchCount = 0; +struct lm *lm = lmInit(0); +struct fieldedRow *row; +for (row = gTable->rowList; row != NULL; row = row->next) + { + boolean pass = TRUE; + if (rql->whereClause != NULL) + { + struct rqlEval res = rqlEvalOnRecord(rql->whereClause, row, lookup, lm); + res = rqlEvalCoerceToBoolean(res); + pass = res.val.b; + } + if (pass) + { + if (doCount) + ++matchCount; + else + { + char *sep = ""; + for (field = rql->fieldList; field != NULL; field = field->next) + { + int fieldIx = hashIntVal(gFieldHash, field->name); + printf("%s%s", sep, row->row[fieldIx]); + sep = "\t"; + } + printf("\n"); + } + } + } + +if (doCount) + printf("%d\n", matchCount); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +if (argc < 2) + usage(); +struct dyString *query = dyStringNew(0); +int i; +for (i=1; i<argc; ++i) + { + if (i != 1) + dyStringAppendC(query, ' '); + dyStringAppend(query, argv[i]); + } +tabQuery(query->string); +return 0; +}