3cbffa5fa5e6432fa9a0aed338a7c6aa4ac80875 kent Sun Sep 1 11:41:08 2019 -0700 Starting to work on a utility to help load things from GEO into HCAT diff --git src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c new file mode 100644 index 0000000..ba0270a --- /dev/null +++ src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c @@ -0,0 +1,298 @@ +/* hcatTabUpdate - take the tabToTabDir result of the geo/sra import + * and unpack a few fields that are just too hard in strex. + * Put results in an output dir. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "fieldedTable.h" +#include "portable.h" +#include "csv.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "hcatTabUpdate - Update the hcat database given a tab seperated input and output dir.\n" + "hcatTabUpdate - take the tabToTabDir result of the geo/sra import\n" + "and unpack a few fields that are just too hard in strex. \n" + "Put results in an output dir.\n" + "usage:\n" + " hcatTabUpdate inDir outDir\n" + "options:\n" + " -xxx=XXX\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {NULL, 0}, +}; + +void addIfReal(int fieldIx, char *oldFieldNames[], + char *newFields[], int newIxs[], int maxNewCount,int *pCurCount) +/* If fieldIx is positive we consider it real anc add it to the newFields */ +{ +if (fieldIx > 0) + { + char *oldName = cloneString(oldFieldNames[fieldIx]); + char *newName = oldName; + char *prefix = "contact_"; + if (startsWith(prefix, oldName)) + { + newName += strlen(prefix);; + } + int curCount = *pCurCount; + if (curCount >= maxNewCount) + errAbort("Too many fields in addIfRead on %s, %d max", oldName, curCount); + newIxs[curCount] = fieldIx; + newFields[curCount] = newName; + *pCurCount = curCount+1; + } +} + + +struct fieldedTable *makeContributors(struct fieldedTable *inProject) +/* Make a fielded table from project contact info and contributors list */ +{ +char **projectRow = inProject->rowList->row; + +/* Make the contributors list in two pieces first off of the contact */ +int contact_name = fieldedTableFindFieldIx(inProject, "contact_name"); +int contact_email = fieldedTableFindFieldIx(inProject, "contact_email"); +int contact_phone = fieldedTableFindFieldIx(inProject, "contact_phone"); +int contact_department = fieldedTableFindFieldIx(inProject, "contact_department"); +int contact_institute = fieldedTableFindFieldIx(inProject, "contact_institute"); +int contact_address = fieldedTableFindFieldIx(inProject, "contact_address"); +int contact_city = fieldedTableFindFieldIx(inProject, "contact_city"); +int contact_country = fieldedTableFindFieldIx(inProject, "contact_country"); +int contact_zip_postal_code = fieldedTableFindFieldIx(inProject, "contact_zip_postal_code"); + + +/* Figure out which contact data we actually have */ +const int maxContacts = 32; +char *contactFields[maxContacts+1]; // An extra for the project_role +int contactIx[maxContacts]; +int realFieldCount = 0; +char **oldFields = inProject->fields; +addIfReal(contact_name, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); +addIfReal(contact_email, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); +addIfReal(contact_phone, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); +addIfReal(contact_department, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); +addIfReal(contact_institute, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); +addIfReal(contact_address, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); +addIfReal(contact_city, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); +addIfReal(contact_country, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); +addIfReal(contact_zip_postal_code, oldFields, + contactFields, contactIx, maxContacts, &realFieldCount); +contactFields[realFieldCount] = "project_role"; +realFieldCount += 1; +uglyf("Got %d real contact fields\n", realFieldCount); + +/* Make contributor output table. The first row of it will be seeded with the contact. + * We can fill out names, but not other info on the other contributors, who will make + * up the rest of the rows. */ +struct fieldedTable *contributors = fieldedTableNew("contributors", contactFields, + realFieldCount); +contributors->startsSharp = inProject->startsSharp; + +/* Make up first row from contacts */ +char *outVals[realFieldCount]; +int outIx; +struct dyString *scratch = dyStringNew(0); +for (outIx=0; outIx<realFieldCount-1; ++outIx) + { + char *inTsv = projectRow[contactIx[outIx]]; + char *inVal = emptyForNull(cloneString(csvParseNext(&inTsv, scratch))); + outVals[outIx] = inVal; + } +outVals[outIx] = "lab contact"; +char *contactName = cloneString(outVals[0]); +fieldedTableAdd(contributors, outVals, realFieldCount, 1); + +/* Unroll the contributors field into further rows*/ +for (outIx=0; outIx<realFieldCount; ++outIx) + outVals[outIx] = ""; // Empty out all rows. +int inContribIx = fieldedTableMustFindFieldIx(inProject, "contributors"); +int outContribIx = fieldedTableMustFindFieldIx(contributors, "name"); +char *inTsv = projectRow[inContribIx]; +char *oneVal; +while ((oneVal = csvParseNext(&inTsv, scratch)) != NULL) + { + if (differentString(oneVal, contactName)) // We already got the contact as a contributor + { + outVals[outContribIx] = cloneString(oneVal); + outVals[realFieldCount-1] = "contributor"; + fieldedTableAdd(contributors, outVals, realFieldCount, contributors->rowCount+1); + } + } +return contributors; +} + +char *lookupSpecies(char *taxon) +/* Some day we may query a decent database, for now + * just have some of the most common */ +{ +if (sameString(taxon, "9606")) return "human"; +if (sameString(taxon, "10090")) return "mouse"; +if (sameString(taxon, "10116")) return "rat"; +if (sameString(taxon, "7955")) return "zebrafish"; +if (sameString(taxon, "7227")) return "fly"; +if (sameString(taxon, "6239")) return "worm"; +if (sameString(taxon, "4932")) return "yeast"; +errAbort("Unknown taxon %s", taxon); +return NULL; +} + +char *taxonsToSpecies(char *inVal, struct dyString *scratch) +/* Convert a comma separated list of taxons in inVal to a + * comma separated species list, using scratch. */ +{ +struct dyString *result = dyStringNew(64); +char *taxon; +char *s = inVal; +while ((taxon = csvParseNext(&s, scratch)) != NULL) + { + char *species = lookupSpecies(taxon); + csvEscapeAndAppend(result, species); + } +return dyStringCannibalize(&result); +} + +struct fieldedTable *makeProject(struct fieldedTable *inProject) +/* Make output project table. This is the big one - 35 fields now + * probably twice that by the time HCA is done. Fortunately we only need + * to deal with some of the fields and it only has one row. */ +{ +char **inFields = inProject->fields; +char **inRow = inProject->rowList->row; +int inFieldCount = inProject->fieldCount; +int outFieldMax = inFieldCount + 16; // Mostly we remove fields but we do add a few +char *outFields[outFieldMax]; +char *outRow[outFieldMax]; +int outFieldCount = 0; +struct dyString *scratch = dyStringNew(0); + +/* First we make up the basics of the outProject table. Mostly this is just + * passing through from the inProject, but there's exceptions like contacts. */ +int inIx; +for (inIx=0; inIx<inFieldCount; ++inIx) + { + /* Fetch input name and value */ + char *inName = inFields[inIx]; + char *inVal = inRow[inIx]; + + /* Go through list of input fields we tweak slightly */ + if (sameString("taxons", inName)) + { + inName = "species"; + inVal = taxonsToSpecies(inVal, scratch); + } + + /* Output all the ones we haven't dealt with already */ + if (!startsWith("contact_", inName) && !sameString("contributors", inName)) + { + outFields[outFieldCount] = inName; + outRow[outFieldCount] = inVal; + ++outFieldCount; + } + } +struct fieldedTable *outTable = fieldedTableNew("project", outFields, outFieldCount); +outTable->startsSharp = inProject->startsSharp; +fieldedTableAdd(outTable, outRow, outFieldCount, 1); +dyStringFree(&scratch); +return outTable; +} + +char *fieldedTableLookupNamedFieldInRow(struct fieldedTable *table, char *field, char **row) +/* Look up field by name, not as effient as by index but what the heck. */ +{ +int ix = fieldedTableFindFieldIx(table, field); +if (ix < 0) + return NULL; +return row[ix]; +} + +struct fieldedTable *makeLab(struct fieldedTable *inProject) +/* If there's a lab field we make a lab table and seed it with the contacts. */ +{ +int labIx = fieldedTableFindFieldIx(inProject, "lab"); +if (labIx >= 0) + { + char **inRow = inProject->rowList->row; + char *short_name = inRow[labIx]; + char *contact = fieldedTableLookupNamedFieldInRow(inProject, "contact_name", inRow); + char *institute = emptyForNull(fieldedTableLookupNamedFieldInRow(inProject, "contact_institute", inRow)); + char labName[256]; + if (strlen(short_name) < 20) // Unlikely to be unique, may cause trouble + safef(labName, sizeof(labName), "%s %s", short_name, institute); + else + safef(labName, sizeof(labName), "%s", short_name); + labName[50] = 0; // not too long + + char *outFields[3] = {"short_name", "institute", "contact"}; + struct fieldedTable *labTable = fieldedTableNew("lab", outFields, ArraySize(outFields)); + char *outRow[3] = {labName, institute, contact}; + fieldedTableAdd(labTable, outRow, ArraySize(outRow), 1); + return labTable; + } +else + return NULL; +} + +void hcatTabUpdate(char *inDir, char *outDir) +/* hcatTabUpdate - Update the hcat database given a tab seperated input and output dir. */ +{ +// We are actually just looking for specific files in inDir. */ + +/* Load up input projects table */ +char *projectFile = "hcat_project.tsv"; +char inPath[PATH_LEN]; +safef(inPath, sizeof(inPath), "%s/%s", inDir, projectFile); +char *projectRequired[] = {"short_name", "contact_name"}; +struct fieldedTable *inProject = fieldedTableFromTabFile(inPath, inPath, + projectRequired, ArraySize(projectRequired)); +uglyf("Got %d rows, %d columns from %s\n", + inProject->rowCount, inProject->fieldCount, inProject->name); + +/* Load up samples table */ +char *sampleFile = "hcat_sample.tsv"; +safef(inPath, sizeof(inPath), "%s/%s", inDir, sampleFile); +char *sampleRequired[] = {"short_name",}; +struct fieldedTable *inSample = fieldedTableFromTabFile(inPath, inPath, + sampleRequired, ArraySize(sampleRequired)); +uglyf("Got %d rows, %d columns from %s\n", + inSample->rowCount, inSample->fieldCount, inSample->name); + + +/* Make sure inProject table makes sense by having exactly one row */ +if (inProject->rowCount != 1) + errAbort("Expected one row in %s, got %d\n", projectFile, inProject->rowCount); + +struct fieldedTable *outContributor = makeContributors(inProject); +struct fieldedTable *outProject = makeProject(inProject); +struct fieldedTable *outLab = makeLab(inProject); + +/* Write output */ +makeDirsOnPath(outDir); +char outPath[PATH_LEN]; +safef(outPath, sizeof(outPath), "%s/%s", outDir, "contributors.tsv"); +fieldedTableToTabFile(outContributor, outPath); +safef(outPath, sizeof(outPath), "%s/%s", outDir, "project.tsv"); +fieldedTableToTabFile(outProject, outPath); +if (outLab != NULL) + { + safef(outPath, sizeof(outPath), "%s/%s", outDir, "lab.tsv"); + fieldedTableToTabFile(outLab, outPath); + } +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 3) + usage(); +hcatTabUpdate(argv[1], argv[2]); +return 0; +}