42616d3f95e6b0ca9b8ab64fd02e4ebcd154ad4c kent Tue Sep 3 15:45:48 2019 -0700 Improving usage message and commenting. Took out the lies about strex, which actually isn't used here after all, just in the previous step. diff --git src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c index 666da90..94ee53b 100644 --- src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c +++ src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c @@ -1,38 +1,34 @@ -/* hcatTabUpdate - take the tabToTabDir result of the geo/sra import - * and unpack a few fields that are just too hard in strex. - * Put results in an output dir. */ +/* hcatTabUpdate - take the tabToTabDir result of the geo/sra import. + * Put results in an output dir in a format sqlUpdateRelated understands. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "fieldedTable.h" #include "portable.h" #include "csv.h" void usage() /* Explain usage and exit. */ { errAbort( - "hcatTabUpdate - Update the hcat database given a tab seperated input and output dir.\n" "hcatTabUpdate - take the tabToTabDir result of the geo/sra import\n" - "and unpack a few fields that are just too hard in strex. \n" - "Put results in an output dir.\n" + "and turn it into food for sqlUpdateRelated, which is what actually\n" + "updates the database.\n" "usage:\n" " hcatTabUpdate inDir outDir\n" - "options:\n" - " -xxx=XXX\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {NULL, 0}, }; char *fieldedTableLookupNamedFieldInRow(struct fieldedTable *table, char *field, char **row) /* Look up field by name, not as effient as by index but what the heck. */ { int ix = fieldedTableFindFieldIx(table, field); if (ix < 0) return NULL; return row[ix]; @@ -97,60 +93,60 @@ for (el = list; el != NULL; el = el->next) csvEscapeAndAppend(dy, el->name); return dyStringCannibalize(&dy); } struct fieldedTable *makeContributors(struct fieldedTable *inProject) /* Make a fielded table from project contact info and contributors list */ { char **projectRow = inProject->rowList->row; struct dyString *csvScratch = dyStringNew(0); /* Make the contributors list in two pieces first off of the contact */ int contact_email = fieldedTableFindFieldIx(inProject, "contact_email"); int contact_phone = fieldedTableFindFieldIx(inProject, "contact_phone"); -//int contact_department = fieldedTableFindFieldIx(inProject, "contact_department"); -//int contact_institute = fieldedTableFindFieldIx(inProject, "contact_institute"); -//int contact_address = fieldedTableFindFieldIx(inProject, "contact_address"); -//int contact_city = fieldedTableFindFieldIx(inProject, "contact_city"); -//int contact_country = fieldedTableFindFieldIx(inProject, "contact_country"); -//int contact_zip_postal_code = fieldedTableFindFieldIx(inProject, "contact_zip_postal_code"); +int contact_department = fieldedTableFindFieldIx(inProject, "contact_department"); +int contact_institute = fieldedTableFindFieldIx(inProject, "contact_institute"); +int contact_address = fieldedTableFindFieldIx(inProject, "contact_address"); +int contact_city = fieldedTableFindFieldIx(inProject, "contact_city"); +int contact_country = fieldedTableFindFieldIx(inProject, "contact_country"); +int contact_zip_postal_code = fieldedTableFindFieldIx(inProject, "contact_zip_postal_code"); /* Figure out which contact data we actually have */ const int maxContacts = 32; char *contactFields[maxContacts+1]; // An extra for the project_role int contactIx[maxContacts]; char **oldFields = inProject->fields; // Add contact name field separately from the rest. We know it's there since it's a // required field, and also we need to decorate it's name contactFields[0] = "?name"; contactIx[0] = fieldedTableMustFindFieldIx(inProject, "contact_name"); int realFieldCount = 1; // The rest of the contact pieces are added just conditionally addIfReal(contact_email, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); addIfReal(contact_phone, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); -//addIfReal(contact_department, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); -//#addIfReal(contact_institute, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); -//#addIfReal(contact_address, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); -//addIfReal(contact_city, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); -//addIfReal(contact_country, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); -//addIfReal(contact_zip_postal_code, oldFields, -// contactFields, contactIx, maxContacts, &realFieldCount); +addIfReal(contact_department, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); +addIfReal(contact_institute, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); +addIfReal(contact_address, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); +addIfReal(contact_city, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); +addIfReal(contact_country, oldFields, contactFields, contactIx, maxContacts, &realFieldCount); +addIfReal(contact_zip_postal_code, oldFields, + contactFields, contactIx, maxContacts, &realFieldCount); contactFields[realFieldCount] = "project_role"; realFieldCount += 1; /* Make contributor output table. The first row of it will be seeded with the contact. * We can fill out names, but not other info on the other contributors, who will make * up the rest of the rows. */ struct fieldedTable *contributors = fieldedTableNew("contributor", contactFields, realFieldCount); contributors->startsSharp = inProject->startsSharp; /* Make up first row from contacts */ char *outVals[realFieldCount]; int outIx; struct dyString *scratch = dyStringNew(0); for (outIx=0; outIx<realFieldCount-1; ++outIx) @@ -203,56 +199,108 @@ /* Convert a comma separated list of taxons in inVal to a * comma separated species list, using scratch. */ { struct dyString *result = dyStringNew(64); char *taxon; char *s = inVal; while ((taxon = csvParseNext(&s, scratch)) != NULL) { char *species = lookupSpecies(taxon); csvEscapeAndAppend(result, species); } return dyStringCannibalize(&result); } void addListFieldIfNonempty(char *field, struct slName *list, - char *newFields[], char *newVals[], int maxNewCount,int *pCurCount) + char **newFields, char **newVals, int maxNewCount,int *pCurCount) /* Add field to newFields if list is non-empty, taking care not to go past end. */ { if (list != NULL) { int curCount = *pCurCount; if (curCount >= maxNewCount) errAbort("Too many fields in addListFieldIfNonempty on %s, %d max", field, curCount); char fieldName[256]; char *strippedField = cloneString(field); stripChar(strippedField, '_'); safef(fieldName, sizeof(fieldName), "@@%s@id@hcat_project_%s@project_id@%s_id@hcat_%s@short_name@id", - field, field, field, strippedField); + field, field, strippedField, strippedField); newFields[curCount] = cloneString(fieldName); newVals[curCount] = slNameToCsv(list); *pCurCount = curCount+1; freez(&strippedField); } } +struct fieldedTable *makeVocabTable(struct fieldedTable *inProject, struct slName *valList, + char *projectFieldName, char *tableName) +/* Make a generic short_name/description table. */ +{ +/* Make up our little generic table */ +char *outFields[] = {"?short_name", "description",}; +struct fieldedTable *vocabTable = fieldedTableNew(tableName, outFields, ArraySize(outFields)); + +/* Fetch the input value, which is a comma separated list. Then go through + * the list and make a table entry for each item. */ +struct dyString *csvScratch = dyStringNew(0); +struct slName *val; +for (val = valList; val != NULL; val = val->next) + { + char *outRow[2] = {val->name, "NEEDS DESCRIPTION"}; + fieldedTableAdd(vocabTable, outRow, ArraySize(outRow), 0); + } + +/* Clean up and go home. */ +dyStringFree(&csvScratch); +return vocabTable; +} + +void projectVocabField(struct fieldedTable *inProject, struct fieldedTable *inSample, + char *underbarred, char *outDir, char **outFields, + char **outRow, int outFieldMax, int *pOutFieldCount) +/* Cope with a list field in project we get from scanning samples. Kind of a + * technical routine, with 7 parameters, but since it gets called like 5 times + * best to encapsulate it. */ +{ +/* Get list of the different values from the inSample table */ +struct slName *valList = uniqVals(inSample, underbarred); + +/* Django has this weird thing about underbars - sometimes it strips them and we have to + * match it's behavior */ +char strippedName[strlen(underbarred) + 1]; +strcpy(strippedName, underbarred); +stripChar(strippedName, '_'); + +/* Ok, try and make the table. If we succeed write it out. */ +struct fieldedTable *outTable = makeVocabTable(inProject,valList,underbarred,strippedName); +if (outTable != NULL) + { + addListFieldIfNonempty(underbarred, valList, outFields, outRow, outFieldMax, pOutFieldCount); + char outPath[PATH_LEN]; + safef(outPath, sizeof(outPath), "%s/hcat_%s.tsv", outDir, strippedName); + fieldedTableToTabFile(outTable, outPath); + } +} -struct fieldedTable *makeProject(struct fieldedTable *inProject, struct fieldedTable *inSample) +struct fieldedTable *makeProject(struct fieldedTable *inProject, + struct fieldedTable *inSample, char *outDir) /* Make output project table. This is the big one - 35 fields now * probably twice that by the time HCA is done. Fortunately we only need - * to deal with some of the fields and it only has one row. */ + * to deal with some of the fields and it only has one row. + * In a move to save code this will also as a side effect write out + * any vocab tables implicitly mentioned.*/ { char **inFields = inProject->fields; char **inRow = inProject->rowList->row; int inFieldCount = inProject->fieldCount; struct dyString *scratch = dyStringNew(0); int outFieldMax = inFieldCount + 16; // Mostly we remove fields but we do add a few. Gets checked char *outFields[outFieldMax]; char *outRow[outFieldMax]; int outFieldCount = 0; /* First we make up the basics of the outProject table. Mostly this is just * passing through from the inProject, but there's exceptions like contacts. */ int inIx; for (inIx=0; inIx<inFieldCount; ++inIx) @@ -274,169 +322,189 @@ if (sameString("state_reached", inName) || sameString("cur_state", inName)) { safef(nameBuf, sizeof(nameBuf), "@%s_id@hcat_projectstate@state@id", inName); inName = cloneString(nameBuf); } else if (sameString("consent", inName)) { safef(nameBuf, sizeof(nameBuf), "@%s_id@hcat_%s@short_name@id", inName, inName); inName = cloneString(nameBuf); } else if (sameString("effort", inName)) { safef(nameBuf, sizeof(nameBuf), "@%s_id@hcat_efforttype@short_name@id", inName); inName = cloneString(nameBuf); } - else if (sameString("effort", inName)) + else if (sameString("lab", inName)) + { + safef(nameBuf, sizeof(nameBuf), "%s", + "@@lab@id@hcat_project_labs@project_id@lab_id@hcat_lab@short_name@id"); + inName = cloneString(nameBuf); + } + else if (sameString("publications", inName)) + { + safef(nameBuf, sizeof(nameBuf), "%s", + "@@publications@id@hcat_project_publications@project_id@publication_id@hcat_publication@short_name@id"); + inName = cloneString(nameBuf); + } /* Output all the ones we haven't dealt with already or will deal with later */ if (!startsWith("contact_", inName) && !sameString("contributors", inName)) { outFields[outFieldCount] = inName; outRow[outFieldCount] = inVal; ++outFieldCount; } } /* Add in contributors as a multi to multi field */ -outFields[outFieldCount] = "@@contributors@hcat_project_contributors@id@project_id@contributor_id@hcat_contributor@name@id"; +outFields[outFieldCount] = "@@contributors@id@hcat_project_contributors@project_id@contributor_id@hcat_contributor@name@id"; outRow[outFieldCount] = fieldedTableLookupNamedFieldInRow(inProject, "contributors", inRow); +outFieldCount += 1; /* Add the fields we scan and merge from sample at end */ -struct slName *organList = uniqVals(inSample, "organ"); -struct slName *organPartList = uniqVals(inSample, "organ_part"); -struct slName *assayTypeList = uniqVals(inSample, "assay_type"); -struct slName *diseaseList = uniqVals(inSample, "disease"); -addListFieldIfNonempty("organ", organList, outFields, outRow, outFieldMax, &outFieldCount); -addListFieldIfNonempty("organ_part", organPartList, outFields, outRow, outFieldMax, &outFieldCount); -addListFieldIfNonempty("assay_type", assayTypeList, outFields, outRow, outFieldMax, &outFieldCount); -addListFieldIfNonempty("disease", diseaseList, outFields, outRow, outFieldMax, &outFieldCount); - +projectVocabField(inProject, inSample, "organ", outDir, + outFields, outRow, outFieldMax, &outFieldCount); +projectVocabField(inProject, inSample, "organ_part", outDir, + outFields, outRow, outFieldMax, &outFieldCount); +projectVocabField(inProject, inSample, "assay_type", outDir, + outFields, outRow, outFieldMax, &outFieldCount); +projectVocabField(inProject, inSample, "disease", outDir, + outFields, outRow, outFieldMax, &outFieldCount); + +uglyf("making project table with %d fields\n", outFieldCount); struct fieldedTable *outTable = fieldedTableNew("project", outFields, outFieldCount); outTable->startsSharp = inProject->startsSharp; -fieldedTableAdd(outTable, outRow, outFieldCount, 1); +fieldedTableAdd(outTable, outRow, outFieldCount, 2); dyStringFree(&scratch); return outTable; } + +struct fieldedTable *makePublication(struct fieldedTable *inProject) +/* If there's a publication field we make a publication table and seed it with the pmid + * and stuff. */ +{ +int pubIx = fieldedTableFindFieldIx(inProject, "publications"); +if (pubIx >= 0) + { + struct dyString *newPubNames = dyStringNew(0); + char **inRow = inProject->rowList->row; + char *nameCsv = inRow[pubIx]; + struct dyString *csvScratch = dyStringNew(0); + char *pmid; + char *outFields[2] = {"?short_name", "pmid"}; + struct fieldedTable *pubTable = fieldedTableNew("publications", + outFields, ArraySize(outFields)); + while ((pmid = csvParseNext(&nameCsv, csvScratch)) != NULL) + { + char name[64]; + safef(name, sizeof(name), "pmid: %s", pmid); + char *outRow[2] = {name, pmid}; + fieldedTableAdd(pubTable, outRow, ArraySize(outRow), 0); + csvEscapeAndAppend(newPubNames, name); + } + inRow[pubIx] = dyStringCannibalize(&newPubNames); // Other people need to use new value too + dyStringFree(&csvScratch); + return pubTable; + } +else + return NULL; +} + struct fieldedTable *makeLab(struct fieldedTable *inProject) /* If there's a lab field we make a lab table and seed it with the contacts. */ { int labIx = fieldedTableFindFieldIx(inProject, "lab"); if (labIx >= 0) { char **inRow = inProject->rowList->row; char *short_name = inRow[labIx]; char *contact = fieldedTableLookupNamedFieldInRow(inProject, "contact_name", inRow); char *contributors = fieldedTableLookupNamedFieldInRow(inProject, "contributors", inRow); - char *institute = emptyForNull(fieldedTableLookupNamedFieldInRow(inProject, "contact_institute", inRow)); + char *institute = fieldedTableLookupNamedFieldInRow(inProject, "contact_institute", inRow); char labName[256]; if (strlen(short_name) < 20) // Unlikely to be unique, may cause trouble - safef(labName, sizeof(labName), "%s %s", short_name, institute); + safef(labName, sizeof(labName), "%s %s", short_name, emptyForNull(institute)); else safef(labName, sizeof(labName), "%s", short_name); labName[50] = 0; // not too long + inRow[labIx] = cloneString(labName); /* Other people need to know about this too. */ char *outFields[4] = {"?short_name", "institution", "@contact_id@hcat_contributor@name@id", "@@contributors@id@hcat_lab_contributors@lab_id@contributor_id@hcat_contributor@name@id"}; struct fieldedTable *labTable = fieldedTableNew("lab", outFields, ArraySize(outFields)); char *outRow[4] = {labName, institute, contact, contributors}; fieldedTableAdd(labTable, outRow, ArraySize(outRow), 1); return labTable; } else return NULL; } -struct fieldedTable *makeOrgan(struct fieldedTable *inProject) -/* If there's and organ field we make an organ table. */ -{ -uglyf("Trying to make organs for %s\n", inProject->name); - -/* See if it's even in the inProject table, and return quickly with a NULL if not. */ -int organIx = fieldedTableFindFieldIx(inProject, "organ"); -uglyf("organsIx %d\n", organIx); -if (organIx < 0) - return NULL; - -/* Make up our little generic table */ -char *outFields[] = {"?short_name", "description",}; -struct fieldedTable *organTable = fieldedTableNew("organ", outFields, ArraySize(outFields)); - -/* Fetch the input value, which is a comma separated list. Then go through - * the list and make a table entry for each item. */ -char **inRow = inProject->rowList->row; -char *organTsv = inRow[organIx]; -struct dyString *csvScratch = dyStringNew(0); -char *organName; -while ((organName = csvParseNext(&organTsv, csvScratch)) != NULL) - { - char *outRow[2] = {organName, "NEEDS DESCRIPTION"}; - fieldedTableAdd(organTable, outRow, ArraySize(outRow), 0); - } - -/* Clean up and go home. */ -dyStringFree(&csvScratch); -return organTable; -} - void hcatTabUpdate(char *inDir, char *outDir) -/* hcatTabUpdate - Update the hcat database given a tab seperated input and output dir. */ +/* hcatTabUpdate - take the tabToTabDir result of the geo/sra import. + * Put results in an output dir in a format sqlUpdateRelated understands. */ { // We are actually just looking for specific files in inDir. */ /* Load up input projects table */ char *projectFile = "hcat_project.tsv"; char inPath[PATH_LEN]; safef(inPath, sizeof(inPath), "%s/%s", inDir, projectFile); char *projectRequired[] = {"short_name", "contact_name", "contributors"}; struct fieldedTable *inProject = fieldedTableFromTabFile(inPath, inPath, projectRequired, ArraySize(projectRequired)); /* Load up samples table */ char *sampleFile = "hcat_sample.tsv"; safef(inPath, sizeof(inPath), "%s/%s", inDir, sampleFile); char *sampleRequired[] = {"short_name",}; struct fieldedTable *inSample = fieldedTableFromTabFile(inPath, inPath, sampleRequired, ArraySize(sampleRequired)); /* Make sure inProject table makes sense by having exactly one row */ if (inProject->rowCount != 1) errAbort("Expected one row in %s, got %d\n", projectFile, inProject->rowCount); -struct fieldedTable *outContributor = makeContributors(inProject); -struct fieldedTable *outProject = makeProject(inProject, inSample); -struct fieldedTable *outLab = makeLab(inProject); -struct fieldedTable *outOrgan = makeOrgan(inProject); - /* Write output from lowest level to highest level tables. */ makeDirsOnPath(outDir); + + +/* Contributors table - it's always there */ +struct fieldedTable *outContributor = makeContributors(inProject); char outPath[PATH_LEN]; safef(outPath, sizeof(outPath), "%s/hcat_%s", outDir, "contributor.tsv"); fieldedTableToTabFile(outContributor, outPath); -if (outOrgan != NULL) - { - safef(outPath, sizeof(outPath), "%s/hcat_%s", outDir, "organ.tsv"); - fieldedTableToTabFile(outOrgan, outPath); - } - +/* Make lab table if there is a lab field */ +struct fieldedTable *outLab = makeLab(inProject); if (outLab != NULL) { safef(outPath, sizeof(outPath), "%s/hcat_%s", outDir, "lab.tsv"); fieldedTableToTabFile(outLab, outPath); } +/* Make pubs table if there are pubs fields */ +struct fieldedTable *outPub = makePublication(inProject); +if (outPub != NULL) + { + safef(outPath, sizeof(outPath), "%s/hcat_%s", outDir, "publication.tsv"); + fieldedTableToTabFile(outPub, outPath); + } + + +struct fieldedTable *outProject = makeProject(inProject, inSample, outDir); safef(outPath, sizeof(outPath), "%s/hcat_%s", outDir, "project.tsv"); fieldedTableToTabFile(outProject, outPath); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); hcatTabUpdate(argv[1], argv[2]); return 0; }