ca4919c8dda0799f1bc1b68de0741b6a38172381 kent Sun Sep 1 14:07:38 2019 -0700 Starting to process fields like organ and organ_part that come from sample diff --git src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c index ba0270a..7ae53e6 100644 --- src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c +++ src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c @@ -39,30 +39,69 @@ char *newName = oldName; char *prefix = "contact_"; if (startsWith(prefix, oldName)) { newName += strlen(prefix);; } int curCount = *pCurCount; if (curCount >= maxNewCount) errAbort("Too many fields in addIfRead on %s, %d max", oldName, curCount); newIxs[curCount] = fieldIx; newFields[curCount] = newName; *pCurCount = curCount+1; } } +struct slName *uniqVals(struct fieldedTable *table, char *field) +/* Return list of all unique values in the field */ +{ +struct dyString *scratch = dyStringNew(0); +int fieldIx = fieldedTableFindFieldIx(table, field); +if (fieldIx < 0) + return NULL; +struct hash *uniqHash = hashNew(0); +struct slName *valList = NULL; +struct fieldedRow *fr; +for (fr = table->rowList; fr != NULL; fr = fr->next) + { + char *inTsv = fr->row[fieldIx]; + char *val; + while ((val = csvParseNext(&inTsv, scratch)) != NULL) + { + if (hashLookup(uniqHash, val) == NULL) + { + hashAdd(uniqHash, val, NULL); + slNameAddHead(&valList, val); + } + } + } +slReverse(&valList); +hashFree(&uniqHash); +dyStringFree(&scratch); +return valList; +} + +char *slNameToCsv(struct slName *list) +/* Convert slNames to a long string */ +{ +struct dyString *dy = dyStringNew(0); +struct slName *el; +for (el = list; el != NULL; el = el->next) + csvEscapeAndAppend(dy, el->name); +return dyStringCannibalize(&dy); +} + struct fieldedTable *makeContributors(struct fieldedTable *inProject) /* Make a fielded table from project contact info and contributors list */ { char **projectRow = inProject->rowList->row; /* Make the contributors list in two pieces first off of the contact */ int contact_name = fieldedTableFindFieldIx(inProject, "contact_name"); int contact_email = fieldedTableFindFieldIx(inProject, "contact_email"); int contact_phone = fieldedTableFindFieldIx(inProject, "contact_phone"); int contact_department = fieldedTableFindFieldIx(inProject, "contact_department"); int contact_institute = fieldedTableFindFieldIx(inProject, "contact_institute"); int contact_address = fieldedTableFindFieldIx(inProject, "contact_address"); int contact_city = fieldedTableFindFieldIx(inProject, "contact_city"); int contact_country = fieldedTableFindFieldIx(inProject, "contact_country"); @@ -147,68 +186,96 @@ char *taxonsToSpecies(char *inVal, struct dyString *scratch) /* Convert a comma separated list of taxons in inVal to a * comma separated species list, using scratch. */ { struct dyString *result = dyStringNew(64); char *taxon; char *s = inVal; while ((taxon = csvParseNext(&s, scratch)) != NULL) { char *species = lookupSpecies(taxon); csvEscapeAndAppend(result, species); } return dyStringCannibalize(&result); } -struct fieldedTable *makeProject(struct fieldedTable *inProject) +void addListFieldIfNonempty(char *field, struct slName *list, + char *newFields[], char *newVals[], int maxNewCount,int *pCurCount) +/* Add field to newFields if list is non-empty, taking care not to go past end. */ +{ +if (list != NULL) + { + int curCount = *pCurCount; + if (curCount >= maxNewCount) + errAbort("Too many fields in addListFieldIfNonempty on %s, %d max", field, curCount); + newFields[curCount] = field; + newVals[curCount] = slNameToCsv(list); + *pCurCount = curCount+1; + } +} + + +struct fieldedTable *makeProject(struct fieldedTable *inProject, struct fieldedTable *inSample) /* Make output project table. This is the big one - 35 fields now * probably twice that by the time HCA is done. Fortunately we only need * to deal with some of the fields and it only has one row. */ { char **inFields = inProject->fields; char **inRow = inProject->rowList->row; int inFieldCount = inProject->fieldCount; -int outFieldMax = inFieldCount + 16; // Mostly we remove fields but we do add a few +struct dyString *scratch = dyStringNew(0); + +int outFieldMax = inFieldCount + 16; // Mostly we remove fields but we do add a few. Gets checked char *outFields[outFieldMax]; char *outRow[outFieldMax]; int outFieldCount = 0; -struct dyString *scratch = dyStringNew(0); /* First we make up the basics of the outProject table. Mostly this is just * passing through from the inProject, but there's exceptions like contacts. */ int inIx; for (inIx=0; inIx<inFieldCount; ++inIx) { /* Fetch input name and value */ char *inName = inFields[inIx]; char *inVal = inRow[inIx]; /* Go through list of input fields we tweak slightly */ if (sameString("taxons", inName)) { inName = "species"; inVal = taxonsToSpecies(inVal, scratch); } /* Output all the ones we haven't dealt with already */ if (!startsWith("contact_", inName) && !sameString("contributors", inName)) { outFields[outFieldCount] = inName; outRow[outFieldCount] = inVal; ++outFieldCount; } } + +/* Add the fields we scan and merge from sample at end */ +struct slName *organList = uniqVals(inSample, "organ"); +struct slName *organPartList = uniqVals(inSample, "organ_part"); +struct slName *assayTypeList = uniqVals(inSample, "assay_type"); +struct slName *diseaseList = uniqVals(inSample, "disease"); +addListFieldIfNonempty("organ", organList, outFields, outRow, outFieldMax, &outFieldCount); +addListFieldIfNonempty("organ_part", organPartList, outFields, outRow, outFieldMax, &outFieldCount); +addListFieldIfNonempty("assay_type", assayTypeList, outFields, outRow, outFieldMax, &outFieldCount); +addListFieldIfNonempty("disease", diseaseList, outFields, outRow, outFieldMax, &outFieldCount); + struct fieldedTable *outTable = fieldedTableNew("project", outFields, outFieldCount); outTable->startsSharp = inProject->startsSharp; fieldedTableAdd(outTable, outRow, outFieldCount, 1); dyStringFree(&scratch); return outTable; } char *fieldedTableLookupNamedFieldInRow(struct fieldedTable *table, char *field, char **row) /* Look up field by name, not as effient as by index but what the heck. */ { int ix = fieldedTableFindFieldIx(table, field); if (ix < 0) return NULL; return row[ix]; } @@ -258,31 +325,31 @@ /* Load up samples table */ char *sampleFile = "hcat_sample.tsv"; safef(inPath, sizeof(inPath), "%s/%s", inDir, sampleFile); char *sampleRequired[] = {"short_name",}; struct fieldedTable *inSample = fieldedTableFromTabFile(inPath, inPath, sampleRequired, ArraySize(sampleRequired)); uglyf("Got %d rows, %d columns from %s\n", inSample->rowCount, inSample->fieldCount, inSample->name); /* Make sure inProject table makes sense by having exactly one row */ if (inProject->rowCount != 1) errAbort("Expected one row in %s, got %d\n", projectFile, inProject->rowCount); struct fieldedTable *outContributor = makeContributors(inProject); -struct fieldedTable *outProject = makeProject(inProject); +struct fieldedTable *outProject = makeProject(inProject, inSample); struct fieldedTable *outLab = makeLab(inProject); /* Write output */ makeDirsOnPath(outDir); char outPath[PATH_LEN]; safef(outPath, sizeof(outPath), "%s/%s", outDir, "contributors.tsv"); fieldedTableToTabFile(outContributor, outPath); safef(outPath, sizeof(outPath), "%s/%s", outDir, "project.tsv"); fieldedTableToTabFile(outProject, outPath); if (outLab != NULL) { safef(outPath, sizeof(outPath), "%s/%s", outDir, "lab.tsv"); fieldedTableToTabFile(outLab, outPath); } }