src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c ca4919c8dda0799f1bc1b68de0741b6a38172381

ca4919c8dda0799f1bc1b68de0741b6a38172381
kent
  Sun Sep 1 14:07:38 2019 -0700
Starting to process fields like organ and organ_part that come from sample

diff --git src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c
index ba0270a..7ae53e6 100644
--- src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c
+++ src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c
@@ -39,30 +39,69 @@
     char *newName = oldName;
     char *prefix = "contact_";
     if (startsWith(prefix, oldName))
 	 {
          newName += strlen(prefix);;
 	 }
     int curCount = *pCurCount;
     if (curCount >= maxNewCount)
        errAbort("Too many fields in addIfRead on %s, %d max", oldName, curCount);
     newIxs[curCount] = fieldIx;
     newFields[curCount] = newName;
     *pCurCount = curCount+1;
     }
 }
 
+struct slName *uniqVals(struct fieldedTable *table, char *field)
+/* Return list of all unique values in the field */
+{
+struct dyString *scratch = dyStringNew(0);
+int fieldIx = fieldedTableFindFieldIx(table, field);
+if (fieldIx < 0)
+    return NULL;
+struct hash *uniqHash = hashNew(0);
+struct slName *valList = NULL;
+struct fieldedRow *fr;
+for (fr = table->rowList; fr != NULL; fr = fr->next)
+    {
+    char *inTsv = fr->row[fieldIx];
+    char *val;
+    while ((val = csvParseNext(&inTsv, scratch)) != NULL)
+	{
+	if (hashLookup(uniqHash, val) == NULL)
+	    {
+	    hashAdd(uniqHash, val, NULL);
+	    slNameAddHead(&valList, val);
+	    }
+	}
+    }
+slReverse(&valList);
+hashFree(&uniqHash);
+dyStringFree(&scratch);
+return valList;
+}
+
+char *slNameToCsv(struct slName *list)
+/* Convert slNames to a long string */
+{
+struct dyString *dy = dyStringNew(0);
+struct slName *el;
+for (el = list; el != NULL; el = el->next)
+    csvEscapeAndAppend(dy, el->name);
+return dyStringCannibalize(&dy);
+}
+
 
 struct fieldedTable *makeContributors(struct fieldedTable *inProject)
 /* Make a fielded table from project contact info and contributors list */
 {
 char **projectRow = inProject->rowList->row;
 
 /* Make the contributors list in two pieces first off of the contact */
 int contact_name = fieldedTableFindFieldIx(inProject, "contact_name");
 int contact_email = fieldedTableFindFieldIx(inProject, "contact_email");
 int contact_phone = fieldedTableFindFieldIx(inProject, "contact_phone");
 int contact_department = fieldedTableFindFieldIx(inProject, "contact_department");
 int contact_institute = fieldedTableFindFieldIx(inProject, "contact_institute");
 int contact_address = fieldedTableFindFieldIx(inProject, "contact_address");
 int contact_city = fieldedTableFindFieldIx(inProject, "contact_city");
 int contact_country = fieldedTableFindFieldIx(inProject, "contact_country");
@@ -147,68 +186,96 @@
 char *taxonsToSpecies(char *inVal, struct dyString *scratch)
 /* Convert a comma separated list of taxons in inVal to a
  * comma separated species list, using scratch. */
 {
 struct dyString *result = dyStringNew(64);
 char *taxon;
 char *s = inVal;
 while ((taxon = csvParseNext(&s, scratch)) != NULL)
     {
     char *species = lookupSpecies(taxon);
     csvEscapeAndAppend(result, species);
     }
 return dyStringCannibalize(&result);
 }
 
-struct fieldedTable *makeProject(struct fieldedTable *inProject)
+void addListFieldIfNonempty(char *field, struct slName *list,
+    char *newFields[], char *newVals[], int maxNewCount,int *pCurCount)
+/* Add field to newFields if list is non-empty, taking care not to go past end. */
+{
+if (list != NULL)
+    {
+    int curCount = *pCurCount;
+    if (curCount >= maxNewCount)
+       errAbort("Too many fields in addListFieldIfNonempty on %s, %d max", field, curCount);
+    newFields[curCount] = field;
+    newVals[curCount] = slNameToCsv(list);
+    *pCurCount = curCount+1;
+    }
+}
+
+
+struct fieldedTable *makeProject(struct fieldedTable *inProject, struct fieldedTable *inSample)
 /* Make output project table.  This is the big one - 35 fields now
  * probably twice that by the time HCA is done.  Fortunately we only need
  * to deal with some of the fields and it only has one row. */
 {
 char **inFields = inProject->fields;
 char **inRow = inProject->rowList->row;
 int inFieldCount = inProject->fieldCount;
-int outFieldMax = inFieldCount + 16;  // Mostly we remove fields but we do add a few
+struct dyString *scratch = dyStringNew(0);
+
+int outFieldMax = inFieldCount + 16;  // Mostly we remove fields but we do add a few.  Gets checked
 char *outFields[outFieldMax];  
 char *outRow[outFieldMax];
 int outFieldCount = 0;
-struct dyString *scratch = dyStringNew(0);
 
 /* First we make up the basics of the outProject table.  Mostly this is just
  * passing through from the inProject, but there's exceptions like contacts. */
 int inIx;
 for (inIx=0; inIx<inFieldCount; ++inIx)
     {
     /* Fetch input name and value */
     char *inName = inFields[inIx];
     char *inVal = inRow[inIx];
 
     /* Go through list of input fields we tweak slightly */
     if (sameString("taxons", inName))
         {
 	inName = "species";
 	inVal = taxonsToSpecies(inVal, scratch);
 	}
 
     /* Output all the ones we haven't dealt with already */
     if (!startsWith("contact_", inName) && !sameString("contributors", inName))
         {
 	outFields[outFieldCount] = inName;
 	outRow[outFieldCount] = inVal;
 	++outFieldCount;
 	}
     }
+
+/* Add the fields we scan and merge from sample at end */
+struct slName *organList = uniqVals(inSample, "organ");
+struct slName *organPartList = uniqVals(inSample, "organ_part");
+struct slName *assayTypeList = uniqVals(inSample, "assay_type");
+struct slName *diseaseList = uniqVals(inSample, "disease");
+addListFieldIfNonempty("organ", organList, outFields, outRow, outFieldMax, &outFieldCount);
+addListFieldIfNonempty("organ_part", organPartList, outFields, outRow, outFieldMax, &outFieldCount);
+addListFieldIfNonempty("assay_type", assayTypeList, outFields, outRow, outFieldMax, &outFieldCount);
+addListFieldIfNonempty("disease", diseaseList, outFields, outRow, outFieldMax, &outFieldCount);
+
 struct fieldedTable *outTable = fieldedTableNew("project", outFields, outFieldCount);
 outTable->startsSharp = inProject->startsSharp;
 fieldedTableAdd(outTable, outRow, outFieldCount, 1);
 dyStringFree(&scratch);
 return outTable;
 }
 
 char *fieldedTableLookupNamedFieldInRow(struct fieldedTable *table, char *field, char **row)
 /* Look up field by name,  not as effient as by index but what the heck. */
 {
 int ix = fieldedTableFindFieldIx(table, field);
 if (ix < 0)
     return NULL;
 return row[ix];
 }
@@ -258,31 +325,31 @@
 /* Load up samples table */
 char *sampleFile = "hcat_sample.tsv";
 safef(inPath, sizeof(inPath), "%s/%s", inDir, sampleFile);
 char *sampleRequired[] = {"short_name",};
 struct fieldedTable *inSample = fieldedTableFromTabFile(inPath, inPath, 
     sampleRequired, ArraySize(sampleRequired));
 uglyf("Got %d rows, %d columns from %s\n", 
     inSample->rowCount, inSample->fieldCount, inSample->name);
 
 
 /* Make sure inProject table makes sense by having exactly one row */
 if (inProject->rowCount != 1)
     errAbort("Expected one row in %s, got %d\n", projectFile, inProject->rowCount);
 
 struct fieldedTable *outContributor = makeContributors(inProject);
-struct fieldedTable *outProject = makeProject(inProject);
+struct fieldedTable *outProject = makeProject(inProject, inSample);
 struct fieldedTable *outLab = makeLab(inProject);
 
 /* Write output */
 makeDirsOnPath(outDir);
 char outPath[PATH_LEN];
 safef(outPath, sizeof(outPath), "%s/%s", outDir, "contributors.tsv");
 fieldedTableToTabFile(outContributor, outPath);
 safef(outPath, sizeof(outPath), "%s/%s", outDir, "project.tsv");
 fieldedTableToTabFile(outProject, outPath);
 if (outLab != NULL)
     {
     safef(outPath, sizeof(outPath), "%s/%s", outDir, "lab.tsv");
     fieldedTableToTabFile(outLab, outPath);
     }
 }