src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c 55f0ab3d708b3c9898830d1d962f010730b43cfd

55f0ab3d708b3c9898830d1d962f010730b43cfd
kent
  Sun Sep 1 16:55:17 2019 -0700
Mangling the field names of the foreign keys and multi/multis.

diff --git src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c
index 7ae53e6..ab11917 100644
--- src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c
+++ src/hca/hcat/hcatTabUpdate/hcatTabUpdate.c
@@ -17,30 +17,39 @@
   "hcatTabUpdate - take the tabToTabDir result of the geo/sra import\n"
   "and unpack a few fields that are just too hard in strex. \n"
   "Put results in an output dir.\n" 
   "usage:\n"
   "   hcatTabUpdate inDir outDir\n"
   "options:\n"
   "   -xxx=XXX\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
+char *fieldedTableLookupNamedFieldInRow(struct fieldedTable *table, char *field, char **row)
+/* Look up field by name,  not as effient as by index but what the heck. */
+{
+int ix = fieldedTableFindFieldIx(table, field);
+if (ix < 0)
+    return NULL;
+return row[ix];
+}
+
 void addIfReal(int fieldIx, char *oldFieldNames[], 
     char *newFields[], int newIxs[], int maxNewCount,int *pCurCount)
 /* If fieldIx is positive we consider it real anc add it to the newFields */
 {
 if (fieldIx > 0)
     {
     char *oldName = cloneString(oldFieldNames[fieldIx]);
     char *newName = oldName;
     char *prefix = "contact_";
     if (startsWith(prefix, oldName))
 	 {
          newName += strlen(prefix);;
 	 }
     int curCount = *pCurCount;
     if (curCount >= maxNewCount)
@@ -85,48 +94,53 @@
 {
 struct dyString *dy = dyStringNew(0);
 struct slName *el;
 for (el = list; el != NULL; el = el->next)
     csvEscapeAndAppend(dy, el->name);
 return dyStringCannibalize(&dy);
 }
 
 
 struct fieldedTable *makeContributors(struct fieldedTable *inProject)
 /* Make a fielded table from project contact info and contributors list */
 {
 char **projectRow = inProject->rowList->row;
 
 /* Make the contributors list in two pieces first off of the contact */
-int contact_name = fieldedTableFindFieldIx(inProject, "contact_name");
 int contact_email = fieldedTableFindFieldIx(inProject, "contact_email");
 int contact_phone = fieldedTableFindFieldIx(inProject, "contact_phone");
 int contact_department = fieldedTableFindFieldIx(inProject, "contact_department");
 int contact_institute = fieldedTableFindFieldIx(inProject, "contact_institute");
 int contact_address = fieldedTableFindFieldIx(inProject, "contact_address");
 int contact_city = fieldedTableFindFieldIx(inProject, "contact_city");
 int contact_country = fieldedTableFindFieldIx(inProject, "contact_country");
 int contact_zip_postal_code = fieldedTableFindFieldIx(inProject, "contact_zip_postal_code");
 
 
 /* Figure out which contact data we actually have */
 const int maxContacts = 32;
 char *contactFields[maxContacts+1];  // An extra for the project_role
 int contactIx[maxContacts];
-int realFieldCount = 0;
 char **oldFields = inProject->fields;
-addIfReal(contact_name, oldFields, contactFields, contactIx, maxContacts, &realFieldCount);
+
+// Add contact name field separately from the rest.  We know it's there since it's a 
+// required field, and also we need to decorate it's name
+contactFields[0] = "?name";
+contactIx[0] = fieldedTableMustFindFieldIx(inProject, "contact_name");
+int realFieldCount = 1;
+
+// The rest of the contact pieces are added just conditionally
 addIfReal(contact_email, oldFields, contactFields, contactIx, maxContacts, &realFieldCount);
 addIfReal(contact_phone, oldFields, contactFields, contactIx, maxContacts, &realFieldCount);
 addIfReal(contact_department, oldFields, contactFields, contactIx, maxContacts, &realFieldCount);
 addIfReal(contact_institute, oldFields, contactFields, contactIx, maxContacts, &realFieldCount);
 addIfReal(contact_address, oldFields, contactFields, contactIx, maxContacts, &realFieldCount);
 addIfReal(contact_city, oldFields, contactFields, contactIx, maxContacts, &realFieldCount);
 addIfReal(contact_country, oldFields, contactFields, contactIx, maxContacts, &realFieldCount);
 addIfReal(contact_zip_postal_code, oldFields, 
     contactFields, contactIx, maxContacts, &realFieldCount);
 contactFields[realFieldCount] = "project_role";
 realFieldCount += 1;
 uglyf("Got %d real contact fields\n", realFieldCount);
 
 /* Make contributor output table.  The first row of it will be seeded with the contact.
  * We can fill out names, but not other info on the other contributors, who will make
@@ -141,31 +155,31 @@
 struct dyString *scratch = dyStringNew(0);
 for (outIx=0; outIx<realFieldCount-1; ++outIx)
     {
     char *inTsv = projectRow[contactIx[outIx]];
     char *inVal = emptyForNull(cloneString(csvParseNext(&inTsv, scratch)));
     outVals[outIx] = inVal;
     }
 outVals[outIx] = "lab contact";
 char *contactName = cloneString(outVals[0]);
 fieldedTableAdd(contributors, outVals, realFieldCount, 1);
 
 /* Unroll the contributors field  into further rows*/
 for (outIx=0; outIx<realFieldCount; ++outIx)
     outVals[outIx] = "";	// Empty out all rows.
 int inContribIx = fieldedTableMustFindFieldIx(inProject, "contributors");
-int outContribIx = fieldedTableMustFindFieldIx(contributors, "name");
+int outContribIx = fieldedTableMustFindFieldIx(contributors, "?name");
 char *inTsv = projectRow[inContribIx];
 char *oneVal;
 while ((oneVal = csvParseNext(&inTsv, scratch)) != NULL)
     {
     if (differentString(oneVal, contactName))  // We already got the contact as a contributor
 	{
 	outVals[outContribIx] = cloneString(oneVal);
 	outVals[realFieldCount-1] = "contributor";
 	fieldedTableAdd(contributors, outVals, realFieldCount, contributors->rowCount+1);
 	}
     }
 return contributors;
 }
 
 char *lookupSpecies(char *taxon)
@@ -195,31 +209,35 @@
     char *species = lookupSpecies(taxon);
     csvEscapeAndAppend(result, species);
     }
 return dyStringCannibalize(&result);
 }
 
 void addListFieldIfNonempty(char *field, struct slName *list,
     char *newFields[], char *newVals[], int maxNewCount,int *pCurCount)
 /* Add field to newFields if list is non-empty, taking care not to go past end. */
 {
 if (list != NULL)
     {
     int curCount = *pCurCount;
     if (curCount >= maxNewCount)
        errAbort("Too many fields in addListFieldIfNonempty on %s, %d max", field, curCount);
-    newFields[curCount] = field;
+    char fieldName[256];
+    safef(fieldName, sizeof(fieldName), 
+	"@@%s@hcat_project_%s@project_id@%s_id@hcat_%s.short_name@id", 
+	field, field, field, field);
+    newFields[curCount] = cloneString(fieldName);
     newVals[curCount] = slNameToCsv(list);
     *pCurCount = curCount+1;
     }
 }
 
 
 struct fieldedTable *makeProject(struct fieldedTable *inProject, struct fieldedTable *inSample)
 /* Make output project table.  This is the big one - 35 fields now
  * probably twice that by the time HCA is done.  Fortunately we only need
  * to deal with some of the fields and it only has one row. */
 {
 char **inFields = inProject->fields;
 char **inRow = inProject->rowList->row;
 int inFieldCount = inProject->fieldCount;
 struct dyString *scratch = dyStringNew(0);
@@ -229,137 +247,150 @@
 char *outRow[outFieldMax];
 int outFieldCount = 0;
 
 /* First we make up the basics of the outProject table.  Mostly this is just
  * passing through from the inProject, but there's exceptions like contacts. */
 int inIx;
 for (inIx=0; inIx<inFieldCount; ++inIx)
     {
     /* Fetch input name and value */
     char *inName = inFields[inIx];
     char *inVal = inRow[inIx];
 
     /* Go through list of input fields we tweak slightly */
     if (sameString("taxons", inName))
         {
-	inName = "species";
+	// its many-to-many, whoot!
+	inName = "@@species@hcat_project_species@project_id@species_id@hcat_species@common_name@id";  
 	inVal = taxonsToSpecies(inVal, scratch);
 	}
 
-    /* Output all the ones we haven't dealt with already */
+    /* We might modify names of some fields */
+    char nameBuf[128];
+    if (sameString("state_reached", inName) || sameString("cur_state", inName))
+        {
+	safef(nameBuf, sizeof(nameBuf), "@%s_id@hcat_project_state.state@id", inName);
+	inName = cloneString(nameBuf);
+	}
+    else if (sameString("consent", inName) || sameString("effort", inName))
+        {
+	safef(nameBuf, sizeof(nameBuf), "@%s_id@hcat_%s.short_name@id", inName, inName);
+	inName = cloneString(nameBuf);
+	}
+
+    /* Output all the ones we haven't dealt with already or will deal with later */
     if (!startsWith("contact_", inName) && !sameString("contributors", inName))
         {
 	outFields[outFieldCount] = inName;
 	outRow[outFieldCount] = inVal;
 	++outFieldCount;
 	}
     }
 
+/* Add in contributors as a multi to multi field */
+outFields[outFieldCount] = "@@contributors@hcat_project_contributors@project_id@contributor_id@hcat_contributor.name@id";
+outRow[outFieldCount] = fieldedTableLookupNamedFieldInRow(inProject, "contributors", inRow);
+
 /* Add the fields we scan and merge from sample at end */
 struct slName *organList = uniqVals(inSample, "organ");
 struct slName *organPartList = uniqVals(inSample, "organ_part");
 struct slName *assayTypeList = uniqVals(inSample, "assay_type");
 struct slName *diseaseList = uniqVals(inSample, "disease");
 addListFieldIfNonempty("organ", organList, outFields, outRow, outFieldMax, &outFieldCount);
 addListFieldIfNonempty("organ_part", organPartList, outFields, outRow, outFieldMax, &outFieldCount);
 addListFieldIfNonempty("assay_type", assayTypeList, outFields, outRow, outFieldMax, &outFieldCount);
 addListFieldIfNonempty("disease", diseaseList, outFields, outRow, outFieldMax, &outFieldCount);
 
 struct fieldedTable *outTable = fieldedTableNew("project", outFields, outFieldCount);
 outTable->startsSharp = inProject->startsSharp;
 fieldedTableAdd(outTable, outRow, outFieldCount, 1);
 dyStringFree(&scratch);
 return outTable;
 }
 
-char *fieldedTableLookupNamedFieldInRow(struct fieldedTable *table, char *field, char **row)
-/* Look up field by name,  not as effient as by index but what the heck. */
-{
-int ix = fieldedTableFindFieldIx(table, field);
-if (ix < 0)
-    return NULL;
-return row[ix];
-}
-
 struct fieldedTable *makeLab(struct fieldedTable *inProject)
 /* If there's a lab field we make a lab table and seed it with the contacts. */
 {
 int labIx = fieldedTableFindFieldIx(inProject, "lab");
 if (labIx >= 0)
     {
     char **inRow = inProject->rowList->row;
     char *short_name = inRow[labIx];
     char *contact = fieldedTableLookupNamedFieldInRow(inProject, "contact_name", inRow);
+    char *contributors = fieldedTableLookupNamedFieldInRow(inProject, "contributors", inRow);
     char *institute = emptyForNull(fieldedTableLookupNamedFieldInRow(inProject, "contact_institute", inRow));
     char labName[256];
     if (strlen(short_name) < 20)  // Unlikely to be unique, may cause trouble
 	safef(labName, sizeof(labName), "%s %s", short_name, institute);
     else
         safef(labName, sizeof(labName), "%s", short_name);
     labName[50] = 0;  // not too long
 
-    char *outFields[3] = {"short_name", "institute", "contact"};
+    char *outFields[4] = {"?short_name", "institute", "@contact@hcat_contributor.name@id", 
+	"@@contributors@hcat_lab_contributors@lab_id@contributor_id@hcat_contributor.name@id"};
     struct fieldedTable *labTable = fieldedTableNew("lab", outFields, ArraySize(outFields));
-    char *outRow[3] = {labName, institute, contact};
+    char *outRow[4] = {labName, institute, contact, contributors};
     fieldedTableAdd(labTable, outRow, ArraySize(outRow), 1);
     return labTable;
     }
 else
     return NULL;
 }
 
 void hcatTabUpdate(char *inDir, char *outDir)
 /* hcatTabUpdate - Update the hcat database given a tab seperated input and output dir. */
 {
 // We are actually just looking for specific files in inDir. */
 
 /* Load up input projects table */
 char *projectFile = "hcat_project.tsv";
 char inPath[PATH_LEN];
 safef(inPath, sizeof(inPath), "%s/%s", inDir, projectFile);
-char *projectRequired[] = {"short_name", "contact_name"};
+char *projectRequired[] = {"short_name", "contact_name", "contributors"};
 struct fieldedTable *inProject = fieldedTableFromTabFile(inPath, inPath, 
     projectRequired, ArraySize(projectRequired));
 uglyf("Got %d rows, %d columns from %s\n", 
     inProject->rowCount, inProject->fieldCount, inProject->name);
 
 /* Load up samples table */
 char *sampleFile = "hcat_sample.tsv";
 safef(inPath, sizeof(inPath), "%s/%s", inDir, sampleFile);
 char *sampleRequired[] = {"short_name",};
 struct fieldedTable *inSample = fieldedTableFromTabFile(inPath, inPath, 
     sampleRequired, ArraySize(sampleRequired));
 uglyf("Got %d rows, %d columns from %s\n", 
     inSample->rowCount, inSample->fieldCount, inSample->name);
 
 
 /* Make sure inProject table makes sense by having exactly one row */
 if (inProject->rowCount != 1)
     errAbort("Expected one row in %s, got %d\n", projectFile, inProject->rowCount);
 
 struct fieldedTable *outContributor = makeContributors(inProject);
 struct fieldedTable *outProject = makeProject(inProject, inSample);
 struct fieldedTable *outLab = makeLab(inProject);
 
-/* Write output */
+/* Write output from lowest level to highest level tables. */
 makeDirsOnPath(outDir);
 char outPath[PATH_LEN];
 safef(outPath, sizeof(outPath), "%s/%s", outDir, "contributors.tsv");
 fieldedTableToTabFile(outContributor, outPath);
-safef(outPath, sizeof(outPath), "%s/%s", outDir, "project.tsv");
-fieldedTableToTabFile(outProject, outPath);
+
 if (outLab != NULL)
     {
     safef(outPath, sizeof(outPath), "%s/%s", outDir, "lab.tsv");
     fieldedTableToTabFile(outLab, outPath);
     }
+
+safef(outPath, sizeof(outPath), "%s/%s", outDir, "project.tsv");
+fieldedTableToTabFile(outProject, outPath);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 hcatTabUpdate(argv[1], argv[2]);
 return 0;
 }