src/hg/sqlUpdateRelated/sqlUpdateRelated.c 6f32b1ba12ffa50bdffefd5cc88ea12141f1334a

6f32b1ba12ffa50bdffefd5cc88ea12141f1334a
kent
  Sat Sep 14 14:33:04 2019 -0700
Started to document how fields work in usage message.  Not complete.  My, this code could use a couple of functions broken up too - logic is a little tortured and long now.  Alas it's gonna be the sort of refactor where the functions are going to have 8 parameters each if I break up the big one.

diff --git src/hg/sqlUpdateRelated/sqlUpdateRelated.c src/hg/sqlUpdateRelated/sqlUpdateRelated.c
index 0828e96..e5475bb 100644
--- src/hg/sqlUpdateRelated/sqlUpdateRelated.c
+++ src/hg/sqlUpdateRelated/sqlUpdateRelated.c
@@ -1,432 +1,442 @@
 /* sqlUpdateRelated - Update a bunch of tables in a kind of careful way based out of tab separated 
  * files.  Handles foreign key and many-to-many relationships with a multitude of @ signs. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "fieldedTable.h"
 #include "csv.h"
 #include "jksql.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "sqlUpdateRelated - Update a bunch of tables in a kind of careful way based out of tab \n"
   "separated files.  Handles foreign key and many-to-many relationships with a multitude\n"
   "of @ signs.  Currently only works with mysql cause going through jksql\n"
   "usage:\n"
   "   sqlUpdateRelated database tableFiles\n"
   "options:\n"
   "   -missOk - if set, tableFiles mentioned that don't exist are skipped rather than erroring\n"
+  "The tableFiles are in a interesting and peculiar format.  The first line with the field name\n"
+  "ends up controlling this program.  If a field starts with just a regular letter all is as\n"
+  "you may expect,  the field just contains data to load.  However if the field starts with\n"
+  "a special char, special things happen.  In particular\n"
+  "   ? - indicates field is a conditional key field.  Record is only inserted if the value\n"
+  "       for this field is not already present in table\n"
+  "   ! - indicates this is update key field.  Record must already exist,  values in other fields\n"
+  "       are updated.\n"
+  "   @ - indicates a foreign key relationship - see source code until docs are in shape\n"
+  "   @@ - indicates a many-to-many relationship - see source code until docs are in shape"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"missOk", TRUE},
    {NULL, 0},
 };
 
 struct foreignRef
 /* What we need to handle a foreign key reference */
     {
     struct foreignRef *next;	    // Next in list
     char *nativeFieldName;	    // Name in the current table
     char *foreignTable;		    // Foreign table name
     char *foreignFindName;	    // Name to search in the destination table
     char *foreignKeyName;	    // Name of key we are fetching, usually just "id"
     char *outputVal;		    // Used as a place to hold the row value for later processing
     int nativeFieldIx;		    // Field index in native table
     char *foreignKey;		    // Actual foreign key - computed each row
     };
 
 struct multiRef
 /* What we need to handle a foreign key reference */
     {
     struct multiRef *next;	    // Next in list
     char *nativeFieldName;	    // Name in the current table
     char *nativeKeyName;	    // The name of the key in current table
     char *relationalTable;	    // Table that links the two together
     char *relationalNativeField;    // The field that specifies the record in current table
     char *relationalForeignField;   // The field that specifies the record in foreign table
     char *foreignTable;		    // Foreign table name
     char *foreignFindName;	    // Name to search in the destination table
     char *foreignKeyName;	    // Name of key we are fetching, usually just "id"
     int nativeFieldIx;		    // Field index in native table
     };
 
 void addMultiRelation(struct sqlConnection *conn, struct multiRef *mRef, struct fieldedRow *fr,
     int nativeId, char *tabFile, struct dyString *csvScratch)
 /* inCsv is a comma separated list of names that we should be able to locate in the foreign
  * table via mRef->foreignFindName.  We make up relationships for them in the relationalTable. */
 {
 char *inCsv = fr->row[mRef->nativeFieldIx];
 char *nativeVal;
 char *parsePos = inCsv;
 struct dyString *sql = dyStringNew(0);
 while ((nativeVal = csvParseNext(&parsePos, csvScratch)) != NULL)
     {
     char *escaped = sqlEscapeString(nativeVal);
     dyStringClear(sql);
     sqlDyStringPrintf(sql, "select %s from %s where %s=\"%s\"",
 	    mRef->foreignKeyName, mRef->foreignTable, 
 	    mRef->foreignFindName, escaped);
     char *foreignKey = sqlQuickString(conn, sql->string);
     verbose(2, "foreignKey for %s is %s\n", nativeVal, foreignKey);
     if (isEmpty(foreignKey))
 	errAbort("No %s in table %s referenced line %d of %s",  nativeVal, mRef->foreignTable,
 	    fr->id,  tabFile);
     
     // Alright, we got our native and foreign keys, let's insert a row in relationship table
     dyStringClear(sql);
     sqlDyStringPrintf(sql, "insert into %s (%s,%s) values (%d,%s)", mRef->relationalTable,
 	mRef->relationalNativeField, mRef->relationalForeignField,
 	nativeId, foreignKey);
     verbose(2, "relationship sql: %s\n", sql->string);
     sqlUpdate(conn, sql->string);
     freez(&escaped);
     }
 dyStringFree(&sql);
 }
 
 void checkFieldExists(struct sqlConnection *conn, char *table, char *field, char *attyField)
 /* Make sure field exists in table in database or print error message that includes
  * attyField */
 {
 if (sqlFieldIndex(conn, table, field) < 0)
     errAbort("No field %s in table %s used in %s", 
 	field, table, attyField);
 }
 
 void checkTableExists(struct sqlConnection *conn, char *table, char *attyField)
 /* Make sure table exists in database or print error message that includes
  * attyField */
 {
 if (!sqlTableExists(conn, table))
     errAbort("Table %s from %s doesn't exist", table, attyField);
 }
 
 void sqlUpdateViaTabFile(struct sqlConnection *conn, char *tabFile, char *tableName)
 /* Interpret one tab-separated file */
 {
 // Load the tabFile 
 struct fieldedTable *inTable = fieldedTableFromTabFile(tabFile, tabFile, NULL, 0);
 verbose(2, "%d fields and %d rows in %s\n",  inTable->fieldCount, inTable->rowCount, tabFile);
 char **inFields = inTable->fields;
 
 // Loop through the fields creating field set for output table and parsing
 // foreign and multi-multi fields into structures 
 char *conditionalField = NULL;  // We might have one of these
 int conditionalIx = -1;
 boolean updateCondition = FALSE;    
 struct foreignRef *foreignRefList = NULL;
 struct multiRef *multiRefList = NULL;
 int fieldIx;
 for (fieldIx=0; fieldIx<inTable->fieldCount; ++fieldIx)
     {
     char *field = inFields[fieldIx];
     char firstChar = field[0];
     if (firstChar == '?' || firstChar == '!')
         {
 	if (conditionalField != NULL)
 	    errAbort("Multiple fields starting with a '?' or '!', There can only be one\n"
 		"but both %s and %s exist\n", conditionalField, field+1);
 	conditionalField = field;
 	conditionalIx = fieldIx;
 	updateCondition = (firstChar == '!');
 	checkFieldExists(conn, tableName, field + 1, field);
 	verbose(2, "conditionalField = %s, ix = %d\n", field, conditionalIx);
 	}
     else if (firstChar == '@')  // Foreign keys are involved.  Will it get worse?
         {
 	char *chopTemp = cloneString(field);
 	if (field[1] == '@')  // Ugh, a multiRef.  Much to parse!
 	    {
 	    verbose(2, "multiRef field = %s\n", field);
 	    char *pos = chopTemp + 2;  // Set up to be past @
 	    int expectedCount = 8;
 	    char *parts[expectedCount+1];	// More than we need
 	    int partCount = chopByChar(pos, '@', parts, ArraySize(parts));
 	    if (partCount != expectedCount)
 	        {
 		errAbort("Expecting %d @ separated fields in %s, got %d\n",
 		    expectedCount, field, partCount);
 		}
 	    /* Makup up multiRef struct */
 	    struct multiRef *mRef;
 	    AllocVar(mRef);
 	    mRef->nativeFieldName = parts[0];	    
 	    mRef->nativeKeyName = parts[1];	    
 	    mRef->relationalTable = parts[2];	    
 	    mRef->relationalNativeField = parts[3];    
 	    mRef->relationalForeignField = parts[4];   
 	    mRef->foreignTable = parts[5];		    
 	    mRef->foreignFindName = parts[6];	    
 	    mRef->foreignKeyName = parts[7];	    
 	    mRef->nativeFieldIx = fieldIx;
 
 	    /* Check fields and tables exist */
 	    checkFieldExists(conn, tableName, mRef->nativeKeyName, field);
 	    checkTableExists(conn, mRef->relationalTable, field);
 	    checkFieldExists(conn, mRef->relationalTable, mRef->relationalNativeField, field);
 	    checkFieldExists(conn, mRef->relationalTable, mRef->relationalForeignField, field);
 	    checkTableExists(conn, mRef->foreignTable, field);
 	    checkFieldExists(conn, mRef->foreignTable, mRef->foreignFindName, field);
 	    checkFieldExists(conn, mRef->foreignTable, mRef->foreignKeyName, field);
 
 	    /* Everything checks out, add it to list */
 	    slAddTail(&multiRefList, mRef);
 	    }
 	else
 	    {
 	    verbose(2, "foreignRef field = %s\n", field);
 	    char *pos = chopTemp + 1;  // Set up to be past @
 	    int expectedCount = 4;
 	    char *parts[expectedCount+1];	// More than we need
 	    int partCount = chopByChar(pos, '@', parts, ArraySize(parts));
 	    if (partCount != expectedCount)
 	        {
 		errAbort("Expecting %d @ separated fields in %s, got %d\n",
 		    expectedCount, field, partCount);
 		}
 	    /* Make up a foreignRef */
 	    struct foreignRef *fRef;
 	    AllocVar(fRef);
 	    fRef->nativeFieldName = parts[0];
 	    fRef->foreignTable = parts[1];
 	    fRef->foreignFindName = parts[2];
 	    fRef->foreignKeyName = parts[3];
 	    fRef->nativeFieldIx = fieldIx;
 
 	    /* Make sure all tables and fields exist */
 	    checkFieldExists(conn, tableName, fRef->nativeFieldName, field);
 	    checkTableExists(conn, fRef->foreignTable, field);
 	    checkFieldExists(conn, fRef->foreignTable, fRef->foreignFindName, field);
 	    checkFieldExists(conn, fRef->foreignTable, fRef->foreignKeyName, field);
 
 	    slAddTail(&foreignRefList, fRef);
 	    }
 	}
     else
         {
 	checkFieldExists(conn, tableName, field, field);
 	}
     }
 verbose(2, "Got %s conditional, %d foreignRefs, %d multiRefs\n", 
 	naForNull(conditionalField), slCount(foreignRefList), slCount(multiRefList));
 
 if (updateCondition)  // In update mode we can't handle fancy stuff
     {
     if (foreignRefList != NULL || multiRefList != NULL)
         errAbort("Can't handle foreign keys or multi-multi relations when doing ! updates");
     if (inTable->fieldCount < 2)
         errAbort("Need at least two fields in update mode");
     }
 
 /* Now we loop through the input table and make the appropriate sql queries and inserts */
 struct fieldedRow *fr;
 struct dyString *sql = dyStringNew(0);
 struct dyString *csvScratch = dyStringNew(0);
 for (fr = inTable->rowList; fr != NULL; fr = fr->next)
     {
     char **row = fr->row;
 
     /* The case of the update (!) condition is special.  */
     if (updateCondition)
         {
 	/* Make sure that the record we are updating exists for better
 	 * error reporting.  There's a race condition that'll make a SQL error happen
 	 * instead once in a million years. */
 	dyStringClear(sql);
 	char *rawVal = row[conditionalIx];
 	char *uncsvVal = csvParseNext(&rawVal, csvScratch);
 	char *conditionalEscaped = sqlEscapeString(uncsvVal);
 	sqlDyStringPrintf(sql, "select count(*) from %s where %s='%s'",
 	    tableName, conditionalField+1, conditionalEscaped);
 	verbose(2, "%s\n", sql->string);
 	if (sqlQuickNum(conn, sql->string) == 0)
 	    errAbort("Trying to update %s in %s.%s, but it doesn't exist",
 		uncsvVal, tableName, conditionalField+1);
 
 	dyStringClear(sql);
 	sqlDyStringPrintf(sql, "update %s set", tableName);
 	boolean firstTime = TRUE;
 	for (fieldIx=0; fieldIx < inTable->fieldCount; ++fieldIx)
 	    {
 	    if (fieldIx != conditionalIx)
 		{
 		char *rawVal = row[fieldIx];
 		char *uncsvVal = csvParseNext(&rawVal, csvScratch);
 		char *escaped = sqlEscapeString(uncsvVal);
 		if (firstTime)
 		    firstTime = FALSE;
 		else
 		    sqlDyStringPrintf(sql, ",");
 		sqlDyStringPrintf(sql, " %s='%s'", inFields[fieldIx], escaped);
 		freez(&escaped);
 		}
 	    }
 	sqlDyStringPrintf(sql, " where %s='%s'", conditionalField+1, conditionalEscaped);
 	verbose(2, "%s\n", sql->string);
 	sqlUpdate(conn, sql->string);
 
 	continue;	// We are done,  the rest of the loop is for inserts not updates
 	}
 
     /* Deal with conditional field.  If we have one and the value we are trying to insert
      * already exists then just continue to next row. */
     if (conditionalField != NULL && sameString(conditionalField, inFields[conditionalIx]))
 	{
 	dyStringClear(sql);
 	char *rawVal = row[conditionalIx];
 	char *uncsvVal = csvParseNext(&rawVal, csvScratch);
 	// Before we do more we see if the record already exists
 	sqlDyStringPrintf(sql, "select count(*) from %s where %s='%s'",
 	    tableName, conditionalField+1, uncsvVal);
 	verbose(2, "%s\n", sql->string);
 	if (sqlQuickNum(conn, sql->string) > 0)
 	    continue;
 	}
 
     /* Cope with foreign keys */
     struct foreignRef *fRef;
     for (fRef = foreignRefList; fRef != NULL; fRef = fRef->next)
         {
 	char *origVal = row[fRef->nativeFieldIx];
 	char *val = emptyForNull(csvParseNext(&origVal, csvScratch));
 	char *escaped = sqlEscapeString(val);
 	dyStringPrintf(sql, "\"%s\"",  escaped);
 	dyStringClear(sql);
 	sqlDyStringPrintf(sql, "select %s from %s where %s=\"%s\"",
 	    fRef->foreignKeyName, fRef->foreignTable, 
 	    fRef->foreignFindName, escaped);
 	verbose(2, "query for foreignKey: %s\n", sql->string);
 	fRef->foreignKey = sqlQuickString(conn, sql->string);
 	if (isEmpty(fRef->foreignKey))
 	    errAbort("No %s in table %s referenced line %d of %s",  val, fRef->foreignTable,
 	        fr->id,  tabFile);
 	row[fRef->nativeFieldIx] = fRef->foreignKey;
 	freez(&escaped);
 	}
 
     dyStringClear(sql);
     sqlDyStringPrintf(sql, "insert into %s (", tableName);
     boolean firstTime = TRUE;
     for (fieldIx=0; fieldIx < inTable->fieldCount; ++fieldIx)
         {
 	char *field = inFields[fieldIx];
 	char firstChar = field[0];
 
 	if (firstChar == '@')
 	    {
 	    if (field[1] == '@')  // multi field
 		{
 		// Actually multi field variables don't get written, all lives in
 		// the relationship table which we handle after the insert into main 
 		// table.
 		continue;
 		}
 	    else
 		{
 		char *startField = field + 1;  // skip over '@'
 		char *endField = strchr(startField, '@');
 		// This is parsed out so we know it works until someone rearranged code
 		assert(endField != NULL);  
 		field = cloneStringZ(startField, endField-startField);
 		}
 	    }
 
 	// We already dealt with the question mark outside this loop
 	if (firstChar == '?')
 	    field += 1;
 
 
 	if (firstTime)
 	    firstTime = !firstTime;
 	else
 	    dyStringAppendC(sql, ',');
 	dyStringAppend(sql, field);
 	}
 
     /* Now generate the values bit */
     dyStringAppend(sql, ") values (");
     firstTime = TRUE;
     for (fieldIx=0; fieldIx < inTable->fieldCount; ++fieldIx)
         {
 	char *field = inFields[fieldIx];
 	char firstChar = field[0];
 
 	if (firstChar == '@')
 	    {
 	    if (field[1] == '@')  // multi field
 		{
 		continue;	// multi field output doesn't go into this table, just relationship
 		}
 	    else
 	        field += 1;	// We val with the foreign key here, just skip over '@'
 	    }
 	if (firstChar == '?')
 	    field += 1;
 	if (firstTime)
 	    firstTime = !firstTime;
 	else
 	    dyStringAppendC(sql, ',');
 	char *origVal = row[fieldIx];
 	char *val = emptyForNull(csvParseNext(&origVal, csvScratch));
 	char *escaped = sqlEscapeString(val);
 	dyStringPrintf(sql, "\"%s\"",  escaped);
 	freez(&escaped);
 	}
     dyStringAppendC(sql, ')');
     verbose(2, "update sql: %s\n", sql->string);
     sqlUpdate(conn, sql->string);
     int mainTableId = sqlLastAutoId(conn);
 
     /* Handle multi-multi stuff */
     struct multiRef *mRef;
     for (mRef = multiRefList; mRef != NULL; mRef = mRef->next)
         {
 	addMultiRelation(conn, mRef, fr, mainTableId, tabFile, csvScratch);
 	}
 
     /* Clean up strings allocated for field references */
     for (fRef = foreignRefList; fRef != NULL; fRef = fRef->next)
 	freez(&fRef->foreignKey);
     }
 dyStringFree(&sql);
 dyStringFree(&csvScratch);
 fieldedTableFree(&inTable);
 }
 
 void sqlUpdateRelated(char *database, char **inFiles, int inCount)
 /* sqlUpdateRelated - Update a bunch of tables in a kind of careful way based out of tab 
  * separated files.  Handles foreign key and many-to-many relationships with a multitude 
  * of @ signs.. */
 {
 struct sqlConnection *conn = sqlConnect(database);
 int fileIx;
 boolean missOk = optionExists("missOk");
 for (fileIx = 0; fileIx < inCount; ++fileIx)
     {
     char *inFile = inFiles[fileIx];
     if (missOk && !fileExists(inFile))
         continue;
     char *tableName = cloneString(inFile);
     chopSuffix(tableName);
     verbose(1, "Processing %s into %s table \n", inFile, tableName);
     sqlUpdateViaTabFile(conn, inFile, tableName);
     }
 sqlDisconnect(&conn);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc < 3)
     usage();
 sqlUpdateRelated(argv[1], argv+2, argc-2);
 return 0;
 }