src/hg/sqlUpdateRelated/sqlUpdateRelated.c 0890e2c3c68649027aa95195fe82ca2e1101e82e

0890e2c3c68649027aa95195fe82ca2e1101e82e
kent
  Sun Sep 15 11:42:11 2019 -0700
Adding -uncsv as an option.  Actually it used to be the default so I had to throw it on in the test set.  The csv/array vs. normal commas in free text are a weak point of the system.  Between that and the SQL you can't escape escaping it seems!

diff --git src/hg/sqlUpdateRelated/sqlUpdateRelated.c src/hg/sqlUpdateRelated/sqlUpdateRelated.c
index e5475bb..e40a2cb 100644
--- src/hg/sqlUpdateRelated/sqlUpdateRelated.c
+++ src/hg/sqlUpdateRelated/sqlUpdateRelated.c
@@ -8,46 +8,49 @@
 #include "fieldedTable.h"
 #include "csv.h"
 #include "jksql.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "sqlUpdateRelated - Update a bunch of tables in a kind of careful way based out of tab \n"
   "separated files.  Handles foreign key and many-to-many relationships with a multitude\n"
   "of @ signs.  Currently only works with mysql cause going through jksql\n"
   "usage:\n"
   "   sqlUpdateRelated database tableFiles\n"
   "options:\n"
   "   -missOk - if set, tableFiles mentioned that don't exist are skipped rather than erroring\n"
+  "   -uncsv - if set, run uncsv and just take first value for each field, needed sometimes\n"
+  "            to deal with extra quotes from tagstorms and other sources\n"
   "The tableFiles are in a interesting and peculiar format.  The first line with the field name\n"
   "ends up controlling this program.  If a field starts with just a regular letter all is as\n"
   "you may expect,  the field just contains data to load.  However if the field starts with\n"
   "a special char, special things happen.  In particular\n"
   "   ? - indicates field is a conditional key field.  Record is only inserted if the value\n"
   "       for this field is not already present in table\n"
   "   ! - indicates this is update key field.  Record must already exist,  values in other fields\n"
   "       are updated.\n"
   "   @ - indicates a foreign key relationship - see source code until docs are in shape\n"
   "   @@ - indicates a many-to-many relationship - see source code until docs are in shape"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
-   {"missOk", TRUE},
+   {"missOk", OPTION_BOOLEAN},
+   {"uncsv", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 struct foreignRef
 /* What we need to handle a foreign key reference */
     {
     struct foreignRef *next;	    // Next in list
     char *nativeFieldName;	    // Name in the current table
     char *foreignTable;		    // Foreign table name
     char *foreignFindName;	    // Name to search in the destination table
     char *foreignKeyName;	    // Name of key we are fetching, usually just "id"
     char *outputVal;		    // Used as a place to hold the row value for later processing
     int nativeFieldIx;		    // Field index in native table
     char *foreignKey;		    // Actual foreign key - computed each row
     };
@@ -106,31 +109,31 @@
  * attyField */
 {
 if (sqlFieldIndex(conn, table, field) < 0)
     errAbort("No field %s in table %s used in %s", 
 	field, table, attyField);
 }
 
 void checkTableExists(struct sqlConnection *conn, char *table, char *attyField)
 /* Make sure table exists in database or print error message that includes
  * attyField */
 {
 if (!sqlTableExists(conn, table))
     errAbort("Table %s from %s doesn't exist", table, attyField);
 }
 
-void sqlUpdateViaTabFile(struct sqlConnection *conn, char *tabFile, char *tableName)
+void sqlUpdateViaTabFile(struct sqlConnection *conn, char *tabFile, char *tableName, boolean uncsv)
 /* Interpret one tab-separated file */
 {
 // Load the tabFile 
 struct fieldedTable *inTable = fieldedTableFromTabFile(tabFile, tabFile, NULL, 0);
 verbose(2, "%d fields and %d rows in %s\n",  inTable->fieldCount, inTable->rowCount, tabFile);
 char **inFields = inTable->fields;
 
 // Loop through the fields creating field set for output table and parsing
 // foreign and multi-multi fields into structures 
 char *conditionalField = NULL;  // We might have one of these
 int conditionalIx = -1;
 boolean updateCondition = FALSE;    
 struct foreignRef *foreignRefList = NULL;
 struct multiRef *multiRefList = NULL;
 int fieldIx;
@@ -239,85 +242,94 @@
 struct fieldedRow *fr;
 struct dyString *sql = dyStringNew(0);
 struct dyString *csvScratch = dyStringNew(0);
 for (fr = inTable->rowList; fr != NULL; fr = fr->next)
     {
     char **row = fr->row;
 
     /* The case of the update (!) condition is special.  */
     if (updateCondition)
         {
 	/* Make sure that the record we are updating exists for better
 	 * error reporting.  There's a race condition that'll make a SQL error happen
 	 * instead once in a million years. */
 	dyStringClear(sql);
 	char *rawVal = row[conditionalIx];
-	char *uncsvVal = csvParseNext(&rawVal, csvScratch);
+	char *uncsvVal = rawVal;
+	if (uncsv)
+	    uncsvVal = csvParseNext(&rawVal, csvScratch);
 	char *conditionalEscaped = sqlEscapeString(uncsvVal);
 	sqlDyStringPrintf(sql, "select count(*) from %s where %s='%s'",
 	    tableName, conditionalField+1, conditionalEscaped);
 	verbose(2, "%s\n", sql->string);
 	if (sqlQuickNum(conn, sql->string) == 0)
 	    errAbort("Trying to update %s in %s.%s, but it doesn't exist",
 		uncsvVal, tableName, conditionalField+1);
 
 	dyStringClear(sql);
 	sqlDyStringPrintf(sql, "update %s set", tableName);
 	boolean firstTime = TRUE;
 	for (fieldIx=0; fieldIx < inTable->fieldCount; ++fieldIx)
 	    {
 	    if (fieldIx != conditionalIx)
 		{
 		char *rawVal = row[fieldIx];
-		char *uncsvVal = csvParseNext(&rawVal, csvScratch);
+		char *uncsvVal = rawVal;
+		if (uncsv)
+		    uncsvVal = csvParseNext(&rawVal, csvScratch);
 		char *escaped = sqlEscapeString(uncsvVal);
 		if (firstTime)
 		    firstTime = FALSE;
 		else
 		    sqlDyStringPrintf(sql, ",");
 		sqlDyStringPrintf(sql, " %s='%s'", inFields[fieldIx], escaped);
 		freez(&escaped);
 		}
 	    }
 	sqlDyStringPrintf(sql, " where %s='%s'", conditionalField+1, conditionalEscaped);
 	verbose(2, "%s\n", sql->string);
 	sqlUpdate(conn, sql->string);
 
 	continue;	// We are done,  the rest of the loop is for inserts not updates
 	}
 
     /* Deal with conditional field.  If we have one and the value we are trying to insert
      * already exists then just continue to next row. */
     if (conditionalField != NULL && sameString(conditionalField, inFields[conditionalIx]))
 	{
 	dyStringClear(sql);
 	char *rawVal = row[conditionalIx];
-	char *uncsvVal = csvParseNext(&rawVal, csvScratch);
+	char *uncsvVal = rawVal;
+	if (uncsv)
+	    uncsvVal = csvParseNext(&rawVal, csvScratch);
 	// Before we do more we see if the record already exists
 	sqlDyStringPrintf(sql, "select count(*) from %s where %s='%s'",
 	    tableName, conditionalField+1, uncsvVal);
 	verbose(2, "%s\n", sql->string);
 	if (sqlQuickNum(conn, sql->string) > 0)
 	    continue;
 	}
 
     /* Cope with foreign keys */
     struct foreignRef *fRef;
     for (fRef = foreignRefList; fRef != NULL; fRef = fRef->next)
         {
-	char *origVal = row[fRef->nativeFieldIx];
-	char *val = emptyForNull(csvParseNext(&origVal, csvScratch));
+	char *rawVal = row[fRef->nativeFieldIx];
+	char *uncsvVal = rawVal;
+	if (uncsv)
+	    uncsvVal = csvParseNext(&rawVal, csvScratch);
+	char *val = emptyForNull(uncsvVal);
 	char *escaped = sqlEscapeString(val);
 	dyStringPrintf(sql, "\"%s\"",  escaped);
 	dyStringClear(sql);
 	sqlDyStringPrintf(sql, "select %s from %s where %s=\"%s\"",
 	    fRef->foreignKeyName, fRef->foreignTable, 
 	    fRef->foreignFindName, escaped);
 	verbose(2, "query for foreignKey: %s\n", sql->string);
 	fRef->foreignKey = sqlQuickString(conn, sql->string);
 	if (isEmpty(fRef->foreignKey))
 	    errAbort("No %s in table %s referenced line %d of %s",  val, fRef->foreignTable,
 	        fr->id,  tabFile);
 	row[fRef->nativeFieldIx] = fRef->foreignKey;
 	freez(&escaped);
 	}
 
@@ -371,32 +383,35 @@
 	if (firstChar == '@')
 	    {
 	    if (field[1] == '@')  // multi field
 		{
 		continue;	// multi field output doesn't go into this table, just relationship
 		}
 	    else
 	        field += 1;	// We val with the foreign key here, just skip over '@'
 	    }
 	if (firstChar == '?')
 	    field += 1;
 	if (firstTime)
 	    firstTime = !firstTime;
 	else
 	    dyStringAppendC(sql, ',');
-	char *origVal = row[fieldIx];
-	char *val = emptyForNull(csvParseNext(&origVal, csvScratch));
+	char *rawVal = row[fieldIx];
+	char *uncsvVal = rawVal;
+	if (uncsv)
+	    uncsvVal = csvParseNext(&rawVal, csvScratch);
+	char *val = emptyForNull(uncsvVal);
 	char *escaped = sqlEscapeString(val);
 	dyStringPrintf(sql, "\"%s\"",  escaped);
 	freez(&escaped);
 	}
     dyStringAppendC(sql, ')');
     verbose(2, "update sql: %s\n", sql->string);
     sqlUpdate(conn, sql->string);
     int mainTableId = sqlLastAutoId(conn);
 
     /* Handle multi-multi stuff */
     struct multiRef *mRef;
     for (mRef = multiRefList; mRef != NULL; mRef = mRef->next)
         {
 	addMultiRelation(conn, mRef, fr, mainTableId, tabFile, csvScratch);
 	}
@@ -406,37 +421,38 @@
 	freez(&fRef->foreignKey);
     }
 dyStringFree(&sql);
 dyStringFree(&csvScratch);
 fieldedTableFree(&inTable);
 }
 
 void sqlUpdateRelated(char *database, char **inFiles, int inCount)
 /* sqlUpdateRelated - Update a bunch of tables in a kind of careful way based out of tab 
  * separated files.  Handles foreign key and many-to-many relationships with a multitude 
  * of @ signs.. */
 {
 struct sqlConnection *conn = sqlConnect(database);
 int fileIx;
 boolean missOk = optionExists("missOk");
+boolean  uncsv = optionExists("uncsv");
 for (fileIx = 0; fileIx < inCount; ++fileIx)
     {
     char *inFile = inFiles[fileIx];
     if (missOk && !fileExists(inFile))
         continue;
     char *tableName = cloneString(inFile);
     chopSuffix(tableName);
     verbose(1, "Processing %s into %s table \n", inFile, tableName);
-    sqlUpdateViaTabFile(conn, inFile, tableName);
+    sqlUpdateViaTabFile(conn, inFile, tableName, uncsv);
     }
 sqlDisconnect(&conn);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc < 3)
     usage();
 sqlUpdateRelated(argv[1], argv+2, argc-2);
 return 0;
 }