src/hg/cgilib/annoStreamDb.c 44ccfacbe3a3d4b300f80d48651c77837a4b571e

44ccfacbe3a3d4b300f80d48651c77837a4b571e
galt
  Tue Apr 26 11:12:02 2022 -0700
SQL INJECTION Prevention Version 2 - this improves our methods by making subclauses of SQL that get passed around be both easy and correct to use. The way that was achieved was by getting rid of the obscure and not well used functions sqlSafefFrag and sqlDyStringPrintfFrag and replacing them with the plain versions of those functions, since these are not needed anymore. The new version checks for NOSQLINJ in unquoted %-s which is used to include SQL clauses, and will give an error the NOSQLINJ clause is not present, and this will automatically require the correct behavior by developers. sqlDyStringPrint is a very useful function, however because it was not enforced, users could use various other dyString functions and they operated without any awareness or checking for SQL correct use. Now those dyString functions are prohibited and it will produce an error if you try to use a dyString function on a SQL string, which is simply detected by the presence of the NOSQLINJ prefix.

diff --git src/hg/cgilib/annoStreamDb.c src/hg/cgilib/annoStreamDb.c
index 173a8bd..8a419c1 100644
--- src/hg/cgilib/annoStreamDb.c
+++ src/hg/cgilib/annoStreamDb.c
@@ -235,72 +235,72 @@
 /* Don't let mysql use a (chrom, chromEnd) index because that messes up sorting by chromStart. */
 {
 if (sameString(dbTable, self->trackTable) && self->endFieldIndexName != NULL)
     sqlDyStringPrintf(query, " IGNORE INDEX (%s) ", self->endFieldIndexName);
 }
 
 static void appendOneTable(struct annoStreamDb *self, struct joinerDtf *dt, struct dyString *query)
 /* Add the (db.)table string from dt to query; if dt is NULL or table is split then
  * use self->table. */
 {
 char dbTable[PATH_LEN];
 if (dt == NULL || useSplitTable(self, dt))
     safecpy(dbTable, sizeof(dbTable), self->table);
 else
     joinerDtfToSqlTableString(dt, self->db, dbTable, sizeof(dbTable));
-dyStringAppend(query, dbTable);
+sqlDyStringPrintf(query, "%s", dbTable);
 ignoreEndIndexIfNecessary(self, dbTable, query);
 }
 
 INLINE void splitOrDtfToSqlField(struct annoStreamDb *self, struct joinerDtf *dtf,
                                  char *fieldName, size_t fieldNameSize)
 /* Write [db].table.field into fieldName, where table may be split. */
 {
 if (useSplitTable(self, dtf))
     safef(fieldName, fieldNameSize, "%s.%s", self->table, dtf->field);
 else
     joinerDtfToSqlFieldString(dtf, self->db, fieldName, fieldNameSize);
 }
 
 static void appendJoin(struct annoStreamDb *self, struct joinerPair *routeList,
                        struct dyString *query)
 /* Append join statement(s) for a possibly tree-structured routeList. */
 {
 struct joinerPair *jp;
 for (jp = routeList;  jp != NULL;  jp = jp->next)
     {
     struct joinerField *jfB = joinerSetFindField(jp->identifier, jp->b);
     if (! jfB->full)
-        dyStringAppend(query, " left");
-    dyStringAppend(query, " join ");
+	sqlDyStringPrintf(query, " left");
+    sqlDyStringPrintf(query, " join ");
     if (jp->child)
         {
-        dyStringAppendC(query, '(');
+        sqlDyStringPrintf(query, "(");
         appendOneTable(self, jp->child->a, query);
         appendJoin(self, jp->child, query);
-        dyStringAppendC(query, ')');
+        sqlDyStringPrintf(query, ")");
         }
     else
         appendOneTable(self, jp->b, query);
     char fieldA[PATH_LEN], fieldB[PATH_LEN];
     splitOrDtfToSqlField(self, jp->a, fieldA, sizeof(fieldA));
     splitOrDtfToSqlField(self, jp->b, fieldB, sizeof(fieldB));
     struct joinerField *jfA = joinerSetFindField(jp->identifier, jp->a);
     if (sameOk(jfA->separator, ","))
-        dyStringPrintf(query, " on find_in_set(%s, %s)", fieldB, fieldA);
+        sqlDyStringPrintf(query, " on find_in_set(%s, %s)", fieldB, fieldA);
     else
-        dyStringPrintf(query, " on %s = %s", fieldA, fieldB);
+        sqlDyStringPrintf(query, " on %s = %s", fieldA, fieldB);
     }
 }
 
 static boolean appendTableList(struct annoStreamDb *self, struct dyString *query)
 /* Append SQL table list to query, including tables used for output, filtering and joining. */
 {
 boolean hasLeftJoin = FALSE;
 if (self->joinMixer == NULL || self->joinMixer->sqlRouteList == NULL)
     appendOneTable(self, NULL, query);
 else
     {
     // Use both a and b of the first pair and only b of each subsequent pair
     appendOneTable(self, self->joinMixer->sqlRouteList->a, query);
     appendJoin(self, self->joinMixer->sqlRouteList, query);
     hasLeftJoin = TRUE;
@@ -327,31 +327,33 @@
 if (isEmpty(joinerFile))
     joinerFile = JOINER_FILE;
 return joinerFile;
 }
 
 static void asdInitBaselineQuery(struct annoStreamDb *self)
 /* Build a dy SQL query with no position constraints (select ... from ...)
  * possibly including joins and filters if specified (where ...). */
 {
 if (self->relatedDtfList)
     {
     struct joinerDtf *outputFieldList = slCat(joinerDtfCloneList(self->mainTableDtfList),
                                               joinerDtfCloneList(self->relatedDtfList));
     if (self->joiner == NULL)
         self->joiner = joinerRead(joinerFilePath());
-    int expectedRows = sqlRowCount(self->conn, self->table);
+    char queryTblSafe[1024];
+    sqlSafef(queryTblSafe, sizeof queryTblSafe, "%s", self->table);
+    int expectedRows = sqlRowCount(self->conn, queryTblSafe);
     self->joinMixer = joinMixerNew(self->joiner, self->db, self->table, outputFieldList,
                                    expectedRows, self->naForMissing);
     joinerPairListToTree(self->joinMixer->sqlRouteList);
     self->sqlRowSize = slCount(self->joinMixer->sqlFieldList);
     self->bigRowSize = self->joinMixer->bigRowSize;
     joinerDtfFreeList(&outputFieldList);
     }
 else
     {
     self->sqlRowSize = slCount(self->mainTableDtfList);
     self->bigRowSize = self->sqlRowSize;
     }
 }
 
 static void asdUpdateBaselineQuery(struct annoStreamDb *self)
@@ -362,41 +364,33 @@
 appendFieldList(self, query);
 sqlDyStringPrintf(query, " from ");
 self->hasLeftJoin = appendTableList(self, query);
 boolean hasWhere = FALSE;
 self->baselineQuery = dyStringCannibalize(&query);
 self->baselineQueryHasWhere = hasWhere;
 // Don't free joiner; we need its storage of joinerFields.
 }
 
 static void addBinToQuery(struct annoStreamDb *self, uint start, uint end, struct dyString *query)
 /* If applicable, add bin range constraints to query with explicit table name, in case we're
  * joining with another table that has a bin column. */
 {
 if (self->hasBin)
     {
-    // Get the bin constraints with no table specification:
-    struct dyString *binConstraints = dyStringNew(0);
-    hAddBinToQuery(start, end, binConstraints);
-    // Swap in explicit table name for bin field:
     char tableDotBin[PATH_LEN];
     safef(tableDotBin, sizeof(tableDotBin), "%s.bin", self->table);
-    struct dyString *explicitBinConstraints = dyStringSub(binConstraints->string,
-                                                          "bin", tableDotBin);
-    dyStringAppend(query, explicitBinConstraints->string);
-    dyStringFree(&explicitBinConstraints);
-    dyStringFree(&binConstraints);
+    hAddBinToQueryGeneral(tableDotBin, start, end, query);
     }
 }
 
 static void addRangeToQuery(struct annoStreamDb *self, struct dyString *query,
                             char *chrom, uint start, uint end, boolean hasWhere)
 /* Add position constraints to query. */
 {
 if (hasWhere)
     sqlDyStringPrintf(query, " and ");
 else
     sqlDyStringPrintf(query, " where ");
 sqlDyStringPrintf(query, "%s.%s='%s'", self->table, self->chromField, chrom);
 uint chromSize = annoAssemblySeqSize(self->streamer.assembly, chrom);
 boolean addStartConstraint = (start > 0);
 boolean addEndConstraint = (end < chromSize);
@@ -424,54 +418,54 @@
         sqlDyStringPrintf(query, "(%s.%s < %u or (%s.%s = %s.%s and %s.%s = %u)) ",
                           self->table, self->startField, end,
                           self->table, self->startField, self->table, self->endField,
                           self->table, self->endField, end);
         }
     }
 }
 
 static void asdDoQuerySimple(struct annoStreamDb *self, char *minChrom, uint minEnd)
 /* Return a sqlResult for a query on table items in position range.
  * If doing a whole genome query. just select all rows from table. */
 // NOTE: it would be possible to implement filters at this level, as in hgTables.
 {
 struct annoStreamer *streamer = &(self->streamer);
 boolean hasWhere = self->baselineQueryHasWhere;
-struct dyString *query = dyStringCreate("%s", self->baselineQuery);
+struct dyString *query = sqlDyStringCreate("%-s", self->baselineQuery);
 if (!streamer->positionIsGenome)
     {
     if (minChrom && differentString(minChrom, streamer->chrom))
 	errAbort("annoStreamDb %s: nextRow minChrom='%s' but region chrom='%s'",
 		 streamer->name, minChrom, streamer->chrom);
     if (self->hasBin)
 	{
 	// Results will be in bin order, but we can restore chromStart order by
 	// accumulating initial coarse-bin items and merge-sorting them with
 	// subsequent finest-bin items which will be in chromStart order.
 	resetMergeState(self);
         startMerging(self);
 	}
     addRangeToQuery(self, query, streamer->chrom, streamer->regionStart, streamer->regionEnd,
                     hasWhere);
     if (self->notSorted || self->hasLeftJoin)
 	sqlDyStringPrintf(query, " order by %s.%s", self->table, self->startField);
     }
 else if (self->notSorted || self->hasLeftJoin)
     sqlDyStringPrintf(query, " order by %s.%s,%s.%s",
                       self->table, self->chromField, self->table, self->startField);
 if (self->maxOutRows > 0)
-    dyStringPrintf(query, " limit %d", self->maxOutRows);
+    sqlDyStringPrintf(query, " limit %d", self->maxOutRows);
 struct sqlResult *sr = sqlGetResult(self->conn, query->string);
 dyStringFree(&query);
 self->sr = sr;
 self->needQuery = FALSE;
 }
 
 static void updateNextChunkState(struct annoStreamDb *self, int queryMaxItems)
 /* If the just-fetched interval list was limited to ASD_CHUNK_SIZE, set doNextChunk
  * and trim the last row(s) so that when we query the next chunk, we don't get
  * repeat rows due to querying a start coord that was already returned. */
 {
 struct rowBuf *rowBuf = &self->rowBuf;
 if (queryMaxItems == ASD_CHUNK_SIZE && rowBuf->size == ASD_CHUNK_SIZE)
     {
     self->doNextChunk = TRUE;
@@ -618,31 +612,31 @@
 if (self->notSorted || self->hasLeftJoin)
     sqlDyStringPrintf(query, " order by %s.%s", self->table, self->startField);
 sqlDyStringPrintf(query, " limit %d", maxItems);
 bufferRowsFromSqlQuery(self, query->string, maxItems);
 }
 
 static void asdDoQueryChunking(struct annoStreamDb *self, char *minChrom, uint minEnd)
 /* Get rows from mysql with a limit on the number of rows returned at one time (ASD_CHUNK_SIZE),
  * to avoid long delays for very large tables.  This will be called multiple times if
  * the number of rows in region is more than ASD_CHUNK_SIZE.  If doing a genome-wide query,
  * break it up into chrom-by-chrom queries because the code that merges large bin items
  * in with small bin items assumes that all rows are on the same chrom. */
 {
 struct annoStreamer *sSelf = &(self->streamer);
 boolean hasWhere = self->baselineQueryHasWhere;
-struct dyString *query = dyStringCreate("%s", self->baselineQuery);
+struct dyString *query = sqlDyStringCreate("%-s", self->baselineQuery);
 if (sSelf->chrom != NULL && self->rowBuf.size > 0 && !self->doNextChunk)
     {
     // We're doing a region query, we already got some rows, and don't need another chunk:
     resetRowBuf(&self->rowBuf);
     self->eof = TRUE;
     }
 if (self->useMaxOutRows)
     {
     self->maxOutRows -= self->rowBuf.size;
     if (self->maxOutRows <= 0)
 	self->eof = TRUE;
     }
 if (self->eof)
     return;
 int queryMaxItems = ASD_CHUNK_SIZE;
@@ -1217,31 +1211,31 @@
             asObj = asdAutoSqlFromTableFields(self, asObj);
             }
         }
     struct jsonElement *naForMissingEl = hashFindVal(config, "naForMissing");
     if (naForMissingEl != NULL)
         self->naForMissing = jsonBooleanVal(naForMissingEl, "naForMissing");
     }
 return asObj;
 }
 
 static char *sqlExplain(struct sqlConnection *conn, char *query)
 /* For now, just turn the values back into a multi-line "#"-comment string. */
 {
 char *trimmedQuery = query;
 if (startsWith(NOSQLINJ, trimmedQuery))
-    trimmedQuery = trimmedQuery + strlen(NOSQLINJ);
+    trimmedQuery = trimmedQuery + NOSQLINJ_SIZE;
 struct dyString *dy = dyStringCreate("# Output of 'explain %s':\n", trimmedQuery);
 char explainQuery[PATH_LEN*8];
 safef(explainQuery, sizeof(explainQuery), NOSQLINJ "explain %s", trimmedQuery);
 struct sqlResult *sr = sqlGetResult(conn, explainQuery);
 struct slName *fieldList = sqlResultFieldList(sr);
 int nColumns = slCount(fieldList);
 // Header:
 dyStringPrintf(dy, "# %s\n", slNameListToString(fieldList, '\t'));
 char **row;
 while ((row = sqlNextRow(sr)) != NULL)
     {
     dyStringAppend(dy, "# ");
     int i;
     for (i = 0;  i < nColumns;  i++)
         {
@@ -1250,34 +1244,39 @@
         if (row[i] == NULL)
             dyStringAppend(dy, "NULL");
         else
             dyStringAppend(dy, row[i]);
         }
     dyStringAppendC(dy, '\n');
     }
 return dyStringCannibalize(&dy);
 }
 
 static char *asdGetHeader(struct annoStreamer *sSelf)
 /* Return header with debug info. */
 {
 struct annoStreamDb *self = (struct annoStreamDb *)sSelf;
 // Add a fake constraint on chromField because a real one is added to baselineQuery.
-char queryWithChrom[PATH_LEN*4];
-safef(queryWithChrom, sizeof(queryWithChrom), "%s %s %s.%s = 'someValue'", self->baselineQuery,
-      (strstr(self->baselineQuery, "where") ? "and" : "where"), self->table, self->chromField);
-char *explanation = sqlExplain(self->conn, queryWithChrom);
+struct dyString *queryWithChrom = dyStringNew(1024);
+sqlDyStringPrintf(queryWithChrom, "%-s ", self->baselineQuery);
+if (strstr(self->baselineQuery, "where"))
+    sqlDyStringPrintf(queryWithChrom, "and ");
+else
+    sqlDyStringPrintf(queryWithChrom, "where ");
+sqlDyStringPrintf(queryWithChrom, "%s.%s = 'someValue'", self->table, self->chromField);
+char *explanation = sqlExplain(self->conn, dyStringContents(queryWithChrom));
+dyStringFree(&queryWithChrom);
 return explanation;
 }
 
 struct annoStreamer *annoStreamDbNew(char *db, char *table, struct annoAssembly *aa,
 				     int maxOutRows, struct jsonElement *configEl)
 /* Create an annoStreamer (subclass) object from a database table.
  * If config is NULL, then the streamer produces output from all fields
  * (except bin, unless table's autoSql includes bin).
  * Otherwise, config is a json object with a member 'relatedTables' that specifies
  * related tables and fields to join with table, for example:
  * config = { "relatedTables": [ { "table": "hg19.kgXref",
  *                                 "fields": ["geneSymbol", "description"] },
  *                               { "table": "hg19.knownCanonical",
  *                                 "fields": ["clusterId"] }
  *                             ] }