src/hg/near/hgClusterGenes/hgClusterGenes.c 44ccfacbe3a3d4b300f80d48651c77837a4b571e

44ccfacbe3a3d4b300f80d48651c77837a4b571e
galt
  Tue Apr 26 11:12:02 2022 -0700
SQL INJECTION Prevention Version 2 - this improves our methods by making subclauses of SQL that get passed around be both easy and correct to use. The way that was achieved was by getting rid of the obscure and not well used functions sqlSafefFrag and sqlDyStringPrintfFrag and replacing them with the plain versions of those functions, since these are not needed anymore. The new version checks for NOSQLINJ in unquoted %-s which is used to include SQL clauses, and will give an error the NOSQLINJ clause is not present, and this will automatically require the correct behavior by developers. sqlDyStringPrint is a very useful function, however because it was not enforced, users could use various other dyString functions and they operated without any awareness or checking for SQL correct use. Now those dyString functions are prohibited and it will produce an error if you try to use a dyString function on a SQL string, which is simply detected by the presence of the NOSQLINJ prefix.

diff --git src/hg/near/hgClusterGenes/hgClusterGenes.c src/hg/near/hgClusterGenes/hgClusterGenes.c
index 6204846..12ddc81 100644
--- src/hg/near/hgClusterGenes/hgClusterGenes.c
+++ src/hg/near/hgClusterGenes/hgClusterGenes.c
@@ -1,422 +1,422 @@
 /* hgClusterGenes - Cluster overlapping gene predictions. */
 
 /* Copyright (C) 2013 The Regents of the University of California 
  * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dlist.h"
 #include "jksql.h"
 #include "genePred.h"
 #include "hdb.h"
 #include "hgRelate.h"
 #include "binRange.h"
 #include "rbTree.h"
 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "hgClusterGenes - Cluster overlapping gene predictions\n"
   "usage:\n"
   "   hgClusterGenes database geneTable clusterTable cannonicalTable\n"
   "where clusterTable pairs clusterIds and geneIds, and cannonicalTable\n"
   "pairs the longest protein isoform with the clusterId\n"
   "options:\n"
   "   -chrom=chrN - Just work on one chromosome\n"
   "   -verbose=N - Print copious debugging info. 0 for none, 3 for loads\n"
   "   -noProt - Skip protein field\n"
   "   -sangerLinks - Use sangerLinks table for protein\n"
   "   -protAccQuery='query' - Use query to retrieve gene->protein map\n"
   );
 }
 
 boolean noProt = FALSE;
 boolean sangerLinks = FALSE;
 char *protAccQuery = NULL;
 
 static struct optionSpec options[] = {
    {"chrom", OPTION_STRING},
    {"noProt", OPTION_BOOLEAN},
    {"sangerLinks", OPTION_BOOLEAN},
    {"protAccQuery", OPTION_STRING},
    {NULL, 0},
 };
 
 struct cluster
 /* A cluster of overlapping genes. */
     {
     struct cluster *next;	/* Next in list. */
     struct hash *geneHash;	/* Associated genes. */
     int start,end;		/* Range covered by cluster. */
     };
 
 void clusterDump(int verbosity, struct cluster *cluster)
 /* Dump contents of cluster to log. */
 {
 struct hashEl *helList = hashElListHash(cluster->geneHash);
 struct hashEl *hel;
 verbose(verbosity, "%d-%d", cluster->start, cluster->end);
 for (hel = helList; hel != NULL; hel = hel->next)
     {
     struct genePred *gp = hel->val;
     verbose(verbosity, " %s", gp->name);
     }
 verbose(verbosity, "\n");
 }
 
 struct cluster *clusterNew()
 /* Create new cluster. */
 {
 struct cluster *cluster;
 AllocVar(cluster);
 cluster->geneHash = hashNew(5);
 return cluster;
 }
 
 void clusterFree(struct cluster **pCluster)
 /* Free up a cluster. */
 {
 struct cluster *cluster = *pCluster;
 if (cluster == NULL)
     return;
 hashFree(&cluster->geneHash);
 freez(pCluster);
 }
 
 void clusterFreeList(struct cluster **pList)
 /* Free a list of dynamically allocated cluster's */
 {
 struct cluster *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     clusterFree(&el);
     }
 *pList = NULL;
 }
 
 int clusterCmp(const void *va, const void *vb)
 /* Compare to sort based on start. */
 {
 const struct cluster *a = *((struct cluster **)va);
 const struct cluster *b = *((struct cluster **)vb);
 return a->start - b->start;
 }
 
 
 
 void clusterAddExon(struct cluster *cluster,
 	int start, int end, struct genePred *gp)
 /* Add exon to cluster. */
 {
 if (!hashLookup(cluster->geneHash, gp->name))
     hashAdd(cluster->geneHash, gp->name, gp);
 if (cluster->start == cluster->end)
     {
     cluster->start = start;
     cluster->end = end;
     }
 else
     {
     if (start < cluster->start) cluster->start = start;
     if (cluster->end < end) cluster->end = end;
     }
 }
 
 void addExon(struct binKeeper *bk, struct dlNode *clusterNode,
 	int start, int end, struct genePred *gp)
 /* Add exon to cluster and binKeeper. */
 {
 clusterAddExon(clusterNode->val, start, end, gp);
 binKeeperAdd(bk, start, end, clusterNode);
 }
 
 void mergeClusters(struct binKeeper *bk, struct binElement *bkRest,
 	struct dlNode *aNode, struct dlNode *bNode)
 /* Move bNode into aNode. */
 {
 struct cluster *aCluster = aNode->val;
 struct cluster *bCluster = bNode->val;
 struct binElement *bkEl;
 struct hashEl *hEl, *hList;
 
 verbose(3, " a: ");
 clusterDump(3, aCluster);
 verbose(3, " b: ");
 clusterDump(3, bCluster);
 
 /* First change references to bNode. */
 binKeeperReplaceVal(bk, bCluster->start, bCluster->end, bNode, aNode);
 for (bkEl = bkRest; bkEl != NULL; bkEl = bkEl->next)
     if (bkEl->val == bNode) 
         bkEl->val = aNode;
 
 /* Add b's genes to a. */
 hList = hashElListHash(bCluster->geneHash);
 for (hEl = hList; hEl != NULL; hEl = hEl->next)
     hashAdd(aCluster->geneHash, hEl->name, hEl->val);
 
 /* Adjust start/end. */
 if (bCluster->start < aCluster->start) 
     aCluster->start = bCluster->start;
 if (aCluster->end < bCluster->end)
     aCluster->end = bCluster->end;
 
 /* Remove all traces of bNode. */
 dlRemove(bNode);
 clusterFree(&bCluster);
 verbose(3, " ab: ");
 clusterDump(3, aCluster);
 }
 
 int totalGeneCount = 0;
 int totalClusterCount = 0;
 
 struct cluster *makeCluster(struct genePred *geneList, int chromSize)
 /* Create clusters out of overlapping genes. */
 {
 struct binKeeper *bk = binKeeperNew(0, chromSize);
 struct dlList *clusters = newDlList();
 struct dlNode *node;
 struct cluster *clusterList = NULL, *cluster;
 struct genePred *gp;
 
 /* Build up cluster list with aid of binKeeper.
  * For each exon look to see if it overlaps an
  * existing cluster.  If so put it into existing
  * cluster, otherwise make a new cluster.  The hard
  * case is where part of the gene is already in one
  * cluster and the exon overlaps with a new
  * cluster.  In this case merge the new cluster into
  * the old one. */
 for (gp = geneList; gp != NULL; gp = gp->next)
     {
     int exonIx;
     struct dlNode *oldNode = NULL;
     verbose(2, "%s %d-%d\n", gp->name, gp->txStart, gp->txEnd);
     for (exonIx = 0; exonIx < gp->exonCount; ++exonIx)
         {
 	int exonStart = gp->exonStarts[exonIx];
 	int exonEnd = gp->exonEnds[exonIx];
 	struct binElement *bEl, *bList = binKeeperFind(bk, exonStart, exonEnd);
 	verbose(4, "  %d %d\n", exonIx, exonStart);
 	if (bList == NULL)
 	    {
 	    if (oldNode == NULL)
 		{
 		struct cluster *cluster = clusterNew();
 		oldNode = dlAddValTail(clusters, cluster);
 		}
 	    addExon(bk, oldNode, exonStart, exonEnd, gp);
 	    }
 	else
 	    {
 	    for (bEl = bList; bEl != NULL; bEl = bEl->next)
 		{
 		struct dlNode *newNode = bEl->val;
 		if (newNode != oldNode)
 		    {
 		    if (oldNode == NULL)
 			{
 			/* Add to existing cluster. */
 			oldNode = newNode;
 			}
 		    else
 			{
 			/* Merge new cluster into old one. */
 			verbose(3, "Merging %p %p\n", oldNode, newNode);
 			mergeClusters(bk, bEl->next, oldNode, newNode);
 			}
 		    }
 		}
 	    addExon(bk, oldNode, exonStart, exonEnd, gp);
 	    slFreeList(&bList);
 	    }
 	}
     }
 
 /* We build up the cluster list as a doubly-linked list
  * to make it faster to remove clusters that get merged
  * into another cluster.  At the end though we make
  * a singly-linked list out of it. */
 for (node = clusters->tail; !dlStart(node); node=node->prev)
     {
     cluster = node->val;
     slAddHead(&clusterList, cluster);
     }
 slSort(&clusterList, clusterCmp);
 
 /* Clean up and go home. */
 binKeeperFree(&bk);
 dlListFree(&clusters);
 return clusterList;
 }
 
 void createClusterTable(struct sqlConnection *conn, 
 	char *tableName, int longestName)
 /* Create cluster table. */
 {
-struct dyString *dy = newDyString(1024);
+struct dyString *dy = dyStringNew(1024);
 sqlDyStringPrintf(dy,
     "CREATE TABLE %s (\n"
     " clusterId int unsigned not null,\n"
     " transcript varchar(255) not null,\n"
     " INDEX(transcript(%d)),\n"
     " INDEX(clusterId))\n"
     , tableName, longestName);
 sqlRemakeTable(conn, tableName, dy->string);
 dyStringFree(&dy);
 }
 
 void createCannonicalTable(struct sqlConnection *conn, 
 	char *tableName, int longestName)
 /* Create cannonical representative of cluster table. */
 {
-struct dyString *dy = newDyString(1024);
+struct dyString *dy = dyStringNew(1024);
 sqlDyStringPrintf(dy,
     "CREATE TABLE %s (\n"
     " chrom varchar(255) not null,\n"
     " chromStart int not null,\n"
     " chromEnd int not null,\n"
     " clusterId int unsigned not null,\n"
     " transcript varchar(255) not null,\n"
     " protein varchar(255) not null,\n"
     " UNIQUE(clusterId),\n"
     " INDEX(transcript(%d)),\n"
     " INDEX(protein(8)))\n"
     , tableName, longestName);
 sqlRemakeTable(conn, tableName, dy->string);
 dyStringFree(&dy);
 }
 
 int clusterId = 0;	/* Assign a unique id to each cluster. */
 int longestName = 1;	/* Longest gene name. */
 
 void clusterGenesOnStrand(char *database, struct sqlConnection *conn,
 	char *geneTable, char *chrom, char strand, 
 	FILE *clusterFile, FILE *canFile)
 /* Scan through genes on this strand, cluster, and write clusters to file. */
 {
 struct sqlResult *sr;
 char **row;
 int rowOffset;
 struct genePred *gp, *gpList = NULL;
 char extraWhere[64], query[256];
 struct cluster *clusterList = NULL, *cluster;
 int nameLen;
 struct hash *protHash = NULL;
 
 verbose(1, "%s %c\n", chrom, strand);
 safef(extraWhere, sizeof(extraWhere), "strand = '%c'", strand);
 sr = hChromQuery(conn, geneTable, chrom, extraWhere, &rowOffset);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     gp = genePredLoad(row + rowOffset);
     nameLen = strlen(gp->name);
     slAddHead(&gpList, gp);
     if (nameLen > longestName)
         longestName = nameLen;
     ++totalGeneCount;
     }
 sqlFreeResult(&sr);
 
 /* Build hash to map between transcript names and protein IDs. */
 if (!noProt)
     {
     protHash = newHash(16);
     if (sangerLinks)
 	sqlSafef(query, sizeof(query), "select orfName,protName from sangerLinks");
     else if (isNotEmpty(protAccQuery))
 	// accepting special user query without checking
-	sqlSafef(query, sizeof(query), "%-s", protAccQuery);  
+	sqlSafef(query, sizeof(query), protAccQuery, NULL);  
     else
 	sqlSafef(query, sizeof(query), "select name, proteinId from %s", geneTable);
     sr = sqlGetResult(conn, query);
     while ((row = sqlNextRow(sr)) != NULL)
 	hashAdd(protHash, row[0], cloneString(row[1]));
     sqlFreeResult(&sr);
     }
 
 slReverse(&gpList);
 clusterList = makeCluster(gpList, hChromSize(database, chrom));
 for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
     {
     struct hashEl *helList = hashElListHash(cluster->geneHash);
     struct hashEl *hel;
     struct genePred *cannonical = NULL;
     char *protName;
     int cannonicalSize = -1, size = 0;
     ++clusterId;
     for (hel = helList; hel != NULL; hel = hel->next)
         {
 	struct genePred *gp = hel->val;
 	fprintf(clusterFile, "%d\t%s\n", clusterId, gp->name);
 	size = genePredCodingBases(gp);
 	if (size > cannonicalSize)
 	    {
 	    cannonical = gp;
 	    cannonicalSize = size;
 	    }
 	}
     protName = cannonical->name;
     if (protHash != NULL)
 	{
 	char *newVal = hashFindVal(protHash, protName);
 	if (newVal != NULL)
 	    protName = newVal;
 	}
     fprintf(canFile, "%s\t%d\t%d\t%d\t%s\t%s\n", 
     	chrom, cannonical->txStart, cannonical->txEnd, clusterId, cannonical->name,
 	protName);
     ++totalClusterCount;
     }
 genePredFreeList(&gpList);
 freeHashAndVals(&protHash);
 }
 
 void hgClusterGenes(char *database, char *geneTable, char *clusterTable,
 	char *cannonicalTable)
 /* hgClusterGenes - Cluster overlapping gene predictions. */
 {
 struct slName *chromList, *chrom;
 FILE *clusterFile = hgCreateTabFile(".", clusterTable);
 FILE *canFile = hgCreateTabFile(".", cannonicalTable);
 struct sqlConnection *conn;
 
 if (optionExists("chrom"))
     chromList = slNameNew(optionVal("chrom", NULL));
 else
     chromList = hAllChromNames(database);
 conn = hAllocConn(database);
 for (chrom = chromList; chrom != NULL; chrom = chrom->next)
     {
     clusterGenesOnStrand(database, conn, geneTable, chrom->name, '+', 
     	clusterFile, canFile);
     clusterGenesOnStrand(database, conn, geneTable, chrom->name, '-', 
     	clusterFile, canFile);
     }
 createClusterTable(conn, clusterTable, longestName);
 createCannonicalTable(conn, cannonicalTable, longestName);
 verbose(1, "Loading database\n");
 hgLoadTabFile(conn, ".", clusterTable, &clusterFile);
 hgLoadTabFile(conn, ".", cannonicalTable, &canFile);
 verbose(1, "Got %d clusters, from %d genes in %d chromosomes\n",
 	totalClusterCount, totalGeneCount, slCount(chromList));
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 noProt = optionExists("noProt");
 sangerLinks = optionExists("sangerLinks");
 protAccQuery = optionVal("protAccQuery", protAccQuery);
 if (argc != 5)
     usage();
 hgClusterGenes(argv[1], argv[2], argv[3], argv[4]);
 return 0;
 }