src/hg/hgTables/vcf.c 44ccfacbe3a3d4b300f80d48651c77837a4b571e

44ccfacbe3a3d4b300f80d48651c77837a4b571e
galt
  Tue Apr 26 11:12:02 2022 -0700
SQL INJECTION Prevention Version 2 - this improves our methods by making subclauses of SQL that get passed around be both easy and correct to use. The way that was achieved was by getting rid of the obscure and not well used functions sqlSafefFrag and sqlDyStringPrintfFrag and replacing them with the plain versions of those functions, since these are not needed anymore. The new version checks for NOSQLINJ in unquoted %-s which is used to include SQL clauses, and will give an error the NOSQLINJ clause is not present, and this will automatically require the correct behavior by developers. sqlDyStringPrint is a very useful function, however because it was not enforced, users could use various other dyString functions and they operated without any awareness or checking for SQL correct use. Now those dyString functions are prohibited and it will produce an error if you try to use a dyString function on a SQL string, which is simply detected by the presence of the NOSQLINJ prefix.

diff --git src/hg/hgTables/vcf.c src/hg/hgTables/vcf.c
index 4320053..ed59fff 100644
--- src/hg/hgTables/vcf.c
+++ src/hg/hgTables/vcf.c
@@ -1,550 +1,550 @@
 /* vcf - stuff to handle VCF stuff in table browser. */
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "hgTables.h"
 #include "asFilter.h"
 #include "hubConnect.h"
 #include "asParse.h"
 #include "hgBam.h"
 #include "linefile.h"
 #include "localmem.h"
 #include "obscure.h"
 #include "vcf.h"
 #include "web.h"
 #include "trackHub.h"
 
 #define VCFDATALINE_NUM_COLS 10
 
 boolean isVcfTable(char *table, boolean *retIsTabix)
 /* Return TRUE if table corresponds to a VCF file. 
  * If retIsTabix is non-NULL, set *retIsTabix to TRUE if this is vcfTabix (not just vcf). */
 {
 boolean isVcfTabix = FALSE, isVcf = FALSE;
 struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table);
 if (tdb)
     {
     isVcfTabix = startsWithWord("vcfTabix", tdb->type) || startsWithWord("vcfPhasedTrio", tdb->type);
     isVcf = startsWithWord("vcf", tdb->type);
     }
 else
     {
     isVcfTabix = trackIsType(database, table, curTrack, "vcfTabix", ctLookupName);
     if (!isVcfTabix)
 	isVcf = trackIsType(database, table, curTrack, "vcf", ctLookupName);
     }
 if (retIsTabix)
     *retIsTabix = isVcfTabix;
 return (isVcfTabix || isVcf);
 }
 
 struct hTableInfo *vcfToHti(char *table, boolean isTabix)
 /* Get standard fields of VCF into hti structure. */
 {
 struct hTableInfo *hti;
 AllocVar(hti);
 hti->rootName = cloneString(table);
 hti->isPos= TRUE;
 strcpy(hti->chromField, "chrom");
 strcpy(hti->startField, "pos");
 strcpy(hti->nameField, "id");
 hti->type = cloneString(isTabix ? "vcfTabix" : "vcf");
 return hti;
 }
 
 struct slName *vcfGetFields()
 /* Get fields of vcf as simple name list. */
 {
 struct asObject *as = vcfAsObj();
 struct slName *names = asColNames(as);
 return names;
 }
 
 struct sqlFieldType *vcfListFieldsAndTypes()
 /* Get fields of bigBed as list of sqlFieldType. */
 {
 struct asObject *as = vcfAsObj();
 struct sqlFieldType *list = sqlFieldTypesFromAs(as);
 return list;
 }
 
 void dyJoin(struct dyString *dy, char *delim, char **words, int wordCount)
 /* Clear dy and append wordCount words interspersed with delim. */
 {
 dyStringClear(dy);
 int i;
 for (i = 0;  i < wordCount;  i++)
     {
     if (i > 0)
 	dyStringAppend(dy, delim);
     dyStringAppend(dy, words[i]);
     }
 }
 
 void vcfInfoElsToString(struct dyString *dy, struct vcfFile *vcff, struct vcfRecord *rec)
 /* Unpack rec's typed infoElements to semicolon-sep'd string in dy.*/
 {
 dyStringClear(dy);
 if (rec->infoCount == 0)
     dyStringAppendC(dy, '.');
 int i;
 for (i = 0;  i < rec->infoCount;  i++)
     {
     if (i > 0)
 	dyStringAppendC(dy, ';');
     const struct vcfInfoElement *el = &(rec->infoElements[i]);
     const struct vcfInfoDef *def = vcfInfoDefForKey(vcff, el->key);
     enum vcfInfoType type = def ? def->type : vcfInfoString;
     dyStringAppend(dy, el->key);
     if (el->count > 0)
 	dyStringAppendC(dy, '=');
     int j;
     for (j = 0;  j < el->count;  j++)
 	{
 	if (j > 0)
 	    dyStringAppendC(dy, ',');
 	if (el->missingData[j])
 	    {
 	    dyStringAppend(dy, ".");
 	    continue;
 	    }
 	union vcfDatum dat = el->values[j];
 	switch (type)
 	    {
 	    case vcfInfoInteger:
 		dyStringPrintf(dy, "%d", dat.datInt);
 		break;
 	    case vcfInfoFloat:
 		{
 		// use big precision and erase trailing zeros:
 		char fbuf[64];
 		safef(fbuf, sizeof(fbuf), "%.16lf", dat.datFloat);
 		int i;
 		for (i = strlen(fbuf) - 1;  i > 0;  i--)
 		    if (fbuf[i] == '0')
 			fbuf[i] = '\0';
 		    else
 			break;
 		dyStringAppend(dy, fbuf);
 		}
 		break;
 	    case vcfInfoCharacter:
 		dyStringAppendC(dy, dat.datChar);
 		break;
 	    case vcfInfoFlag: // Flags could have values in older VCF
 	    case vcfInfoString:
 		dyStringAppend(dy, dat.datString);
 		break;
 	    default:
 		errAbort("Invalid vcfInfoType %d (how did this get past parser?", type);
 	    }
 	}
     }
 }
 
 #define VCF_NUM_BUF_SIZE 256
 
 void vcfRecordToRow(struct vcfRecord *rec, char *chrom, char *numBuf, struct dyString *dyAlt,
 		    struct dyString *dyFilter, struct dyString *dyInfo, struct dyString *dyGt,
 		    char *row[VCFDATALINE_NUM_COLS])
 /* Convert vcfRecord to array of strings, using numBuf to store ascii
  * versions of numbers and dyStrings to store stringified arrays. */
 {
 struct vcfFile *vcff = rec->file;
 char *numPt = numBuf;
 row[0] = chrom; // VCF files' chrom field often lacks "chr" -- use region->chrom from caller.
 row[1] = numPt;
 numPt += safef(numPt, VCF_NUM_BUF_SIZE - (numPt-numBuf), "%u", rec->chromStart+1);
 numPt++;
 row[2] = rec->name;
 row[3] = rec->alleles[0];
 dyJoin(dyAlt, ",", &(rec->alleles[1]), rec->alleleCount-1);
 row[4] = dyAlt->string;
 row[5] = rec->qual;
 dyJoin(dyFilter, ";", rec->filters, rec->filterCount);
 row[6] = dyFilter->string;
 vcfInfoElsToString(dyInfo, vcff, rec);
 row[7] = dyInfo->string;
 if (vcff->genotypeCount > 0)
     {
     row[8] = rec->format;
     dyJoin(dyGt, "\t", rec->genotypeUnparsedStrings, vcff->genotypeCount);
     row[9] = dyGt->string;
     }
 else
     row[8] = row[9] = ""; // compatible with localmem usage
 }
 
 static char *vcfFileName(struct sqlConnection *conn, char *table, char *chrom, boolean isTabix)
 // Look up the vcf or vcfTabix file name, using CUSTOM_TRASH if necessary; return NULL if not found.
 {
 char *fileName = bigFileNameFromCtOrHub(table, conn);
 struct trackDb *tdb = hashFindVal(fullTableToTdbHash, table);
 if (isCustomTrack(table) && ! isTabix)
     {
     // fileName is stored in the customTrash table
     struct customTrack *ct = ctLookupName(table);
     struct sqlConnection *conn = hAllocConn(CUSTOM_TRASH);
     char query[1024];
     sqlSafef(query, sizeof(query), "select fileName from %s", ct->dbTableName);
     fileName = sqlQuickString(conn, query);
     hFreeConn(&conn);
     }
 if (fileName == NULL)
     fileName = bbiNameFromSettingOrTableChrom(tdb, conn, table, chrom);
 return fileName;
 }
 
 static char *vcfMustFindFileName(struct sqlConnection *conn, char *table, char *chrom,
                                  boolean isTabix)
 /* Look up the vcf or vcfTabix file name; errAbort if not found. */
 {
 char *fileName = vcfFileName(conn, table, chrom, isTabix);
 if (fileName == NULL)
     errAbort("vcfMustFindFileName: can't find VCF file; chrom=%s, table=%s", chrom, table);
 return fileName;
 }
 
 void vcfTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f,
 	       boolean isTabix)
 /* Print out selected fields from VCF.  If fields is NULL, then print out all fields. */
 {
 struct hTableInfo *hti = NULL;
 hti = getHti(db, table, conn);
 struct hash *idHash = NULL;
 char *idField = getIdField(db, curTrack, table, hti);
 int idFieldNum = 0;
 
 /* if we know what field to use for the identifiers, get the hash of names */
 if (idField != NULL)
     idHash = identifierHash(db, table);
 
 if (f == NULL)
     f = stdout;
 
 /* Convert comma separated list of fields to array. */
 int fieldCount = chopByChar(fields, ',', NULL, 0);
 char **fieldArray;
 AllocArray(fieldArray, fieldCount);
 chopByChar(fields, ',', fieldArray, fieldCount);
 
 /* Get list of all fields in big bed and turn it into a hash of column indexes keyed by
  * column name. */
 struct hash *fieldHash = hashNew(0);
 struct slName *bb, *bbList = vcfGetFields();
 int i;
 for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i)
     {
     /* if we know the field for identifiers, save it away */
     if ((idField != NULL) && sameString(idField, bb->name))
 	idFieldNum = i;
     hashAddInt(fieldHash, bb->name, i);
     }
 
 /* Create an array of column indexes corresponding to the selected field list. */
 int *columnArray;
 AllocArray(columnArray, fieldCount);
 for (i=0; i<fieldCount; ++i)
     {
     columnArray[i] = hashIntVal(fieldHash, fieldArray[i]);
     }
 
 // If we are outputting a subset of fields, invalidate the VCF header.
 boolean allFields = (fieldCount == VCFDATALINE_NUM_COLS);
 if (!allFields)
     fprintf(f, "# Only selected columns are included below; output is not valid VCF.\n");
 
 struct asObject *as = vcfAsObj();
 struct asFilter *filter = NULL;
 if (anyFilter())
     filter = asFilterFromCart(cart, db, table, as);
 
 /* Loop through outputting each region */
 struct region *region, *regionList = getRegions();
 int maxOut = bigFileMaxOutput();
 // Include the header, absolutely necessary for VCF parsing.
 boolean printedHeader = FALSE;
 // Temporary storage for row-ification:
-struct dyString *dyAlt = newDyString(1024);
-struct dyString *dyFilter = newDyString(1024);
-struct dyString *dyInfo = newDyString(1024);
-struct dyString *dyGt = newDyString(1024);
+struct dyString *dyAlt = dyStringNew(1024);
+struct dyString *dyFilter = dyStringNew(1024);
+struct dyString *dyInfo = dyStringNew(1024);
+struct dyString *dyGt = dyStringNew(1024);
 struct vcfRecord *rec;
 for (region = regionList; region != NULL && (maxOut > 0); region = region->next)
     {
     char *fileName = vcfFileName(conn, table, region->chrom, isTabix);
     if (fileName == NULL)
         continue;
     struct vcfFile *vcff;
     if (isTabix)
         {
         char *indexUrl = bigDataIndexFromCtOrHub(table, conn);
 	vcff = vcfTabixFileAndIndexMayOpen(fileName, indexUrl, region->chrom, region->start, region->end,
 				   100, maxOut);
         }
     else
 	vcff = vcfFileMayOpen(fileName, region->chrom, region->start, region->end,
 			      100, maxOut, TRUE);
     if (vcff == NULL)
 	noWarnAbort();
     // If we are outputting all fields, but this VCF has no genotype info, omit the
     // genotype columns from output:
     if (allFields && vcff->genotypeCount == 0)
 	fieldCount = VCFDATALINE_NUM_COLS - 2;
     if (!printedHeader)
 	{
 	fprintf(f, "%s", vcff->headerString);
 	if (filter)
 	    fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList));
 	if (!allFields)
 	    {
 	    fprintf(f, "#%s", fieldArray[0]);
 	    for (i=1; i<fieldCount; ++i)
 		fprintf(f, "\t%s", fieldArray[i]);
 	    fprintf(f, "\n");
 	    }
 	printedHeader = TRUE;
 	}
     char *row[VCFDATALINE_NUM_COLS];
     char numBuf[VCF_NUM_BUF_SIZE];
     for (rec = vcff->records;  rec != NULL && (maxOut > 0);  rec = rec->next)
         {
 	vcfRecordToRow(rec, region->chrom, numBuf, dyAlt, dyFilter, dyInfo, dyGt, row);
 	if (asFilterOnRow(filter, row))
 	    {
 	    /* if we're looking for identifiers, check if this matches */
 	    if ((idHash != NULL) && (hashLookup(idHash, row[idFieldNum]) == NULL))
 		continue;
 	    // All fields output: after asFilter'ing, preserve original VCF chrom
 	    if (allFields && !sameString(rec->chrom, region->chrom))
 		row[0] = rec->chrom;
 	    int i;
 	    fprintf(f, "%s", row[columnArray[0]]);
 	    for (i=1; i<fieldCount; ++i)
 		{
 		fprintf(f, "\t%s", row[columnArray[i]]);
 		}
 	    fprintf(f, "\n");
 	    maxOut --;
 	    }
 	}
     vcfFileFree(&vcff);
     freeMem(fileName);
     }
 if (!printedHeader)
     explainWhyNoResults(f);
 if (maxOut == 0)
     errAbort("Reached output limit of %d data values, please make region smaller,\n\tor set a higher output line limit with the filter settings.", bigFileMaxOutput());
 /* Clean up and exit. */
 dyStringFree(&dyAlt);  dyStringFree(&dyFilter);  dyStringFree(&dyInfo);  dyStringFree(&dyGt);
 hashFree(&fieldHash);
 freeMem(fieldArray);
 freeMem(columnArray);
 }
 
 static void addFilteredBedsOnRegion(char *fileName, char *indexUrl, struct region *region, char *table,
 				    struct asFilter *filter, struct lm *bedLm,
 				    struct bed **pBedList, struct hash *idHash, int *pMaxOut,
 				    boolean isTabix)
 /* Add relevant beds in reverse order to pBedList */
 {
 struct vcfFile *vcff;
 if (isTabix)
     vcff = vcfTabixFileAndIndexMayOpen(fileName, indexUrl, region->chrom, region->start, region->end,
             100, *pMaxOut);
 else
     vcff = vcfFileMayOpen(fileName, region->chrom, region->start, region->end,
 			  100, *pMaxOut, TRUE);
 if (vcff == NULL)
     noWarnAbort();
 struct lm *lm = lmInit(0);
 char *row[VCFDATALINE_NUM_COLS];
 char numBuf[VCF_NUM_BUF_SIZE];
 // Temporary storage for row-ification:
-struct dyString *dyAlt = newDyString(1024);
-struct dyString *dyFilter = newDyString(1024);
-struct dyString *dyInfo = newDyString(1024);
-struct dyString *dyGt = newDyString(1024);
+struct dyString *dyAlt = dyStringNew(1024);
+struct dyString *dyFilter = dyStringNew(1024);
+struct dyString *dyInfo = dyStringNew(1024);
+struct dyString *dyGt = dyStringNew(1024);
 struct vcfRecord *rec;
 for (rec = vcff->records;  rec != NULL;  rec = rec->next)
     {
     vcfRecordToRow(rec, region->chrom, numBuf, dyAlt, dyFilter, dyInfo, dyGt, row);
     if (asFilterOnRow(filter, row))
         {
 	if ((idHash != NULL) && (hashLookup(idHash, rec->name) == NULL))
 	    continue;
 	struct bed *bed;
 	lmAllocVar(bedLm, bed);
 	bed->chrom = lmCloneString(bedLm, region->chrom);
 	bed->chromStart = rec->chromStart;
 	bed->chromEnd = rec->chromEnd;
 	bed->name = lmCloneString(bedLm, rec->name);
 	slAddHead(pBedList, bed);
 	}
     (*pMaxOut)--;
     if (*pMaxOut <= 0)
 	break;
     }
 dyStringFree(&dyAlt);  dyStringFree(&dyFilter);  dyStringFree(&dyInfo);  dyStringFree(&dyGt);
 lmCleanup(&lm);
 vcfFileFree(&vcff);
 }
 
 struct bed *vcfGetFilteredBedsOnRegions(struct sqlConnection *conn,
 	char *db, char *table, struct region *regionList, struct lm *lm,
 	int *retFieldCount, boolean isTabix)
 /* Get list of beds from VCF, in all regions, that pass filtering. */
 {
 int maxOut = bigFileMaxOutput();
 /* Figure out vcf file name get column info and filter. */
 struct asObject *as = vcfAsObj();
 struct asFilter *filter = asFilterFromCart(cart, db, table, as);
 struct hash *idHash = identifierHash(db, table);
 
 /* Get beds a region at a time. */
 struct bed *bedList = NULL;
 struct region *region;
 for (region = regionList; region != NULL; region = region->next)
     {
     char *fileName = vcfFileName(conn, table, region->chrom, isTabix);
     if (fileName == NULL)
 	continue;
     char *indexUrl = bigDataIndexFromCtOrHub(table, conn);
     addFilteredBedsOnRegion(fileName, indexUrl, region, table, filter, lm, &bedList, idHash, &maxOut,
 			    isTabix);
     freeMem(fileName);
     if (maxOut <= 0)
 	{
 	errAbort("Reached output limit of %d data values, please make region smaller,\n"
 	     "\tor set a higher output line limit with the filter settings.", bigFileMaxOutput());
 	}
     }
 slReverse(&bedList);
 return bedList;
 }
 
 struct slName *randomVcfIds(char *table, struct sqlConnection *conn, int count, boolean isTabix)
 /* Return some semi-random IDs from a VCF file. */
 {
 /* Read 10000 items from vcf file,  or if they ask for a big list, then 4x what they ask for. */
 char *fileName = vcfMustFindFileName(conn, table, hDefaultChrom(database), isTabix);
 char *indexUrl = bigDataIndexFromCtOrHub(table, conn);
 
 struct lineFile *lf = isTabix ? lineFileTabixAndIndexMayOpen(fileName, indexUrl, TRUE) :
 				lineFileMayOpen(fileName, TRUE);
 if (lf == NULL)
     noWarnAbort();
 int orderedCount = count * 4;
 if (orderedCount < 100)
     orderedCount = 100;
 struct slName *idList = NULL;
 char *words[4];
 int i;
 for (i = 0;  i < orderedCount && lineFileChop(lf, words); i++)
     {
     // compress runs of identical ID, in case most are placeholder
     if (i == 0 || !sameString(words[2], idList->name))
 	slAddHead(&idList, slNameNew(words[2]));
     }
 lineFileClose(&lf);
 /* Shuffle list and trim it to count if necessary. */
 shuffleList(&idList);
 struct slName *sl;
 for (sl = idList, i = 0; sl != NULL; sl = sl->next, i++)
     {
     if (i+1 >= count)
 	{
 	slNameFreeList(&(sl->next));
 	break;
 	}
     }
 freez(&fileName);
 return idList;
 }
 
 #define VCF_MAX_SCHEMA_COLS 20
 
 void showSchemaVcf(char *table, struct trackDb *tdb, boolean isTabix)
 /* Show schema on vcf. */
 {
 struct sqlConnection *conn = NULL;
 if (!trackHubDatabase(database))
     conn = hAllocConn(database);
 char *fileName = vcfMustFindFileName(conn, table, hDefaultChrom(database), isTabix);
 
 struct asObject *as = vcfAsObj();
 hPrintf("<B>Database:</B> %s", database);
 hPrintf("&nbsp;&nbsp;&nbsp;&nbsp;<B>Primary Table:</B> %s<br>", table);
 hPrintf("<B>VCF File:</B> %s", fileName);
 hPrintf("<BR>\n");
 hPrintf("<B>Format description:</B> %s<BR>", as->comment);
 hPrintf("See the <A HREF=\"%s\" target=_blank>Variant Call Format specification</A> for  more details<BR>\n",
 	"http://www.1000genomes.org/wiki/analysis/vcf4.0");
 
 /* Put up table that describes fields. */
 hTableStart();
 hPrintf("<TR><TH>field</TH>");
 hPrintf("<TH>description</TH> ");
 puts("</TR>\n");
 struct asColumn *col;
 int colCount = 0;
 for (col = as->columnList; col != NULL; col = col->next)
     {
     hPrintf("<TR><TD><TT>%s</TT></TD>", col->name);
     hPrintf("<TD>%s</TD></TR>", col->comment);
     ++colCount;
     }
 hTableEnd();
 
 /* Put up another section with sample rows. */
 webNewSection("Sample Rows");
 hTableStart();
 
 /* Fetch sample rows. */
 char *indexUrl = bigDataIndexFromCtOrHub(table, conn);
 struct lineFile *lf = isTabix ? lineFileTabixAndIndexMayOpen(fileName, indexUrl, TRUE) :
 				lineFileMayOpen(fileName, TRUE);
 if (lf == NULL)
     noWarnAbort();
 char *row[VCF_MAX_SCHEMA_COLS];
 int i;
 for (i = 0;  i < 10;  i++)
     {
     int colCount = lineFileChop(lf, row);
     int colIx;
     if (i == 0)
 	{
 	// Print field names as column headers, using colCount to compute genotype span
 	hPrintf("<TR>");
 	for (colIx = 0, col = as->columnList; col != NULL && colIx < colCount;
 	     colIx++, col = col->next)
 	    {
 
 	    if (sameString("genotypes", col->name) && colCount > colIx+1)
 		hPrintf("<TH colspan=%d>%s</TH>", colCount - colIx, col->name);
 	    else
 		hPrintf("<TH>%s</TH>", col->name);
 	    }
 	hPrintf("</TR>\n");
 	}
     hPrintf("<TR>");
     for (colIx=0; colIx < colCount; ++colIx)
 	{
 	if (colCount > VCFDATALINE_NUM_COLS && colIx == colCount - 1)
 	    hPrintf("<TD>...</TD>");
 	else
 	    writeHtmlCell(row[colIx]);
 	}
     hPrintf("</TR>\n");
     }
 hTableEnd();
 printTrackHtml(tdb);
 
 /* Clean up and go home. */
 lineFileClose(&lf);
 freeMem(fileName);
 hFreeConn(&conn);
 }