44ccfacbe3a3d4b300f80d48651c77837a4b571e galt Tue Apr 26 11:12:02 2022 -0700 SQL INJECTION Prevention Version 2 - this improves our methods by making subclauses of SQL that get passed around be both easy and correct to use. The way that was achieved was by getting rid of the obscure and not well used functions sqlSafefFrag and sqlDyStringPrintfFrag and replacing them with the plain versions of those functions, since these are not needed anymore. The new version checks for NOSQLINJ in unquoted %-s which is used to include SQL clauses, and will give an error the NOSQLINJ clause is not present, and this will automatically require the correct behavior by developers. sqlDyStringPrint is a very useful function, however because it was not enforced, users could use various other dyString functions and they operated without any awareness or checking for SQL correct use. Now those dyString functions are prohibited and it will produce an error if you try to use a dyString function on a SQL string, which is simply detected by the presence of the NOSQLINJ prefix. diff --git src/hg/txCds/txCdsGoodBed/txCdsGoodBed.c src/hg/txCds/txCdsGoodBed/txCdsGoodBed.c index eb13f4d..e27725d 100644 --- src/hg/txCds/txCdsGoodBed/txCdsGoodBed.c +++ src/hg/txCds/txCdsGoodBed/txCdsGoodBed.c @@ -1,163 +1,164 @@ /* txCdsGoodBed - Create positive example training set for SVM. This is based on * the refSeq reviewed genes, but we fragment a certain percentage of them so as * not to end up with a SVM that *requires* a complete transcript. */ /* Copyright (C) 2013 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "bed.h" #include "genePred.h" #include "obscure.h" #include "jksql.h" double frag = 0.15; void usage() /* Explain usage and exit. */ { errAbort( "txCdsGoodBed - Create positive example training set for SVM. This is \n" "based on the refSeq reviewed genes, but we fragment a certain percentage \n" "of them so as not to end up with a SVM that *requires* a complete \n" "transcript.\n" "usage:\n" " txCdsGoodBed database output.bed output.cds\n" "options:\n" " -frag=0.N - Amount to fragment. Default %g\n" , frag ); } static struct optionSpec options[] = { {"frag", OPTION_DOUBLE}, {NULL, 0}, }; void gpFragLimits(struct genePred *gp, double startRatio, double endRatio, int *retStart, int *retEnd) /* Convert floating point range from 0-1 representing the cDNA position * to genomic start/end. */ { int geneBases = genePredBases(gp); int cdnaStart = geneBases * startRatio; int cdnaEnd = geneBases * endRatio; int i; int cdnaPos = 0; if (cdnaStart <= 0) *retStart = gp->txStart; if (cdnaEnd >= geneBases) *retEnd = gp->txEnd; for (i=0; i<gp->exonCount; ++i) { int exonStart = gp->exonStarts[i]; int exonEnd = gp->exonEnds[i]; int exonSize = exonEnd - exonStart; if (cdnaPos <= cdnaStart && cdnaStart < cdnaPos + exonSize) *retStart = exonStart + (cdnaStart - cdnaPos); if (cdnaPos < cdnaEnd && cdnaEnd <= cdnaPos + exonSize) *retEnd = exonStart + (cdnaEnd - cdnaPos); cdnaPos += exonSize; } } void gpPartOutAsCds(struct genePred *gp, int partStart, int partEnd, FILE *f, char *type, int id) /* Write out CDS region of this fragment of gp. It may in fact be empty, * which we'll represent as start=0, end=0 */ { /* Truncate cds to fit inside of the interval we're actually outputting. * If this results in us losing the CDS, just output a stub quickly. */ fprintf(f, "%s_%d_%s\t", type, id, gp->name); int genoCdsStart = max(partStart, gp->cdsStart); int genoCdsEnd = min(partEnd, gp->cdsEnd); if (genoCdsStart >= genoCdsEnd) { fprintf(f, "0\t0\n"); return; } /* Figure out amount of exonic sequence that is inside of our part * and also in the first UTR or the CDS. */ int i; int cdnaUtrUpSize = 0; int cdnaUtrDownSize = 0; int cdnaCdsSize = 0; for (i=0; i<gp->exonCount; ++i) { int exonStart = gp->exonStarts[i]; int exonEnd = gp->exonEnds[i]; int utrUpStart = max(gp->txStart, exonStart); int utrUpEnd = min(gp->cdsStart, exonEnd); int utrDownStart = max(gp->cdsEnd, exonStart); int utrDownEnd = min(gp->txEnd, exonEnd); int cdsStart = max(gp->cdsStart, exonStart); int cdsEnd = min(gp->cdsEnd, exonEnd); cdnaUtrUpSize += positiveRangeIntersection(utrUpStart, utrUpEnd, partStart, partEnd); cdnaUtrDownSize += positiveRangeIntersection(utrDownStart, utrDownEnd, partStart, partEnd); cdnaCdsSize += positiveRangeIntersection(cdsStart, cdsEnd, partStart, partEnd); } /* Output CDS start/end. */ if (gp->strand[0] == '+') fprintf(f, "%d\t%d\n", cdnaUtrUpSize, cdnaUtrUpSize+cdnaCdsSize); else fprintf(f, "%d\t%d\n", cdnaUtrDownSize, cdnaUtrDownSize+cdnaCdsSize); } void txCdsGoodBed(char *database, char *outBed, char *outCds) /* txCdsGoodBed - Create positive example training set for SVM. This is based on * the refSeq reviewed genes, but we fragment a certain percentage of them so as * not to end up with a SVM that *requires* a complete transcript. */ { struct sqlConnection *conn = sqlConnect(database); char *refTrack = "refGene"; char *statusTable = "refSeqStatus"; if (!sqlTableExists(conn, refTrack)) errAbort("table %s doesn't exist in %s", refTrack, database); if (!sqlTableExists(conn, statusTable)) errAbort("table %s doesn't exist in %s", statusTable, database); FILE *fBed = mustOpen(outBed, "w"); FILE *fCds = mustOpen(outCds, "w"); -char *query = - NOSQLINJ "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds " - "from refGene r,refSeqStatus s where r.name=s.mrnaAcc and s.status='Reviewed'"; +char query[1024]; +sqlSafef(query, sizeof query, + "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds " + "from refGene r,refSeqStatus s where r.name=s.mrnaAcc and s.status='Reviewed'"); struct sqlResult *sr = sqlGetResult(conn, query); char **row; double randScale = 1.0/RAND_MAX; int id = 0; while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row); int start = gp->txStart, end = gp->txEnd; char *type = "refReviewed"; if (rand()*randScale < frag) { double midRatio = rand()*randScale; if (midRatio > 0.5) gpFragLimits(gp, 0, midRatio, &start, &end); else gpFragLimits(gp, midRatio, 1.0, &start, &end); type = "refFrag"; } gpPartOutAsBed(gp, start, end, fBed, type, ++id, 0); gpPartOutAsCds(gp, start, end, fCds, type, id); } carefulClose(&fBed); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); frag = optionDouble("frag", frag); txCdsGoodBed(argv[1], argv[2], argv[3]); return 0; }