src/hg/getRnaPred/getRnaPred.c 44ccfacbe3a3d4b300f80d48651c77837a4b571e

44ccfacbe3a3d4b300f80d48651c77837a4b571e
galt
  Tue Apr 26 11:12:02 2022 -0700
SQL INJECTION Prevention Version 2 - this improves our methods by making subclauses of SQL that get passed around be both easy and correct to use. The way that was achieved was by getting rid of the obscure and not well used functions sqlSafefFrag and sqlDyStringPrintfFrag and replacing them with the plain versions of those functions, since these are not needed anymore. The new version checks for NOSQLINJ in unquoted %-s which is used to include SQL clauses, and will give an error the NOSQLINJ clause is not present, and this will automatically require the correct behavior by developers. sqlDyStringPrint is a very useful function, however because it was not enforced, users could use various other dyString functions and they operated without any awareness or checking for SQL correct use. Now those dyString functions are prohibited and it will produce an error if you try to use a dyString function on a SQL string, which is simply detected by the presence of the NOSQLINJ prefix.

diff --git src/hg/getRnaPred/getRnaPred.c src/hg/getRnaPred/getRnaPred.c
index 22bc045..fea038e 100644
--- src/hg/getRnaPred/getRnaPred.c
+++ src/hg/getRnaPred/getRnaPred.c
@@ -1,492 +1,494 @@
 /* getRnaPred - Get RNA for gene predictions. */
 
 /* Copyright (C) 2013 The Regents of the University of California 
  * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "genePred.h"
 #include "psl.h"
 #include "fa.h"
 #include "jksql.h"
 #include "hdb.h"
 #include "nibTwo.h"
 #include "dnautil.h"
 #include "options.h"
 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "getRnaPred - Get virtual RNA for gene predictions\n"
   "usage:\n"
   "   getRnaPred [options] database table chromosome output.fa\n"
   "table can be a table or a file.  Specify chromosome of 'all' to\n"
   "to process all chromosome\n"
   "\n"
   "options:\n"
   "   -weird - only get ones with weird splice sites\n"
   "   -cdsUpper - output CDS in upper case\n"
   "   -cdsOnly - only output CDS\n"
   "   -cdsOut=file - write CDS to this tab-separated file, in the form\n"
   "      acc  start  end\n"
   "    where start..end are genbank style, one-based coordinates\n"
   "   -keepMasking - un/masked in upper/lower case.\n"
   "   -pslOut=psl - output a PSLs for the virtual mRNAs.  Allows virtual\n"
   "    mRNA to be analyzed by tools that work on PSLs\n"
   "   -suffix=suf - append suffix to each id to avoid confusion with mRNAs\n"
   "    use to define the genes.\n"
   "   -peptides - out the translation of the CDS to a peptide sequence.\n"
   "    The newer program genePredToProt maybe produce better results in cases\n"
   "    were there are frame-shifting indels in the CDS.\n"
   "   -exonIndices - output indices of exon boundaries after sequence name,\n"
   "    e.g., \"103 243 290\" says positions 1-103 are from the first exon,\n"
   "    positions 104-243 are from the second exon, etc. \n"
   "   -maxSize=size - output a maximum of size characters.  Useful when\n"
   "    testing gene predictions by RT-PCR.\n"
   "   -genomeSeqs=spec - get genome sequences from the specified nib directory\n"
   "    or 2bit file instead of going though the path found in chromInfo.\n"
   "   -includeCoords - include the genomic coordinates as a comment in the\n"
   "    fasta header.  This is necessary when there are multiple genePreds\n"
   "    with the same name.\n"
   "   -genePredExt - (for use with -peptides) use extended genePred format,\n"
   "    and consider frame information when translating (Warning: only\n"
   "    considers offset at 5' end, not frameshifts between blocks)\n"
   );
 }
 
 static struct optionSpec options[] = {
    {"weird", OPTION_BOOLEAN},
    {"cdsUpper", OPTION_BOOLEAN},
    {"cdsOut", OPTION_STRING},
    {"cdsOnly", OPTION_BOOLEAN},
    {"keepMasking", OPTION_BOOLEAN},
    {"pslOut", OPTION_STRING},
    {"suffix", OPTION_STRING},
    {"peptides", OPTION_BOOLEAN},
    {"includeCoords", OPTION_BOOLEAN},
    {"exonIndices", OPTION_BOOLEAN},
    {"maxSize", OPTION_INT},
    {"genePredExt", OPTION_BOOLEAN},
    {"genomeSeqs", OPTION_STRING},
    {NULL, 0}
 };
 
 
 /* parsed from command line */
 boolean weird, cdsUpper, cdsOnly, peptides, keepMasking, includeCoords, exonIndices,
   genePredExt;
 char *cdsOut = NULL;
 char *pslOut = NULL;
 char *suffix = "";
 int maxSize = -1;
 char *genomeSeqs = NULL;
 
 struct nibTwoCache *getNibTwoCacheFromDb(char *db)
 /* get the nib or two-bit cache from database */
 {
 struct sqlConnection *conn = hAllocConn(db);
 char nibTwoPath[PATH_LEN];
 struct nibTwoCache *nibTwoCache;
 
 /* grab the first chromsome file name, if it's a nib, convert to
  * directory name */
-sqlNeedQuickQuery(conn, NOSQLINJ "select fileName from chromInfo limit 1",
+char query[1024];
+sqlSafef(query, sizeof query, "select fileName from chromInfo limit 1");
+sqlNeedQuickQuery(conn, query,
                   nibTwoPath, sizeof(nibTwoPath));
 if (nibIsFile(nibTwoPath))
     {
     char *p = strrchr(nibTwoPath, '/');
     if (p != NULL)
         *p = '\0';
     else
         strcpy(nibTwoPath, "."); 
     }
 nibTwoCache = nibTwoCacheNew(nibTwoPath);
 hFreeConn(&conn);
 return nibTwoCache;
 }
 
 struct nibTwoCache *getNibTwoCache(char *db)
 /* get the nib or two-bit cache */
 {
 if (genomeSeqs != NULL)
     return nibTwoCacheNew(genomeSeqs);
 else
     return getNibTwoCacheFromDb(db);
 }
 
 struct dnaSeq *fetchDna(char *db, char *seqName, int start, int end, enum dnaCase dnaCase)
 /* Fetch DNA sequence, with caching of opens */
 {
 static struct nibTwoCache *nibTwoCache = NULL;
 struct dnaSeq *dna;
 if (nibTwoCache == NULL)
     nibTwoCache = getNibTwoCache(db);
 dna = nibTwoCacheSeqPart(nibTwoCache, seqName, start, (end-start), NULL);
 if (dnaCase == dnaLower)
     tolowers(dna->dna);
 return dna;
 }
 
 boolean hasWeirdSplice(char *db, struct genePred *gp)
 /* see if a gene has weird splice sites */
 {
 int i;
 boolean gotOdd = FALSE;
 for (i=1; (i<gp->exonCount) && !gotOdd; ++i)
     {
     int start = gp->exonEnds[i-1];
     int end = gp->exonStarts[i];
     int size = end - start;
     if (size > 0)
         {
         struct dnaSeq *seq = fetchDna(db, gp->chrom, start, end, dnaLower);
         DNA *s = seq->dna;
         DNA *e = seq->dna + seq->size;
         uglyf("%s %c%c/%c%c\n", gp->name, s[0], s[1], e[-2], e[-1]);
         if (gp->strand[0] == '-')
             reverseComplement(seq->dna, seq->size);
         if (s[0] != 'g' || (s[1] != 't' && s[1] != 'c') || e[-2] != 'a' || e[-1] != 'g')
             gotOdd = TRUE;
         freeDnaSeq(&seq);
         }
     }
 return gotOdd;
 }
 
 int findGeneOff(struct genePred *gp, int chrPos)
 /* find the mrna offset containing the specified position.*/
 {
 int iRna = 0, iExon;
 
 for (iExon = 0; iExon < gp->exonCount; iExon++)
     {
     if (chrPos < gp->exonStarts[iExon])
         {
         /* intron before this exon */
         return iRna;
         }
     if (chrPos < gp->exonEnds[iExon])
         {
         return iRna + (chrPos - gp->exonStarts[iExon]);
         }
     iRna += (gp->exonEnds[iExon] - gp->exonStarts[iExon]);
     }
 return iRna-1;
 }
 
 void processCds(struct genePred *gp, struct dyString *dnaBuf, struct dyString *cdsBuf,
                 FILE* cdsFh)
 /* find the CDS bounds in the mRNA defined by the annotation.
  * output and/or update as requested.  If CDS buf is not NULL,
  * copy CDS to it.*/
 {
 int cdsStart = findGeneOff(gp, gp->cdsStart);
 int cdsEnd = findGeneOff(gp, gp->cdsEnd-1)+1;
 int rnaSize = genePredBases(gp);
 if (gp->strand[0] == '-')
     reverseIntRange(&cdsStart, &cdsEnd, rnaSize);
 assert(cdsStart <= cdsEnd);
 assert(cdsEnd <= rnaSize);
 
 if (cdsUpper)
     toUpperN(dnaBuf->string+cdsStart, (cdsEnd-cdsStart));
 
 if (cdsBuf != NULL)
     {
     dyStringClear(cdsBuf);
     dyStringAppendN(cdsBuf, dnaBuf->string+cdsStart, (cdsEnd-cdsStart));
     }
 
 if (cdsFh != NULL)
     fprintf(cdsFh,"%s\t%d\t%d\n", gp->name, cdsStart+1, cdsEnd);
 }
 
 void writePsl(char *db, struct genePred *gp, FILE* pslFh)
 /* create a PSL for the virtual mRNA */
 {
 int rnaSize = genePredBases(gp);
 int qStart = 0, iExon;
 struct psl psl;
 ZeroVar(&psl);
 
 psl.match = rnaSize;
 psl.tNumInsert = gp->exonCount-1;
 psl.strand[0] = gp->strand[0];
 psl.qName = gp->name;
 psl.qSize = rnaSize;
 psl.qStart = 0;
 psl.qEnd = rnaSize;
 psl.tName = gp->chrom;
 psl.tSize =  hChromSize(db, gp->chrom);
 psl.tStart = gp->txStart;
 psl.tEnd = gp->txEnd;
 
 /* fill in blocks in chrom order */
 AllocArray(psl.blockSizes, 3 * gp->exonCount);
 psl.qStarts = psl.blockSizes + gp->exonCount;
 psl.tStarts = psl.qStarts + gp->exonCount;
 
 for (iExon = 0; iExon < gp->exonCount; iExon++)
     {
     int exonSize = gp->exonEnds[iExon] - gp->exonStarts[iExon];
     int qEnd = qStart + exonSize;
     psl.blockSizes[iExon] = exonSize;
     psl.qStarts[iExon] = qStart;
     psl.tStarts[iExon] = gp->exonStarts[iExon];
     if (iExon > 0)
         {
         /* count intron as bases inserted */
         psl.tBaseInsert += gp->exonStarts[iExon]-gp->exonEnds[iExon-1];
         }
     psl.blockCount++;
     qStart = qEnd;
     }
 assert(qStart == rnaSize);
 
 if (pslCheck("from genePred", stderr, &psl) > 0)
     {
     fprintf(stderr, "psl: ");
     pslTabOut(&psl, stderr);
     errAbort("invalid psl created");
     }
 pslTabOut(&psl, pslFh);
 freeMem(psl.blockSizes);
 }
 
 void outputPeptide(struct genePred *gp, char *name, struct dyString *cdsBuf, FILE* faFh)
 /* output the peptide sequence */
 {
 int offset = 0;
 
 /* get frame offset, if available and needed */
 if (gp->exonFrames != NULL) 
 {
     if (gp->strand[0] == '+' && gp->cdsStartStat != cdsComplete)
         offset = (3 - gp->exonFrames[0]) % 3;
     else if (gp->strand[0] == '-' && gp->cdsEndStat != cdsComplete)
         offset = (3 - gp->exonFrames[gp->exonCount-1]) % 3;
 }
 /* NOTE: this fix will not handle the case in which frame is shifted
  * internally or at multiple exons, as when frame-shift gaps occur in
  * an alignment of an mRNA to the genome.  */
 
 /* just overwrite the buffer with the peptide, which will stop at end of DNA
  * if no stop codon. Buffer size must allow for stop codon. */
 int ir, ip;
 boolean isChrM = isMito(gp->chrom);
 for (ir = offset, ip = 0; ir < cdsBuf->stringSize; ir += 3, ip++)
     {
     cdsBuf->string[ip] = (isChrM ? lookupMitoCodon(cdsBuf->string+ir)
                           : lookupCodon(cdsBuf->string+ir));
     }
 cdsBuf->string[ip] = '\0';
   
 faWriteNext(faFh, name, cdsBuf->string, strlen(cdsBuf->string));
 }
 
 void processGenePred(char *db, struct genePred *gp, struct dyString *dnaBuf, 
                      struct dyString *cdsBuf, struct dyString *nameBuf,
                      FILE* faFh, FILE* cdsFh, FILE* pslFh)
 /* output genePred DNA, check for weird splice sites if requested */
 {
 int i;
 int index = 0;
 
 /* Load exons one by one into dna string. */
 dyStringClear(dnaBuf);
 dyStringClear(nameBuf);
 dyStringAppend(nameBuf, gp->name);
 dyStringAppend(nameBuf, suffix);
 if (exonIndices || includeCoords)
     dyStringPrintf(nameBuf, " %s", db);  /* we'll also include the db */
 if (includeCoords)
     dyStringPrintf(nameBuf, " %s:%d-%d", gp->chrom, gp->txStart, gp->txEnd);
 for (i=0; i<gp->exonCount; ++i)
     {
     int start = gp->exonStarts[i];
     int end = gp->exonEnds[i];
     int size = end - start;
     if (size < 0)
         warn("%d sized exon in %s\n", size, gp->name);
     else
         {
         struct dnaSeq *seq = fetchDna(db, gp->chrom, start, end, (keepMasking ? dnaMixed : dnaLower));
         dyStringAppendN(dnaBuf, seq->dna, size);
         freeDnaSeq(&seq);
         }
     }
 
 /* Reverse complement if necessary */
 if (gp->strand[0] == '-')
     reverseComplement(dnaBuf->string, dnaBuf->stringSize);
 
 /* create list of exon indices, if necessary */
 if (exonIndices) 
     {
     for (i = (gp->strand[0] == '-' ? gp->exonCount-1 : 0); 
          i >= 0 && i < gp->exonCount; 
          i += (gp->strand[0] == '-' ? -1 : 1))
                                 /* use forward order if plus strand
                                    and reverse order if minus
                                    strand */
         {
         index += gp->exonEnds[i] - gp->exonStarts[i];
         if (maxSize != -1 && index > maxSize) index = maxSize;
         dyStringPrintf(nameBuf, " %d", index);
         if (index == maxSize) break; /* can only happen if maxSize != -1 */
         }
     } 
 
 if ((gp->cdsStart < gp->cdsEnd)
     && (cdsUpper || cdsOnly || peptides || (cdsFh != NULL)))
     processCds(gp, dnaBuf, cdsBuf, cdsFh);
 
 if (cdsOnly)
     faWriteNext(faFh, nameBuf->string, cdsBuf->string, 
                 (maxSize != -1 && cdsBuf->stringSize > maxSize) ? maxSize : 
                 cdsBuf->stringSize);
 else if (peptides)
     {
     if (gp->cdsStart < gp->cdsEnd)
 	outputPeptide(gp, nameBuf->string, cdsBuf, faFh);
     }
 else
     faWriteNext(faFh, nameBuf->string, dnaBuf->string, 
                 (maxSize != -1 && dnaBuf->stringSize > maxSize) ? maxSize : 
                 dnaBuf->stringSize);
 
 if (pslFh != NULL)
     writePsl(db, gp, pslFh);
 }
 
 void getRnaForTable(char *db, char *table, char *chrom, struct dyString *dnaBuf, 
                     struct dyString *cdsBuf, struct dyString *nameBuf,
                     FILE *faFh, FILE *cdsFh, FILE* pslFh)
 /* get RNA for a genePred table */
 {
 int rowOffset;
 char **row;
 struct sqlConnection *conn = hAllocConn(db);
 struct sqlResult *sr;
 
 if (!sqlTableExists(conn, table))
     errAbort("table or file \"%s\" does not exists", table);
 
 sr = hChromQuery(conn, table, chrom, NULL, &rowOffset);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     /* Load gene prediction from database. */
     struct genePred *gp;
     if (genePredExt)
       gp = genePredExtLoad(row+rowOffset, GENEPREDX_NUM_COLS);
     else
       gp = genePredLoad(row+rowOffset);
     if ((!weird) || hasWeirdSplice(db, gp))
         processGenePred(db, gp, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh);
     genePredFree(&gp);
     }
 sqlFreeResult(&sr);
 hFreeConn(&conn);
 }
 
 void getRnaForTables(char *db, char *table, char *chrom, struct dyString *dnaBuf, 
                      struct dyString *cdsBuf, struct dyString *nameBuf,
                      FILE *faFh, FILE *cdsFh, FILE* pslFh)
 /* get RNA for one for possibly splite genePred table */
 {
 struct slName* chroms = NULL, *chr;
 if (sameString(chrom, "all"))
     chroms = hAllChromNames(db);
 else
     chroms = slNameNew(chrom);
 for (chr = chroms; chr != NULL; chr = chr->next)
     getRnaForTable(db, table, chr->name, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh);
 }
 
 void getRnaForFile(char *db, char *table, char *chrom, struct dyString *dnaBuf, 
                    struct dyString *cdsBuf, struct dyString *nameBuf,
                    FILE *faFh, FILE* cdsFh, FILE* pslFh)
 /* get RNA for a genePred file */
 {
 boolean all = sameString(chrom, "all");
 struct lineFile *lf = lineFileOpen(table, TRUE);
 char *row[GENEPREDX_NUM_COLS];
 while (lineFileNextRowTab(lf, row, genePredExt ? 
                           GENEPREDX_NUM_COLS : GENEPRED_NUM_COLS))
     {
     struct genePred *gp;
     if (genePredExt) 
       gp = genePredExtLoad(row, GENEPREDX_NUM_COLS);
     else 
       gp = genePredLoad(row);
     if (all || sameString(gp->chrom, chrom))
         processGenePred(db, gp, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh);
     genePredFree(&gp);
     }
 lineFileClose(&lf); 
 }
 
 void getRnaPred(char *db, char *table, char *chrom, char *faOut)
 /* getRna - Get RNA for gene predictions and write to file. */
 {
 struct dyString *dnaBuf = dyStringNew(16*1024);
 struct dyString *cdsBuf = NULL;
 struct dyString *nameBuf = dyStringNew(64);
 FILE *faFh = mustOpen(faOut, "w");
 FILE *cdsFh = NULL;
 FILE *pslFh = NULL;
 if (cdsOnly || peptides)
     cdsBuf = dyStringNew(16*1024);
 dyStringNew(16*1024);
 if (cdsOut != NULL)
     cdsFh = mustOpen(cdsOut, "w");
 if (pslOut != NULL)
     pslFh = mustOpen(pslOut, "w");
 if (exonIndices)
    nameBuf = dyStringNew(512);
 
 if (fileExists(table))
     getRnaForFile(db, table, chrom, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh);
 else
     getRnaForTables(db, table, chrom, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh);
 
 dyStringFree(&dnaBuf);
 dyStringFree(&cdsBuf);
 dyStringFree(&nameBuf);
 carefulClose(&pslFh);
 carefulClose(&cdsFh);
 carefulClose(&faFh);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 5)
     usage();
 dnaUtilOpen();
 weird = optionExists("weird");
 cdsUpper = optionExists("cdsUpper");
 cdsOnly = optionExists("cdsOnly");
 cdsOut = optionVal("cdsOut", NULL); 
 keepMasking = optionExists("keepMasking"); 
 pslOut = optionVal("pslOut", NULL); 
 suffix = optionVal("suffix", suffix); 
 peptides = optionExists("peptides");
 includeCoords = optionExists("includeCoords");
 exonIndices = optionExists("exonIndices");
 maxSize = optionInt("maxSize", -1);
 genePredExt = optionExists("genePredExt");
 if (cdsOnly && peptides)
     errAbort("can't specify both -cdsOnly and -peptides");
 if (cdsUpper && keepMasking)
     errAbort("can't specify both -cdsUpper and -keepMasking");
 if (exonIndices && (cdsOnly || peptides))
     errAbort("can't specify -exonIndices with -cdsOnly or -peptides");
 if (maxSize != -1 && peptides)
     errAbort("can't specify -maxSize with -peptides");
 genomeSeqs = optionVal("genomeSeqs", NULL);
 getRnaPred(argv[1], argv[2], argv[3], argv[4]);
 return 0;
 }