44ccfacbe3a3d4b300f80d48651c77837a4b571e galt Tue Apr 26 11:12:02 2022 -0700 SQL INJECTION Prevention Version 2 - this improves our methods by making subclauses of SQL that get passed around be both easy and correct to use. The way that was achieved was by getting rid of the obscure and not well used functions sqlSafefFrag and sqlDyStringPrintfFrag and replacing them with the plain versions of those functions, since these are not needed anymore. The new version checks for NOSQLINJ in unquoted %-s which is used to include SQL clauses, and will give an error the NOSQLINJ clause is not present, and this will automatically require the correct behavior by developers. sqlDyStringPrint is a very useful function, however because it was not enforced, users could use various other dyString functions and they operated without any awareness or checking for SQL correct use. Now those dyString functions are prohibited and it will produce an error if you try to use a dyString function on a SQL string, which is simply detected by the presence of the NOSQLINJ prefix. diff --git src/hg/poster/foldDb/foldDb.c src/hg/poster/foldDb/foldDb.c index 562d96a..d695271 100644 --- src/hg/poster/foldDb/foldDb.c +++ src/hg/poster/foldDb/foldDb.c @@ -1,1453 +1,1455 @@ /* foldDb - Search database info for making foldout. */ /* Copyright (C) 2013 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "hash.h" #include "linefile.h" #include "dlist.h" #include "obscure.h" #include "jksql.h" #include "hdb.h" #include "memgfx.h" #include "psl.h" #include "genePred.h" #include "est3.h" #include "gcPercent.h" #include "rmskOut.h" #include "agpFrag.h" #include "agpGap.h" #include "cpgIsland.h" #include "wabAli.h" #include "clonePos.h" #include "ctgPos.h" #include "glDbRep.h" #include "knownInfo.h" #include "exoFish.h" #include "rnaGene.h" #include "snp.h" #include "cytoBand.h" #include "knownMore.h" #include "hCommon.h" /* Coverage info file. */ char *covFile = "coverage.oct"; /* Disease genes file. */ char *diseaseFile = "disease.oct"; /* Resolved duplications file. */ char *bestDupFile = "bestDup.oct"; /* Rough location of Ensembl genes. */ char *ensTick = "/projects/hg/gs.5/oo.21/bed/ensTick/ensTick.bed"; /* Unresolved dupe output. */ FILE *dupeFile; char *dupeFileName = "dupes.out"; struct knownGene /* Enough info to draw a known gene. */ { struct knownGene *next; char *name; /* Name of gene. */ char omimIdString[16]; /* OMIM ID ascii representation. */ char *chrom; /* Name of chromosome. */ int start,end; /* Position in chromosome. */ boolean isDisease; /* is a disease gene? */ }; int knownGeneCmpPos(const void *va, const void *vb) /* Compare to sort based on start position. */ { const struct knownGene *a = *((struct knownGene **)va); const struct knownGene *b = *((struct knownGene **)vb); int diff = strcmp(a->chrom, b->chrom); if (diff == 0) diff = a->start - b->start; return diff; } /* -------------- Some Stuff for NT Patch on October. ---------------*/ /* Locations of NT contigs that need patching in oo21 assembly. */ char *ntPosFile = "ntPos.oct"; /* Locations of genes within NT contigs. */ char *ntGeneFile = "ntKnownGene.oct"; struct ntPos /* Position of an NT contig. */ { struct ntPos *next; char *name; /* Name of contig. */ char *chrom; /* Chromosome. */ int start,end; /* Range on chromosome. */ char strand; /* + or - */ struct ntGene *geneList; /* List of genes. */ }; struct ntGene /* Position of gene within an NT contig. */ { struct ntGene *next; struct ntPos *nt; /* Contig this is part of. */ int start,end; /* Position within contig. */ char *hugoName; /* Name of gene according to hugo. */ char *omimIdString; /* ASCII representation of OMIM id. */ }; int ntGeneCmpPos(const void *va, const void *vb) /* Compare to sort based on start position. */ { const struct ntGene *a = *((struct ntGene **)va); const struct ntGene *b = *((struct ntGene **)vb); return a->start - b->start; } struct hash *ntPosHash; /* Hash with ntPos values. */ struct ntPos *ntPosList; /* List of all the places to patch. */ struct hash *ntGeneNameHash; /* Hash of NT genes by name. */ struct hash *ntGeneOmimHash; /* Hash of NT genes by hugo ID. */ void loadNtPos() /* Read in NT pos file. */ { struct lineFile *lf = lineFileOpen(ntPosFile, TRUE); char *words[5]; struct ntPos *ntPos; ntPosHash = newHash(12); while (lineFileRow(lf, words)) { AllocVar(ntPos); slAddHead(&ntPosList, ntPos); hashAddSaveName(ntPosHash, words[3], ntPos, &ntPos->name); ntPos->chrom = hgOfficialChromName(words[0]); ntPos->start = lineFileNeedNum(lf, words, 1); ntPos->end = lineFileNeedNum(lf, words, 2); ntPos->strand = words[4][0]; } lineFileClose(&lf); slReverse(&ntPosList); printf("Loaded %d NT positions\n", slCount(ntPosList)); } void loadNtGenes() /* Read in knownGene table to appropriate ntPos. */ { struct lineFile *lf = lineFileOpen(ntGeneFile, TRUE); char *words[6]; struct ntPos *ntPos; struct ntGene *ntg; int count = 0; int skipped = 0; int duped = 0; ntGeneNameHash = newHash(12); ntGeneOmimHash = newHash(12); while (lineFileRow(lf, words)) { if ((ntPos = hashFindVal(ntPosHash, words[0])) == NULL) { ++skipped; continue; } if (hashLookup(ntGeneNameHash, words[4]) || (words[5][0] != '-' && hashLookup(ntGeneOmimHash,words[5]))) { ++duped; continue; } hashAdd(ntGeneNameHash, words[4], NULL); if (words[5][0] != '-') hashAdd(ntGeneOmimHash, words[5], NULL); AllocVar(ntg); slAddHead(&ntPos->geneList, ntg); ntg->start = lineFileNeedNum(lf, words, 1); ntg->end = lineFileNeedNum(lf, words, 2); ntg->hugoName = cloneString(words[4]); ntg->omimIdString = cloneString(words[5]); ++count; } lineFileClose(&lf); for (ntPos = ntPosList; ntPos != NULL; ntPos = ntPos->next) { if (ntPos->strand == '-') { int ntSize = ntPos->end - ntPos->start; int s, e; slReverse(&ntPos->geneList); for (ntg = ntPos->geneList; ntg != NULL; ntg = ntg->next) { s = ntSize - ntg->end; e = ntSize - ntg->start; //uglyAbort("%s in %s, ntSize %d, s %d, e %d, ntg->start %d, ntg->end %d\n", // ntg->hugoName, ntPos->name, ntSize, s, e, ntg->start, ntg->end); ntg->start = s; ntg->end = e; } } slSort(&ntPos->geneList, ntGeneCmpPos); } printf("Loaded %d NT genes, skipped %d, duped %d\n", count, skipped, duped); } void loadNtPatchFiles() /* Read in info on NT patches into ntPosHash/ntPosList. */ { loadNtPos(); loadNtGenes(); } void ntPatchChromosome(char *chromosome, struct dlList *geneList) /* Apply NT patches to geneList. */ { struct knownGene *kg; struct dlNode *node; struct ntPos *ntChromStart, *ntPos; struct ntGene *ntg; printf("Patching NT genes\n"); /* Find start of this chromosome in ntPosList. */ chromosome = hgOfficialChromName(chromosome); for (ntChromStart = ntPosList; ntChromStart != NULL; ntChromStart = ntChromStart->next) { if (ntChromStart->chrom == chromosome) break; } /* Get rid of any existing genes that share either space, name or OMIM ID * with the NT genes to be patched in. */ for (node = geneList->head; !dlEnd(node); node = node->next) { kg = node->val; if (hashLookup(ntGeneNameHash, kg->name) ) dlRemove(node); else if (isdigit(kg->omimIdString[0]) && hashLookup(ntGeneOmimHash, kg->omimIdString)) dlRemove(node); else { int genePos = (kg->start + kg->end)/2; for (ntPos = ntChromStart; ntPos != NULL && ntPos->chrom == chromosome; ntPos = ntPos->next) { if (ntPos->start <= genePos && genePos < ntPos->end) { dlRemove(node); break; } } } } /* Add in NT genes. */ for (ntPos = ntChromStart; ntPos != NULL && ntPos->chrom == chromosome; ntPos = ntPos->next) { for (ntg = ntPos->geneList; ntg != NULL; ntg = ntg->next) { AllocVar(kg); kg->name = ntg->hugoName; strcpy(kg->omimIdString, ntg->omimIdString); kg->chrom = chromosome; kg->start = ntg->start + ntPos->start; kg->end = ntg->end + ntPos->start; dlAddValTail(geneList, kg); } } dlSort(geneList, knownGeneCmpPos); } /* -------------- End of (most) Stuff for NT Patch on October. ---------------*/ /* Centromere/big gap info. Obsolete as of October 2000 assembly. */ //char *gapFile = "centro.tab"; /* Some colors. */ struct rgbColor rgbRed = {240,0,0}; struct rgbColor rgbRedOrange = {240, 100, 0}; struct rgbColor rgbYellow = {255, 255, 0}; static struct rgbColor blueGene = { 0, 0, 160}; void usage() /* Explain usage and exit. */ { errAbort( "foldDb - Search database info for making foldout\n" "usage:\n" " foldDb chrM chrN ...\n"); } struct chromGaps /* Placeholder for obsolete structure. */ { struct chromGaps *next; }; int gapOffset(struct chromGaps *gaps, int pos) /* Placefiller for old code that inserted gaps. (Now they are included in sequence itself.) */ { return 0; } void printTab(FILE *f, struct chromGaps *cg, char *chrom, int start, int end, char *track, char *type, int r, int g, int b, char *val) /* Print line to tab-delimited file. */ { static double colScale = 1.0/255.0; double red = r*colScale; double green = g*colScale; double blue = b*colScale; if (!sameString(track, "heterochromatin")) { int offset = gapOffset(cg, (start+end)/2); start += offset; end += offset; } fprintf(f, "%s\t%d\t%d\t%s\t%s\t%f,%f,%f\t%s\n", chrom, start, end, track, type, red, green, blue, val); } void printTabNum(FILE *f, struct chromGaps *cg, char *chrom, int start, int end, char *track, char *type, int r, int g, int b, double num) /* Print number-valued line to tab-delimited file. */ { char buf[32]; sprintf(buf, "%f", num); printTab(f, cg, chrom, start, end, track, type, r, g, b, buf); } void getHeterochromatin(struct chromGaps *cg, char *chrom, struct sqlConnection *conn, FILE *f) /* Print out centromere info (and other big gaps). */ { struct sqlResult *sr; char **row; char query[256]; struct cytoBand el; int r,g,b; char *stain; boolean inCen = FALSE, lastInCen = FALSE; boolean cenStart = BIGNUM, cenEnd = -BIGNUM; printf(" getting heterochromatin\n"); sqlSafef(query, sizeof query, "select * from cytoBand where chrom = '%s'", chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { struct rgbColor *col; cytoBandStaticLoad(row, &el); stain = el.gieStain; inCen = FALSE; if (startsWith("acen", stain)) { if (!lastInCen) cenStart = el.chromStart; cenEnd = el.chromEnd; inCen = TRUE; } if (lastInCen && !inCen) { col = &rgbRed; printTab(f, cg, chrom, cenStart, cenEnd, "heterochromatin", "box", col->r, col->g, col->b, "centromere"); } lastInCen = inCen; if (startsWith("hetero", stain)) { col = &rgbRedOrange; printTab(f, cg, chrom, el.chromStart, el.chromEnd, "heterochromatin", "box", col->r, col->g, col->b, "other"); } } sqlFreeResult(&sr); } struct resolvedDup /* A duplication that has been resolved by hand. */ { struct resolvedDup *next; /* Next in list. */ char *name; /* Name - allocated in hash. */ char *trueChrom; /* Real chromosome - allocated here. */ int trueStart; /* Start of real gene. */ int trueEnd; /* End of true gene. */ boolean used; /* True when used - can be used only once. */ }; void makeResolvedDupes(struct hash **retHash, struct resolvedDup **retList) /* Read in list of dupes that John resolved by hand. */ { struct hash *hash = newHash(6); struct lineFile *lf = lineFileOpen(bestDupFile, TRUE); char *words[32], *parts[4]; int wordCount, partCount; struct resolvedDup *rd, *rdList = NULL; char *bestDupFile = "bestDup.txt"; while ((wordCount = lineFileChop(lf, words)) > 0) { lineFileExpectWords(lf, 4, wordCount); AllocVar(rd); hashAddSaveName(hash, words[1], rd, &rd->name); partCount = chopString(words[3], ":-", parts, ArraySize(parts)); if (partCount != 3) errAbort("Misformed chromosome location line %d of %s", lf->lineIx, lf->fileName); rd->trueChrom = cloneString(parts[0]); rd->trueStart = atoi(parts[1]); rd->trueEnd = atoi(parts[2]); slAddHead(&rdList, rd); } slReverse(&rdList); *retHash = hash; *retList = rdList; uglyf("Got %d resolved duplications\n", slCount(rdList)); } struct omimInfo /* Info on OMIM gene. */ { struct omimInfo *next; /* Next in list. */ char *name; /* HUGO/OMIM name, allocated in hash. */ int uses; /* Number of times used. */ struct genePred *gpExtra; /* Extra info if it's on GenieAlt rather than GenieKnown table. */ }; void makeDiseaseHash(struct sqlConnection *conn, struct hash **retNameHash, struct hash **retOmimHash) /* Make hash of all disease genes. */ { struct hash *nameHash = newHash(12); struct hash *omimHash = newHash(12); struct lineFile *lf = lineFileOpen(diseaseFile, TRUE); char *row[3]; while (lineFileRow(lf, row)) { hashAdd(nameHash, row[1], NULL); hashAdd(omimHash, row[2], NULL); } lineFileClose(&lf); *retNameHash = nameHash; *retOmimHash = omimHash; } struct knownPos /* Position of a known gene. */ { struct knownPos *next; /* Next in list. */ char *name; /* Name. */ char *chrom; /* Chromosome. */ int chromStart; int chromEnd; int dupeCount; /* Number of times duplicated. */ }; int cmpGenePred(const void *va, const void *vb) /* Compare to sort based on chromosome, start. */ { const struct genePred *a = *((struct genePred **)va); const struct genePred *b = *((struct genePred **)vb); int dif; dif = strcmp(a->chrom, b->chrom); if (dif == 0) dif = a->txStart - b->txStart; return dif; } void getKnownGenes(struct chromGaps *cg, char *chrom, struct sqlConnection *conn, FILE *f, struct hash *hugoDupHash, struct hash *knownUniqHash, struct hash *resolvedDupHash, struct hash *diseaseNameHash, struct hash *diseaseOmimHash) /* Get info on known genes. */ { struct sqlResult *sr; char **row; char query[256]; struct genePred *gpList = NULL, *gp; char geneName[64]; static struct rgbColor diseaseColor = {255, 0, 0}; struct rgbColor *col; boolean keepGene; int lastStart = -BIGNUM; struct knownPos *kp; char omimIdAsString[16]; struct dlList *geneList = newDlList(); struct dlNode *node; struct knownGene *kg; bool isDisease; printf(" finding known genes for %s, chrom\n"); /* Get list of known genes. */ sqlSafef(query, sizeof query, "select * from genieKnown where chrom = '%s' order by cdsStart", chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { gp = genePredLoad(row); slAddHead(&gpList, gp); } sqlFreeResult(&sr); slSort(&gpList, cmpGenePred); /* Now find out name - either from disease gene hash or known table in database. */ for (gp = gpList; gp != NULL; gp = gp->next) { keepGene = FALSE; isDisease = FALSE; sqlSafef(query, sizeof query, "select * from knownMore where transId = '%s'", gp->name); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) != NULL) { struct knownMore km; knownMoreStaticLoad(row, &km); strcpy(geneName, km.name); if (km.omimId != 0) sprintf(omimIdAsString, "%d", km.omimId); else strcpy(omimIdAsString, "-"); if (!sameString(geneName, "n/a") ) { if (gp->cdsStart != lastStart) { keepGene = TRUE; lastStart = gp->cdsStart; } if (hashLookup(diseaseNameHash, geneName)) isDisease = TRUE; else if (hashLookup(diseaseOmimHash, omimIdAsString) || (km.omimName[0] != 0 && hashLookup(diseaseNameHash, km.omimName))) { if (km.omimName[0] != 0) strcpy(geneName, km.omimName); isDisease = TRUE; } } } sqlFreeResult(&sr); if (keepGene) { boolean showIt = FALSE; boolean doWarn = TRUE; struct resolvedDup *rd; if ((kp = hashFindVal(knownUniqHash, geneName)) == NULL) { showIt = TRUE; AllocVar(kp); hashAddSaveName(knownUniqHash, geneName, kp, &kp->name); kp->chrom = chrom; kp->chromStart = gp->cdsStart; kp->chromEnd = gp->cdsEnd; kp->dupeCount = 1; } else { kp->dupeCount += 1; } if ((rd = hashFindVal(resolvedDupHash, geneName)) != NULL) { if (sameString(chrom, rd->trueChrom) && gp->cdsStart == rd->trueStart && gp->cdsEnd == rd->trueEnd) showIt = TRUE; else showIt = FALSE; doWarn = FALSE; uglyf("Resolving dup %s (%d) at %s:%d-%d\n", geneName, showIt, chrom, gp->cdsStart, gp->cdsEnd); } if (kp != NULL && !showIt) { if (kp->chrom != chrom || kp->chromStart > gp->cdsStart + 300000 || kp->chromStart < gp->cdsStart - 300000) { if (doWarn) { fprintf(dupeFile, "%s\t%s\t%4.1f\n", geneName, chrom, 0.000001*kp->chromStart); warn("Duplicate %s at %s:%d-%d and %s:%d-%d, skipping all but first,", geneName, kp->chrom, kp->chromStart, kp->chromEnd, chrom, gp->cdsStart, gp->cdsEnd); } } } if (showIt) { AllocVar(kg); kg->name = cloneString(geneName); strcpy(kg->omimIdString, omimIdAsString); kg->chrom = hgOfficialChromName(chrom); kg->start = gp->cdsStart; kg->end = gp->cdsEnd; kg->isDisease = isDisease; dlAddValTail(geneList, kg); } } } ntPatchChromosome(chrom, geneList); for (node = geneList->head; !dlEnd(node); node = node->next) { int len; char *name; kg = node->val; name = kg->name; len = strlen(name); if (kg->isDisease || hashLookup(diseaseNameHash, name) || hashLookup(diseaseOmimHash, kg->omimIdString)) col = &diseaseColor; else col = &blueGene; if (len < 8 && !sameString(name, "pseudo") && !startsWith("DKFZ", name) && !startsWith("KIAA", name) && !(startsWith("LOC", name) && len > 5)) { printTab(f, cg, chrom, kg->start, kg->end, "hugo", "word", col->r, col->g, col->b, kg->name); } } freeDlList(&geneList); } struct tickPos /* A tick position */ { struct tickPos *next; int start, end; char *val; }; void getPredGenes(struct chromGaps *cg, char *chrom, struct sqlConnection *conn, FILE *f, char *table, int red, int green, int blue) /* Get predicted genes. */ { struct sqlResult *sr; char **row; char query[256]; struct genePred *gp; printf(" Getting %s predicted genes\n", table); sqlSafef(query, sizeof query, "select * from %s where chrom = '%s' order by cdsStart", table, chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { gp = genePredLoad(row); printTab(f, cg, chrom, gp->cdsStart, gp->cdsEnd, "genePred", "tick", red, green, blue, "."); genePredFree(&gp); } sqlFreeResult(&sr); } void getCpgIslands(struct chromGaps *cg, char *chrom, struct sqlConnection *conn, FILE *f) /* Get cpgIslands. */ { struct sqlResult *sr; char **row; char query[256]; struct cpgIsland el; sqlSafef(query, sizeof query, "select * from cpgIsland where chrom = '%s'", chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { cpgIslandStaticLoad(row, &el); printTab(f, cg, chrom, el.chromStart, el.chromEnd, "cpgIsland", "tick", 0, 120, 0, "."); } sqlFreeResult(&sr); } void getExofishHits(struct chromGaps *cg, char *chrom, struct sqlConnection *conn, FILE *f) /* Get Exofis stuff. */ { struct sqlResult *sr; char **row; char query[256]; struct exoFish el; sqlSafef(query, sizeof query, "select * from exoFish where chrom = '%s'", chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { exoFishStaticLoad(row, &el); printTab(f, cg, chrom, el.chromStart, el.chromEnd, "tetraodon", "tick", 0, 70, 120, "."); } sqlFreeResult(&sr); } void getRnaGenes(struct chromGaps *cg, char *chrom, struct sqlConnection *conn, FILE *f) /* Get RNA gene stuff. */ { struct sqlResult *sr; char **row; char query[256]; struct rnaGene el; int r,g,b; sqlSafef(query, sizeof query, "select * from rnaGene where chrom = '%s'", chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { rnaGeneStaticLoad(row, &el); if (el.isPsuedo) { r = 255; g = 200; b = 150; } else { r = 120; g = 60; b = 0; } printTab(f, cg, chrom, el.chromStart, el.chromEnd, "rnaGenes", "tick", r, g, b, el.name); } sqlFreeResult(&sr); } void getTetHits(struct chromGaps *cg, char *chrom, struct sqlConnection *conn, FILE *f) /* Get tetroadon hits. */ { struct sqlResult *sr; char **row; char query[256]; struct wabAli el; sqlSafef(query, sizeof query, "select * from waba_tet where chrom = '%s'", chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { wabAliStaticLoad(row, &el); printTab(f, cg, chrom, el.chromStart, el.chromEnd, "tetraodon", "tick", 0, 70, 120, "."); } sqlFreeResult(&sr); } void getEst3Ticks(struct chromGaps *cg, char *chrom, struct sqlConnection *conn, FILE *f) /* Get est 3' ends. */ { struct sqlResult *sr; char **row; char query[256]; struct est3 *el; sqlSafef(query, sizeof query, "select * from %s where chrom = '%s'", "est3", chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { el = est3Load(row); printTab(f, cg, chrom, el->chromStart, el->chromEnd, "est3", "tick", 0, 0, 0, "."); est3Free(&el); } sqlFreeResult(&sr); } void getEstIntronTicks(struct chromGaps *cg, char *chrom, struct sqlConnection *conn, FILE *f) /* Get ests with introns. */ { struct sqlResult *sr; char **row; char query[256]; struct psl *psl; int lastEnd = -BIGNUM; int sepDistance = 20000; sqlSafef(query, sizeof query, "select * from %s_intronEst", chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { psl = pslLoad(row); if (psl->tStart > lastEnd + sepDistance) printTab(f, cg, chrom, psl->tStart, psl->tStart+1, "est3", "tick", 0, 0, 0, "."); lastEnd = psl->tEnd; pslFree(&psl); } sqlFreeResult(&sr); } void getEstTicks(struct chromGaps *cg, char *chrom, struct sqlConnection *conn, FILE *f) { if (sqlTableExists(conn, "est3")) getEst3Ticks(cg, chrom, conn, f); else getEstIntronTicks(cg,chrom,conn, f); } void getGc(struct chromGaps *cg, char *chrom, struct sqlConnection *conn, FILE *f) /* Write wiggle track for GC. */ { struct sqlResult *sr; char **row; char query[256]; struct gcPercent *el; printf(" getting GC wiggle\n"); sqlSafef(query, sizeof query, "select * from %s where chrom = '%s'", "gcPercent", chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { el = gcPercentLoad(row); if (el->gcPpt != 0) { double scale; scale = (el->gcPpt - 300)*0.00300; if (scale > 1) scale = 1; if (scale < 0) scale = 0; printTabNum(f, cg, chrom, el->chromStart, el->chromEnd, "gcPercent", "wiggle", 0, 0, 0, scale); } gcPercentFree(&el); } sqlFreeResult(&sr); } void addSegTotals(int chromStart, int chromEnd, int *countArray, int windowSize, int chromSize) /* Store the total number of bases in each window between chromStart and chromEnd * in the appropriate countArray element. */ { int startWinIx = chromStart/windowSize; int endWinIx = (chromEnd-1)/windowSize; int winBoundary; int winIx; int size; if (chromStart < 0 || chromStart > chromSize) { warn("Clipping start %d (%d) in addSegTotals\n", chromStart, chromSize); return; } if (chromEnd < 0 || chromEnd > chromSize || chromStart >= chromEnd) { warn("Clipping end %d (%d) in addSegTotals\n", chromStart, chromSize); return; } if (startWinIx == endWinIx) { countArray[startWinIx] += (chromEnd - chromStart); return; } winBoundary = (startWinIx+1)*windowSize; size = winBoundary - chromStart; assert(size > 0); assert(size <= windowSize); countArray[startWinIx] += size; for (winIx = startWinIx+1; winIx < endWinIx; ++winIx) { countArray[winIx] += windowSize; } winBoundary = (endWinIx)*windowSize; size = chromEnd - winBoundary; assert(size > 0); assert(size <= windowSize); countArray[endWinIx] += size; } void getRepeatDensity(struct chromGaps *cg, char *chrom, int chromSize, struct sqlConnection *conn, FILE *f, char *repClass, int windowSize, double scale, int red, int green, int blue) /* Put out repeat density info. */ { struct sqlResult *sr; char **row; char query[256]; struct rmskOut rmsk; struct agpFrag frag; int winsPerChrom = (chromSize + windowSize-1)/windowSize; int *winBases; int *winRepBases; int i; int baseCount, repCount; double density; double minDen = 1.0, maxDen = 0.0; int winStart, winEnd; printf(" Getting %s repeats in %d windows of size %d\n", repClass, winsPerChrom, windowSize); fflush(stdout); /* Count up repeats in each window. */ AllocArray(winRepBases, winsPerChrom); sqlSafef(query, sizeof query, "select * from %s_rmsk where repClass = '%s'", chrom, repClass); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { rmskOutStaticLoad(row, &rmsk); addSegTotals(rmsk.genoStart, rmsk.genoEnd, winRepBases, windowSize, chromSize); } sqlFreeResult(&sr); /* Count up bases in each window. */ AllocArray(winBases, winsPerChrom); sqlSafef(query, sizeof query, "select * from %s_gold", chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { agpFragStaticLoad(row, &frag); addSegTotals(frag.chromStart, frag.chromEnd, winBases, windowSize, chromSize); } sqlFreeResult(&sr); /* Output density. */ for (i=0; i windowSize/3) { repCount = winRepBases[i]; density = (double)repCount/(double)baseCount; if (density < minDen) minDen = density; if (density > maxDen) maxDen = density; density *= scale; if (density > 1.0) density = 1.0; winStart = i*windowSize; winEnd = winStart + windowSize; if (winEnd > chromSize) winEnd = chromSize; printTabNum(f, cg, chrom, winStart, winEnd, repClass, "wiggle", red, green, blue, density); } } printf(" %s minDen %f, maxDen %f\n", chrom, minDen, maxDen); freeMem(winBases); freeMem(winRepBases); } #define finCov 100 struct rgbColor draftCols[10]; void initDraftCols() /* Initialize draft colors with a range. */ { int i; int dif; int maxCol = ArraySize(draftCols)-1; double scale; for (i=0; i<= maxCol; ++i) { scale = (double)i/(double)maxCol; dif = rgbRedOrange.r - rgbYellow.r; draftCols[i].r = rgbYellow.r + round(dif*scale); dif = rgbRedOrange.g - rgbYellow.g; draftCols[i].g = rgbYellow.g + round(dif*scale); dif = rgbRedOrange.b - rgbYellow.b; draftCols[i].b = rgbYellow.b + round(dif*scale); } } void makeBackupCoverageHash(struct sqlConnection *conn, struct hash *hash) /* Make hash table that describe coverage of each clone just based on * whether it's draft/predraft/etc. Only do this if clone not already * in hash. */ { char query[512]; struct clonePos cp; struct sqlResult *sr; char **row; int coverage; char stage; sqlSafef(query, sizeof query, "select * from clonePos"); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { clonePosStaticLoad(row, &cp); stage = cp.stage[0]; if (stage == 'D') coverage = 5; else if (stage == 'F') coverage = finCov; else if (stage == 'P') coverage = 2; else errAbort("Unknown stage %c for clone %s", stage, cp.name); chopSuffix(cp.name); if (!hashLookup(hash, cp.name)) hashAdd(hash, cp.name, intToPt(coverage)); } sqlFreeResult(&sr); } void makeMainCoverageHash(char *fileName, struct hash *hash) /* Scan file for clone coverage. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *words[16]; int wordCount; double d; int rd; char *acc; while ((wordCount = lineFileChop(lf, words)) != 0) { lineFileExpectWords(lf, wordCount, 2); rd = round(atof(words[1])); if (rd > 20) rd = 20; hashAddUnique(hash, words[0], intToPt(rd)); } lineFileClose(&lf); } struct hash *makeCoverageHash(struct sqlConnection *conn) /* Get coverage hash, making it if necessary. */ { static struct hash *hash = NULL; if (hash == NULL) { hash = newHash(16); makeMainCoverageHash(covFile, hash); makeBackupCoverageHash(conn, hash); } return hash; } void covInWin(struct hash *covHash, struct chromGaps *cg, char *chrom, int start, int end, struct gl *glList, FILE *f) /* Process coverage in a window of chromosome. */ { int size = end-start; UBYTE *cov = needLargeZeroedMem(size+1); /* A byte for each base in window. */ struct gl *gl; int s, e; int coverage; char cloneName[64]; void *covAsPt; int i,c; int startIx, runSize; UBYTE val, lastVal; for (gl = glList; gl != NULL; gl = gl->next) { if ((s = gl->start) < end && (e = gl->end) > start) { if (s < start) s = start; if (e > end) e = end; assert(sfrag); chopSuffix(cloneName); covAsPt = hashMustFindVal(covHash, cloneName); coverage = ptToInt(covAsPt); for (i=s; i finCov) c = finCov; cov[i] = c; } } } /* Add sentinal value to simplify run-length encoding loop. */ cov[size] = finCov+1; startIx = 0; lastVal = cov[0]; for (i=1; i<=size; ++i) { if ((val = cov[i]) != lastVal) { if (lastVal != 0) { struct rgbColor *col; runSize = i-startIx; if (lastVal == finCov) col = &rgbRed; else if (lastVal >= ArraySize(draftCols)) col = &rgbRedOrange; else col = &draftCols[lastVal-1]; s = startIx + start; e = i + start; printTabNum(f, cg, chrom, s, e, "coverage", "box", col->r, col->g, col->b, lastVal); } startIx = i; lastVal = val; } } freeMem(cov); } void getCoverage(struct chromGaps *cg, char *chrom, int chromSize, struct sqlConnection *conn, FILE *f) /* Make coverage track. */ { struct hash *covHash = makeCoverageHash(conn); char query[512]; struct gl *glList = NULL, *gl; struct sqlResult *sr; char **row; int coverage; int start, end; int winSize = 1024*1024*64; printf(" getting coverage\n"); initDraftCols(); sqlSafef(query, sizeof query, "select * from %s_gl", chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { gl = glLoad(row); slAddHead(&glList, gl); } sqlFreeResult(&sr); slReverse(&glList); for (start = 0; start < chromSize; start = end) { end = start + winSize; if (end > chromSize) end = chromSize; covInWin(covHash, cg, chrom, start, end, glList, f); } glFreeList(&glList); } void getEnsTicks(struct chromGaps *cg, char *chrom, int chromSize, FILE *f, char *tabName) /* Get Ensembl gene ticks from tab separated file. */ { struct lineFile *lf = lineFileOpen(tabName, TRUE); char *words[8]; int wordCount; int start; printf(" Getting Ensembl gene ticks\n"); while ((wordCount = lineFileChop(lf, words)) != 0) { lineFileExpectWords(lf, 4, wordCount); if (sameString(words[0], chrom)) { start = atoi(words[1]); printTab(f, cg, chrom, start, start+1, "genePred", "tick", 150, 0, 0, "."); } } lineFileClose(&lf); } double nRatio(struct sqlConnection *conn, char *chrom, int chromStart, int chromEnd) /* Return percentage of N's in range. */ { char query[256]; struct sqlResult *sr; char **row; struct agpGap gap; int baseTotal = chromEnd - chromStart; int nCount = 0, s, e, size; sqlSafef(query, sizeof query, "select * from %s_gap where chromStart < %d and chromEnd > %d", chrom, chromEnd, chromStart); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { agpGapStaticLoad(row, &gap); s = gap.chromStart; if (s < chromStart) s = chromStart; e = gap.chromEnd; if (e > chromEnd) e = chromEnd; size = e - s; nCount += size; } sqlFreeResult(&sr); return (double)nCount/(double)baseTotal; } int countSnps(struct sqlConnection *conn, char *chrom, int chromStart, int chromEnd) /* Count the number of SNPs in window. */ { char query[256]; sqlSafef(query, sizeof query, "select count(*) from snpTsc where chrom = '%s' and chromStart < %d and chromEnd > %d", chrom, chromEnd, chromStart); return sqlQuickNum(conn, query); } void slowGetSnpDensity(struct chromGaps *cg, char *chrom, int chromSize, struct sqlConnection *conn, FILE *f) /* Make SNP density track. */ { int windowSize = 50000; int s, e; int size; double ratio, totalRatio = 0; double maxCount = 0, minCount=windowSize*2, totalCount = 0; for (s = 0; s < chromSize; s += 50000) { e = s + windowSize; if (e > chromSize) e = chromSize; size = e - s; if (size > windowSize/2) { ratio = 1.0 - nRatio(conn, chrom, s, e); uglyf("%s:%d-%d ratio %f\n", chrom, s, e, ratio); if (ratio >= 0.5) { int rawCount = countSnps(conn, chrom, s, e); double count = rawCount/ratio; uglyf(" rawCount %d, count %f\n", rawCount, count); if (count < minCount) minCount = count; if (count > maxCount) maxCount = count; totalRatio += ratio; totalCount += count; } } } printf("%s SNP density in %d windows is between %f and %f, average %f\n", chrom, windowSize, minCount, maxCount, totalCount/totalRatio); } void getSnpDensity(struct chromGaps *cg, char *chrom, int chromSize, struct sqlConnection *conn, FILE *f) /* Put out SNP density info. */ { int windowSize = 50000; double scale = 500.0; struct sqlResult *sr; char **row; char query[256]; struct snp snp; struct agpFrag frag; int winsPerChrom = (chromSize + windowSize-1)/windowSize; int *winBases; int *winRepBases; int i; int baseCount, repCount; double density; double minDen = 1.0, maxDen = 0.0; int winStart, winEnd; printf(" Getting SNPs in %d windows of size %d\n", winsPerChrom, windowSize); fflush(stdout); /* Count up SNPs in each window. */ AllocArray(winRepBases, winsPerChrom); sqlSafef(query, sizeof query, "select * from snpTsc where chrom = '%s'", chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { snpStaticLoad(row, &snp); addSegTotals(snp.chromStart, snp.chromEnd, winRepBases, windowSize, chromSize); } sqlFreeResult(&sr); /* Count up bases in each window. */ AllocArray(winBases, winsPerChrom); sqlSafef(query, sizeof query, "select * from %s_gold", chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { agpFragStaticLoad(row, &frag); addSegTotals(frag.chromStart, frag.chromEnd, winBases, windowSize, chromSize); } sqlFreeResult(&sr); /* Output density. */ for (i=0; i windowSize/2) { repCount = winRepBases[i]; density = (double)repCount/(double)baseCount; if (density < minDen) minDen = density; if (density > maxDen) maxDen = density; density *= scale; if (density > 1.0) density = 1.0; winStart = i*windowSize; winEnd = winStart + windowSize; if (winEnd > chromSize) winEnd = chromSize; printTabNum(f, cg, chrom, winStart, winEnd, "SNP", "wiggle", 128, 0, 128, density); } } printf(" %s minDen %f, maxDen %f\n", chrom, minDen, maxDen); freeMem(winBases); freeMem(winRepBases); } void getBands(struct chromGaps *cg, char *chrom, int chromSize, struct sqlConnection *conn, FILE *f) /* Get chromosome bands stuff. */ { struct sqlResult *sr; char **row; char query[256]; struct cytoBand el; int r,g,b; char *stain; printf(" getting bands\n"); sqlSafef(query, sizeof query, "select * from cytoBand where chrom = '%s'", chrom); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { cytoBandStaticLoad(row, &el); stain = el.gieStain; if (startsWith("gneg", stain)) { r = g = b = 220; } else if (startsWith ("gpos", stain)) { int percentage = 100; stain += 4; if (isdigit(stain[0])) percentage = atoi(stain); r = g = b = round((100.0-percentage)*200); } else if (startsWith("gvar", stain)) { r = 180; g = 180; b = 120; } else { r = 150; g = 50; b = 50; } printTab(f, cg, chrom, el.chromStart, el.chromEnd, "band", "box", r, g, b, el.name); } sqlFreeResult(&sr); } void oneChrom(char *chrom, struct sqlConnection *conn, struct hash *hugoDupHash, struct hash *knownUniqHash, struct hash *resolvedDupHash, struct hash *diseaseNameHash, struct hash *diseaseOmimHash, FILE *f) /* Get info for one chromosome. */ { int chromSize = hChromSize(chrom); struct chromGaps *cg = NULL; printf("Processing %s\n", chrom); getHeterochromatin(cg, chrom, conn, f); getBands(cg, chrom, chromSize, conn, f); getCoverage(cg, chrom, chromSize, conn, f); getGc(cg, chrom, conn, f); getRepeatDensity(cg, chrom, chromSize, conn, f, "SINE", 100000, 1.0/0.66, 255, 0, 0); getRepeatDensity(cg, chrom, chromSize, conn, f, "LINE", 100000, 1.0/0.66, 0, 0, 255); getSnpDensity(cg, chrom, chromSize, conn, f); getRnaGenes(cg, chrom, conn, f); getCpgIslands(cg, chrom, conn, f); getExofishHits(cg,chrom,conn,f); //getTetHits(cg, chrom, conn, f); /* WABA Tet alignments. */ getEstTicks(cg, chrom, conn, f); getPredGenes(cg, chrom, conn, f, "genieAlt", 150, 0, 0); //getPredGenes(cg, chrom, conn, f, "ensGene", 150, 0, 0); /* Ensemble full predictions. */ getEnsTicks(cg, chrom, chromSize, f, ensTick); getPredGenes(cg, chrom, conn, f, "genieKnown", blueGene.r, blueGene.g, blueGene.b); getKnownGenes(cg, chrom, conn, f, hugoDupHash, knownUniqHash, resolvedDupHash, diseaseNameHash, diseaseOmimHash); } void getHugoDupes(struct sqlConnection *conn, struct hash *hugoDupHash) /* Read in all hugo genes and save the duplicated ones in hugoDupHash */ { struct hash *hash = newHash(0); struct sqlResult *sr; char **row; char *name; struct slName *txIdList = NULL, *txId; char query[256]; char geneName[256]; printf("Looking for duplicated HUGO genes:\n"); -sr = sqlGetResult(conn, NOSQLINJ "select name from genieKnown"); +char query[1024]; +sqlSafef(query, sizeof query, "select name from genieKnown"); +sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { txId = newSlName(row[0]); slAddHead(&txIdList, txId); } sqlFreeResult(&sr); for (txId = txIdList; txId != NULL; txId = txId->next) { sqlSafef(query, sizeof query, "select name from knownMore where transId = '%s'", txId->name); if (sqlQuickQuery(conn, query, geneName, sizeof(geneName))) { if (hashLookup(hash, geneName) && !hashLookup(hugoDupHash, geneName)) { printf(" %s\n", geneName); hashStore(hugoDupHash, geneName); } else hashAdd(hash, geneName, NULL); } } freeHash(&hash); } void foldDb(int chromCount, char *chromNames[]) /* foldDb - Search database info for making foldout. */ { int i; char *database = "oo21"; struct sqlConnection *conn = sqlConnect(database); struct hash *hugoDupHash = newHash(0); struct hash *knownUniqHash = newHash(0); struct hash *resolvedDupHash = NULL; struct resolvedDup *rdList = NULL; struct hash *diseaseNameHash = NULL; struct hash *diseaseOmimHash = NULL; dupeFile = mustOpen(dupeFileName, "w"); hSetDb(database); loadNtPatchFiles(); // setupHugeGaps(conn); getHugoDupes(conn, hugoDupHash); makeDiseaseHash(conn, &diseaseNameHash, &diseaseOmimHash); makeResolvedDupes(&resolvedDupHash, &rdList); for (i=0; i