44ccfacbe3a3d4b300f80d48651c77837a4b571e galt Tue Apr 26 11:12:02 2022 -0700 SQL INJECTION Prevention Version 2 - this improves our methods by making subclauses of SQL that get passed around be both easy and correct to use. The way that was achieved was by getting rid of the obscure and not well used functions sqlSafefFrag and sqlDyStringPrintfFrag and replacing them with the plain versions of those functions, since these are not needed anymore. The new version checks for NOSQLINJ in unquoted %-s which is used to include SQL clauses, and will give an error the NOSQLINJ clause is not present, and this will automatically require the correct behavior by developers. sqlDyStringPrint is a very useful function, however because it was not enforced, users could use various other dyString functions and they operated without any awareness or checking for SQL correct use. Now those dyString functions are prohibited and it will produce an error if you try to use a dyString function on a SQL string, which is simply detected by the presence of the NOSQLINJ prefix. diff --git src/hg/featureBits/featureBits.c src/hg/featureBits/featureBits.c index 363ec5c..b658645 100644 --- src/hg/featureBits/featureBits.c +++ src/hg/featureBits/featureBits.c @@ -1,1034 +1,1039 @@ /* featureBits - Correlate tables via bitmap projections and booleans. */ /* Copyright (C) 2014 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "jksql.h" #include "hdb.h" #include "fa.h" #include "bits.h" #include "bed.h" #include "psl.h" #include "portable.h" #include "featureBits.h" #include "agpGap.h" #include "chain.h" #include "chromInfo.h" static struct optionSpec optionSpecs[] = /* command line option specifications */ { {"bed", OPTION_STRING}, {"fa", OPTION_STRING}, {"faMerge", OPTION_BOOLEAN}, {"minSize", OPTION_INT}, {"chrom", OPTION_STRING}, {"or", OPTION_BOOLEAN}, {"not", OPTION_BOOLEAN}, {"countGaps", OPTION_BOOLEAN}, {"noRandom", OPTION_BOOLEAN}, {"noHap", OPTION_BOOLEAN}, {"primaryChroms", OPTION_BOOLEAN}, {"dots", OPTION_INT}, {"minFeatureSize", OPTION_INT}, {"enrichment", OPTION_BOOLEAN}, {"where", OPTION_STRING}, {"bin", OPTION_STRING}, {"chromSize", OPTION_STRING}, {"binSize", OPTION_INT}, {"binOverlap", OPTION_INT}, {"bedRegionIn", OPTION_STRING}, {"bedRegionOut", OPTION_STRING}, {"countBlocks", OPTION_BOOLEAN}, {NULL, 0} }; int minSize = 1; /* Minimum size of feature. */ char *clChrom = "all"; /* Which chromosome. */ boolean orLogic = FALSE; /* Do ors instead of ands? */ boolean notResults = FALSE; /* negate results? */ char *where = NULL; /* Extra selection info. */ char *chromSizes = NULL; /* read chrom sizes from file instead of database . */ boolean countGaps = FALSE; /* Count gaps in denominator? */ boolean noRandom = FALSE; /* Exclude _random chromosomes? */ boolean noHap = FALSE; /* Exclude _hap|_alt chromosomes? */ boolean primaryChroms = FALSE; /* only include primary chroms */ int dots = 0; /* print dots every N chroms (scaffolds) processed */ boolean calcEnrichment = FALSE; /* Calculate coverage/enrichment? */ int binSize = 500000; /* Default bin size. */ int binOverlap = 250000; /* Default bin size. */ boolean countBlocks = FALSE; /* Count blocks in bed12 rather than extent. */ /* to process chroms without constantly looking up in chromInfo, create * this list of them from the chromInfo once. */ static struct chromInfo *chromInfoList = NULL; static struct hash *gapHash = NULL; void usage() /* Explain usage and exit. */ { errAbort( "featureBits - Correlate tables via bitmap projections. \n" "usage:\n" " featureBits database table(s)\n" "This will return the number of bits in all the tables bitwise ANDed together\n" "Pipe warning: output goes to stderr.\n" "Options:\n" " -bed=output.bed Put intersection into bed format. Can use stdout.\n" " -fa=output.fa Put sequence in intersection into .fa file\n" " -faMerge For fa output merge overlapping features.\n" " -minSize=N Minimum size to output (default 1)\n" " -chrom=chrN Restrict to one chromosome\n" " -chromSize=sizefile Read chrom sizes from file instead of database. \n" " (chromInfo three column format)\n" " -or Bitwise OR tables together instead of ANDing them.\n" " -not Output negation of resulting bit set.\n" " -countGaps Count gaps in denominator\n" " -countBlocks Count blocks in bed12 files rather than entire extent.\n" " -noRandom Don't include _random (or Un) chromosomes\n" " -noHap Don't include _hap|_alt chromosomes\n" " -primaryChroms Primary assembly (chroms without '_' in name)\n" " -dots=N Output dot every N chroms (scaffolds) processed\n" " -minFeatureSize=n Don't include bits of the track that are smaller than\n" " minFeatureSize, useful for differentiating between\n" " alignment gaps and introns.\n" " -bin=output.bin Put bin counts in output file\n" " -binSize=N Bin size for generating counts in bin file (default 500000)\n" " -binOverlap=N Bin overlap for generating counts in bin file (default 250000)\n" " -bedRegionIn=input.bed Read in a bed file for bin counts in specific regions \n" " and write to bedRegionsOut\n" " -bedRegionOut=output.bed Write a bed file of bin counts in specific regions \n" " from bedRegionIn\n" " -enrichment Calculates coverage and enrichment assuming first table\n" " is reference gene track and second track something else\n" " Enrichment is the amount of table1 that covers table2 vs. the\n" " amount of table1 that covers the genome. It's how much denser\n" " table1 is in table2 than it is genome-wide.\n" " '-where=some sql pattern' Restrict to features matching some sql pattern\n" "You can include a '!' before a table name to negate it.\n" " To prevent your shell from interpreting the '!' you will need\n" " to use the backslash \\!, for example the gap table: \\!gap\n" "Some table names can be followed by modifiers such as:\n" " :exon:N Break into exons and add N to each end of each exon\n" " :cds Break into coding exons\n" " :intron:N Break into introns, remove N from each end\n" " :utr5, :utr3 Break into 5' or 3' UTRs\n" " :upstream:N Consider the region of N bases before region\n" " :end:N Consider the region of N bases after region\n" " :score:N Consider records with score >= N \n" " :upstreamAll:N Like upstream, but doesn't filter out genes that \n" " have txStart==cdsStart or txEnd==cdsEnd\n" " :endAll:N Like end, but doesn't filter out genes that \n" " have txStart==cdsStart or txEnd==cdsEnd\n" "The tables can be bed, psl, or chain files, or a directory full of\n" "such files as well as actual database tables. To count the bits\n" "used in dir/chrN_something*.bed you'd do:\n" " featureBits database dir/_something.bed\n" "File types supported are BED, bigBed, PSL, and chain. The suffix of the file \n" "is used to determine the type and MUST be .bed, .bb, .psl, or .chain respectively.\n" "NB: by default, featureBits omits gap regions from its calculation of the total\n" "number of bases. This requires connecting to a database server using credentials\n" "from a .hg.conf file (or similar). If such a connection is not available, you will\n" "need to specify -countGaps (which skips the database connection) in addition to\n" "providing all tables as files or directories.\n" ); } static struct chromInfo *fbCreateChromInfoList(char *name, char *database) /* Load up all chromosome infos. */ { struct sqlConnection *conn = sqlConnect(database); struct sqlResult *sr = NULL; char **row; int loaded=0; struct chromInfo *ret = NULL; unsigned totalSize = 0; if (sameWord(name, "all")) - sr = sqlGetResult(conn, NOSQLINJ "select * from chromInfo"); + { + char query[1024]; + sqlSafef(query, sizeof query, "select * from chromInfo"); + sr = sqlGetResult(conn, query); + } else { char select[256]; sqlSafef(select, ArraySize(select), "select * from chromInfo where chrom='%s'", name); sr = sqlGetResult(conn, select); } while ((row = sqlNextRow(sr)) != NULL) { struct chromInfo *el; struct chromInfo *ci; AllocVar(ci); el = chromInfoLoad(row); ci->chrom = cloneString(el->chrom); ci->size = el->size; totalSize += el->size; slAddHead(&ret, ci); ++loaded; } if (loaded < 1) errAbort("ERROR: can not find chrom name: '%s'\n", name); slReverse(&ret); if (sameWord(name, "all")) verbose(2, "#\tloaded size info for %d chroms, total size: %u\n", loaded, totalSize); sqlFreeResult(&sr); sqlDisconnect(&conn); return ret; } static boolean hasSuffixCompress(char *file, char *suffix, char *compSuffix) /* determine if file ends with .suffix.compSuffix */ { char sbuf[64]; safef(sbuf, sizeof(sbuf), "%s.%s", suffix, compSuffix); return endsWith(file, sbuf); } boolean isFileType(char *file, char *suffix) /* determine if file ends with .suffix, or .suffix.{gz,Z,bz2} */ { char cleaned[PATH_LEN], *p; /* remove any : qualifiers */ strcpy(cleaned, file); p = strrchr (cleaned, '/'); if (p == NULL) p = cleaned; p = strchr(p, ':'); if (p != NULL) *p = '\0'; if (endsWith(cleaned, suffix)) return TRUE; else if (hasSuffixCompress(cleaned, suffix, "gz")) return TRUE; else if (hasSuffixCompress(cleaned, suffix, "Z")) return TRUE; else if (hasSuffixCompress(cleaned, suffix, "bz2")) return TRUE; else return FALSE; } bool inclChrom(char *name) /* check if a chromosome should be included */ { return !((noRandom && (endsWith(name, "_random") || startsWith("chrUn", name) || sameWord("chrNA", name) /* danRer */ || sameWord("chrU", name))) /* dm */ || (noHap && haplotype(name)) || (primaryChroms && (strchr(name, '_') != NULL))); } void bitsToBins(Bits *bits, char *chrom, int chromSize, FILE *binFile, int binSize, int binOverlap) /* Write out binned counts of bits. */ { int bin, count; if (!binFile) return; for (bin=0; bin+binSize<chromSize; bin=bin+binOverlap) { count = bitCountRange(bits, bin, binSize); fprintf(binFile, "%s\t%d\t%d\t%d\t%s.%d\n", chrom, bin, bin+binSize, count, chrom, bin/binOverlap+1); } count = bitCountRange(bits, bin, chromSize-bin); fprintf(binFile, "%s\t%d\t%d\t%d\t%s.%d\n", chrom, bin, chromSize, count, chrom, bin/binOverlap+1); } void bitsToRegions(Bits *bits, char *chrom, int chromSize, struct bed *bedList, FILE *bedOutFile) /* Write out counts of bits in regions defined by bed elements. */ { struct bed *bl=NULL; int count, i=0; if (!bedOutFile) return; for (bl=bedList; bl!=NULL; bl=bl->next) { if(differentString(bl->chrom,chrom)) continue; count = bitCountRange(bits, bl->chromStart, bl->chromEnd-bl->chromStart); fprintf(bedOutFile, "%s\t%d\t%d\t%d\t%s.%d\n", chrom, bl->chromStart, bl->chromEnd, count, chrom, ++i); } } void check(struct sqlConnection *conn, char *table) /* Check it's as planned. */ { char query[256], **row; struct sqlResult *sr; int lastEnd = -1, lastStart = -1, start, end; sqlSafef(query, sizeof query, "select chromStart,chromEnd from %s", table); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { start = atoi(row[0]); end = atoi(row[1]); if (start < lastStart) fprintf(stderr,"Out of order: %d,%d\n", lastStart, start); if (rangeIntersection(lastStart, lastEnd, start-1, end) > 0) fprintf(stderr,"Overlapping: (%d %d) (%d %d)\n", lastStart, lastEnd, start, end); lastStart = start; lastEnd = end; } sqlFreeResult(&sr); errAbort("All for now"); } char *chromFileName(char *track, char *chrom, char fileName[512]) /* Return chromosome specific version of file if it exists. */ { if (fileExists(track)) { strncpy(fileName, track, 512); } else { char dir[256], root[128], ext[65]; int len; splitPath(track, dir, root, ext); /* Chop trailing / off of dir. */ len = strlen(dir); if (len > 0 && dir[len-1] == '/') dir[len-1] = 0; safef(fileName, 512, "%s/%s%s%s", dir, chrom, root, ext); if (!fileExists(fileName)) { warn("Couldn't find %s or %s", track, fileName); strncpy(fileName, track, 512); } } return fileName; } void outOfRange(struct lineFile *lf, char *chrom, int chromSize) /* Complain that coordinate is out of range. */ { errAbort("Coordinate out of allowed range [%d,%d) for %s near line %d of %s", 0, chromSize, chrom, lf->lineIx, lf->fileName); } void setPslBits(struct lineFile *lf, Bits *bits, struct psl *psl, int winStart, int winEnd) /* Set bits that are in psl. */ { int i, s, e, w, blockCount = psl->blockCount; boolean isRev = (psl->strand[1] == '-'); for (i=0; i<blockCount; ++i) { s = psl->tStarts[i]; e = s + psl->blockSizes[i]; if (isRev) { /* Use w as a temp variable to reverse coordinates s&e. */ w = psl->tSize - e; e = psl->tSize - s; s = w; } /* Clip, and if anything left set it. */ if (s < winStart) outOfRange(lf, psl->tName, psl->tSize); if (e > winEnd) outOfRange(lf, psl->tName, psl->tSize); w = e - s; if (w > 0) bitSetRange(bits, s, w); } } void fbOrPsl(Bits *acc, char *track, char *chrom, int chromSize) /* Or in bits of psl file that correspond to chrom. */ { struct lineFile *lf; char fileName[512]; struct psl *psl; chromFileName(track, chrom, fileName); if (!fileExists(fileName)) return; lf = pslFileOpen(fileName); while ((psl = pslNext(lf)) != NULL) { if (sameString(psl->tName, chrom)) setPslBits(lf, acc, psl, 0, chromSize); pslFree(&psl); } lineFileClose(&lf); } static int howManyFields(char *fileName) /* Figure out how many fields the first bed has in this file. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); char *line, *row[256]; unsigned numFields = 0; if (lineFileNextReal(lf, &line)) numFields = chopByWhite(line, row, ArraySize(row)); lineFileClose(&lf); return numFields; } static struct bed *cacheBeds(char *fileName) /* Read in the beds from this file and keep them in a cache. */ { static char *cachedFile; static struct bed *cachedBeds; if ((cachedFile == NULL) || differentString(cachedFile, fileName)) { if (cachedBeds) { bedFreeList(&cachedBeds); freez(&cachedFile); } cachedFile = cloneString(fileName); unsigned numFields = howManyFields(cachedFile); cachedBeds = bedLoadNAllChrom(fileName, numFields > 12 ? 12 : numFields, NULL); } return cachedBeds; } void fbOrBed(Bits *acc, char *track, char *chrom, int chromSize) /* Or in bits of bed file that correspond to chrom. */ { char fileName[512]; chromFileName(track, chrom, fileName); if (!fileExists(fileName)) return; struct bed *bed, *beds = cacheBeds(fileName); for(bed = beds; bed; bed = bed->next) { if (sameString(bed->chrom, chrom)) { if (countBlocks && (bed->blockCount > 0)) { int ii; for(ii=0; ii < bed->blockCount; ii++) bitSetRange(acc, bed->chromStart + bed->chromStarts[ii], bed->blockSizes[ii]); } else { int w; w = bed->chromEnd - bed->chromStart; if (w > 0) bitSetRange(acc, bed->chromStart, w); } } } } void fbOrBigBed(Bits *acc, char *track, char *chrom, int chromSize) /* Or in a bigBed file. */ { char fileName[512]; chromFileName(track, chrom, fileName); if (!fileExists(fileName)) return; struct lm *lm = lmInit(0); struct bbiFile *bbi = bigBedFileOpen(fileName); unsigned fieldCount = bbi->definedFieldCount; char *bedRow[fieldCount]; struct bigBedInterval *interval, *intervalList = bigBedIntervalQuery(bbi, chrom, 0, chromSize, 0, lm); for (interval = intervalList; interval != NULL; interval = interval->next) { if (countBlocks && (fieldCount >= 12)) { char startBuf[16], endBuf[16]; bigBedIntervalToRow(interval, chrom, startBuf, endBuf, bedRow, ArraySize(bedRow)); struct bed *bed = bedLoadN(bedRow, fieldCount); int ii; for(ii=0; ii < bed->blockCount; ii++) bitSetRange(acc, bed->chromStart + bed->chromStarts[ii], bed->blockSizes[ii]); bedFree(&bed); } else { int w = interval->end - interval->start; if (w > 0) bitSetRange(acc, interval->start, w); } } lmCleanup(&lm); bbiFileClose(&bbi); } void fbOrChain(Bits *acc, char *track, char *chrom, int chromSize) /* Or in a chain file. */ { struct lineFile *lf; char fileName[512]; struct chain *chain; struct cBlock *b; chromFileName(track, chrom, fileName); if (!fileExists(fileName)) return; lf = lineFileOpen(fileName, TRUE); while ((chain = chainRead(lf)) != NULL) { if (sameString(chrom, chain->tName)) { for (b = chain->blockList; b != NULL; b = b->next) { int s = b->tStart, e = b->tEnd; if (s < 0) outOfRange(lf, chrom, chromSize); if (e > chromSize) outOfRange(lf, chrom, chromSize); bitSetRange(acc, b->tStart, b->tEnd - b->tStart); } } chainFree(&chain); } lineFileClose(&lf); } void isolateTrackPartOfSpec(char *spec, char track[512]) /* Convert something like track:exon to just track */ { char *s; strncpy(track, spec, 512); s = strchr(track, ':'); if (s != NULL) *s = 0; } void orFile(Bits *acc, char *track, char *chrom, int chromSize) /* Or some sort of file into bits. */ { if (isFileType(track, "psl")) { fbOrPsl(acc, track, chrom, chromSize); } else if (isFileType(track, "bed")) { fbOrBed(acc, track, chrom, chromSize); } else if (isFileType(track, "chain")) { fbOrChain(acc, track, chrom, chromSize); } else if (isFileType(track, "bb")) { fbOrBigBed(acc, track, chrom, chromSize); } else errAbort("can't determine file type of: %s", track); } void orTable(char *database, Bits *acc, char *track, char *chrom, int chromSize, struct sqlConnection *conn) /* Or in table if it exists. Else do nothing. */ { char t[512], *s; char table[HDB_MAX_TABLE_STRING]; isolateTrackPartOfSpec(track, t); s = strrchr(t, '.'); if (s != NULL) { orFile(acc, track, chrom, chromSize); } else { boolean hasBin; int minFeatureSize = optionInt("minFeatureSize", 0); boolean isFound = hFindSplitTable(database, chrom, t, table, sizeof table, &hasBin); verbose(3,"orTable: db: %s isFound: %s %s %s %s\n", database, isFound ? "TRUE" : "FALSE", chrom, t, table ); if (isFound) fbOrTableBitsQueryMinSize(database, acc, track, chrom, chromSize, conn, where, TRUE, TRUE, minFeatureSize); } } void chromFeatureBits(struct sqlConnection *conn,char *database, char *chrom, int tableCount, char *tables[], FILE *bedFile, FILE *faFile, FILE *binFile, struct bed *bedRegionList, FILE *bedOutFile, int chromSize, int *retChromBits, int *retFirstTableBits, int *retSecondTableBits) /* featureBits - Correlate tables via bitmap projections and booleans * on one chromosome. */ { int i; Bits *acc = NULL; Bits *bits = NULL; char *table; acc = bitAlloc(chromSize); bits = bitAlloc(chromSize); for (i=0; i<tableCount; ++i) { boolean not = FALSE; table = tables[i]; if (table[0] == '!') { not = TRUE; ++table; } if (i == 0) { orTable(database, acc, table, chrom, chromSize, conn); if (not) bitNot(acc, chromSize); if (retFirstTableBits != NULL) *retFirstTableBits = bitCountRange(acc, 0, chromSize); } else { bitClear(bits, chromSize); orTable(database, bits, table, chrom, chromSize, conn); if (not) bitNot(bits, chromSize); if (i == 1 && retSecondTableBits != NULL) *retSecondTableBits = bitCountRange(bits, 0, chromSize); /* feature/bug - the above does not respect minSize */ if (orLogic) bitOr(acc, bits, chromSize); else bitAnd(acc, bits, chromSize); } } if (notResults) bitNot(acc, chromSize); *retChromBits = bitCountRange(acc, 0, chromSize); if (bedFile != NULL || faFile != NULL) { minSize = optionInt("minSize", minSize); bitsToBed(database, acc, chrom, chromSize, bedFile, faFile, minSize); } if (binFile != NULL) { binSize = optionInt("binSize", binSize); binOverlap = optionInt("binOverlap", binOverlap); bitsToBins(acc, chrom, chromSize, binFile, binSize, binOverlap); } if (bedOutFile != NULL) bitsToRegions(acc, chrom, chromSize, bedRegionList, bedOutFile); bitFree(&acc); bitFree(&bits); } void chromFeatureSeq(struct sqlConnection *conn, char *database, char *chrom, char *trackSpec, FILE *bedFile, FILE *faFile, int *retItemCount, int *retBaseCount) /* Write out sequence file for features from one chromosome. * This separate routine handles the non-merged case. It's * reason for being is so that the feature names get preserved. */ { boolean hasBin; char t[512], *s = NULL; char table[HDB_MAX_TABLE_STRING]; struct featureBits *fbList = NULL, *fb; if (trackSpec[0] == '!') errAbort("Sorry, '!' not available with fa output unless you use faMerge"); isolateTrackPartOfSpec(trackSpec, t); s = strchr(t, '.'); if (s != NULL) errAbort("Sorry, only database (not file) tracks allowed with " "fa output unless you use faMerge"); if (!hFindSplitTable(database, chrom, t, table, sizeof table, &hasBin)) errAbort("track %s not found", t); fbList = fbGetRangeQuery(database, trackSpec, chrom, 0, hChromSize(database, chrom), where, TRUE, TRUE); for (fb = fbList; fb != NULL; fb = fb->next) { int s = fb->start, e = fb->end; if (bedFile != NULL) { fprintf(bedFile, "%s\t%d\t%d\t%s", fb->chrom, fb->start, fb->end, fb->name); if (fb->strand != '?') fprintf(bedFile, "\t0\t%c", fb->strand); fprintf(bedFile, "\n"); } if (faFile != NULL) { struct dnaSeq *seq = hDnaFromSeq(database, chrom, s, e, dnaLower); if (fb->strand == '-') reverseComplement(seq->dna, seq->size); faWriteNext(faFile, fb->name, seq->dna, seq->size); freeDnaSeq(&seq); } } featureBitsFreeList(&fbList); } static struct hash *loadAllGaps(struct sqlConnection *conn, char *db) /* working on all chroms, fetch all per-chrom gap counts at once * returns hash by chrom name to gap counts for that chrom */ { struct chromInfo *cInfo; struct sqlResult *sr; char **row; struct hash *ret; int totalGapSize = 0; int gapCount = 0; ret = newHash(0); /* If not split, read in whole gulp, create per-chrom hash of sizes */ if (hTableExists(db, "gap")) { char *prevChrom = NULL; int totalGapsThisChrom = 0; - sr = sqlGetResult(conn, - NOSQLINJ "select chrom,chromStart,chromEnd from gap order by chrom"); + char query[1024]; + sqlSafef(query, sizeof query, "select chrom,chromStart,chromEnd from gap order by chrom"); + sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { int gapSize = sqlUnsigned(row[2]) - sqlUnsigned(row[1]); ++gapCount; if (prevChrom && sameWord(prevChrom,row[0])) { totalGapsThisChrom += gapSize; totalGapSize += gapSize; } else { if (prevChrom) { hashAddInt(ret, prevChrom, totalGapsThisChrom); freeMem(prevChrom); prevChrom = cloneString(row[0]); totalGapsThisChrom = gapSize; totalGapSize += gapSize; } else { prevChrom = cloneString(row[0]); totalGapsThisChrom = gapSize; totalGapSize += gapSize; } } } /* and the last one */ if (prevChrom && (totalGapsThisChrom > 0)) { hashAddInt(ret, prevChrom, totalGapsThisChrom); freeMem(prevChrom); } sqlFreeResult(&sr); } else { /* for each chrom name, fetch the gap count */ for (cInfo = chromInfoList; cInfo != NULL; cInfo = cInfo->next) { int rowOffset; int totalGapsThisChrom = 0; sr = hChromQuery(conn, "gap", cInfo->chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { int gapSize; struct agpGap gap; ++gapCount; agpGapStaticLoad(row+rowOffset, &gap); gapSize = gap.chromEnd - gap.chromStart; totalGapsThisChrom += gapSize; totalGapSize += gapSize; } sqlFreeResult(&sr); hashAddInt(ret, cInfo->chrom, totalGapsThisChrom); } } verbose(2,"#\tloaded %d gaps covering %d bases\n", gapCount, totalGapSize); return ret; } int countBases(struct sqlConnection *conn, char *chrom, int chromSize, char *database) /* Count bases, generally not including gaps, in chromosome. */ { static boolean gapsLoaded = FALSE; struct sqlResult *sr; int totalGaps = 0; char **row; int rowOffset; if (countGaps) return chromSize; /* If doing all chroms, then load up all the gaps and be done with * it instead of re-reading the gap table for every chrom */ if (sameWord(clChrom,"all")) { if (!gapsLoaded) gapHash = loadAllGaps(conn, database); gapsLoaded = TRUE; totalGaps = hashIntValDefault(gapHash, chrom, 0); } else { sr = hChromQuery(conn, "gap", chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { int gapSize; struct agpGap gap; agpGapStaticLoad(row+rowOffset, &gap); gapSize = gap.chromEnd - gap.chromStart; totalGaps += gapSize; } sqlFreeResult(&sr); } return chromSize - totalGaps; } void checkInputExists(struct sqlConnection *conn,char *database, struct chromInfo *chromInfoList, int tableCount, char *tables[]) /* check input tables/files exist, especially to handle split tables */ { char *track=NULL; int i = 0, missing=0; char t[512], *s=NULL; char table[HDB_MAX_TABLE_STRING]; char fileName[512]; boolean found = FALSE; for (i=0; i<tableCount; ++i) { struct chromInfo *cInfo; track = tables[i]; if (track[0] == '!') { ++track; } isolateTrackPartOfSpec(track, t); s = strrchr(t, '.'); if (s) { if (fileExists(t)) continue; } else { if (NULL == conn) conn = hAllocConn(database); if (sqlTableExists(conn, t)) continue; } found = FALSE; for (cInfo = chromInfoList; cInfo != NULL; cInfo = cInfo->next) { if (inclChrom(cInfo->chrom)) { if (s) { chromFileName(t, cInfo->chrom, fileName); if (fileExists(fileName)) { found = TRUE; break; } } else { boolean hasBin; if (hFindSplitTable(database, cInfo->chrom, t, table, sizeof table, &hasBin)) { found = TRUE; break; } } } } if (!found) { if (s) warn("file %s not found for any chroms", t); else warn("table %s not found for any chroms", t); ++missing; } } if (missing>0) errAbort("Error: %d input table(s)/file(s) do not exist for any of the chroms specified",missing); } void featureBits(char *database, int tableCount, char *tables[]) /* featureBits - Correlate tables via bitmap projections and booleans. */ { struct sqlConnection *conn = NULL; char *bedName = optionVal("bed", NULL), *faName = optionVal("fa", NULL); char *binName = optionVal("bin", NULL); char *bedRegionInName = optionVal("bedRegionIn", NULL); char *bedRegionOutName = optionVal("bedRegionOut", NULL); FILE *bedFile = NULL, *faFile = NULL, *binFile = NULL; FILE *bedRegionOutFile = NULL; struct bed *bedRegionList = NULL; boolean faIndependent = FALSE; struct chromInfo *cInfo; if (bedName) bedFile = mustOpen(bedName, "w"); if (binName) binFile = mustOpen(binName, "w"); if ((bedRegionInName && !bedRegionOutName) || (!bedRegionInName && bedRegionOutName)) errAbort("bedRegionIn and bedRegionOut must both be specified"); if (faName) { boolean faMerge = optionExists("faMerge"); faFile = mustOpen(faName, "w"); if (tableCount > 1) { if (!faMerge) errAbort("For fa output of multiple tables you must use the " "faMerge option"); } faIndependent = (!faMerge); } if (chromSizes != NULL) chromInfoList = chromInfoLoadAll(chromSizes); else chromInfoList = fbCreateChromInfoList(clChrom, database); if (!countGaps) conn = hAllocConn(database); checkInputExists(conn, database, chromInfoList, tableCount, tables); if (!faIndependent) { double totalBases = 0, totalBits = 0; int firstTableBits = 0, secondTableBits = 0; int *pFirstTableBits = NULL, *pSecondTableBits = NULL; double totalFirstBits = 0, totalSecondBits = 0; static int dotClock = 1; if (calcEnrichment) { pFirstTableBits = &firstTableBits; pSecondTableBits = &secondTableBits; } if (bedRegionInName) { struct lineFile *lf = lineFileOpen(bedRegionInName, TRUE); struct bed *bed; char *row[3]; bedRegionOutFile = mustOpen(bedRegionOutName, "w"); while (lineFileRow(lf, row)) { if (startsWith(row[0],"#")||startsWith(row[0],"chrom")) continue; bed = bedLoad3(row); slAddHead(&bedRegionList, bed); } lineFileClose(&lf); slReverse(&bedRegionList); } for (cInfo = chromInfoList; cInfo != NULL; cInfo = cInfo->next) { if (inclChrom(cInfo->chrom)) { int chromBitSize; int chromSize = cInfo->size; verbose(3,"chromFeatureBits(%s)\n", cInfo->chrom); chromFeatureBits(conn, database, cInfo->chrom, tableCount, tables, bedFile, faFile, binFile, bedRegionList, bedRegionOutFile, chromSize, &chromBitSize, pFirstTableBits, pSecondTableBits ); totalBases += countBases(conn, cInfo->chrom, chromSize, database); totalBits += chromBitSize; totalFirstBits += firstTableBits; totalSecondBits += secondTableBits; if (dots > 0) { if (--dotClock <= 0) { fputc('.', stdout); fflush(stdout); dotClock = dots; } } } } if (dots > 0) { fputc('\n', stdout); fflush(stdout); } if (calcEnrichment) fprintf(stderr,"%s %5.3f%%, %s %5.3f%%, both %5.3f%%, cover %4.2f%%, enrich %4.2fx\n", tables[0], 100.0 * totalFirstBits/totalBases, tables[1], 100.0 * totalSecondBits/totalBases, 100.0 * totalBits/totalBases, 100.0 * totalBits / totalFirstBits, (totalBits/totalSecondBits) / (totalFirstBits/totalBases) ); else fprintf(stderr,"%1.0f bases of %1.0f (%4.3f%%) in intersection\n", totalBits, totalBases, 100.0*totalBits/totalBases); } else { int totalItems = 0; double totalBases = 0; int itemCount, baseCount; for (cInfo = chromInfoList; cInfo != NULL; cInfo = cInfo->next) { if (inclChrom(cInfo->chrom)) { chromFeatureSeq(conn, database, cInfo->chrom, tables[0], bedFile, faFile, &itemCount, &baseCount); totalBases += countBases(conn, cInfo->chrom, baseCount, database); totalItems += itemCount; } } } hFreeConn(&conn); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); if (argc < 3) usage(); clChrom = optionVal("chrom", clChrom); chromSizes = optionVal("chromSize", NULL); orLogic = optionExists("or"); notResults = optionExists("not"); countGaps = optionExists("countGaps"); countBlocks = optionExists("countBlocks"); noRandom = optionExists("noRandom"); noHap = optionExists("noHap"); primaryChroms = optionExists("primaryChroms"); dots = optionInt("dots", dots); where = optionVal("where", NULL); calcEnrichment = optionExists("enrichment"); if (calcEnrichment && argc != 4) errAbort("You must specify two tables with -enrichment"); if (calcEnrichment && orLogic) errAbort("You can't use -or with -enrichment"); if (calcEnrichment && notResults) errAbort("You can't use -not with -enrichment"); if (notResults && optionExists("fa") && !optionExists("faMerge")) errAbort("Must specify -faMerge if using -not with -fa"); featureBits(argv[1], argc-2, argv+2); return 0; }