2306b0cbe3778fe985fa284367228b35fd9dae78 markd Tue Feb 23 11:27:38 2021 -0800 added featureBits option to only consider primary chromosomes using UCSC naming convention diff --git src/hg/featureBits/featureBits.c src/hg/featureBits/featureBits.c index c05d8e7..ca3a900 100644 --- src/hg/featureBits/featureBits.c +++ src/hg/featureBits/featureBits.c @@ -20,52 +20,54 @@ static struct optionSpec optionSpecs[] = /* command line option specifications */ { {"bed", OPTION_STRING}, {"fa", OPTION_STRING}, {"faMerge", OPTION_BOOLEAN}, {"minSize", OPTION_INT}, {"chrom", OPTION_STRING}, {"or", OPTION_BOOLEAN}, {"not", OPTION_BOOLEAN}, {"countGaps", OPTION_BOOLEAN}, {"noRandom", OPTION_BOOLEAN}, {"noHap", OPTION_BOOLEAN}, + {"primaryChroms", OPTION_BOOLEAN}, {"dots", OPTION_INT}, {"minFeatureSize", OPTION_INT}, {"enrichment", OPTION_BOOLEAN}, {"where", OPTION_STRING}, {"bin", OPTION_STRING}, {"chromSize", OPTION_STRING}, {"binSize", OPTION_INT}, {"binOverlap", OPTION_INT}, {"bedRegionIn", OPTION_STRING}, {"bedRegionOut", OPTION_STRING}, {NULL, 0} }; int minSize = 1; /* Minimum size of feature. */ char *clChrom = "all"; /* Which chromosome. */ boolean orLogic = FALSE; /* Do ors instead of ands? */ boolean notResults = FALSE; /* negate results? */ char *where = NULL; /* Extra selection info. */ char *chromSizes = NULL; /* read chrom sizes from file instead of database . */ boolean countGaps = FALSE; /* Count gaps in denominator? */ boolean noRandom = FALSE; /* Exclude _random chromosomes? */ boolean noHap = FALSE; /* Exclude _hap|_alt chromosomes? */ +boolean primaryChroms = FALSE; /* only include primary chroms */ int dots = 0; /* print dots every N chroms (scaffolds) processed */ boolean calcEnrichment = FALSE; /* Calculate coverage/enrichment? */ int binSize = 500000; /* Default bin size. */ int binOverlap = 250000; /* Default bin size. */ /* to process chroms without constantly looking up in chromInfo, create * this list of them from the chromInfo once. */ static struct chromInfo *chromInfoList = NULL; static struct hash *gapHash = NULL; void usage() /* Explain usage and exit. */ { errAbort( @@ -75,30 +77,31 @@ "This will return the number of bits in all the tables anded together\n" "Pipe warning: output goes to stderr.\n" "Options:\n" " -bed=output.bed Put intersection into bed format. Can use stdout.\n" " -fa=output.fa Put sequence in intersection into .fa file\n" " -faMerge For fa output merge overlapping features.\n" " -minSize=N Minimum size to output (default 1)\n" " -chrom=chrN Restrict to one chromosome\n" " -chromSize=sizefile Read chrom sizes from file instead of database. \n" " (chromInfo three column format)\n" " -or Or tables together instead of anding them\n" " -not Output negation of resulting bit set.\n" " -countGaps Count gaps in denominator\n" " -noRandom Don't include _random (or Un) chromosomes\n" " -noHap Don't include _hap|_alt chromosomes\n" + " -primaryChroms Primary assembly (chroms without '_' in name)\n" " -dots=N Output dot every N chroms (scaffolds) processed\n" " -minFeatureSize=n Don't include bits of the track that are smaller than\n" " minFeatureSize, useful for differentiating between\n" " alignment gaps and introns.\n" " -bin=output.bin Put bin counts in output file\n" " -binSize=N Bin size for generating counts in bin file (default 500000)\n" " -binOverlap=N Bin overlap for generating counts in bin file (default 250000)\n" " -bedRegionIn=input.bed Read in a bed file for bin counts in specific regions \n" " and write to bedRegionsOut\n" " -bedRegionOut=output.bed Write a bed file of bin counts in specific regions \n" " from bedRegionIn\n" " -enrichment Calculates coverage and enrichment assuming first table\n" " is reference gene track and second track something else\n" " Enrichment is the amount of table1 that covers table2 vs. the\n" " amount of table1 that covers the genome. It's how much denser\n" @@ -204,31 +207,32 @@ else if (hasSuffixCompress(cleaned, suffix, "Z")) return TRUE; else if (hasSuffixCompress(cleaned, suffix, "bz2")) return TRUE; else return FALSE; } bool inclChrom(char *name) /* check if a chromosome should be included */ { return !((noRandom && (endsWith(name, "_random") || startsWith("chrUn", name) || sameWord("chrNA", name) /* danRer */ || sameWord("chrU", name))) /* dm */ - || (noHap && haplotype(name))); + || (noHap && haplotype(name)) + || (primaryChroms && (strchr(name, '_') != NULL))); } void bitsToBins(Bits *bits, char *chrom, int chromSize, FILE *binFile, int binSize, int binOverlap) /* Write out binned counts of bits. */ { int bin, count; if (!binFile) return; for (bin=0; bin+binSize<chromSize; bin=bin+binOverlap) { count = bitCountRange(bits, bin, binSize); fprintf(binFile, "%s\t%d\t%d\t%d\t%s.%d\n", chrom, bin, bin+binSize, count, chrom, bin/binOverlap+1); } count = bitCountRange(bits, bin, chromSize-bin); @@ -914,30 +918,31 @@ int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); if (argc < 3) usage(); clChrom = optionVal("chrom", clChrom); chromSizes = optionVal("chromSize", NULL); orLogic = optionExists("or"); notResults = optionExists("not"); countGaps = optionExists("countGaps"); noRandom = optionExists("noRandom"); noHap = optionExists("noHap"); +primaryChroms = optionExists("primaryChroms"); dots = optionInt("dots", dots); where = optionVal("where", NULL); calcEnrichment = optionExists("enrichment"); if (calcEnrichment && argc != 4) errAbort("You must specify two tables with -enrichment"); if (calcEnrichment && orLogic) errAbort("You can't use -or with -enrichment"); if (calcEnrichment && notResults) errAbort("You can't use -not with -enrichment"); if (notResults && optionExists("fa") && !optionExists("faMerge")) errAbort("Must specify -faMerge if using -not with -fa"); featureBits(argv[1], argc-2, argv+2); return 0; }