b40094812a7c35cef96e0d9b94fa421262947b13 hiram Tue Jul 11 14:43:23 2017 -0700 adding truncate option and tests to verify everything works refs #19514 diff --git src/utils/bedClip/bedClip.c src/utils/bedClip/bedClip.c index 1b67d59..9b72f4b 100644 --- src/utils/bedClip/bedClip.c +++ src/utils/bedClip/bedClip.c @@ -1,93 +1,115 @@ /* bedClip - Remove lines from bed file that refer to off-chromosome places.. */ /* Copyright (C) 2011 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "bbiFile.h" #include "sqlNum.h" #include "obscure.h" - void usage() /* Explain usage and exit. */ { errAbort( - "bedClip - Remove lines from bed file that refer to off-chromosome places.\n" + "bedClip - Remove lines from bed file that refer to off-chromosome locations.\n" "usage:\n" - " bedClip input.bed chrom.sizes output.bed\n" + " bedClip [options] input.bed chrom.sizes output.bed\n" "chrom.sizes is a two-column file/URL: \n" "If the assembly is hosted by UCSC, chrom.sizes can be a URL like\n" " http://hgdownload.cse.ucsc.edu/goldenPath//bigZips/.chrom.sizes\n" "or you may use the script fetchChromSizes to download the chrom.sizes file.\n" "If not hosted by UCSC, a chrom.sizes file can be generated by running\n" "twoBitInfo on the assembly .2bit file.\n" "options:\n" - " -verbose=2 - set to get list of lines clipped and why\n" + " -truncate - truncate items that span ends of chrom instead of the\n" + " default of dropping the items\n" + " -verbose=2 - set to get list of lines clipped and why" ); } static struct optionSpec options[] = { + {"truncate", OPTION_BOOLEAN}, {NULL, 0}, }; +static boolean trim = FALSE; // the name truncate is already taken + void bedClip(char *inFile, char *chromSizes, char *outFile) /* bedClip - Remove lines from bed file that refer to off-chromosome places.. */ { struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes); struct lineFile *lf = lineFileOpen(inFile, TRUE); FILE *f = mustOpen(outFile, "w"); char *line; while (lineFileNextReal(lf, &line)) { char *chrom = nextWord(&line); char *startString = nextWord(&line); char *endString = nextWord(&line); if (endString == NULL) errAbort("Need at least three fields line %d of %s", lf->lineIx, lf->fileName); if (startString[0] == '-') { - verbose(2, "Clipping negative line %d of %s\n", lf->lineIx, lf->fileName); + if (trim) + { + verbose(2, "Truncating negative start line %d of %s: %s:%s-%s\n", lf->lineIx, lf->fileName, chrom, startString, endString); + startString = "0"; + } + else + { + verbose(2, "Clipping negative line %d of %s: %s:%s-%s\n", lf->lineIx, lf->fileName, chrom, startString, endString); continue; // Clip off negatives } + } if (!isdigit(startString[0])) - errAbort("Expecting number got %s line %d of %s", startString, lf->lineIx, lf->fileName); + errAbort("Expecting number got %s line %d of %s: %s:%s-%s", startString, lf->lineIx, lf->fileName, chrom, startString, endString); if (!isdigit(endString[0])) - errAbort("Expecting number got %s line %d of %s", endString, lf->lineIx, lf->fileName); + errAbort("Expecting number got %s line %d of %s: %s:%s-%s", endString, lf->lineIx, lf->fileName, chrom, startString, endString); int start = sqlUnsigned(startString); int end = sqlUnsigned(endString); if (start >= end) { - verbose(2, "Clipping end <= start line %d of %s\n", lf->lineIx, lf->fileName); + verbose(2, "Clipping end <= start line %d of %s: %s:%s-%s\n", lf->lineIx, lf->fileName, chrom, startString, endString); continue; } struct hashEl *hel = hashLookup(chromSizesHash, chrom); if (hel == NULL) - errAbort("Chromosome %s isn't in %s line %d of %s\n", chrom, chromSizes, lf->lineIx, lf->fileName); + errAbort("Chromosome %s isn't in %s line %d of %s: %s:%s-%s\n", chrom, chromSizes, lf->lineIx, lf->fileName, chrom, startString, endString); int chromSize = ptToInt(hel->val); if (end > chromSize) { - verbose(2, "Clipping end > chromSize line %d of %s\n", lf->lineIx, lf->fileName); + if (trim) + { + end = chromSize; + verbose(2, "Truncating end > chromSize(%d) line %d of %s: %s:%s-%s\n", chromSize, lf->lineIx, lf->fileName, chrom, startString, endString); + } + else + { + verbose(2, "Clipping end > chromSize(%d) line %d of %s: %s:%s-%s\n", chromSize, lf->lineIx, lf->fileName, chrom, startString, endString); continue; } - fprintf(f, "%s\t%s\t%s", chrom, startString, endString); + } + fprintf(f, "%s\t%d\t%d", chrom, start, end); line = skipLeadingSpaces(line); if (line == NULL || line[0] == 0) fputc('\n', f); else fprintf(f, "\t%s\n", line); } carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); +trim = optionExists("truncate"); + bedClip(argv[1], argv[2], argv[3]); return 0; }