2866dd07e3cc9012e19faf41c18529ea2f1d8e08 galt Fri Apr 13 17:08:01 2012 -0700 This is a squashed merge to make git-reports code-review simpler. The main thing is that there is a new shared validator routine in lib/basicBed.c which uses asParse.c to handle bedPlus. This validator is shared among validateFiles, bedToBigBed, hgLoadBed, and customTracks. Some effort has been made to standardize commandline options, and vf has been simplified a little by removing some debugging options. vf has also recently gaind the ability to validate native bigBed format, via some new code in linefile.c for attaching to a bigBed. ct: use of the new validator is controlled by an hg.conf flag that can be turned on and if needed turned off again. It will be off by default for now. As soon as we are happy with the code and it has been established, we can remove the switch. Code has been added to compare .as files, and it is here used to compare against the library standard BED. As an experiment I am leaving in the list of squashed commits messages below: Squashed commit of the following: commit a55eb050055911c699120432cc98e33cefa5fffc Author: Galt Barber <galt@soe.ucsc.edu> Date: Fri Apr 13 17:00:30 2012 -0700 fixing freeMem bug; making better option-combination checking, fixing as, adding test bed6 commit ac4b98f41e89875bc100cd2f1c1fc3825cafa57d Author: Galt Barber <galt@soe.ucsc.edu> Date: Fri Apr 13 12:56:32 2012 -0700 unless we add back in everywhere the -zerosOk option, we must tolerate them for SNP type objects commit efa2c269a6df4ea0630084670d152b4d518e232b Author: Galt Barber <galt@soe.ucsc.edu> Date: Thu Apr 12 16:16:46 2012 -0700 adding bed15 example input commit f2b5af00e2786283e785ef501caa036e5a945dc7 Author: Galt Barber <galt@soe.ucsc.edu> Date: Thu Apr 12 11:09:18 2012 -0700 increasing maximum row length buffer automatically in lineFile on bigbed commit 1bb07ce21fcd4ce1d55752b0b61997ffb71a159e Author: Galt Barber <galt@soe.ucsc.edu> Date: Thu Apr 12 11:03:59 2012 -0700 increasing maximum row length buffer automatically commit 59e831df690c5b8ac68afc9648443b3cb5dfd51b Author: Galt Barber <galt@soe.ucsc.edu> Date: Thu Apr 12 10:45:22 2012 -0700 increasing maximum row length buffer commit 711db7c2edf0a6f7b710fde1929af1ba388cb81d Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Apr 11 23:45:18 2012 -0700 adding lineFileOnBigBed, using it to add bigBed validation to validateFiles. commit 1b5e5e1eaba802c25c94a8a3dd000898d2fb3150 Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Apr 11 17:01:49 2012 -0700 for consistency with basicBed.c commit ca8e7f93af179803f4a6fe2d073703f379e271a3 Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Apr 11 16:55:26 2012 -0700 standardizing - have to call the field "reserved" so that .sql will contain the right name and existing trackhandlers will work commit fea0cb3bb7163353496c3d092d3716f1b5c30e53 Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Apr 11 12:12:25 2012 -0700 renaming option -tabs to -tab to be consistent with hgLoadBed commit 6452f634c4cc7730b521869cdfb11f4253c59ff5 Author: Galt Barber <galt@soe.ucsc.edu> Date: Tue Apr 10 18:44:11 2012 -0700 trap aborts from weird errs reading as files commit 9800e8416b5959d7a80f3785e116987569e7273c Author: Galt Barber <galt@soe.ucsc.edu> Date: Tue Apr 10 18:14:05 2012 -0700 oops commit f9f1b7f9d9e136864df4b908cd32773e1a04a260 Author: Galt Barber <galt@soe.ucsc.edu> Date: Tue Apr 10 18:08:13 2012 -0700 adding asCompare utility for comparing a given .as against many others commit 605f9a9da14b3a168821519eba737b7f4cd48163 Author: Galt Barber <galt@soe.ucsc.edu> Date: Tue Apr 10 15:17:11 2012 -0700 add parameter to return the number of columns that did match the give .as, even if the entire match might fail commit 86fb68307dfbefd2960c9173fbbae1c4d4c49b11 Author: Galt Barber <galt@soe.ucsc.edu> Date: Tue Apr 10 12:00:24 2012 -0700 adding a handy standard bed12 .as file for testing commit 79bd2364e383e133c886d034a9176c6e636181cc Author: Galt Barber <galt@soe.ucsc.edu> Date: Tue Apr 10 11:27:09 2012 -0700 added support for linked-Size in .as validation so that list sizes get validated commit 67ee8435b668f779fce28744519e5e76d6ae3433 Author: Galt Barber <galt@soe.ucsc.edu> Date: Tue Apr 10 09:53:19 2012 -0700 oops signed flag was backwards commit ebae2bef410150aefd39b7b01918fbc8c18e6785 Merge: ab0cee9 8f806f5 Author: Galt Barber <galt@soe.ucsc.edu> Date: Mon Apr 9 16:28:10 2012 -0700 Merge commit 'origin/master' into validateFiles commit ab0cee922869996045074c6076c38abd8487d7c2 Author: Galt Barber <galt@soe.ucsc.edu> Date: Mon Apr 9 16:15:53 2012 -0700 fixing rgb commit 4187a167aeeb69eb20ad9df8021db9c8f71d9193 Author: Galt Barber <galt@soe.ucsc.edu> Date: Mon Apr 9 14:42:27 2012 -0700 updated testing files commit 0ee0c51132cb09a4e9909caa9718bdf7aa5af9ff Author: Galt Barber <galt@soe.ucsc.edu> Date: Mon Apr 9 14:33:42 2012 -0700 fix err msg bug where colors field had already been chopped up by the parser commit edeed9458c345d1d8629d1e1b5230eb6b9e424a8 Author: Galt Barber <galt@soe.ucsc.edu> Date: Mon Apr 9 14:16:27 2012 -0700 adding -tabs option but making whitespace the default. this is to make it like b2bb and hglb commit 239ca9b99d04b7ff6817867160b34efe8af1d25a Author: Galt Barber <galt@soe.ucsc.edu> Date: Mon Apr 9 12:35:06 2012 -0700 more printf fixes %d ==> %u commit 57745b3096c096d73b02a651cba0060210acad07 Author: Galt Barber <galt@soe.ucsc.edu> Date: Mon Apr 9 12:19:46 2012 -0700 fixing some %d to %u for correct sign of bed struct members in printf commit 7c3f3f24402f01f2f152f6b839b9efb01fa9c5c1 Author: Galt Barber <galt@soe.ucsc.edu> Date: Mon Apr 9 09:33:30 2012 -0700 oops need to use FromDatabase with chromDb option commit 237ab4c1b707ba19ba12079483365269db3b6119 Author: Galt Barber <galt@soe.ucsc.edu> Date: Mon Apr 9 09:26:43 2012 -0700 little fix removing unused option maxErrors commit 689c10edba73f49658f315a14201468864e5d83c Author: Galt Barber <galt@soe.ucsc.edu> Date: Sun Apr 8 12:07:04 2012 -0700 reducing redundancy by making allInts both polymorphic and fast; added checking of .as fields against BED standard for the first bedN columns commit c6bd5e6eec93c4ca6e947e7028907110e4e73d0e Merge: 4d09bfd b7b8f22 Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Apr 4 12:58:34 2012 -0700 Merge commit 'origin/master' into validateFiles commit 4d09bfdd35087b6637fa638297be9c07b45cb450 Author: Galt Barber <galt@soe.ucsc.edu> Date: Tue Apr 3 14:31:32 2012 -0700 because tabs may be used, cannot confirm here that the strings are non-empty, because they might be, so removing the check commit cfaa756790ee029e480a045f05a51733ebef4e4b Author: Galt Barber <galt@soe.ucsc.edu> Date: Tue Apr 3 14:25:03 2012 -0700 adding back the check for chrHash (chromDb or chromInfo) that was lost when I reverted the tabs option deletion commit 5531303fe2ee39d28cec7c9f4baa472ce0ab5eba Author: Galt Barber <galt@soe.ucsc.edu> Date: Tue Apr 3 14:20:27 2012 -0700 Revert "cleaned up unneeded options, using chopByWhite instead of chopByTab." This reverts commit 458a52f976edade78177908bc9f5886c81e7b6ab. commit 0ea039f35541360ca1d64d754af3f9dec6621a38 Author: Galt Barber <galt@soe.ucsc.edu> Date: Tue Apr 3 14:15:45 2012 -0700 resolved reversion of fa3b343f1ff1eb7a50df9029e71141484f260a22 commit 44e7cbc4dbf3b5a955f7fa1eea6a65a7d8570528 Author: Galt Barber <galt@soe.ucsc.edu> Date: Thu Mar 29 16:34:47 2012 -0700 removed unneeded errs variable commit 404c095271351b38233124e74cd35cbc57b8aad4 Author: Galt Barber <galt@soe.ucsc.edu> Date: Thu Mar 29 16:10:27 2012 -0700 removed line count variable commit 5561bd6a7b85e6115f8162eb2e597de0f7c2c284 Author: Galt Barber <galt@soe.ucsc.edu> Date: Thu Mar 29 16:06:03 2012 -0700 removed unneeded flags printFailLines printOkLines, and quick commit 5a6eae96483d91baa7690df436d4becc13cdbf2f Author: Galt Barber <galt@soe.ucsc.edu> Date: Thu Mar 29 15:10:13 2012 -0700 jk prefers brackets to curly-braces commit 7e431a76a215c1402c5a6a37816f6fc7ca234363 Author: Galt Barber <galt@soe.ucsc.edu> Date: Thu Mar 29 15:01:45 2012 -0700 adding version # for b2bb, by crickets request commit fa3b343f1ff1eb7a50df9029e71141484f260a22 Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Mar 28 16:43:33 2012 -0700 removed -tabs option, using chopByWhite instead of chopByTab because of bed definition according to JK commit 458a52f976edade78177908bc9f5886c81e7b6ab Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Mar 28 16:40:10 2012 -0700 cleaned up unneeded options, using chopByWhite instead of chopByTab. commit 7e179aab9da9961221bf864fdd136ed8bbc0392b Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Mar 28 16:13:30 2012 -0700 removing zeroSizeOk option commit 8c8ad8ca09239b7a7a8cc0de693b8a7c7d49a669 Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Mar 28 15:51:45 2012 -0700 improving the wording of help commit 1bf63d9d7e7fa89e060eab26a9751acd3c8ad8e4 Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Mar 28 12:37:37 2012 -0700 changing edge-case definition slightly for chromEnd commit 7e5c151234e12cd8b55d10e79042d7f92b3fcb5a Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Mar 28 12:04:29 2012 -0700 incrementing version commit c0e129d917eaac888e6a9c521c4407bf9380b4b0 Merge: f6ce988 68a21eb Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Mar 28 11:16:45 2012 -0700 Merge commit 'origin/master' into validateFiles commit f6ce988fb6f76bb74439c77c4094483f26465c90 Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Feb 8 15:39:54 2012 -0800 oops commit 3dfdebd293e93073354f9af486a7d9407f0b9471 Merge: cf104e3 75c9145 Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Feb 8 14:59:58 2012 -0800 Merge commit 'origin/master' into validateFiles commit cf104e362325a5832911022865bb69241e1e472d Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Feb 8 14:55:23 2012 -0800 adding hg.conf switch to activate new validator use commit ef5806814a6e9730682ea56a0a0e45ab962a9ee1 Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Feb 8 14:26:43 2012 -0800 various cleanup and consistency commit 3e4e92b25cf414129ba0339a91023380c2f8f974 Author: Galt Barber <galt@soe.ucsc.edu> Date: Fri Feb 3 17:21:51 2012 -0800 adding chromDb, and chromInfo options to hgLoadBed commit 05ed14c1e92684861789163e3c936bf07778acb2 Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Feb 1 21:06:55 2012 -0800 added optional validation support for bed and bedPlus to hgLoadBed commit 7445f19ac4009eed47b989999de35f92b550fac5 Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Feb 1 20:09:54 2012 -0800 tested bedPlus, and extended checking for more types/cases, e.g. string~ and unsigned numbers commit 31b44ec97ecd57101b3c65096288347592276d5d Author: Galt Barber <galt@soe.ucsc.edu> Date: Tue Jan 31 11:11:07 2012 -0800 adding support for context during array-list parsing commit 5ffcc5974b59f00551940f0dc20b56f1e7990f12 Author: Galt Barber <galt@soe.ucsc.edu> Date: Mon Jan 30 11:15:12 2012 -0800 re-working things, adding better checking commit 46e10b9ff5d32793656dfd1d136687f2a0733b4e Author: Galt Barber <galt@soe.ucsc.edu> Date: Fri Jan 20 17:28:41 2012 -0800 validateFiles now supports bed, bedPlus using the shared validator lib function commit d2adfda8e080b562c93084f956ddcdcf47f06974 Author: Galt Barber <galt@soe.ucsc.edu> Date: Fri Jan 20 02:33:09 2012 -0800 cleanup, handling ct differently than the others which care only for validation but not the actual bed results commit 1fe9aa5028374750a491a0a3f836f4657f9b9a43 Author: Galt Barber <galt@soe.ucsc.edu> Date: Thu Jan 19 16:54:12 2012 -0800 added support for bedPlus via .as object commit 7b32efb4152f57038ae56d3dca15fd700398ea5a Author: Galt Barber <galt@soe.ucsc.edu> Date: Thu Jan 19 16:13:40 2012 -0800 moved validation code from customFactory.c to basicBed.c, added validation support to b2bb commit 64b293690baeb50ce375f34e01493dbc9a50cbf5 Author: Galt Barber <galt@soe.ucsc.edu> Date: Tue Jan 17 14:06:35 2012 -0800 ok, have bed 12 linked-features validation working commit f0a6d2c2a749fffedadc8189ff7ba86583003128 Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Jan 11 11:37:10 2012 -0800 adding README describing how some bed tests are made commit 46400cceb3a602423ae2a5656bc442d9969e63df Author: Galt Barber <galt@soe.ucsc.edu> Date: Wed Jan 11 11:26:13 2012 -0800 fixed bigwig tests; added bed12ok test commit 305e5c0e07210d278aa257d7c859bdd47abc77c5 Author: Galt Barber <galt@soe.ucsc.edu> Date: Tue Jan 10 17:05:13 2012 -0800 added support for bed files (not including bedPlus); also began working on re-organizing the make test cases diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c index 3f870eb..e6b7111 100644 --- src/utils/bedToBigBed/bedToBigBed.c +++ src/utils/bedToBigBed/bedToBigBed.c @@ -1,182 +1,172 @@ /* bedToBigBed - Convert bed to bigBed.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dystring.h" #include "obscure.h" #include "asParse.h" #include "basicBed.h" #include "sig.h" #include "rangeTree.h" #include "zlibFace.h" #include "sqlNum.h" #include "bigBed.h" +char *version = "2.0"; int blockSize = 256; int itemsPerSlot = 512; -int bedFields = 0; -int rgbField = 0; +int bedN = 0; /* number of standard bed fields */ +int bedP = 0; /* number of bed plus fields */ char *as = NULL; static boolean doCompress = FALSE; static boolean tabSep = FALSE; void usage() /* Explain usage and exit. */ { errAbort( - "bedToBigBed v. %d - Convert bed file to bigBed.\n" + "bedToBigBed v. %s - Convert bed file to bigBed. (BigBed version: %d)\n" "usage:\n" " bedToBigBed in.bed chrom.sizes out.bb\n" "Where in.bed is in one of the ascii bed formats, but not including track lines\n" "and chrom.sizes is two column: <chromosome name> <size in bases>\n" "and out.bb is the output indexed big bed file.\n" "Use the script: fetchChromSizes to obtain the actual chrom.sizes information\n" "from UCSC, please do not make up a chrom sizes from your own information.\n" "The in.bed file must be sorted by chromosome,start,\n" " to sort a bed file, use the unix sort command:\n" " sort -k1,1 -k2,2n unsorted.bed > sorted.bed\n" "\n" "options:\n" + " -type=bedN[+[P]] - Bed N is between 3 and 15,\n" + " optional (+) if extra \"bedPlus\" fields, optional P specifies the number of extra fields \n" + " -as=fields.as - If you have non-standard \"bedPlus\" fields, it's great to put a definition\n" + " of each field in a row in AutoSql format here.\n" " -blockSize=N - Number of items to bundle in r-tree. Default %d\n" " -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n" - " -bedFields=N - Number of fields that fit standard bed definition. If undefined\n" - " assumes all fields in bed are defined.\n" - " -as=fields.as - If have non-standard fields, it's great to put a definition\n" - " of each field in a row in AutoSql format here.\n" - " -rgbField=N - the Nth field is a comma separated R,G,B triple.\n" - " For the usual itemRgb/reserved bed field this is 9.\n" " -unc - If set, do not use compression.\n" - " -tabs - If set, expect fields to be tab separated, normally\n" + " -tab - If set, expect fields to be tab separated, normally\n" " expects white space separator.\n" - , bbiCurrentVersion, blockSize, itemsPerSlot + , version, bbiCurrentVersion, blockSize, itemsPerSlot ); } static struct optionSpec options[] = { {"blockSize", OPTION_INT}, {"itemsPerSlot", OPTION_INT}, - {"bedFields", OPTION_INT}, - {"rgbField", OPTION_INT}, + {"type", OPTION_STRING}, {"as", OPTION_STRING}, {"unc", OPTION_BOOLEAN}, - {"tabs", OPTION_BOOLEAN}, + {"tab", OPTION_BOOLEAN}, {NULL, 0}, }; void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as, - bits16 definedFieldCount, int itemsPerSlot, struct bbiBoundsArray *bounds, + int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, boolean doCompress, FILE *f, int resTryCount, int resScales[], int resSizes[], - bits16 *retFieldCount, bits16 *retDefinedFieldCount, bits32 *retMaxBlockSize) + bits16 *retFieldCount, bits32 *retMaxBlockSize) /* Read through lf, writing it in f. Save starting points of blocks (every itemsPerSlot) * to boundsArray */ { int maxBlockSize = 0; struct bbiChromUsage *usage = usageList; -char *line, **row = NULL; -int fieldCount = 0, fieldAlloc=0, lastField = 0; +char *line, *row[256]; // limit of 256 columns is arbitrary, but useful to catch pathological input +int fieldCount = 0, lastField = 0; int itemIx = 0, sectionIx = 0; bits64 blockOffset = 0; int startPos = 0, endPos = 0; bits32 chromId = 0; boolean allocedAs = FALSE; struct dyString *stream = dyStringNew(0); /* Will keep track of some things that help us determine how much to reduce. */ bits32 resEnds[resTryCount]; int resTry; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; boolean atEnd = FALSE, sameChrom = FALSE; bits32 start = 0, end = 0; char *chrom = NULL; +struct bed *bed; +AllocVar(bed); for (;;) { /* Get next line of input if any. */ if (lineFileNextReal(lf, &line)) { - /* First time through figure out the field count, and if not set, the defined field count. */ + /* First time through figure out the field count and if not set, the bedN. */ if (fieldCount == 0) { if (as == NULL) { if (tabSep) fieldCount = chopString(line, "\t", NULL, 0); else fieldCount = chopByWhite(line, NULL, 0); - if (definedFieldCount == 0) - definedFieldCount = fieldCount; - char *asText = bedAsDef(definedFieldCount, fieldCount); + if (bedN == 0) + bedN = fieldCount; + if (bedN > 15) + { + bedN = 15; + bedP = fieldCount - bedN; + } + char *asText = bedAsDef(bedN, fieldCount); as = asParseText(asText); allocedAs = TRUE; freeMem(asText); } else { fieldCount = slCount(as->columnList); + // if the .as is specified, the -type must be also so that the number of standard BED columns is known. bedN will be >0. + asCompareObjAgainstStandardBed(as, bedN, TRUE); // abort if bedN columns are not standard } - fieldAlloc = fieldCount + 1; + if (fieldCount > ArraySize(row)) + errAbort("Too many fields [%d], current maximum fields limit is %lu", fieldCount, ArraySize(row)); lastField = fieldCount - 1; - AllocArray(row, fieldAlloc); *retFieldCount = fieldCount; - *retDefinedFieldCount = definedFieldCount; + + if (bedP == -1) // user did not specify how many plus columns there are. + { + bedP = fieldCount - bedN; + if (bedP < 1) + errAbort("fieldCount input (%d) did not match the specification (%s)\n" + , fieldCount, optionVal("type", "")); + } + if (fieldCount != bedN + bedP) + errAbort("fieldCount input (%d) did not match the specification (%s)\n" + , fieldCount, optionVal("type", "")); } /* Chop up line and make sure the word count is right. */ int wordCount; if (tabSep) - wordCount = chopString(line, "\t", row, fieldAlloc); + wordCount = chopTabs(line, row); else - wordCount = chopByWhite(line, row, fieldAlloc); + wordCount = chopLine(line, row); lineFileExpectWords(lf, fieldCount, wordCount); - /* Parse out first three fields. */ - chrom = row[0]; - start = lineFileNeedNum(lf, row, 1); - end = lineFileNeedNum(lf, row, 2); + loadAndValidateBed(row, bedN, fieldCount, lf, bed, as, FALSE); - /* Check remaining fields are formatted right. */ - if (fieldCount > 3) - { - /* Go through and check that numerical strings really are numerical. */ - struct asColumn *asCol = slElementFromIx(as->columnList, 3); - int i; - for (i=3; i<fieldCount; ++i) - { - enum asTypes type = asCol->lowType->type; - if (! (asCol->isList || asCol->isArray)) - { - if (rgbField == i + 1) - { - // we check for error, but save the R,G,B truple in - // the bigBed - if (-1 == bedParseRgb(row[i])) - errAbort("ERROR: expecting r,g,b specification, " - "found: '%s'", row[i]); - } - else if (asTypesIsInt(type)) - lineFileNeedFullNum(lf, row, i); - else if (asTypesIsFloating(type)) - lineFileNeedDouble(lf, row, i); - } - asCol = asCol->next; - } - } + chrom = bed->chrom; + start = bed->chromStart; + end = bed->chromEnd; sameChrom = sameString(chrom, usage->name); } else /* No next line */ { atEnd = TRUE; } /* Check conditions that would end block and save block info and advance to next if need be. */ if (atEnd || !sameChrom || itemIx >= itemsPerSlot) { /* Save stream to file, compressing if need be. */ if (stream->stringSize > maxBlockSize) maxBlockSize = stream->stringSize; @@ -270,31 +260,31 @@ { bits32 resEnd = resEnds[resTry]; if (start >= resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = start + resScales[resTry]; } while (end > resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = resEnd + resScales[resTry]; } } } assert(sectionIx == sectionCount); -freez(&row); +freez(&bed); if (allocedAs) asObjectFreeList(&as); *retMaxBlockSize = maxBlockSize; } struct rbTree *rangeTreeForBedChrom(struct lineFile *lf, char *chrom) /* Read lines from bed file as long as they match chrom. Return a rangeTree that * corresponds to the coverage. */ { struct rbTree *tree = rangeTreeNew(); char *line; while (lineFileNextReal(lf, &line)) { if (!startsWithWord(chrom, line)) { @@ -435,32 +425,30 @@ cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount, blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); slReverse(&twiceReducedList); return twiceReducedList; } void bbFileCreate( char *inName, /* Input file in a tabular bed format <chrom><start><end> + whatever. */ char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */ int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */ int itemsPerSlot, /* Number of items in lowest level of tree. 64 is good. */ - bits16 definedFieldCount, /* Number of defined bed fields - 3-16 or so. 0 means all fields - * are the defined bed ones. */ char *asFileName, /* If non-null points to a .as file that describes fields. */ boolean doCompress, /* If TRUE then compress data. */ char *outName) /* BigBed output file name. */ /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */ { /* Set up timing measures. */ verboseTimeInit(); struct lineFile *lf = lineFileOpen(inName, TRUE); /* Load up as object if defined in file. */ struct asObject *as = NULL; if (asFileName != NULL) { /* Parse it and do sanity check. */ as = asParseFile(asFileName); @@ -521,32 +509,32 @@ { resSizes[resTry] = 0; resScales[resTry] = res; res *= resIncrement; } /* Write out primary full resolution data in sections, collect stats to use for reductions. */ bits64 dataOffset = ftell(f); writeOne(f, bedCount); bits32 blockCount = bbiCountSectionsNeeded(usageList, itemsPerSlot); struct bbiBoundsArray *boundsArray; AllocArray(boundsArray, blockCount); lineFileRewind(lf); bits16 fieldCount=0; bits32 maxBlockSize = 0; -writeBlocks(usageList, lf, as, definedFieldCount, itemsPerSlot, boundsArray, blockCount, doCompress, - f, resTryCount, resScales, resSizes, &fieldCount, &definedFieldCount, &maxBlockSize); +writeBlocks(usageList, lf, as, itemsPerSlot, boundsArray, blockCount, doCompress, + f, resTryCount, resScales, resSizes, &fieldCount, &maxBlockSize); verboseTime(1, "pass2 - checking and writing primary data (%lld records, %d fields)", (long long)bedCount, fieldCount); /* Write out primary data index. */ bits64 indexOffset = ftell(f); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), blockCount, blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); verboseTime(1, "index write"); /* Declare arrays and vars that track the zoom levels we actually output. */ bits32 zoomAmounts[bbiMaxZoomLevels]; bits64 zoomDataOffsets[bbiMaxZoomLevels]; bits64 zoomIndexOffsets[bbiMaxZoomLevels]; @@ -613,30 +601,32 @@ /* Figure out buffer size needed for uncompression if need be. */ if (doCompress) { int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk); uncompressBufSize = max(maxBlockSize, maxZoomUncompSize); } /* Go back and rewrite header. */ rewind(f); bits32 sig = bigBedSig; bits16 version = bbiCurrentVersion; bits16 summaryCount = zoomLevels; bits32 reserved32 = 0; bits64 reserved64 = 0; +bits16 definedFieldCount = bedN; + /* Write fixed header */ writeOne(f, sig); writeOne(f, version); writeOne(f, summaryCount); writeOne(f, chromTreeOffset); writeOne(f, dataOffset); writeOne(f, indexOffset); writeOne(f, fieldCount); writeOne(f, definedFieldCount); writeOne(f, asOffset); writeOne(f, totalSummaryOffset); writeOne(f, uncompressBufSize); int i; for (i=0; i<2; ++i) writeOne(f, reserved32); @@ -668,38 +658,64 @@ fseek(f, 0L, SEEK_END); writeOne(f, sig); /* Clean up. */ lineFileClose(&lf); carefulClose(&f); freeHash(&chromSizesHash); bbiChromUsageFreeList(&usageList); asObjectFreeList(&as); } void bedToBigBed(char *inName, char *chromSizes, char *outName) /* bedToBigBed - Convert bed file to bigBed.. */ { -bbFileCreate(inName, chromSizes, blockSize, itemsPerSlot, bedFields, as, +bbFileCreate(inName, chromSizes, blockSize, itemsPerSlot, as, doCompress, outName); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); blockSize = optionInt("blockSize", blockSize); itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot); -bedFields = optionInt("bedFields", bedFields); -rgbField = optionInt("rgbField", rgbField); as = optionVal("as", as); doCompress = !optionExists("unc"); -tabSep = optionExists("tabs"); +tabSep = optionExists("tab"); if (argc != 4) usage(); +if (optionExists("type")) + { + // parse type + char *btype = cloneString(optionVal("type", "")); + char *plus = strchr(btype, '+'); + if (plus) + { + *plus++ = 0; + if (isdigit(*plus)) + bedP = sqlUnsigned(plus); + else + bedP = -1; + } + if (!startsWith("bed", btype)) + errAbort("type must begin with \"bed\""); + btype +=3; + bedN = sqlUnsigned(btype); + if (bedN < 3) + errAbort("Bed must be 3 or higher, found %d\n", bedN); + if (bedN > 15) + errAbort("Bed must be 15 or lower, found %d\n", bedN); + } +else + { + if (as) + errAbort("If you specify the .as file, you must specify the -type as well so that the number of standard BED columns is known."); + } + bedToBigBed(argv[1], argv[2], argv[3]); optionFree(); if (verboseLevel() > 1) printVmPeak(); return 0; }