e70152e44cc66cc599ff6b699eb8adc07f3e656a kent Sat May 24 21:09:34 2014 -0700 Adding Copyright NNNN Regents of the University of California to all files I believe with reasonable certainty were developed under UCSC employ or as part of Genome Browser copyright assignment. diff --git src/hg/lib/wigAsciiToBinary.c src/hg/lib/wigAsciiToBinary.c index 914abc5..7c13e9d 100644 --- src/hg/lib/wigAsciiToBinary.c +++ src/hg/lib/wigAsciiToBinary.c @@ -1,700 +1,703 @@ +/* Copyright (C) 2014 The Regents of the University of California + * See README in this or parent directory for licensing information. */ + /* wigAsciiToBinary - convert ascii Wiggle data to binary file * plus, produces file to read for load into database * * The format of the incoming data is described at: * http://hgwdev.cse.ucsc.edu/encode/submission.html#WIG * * Incoming data points are chunked up into blocks. * Each block of data is described by an SQL row in the .wig file * output. The data values are converted to single bytes in a * binning scheme. These binary bytes are written to the .wib file. * * The binary data file is simply one unsigned char value per byte. * The suffix for this binary data file is: .wib - wiggle binary * * The second file produced is a bed-like format file, with chrom, * chromStart, chromEnd, etc. See hg/inc/wiggle.h for structure * definition. The important thing about this file is that it will * have the pointers into the binary data file and statistical * summaries of the data values in this block. * This file extension is: .wig - wiggle * * Holes in the data are allowed. Incoming data specifications * set the span of a data point == how many nucleotide * bases the value is representative of. * * There is a concept of a "bin" size which means how many values * should be maintained in a single table row entry. Holes in the input * data will be filled with WIG_NO_DATA values up to the size of the * current "bin". If the missing data continues past the next "bin" size * number of data points, that table row will be skipped. Thus, it * is only necessary to have table rows which contain valid data. * binsize is currently 1024, however it will need to be an input * parameter for the command line version of this function. * (there are several parameters that need to come in from the * command line version -- to be done). */ #include "common.h" #include "options.h" #include "linefile.h" #include "wiggle.h" /* This list of static variables is here because the several * subroutines in this source file need access to all this business * and since there are so many it is not convenient to pass them * around in a structure. Just make sure they are initialized * properly upon entry to this function. This used to be a static * command line program and thus these did not need to be set * except for this declaration, now they are all re-initialized * upon entry to the function. * Functions in this source file: * static void output_row(); * static void setDataSpan(char *spec); * static void setStepSize(char *spec); * static void setFixedStart(char *spec); * static void setChromName(char *spec); * static void setDataValue(double val); * void wigAsciiToBinary( char *wigAscii, char *wigFile, char *wibFile, * double *upperLimit, double *lowerLimit ); */ static unsigned long lineCount = 0; /* counting all input lines */ static long long add_offset = 0; /* to allow "lifting" of the data */ static boolean noOverlap = FALSE; /* check for overlapping data */ static boolean flagOverlapSpanData = FALSE;// verify span data doesn't overlap static long long binsize = 1024; /* # of data points per table row */ static long long dataSpan = 1; /* bases spanned per data point */ static unsigned long long bincount = 0; /* to count binsize for wig file */ static unsigned long long chromStart = 0; /* for table row data */ static unsigned long long fixedStart = 0; /* for fixedStep data */ static unsigned long long stepSize = 1; /* for fixedStep */ static unsigned long long chromEnd = 0; /* for table row data */ static unsigned long long bedChromStart = 0; /* for bed format */ static unsigned long long bedChromEnd = 0; /* for bed format */ static double bedDataValue = 0.0; /* for bed format */ static FILE *wigout = (FILE *)NULL; /* file handle for wiggle database file */ static FILE *binout = (FILE *)NULL; /* file handle for binary data file */ static char *chromName = (char *)NULL; /* pseudo bed-6 data to follow */ static char *featureName = (char *)NULL;/* the name of this feature */ static unsigned long long rowCount = 0; /* to count rows output */ static off_t fileOffsetBegin = 0; /* where this bin started in binary */ static unsigned long long Offset = 0; /* from data input */ static unsigned long long previousOffset = 0; /* for data missing detection */ static off_t fileOffset = 0; /* where are we in the binary data */ static double *data_values = (double *) NULL; /* all the data values read in for this one row */ static unsigned char * validData = (unsigned char *) NULL; /* array to keep track of what data is valid */ /* during wigAsciiToBinary, these starting limits are for the complete set * of data, they must change from these initial defaults during * processing */ /* limits for the complete set of data, they must change from these initial defaults during processing */ static double overallUpperLimit = wigEncodeStartingUpperLimit; static double overallLowerLimit = wigEncodeStartingLowerLimit; static char *wibFileName = (char *)NULL; /* for use in output_row() */ static long long wibSizeLimit = 0; /* governor on ct trash files */ static long long wibSize = 0; /* counting up to limit */ #if defined(NOT) /* not used at this time, perhaps in the future */ static int isIntF(float x) { float y = floorf(x); return (0.0 == (x - y)); } #endif static int isIntD(double x) { double y = floor(x); return (0.0 == (x - y)); } /* * The table rows need to be printed in a couple of different * places in the code. * Definition of this row format can be found in hg/inc/wiggle.h * and hg/lib/wiggle.as, etc ... * output_row() is called each time a block of data values are * complete, this produces one SQL line definition of a data block. */ static void output_row() { /* limits for the complete set of data, they must change from these initial defaults during processing */ double lowerLimit = wigEncodeStartingLowerLimit; double upperLimit = wigEncodeStartingUpperLimit; double dataRange = 0.0; double sumData = 0.0; double sumSquares = 0.0; unsigned long long i; unsigned long long validCount = 0; /* number of valid data points */ boolean allIntegers = FALSE; /* off until further notice */ /* with the allIntegers function on, the "Maximum" of the * data ends up being artifically high. */ if (bincount) { wibSize += bincount; /* counting up to check limit */ /* scan the data and determine stats for it */ for (i=0; i < bincount; ++i) { if (validData[i]) { ++validCount; if (data_values[i] > upperLimit) upperLimit = data_values[i]; if (data_values[i] < lowerLimit) lowerLimit = data_values[i]; sumData += data_values[i]; sumSquares += data_values[i] * data_values[i]; #ifdef NOT if (allIntegers && (! isIntD(data_values[i]))) allIntegers = FALSE; #endif } } if (validCount < 1) { errAbort("wigAsciiToBinary internal error: empty row being produced at row: %lld, line: %lu\n", rowCount, lineCount); } dataRange = upperLimit - lowerLimit; /* If all integer data and will fit in the 128 bins, * then do them as integer bins */ if (allIntegers && isIntD(dataRange) && isIntD(upperLimit) && isIntD(lowerLimit) && (dataRange < (double)MAX_WIG_VALUE) && (dataRange > 0.0)) { dataRange = MAX_WIG_VALUE; } if (upperLimit > overallUpperLimit) overallUpperLimit = upperLimit; if (lowerLimit < overallLowerLimit) overallLowerLimit = lowerLimit; /* With limits determined, can now scale and output the values * compressed down to byte data values for the binary file */ for (i=0; i < bincount; ++i) { int byteOutput; /* should be unsigned char, but fputc() wants int */ if (validData[i]) { if (dataRange > 0.0) { byteOutput = MAX_WIG_VALUE * ((data_values[i] - lowerLimit)/dataRange); } else { byteOutput = 0; } } else { byteOutput = WIG_NO_DATA; } fputc(byteOutput,binout); /* output a data byte */ } chromEnd = chromStart + (bincount * dataSpan); verbose(2, "row: %llu %llu-%llu total: %llu valid: %llu, miss: %llu, file offset: %llu\n", rowCount, chromStart+add_offset, chromEnd+add_offset, bincount, validCount, bincount-validCount, (unsigned long long)fileOffsetBegin); fprintf( wigout, "%s\t%llu\t%llu\t%s.%llu\t%llu\t%llu\t%llu\t%s\t%g\t%g\t%llu\t%g\t%g\n", chromName, chromStart+add_offset, chromEnd+add_offset, featureName, rowCount, dataSpan, bincount, (unsigned long long)fileOffsetBegin, wibFileName, lowerLimit, dataRange, validCount, sumData, sumSquares); ++rowCount; } bincount = 0; /* to count up to binsize */ for (i=0; i < binsize; ++i) /* reset valid indicator */ validData[i] = FALSE; chromStart = Offset + dataSpan; verbose(2, "next chromStart = %llu = %llu + %llu\n", chromStart, Offset, dataSpan); fileOffsetBegin = fileOffset; } /* static void output_row() */ static void setDataSpan(char *spec) /* given *spec: "dataSpan=N", set parse and dataSpan to N */ { char *wordPairs[2]; int wc; char *clone; clone = cloneString(spec); wc = chopByChar(spec, '=', wordPairs, 2); if (wc != 2) errAbort("Expecting span=N, no N found in %s at line %lu", clone, lineCount); dataSpan = sqlLongLong(wordPairs[1]); freeMem(clone); } static void setStepSize(char *spec) /* given *spec: "step=<position>", parse and set stepSize and dataSpan */ { char *wordPairs[2]; int wc; char *clone; clone = cloneString(spec); wc = chopByChar(spec, '=', wordPairs, 2); if (wc != 2) errAbort("Expecting step=<size>, no <size> found in %s at line %lu", clone, lineCount); stepSize=sqlLongLong(wordPairs[1]); freeMem(clone); } static void setFixedStart(char *spec) /* given *spec: "start=<position>", parse and set fixedStart */ { char *wordPairs[2]; int wc; char *clone; clone = cloneString(spec); wc = chopByChar(spec, '=', wordPairs, 2); if (wc != 2) errAbort("Expecting start=<position>, no <position> found in %s at line %lu", clone, lineCount); fixedStart=sqlLongLong(wordPairs[1]); if (fixedStart == 0) errAbort("Found start=0 at line %lu, the first chrom position is 1, not 0", lineCount); else fixedStart = BASE_0(fixedStart); /* zero relative half-open */ freeMem(clone); } static void setChromName(char *spec) /* given *spec: "chrom=<name>", parse and set chromName, featureName */ { char *wordPairs[2]; int wc; char *clone; clone = cloneString(spec); wc = chopByChar(spec, '=', wordPairs, 2); if (wc != 2) errAbort("Expecting chrom=<name>, no <name> found in %s at line %lu", clone, lineCount); freez(&chromName); chromName=cloneString(wordPairs[1]); freez(&featureName); featureName=cloneString(wordPairs[1]); freeMem(clone); } static void setDataValue(double val) /* add a data value to the accumulating block of data */ { data_values[bincount] = val; validData[bincount] = TRUE; ++fileOffset; ++bincount; /* count scores in this bin */ /* Is it time to output a row definition ? */ if (bincount >= binsize) output_row(); } /* The single externally visible routine. * Future improvements will need to add a couple more arguments to * satisify the needs of the command line version and its options. * Currently, this is used only in customTrack input parsing. */ void wigAsciiToBinary( char *wigAscii, char *wigFile, char *wibFile, double *upperLimit, double *lowerLimit, struct wigEncodeOptions *options) /* given the three file names, read the ascii wigAscii file and produce * the wigFile and wibFile outputs */ { struct lineFile *lf; /* for line file utilities */ char *line = (char *) NULL; /* to receive data input line */ char *words[10]; /* to split data input line */ int wordCount = 0; /* result of split */ int validLines = 0; /* counting only lines with data */ double dataValue = 0.0; /* from data input */ boolean bedData = FALSE; /* in bed format data */ boolean variableStep = FALSE; /* in variableStep data */ boolean fixedStep = FALSE; /* in fixedStep data */ char *prevChromName = (char *)NULL; /* to watch for chrom name changes */ int trackCount = 0; /* We abort if we see more than one track. */ if ((wigAscii == (char *)NULL) || (wigFile == (char *)NULL) || (wibFile == (char *)NULL)) errAbort("wigAsciiToBinary: missing data file names, ascii: %s, wig: %s, wib: %s", wigAscii, wigFile, wibFile); /* need to be careful here and initialize all the global variables */ freez(&wibFileName); /* send this name to the global */ wibFileName = cloneString(wibFile); /* variable for use in output_row() */ lineCount = 0; /* to count all lines */ add_offset = 0; /* to allow "lifting" of the data */ validLines = 0; /* to count only lines with data */ rowCount = 0; /* to count rows output */ bincount = 0; /* to count up to binsize */ binsize = 1024; /* # of data points per table row */ dataSpan = 1; /* default bases spanned per data point */ chromStart = 0; /* for table row data */ previousOffset = 0; /* for data missing detection */ fileOffset = 0; /* current location within binary data file */ fileOffsetBegin = 0;/* location in binary data file where this bin starts*/ freez(&data_values); freez(&validData); data_values = (double *) needMem( (size_t) (binsize * sizeof(double))); validData = (unsigned char *) needMem( (size_t) (binsize * sizeof(unsigned char))); if (options != NULL) { if (options->lift != 0) add_offset = options->lift; if (options->noOverlap) noOverlap = TRUE; if (options->flagOverlapSpanData) flagOverlapSpanData = TRUE; if (options->wibSizeLimit > 0) wibSizeLimit = options->wibSizeLimit; } /* limits for the complete set of data, they must change from these initial defaults during processing */ overallLowerLimit = wigEncodeStartingLowerLimit; overallUpperLimit = wigEncodeStartingUpperLimit; binout = mustOpen(wibFile,"w"); /* binary data file */ wigout = mustOpen(wigFile,"w"); /* table row definition file */ #if defined(DEBUG) /* dbg */ chmod(wibFile, 0666); chmod(wigFile, 0666); #endif lf = lineFileOpen(wigAscii, TRUE); /* input file */ while (lineFileNext(lf, &line, NULL)) { boolean readingFrameSlipped; ++lineCount; if ((wibSizeLimit > 0) && (wibSize >= wibSizeLimit)) errAbort("data size limit of %lld data values has been exceeded. This data can be efficiently displayed with the <A HREF='/goldenPath/help/bigWig.html' TARGET=_blank>bigWig file format</A> in a custom track, or in a <A HREF='/goldenPath/help/hgTrackHubHelp.html' TARGET=_blank>Track Hub</A> for multiple large datasets.", wibSizeLimit); line = skipLeadingSpaces(line); /* ignore blank or comment lines */ if ((line == (char *)NULL) || (line[0] == '\0') || (line[0] == '#')) continue; /* !!! go to next line of input */ wordCount = chopByWhite(line, words, ArraySize(words)); if (sameWord("track",words[0])) { /* Allow (and ignore) one track line, but no more. */ ++trackCount; if (trackCount > 1) errAbort("Multiple tracks seen, second at line %d of %s, can only handle one.", lf->lineIx, lf->fileName); continue; } else if (sameWord("browser", words[0])) { continue; /* ignore browser lines if present */ } else if (sameWord("variableStep",words[0])) { int i; boolean foundChrom = FALSE; /* safest thing to do if we were processing anything is to * output that previous block and start anew * Future improvement could get fancy here and decide if it * is really necessary to start over, although the concept * of a line between data points on one item may use this * block behavior later to define line segments, so don't * get too quick to be fancy here. This line behavior * implies that feature names will need to be specified to * identify the line segments that belong together. */ if (variableStep || bedData || fixedStep) { output_row(); validLines = 0; /* to cause reset for first offset */ } dataSpan = 1; /* default bases spanned per data point */ for(i = 1; i < wordCount; ++i) { if (startsWith("chrom",words[i])) { setChromName(words[i]); foundChrom = TRUE; } else if (startsWith("span",words[i])) setDataSpan(words[i]); else errAbort("illegal specification on variableStep at line %lu: %s", lineCount, words[i]); } if (!foundChrom) errAbort("missing chrom=<name> specification on variableStep declaration at line %lu", lineCount); variableStep = TRUE; bedData = FALSE; fixedStep = FALSE; freez(&prevChromName); prevChromName = cloneString(chromName); continue; /* !!! go to next input line */ } else if (sameWord("fixedStep",words[0])) { boolean foundChrom = FALSE; boolean foundStart = FALSE; int i; /* same comment as above */ if (variableStep || bedData || fixedStep) { output_row(); validLines = 0; /* to cause reset for first offset */ } stepSize = 1; /* default step size */ dataSpan = 0; /* this will match step size if not set*/ for(i = 1; i < wordCount; ++i) { if (startsWith("chrom",words[i])) { setChromName(words[i]); foundChrom = TRUE; } else if (startsWith("start",words[i])) { setFixedStart(words[i]); foundStart = TRUE; } else if (startsWith("step",words[i])) setStepSize(words[i]); else if (startsWith("span",words[i])) setDataSpan(words[i]); else errAbort("illegal specification on variableStep at line %lu: %s", lineCount, words[i]); } if (dataSpan == 0) dataSpan = stepSize; if (!foundChrom) errAbort("missing chrom=<name> specification on fixedStep declaration at line %lu", lineCount); if (!foundStart) errAbort("missing start=<position> specification on fixedStep declaration at line %lu", lineCount); if (noOverlap && validLines && prevChromName) { if (sameWord(prevChromName,chromName) && (fixedStart < chromStart)) errAbort("specified fixedStep chromStart %llu is less than expected next chromStart %llu", fixedStart, chromStart); } variableStep = FALSE; bedData = FALSE; fixedStep = TRUE; freez(&prevChromName); prevChromName = cloneString(chromName); continue; /* !!! go to next input line */ } else if (wordCount == 4) { /* while in bedData, we do not necessarily need to start a new * batch unless the chrom name is changing, since dataSpan * is always 1 for bedData. As above, this may change in * the future if each bed line specification is talking * about a different feature. */ if (variableStep || fixedStep || (bedData && ((prevChromName != (char *)NULL) && differentWord(prevChromName,words[0])))) { output_row(); validLines = 0; /* to cause reset for first offset */ } dataSpan = 1; /* default bases spanned per data point */ variableStep = FALSE; bedData = TRUE; fixedStep = FALSE; freez(&chromName); chromName=cloneString(words[0]); freez(&featureName); featureName=cloneString(words[0]); bedChromStart = sqlLongLong(words[1]); bedChromEnd = sqlLongLong(words[2]); bedDataValue = sqlDouble(words[3]); /* the bed format coordinate system is zero relative, half-open, * hence, no adjustment of bedChromStart is needed here, unlike the * fixed and variable step formats which will subtract one from the * incoming coordinate. */ if (bedChromStart >= bedChromEnd) errAbort("Found chromStart >= chromEnd at line %lu (%llu > %llu)", lineCount, bedChromStart, bedChromEnd); if (bedChromEnd > (bedChromStart + 10000000)) errAbort("Limit of 10,000,000 length specification for bed format at line %lu, found: %llu)", lineCount, bedChromEnd-bedChromStart); if ((validLines > 0) && (bedChromStart < previousOffset)) errAbort("chrom positions not in numerical order at line %lu. previous: %llu > %llu <-current (bed)", lineCount, previousOffset, bedChromStart); freez(&prevChromName); prevChromName = cloneString(chromName); } /* We must be in one of these data formats at this point */ if (!(variableStep || fixedStep || bedData)) errAbort("at the line beginning: %s, variableStep or fixedStep data declaration not found or BED data 4 column format not recognized.", words[0]); if (variableStep && (wordCount != 2)) errAbort("Expecting two words for variableStep data at line %lu, found %d", lineCount, wordCount); if (fixedStep && (wordCount != 1)) errAbort("Expecting one word for fixedStep data at line %lu, found %d", lineCount, wordCount); if (bedData && (wordCount != 4)) errAbort("Expecting four words for bed format data at line %lu, found %d", lineCount, wordCount); ++validLines; /* counting good lines of data input */ /* Offset is the incoming specified position for this value, * fixedStart has already been converted to zero * relative half open */ if (variableStep) { Offset = sqlLongLong(words[0]); Offset = BASE_0(Offset); /* zero relative half open */ dataValue = sqlDouble(words[1]); } else if (fixedStep) { Offset = fixedStart + (stepSize * (validLines - 1)); dataValue = sqlDouble(words[0]); } else if (bedData) { Offset = bedChromStart; dataValue = bedDataValue; } if (dataValue > overallUpperLimit) overallUpperLimit = dataValue; if (dataValue < overallLowerLimit) overallLowerLimit = dataValue; /* see if this is the first time through, establish chromStart */ if (validLines == 1) { chromStart = Offset; verbose(2, "first offset: %llu\n", chromStart); } else if ((validLines > 1) && (Offset <= previousOffset)) errAbort("chrom positions not in numerical order at line %lu. previous: %llu > %llu " "<-current (offset)", lineCount, BASE_1(previousOffset), BASE_1(Offset)); /* if we are working on a zoom level and the data is not exactly * spaced according to the span, then we need to put each value * in its own row in order to keep positioning correct for these * data values. The number of skipped bases has to be an even * multiple of dataSpan */ readingFrameSlipped = FALSE; if ((validLines > 1) && (dataSpan > 1)) { unsigned long long prevEnd = previousOffset + dataSpan; int skippedBases; int spansSkipped; skippedBases = Offset - previousOffset; if (flagOverlapSpanData && (prevEnd > Offset)) errAbort("ERROR: data points overlapping at input line %lu.\n" "previous data point position: %s:%llu-%llu overlaps current: %s:%llu-%llu", lineCount, chromName, BASE_1(previousOffset), prevEnd, chromName, BASE_1(Offset),Offset+dataSpan); spansSkipped = skippedBases / dataSpan; if ((spansSkipped * dataSpan) != skippedBases) readingFrameSlipped = TRUE; } if (readingFrameSlipped) { verbose(2, "data not spanning %llu bases, prev: %llu, this: %llu, at line: %lu\n", dataSpan, previousOffset, Offset, lineCount); output_row(); chromStart = Offset; /* a full reset here */ } /* Check to see if data is being skipped */ else if ( (validLines > 1) && (Offset > (previousOffset + dataSpan)) ) { unsigned long long off; unsigned long long fillSize; /* number of bytes */ verbose(2, "missing data offsets: %llu - %llu\n", BASE_1(previousOffset),BASE_0(Offset)); /* If we are just going to fill the rest of this bin with * no data, then may as well stop here. No need to fill * it with nothing. */ fillSize = (Offset - (previousOffset + dataSpan)) / dataSpan; verbose(2, "filling NO_DATA for %llu bytes\n", fillSize); if (fillSize + bincount >= binsize) { verbose(2, "completing a bin due to NO_DATA for %llu bytes, only %llu - %llu = %llu to go\n", fillSize, binsize, bincount, binsize - bincount); verbose(2, "Offset: %llu, previousOffset: %llu\n", Offset, previousOffset); output_row(); chromStart = Offset; /* a full reset here */ } else { fillSize = 0; /* fill missing data with NO_DATA indication */ for (off = previousOffset + dataSpan; off < Offset; off += dataSpan) { ++fillSize; ++fileOffset; ++bincount; /* count scores in this bin */ if (bincount >= binsize) break; } verbose(2, "filled NO_DATA for %llu bytes\n", fillSize); /* If that finished off this bin, output it * This most likely should not happen here. The * check above: if (fillSize + bincount >= binsize) * should have caught this case already. */ if (bincount >= binsize) { output_row(); chromStart = Offset; /* a full reset here */ } } } /* With perhaps the missing data taken care of, back to the * real data. */ if (bedData) { unsigned long long bedSize = bedChromEnd - bedChromStart; for ( ; bedSize > 0; --bedSize ) { setDataValue(bedDataValue); Offset += 1; } Offset -= 1; /* loop above increments this one too much. * This Offset is supposed to be the last * valid chrom position written, not the * next to be written */ } else { setDataValue(dataValue); } previousOffset = Offset; /* remember position for gap calculations */ } /* reading file input loop end */ /* Done with input file, any data points left in this bin ? */ if (bincount) output_row(); lineFileClose(&lf); fclose(binout); fclose(wigout); freez(&chromName); freez(&featureName); freez(&data_values); freez(&validData); freez(&wibFileName); /* return limits if pointers are given */ if (upperLimit) *upperLimit = overallUpperLimit; if (lowerLimit) *lowerLimit = overallLowerLimit; if (wibSizeLimit > 0) options->wibSizeLimit = wibSize; }