src/lib/basicBed.c 1.1
1.1 2009/04/17 22:02:51 kent
Separating out basicBed module from bed module, so that basicBed can live in src/lib and be clear that it has no database dependence.
Index: src/lib/basicBed.c
===================================================================
RCS file: src/lib/basicBed.c
diff -N src/lib/basicBed.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/lib/basicBed.c 17 Apr 2009 22:02:51 -0000 1.1
@@ -0,0 +1,1259 @@
+/* basicBed contains the basic code for Browser Extensible Data (bed) files and tables.
+ * The idea behind bed is that the first three fields are defined and required.
+ * A total of 15 fields are defined, and the file can contain any number of these.
+ * In addition after any number of defined fields there can be custom fields that
+ * are not defined in the bed spec.
+ *
+ * There's additional bed-related code in src/hg/inc/bed.h. This module contains the
+ * stuff that's independent of the database and other genomic structures. */
+
+
+#include "common.h"
+#include "hash.h"
+#include "linefile.h"
+#include "dystring.h"
+#include "sqlNum.h"
+#include "sqlList.h"
+#include "rangeTree.h"
+#include "binRange.h"
+#include "basicBed.h"
+
+static char const rcsid[] = "$Id$";
+
+void bedStaticLoad(char **row, struct bed *ret)
+/* Load a row from bed table into ret. The contents of ret will
+ * be replaced at the next call to this function. */
+{
+ret->chrom = row[0];
+ret->chromStart = sqlUnsigned(row[1]);
+ret->chromEnd = sqlUnsigned(row[2]);
+ret->name = row[3];
+}
+
+struct bed *bedLoad(char **row)
+/* Load a bed from row fetched with select * from bed
+ * from database. Dispose of this with bedFree(). */
+{
+struct bed *ret;
+AllocVar(ret);
+ret->chrom = cloneString(row[0]);
+ret->chromStart = sqlUnsigned(row[1]);
+ret->chromEnd = sqlUnsigned(row[2]);
+ret->name = cloneString(row[3]);
+return ret;
+}
+
+struct bed *bedCommaIn(char **pS, struct bed *ret)
+/* Create a bed out of a comma separated string.
+ * This will fill in ret if non-null, otherwise will
+ * return a new bed */
+{
+char *s = *pS;
+
+if (ret == NULL)
+ AllocVar(ret);
+ret->chrom = sqlStringComma(&s);
+ret->chromStart = sqlUnsignedComma(&s);
+ret->chromEnd = sqlUnsignedComma(&s);
+ret->name = sqlStringComma(&s);
+*pS = s;
+return ret;
+}
+
+void bedFree(struct bed **pEl)
+/* Free a single dynamically allocated bed such as created
+ * with bedLoad(). */
+{
+struct bed *el;
+
+if ((el = *pEl) == NULL) return;
+freeMem(el->chrom);
+freeMem(el->name);
+freeMem(el->blockSizes);
+freeMem(el->chromStarts);
+freeMem(el->expIds);
+freeMem(el->expScores);
+freez(pEl);
+}
+
+void bedFreeList(struct bed **pList)
+/* Free a list of dynamically allocated bed's */
+{
+struct bed *el, *next;
+
+for (el = *pList; el != NULL; el = next)
+ {
+ next = el->next;
+ bedFree(&el);
+ }
+*pList = NULL;
+}
+
+void bedOutput(struct bed *el, FILE *f, char sep, char lastSep)
+/* Print out bed. Separate fields with sep. Follow last field with lastSep. */
+{
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->chrom);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+fprintf(f, "%u", el->chromStart);
+fputc(sep,f);
+fprintf(f, "%u", el->chromEnd);
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->name);
+if (sep == ',') fputc('"',f);
+fputc(lastSep,f);
+}
+
+/* --------------- End of AutoSQL generated code. --------------- */
+
+int bedCmp(const void *va, const void *vb)
+/* Compare to sort based on chrom,chromStart. */
+{
+const struct bed *a = *((struct bed **)va);
+const struct bed *b = *((struct bed **)vb);
+int dif;
+dif = strcmp(a->chrom, b->chrom);
+if (dif == 0)
+ dif = a->chromStart - b->chromStart;
+return dif;
+}
+
+int bedCmpEnd(const void *va, const void *vb)
+/* Compare to sort based on chrom,chromEnd. */
+{
+const struct bed *a = *((struct bed **)va);
+const struct bed *b = *((struct bed **)vb);
+int dif;
+dif = strcmp(a->chrom, b->chrom);
+if (dif == 0)
+ dif = a->chromEnd - b->chromEnd;
+return dif;
+}
+
+int bedCmpScore(const void *va, const void *vb)
+/* Compare to sort based on score - lowest first. */
+{
+const struct bed *a = *((struct bed **)va);
+const struct bed *b = *((struct bed **)vb);
+return a->score - b->score;
+}
+
+int bedCmpPlusScore(const void *va, const void *vb)
+/* Compare to sort based on chrom,chromStart. */
+{
+const struct bed *a = *((struct bed **)va);
+const struct bed *b = *((struct bed **)vb);
+int dif;
+dif = strcmp(a->chrom, b->chrom);
+if (dif == 0)
+ {
+ dif = (a->chromStart - b->chromStart) * 1000 +(a->score - b->score);
+ }
+return dif;
+}
+
+int bedCmpSize(const void *va, const void *vb)
+/* Compare to sort based on size of element (end-start == size) */
+{
+const struct bed *a = *((struct bed **)va);
+const struct bed *b = *((struct bed **)vb);
+int a_size = a->chromEnd - a->chromStart;
+int b_size = b->chromEnd - b->chromStart;
+return (a_size - b_size);
+}
+
+int bedCmpChromStrandStart(const void *va, const void *vb)
+/* Compare to sort based on chrom,strand,chromStart. */
+{
+const struct bed *a = *((struct bed **)va);
+const struct bed *b = *((struct bed **)vb);
+int dif;
+dif = strcmp(a->chrom, b->chrom);
+if (dif == 0)
+ dif = strcmp(a->strand, b->strand);
+if (dif == 0)
+ dif = a->chromStart - b->chromStart;
+return dif;
+}
+
+struct bedLine *bedLineNew(char *line)
+/* Create a new bedLine based on tab-separated string s. */
+{
+struct bedLine *bl;
+char *s, c;
+
+AllocVar(bl);
+bl->chrom = cloneString(line);
+s = strchr(bl->chrom, '\t');
+if (s == NULL)
+ errAbort("Expecting tab in bed line %s", line);
+*s++ = 0;
+c = *s;
+if (isdigit(c) || (c == '-' && isdigit(s[1])))
+ {
+ bl->chromStart = atoi(s);
+ bl->line = s;
+ return bl;
+ }
+else
+ {
+ errAbort("Expecting start position in second field of %s", line);
+ return NULL;
+ }
+}
+
+void bedLineFree(struct bedLine **pBl)
+/* Free up memory associated with bedLine. */
+{
+struct bedLine *bl;
+
+if ((bl = *pBl) != NULL)
+ {
+ freeMem(bl->chrom);
+ freez(pBl);
+ }
+}
+
+void bedLineFreeList(struct bedLine **pList)
+/* Free a list of dynamically allocated bedLine's */
+{
+struct bedLine *el, *next;
+
+for (el = *pList; el != NULL; el = next)
+ {
+ next = el->next;
+ bedLineFree(&el);
+ }
+*pList = NULL;
+}
+
+
+int bedLineCmp(const void *va, const void *vb)
+/* Compare to sort based on query. */
+{
+const struct bedLine *a = *((struct bedLine **)va);
+const struct bedLine *b = *((struct bedLine **)vb);
+int dif;
+dif = strcmp(a->chrom, b->chrom);
+if (dif == 0)
+ dif = a->chromStart - b->chromStart;
+return dif;
+}
+
+
+void bedSortFile(char *inFile, char *outFile)
+/* Sort a bed file (in place, overwrites old file. */
+{
+struct lineFile *lf = NULL;
+FILE *f = NULL;
+struct bedLine *blList = NULL, *bl;
+char *line;
+int lineSize;
+
+verbose(2, "Reading %s\n", inFile);
+lf = lineFileOpen(inFile, TRUE);
+while (lineFileNext(lf, &line, &lineSize))
+ {
+ if (line[0] == '#')
+ continue;
+ bl = bedLineNew(line);
+ slAddHead(&blList, bl);
+ }
+lineFileClose(&lf);
+
+verbose(2, "Sorting\n");
+slSort(&blList, bedLineCmp);
+
+verbose(2, "Writing %s\n", outFile);
+f = mustOpen(outFile, "w");
+for (bl = blList; bl != NULL; bl = bl->next)
+ {
+ fprintf(f, "%s\t%s\n", bl->chrom, bl->line);
+ if (ferror(f))
+ {
+ perror("Writing error\n");
+ errAbort("%s is truncated, sorry.", outFile);
+ }
+ }
+fclose(f);
+}
+
+struct bed *bedLoad3(char **row)
+/* Load first three fields of bed. */
+{
+struct bed *ret;
+AllocVar(ret);
+ret->chrom = cloneString(row[0]);
+ret->chromStart = sqlUnsigned(row[1]);
+ret->chromEnd = sqlUnsigned(row[2]);
+return ret;
+}
+
+struct bed *bedLoad5(char **row)
+/* Load first five fields of bed. */
+{
+struct bed *ret;
+AllocVar(ret);
+ret->chrom = cloneString(row[0]);
+ret->chromStart = sqlUnsigned(row[1]);
+ret->chromEnd = sqlUnsigned(row[2]);
+ret->name = cloneString(row[3]);
+ret->score = sqlSigned(row[4]);
+return ret;
+}
+
+struct bed *bedLoad6(char **row)
+/* Load first six fields of bed. */
+{
+struct bed *ret = bedLoad5(row);
+safef(ret->strand, sizeof(ret->strand), "%s", row[5]);
+return ret;
+}
+
+/* it turns out that it isn't just hgLoadBed and custom tracks
+ * that may encounter the r,g,b specification. Any program that
+ * reads bed files may enconter them, so take care of them
+ * at any time. The strchr() function is very fast which will
+ * be a failure in the vast majority of cases parsing integers,
+ * therefore, this shouldn't be too severe a performance hit.
+ */
+static int itemRgbColumn(char *column9)
+{
+int itemRgb = 0;
+/* Allow comma separated list of rgb values here */
+char *comma = strchr(column9, ',');
+if (comma)
+ {
+ if (-1 == (itemRgb = bedParseRgb(column9)))
+ errAbort("ERROR: expecting r,g,b specification, "
+ "found: '%s'", column9);
+ }
+else
+ itemRgb = sqlUnsigned(column9);
+return itemRgb;
+}
+
+struct bed *bedLoad12(char **row)
+/* Load a bed from row fetched with select * from bed
+ * from database. Dispose of this with bedFree(). */
+{
+struct bed *ret;
+int sizeOne;
+
+AllocVar(ret);
+ret->blockCount = sqlSigned(row[9]);
+ret->chrom = cloneString(row[0]);
+ret->chromStart = sqlUnsigned(row[1]);
+ret->chromEnd = sqlUnsigned(row[2]);
+ret->name = cloneString(row[3]);
+ret->score = sqlSigned(row[4]);
+strcpy(ret->strand, row[5]);
+ret->thickStart = sqlUnsigned(row[6]);
+ret->thickEnd = sqlUnsigned(row[7]);
+ret->itemRgb = itemRgbColumn(row[8]);
+sqlSignedDynamicArray(row[10], &ret->blockSizes, &sizeOne);
+assert(sizeOne == ret->blockCount);
+sqlSignedDynamicArray(row[11], &ret->chromStarts, &sizeOne);
+assert(sizeOne == ret->blockCount);
+return ret;
+}
+
+struct bed *bedLoadN(char *row[], int wordCount)
+/* Convert a row of strings to a bed. */
+{
+struct bed * bed;
+int count;
+
+AllocVar(bed);
+bed->chrom = cloneString(row[0]);
+bed->chromStart = sqlUnsigned(row[1]);
+bed->chromEnd = sqlUnsigned(row[2]);
+if (wordCount > 3)
+ bed->name = cloneString(row[3]);
+if (wordCount > 4)
+ bed->score = sqlSigned(row[4]);
+if (wordCount > 5)
+ bed->strand[0] = row[5][0];
+if (wordCount > 6)
+ bed->thickStart = sqlUnsigned(row[6]);
+else
+ bed->thickStart = bed->chromStart;
+if (wordCount > 7)
+ bed->thickEnd = sqlUnsigned(row[7]);
+else
+ bed->thickEnd = bed->chromEnd;
+if (wordCount > 8)
+ bed->itemRgb = itemRgbColumn(row[8]);
+if (wordCount > 9)
+ bed->blockCount = sqlUnsigned(row[9]);
+if (wordCount > 10)
+ sqlSignedDynamicArray(row[10], &bed->blockSizes, &count);
+if (wordCount > 11)
+ sqlSignedDynamicArray(row[11], &bed->chromStarts, &count);
+if (wordCount > 12)
+ bed->expCount = sqlUnsigned(row[12]);
+if (wordCount > 13)
+ sqlSignedDynamicArray(row[13], &bed->expIds, &count);
+if (wordCount > 14)
+ sqlFloatDynamicArray(row[14], &bed->expScores, &count);
+return bed;
+}
+
+struct bed *bedLoadNAllChrom(char *fileName, int numFields, char* chrom)
+/* Load bed entries from a tab-separated file that have the given chrom.
+ * Dispose of this with bedFreeList(). */
+{
+struct bed *list = NULL, *el;
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *row[numFields];
+
+while (lineFileRow(lf, row))
+ {
+ el = bedLoadN(row, numFields);
+ if(chrom == NULL || sameString(el->chrom, chrom))
+ slAddHead(&list, el);
+ else
+ bedFree(&el);
+ }
+lineFileClose(&lf);
+slReverse(&list);
+return list;
+}
+
+struct bed *bedLoadNAll(char *fileName, int numFields)
+/* Load all bed from a tab-separated file.
+ * Dispose of this with bedFreeList(). */
+{
+return bedLoadNAllChrom(fileName, numFields, NULL);
+}
+
+struct bed *bedLoadAll(char *fileName)
+/* Determines how many fields are in a bedFile and load all beds from
+ * a tab-separated file. Dispose of this with bedFreeList(). */
+{
+struct bed *list = NULL;
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+int numFields = 0;
+char *line = NULL;
+/* First peek to see how many columns in the bed file. */
+lineFileNextReal(lf, &line);
+
+/* If there is something in the file then read it. If file
+ is empty return NULL. */
+if(line != NULL)
+ {
+ numFields = chopByWhite(line, NULL, 0);
+ lineFileClose(&lf);
+ if(numFields < 4) /* Minimum number of fields. */
+ errAbort("file %s doesn't appear to be in bed format. At least 4 fields required, got %d",
+ fileName, numFields);
+ /* Now load them up with that number of fields. */
+ list = bedLoadNAll(fileName, numFields);
+ }
+else
+ lineFileClose(&lf);
+
+return list;
+}
+
+static void bedOutputN_Opt(struct bed *el, int wordCount, FILE *f,
+ char sep, char lastSep, boolean useItemRgb)
+/* Write a bed of wordCount fields, optionally interpreting field nine
+ as R,G,B values. */
+{
+int i;
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->chrom);
+if (sep == ',') fputc('"',f);
+fputc(sep,f);
+fprintf(f, "%u", el->chromStart);
+fputc(sep,f);
+fprintf(f, "%u", el->chromEnd);
+if (wordCount <= 3)
+ {
+ fputc(lastSep, f);
+ return;
+ }
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->name);
+if (sep == ',') fputc('"',f);
+if (wordCount <= 4)
+ {
+ fputc(lastSep, f);
+ return;
+ }
+fputc(sep,f);
+fprintf(f, "%d", el->score);
+if (wordCount <= 5)
+ {
+ fputc(lastSep, f);
+ return;
+ }
+fputc(sep,f);
+if (sep == ',') fputc('"',f);
+fprintf(f, "%s", el->strand);
+if (sep == ',') fputc('"',f);
+if (wordCount <= 6)
+ {
+ fputc(lastSep, f);
+ return;
+ }
+fputc(sep,f);
+fprintf(f, "%u", el->thickStart);
+if (wordCount <= 7)
+ {
+ fputc(lastSep, f);
+ return;
+ }
+fputc(sep,f);
+fprintf(f, "%u", el->thickEnd);
+if (wordCount <= 8)
+ {
+ fputc(lastSep, f);
+ return;
+ }
+fputc(sep,f);
+if (useItemRgb)
+ fprintf(f, "%d,%d,%d", (el->itemRgb & 0xff0000) >> 16,
+ (el->itemRgb & 0xff00) >> 8, (el->itemRgb & 0xff));
+else
+ fprintf(f, "%u", el->itemRgb);
+if (wordCount <= 9)
+ {
+ fputc(lastSep, f);
+ return;
+ }
+fputc(sep,f);
+fprintf(f, "%d", el->blockCount);
+if (wordCount <= 10)
+ {
+ fputc(lastSep, f);
+ return;
+ }
+fputc(sep,f);
+if (sep == ',') fputc('{',f);
+for (i=0; i<el->blockCount; ++i)
+ {
+ fprintf(f, "%d", el->blockSizes[i]);
+ fputc(',', f);
+ }
+if (sep == ',') fputc('}',f);
+if (wordCount <= 11)
+ {
+ fputc(lastSep, f);
+ return;
+ }
+fputc(sep,f);
+if (sep == ',') fputc('{',f);
+for (i=0; i<el->blockCount; ++i)
+ {
+ fprintf(f, "%d", el->chromStarts[i]);
+ fputc(',', f);
+ }
+if (sep == ',') fputc('}',f);
+
+if (wordCount <= 12)
+ {
+ fputc(lastSep, f);
+ return;
+ }
+fputc(sep,f);
+fprintf(f, "%d", el->expCount);
+
+if (wordCount <= 13)
+ {
+ fputc(lastSep, f);
+ return;
+ }
+fputc(sep,f);
+if (sep == ',') fputc('{',f);
+for (i=0; i<el->expCount; ++i)
+ {
+ fprintf(f, "%d", el->expIds[i]);
+ fputc(',', f);
+ }
+if (sep == ',') fputc('}',f);
+
+
+if (wordCount <= 14)
+ {
+ fputc(lastSep, f);
+ return;
+ }
+fputc(sep,f);
+if (sep == ',') fputc('{',f);
+for (i=0; i<el->expCount; ++i)
+ {
+ fprintf(f, "%g", el->expScores[i]);
+ fputc(',', f);
+ }
+if (sep == ',') fputc('}',f);
+
+
+fputc(lastSep,f);
+}
+
+void bedOutputN(struct bed *el, int wordCount, FILE *f, char sep, char lastSep)
+/* Write a bed of wordCount fields. */
+{
+bedOutputN_Opt(el, wordCount, f, sep, lastSep, FALSE);
+}
+
+void bedOutputNitemRgb(struct bed *el, int wordCount, FILE *f,
+ char sep, char lastSep)
+/* Write a bed of wordCount fields, interpret column 9 as RGB. */
+{
+bedOutputN_Opt(el, wordCount, f, sep, lastSep, TRUE);
+}
+
+
+int bedTotalBlockSize(struct bed *bed)
+/* Return total size of all blocks. */
+{
+int total = 0;
+int i;
+if (bed->blockCount == 0)
+ return bed->chromEnd - bed->chromStart;
+for (i=0; i<bed->blockCount; ++i)
+ total += bed->blockSizes[i];
+return total;
+}
+
+int bedBlockSizeInRange(struct bed *bed, int rangeStart, int rangeEnd)
+/* Get size of all parts of all exons between rangeStart and rangeEnd. */
+{
+int total = 0;
+int i;
+for (i=0; i<bed->blockCount; ++i)
+ {
+ int start = bed->chromStart + bed->chromStarts[i];
+ int end = start + bed->blockSizes[i];
+ total += positiveRangeIntersection(start, end, rangeStart, rangeEnd);
+ }
+return total;
+}
+
+int bedTotalThickBlockSize(struct bed *bed)
+/* Return total size of all thick blocks. */
+{
+return bedBlockSizeInRange(bed, bed->thickStart, bed->thickEnd);
+}
+
+int bedStartThinSize(struct bed *bed)
+/* Return total size of all blocks before thick part. */
+{
+return bedBlockSizeInRange(bed, bed->chromStart, bed->thickStart);
+}
+
+int bedEndThinSize(struct bed *bed)
+/* Return total size of all blocks after thick part. */
+{
+return bedBlockSizeInRange(bed, bed->thickEnd, bed->chromEnd);
+}
+
+struct bed *bedFromPsl(struct psl *psl)
+/* Convert a single psl to a bed structure */
+{
+struct bed *bed;
+int i, blockCount, *chromStarts, chromStart;
+
+/* A tiny bit of error checking on the psl. */
+if (psl->qStart >= psl->qEnd || psl->qEnd > psl->qSize
+ || psl->tStart >= psl->tEnd || psl->tEnd > psl->tSize)
+ {
+ errAbort("mangled psl format for %s", psl->qName);
+ }
+
+/* Allocate bed and fill in from psl. */
+AllocVar(bed);
+bed->chrom = cloneString(psl->tName);
+bed->chromStart = bed->thickStart = chromStart = psl->tStart;
+bed->chromEnd = bed->thickEnd = psl->tEnd;
+bed->score = 1000 - 2*pslCalcMilliBad(psl, TRUE);
+if (bed->score < 0) bed->score = 0;
+strncpy(bed->strand, psl->strand, sizeof(bed->strand));
+bed->blockCount = blockCount = psl->blockCount;
+bed->blockSizes = (int *)cloneMem(psl->blockSizes,(sizeof(int)*psl->blockCount));
+bed->chromStarts = chromStarts = (int *)cloneMem(psl->tStarts, (sizeof(int)*psl->blockCount));
+bed->name = cloneString(psl->qName);
+
+/* Switch minus target strand to plus strand. */
+if (psl->strand[1] == '-')
+ {
+ int chromSize = psl->tSize;
+ reverseInts(bed->blockSizes, blockCount);
+ reverseInts(chromStarts, blockCount);
+ for (i=0; i<blockCount; ++i)
+ chromStarts[i] = chromSize - chromStarts[i];
+ }
+
+/* Convert coordinates to relative. */
+for (i=0; i<blockCount; ++i)
+ chromStarts[i] -= chromStart;
+return bed;
+}
+
+void makeItBed12(struct bed *bedList, int numFields)
+/* If it's less than bed 12, make it bed 12. The numFields */
+/* param is for how many fields the bed *currently* has. */
+{
+int i = 1;
+struct bed *cur;
+for (cur = bedList; cur != NULL; cur = cur->next)
+ {
+ /* it better be bigger than bed 3. */
+ if (numFields < 4)
+ {
+ char name[50];
+ safef(name, ArraySize(name), "item.%d", i+1);
+ cur->name = cloneString(name);
+ }
+ if (numFields < 5)
+ cur->score = 1000;
+ if (numFields < 6)
+ {
+ cur->strand[0] = '?';
+ cur->strand[1] = '\0';
+ }
+ if (numFields < 8)
+ {
+ cur->thickStart = cur->chromStart;
+ cur->thickEnd = cur->chromEnd;
+ }
+ if (numFields < 9)
+ cur->itemRgb = 0;
+ if (numFields < 12)
+ {
+ cur->blockSizes = needMem(sizeof(int));
+ cur->chromStarts = needMem(sizeof(int));
+ cur->blockCount = 1;
+ cur->chromStarts[0] = 0;
+ cur->blockSizes[0] = cur->chromEnd - cur->chromStart;
+ }
+ i++;
+ }
+}
+
+struct bed *lmCloneBed(struct bed *bed, struct lm *lm)
+/* Make a copy of bed in local memory. */
+{
+struct bed *newBed;
+if (bed == NULL)
+ return NULL;
+lmAllocVar(lm, newBed);
+newBed->chrom = lmCloneString(lm, bed->chrom);
+newBed->chromStart = bed->chromStart;
+newBed->chromEnd = bed->chromEnd;
+newBed->name = lmCloneString(lm, bed->name);
+newBed->score = bed->score;
+strncpy(newBed->strand, bed->strand, sizeof(bed->strand));
+newBed->thickStart = bed->thickStart;
+newBed->thickEnd = bed->thickEnd;
+newBed->itemRgb = bed->itemRgb;
+newBed->blockCount = bed->blockCount;
+if (bed->blockCount > 0)
+ {
+ newBed->blockSizes = lmCloneMem(lm, bed->blockSizes,
+ sizeof(bed->blockSizes[0]) * bed->blockCount);
+ newBed->chromStarts = lmCloneMem(lm, bed->chromStarts,
+ sizeof(bed->chromStarts[0]) * bed->blockCount);
+ }
+newBed->expCount = bed->expCount;
+if (bed->expCount > 0)
+ {
+ newBed->expIds = lmCloneMem(lm, bed->expIds,
+ sizeof(bed->expIds[0]) * bed->expCount);
+ newBed->expScores = lmCloneMem(lm, bed->expScores,
+ sizeof(bed->expScores[0]) * bed->expCount);
+ }
+return(newBed);
+}
+
+
+struct bed *cloneBed(struct bed *bed)
+/* Make an all-newly-allocated copy of a single bed record. */
+{
+struct bed *newBed;
+if (bed == NULL)
+ return NULL;
+AllocVar(newBed);
+newBed->chrom = cloneString(bed->chrom);
+newBed->chromStart = bed->chromStart;
+newBed->chromEnd = bed->chromEnd;
+newBed->name = cloneString(bed->name);
+newBed->score = bed->score;
+strncpy(newBed->strand, bed->strand, sizeof(bed->strand));
+newBed->thickStart = bed->thickStart;
+newBed->thickEnd = bed->thickEnd;
+newBed->itemRgb = bed->itemRgb;
+newBed->blockCount = bed->blockCount;
+if (bed->blockCount > 0)
+ {
+ newBed->blockSizes = needMem(sizeof(int) * bed->blockCount);
+ memcpy(newBed->blockSizes, bed->blockSizes,
+ sizeof(int) * bed->blockCount);
+ newBed->chromStarts = needMem(sizeof(int) * bed->blockCount);
+ memcpy(newBed->chromStarts, bed->chromStarts,
+ sizeof(int) * bed->blockCount);
+ }
+newBed->expCount = bed->expCount;
+if (bed->expCount > 0)
+ {
+ newBed->expIds = needMem(sizeof(int) * bed->expCount);
+ memcpy(newBed->expIds, bed->expIds,
+ sizeof(int) * bed->expCount);
+ newBed->expScores = needMem(sizeof(float) * bed->expCount);
+ memcpy(newBed->expScores, bed->expScores,
+ sizeof(float) * bed->expCount);
+ }
+return(newBed);
+}
+
+
+struct bed *cloneBedList(struct bed *bedList)
+/* Make an all-newly-allocated list copied from bed. */
+{
+struct bed *bedListOut = NULL, *bed=NULL;
+
+for (bed=bedList; bed != NULL; bed=bed->next)
+ {
+ struct bed *newBed = cloneBed(bed);
+ slAddHead(&bedListOut, newBed);
+ }
+
+slReverse(&bedListOut);
+return bedListOut;
+}
+
+struct bed *bedCommaInN(char **pS, struct bed *ret, int fieldCount)
+/* Create a bed out of a comma separated string looking for fieldCount
+ * fields. This will fill in ret if non-null, otherwise will return a
+ * new bed */
+{
+char *s = *pS;
+int i;
+
+if (ret == NULL)
+ AllocVar(ret);
+ret->chrom = sqlStringComma(&s);
+ret->chromStart = sqlUnsignedComma(&s);
+ret->chromEnd = sqlUnsignedComma(&s);
+if (fieldCount > 3)
+ ret->name = sqlStringComma(&s);
+if (fieldCount > 4)
+ ret->score = sqlUnsignedComma(&s);
+if (fieldCount > 5)
+ sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand));
+if (fieldCount > 6)
+ ret->thickStart = sqlUnsignedComma(&s);
+else
+ ret->thickStart = ret->chromStart;
+if (fieldCount > 7)
+ ret->thickEnd = sqlUnsignedComma(&s);
+else
+ ret->thickEnd = ret->chromEnd;
+if (fieldCount > 8)
+ ret->itemRgb = sqlUnsignedComma(&s);
+if (fieldCount > 9)
+ ret->blockCount = sqlUnsignedComma(&s);
+if (fieldCount > 10)
+ {
+ s = sqlEatChar(s, '{');
+ AllocArray(ret->blockSizes, ret->blockCount);
+ for (i=0; i<ret->blockCount; ++i)
+ {
+ ret->blockSizes[i] = sqlSignedComma(&s);
+ }
+ s = sqlEatChar(s, '}');
+ s = sqlEatChar(s, ',');
+ }
+if(fieldCount > 11)
+ {
+ s = sqlEatChar(s, '{');
+ AllocArray(ret->chromStarts, ret->blockCount);
+ for (i=0; i<ret->blockCount; ++i)
+ {
+ ret->chromStarts[i] = sqlSignedComma(&s);
+ }
+ s = sqlEatChar(s, '}');
+ s = sqlEatChar(s, ',');
+ }
+if (fieldCount > 12)
+ ret->expCount = sqlSignedComma(&s);
+if (fieldCount > 13)
+ {
+ s = sqlEatChar(s, '{');
+ AllocArray(ret->expIds, ret->expCount);
+ for (i=0; i<ret->expCount; ++i)
+ {
+ ret->expIds[i] = sqlSignedComma(&s);
+ }
+ s = sqlEatChar(s, '}');
+ s = sqlEatChar(s, ',');
+ }
+if (fieldCount > 14)
+ {
+ s = sqlEatChar(s, '{');
+ AllocArray(ret->expScores, ret->expCount);
+ for (i=0; i<ret->expCount; ++i)
+ {
+ ret->expScores[i] = sqlFloatComma(&s);
+ }
+ s = sqlEatChar(s, '}');
+ s = sqlEatChar(s, ',');
+ }
+*pS = s;
+return ret;
+}
+
+struct hash *readBedToBinKeeper(char *sizeFileName, char *bedFileName, int wordCount)
+/* Read a list of beds and return results in hash of binKeeper structure for fast query
+ * See also bedsIntoKeeperHash, which takes the beds read into a list already, but
+ * dispenses with the need for the sizeFile. */
+{
+struct binKeeper *bk;
+struct bed *bed;
+struct lineFile *lf = lineFileOpen(sizeFileName, TRUE);
+struct lineFile *bf = lineFileOpen(bedFileName , TRUE);
+struct hash *hash = newHash(0);
+char *chromRow[2];
+char *row[3] ;
+
+assert (wordCount == 3);
+while (lineFileRow(lf, chromRow))
+ {
+ char *name = chromRow[0];
+ int size = lineFileNeedNum(lf, chromRow, 1);
+
+ if (hashLookup(hash, name) != NULL)
+ warn("Duplicate %s, ignoring all but first\n", name);
+ else
+ {
+ bk = binKeeperNew(0, size);
+ assert(size > 1);
+ hashAdd(hash, name, bk);
+ }
+ }
+while (lineFileNextRow(bf, row, ArraySize(row)))
+ {
+ bed = bedLoadN(row, wordCount);
+ bk = hashMustFindVal(hash, bed->chrom);
+ binKeeperAdd(bk, bed->chromStart, bed->chromEnd, bed);
+ }
+lineFileClose(&bf);
+lineFileClose(&lf);
+return hash;
+}
+
+int bedParseRgb(char *itemRgb)
+/* parse a string: "r,g,b" into three unsigned char values
+ returned as 24 bit number, or -1 for failure */
+{
+char dupe[64];
+int wordCount;
+char *row[4];
+
+strncpy(dupe, itemRgb, sizeof(dupe));
+wordCount = chopString(dupe, ",", row, ArraySize(row));
+
+if ((wordCount != 3) || (!isdigit(row[0][0]) ||
+ !isdigit(row[1][0]) || !isdigit(row[2][0])))
+ return (-1);
+
+return ( ((atoi(row[0]) & 0xff) << 16) |
+ ((atoi(row[1]) & 0xff) << 8) |
+ (atoi(row[2]) & 0xff) );
+}
+
+long long bedTotalSize(struct bed *bedList)
+/* Add together sizes of all beds in list. */
+{
+long long total=0;
+struct bed *bed;
+for (bed = bedList; bed != NULL; bed = bed->next)
+ total += (bed->chromEnd - bed->chromStart);
+return total;
+}
+
+void bedIntoRangeTree(struct bed *bed, struct rbTree *rangeTree)
+/* Add all blocks in bed to range tree. For beds without blocks,
+ * add entire bed. */
+{
+int i;
+if (bed->blockCount == 0)
+ rangeTreeAdd(rangeTree, bed->chromStart, bed->chromEnd);
+else
+ {
+ for (i=0; i < bed->blockCount; ++i)
+ {
+ int start = bed->chromStart + bed->chromStarts[i];
+ int end = start + bed->blockSizes[i];
+ rangeTreeAdd(rangeTree, start, end);
+ }
+ }
+}
+
+struct rbTree *bedToRangeTree(struct bed *bed)
+/* Convert bed into a range tree. */
+{
+struct rbTree *rangeTree = rangeTreeNew();
+bedIntoRangeTree(bed, rangeTree);
+return rangeTree;
+}
+
+int bedRangeTreeOverlap(struct bed *bed, struct rbTree *rangeTree)
+/* Return number of bases bed overlaps with rangeTree. */
+{
+int totalOverlap = 0;
+if (bed->blockCount == 0)
+ totalOverlap = rangeTreeOverlapSize(rangeTree, bed->chromStart, bed->chromEnd);
+else
+ {
+ int i;
+ for (i=0; i < bed->blockCount; ++i)
+ {
+ int start = bed->chromStart + bed->chromStarts[i];
+ int end = start + bed->blockSizes[i];
+ totalOverlap += rangeTreeOverlapSize(rangeTree, start, end);
+ }
+ }
+return totalOverlap;
+}
+
+int bedSameStrandOverlap(struct bed *a, struct bed *b)
+/* Return amount of block-level overlap on same strand between a and b */
+{
+/* Make sure on same strand, chromosome, and that overlap
+ * at the non-block level. */
+if (a->strand[0] != b->strand[0])
+ return 0;
+if (!sameString(a->chrom, b->chrom))
+ return 0;
+int outerOverlap = rangeIntersection(a->chromStart, a->chromEnd,
+ b->chromStart, b->chromEnd);
+if (outerOverlap <= 0)
+ return 0;
+
+/* If both beds are non-blocked then we're pretty much done. */
+if (a->blockCount == 0 && b->blockCount == 0)
+ return outerOverlap;
+
+/* Otherwise make up a range tree containing regions covered by a,
+ * and figure out how much b overlaps it.. */
+struct rbTree *rangeTree = bedToRangeTree(a);
+int overlap = bedRangeTreeOverlap(b, rangeTree);
+
+/* Clean up and return result. */
+rangeTreeFree(&rangeTree);
+return overlap;
+}
+
+boolean bedExactMatch(struct bed *oldBed, struct bed *newBed)
+/* Return TRUE if it's an exact match. */
+{
+if (oldBed->blockCount != newBed->blockCount)
+ return FALSE;
+int oldSize = bedTotalBlockSize(oldBed);
+int newSize = bedTotalBlockSize(newBed);
+int overlap = bedSameStrandOverlap(oldBed, newBed);
+return (oldSize == newSize && oldSize == overlap);
+}
+
+boolean bedCompatibleExtension(struct bed *oldBed, struct bed *newBed)
+/* Return TRUE if newBed is a compatible extension of oldBed, meaning
+ * all internal exons and all introns of old bed are contained, in the
+ * same order in the new bed. */
+{
+/* New bed must have at least as many exons as old bed... */
+if (oldBed->blockCount > newBed->blockCount)
+ return FALSE;
+
+/* New bed must also must also encompass old bed. */
+if (newBed->chromStart > oldBed->chromStart)
+ return FALSE;
+if (newBed->chromEnd < oldBed->chromEnd)
+ return FALSE;
+
+/* Look for an exact match */
+int oldSize = bedTotalBlockSize(oldBed);
+int newSize = bedTotalBlockSize(newBed);
+int overlap = bedSameStrandOverlap(oldBed, newBed);
+if (oldSize == newSize && oldSize == overlap)
+ return TRUE;
+
+/* If overlap is smaller than old size then we can't be a superset. */
+if (overlap < oldSize)
+ return FALSE;
+
+/* If we're a single exon bed then we're done. */
+if (oldBed->blockCount <= 1)
+ return TRUE;
+
+/* Otherwise we look for first intron start in old bed, and then
+ * flip through new bed until we find an intron that starts at the
+ * same place. */
+int oldFirstIntronStart = oldBed->chromStart + oldBed->chromStarts[0] + oldBed->blockSizes[0];
+int newLastBlock = newBed->blockCount-1, oldLastBlock = oldBed->blockCount-1;
+int newIx, oldIx;
+for (newIx=0; newIx < newLastBlock; ++newIx)
+ {
+ int iStartNew = newBed->chromStart + newBed->chromStarts[newIx] + newBed->blockSizes[newIx];
+ if (iStartNew == oldFirstIntronStart)
+ break;
+ }
+if (newIx == newLastBlock)
+ return FALSE;
+
+/* Now we go through all introns in old bed, and make sure they match. */
+for (oldIx=0; oldIx < oldLastBlock; ++oldIx, ++newIx)
+ {
+ int iStartOld = oldBed->chromStart + oldBed->chromStarts[oldIx] + oldBed->blockSizes[oldIx];
+ int iEndOld = oldBed->chromStart + oldBed->chromStarts[oldIx+1];
+ int iStartNew = newBed->chromStart + newBed->chromStarts[newIx] + newBed->blockSizes[newIx];
+ int iEndNew = newBed->chromStart + newBed->chromStarts[newIx+1];
+ if (iStartOld != iStartNew || iEndOld != iEndNew)
+ return FALSE;
+ }
+return TRUE;
+}
+
+struct bed3 *bed3New(char *chrom, int start, int end)
+/* Make new bed3. */
+{
+struct bed3 *bed;
+AllocVar(bed);
+bed->chrom = cloneString(chrom);
+bed->chromStart = start;
+bed->chromEnd = end;
+return bed;
+}
+
+struct bed *bedThickOnly(struct bed *in)
+/* Return a bed that only has the thick part. (Which is usually the CDS). */
+{
+if (in->thickStart >= in->thickEnd)
+ return NULL;
+if (in->expCount != 0 || in->expIds != NULL || in->expScores != NULL)
+ errAbort("Sorry, bedThickOnly only works on beds with up to 12 fields.");
+
+/* Allocate output, and fill in simple fields. */
+struct bed *out;
+AllocVar(out);
+out->chrom = cloneString(in->chrom);
+out->chromStart = out->thickStart = in->thickStart;
+out->chromEnd = out->thickEnd = in->thickEnd;
+out->name = cloneString(in->name);
+out->strand[0] = in->strand[0];
+out->score = in->score;
+out->itemRgb = in->itemRgb;
+
+/* If need be fill in blocks. */
+if (in->blockCount > 0)
+ {
+ /* Count up blocks inside CDS. */
+ int i;
+ int outBlockCount = 0;
+ for (i=0; i<in->blockCount; ++i)
+ {
+ int start = in->chromStart + in->chromStarts[i];
+ int end = start + in->blockSizes[i];
+ if (start < in->thickStart) start = in->thickStart;
+ if (end > in->thickEnd) end = in->thickEnd;
+ if (start < end)
+ outBlockCount += 1;
+ }
+
+ /* This trivieal case shouldn't happen, but just in case, we deal with it. */
+ if (outBlockCount == 0)
+ {
+ freeMem(out);
+ return NULL;
+ }
+
+ /* Allocate block arrays for output. */
+ out->blockCount = outBlockCount;
+ AllocArray(out->chromStarts, outBlockCount);
+ AllocArray(out->blockSizes, outBlockCount);
+
+ /* Scan through input one more time, copying to out. */
+ int outBlockIx = 0;
+ for (i=0; i<in->blockCount; ++i)
+ {
+ int start = in->chromStart + in->chromStarts[i];
+ int end = start + in->blockSizes[i];
+ if (start < in->thickStart) start = in->thickStart;
+ if (end > in->thickEnd) end = in->thickEnd;
+ if (start < end)
+ {
+ out->chromStarts[outBlockIx] = start - out->chromStart;
+ out->blockSizes[outBlockIx] = end - start;
+ outBlockIx += 1;
+ }
+ }
+ }
+return out;
+}
+
+struct bed *bedThickOnlyList(struct bed *inList)
+/* Return a list of beds that only are the thick part of input. */
+{
+struct bed *outList = NULL, *out, *in;
+for (in = inList; in != NULL; in = in->next)
+ {
+ if ((out = bedThickOnly(in)) != NULL)
+ slAddHead(&outList, out);
+ }
+slReverse(&outList);
+return outList;
+}
+
+char *bedAsDef(int bedFieldCount, int totalFieldCount)
+/* Return an autoSql definition for a bed of given number of fields.
+ * Normally totalFieldCount is equal to bedFieldCount. If there are extra
+ * fields they are just given the names field16, field17, etc and type string. */
+{
+if (bedFieldCount < 3 || bedFieldCount > 15)
+ errAbort("bedFieldCount is %d, but must be between %d and %d in bedAsDef", bedFieldCount, 3, 15);
+struct dyString *dy = dyStringNew(0);
+dyStringAppend(dy,
+ "table bed\n"
+ "\"Browser Extensible Data\"\n"
+ " (\n"
+ " string chrom; \"Reference sequence chromosome or scaffold\"\n"
+ " uint chromStart; \"Start position in chromosome\"\n"
+ " uint chromEnd; \"End position in chromosome\"\n"
+ );
+if (bedFieldCount >= 4)
+ dyStringAppend(dy, " string name; \"Name of item.\"\n");
+if (bedFieldCount >= 5)
+ dyStringAppend(dy, " int score; \"Score (0-1000)\"\n");
+if (bedFieldCount >= 6)
+ dyStringAppend(dy, " char[1] strand; \"+ or - for strand\"\n");
+if (bedFieldCount >= 7)
+ dyStringAppend(dy, " uint thickStart; \"Start of where display should be thick (start codon)\"\n");
+if (bedFieldCount >= 8)
+ dyStringAppend(dy, " uint thickEnd; \"End of where display should be thick (stop codon)\"\n");
+if (bedFieldCount >= 9)
+ dyStringAppend(dy, " uint reserved; \"Used as itemRgb as of 2004-11-22\"\n");
+if (bedFieldCount >= 10)
+ dyStringAppend(dy, " int blockCount; \"Number of blocks\"\n");
+if (bedFieldCount >= 11)
+ dyStringAppend(dy, " int[blockCount] blockSizes; \"Comma separated list of block sizes\"\n");
+if (bedFieldCount >= 12)
+ dyStringAppend(dy, " int[blockCount] chromStarts; \"Start positions relative to chromStart\"\n");
+if (bedFieldCount >= 13)
+ dyStringAppend(dy, " int expCount; \"Experiment count\"\n");
+if (bedFieldCount >= 14)
+ dyStringAppend(dy, " int[expCount] expIds; \"Comma separated list of experiment ids. Always 0,1,2,3....\"\n");
+if (bedFieldCount >= 15)
+ dyStringAppend(dy, " float[expCount] expScores; \"Comma separated list of experiment scores.\"\n");
+int i;
+for (i=bedFieldCount+1; i<=totalFieldCount; ++i)
+ dyStringPrintf(dy, "string field%d; \"Undocumented field\"\n", i+1);
+dyStringAppend(dy, " )\n");
+return dyStringCannibalize(&dy);
+}
+