src/lib/basicBed.c 1.2
1.2 2009/06/29 20:29:44 kent
Changing reserved field to a string so it can handle the various color morphs.
Index: src/lib/basicBed.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/basicBed.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/lib/basicBed.c 17 Apr 2009 22:02:51 -0000 1.1
+++ src/lib/basicBed.c 29 Jun 2009 20:29:44 -0000 1.2
@@ -1,1259 +1,1259 @@
/* basicBed contains the basic code for Browser Extensible Data (bed) files and tables.
* The idea behind bed is that the first three fields are defined and required.
* A total of 15 fields are defined, and the file can contain any number of these.
* In addition after any number of defined fields there can be custom fields that
* are not defined in the bed spec.
*
* There's additional bed-related code in src/hg/inc/bed.h. This module contains the
* stuff that's independent of the database and other genomic structures. */
#include "common.h"
#include "hash.h"
#include "linefile.h"
#include "dystring.h"
#include "sqlNum.h"
#include "sqlList.h"
#include "rangeTree.h"
#include "binRange.h"
#include "basicBed.h"
static char const rcsid[] = "$Id$";
void bedStaticLoad(char **row, struct bed *ret)
/* Load a row from bed table into ret. The contents of ret will
* be replaced at the next call to this function. */
{
ret->chrom = row[0];
ret->chromStart = sqlUnsigned(row[1]);
ret->chromEnd = sqlUnsigned(row[2]);
ret->name = row[3];
}
struct bed *bedLoad(char **row)
/* Load a bed from row fetched with select * from bed
* from database. Dispose of this with bedFree(). */
{
struct bed *ret;
AllocVar(ret);
ret->chrom = cloneString(row[0]);
ret->chromStart = sqlUnsigned(row[1]);
ret->chromEnd = sqlUnsigned(row[2]);
ret->name = cloneString(row[3]);
return ret;
}
struct bed *bedCommaIn(char **pS, struct bed *ret)
/* Create a bed out of a comma separated string.
* This will fill in ret if non-null, otherwise will
* return a new bed */
{
char *s = *pS;
if (ret == NULL)
AllocVar(ret);
ret->chrom = sqlStringComma(&s);
ret->chromStart = sqlUnsignedComma(&s);
ret->chromEnd = sqlUnsignedComma(&s);
ret->name = sqlStringComma(&s);
*pS = s;
return ret;
}
void bedFree(struct bed **pEl)
/* Free a single dynamically allocated bed such as created
* with bedLoad(). */
{
struct bed *el;
if ((el = *pEl) == NULL) return;
freeMem(el->chrom);
freeMem(el->name);
freeMem(el->blockSizes);
freeMem(el->chromStarts);
freeMem(el->expIds);
freeMem(el->expScores);
freez(pEl);
}
void bedFreeList(struct bed **pList)
/* Free a list of dynamically allocated bed's */
{
struct bed *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
bedFree(&el);
}
*pList = NULL;
}
void bedOutput(struct bed *el, FILE *f, char sep, char lastSep)
/* Print out bed. Separate fields with sep. Follow last field with lastSep. */
{
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->chrom);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->chromStart);
fputc(sep,f);
fprintf(f, "%u", el->chromEnd);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->name);
if (sep == ',') fputc('"',f);
fputc(lastSep,f);
}
/* --------------- End of AutoSQL generated code. --------------- */
int bedCmp(const void *va, const void *vb)
/* Compare to sort based on chrom,chromStart. */
{
const struct bed *a = *((struct bed **)va);
const struct bed *b = *((struct bed **)vb);
int dif;
dif = strcmp(a->chrom, b->chrom);
if (dif == 0)
dif = a->chromStart - b->chromStart;
return dif;
}
int bedCmpEnd(const void *va, const void *vb)
/* Compare to sort based on chrom,chromEnd. */
{
const struct bed *a = *((struct bed **)va);
const struct bed *b = *((struct bed **)vb);
int dif;
dif = strcmp(a->chrom, b->chrom);
if (dif == 0)
dif = a->chromEnd - b->chromEnd;
return dif;
}
int bedCmpScore(const void *va, const void *vb)
/* Compare to sort based on score - lowest first. */
{
const struct bed *a = *((struct bed **)va);
const struct bed *b = *((struct bed **)vb);
return a->score - b->score;
}
int bedCmpPlusScore(const void *va, const void *vb)
/* Compare to sort based on chrom,chromStart. */
{
const struct bed *a = *((struct bed **)va);
const struct bed *b = *((struct bed **)vb);
int dif;
dif = strcmp(a->chrom, b->chrom);
if (dif == 0)
{
dif = (a->chromStart - b->chromStart) * 1000 +(a->score - b->score);
}
return dif;
}
int bedCmpSize(const void *va, const void *vb)
/* Compare to sort based on size of element (end-start == size) */
{
const struct bed *a = *((struct bed **)va);
const struct bed *b = *((struct bed **)vb);
int a_size = a->chromEnd - a->chromStart;
int b_size = b->chromEnd - b->chromStart;
return (a_size - b_size);
}
int bedCmpChromStrandStart(const void *va, const void *vb)
/* Compare to sort based on chrom,strand,chromStart. */
{
const struct bed *a = *((struct bed **)va);
const struct bed *b = *((struct bed **)vb);
int dif;
dif = strcmp(a->chrom, b->chrom);
if (dif == 0)
dif = strcmp(a->strand, b->strand);
if (dif == 0)
dif = a->chromStart - b->chromStart;
return dif;
}
struct bedLine *bedLineNew(char *line)
/* Create a new bedLine based on tab-separated string s. */
{
struct bedLine *bl;
char *s, c;
AllocVar(bl);
bl->chrom = cloneString(line);
s = strchr(bl->chrom, '\t');
if (s == NULL)
errAbort("Expecting tab in bed line %s", line);
*s++ = 0;
c = *s;
if (isdigit(c) || (c == '-' && isdigit(s[1])))
{
bl->chromStart = atoi(s);
bl->line = s;
return bl;
}
else
{
errAbort("Expecting start position in second field of %s", line);
return NULL;
}
}
void bedLineFree(struct bedLine **pBl)
/* Free up memory associated with bedLine. */
{
struct bedLine *bl;
if ((bl = *pBl) != NULL)
{
freeMem(bl->chrom);
freez(pBl);
}
}
void bedLineFreeList(struct bedLine **pList)
/* Free a list of dynamically allocated bedLine's */
{
struct bedLine *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
bedLineFree(&el);
}
*pList = NULL;
}
int bedLineCmp(const void *va, const void *vb)
/* Compare to sort based on query. */
{
const struct bedLine *a = *((struct bedLine **)va);
const struct bedLine *b = *((struct bedLine **)vb);
int dif;
dif = strcmp(a->chrom, b->chrom);
if (dif == 0)
dif = a->chromStart - b->chromStart;
return dif;
}
void bedSortFile(char *inFile, char *outFile)
/* Sort a bed file (in place, overwrites old file. */
{
struct lineFile *lf = NULL;
FILE *f = NULL;
struct bedLine *blList = NULL, *bl;
char *line;
int lineSize;
verbose(2, "Reading %s\n", inFile);
lf = lineFileOpen(inFile, TRUE);
while (lineFileNext(lf, &line, &lineSize))
{
if (line[0] == '#')
continue;
bl = bedLineNew(line);
slAddHead(&blList, bl);
}
lineFileClose(&lf);
verbose(2, "Sorting\n");
slSort(&blList, bedLineCmp);
verbose(2, "Writing %s\n", outFile);
f = mustOpen(outFile, "w");
for (bl = blList; bl != NULL; bl = bl->next)
{
fprintf(f, "%s\t%s\n", bl->chrom, bl->line);
if (ferror(f))
{
perror("Writing error\n");
errAbort("%s is truncated, sorry.", outFile);
}
}
fclose(f);
}
struct bed *bedLoad3(char **row)
/* Load first three fields of bed. */
{
struct bed *ret;
AllocVar(ret);
ret->chrom = cloneString(row[0]);
ret->chromStart = sqlUnsigned(row[1]);
ret->chromEnd = sqlUnsigned(row[2]);
return ret;
}
struct bed *bedLoad5(char **row)
/* Load first five fields of bed. */
{
struct bed *ret;
AllocVar(ret);
ret->chrom = cloneString(row[0]);
ret->chromStart = sqlUnsigned(row[1]);
ret->chromEnd = sqlUnsigned(row[2]);
ret->name = cloneString(row[3]);
ret->score = sqlSigned(row[4]);
return ret;
}
struct bed *bedLoad6(char **row)
/* Load first six fields of bed. */
{
struct bed *ret = bedLoad5(row);
safef(ret->strand, sizeof(ret->strand), "%s", row[5]);
return ret;
}
/* it turns out that it isn't just hgLoadBed and custom tracks
* that may encounter the r,g,b specification. Any program that
* reads bed files may enconter them, so take care of them
* at any time. The strchr() function is very fast which will
* be a failure in the vast majority of cases parsing integers,
* therefore, this shouldn't be too severe a performance hit.
*/
static int itemRgbColumn(char *column9)
{
int itemRgb = 0;
/* Allow comma separated list of rgb values here */
char *comma = strchr(column9, ',');
if (comma)
{
if (-1 == (itemRgb = bedParseRgb(column9)))
errAbort("ERROR: expecting r,g,b specification, "
"found: '%s'", column9);
}
else
itemRgb = sqlUnsigned(column9);
return itemRgb;
}
struct bed *bedLoad12(char **row)
/* Load a bed from row fetched with select * from bed
* from database. Dispose of this with bedFree(). */
{
struct bed *ret;
int sizeOne;
AllocVar(ret);
ret->blockCount = sqlSigned(row[9]);
ret->chrom = cloneString(row[0]);
ret->chromStart = sqlUnsigned(row[1]);
ret->chromEnd = sqlUnsigned(row[2]);
ret->name = cloneString(row[3]);
ret->score = sqlSigned(row[4]);
strcpy(ret->strand, row[5]);
ret->thickStart = sqlUnsigned(row[6]);
ret->thickEnd = sqlUnsigned(row[7]);
ret->itemRgb = itemRgbColumn(row[8]);
sqlSignedDynamicArray(row[10], &ret->blockSizes, &sizeOne);
assert(sizeOne == ret->blockCount);
sqlSignedDynamicArray(row[11], &ret->chromStarts, &sizeOne);
assert(sizeOne == ret->blockCount);
return ret;
}
struct bed *bedLoadN(char *row[], int wordCount)
/* Convert a row of strings to a bed. */
{
struct bed * bed;
int count;
AllocVar(bed);
bed->chrom = cloneString(row[0]);
bed->chromStart = sqlUnsigned(row[1]);
bed->chromEnd = sqlUnsigned(row[2]);
if (wordCount > 3)
bed->name = cloneString(row[3]);
if (wordCount > 4)
bed->score = sqlSigned(row[4]);
if (wordCount > 5)
bed->strand[0] = row[5][0];
if (wordCount > 6)
bed->thickStart = sqlUnsigned(row[6]);
else
bed->thickStart = bed->chromStart;
if (wordCount > 7)
bed->thickEnd = sqlUnsigned(row[7]);
else
bed->thickEnd = bed->chromEnd;
if (wordCount > 8)
bed->itemRgb = itemRgbColumn(row[8]);
if (wordCount > 9)
bed->blockCount = sqlUnsigned(row[9]);
if (wordCount > 10)
sqlSignedDynamicArray(row[10], &bed->blockSizes, &count);
if (wordCount > 11)
sqlSignedDynamicArray(row[11], &bed->chromStarts, &count);
if (wordCount > 12)
bed->expCount = sqlUnsigned(row[12]);
if (wordCount > 13)
sqlSignedDynamicArray(row[13], &bed->expIds, &count);
if (wordCount > 14)
sqlFloatDynamicArray(row[14], &bed->expScores, &count);
return bed;
}
struct bed *bedLoadNAllChrom(char *fileName, int numFields, char* chrom)
/* Load bed entries from a tab-separated file that have the given chrom.
* Dispose of this with bedFreeList(). */
{
struct bed *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[numFields];
while (lineFileRow(lf, row))
{
el = bedLoadN(row, numFields);
if(chrom == NULL || sameString(el->chrom, chrom))
slAddHead(&list, el);
else
bedFree(&el);
}
lineFileClose(&lf);
slReverse(&list);
return list;
}
struct bed *bedLoadNAll(char *fileName, int numFields)
/* Load all bed from a tab-separated file.
* Dispose of this with bedFreeList(). */
{
return bedLoadNAllChrom(fileName, numFields, NULL);
}
struct bed *bedLoadAll(char *fileName)
/* Determines how many fields are in a bedFile and load all beds from
* a tab-separated file. Dispose of this with bedFreeList(). */
{
struct bed *list = NULL;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
int numFields = 0;
char *line = NULL;
/* First peek to see how many columns in the bed file. */
lineFileNextReal(lf, &line);
/* If there is something in the file then read it. If file
is empty return NULL. */
if(line != NULL)
{
numFields = chopByWhite(line, NULL, 0);
lineFileClose(&lf);
if(numFields < 4) /* Minimum number of fields. */
errAbort("file %s doesn't appear to be in bed format. At least 4 fields required, got %d",
fileName, numFields);
/* Now load them up with that number of fields. */
list = bedLoadNAll(fileName, numFields);
}
else
lineFileClose(&lf);
return list;
}
static void bedOutputN_Opt(struct bed *el, int wordCount, FILE *f,
char sep, char lastSep, boolean useItemRgb)
/* Write a bed of wordCount fields, optionally interpreting field nine
as R,G,B values. */
{
int i;
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->chrom);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->chromStart);
fputc(sep,f);
fprintf(f, "%u", el->chromEnd);
if (wordCount <= 3)
{
fputc(lastSep, f);
return;
}
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->name);
if (sep == ',') fputc('"',f);
if (wordCount <= 4)
{
fputc(lastSep, f);
return;
}
fputc(sep,f);
fprintf(f, "%d", el->score);
if (wordCount <= 5)
{
fputc(lastSep, f);
return;
}
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->strand);
if (sep == ',') fputc('"',f);
if (wordCount <= 6)
{
fputc(lastSep, f);
return;
}
fputc(sep,f);
fprintf(f, "%u", el->thickStart);
if (wordCount <= 7)
{
fputc(lastSep, f);
return;
}
fputc(sep,f);
fprintf(f, "%u", el->thickEnd);
if (wordCount <= 8)
{
fputc(lastSep, f);
return;
}
fputc(sep,f);
if (useItemRgb)
fprintf(f, "%d,%d,%d", (el->itemRgb & 0xff0000) >> 16,
(el->itemRgb & 0xff00) >> 8, (el->itemRgb & 0xff));
else
fprintf(f, "%u", el->itemRgb);
if (wordCount <= 9)
{
fputc(lastSep, f);
return;
}
fputc(sep,f);
fprintf(f, "%d", el->blockCount);
if (wordCount <= 10)
{
fputc(lastSep, f);
return;
}
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->blockCount; ++i)
{
fprintf(f, "%d", el->blockSizes[i]);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
if (wordCount <= 11)
{
fputc(lastSep, f);
return;
}
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->blockCount; ++i)
{
fprintf(f, "%d", el->chromStarts[i]);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
if (wordCount <= 12)
{
fputc(lastSep, f);
return;
}
fputc(sep,f);
fprintf(f, "%d", el->expCount);
if (wordCount <= 13)
{
fputc(lastSep, f);
return;
}
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->expCount; ++i)
{
fprintf(f, "%d", el->expIds[i]);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
if (wordCount <= 14)
{
fputc(lastSep, f);
return;
}
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->expCount; ++i)
{
fprintf(f, "%g", el->expScores[i]);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
fputc(lastSep,f);
}
void bedOutputN(struct bed *el, int wordCount, FILE *f, char sep, char lastSep)
/* Write a bed of wordCount fields. */
{
bedOutputN_Opt(el, wordCount, f, sep, lastSep, FALSE);
}
void bedOutputNitemRgb(struct bed *el, int wordCount, FILE *f,
char sep, char lastSep)
/* Write a bed of wordCount fields, interpret column 9 as RGB. */
{
bedOutputN_Opt(el, wordCount, f, sep, lastSep, TRUE);
}
int bedTotalBlockSize(struct bed *bed)
/* Return total size of all blocks. */
{
int total = 0;
int i;
if (bed->blockCount == 0)
return bed->chromEnd - bed->chromStart;
for (i=0; i<bed->blockCount; ++i)
total += bed->blockSizes[i];
return total;
}
int bedBlockSizeInRange(struct bed *bed, int rangeStart, int rangeEnd)
/* Get size of all parts of all exons between rangeStart and rangeEnd. */
{
int total = 0;
int i;
for (i=0; i<bed->blockCount; ++i)
{
int start = bed->chromStart + bed->chromStarts[i];
int end = start + bed->blockSizes[i];
total += positiveRangeIntersection(start, end, rangeStart, rangeEnd);
}
return total;
}
int bedTotalThickBlockSize(struct bed *bed)
/* Return total size of all thick blocks. */
{
return bedBlockSizeInRange(bed, bed->thickStart, bed->thickEnd);
}
int bedStartThinSize(struct bed *bed)
/* Return total size of all blocks before thick part. */
{
return bedBlockSizeInRange(bed, bed->chromStart, bed->thickStart);
}
int bedEndThinSize(struct bed *bed)
/* Return total size of all blocks after thick part. */
{
return bedBlockSizeInRange(bed, bed->thickEnd, bed->chromEnd);
}
struct bed *bedFromPsl(struct psl *psl)
/* Convert a single psl to a bed structure */
{
struct bed *bed;
int i, blockCount, *chromStarts, chromStart;
/* A tiny bit of error checking on the psl. */
if (psl->qStart >= psl->qEnd || psl->qEnd > psl->qSize
|| psl->tStart >= psl->tEnd || psl->tEnd > psl->tSize)
{
errAbort("mangled psl format for %s", psl->qName);
}
/* Allocate bed and fill in from psl. */
AllocVar(bed);
bed->chrom = cloneString(psl->tName);
bed->chromStart = bed->thickStart = chromStart = psl->tStart;
bed->chromEnd = bed->thickEnd = psl->tEnd;
bed->score = 1000 - 2*pslCalcMilliBad(psl, TRUE);
if (bed->score < 0) bed->score = 0;
strncpy(bed->strand, psl->strand, sizeof(bed->strand));
bed->blockCount = blockCount = psl->blockCount;
bed->blockSizes = (int *)cloneMem(psl->blockSizes,(sizeof(int)*psl->blockCount));
bed->chromStarts = chromStarts = (int *)cloneMem(psl->tStarts, (sizeof(int)*psl->blockCount));
bed->name = cloneString(psl->qName);
/* Switch minus target strand to plus strand. */
if (psl->strand[1] == '-')
{
int chromSize = psl->tSize;
reverseInts(bed->blockSizes, blockCount);
reverseInts(chromStarts, blockCount);
for (i=0; i<blockCount; ++i)
chromStarts[i] = chromSize - chromStarts[i];
}
/* Convert coordinates to relative. */
for (i=0; i<blockCount; ++i)
chromStarts[i] -= chromStart;
return bed;
}
void makeItBed12(struct bed *bedList, int numFields)
/* If it's less than bed 12, make it bed 12. The numFields */
/* param is for how many fields the bed *currently* has. */
{
int i = 1;
struct bed *cur;
for (cur = bedList; cur != NULL; cur = cur->next)
{
/* it better be bigger than bed 3. */
if (numFields < 4)
{
char name[50];
safef(name, ArraySize(name), "item.%d", i+1);
cur->name = cloneString(name);
}
if (numFields < 5)
cur->score = 1000;
if (numFields < 6)
{
cur->strand[0] = '?';
cur->strand[1] = '\0';
}
if (numFields < 8)
{
cur->thickStart = cur->chromStart;
cur->thickEnd = cur->chromEnd;
}
if (numFields < 9)
cur->itemRgb = 0;
if (numFields < 12)
{
cur->blockSizes = needMem(sizeof(int));
cur->chromStarts = needMem(sizeof(int));
cur->blockCount = 1;
cur->chromStarts[0] = 0;
cur->blockSizes[0] = cur->chromEnd - cur->chromStart;
}
i++;
}
}
struct bed *lmCloneBed(struct bed *bed, struct lm *lm)
/* Make a copy of bed in local memory. */
{
struct bed *newBed;
if (bed == NULL)
return NULL;
lmAllocVar(lm, newBed);
newBed->chrom = lmCloneString(lm, bed->chrom);
newBed->chromStart = bed->chromStart;
newBed->chromEnd = bed->chromEnd;
newBed->name = lmCloneString(lm, bed->name);
newBed->score = bed->score;
strncpy(newBed->strand, bed->strand, sizeof(bed->strand));
newBed->thickStart = bed->thickStart;
newBed->thickEnd = bed->thickEnd;
newBed->itemRgb = bed->itemRgb;
newBed->blockCount = bed->blockCount;
if (bed->blockCount > 0)
{
newBed->blockSizes = lmCloneMem(lm, bed->blockSizes,
sizeof(bed->blockSizes[0]) * bed->blockCount);
newBed->chromStarts = lmCloneMem(lm, bed->chromStarts,
sizeof(bed->chromStarts[0]) * bed->blockCount);
}
newBed->expCount = bed->expCount;
if (bed->expCount > 0)
{
newBed->expIds = lmCloneMem(lm, bed->expIds,
sizeof(bed->expIds[0]) * bed->expCount);
newBed->expScores = lmCloneMem(lm, bed->expScores,
sizeof(bed->expScores[0]) * bed->expCount);
}
return(newBed);
}
struct bed *cloneBed(struct bed *bed)
/* Make an all-newly-allocated copy of a single bed record. */
{
struct bed *newBed;
if (bed == NULL)
return NULL;
AllocVar(newBed);
newBed->chrom = cloneString(bed->chrom);
newBed->chromStart = bed->chromStart;
newBed->chromEnd = bed->chromEnd;
newBed->name = cloneString(bed->name);
newBed->score = bed->score;
strncpy(newBed->strand, bed->strand, sizeof(bed->strand));
newBed->thickStart = bed->thickStart;
newBed->thickEnd = bed->thickEnd;
newBed->itemRgb = bed->itemRgb;
newBed->blockCount = bed->blockCount;
if (bed->blockCount > 0)
{
newBed->blockSizes = needMem(sizeof(int) * bed->blockCount);
memcpy(newBed->blockSizes, bed->blockSizes,
sizeof(int) * bed->blockCount);
newBed->chromStarts = needMem(sizeof(int) * bed->blockCount);
memcpy(newBed->chromStarts, bed->chromStarts,
sizeof(int) * bed->blockCount);
}
newBed->expCount = bed->expCount;
if (bed->expCount > 0)
{
newBed->expIds = needMem(sizeof(int) * bed->expCount);
memcpy(newBed->expIds, bed->expIds,
sizeof(int) * bed->expCount);
newBed->expScores = needMem(sizeof(float) * bed->expCount);
memcpy(newBed->expScores, bed->expScores,
sizeof(float) * bed->expCount);
}
return(newBed);
}
struct bed *cloneBedList(struct bed *bedList)
/* Make an all-newly-allocated list copied from bed. */
{
struct bed *bedListOut = NULL, *bed=NULL;
for (bed=bedList; bed != NULL; bed=bed->next)
{
struct bed *newBed = cloneBed(bed);
slAddHead(&bedListOut, newBed);
}
slReverse(&bedListOut);
return bedListOut;
}
struct bed *bedCommaInN(char **pS, struct bed *ret, int fieldCount)
/* Create a bed out of a comma separated string looking for fieldCount
* fields. This will fill in ret if non-null, otherwise will return a
* new bed */
{
char *s = *pS;
int i;
if (ret == NULL)
AllocVar(ret);
ret->chrom = sqlStringComma(&s);
ret->chromStart = sqlUnsignedComma(&s);
ret->chromEnd = sqlUnsignedComma(&s);
if (fieldCount > 3)
ret->name = sqlStringComma(&s);
if (fieldCount > 4)
ret->score = sqlUnsignedComma(&s);
if (fieldCount > 5)
sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand));
if (fieldCount > 6)
ret->thickStart = sqlUnsignedComma(&s);
else
ret->thickStart = ret->chromStart;
if (fieldCount > 7)
ret->thickEnd = sqlUnsignedComma(&s);
else
ret->thickEnd = ret->chromEnd;
if (fieldCount > 8)
ret->itemRgb = sqlUnsignedComma(&s);
if (fieldCount > 9)
ret->blockCount = sqlUnsignedComma(&s);
if (fieldCount > 10)
{
s = sqlEatChar(s, '{');
AllocArray(ret->blockSizes, ret->blockCount);
for (i=0; i<ret->blockCount; ++i)
{
ret->blockSizes[i] = sqlSignedComma(&s);
}
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
}
if(fieldCount > 11)
{
s = sqlEatChar(s, '{');
AllocArray(ret->chromStarts, ret->blockCount);
for (i=0; i<ret->blockCount; ++i)
{
ret->chromStarts[i] = sqlSignedComma(&s);
}
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
}
if (fieldCount > 12)
ret->expCount = sqlSignedComma(&s);
if (fieldCount > 13)
{
s = sqlEatChar(s, '{');
AllocArray(ret->expIds, ret->expCount);
for (i=0; i<ret->expCount; ++i)
{
ret->expIds[i] = sqlSignedComma(&s);
}
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
}
if (fieldCount > 14)
{
s = sqlEatChar(s, '{');
AllocArray(ret->expScores, ret->expCount);
for (i=0; i<ret->expCount; ++i)
{
ret->expScores[i] = sqlFloatComma(&s);
}
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
}
*pS = s;
return ret;
}
struct hash *readBedToBinKeeper(char *sizeFileName, char *bedFileName, int wordCount)
/* Read a list of beds and return results in hash of binKeeper structure for fast query
* See also bedsIntoKeeperHash, which takes the beds read into a list already, but
* dispenses with the need for the sizeFile. */
{
struct binKeeper *bk;
struct bed *bed;
struct lineFile *lf = lineFileOpen(sizeFileName, TRUE);
struct lineFile *bf = lineFileOpen(bedFileName , TRUE);
struct hash *hash = newHash(0);
char *chromRow[2];
char *row[3] ;
assert (wordCount == 3);
while (lineFileRow(lf, chromRow))
{
char *name = chromRow[0];
int size = lineFileNeedNum(lf, chromRow, 1);
if (hashLookup(hash, name) != NULL)
warn("Duplicate %s, ignoring all but first\n", name);
else
{
bk = binKeeperNew(0, size);
assert(size > 1);
hashAdd(hash, name, bk);
}
}
while (lineFileNextRow(bf, row, ArraySize(row)))
{
bed = bedLoadN(row, wordCount);
bk = hashMustFindVal(hash, bed->chrom);
binKeeperAdd(bk, bed->chromStart, bed->chromEnd, bed);
}
lineFileClose(&bf);
lineFileClose(&lf);
return hash;
}
int bedParseRgb(char *itemRgb)
/* parse a string: "r,g,b" into three unsigned char values
returned as 24 bit number, or -1 for failure */
{
char dupe[64];
int wordCount;
char *row[4];
strncpy(dupe, itemRgb, sizeof(dupe));
wordCount = chopString(dupe, ",", row, ArraySize(row));
if ((wordCount != 3) || (!isdigit(row[0][0]) ||
!isdigit(row[1][0]) || !isdigit(row[2][0])))
return (-1);
return ( ((atoi(row[0]) & 0xff) << 16) |
((atoi(row[1]) & 0xff) << 8) |
(atoi(row[2]) & 0xff) );
}
long long bedTotalSize(struct bed *bedList)
/* Add together sizes of all beds in list. */
{
long long total=0;
struct bed *bed;
for (bed = bedList; bed != NULL; bed = bed->next)
total += (bed->chromEnd - bed->chromStart);
return total;
}
void bedIntoRangeTree(struct bed *bed, struct rbTree *rangeTree)
/* Add all blocks in bed to range tree. For beds without blocks,
* add entire bed. */
{
int i;
if (bed->blockCount == 0)
rangeTreeAdd(rangeTree, bed->chromStart, bed->chromEnd);
else
{
for (i=0; i < bed->blockCount; ++i)
{
int start = bed->chromStart + bed->chromStarts[i];
int end = start + bed->blockSizes[i];
rangeTreeAdd(rangeTree, start, end);
}
}
}
struct rbTree *bedToRangeTree(struct bed *bed)
/* Convert bed into a range tree. */
{
struct rbTree *rangeTree = rangeTreeNew();
bedIntoRangeTree(bed, rangeTree);
return rangeTree;
}
int bedRangeTreeOverlap(struct bed *bed, struct rbTree *rangeTree)
/* Return number of bases bed overlaps with rangeTree. */
{
int totalOverlap = 0;
if (bed->blockCount == 0)
totalOverlap = rangeTreeOverlapSize(rangeTree, bed->chromStart, bed->chromEnd);
else
{
int i;
for (i=0; i < bed->blockCount; ++i)
{
int start = bed->chromStart + bed->chromStarts[i];
int end = start + bed->blockSizes[i];
totalOverlap += rangeTreeOverlapSize(rangeTree, start, end);
}
}
return totalOverlap;
}
int bedSameStrandOverlap(struct bed *a, struct bed *b)
/* Return amount of block-level overlap on same strand between a and b */
{
/* Make sure on same strand, chromosome, and that overlap
* at the non-block level. */
if (a->strand[0] != b->strand[0])
return 0;
if (!sameString(a->chrom, b->chrom))
return 0;
int outerOverlap = rangeIntersection(a->chromStart, a->chromEnd,
b->chromStart, b->chromEnd);
if (outerOverlap <= 0)
return 0;
/* If both beds are non-blocked then we're pretty much done. */
if (a->blockCount == 0 && b->blockCount == 0)
return outerOverlap;
/* Otherwise make up a range tree containing regions covered by a,
* and figure out how much b overlaps it.. */
struct rbTree *rangeTree = bedToRangeTree(a);
int overlap = bedRangeTreeOverlap(b, rangeTree);
/* Clean up and return result. */
rangeTreeFree(&rangeTree);
return overlap;
}
boolean bedExactMatch(struct bed *oldBed, struct bed *newBed)
/* Return TRUE if it's an exact match. */
{
if (oldBed->blockCount != newBed->blockCount)
return FALSE;
int oldSize = bedTotalBlockSize(oldBed);
int newSize = bedTotalBlockSize(newBed);
int overlap = bedSameStrandOverlap(oldBed, newBed);
return (oldSize == newSize && oldSize == overlap);
}
boolean bedCompatibleExtension(struct bed *oldBed, struct bed *newBed)
/* Return TRUE if newBed is a compatible extension of oldBed, meaning
* all internal exons and all introns of old bed are contained, in the
* same order in the new bed. */
{
/* New bed must have at least as many exons as old bed... */
if (oldBed->blockCount > newBed->blockCount)
return FALSE;
/* New bed must also must also encompass old bed. */
if (newBed->chromStart > oldBed->chromStart)
return FALSE;
if (newBed->chromEnd < oldBed->chromEnd)
return FALSE;
/* Look for an exact match */
int oldSize = bedTotalBlockSize(oldBed);
int newSize = bedTotalBlockSize(newBed);
int overlap = bedSameStrandOverlap(oldBed, newBed);
if (oldSize == newSize && oldSize == overlap)
return TRUE;
/* If overlap is smaller than old size then we can't be a superset. */
if (overlap < oldSize)
return FALSE;
/* If we're a single exon bed then we're done. */
if (oldBed->blockCount <= 1)
return TRUE;
/* Otherwise we look for first intron start in old bed, and then
* flip through new bed until we find an intron that starts at the
* same place. */
int oldFirstIntronStart = oldBed->chromStart + oldBed->chromStarts[0] + oldBed->blockSizes[0];
int newLastBlock = newBed->blockCount-1, oldLastBlock = oldBed->blockCount-1;
int newIx, oldIx;
for (newIx=0; newIx < newLastBlock; ++newIx)
{
int iStartNew = newBed->chromStart + newBed->chromStarts[newIx] + newBed->blockSizes[newIx];
if (iStartNew == oldFirstIntronStart)
break;
}
if (newIx == newLastBlock)
return FALSE;
/* Now we go through all introns in old bed, and make sure they match. */
for (oldIx=0; oldIx < oldLastBlock; ++oldIx, ++newIx)
{
int iStartOld = oldBed->chromStart + oldBed->chromStarts[oldIx] + oldBed->blockSizes[oldIx];
int iEndOld = oldBed->chromStart + oldBed->chromStarts[oldIx+1];
int iStartNew = newBed->chromStart + newBed->chromStarts[newIx] + newBed->blockSizes[newIx];
int iEndNew = newBed->chromStart + newBed->chromStarts[newIx+1];
if (iStartOld != iStartNew || iEndOld != iEndNew)
return FALSE;
}
return TRUE;
}
struct bed3 *bed3New(char *chrom, int start, int end)
/* Make new bed3. */
{
struct bed3 *bed;
AllocVar(bed);
bed->chrom = cloneString(chrom);
bed->chromStart = start;
bed->chromEnd = end;
return bed;
}
struct bed *bedThickOnly(struct bed *in)
/* Return a bed that only has the thick part. (Which is usually the CDS). */
{
if (in->thickStart >= in->thickEnd)
return NULL;
if (in->expCount != 0 || in->expIds != NULL || in->expScores != NULL)
errAbort("Sorry, bedThickOnly only works on beds with up to 12 fields.");
/* Allocate output, and fill in simple fields. */
struct bed *out;
AllocVar(out);
out->chrom = cloneString(in->chrom);
out->chromStart = out->thickStart = in->thickStart;
out->chromEnd = out->thickEnd = in->thickEnd;
out->name = cloneString(in->name);
out->strand[0] = in->strand[0];
out->score = in->score;
out->itemRgb = in->itemRgb;
/* If need be fill in blocks. */
if (in->blockCount > 0)
{
/* Count up blocks inside CDS. */
int i;
int outBlockCount = 0;
for (i=0; i<in->blockCount; ++i)
{
int start = in->chromStart + in->chromStarts[i];
int end = start + in->blockSizes[i];
if (start < in->thickStart) start = in->thickStart;
if (end > in->thickEnd) end = in->thickEnd;
if (start < end)
outBlockCount += 1;
}
/* This trivieal case shouldn't happen, but just in case, we deal with it. */
if (outBlockCount == 0)
{
freeMem(out);
return NULL;
}
/* Allocate block arrays for output. */
out->blockCount = outBlockCount;
AllocArray(out->chromStarts, outBlockCount);
AllocArray(out->blockSizes, outBlockCount);
/* Scan through input one more time, copying to out. */
int outBlockIx = 0;
for (i=0; i<in->blockCount; ++i)
{
int start = in->chromStart + in->chromStarts[i];
int end = start + in->blockSizes[i];
if (start < in->thickStart) start = in->thickStart;
if (end > in->thickEnd) end = in->thickEnd;
if (start < end)
{
out->chromStarts[outBlockIx] = start - out->chromStart;
out->blockSizes[outBlockIx] = end - start;
outBlockIx += 1;
}
}
}
return out;
}
struct bed *bedThickOnlyList(struct bed *inList)
/* Return a list of beds that only are the thick part of input. */
{
struct bed *outList = NULL, *out, *in;
for (in = inList; in != NULL; in = in->next)
{
if ((out = bedThickOnly(in)) != NULL)
slAddHead(&outList, out);
}
slReverse(&outList);
return outList;
}
char *bedAsDef(int bedFieldCount, int totalFieldCount)
/* Return an autoSql definition for a bed of given number of fields.
* Normally totalFieldCount is equal to bedFieldCount. If there are extra
* fields they are just given the names field16, field17, etc and type string. */
{
if (bedFieldCount < 3 || bedFieldCount > 15)
errAbort("bedFieldCount is %d, but must be between %d and %d in bedAsDef", bedFieldCount, 3, 15);
struct dyString *dy = dyStringNew(0);
dyStringAppend(dy,
"table bed\n"
"\"Browser Extensible Data\"\n"
" (\n"
" string chrom; \"Reference sequence chromosome or scaffold\"\n"
" uint chromStart; \"Start position in chromosome\"\n"
" uint chromEnd; \"End position in chromosome\"\n"
);
if (bedFieldCount >= 4)
dyStringAppend(dy, " string name; \"Name of item.\"\n");
if (bedFieldCount >= 5)
dyStringAppend(dy, " int score; \"Score (0-1000)\"\n");
if (bedFieldCount >= 6)
dyStringAppend(dy, " char[1] strand; \"+ or - for strand\"\n");
if (bedFieldCount >= 7)
dyStringAppend(dy, " uint thickStart; \"Start of where display should be thick (start codon)\"\n");
if (bedFieldCount >= 8)
dyStringAppend(dy, " uint thickEnd; \"End of where display should be thick (stop codon)\"\n");
if (bedFieldCount >= 9)
- dyStringAppend(dy, " uint reserved; \"Used as itemRgb as of 2004-11-22\"\n");
+ dyStringAppend(dy, " string reserved; \"Used as itemRgb as of 2004-11-22\"\n");
if (bedFieldCount >= 10)
dyStringAppend(dy, " int blockCount; \"Number of blocks\"\n");
if (bedFieldCount >= 11)
dyStringAppend(dy, " int[blockCount] blockSizes; \"Comma separated list of block sizes\"\n");
if (bedFieldCount >= 12)
dyStringAppend(dy, " int[blockCount] chromStarts; \"Start positions relative to chromStart\"\n");
if (bedFieldCount >= 13)
dyStringAppend(dy, " int expCount; \"Experiment count\"\n");
if (bedFieldCount >= 14)
dyStringAppend(dy, " int[expCount] expIds; \"Comma separated list of experiment ids. Always 0,1,2,3....\"\n");
if (bedFieldCount >= 15)
dyStringAppend(dy, " float[expCount] expScores; \"Comma separated list of experiment scores.\"\n");
int i;
for (i=bedFieldCount+1; i<=totalFieldCount; ++i)
dyStringPrintf(dy, "string field%d; \"Undocumented field\"\n", i+1);
dyStringAppend(dy, " )\n");
return dyStringCannibalize(&dy);
}