src/lib/bigBed.c bd1404d9126eb1575ed20098204038364ea90308

bd1404d9126eb1575ed20098204038364ea90308
kent
  Tue Mar 5 10:04:23 2013 -0800
Big bed extra indexes seem to work now.  Code still needs a little polish and testing.
diff --git src/lib/bigBed.c src/lib/bigBed.c
index 1108797..1bf3609 100644
--- src/lib/bigBed.c
+++ src/lib/bigBed.c
@@ -190,45 +190,30 @@
 	summarySize, summary);
 }
 
 boolean bigBedSummaryArray(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end,
 	enum bbiSummaryType summaryType, int summarySize, double *summaryValues)
 /* Fill in summaryValues with  data from indicated chromosome range in bigBed file.
  * Be sure to initialize summaryValues to a default value, which will not be touched
  * for regions without data in file.  (Generally you want the default value to either
  * be 0.0 or nan("") depending on the application.)  Returns FALSE if no data
  * at that position. */
 {
 return bbiSummaryArray(bbi, chrom, start, end, bigBedCoverageIntervals,
 	summaryType, summarySize, summaryValues);
 }
 
-#ifdef OLD
-void bigBedAttachNameIndex(struct bbiFile *bbi)
-/* Attach name index part of bbiFile to bbi */
-{
-if (bbi->nameBpt == NULL)
-    {
-    if (bbi->nameIndexOffset == 0)
-	errAbort("%s has no name index", bbi->fileName);
-    udcSeek(bbi->udc, bbi->nameIndexOffset);
-    bbi->nameBpt = bptFileAttach(bbi->fileName, bbi->udc);
-    }
-uglyAbort("bigBedAttachNameIndex() - no can do");
-}
-#endif /* OLD */
-
 struct offsetSize 
 /* Simple file offset and file size. */
     {
     bits64 offset; 
     bits64 size;
     };
 
 static int cmpOffsetSizeRef(const void *va, const void *vb)
 /* Compare to sort slRef pointing to offsetSize.  Sort is kind of hokey,
  * but guarantees all items that are the same will be next to each other
  * at least, which is all we care about. */
 {
 const struct slRef *a = *((struct slRef **)va);
 const struct slRef *b = *((struct slRef **)vb);
 return memcmp(a->val, b->val, sizeof(struct offsetSize));
@@ -294,62 +279,83 @@
     {
     char *name = names[nameIx];
     struct slRef *oneList = bptFileFindMultiple(index, 
 	    name, strlen(name), sizeof(struct offsetSize));
     blockList = slCat(oneList, blockList);
     }
 
 /* Create nonredundant list of blocks. */
 struct fileOffsetSize *fosList = fosFromRedundantBlockList(&blockList, bbi->isSwapped);
 
 /* Clean up and resturn result. */
 slRefFreeListAndVals(&blockList);
 return fosList;
 }
 
-typedef boolean (*BbFirstWordMatch)(char *line, void *target);
+typedef boolean (*BbFirstWordMatch)(char *line, int fieldIx, void *target);
 /* A function that returns TRUE if first word in tab-separated line matches target. */
 
+static void extractField(char *line, int fieldIx, char **retField, int *retFieldSize)
+/* Go through tab separated line and figure out start and size of given field. */
+{
+int i;
+fieldIx -= 3;	/* Skip over chrom/start/end, which are not in line. */
+for (i=0; i<fieldIx; ++i)
+    {
+    line = strchr(line, '\t');
+    if (line == NULL)
+        {
+	warn("Not enough fields in extractField of %s", line);
+	internalErr();
+	}
+    line += 1;
+    }
+char *end = strchr(line, '\t');
+if (end == NULL)
+    end = line + strlen(line);
+*retField = line;
+*retFieldSize = end - line;
+}
 
-static boolean bbFirstWordMatchesName(char *line, void *target)
+static boolean bbWordMatchesName(char *line, int fieldIx, void *target)
 /* Return true if first word of line is same as target, which is just a string. */
 {
 char *name = target;
-return startsWithWordByDelimiter(name, '\t', line);
+int fieldSize;
+char *field;
+extractField(line, fieldIx, &field, &fieldSize);
+return strlen(name) == fieldSize && memcmp(name, field, fieldSize) == 0;
 }
 
-static boolean bbFirstWordIsInHash(char *line, void *target)
+static boolean bbWordIsInHash(char *line, int fieldIx, void *target)
 /* Return true if first word of line is same as target, which is just a string. */
 {
-/* Isolate out first word. */
-int firstWordSize;
-char *tab = strchr(line, '\t');
-if (tab == NULL)
-    firstWordSize = strlen(line);
-else
-    firstWordSize = tab - line;
-char firstWord[firstWordSize+1];
-memcpy(firstWord, line, firstWordSize);
-firstWord[firstWordSize] = 0;
+int fieldSize;
+char *field;
+extractField(line, fieldIx, &field, &fieldSize);
+char fieldString[fieldSize+1];
+memcpy(fieldString, field, fieldSize);
+fieldString[fieldSize] = 0;
 
 /* Return boolean value that reflects whether we found it in hash */
 struct hash *hash = target;
-return hashLookup(hash, firstWord) != NULL;
+return hashLookup(hash, fieldString) != NULL;
 }
 
 static struct bigBedInterval *bigBedIntervalsMatchingName(struct bbiFile *bbi, 
-    struct fileOffsetSize *fosList, BbFirstWordMatch matcher, void *target, struct lm *lm)
+    struct fileOffsetSize *fosList, BbFirstWordMatch matcher, int fieldIx, 
+    void *target, struct lm *lm)
 /* Return list of intervals inside of sectors of bbiFile defined by fosList where the name 
  * matches target somehow. */
 {
 struct bigBedInterval *interval, *intervalList = NULL;
 struct fileOffsetSize *fos;
 boolean isSwapped = bbi->isSwapped;
 for (fos = fosList; fos != NULL; fos = fos->next)
     {
     /* Read in raw data */
     udcSeek(bbi->udc, fos->offset);
     char *rawData = needLargeMem(fos->size);
     udcRead(bbi->udc, rawData, fos->size);
 
     /* Optionally uncompress data, and set data pointer to uncompressed version. */
     char *uncompressedData = NULL;
@@ -373,80 +379,81 @@
 
     /* Read next record into local variables. */
     while (blockPt < blockEnd)
 	{
 	bits32 chromIx = memReadBits32(&blockPt, isSwapped);
 	bits32 s = memReadBits32(&blockPt, isSwapped);
 	bits32 e = memReadBits32(&blockPt, isSwapped);
 	int c;
 	dyStringClear(dy);
 	while ((c = *blockPt++) >= 0)
 	    {
 	    if (c == 0)
 		break;
 	    dyStringAppendC(dy, c);
 	    }
-	if ((*matcher)(dy->string, target))
+	if ((*matcher)(dy->string, fieldIx, target))
 	    {
 	    lmAllocVar(lm, interval);
 	    interval->start = s;
 	    interval->end = e;
 	    interval->rest = cloneString(dy->string);
 	    interval->chromId = chromIx;
 	    slAddHead(&intervalList, interval);
 	    }
 	}
 
     /* Clean up temporary buffers. */
     dyStringFree(&dy);
     freez(&uncompressedData);
     freez(&rawData);
     }
 slReverse(&intervalList);
 return intervalList;
 }
 
 
 
 struct bigBedInterval *bigBedNameQuery(struct bbiFile *bbi, struct bptFile *index,
-    char *name, struct lm *lm)
+    int fieldIx, char *name, struct lm *lm)
 /* Return list of intervals matching file. These intervals will be allocated out of lm. */
 {
 struct fileOffsetSize *fosList = bigBedChunksMatchingName(bbi, index, name);
 struct bigBedInterval *intervalList = bigBedIntervalsMatchingName(bbi, fosList, 
-    bbFirstWordMatchesName, name, lm);
+    bbWordMatchesName, fieldIx, name, lm);
 slFreeList(&fosList);
 return intervalList;
 }
 
 struct bigBedInterval *bigBedMultiNameQuery(struct bbiFile *bbi, struct bptFile *index,
-    char **names, int nameCount, struct lm *lm)
-/* Fetch all records matching any of the names. Return list is allocated out of lm. */
+    int fieldIx, char **names, int nameCount, struct lm *lm)
+/* Fetch all records matching any of the names. Using given index on given field.
+ * Return list is allocated out of lm. */
 {
 /* Set up name index and get list of chunks that match any of our names. */
 struct fileOffsetSize *fosList = bigBedChunksMatchingNames(bbi, index, names, nameCount);
 
 /* Create hash of all names. */
 struct hash *hash = newHash(0);
 int nameIx;
 for (nameIx=0; nameIx < nameCount; ++nameIx)
     hashAdd(hash, names[nameIx], NULL);
 
 
 /* Get intervals where name matches hash target. */
 struct bigBedInterval *intervalList = bigBedIntervalsMatchingName(bbi, fosList, 
-    bbFirstWordIsInHash, hash, lm);
+    bbWordIsInHash, fieldIx, hash, lm);
 
 /* Clean up and return results. */
 slFreeList(&fosList);
 hashFree(&hash);
 return intervalList;
 }
 
 void bigBedIntervalListToBedFile(struct bbiFile *bbi, struct bigBedInterval *intervalList, FILE *f)
 /* Write out big bed interval list to bed file, looking up chromosome. */
 {
 char chromName[bbi->chromBpt->keySize+1];
 int lastChromId = -1;
 struct bigBedInterval *interval;
 for (interval = intervalList; interval != NULL; interval = interval->next)
     {
@@ -475,50 +482,44 @@
 }
 
 char *bigBedAutoSqlText(struct bbiFile *bbi)
 /* Get autoSql text if any associated with file.  Do a freeMem of this when done. */
 {
 if (bbi->asOffset == 0)
     return NULL;
 struct udcFile *f = bbi->udc;
 udcSeek(f, bbi->asOffset);
 return udcReadStringAndZero(f);
 }
 
 struct asObject *bigBedAs(struct bbiFile *bbi)
 /* Get autoSql object definition if any associated with file. */
 {
-struct asObject *as = bbi->cachedAs;
-if (as != NULL)
-    return as;
 if (bbi->asOffset == 0)
     return NULL;
 char *asText = bigBedAutoSqlText(bbi);
-bbi->cachedAs = as = asParseText(asText);
+struct asObject *as = asParseText(asText);
 freeMem(asText);
 return as;
 }
 
 struct asObject *bigBedAsOrDefault(struct bbiFile *bbi)
 // Get asObject associated with bigBed - if none exists in file make it up from field counts.
 {
-struct asObject *as = bbi->cachedAs;
-if (as != NULL)
-    return as;
-as = bigBedAs(bbi);
+struct asObject *as = bigBedAs(bbi);
 if (as == NULL)
-    bbi->cachedAs = as = asParseText(bedAsDef(bbi->definedFieldCount, bbi->fieldCount));
+    as = asParseText(bedAsDef(bbi->definedFieldCount, bbi->fieldCount));
 return as;
 }
 
 struct asObject *bigBedFileAsObjOrDefault(char *fileName)
 // Get asObject associated with bigBed file, or the default.
 {
 struct bbiFile *bbi = bigBedFileOpen(fileName);
 if (bbi)
     {
     struct asObject *as = bigBedAsOrDefault(bbi);
     bbiFileClose(&bbi);
     return as;
     }
 return NULL;
 }
@@ -571,43 +572,48 @@
 struct asObject *as = bigBedAsOrDefault(bbi);
 
 /* Make list of field names out of list of field numbers */
 struct slName *nameList = NULL;
 for (intEl = intList; intEl != NULL; intEl = intEl->next)
     {
     struct asColumn *col = slElementFromIx(as->columnList, intEl->val);
     if (col == NULL)
 	{
         warn("Inconsistent bigBed file %s", bbi->fileName);
 	internalErr();
 	}
     slNameAddHead(&nameList, col->name);
     }
 
+asObjectFree(&as);
 return nameList;
 }
 
-struct bptFile *bigBedOpenExtraIndex(struct bbiFile *bbi, char *fieldName)
-/* Return index associated with fieldName.  Aborts if no such index. */
+struct bptFile *bigBedOpenExtraIndex(struct bbiFile *bbi, char *fieldName, int *retFieldIx)
+/* Return index associated with fieldName.  Aborts if no such index.  Optionally return
+ * index in a row of this field. */
 {
 struct udcFile *udc = bbi->udc;
 boolean isSwapped = bbi->isSwapped;
 struct asObject *as = bigBedAsOrDefault(bbi);
 struct asColumn *col = asColumnFind(as, fieldName);
 if (col == NULL)
     errAbort("No field %s in %s", fieldName, bbi->fileName);
 int colIx = slIxFromElement(as->columnList, col);
+if (retFieldIx != NULL)
+   *retFieldIx = colIx;
+asObjectFree(&as);
 
 /* See if we have any extra indexes, and if so seek to there. */
 bits64 offset = bbi->extraIndexListOffset;
 if (offset == 0)
    errAbort("%s has no indexes", bbi->fileName);
 udcSeek(udc, offset);
 
 /* Go through each extra index and see if it's a match */
 int i;
 for (i=0; i<bbi->extraIndexCount; ++i)
     {
     bits16 type = udcReadBits16(udc, isSwapped);
     bits16 fieldCount = udcReadBits16(udc, isSwapped);
     bits64 fileOffset = udcReadBits64(udc, isSwapped);
     udcSeekCur(udc, 4);    // skip over reserved bits