3c9f824a9764c5eed782ae3f07146aadecec74f0
kate
  Mon Nov 27 15:07:15 2017 -0800
Cleanup to ease some fixes to longTabix.

diff --git src/lib/annoStreamLongTabix.c src/lib/annoStreamLongTabix.c
new file mode 100644
index 0000000..4ca44a7
--- /dev/null
+++ src/lib/annoStreamLongTabix.c
@@ -0,0 +1,208 @@
+/* annoStreamLongTabix -- subclass of annoStreamer for longTabix files */
+
+/* Copyright (C) 2014 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
+
+#include "annoStreamLongTabix.h"
+#include "twoBit.h"
+#include "bedTabix.h"
+#include "sqlNum.h"
+
+struct annoStreamLongTabix
+    {
+    struct annoStreamer streamer;	// Parent class members & methods
+    // Private members
+    char *asWords[6];	          // Current row of longTabix with genotypes squashed for autoSql
+    struct bedTabixFile *btf;		// longTabix parsed header and file object
+    int numFileCols;			// Number of columns in longTabix file.
+    int maxRecords;			// Maximum number of annoRows to return.
+    int recordCount;			// Number of annoRows we have returned so far.
+    boolean eof;			// True when we have hit end of file or maxRecords
+    };
+
+static void asxSetRegion(struct annoStreamer *vSelf, char *chrom, uint regionStart, uint regionEnd)
+/* Set region and reset internal state. */
+{
+annoStreamerSetRegion(vSelf, chrom, regionStart, regionEnd);
+struct annoStreamLongTabix *self = (struct annoStreamLongTabix *)vSelf;
+self->eof = FALSE;
+if (chrom != NULL)
+    {
+    // If this region is not in longTabix index, set self->eof so we won't keep grabbing rows
+    // from the old position.
+    boolean gotRegion = lineFileSetTabixRegion(self->btf->lf, chrom, regionStart, regionEnd);
+    if (! gotRegion)
+	self->eof = TRUE;
+    }
+}
+
+static char *asxGetHeader(struct annoStreamer *vSelf)
+/* Return longTabix header (e.g. for use by formatter) */
+{
+#ifdef NOTNOW
+struct annoStreamLongTabix *self = (struct annoStreamLongTabix *)vSelf;
+return cloneString(self->btf->headerString);
+#endif
+return NULL;
+}
+
+static char **nextRowRaw(struct annoStreamLongTabix *self)
+/* Get the next longTabix record and put the row text into autoSql words.
+ * Return pointer to self->asWords if we get a row, otherwise NULL. */
+{
+char *words[self->numFileCols];
+int wordCount;
+if ((wordCount = lineFileChop(self->btf->lf, words)) <= 0)
+    return NULL;
+lineFileExpectWords(self->btf->lf, self->numFileCols, wordCount);
+int i;
+// First 6 columns are always in the longTabix file:
+for (i = 0;  i < 6;  i++)
+    {
+    freeMem(self->asWords[i]);
+    self->asWords[i] = cloneString(words[i]);
+    }
+//self->record = vcfRecordFromRow(self->btf, words);
+return self->asWords;
+}
+
+#ifdef NOTNOW
+static char *getProperChromName(struct annoStreamLongTabix *self, char *vcfChrom)
+/* We tolerate chr-less chrom names in longTabix and BAM ("1" for "chr1" etc); to avoid
+ * confusing the rest of the system, return the chr-ful version if it exists. */
+{
+char *name = hashFindVal(self->chromNameHash, vcfChrom);
+if (name == NULL)
+    {
+    name = vcfChrom;
+    struct twoBitFile *tbf = self->streamer.assembly->tbf;
+    char buf[256];
+    if (! twoBitIsSequence(tbf, vcfChrom))
+	{
+	safef(buf, sizeof(buf), "chr%s", vcfChrom);
+	if (twoBitIsSequence(tbf, buf))
+	    name = buf;
+	}
+    name = lmCloneString(self->chromNameHash->lm, name);
+    hashAdd(self->chromNameHash, vcfChrom, name);
+    }
+return name;
+}
+#endif
+
+static char **nextRowUnfiltered(struct annoStreamLongTabix *self, char *minChrom, uint minEnd)
+/* Get the next longTabix record and put the row text into autoSql words.
+ * Return pointer to self->asWords if we get a row, otherwise NULL. */
+{
+struct annoStreamer *sSelf = (struct annoStreamer *)self;
+char *regionChrom = sSelf->chrom;
+uint regionStart = sSelf->regionStart;
+if (minChrom != NULL)
+    {
+    if (regionChrom == NULL)
+	{
+	regionChrom = minChrom;
+	regionStart = minEnd;
+	}
+    else
+	{
+	regionStart = max(regionStart, minEnd);
+	}
+    }
+char **words = nextRowRaw(self);
+if (regionChrom != NULL && words != NULL)
+    {
+    //char *rowChrom = getProperChromName(self, words[0]);
+    char *rowChrom = cloneString(words[0]);
+    if (strcmp(rowChrom, regionChrom) < 0)
+	{
+	uint regionEnd = sSelf->regionEnd;
+	if (minChrom != NULL && sSelf->chrom == NULL)
+	    regionEnd = annoAssemblySeqSize(sSelf->assembly, minChrom);
+	// If lineFileSetTabixRegion fails, just keep the current file position
+	// -- hopefully we'll just be skipping to the next row after region{Chrom,Start,End}.
+	lineFileSetTabixRegion(self->btf->lf, regionChrom, regionStart, regionEnd);
+	}
+    }
+if (words != NULL)
+    self->recordCount++;
+if (words == NULL || (self->maxRecords > 0 && self->recordCount >= self->maxRecords))
+    self->eof = TRUE;
+return words;
+}
+
+static struct annoRow *nextRowFiltered(struct annoStreamLongTabix *self, char *minChrom, uint minEnd,
+				       struct lm *callerLm)
+/* Get the next record that passes our filters. */
+{
+char **words = nextRowUnfiltered(self, minChrom, minEnd);
+if (words == NULL)
+    return NULL;
+// Skip past any left-join failures until we get a right-join failure, a passing row, or EOF.
+struct annoStreamer *sSelf = (struct annoStreamer *)self;
+boolean rightFail = FALSE;
+while (annoFilterRowFails(sSelf->filters, words, sSelf->numCols, &rightFail))
+    {
+    if (rightFail)
+	break;
+    words = nextRowUnfiltered(self, minChrom, minEnd);
+    if (words == NULL)
+	return NULL;
+    }
+//struct vcfRecord *rec = self->record;
+//char *chrom = getProperChromName(self, rec->chrom);
+return annoRowFromStringArray(words[0], sqlUnsigned(words[1]), sqlUnsigned(words[2]),
+			      rightFail, words, sSelf->numCols, callerLm);
+}
+
+
+static struct annoRow *asxNextRow(struct annoStreamer *sSelf, char *minChrom, uint minEnd,
+				  struct lm *callerLm)
+/* Return an annoRow encoding the next longTabix record, or NULL if there are no more items.
+ * Use queues to save indels aside until we get to the following base, because longTabix's
+ * indel encoding starts one base to the left of the actual indel.  Thus, sorted longTabix might
+ * not be sorted in our internal coords, but it won't be off by more than one base. */
+{
+struct annoStreamLongTabix *self = (struct annoStreamLongTabix *)sSelf;
+if (minChrom != NULL && sSelf->chrom != NULL && differentString(minChrom, sSelf->chrom))
+    errAbort("annoStreamLongTabix %s: nextRow minChrom='%s' but region chrom='%s'",
+	     sSelf->name, minChrom, sSelf->chrom);
+if (self->eof)
+    return NULL;
+struct annoRow *nextRow = NULL;
+if ((nextRow = nextRowFiltered(self, minChrom, minEnd, callerLm)) != NULL)
+    return nextRow;
+return NULL;
+}
+
+
+static void asxClose(struct annoStreamer **pVSelf)
+/* Close longTabix file and free self. */
+{
+if (pVSelf == NULL)
+    return;
+struct annoStreamLongTabix *self = *(struct annoStreamLongTabix **)pVSelf;
+bedTabixFileClose(&(self->btf));
+annoStreamerFree(pVSelf);
+}
+
+struct annoStreamer *annoStreamLongTabixNew(char *fileOrUrl,  struct annoAssembly *aa, int maxRecords)
+/* Create an annoStreamer (subclass) object from a longTabix indexed tab file */
+{
+struct bedTabixFile *btf = bedTabixFileMayOpen(fileOrUrl, NULL, 0, 0);
+
+struct annoStreamLongTabix *self;
+AllocVar(self);
+struct annoStreamer *streamer = &(self->streamer);
+struct asObject *asObj = longTabixAsObj();
+annoStreamerInit(streamer, aa, asObj, fileOrUrl);
+streamer->rowType = arWords;
+streamer->setRegion = asxSetRegion;
+streamer->getHeader = asxGetHeader;
+streamer->nextRow = asxNextRow;
+streamer->close = asxClose;
+self->btf = btf;
+self->numFileCols = 6;
+self->maxRecords = maxRecords;
+return (struct annoStreamer *)self;
+}