7f5b0b5cf3ed7efc0f9e84aa5bdcc097473c7584
kent
  Thu Feb 3 21:18:16 2011 -0800
Some preliminary work putting proper bam support in table browser.  Still quite a ways to go, but have implemented a way to get a list of alignments in 'sam' format out of a BAM.  This I think contains all the info.
diff --git src/hg/lib/bamFile.c src/hg/lib/bamFile.c
index 45648f7..bb1543d 100644
--- src/hg/lib/bamFile.c
+++ src/hg/lib/bamFile.c
@@ -1,24 +1,25 @@
 /* bam -- interface to binary alignment format files using Heng Li's samtools lib. */
 
 #ifdef USE_BAM
 
 #include "common.h"
 #include "htmshell.h"
 #include "hdb.h"
 #include "hgConfig.h"
 #include "udc.h"
+#include "samAlignment.h"
 #include "bamFile.h"
 
 static char const rcsid[] = "$Id: bamFile.c,v 1.22 2010/03/04 05:14:13 angie Exp $";
 
 char *bamFileNameFromTable(struct sqlConnection *conn, char *table, char *bamSeqName)
 /* Return file name from table.  If table has a seqName column, then grab the 
  * row associated with bamSeqName (which is not nec. in chromInfo, e.g. 
  * bam file might have '1' not 'chr1'). */
 {
 boolean checkSeqName = (sqlFieldIndex(conn, table, "seqName") >= 0);
 if (checkSeqName && bamSeqName == NULL)
     errAbort("bamFileNameFromTable: table %s has seqName column, but NULL seqName passed in",
 	     table);
 char query[512];
 if (checkSeqName)
@@ -185,55 +186,59 @@
 #ifndef KNETFILE_HOOKS
     setCurrentDir(runDir);
 #endif//ndef KNETFILE_HOOKS
     samclose(fh);
     if (idx == NULL)
 	{
 	warn("bamFileExists: failed to read index corresponding to %s", bamFileName);
 	return FALSE;
 	}
     free(idx); // Not freeMem, freez etc -- sam just uses malloc/calloc.
     return TRUE;
     }
 return FALSE;
 }
 
-void bamFetch(char *fileOrUrl, char *position, bam_fetch_f callbackFunc, void *callbackData)
+void bamFetch(char *fileOrUrl, char *position, bam_fetch_f callbackFunc, void *callbackData,
+	samfile_t **pSamFile)
 /* Open the .bam file, fetch items in the seq:start-end position range,
  * and call callbackFunc on each bam item retrieved from the file plus callbackData.
- * This handles BAM files with "chr"-less sequence names, e.g. from Ensembl. */
+ * This handles BAM files with "chr"-less sequence names, e.g. from Ensembl. 
+ * The pSamFile parameter is optional.  If non-NULL it will be filled in, just for
+ * the benefit of the callback function, with the open samFile.  */
 {
 char *bamFileName = samtoolsFileName(fileOrUrl);
 samfile_t *fh = samopen(bamFileName, "rb", NULL);
 if (fh == NULL)
     {
     boolean usingUrl = (strstr(fileOrUrl, "tp://") || strstr(fileOrUrl, "https://"));
     struct dyString *urlWarning = dyStringNew(0);
     if (usingUrl)
 	{
 	char *udcFuseRoot = cfgOption("udcFuse.mountPoint");
 	boolean usingUdc = (udcFuseRoot != NULL && startsWith(udcFuseRoot, bamFileName));
 	if (usingUdc)
 	    dyStringAppend(urlWarning, " (using udcFuse)");
 	dyStringAppend(urlWarning,
 		       ". If you are able to access the URL with your web browser, "
 		       "please try reloading this page.");
 	}
     warn("failed to open %s%s", fileOrUrl, urlWarning->string);
     return;
     }
-
+if (pSamFile != NULL)
+    *pSamFile = fh;
 int chromId, start, end;
 int ret = bam_parse_region(fh->header, position, &chromId, &start, &end);
 if (ret != 0 && startsWith("chr", position))
     ret = bam_parse_region(fh->header, position+strlen("chr"), &chromId, &start, &end);
 if (ret != 0)
     // If the bam file does not cover the current chromosome, OK
     return;
 
 #ifndef KNETFILE_HOOKS
 // Since samtools' url-handling code saves the .bai file to the current directory,
 // chdir to a trash directory before calling bam_index_load, then chdir back.
 char *runDir = getCurrentDir();
 char *samDir = getSamDir();
 setCurrentDir(samDir);
 #endif//ndef KNETFILE_HOOKS
@@ -269,80 +274,99 @@
 unsigned int *cigarPacked = bam1_cigar(bam);
 const bam1_core_t *core = &bam->core;
 char op;
 int n = bamUnpackCigarElement(cigarPacked[0], &op);
 int low = (op == 'S') ? n : 0;
 n = bamUnpackCigarElement(cigarPacked[core->n_cigar-1], &op);
 int high = (op == 'S') ? n : 0;
 if (retLow != NULL)
     *retLow = low;
 if (retHigh != NULL)
     *retHigh = high;
 if (retClippedQLen != NULL)
     *retClippedQLen = (core->l_qseq - low - high);
 }
 
-char *bamGetQuerySequence(const bam1_t *bam, boolean useStrand)
-/* Return the nucleotide sequence encoded in bam.  The BAM format 
+
+void bamUnpackQuerySequence(const bam1_t *bam, boolean useStrand, char *qSeq)
+/* Fill in qSeq with the nucleotide sequence encoded in bam.  The BAM format 
  * reverse-complements query sequence when the alignment is on the - strand,
  * so if useStrand is given we rev-comp it back to restore the original query 
  * sequence. */
 {
 const bam1_core_t *core = &bam->core;
 int qLen = core->l_qseq;
-char *qSeq = needMem(qLen+1);
 uint8_t *packedQSeq = bam1_seq(bam);
 int i;
 for (i = 0; i < qLen; i++)
     qSeq[i] = bam_nt16_rev_table[bam1_seqi(packedQSeq, i)];
 qSeq[i] = '\0';
 if (useStrand && bamIsRc(bam))
     reverseComplement(qSeq, qLen);
+}
+
+char *bamGetQuerySequence(const bam1_t *bam, boolean useStrand)
+/* Allocate and return the nucleotide sequence encoded in bam.  The BAM format 
+ * reverse-complements query sequence when the alignment is on the - strand,
+ * so if useStrand is given we rev-comp it back to restore the original query 
+ * sequence. */
+{
+const bam1_core_t *core = &bam->core;
+int qLen = core->l_qseq;
+char *qSeq = needMem(qLen+1);
+bamUnpackQuerySequence(bam, useStrand, qSeq);
 return qSeq;
 }
 
 UBYTE *bamGetQueryQuals(const bam1_t *bam, boolean useStrand)
 /* Return the base quality scores encoded in bam as an array of ubytes. */
 {
 const bam1_core_t *core = &bam->core;
 int qLen = core->l_qseq;
 UBYTE *arr = needMem(qLen);
 boolean isRc = useStrand && bamIsRc(bam);
 UBYTE *qualStr = bam1_qual(bam);
 int i;
 for (i = 0;  i < core->l_qseq;  i++)
     {
     int offset = isRc ? (qLen - 1 - i) : i;
     arr[i] = (qualStr[0] == 255) ? 255 : qualStr[offset];
     }
 return arr;
 }
 
-char *bamGetCigar(const bam1_t *bam)
-/* Return a BAM-enhanced CIGAR string, decoded from the packed encoding in bam. */
+static void bamUnpackCigar(const bam1_t *bam, struct dyString *dyCigar)
+/* Unpack CIGAR string into dynamic string */
 {
 unsigned int *cigarPacked = bam1_cigar(bam);
 const bam1_core_t *core = &bam->core;
-struct dyString *dyCigar = dyStringNew(min(8, core->n_cigar*4));
 int i;
 for (i = 0;  i < core->n_cigar;  i++)
     {
     char op;
     int n = bamUnpackCigarElement(cigarPacked[i], &op);
     dyStringPrintf(dyCigar, "%d", n);
     dyStringAppendC(dyCigar, op);
     }
+}
+
+char *bamGetCigar(const bam1_t *bam)
+/* Return a BAM-enhanced CIGAR string, decoded from the packed encoding in bam. */
+{
+const bam1_core_t *core = &bam->core;
+struct dyString *dyCigar = dyStringNew(min(8, core->n_cigar*4));
+bamUnpackCigar(bam, dyCigar);
 return dyStringCannibalize(&dyCigar);
 }
 
 void bamShowCigarEnglish(const bam1_t *bam)
 /* Print out cigar in English e.g. "20 (mis)Match, 1 Deletion, 3 (mis)Match" */
 {
 unsigned int *cigarPacked = bam1_cigar(bam);
 const bam1_core_t *core = &bam->core;
 int i;
 for (i = 0;  i < core->n_cigar;  i++)
     {
     char op;
     int n = bamUnpackCigarElement(cigarPacked[i], &op);
     if (i > 0)
 	printf(", ");
@@ -575,16 +599,146 @@
     else
 	{
 	if (type == 'A' || type == 'C' || type == 'c') { ++s; }
 	else if (type == 'S' || type == 's') { s += 2; }
 	else if (type == 'I' || type == 'i' || type == 'f') { s += 4; }
 	else if (type == 'd') { s += 8; }
 	else if (type == 'Z' || type == 'H')
 	    {
 	    while (*s++);
 	    }
 	}
     }
 return val;
 }
 
+void bamUnpackAux(const bam1_t *bam, struct dyString *dy)
+/* Unpack the tag:type:val part of bam into dy */
+{
+// adapted from part of bam.c bam_format1:
+uint8_t *s = bam1_aux(bam);
+boolean firstTime = TRUE;
+while (s < bam->data + bam->data_len)
+    {
+    if (firstTime)
+        firstTime = FALSE;
+    else
+        dyStringAppendC(dy, '\t');
+    dyStringAppendC(dy, *s++);
+    dyStringAppendC(dy, *s++);
+    dyStringAppendC(dy, ':');
+    dyStringAppendC(dy, s[0]);
+    dyStringAppendC(dy, ':');
+    uint8_t type = *s++;
+    if (type == 'A') { dyStringPrintf(dy, "%c", *s); ++s; }
+    else if (type == 'C') { dyStringPrintf(dy, "%u", *s); ++s; }
+    else if (type == 'c') { dyStringPrintf(dy, "%d", *s); ++s; }
+    else if (type == 'S') { dyStringPrintf(dy, "%u", *(uint16_t*)s); s += 2; }
+    else if (type == 's') { dyStringPrintf(dy, "%d", *(int16_t*)s); s += 2; }
+    else if (type == 'I') { dyStringPrintf(dy, "%u", *(uint32_t*)s); s += 4; }
+    else if (type == 'i') { dyStringPrintf(dy, "%d", *(int32_t*)s); s += 4; }
+    else if (type == 'f') { dyStringPrintf(dy, "%g", *(float*)s); s += 4; }
+    else if (type == 'd') { dyStringPrintf(dy, "%lg", *(double*)s); s += 8; }
+    else if (type == 'Z' || type == 'H')
+	{
+	dyStringAppend(dy, (char *)s);
+	s += strlen((char *)s) + 1;
+	}
+    }
+}
+
+
+struct bamToSamHelper
+/* Helper structure to help get sam alignments out of a BAM file */
+    {
+    struct lm *lm;	/* Local memory pool to allocate into */
+    char *chrom;	/* Chromosome name */
+    struct dyString *dy;	/* Dynamic temporary string */
+    samfile_t *samFile;	/* Open sam/bam file header. */
+    struct samAlignment *samList;	/* List of alignments. */
+    };
+
+static void addToChars(char *s, int size, char amount)
+/* Add amount to each char in s of given size. */
+{
+int i;
+for (i=0; i<size; ++i)
+    s[i] += amount;
+}
+
+static boolean isAllSameChar(char *s, int size, char c)
+/* Return TRUE if first size chars of s are 0 */
+{
+int i;
+for (i=0; i<size; ++i)
+    if (s[i] != c)
+	return FALSE;
+return TRUE;
+}
+
+
+int bamAddOneSamAlignment(const bam1_t *bam, void *data)
+/* bam_fetch() calls this on each bam alignment retrieved.  Translate each bam
+ * into a linkedFeaturesSeries item, and either store it until we find its mate
+ * or add it to tg->items. */
+{
+struct bamToSamHelper *helper = (struct bamToSamHelper *)data;
+struct lm *lm = helper->lm;
+struct samAlignment *sam;
+lmAllocVar(lm, sam);
+const bam1_core_t *core = &bam->core;
+struct dyString *dy = helper->dy;
+sam->qName = lmCloneString(lm, bam1_qname(bam));
+sam->flag = core->flag;
+sam->rName = helper->chrom;
+sam->pos = core->pos + 1;
+sam->mapQ = core->qual;
+dyStringClear(dy);
+bamUnpackCigar(bam, dy);
+sam->cigar = lmCloneStringZ(lm, dy->string, dy->stringSize);
+if (core->mtid >= 0)
+    {
+    if (core->tid == core->mtid)
+	sam->rNext = "=";
+    else
+	sam->rNext = lmCloneString(lm, helper->samFile->header->target_name[core->mtid]);
+    }
+else
+    sam->rNext = "*";
+sam->pNext = core->mpos + 1;
+sam->tLen = core->isize;
+sam->seq = lmAlloc(lm, core->l_qseq + 1);
+bamUnpackQuerySequence(bam, FALSE, sam->seq);
+char *bamQual = (char *)bam1_qual(bam);
+if (isAllSameChar(bamQual, core->l_qseq, -1))
+    sam->qual = "*";
+else
+    {
+    sam->qual = lmCloneStringZ(lm, bamQual, core->l_qseq);
+    addToChars(sam->qual, core->l_qseq, 33);
+    }
+dyStringClear(dy);
+bamUnpackAux(bam, dy);
+sam->tagTypeVals = lmCloneStringZ(lm, dy->string, dy->stringSize);
+slAddHead(&helper->samList, sam);
+return 0;
+}
+
+struct samAlignment *bamFetchSamAlignment(char *fileOrUrl, char *chrom, int start, int end,
+	struct lm *lm)
+/* Fetch region as a list of samAlignments - which is more or less an unpacked
+ * bam record.  Results is allocated out of lm, since it tends to be large... */
+{
+struct bamToSamHelper helper;
+helper.lm = lm;
+helper.chrom = chrom;
+helper.dy = dyStringNew(0);
+helper.samList = NULL;
+char posForBam[256];
+safef(posForBam, sizeof(posForBam), "%s:%d-%d", chrom, start, end);
+bamFetch(fileOrUrl, posForBam, bamAddOneSamAlignment, &helper, &helper.samFile);
+dyStringFree(&helper.dy);
+slReverse(&helper.samList);
+return helper.samList;
+}
+
 #endif//def USE_BAM