src/hg/lib/bamFile.c 1.2

1.2 2009/08/03 22:00:24 angie
Libified more commonly used BAM code from hgc and hgTracks.
Index: src/hg/lib/bamFile.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/lib/bamFile.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 4 -r1.1 -r1.2
--- src/hg/lib/bamFile.c	27 Jul 2009 21:52:08 -0000	1.1
+++ src/hg/lib/bamFile.c	3 Aug 2009 22:00:24 -0000	1.2
@@ -48,5 +48,70 @@
     errAbort("bam_fetch(%s, %s (chromId=%d) failed (%d)", bamFileName, position, chromId, ret);
 samclose(fh);
 }
 
+char *bamGetQuerySequence(const bam1_t *bam)
+/* Return the nucleotide sequence encoded in bam. */
+{
+const bam1_core_t *core = &bam->core;
+char *qSeq = needMem(core->l_qseq + 1);
+uint8_t *s = bam1_seq(bam);
+int i;
+for (i = 0; i < core->l_qseq; i++)
+    qSeq[i] = bam_nt16_rev_table[bam1_seqi(s, i)];
+if ((core->flag & BAM_FREVERSE) == 1)
+    reverseComplement(qSeq, core->l_qseq);
+return qSeq;
+}
+
+char *bamGetCigar(const bam1_t *bam)
+/* Return a BAM-enhanced CIGAR string, decoded from the packed encoding in bam. */
+{
+unsigned int *cigarPacked = bam1_cigar(bam);
+const bam1_core_t *core = &bam->core;
+struct dyString *dyCigar = dyStringNew(min(8, core->n_cigar*4));
+int i;
+for (i = 0;  i < core->n_cigar;  i++)
+    {
+    char op;
+    int n = bamUnpackCigarElement(cigarPacked[i], &op);
+    dyStringPrintf(dyCigar, "%d", n);
+    dyStringAppendC(dyCigar, op);
+    }
+return dyStringCannibalize(&dyCigar);
+}
+
+int bamGetTargetLength(const bam1_t *bam)
+/* Tally up the alignment's length on the reference sequence from
+ * bam's packed-int CIGAR representation. */
+{
+unsigned int *cigarPacked = bam1_cigar(bam);
+const bam1_core_t *core = &bam->core;
+int tLength=0;
+int i;
+for (i = 0;  i < core->n_cigar;  i++)
+    {
+    char op;
+    int n = bamUnpackCigarElement(cigarPacked[i], &op);
+    switch (op)
+	{
+	case 'M': // match or mismatch (gapless aligned block)
+	    tLength += n;
+	    break;
+	case 'I': // inserted in query
+	case 'S': // skipped query bases at beginning or end ("soft clipping")
+	    break;
+	case 'D': // deleted from query
+	case 'N': // long deletion from query (intron as opposed to small del)
+	    tLength += n;
+	    break;
+	case 'H': // skipped query bases not stored in record's query sequence ("hard clipping")
+	case 'P': // P="silent deletion from padded reference sequence" -- ignore these.
+	    break;
+	default:
+	    errAbort("bamGetTargetLength: unrecognized CIGAR op %c -- update me", op);
+	}
+    }
+return tLength;
+}
+
 #endif//def USE_BAM