src/hg/hgTables/bam.c 0b52ad3ec2c6113bfdd692a58fee812b7dd0a7b2

0b52ad3ec2c6113bfdd692a58fee812b7dd0a7b2
kent
  Fri Feb 4 20:47:27 2011 -0800
Getting Table Browser to handle BAM files.  This is ready for testing by folks other than me now. Should work for both built-in BAM files and BAM files from track data hubs. I've tested it more though via the hubs. Works mostly by converting bam->sam->array-of-strings and then slotting into the same code bigBed processing uses.
diff --git src/hg/hgTables/bam.c src/hg/hgTables/bam.c
index cbfc448..23bdb39 100644
--- src/hg/hgTables/bam.c
+++ src/hg/hgTables/bam.c
@@ -10,133 +10,319 @@
 #include "jksql.h"
 #include "cheapcgi.h"
 #include "cart.h"
 #include "web.h"
 #include "bed.h"
 #include "hdb.h"
 #include "trackDb.h"
 #include "obscure.h"
 #include "hmmstats.h"
 #include "correlate.h"
 #include "asParse.h"
 #include "bbiFile.h"
 #include "bigBed.h"
 #include "hubConnect.h"
 #include "hgTables.h"
+#include "asFilter.h"
 #include "bamFile.h"
+#include "samAlignment.h"
 #if (defined USE_BAM && defined KNETFILE_HOOKS)
 #include "knetUdc.h"
 #include "udc.h"
 #endif//def USE_BAM && KNETFILE_HOOKS
 
 
 boolean isBamTable(char *table)
 /* Return TRUE if table corresponds to a BAM file. */
 {
 if (isHubTrack(table))
     {
     struct trackDb *tdb = hashFindVal(fullTrackAndSubtrackHash, table);
     return startsWithWord("bam", tdb->type);
     }
 else
     return trackIsType(database, table, curTrack, "bam", ctLookupName);
 }
 
 char *bamFileName(char *table, struct sqlConnection *conn)
 /* Return file name associated with BAM.  This handles differences whether it's
  * a custom or built-in track.  Do a freeMem on returned string when done. */
 {
 /* Implementation is same as bigWig. */
 return bigWigFileName(table, conn);
 }
 
 char *bamAsDef = 
-"";
+"table samAlignment\n"
+"\"The fields of a SAM short read alignment, the text version of BAM.\"\n"
+"    (\n"
+"    string qName;	\"Query template name - name of a read\"\n"
+"    ushort flag;	\"Flags.  0x10 set for reverse complement.  See SAM docs for others.\"\n"
+"    string rName;	\"Reference sequence name (often a chromosome)\"\n"
+"    uint pos;		\"1 based position\"\n"
+"    ubyte mapQ;		\"Mapping quality 0-255, 255 is best\"\n"
+"    string cigar;	\"CIGAR encoded alignment string.\"\n"
+"    string rNext;	\"Ref sequence for next (mate) read. '=' if same as rName, '*' if no mate\"\n"
+"    int pNext;		\"Position (1-based) of next (mate) sequence. May be -1 or 0 if no mate\"\n"
+"    int tLen;	        \"Size of DNA template for mated pairs.  -size for one of mate pairs\"\n"
+"    string seq;		\"Query template sequence\"\n"
+"    string qual;	\"ASCII of Phred-scaled base QUALity+33.  Just '*' if no quality scores\"\n"
+"    string tagTypeVals; \"Tab-delimited list of tag:type:value optional extra fields\"\n"
+"    )\n";
+
+struct asObject *bamAsObj()
+/* Return asObject describing fields of BAM */
+{
+return asParseText(bamAsDef);
+}
+
+struct hTableInfo *bamToHti(char *table)
+/* Get standard fields of BAM into hti structure. */
+{
+struct hTableInfo *hti;
+AllocVar(hti);
+hti->rootName = cloneString(table);
+hti->isPos= TRUE;
+strcpy(hti->chromField, "rName");
+strcpy(hti->startField, "pos");
+strcpy(hti->nameField, "qName");
+hti->type = cloneString("bam");
+return hti;
+}
+
+struct slName *bamGetFields(char *table)
+/* Get fields of bam as simple name list. */
+{
+struct asObject *as = bamAsObj();
+struct slName *names = asColNames(as);
+return names;
+}
+
+struct sqlFieldType *bamListFieldsAndTypes()
+/* Get fields of bigBed as list of sqlFieldType. */
+{
+struct asObject *as = bamAsObj();
+struct sqlFieldType *list = sqlFieldTypesFromAs(as);
+return list;
+}
+
+#define BAM_NUM_BUF_SIZE 256
+
+void samAlignmentToRow(struct samAlignment *sam, char *numBuf, char *row[SAMALIGNMENT_NUM_COLS])
+/* Convert samAlignment data structure to an array of strings, using numBuf to store
+ * ascii versions of numbers temporarily */
+{
+char *numPt = numBuf;
+char *numBufEnd = numBuf + BAM_NUM_BUF_SIZE;
+
+row[0] = sam->qName;
+row[1] = numPt; numPt += sprintf(numPt, "%u", sam->flag); numPt += 1;
+row[2] = sam->rName;
+row[3] = numPt; numPt += sprintf(numPt, "%u", sam->pos); numPt += 1;
+row[4] = numPt; numPt += sprintf(numPt, "%u", sam->mapQ); numPt += 1;
+row[5] = sam->cigar;
+row[6] = sam->rNext;
+row[7] = numPt; numPt += sprintf(numPt, "%d", sam->pNext); numPt += 1;
+row[8] = numPt; numPt += sprintf(numPt, "%d", sam->tLen); numPt += 1;
+row[9] = sam->seq;
+row[10] = sam->qual;
+row[11] = sam->tagTypeVals;
+assert(numPt < numBufEnd);
+}
+
+void bamTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f)
+/* Print out selected fields from BAM.  If fields is NULL, then print out all fields. */
+{
+if (f == NULL) 
+    f = stdout;
+
+/* Convert comma separated list of fields to array. */
+int fieldCount = chopByChar(fields, ',', NULL, 0);
+char **fieldArray;
+AllocArray(fieldArray, fieldCount);
+chopByChar(fields, ',', fieldArray, fieldCount);
+
+/* Get list of all fields in big bed and turn it into a hash of column indexes keyed by
+ * column name. */
+struct hash *fieldHash = hashNew(0);
+struct slName *bb, *bbList = bamGetFields(table);
+int i;
+for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i)
+    hashAddInt(fieldHash, bb->name, i);
+
+/* Create an array of column indexes corresponding to the selected field list. */
+int *columnArray;
+AllocArray(columnArray, fieldCount);
+for (i=0; i<fieldCount; ++i)
+    {
+    columnArray[i] = hashIntVal(fieldHash, fieldArray[i]);
+    }
+
+/* Output row of labels */
+fprintf(f, "#%s", fieldArray[0]);
+for (i=1; i<fieldCount; ++i)
+    fprintf(f, "\t%s", fieldArray[i]);
+fprintf(f, "\n");
+
+char *fileName = bamFileName(table, conn);
+struct asObject *as = bamAsObj();
+struct asFilter *filter = NULL;
+
+if (anyFilter())
+    {
+    filter = asFilterFromCart(cart, db, table, as);
+    if (filter)
+        {
+	fprintf(f, "# Filtering on %d columns\n", slCount(filter->columnList));
+	}
+    }
+
+/* Loop through outputting each region */
+struct region *region, *regionList = getRegions();
+for (region = regionList; region != NULL; region = region->next)
+    {
+    struct lm *lm = lmInit(0);
+    struct samAlignment *sam, *samList = bamFetchSamAlignment(fileName, region->chrom,
+    	region->start, region->end, lm);
+    char *row[SAMALIGNMENT_NUM_COLS];
+    char numBuf[BAM_NUM_BUF_SIZE];
+    for (sam = samList; sam != NULL; sam = sam->next)
+        {
+	samAlignmentToRow(sam, numBuf, row);
+	if (asFilterOnRow(filter, row))
+	    {
+	    int i;
+	    fprintf(f, "%s", row[columnArray[0]]);
+	    for (i=1; i<fieldCount; ++i)
+		fprintf(f, "\t%s", row[columnArray[i]]);
+	    fprintf(f, "\n");
+	    }
+	}
+    lmCleanup(&lm);
+    }
+
+/* Clean up and exit. */
+hashFree(&fieldHash);
+freeMem(fieldArray);
+freeMem(columnArray);
+}
+
+int cigarWidth(char *cigar, int cigarSize)
+/* Return width of alignment as encoded in cigar format string. */
+{
+int tLength=0;
+char *s, *end = cigar + cigarSize;
+s = cigar;
+while (s < end)
+    {
+    int digCount = countLeadingDigits(s);
+    if (digCount <= 0)
+        errAbort("expecting number got %s in cigarWidth", s);
+    int n = atoi(s);
+    s += digCount;
+    char op = *s++;
+    switch (op)
+	{
+	case 'M': // match or mismatch (gapless aligned block)
+	    tLength += n;
+	    break;
+	case 'I': // inserted in query
+	    break;
+	case 'D': // deleted from query
+	case 'N': // long deletion from query (intron as opposed to small del)
+	    tLength += n;
+	    break;
+	case 'S': // skipped query bases at beginning or end ("soft clipping")
+	case 'H': // skipped query bases not stored in record's query sequence ("hard clipping")
+	case 'P': // P="silent deletion from padded reference sequence" -- ignore these.
+	    break;
+	default:
+	    errAbort("cigarWidth: unrecognized CIGAR op %c -- update me", op);
+	}
+    }
+return tLength;
+}
+
+static void addFilteredBedsOnRegion(char *fileName, struct region *region, 
+	char *table, struct asFilter *filter, struct lm *bedLm, struct bed **pBedList)
+/* Add relevant beds in reverse order to pBedList */
+{
+struct lm *lm = lmInit(0);
+struct samAlignment *sam, *samList = bamFetchSamAlignment(fileName, region->chrom,
+    	region->start, region->end, lm);
+char *row[SAMALIGNMENT_NUM_COLS];
+char numBuf[BAM_NUM_BUF_SIZE];
+for (sam = samList; sam != NULL; sam = sam->next)
+    {
+    samAlignmentToRow(sam, numBuf, row);
+    if (asFilterOnRow(filter, row))
+        {
+	struct bed *bed;
+	lmAllocVar(bedLm, bed);
+	bed->chrom = lmCloneString(bedLm, sam->rName);
+	bed->chromStart = sam->pos - 1;
+	bed->chromEnd = bed->chromStart + cigarWidth(sam->cigar, strlen(sam->cigar));
+	bed->name = lmCloneString(bedLm, sam->qName);
+	slAddHead(pBedList, bed);
+	}
+    }
+lmCleanup(&lm);
+}
+
+struct bed *bamGetFilteredBedsOnRegions(struct sqlConnection *conn, 
+	char *db, char *table, struct region *regionList, struct lm *lm, 
+	int *retFieldCount)
+/* Get list of beds from BAM, in all regions, that pass filtering. */
+{
+/* Figure out bam file name get column info and filter. */
+char *fileName = bamFileName(table, conn);
+struct asObject *as = bamAsObj();
+struct asFilter *filter = asFilterFromCart(cart, db, table, as);
+
+/* Get beds a region at a time. */
+struct bed *bedList = NULL;
+struct region *region;
+for (region = regionList; region != NULL; region = region->next)
+    addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList);
+slReverse(&bedList);
+return bedList;
+}
 
 void showSchemaBam(char *table)
 /* Show schema on bam. */
 {
-/* Get description of columns, making it up from BED records if need be. */
-struct asObject *as = bigBedAs(bbi);
-if (as == NULL)
-    as = asParseText(bedAsDef(bbi->definedFieldCount, bbi->fieldCount));
+struct sqlConnection *conn = hAllocConn(database);
+char *fileName = bamFileName(table, conn);
 
+struct asObject *as = bamAsObj();
 hPrintf("<B>Database:</B> %s", database);
 hPrintf("&nbsp;&nbsp;&nbsp;&nbsp;<B>Primary Table:</B> %s<br>", table);
-hPrintf("<B>Big Bed File:</B> %s", fileName);
-if (bbi->version >= 2)
-    {
-    hPrintf("&nbsp;&nbsp;&nbsp;&nbsp;<B>Item Count:</B> ");
-    printLongWithCommas(stdout, bigBedItemCount(bbi));
-    }
+hPrintf("<B>BAM File:</B> %s", fileName);
 hPrintf("<BR>\n");
 hPrintf("<B>Format description:</B> %s<BR>", as->comment);
+hPrintf("See the <A HREF=\"%s\" target=_blank>SAM Format Specification</A> for  more details<BR>\n",
+	"http://samtools.sourceforge.net/SAM-1.3.pdf");
 
 /* Put up table that describes fields. */
 hTableStart();
 hPrintf("<TR><TH>field</TH>");
-hPrintf("<TH>example</TH>");
 hPrintf("<TH>description</TH> ");
 puts("</TR>\n");
 struct asColumn *col;
 int colCount = 0;
-char *row[bbi->fieldCount];
-char startBuf[16], endBuf[16];
-char *dupeRest = lmCloneString(lm, ivList->rest);	/* Manage rest-stomping side-effect */
-bigBedIntervalToRow(ivList, chromList->name, startBuf, endBuf, row, bbi->fieldCount);
-ivList->rest = dupeRest;
 for (col = as->columnList; col != NULL; col = col->next)
     {
     hPrintf("<TR><TD><TT>%s</TT></TD>", col->name);
-    hPrintf("<TD>%s</TD>", row[colCount]);
     hPrintf("<TD>%s</TD></TR>", col->comment);
     ++colCount;
     }
-
-/* If more fields than descriptions put up minimally helpful info (at least has example). */
-for ( ; colCount < bbi->fieldCount; ++colCount)
-    {
-    hPrintf("<TR><TD><TT>column%d</TT></TD>", colCount+1);
-    hPrintf("<TD>%s</TD>", row[colCount]);
-    hPrintf("<TD>n/a</TD></TR>\n");
-    }
 hTableEnd();
 
-
-/* Put up another section with sample rows. */
-webNewSection("Sample Rows");
-hTableStart();
-
-/* Print field names as column headers for example */
-hPrintf("<TR>");
-int colIx = 0;
-for (col = as->columnList; col != NULL; col = col->next)
-    {
-    hPrintf("<TH>%s</TH>", col->name);
-    ++colIx;
-    }
-for (; colIx < colCount; ++colIx)
-    hPrintf("<TH>column%d</TH>", colIx+1);
-hPrintf("</TR>\n");
-
-/* Print sample lines. */
-struct bigBedInterval *iv;
-for (iv=ivList; iv != NULL; iv = iv->next)
-    {
-    bigBedIntervalToRow(iv, chromList->name, startBuf, endBuf, row, bbi->fieldCount);
-    hPrintf("<TR>");
-    for (colIx=0; colIx<colCount; ++colIx)
-        {
-	writeHtmlCell(row[colIx]);
-	}
-    hPrintf("</TR>\n");
-    }
-hTableEnd();
+/* In a perfect world would print sample rows here.  Maybe later.... */
 
 /* Clean up and go home. */
-lmCleanup(&lm);
-bbiFileClose(&bbi);
 freeMem(fileName);
 hFreeConn(&conn);
 }
+
 #endif /* USE_BAM */