0b52ad3ec2c6113bfdd692a58fee812b7dd0a7b2 kent Fri Feb 4 20:47:27 2011 -0800 Getting Table Browser to handle BAM files. This is ready for testing by folks other than me now. Should work for both built-in BAM files and BAM files from track data hubs. I've tested it more though via the hubs. Works mostly by converting bam->sam->array-of-strings and then slotting into the same code bigBed processing uses. diff --git src/hg/hgTables/bam.c src/hg/hgTables/bam.c index cbfc448..23bdb39 100644 --- src/hg/hgTables/bam.c +++ src/hg/hgTables/bam.c @@ -10,133 +10,319 @@ #include "jksql.h" #include "cheapcgi.h" #include "cart.h" #include "web.h" #include "bed.h" #include "hdb.h" #include "trackDb.h" #include "obscure.h" #include "hmmstats.h" #include "correlate.h" #include "asParse.h" #include "bbiFile.h" #include "bigBed.h" #include "hubConnect.h" #include "hgTables.h" +#include "asFilter.h" #include "bamFile.h" +#include "samAlignment.h" #if (defined USE_BAM && defined KNETFILE_HOOKS) #include "knetUdc.h" #include "udc.h" #endif//def USE_BAM && KNETFILE_HOOKS boolean isBamTable(char *table) /* Return TRUE if table corresponds to a BAM file. */ { if (isHubTrack(table)) { struct trackDb *tdb = hashFindVal(fullTrackAndSubtrackHash, table); return startsWithWord("bam", tdb->type); } else return trackIsType(database, table, curTrack, "bam", ctLookupName); } char *bamFileName(char *table, struct sqlConnection *conn) /* Return file name associated with BAM. This handles differences whether it's * a custom or built-in track. Do a freeMem on returned string when done. */ { /* Implementation is same as bigWig. */ return bigWigFileName(table, conn); } char *bamAsDef = -""; +"table samAlignment\n" +"\"The fields of a SAM short read alignment, the text version of BAM.\"\n" +" (\n" +" string qName; \"Query template name - name of a read\"\n" +" ushort flag; \"Flags. 0x10 set for reverse complement. See SAM docs for others.\"\n" +" string rName; \"Reference sequence name (often a chromosome)\"\n" +" uint pos; \"1 based position\"\n" +" ubyte mapQ; \"Mapping quality 0-255, 255 is best\"\n" +" string cigar; \"CIGAR encoded alignment string.\"\n" +" string rNext; \"Ref sequence for next (mate) read. '=' if same as rName, '*' if no mate\"\n" +" int pNext; \"Position (1-based) of next (mate) sequence. May be -1 or 0 if no mate\"\n" +" int tLen; \"Size of DNA template for mated pairs. -size for one of mate pairs\"\n" +" string seq; \"Query template sequence\"\n" +" string qual; \"ASCII of Phred-scaled base QUALity+33. Just '*' if no quality scores\"\n" +" string tagTypeVals; \"Tab-delimited list of tag:type:value optional extra fields\"\n" +" )\n"; + +struct asObject *bamAsObj() +/* Return asObject describing fields of BAM */ +{ +return asParseText(bamAsDef); +} + +struct hTableInfo *bamToHti(char *table) +/* Get standard fields of BAM into hti structure. */ +{ +struct hTableInfo *hti; +AllocVar(hti); +hti->rootName = cloneString(table); +hti->isPos= TRUE; +strcpy(hti->chromField, "rName"); +strcpy(hti->startField, "pos"); +strcpy(hti->nameField, "qName"); +hti->type = cloneString("bam"); +return hti; +} + +struct slName *bamGetFields(char *table) +/* Get fields of bam as simple name list. */ +{ +struct asObject *as = bamAsObj(); +struct slName *names = asColNames(as); +return names; +} + +struct sqlFieldType *bamListFieldsAndTypes() +/* Get fields of bigBed as list of sqlFieldType. */ +{ +struct asObject *as = bamAsObj(); +struct sqlFieldType *list = sqlFieldTypesFromAs(as); +return list; +} + +#define BAM_NUM_BUF_SIZE 256 + +void samAlignmentToRow(struct samAlignment *sam, char *numBuf, char *row[SAMALIGNMENT_NUM_COLS]) +/* Convert samAlignment data structure to an array of strings, using numBuf to store + * ascii versions of numbers temporarily */ +{ +char *numPt = numBuf; +char *numBufEnd = numBuf + BAM_NUM_BUF_SIZE; + +row[0] = sam->qName; +row[1] = numPt; numPt += sprintf(numPt, "%u", sam->flag); numPt += 1; +row[2] = sam->rName; +row[3] = numPt; numPt += sprintf(numPt, "%u", sam->pos); numPt += 1; +row[4] = numPt; numPt += sprintf(numPt, "%u", sam->mapQ); numPt += 1; +row[5] = sam->cigar; +row[6] = sam->rNext; +row[7] = numPt; numPt += sprintf(numPt, "%d", sam->pNext); numPt += 1; +row[8] = numPt; numPt += sprintf(numPt, "%d", sam->tLen); numPt += 1; +row[9] = sam->seq; +row[10] = sam->qual; +row[11] = sam->tagTypeVals; +assert(numPt < numBufEnd); +} + +void bamTabOut(char *db, char *table, struct sqlConnection *conn, char *fields, FILE *f) +/* Print out selected fields from BAM. If fields is NULL, then print out all fields. */ +{ +if (f == NULL) + f = stdout; + +/* Convert comma separated list of fields to array. */ +int fieldCount = chopByChar(fields, ',', NULL, 0); +char **fieldArray; +AllocArray(fieldArray, fieldCount); +chopByChar(fields, ',', fieldArray, fieldCount); + +/* Get list of all fields in big bed and turn it into a hash of column indexes keyed by + * column name. */ +struct hash *fieldHash = hashNew(0); +struct slName *bb, *bbList = bamGetFields(table); +int i; +for (bb = bbList, i=0; bb != NULL; bb = bb->next, ++i) + hashAddInt(fieldHash, bb->name, i); + +/* Create an array of column indexes corresponding to the selected field list. */ +int *columnArray; +AllocArray(columnArray, fieldCount); +for (i=0; icolumnList)); + } + } + +/* Loop through outputting each region */ +struct region *region, *regionList = getRegions(); +for (region = regionList; region != NULL; region = region->next) + { + struct lm *lm = lmInit(0); + struct samAlignment *sam, *samList = bamFetchSamAlignment(fileName, region->chrom, + region->start, region->end, lm); + char *row[SAMALIGNMENT_NUM_COLS]; + char numBuf[BAM_NUM_BUF_SIZE]; + for (sam = samList; sam != NULL; sam = sam->next) + { + samAlignmentToRow(sam, numBuf, row); + if (asFilterOnRow(filter, row)) + { + int i; + fprintf(f, "%s", row[columnArray[0]]); + for (i=1; ichrom, + region->start, region->end, lm); +char *row[SAMALIGNMENT_NUM_COLS]; +char numBuf[BAM_NUM_BUF_SIZE]; +for (sam = samList; sam != NULL; sam = sam->next) + { + samAlignmentToRow(sam, numBuf, row); + if (asFilterOnRow(filter, row)) + { + struct bed *bed; + lmAllocVar(bedLm, bed); + bed->chrom = lmCloneString(bedLm, sam->rName); + bed->chromStart = sam->pos - 1; + bed->chromEnd = bed->chromStart + cigarWidth(sam->cigar, strlen(sam->cigar)); + bed->name = lmCloneString(bedLm, sam->qName); + slAddHead(pBedList, bed); + } + } +lmCleanup(&lm); +} + +struct bed *bamGetFilteredBedsOnRegions(struct sqlConnection *conn, + char *db, char *table, struct region *regionList, struct lm *lm, + int *retFieldCount) +/* Get list of beds from BAM, in all regions, that pass filtering. */ +{ +/* Figure out bam file name get column info and filter. */ +char *fileName = bamFileName(table, conn); +struct asObject *as = bamAsObj(); +struct asFilter *filter = asFilterFromCart(cart, db, table, as); + +/* Get beds a region at a time. */ +struct bed *bedList = NULL; +struct region *region; +for (region = regionList; region != NULL; region = region->next) + addFilteredBedsOnRegion(fileName, region, table, filter, lm, &bedList); +slReverse(&bedList); +return bedList; +} void showSchemaBam(char *table) /* Show schema on bam. */ { -/* Get description of columns, making it up from BED records if need be. */ -struct asObject *as = bigBedAs(bbi); -if (as == NULL) - as = asParseText(bedAsDef(bbi->definedFieldCount, bbi->fieldCount)); +struct sqlConnection *conn = hAllocConn(database); +char *fileName = bamFileName(table, conn); +struct asObject *as = bamAsObj(); hPrintf("Database: %s", database); hPrintf("    Primary Table: %s
", table); -hPrintf("Big Bed File: %s", fileName); -if (bbi->version >= 2) - { - hPrintf("    Item Count: "); - printLongWithCommas(stdout, bigBedItemCount(bbi)); - } +hPrintf("BAM File: %s", fileName); hPrintf("
\n"); hPrintf("Format description: %s
", as->comment); +hPrintf("See the SAM Format Specification for more details
\n", + "http://samtools.sourceforge.net/SAM-1.3.pdf"); /* Put up table that describes fields. */ hTableStart(); hPrintf("field"); -hPrintf("example"); hPrintf("description "); puts("\n"); struct asColumn *col; int colCount = 0; -char *row[bbi->fieldCount]; -char startBuf[16], endBuf[16]; -char *dupeRest = lmCloneString(lm, ivList->rest); /* Manage rest-stomping side-effect */ -bigBedIntervalToRow(ivList, chromList->name, startBuf, endBuf, row, bbi->fieldCount); -ivList->rest = dupeRest; for (col = as->columnList; col != NULL; col = col->next) { hPrintf("%s", col->name); - hPrintf("%s", row[colCount]); hPrintf("%s", col->comment); ++colCount; } - -/* If more fields than descriptions put up minimally helpful info (at least has example). */ -for ( ; colCount < bbi->fieldCount; ++colCount) - { - hPrintf("column%d", colCount+1); - hPrintf("%s", row[colCount]); - hPrintf("n/a\n"); - } hTableEnd(); - -/* Put up another section with sample rows. */ -webNewSection("Sample Rows"); -hTableStart(); - -/* Print field names as column headers for example */ -hPrintf(""); -int colIx = 0; -for (col = as->columnList; col != NULL; col = col->next) - { - hPrintf("%s", col->name); - ++colIx; - } -for (; colIx < colCount; ++colIx) - hPrintf("column%d", colIx+1); -hPrintf("\n"); - -/* Print sample lines. */ -struct bigBedInterval *iv; -for (iv=ivList; iv != NULL; iv = iv->next) - { - bigBedIntervalToRow(iv, chromList->name, startBuf, endBuf, row, bbi->fieldCount); - hPrintf(""); - for (colIx=0; colIx\n"); - } -hTableEnd(); +/* In a perfect world would print sample rows here. Maybe later.... */ /* Clean up and go home. */ -lmCleanup(&lm); -bbiFileClose(&bbi); freeMem(fileName); hFreeConn(&conn); } + #endif /* USE_BAM */