e020a087201d536e5ffe573fa6fb1ad1be68127c
angie
Wed Dec 18 16:54:56 2013 -0800
In MLQ #12367, the user got a blank screen when trying to view a CTusing a Mac-mangled BAM file. Of course we need to show an error message
instead of SEGV/blank screen; but hgCustom also shouldn't have accepted
the file as a custom track. It turns out that the samtools lib function
samopen() may return a non-null file handler with a null header, when the
header is invalid. So we need to check not only for null result from
samopen, but also null header inside non-null result. Meanwhile, I noticed
that hgCustom's error reporting was actually masking more detailed error
messages from lib code, because of its use of errCatch around bamFileExists.
errCatch only lets errAbort messages propagate, not warnings; bamFileExists
has a warn() that is lost in transmission. I added bamFileAndIndexMustExist
which uses errAbort so that errCatch in callers will get the more informative
error instead of having to make up one of its own. trackHub checking code
also needed this stronger form of checking. I also moved some duplicated
(or triplicated) code in bamFile.c into functions.
refs #12367
diff --git src/inc/bamFile.h src/inc/bamFile.h
index c0392b9..94aa717 100644
--- src/inc/bamFile.h
+++ src/inc/bamFile.h
@@ -1,154 +1,158 @@
/* bamFile -- interface to binary alignment format files using Heng Li's samtools lib. */
#ifndef BAMFILE_H
#define BAMFILE_H
#include "dnaseq.h"
#include "dystring.h"
#ifdef USE_BAM
// bam.h is incomplete without _IOLIB set to 1, 2 or 3. 2 is used by Makefile.generic:
#ifndef _IOLIB
#define _IOLIB 2
#endif
#include "bam.h"
#include "sam.h"
#else // no USE_BAM
typedef struct { } bam1_t;
typedef struct { } bam_index_t;
typedef struct { } samfile_t;
typedef int (*bam_fetch_f)(const bam1_t *b, void *data);
#define COMPILE_WITH_SAMTOOLS "%s: in order to use this functionality you must " \
"install the samtools library (http://samtools.sourceforge.net) and recompile kent/src with " \
"USE_BAM=1 in your environment " \
"(see http://genomewiki.ucsc.edu/index.php/Build_Environment_Variables)."
#endif // USE_BAM
struct bamChromInfo
{
struct bamChromInfo *next;
char *name; /* Chromosome name */
bits32 size; /* Chromosome size in bases */
};
boolean bamFileExists(char *bamFileName);
/* Return TRUE if we can successfully open the bam file and its index file. */
+void bamFileAndIndexMustExist(char *fileOrUrl);
+/* Open both a bam file and its accompanying index or errAbort; this is what it
+ * takes for diagnostic info to propagate up through errCatches in calling code. */
+
samfile_t *bamOpen(char *fileOrUrl, char **retBamFileName);
/* Return an open bam file as well as the filename of the bam. */
void bamFetchAlreadyOpen(samfile_t *samfile, bam_index_t *idx, char *bamFileName,
char *position, bam_fetch_f callbackFunc, void *callbackData);
/* With the open bam file, return items the same way with the callbacks as with bamFetch() */
/* except in this case use an already-open bam file and index (use bam_index_load and free() for */
/* the index). It seems a little strange to pass the filename in with the open bam, but */
/* it's just used to report errors. */
void bamFetch(char *fileOrUrl, char *position, bam_fetch_f callbackFunc, void *callbackData,
samfile_t **pSamFile);
/* Open the .bam file, fetch items in the seq:start-end position range,
* and call callbackFunc on each bam item retrieved from the file plus callbackData.
* This handles BAM files with "chr"-less sequence names, e.g. from Ensembl.
* The pSamFile parameter is optional. If non-NULL it will be filled in, just for
* the benefit of the callback function, with the open samFile. */
void bamClose(samfile_t **pSamFile);
/* Close down a samefile_t */
boolean bamIsRc(const bam1_t *bam);
/* Return TRUE if alignment is on - strand. */
INLINE int bamUnpackCigarElement(unsigned int packed, char *retOp)
/* Given an unsigned int containing a number of bases and an offset into an
* array of BAM-enhanced-CIGAR ASCII characters (operations), store operation
* char into *retOp (retOp must not be NULL) and return the number of bases. */
{
#ifdef USE_BAM
// decoding lifted from samtools bam.c bam_format1_core(), long may it remain stable:
#define BAM_DOT_C_OPCODE_STRING "MIDNSHP=X"
int n = packed>>BAM_CIGAR_SHIFT;
int opcode = packed & BAM_CIGAR_MASK;
if (opcode >= strlen(BAM_DOT_C_OPCODE_STRING))
errAbort("bamUnpackCigarElement: unrecognized opcode %d. "
"(I only recognize 0..%lu [" BAM_DOT_C_OPCODE_STRING "]) "
"Perhaps samtools bam.c's bam_format1 encoding changed? If so, update me.",
opcode, (unsigned long)(strlen(BAM_DOT_C_OPCODE_STRING)-1));
*retOp = BAM_DOT_C_OPCODE_STRING[opcode];
return n;
#else // no USE_BAM
errAbort(COMPILE_WITH_SAMTOOLS, "bamUnpackCigarElement");
return 0;
#endif// USE_BAM
}
void bamGetSoftClipping(const bam1_t *bam, int *retLow, int *retHigh, int *retClippedQLen);
/* If retLow is non-NULL, set it to the number of "soft-clipped" (skipped) bases at
* the beginning of the query sequence and quality; likewise for retHigh at end.
* For convenience, retClippedQLen is the original query length minus soft clipping
* (and the length of the query sequence that will be returned). */
void bamUnpackQuerySequence(const bam1_t *bam, boolean useStrand, char *qSeq);
/* Fill in qSeq with the nucleotide sequence encoded in bam. The BAM format
* reverse-complements query sequence when the alignment is on the - strand,
* so if useStrand is given we rev-comp it back to restore the original query
* sequence. */
char *bamGetQuerySequence(const bam1_t *bam, boolean useStrand);
/* Return the nucleotide sequence encoded in bam. The BAM format
* reverse-complements query sequence when the alignment is on the - strand,
* so if useStrand is given we rev-comp it back to restore the original query
* sequence. */
UBYTE *bamGetQueryQuals(const bam1_t *bam, boolean useStrand);
/* Return the base quality scores encoded in bam as an array of ubytes. */
void bamUnpackCigar(const bam1_t *bam, struct dyString *dyCigar);
/* Unpack CIGAR string into dynamic string */
char *bamGetCigar(const bam1_t *bam);
/* Return a BAM-enhanced CIGAR string, decoded from the packed encoding in bam. */
void bamShowCigarEnglish(const bam1_t *bam);
/* Print out cigar in English e.g. "20 (mis)Match, 1 Deletion, 3 (mis)Match" */
void bamShowFlagsEnglish(const bam1_t *bam);
/* Print out flags in English, e.g. "Mate is on '-' strand; Properly paired". */
int bamGetTargetLength(const bam1_t *bam);
/* Tally up the alignment's length on the reference sequence from
* bam's packed-int CIGAR representation. */
bam1_t *bamClone(const bam1_t *bam);
/* Return a newly allocated copy of bam. */
void bamShowTags(const bam1_t *bam);
/* Print out tags in HTML: bold key, no type indicator for brevity. */
char *bamGetTagString(const bam1_t *bam, char *tag, char *buf, size_t bufSize);
/* If bam's tags include the given 2-character tag, place the value into
* buf (zero-terminated, trunc'd if nec) and return a pointer to buf,
* or NULL if tag is not present. */
void bamUnpackAux(const bam1_t *bam, struct dyString *dy);
/* Unpack the tag:type:val part of bam into dy */
struct bamChromInfo *bamChromList(samfile_t *fh);
/* Return list of chromosomes from bam header. We make no attempty to normalize chromosome names to UCSC format,
so list may contain things like "1" for "chr1", "I" for "chrI", "MT" for "chrM" etc. */
void bamChromInfoFreeList(struct bamChromInfo **pList);
/* Free a list of dynamically allocated bamChromInfo's */
void samToBed(char *samIn, char *bedOut);
/* samToBed - Convert SAM file to a pretty simple minded bed file.. */
void samToOpenBed(char *samIn, FILE *f);
/* Like samToOpenBed, but the output is the already open file f. */
#endif//ndef BAMFILE_H