src/hg/lib/bamFile.c 1.17

1.17 2009/12/10 15:02:12 angie
Got rid of bogus bamIgnoreStrand() -- make useStrand explicit. Added auto-detect of missing 'chr' in sequence names, so stripPrefix setting is no longer necessary. Better message for samopen failures.
Index: src/hg/lib/bamFile.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/lib/bamFile.c,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 4 -r1.16 -r1.17
--- src/hg/lib/bamFile.c	30 Nov 2009 23:46:52 -0000	1.16
+++ src/hg/lib/bamFile.c	10 Dec 2009 15:02:12 -0000	1.17
@@ -10,17 +10,8 @@
 #include "bamFile.h"
 
 static char const rcsid[] = "$Id$";
 
-static boolean ignoreStrand = FALSE;
-
-void bamIgnoreStrand()
-/* Change the behavior of this lib to disregard item strand. 
- * If called, this should be called before any other bam functions. */
-{
-ignoreStrand = TRUE;
-}
-
 char *bamFileNameFromTable(char *db, char *table, char *bamSeqName)
 /* Return file name from table.  If table has a seqName column, then grab the 
  * row associated with bamSeqName (which is not nec. in chromInfo, e.g. 
  * bam file might have '1' not 'chr1'). */
@@ -36,8 +27,18 @@
 	  table, bamSeqName);
 else
     safef(query, sizeof(query), "select fileName from %s", table);
 char *fileName = sqlQuickString(conn, query);
+if (fileName == NULL && checkSeqName)
+    {
+    if (startsWith("chr", bamSeqName))
+	safef(query, sizeof(query), "select fileName from %s where seqName = '%s'",
+	      table, bamSeqName+strlen("chr"));
+    else
+	safef(query, sizeof(query), "select fileName from %s where seqName = 'chr%s'",
+	      table, bamSeqName);
+    fileName = sqlQuickString(conn, query);
+    }
 if (fileName == NULL)
     {
     if (checkSeqName)
 	errAbort("Missing fileName for seqName '%s' in %s table", bamSeqName, table);
@@ -184,18 +185,34 @@
 
 void bamFetch(char *fileOrUrl, char *position, bam_fetch_f callbackFunc, void *callbackData)
 /* Open the .bam file, fetch items in the seq:start-end position range,
  * and call callbackFunc on each bam item retrieved from the file plus callbackData. 
- * Note: if sequences in .bam file don't begin with "chr" but cart position does, pass in 
- * cart position + strlen("chr") to match the .bam file sequence names. */
+ * This handles BAM files with "chr"-less sequence names, e.g. from Ensembl. */
 {
 char *bamFileName = samtoolsFileName(fileOrUrl);
 samfile_t *fh = samopen(bamFileName, "rb", NULL);
 if (fh == NULL)
-    errAbort("samopen(%s, \"rb\") returned NULL", bamFileName);
+    {
+    boolean usingUrl = (strstr(fileOrUrl, "tp://") || strstr(fileOrUrl, "https://"));
+    struct dyString *urlWarning = dyStringNew(0);
+    if (usingUrl)
+	{
+	char *udcFuseRoot = cfgOption("udcFuse.mountPoint");
+	boolean usingUdc = (udcFuseRoot != NULL && startsWith(udcFuseRoot, bamFileName));
+	if (usingUdc)
+	    dyStringAppend(urlWarning, " (using udcFuse)");
+	dyStringAppend(urlWarning,
+		       ". If you are able to access the URL with your web browser, "
+		       "please try reloading this page.");
+	}
+    warn("failed to open %s%s", fileOrUrl, urlWarning->string);
+    return;
+    }
 
 int chromId, start, end;
 int ret = bam_parse_region(fh->header, position, &chromId, &start, &end);
+if (ret != 0 && startsWith("chr", position))
+    ret = bam_parse_region(fh->header, position+strlen("chr"), &chromId, &start, &end);
 if (ret != 0)
     // If the bam file does not cover the current chromosome, OK
     return;
 
@@ -215,38 +232,38 @@
 samclose(fh);
 }
 
 boolean bamIsRc(const bam1_t *bam)
-/* Return TRUE if alignment is on - strand.  If bamIgnoreStrand has been called,
- * then this always returns FALSE. */
+/* Return TRUE if alignment is on - strand. */
 {
 const bam1_core_t *core = &bam->core;
-return (core->flag & BAM_FREVERSE) && !ignoreStrand;
+return (core->flag & BAM_FREVERSE);
 }
 
-char *bamGetQuerySequence(const bam1_t *bam)
+char *bamGetQuerySequence(const bam1_t *bam, boolean useStrand)
 /* Return the nucleotide sequence encoded in bam.  The BAM format 
  * reverse-complements query sequence when the alignment is on the - strand,
- * so here we rev-comp it back to restore the original query sequence. */
+ * so if useStrand is given we rev-comp it back to restore the original query 
+ * sequence. */
 {
 const bam1_core_t *core = &bam->core;
 char *qSeq = needMem(core->l_qseq + 1);
 uint8_t *s = bam1_seq(bam);
 int i;
 for (i = 0; i < core->l_qseq; i++)
     qSeq[i] = bam_nt16_rev_table[bam1_seqi(s, i)];
-if (bamIsRc(bam))
+if (useStrand && bamIsRc(bam))
     reverseComplement(qSeq, core->l_qseq);
 return qSeq;
 }
 
-UBYTE *bamGetQueryQuals(const bam1_t *bam)
+UBYTE *bamGetQueryQuals(const bam1_t *bam, boolean useStrand)
 /* Return the base quality scores encoded in bam as an array of ubytes. */
 {
 const bam1_core_t *core = &bam->core;
 int qLen = core->l_qseq;
 UBYTE *arr = needMem(qLen);
-boolean isRc = bamIsRc(bam);
+boolean isRc = useStrand && bamIsRc(bam);
 UBYTE *qualStr = bam1_qual(bam);
 int i;
 for (i = 0;  i < qLen;  i++)
     {
@@ -382,15 +399,16 @@
     }
 return tLength;
 }
 
-struct ffAli *bamToFfAli(const bam1_t *bam, struct dnaSeq *target, int targetOffset)
+struct ffAli *bamToFfAli(const bam1_t *bam, struct dnaSeq *target, int targetOffset,
+			 boolean useStrand)
 /* Convert from bam to ffAli format.  (Adapted from psl.c's pslToFfAli.) */
 {
 struct ffAli *ffList = NULL, *ff;
 const bam1_core_t *core = &bam->core;
-boolean isRc = bamIsRc(bam);
-DNA *needle = (DNA *)bamGetQuerySequence(bam);
+boolean isRc = useStrand && bamIsRc(bam);
+DNA *needle = (DNA *)bamGetQuerySequence(bam, useStrand);
 if (isRc)
     reverseComplement(target->dna, target->size);
 DNA *haystack = target->dna;
 unsigned int *cigarPacked = bam1_cigar(bam);