src/hg/tcga/bamBam/bamBam.c 1.5

1.5 2009/09/26 18:29:47 jsanborn
updated
Index: src/hg/tcga/bamBam/bamBam.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/bamBam/bamBam.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 1000000 -r1.4 -r1.5
--- src/hg/tcga/bamBam/bamBam.c	25 Sep 2009 21:16:13 -0000	1.4
+++ src/hg/tcga/bamBam/bamBam.c	26 Sep 2009 18:29:47 -0000	1.5
@@ -1,108 +1,228 @@
 /* bamBam -- discovering variants from BAM file
  *   code adapted from Heng Li's Samtools library functions
  */
 
 #ifdef USE_BAM
 
 #include "common.h"
 #include "hCommon.h"
 #include "linefile.h"
 #include "hash.h"
 #include "hdb.h"
 #include "options.h"
 #include "jksql.h"
 
 #define _IOLIB 2
 #include "bam.h"
 #include "sam.h"
 #include "bam_dual_pileup.h"
 
 static char const rcsid[] = "$Id";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "bamBam - Perform analysis on two BAM files.\n"
   "usage:\n"
   "   bamBam [options] <left.bam> <right.bam>\n"
   "options:\n"
+  "   -minSupport=INT - Minimum Support (i.e. num reads) to call a variant. [2]\n"
+  "   -minQ=INT       - Minimum acceptable base quality. [0]\n"
+  "   -minMapQ=INT    - Minimum acceptable mapping quality. [0]\n"
   "\n"
   );
 }
 
 static struct optionSpec options[] = {
+    {"minSupport", OPTION_INT},
+    {"minQ", OPTION_INT},
+    {"minMapQ", OPTION_INT},
     {NULL, 0}
 };
 
 /* Option variables */
 char *position = NULL;
+int minSupport = 2;
+int minQ = 0;
+int minMapQ = 0;
 
 typedef struct {
-    int minQual;
-    int minMapQual;
+    int minQ;
+    int minMapQ;
+    int minSupp;
 } analysis_params;
 
-static void perform_analysis(void *data)
+static int32_t votesL[4];
+static int32_t countsL[4];
+static int32_t votesR[4];
+static int32_t countsR[4];
+
+static int nuc_index(char c)
 {
-pileup_data_t *d = (pileup_data_t*)data;
-analysis_params *ap = (analysis_params*)d->func_data;
+switch (c)
+    {
+    case 'A':
+	return 0;
+    case 'G':
+	return 1;
+    case 'C':
+	return 2;
+    case 'T':
+	return 3;
+    default:
+	return -1;
+    }
+return -1;
+}
 
-printf("%d\t%d\t%d\t%d/%d\t", d->tidL, d->posL, d->nL, ap->minQual, ap->minMapQual);
+#define NUCS "AGCT"
+
+static int consensus_snp_variant(pileup_data_t *d, analysis_params *ap, 
+				 char *callL, char *callR)
+{
+int i, index;
+for (i = 0; i < 4; i++)
+    { // sum up the qualities of each base
+    votesL[i]  = 0;
+    countsL[i] = 0;
+    votesR[i]  = 0; 
+    countsR[i] = 0;
+    }
 
-int i;
 const bam_pileup1_t *p;
 for (i = 0; i < d->nL; ++i)
     {
     p = d->puL + i;
+
+    if (p->b->core.qual < ap->minMapQ)
+	continue;
+
     int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
-    putchar(c);
+    int qual = bam1_qual(p->b)[p->qpos];    
+    if (qual < ap->minQ)
+	continue;
+    
+    index = nuc_index(c);
+    if ((index = nuc_index(c)) < 0)
+	continue;
+
+    votesL[index] += qual;
+    countsL[index]++;
     }
-putchar('\t');
-printf("%d\t", d->nR);
 
 for (i = 0; i < d->nR; ++i)
     {
     p = d->puR + i;
+
+    if (p->b->core.qual < ap->minMapQ)
+	continue;
+
     int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
-    putchar(c);
+    int qual = bam1_qual(p->b)[p->qpos];    
+    if (qual < ap->minQ)
+	continue;
+
+    index = nuc_index(c);
+    if ((index = nuc_index(c)) < 0)
+	continue;
+
+    votesR[index] += qual;
+    countsR[index]++;
     }
-putchar('\n');
+
+int iL = -1, maxL = -1, countL = -1;
+int iR = -1, maxR = -1, countR = -1;
+for (i = 0; i < 4; i++)
+    {
+    if (votesL[i] > maxL)
+	{
+	iL = i;
+	maxL = votesL[i];
+	countL = countsL[i];
+	}
+    if (votesR[i] > maxR)
+	{
+	iR = i;
+	maxR = votesR[i];
+	countR = countsR[i];
+	}
+    }
+	   
+if (iL == -1 || iR == -1 || (iL == iR))  // either no good data or matching consensus.
+    return 0;
+if (countL < ap->minSupp || countR < ap->minSupp)
+    return 0;
+
+*callL = NUCS[iL];
+*callR = NUCS[iR];
+    
+return 1;
 }
 
 
-void bamBam(char *inbamL, char *inbamR)
+static void perform_analysis(void *data)
 {
+pileup_data_t *d = (pileup_data_t*)data;
+analysis_params *ap = (analysis_params*)d->func_data;
+
+char callL, callR;
 
+if (!consensus_snp_variant(d, ap, &callL, &callR))
+    return;
+
+// remember, BAM data is starts at base zero, not base one like browser.
+printf("chr%s\t%d\t%d\t%c,%c\n", 
+       d->hL->target_name[d->tidL], d->posL, d->posL + 1, callL, callR);
+fflush(stdout);
+}
+
+void bamBam(char *inbamL, char *inbamR)
+{
 analysis_params aps;
-aps.minQual = 50;
-aps.minMapQual = 25;
+aps.minQ    = minQ;
+aps.minMapQ = minMapQ;
+aps.minSupp = minSupport;
 
 bam_bam_file(inbamL, inbamR, perform_analysis, &aps);
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
+int i;
+putchar('#');
+for (i = 0; i < argc; i++)
+    printf(" %s", argv[i]);
+putchar('\n');
+
+
 optionInit(&argc, argv, options);
-if (argc != 3)
+if (argc < 3)
     usage();
 
+if (optionExists("minSupport"))
+    minSupport = optionInt("minSupport", 2);
+if (optionExists("minQ"))
+    minQ = optionInt("minQ", 0);
+if (optionExists("minMapQ"))
+    minMapQ = optionInt("minMapQ", 0);
+
 char *inbamL = argv[1];
 char *inbamR = argv[2];
 bamBam(inbamL, inbamR);
 
 return 0;
 }
 
 #else // USE_BAM not defined
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 printf("BAM not installed");
 
 return 0;
 }
 #endif