src/hg/tcga/pBamBam/pBamBamMuts.c 1.3

1.3 2010/03/05 19:53:24 jsanborn
initial commit
Index: src/hg/tcga/pBamBam/pBamBamMuts.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/pBamBamMuts.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/tcga/pBamBam/pBamBamMuts.c	4 Mar 2010 04:03:51 -0000	1.2
+++ src/hg/tcga/pBamBam/pBamBamMuts.c	5 Mar 2010 19:53:24 -0000	1.3
@@ -1,711 +1,701 @@
 /* bamBam -- discovering variants from BAM file
  *   code adapted from Heng Li's Samtools library functions
  */
 
 #ifdef USE_BAM
 
 #include "common.h"
 #include "hCommon.h"
 #include "linefile.h"
 #include "hash.h"
 #include "hdb.h"
 #include "options.h"
 #include "jksql.h"
 
 #define _IOLIB 2
 #include "bam.h"
 #include "sam.h"
 #include "bam_helper.h"
 
 static char const rcsid[] = "$Id";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "bamBam - Perform analysis on two BAM files.\n"
   "usage:\n"
   "   pBamBam [options] <left.bam> <right.bam>\n"
   "options:\n"
   "   -position=chrX:1-100 - Position to do analysis. [None]\n" 
   "   -minSupport=INT - Minimum Support (i.e. num reads) to call a variant. [2]\n"
   "   -minQ=INT       - Minimum acceptable base quality. [0]\n"
   "   -minMapQ=INT    - Minimum acceptable mapping quality. [0]\n"
   "   -avgMapQ=INT    - Minimum acceptable avg mapping quality. [0]\n"
   "   -minChiSq=INT   - Minimum acceptable chi square of nucleotide freqs. [0]\n"
   "   -maxISize=INT   - Maximum Insert Size. [10000]\n"
   "   -minOutput      - Minimal output, no read data\n"
   "\n"
   );
 }
 
 static struct optionSpec options[] = {
     {"position", OPTION_STRING},
     {"minSupport", OPTION_INT},
     {"minQ", OPTION_INT},
     {"minMapQ", OPTION_INT},
     {"avgMapQ", OPTION_INT},
     {"minChiSq", OPTION_INT},
     {"maxISize", OPTION_INT},
     {"minOutput", OPTION_BOOLEAN},
     {NULL, 0}
 };
 
 /* Option variables */
 char *position = NULL;
 int minSupport = 2;
 int minQ     = 0;
 int minMapQ  = 0;
 int avgMapQ  = 0;
 int minChiSq = 0;
 int maxISize = 10000;
 boolean minOutput = FALSE;
 
 typedef struct {
     int minQ;
     int minMapQ;
     int avgMapQ;
     int minSupp;
     int minChiSq;
     int sites;
 } analysis_params_t;
 
 static int32_t countsL1[4];
 static int32_t countsL2[4];
 static int32_t countsL3[4];
 
 static int32_t countsR1[4];
 static int32_t countsR2[4];
 static int32_t countsR3[4];
 
 static int32_t muts[4];
 
 static int32_t votesL[4];
 static int32_t countsL[4];
 static int32_t votesR[4];
 static int32_t countsR[4];
 
 static int32_t insL, delL, insR, delR;
 
 typedef struct {
     char nuc;
     int count;
     int vote;
 } variant_t;
 
 variant_t minorL, majorL;
 variant_t minorR, majorR;
 
 static int nuc_index(char c)
 {
 switch (c)
     {
     case 'A':
 	return 0;
     case 'G':
 	return 1;
     case 'C':
 	return 2;
     case 'T':
 	return 3;
     default:
 	return -1;
     }
 return -1;
 }
 
 #define NUCS "AGCT"
 
 static inline void nuc_counts(dual_pileup_t *d, analysis_params_t *ap)
 { 
 int i, index;
 for (i = 0; i < 4; i++)
     { // sum up the qualities of each base
     votesL[i]  = 0;
     countsL[i] = 0;
     votesR[i]  = 0; 
     countsR[i] = 0;
 
     countsL1[i] = 0;
     countsL2[i] = 0;
     countsL3[i] = 0;
     countsR1[i] = 0;
     countsR2[i] = 0;
     countsR3[i] = 0;
     }
 insL = 0;
 insR = 0;
 delL = 0;
 delR = 0;
 
 const bam_pileup1_t *p;
 for (i = 0; i < d->nL; ++i)
     { 
     p = d->puL + i;
 
     if (p->b->core.qual < ap->minMapQ)
 	continue;
 
     int qual = bam1_qual(p->b)[p->qpos];    
     if (qual < ap->minQ)
 	continue;
     if (!p->is_del && p->indel == 0)
 	{
 	double lseq = (double) p->b->core.l_qseq;
 	int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);    
 	if ((index = nuc_index(c)) < 0)
 	    continue;
 
 	votesL[index] += qual;
 	countsL[index]++;
 
         if (p->qpos < round(0.33 * lseq))
 	    countsL1[index]++;
 	else if (p->qpos < round(0.67 * lseq))
 	    countsL2[index]++;
 	else
 	    countsL3[index]++;
 	}
     }
 
 for (i = 0; i < d->nR; ++i)
     {
     p = d->puR + i;
 
     if (p->b->core.qual < ap->minMapQ)
 	continue;
 
     int qual = bam1_qual(p->b)[p->qpos];    
     if (qual < ap->minQ)
 	continue;
 
     if (!p->is_del && p->indel == 0)
 	{
 	double lseq = (double) p->b->core.l_qseq;
 	int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
 	if ((index = nuc_index(c)) < 0)
 	    continue;
 	votesR[index] += qual;
 	countsR[index]++;
 
 	if (p->qpos < round(0.33 * lseq))
 	    countsR1[index]++;
 	else if (p->qpos < round(0.67 * lseq))
 	    countsR2[index]++;
 	else
 	    countsR3[index]++;
 	}
     }
 
 int iL1 = -1, iL2 = -1, maxL1 = 0, maxL2 = 0; 
 int iR1 = -1, iR2 = -1, maxR1 = 0, maxR2 = 0;
 int countL1 = -1, countL2 = -1, countR1 = -1, countR2 = -1;
 for (i = 0; i < 4; i++)
     {
     if (countsL[i] > countL1)
 	{
 	iL2     = iL1;     // set prev max to max2
 	maxL2   = maxL1;
 	countL2 = countL1;
 
 	iL1     = i;
 	maxL1   = votesL[i];
 	countL1 = countsL[i];
 	}
     else if (countsL[i] > countL2)
 	{
 	iL2     = i;
 	maxL2   = votesL[i];
 	countL2 = countsL[i];
 	}
 
     if (countsR[i] > countR1)
 	{
 	iR2     = iR1;
 	maxR2   = maxR1;
 	countR2 = countR1;
 
 	iR1     = i;
 	maxR1   = votesR[i];
 	countR1 = countsR[i];
 	}
     else if (countsR[i] > countR2)
 	{
 	iR2     = i;
 	maxR2   = votesR[i];
 	countR2 = countsR[i];
 	}
     }
 
 majorL.nuc   = 'N';
 majorL.vote  = 0;
 majorL.count = 0;
 if (iL1 != -1 && countL1 >= ap->minSupp)
     {
     majorL.nuc   = NUCS[iL1];
     majorL.vote  = maxL1;
     majorL.count = countL1;
     }
 
 minorL.nuc   = 'N';
 minorL.vote  = 0;
 minorL.count = 0;
 if (iL2 != -1 && countL2 >= ap->minSupp)
     {
     minorL.nuc   = NUCS[iL2];
     minorL.vote  = maxL2;
     minorL.count = countL2;
     }
 
 majorR.nuc   = 'N';
 majorR.vote  = 0;
 majorR.count = 0;
 if (iR1 != -1 && countR1 >= ap->minSupp)
     {
     majorR.nuc   = NUCS[iR1];
     majorR.vote  = maxR1;
     majorR.count = countR1;
     }
 
 minorR.nuc   = 'N';
 minorR.vote  = 0;
 minorR.count = 0;
 if (iR2 != -1 && countR2 >= ap->minSupp)
     {
     minorR.nuc   = NUCS[iR2];
     minorR.vote  = maxR2;
     minorR.count = countR2;
     }
 }
 
 
 double chi_square(int obs1, int obs2, double p)  // p = binomial prob of getting 1
 {
 double pseudo = 1.0;
 int count = obs1 + obs2;
 
 double exp1 = p * ((double) count) + pseudo;
 double exp2 = (1.0 - p) * ((double) count) + pseudo;
 
 // apply Yates correction... will often have small observed values.
 return pow(abs((double)obs1 - exp1) - 0.5, 2.0)/exp1 + pow(abs((double)obs2 - exp2) - 0.5, 2.0)/exp2;
 }
 
 static int sig_diff(dual_pileup_t *d, analysis_params_t *ap)
 {
 int i;
 double pseudo = 1.0;
 int totalL = 0, totalR = 0;
 
 for (i = 0; i < 4; i++)
     {
     totalL += countsL[i];
     totalR += countsR[i];
     }
 
 double obs, exp, chi_sq = 0.0;
 for (i = 0; i < 4; i++)
     {
     obs = ((double) countsL[i] * 100.0) / (double) totalL + pseudo;
     exp = ((double) countsR[i] * 100.0) / (double) totalR + pseudo;
     chi_sq += pow(abs(obs - exp) - 0.5, 2.0) / exp;
     }
 
 return chi_sq;
 }
 
 
-int which_model(double homo, double het)
-{
-if (homo <= het)
-    return 1;
-else if (homo > het)
-    return 2;
-return 0;
-}
-
-
 static int model_select(dual_pileup_t *d, analysis_params_t *ap, 
 			char *callL1, char *callL2, char *callR1, char *callR2, 
 			float *hetL, float *hetR, int *stateL, int *stateR, int *sites)
 {
 *sites += 1;
 
 if (majorR.vote == 0 || majorL.vote == 0)
     return 0;  // most likely a bad quality region.
 
 if (sig_diff(d, ap) < ap->minChiSq)
     return 0;
 
 int i = 0;
 int rCount = 0;
 for (i = 0; i < 4; i++)
     rCount += countsR1[i] + countsR2[i] + countsR3[i];
 
 if (rCount < 10)
     return 0;
 
 int retVal1 = 0, retVal2 = 0;
 for (i = 0; i < 4; i++)
     {
     if (NUCS[i] != majorL.nuc && NUCS[i] != minorL.nuc)
 	continue;
     
     if (NUCS[i] == majorL.nuc && majorL.nuc != majorR.nuc && majorL.nuc != minorR.nuc)
 	{
 //	if (countsL1[i] >= 2 && countsL2[i] >= 1)
 //	    retVal1 = 1;
 //	else if (countsL1[i] > 0 && countsL2[i] > 0 && countsL3[i] > 0)
 //	    retVal1 = 1;
 
 	if (countsL1[i] > 0 && countsL2[i] > 0 && countsL1[i] + countsL2[i] >= 3)
 	    retVal1 = 1;
 	else if (countsL1[i] > 0 && countsL3[i] > 0 && countsL1[i] + countsL3[i] >= 3)
 	    retVal1 = 1;
 	else if (countsL2[i] > 0 && countsL3[i] > 0 && countsL2[i] + countsL3[i] >= 3)
 	    retVal1 = 1;
 
 	if (retVal1)
 	    {
 	    if (rCount >= 30 && countsR[i] <= 1) 
 		retVal1 = 1;
 	    else if (rCount < 30 && countsR[i] == 0)
 		retVal1 = 1;
 	    else
 		retVal1 = 0;
 	    }
 	}
 
 
     if (NUCS[i] == minorL.nuc && (minorL.nuc != majorR.nuc && minorL.nuc != minorR.nuc))
 	{
 //	if (countsL1[i] >= 2 && countsL2[i] >= 1)
 //	    retVal2 = 1;
 //	else if (countsL1[i] > 0 && countsL2[i] > 0 && countsL3[i] > 0)
 //	    retVal2 = 1;
 
 	if (countsL1[i] > 0 && countsL2[i] > 0 && countsL1[i] + countsL2[i] >= 3)
 	    retVal2 = 1;
 	else if (countsL1[i] > 0 && countsL3[i] > 0 && countsL1[i] + countsL3[i] >= 3)
 	    retVal2 = 1;
 	else if (countsL2[i] > 0 && countsL3[i] > 0 && countsL2[i] + countsL3[i] >= 3)
 	    retVal2 = 1;
 
 	if (retVal2) 
 	    {
 	    if (rCount >= 30 && countsR[i] <= 1) 
 		retVal2 = 1;
 	    else if (rCount < 30 && countsR[i] == 0)
 		retVal2 = 1;
 	    else
 		retVal2 = 0;
 	    }
 	}
     }    
 
 if (!retVal1 && !retVal2)
     return 0;
 
 
 float freqL = ((float) minorL.count) / ((float) majorL.count + (float) minorL.count);
 float freqR = ((float) minorR.count) / ((float) majorR.count + (float) minorR.count);
 
 *hetL = freqL;
 *hetR = freqR;
 
 *callL1 = majorL.nuc;
 *callL2 = minorL.nuc;
 *callR1 = majorR.nuc;
 *callR2 = minorR.nuc;
 
 if (majorL.nuc != majorR.nuc && majorL.nuc != minorR.nuc)
     return 1;
 if (minorL.nuc != 'N' && (minorL.nuc != majorR.nuc && minorL.nuc != minorR.nuc))
     return 1;
 
 return 0;
 }
 
 static int is_mutation(dual_pileup_t *d, analysis_params_t *ap,
 		       int *sites)
 /* Styled this new, simple caller based on Pleasance et al. Nature 2010:
  * Requires > 10X coverage in Germline.
  *
  * Substitution in Tumor must be seen:
  *   - At least once in each of three bins
  *                   OR
  *   - Twice in first bin, and once in second bin
  *
  * This attempts to avoid false-positives due to indels.
  *
  * Substitution can be seen in Germline:
  *    - 0 times if Coverage < 30
  *    - 1 time  if Coverage >= 30
  */
 {
 *sites += 1;
 
 int i = 0;
 int32_t lCount = 0, rCount = 0;
 
 for (i = 0; i < 4; i++)
     {
     muts[i] = 0;
 
     lCount += countsL1[i];
     lCount += countsL2[i];
     lCount += countsL3[i];
 
     countsL[i] = 0;
     if (countsL1[i] >= 1 && countsL2[i] >= 1 && countsL3[i] >= 1)
 	countsL[i] = countsL1[i] + countsL2[i] + countsL3[i];
     else if (countsL1[i] >= 2 && countsL2[i] >= 1)
 	countsL[i] = countsL1[i] + countsL2[i] + countsL3[i];
 
     rCount += countsR1[i];
     rCount += countsR2[i];
     rCount += countsR3[i];
 
     //countsR[i] = 0;
     //if (countsR1[i] >= 1 && countsR2[i] >= 1 && countsR3[i] >= 1)
     countsR[i] = countsR1[i] + countsR2[i] + countsR3[i];
     //else if (countsR1[i] >= 2 && countsR2[i] >= 1)
     //countsR[i] = countsR1[i] + countsR2[i] + countsR3[i];
     }
 
 if (rCount < 10)
     return 0;   // insufficient cover
 
 int retVal = 0;
 for (i = 0; i < 4; i++)
     {
     if (countsL[i] > 0)
 	{
 	if ( (rCount < 30 && countsR[i] == 0) || (rCount >= 30 && countsR[i] == 1) )
 	    {  // potential mutation
 	    muts[i] = 1;
 	    retVal = 1;
 	    }
 	}
     }
 
 return retVal;
 }
 
 
 static void print_seq(uint32_t tid, uint32_t pos, int n, 
 		      const bam_pileup1_t *pu, dual_pileup_t *d)
 {
 if (minOutput)
     return;
 
 int i, j, rb;
 rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
 for (i = 0; i < n; ++i)
     {
     const bam_pileup1_t *p = pu + i;
     if (p->b->core.qual < minMapQ)
 	continue;
 
     int qual = bam1_qual(p->b)[p->qpos];    
     if (qual < minQ)
 	continue;
 
     if (p->is_head)
 	printf("^%c", p->b->core.qual > 93 ? 126 : p->b->core.qual + 33);
     if (!p->is_del)
 	{
 	int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
 	if (c == '=' || toupper(c) == toupper(rb))
 	    c = bam1_strand(p->b) ? ',' : '.';
 	else
 	    c = bam1_strand(p->b) ? tolower(c) : toupper(c);
 	putchar(c);
 	if (p->indel > 0)
 	    {
 	    printf("+%d", p->indel);
 	    for (j = 1; j <= p->indel; ++j)
 		{
 		c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
 		putchar(bam1_strand(p->b) ? tolower(c) : toupper(c));
 		}
 	    }
 	else if (p->indel < 0)
 	    {
 	    printf("%d", p->indel);
 	    for (j = 1; j <= -p->indel; ++j)
 		{
 		c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N';
 		putchar(bam1_strand(p->b) ? tolower(c) : toupper(c));
 		}
 	    }
 	}
     else
 	putchar('*');
     if (p->is_tail)
 	putchar('$');
     }
 putchar('\t');
 
 // print base quality
 for (i = 0; i < n; ++i)
     {
     const bam_pileup1_t *p = pu + i;
     if (p->b->core.qual < minMapQ)
 	continue;
 
     int qual = bam1_qual(p->b)[p->qpos];    
     if (qual < minQ)
 	continue;
 
     int c = qual + 33;
     if (c > 126)
 	c = 126;
     putchar(c);
     }
 putchar('\t');
 
 // print mapping quality
 for (i = 0; i < n; ++i)
     {
     const bam_pileup1_t *p = pu + i;
     int qual = p->b->core.qual;
     if (qual < minMapQ)
 	continue;
 
     if (bam1_qual(p->b)[p->qpos] < minQ)
 	continue;
 
     int c = qual + 33;
     if (c > 126)
 	c = 126;
     putchar(c);
     }
 }
 
 static void perform_analysis(dual_pileup_t *d, void *func_data)
 {
 analysis_params_t *ap = (analysis_params_t *)func_data;
 
 int stateL, stateR;
 char callL1, callL2, callR1, callR2;
 float hetL, hetR;
 
 nuc_counts(d, ap);
 
 if (model_select(d, ap, &callL1, &callL2, &callR1, &callR2, 
 		 &hetL, &hetR, &stateL, &stateR, &ap->sites))
     {
     printf("%s\t%d\t%d\t", 
 	   d->hL->target_name[d->tidL], d->posL, d->posL + 1);
 
     if (callR2 == 'N')
 	printf("%c>", callR1);
     else
 	printf("%c/%c>", callR1, callR2);
 
     if (callL2 == 'N')
 	printf("%c\t", callL1);
     else
 	printf("%c/%c\t", callL1, callL2);
     
     double chi_sq = sig_diff(d, ap);
     printf("%f\t", chi_sq);
 
     print_seq(d->tidR, d->posR, d->nR, d->puR, d);
     putchar('\t');
     print_seq(d->tidL, d->posL, d->nL, d->puL, d);
     putchar('\n');
 
     fflush(stdout);
     }
 
 if (0 && is_mutation(d, ap, &ap->sites))
     {
     printf("%s\t%d\t%d\t",
 	   d->hL->target_name[d->tidL], d->posL, d->posL + 1);
 
     int i;
     for (i = 0; i < 4; i++)
 	{
 	if (countsR[i])
 	    putchar(NUCS[i]);
 	}
     putchar('\t');
     for (i = 0; i < 4; i++)
 	{
 	if (countsL[i])
 	    putchar(NUCS[i]);
 	}
     putchar('\t');
     for (i = 0; i < 4; i++)
 	{
 	if (muts[i])
 	    putchar(NUCS[i]);
 	}
     putchar('\t');
     print_seq(d->tidR, d->posR, d->nR, d->puR, d);
     putchar('\t');
     print_seq(d->tidL, d->posL, d->nL, d->puL, d);
     putchar('\n');
 
     fflush(stdout);
     }
 }
 
 void bamBam(char *inbamL, char *inbamR, int mask)
 {
 analysis_params_t ap;
 ap.minQ    = minQ;
 ap.minMapQ = minMapQ;
 ap.avgMapQ = avgMapQ;
 ap.minSupp = minSupport;
 ap.minChiSq = minChiSq;
 ap.sites   = 0;
 
 bam_bam_file(inbamL, inbamR, position, mask, perform_analysis, &ap);
 fprintf(stderr, "# qualified sites = %d\n", ap.sites);
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc < 3)
     usage();
 
 if (optionExists("minSupport"))
     minSupport = optionInt("minSupport", 2);
 if (optionExists("minQ"))
     minQ = optionInt("minQ", 0);
 if (optionExists("minMapQ"))
     minMapQ = optionInt("minMapQ", 0);
 if (optionExists("avgMapQ"))
     avgMapQ = optionInt("avgMapQ", 0);
 if (optionExists("minChiSq"))
     minChiSq = optionInt("minChiSq", 0);
 if (optionExists("maxISize"))
     maxISize = optionInt("maxISize", 10000);
 if (optionExists("position"))
     position = optionVal("position", NULL);
 if (optionExists("minOutput"))
     minOutput = TRUE;
 
 char *inbamL = argv[1];
 char *inbamR = argv[2];
 bamBam(inbamL, inbamR, BAM_DEF_MASK);
 
 return 0;
 }
 
 #else // USE_BAM not defined
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 printf("BAM not installed");
 
 return 0;
 }
 #endif