src/hg/pslCDnaGenomeMatch/pslCDnaGenomeMatch.c 1.23

1.23 2009/08/18 22:31:32 baertsch
fix bug with non pass thru mode. alignments are now removed if there are multiple hits that are less than minDiff cutoff. removed commented code and tweaked logging messages.
Index: src/hg/pslCDnaGenomeMatch/pslCDnaGenomeMatch.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/pslCDnaGenomeMatch/pslCDnaGenomeMatch.c,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 4 -r1.22 -r1.23
--- src/hg/pslCDnaGenomeMatch/pslCDnaGenomeMatch.c	15 Aug 2009 17:59:49 -0000	1.22
+++ src/hg/pslCDnaGenomeMatch/pslCDnaGenomeMatch.c	18 Aug 2009 22:31:32 -0000	1.23
@@ -22,9 +22,8 @@
 //#include "twoBit.h"
 #include "nibTwo.h"
 #include "bed.h"
 #include "snp125.h"
-//#include "chainToAxt.h"
 #include "pipeline.h"
 #define MINDIFF 3
 #define MAXLOCI 2048
 #define NOVALUE 10000  /* loci index when there is no genome base for that mrna position */
@@ -474,9 +473,9 @@
     {
     /* retrieve snp and complement to agree with strand of mrna  */
     struct snp125 *snp = el->val;
     char *snpDna = NULL;
-    verbose(3, "%s %s:%d \n", snp->name, snp->chrom, snp->chromStart);
+    verbose(4, "%s %s:%d \n", snp->name, snp->chrom, snp->chromStart);
     snpDna = replaceChars(snp->observed, "/",".");
     if (genomeStrand != snp->strand[0])
         reverseComplement(snpDna, strlen(snp->observed));
     slAddHead(&snpList, snp);
@@ -678,9 +677,9 @@
 int *snpCount;             /* count of cases where the loci are snps*/
 int indel = 0;             /* count of gaps in mrna alignment */
 int maxScore = -100;       /* max scoring alignment for this mrna */
 int nextBestScore = -100;       /* 2nd best scoring alignment for this mrna */
-int maxCount = 0;       /* number of alignments with max score */
+int maxHits = 0;       /* number of alignments with max score */
 int maxIndex = -1;      /* index in loci list of best aligment */
 int qNameCount = 0;     /* count of filtered psls for this qName */
 char *chrom;           /* best alignment */
 int chromStart;
@@ -735,9 +734,9 @@
             }
         if (mrnaMisMatch->snpCount > 0 && mrnaMisMatch->snps[index] != NULL)
             {
             snpCount[index]++;
-            verbose(3,"found snp %s count[%d] %d %s\n", mrnaMisMatch->snps[index], index, snpCount[index], mrnaMisMatch->chroms[index]);
+            verbose(4,"found snp %s count[%d] %d %s\n", mrnaMisMatch->snps[index], index, snpCount[index], mrnaMisMatch->chroms[index]);
             }
         if (mrnaMisMatch->bases[i] == '-' && atLeastOneMatch)
             indel++;
         if (!lociDifferent || !atLeastOneMatch)
@@ -784,33 +783,31 @@
 for (l = lociList ; l != NULL; l=l->next)
     {
     int z = l->index;
     int score = goodCount[z]-missCount[z];
-    verbose(5, "score %d nextBest %d max %d\n",
-            score, nextBestScore, maxScore);
+    verbose(3, "%s score %d next %d max %d\n",
+            name, score, nextBestScore, maxScore);
     if (maxScore == score)
         {
-        maxCount ++;
-        //nextBestScore = score;
-        verbose(5,"score == maxScore %s score %d nextBestScore %d maxScore %d\n",
-                name, score, nextBestScore, maxScore);
+        maxHits ++;
+        verbose(3,"%s score %d == maxScore %d next %d \n",
+                name, score, maxScore, nextBestScore);
         }
     else 
         {
         if (score > maxScore)
             {
-            maxCount = 1;
+            maxHits = 1;
             maxIndex = z;
-            if ((score - maxScore ) > minDiff)
                 nextBestScore = maxScore;
-            verbose(5,"score > maxScore %s score %d nextBestScore %d maxScore %d\n",
-                    name, score, nextBestScore, maxScore);
+            verbose(3,"%s score %d > maxScore %d nextBestScore %d \n",
+                    name, score, maxScore, nextBestScore);
             }
         else if (score > nextBestScore && (maxScore - score) > minDiff)
             {
             nextBestScore = score;
-            verbose(5,"score > nextBestScore %s score %d nextBestScore %d maxScore %d\n",
-                    name, score, nextBestScore, maxScore);
+            verbose(3,"%s score %d > nextBestScore %d (maxScore %d -score %d) > minDiff %d\n",
+                    name, score, nextBestScore, maxScore, score, minDiff);
             }
     }
     maxScore = max(maxScore, score); 
     }
@@ -823,34 +820,34 @@
     bool posOk = getLociPosition(lociList, l->index, &chrom, &chromStart, &chromEnd, &psl);
     assert(psl != NULL);
     if (scoreFile != NULL)
         fprintf(scoreFile, "##   %s score %d %s:%d [%d] mismatch %d good %d neither %d indel %d \
- gaps %d snpCount[%d] %d total %d nextBestScore %d diff %d index %d maxCnt %d \n",
+ gaps %d snpCount[%d] %d total %d nextBestScore %d diff %d index %d maxHITS %d \n",
             name, score , l->chrom, l->chromStart, z, missCount[z], goodCount[z], neither[z], indel,
             missCount[z]+ goodCount[z]+ neither[z]+ indel, gapCount[z], z, snpCount[z],
-            nextBestScore, diff ,l->index, maxCount);
+            nextBestScore, diff ,l->index, maxHits);
     verbose(3, "%s %s:%d [%d] mismatch %d good %d neither %d indel %d \
- gaps %d snpCount[%d] %d total %d score %d nextBestScore %d diff %d index %d maxCnt %d\n",
+ gaps %d snpCount[%d] %d total %d score %d nextBestScore %d diff %d index %d maxHITS %d\n",
             name, l->chrom, l->chromStart, z, missCount[z], goodCount[z], neither[z], indel,
             missCount[z]+ goodCount[z]+ neither[z]+ indel, gapCount[z], z, snpCount[z],
-            score, nextBestScore, diff, l->index, maxCount);
-    if (posOk && diff >= minDiff && spread >= minDiff /* z >= 0 *&& maxCount == 1 && */)
+            score, nextBestScore, diff, l->index, maxHits);
+    if (posOk && diff >= minDiff && spread >= minDiff /* z >= 0 *&& maxHits == 1 && */)
         {
-        verbose(2, "%s bestHit score %d %s:%d-%d [%d] mismatch %d good %d neither %d indel %d sum %d\
- gaps %d snps %d seqCnt-loci %d maxScore %d maxCount %d 2nd best %d diff %d\n",
-                name,  score, psl->tName , chromStart, chromEnd, z, 
+        verbose(2, "%s bestHit score %d. diff %d and spread %d >= %d. %s:%d-%d [%d] mismatch %d good %d neither %d indel %d sum %d\
+ gaps %d snps %d seqCnt-loci %d maxScore %d maxHITS %d 2nd best %d diff %d\n",
+                name, score, diff , spread, minDiff , psl->tName , chromStart, chromEnd, z, 
                 missCount[z], goodCount[z], neither[z], indel,
                 missCount[z]+ goodCount[z]+ neither[z]+ indel, 
                 gapCount[z], snpCount[z],
-                seqCount - slCount(lociList), maxScore, maxCount, nextBestScore, diff);
+                seqCount - slCount(lociList), maxScore, maxHits, nextBestScore, diff);
         if (scoreFile)
             fprintf(scoreFile, "##>> %s score %d %s:%d-%d [%d] mismatch %d good %d neither %d indel %d \
- gaps %d snps %d total %d diff %d maxScore %d maxCount %d 2nd best %d diff %d\n",
+ gaps %d snps %d total %d diff %d maxScore %d maxHITS %d 2nd best %d diff %d\n",
                 name,  score, psl->tName , chromStart, chromEnd, z, 
                 missCount[z], goodCount[z], neither[z], indel,
                 missCount[z]+ goodCount[z]+ neither[z]+ indel, 
                 gapCount[z], snpCount[z],
-                seqCount - slCount(lociList), maxScore, maxCount, nextBestScore, diff);
+                seqCount - slCount(lociList), maxScore, maxHits, nextBestScore, diff);
         if (psl->strand[0] == '+' && psl->strand[1] == '-')
             pslRc(psl);
         pslTabOut(psl, outFile);
         outputCount++;
@@ -866,16 +863,13 @@
         */
         }
     else
         {
-        verbose(2, "%s NO score %d maxScore %d nextBestScore %d maxCount %d index %d pos %s:%d-%d diff %d < %d\n", 
-                name, score , maxScore, nextBestScore, maxCount, z, 
+        verbose(2, "%s NO score %d diff %d spread %d maxScore %d nextBestScore %d maxHits %d index %d posok %d pos %s:%d-%d diff %d < %d\n", 
+                name, score , diff, spread,  maxScore, nextBestScore, maxHits, z,  posOk,
                 psl->tName , psl->tStart, chromEnd, diff, minDiff);
         }
     }
-//else
-//    verbose(2, "%s noLoci bestScore %d nextBestScore %d maxCount %d index %d no loci\n", 
-//            name, maxScore, nextBestScore, maxCount, maxIndex);
 freez(&missCount);
 freez(&goodCount);
 freez(&gapCount);
 freez(&neither);
@@ -1030,14 +1024,14 @@
                 offset = (tSeq->size - offset)-1;
             snpBase = toupper(tSeq->dna[offset]);
            if (snpBase != t)
                {
-               verbose(3,"snp mismatch genome %c snpbase %c mrna %c ts %d snp %d valid %s\n",
+               verbose(4,"snp mismatch genome %c snpbase %c mrna %c ts %d snp %d valid %s\n",
                        t, snpBase, q, ts, snp->chromStart, snp->valid);
 //                        assert(snpBase == t);
                }
            else
-            verbose(3,"       [%d] snp match %s %s %s:%d %s %s offset %d gs %c ts %d genomic%c %c valid %s\n",
+            verbose(4,"       [%d] snp match %s %s %s:%d %s %s offset %d gs %c ts %d genomic%c %c valid %s\n",
                     getLoci(lociList, psl->tName, psl->tStart), snp->name, psl->qName, snp->chrom, snp->chromStart, snp->observed, 
                     snp->strand, offset, genomeStrand, ts, genomeStrand, t, snp->valid);
             }
         }
@@ -1234,13 +1228,8 @@
         {
         char t = toupper(tSeq->dna[i]);
         char q = toupper(qSeq->dna[i]);
         
-//        if (sameString(psl->qName, "AB063721"))
-//            {
-//            verbose(5,"te %d i %d\n",te,i);
-//            verbose(5,"hit\n");
-//            }
         if (t != q)
             /* add mismatch to list found between mRNA and genome */
             {
             int genomeStart = ts+i;
@@ -1375,18 +1364,18 @@
     if (misMatchList != NULL)
         {
         /* sort list by mrna position */
         slSort(&misMatchList, misMatchCmpMrnaLoc);
+        verbose(2, "sort on mrna loc compile %s mismatchList %d lociList %d of %d alist %d\n",
+                name, slCount(misMatchList), seqCount, slCount(lociList), slCount(alignList));
         for (align = alignList; align != NULL ; align = align->next)
             {
             int index = getLoci(lociList, align->psl->tName, align->psl->tStart);
             /* check for matches or indels in other loci */
             verbose(4, "CALL FILLINMATCHES() %s:%d index %d\n",
                     align->psl->tName, align->psl->tStart, index);
             fillinMatches(&misMatchList, align, lociList);
             }
-        verbose(4, "compile %s mismatchList %d lociList %d of %d alist %d\n",
-                name, slCount(misMatchList), seqCount, slCount(lociList), slCount(alignList));
         }
     /* compute mismatches and dump input alignments if nothing found */
     if (!compileOutput(name, misMatchList, seqCount, lociList) &&  passthru)
         for (align = alignList; align != NULL ; align = align->next)
@@ -1429,11 +1418,10 @@
         verbose(5, "REVERSE %s %s \n",psl->qName, psl->tName);
         }
     if (differentString(lastName, psl->qName) && subList != NULL && lastName != NULL)
 	{
-//        slReverse(&subList);
         slSort(&subList, pslCmpMatch);
-        verbose(5, "2 pslList %d subList %d last %s\n",slCount(pslList), slCount(subList), lastName);
+        verbose(2, "sort on match pslList %d subList %d last %s\n",slCount(pslList), slCount(subList), lastName);
 	doOneMrna(lastName, subList);
 	alignFreeList(&subList);
         verbose(5, "3 pslList %d\n",slCount(pslList));
 	safef(lastName, sizeof(lastName), "%s", psl->qName);
@@ -1452,9 +1440,9 @@
         verbose(5, "add %s %s\n",align->psl->qName, align->psl->tName);
         }
     }
 verbose(5,"last doOneMrna\n");
-slReverse(&subList);
+slSort(&subList, pslCmpMatch);
 doOneMrna(lastName, subList);
 alignFreeList(&subList);
 //pslFreeList(&pslList);
 verbose(1,"Wrote %d alignments out of %d \n", outputCount, aliCount);