src/hg/pslCDnaGenomeMatch/pslCDnaGenomeMatch.c 1.27

1.27 2009/12/01 06:46:18 baertsch
add -notAlignPenalty to reduce score for non-aligning bases. Fix strand bug with -passthru
Index: src/hg/pslCDnaGenomeMatch/pslCDnaGenomeMatch.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/pslCDnaGenomeMatch/pslCDnaGenomeMatch.c,v
retrieving revision 1.26
retrieving revision 1.27
diff -b -B -U 4 -r1.26 -r1.27
--- src/hg/pslCDnaGenomeMatch/pslCDnaGenomeMatch.c	19 Aug 2009 00:13:19 -0000	1.26
+++ src/hg/pslCDnaGenomeMatch/pslCDnaGenomeMatch.c	1 Dec 2009 06:46:18 -0000	1.27
@@ -23,19 +23,20 @@
 #include "nibTwo.h"
 #include "bed.h"
 #include "snp125.h"
 #include "pipeline.h"
-#define MINDIFF 3
+#define MINDIFF 5
 #define MAXLOCI 2048
 #define NOVALUE 10000  /* loci index when there is no genome base for that mrna position */
 #include "mrnaMisMatch.h"
-
+#define NOTALIGNEDFACTOR 8 /* unaligned bases divided by this factor reduce score */
 //static char const rcsid[] = "$Id$";
 static char na[3] = "NA";
 struct axtScoreScheme *ss = NULL; /* blastz scoring matrix */
 struct hash *snpHash = NULL, *mrnaHash = NULL, *faHash = NULL, *tHash = NULL, *species1Hash = NULL, *species2Hash = NULL;
 int maxGap = 100;
 int minDiff = MINDIFF;
+int notAlignPenalty = NOTALIGNEDFACTOR;
 int aliCount = 0; /* number of alignments read */
 int mrnaCount = 0; /* number of mrna read */
 int filterCount = 0; /* number of mrna where best hit can be determined  */
 int outputCount = 0; /* number of alignments written */
@@ -73,8 +74,9 @@
     {"bedOut", OPTION_STRING},
     {"score", OPTION_STRING},
     {"snp", OPTION_STRING},
     {"minDiff", OPTION_INT},
+    {"notAlignPenalty", OPTION_INT},
     {"passthru", OPTION_BOOLEAN},
     {"computeSS", OPTION_BOOLEAN},
     {NULL, 0}
 };
@@ -85,8 +87,9 @@
     char *chrom ;
     int chromStart;
     int chromEnd;
     int index ;
+    int score ;     /* score for the loci - count of mismatches relative to other loci*/ 
     struct psl *psl; /* alignment to mrna for this loci */
     };
 
 struct misMatch
@@ -148,28 +151,29 @@
 {
 errAbort(
     "pslCDnaGenomeMatch - check if retroGene aligns better to parent or retroGene \n"
     "usage:\n"
-    "    pslCDnaGenomeMatch input.psl chrom.sizes cdna.2bit nibDir output.psl\n\n"
-    "where \n"
+    "    pslCDnaGenomeMatch input.psl chrom.sizes cdna.2bit nibDir output.psl\n"
     "input.psl contains input mRNA/EST alignment psl file SORTED by qName\n"
     "chrom.sizes is a list of chromosome followed by size\n"
     "cdna.2bit contains fasta sequences of all mrnas/EST \n"
     "directory containing nibs of genome\n"
-    "output.psl contains filtered alignments for best matches and cases where no filtering occurred.\n"
+    "output.psl contains filtered alignments for best matches and cases where no filtering occurred.\n\n"
+    "Options: \n"
     "    -score=output.tab  is output containing mismatch info\n" 
     "    -computeSS compute sufficient statistics instead of filtering\n"
     "    -passthru  if best hit cannot be decided, then pass through all alignments\n"
-    "    -minDiff=N minimum difference in score to filter out 2nd best hit (default 5)\n"
+    "    -minDiff=N minimum difference in score to filter out 2nd best hit (default %d)\n"
+    "    -notAlignPenalty=N score non-aligning bases with 1/N (default %d)\n"
     "    -bedOut=bed output file of mismatches.\n"
     "    -species1=psl file with alignment of mrna/EST to other species.\n"
     "    -species2=psl file with alignment of mrna/EST to other species.\n"
     "    -snp=snp.tab.gz contains snps with or without bin field in snp125 format\n"
     "    -nibdir1=sequence of species 1 \n"
     "    -nibdir2=sequence of species 2 \n"
     "    -mrna1=fasta file with sequence of mrna/EST used in alignment 1\n"
-    "    -mrna2=fasta file with sequence of mrna/EST used in alignment 2\n"
-    );
+    "    -mrna2=fasta file with sequence of mrna/EST used in alignment 2\n",
+    minDiff, notAlignPenalty);
 }
 
 void alignFree(struct alignment  **pEl)
 /* Free a single dynamically allocated alignment */
@@ -662,9 +666,19 @@
 const struct misMatch *b = *((struct misMatch **)vb);
 return a->mrnaLoc - b->mrnaLoc;
 }
 
+int lociCmpSort(const void *va, const void *vb)
+/* Compare to sort based on score desc. */
+{
+const struct loci *a = *((struct loci **)va);
+const struct loci *b = *((struct loci **)vb);
+return b->score - a->score;
+}
 bool compileOutput(char *name, struct misMatch *misMatchList, int seqCount, struct loci *lociList)
+/* score predigested alignments  in misMatchList/lociList and output alignments to outFile 
+   return false if list is empty , else true
+   */
 {
 struct mrnaMisMatch *mrnaMm = NULL;
 struct mrnaMisMatch *mrnaMisMatch = NULL;
 struct bed *bed = NULL;
@@ -675,20 +689,18 @@
 int *neither;             /* count of cases where the loci all have the same base */
 int *gapCount;             /* count of cases where the loci are gaps*/
 int *snpCount;             /* count of cases where the loci are snps*/
 int indel = 0;             /* count of gaps in mrna alignment */
-int maxScore = -100;       /* max scoring alignment for this mrna */
-int nextBestScore = -100;       /* 2nd best scoring alignment for this mrna */
+int maxScore = -99999;       /* max scoring alignment for this mrna */
+int nextBestScore = -99999;       /* 2nd best scoring alignment for this mrna */
 int maxHits = 0;       /* number of alignments with max score */
-int maxIndex = -1;      /* index in loci list of best aligment */
+//int maxIndex = -1;      /* index in loci list of best aligment */
 int qNameCount = 0;     /* count of filtered psls for this qName */
-char *chrom;           /* best alignment */
-int chromStart;
-int chromEnd;
 //int i = 0;
 //int prevLoc = misMatchList->mrnaLoc;
 if (misMatchList == NULL) 
     return FALSE;
+//AllocArray(matchCount,seqCount);
 AllocArray(missCount,seqCount);
 AllocArray(goodCount,seqCount);
 AllocArray(gapCount,seqCount);
 AllocArray(neither,seqCount);
@@ -747,12 +759,22 @@
         if (atLeastOneMatch && lociDifferent && 
                 mrnaMisMatch->bases[index] != mrnaMisMatch->mrnaBase && mrnaMisMatch->bases[i] != '-')
             {
             if (toupper(mrnaMisMatch->bases[index]) != 'N')
+                {
                 missCount[index]++;
+                verbose(5,"missCount[%d]++ %d %c/%c\n",index, 
+                        missCount[index], mrnaMisMatch->bases[index] ,mrnaMisMatch->mrnaBase );
+                }
             else
+                {
                 gapCount[index]++;
+                verbose(5,"gapCount[%d]++ %d\n",index, gapCount[index]);
             }
+            }
+        else
+            verbose(9,"match not counted [%d] i=%d %c/%c/%c\n",
+                    index, i, mrnaMisMatch->bases[index], mrnaMisMatch->mrnaBase, mrnaMisMatch->bases[i]);
         if (mrnaMisMatch->chroms[i] != NULL && differentString(mrnaMisMatch->chroms[i], "NA") &&
             (lociDifferent && mrnaMisMatch->bases[index] != mrnaMisMatch->mrnaBase))
             {
             safef(chrom, sizeof(chrom), "%s", mrnaMisMatch->chroms[i]);
@@ -778,29 +800,45 @@
                 verbose(6,"snp index %d loc %d \n", index, mrnaMisMatch->mrnaLoc);
         }
     }
 mrnaMisMatchFreeList(&mrnaMm);
-/* now grap the highest scoring alignment and output it if there are no others that score within minDiff points */
+
+/* calculate scores for each loci*/
 for (l = lociList ; l != NULL; l=l->next)
     {
     int z = l->index;
-    int score = goodCount[z]-missCount[z];
-    verbose(3, "%s score %d next %d max %d\n",
-            name, score, nextBestScore, maxScore);
+    //bool posOk = getLociPosition(lociList, l->index, &chrom, &chromStart, &chromEnd, &psl);
+    psl = l->psl;
+    int notAligned = psl->qSize-goodCount[z]-missCount[z]-indel-(psl->match);
+    if (notAligned < 0) notAligned = 0;
+    int score = goodCount[z]-missCount[z] - notAligned/notAlignPenalty;
+    l->score = score;
+    verbose(3, "%s %s:%d [%d] miss %d good %d neither %d indel %d total %d\
+ gaps %d snp %d score %d unalg %d (-%d) match %d\n",
+            name, l->chrom, l->chromStart, z, missCount[z], goodCount[z], neither[z], indel,
+            missCount[z]+ goodCount[z]+ neither[z]+ indel, gapCount[z], snpCount[z],
+            score, notAligned, notAligned/notAlignPenalty, psl->match);
+    }
+
+/* sort by score */
+slSort(&lociList, lociCmpSort);
+
+/* now grap the highest scoring alignment and output it if there are no others that score within minDiff points */
+for (l = lociList ; l != NULL; l=l->next)
+    {
+    int score = l->score;
     if (maxScore == score)
         {
         maxHits ++;
-        nextBestScore = score;
-        verbose(3,"%s score %d == maxScore %d next %d \n",
-                name, score, maxScore, nextBestScore);
+        //        nextBestScore = score;
+        verbose(3,"%s score %d == maxScore %d next %d maxHits %d\n",
+                name, score, maxScore, nextBestScore, maxHits);
         }
     else 
         {
         if (score > maxScore)
             {
             maxHits = 1;
-            maxIndex = z;
-            nextBestScore = maxScore;
             verbose(3,"%s score %d > maxScore %d nextBestScore %d \n",
                     name, score, maxScore, nextBestScore);
             }
         else if (score > nextBestScore ) 
@@ -811,28 +849,30 @@
             }
     }
     maxScore = max(maxScore, score); 
     }
+
+/* output results */
 for (l = lociList ; l != NULL; l=l->next)
     {
+    char *chrom;           /* best alignment */
+    int chromStart;
+    int chromEnd;
     int z = l->index;
-    int score = goodCount[z]-missCount[z];
-    int diff =  score - nextBestScore;
     int spread = maxScore - nextBestScore;
     bool posOk = getLociPosition(lociList, l->index, &chrom, &chromStart, &chromEnd, &psl);
+    int score = l->score;
+    int diff =  score - nextBestScore;
+    psl = l->psl;
     assert(psl != NULL);
+    assert(l->index >= 0);
     if (scoreFile != NULL)
         fprintf(scoreFile, "##   %s score %d %s:%d [%d] mismatch %d good %d neither %d indel %d \
- gaps %d snpCount[%d] %d total %d nextBestScore %d diff %d index %d maxHITS %d \n",
+ gaps %d snpCount[%d] %d total %d nextBestScore %d diff %d index %d maxHITS %d posOk %d\n",
             name, score , l->chrom, l->chromStart, z, missCount[z], goodCount[z], neither[z], indel,
             missCount[z]+ goodCount[z]+ neither[z]+ indel, gapCount[z], z, snpCount[z],
-            nextBestScore, diff ,l->index, maxHits);
-    verbose(3, "%s %s:%d [%d] mismatch %d good %d neither %d indel %d \
- gaps %d snpCount[%d] %d total %d score %d nextBestScore %d diff %d index %d maxHITS %d\n",
-            name, l->chrom, l->chromStart, z, missCount[z], goodCount[z], neither[z], indel,
-            missCount[z]+ goodCount[z]+ neither[z]+ indel, gapCount[z], z, snpCount[z],
-            score, nextBestScore, diff, l->index, maxHits);
-    if (posOk && diff >= minDiff && spread >= minDiff /* z >= 0 *&& maxHits == 1 && */)
+            nextBestScore, diff ,l->index, maxHits, posOk);
+    if (diff >= minDiff && spread >= minDiff /* z >= 0 *&& maxHits == 1 && */)
         {
         verbose(2, "%s bestHit score %d. diff %d and spread %d >= %d. %s:%d-%d [%d] mismatch %d good %d neither %d indel %d sum %d\
  gaps %d snps %d seqCnt-loci %d maxScore %d maxHITS %d 2nd best %d diff %d\n",
                 name, score, diff , spread, minDiff , psl->tName , chromStart, chromEnd, z, 
@@ -849,8 +889,9 @@
                 gapCount[z], snpCount[z],
                 seqCount - slCount(lociList), maxScore, maxHits, nextBestScore, diff);
         if (psl->strand[0] == '+' && psl->strand[1] == '-')
             pslRc(psl);
+        if (psl->strand[1] == '+' || psl->strand[1] == '-') psl->strand[1] = '\0';
         pslTabOut(psl, outFile);
         outputCount++;
         filterCount++;
         qNameCount++;
@@ -864,11 +905,11 @@
         */
         }
     else
         {
-        verbose(2, "%s NO score %d diff %d spread %d maxScore %d nextBestScore %d maxHits %d index %d posok %d pos %s:%d-%d diff %d < %d\n", 
-                name, score , diff, spread,  maxScore, nextBestScore, maxHits, z,  posOk,
-                psl->tName , psl->tStart, chromEnd, diff, minDiff);
+        verbose(2, "%s NO score %d diff %d spread %d max %d next %d maxHits %d index %d pos %s:%d diff %d < %d\n", 
+                name, score , diff, spread,  maxScore, nextBestScore, maxHits, z, 
+                psl->tName , psl->tStart, diff, minDiff);
         }
     }
 freez(&missCount);
 freez(&goodCount);
@@ -1123,9 +1164,12 @@
             int i = mm->mrnaLoc - qs;
             int genomeStart = ts+i;
             if (genomeStrand == '-')
                 genomeStart = te-i-1;
-            if (sameString(mm->chrom , na) && index == mm->loci &&
+            /* if genomic chr not filled in yet (only mismatches are filled in ), and we are within the block, lookup genomic loc and  base*/
+            if (index == mm->loci)
+                {
+                if (sameString(mm->chrom , na) && 
                     mm->mrnaLoc >= qs && mm->mrnaLoc < qe && i >= 0)
                 {
                 char t = toupper(tSeq->dna[i]);
                 char q = toupper(qSeq->dna[i]);
@@ -1133,8 +1177,9 @@
                 if (t=='\0') t='-';
                 if (i > 0 || i <= qe-qs) 
                     verbose(6," i %d qs %d qe %d\n",i,qs,qe);
                 assert (i > 0 || i <= qe-qs) ;
+//                    verbose(6, "WAS %s not %s\n",mm->chrom, psl->tName);
                 mm->chrom = cloneString(psl->tName);
                 mm->chromStart = genomeStart;
                 mm->genomeBase = t;
                 mm->strand = genomeStrand;
@@ -1142,11 +1187,15 @@
                     {
                     verbose(2, "mismatch %s %s q %c != mmBase %c offset %d\n",psl->qName, psl->tName,q,(mm->mrnaBase), i );
                     assert(q==(mm->mrnaBase));
                     }
-                verbose(5,"   fillinMatches() %s %c/%c t %s:%d q %d i %d qs %d qe %d %s %c loci %d block %d\n",
+                    verbose(5,"   fM() FILL %s %c/%c t %s:%d q %d i %d qs %d qe %d %s %c loci %d BLK %d ",
                         psl->qName, t, q, psl->tName, 
                         genomeStart, mm->mrnaLoc, i, qs, qe ,psl->strand, genomeStrand, mm->loci, blockIx);
+                    if (t==q)
+                        verbose(5,"GOOD\n");
+                    else
+                        verbose(5,"MISMATCH\n");
                 }
     /*        else if (i < 0)
                 {
                 mm->chrom = cloneString(psl->tName);
@@ -1154,20 +1203,22 @@
                 mm->genomeBase = '-';
                 mm->strand = genomeStrand;
                 verbose(2, "negative index %s %s mrnaLoc %d qs %d \n",psl->qName, psl->tName, mm->mrnaLoc , qs);
                 }*/
-            else if (sameString(mm->chrom , na) && index == mm->loci )
+                /* if we are outside the block, assume an indel */
+                else if (mm->mrnaLoc >= qs && mm->mrnaLoc < qe)
                 {
                 mm->genomeBase = '-';
-                verbose(5,"   fillinMatches() skipped mismatch %s t %s:%d mrnaLoc %d qs %d %s loci %d i=mrnLoc-qs %d blk %d\n",
+                    verbose(5,"   fM() INDEL %s t %s:%d mrnaLoc %d i %d not in %d-%d %s loci %d BLK %d\n",
                         psl->qName, psl->tName, 
-                        psl->tStart, mm->mrnaLoc, qs , psl->strand, mm->loci, i, blockIx);
+                            psl->tStart, mm->mrnaLoc, i, qs, qe , psl->strand, mm->loci, blockIx);
                 }
-            else 
-                {
-                verbose(5,"   fillinMatches() fallthru  mismatch %s mmchrom %s t %s:%d mrnaLoc %d qs %d %s loci %d <> index %d i=mrnLoc-qs %d\n",
-                        psl->qName, mm->chrom, psl->tName, 
-                        psl->tStart, mm->mrnaLoc, qs , psl->strand, mm->loci, index, i);
+//                else 
+//                    {
+//                    verbose(5,"   fM() nop %s chr %s t %s:%d mrnaLoc %d i %d not in %d-%d %s loci %d <> %d BLK %d\n",
+//                            psl->qName, mm->chrom, psl->tName, 
+//                            psl->tStart, mm->mrnaLoc, i, qs, qe, psl->strand, mm->loci, index, blockIx);
+//                    }
                 }
             }
         /*
         snpList = getSnpList(psl->tName, ts, te, genomeStrand) ;
@@ -1198,8 +1249,9 @@
 int misMatchCount = 0;
 int transitionCount = 0;
 int transversionCount = 0;
 char genomeStrand = psl->strand[1] == '-' ? '-' : '+';
+int matchCount = 0;
 //if (genomeStrand == '-')
 //    reverseIntRange(&tStart, &tEnd, psl->tSize);
 for (blockIx=0; blockIx < psl->blockCount; ++blockIx)
     /* for each alignment block get sequence for both strands */
@@ -1270,10 +1322,14 @@
                        t, q, ts, genomeStart, i, mrnaLoc);
             newMisMatches(misMatchList, psl->qName, i ,t, q, psl->tName, 
                     genomeStart, mrnaLoc, genomeStrand, lociList, snpList);
             }
+        else
+            {
+            matchCount++;
+            }
         }
-    /* add indel for portions of mrna that do not align at all */
+//    /* add indel for portions of mrna that do not align at all */
 //    for (i = 0 ; i < psl->qStarts[0] ; i++)
 //        {
 //        char q = toupper(qSeq->dna[i]);
 //
@@ -1302,11 +1358,11 @@
         */
     freeDnaSeq(&tSeq);
     freeDnaSeq(&qSeq);
     }
-    verbose(4,"final score %s:%d-%d %s %s misMatch %d \n",
+    verbose(4,"buildMismatches final score %s:%d-%d %s %s #match %d misMatch %d \n",
             psl->tName, tStart, tEnd, psl->strand,
-            psl->qName, misMatchCount);
+            psl->qName, matchCount, misMatchCount);
 //    fprintf(scoreFile,"%s\t%d\t%d\t%s\t%d\n",
 //            psl->tName, psl->tStart, psl->tEnd, psl->qName, 
 //            misMatchCount);
 }
@@ -1328,8 +1384,9 @@
     if (psl == NULL)
         return;
     if (psl->strand[0] == '+' && psl->strand[1] == '-')
         pslRc(psl);
+    if (psl->strand[1] == '+' || psl->strand[1] == '-') psl->strand[1] = '\0';
     pslTabOut(psl, outFile);
     outputCount++;
     filterCount++;
     return;
@@ -1348,13 +1405,13 @@
 
 //addOtherAlignments(&alignList, species1Hash, name, nibDir1, mrna1);
 //addOtherAlignments(&alignList, species2Hash, name, nibDir2, mrna2);
 if (alignList != NULL) 
-    verbose(3,"buildMisMatch %s aList count %d \n",name, slCount(alignList));
+    verbose(5,"buildMisMatch %s aList count %d \n",name, slCount(alignList));
 for (align = alignList; align != NULL ; align = align->next)
     {
     /* for each alignment, build mismatch list */
-    verbose(3,"buildMisMatch %s tName %s:%d-%d\n",
+    verbose(5,"buildMisMatch %s tName %s:%d-%d\n",
             name, align->psl->tName, align->psl->tStart, align->psl->tEnd);
     if (computeSS)
         computeSuffStats(&misMatchList, align, lociList);
     else
@@ -1372,21 +1429,27 @@
         for (align = alignList; align != NULL ; align = align->next)
             {
             int index = getLoci(lociList, align->psl->tName, align->psl->tStart);
             /* check for matches or indels in other loci */
-            verbose(4, "CALL FILLINMATCHES() %s:%d index %d\n",
+            verbose(6, "CALL FILLINMATCHES() %s:%d index %d\n",
                     align->psl->tName, align->psl->tStart, index);
             fillinMatches(&misMatchList, align, lociList);
             }
         }
-    /* compute mismatches and dump input alignments if nothing found */
+    /* compute mismatches and output alignments,
+       if best hit not found and passthru is true,  copy input to output */
     if (!compileOutput(name, misMatchList, seqCount, lociList) &&  passthru)
+        {
         for (align = alignList; align != NULL ; align = align->next)
             {
+            if (align->psl->strand[0] == '+' && align->psl->strand[1] == '-')
+                pslRc(align->psl);
+            if (align->psl->strand[1] == '+' || align->psl->strand[1] == '-') align->psl->strand[1] = '\0';
             pslTabOut(align->psl, outFile);
             outputCount++;
             }
-        ;
+        filterCount++;;
+        }
     misMatchFreeList(&misMatchList);
     }
 else if (computeSS)
     {
@@ -1447,9 +1510,9 @@
 doOneMrna(lastName, subList);
 alignFreeList(&subList);
 //pslFreeList(&pslList);
 verbose(1,"Wrote %d alignments out of %d \n", outputCount, aliCount);
-verbose(1,"Kept %d out of %d mRNAs \n", filterCount, mrnaCount);
+verbose(1,"%d out of %d mRNAs passed strict criteria (pass thru not counted)\n", filterCount, mrnaCount);
 if (computeSS)
     {
     fprintf(outFile, "%8d %8d %8d %8d  %8d %8d %8d %8d  %8d %8d %8d %8d  %8d %8d %8d %8d \n",
             getHistogram('a','a'), getHistogram('a','c'), getHistogram('a','g'), getHistogram('a','t'), 
@@ -1511,8 +1574,9 @@
 outFile = fopen(argv[5],"w");
 
 initSS();
 minDiff = optionInt("minDiff", MINDIFF);
+notAlignPenalty = optionInt("notAlignPenalty", NOTALIGNEDFACTOR);
 snpFile = optionVal("snp", NULL);
 if (snpFile != NULL)
     {
     verbose(1,"Reading snps from %s\n",snpFile);