6e5ee11ca95cd971984038cf65bae00d9c898707
galt
  Wed Jun 4 15:40:02 2014 -0700
Since we have git, it is easy to rename errabort.c to errAbort.c without losing any history.
diff --git src/cdnaAli/refineAli/refineAli.c src/cdnaAli/refineAli/refineAli.c
index 3f55938..d696e74 100644
--- src/cdnaAli/refineAli/refineAli.c
+++ src/cdnaAli/refineAli/refineAli.c
@@ -1,1567 +1,1567 @@
 /* RefineAli.c - Stuff to check and refine alignments.
  *
  * This program checks alignments and refines alignments.
  *      
  * The alignments are checked at the edges, and if at least decent in
  * the middle, but not as good on the edges, the alignment is done
  * again against a larger chunk of genomic DNA on the side(s) that don't
  * match up.
  *
  * After any reallignment, full data about decent or better alignments 
  * is written to a file.  This info is:
  *      cdnaName
  *          score, targetChromosome, targetStrand
  *                length, cdnaOffset, targetOffset
  *                length, cdnaOffset, targetOffset
  *          score, targetChromosome, targetStrand
  *                length, cdnaOffset, targetOffset
  *                length, cdnaOffset, targetOffset
  * Poor and none alignments are written to another file as simply
  *      cdnaName
  * with the intention of eventually blasting these, or running them
  * with an exonAli with smaller tile size or something. */   
                 
 #include "common.h"
 #include "portable.h"
 #include "hash.h"
 #include "obscure.h"
-#include "errabort.h"
+#include "errAbort.h"
 #include "dnautil.h"
 #include "dnaseq.h"
 #include "snof.h"
 #include "fa.h"
 #include "nt4.h"
 #include "wormdna.h"
 #include "fuzzyFind.h"
 #include "htmshell.h"
 #include "cheapcgi.h"
 #include "cdnaAli.h"
 
 
 void loadGenome(char *nt4Dir, struct nt4Seq ***retNt4Seq, char ***retChromNames, int *retNt4Count)
 /* Load up entire packed  genome into memory. */
 {
 struct slName *nameList, *name;
 int count;
 int i;
 char **chromNames;
 struct nt4Seq **chroms;
 char fileName[512];
 char chromName[256];
 
 nameList = listDir(nt4Dir, "*.nt4");
 count = slCount(nameList);
 chroms = needMem(count*sizeof(chroms[0]));
 chromNames = needMem(count*sizeof(chromNames[0]));
 
 for (name = nameList, i=0; name != NULL; name = name->next, i+=1)
     {
     char *end;
 
     sprintf(fileName, "%s/%s", nt4Dir, name->name);
     
     /* Cut off .nt4 suffix. */
     strcpy(chromName, name->name);
     end = strrchr(chromName, '.');
     assert(end != NULL);
     *end = 0;
     
     chroms[i] = loadNt4(fileName, chromName);
     chromNames[i] = chroms[i]->name;
     }
 slFreeList(&nameList);
 *retNt4Seq = chroms;
 *retChromNames = chromNames;
 *retNt4Count = count;
 }
 
 void freeGenome(struct nt4Seq ***pNt4Seq, char ***pChromNames, int chromCount)
 /* Free up genome. */
 {
 int i;
 struct nt4Seq **nt4s = *pNt4Seq;
 for (i=0; i<chromCount; ++i)
     {
     freeNt4(&nt4s[i]);
     }
 freez(pNt4Seq);
 freez(pChromNames);
 }
 
 
 
 struct fineAli
     /* A (hopefully) refined alignment. */
     {
     struct fineAli *next;
     boolean isDupe;
     boolean isGood;
     int chromIx;
     boolean isRc, isBackwards;
     int score;
     struct ffAli *blocks;
     DNA *virtNeedle;    /* Not referenced, just used for arithmetic. */
     DNA *virtHaystack;  /* Also not referenced. */
     long nStart, nEnd, hStart, hEnd;
     char *geneName;
     long geneStart, geneEnd;
     };
 
 struct roughAli
     /* A rough alignment .*/
     {
     struct roughAli *next;
     boolean isDupe;
     int chromIx;
     int score;
     int cStart, cEnd;   /* Boundaries on cDNA. */
     int gStart, gEnd;   /* Boundaries on genome. */
     };
 
 struct cdnaInfo
     /* Accumulated info about one cDNA. */
     {
     struct cdnaInfo *next;
     struct cdnaInfo *crcHashNext;
     int ix;
     char *name;
     int baseCount;
     bits32 baseCrc;
     boolean finished;
     boolean isDupe;
     boolean isBackwards;
     int fineScore;
     int roughScore;
     struct roughAli *roughAli;
     struct fineAli *fineAli;
     boolean isEmbryonic;
     };
 
 
 char *inName, *goodLogName, *badLogName, *unusualName, *errName;
 FILE *inFile, *goodLogFile, *badLogFile, *unusualFile, *errFile;
 char *cdnaName, *chromDir, *c2gName = NULL;
 struct nt4Seq **chroms;
 int chromCount;
 
 struct hash *redoHash;
 
 void startRedoHash()
 {
 redoHash = newHash(12);
 }
 
 void addRedoHash(struct cdnaInfo *ci, char *reason)
 {
 struct hashEl *hel;
 if ((hel = hashLookup(redoHash, ci->name)) == NULL)
     hashAdd(redoHash, ci->name, reason);
 }
 
 #ifdef OLD
 void endRedoHash()
 {
 char *redoName = "redo.txt";
 FILE *redoFile = mustOpen(redoName, "w");
 struct hashEl *hel;
 int i;
 
 for (i=0; i<redoHash->size; ++i)
     {
     for (hel = redoHash->table[i]; hel != NULL; hel = hel->next)
         {
         fprintf(redoFile, "%s %s\n", hel->name, hel->val);
         }
     }
 freeHash(&redoHash);
 }
 #endif
 
 boolean anyCdnaSeq(char *name, struct dnaSeq **retDna, struct wormCdnaInfo *retInfo)
 /* Get a single  cDNA sequence. Optionally (if retInfo is non-null) get additional
  * info about the sequence. */
 {
 static FILE *cdnaFa;
 static struct snof *cdnaSnof = NULL;
 long offset;
 char *faComment;
 char **pFaComment = (retInfo == NULL ? NULL : &faComment);
 
 if (cdnaSnof == NULL)
     {
     char buf[512];
 
     cdnaSnof = snofMustOpen(cdnaName);
     sprintf(buf, "%s%s", cdnaName, ".fa");
     cdnaFa = mustOpen(buf, "rb");
     }
 if (!snofFindOffset(cdnaSnof, name, &offset))
     return FALSE;
 fseek(cdnaFa, offset, SEEK_SET);
 if (!faReadNext(cdnaFa, name, TRUE, pFaComment, retDna))
     return FALSE;
 if (retInfo != NULL)
     {
     /* Kludge - only look up info if format is more or less right. */
     int fieldCount = countChars(faComment, '|');
     if (fieldCount >= 8)
         wormFaCommentIntoInfo(faComment, retInfo);
     else
         zeroBytes(retInfo, sizeof(*retInfo));
     }
 return TRUE;
 }
 
 boolean weAreWeb()
 /* Return true if we're executing as a CGI app from a browser. */
 {
 static boolean checked = FALSE;
 static boolean result = FALSE;
 
 if (!checked)
     {
     checked = TRUE;
     if (getenv("QUERY_STRING") != NULL)
         result = TRUE;
     }
 return result;
 }
 
 char *lineBreak()
 /* Return line break string. */
 {
 if (weAreWeb())
     return "<BR>\n";
 else    
     return "\n";
 }
 
 void vaReportFile(char *format, va_list args, FILE *file, char *pre, char *post)
 /* Write printf formatted report to file, prepending extra if non-NULL */
 {
 if (pre != NULL)
     fprintf(file, "%s", pre);
 vfprintf(file, format, args);
 if (post != NULL)
     fprintf(file, "%s", post);
 }
 
 void reportGood(char *format, ...)
 /* Printf to good log file and console. */
 {
 va_list args;
 va_start(args, format);
 //vaReportFile(format, args, stdout, NULL, NULL);
 vaReportFile(format, args, goodLogFile, NULL, NULL);
 va_end(args);
 }
 
 void reportBad(char *format, ...)
 /* Printf to bad log file and console. */
 {
 va_list args;
 va_start(args, format);
 //vaReportFile(format, args, stdout, NULL, NULL);
 vaReportFile(format, args, badLogFile, NULL, NULL);
 va_end(args);
 }
 
 void reportUnusual(char *format, ...)
 /* Printf to unusual log file. */
 {
 va_list args;
 va_start(args, format);
 vaReportFile(format, args, stdout, NULL, NULL);
 vaReportFile(format, args, unusualFile, NULL, NULL);
 va_end(args);
 }
 
 void reportWarning(char *format, va_list args)
 /* Print warning to console and also to log files. */
 {
 vaReportFile(format, args, stdout, "###", lineBreak());
 vaReportFile(format, args, errFile, "###", "\n");
 fflush(errFile);
 }
 
 void usageErr()
 /* Print usage info and bail */
 {
 errAbort(
 "This program turns rough alignments into fine ones.\n"
 "usage:\n"
 "      refineAli roughInputFile cdnaBase chromDir goodAlignFile badAlignFile coolFile errorFile startIx endIx [c2gFile]\n"
 "example:\n"
 "      refineAli ea\\all.out cDNA\\allcdna chrom ra\\good.txt ra\\bad.txt ra\\cool.txt ra\\err.txt 0 100000 features\\c2g\n");
 }
 
 char **chromNames;
 
 
 bits32 dnaCrc(DNA *dna, int size)
 /* Return a number that is fairly unique for a particular
  * piece of DNA. */
 {
 bits32 shiftAcc = 0;
 bits32 b;
 while (--size >= 0)
     {
     b = *dna++;
     if (shiftAcc >= 0x80000000u)
         {
         shiftAcc <<= 1;
         shiftAcc |= 1;
         shiftAcc += b;
         }
     else
         {
         shiftAcc <<= 1;
         shiftAcc += b;
         }
     }
 return shiftAcc;
 }
 
 
 void filterDupeCdna(struct cdnaInfo *ci, struct dnaSeq *cdnaSeq)
 /* Flag duplicated cDNA that is named differently. */
 {
 #define crcHashSize (1<<14)
 #define crcHashMask (crcHashSize-1)
 static struct cdnaInfo *crcHash[crcHashSize];
 int hashVal = (ci->baseCrc&crcHashMask);
 struct cdnaInfo *hel = crcHash[hashVal];
 struct dnaSeq *otherSeq;
 while (hel != NULL)
     {
     if (hel->baseCount == ci->baseCount && hel->baseCrc == ci->baseCrc)
         {
         if (!anyCdnaSeq(hel->name, &otherSeq, NULL))
             errAbort("Can't find other cDNA %s", hel->name);
         if (memcmp(cdnaSeq->dna, otherSeq->dna, cdnaSeq->size) == 0)
             {
             warn("%s Duplicate cDNA (%d %s and %d %s)", 
                 ci->name, ci->ix, ci->name, hel->ix, hel->name);
             ci->isDupe = TRUE;
             break;
             }
         else
             {
             warn("%s Misleading CRC match (%d %s and %d %s)",
                 ci->name, ci->ix, ci->name, hel->ix, hel->name);
             }
         }
     hel = hel->crcHashNext;
     }
 ci->crcHashNext = crcHash[hashVal];
 crcHash[hashVal] = ci;
 }
 
 struct cdnaInfo *lookupInfo(struct hash *hash, char *name)
 {
 struct hashEl *el = hashLookup(hash, name);
 if (el == NULL)
     return NULL;
 return (struct cdnaInfo *)(el->val);
 }
 
 
 void findAliEnds(struct ffAli *ali, DNA *needle, DNA *hay,
     long *retNeedleStart, long *retNeedleEnd,
     long *retHayStart, long *retHayEnd)
 {
 DNA *hStart = NULL;
 DNA *hEnd = NULL;
 DNA *nStart = NULL;
 DNA *nEnd = NULL;
 boolean first = TRUE;
 
 while (ali->left) ali = ali->left;
 for (;ali != NULL; ali = ali->right)
     {
     if (first)
         {
         hStart = ali->hStart;
         hEnd = ali->hEnd;
         nStart = ali->nStart;
         nEnd = ali->nEnd;
         first = FALSE;
         }
     else
         {
         if (ali->hStart < hStart)
             hStart = ali->hStart;
         if (ali->hEnd > hEnd)
             hEnd = ali->hEnd;
         if (ali->nStart < nStart)
             nStart = ali->nStart;
         if (ali->nEnd > nEnd)
             nEnd = ali->nEnd;
         }
     }
 *retHayStart = hStart - hay;
 *retNeedleStart = nStart - needle;
 *retHayEnd = hEnd - hay;
 *retNeedleEnd = nEnd - needle;
 }
 
 void fetchUnpacked(int chromIx, int gStart, int gEnd, int extraOnEnds,
     DNA **retDna, int *retStart, int *retEnd)
 /* This routine fetches DNA for a segment of chromosome. 
  * Because of packing issues it may fetch a bit more than
  * requested.  retStart and retEnd are the beginning and
  * end offsets of what's actually fetched. */
 {
 struct nt4Seq *nt4 = chroms[chromIx];
 DNA *unpacked;
 
 /* Adjust endpoints to include extra clipped to fit inside chromosome. */
 gStart -= extraOnEnds;
 if (gStart < 0)
     gStart = 0;
 gEnd += extraOnEnds;
 if (gEnd > nt4->baseCount)
     gEnd = nt4->baseCount;
 
 unpacked = nt4Unpack(nt4, gStart, gEnd-gStart);
 
 /* Set return variables. */
 *retDna = unpacked;
 *retStart = gStart;
 *retEnd = gEnd;
 }
 
 int goodNts(DNA *dna, int size)
 /* Count up number of non-N nucleotides. */
 {
 int count = 0;
 while (--size >= 0)
     {
     if (ntVal[(int)(*dna++)] >= 0)
         ++count;
     }
 return count;
 }
 
 #define minGoodSize 12
 
 int leftFlakySize(struct ffAli *ali, DNA *needle, int needleSize)
 /* Returns # of unalligned or flakily aligned bases on left side
  * of needle. */
 {
 int aliSize;
 DNA *firstGood = NULL;
 
 while (ali->left)   ali = ali->left;
 while (ali != NULL)
     {
     aliSize = ali->nEnd - ali->nStart;
     if (ffScoreMatch(ali->nStart, ali->hStart, aliSize) >= minGoodSize)
         {
         firstGood = ali->nStart;
         break;
         }
     ali = ali->right;
     }
 if (firstGood == NULL)
     return needleSize;
 else
     return firstGood - needle;
 }
 
 int rightFlakySize(struct ffAli *ali, DNA *needle, int needleSize)
 /* Returns # of unalligned or flakily aligned bases on right side
  * of needle. */
 {
 int aliSize;
 DNA *firstGood = NULL;
 while (ali->right) ali = ali->right;
 
 while (ali != NULL)
     {
     aliSize = ali->nEnd - ali->nStart;
     if (ffScoreMatch(ali->nStart, ali->hStart, aliSize) >= minGoodSize)
         {
         firstGood = ali->nEnd;
         break;
         }
     ali = ali->left;
     }
 if (firstGood == NULL)
     return needleSize;
 else
     return (needle + needleSize) - firstGood;
 }
 
 int polyaSize(DNA *cdna, int cdnaSize)
 /* Return size of polyA tail. */
 {
 int count=0;
 while (--cdnaSize >= 0)
     {
     if (cdna[cdnaSize] != 'a')
         break;
     ++count;
     }
 if (count < 4)  /* Not really poly ... */
     return 0;
 return count;
 }
 
 int gapPenalty(struct ffAli *left, struct ffAli *ali)
 /* Calculate gap penalty using exon scoring. */
 {
 int nGap, hGap;
 int minGap;
 
 nGap = ali->nStart - left->nEnd;
 assert(nGap >= 0);
 hGap = ali->hStart - left->hStart;
 if (hGap < 0)       
     {
     hGap = 0;
     nGap -= hGap;
     }
 minGap = (nGap < hGap ? nGap : hGap);
 return (2 + digitsBaseTwo(hGap+minGap) + (nGap-minGap));
 }
     
 int scoreExonAli(struct ffAli *ali)
 /* Score alignment using assumptions that it may have introns. */
 {
 struct ffAli *left;
 int oneScore;
 int onePenalty;
 int score;
 score = ffScoreMatch(ali->nStart, ali->hStart, ali->nEnd - ali->nStart);
 for (;;)
     {
     left = ali;
     if ((ali = ali->right) == NULL)
         break;
     onePenalty = gapPenalty(left, ali);
     score -= onePenalty;
     oneScore = ffScoreMatch(ali->nStart, ali->hStart, ali->nEnd - ali->nStart);
     score += oneScore;
     }
 return score;
 }
 
 void clipEnds(int leftMin, int *left, int *right, int rightMax)
 /* Make sure left and right are within bounds. */
 {
 if (*left < leftMin)
     *left = leftMin;
 if (*right > rightMax)
     *right = rightMax;
 }
 
 
 
 int cmpRoughAli(const void *va, const void *vb)
 /* Compare two rough alis by score. */
 {
 struct roughAli **pA = (struct roughAli **)va;
 struct roughAli **pB = (struct roughAli **)vb;
 struct roughAli *a = *pA, *b = *pB;
 
 return b->score - a->score;
 }
 
 
 void sortRoughAlis(struct roughAli **pAli)
 {
 slSort(pAli, cmpRoughAli);
 }
 
 
 int cmpFineAli(const void *va, const void *vb)
 /* Compare two rough alis by score. */
 {
 struct fineAli **pA = (struct fineAli **)va;
 struct fineAli **pB = (struct fineAli **)vb;
 struct fineAli *a = *pA, *b = *pB;
 
 return b->score - a->score;
 }
 
 
 void sortFineAlis(struct fineAli **pAli)
 {
 slSort(pAli, cmpFineAli);
 }
 
 
 void flagOneRough(struct roughAli *list, struct roughAli *suspect)
 /* Check list from beginning to suspect for things that 
  * completely overlap with suspect. */
 {
 struct roughAli *better;
 for (better = list; better != suspect; better = better->next)
     {
     if (better->chromIx == suspect->chromIx)
         {
         if ((better->gStart <= suspect->gStart && better->gEnd >= suspect->gEnd)
          || (better->gStart >= suspect->gStart && better->gEnd <= suspect->gEnd)) 
             {
             suspect->isDupe = TRUE;
             break;
             }
         }
     }
 }
 
 void flagDupeRoughAlis(struct roughAli *raList)
 /* Get rid of lower or equal scoring roughAlis that
  * cover same region. raList should be sorted. */
 {
 int count = slCount(raList);
 struct roughAli *ra;
 if (count <= 1)
     return;
 for (ra = raList->next; ra != NULL; ra = ra->next)
     {
     flagOneRough(raList, ra);
     }
 }
 
 void flagOneFine(struct fineAli *list, struct fineAli *suspect)
 /* Check list from beginning to suspect for things that 
  * completely overlap with suspect. */
 {
 struct fineAli *better;
 for (better = list; better != suspect; better = better->next)
     {
     if (better->chromIx == suspect->chromIx && better->isRc == suspect->isRc)
         {
         long snStart, snEnd, shStart, shEnd, bnStart, bnEnd, bhStart, bhEnd;
         findAliEnds(suspect->blocks, suspect->virtNeedle, suspect->virtHaystack,
             &snStart, &snEnd, &shStart, &shEnd);
         findAliEnds(better->blocks, better->virtNeedle, better->virtHaystack,
             &bnStart, &bnEnd, &bhStart, &bhEnd);
         if ((bhStart <= shStart && bhEnd >= shEnd) ||
             (bhStart >= shStart && bhEnd <= shEnd))
             {
             suspect->isDupe = TRUE;
             break;
             }
         }
     }
 }
 
 
 void flagDupeFineAlis(struct fineAli *faList)
 /* Get rid of duplicate fineAlis
  */
 {
 int count = slCount(faList);
 struct fineAli *fa;
 if (count <= 1)
     return;
 for (fa = faList->next; fa != NULL; fa = fa->next)
     {
     flagOneFine(faList, fa);
     }
 }
 
 
 struct cr2g
     {
     struct cr2g *next;
     char strand;
     int start, end;
     char geneName[32];
     };
 
 int findChromIx(char *name)
 {
 int i;
 for (i=0; i<chromCount; ++i)
     {
     if (!differentWord(chromNames[i], name))
         return i;
     }
 errAbort("Unknown chromosome %s", name);
 return -1;
 }
 
 
 void loadCr2g(char *fileName, struct cr2g **chromLists)
 {
 char buf[512];
 struct cr2g *el;
 FILE *f = mustOpen(fileName, "r");
 char *words[8];
 int wordCount;
 char *parts[3];
 int partCount;
 int chromIx;
 
 uglyf("Loading %s<BR>\n", fileName);
 while (fgets(buf, sizeof(buf), f) != NULL)
     {
     wordCount = chopString(buf, whiteSpaceChopper, words, ArraySize(words));
     if (wordCount != 3)
         errAbort("Bad cr2g file\n");
     partCount = chopString(words[0], ":-", parts, ArraySize(parts));
     if (partCount != 3)
         errAbort("Bad cr2g file\n");
     AllocVar(el);
     chromIx = findChromIx(parts[0]);
     el->strand = words[1][0];
     el->start = atoi(parts[1]);
     el->end = atoi(parts[2]);
     strncpy(el->geneName, words[2], sizeof(el->geneName));
 
     el->next = chromLists[chromIx];
     chromLists[chromIx] = el;
     }
 uglyf("Done loading %s<BR>\n", fileName);
 fclose(f);
 }
 
 
 static boolean isLoadedCtg = FALSE;
 static struct cr2g *cr2gs[8];
 
 void findClosestGene(char *chromName, int chromStart, int chromEnd, int strand,
     char **retGeneName, long *retGeneStart, long *retGeneEnd)
 {
 struct cr2g *cr2g;
 int chromIx;
 int closestDistance = 0x7fffffff;
 struct cr2g *closestGene = NULL;
 int distance;
 int okOutsideDistance = GENE_SPACE;
 
 if (c2gName == NULL)
     {
     char buf[64];
     sprintf(buf, "%s:%d-%d", chromName, chromStart, chromEnd);
     *retGeneName = cloneString(buf);
     *retGeneStart = chromStart;
     *retGeneEnd = chromEnd;
     return;
     }
 if (!isLoadedCtg)
     {
     loadCr2g(c2gName, cr2gs);
     isLoadedCtg = TRUE;
     }
 chromIx = findChromIx(chromName);
 for (cr2g = cr2gs[chromIx]; cr2g != NULL; cr2g = cr2g->next)
     {
     if (chromStart > cr2g->end)
         distance = chromStart - cr2g->end;
     else if (chromEnd < cr2g->start)
         distance = cr2g->start - chromEnd;
     else
         {
         /* Overlaps */
         distance = 0;
         }
     if (distance < closestDistance)
         {
         closestGene = cr2g;
         closestDistance = distance;
         }
     }
 if (closestDistance <= okOutsideDistance)
     {
     *retGeneName = closestGene->geneName;
     *retGeneStart = closestGene->start;
     *retGeneEnd = closestGene->end;
     }
 else
     {
     char buf[64];
     sprintf(buf, "%s:%d-%d", chromName, chromStart, chromEnd);
     *retGeneName = cloneString(buf);
     *retGeneStart = chromStart;
     *retGeneEnd = chromEnd;
     }
 }
 
 void hyperReportAlis(struct cdnaInfo *ci)
 /* Report alignments in hypertext format. */
 {
 struct fineAli *fa;
 printf("%d <A HREF=\"../cgi-bin/getgene.exe?geneName=%s\">%s</A> ",
     ci->ix, ci->name, ci->name);
 printf("%d bases %d rough alignments. Fine alignments:\n", 
     ci->baseCount, slCount(ci->roughAli));
 for (fa = ci->fineAli; fa != NULL; fa = fa->next)
     {
     if (fa->isDupe)
         continue;
     printf("%s %ld-%ld hits ", ci->name, fa->nStart, fa->nEnd);
     printf("<A HREF=\"../cgi-bin/fuzzyFind.exe?cDNA=%s&gene=%s+%ld-%ld\">",
         ci->name, chromNames[fa->chromIx], fa->hStart-100, fa->hEnd+100);
     printf("chromosome %s %ld-%ld</A>", chromNames[fa->chromIx], fa->hStart, fa->hEnd);
     printf(" gene <A HREF=\"../cgi-bin/intronerator.exe?gene=%s&withCdna=On\">%s</A>", 
         fa->geneName, fa->geneName);
     printf(" score %f\n", (double)fa->score / (double)ci->baseCount);
     }
 htmlHorizontalLine();
 }
 
 boolean correctIsBackwards(boolean putativeIsReversed, boolean isRc, struct ffAli *ffAli, char *cdnaName)
 /* Look for introns to double check orientation */
 {
 int fCount = 0;
 int rCount = 0;
 struct ffAli *left, *right;
 int nGap, hGap;
 int leftSize, rightSize;
 boolean isReversed;
 
 right = ffAli;
 for (;;)
     {
     left = right;
     if ((right = left->right) == NULL)
         break;
     leftSize = left->nEnd - left->nStart;
     rightSize = right->nEnd - right->nStart;
     nGap = right->nStart - left->nEnd;
     hGap = right->hStart - left->hEnd;
     if (nGap == 0 && hGap >= 16 && leftSize >= 5 && rightSize >= 5)
         {
         DNA *iStart = left->hEnd;
         DNA *iEnd = right->hStart;
         if (iStart[0] == 'g' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'g')
             ++fCount;
         else if (iStart[0] == 'c' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'c')
             ++rCount;
         }
     }
 if (fCount == rCount)
     return putativeIsReversed;
 if (fCount != 0 && rCount != 0)
     return putativeIsReversed;
 isReversed = (rCount > fCount);
 if (isRc)
     isReversed = !isReversed;
 if (isReversed != putativeIsReversed)
     {
     warn("Correcting backwards EST %s", cdnaName);
     }
 return isReversed;
 }
 
 void refineAlis(struct cdnaInfo *ci, struct dnaSeq *cdnaSeq)
 /* Turn ci->roughAli into ci->fineAli.  Refine alignment. */
 {
 struct roughAli *ra;
 struct fineAli *fa;
 struct ffAli *ffAli = NULL;
 DNA *unpacked;
 int outerStart, outerEnd;
 int score;
 boolean isRc;
 boolean leftCruddyCount;
 boolean rightCruddyCount;
 DNA *hayStart;
 int hayLen;
 DNA *hayEnd;
 int gStart, gEnd;
 boolean badFind;
 
 sortRoughAlis(&ci->roughAli);
 flagDupeRoughAlis(ci->roughAli);
 
 for (ra = ci->roughAli; ra != NULL; ra = ra->next)
     {
     int bestScore = -0x7fffffff;
     int oldBestScore = -0x7fffffff;
     int oldScore;
     struct ffAli *bestAli = NULL;
     boolean bestIsRc = FALSE;
 
     if (ra->isDupe)
         continue;
     /* If score is less than 1/8 of cdna size, don't bother
      * with further processing. */
     if (ra->score < ci->baseCount/8)
         {
         continue;
         }
 
     gStart = ra->gStart - 16;
     gEnd = ra->gEnd + 16;
 
     /* Unpack dna, including extra at either end. */
     fetchUnpacked(ra->chromIx, ra->gStart, ra->gEnd, 15250, 
         &unpacked, &outerStart, &outerEnd);
 
     badFind = FALSE;    
     for (;;)
         {
         clipEnds(outerStart, &gStart, &gEnd, outerEnd);
         hayStart = unpacked + gStart-outerStart;
         hayLen = gEnd - gStart;
         hayEnd = hayStart + hayLen;
         if (!ffFindEitherStrandN(cdnaSeq->dna, cdnaSeq->size, hayStart, hayLen,
             ffCdna, &ffAli, &isRc))
             {
             if (ra->score > 20 && ra->score > oldBestScore + 5 && ra == ci->roughAli)
                 {
                 if (badFind)
                     {
                     warn("%s - still couldn't ffFind after expansion",
                         ci->name);
                     break;
                     }
                 warn("%s Couldn't ffFind %s (%d bases %d score %d bestScore) in chromosome %s %d-%d", 
                     ci->name, ci->name, ci->baseCount, ra->score, oldBestScore, chromNames[ra->chromIx], gStart, gEnd);
                 addRedoHash(ci, "ffFind");
                 }
             else
                 break;
             badFind = TRUE;
             gStart -= 500;
             gEnd += 500;
             continue;
             }
         if (isRc)
             reverseComplement(cdnaSeq->dna, cdnaSeq->size);
         score = scoreExonAli(ffAli);
         oldScore = ffScoreCdna(ffAli);
         leftCruddyCount = leftFlakySize(ffAli, cdnaSeq->dna, cdnaSeq->size);
         rightCruddyCount = rightFlakySize(ffAli, cdnaSeq->dna, cdnaSeq->size) -
             polyaSize(cdnaSeq->dna, cdnaSeq->size);
         if (isRc)
             reverseComplement(cdnaSeq->dna, cdnaSeq->size);
         if (score <= bestScore)
             {
             ffFreeAli(&ffAli);
             break;
             }
         bestScore = score;
         oldBestScore = oldScore;
         ffFreeAli(&bestAli);
         bestAli = ffAli;
         bestIsRc = isRc;
         if (leftCruddyCount <= 0 && rightCruddyCount <= 0)
             break;
         if (gStart == outerStart || gEnd == outerEnd)
             break;
         if (leftCruddyCount < 16)
             gStart -= 2*leftCruddyCount;
         else
             gStart -= 5000;
         if (rightCruddyCount > 0)
             {
             if (rightCruddyCount < 16)
                 gEnd += 2*rightCruddyCount;
             else
                 gEnd += 5000;
             }
         }
     if (bestAli != NULL)
         {
         AllocVar(fa);
         fa->chromIx = ra->chromIx;
         fa->isRc = bestIsRc;
         fa->score = bestScore;
         fa->blocks = bestAli;
         fa->virtNeedle = cdnaSeq->dna;
         fa->virtHaystack = unpacked - outerStart;
         findAliEnds(fa->blocks, fa->virtNeedle, fa->virtHaystack,
             &fa->nStart, &fa->nEnd, &fa->hStart, &fa->hEnd);
         findClosestGene(chromNames[fa->chromIx], fa->hStart, fa->hEnd,
             (fa->isRc ? '-' : '+'), &fa->geneName, &fa->geneStart, &fa->geneEnd);
         fa->isBackwards = correctIsBackwards(ci->isBackwards, fa->isRc, fa->blocks, cdnaSeq->name);
         fa->next = ci->fineAli;
         ci->fineAli = fa;
         }
     freez(&unpacked);
     }
 slReverse(&ci->fineAli);
 sortFineAlis(&ci->fineAli);
 flagDupeFineAlis(ci->fineAli);
 if (weAreWeb())
     hyperReportAlis(ci);
 else
     printf("%d %s\n", ci->ix, ci->name);
 slFreeList(&ci->roughAli);
 }
 
 int bestFineScore(struct fineAli *fines)
 /* Return best score in a list of fineAlis. */
 {
 int bestScore = -0x7fffffff;
 struct fineAli *fa;
 for (fa = fines; fa != NULL; fa = fa->next)
     {
     if (fa->score > bestScore)
         bestScore = fa->score;
     }
 return bestScore;
 }
 
 int bestRoughScore(struct roughAli *roughs)
 {
 int bestScore = -0x7fffffff;
 struct roughAli *ra;
 for (ra = roughs; ra != NULL; ra = ra->next)
     {
     if (ra->score > bestScore)
         bestScore = ra->score;
     }
 return bestScore;
 }
 
 struct textLine
     {
     struct textLine *next;
     char line[1];   /* This is allocated to be bigger. */
     };
 
 struct textLine *loadLines(char *fileName)
 /* Load in each line of file into a textLine structure. */
 {
 char buf[512];
 struct textLine *list = NULL;
 struct textLine *tl;
 int textSize;
 FILE *f = mustOpen(fileName, "r");
 
 while (fgets(buf, sizeof(buf), f) != NULL)
     {
     textSize = strlen(buf);
     tl = needMem(sizeof(*tl) + textSize);
     strcpy(tl->line, buf);
     tl->next = list;
     list = tl;
     }
 slReverse(&list);
 fclose(f);
 return list;
 }
 
 void doGoodBad(struct cdnaInfo *ciList)
 {
 struct cdnaInfo *ci;
 int goodCount = 0;
 int badCount = 0;
 int poorCount = 0;
 struct fineAli *fa;
 int bestScore;
 
 for (ci=ciList; ci!=NULL; ci = ci->next)
     {
     if (ci->isDupe)
         continue;
     if (ci->fineAli == NULL)
         {
         reportBad("%s score %d\n", ci->name, 0);
         if (ci->roughScore > 32 && ci->roughScore >  ci->baseCount/8)
             addRedoHash(ci, "dropped");
         ++badCount;
         continue;
         }
     bestScore = ci->fineScore;
     if (bestScore < ci->baseCount/2)
         {
         fa = ci->fineAli;
         reportBad("%s score %f at chromosome %s:%ld-%ld gene %s\n", ci->name, (double)bestScore / (double)ci->baseCount,
             chromNames[fa->chromIx], fa->hStart, fa->hEnd, fa->geneName);
         addRedoHash(ci, "poor");
         ++poorCount;
         continue;
         }
     reportGood("cDNA %s size %d %s\n", 
         ci->name, ci->baseCount, (ci->isEmbryonic ? "embryo" : "unknown"));
     for(fa = ci->fineAli;fa!=NULL; fa = fa->next)
         {
         if (fa->isDupe)
             continue;
         if (fa->score >= bestScore)
             {
             char *chromName = chromNames[fa->chromIx];
             struct ffAli *ali, *lastAli = NULL;
 
             fa->isGood = TRUE;
             reportGood("\tscore %f chromosome %s strand %c %c gene %s\n",
                 (double)fa->score / (double)ci->baseCount, chromName, 
                 (fa->isRc ? '-' : '+'), (fa->isBackwards ? '-' : '+'), fa->geneName);
             for (ali=fa->blocks; ali != NULL; ali = ali->right)
                 {
                 reportGood("\t\t%3d %3d | %d %d size %3d goodEnds %2d %2d", 
                     ali->nStart - fa->virtNeedle, ali->nEnd - fa->virtNeedle,
                     ali->hStart - fa->virtHaystack, ali->hEnd - fa->virtHaystack,
                     ali->nEnd - ali->nStart, ali->startGood, ali->endGood);
                 if (lastAli != NULL)
                     reportGood(" gap %d | %d", 
                         ali->nStart - lastAli->nEnd, ali->hStart - lastAli->hEnd);
                 reportGood("\n");
                 lastAli = ali;
                 }
             }
         }
     ++goodCount;
     }
 printf("%d Good %d Poor %d Bad\n", goodCount, poorCount, badCount);
 }
 
 void reportBackSkippers(struct cdnaInfo *ci, struct fineAli *fa)
 /* Report cases where the alignment has one exon behind another. */
 {
 struct ffAli *ali = fa->blocks; 
 struct ffAli *right = ali->right;
 int skip;
 
 while (right != NULL)
     {
     skip = right->hStart - ali->hEnd;
     if (skip < 0)
         {
         reportUnusual("Skips %s %s:%d-%d alignment skips %d\n",
            ci->name, chromNames[fa->chromIx], fa->hStart, fa->hEnd, skip);
         addRedoHash(ci, "skipsBack");
         }
     ali = right;
     right = right->right;
     }
 }
 
 
 void doUnusual(struct cdnaInfo *cdnaList)
 /* Report unusual things. */
 {
 struct cdnaInfo *ci;
 struct fineAli *fa;
 int bestScore;
 int greatCount;
 int closeCount;
 int decentCount;
 int greatCutoff;
 int closeCutoff;
 int decentCutoff;
 char *comment;
 
 for (ci = cdnaList; ci != NULL; ci = ci->next)
     {
     if (ci->isDupe || ci->fineAli == NULL)
         continue;
     if (ci->baseCount < 100)
         reportUnusual("%s is short - %d bases\n", ci->name, ci->baseCount);
 
     bestScore = ci->fineScore;
     greatCutoff = ci->baseCount * 9 / 10;
     closeCutoff = 9*bestScore/10;
     decentCutoff = ci->baseCount/2;
     
     greatCount = closeCount = decentCount = 0;
     comment = NULL;
 
     for (fa = ci->fineAli; fa != NULL; fa = fa->next)
         {
         if (fa->isDupe)
             continue;
         if (fa->isGood)
             {
             ++decentCount;
             reportBackSkippers(ci, fa);
             }
         if (fa->score >= closeCutoff)
             ++closeCount;
         if (fa->score >= greatCutoff)
             ++greatCount;
         }
     if (greatCount > 1)
         comment = "great";
     else if (closeCount > 1)
         comment = "similar";
     else if (decentCount > 1)
         comment = "decent";
 
     if (comment != NULL)
         {
         reportUnusual("Multiple %s hits for %s\n", comment, ci->name);
         for (fa = ci->fineAli; fa != NULL; fa = fa->next)
             {
             if (fa->isDupe)
                 continue;
             if (fa->score >= decentCutoff || fa->score >= closeCutoff)
                 {
                 long hStart, hEnd, nStart, nEnd;
                 findAliEnds(fa->blocks, fa->virtNeedle, fa->virtHaystack,
                    &nStart, &nEnd, &hStart, &hEnd);
 
                 reportUnusual("\t%s:%d-%d score %f\n", 
                     chromNames[fa->chromIx], hStart, hEnd, 
                     (double)fa->score/ci->baseCount);
                 }
             }
         }
     }
 }
 
 struct geneHit
     {
     struct geneHit *next;
     char *cdnaName;
     int chromOffset;
     };
 
 struct geneHitList
     {
     struct geneHitList *next;
     char *geneName;
     struct geneHit *hits;
     };
 
 int cmpGhlName(const void *va, const void *vb)
 /* Compare two geneHitLists by name. */
 {
 struct geneHitList **pA = (struct geneHitList **)va;
 struct geneHitList **pB = (struct geneHitList **)vb;
 struct geneHitList *a = *pA, *b = *pB;
 
 return strcmp(a->geneName, b->geneName);
 }
 
 int cmpGhOffset(const void *va, const void *vb)
 /* Compare two geneHits by chromosome offset. */
 {
 struct geneHit **pA = (struct geneHit **)va;
 struct geneHit **pB = (struct geneHit **)vb;
 struct geneHit *a = *pA, *b = *pB;
 
 return a->chromOffset - b->chromOffset;
 }
 
 
 #ifdef OLD
 void makeCdnaToGene(struct cdnaInfo *cdnaList)
 /* Make cdna to gene translation file. */
 {
 struct hash *hash = newHash(12);
 struct cdnaInfo *ci;
 struct fineAli *fa;
 struct geneHit *gh;
 struct geneHitList *geneHitList = NULL;
 struct geneHitList *ghl;
 struct hashEl *he;
 
 uglyf("Making cdnaToGene file<BR>\n");
 
 for (ci = cdnaList; ci != NULL; ci = ci->next)
     {
     if (ci->isDupe)
         continue;
     for (fa = ci->fineAli; fa != NULL; fa = fa->next)
         {
         if (fa->isDupe || !fa->isGood) 
             continue;
         if ((he = hashLookup(hash, fa->geneName)) == NULL)
             {
             AllocVar(ghl);
             ghl->geneName = fa->geneName;
             ghl->next = geneHitList;
             geneHitList = ghl;
             he = hashAdd(hash, fa->geneName, ghl);
             }
         ghl = (struct geneHitList *)(he->val);
         AllocVar(gh);
         gh->cdnaName = ci->name;
         gh->chromOffset = fa->hStart;
         gh->next = ghl->hits;
         ghl->hits = gh;
         }
     }
 slSort(&geneHitList, cmpGhlName);
 
 for (ghl=geneHitList; ghl!=NULL; ghl = ghl->next)
     {
     slReverse(&ghl->hits);
     slSort(&ghl->hits, cmpGhOffset);
     fprintf(cdnaToGeneFile, "%s ", ghl->geneName);
     for (gh = ghl->hits; gh != NULL; gh = gh->next)
         fprintf(cdnaToGeneFile, "%s ", gh->cdnaName);
     fprintf(cdnaToGeneFile, "\n");
     }
 
 freeHash(&hash);
 slFreeList(&geneHitList);
 uglyf("Done making cdnaToGene file<BR>\n");
 }
 #endif /* OLD */
 
 
 
 void hitLine(struct cdnaInfo *ci, int lineCount, char *cdnaName, char *cdnaRange,
     char *mitoChrom, char *chromName, char *genomeRange, char *scoreString)
 /* Process line with hit info. */
 {
 char *s;
 struct roughAli *ra;
 
 if (ci == NULL || differentWord(cdnaName, ci->name))
     {
     errAbort("Got hit line %d without corresponding blasting for cDNA %s",
         lineCount, cdnaName);
     }
 /* Allocate structure to hold data for line. */
 ra = needMem(sizeof(*ra));
 /* Grab range of bases covered in cDNA */
 if (cdnaRange != NULL)
     {
     s = cdnaRange;
     ra->cStart = atoi(s);
     s = strchr(s, '-')+1;
     ra->cEnd = atoi(s)+1;
     if (ra->cStart >= ra->cEnd)
         {
         int temp;
         warn("%s line %d cdna %s: cStart %d > cEnd %d. Swapping.",
             cdnaName, lineCount, cdnaName, ra->cStart, ra->cEnd);
         addRedoHash(ci, "swappedCdna");
         temp = ra->cStart;
         ra->cStart = ra->cEnd;
         ra->cEnd = ra->cStart;
         }
     }
 /* Grab range of bases covered in genome. */
 s = genomeRange;
 ra->gStart = atoi(s);
 s = strchr(s, '-')+1;
 ra->gEnd = atoi(s)+1;
 if (ra->gStart >= ra->gEnd)
     {
     int temp;
     warn("%s line %d cdna %s: gStart %d > gEnd %d. Swapping.",
         cdnaName, lineCount, cdnaName, ra->gStart, ra->gEnd);
     addRedoHash(ci, "swappedGenome");
     temp = ra->gStart;
     ra->gStart = ra->gEnd;
     ra->gEnd = ra->gStart;
     }
 /* Figure out chromosome it's on. */
 if (!differentWord(mitoChrom, "Mitochondrial"))
     ra->chromIx = 6;
 else
     {
     int i;
     for (i=0; i<chromCount; ++i)
         {
         if (!differentWord(chromName, chromNames[i]))
             {
             ra->chromIx = i;
             break;
             }
         }
     }
 /* Get alignment score. */
 ra->score = atoi(scoreString);
 /* Hang it on list. */
 ra->next = ci->roughAli;
 ci->roughAli = ra;
 }
 
 void analyse(int start, int stop)
 {
 struct hash *hash;
 char line[512];
 int lineCount = 0;
 char *words[32];
 int wordCount;
 struct cdnaInfo *cdnaList = NULL;
 struct cdnaInfo *ci = NULL;
 int cdnaCount;
 int maxCdnaCount = stop - start;
 
 cdnaCount = 1;
 if (start > 1)
     {
     for (;;)
         {
         if (!fgets(line, sizeof(line), inFile))
             errAbort("Not %d cDNAs in file, only %d\n", start, cdnaCount);
         ++lineCount;
         if (line[0] == '#') /* Skip comments. */
             continue;
         wordCount = chopString(line, whiteSpaceChopper, words, ArraySize(words));
         if (wordCount <= 0) /* Skip empty lines. */
             continue;
         if (!differentWord(words[1], "alignments"))
             {
             ++cdnaCount;
             if (cdnaCount >= start)
                 break;
             }
         }
     }
 cdnaCount = 0;
 hash = newHash(14); /* Hash table with 16k entries. */
 for (;;)
     {
     if (!fgets(line, sizeof(line), inFile))
         break;
     ++lineCount;
     if (line[0] == '#') /* Skip comments. */
         continue;
     wordCount = chopString(line, whiteSpaceChopper, words, ArraySize(words));
     if (wordCount <= 0) /* Skip empty lines. */
         continue;
     if (wordCount < 4)  /* Everyone else has at least four words. */
         {
         errAbort("Short line %d:\n", lineCount);
         }
     if (sameWord(words[1], "Blasting"))
         {
         char *cdnaName = words[2];
         if ((ci = lookupInfo(hash, cdnaName)) == NULL)
             {
             struct hashEl *hel;
             ci = needMem(sizeof(*ci));
             hel = hashAdd(hash, cdnaName, ci);
             ci->next = cdnaList;
             cdnaList = ci;
             ci->ix = atoi(words[0]);
             ci->name = hel->name;
             }
         }
     else if (sameWord(words[2], "hits"))
         {
         /* Newer style - includes cDNA matching range. */
         if (ci == NULL)
             continue;
         hitLine(ci, lineCount, words[0], words[1], words[3], words[4], words[5], words[9]);
         }
     else if (sameWord(words[1], "hits"))
         /* Older style - no cDNA matching range. */
         {
         if (ci == NULL)
             continue;
         hitLine(ci, lineCount, words[0],     NULL, words[2], words[3], words[4], words[8]);
         }
    else if (sameWord(words[1], "alignments"))
         {
         struct dnaSeq *cdnaSeq;
         struct wormCdnaInfo info;
         if (ci == NULL)
             continue;
         if (differentWord(ci->name, words[3]))
             errAbort("Line %d - %s is not %s", lineCount, words[3], ci->name);
         if (!ci->finished)
             {
             if (!anyCdnaSeq(ci->name, &cdnaSeq, &info))
                 {
                 warn("Can't find cDNA %s", ci->name);
                 ci->isDupe = TRUE;
                 }
             else
                 {
                 ci->baseCount = cdnaSeq->size;
                 ci->baseCrc = dnaCrc(cdnaSeq->dna, cdnaSeq->size);
                 slReverse(&ci->roughAli);
                 ci->roughScore = bestRoughScore(ci->roughAli);
                 filterDupeCdna(ci, cdnaSeq);
                 ci->isBackwards = (info.orientation == '-');
                 refineAlis(ci, cdnaSeq);
                 ci->fineScore = bestFineScore(ci->fineAli);
                 ci->isEmbryonic = info.isEmbryonic;  
                 ci->finished = TRUE;  
                 freeDnaSeq(&cdnaSeq);
                 ++cdnaCount;
                 if (cdnaCount >= maxCdnaCount)
                     break;
                 }
             }
         }
     else
         {
         errAbort("Can't deal with line %d\n", lineCount);
         }
     }
 
 slReverse(&cdnaList);
 
 doGoodBad(cdnaList);
 doUnusual(cdnaList);
 //makeCdnaToGene(cdnaList);
 
 /* Clean up. */
 
 /* These two are slow and not really necessary. */
 #ifdef FASTIDIOUS
 slFreeList(&cdnaList);
 freeHash(&hash);
 #endif
 
 uglyf("Done analyse\n");
 }
 
 int test(int argc, char *argv[])
 {
 int start, stop;
 
 if (argc != 9 && argc != 10)
     usageErr();
 inName = argv[0];
 cdnaName = argv[1];
 chromDir = argv[2];
 goodLogName = argv[3];
 badLogName = argv[4];
 unusualName = argv[5];
 errName = argv[6];
 start = atoi(argv[7]);
 stop = atoi(argv[8]);
 if (start >= stop)
     usageErr();
 if (argc == 10)
     c2gName = argv[9];
 
 inFile = mustOpen(inName, "r");
 goodLogFile = mustOpen(goodLogName, "w");
 badLogFile = mustOpen(badLogName, "w");
 unusualFile = mustOpen(unusualName, "w");
 errFile = mustOpen(errName, "w");
 pushWarnHandler(reportWarning);
 
 dnaUtilOpen();
 
 printf("Loading chromosomes\n");
 loadGenome(chromDir, &chroms, &chromNames, &chromCount);
 startRedoHash();
 
 printf("Analysing %s\n", inName);
 if (weAreWeb())
     htmlHorizontalLine();
 
 analyse(start, stop);
 
 //endRedoHash();
 freeGenome(&chroms, &chromNames, chromCount);
 popWarnHandler();
 fclose(inFile);
 fclose(goodLogFile);
 fclose(badLogFile);
 fclose(unusualFile);
 fclose(errFile);
 return 0;
 }
 
 void doMiddle()
 {
 char *commandLine = cgiString("commandLine");
 char *words[50];
 int wordCount;
 wordCount = chopString(commandLine, whiteSpaceChopper, words, ArraySize(words));
 printf("<PRE>");
 test(wordCount, words);
 printf("</PRE>");
 }
 
 int main(int argc, char *argv[])
 {
 if (weAreWeb())
     htmShell("refineAli Output", doMiddle, "QUERY");
 else
     test(argc-1, argv+1);
 return 0;
 }