e70152e44cc66cc599ff6b699eb8adc07f3e656a
kent
  Sat May 24 21:09:34 2014 -0700
Adding Copyright NNNN Regents of the University of California to all files I believe with reasonable certainty were developed under UCSC employ or as part of Genome Browser copyright assignment.
diff --git src/hg/genePredToMafFrames/mkMafFrames.c src/hg/genePredToMafFrames/mkMafFrames.c
index fe3d29a..eea520c 100644
--- src/hg/genePredToMafFrames/mkMafFrames.c
+++ src/hg/genePredToMafFrames/mkMafFrames.c
@@ -1,345 +1,348 @@
 /* mkMafFrames - build mafFrames objects for exons, this does not handle
  * issues of linking mafFrames. */
+
+/* Copyright (C) 2006 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
 #include "common.h"
 #include "genePred.h"
 #include "frameIncr.h"
 #include "maf.h"
 #include "dnautil.h"
 #include "orgGenes.h"
 #include "chromBins.h"
 #include "binRange.h"
 #include "verbose.h"
 #include "localmem.h"
 
 /* assert for a frame being valid */
 #define assertFrame(fr) assert(((fr) >= 0) && ((fr) <= 2))
 
 /* data structure used to scan a mafComp object */
 struct compCursor {
     struct mafComp *comp;   /* component object */
     int pos;                /* corresponding position in the sequence; it at a
                              * gap in the alignment, this is the next position
                              * in the sequence*/
     char *ptr;              /* pointer into text */
 };
 
 /* data structure used to scan a pair of mafComp objects */
 struct scanCursor {
     struct compCursor query;   /* cursor for gene (query) */
     struct compCursor target;  /* cursor for target (genome) */
 };
 
 static struct compCursor compCursorNew(struct mafComp *comp)
 /* construct a new compCursor.  Note: struct is copied, it is not dynamic */
 {
 struct compCursor cc;
 cc.comp = comp;
 cc.pos = comp->start;
 cc.ptr = comp->text;
 return cc;
 }
 
 static boolean compCursorAtBase(struct compCursor *cc)
 /* Check if a position contains a base, this uses a quick check that really
  * doesn't test for a base, just >= A, so '.', '-', and '\0' fail */
 {
 return (*cc->ptr >= 'A');
 }
 
 static struct scanCursor scanCursorNew(struct mafComp *geneComp, struct mafComp *targetComp)
 /* construct a new scanCursor.  Note: struct is copied, it is not dynamic */
 {
 struct scanCursor sc;
 sc.query = compCursorNew(geneComp);
 sc.target = compCursorNew(targetComp);
 return sc;
 }
 
 static boolean scanCursorIncr(struct scanCursor *sc)
 /* increment scan cursor one position, return FALSE at end */
 {
 /* the < 'A' tests for '.', '-', '\0' so that we advance past the last position
  * in at the end of the string. */
 if (compCursorAtBase(&sc->query))
     sc->query.pos++;
 sc->query.ptr++;
 if (compCursorAtBase(&sc->target))
     sc->target.pos++;
 sc->target.ptr++;
 return (*sc->query.ptr != '\0');
 }
 
 static boolean scanCursorAdvToQueryPos(struct scanCursor *sc, int qPos)
 /* advance cursor to a position in the gene's component space */
 {
 while (sc->query.pos < qPos)
     {
     if (!scanCursorIncr(sc))
         return FALSE;
     }
 return TRUE;
 }
 
 struct scanInfo
 /* object that holds state while scanning the alignment for an exon */
 {
     struct scanCursor *sc;  /* cursor into the alignment */
     struct cdsExon *exon;   /* exon being processed */
     int exonQStart;         /* exon coords mapped query seq strand */
     int exonQEnd;
     int blkQStart;          /* exon coords, mapped and adjust to block bounds */
     int blkQEnd;
     int frame;              /* current frame */
     int frScanDir;          /* frame scanning direction (+1|-1) relative to transcription */
     int subQStart;          /* query and target start of current subrange of exon,
                              * -1 for none */
     int subTStart;
     int subStartFrame;      /* frame frame subrange of exon, start scan start */
     int subEndFrame;
 };
 
 static void traceFrameDef(int level, struct scanInfo *si, struct exonFrames *ef)
 /* verbose trace to indicate why a mafFrames record was defined */
 {
 verbose(level, "exon %s[%d] %s:%d-%d [%d] %c fm: %d; ",
         si->exon->gene->name, si->exon->exonNum,
         si->exon->gene->chrom, si->exon->chromStart, si->exon->chromEnd,
         (si->exon->chromEnd - si->exon->chromStart),
         si->exon->gene->strand, si->exon->frame);
 verbose(level, "subQuery %s:%d-%d [%d] %c fm: %d-%d; ",
         si->sc->query.comp->src, si->subQStart, si->sc->query.pos,
         (si->sc->query.pos - si->subQStart),
         si->sc->query.comp->strand, si->subStartFrame, si->subEndFrame);
 verbose(level, "subTarget %s:%d-%d [%d] %c fm: %d\n",
         si->sc->target.comp->src, si->subTStart, si->sc->target.pos, 
         (si->sc->target.pos - si->subTStart), 
         si->sc->target.comp->strand, ef->mf.frame);
 }
 
 static void addMafFrame(struct scanInfo *si)
 /* add an MAF frame record for the current subrange to the exon. Assumes that
  * the cursor has gone past the last query column to include in record. Resets
  * the state.*/
 {
 struct exonFrames *ef;
 char src[256], *tName, strand, *dot, frame;
 int cdsOff;
 
 /* target excludes organism */
 tName = (strchr(si->sc->target.comp->src, '.')+1);
 
 /* frame is for first base in direction of transcription */
 if (si->frScanDir > 0)
     frame = si->subStartFrame;
 else
     frame = si->subEndFrame;
 
 /* project strand to target */
 strand = (si->exon->gene->strand == si->sc->query.comp->strand) ? '+' : '-';
 if (si->sc->target.comp->strand == '-')
     strand = (strand == '+') ? '-' : '+';
 
 /* get src name, removing sequence id */
 safef(src, sizeof(src), "%s", si->sc->query.comp->src);
 dot = strchr(src, '.');
 if (dot != NULL)
     *dot = '\0';
 
 /* compute offset within CDS */
 if (si->frScanDir > 0)
     cdsOff = si->exon->cdsOff + (si->subQStart - si->exonQStart);
 else
     cdsOff = si->exon->cdsOff + (si->exonQEnd - si->sc->query.pos);
 
 /* create and link exon */
 ef = cdsExonAddFrames(si->exon, si->subQStart, si->sc->query.pos, si->sc->query.comp->strand,
                       tName, si->subTStart, si->sc->target.pos,
                       frame, strand, cdsOff);
 
 if (verboseLevel() >= 2)
     traceFrameDef(2, si, ef);
 assert((ef->srcEnd-ef->srcStart) == (ef->mf.chromEnd-ef->mf.chromStart));
 
 /* reset state */
 si->subQStart = -1;
 si->subTStart = -1;
 si->subStartFrame = -1;
 si->subEndFrame = -1;
 }
 
 static struct scanInfo scanInfoInit(struct scanCursor *sc, struct cdsExon *exon)
 /* initialize scanInfo object for an exon.  This doesn't advance to the first
  * aligned position in the exon */
 {
 struct scanInfo si;
 int queryEnd = sc->query.comp->start + sc->query.comp->size;
 int frameStart, frameEnd;
 
 ZeroVar(&si);
 si.sc = sc;
 si.exon = exon;
 si.subTStart = -1;
 si.subStartFrame = -1;
 si.subEndFrame = -1;
 
 /* direction frame will increment */
 assert(sc->target.comp->strand == '+');
 si.frScanDir = (exon->gene->strand == sc->query.comp->strand) ? 1 : -1;
 
 /* get coordinates and frame at each end of the exon */
 si.exonQStart = exon->chromStart;
 si.exonQEnd = exon->chromEnd;
 frameStart = (exon->gene->strand == '+')
     ? exon->frame : frameIncr(exon->frame, (si.exonQEnd - si.exonQStart)-1);
 frameEnd = (exon->gene->strand == '+')
     ? frameIncr(exon->frame, (si.exonQEnd - si.exonQStart)-1) : exon->frame;
 
 /* genePred coordinates are always on the positive strand, we need to reverse
  * them to search the strand-specific MAF coords if the component is on the
  * negative strand */
 if (sc->query.comp->strand == '-')
     {
     int hold = frameStart;
     frameStart = frameEnd;
     frameEnd = hold;
     reverseIntRange(&si.exonQStart, &si.exonQEnd, sc->query.comp->srcSize);
     }
 
 /* adjust to bounds of query in maf block */
 si.blkQStart = si.exonQStart;
 si.blkQEnd = si.exonQEnd;
 if (si.blkQStart < sc->query.comp->start)
     {
     int delta = sc->query.comp->start - si.blkQStart;
     si.blkQStart = sc->query.comp->start;
     frameStart = frameIncr(frameStart, si.frScanDir*delta);
     }
 if (si.blkQEnd > queryEnd)
     {
     int delta = si.blkQEnd - queryEnd;
     si.blkQEnd = queryEnd;
     frameEnd = frameIncr(frameEnd, -1*si.frScanDir*delta);
     }
 si.frame = frameStart;
 
 /* Ugly side-affect: record src chrom size for gene the first time it's
  * encounter, since it was available when gene is read in */
 if (exon->gene->chromSize == 0)
     exon->gene->chromSize = sc->query.comp->srcSize;
 else if (exon->gene->chromSize != sc->query.comp->srcSize)
     errAbort("srcSize for %s differes in MAF; previous found as, %d, now %d",
              sc->query.comp->src, exon->gene->chromSize, sc->query.comp->srcSize);
 return si;
 }
 
 static void processColumn(struct scanInfo *si)
 /* process one column of the alignment for an exon */
 {
 if (compCursorAtBase(&si->sc->query))
     {
     /* column contains query */
     if (compCursorAtBase(&si->sc->target))
         {
         /* target also aligns position */
         if (si->subTStart < 0)
             {
             /* remember start */
             si->subQStart = si->sc->query.pos;
             si->subTStart = si->sc->target.pos;
             si->subStartFrame = si->frame;
             }
         si->subEndFrame = si->frame;
         }
     else
         {
         /* target does not align position */
         if (si->subTStart >= 0)
             {
             addMafFrame(si);
             }
         }
     /* advance frame */
     si->frame = frameIncr(si->frame, si->frScanDir); 
     }
 else
     {
     /* query not aligned, if target is aligned, output current frame */
     if (compCursorAtBase(&si->sc->target) && (si->subTStart >= 0))
         {
         addMafFrame(si);
         }
     }
 }
 
 static void mkCompExonFrames(struct scanCursor *sc, struct cdsExon *exon)
 /* create mafFrames objects for a mafComp and an exon. */
 {
 struct scanInfo si = scanInfoInit(sc, exon);
 
 /* advance to start of exon in component, which must be there due
  * to the bin search */
 if (!scanCursorAdvToQueryPos(sc, si.blkQStart))
     errAbort("BUG: should have found exon in this component");
 si.frame = frameIncr(si.frame, si.frScanDir*(sc->query.pos - si.blkQStart));
 
 /* scan columns of alignment overlapping exon */
 while (sc->query.pos < si.blkQEnd)
     {
     processColumn(&si);
     if (!scanCursorIncr(si.sc))
         break;
     }
 if (si.subTStart >= 0)
     addMafFrame(&si);
 }
 
 static void mkCompFrames(struct orgGenes *genes,
                          struct mafComp *queryComp,
                          struct mafComp *targetComp)
 /* create mafFrames objects for an mafComp */
 {
 struct scanCursor sc = scanCursorNew(queryComp, targetComp);
 
 /* n.b. the order of scanning is very important here or will miss some exons
  * if they share a block */
 int sortDir =  (queryComp->strand == targetComp->strand) ? 1 : -1;
 struct binElement *exonRefs = orgGenesFind(genes, queryComp, sortDir);
 struct binElement *exonRef;
 
 for (exonRef = exonRefs; exonRef != NULL; exonRef = exonRef->next)
     mkCompExonFrames(&sc, (struct cdsExon*)exonRef->val);
 
 slFreeList(&exonRefs);
 }
 
 static void mkFramesForOrg(struct mafAli *ali, struct mafComp *targetComp,
                            struct orgGenes *genes)
 /* create frames for an organism and a maf alignment */
 {
 struct mafComp *queryComp = mafMayFindComponentDb(ali, genes->srcDb);
 if (queryComp != NULL)
     mkCompFrames(genes, queryComp, targetComp);
 }
 
 void mkMafFramesForMaf(char *targetDb, struct orgGenes *orgs,
                        char *mafFilePath)
 /* create mafFrames objects from an MAF file */
 {
 struct mafFile *mafFile = mafOpen(mafFilePath);
 struct mafAli *ali;
 struct orgGenes *genes;
 
 while ((ali = mafNext(mafFile)) != NULL)
     {
     struct mafComp *targetComp = mafMayFindComponentDb(ali, targetDb);
     if (targetComp != NULL)
         {
         if (targetComp->strand == '-')
             mafFlipStrand(ali);  /* code requires target strand is + */
         for (genes = orgs; genes != NULL; genes = genes->next)
             mkFramesForOrg(ali, targetComp, genes);
         }
     mafAliFree(&ali);
     }
 mafFileFree(&mafFile);
 }