e70152e44cc66cc599ff6b699eb8adc07f3e656a
kent
  Sat May 24 21:09:34 2014 -0700
Adding Copyright NNNN Regents of the University of California to all files I believe with reasonable certainty were developed under UCSC employ or as part of Genome Browser copyright assignment.
diff --git src/hg/liftAcross/liftAcross.c src/hg/liftAcross/liftAcross.c
index 8fc6e72..6715011 100644
--- src/hg/liftAcross/liftAcross.c
+++ src/hg/liftAcross/liftAcross.c
@@ -1,561 +1,564 @@
 /* liftAcross - convert one coordinate system to another, no overlapping items. */
+
+/* Copyright (C) 2011 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "sqlNum.h"
 #include "genePred.h"
 #include "sqlList.h"
 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "liftAcross - convert one coordinate system to another, no overlapping items\n"
   "usage:\n"
   "   liftAcross [options] liftAcross.lft srcFile.gp dstOut.gp\n"
   "required arguments:\n"
   "   liftAcross.lft - six column file relating src to destination\n"
   "   srcFile.gp - genePred input file to be lifted\n"
   "   dstOut.gp - genePred output file result\n"
   "options:\n"
   "   -bedOut=<regionMap.bed> - mappings from liftAcross.lft in bed format\n"
   "	          - this is a type bed 6+ file, where thickStart and thickEnd\n"
   "               - are the GeneScaffold coordinates, unrelated to the\n"
   "               - scaffold target coordinates."
   "   -warn - warnings only on broken items, not the default error exit\n"
   "The six columns in the liftAcross.lft file are:\n"
   "srcName  srcStart  srcEnd  dstName  dstStart dstStrand\n"
   "    the destination regions are the same size as the source regions.\n"
   "This only works with items that are fully contained in a\n"
   "   single source region.  First incarnation only lifts genePred files.\n"
   "Items not found in the liftAcross.lft file are merely passed along\n"
   "   unchanged."
   );
 }
 
 char *bedOut = NULL;	/*	default NULL == not happening */
 
 static struct optionSpec options[] = {
    {"bedOut", OPTION_STRING},
    {"warn", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 struct liftSpec
 /* define one region lift */
     {
     struct liftSpec *next;
     int start;
     int end;
     char *dstName;
     int dstStart;
     char strand;
     };
 
 struct liftedItem
 /* the result of lifting a single item */
     {
     struct liftedItem *next;
     char *name;
     int start;
     int end;
     char strand;
     int frame;
     boolean outOfOrder;
     };
 
 static void freeLiftedItem(struct liftedItem **pLi)
 /* free up the space for a single item */
 {
 struct liftedItem *li = *pLi;
 if (NULL == li)
     return;
 freeMem(li->name);
 freez(pLi);
 }
 
 static void freeLiftedItemList(struct liftedItem **pLi)
 /* free up the space for all items on the list */
 {
 struct liftedItem *liList = *pLi;
 if (NULL == liList)
     return;
 struct liftedItem *li;
 struct liftedItem *next;
 for (li = liList; li; li = next)
     {
     next = li->next;
     freeLiftedItem(&li);
     }
 }
 
 static int lsStartCmp(const void *va, const void *vb)
 /* Compare a sort based on start for a liftSpec item. */
 {
 const struct liftSpec *a = *((struct liftSpec **)va);
 const struct liftSpec *b = *((struct liftSpec **)vb);
 int dif;
 dif = a->start - b->start;
 return dif;
 }
 
 static int liStartCmp(const void *va, const void *vb)
 /* Compare a sort based on start for a liftedItem. */
 {
 const struct liftedItem *a = *((struct liftedItem **)va);
 const struct liftedItem *b = *((struct liftedItem **)vb);
 int dif;
 dif = a->start - b->start;
 return dif;
 }
 
 static struct hash *readLift(char *liftAcross)
 /* read in liftAcross file, create hash of srcName as hash key,
  *	hash elements are simple lists of coordinate relationships
  *	return them all sorted by start position
  */
 {
 char *row[6];
 struct hash *result = newHash(8);
 struct hashEl *hel = NULL;
 struct lineFile *lf = lineFileOpen(liftAcross, TRUE);
 while (lineFileNextRow(lf, row, ArraySize(row)))
     {
     struct liftSpec *liftSpec;
     hel = hashStore(result, row[0]);		/* srcName hash	*/
     AllocVar(liftSpec);
     liftSpec->start = sqlUnsigned(row[1]);	/* src start	*/
     liftSpec->end = sqlUnsigned(row[2]);	/* src end	*/
     liftSpec->dstName = cloneString(row[3]);	/* dstName	*/
     liftSpec->dstStart = sqlUnsigned(row[4]);	/* dst start	*/
     liftSpec->strand = '+';			/* dst strand	*/
     if ('-' == *row[5])
 	liftSpec->strand = '-';
     /* accumulate list of lift specs under the srcName hash	*/
     slAddHead(&(hel->val), liftSpec);
     }
 
 /*	Go through each srcName in the hash, and sort the list there by
  *	the start coordinate of each item.  The searching will expect
  *	them to be in order.
  */
 struct hashCookie cookie = hashFirst(result);
 while ((hel = hashNext(&cookie)) != NULL)
     {
     slSort(&(hel->val), lsStartCmp);
     if (verboseLevel() > 2)
 	{
 	struct liftSpec *ls;
 	for (ls = hel->val; ls != NULL; ls = ls->next)
 	    verbose(3, "# %s\t%d\t%d\t%s\t%d\t%c\n", hel->name, ls->start,
 		ls->end, ls->dstName, ls->dstStart, ls->strand);
 	}
     }
 
 return result;
 }
 
 static int verifyCoordinates(int start1, int end1, int start2, int end2,
     char *name1, char *name2)
 /* verify start1-end1 is completely contained in start2-end2
  *	return (when 'warn'):
  *	-1 == no overlap at all, completely disjoint
  *	1 == partially overlap
  *	0 == perfectly OK
  * name1 and name2 are along for the ride only for printout
  */
 {
 if ((end1 <= start2) || (start1 >= end2))
     {
     if (optionExists("warn"))
 	warn("item %s:%d-%d can not be found in %s:%d-%d",
 	    name1, start1, end1, name2, start2, end2);
     else
 	errAbort("item %s:%d-%d can not be found in %s:%d-%d",
 	    name1, start1, end1, name2, start2, end2);
     return -1;
     }
 if ((start1 < start2) || (end1 > end2))
     {
     if (optionExists("warn"))
 	warn("item %s:%d-%d not fully contained in %s:%d-%d",
 	    name1, start1, end1, name2, start2, end2);
     else
 	errAbort("item %s:%d-%d not fully contained in %s:%d-%d",
 	    name1, start1, end1, name2, start2, end2);
     return 1;
     }
 return 0;
 }
 
 static struct liftSpec *findLiftSpec(struct liftSpec *lsList, char *srcName,
     int start, int end)
 /* see if the given item: start-end can be found in the liftSpec lsList
  *	if it can, return the liftSpec for that item
  *	if not found, return NULL
  * Verify sanity of lifting this item.  It needs to fall fully within
  *	the liftSpec found.  If not, it will not lift.
  * The srcName is used here merely for debug and error output
  *	information.  The given liftSpec is the liftSpec that belongs
  *	to this srcName as already found by the caller to findLiftSpec.
  */
 {
 struct liftSpec *ret = NULL;
 struct liftSpec *ls = NULL;
 for (ls = lsList; ls != NULL; ls = ls->next)
     {
     if (start < ls->end)	/* lift found when start < end */
 	{
 	if (verifyCoordinates(start, end, ls->start, ls->end,
 		srcName, ls->dstName))
 	    break;	/* skip any illegal ones */
 	ret = ls;	/* this one is it, remember it and */
 	break;		/* get out of here */
 	}
     }
 return (ret);
 }
 
 struct liftedItem *liftOne(struct liftSpec *lift,
 	char *srcName, int start, int end)
 /* Given an item: start-end
  * lift it via the given liftSpec to a liftedItem
  *	return NULL if not found in the liftSpec
  * The srcName is used here merely for debug and error output
  *	information.  The given liftSpec is the liftSpec that belongs
  *	to this srcName as already found by the caller to liftOne.
  */
 {
 struct liftedItem *result = NULL;
 struct liftSpec *ls = findLiftSpec(lift, srcName, start, end);
 if (ls)
     {
     AllocVar(result);
     result->name = cloneString(ls->dstName);
     result->strand = ls->strand;
     result->outOfOrder = FALSE;	/* can only be determined later */
     if ('+' == ls->strand)
 	{
 	result->start = start - ls->start + ls->dstStart;
 	result->end = end - ls->start + ls->dstStart;
 	}
     else
 	{
 	result->start = ls->end - end + ls->dstStart;
 	result->end = ls->end - start + ls->dstStart;
 	}
     verbose(3,"#\t%s:%d-%d -> %s:%d-%d %c\n", srcName, start, end,
 	result->name, result->start, result->end, result->strand);
     }
 return result;
 }
 
 static struct genePred *liftGenePred(struct genePred *gp, struct liftSpec *ls)
 /* lift the specified genePred gp via the given liftSpec ls
  *	result is a list of genePred items on various other scaffolds
  */
 {
 struct genePred *result = NULL;
 struct hash *items = newHash(8);
 struct hash *prevLift = newHash(8);
 struct hashEl *hel = NULL;
 int exonCount = gp->exonCount;
 struct liftedItem *cdsStart = liftOne(ls, gp->chrom, gp->cdsStart,
 	gp->cdsStart+1);
 struct liftedItem *cdsEnd = liftOne(ls, gp->chrom, gp->cdsEnd-1, gp->cdsEnd);
 int i;
 boolean frames = FALSE;
 boolean noCds = FALSE;
 
 if (((0 == gp->cdsStart) && (0 == gp->cdsEnd)) || (gp->cdsStart == gp->cdsEnd))
     noCds = TRUE;
 else if (NULL == cdsStart)
     warn("#\tWARNING: missing cdsStart for %s:%d\n", gp->chrom, gp->cdsStart);
 
 if (((0 == gp->cdsStart) && (0 == gp->cdsEnd)) || (gp->cdsStart == gp->cdsEnd))
     noCds = TRUE;
 else if (NULL == cdsEnd)
     warn("#\tWARNING: missing cdsEnd for %s:%d\n", gp->chrom, gp->cdsEnd);
 
 if (gp->optFields & genePredExonFramesFld)
     frames = TRUE;
 
 /* lift each exon.  Some do not lift, their lift specification may be
  * missing, ignore them.  For each lifted exon, keep it on a hash list by the
  *	lifted scaffold name
  */
 for (i = 0; i < exonCount; ++i)
     {
     struct liftedItem *itemLift = liftOne(ls, gp->chrom,
 	gp->exonStarts[i], gp->exonEnds[i]);
     if (itemLift)
 	{
 	if (frames)
 	    itemLift->frame = gp->exonFrames[i];
 	hel = hashStore(items, itemLift->name);
 	slAddHead(&(hel->val), itemLift);
 	/* see if exons are getting mixed up, compare result with previous lift
 	 *	on this scaffold */
 	struct hashEl *prev = hashStore(prevLift, itemLift->name);
 	if (prev->val)
 	    {
 	    struct liftedItem *prevLift = prev->val;
 	    /* things are out of order if the strands are different, or
 	     *	the items are not marching along in order.  - strand
 	     *	results reverse the sense of "marching along"
 	     */
 	    if ( (prevLift->strand != itemLift->strand) ||
 		(('-' == itemLift->strand) ?
 			(itemLift->end > prevLift->start) :
 			(prevLift->end > itemLift->start) ) )
 		{
     		prevLift->outOfOrder = itemLift->outOfOrder = TRUE;
 		}
 	    }
 	prev->val = itemLift;
 	}
     }
 freeHash(&prevLift);
 
 /* Each scaffold name in the hash becomes a new genePred item all by itself */
 struct hashCookie cookie = hashFirst(items);
 while ((hel = hashNext(&cookie)) != NULL)
     {
     struct liftedItem *li = NULL;
     struct liftedItem *itemList = (struct liftedItem *)hel->val;
     struct genePred *gpItem = NULL;
     unsigned *exonStarts = NULL;
     unsigned *exonEnds = NULL;
     int itemExonCount = slCount(itemList);
 
     slSort(&itemList, liStartCmp);	/* sort the exons by start coord */
     AllocArray(exonStarts, itemExonCount);
     AllocArray(exonEnds, itemExonCount);
     AllocVar(gpItem);	/*	start this new genePred item	*/
     if (frames)
 	AllocArray(gpItem->exonFrames, itemExonCount);
     gpItem->name = cloneString(gp->name);	/* gene name stays the same */
     gpItem->chrom = cloneString(hel->name);	/* new scaffold name */
     gpItem->exonCount = itemExonCount;		/* this many exons in this gp */
     gpItem->exonStarts = exonStarts;		/* an array of unsigned ints */
     gpItem->exonEnds = exonEnds;		/* an array of unsigned ints */
     gpItem->strand[0] = gp->strand[0];		/* assume lifted to + result */
     /* set result strand properly, lifted to - may reverse it */
     if ('-' == itemList->strand)		/* assume all on same strand */
 	gpItem->strand[0] = ('+' == gp->strand[0]) ? '-' : '+';
 
     /* no need to set gpItem->strand[1]=0, is already zero from the AllocVar */
 
     int i = 0;
     boolean exonsOutOfOrder = FALSE;
     /* copy each new exon start and end to the new gpItem */
     for (li = itemList; li != NULL; li = li->next)
 	{
 	if (frames)
 	    gpItem->exonFrames[i] = li->frame;
 	exonStarts[i] = li->start;
 	exonEnds[i++] = li->end;
 	if (li->outOfOrder)
 	    exonsOutOfOrder = TRUE;
 	}
     /*	assign cdsStart,End appropriately.  txStart/End are always first
      *	and last exon extents no matter what.
      *	First, assume this scaffold is not the one with the cds bits, thus
      *	the first and last exon define these cds starts and ends.
      *	If this scaffold has one of those, then apply the actual cds bit.
      *	A negative strand lift result turns everything on its head.
      */
     gpItem->txStart = exonStarts[0];  /* assume first exon */
     gpItem->txEnd = exonEnds[itemExonCount-1];  /* assume last exon */
     if ('-' == itemList->strand)	/* assume all on same strand */
 	{
 	if (noCds)
 	    {
 	    gpItem->cdsStart = gpItem->txEnd;
 	    gpItem->cdsEnd = gpItem->txEnd;
 	    }
 	else
 	    {
 	    gpItem->cdsStart = exonStarts[0];/* assume first exon */
 	    gpItem->cdsEnd = exonEnds[itemExonCount-1];/* assume last exon */
 	    if (cdsStart && sameWord(cdsStart->name, hel->name))
 		gpItem->cdsEnd = cdsStart->start+1; /* this has the cdsEnd */
 	    if (cdsEnd && sameWord(cdsEnd->name, hel->name))
 		gpItem->cdsStart = cdsEnd->start;   /* this has the cdsStart */
 	    /*	fix unusual (mixed up) mappings onto a scaffold */
 	    if (exonsOutOfOrder)
 		{
 		gpItem->cdsStart = min(gpItem->cdsStart, exonStarts[0]);
 		gpItem->cdsEnd = max(gpItem->cdsEnd,exonEnds[itemExonCount-1]);
 		}
 	    }
 	}
     else
 	{
 	if (noCds)
 	    {
 	    gpItem->cdsEnd = gpItem->txEnd;
 	    gpItem->cdsStart = gpItem->txEnd;
 	    }
 	else
 	    {
 	    gpItem->cdsStart = exonStarts[0];  /* assume first exon */
 	    gpItem->cdsEnd = exonEnds[itemExonCount-1];/* assume last exon */
 	    if (cdsStart && sameWord(cdsStart->name, hel->name))
 		gpItem->cdsStart = cdsStart->start; /* this has the cdsStart */
 	    if (cdsEnd && sameWord(cdsEnd->name, hel->name))
 		gpItem->cdsEnd = cdsEnd->end;   /* this has the cdsEnd */
 	    /*	fix unusual (mixed up) mappings onto a scaffold */
 	    if (exonsOutOfOrder)
 		{
 		gpItem->cdsStart = min(gpItem->cdsStart, exonStarts[0]);
 		gpItem->cdsEnd = max(gpItem->cdsEnd, exonEnds[itemExonCount-1]);
 		}
 	    }
 	}
 
     /*	if optional fields present, carry them along to the new item */
     gpItem->optFields = gp->optFields;
     if (gpItem->optFields & genePredScoreFld)
 	gpItem->score = gp->score;
     if (gpItem->optFields & genePredName2Fld)
 	{
 	if (gp->name2)
 	    gpItem->name2 = cloneString(gp->name2);
 	else
 	    gpItem->name2 = cloneString("");
 	}
     if (gpItem->optFields & genePredCdsStatFld)
 	{
 	gpItem->cdsStartStat = gp->cdsStartStat;
 	gpItem->cdsEndStat = gp->cdsEndStat;
 	}
 
     /* accumulating a list of genePred items for the final result */
     slAddHead(&result, gpItem);
     freeLiftedItemList((struct liftedItem **)(&hel->val));
     }	/*	while ((hel = hashNext(&cookie)) != NULL)	*/
 
 freeHash(&items);
 
 return (result);
 }	/*	static struct genePred *liftGenePred()	*/
 
 static void bedRegionOutput(struct hash *lftHash)
 /* write the lift spec out as a bed file to map the regions lifted
  *	scores of the items are set at 1000 when they are in order,
  *	or 200 when they are out of order.
 */
 {
 FILE *bedFile = mustOpen(bedOut, "w");
 struct hashEl *hel = NULL;
 struct hashCookie cookie = hashFirst(lftHash);
 while ((hel = hashNext(&cookie)) != NULL)
     {
     struct liftSpec *lsList = hel->val;
     struct liftSpec *ls = NULL;
     int itemCount = 0;
     struct liftSpec *prevLs = lsList;
     struct liftSpec *nextLs = prevLs->next;
     int size = prevLs->end - prevLs->start;
     int prevStart = prevLs->dstStart;
     int prevEnd = prevStart + size;
     char prevStrand = prevLs->strand;
     char *prevDstName = NULL;
     int nextStart = prevStart;
     int nextEnd = prevEnd;
     char nextStrand = prevStrand;
     char *nextDstName = NULL;
 
     for (ls = lsList; ls != NULL; ls = ls->next)
 	{
 	size = ls->end - ls->start;
 	int dstEnd = ls->dstStart+size;
 	int score = 1000;	/* assume everything is in order */
 	nextLs = ls->next;
 	if (nextLs)		/* if there is a next item */
 	    {
 	    size = nextLs->end - nextLs->start;
 	    nextStart = nextLs->dstStart;
 	    nextEnd = nextStart + size;
 	    nextStrand = nextLs->strand;
 	    nextDstName = nextLs->dstName;
 	    }
 	/* verify we are properly in order from previous item 
 	 *	strands must be the same, and start/end in order
 	 *	negative strand reverses meaning of "in order"
 	 */
 	if ( ! differentStringNullOk(prevDstName,ls->dstName) &&
 		( (prevStrand != ls->strand) ||
 	    (('-' == ls->strand) ? (dstEnd > prevStart) :
 				    (prevEnd > ls->dstStart) ) ) )
 	    {
 	    score = 200;
 	    }
 	/* when possible, verify next item properly follows this item */
 	if (nextLs)
 	    {
 	    if ( ! differentStringNullOk(nextDstName,ls->dstName) &&
 		    ( (nextStrand != ls->strand) ||
 		(('-' == ls->strand) ? (nextEnd > ls->dstStart) :
 					(dstEnd > nextStart) ) ) )
 		{
 		score = 200;
 		}
 	    }
 	fprintf(bedFile, "%s\t%d\t%d\t%s.%d\t%d\t%c\t%d\t%d\n",
 	    ls->dstName, ls->dstStart, dstEnd, hel->name,
 	    ++itemCount, score, ls->strand, ls->start, ls->end);
 	prevStart = ls->dstStart;
 	prevEnd = dstEnd;
 	prevStrand = ls->strand;
 	prevDstName = ls->dstName;
 	}
     }
 }
 
 void liftAcross(char *liftAcross, char *srcFile, char *dstOut)
 /* liftAcross - convert one coordinate system to another, no overlapping items. */
 {
 struct hash *lftHash = readLift(liftAcross);
 struct genePred *gpList = genePredExtLoadAll(srcFile);
 struct genePred *gp = NULL;
 FILE *out = mustOpen(dstOut, "w");
 
 if (bedOut)
     bedRegionOutput(lftHash);
 
 int genePredItemCount = 0;
 for (gp = gpList; gp != NULL; gp = gp->next)
     {
     struct liftSpec *lsFound = hashFindVal(lftHash, gp->chrom);
     if (lsFound)
 	{
 	struct genePred *gpLifted = liftGenePred(gp, lsFound);
 	struct genePred *gpl;
 	for (gpl = gpLifted; gpl != NULL; gpl = gpl->next)
 	    genePredTabOut(gpl, out);
 	genePredFreeList(&gpLifted);
 	}
     else
 	{
 	genePredTabOut(gp, out);
 	}
     ++genePredItemCount;
     }
 /* lftHash and gpList are left allocated to disappear at exit */
 verbose(2,"#\tgene pred item count: %d\n", genePredItemCount);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 bedOut = optionVal("bedOut", bedOut);
 liftAcross(argv[1], argv[2], argv[3]);
 return 0;
 }