src/hg/txGene/txGeneExplainUpdate2/txGeneExplainUpdate2.c 1.2

1.2 2009/10/03 02:15:22 kent
Adding -unmapped and -oldAsm options.
Index: src/hg/txGene/txGeneExplainUpdate2/txGeneExplainUpdate2.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/txGene/txGeneExplainUpdate2/txGeneExplainUpdate2.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/txGene/txGeneExplainUpdate2/txGeneExplainUpdate2.c	25 Aug 2008 18:18:22 -0000	1.1
+++ src/hg/txGene/txGeneExplainUpdate2/txGeneExplainUpdate2.c	3 Oct 2009 02:15:22 -0000	1.2
@@ -1,212 +1,233 @@
 /* txGeneExplainUpdate2 - Make table explaining correspondence between versions of UCSC genes. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "bed.h"
 #include "binRange.h"
 
 
 static char const rcsid[] = "$Id$";
 
+char *unmapped = NULL;
+char *oldAsm = NULL;
+
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "txGeneExplainUpdate2 - Make table explaining correspondence between versions of UCSC genes.\n"
   "usage:\n"
   "   txGeneExplainUpdate2 oldKg.bed newKg.bed out.tab\n"
   "options:\n"
-  "   -xxx=XXX\n"
+  "   -unmapped=unmapped.bed - file containing beds from previous assembly that didn't map well\n"
+  "   -oldAsm=hg18 - name of old database where unmapped ones used to live.\n"
   );
 }
 
 static struct optionSpec options[] = {
+   {"unmapped", OPTION_STRING},
+   {"oldAsm", OPTION_STRING},
    {NULL, 0},
 };
 
 struct bed *findExact(struct bed *oldBed, struct hash *newKeeperHash)
 /* Try and find a new bed identical with old bed. */
 {
 struct binKeeper *bk = hashFindVal(newKeeperHash, oldBed->chrom);
 if (bk == NULL)
     return NULL;
 struct bed *matchingBed = NULL;
 struct binElement *bin, *binList = binKeeperFind(bk, oldBed->chromStart, oldBed->chromEnd);
 for (bin = binList; bin != NULL; bin = bin->next)
     {
     struct bed *newBed = bin->val;
     if (oldBed->strand[0] == newBed->strand[0])
         {
 	if (bedExactMatch(oldBed, newBed))
 	    {
 	    matchingBed = newBed;
 	    break;
 	    }
 	}
     }
 slFreeList(&binList);
 return matchingBed;
 }
 
 struct bed *findCompatible(struct bed *oldBed, struct hash *newAccHash)
 /* Find bed in newHash with same accession (but maybe different version) */
 {
 char accOnly[64];
 safef(accOnly, sizeof(accOnly), oldBed->name);
 chopSuffix(accOnly);
 return hashFindVal(newAccHash, accOnly);
 }
 
 struct bed *findMostOverlapping(struct bed *bed, struct hash *keeperHash)
 /* Try find most overlapping thing to bed in keeper hash. */
 {
 struct bed *bestBed = NULL;
 int bestOverlap = 0;
 struct binKeeper *bk = hashFindVal(keeperHash, bed->chrom);
 if (bk == NULL)
     return NULL;
 struct binElement *bin, *binList = binKeeperFind(bk, bed->chromStart, bed->chromEnd);
 for (bin = binList; bin != NULL; bin = bin->next)
     {
     struct bed *bed2 = bin->val;
     if (bed2->strand[0] == bed->strand[0])
 	{
 	int overlap = bedSameStrandOverlap(bed2, bed);
 	if (overlap > bestOverlap)
 	    {
 	    bestOverlap = overlap;
 	    bestBed = bed2;
 	    }
 	}
     }
 slFreeList(&binList);
 return bestBed;
 }
 
 void explainMissing(struct bed *oldBed, char *type, FILE *f)
 /* Write out a line explaining why old gene has gone missing. */
 {
 char *newName = "";
 char *note = ""; 	/* Some day would be nice to elaborate on note. */
 fprintf(f, "%s\t%s\t%d\t%d\t%s\t%s\t%s\n", oldBed->name, oldBed->chrom, 
 	oldBed->chromStart, oldBed->chromEnd, newName, type, note);
 }
 
 boolean intronsDifferent(struct bed *a, struct bed *b)
 /* Return TRUE if introns of a and b are same. */
 {
 int blockCount = a->blockCount;
 if (b->blockCount != blockCount)
     return TRUE;
 int i;
 for (i=1; i<blockCount; ++i)
     {
     int aStart = a->chromStart + a->chromStarts[i-1] + a->blockSizes[i-1];
     int bStart = b->chromStart + b->chromStarts[i-1] + b->blockSizes[i-1];
     int aEnd = a->chromStart + a->chromStarts[i];
     int bEnd = b->chromStart + b->chromStarts[i];
     if (aStart != bStart || aEnd != bEnd)
         return TRUE;
     }
 return FALSE;
 }
 
 char *diffNote(struct bed *oldBed, struct bed *newBed)
 /* Return note explaining difference. */
 {
 char *note = "";
 boolean oldIsCoding = (oldBed->thickStart < oldBed->thickEnd);
 boolean newIsCoding = (newBed->thickStart < newBed->thickEnd);
 boolean bothCoding = (oldIsCoding && newIsCoding);
 
 if (oldIsCoding != newIsCoding)
     {
     if (oldIsCoding)
         note = "No longer considered protein coding";
     else
         note = "Now considered protein coding";
     }
 else if (oldBed->blockCount != newBed->blockCount)
     note = "Different number of exons";
 else if (bothCoding && (oldBed->thickStart < newBed->thickStart || oldBed->thickEnd > newBed->thickEnd))
     note = "Bases removed from coding region";
 else if (bothCoding && (oldBed->thickStart > newBed->thickStart || oldBed->thickEnd < newBed->thickEnd))
     note = "Bases added to coding region";
 else if (intronsDifferent(oldBed, newBed))
     note = "Intron boundaries have changed";
 else if (oldBed->chromStart < newBed->chromStart || oldBed->chromEnd > newBed->chromEnd)
     note = "Bases removed from UTR";
 else if (oldBed->chromStart > newBed->chromStart || oldBed->chromEnd < newBed->chromEnd)
     note = "Bases added to UTR";
 return note;
 }
 
 void explainOverlap(struct bed *oldBed, struct bed *newBed, char *type, FILE *f)
 /* Write out a line explaining why old gene has gone missing. */
 {
 char *note = diffNote(oldBed, newBed);
 fprintf(f, "%s\t%s\t%d\t%d\t%s\t%s\t%s\n", oldBed->name, oldBed->chrom, 
 	oldBed->chromStart, oldBed->chromEnd, newBed->name, type, note);
 }
 
 struct hash *makeAccHash(struct bed *list)
 /* Make hash keyed by accession (without version) with beds for values. */
 {
 struct bed *bed;
 struct hash *hash = hashNew(16);
 for (bed = list; bed != NULL; bed = bed->next)
     {
     char accOnly[64];
     safef(accOnly, sizeof(accOnly), bed->name);
     chopSuffix(accOnly);
     hashAdd(hash, accOnly, bed);
     }
 return hash;
 }
 
 struct hash *hashBedList(struct bed *list)
 /* Make hash keyed by bed->name with bed values. */
 {
 struct bed *bed;
 struct hash *hash = hashNew(16);
 for (bed = list; bed != NULL; bed = bed->next)
     hashAdd(hash, bed->name, bed);
 return hash;
 }
 
 void txGeneExplainUpdate2(char *oldKgFile, char *newKgFile, char *outFile)
 /* txGeneExplainUpdate2 - Make table explaining correspondence between versions of UCSC genes. */
 {
+struct bed *unmappedList = NULL;
 struct bed *oldBed, *oldList = bedLoadNAll(oldKgFile, 12);
 struct bed *bed, *newList = bedLoadNAll(newKgFile, 12);
 struct hash *newAccHash = makeAccHash(newList);
 struct hash *newAccVerHash = hashBedList(newList);
 struct hash *newKeeperHash = bedsIntoKeeperHash(newList);
+if (unmapped)
+    unmappedList = bedLoadNAll(unmapped, 12);
 FILE *f = mustOpen(outFile, "w");
 
 for (oldBed = oldList; oldBed != NULL; oldBed = oldBed->next)
     {
     if ((bed = hashFindVal(newAccVerHash, oldBed->name)) != NULL)
         fprintf(f, "%s\t%s\t%d\t%d\t%s\texact\t\n", oldBed->name, oldBed->chrom,
 		oldBed->chromStart, oldBed->chromEnd, bed->name);
     else if ((bed = findCompatible(oldBed, newAccHash)) != NULL)
         fprintf(f, "%s\t%s\t%d\t%d\t%s\tcompatible\t%s\n", oldBed->name, oldBed->chrom,
 		oldBed->chromStart, oldBed->chromEnd, bed->name, diffNote(oldBed, bed));
     else if ((bed = findMostOverlapping(oldBed, newKeeperHash)) != NULL)
 	explainOverlap(oldBed, bed, "overlap", f);
     else
         explainMissing(oldBed, "none", f);
     }
+for (oldBed = unmappedList; oldBed != NULL; oldBed = oldBed->next)
+    {
+    fprintf(f, "%s\t%s\t%d\t%d\t%s\tunmapped\tunmapped from %s\n", oldBed->name, oldBed->chrom,
+	    oldBed->chromStart, oldBed->chromEnd, "", oldAsm);
+    }
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
+unmapped = optionVal("unmapped", NULL);
+oldAsm = optionVal("oldAsm", NULL);
+if (unmapped != NULL)
+    {
+    if (oldAsm == NULL)
+        errAbort("You have to specify oldAsm option to say what database unmapped come from.");
+    }
 txGeneExplainUpdate2(argv[1], argv[2], argv[3]);
 return 0;
 }