src/hg/utils/gapToLift/gapToLift.c 1.8

1.8 2009/08/10 20:58:59 hiram
Properly output lift and bed lines for chroms that have no gaps
Index: src/hg/utils/gapToLift/gapToLift.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/gapToLift/gapToLift.c,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 1000000 -r1.7 -r1.8
--- src/hg/utils/gapToLift/gapToLift.c	9 Mar 2009 16:43:40 -0000	1.7
+++ src/hg/utils/gapToLift/gapToLift.c	10 Aug 2009 20:58:59 -0000	1.8
@@ -1,260 +1,277 @@
 /* gapToLift - create lift file from gap table(s). */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "jksql.h"
 #include "chromInfo.h"
 #include "agpGap.h"
 #include "hdb.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "gapToLift - create lift file from gap table(s)\n"
   "usage:\n"
   "   gapToLift [options] db liftFile.lft\n"
   "       uses gap table(s) from specified db.  Writes to liftFile.lft\n"
   "       generates lift file segements separated by non-bridged gaps.\n"
   "options:\n"
   "   -chr=chrN - work only on given chrom\n"
   "   -insane - do *not* perform coordinate sanity checks on gaps\n"
-  "   -bedFile=fileName.bed - output segments to fileName.bed"
+  "   -bedFile=fileName.bed - output segments to fileName.bed\n"
+  "   -verbose=N - N > 1 see more information about procedure"
   );
 }
 
 /* options */
 static char *workChr = NULL;	/* work only on this given chrom name */
 static boolean insane = FALSE;	/* TRUE do not perform sanity checks on gaps */
 static FILE *bedFile = NULL; /* when requested, output segments to bed file */
 static char *bedFileName = NULL; /* output to bedFileName name */
 
 
 static struct optionSpec options[] = {
    {"chr", OPTION_STRING},
    {"insane", OPTION_BOOLEAN},
    {"bedFile", OPTION_STRING},
    {NULL, 0},
 };
 
 static struct hash *cInfoHash = NULL;
 
 static struct chromInfo *loadChromInfo(struct sqlConnection *conn)
 {
 struct chromInfo *ret = NULL;
 char **row;
 char query[256];
 int chromCount = 0;
 
 cInfoHash = newHash(0);
 
 if (workChr)
     safef(query, ArraySize(query), "SELECT * FROM chromInfo WHERE "
 	"chrom='%s' ORDER BY chrom DESC", workChr);
 else
     safef(query, ArraySize(query),
 	"SELECT * FROM chromInfo ORDER BY chrom DESC");
 
 struct sqlResult *sr = sqlGetResult(conn, query);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     struct chromInfo *el;
     struct chromInfo *ci;
     AllocVar(ci);
     el = chromInfoLoad(row);
     ci->chrom = cloneString(el->chrom);
     ci->size = el->size;
     slAddHead(&ret, ci);
     hashAddInt(cInfoHash, el->chrom, el->size);
     ++chromCount;
     }
 sqlFreeResult(&sr);
 verbose(2,"#\tchrom count: %d\n", chromCount);
 return (ret);
 }
 
 static void gapSanityCheck(struct agpGap *gapList)
 {
 int prevEnd = 0;
 int prevStart = 0;
 char *prevChr = NULL;
 char *prevType = NULL;
 struct agpGap *gap;
 
 for (gap = gapList; gap; gap = gap->next)
     {
     int chrSize = hashIntVal(cInfoHash, gap->chrom);
     if (gap->chromStart < 0)
 	verbose(1, "WARNING: gap chromStart < 0 at %s:%d-%d\n",
 		gap->chrom, gap->chromStart, gap->chromEnd);
     if (gap->chromEnd > chrSize)
 	verbose(1, "WARNING: gap chromEnd > chromSize(%d) "
 	    "at %s:%d-%d\n", chrSize, gap->chrom,
 		gap->chromStart, gap->chromEnd);
     if (gap->chromEnd == chrSize && differentString(gap->type, "telomere"))
 	verbose(1, "WARNING: gap at end of chromosome not telomere "
 	    "at %s:%d-%d, type: %s\n", gap->chrom,
 		gap->chromStart, gap->chromEnd, gap->type);
     if (gap->chromStart >= gap->chromEnd)
 	verbose(1, "WARNING: gap chromStart >= chromEnd at %s:%d-%d\n",
 	    gap->chrom, gap->chromStart, gap->chromEnd);
     if (prevEnd > 0)
 	{
 	if (sameWord(prevChr, gap->chrom) &&
 		(prevEnd >= gap->chromStart))
 	    verbose(1,"WARNING: overlapping gap at "
 		"%s:%d-%d(%s) and %s:%d-%d(%s)\n",
 		    gap->chrom, prevStart, prevEnd, prevType, gap->chrom,
 			gap->chromStart, gap->chromEnd, gap->type);
 	}
     else
 	{
 	prevStart = gap->chromStart;
 	prevEnd = gap->chromEnd;
 	prevType = gap->type;
 	}
     if (isNotEmpty(prevChr))
 	{
 	if (differentWord(prevChr, gap->chrom))
 	    {
 	    freeMem(prevChr);
 	    prevChr = cloneString(gap->chrom);
 	    }
 	}
     else
 	prevChr = cloneString(gap->chrom);
     prevStart = gap->chromStart;
     prevEnd = gap->chromEnd;
     }
 }
 
 static struct agpGap *loadAllGaps(struct sqlConnection *conn,
 	char *db, struct chromInfo *cInfoList)
 /*	fetch all gaps, returns list of gaps */
 { 
 struct agpGap *gapList = NULL;
 struct chromInfo *cInfo;
 int gapCount = 0;
 
 for (cInfo = cInfoList; cInfo; cInfo = cInfo->next)
     {
     char **row;
     int rowOffset;
     struct sqlResult *sr = hRangeQuery(conn, "gap", cInfo->chrom, 0,
 	cInfo->size, NULL, &rowOffset);
     while ((row = sqlNextRow(sr)) != NULL)
 	{
 	struct agpGap *gap = agpGapLoad(row+rowOffset);
 	slAddHead(&gapList, gap);
 	++gapCount;
 	}
     sqlFreeResult(&sr);
     }
 slSort(&gapList, bedCmp);
 if (! insane)
     gapSanityCheck(gapList);
 verbose(2,"#\tfound %d gaps\n", gapCount);
 return (gapList);
 }
 
 static int liftOutLine(FILE *out, char *chr, int start, int end,
     int count, int chrSize)
 {
 if ((end-start) > 0)
     {
     fprintf(out, "%d\t%s.%02d\t%d\t%s\t%d\n", start, chr,
 	count, end-start, chr, chrSize);
     if (bedFile)
 	fprintf(bedFile, "%s\t%d\t%d\t%s.%02d\n", chr, start, end, chr, count);
     count += 1;
     }
 return count;
 }
 
 static void gapToLift(char *db, char *outFile)
 /* gapToLift - create lift file from gap table(s). */
 {
 FILE *out = mustOpen(outFile, "w");
 struct sqlConnection *conn = sqlConnect(db);
 struct chromInfo *cInfoList = loadChromInfo(conn);
 struct agpGap *gapList = loadAllGaps(conn, db, cInfoList);
 struct agpGap *gap;
 int start = 0;
 int end = 0;
 char *prevChr = NULL;
 int liftCount = 0;
 int chrSize = 0;
+static struct hash *chrDone = NULL;
+
+chrDone = newHash(0);
 
 if (isNotEmpty(bedFileName))
     {
     bedFile = mustOpen(bedFileName, "w");
     verbose(2,"#\tbed output requested to %s\n", bedFileName);
     }
 
 for (gap = gapList; gap; gap = gap->next)
     {
     verbose(3,"#\t%s\t%d\t%d\t%s\n", gap->chrom, gap->chromStart,
 	gap->chromEnd, gap->bridge);
 
     if (prevChr && sameWord(prevChr, gap->chrom))
 	{	/* continuing same segment, check for gap break,
 		 *	or gap at end of chrom */
 	if (sameWord("no",gap->bridge) || (gap->chromEnd == chrSize))
 	    {
 	    end = gap->chromStart;
 	    liftCount = liftOutLine(out, gap->chrom, start, end, liftCount, chrSize);
 	    start = gap->chromEnd;
 	    end = start;
 	    }
 	else
 	    end = gap->chromEnd;
 	}
     else	/* new chrom encountered */
 	{ /* output last segment of previous chrom when necessary */
 	if (prevChr && differentWord(prevChr, gap->chrom))
 	    {
 	    if (end < chrSize)
 		liftCount = liftOutLine(out, prevChr, start, chrSize, liftCount, chrSize);
 	    }
 	liftCount = 0;
 	chrSize = hashIntVal(cInfoHash, gap->chrom);
+	hashAddInt(chrDone, gap->chrom, 1);
 	if (gap->chromStart > 0)
 	    {	/* starting first segment at position 0 */
 	    start = 0;
 	    end = gap->chromStart;
 	    /* does the first gap break it ?  Or gap goes to end of chrom. */
 	    if (sameWord("no",gap->bridge) || (gap->chromEnd == chrSize))
 		{
 		liftCount = liftOutLine(out, gap->chrom, start, end, liftCount, chrSize);
 		start = gap->chromEnd;
 		end = start;
 		}
 	    }
 	else	/* first gap is actually the beginning of the chrom */
 	    {	/* thus, first segment starts after this first gap */
 	    start = gap->chromEnd;
 	    end = start;
 	    }
 	}
     prevChr = gap->chrom;	/* remember prev chrom to detect next chrom */
     }
 /* potentially a last one */
 if (end < chrSize)
     liftCount = liftOutLine(out, prevChr, start, chrSize, liftCount, chrSize);
+/* check that all chroms have been used */
+struct hashCookie cookie = hashFirst(cInfoHash);
+struct hashEl *hel;
+while ((hel = hashNext(&cookie)) != NULL)
+    {
+    if (NULL == hashLookup(chrDone, hel->name))
+	{
+	chrSize = hashIntVal(cInfoHash, hel->name);
+	verbose(2, "#\tno gaps on chrom: %s, size: %d\n", hel->name, chrSize);
+	liftCount = liftOutLine(out, hel->name, 0, chrSize, 0, chrSize);
+	}
+    }
 carefulClose(&out);
 sqlDisconnect(&conn);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 workChr = optionVal("chr", NULL);
 bedFileName = optionVal("bedFile", NULL);
 insane = optionExists("insane");
 gapToLift(argv[1], argv[2]);
 return 0;
 }