src/hg/txGraph/txBedToGraph/txBedToGraph.c be4311c07e14feb728abc6425ee606ffaa611a58

be4311c07e14feb728abc6425ee606ffaa611a58
markd
  Fri Jan 22 06:46:58 2021 -0800
merge with master

diff --git src/hg/txGraph/txBedToGraph/txBedToGraph.c src/hg/txGraph/txBedToGraph/txBedToGraph.c
index c80aa9a..0500697 100644
--- src/hg/txGraph/txBedToGraph/txBedToGraph.c
+++ src/hg/txGraph/txBedToGraph/txBedToGraph.c
@@ -1,384 +1,385 @@
 /* txBedToGraph - Cluster together beds from txPslToBed. Make transcript graphs out of clusters.
  * The overall flow  of the algorithm is:
  *   Read in input, and separate it groups by strand and chromosome.
  *   For each chromosome strand:
  *       Cluster input that overlaps at the exon level.
  *       Turn each cluster into a graph.
  * This module handles i/o and clustering.  The makeGraph module
  * handles the graph building. */
 
 /* Copyright (C) 2008 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "ggTypes.h"
 #include "bed.h"
 #include "dnautil.h"
 #include "rangeTree.h"
 #include "binRange.h"
 #include "txGraph.h"
 #include "nibTwo.h"
 #include "txBedToGraph.h"
 
 int maxJoinSize = 70000;	/* This excludes most of the chr14 IG mess */
 boolean forceRefSeqJoin = TRUE;
 int maxBleedOver = 6;
 int maxUncheckedBleed = 3;
 struct nibTwoCache *seqCache = NULL;
 char *prefix = "a";
 double singleExonMaxOverlap = 0.60;
 
 boolean trustedSource(char *sourceType)
 /* Return TRUE source type is trusted (refSeq or something similar). */ 
 {
 return sameString(sourceType, "refSeq") || sameString(sourceType, "ccds");
 }
 
 boolean noCutSource(char *sourceType)
 /* Return TRUE if source is not to be truncated during snap to operation. */
 {
 return sameString(sourceType, "ccds");
 }
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "txBedToGraph - Cluster together beds from txPslToBed. Make transcript graphs out of clusters.\n"
   "   txBedToGraph in1.bed in1Type [in2.bed in2type ...]  out.txg\n"
   "options:\n"
   "    -maxJoinSize=N - Maximum size of gap to join for introns with\n"
   "                    noisy ends. Default %d.\n"
   "    -noForceRefSeqJoin - If set don't force joins above maxJoinSize\n"
   "                    on refSeq type\n"
   "    -maxBleedOver=N - Maximum amount of exon that can be lost when snapping\n"
   "                    soft to hard edges. Default %d\n"
   "    -maxUncheckedBleed=N - Maximum amount of exon that can be lost when\n"
   "                    snapping soft to hard edges without checking nucleotide\n"
   "                    match. Only used checkSeq is set.  Default %d\n"
   "    -checkSeq=file.2bit - If true check sequence when snapping off ends. Can use nib\n"
   "                    dir or two bit file.\n"
   "    -prefix=xyz - Use the given prefix for the graph names, default %s\n"
   "    -singleExonMaxOverlap=0.N - Maximum ratio of single exon that can overlap\n"
   "                                a multi-exon gene.  Default %g\n"
   , maxJoinSize, maxBleedOver, maxUncheckedBleed, prefix, singleExonMaxOverlap
   );
 }
 
 static struct optionSpec options[] = {
    {"maxJoinSize", OPTION_INT},
    {"noForceRefSeqJoin", OPTION_BOOLEAN},
    {"maxBleedOver", OPTION_INT},
    {"maxUncheckedBleed", OPTION_INT},
    {"checkSeq", OPTION_STRING},
    {"prefix", OPTION_STRING},
    {"singleExonMaxOverlap", OPTION_FLOAT},
    {NULL, 0},
 };
 
 
 int linkedBedsCmpChromAndStrand(const void *va, const void *vb)
 /* Compare to sort based on chrom,strand,chromStart. */
 {
 const struct linkedBeds *a = *((struct linkedBeds **)va);
 const struct linkedBeds *b = *((struct linkedBeds **)vb);
 struct bed *aBed = a->bedList, *bBed = b->bedList;
 int dif;
 dif = strcmp(aBed->chrom, bBed->chrom);
 if (dif == 0)
     dif = strcmp(aBed->strand, bBed->strand);
 if (dif == 0)
     dif = a->chromStart - b->chromStart;
 return dif;
 }
 
 struct linkedBeds *linkedBedsLoad(char *fileName, char *sourceType)
 /* Read a bed file and turn it into a list of linked beds. */
 {
 struct bed *bedList = bedLoadNAll(fileName, 12);
 struct bed *startBed, *bed, *endBed;
 struct linkedBeds *lb, *lbList = NULL;
 boolean bigGapOk = (sameString(sourceType, "refSeq") && forceRefSeqJoin);
 for (startBed = bedList; startBed != NULL; startBed = endBed)
     {
     /* Figure out first bed with different name, or that otherwise can't be
      * a continuation of previous bed. */
     struct bed *lastBed = startBed;
     for (bed = startBed->next; bed != NULL; bed = bed->next)
         {
 	if ( lastBed->chromEnd > bed->chromStart || !sameString(lastBed->name, bed->name) 
 		|| bed->strand[0] != lastBed->strand[0] 
 		|| !sameString(bed->chrom, lastBed->chrom) )
 	    {
 	    break;
 	    }
 	if (!bigGapOk)
 	    {
 	    break;
 	    }
 	lastBed = bed;
 	}
     lastBed->next = NULL;
     endBed = bed;
 
     /* Create linkedBed and hang it on list. */
     AllocVar(lb);
     lb->bedList = startBed;
     lb->sourceType = sourceType;
     lb->chromStart = startBed->chromStart;
     lb->chromEnd = lastBed->chromEnd;
     slAddHead(&lbList, lb);
     }
 slReverse(&lbList);
 return lbList;
 }
 
 struct lbCluster
 /* A cluster of overlapping (at the block level on the same strand)
  * linkedBeds. */
     {
     struct lbCluster *next;
     int chromStart,chromEnd;	/* Bounds of cluster. */
     struct linkedBeds *lbList;	/* Contents of cluster. */
     struct rbTree *exonTree;	/* Tree just used during creation. */
     };
 
 void lbClusterFree(struct lbCluster **pCluster)
 /* Free up memory associated with cluster. */
 {
 /* Note this doesn't free up the linkedBeds. The program
  * has to have all the linkedBeds in memory at one point
  * anyway, and I'm a little lazy right now. */
 struct lbCluster *cluster = *pCluster;
 if (cluster != NULL)
     {
     freeMem(cluster->exonTree);
     freez(pCluster);
     }
 }
 
 struct lbCluster *lbClusterNew(struct linkedBeds *lb, 
 	struct lm *lm, struct rbTreeNode *rbStack[128])
 /* Create cluster around a single alignment. */
 {
 /* Allocate and fill in basic fields. */
 struct lbCluster *cluster;
 AllocVar(cluster);
 cluster->chromStart = lb->chromStart;
 cluster->chromEnd = lb->chromEnd;
 cluster->lbList = lb;
 lb->next = NULL;
 
 /* Fill in range tree with exons. */
 cluster->exonTree = rangeTreeNewDetailed(lm, rbStack);
 struct bed *bed;
 for (bed = lb->bedList; bed != NULL; bed = bed->next)
     {
     int block, blockCount = bed->blockCount;
     for (block = 0; block < blockCount; ++block)
         {
 	int start = bed->chromStart + bed->chromStarts[block];
 	int end = start + bed->blockSizes[block];
 	rangeTreeAdd(cluster->exonTree, start, end);
 	}
     }
 return cluster;
 }
 
 void lbClusterMerge(struct lbCluster *a, struct lbCluster **pB)
 /* Merge b into a.  Destroys b. */
 {
 struct lbCluster *b = *pB;
 a->lbList = slCat(a->lbList, b->lbList);
 b->lbList = NULL;
 a->chromStart = min(a->chromStart, b->chromStart);
 a->chromEnd = max(a->chromEnd, b->chromEnd);
 struct range *range;
 for (range = rangeTreeList(b->exonTree); range != NULL; range = range->next)
     rangeTreeAdd(a->exonTree, range->start, range->end);
 lbClusterFree(pB);
 }
 
 boolean lbIntersectsCluster(struct lbCluster *cluster, struct linkedBeds *lb)
 /* Return TRUE if any block in psl intersects with cluster. */
 {
 struct bed *bed;
 for (bed = lb->bedList; bed != NULL; bed = bed->next)
     {
     int block, blockCount = bed->blockCount;
     for (block = 0; block < blockCount; ++block)
 	{
 	struct range tempR;
 	int start = bed->chromStarts[block] + bed->chromStart;
 	int end = start + bed->blockSizes[block];
 	tempR.start = start;
 	tempR.end = end;
 	if (rbTreeFind(cluster->exonTree, &tempR))
 	    return TRUE;
 	}
     }
 return FALSE;
 }
 
 void lbIntoBinsOfClusters(struct linkedBeds *lb, struct binKeeper *bins, 
 	struct lm *lm, struct rbTreeNode *rbStack[128])
 /* Add lb into a binKeeper full of lbClusters.  This will create and merge 
  * clusters as need be. */
 {
 /* Create new cluster around lb. */
 struct lbCluster *newCluster = lbClusterNew(lb, lm, rbStack);
 
 /* Merge in any existing overlapping clusters.. */
 struct binElement *bel, *belList = binKeeperFind(bins, lb->chromStart, lb->chromEnd);
 for (bel = belList; bel != NULL; bel = bel->next)
     {
     struct lbCluster *oldCluster = bel->val;
     if (lbIntersectsCluster(oldCluster, lb))
 	{
 	binKeeperRemove(bins, oldCluster->chromStart, oldCluster->chromEnd, oldCluster);
 	lbClusterMerge(oldCluster, &newCluster);
 	newCluster = oldCluster;
 	}
     }
 slFreeList(&belList);
 
 /* Add to binKeeper. */
 binKeeperAdd(bins, newCluster->chromStart, newCluster->chromEnd, newCluster);
 }
 
 struct lbCluster *clusterOneStrand(struct linkedBeds *lbList, int strandSizeLimit)
 /* Make clusters of overlapping beds on a single chromosome. 
  * Return a list of such clusters. */
 {
 /* Set up local memory pool and stack for all the rangeTrees we'll be using. */
 struct lm *lm = lmInit(0);
 struct rbTreeNode *rbStack[128];
 
 /* Create binKeeper full of clusters. */
 struct binKeeper *bins = binKeeperNew(0, strandSizeLimit);
 struct linkedBeds *lb, *nextLb;
 for (lb = lbList; lb != NULL; lb = nextLb)
     {
     nextLb = lb->next;
     lbIntoBinsOfClusters(lb, bins, lm, rbStack);
     }
 
 /* Convert from binKeeper of clusters to simple list of clusters. */
 struct lbCluster *clusterList = NULL;
 struct binElement *binList, *bin;
 binList = binKeeperFindAll(bins);
 for (bin = binList; bin != NULL; bin = bin->next)
     {
     struct lbCluster *cluster = bin->val;
     slAddHead(&clusterList, cluster);
     }
 slReverse(&clusterList);
 
 /* Clean up and go home. */
 struct lbCluster *cluster;
 for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
     freez(&cluster->exonTree);
 lmCleanup(&lm);
 binKeeperFree(&bins);
 return clusterList;
 }
 
 struct txGraph *graphOneStrand(struct linkedBeds *lbList, int strandSizeLimit)
 /* Create a list of graphs based on lbList, which is already
  * sorted and restricted to a single strand of a single chromosome. 
  * The interval 0 to strandSizeLimit needs to encompass all the 
  * exons. */
 {
 struct lbCluster *clusterList = clusterOneStrand(lbList, strandSizeLimit);
 struct lbCluster *cluster, *nextCluster;
 struct txGraph *graphList = NULL;
 for (cluster = clusterList; cluster != NULL; cluster = nextCluster)
     {
     char name[128];
     static int id=0;
     nextCluster = cluster->next;
     safef(name, sizeof(name), "%s%d", prefix, ++id);
     verbose(2, "Got cluster of %d called %s.\n", slCount(cluster->lbList), name);
     struct txGraph *graph = makeGraph(cluster->lbList, maxBleedOver, maxUncheckedBleed, 
     	seqCache, singleExonMaxOverlap, name);
+    if (graph != NULL)
 	slAddHead(&graphList, graph);
     lbClusterFree(&cluster);
     }
 slReverse(&graphList);
 return graphList;
 }
 
 void txBedToGraph(char *outGraph, char **inPairs, int inCount)
 /* txBedToGraph - Cluster together beds from txPslToBed. Make transcript graphs out of clusters.. */
 {
 /* Load all bed files and wrap them in sourcedBeds. */
 struct linkedBeds *lbList = NULL;
 int i;
 for (i=0; i<inCount; ++i)
     {
     char *inBed = *inPairs++;
     char *type = *inPairs++;
     verbose(2, "Loading %s\n", inBed);
     lbList = slCat(lbList, linkedBedsLoad(inBed, type));
     }
 verbose(2, "Created %d linkedBeds\n", slCount(lbList));
 
 /* Sort and set up loop that processes all input on the same strand
  * of the same chromosome at once. */
 slSort(&lbList, linkedBedsCmpChromAndStrand);
 struct linkedBeds *strandStart, *strandEnd = NULL;
 FILE *f = mustOpen(outGraph, "w");
 for (strandStart = lbList; strandStart != NULL; strandStart = strandEnd)
     {
     /* Find chromosome end. */
     char *chromName = strandStart->bedList->chrom;
     char strand = strandStart->bedList->strand[0];
     struct linkedBeds *lb, *dummy, **endAddress = &dummy;
     int strandSizeLimit = 0;
     for (lb = strandStart; lb != NULL; lb = lb->next)
         {
 	if (!sameString(lb->bedList->chrom, chromName))
 	    break;
 	if (lb->bedList->strand[0] != strand)
 	    break;
 	endAddress = &lb->next;
 	strandSizeLimit = max(strandSizeLimit, lb->chromEnd);
 	}
     strandEnd = lb;
 
     /* Terminate list before next chromosome */
     *endAddress = NULL;
 
     /* Create and write and free graphs. */
     verbose(2, "creating graphs on %s %s\n", 
     	strandStart->bedList->chrom, strandStart->bedList->strand); 
     struct txGraph *graph, *graphList = graphOneStrand(strandStart, strandSizeLimit);
     for (graph = graphList; graph != NULL; graph = graph->next)
         txGraphTabOut(graph, f);
     txGraphFreeList(&graphList);
     }
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 // pushCarefulMemHandler(500000000);
 dnaUtilOpen();
 optionInit(&argc, argv, options);
 maxJoinSize = optionInt("maxJoinSize", maxJoinSize);
 forceRefSeqJoin = !optionExists("noForceRefSeqJoin");
 maxBleedOver = optionInt("maxBleedOver", maxBleedOver);
 singleExonMaxOverlap = optionDouble("singleExonMaxOverlap", singleExonMaxOverlap);
 maxUncheckedBleed = optionInt("maxUncheckedBleed", maxUncheckedBleed);
 if (optionExists("checkSeq"))
    seqCache = nibTwoCacheNew(optionVal("checkSeq", NULL));
 prefix = optionVal("prefix", prefix);
 if (argc < 4 || argc%2 != 0)
     usage();
 txBedToGraph(argv[argc-1], argv+1, argc/2-1);
 return 0;
 }