7687cf920f88accdb12ab70c718f3add9bf88131
markd
  Wed Feb 12 09:16:51 2020 -0800
added -minOverlappingBases option to set a threshold for merging transcripts into a gene cluster

diff --git src/hg/geneBounds/clusterGenes/clusterGenes.c src/hg/geneBounds/clusterGenes/clusterGenes.c
index 9c6b1f6..a2c7b4e 100644
--- src/hg/geneBounds/clusterGenes/clusterGenes.c
+++ src/hg/geneBounds/clusterGenes/clusterGenes.c
@@ -44,57 +44,62 @@
   "    names.\n"
   "   -ignoreStrand - cluster postive and negative strand together\n"
   "   -clusterBed=bed - output BED file for each cluster.  If -cds is specified,\n"
   "    this only contains bounds based on CDS\n"
   "   -clusterTxBed=bed - output BED file for each cluster.  If -cds is specified,\n"
   "    this contains bounds based on full transcript range of genes, not just CDS\n"
   "   -flatBed=bed - output BED file that contains the exons of all genes\n"
   "    flattned into a single record. If -cds is specified, this only contains\n"
   "    bounds based on CDS\n"
   "   -joinContained - join genes that are contained within a larger loci\n"
   "    into that loci. Intended as a way to handled fragments and exon-level\n"
   "    predictsions, as genes-in-introns on the same strand are very rare.\n"
   "   -conflicted - detect conflicted loci. Conflicted loci are loci that\n"
   "    contain genes that share not sequence.  This option greatly increases\n"
   "    size of output file.\n"
+  "   -minOverlappingBases=N - require at least N bases to overlap to merge two\n"
+  "    transcripts.  This is done on a per-exon basis, not the total overlap.\n"
+  "    The default is 1.\n"
   "\n"
   "The cdsConflicts and exonConflicts columns contains `y' if the cluster has\n"
   "conficts. A conflict is a cluster where all of the genes don't share exons. \n"
   "Conflicts maybe either internal to a table or between tables.\n"
   );
 }
 
 static struct optionSpec options[] = {
    {"chrom", OPTION_STRING|OPTION_MULTI},
    {"cds", OPTION_BOOLEAN},
    {"trackNames", OPTION_BOOLEAN},
    {"clusterBed", OPTION_STRING},
    {"clusterTxBed", OPTION_STRING},
    {"flatBed", OPTION_STRING},
    {"cds", OPTION_BOOLEAN},
    {"joinContained", OPTION_BOOLEAN},
    {"conflicted", OPTION_BOOLEAN},
    {"ignoreStrand", OPTION_BOOLEAN},
+   {"minOverlappingBases", OPTION_INT},
    {NULL, 0},
 };
 
 /* from command line  */
-boolean gUseCds;
-boolean gTrackNames;
-boolean gJoinContained = FALSE;
-boolean gDetectConflicted = FALSE;
-boolean gIgnoreStrand = FALSE;
+static boolean gUseCds;
+static boolean gTrackNames;
+static boolean gJoinContained = FALSE;
+static boolean gDetectConflicted = FALSE;
+static boolean gIgnoreStrand = FALSE;
+static int gMinOverlappingBases = 1;
 
 struct track
 /*  Object representing a track. */
 {
     struct track *next;
     char *name;             /* name to use */
     char *table;            /* table or file */
     boolean isDb;           /* is this a database table or file? */
     struct genePred *genes; /* genes read from a file, sorted by chrom and strand */
 };
 
 static int genePredStandCmp(const void *va, const void *vb)
 /* Compare to sort based on chromosome, strand, txStart, txEnd, and name.  The
  * location and name are just included to help make tests consistent. */
 {
@@ -660,100 +665,111 @@
 
 if (gDetectConflicted)
     getConflicts(clusterList);
 
 /* assign ids */
 for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
     cluster->id = nextClusterId++;
 
 /* Clean up and go home. */
 binKeeperFree(&cm->bk);
 dlListFree(&cm->clusters);
 freez(pCm);
 return clusterList;
 }
 
-void clusterMakerAddExon(struct clusterMaker *cm, struct track *track, struct genePred *gp,
-                         int exonStart, int exonEnd, struct dlNode **oldNodePtr)
-/* Add a gene exon to clusterMaker. */
-{
-struct dlNode *oldNode = *oldNodePtr;
-struct binElement *bEl, *bList = binKeeperFind(cm->bk, exonStart, exonEnd);
-verbose(4, "  %s %d-%d\n", track->name, exonStart, exonEnd);
-if (bList == NULL)
-    {
-    if (oldNode == NULL)
+static bool overlapingBases(int exonStart, int exonEnd, struct dlNode *node)
+/* check if the overlap with cluster is over threshold  */
 {
-        struct cluster *cluster = clusterNew();
-        oldNode = dlAddValTail(cm->clusters, cluster);
-        }
-    addExon(cm->bk, oldNode, exonStart, exonEnd, track, gp);
+struct cluster *cluster = node->val;
+int maxStart = max(exonStart, cluster->txStart);
+int minEnd = min(exonEnd, cluster->txEnd);
+return (minEnd - maxStart) >= gMinOverlappingBases;
 }
-else
+
+static struct dlNode *clusterMakerAddExonMerge(struct clusterMaker *cm, int exonStart, int exonEnd, struct dlNode *currentNode)
+/* merge notes that overlap exon over threshold, return node */
 {
+struct binElement *bEl, *bList = binKeeperFind(cm->bk, exonStart, exonEnd);
 for (bEl = bList; bEl != NULL; bEl = bEl->next)
     {
     struct dlNode *newNode = bEl->val;
-        if (newNode != oldNode)
+    if (newNode != currentNode)
         {
-            if (oldNode == NULL)
+        if (currentNode == NULL)
             {
             /* Add to existing cluster. */
-                oldNode = newNode;
+            currentNode = newNode;
             }
-            else
+        else if (overlapingBases(exonStart, exonEnd, currentNode))
             {
             /* Merge new cluster into old one. */
-                verbose(3, "Merging %p %p\n", oldNode, newNode);
-                mergeClusters(cm->bk, bEl->next, oldNode, newNode);
+            verbose(3, "Merging %p %p\n", currentNode, newNode);
+            mergeClusters(cm->bk, bEl->next, currentNode, newNode);
             }
         }
     }
-    addExon(cm->bk, oldNode, exonStart, exonEnd, track, gp);
 slFreeList(&bList);
+return currentNode;
+}
+
+static void clusterMakerAddExon(struct clusterMaker *cm, struct track *track, struct genePred *gp,
+                                int exonStart, int exonEnd, struct dlNode **currentNodePtr)
+/* Add a gene exon to clusterMaker. */
+{
+verbose(4, "  %s %d-%d\n", track->name, exonStart, exonEnd);
+struct dlNode *currentNode = clusterMakerAddExonMerge(cm, exonStart, exonEnd, *currentNodePtr);
+
+if ((currentNode == NULL) || !overlapingBases(exonStart, exonEnd, currentNode))
+    {
+    struct cluster *cluster = clusterNew();
+    currentNode = dlAddValTail(cm->clusters, cluster);
+    addExon(cm->bk, currentNode, exonStart, exonEnd, track, gp);
     }
-*oldNodePtr = oldNode;
+else
+    addExon(cm->bk, currentNode, exonStart, exonEnd, track, gp);
+*currentNodePtr = currentNode;
 }
 
 void clusterMakerAdd(struct clusterMaker *cm, struct track *track, struct genePred *gp)
 /* Add gene to clusterMaker. */
 {
 int exonIx;
-struct dlNode *oldNode = NULL;
+struct dlNode *currentNode = NULL;
 
 /* Build up cluster list with aid of binKeeper.  For each exon look to see if
  * it overlaps an existing cluster.  If so put it into existing cluster,
  * otherwise make a new cluster.  The hard case is where part of the gene is
  * already in one cluster and the exon overlaps with a new cluster.  In this
  * case merge the new cluster into the old one.  If we are joining contained
  * genes, the only gene range is added as if it was a single exon. */
 
 verbose(2, "%s %s %d-%d\n", track->name, gp->name, gp->txStart, gp->txEnd);
 if (gJoinContained)
     {
     int start =(gUseCds) ? gp->cdsStart : gp->txStart;
     int end =(gUseCds) ? gp->cdsEnd : gp->txEnd;
     if (end > start) 
-        clusterMakerAddExon(cm, track, gp, start, end, &oldNode);
+        clusterMakerAddExon(cm, track, gp, start, end, &currentNode);
     }
 else
     {
     for (exonIx = 0; exonIx < gp->exonCount; ++exonIx)
         {
         int exonStart, exonEnd;
         if (gpGetExon(gp, exonIx, gUseCds, &exonStart, &exonEnd))
-            clusterMakerAddExon(cm, track, gp, exonStart, exonEnd, &oldNode);
+            clusterMakerAddExon(cm, track, gp, exonStart, exonEnd, &currentNode);
         }
     }
 }
 
 void loadGenes(struct clusterMaker *cm, struct sqlConnection *conn,
                struct track* track, char *chrom, char strand,
                struct genePred **allGenes)
 /* load genes into cluster from a table or file */
 {
 struct genePred *genes, *gp;
 verbose(2, "%s %s %c\n", track->table, chrom, strand);
 
 if (track->isDb)
     genes = trackTableGetGenes(track, conn, chrom, strand);
 else
@@ -982,28 +998,29 @@
 carefulClose(&clBedFh);
 carefulClose(&clTxBedFh);
 carefulClose(&flatBedFh);
 carefulClose(&outFh);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 gUseCds = optionExists("cds");
 gTrackNames = optionExists("trackNames");
 gJoinContained = optionExists("joinContained");
 gDetectConflicted = optionExists("conflicted");
 gIgnoreStrand = optionExists("ignoreStrand");
+gMinOverlappingBases = optionInt("minOverlappingBases", gMinOverlappingBases);
 if (!gTrackNames)
     {
     if (argc < 4)
         usage();
     }
 else
     {
     if ((argc < 5) || !(argc & 1))
         usage();
     }
 clusterGenes(argv[1], argv[2], argc-3, argv+3);
 return 0;
 }