dddc3258aec53ba33f27a2233816cc02a42b940d
markd
  Wed Sep 30 12:41:38 2020 -0700
refactor to simplified code in prep for fixing minOverlapping functionality

diff --git src/hg/geneBounds/clusterGenes/clusterGenes.c src/hg/geneBounds/clusterGenes/clusterGenes.c
index a2c7b4e..34c2efc 100644
--- src/hg/geneBounds/clusterGenes/clusterGenes.c
+++ src/hg/geneBounds/clusterGenes/clusterGenes.c
@@ -26,80 +26,82 @@
 {
 errAbort(
   "clusterGenes - Cluster genes from genePred tracks\n"
   "usage:\n"
   "   clusterGenes [options] outputFile database table1 ... tableN\n"
   "   clusterGenes [options] -trackNames outputFile database track1 table1 ... trackN tableN\n"
   "\n"
   "Where outputFile is a tab-separated file describing the clustering,\n"
   "database is a genome database such as mm4 or hg16,\n"
   "and the table parameters are either tables in genePred format in that\n"
   "database or genePred tab seperated files. If the input is all from files, the argument\n"
   "can be `no'.\n"
   "options:\n"
   "   -verbose=N - Print copious debugging info. 0 for none, 3 for loads\n"
   "   -chrom=chrN - Just work this chromosome, maybe repeated.\n"
-  "   -cds - cluster only on CDS exons\n"
+  "   -cds - cluster only on CDS exons, Non-coding genes are dropped.\n"
   "   -trackNames - If specified, input are pairs of track names and files.\n"
   "    This is useful when the file names don't reflact the desired track\n"
   "    names.\n"
   "   -ignoreStrand - cluster postive and negative strand together\n"
   "   -clusterBed=bed - output BED file for each cluster.  If -cds is specified,\n"
   "    this only contains bounds based on CDS\n"
   "   -clusterTxBed=bed - output BED file for each cluster.  If -cds is specified,\n"
   "    this contains bounds based on full transcript range of genes, not just CDS\n"
   "   -flatBed=bed - output BED file that contains the exons of all genes\n"
   "    flattned into a single record. If -cds is specified, this only contains\n"
   "    bounds based on CDS\n"
   "   -joinContained - join genes that are contained within a larger loci\n"
   "    into that loci. Intended as a way to handled fragments and exon-level\n"
   "    predictsions, as genes-in-introns on the same strand are very rare.\n"
   "   -conflicted - detect conflicted loci. Conflicted loci are loci that\n"
   "    contain genes that share not sequence.  This option greatly increases\n"
   "    size of output file.\n"
-  "   -minOverlappingBases=N - require at least N bases to overlap to merge two\n"
-  "    transcripts.  This is done on a per-exon basis, not the total overlap.\n"
-  "    The default is 1.\n"
+  "   -ignoreEndBases=N - ignore this many based to the start and end of each\n"
+  "    transcript when determine overlap.  This prevents small amounts of overlap\n"
+  "    from merging transcripts.  If -cds is specified, this amount of the CDS.\n"
+  "    is ignored. The default is 0.\n"
   "\n"
   "The cdsConflicts and exonConflicts columns contains `y' if the cluster has\n"
   "conficts. A conflict is a cluster where all of the genes don't share exons. \n"
   "Conflicts maybe either internal to a table or between tables.\n"
   );
 }
 
 static struct optionSpec options[] = {
    {"chrom", OPTION_STRING|OPTION_MULTI},
    {"cds", OPTION_BOOLEAN},
    {"trackNames", OPTION_BOOLEAN},
    {"clusterBed", OPTION_STRING},
    {"clusterTxBed", OPTION_STRING},
    {"flatBed", OPTION_STRING},
    {"cds", OPTION_BOOLEAN},
    {"joinContained", OPTION_BOOLEAN},
    {"conflicted", OPTION_BOOLEAN},
    {"ignoreStrand", OPTION_BOOLEAN},
-   {"minOverlappingBases", OPTION_INT},
+   {"minOverlappingBases", OPTION_INT},  /* deprecated, use ignoreEndBases */
+   {"ignoreEndBases", OPTION_INT},
    {NULL, 0},
 };
 
 /* from command line  */
 static boolean gUseCds;
 static boolean gTrackNames;
 static boolean gJoinContained = FALSE;
 static boolean gDetectConflicted = FALSE;
 static boolean gIgnoreStrand = FALSE;
-static int gMinOverlappingBases = 1;
+static int gIgnoreEndBases = 1;
 
 struct track
 /*  Object representing a track. */
 {
     struct track *next;
     char *name;             /* name to use */
     char *table;            /* table or file */
     boolean isDb;           /* is this a database table or file? */
     struct genePred *genes; /* genes read from a file, sorted by chrom and strand */
 };
 
 static int genePredStandCmp(const void *va, const void *vb)
 /* Compare to sort based on chromosome, strand, txStart, txEnd, and name.  The
  * location and name are just included to help make tests consistent. */
 {
@@ -317,43 +319,76 @@
         exonStart = gp->cdsStart;
     if (exonEnd > gp->cdsEnd)
         exonEnd = gp->cdsEnd;
     }
 *exonStartRet = exonStart;
 *exonEndRet = exonEnd;
 return exonStart < exonEnd;
 }
 
 struct clusterGene
 /* A gene in a cluster */
 {
     struct clusterGene* next;
     struct track* track;         /* aka table for gene */
     struct genePred* gp;
+    struct cluster* cluster;  /* cluster containing gene */
+    int effectiveStart;     /* start/end based on -cds and -ignoreEndBases */
+    int effectiveEnd;
     struct slRef *exonConflicts; /* list of genes in cluster that don't share
                                   * exons with this gene */
     struct slRef *cdsConflicts; /* list of genes in cluster that don't share
                                  * CDS with this gene */
 };
 
+#if 0
+static int findStartExon(struct genePred* gp)
+/* find start exon based on using cds or not */
+{
+int start = gUseCds ? gp->cdsStart : gp->txStart;
+int iExon;
+for (iExon = 0; iExon < gp->exonCount; iExon++)
+    {
+    if (start < gp->exonEnds[iExon))
+        return iExon;
+    }
+errAbort("BUG: findStartExon failed: %s", gp->name);
+return 0;
+}
+
+static int calcEffectiveStart(struct genePred* gp)
+/* calculate the genomic start position to use for overlaps */
+{
+int iExon = findStartExon(gp);
+for (iExon = 0; iExon < gp->exonCount; iExon++)
+    {
+    int possibleStart =;
+    }
+}
+#endif
+
 struct clusterGene* clusterGeneNew(struct track* track, struct genePred* gp)
 /* create a new clusterGene. */
 {
 struct clusterGene* cg;
 AllocVar(cg);
 cg->track = track;
 cg->gp = gp;
+#if 0
+cg->effectiveStart = calcEffectiveStart(gp);
+cg->effectiveEnd = calcEffectiveEnd(gp);
+#endif
 return cg;
 }
 
 void clusterGeneFree(struct clusterGene** cgp)
 /* Free a clusterGene. */
 {
 struct clusterGene* cg = *cgp;
 if (cg != NULL)
     {
     slFreeList(&cg->exonConflicts);
     slFreeList(&cg->cdsConflicts);
     freez(cgp);
     }
 }
 
@@ -438,106 +473,92 @@
     }
 *pList = NULL;
 }
 
 int clusterCmp(const void *va, const void *vb)
 /* Compare to sort based on start and end. assumes they are on the chrom */
 {
 const struct cluster *a = *((struct cluster **)va);
 const struct cluster *b = *((struct cluster **)vb);
 int dif = a->start - b->start;
 if (dif == 0)
     dif = a->end - b->end;
 return dif;
 }
 
-struct clusterGene *clusterFindGene(struct cluster *cluster, struct track *track, struct genePred *gp)
-/* search for a gene in a cluster.  */
-{
-struct clusterGene *cg;
-
-for (cg = cluster->genes; cg != NULL; cg = cg->next)
-    {
-    if ((cg->gp == gp) && (cg->track == track))
-        return cg;
-    }
-return NULL;
-}
-
 void clusterAddExon(struct cluster *cluster,
-	int start, int end, struct track *track, struct genePred *gp)
+	int start, int end, struct clusterGene *cg)
 /* Add exon to cluster. */
 {
-struct clusterGene *cg = clusterFindGene(cluster, track, gp);
-if (cg == NULL)
+assert((cg->cluster == NULL) || (cg->cluster == cluster));
+if (cg->cluster == NULL)
     {
-    cg = clusterGeneNew(track, gp);
+    cg->cluster = cluster;
     slSafeAddHead(&cluster->genes, cg);
     }
 if (cluster->start == cluster->end)
     {
-    cluster->chrom = gp->chrom;
+    cluster->chrom = cg->gp->chrom;
     cluster->start = start;
     cluster->end = end;
-    cluster->txStart = gp->txStart;
-    cluster->txEnd = gp->txEnd;
+    cluster->txStart = cg->gp->txStart;
+    cluster->txEnd = cg->gp->txEnd;
     }
 else
     {
     cluster->start = min(cluster->start, start);
     cluster->end = max(cluster->end, end);
-    cluster->txStart = min(cluster->txStart, gp->txStart);
-    cluster->txEnd = max(cluster->txEnd, gp->txEnd);
+    cluster->txStart = min(cluster->txStart, cg->gp->txStart);
+    cluster->txEnd = max(cluster->txEnd, cg->gp->txEnd);
     }
 }
 
 void addExon(struct binKeeper *bk, struct dlNode *clusterNode,
-	int start, int end, struct track *track, struct genePred *gp)
+             int start, int end, struct clusterGene *cg)
 /* Add exon to cluster and binKeeper. */
 {
-clusterAddExon(clusterNode->val, start, end, track, gp);
+clusterAddExon(clusterNode->val, start, end, cg);
 binKeeperAdd(bk, start, end, clusterNode);
 }
 
 void mergeClusters(struct binKeeper *bk, struct binElement *bkRest,
 	struct dlNode *aNode, struct dlNode *bNode)
 /* Move bNode into aNode. */
 {
 struct cluster *aCluster = aNode->val;
 struct cluster *bCluster = bNode->val;
 struct binElement *bkEl;
 
 if (verboseLevel() >= 3) 
     {
     fprintf(stderr, " a: ");
     clusterDump(aCluster);
     fprintf(stderr, " b: ");
     clusterDump(bCluster);
     }
 
 /* First change references to bNode. */
 binKeeperReplaceVal(bk, bCluster->start, bCluster->end, bNode, aNode);
 for (bkEl = bkRest; bkEl != NULL; bkEl = bkEl->next)
     if (bkEl->val == bNode) 
         bkEl->val = aNode;
 
 /* Add b's genes to a. */
-while (bCluster->genes != NULL)
-    {
 struct clusterGene *cg = bCluster->genes;
-    bCluster->genes = bCluster->genes->next;
-    cg->next = NULL;
+while ((cg = slPopHead(&bCluster->genes)) != NULL)
+    {
+    cg->cluster = aCluster;
     slSafeAddHead(&aCluster->genes, cg);
     }
 
 /* Adjust start/end. */
 if (bCluster->start < aCluster->start) 
     aCluster->start = bCluster->start;
 if (aCluster->end < bCluster->end)
     aCluster->end = bCluster->end;
 
 /* Remove all traces of bNode. */
 dlDelete(&bNode);
 clusterFree(&bCluster);
 if (verboseLevel() >= 3) 
     {
     fprintf(stderr, " ab: ");
@@ -665,135 +686,146 @@
 
 if (gDetectConflicted)
     getConflicts(clusterList);
 
 /* assign ids */
 for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
     cluster->id = nextClusterId++;
 
 /* Clean up and go home. */
 binKeeperFree(&cm->bk);
 dlListFree(&cm->clusters);
 freez(pCm);
 return clusterList;
 }
 
-static bool overlapingBases(int exonStart, int exonEnd, struct dlNode *node)
+static bool isOverlapped(int exon1Start, int exon1End,
+                         int exon2Start, int exon2End)
+/* check if two ranges are overlapped by minimum overlap */
+{
+int maxStart = max(exon1Start, exon2Start);
+int minEnd = min(exon1End, exon2End);
+return (minEnd - maxStart) >= gIgnoreEndBases;
+}
+
+static bool overlappingCluster(int exonStart, int exonEnd, struct dlNode *node)
 /* check if the overlap with cluster is over threshold  */
 {
 struct cluster *cluster = node->val;
-int maxStart = max(exonStart, cluster->txStart);
-int minEnd = min(exonEnd, cluster->txEnd);
-return (minEnd - maxStart) >= gMinOverlappingBases;
+return isOverlapped(exonStart, exonEnd, cluster->txStart, cluster->txEnd);
 }
 
 static struct dlNode *clusterMakerAddExonMerge(struct clusterMaker *cm, int exonStart, int exonEnd, struct dlNode *currentNode)
 /* merge notes that overlap exon over threshold, return node */
 {
 struct binElement *bEl, *bList = binKeeperFind(cm->bk, exonStart, exonEnd);
 for (bEl = bList; bEl != NULL; bEl = bEl->next)
     {
     struct dlNode *newNode = bEl->val;
     if (newNode != currentNode)
         {
         if (currentNode == NULL)
             {
             /* Add to existing cluster. */
             currentNode = newNode;
             }
-        else if (overlapingBases(exonStart, exonEnd, currentNode))
+        else if (overlappingCluster(exonStart, exonEnd, currentNode))
             {
             /* Merge new cluster into old one. */
             verbose(3, "Merging %p %p\n", currentNode, newNode);
             mergeClusters(cm->bk, bEl->next, currentNode, newNode);
             }
         }
     }
 slFreeList(&bList);
 return currentNode;
 }
 
-static void clusterMakerAddExon(struct clusterMaker *cm, struct track *track, struct genePred *gp,
+static void clusterMakerAddExon(struct clusterMaker *cm, struct clusterGene *cg,
                                 int exonStart, int exonEnd, struct dlNode **currentNodePtr)
 /* Add a gene exon to clusterMaker. */
 {
-verbose(4, "  %s %d-%d\n", track->name, exonStart, exonEnd);
+verbose(4, "  %s %d-%d\n", cg->track->name, exonStart, exonEnd);
 struct dlNode *currentNode = clusterMakerAddExonMerge(cm, exonStart, exonEnd, *currentNodePtr);
 
-if ((currentNode == NULL) || !overlapingBases(exonStart, exonEnd, currentNode))
+if ((currentNode == NULL) || !overlappingCluster(exonStart, exonEnd, currentNode))
     {
     struct cluster *cluster = clusterNew();
     currentNode = dlAddValTail(cm->clusters, cluster);
-    addExon(cm->bk, currentNode, exonStart, exonEnd, track, gp);
+    addExon(cm->bk, currentNode, exonStart, exonEnd, cg);
     }
 else
-    addExon(cm->bk, currentNode, exonStart, exonEnd, track, gp);
+    addExon(cm->bk, currentNode, exonStart, exonEnd, cg);
 *currentNodePtr = currentNode;
 }
 
-void clusterMakerAdd(struct clusterMaker *cm, struct track *track, struct genePred *gp)
+void clusterMakerAdd(struct clusterMaker *cm, struct clusterGene *cg)
 /* Add gene to clusterMaker. */
 {
 int exonIx;
 struct dlNode *currentNode = NULL;
 
 /* Build up cluster list with aid of binKeeper.  For each exon look to see if
  * it overlaps an existing cluster.  If so put it into existing cluster,
  * otherwise make a new cluster.  The hard case is where part of the gene is
  * already in one cluster and the exon overlaps with a new cluster.  In this
  * case merge the new cluster into the old one.  If we are joining contained
  * genes, the only gene range is added as if it was a single exon. */
 
-verbose(2, "%s %s %d-%d\n", track->name, gp->name, gp->txStart, gp->txEnd);
+verbose(2, "%s %s %d-%d\n", cg->track->name, cg->gp->name, cg->gp->txStart, cg->gp->txEnd);
 if (gJoinContained)
     {
-    int start =(gUseCds) ? gp->cdsStart : gp->txStart;
-    int end =(gUseCds) ? gp->cdsEnd : gp->txEnd;
+    int start =(gUseCds) ? cg->gp->cdsStart : cg->gp->txStart;
+    int end =(gUseCds) ? cg->gp->cdsEnd : cg->gp->txEnd;
     if (end > start) 
-        clusterMakerAddExon(cm, track, gp, start, end, &currentNode);
+        clusterMakerAddExon(cm, cg, start, end, &currentNode);
     }
 else
     {
-    for (exonIx = 0; exonIx < gp->exonCount; ++exonIx)
+    for (exonIx = 0; exonIx < cg->gp->exonCount; ++exonIx)
         {
         int exonStart, exonEnd;
-        if (gpGetExon(gp, exonIx, gUseCds, &exonStart, &exonEnd))
-            clusterMakerAddExon(cm, track, gp, exonStart, exonEnd, &currentNode);
+        if (gpGetExon(cg->gp, exonIx, gUseCds, &exonStart, &exonEnd))
+            clusterMakerAddExon(cm, cg, exonStart, exonEnd, &currentNode);
         }
     }
 }
 
 void loadGenes(struct clusterMaker *cm, struct sqlConnection *conn,
-               struct track* track, char *chrom, char strand,
-               struct genePred **allGenes)
+               struct track* track, char *chrom, char strand)
 /* load genes into cluster from a table or file */
 {
 struct genePred *genes, *gp;
 verbose(2, "%s %s %c\n", track->table, chrom, strand);
 
 if (track->isDb)
     genes = trackTableGetGenes(track, conn, chrom, strand);
 else
     genes = trackFileGetGenes(track, chrom, strand);
 
 /* add to cluster and deletion list */
 while ((gp = slPopHead(&genes)) != NULL)
     {
-    slAddHead(allGenes, gp);
-    clusterMakerAdd(cm, track, gp);
+    if ((!gUseCds) || (gp->cdsStart < gp->cdsEnd))
+        {
+        struct clusterGene* cg = clusterGeneNew(track, gp);
+        clusterMakerAdd(cm, cg);
         ++totalGeneCount;
         }
+    else
+        genePredFree(&gp);
+    }
 }
 
 void prConflicts(FILE *f, struct slRef* conflicts)
 /* print list of conflicts as comma-seperated list */
 {
 struct slRef* cl;
 fprintf(f, "\t");
 for (cl = conflicts; cl != NULL; cl = cl->next)
     {
     struct clusterGene *cg = cl->val;
     fprintf(f, "%s:%s,", cg->track->name, cg->gp->name);
     }
 }
 
 void prGene(FILE *f, struct cluster *cluster, struct clusterGene *cg)
@@ -907,43 +939,41 @@
         if (clBedFh != NULL)
             outputBed(cluster, cluster->start, cluster->end, strand, clBedFh);
         if (clTxBedFh != NULL)
             outputBed(cluster, cluster->txStart, cluster->txEnd, strand, clTxBedFh);
         if (flatBedFh != NULL)
             outputFlatBed(cluster, strand, flatBedFh);
         }
     }
 }
 
 void clusterGenesOnStrand(struct sqlConnection *conn, struct track* tracks,
                           char *chrom, char strand, FILE *outFh,
                           FILE *clBedFh, FILE *clTxBedFh, FILE *flatBedFh)
 /* Scan through genes on this strand, cluster, and write clusters to file. */
 {
-struct genePred *gpList = NULL;
 struct cluster *clusterList = NULL;
 struct track *tr;
 int chromSize = (conn != NULL) ? hChromSize(sqlGetDatabase(conn), chrom) : 1000000000;
 struct clusterMaker *cm = clusterMakerStart(chromSize);
 
 for (tr = tracks; tr != NULL; tr = tr->next)
-    loadGenes(cm, conn, tr, chrom, strand, &gpList);
+    loadGenes(cm, conn, tr, chrom, strand);
 
 clusterList = clusterMakerFinish(&cm);
 outputClusters(clusterList, strand, outFh, clBedFh, clTxBedFh, flatBedFh);
 
-genePredFreeList(&gpList);
 clusterFreeList(&clusterList);
 }
 
 FILE *openOutput(char *outFile)
 /* open the output file and write the header */
 {
 FILE *f = mustOpen(outFile, "w");
 fputs("#"
       "cluster\t"
       "table\t"
       "gene\t"
       "chrom\t"
       "txStart\t"
       "txEnd\t"
       "strand\t"
@@ -998,29 +1028,32 @@
 carefulClose(&clBedFh);
 carefulClose(&clTxBedFh);
 carefulClose(&flatBedFh);
 carefulClose(&outFh);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 gUseCds = optionExists("cds");
 gTrackNames = optionExists("trackNames");
 gJoinContained = optionExists("joinContained");
 gDetectConflicted = optionExists("conflicted");
 gIgnoreStrand = optionExists("ignoreStrand");
-gMinOverlappingBases = optionInt("minOverlappingBases", gMinOverlappingBases);
+if (optionExists("ignoreEndBases"))
+    gIgnoreEndBases = optionInt("ignoreEndBases", gIgnoreEndBases);
+else
+    gIgnoreEndBases = optionInt("minOverlappingBases", gIgnoreEndBases);  /* deprecated */
 if (!gTrackNames)
     {
     if (argc < 4)
         usage();
     }
 else
     {
     if ((argc < 5) || !(argc & 1))
         usage();
     }
 clusterGenes(argv[1], argv[2], argc-3, argv+3);
 return 0;
 }