src/hg/geneBounds/clusterGenes/clusterGenes.c 83657216000f3e3ae3673d8a2a7d04feaf8cd147

83657216000f3e3ae3673d8a2a7d04feaf8cd147
markd
  Tue Dec 15 13:21:22 2020 -0800
fixed bug were clusterGenes -ignoreBases would drop small transcripts

diff --git src/hg/geneBounds/clusterGenes/clusterGenes.c src/hg/geneBounds/clusterGenes/clusterGenes.c
index 2df0e5b..442a31f 100644
--- src/hg/geneBounds/clusterGenes/clusterGenes.c
+++ src/hg/geneBounds/clusterGenes/clusterGenes.c
@@ -671,30 +671,31 @@
 /* search for conflicst in clusters and fill in the data structs */
 {
 struct cluster *cl;
 for (cl = clusters; cl != NULL; cl = cl->next)
     getClusterConflicts(cl);
 }
 
 int totalGeneCount = 0;
 int totalClusterCount = 0;
 
 struct clusterMaker
 /* Something that helps us make clusters. */
     {
     struct clusterMaker *next;	/* Next in list */
     struct dlList *clusters;	/* Doubly linked list of clusters. */
+    struct cluster *orphans;    /* not added to clusters due to gIgnoreBases */
     struct binKeeper *bk;	/* Bin-keeper that tracks exons. */
     };
 
 struct clusterMaker *clusterMakerStart(int chromSize)
 /* Allocate a new clusterMaker */
 {
 struct clusterMaker *cm;
 AllocVar(cm);
 cm->bk = binKeeperNew(0, chromSize);
 cm->clusters = dlListNew();
 return cm;
 }
 
 int nextClusterId = 1;  /* next cluster id to assign */
 
@@ -705,30 +706,31 @@
 struct cluster *clusterList = NULL, *cluster;
 struct clusterMaker *cm = *pCm;
 
 if (cm == NULL)
     errAbort("Null cluster in clusterMakerFinish");
 
 /* We build up the cluster list as a doubly-linked list
  * to make it faster to remove clusters that get merged
  * into another cluster.  At the end though we make
  * a singly-linked list out of it. */
 for (node = cm->clusters->tail; !dlStart(node); node=node->prev)
     {
     cluster = node->val;
     slAddHead(&clusterList, cluster);
     }
+clusterList = slCat(clusterList, cm->orphans);
 slSort(&clusterList, clusterCmp);
 
 if (gDetectConflicted)
     getConflicts(clusterList);
 
 /* assign ids */
 for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
     cluster->id = nextClusterId++;
 
 /* Clean up and go home. */
 binKeeperFree(&cm->bk);
 dlListFree(&cm->clusters);
 freez(pCm);
 return clusterList;
 }
@@ -775,78 +777,102 @@
 return currentNode;
 }
 
 static void clusterMakerAddExon(struct clusterMaker *cm, struct clusterGene *cg,
                                 int exonStart, int exonEnd, struct dlNode **currentNodePtr)
 /* Add a gene exon to clusterMaker. */
 {
 verbose(4, "  exon %d-%d\n", exonStart, exonEnd);
 struct binElement *bList = binKeeperFind(cm->bk, exonStart, exonEnd);
 if (bList == NULL)
     *currentNodePtr = clusterMakerAddExonNoOverlap(cm, cg, exonStart, exonEnd, *currentNodePtr);
 else
     *currentNodePtr = clusterMakerAddExonOverlap(cm, cg, exonStart, exonEnd, bList, *currentNodePtr);
 }
 
-void clusterMakerAdd(struct clusterMaker *cm, struct clusterGene *cg)
-/* Add gene to clusterMaker. */
+bool clusterMakerAdd(struct clusterMaker *cm, struct clusterGene *cg)
+/* Add gene to clusterMaker, return false if not added due to gIgnoreBases */
 {
 int exonIx;
 struct dlNode *currentNode = NULL;
 
 /* Build up cluster list with aid of binKeeper.  For each exon look to see if
  * it overlaps an existing cluster.  If so put it into existing cluster,
  * otherwise make a new cluster.  The hard case is where part of the gene is
  * already in one cluster and the exon overlaps with a new cluster.  In this
  * case merge the new cluster into the old one.  If we are joining contained
  * genes, the only gene range is added as if it was a single exon. */
 
 verbose(2, "add gene: [%s] %s %d-%d\n", cg->track->name, cg->gp->name, cg->gp->txStart, cg->gp->txEnd);
+bool added = FALSE;
 if (gJoinContained)
     {
+    if (cg->effectiveStart < cg->effectiveEnd)
+        {
         clusterMakerAddExon(cm, cg, cg->effectiveStart, cg->effectiveEnd, &currentNode);
+        added = TRUE;
+        }
     }
 else
     {
     for (exonIx = 0; exonIx < cg->gp->exonCount; ++exonIx)
         {
         int exonStart, exonEnd;
         if (gpGetExon(cg, exonIx, FALSE, &exonStart, &exonEnd))
+            {
             clusterMakerAddExon(cm, cg, exonStart, exonEnd, &currentNode);
+            added = TRUE;
+            }
         }
     }
+return added;
+}
+
+void addOrphan(struct clusterMaker *cm, struct clusterGene* cg)
+/* create an orphan cluster and added it to the orphans */
+{
+struct cluster *cluster = clusterNew();
+cg->cluster = cluster;
+slSafeAddHead(&cluster->genes, cg);
+cluster->chrom = cg->gp->chrom;
+cluster->start = cg->gp->txStart;
+cluster->end = cg->gp->txEnd;
+cluster->txStart = cg->gp->txStart;
+cluster->txEnd = cg->gp->txEnd;
+slSafeAddHead(&cm->orphans, cluster);
 }
 
 void loadGenes(struct clusterMaker *cm, struct sqlConnection *conn,
                struct track* track, char *chrom, char strand)
 /* load genes into cluster from a table or file */
 {
 struct genePred *genes, *gp;
 verbose(2, "loadGenes: [%s] %s %c\n", track->name, chrom, strand);
 
 if (track->isDb)
     genes = trackTableGetGenes(track, conn, chrom, strand);
 else
     genes = trackFileGetGenes(track, chrom, strand);
 
 /* add to cluster and deletion list */
 while ((gp = slPopHead(&genes)) != NULL)
     {
     if ((!gUseCds) || (gp->cdsStart < gp->cdsEnd))
         {
         struct clusterGene* cg = clusterGeneNew(track, gp);
-        clusterMakerAdd(cm, cg);
+        if (!clusterMakerAdd(cm, cg))
+            addOrphan(cm, cg);
         ++totalGeneCount;
         }
     else
         genePredFree(&gp);
     }
 }
 
 void prConflicts(FILE *f, struct slRef* conflicts)
 /* print list of conflicts as comma-seperated list */
 {
 struct slRef* cl;
 fprintf(f, "\t");
 for (cl = conflicts; cl != NULL; cl = cl->next)
     {
     struct clusterGene *cg = cl->val;