83657216000f3e3ae3673d8a2a7d04feaf8cd147 markd Tue Dec 15 13:21:22 2020 -0800 fixed bug were clusterGenes -ignoreBases would drop small transcripts diff --git src/hg/geneBounds/clusterGenes/clusterGenes.c src/hg/geneBounds/clusterGenes/clusterGenes.c index 2df0e5b..442a31f 100644 --- src/hg/geneBounds/clusterGenes/clusterGenes.c +++ src/hg/geneBounds/clusterGenes/clusterGenes.c @@ -671,30 +671,31 @@ /* search for conflicst in clusters and fill in the data structs */ { struct cluster *cl; for (cl = clusters; cl != NULL; cl = cl->next) getClusterConflicts(cl); } int totalGeneCount = 0; int totalClusterCount = 0; struct clusterMaker /* Something that helps us make clusters. */ { struct clusterMaker *next; /* Next in list */ struct dlList *clusters; /* Doubly linked list of clusters. */ + struct cluster *orphans; /* not added to clusters due to gIgnoreBases */ struct binKeeper *bk; /* Bin-keeper that tracks exons. */ }; struct clusterMaker *clusterMakerStart(int chromSize) /* Allocate a new clusterMaker */ { struct clusterMaker *cm; AllocVar(cm); cm->bk = binKeeperNew(0, chromSize); cm->clusters = dlListNew(); return cm; } int nextClusterId = 1; /* next cluster id to assign */ @@ -705,30 +706,31 @@ struct cluster *clusterList = NULL, *cluster; struct clusterMaker *cm = *pCm; if (cm == NULL) errAbort("Null cluster in clusterMakerFinish"); /* We build up the cluster list as a doubly-linked list * to make it faster to remove clusters that get merged * into another cluster. At the end though we make * a singly-linked list out of it. */ for (node = cm->clusters->tail; !dlStart(node); node=node->prev) { cluster = node->val; slAddHead(&clusterList, cluster); } +clusterList = slCat(clusterList, cm->orphans); slSort(&clusterList, clusterCmp); if (gDetectConflicted) getConflicts(clusterList); /* assign ids */ for (cluster = clusterList; cluster != NULL; cluster = cluster->next) cluster->id = nextClusterId++; /* Clean up and go home. */ binKeeperFree(&cm->bk); dlListFree(&cm->clusters); freez(pCm); return clusterList; } @@ -775,78 +777,102 @@ return currentNode; } static void clusterMakerAddExon(struct clusterMaker *cm, struct clusterGene *cg, int exonStart, int exonEnd, struct dlNode **currentNodePtr) /* Add a gene exon to clusterMaker. */ { verbose(4, " exon %d-%d\n", exonStart, exonEnd); struct binElement *bList = binKeeperFind(cm->bk, exonStart, exonEnd); if (bList == NULL) *currentNodePtr = clusterMakerAddExonNoOverlap(cm, cg, exonStart, exonEnd, *currentNodePtr); else *currentNodePtr = clusterMakerAddExonOverlap(cm, cg, exonStart, exonEnd, bList, *currentNodePtr); } -void clusterMakerAdd(struct clusterMaker *cm, struct clusterGene *cg) -/* Add gene to clusterMaker. */ +bool clusterMakerAdd(struct clusterMaker *cm, struct clusterGene *cg) +/* Add gene to clusterMaker, return false if not added due to gIgnoreBases */ { int exonIx; struct dlNode *currentNode = NULL; /* Build up cluster list with aid of binKeeper. For each exon look to see if * it overlaps an existing cluster. If so put it into existing cluster, * otherwise make a new cluster. The hard case is where part of the gene is * already in one cluster and the exon overlaps with a new cluster. In this * case merge the new cluster into the old one. If we are joining contained * genes, the only gene range is added as if it was a single exon. */ verbose(2, "add gene: [%s] %s %d-%d\n", cg->track->name, cg->gp->name, cg->gp->txStart, cg->gp->txEnd); +bool added = FALSE; if (gJoinContained) { + if (cg->effectiveStart < cg->effectiveEnd) + { clusterMakerAddExon(cm, cg, cg->effectiveStart, cg->effectiveEnd, ¤tNode); + added = TRUE; + } } else { for (exonIx = 0; exonIx < cg->gp->exonCount; ++exonIx) { int exonStart, exonEnd; if (gpGetExon(cg, exonIx, FALSE, &exonStart, &exonEnd)) + { clusterMakerAddExon(cm, cg, exonStart, exonEnd, ¤tNode); + added = TRUE; + } } } +return added; +} + +void addOrphan(struct clusterMaker *cm, struct clusterGene* cg) +/* create an orphan cluster and added it to the orphans */ +{ +struct cluster *cluster = clusterNew(); +cg->cluster = cluster; +slSafeAddHead(&cluster->genes, cg); +cluster->chrom = cg->gp->chrom; +cluster->start = cg->gp->txStart; +cluster->end = cg->gp->txEnd; +cluster->txStart = cg->gp->txStart; +cluster->txEnd = cg->gp->txEnd; +slSafeAddHead(&cm->orphans, cluster); } void loadGenes(struct clusterMaker *cm, struct sqlConnection *conn, struct track* track, char *chrom, char strand) /* load genes into cluster from a table or file */ { struct genePred *genes, *gp; verbose(2, "loadGenes: [%s] %s %c\n", track->name, chrom, strand); if (track->isDb) genes = trackTableGetGenes(track, conn, chrom, strand); else genes = trackFileGetGenes(track, chrom, strand); /* add to cluster and deletion list */ while ((gp = slPopHead(&genes)) != NULL) { if ((!gUseCds) || (gp->cdsStart < gp->cdsEnd)) { struct clusterGene* cg = clusterGeneNew(track, gp); - clusterMakerAdd(cm, cg); + if (!clusterMakerAdd(cm, cg)) + addOrphan(cm, cg); ++totalGeneCount; } else genePredFree(&gp); } } void prConflicts(FILE *f, struct slRef* conflicts) /* print list of conflicts as comma-seperated list */ { struct slRef* cl; fprintf(f, "\t"); for (cl = conflicts; cl != NULL; cl = cl->next) { struct clusterGene *cg = cl->val;