7687cf920f88accdb12ab70c718f3add9bf88131 markd Wed Feb 12 09:16:51 2020 -0800 added -minOverlappingBases option to set a threshold for merging transcripts into a gene cluster diff --git src/hg/geneBounds/clusterGenes/clusterGenes.c src/hg/geneBounds/clusterGenes/clusterGenes.c index 9c6b1f6..a2c7b4e 100644 --- src/hg/geneBounds/clusterGenes/clusterGenes.c +++ src/hg/geneBounds/clusterGenes/clusterGenes.c @@ -44,57 +44,62 @@ " names.\n" " -ignoreStrand - cluster postive and negative strand together\n" " -clusterBed=bed - output BED file for each cluster. If -cds is specified,\n" " this only contains bounds based on CDS\n" " -clusterTxBed=bed - output BED file for each cluster. If -cds is specified,\n" " this contains bounds based on full transcript range of genes, not just CDS\n" " -flatBed=bed - output BED file that contains the exons of all genes\n" " flattned into a single record. If -cds is specified, this only contains\n" " bounds based on CDS\n" " -joinContained - join genes that are contained within a larger loci\n" " into that loci. Intended as a way to handled fragments and exon-level\n" " predictsions, as genes-in-introns on the same strand are very rare.\n" " -conflicted - detect conflicted loci. Conflicted loci are loci that\n" " contain genes that share not sequence. This option greatly increases\n" " size of output file.\n" + " -minOverlappingBases=N - require at least N bases to overlap to merge two\n" + " transcripts. This is done on a per-exon basis, not the total overlap.\n" + " The default is 1.\n" "\n" "The cdsConflicts and exonConflicts columns contains `y' if the cluster has\n" "conficts. A conflict is a cluster where all of the genes don't share exons. \n" "Conflicts maybe either internal to a table or between tables.\n" ); } static struct optionSpec options[] = { {"chrom", OPTION_STRING|OPTION_MULTI}, {"cds", OPTION_BOOLEAN}, {"trackNames", OPTION_BOOLEAN}, {"clusterBed", OPTION_STRING}, {"clusterTxBed", OPTION_STRING}, {"flatBed", OPTION_STRING}, {"cds", OPTION_BOOLEAN}, {"joinContained", OPTION_BOOLEAN}, {"conflicted", OPTION_BOOLEAN}, {"ignoreStrand", OPTION_BOOLEAN}, + {"minOverlappingBases", OPTION_INT}, {NULL, 0}, }; /* from command line */ -boolean gUseCds; -boolean gTrackNames; -boolean gJoinContained = FALSE; -boolean gDetectConflicted = FALSE; -boolean gIgnoreStrand = FALSE; +static boolean gUseCds; +static boolean gTrackNames; +static boolean gJoinContained = FALSE; +static boolean gDetectConflicted = FALSE; +static boolean gIgnoreStrand = FALSE; +static int gMinOverlappingBases = 1; struct track /* Object representing a track. */ { struct track *next; char *name; /* name to use */ char *table; /* table or file */ boolean isDb; /* is this a database table or file? */ struct genePred *genes; /* genes read from a file, sorted by chrom and strand */ }; static int genePredStandCmp(const void *va, const void *vb) /* Compare to sort based on chromosome, strand, txStart, txEnd, and name. The * location and name are just included to help make tests consistent. */ { @@ -660,100 +665,111 @@ if (gDetectConflicted) getConflicts(clusterList); /* assign ids */ for (cluster = clusterList; cluster != NULL; cluster = cluster->next) cluster->id = nextClusterId++; /* Clean up and go home. */ binKeeperFree(&cm->bk); dlListFree(&cm->clusters); freez(pCm); return clusterList; } -void clusterMakerAddExon(struct clusterMaker *cm, struct track *track, struct genePred *gp, - int exonStart, int exonEnd, struct dlNode **oldNodePtr) -/* Add a gene exon to clusterMaker. */ -{ -struct dlNode *oldNode = *oldNodePtr; -struct binElement *bEl, *bList = binKeeperFind(cm->bk, exonStart, exonEnd); -verbose(4, " %s %d-%d\n", track->name, exonStart, exonEnd); -if (bList == NULL) - { - if (oldNode == NULL) +static bool overlapingBases(int exonStart, int exonEnd, struct dlNode *node) +/* check if the overlap with cluster is over threshold */ { - struct cluster *cluster = clusterNew(); - oldNode = dlAddValTail(cm->clusters, cluster); - } - addExon(cm->bk, oldNode, exonStart, exonEnd, track, gp); +struct cluster *cluster = node->val; +int maxStart = max(exonStart, cluster->txStart); +int minEnd = min(exonEnd, cluster->txEnd); +return (minEnd - maxStart) >= gMinOverlappingBases; } -else + +static struct dlNode *clusterMakerAddExonMerge(struct clusterMaker *cm, int exonStart, int exonEnd, struct dlNode *currentNode) +/* merge notes that overlap exon over threshold, return node */ { +struct binElement *bEl, *bList = binKeeperFind(cm->bk, exonStart, exonEnd); for (bEl = bList; bEl != NULL; bEl = bEl->next) { struct dlNode *newNode = bEl->val; - if (newNode != oldNode) + if (newNode != currentNode) { - if (oldNode == NULL) + if (currentNode == NULL) { /* Add to existing cluster. */ - oldNode = newNode; + currentNode = newNode; } - else + else if (overlapingBases(exonStart, exonEnd, currentNode)) { /* Merge new cluster into old one. */ - verbose(3, "Merging %p %p\n", oldNode, newNode); - mergeClusters(cm->bk, bEl->next, oldNode, newNode); + verbose(3, "Merging %p %p\n", currentNode, newNode); + mergeClusters(cm->bk, bEl->next, currentNode, newNode); } } } - addExon(cm->bk, oldNode, exonStart, exonEnd, track, gp); slFreeList(&bList); +return currentNode; +} + +static void clusterMakerAddExon(struct clusterMaker *cm, struct track *track, struct genePred *gp, + int exonStart, int exonEnd, struct dlNode **currentNodePtr) +/* Add a gene exon to clusterMaker. */ +{ +verbose(4, " %s %d-%d\n", track->name, exonStart, exonEnd); +struct dlNode *currentNode = clusterMakerAddExonMerge(cm, exonStart, exonEnd, *currentNodePtr); + +if ((currentNode == NULL) || !overlapingBases(exonStart, exonEnd, currentNode)) + { + struct cluster *cluster = clusterNew(); + currentNode = dlAddValTail(cm->clusters, cluster); + addExon(cm->bk, currentNode, exonStart, exonEnd, track, gp); } -*oldNodePtr = oldNode; +else + addExon(cm->bk, currentNode, exonStart, exonEnd, track, gp); +*currentNodePtr = currentNode; } void clusterMakerAdd(struct clusterMaker *cm, struct track *track, struct genePred *gp) /* Add gene to clusterMaker. */ { int exonIx; -struct dlNode *oldNode = NULL; +struct dlNode *currentNode = NULL; /* Build up cluster list with aid of binKeeper. For each exon look to see if * it overlaps an existing cluster. If so put it into existing cluster, * otherwise make a new cluster. The hard case is where part of the gene is * already in one cluster and the exon overlaps with a new cluster. In this * case merge the new cluster into the old one. If we are joining contained * genes, the only gene range is added as if it was a single exon. */ verbose(2, "%s %s %d-%d\n", track->name, gp->name, gp->txStart, gp->txEnd); if (gJoinContained) { int start =(gUseCds) ? gp->cdsStart : gp->txStart; int end =(gUseCds) ? gp->cdsEnd : gp->txEnd; if (end > start) - clusterMakerAddExon(cm, track, gp, start, end, &oldNode); + clusterMakerAddExon(cm, track, gp, start, end, ¤tNode); } else { for (exonIx = 0; exonIx < gp->exonCount; ++exonIx) { int exonStart, exonEnd; if (gpGetExon(gp, exonIx, gUseCds, &exonStart, &exonEnd)) - clusterMakerAddExon(cm, track, gp, exonStart, exonEnd, &oldNode); + clusterMakerAddExon(cm, track, gp, exonStart, exonEnd, ¤tNode); } } } void loadGenes(struct clusterMaker *cm, struct sqlConnection *conn, struct track* track, char *chrom, char strand, struct genePred **allGenes) /* load genes into cluster from a table or file */ { struct genePred *genes, *gp; verbose(2, "%s %s %c\n", track->table, chrom, strand); if (track->isDb) genes = trackTableGetGenes(track, conn, chrom, strand); else @@ -982,28 +998,29 @@ carefulClose(&clBedFh); carefulClose(&clTxBedFh); carefulClose(&flatBedFh); carefulClose(&outFh); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); gUseCds = optionExists("cds"); gTrackNames = optionExists("trackNames"); gJoinContained = optionExists("joinContained"); gDetectConflicted = optionExists("conflicted"); gIgnoreStrand = optionExists("ignoreStrand"); +gMinOverlappingBases = optionInt("minOverlappingBases", gMinOverlappingBases); if (!gTrackNames) { if (argc < 4) usage(); } else { if ((argc < 5) || !(argc & 1)) usage(); } clusterGenes(argv[1], argv[2], argc-3, argv+3); return 0; }