f0b2eab01fa6d080c92c0444ef84d482b75afec9 markd Mon Oct 5 17:31:06 2020 -0700 replaced the -minOverlappingBases option with -ignoreBases; -minOverlappingBases did not work correctly and would have requred major restructing of the code. The -ignoreBases directly address the issue of minor overlap at the ends of transcripts diff --git src/hg/geneBounds/clusterGenes/clusterGenes.c src/hg/geneBounds/clusterGenes/clusterGenes.c index 34c2efc..e4291ae 100644 --- src/hg/geneBounds/clusterGenes/clusterGenes.c +++ src/hg/geneBounds/clusterGenes/clusterGenes.c @@ -42,66 +42,67 @@ " -trackNames - If specified, input are pairs of track names and files.\n" " This is useful when the file names don't reflact the desired track\n" " names.\n" " -ignoreStrand - cluster postive and negative strand together\n" " -clusterBed=bed - output BED file for each cluster. If -cds is specified,\n" " this only contains bounds based on CDS\n" " -clusterTxBed=bed - output BED file for each cluster. If -cds is specified,\n" " this contains bounds based on full transcript range of genes, not just CDS\n" " -flatBed=bed - output BED file that contains the exons of all genes\n" " flattned into a single record. If -cds is specified, this only contains\n" " bounds based on CDS\n" " -joinContained - join genes that are contained within a larger loci\n" " into that loci. Intended as a way to handled fragments and exon-level\n" " predictsions, as genes-in-introns on the same strand are very rare.\n" " -conflicted - detect conflicted loci. Conflicted loci are loci that\n" - " contain genes that share not sequence. This option greatly increases\n" + " contain genes that share no sequence. This option greatly increases\n" " size of output file.\n" - " -ignoreEndBases=N - ignore this many based to the start and end of each\n" + " -ignoreBases=N - ignore this many based to the start and end of each\n" " transcript when determine overlap. This prevents small amounts of overlap\n" " from merging transcripts. If -cds is specified, this amount of the CDS.\n" " is ignored. The default is 0.\n" + " -minOverlappingBases=N - mapped to ignoreBases, did not work correctly.\n" "\n" "The cdsConflicts and exonConflicts columns contains `y' if the cluster has\n" "conficts. A conflict is a cluster where all of the genes don't share exons. \n" "Conflicts maybe either internal to a table or between tables.\n" ); } static struct optionSpec options[] = { {"chrom", OPTION_STRING|OPTION_MULTI}, {"cds", OPTION_BOOLEAN}, {"trackNames", OPTION_BOOLEAN}, {"clusterBed", OPTION_STRING}, {"clusterTxBed", OPTION_STRING}, {"flatBed", OPTION_STRING}, {"cds", OPTION_BOOLEAN}, {"joinContained", OPTION_BOOLEAN}, {"conflicted", OPTION_BOOLEAN}, {"ignoreStrand", OPTION_BOOLEAN}, - {"minOverlappingBases", OPTION_INT}, /* deprecated, use ignoreEndBases */ - {"ignoreEndBases", OPTION_INT}, + {"minOverlappingBases", OPTION_INT}, /* deprecated, use ignoreBases */ + {"ignoreBases", OPTION_INT}, {NULL, 0}, }; /* from command line */ static boolean gUseCds; static boolean gTrackNames; static boolean gJoinContained = FALSE; static boolean gDetectConflicted = FALSE; static boolean gIgnoreStrand = FALSE; -static int gIgnoreEndBases = 1; +static int gIgnoreBases = 0; struct track /* Object representing a track. */ { struct track *next; char *name; /* name to use */ char *table; /* table or file */ boolean isDb; /* is this a database table or file? */ struct genePred *genes; /* genes read from a file, sorted by chrom and strand */ }; static int genePredStandCmp(const void *va, const void *vb) /* Compare to sort based on chromosome, strand, txStart, txEnd, and name. The * location and name are just included to help make tests consistent. */ { @@ -294,101 +295,113 @@ else { /* if there are any tracks from the databases, include all chroms? */ for (tr = tracks; tr != NULL; tr = tr->next) if (tr->isDb) anyDb = TRUE; if (anyDb) chroms = slNameCloneList(hAllChromNames(sqlGetDatabase(conn))); else chroms = getFileTracksChroms(tracks); } slNameSort(&chroms); return chroms; } -boolean gpGetExon(struct genePred* gp, int exonIx, boolean cdsOnly, - int *exonStartRet, int *exonEndRet) -/* Get the start and end of an exon, adjusting if we are only examining CDS. - * Return false if exon should not be used. */ -{ -int exonStart = gp->exonStarts[exonIx]; -int exonEnd = gp->exonEnds[exonIx]; -if (cdsOnly) - { - if (exonStart < gp->cdsStart) - exonStart = gp->cdsStart; - if (exonEnd > gp->cdsEnd) - exonEnd = gp->cdsEnd; - } -*exonStartRet = exonStart; -*exonEndRet = exonEnd; -return exonStart < exonEnd; -} - struct clusterGene /* A gene in a cluster */ { struct clusterGene* next; struct track* track; /* aka table for gene */ struct genePred* gp; struct cluster* cluster; /* cluster containing gene */ - int effectiveStart; /* start/end based on -cds and -ignoreEndBases */ + int effectiveStart; /* start/end based on -cds and -ignoreBases */ int effectiveEnd; struct slRef *exonConflicts; /* list of genes in cluster that don't share * exons with this gene */ struct slRef *cdsConflicts; /* list of genes in cluster that don't share * CDS with this gene */ }; -#if 0 -static int findStartExon(struct genePred* gp) +static int findStartExon(struct genePred* gp, + int minStart) /* find start exon based on using cds or not */ { -int start = gUseCds ? gp->cdsStart : gp->txStart; -int iExon; -for (iExon = 0; iExon < gp->exonCount; iExon++) +int exonIx; +for (exonIx = 0; exonIx < gp->exonCount; exonIx++) { - if (start < gp->exonEnds[iExon)) - return iExon; + if (minStart >= gp->exonStarts[exonIx]) + return exonIx; } errAbort("BUG: findStartExon failed: %s", gp->name); return 0; } static int calcEffectiveStart(struct genePred* gp) /* calculate the genomic start position to use for overlaps */ { -int iExon = findStartExon(gp); -for (iExon = 0; iExon < gp->exonCount; iExon++) +int minStart = gUseCds ? gp->cdsStart : gp->txStart; +int exonIx = findStartExon(gp, minStart); +for (; exonIx < gp->exonCount; exonIx++) { - int possibleStart =; + int possibleStart = minStart + gIgnoreBases; + if (possibleStart >= gp->exonStarts[exonIx]) + return possibleStart; } +errAbort("BUG: calcEffectiveStart failed: %s", gp->name); +return 0; } -#endif + +static int findEndExon(struct genePred* gp, + int maxEnd) +/* find end exon based on using cds or not */ +{ +int exonIx; +for (exonIx = gp->exonCount-1; exonIx >= 0; exonIx--) + { + if (maxEnd <= gp->exonEnds[exonIx]) + return exonIx; + } +errAbort("BUG: findEndExon failed: %s", gp->name); +return 0; +} + +static int calcEffectiveEnd(struct genePred* gp) +/* calculate the genomic end position to use for overlaps */ +{ +int maxEnd = gUseCds ? gp->cdsEnd : gp->txEnd; +int exonIx = findEndExon(gp, maxEnd); +for (; exonIx >= 0; exonIx--) + { + int possibleEnd = maxEnd - gIgnoreBases; + if (possibleEnd <= gp->exonEnds[exonIx]) + return possibleEnd; + } +errAbort("BUG: calcEffectiveEnd failed: %s", gp->name); +return 0; +} + struct clusterGene* clusterGeneNew(struct track* track, struct genePred* gp) /* create a new clusterGene. */ { struct clusterGene* cg; AllocVar(cg); cg->track = track; cg->gp = gp; -#if 0 cg->effectiveStart = calcEffectiveStart(gp); cg->effectiveEnd = calcEffectiveEnd(gp); -#endif return cg; } void clusterGeneFree(struct clusterGene** cgp) /* Free a clusterGene. */ { struct clusterGene* cg = *cgp; if (cg != NULL) { slFreeList(&cg->exonConflicts); slFreeList(&cg->cdsConflicts); freez(cgp); } } @@ -399,31 +412,31 @@ struct clusterGene *cg2 = (*((struct slRef**)clr2))->val; int cmp = strcmp(cg1->track->name, cg2->track->name); if (cmp == 0) cmp = strcmp(cg1->gp->name, cg2->gp->name); return cmp; } struct cluster /* A cluster of overlapping genes. */ { struct cluster *next; /* Next in list. */ int id; /* id assigned to cluster */ struct clusterGene *genes; /* Associated genes. */ char *chrom; /* chrom, memory not owned */ int start,end; /* Range covered by cluster. */ - int txStart,txEnd; /* txStart/txEnd range, diff if -cds */ + int txStart,txEnd; /* txStart/txEnd range, diff from start/end if effective bounds differ */ boolean hasExonConflicts; /* does this cluster have conflicts? */ boolean hasCdsConflicts; }; boolean clusterHaveTrack(struct cluster *cluster, struct track *track) /* check if the cluster has a track */ { struct clusterGene *gene; for (gene = cluster->genes; gene != NULL; gene = gene->next) if (gene->track == track) return TRUE; return FALSE; } @@ -473,30 +486,54 @@ } *pList = NULL; } int clusterCmp(const void *va, const void *vb) /* Compare to sort based on start and end. assumes they are on the chrom */ { const struct cluster *a = *((struct cluster **)va); const struct cluster *b = *((struct cluster **)vb); int dif = a->start - b->start; if (dif == 0) dif = a->end - b->end; return dif; } +static boolean gpGetExon(struct clusterGene *cg, int exonIx, boolean cdsOnly, + int *exonStartRet, int *exonEndRet) +/* Get the start and end of an exon, adjusting if we are only examining CDS, + * Return false if exon should not be used. */ +{ +int exonStart = cg->gp->exonStarts[exonIx]; +int exonEnd = cg->gp->exonEnds[exonIx]; +if (exonStart < cg->effectiveStart) + exonStart = cg->effectiveStart; +if (exonEnd > cg->effectiveEnd) + exonEnd = cg->effectiveEnd; +// still need to check CDS for CDS conflict check +if (cdsOnly) + { + if (exonStart < cg->gp->cdsStart) + exonStart = cg->gp->cdsStart; + if (exonEnd > cg->gp->cdsEnd) + exonEnd = cg->gp->cdsEnd; + } +*exonStartRet = exonStart; +*exonEndRet = exonEnd; +return exonStart < exonEnd; +} + void clusterAddExon(struct cluster *cluster, int start, int end, struct clusterGene *cg) /* Add exon to cluster. */ { assert((cg->cluster == NULL) || (cg->cluster == cluster)); if (cg->cluster == NULL) { cg->cluster = cluster; slSafeAddHead(&cluster->genes, cg); } if (cluster->start == cluster->end) { cluster->chrom = cg->gp->chrom; cluster->start = start; cluster->end = end; @@ -554,68 +591,68 @@ if (bCluster->start < aCluster->start) aCluster->start = bCluster->start; if (aCluster->end < bCluster->end) aCluster->end = bCluster->end; /* Remove all traces of bNode. */ dlDelete(&bNode); clusterFree(&bCluster); if (verboseLevel() >= 3) { fprintf(stderr, " ab: "); clusterDump(aCluster); } } -boolean shareExons(struct genePred *gp1, struct genePred *gp2, boolean cdsOnly) +boolean shareExons(struct clusterGene *cg1, struct clusterGene *cg2, boolean cdsOnly) /* determine if two genes share exons or CDS exons */ { int exonIx1, exonStart1, exonEnd1; int exonIx2, exonStart2, exonEnd2; -for (exonIx1 = 0; exonIx1 < gp1->exonCount; exonIx1++) +for (exonIx1 = 0; exonIx1 < cg1->gp->exonCount; exonIx1++) { - if (gpGetExon(gp1, exonIx1, cdsOnly, &exonStart1, &exonEnd1)) + if (gpGetExon(cg1, exonIx1, cdsOnly, &exonStart1, &exonEnd1)) { /* exonStart2 >= exon1End indicates there can't be overlap on this * exon */ for (exonIx2 = 0, exonStart2 = 0; - (exonIx2 < gp2->exonCount) && (exonStart2 < exonEnd1); + (exonIx2 < cg2->gp->exonCount) && (exonStart2 < exonEnd1); exonIx2++) { - if (gpGetExon(gp2, exonIx2, cdsOnly, &exonStart2, &exonEnd2)) + if (gpGetExon(cg2, exonIx2, cdsOnly, &exonStart2, &exonEnd2)) { if ((exonStart2 < exonEnd1) && (exonEnd2 > exonStart1)) return TRUE; /* overlaps */ } } } } return FALSE; } struct slRef *getGeneConflicts(struct cluster *cluster, struct clusterGene *gene, boolean cdsOnly) /* get list of genes in this cluster that don't share exons/CDS */ { struct slRef *conflicts = NULL; struct clusterGene *cg; /* check all other genes */ for (cg = cluster->genes; cg != NULL; cg = cg->next) { - if ((cg != gene) && !shareExons(cg->gp, gene->gp, cdsOnly)) + if ((cg != gene) && !shareExons(cg, gene, cdsOnly)) refAdd(&conflicts, cg); } return conflicts; } void getClusterConflicts(struct cluster *cluster) /* determine if the cluster has conflicts and fill in clusterGene * list of conflicts */ { struct clusterGene *cg; for (cg = cluster->genes; cg != NULL; cg = cg->next) { cg->exonConflicts = getGeneConflicts(cluster, cg, FALSE); if (cg->exonConflicts != NULL) @@ -686,128 +723,119 @@ if (gDetectConflicted) getConflicts(clusterList); /* assign ids */ for (cluster = clusterList; cluster != NULL; cluster = cluster->next) cluster->id = nextClusterId++; /* Clean up and go home. */ binKeeperFree(&cm->bk); dlListFree(&cm->clusters); freez(pCm); return clusterList; } -static bool isOverlapped(int exon1Start, int exon1End, - int exon2Start, int exon2End) -/* check if two ranges are overlapped by minimum overlap */ +static struct dlNode *clusterMakerAddExonNoOverlap(struct clusterMaker *cm, struct clusterGene *cg, + int exonStart, int exonEnd, struct dlNode *currentNode) +/* add an exon to a cluster when it doesn't overlap any existing exons, return node */ { -int maxStart = max(exon1Start, exon2Start); -int minEnd = min(exon1End, exon2End); -return (minEnd - maxStart) >= gIgnoreEndBases; -} - -static bool overlappingCluster(int exonStart, int exonEnd, struct dlNode *node) -/* check if the overlap with cluster is over threshold */ +if (currentNode == NULL) { -struct cluster *cluster = node->val; -return isOverlapped(exonStart, exonEnd, cluster->txStart, cluster->txEnd); + struct cluster *cluster = clusterNew(); + currentNode = dlAddValTail(cm->clusters, cluster); + } +addExon(cm->bk, currentNode, exonStart, exonEnd, cg); +return currentNode; } -static struct dlNode *clusterMakerAddExonMerge(struct clusterMaker *cm, int exonStart, int exonEnd, struct dlNode *currentNode) -/* merge notes that overlap exon over threshold, return node */ +static struct dlNode *clusterMakerAddExonOverlap(struct clusterMaker *cm, struct clusterGene *cg, + int exonStart, int exonEnd, struct binElement *bList, + struct dlNode *currentNode) +/* add an exon that overlap exons in clusters, return node */ { -struct binElement *bEl, *bList = binKeeperFind(cm->bk, exonStart, exonEnd); +struct binElement *bEl; for (bEl = bList; bEl != NULL; bEl = bEl->next) { struct dlNode *newNode = bEl->val; if (newNode != currentNode) { if (currentNode == NULL) { /* Add to existing cluster. */ currentNode = newNode; } - else if (overlappingCluster(exonStart, exonEnd, currentNode)) + else { /* Merge new cluster into old one. */ verbose(3, "Merging %p %p\n", currentNode, newNode); mergeClusters(cm->bk, bEl->next, currentNode, newNode); } } } +addExon(cm->bk, currentNode, exonStart, exonEnd, cg); slFreeList(&bList); return currentNode; } static void clusterMakerAddExon(struct clusterMaker *cm, struct clusterGene *cg, int exonStart, int exonEnd, struct dlNode **currentNodePtr) /* Add a gene exon to clusterMaker. */ { -verbose(4, " %s %d-%d\n", cg->track->name, exonStart, exonEnd); -struct dlNode *currentNode = clusterMakerAddExonMerge(cm, exonStart, exonEnd, *currentNodePtr); - -if ((currentNode == NULL) || !overlappingCluster(exonStart, exonEnd, currentNode)) - { - struct cluster *cluster = clusterNew(); - currentNode = dlAddValTail(cm->clusters, cluster); - addExon(cm->bk, currentNode, exonStart, exonEnd, cg); - } +verbose(4, " exon %d-%d\n", exonStart, exonEnd); +struct binElement *bList = binKeeperFind(cm->bk, exonStart, exonEnd); +if (bList == NULL) + *currentNodePtr = clusterMakerAddExonNoOverlap(cm, cg, exonStart, exonEnd, *currentNodePtr); else - addExon(cm->bk, currentNode, exonStart, exonEnd, cg); -*currentNodePtr = currentNode; + *currentNodePtr = clusterMakerAddExonOverlap(cm, cg, exonStart, exonEnd, bList, *currentNodePtr); } void clusterMakerAdd(struct clusterMaker *cm, struct clusterGene *cg) /* Add gene to clusterMaker. */ { int exonIx; struct dlNode *currentNode = NULL; /* Build up cluster list with aid of binKeeper. For each exon look to see if * it overlaps an existing cluster. If so put it into existing cluster, * otherwise make a new cluster. The hard case is where part of the gene is * already in one cluster and the exon overlaps with a new cluster. In this * case merge the new cluster into the old one. If we are joining contained * genes, the only gene range is added as if it was a single exon. */ -verbose(2, "%s %s %d-%d\n", cg->track->name, cg->gp->name, cg->gp->txStart, cg->gp->txEnd); +verbose(2, "add gene: [%s] %s %d-%d\n", cg->track->name, cg->gp->name, cg->gp->txStart, cg->gp->txEnd); if (gJoinContained) { - int start =(gUseCds) ? cg->gp->cdsStart : cg->gp->txStart; - int end =(gUseCds) ? cg->gp->cdsEnd : cg->gp->txEnd; - if (end > start) - clusterMakerAddExon(cm, cg, start, end, ¤tNode); + clusterMakerAddExon(cm, cg, cg->effectiveStart, cg->effectiveEnd, ¤tNode); } else { for (exonIx = 0; exonIx < cg->gp->exonCount; ++exonIx) { int exonStart, exonEnd; - if (gpGetExon(cg->gp, exonIx, gUseCds, &exonStart, &exonEnd)) + if (gpGetExon(cg, exonIx, FALSE, &exonStart, &exonEnd)) clusterMakerAddExon(cm, cg, exonStart, exonEnd, ¤tNode); } } } void loadGenes(struct clusterMaker *cm, struct sqlConnection *conn, struct track* track, char *chrom, char strand) /* load genes into cluster from a table or file */ { struct genePred *genes, *gp; -verbose(2, "%s %s %c\n", track->table, chrom, strand); +verbose(2, "loadGenes: [%s] %s %c\n", track->name, chrom, strand); if (track->isDb) genes = trackTableGetGenes(track, conn, chrom, strand); else genes = trackFileGetGenes(track, chrom, strand); /* add to cluster and deletion list */ while ((gp = slPopHead(&genes)) != NULL) { if ((!gUseCds) || (gp->cdsStart < gp->cdsEnd)) { struct clusterGene* cg = clusterGeneNew(track, gp); clusterMakerAdd(cm, cg); ++totalGeneCount; } @@ -837,36 +865,36 @@ { fprintf(f, "\t%c\t%c", (cluster->hasExonConflicts ? 'y' : 'n'), (cluster->hasCdsConflicts ? 'y' : 'n')); prConflicts(f, cg->exonConflicts); prConflicts(f, cg->cdsConflicts); } fprintf(f, "\n"); } Bits* mkClusterMap(struct cluster *cluster) /* make a bit map of the exons in a cluster */ { int len = (cluster->end - cluster->start); Bits *map = bitAlloc(len); struct clusterGene *cg; -int exonStart, exonEnd, iExon; +int exonStart, exonEnd, exonIx; for (cg = cluster->genes; cg != NULL; cg = cg->next) { - for (iExon = 0; iExon < cg->gp->exonCount; iExon++) - if (gpGetExon(cg->gp, iExon, gUseCds, &exonStart, &exonEnd)) + for (exonIx = 0; exonIx < cg->gp->exonCount; exonIx++) + if (gpGetExon(cg, exonIx, FALSE, &exonStart, &exonEnd)) bitSetRange(map, (exonStart-cluster->start), (exonEnd - exonStart)); } return map; } void outputFlatBed(struct cluster *cluster, char strand, FILE *flatBedFh) /* output a clusters as a single bed record, with blocks */ { static struct bed bed; /* bed buffer */ static int capacity = 0; int len = (cluster->end - cluster->start); Bits *map = mkClusterMap(cluster); int startIdx = 0; char nameBuf[64]; @@ -1028,32 +1056,35 @@ carefulClose(&clBedFh); carefulClose(&clTxBedFh); carefulClose(&flatBedFh); carefulClose(&outFh); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); gUseCds = optionExists("cds"); gTrackNames = optionExists("trackNames"); gJoinContained = optionExists("joinContained"); gDetectConflicted = optionExists("conflicted"); gIgnoreStrand = optionExists("ignoreStrand"); -if (optionExists("ignoreEndBases")) - gIgnoreEndBases = optionInt("ignoreEndBases", gIgnoreEndBases); -else - gIgnoreEndBases = optionInt("minOverlappingBases", gIgnoreEndBases); /* deprecated */ +if (optionExists("ignoreBases")) + gIgnoreBases = optionInt("ignoreBases", gIgnoreBases); +else if (optionExists("minOverlappingBases")) + { + fprintf(stderr, "WARNING: -minOverlappingBases is deprecated and may not work as expected\n"); + gIgnoreBases = optionInt("minOverlappingBases", gIgnoreBases); /* deprecated */ + } if (!gTrackNames) { if (argc < 4) usage(); } else { if ((argc < 5) || !(argc & 1)) usage(); } clusterGenes(argv[1], argv[2], argc-3, argv+3); return 0; }