4d80894351b62c4daa432279691a9ef73717d86b braney Wed Oct 29 08:57:35 2025 -0700 allow mafGene to use bigMaf and twoBit instead of database diff --git src/hg/lib/mafGene.c src/hg/lib/mafGene.c index 517592d3537..e322d514304 100644 --- src/hg/lib/mafGene.c +++ src/hg/lib/mafGene.c @@ -1,28 +1,31 @@ /* Copyright (C) 2014 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "hdb.h" #include "maf.h" #include "obscure.h" #include "genePred.h" +#include "dnaseq.h" #include "mafGene.h" +#include "chromAlias.h" +#include "hgMaf.h" struct exonInfo { struct exonInfo *next; struct mafAli *ali; int exonStart; int exonSize; int chromStart; int chromEnd; int frame; char strand; char *name; }; @@ -103,135 +106,169 @@ { if (stop) break; else aa = 'Z'; } *pep++ = aa; ++actualSize; } *pep = 0; assert(actualSize <= inSize/3+1); seq->size = actualSize; return seq; } -static struct mafAli *getRefAli(char *database, char *chrom, int start, int end) +static DNA *getTwoBitSeq(struct mafFileCache *mafFileCache, char *chrom, int start, int end) +{ +if (!sameOk(mafFileCache->chrom, chrom)) + { + freeMem(mafFileCache->chrom); + mafFileCache->chrom = cloneString(chrom); + + freeDnaSeq(&mafFileCache->dnaSeq); + + int size; + struct dnaSeq *dnaSeq = twoBitReadSeqFragExt(mafFileCache->tbf, chrom, 0, 0, FALSE, &size); + mafFileCache->dnaSeq = dnaSeq; + } + +// we copy this because it gets freed later by the maf code +int length = end - start; +char *text = cloneMem(mafFileCache->dnaSeq->dna + start, length+1); +text[length] = 0; + +return text; +} + +static struct mafAli *getRefAli(char *database, char *chrom, int start, int end, struct mafFileCache *mafFileCache) { struct mafAli *ali; struct mafComp *comp; char buffer[1024]; AllocVar(ali); AllocVar(comp); ali->components = comp; ali->textSize = end - start; safef(buffer, sizeof buffer, "%s.%s", database, chrom); comp->src = cloneString(buffer); comp->start = start; comp->strand = '+'; comp->size = end - start; -struct dnaSeq *seq = hChromSeqMixed(database, chrom, start , end); +struct dnaSeq *seq = NULL; +if (mafFileCache && mafFileCache->tbf) + comp->text = getTwoBitSeq(mafFileCache, chrom, start , end); +else + { + seq = hChromSeqMixed(database, chrom, start , end); comp->text = cloneString(seq->dna); freeDnaSeq(&seq); + } return ali; } /* make sure we have the whole range even if * there isn't a maf loaded in this region */ static struct mafAli *padOutAli(struct mafAli *list, char *database, - char *chrom, int start, int end) + char *chrom, int start, int end, struct mafFileCache *mafFileCache) { if (list == NULL) { - struct mafAli *ali = getRefAli(database, chrom, start, end); + struct mafAli *ali = getRefAli(database, chrom, start, end, mafFileCache); return ali; } int aliStart = list->components->start; if (start != aliStart) { - struct mafAli *ali = getRefAli(database, chrom, start, aliStart); + struct mafAli *ali = getRefAli(database, chrom, start, aliStart, mafFileCache); slAddHead(&list, ali); } struct mafAli *next, *last = list; for(; last->next; last = last->next) { next=last->next; int aliEnd = last->components->start + last->components->size; int nextStart = next->components->start ; if (aliEnd != nextStart) { - struct mafAli *ali = getRefAli(database, chrom, aliEnd, nextStart); + struct mafAli *ali = getRefAli(database, chrom, aliEnd, nextStart, mafFileCache); ali->next = next; last->next = ali; } } int aliEnd = last->components->start + last->components->size; if (end != aliEnd) { - struct mafAli *ali = getRefAli(database, chrom, aliEnd, end); + struct mafAli *ali = getRefAli(database, chrom, aliEnd, end, mafFileCache); slAddTail(&list, ali); } return list; } static struct mafAli *getAliForRange(char *database, char *mafTable, - char *chrom, int start, int end) + char *chrom, int start, int end, struct mafFileCache *mafFileCache) +{ +struct mafAli *aliAll = NULL; +if (mafFileCache && (mafFileCache->bbi != NULL)) + { + aliAll = bigMafLoadInRegion(mafFileCache->bbi, chrom, start, end); + } +else { struct sqlConnection *conn = hAllocConn(database); -struct mafAli *aliAll = mafLoadInRegion(conn, mafTable, - chrom, start, end); + aliAll = mafLoadInRegion(conn, mafTable, chrom, start, end); + hFreeConn(&conn); + } struct mafAli *ali; struct mafAli *list = NULL; struct mafAli *nextAli; -hFreeConn(&conn); - for(ali = aliAll; ali; ali = nextAli) { nextAli = ali->next; ali->next = NULL; char *masterSrc = ali->components->src; struct mafAli *subAli = NULL; if (mafNeedSubset(ali, masterSrc, start, end)) { subAli = mafSubset( ali, masterSrc, start, end); if (subAli == NULL) continue; } if (subAli) { slAddHead(&list, subAli); mafAliFree(&ali); } else slAddHead(&list, ali); } slReverse(&list); -list = padOutAli(list, database, chrom, start, end); +list = padOutAli(list, database, chrom, start, end, mafFileCache); return list; } /* allocate space for the nuc and aa sequence for each species */ static struct speciesInfo *getSpeciesInfo(struct exonInfo *giList, struct slName *speciesNames, struct hash *siHash) { struct exonInfo *gi; int size = 0; struct speciesInfo *siList = NULL; for(gi = giList ; gi ; gi = gi->next) size += gi->exonSize; @@ -307,31 +344,31 @@ ptr += gi->exonSize - 1; break; } int lastFrame = (gi->frame + gi->exonSize) % 3; if (lastFrame == 1) /* delete the last nucleotide */ --ptr; else if (lastFrame == 2) /* add one more nucleotide from * the next exon */ *ptr++ = siTemp->nucSequence[gi->exonStart + gi->exonSize]; *ptr++ = 0; /* null terminate */ thisSeq.dna = exonBuffer; thisSeq.size = ptr - exonBuffer; outSeq = doTranslate(&thisSeq, 0, 0, FALSE, doUniq); - char buffer[10 * 1024]; + char buffer[1024 * 1024]; safef(buffer, sizeof buffer, "%s_%s_%d_%d %d %d %d %s", gi->name, siTemp->name, exonNum, exonCount, outSeq->size, gi->frame, lastFrame, siTemp->curPosString->name); if ((outSeq->size > 0) && (doBlank || !allDashes(outSeq->dna))) { if (doTable) { if (numCols == -1) fprintf(f, "%s ", buffer); else @@ -538,31 +575,31 @@ } else fprintf(f, ">%s\n", buffer); fprintf(f, "%s\n", si->aaSequence); } } fprintf(f, "\n\n"); } } static void flushPosString(struct speciesInfo *si) { if (si->chrom != NULL) { - char buffer[10*1024]; + char buffer[1024*1024]; char strand = '+'; if (si->strand != si->frameStrand) strand = '-'; if (si->posString == NULL) { safef(buffer, sizeof buffer, "%s:%d-%d%c", si->chrom, si->start+1, si->end, strand); } else { safef(buffer, sizeof buffer, "%s;%s:%d-%d%c", si->posString, si->chrom, si->start+1, si->end, strand); freez(&si->posString); @@ -782,31 +819,31 @@ /* free exonInfo list */ static void freeGIList(struct exonInfo *list) { struct exonInfo *giNext; for(; list ; list = giNext) { giNext = list->next; mafAliFreeList(&list->ali); } } static struct exonInfo *buildGIList(char *database, struct genePred *pred, - char *mafTable, unsigned options) + char *mafTable, unsigned options, struct mafFileCache *mafFileCache) { struct exonInfo *giList = NULL; unsigned *exonStart = pred->exonStarts; unsigned *lastStart = &exonStart[pred->exonCount]; unsigned *exonEnd = pred->exonEnds; int *frames = pred->exonFrames; boolean includeUtr = options & MAFGENE_INCLUDEUTR; if (frames == NULL) { genePredAddExonFrames(pred); frames = pred->exonFrames; } @@ -841,68 +878,77 @@ if (thisStart < pred->cdsStart) thisStart = pred->cdsStart; if (thisEnd > pred->cdsEnd) thisEnd = pred->cdsEnd; } int thisSize = thisEnd - thisStart; if (!includeUtr) verbose(3, "in %d %d cds %d %d\n",*exonStart,*exonEnd, thisStart, thisEnd); AllocVar(gi); gi->frame = *frames; gi->name = pred->name; gi->ali = getAliForRange(database, mafTable, pred->chrom, - thisStart, thisEnd); + thisStart, thisEnd, mafFileCache); gi->chromStart = thisStart; gi->chromEnd = thisEnd; gi->exonStart = start; gi->exonSize = thisSize; verbose(3, "exon size %d\n", thisSize); gi->strand = pred->strand[0]; start += gi->exonSize; slAddHead(&giList, gi); if (!includeUtr) { if (thisEnd == pred->cdsEnd) break; } } slReverse(&giList); return giList; } -void mafGeneOutPred(FILE *f, struct genePred *pred, char *dbName, + +void mafGeneOutPredExt(FILE *f, struct genePred *pred, char *dbName, char *mafTable, struct slName *speciesNameList, unsigned options, - int numCols) + int numCols, struct mafFileCache *mafFileCache) { boolean inExons = options & MAFGENE_EXONS; if (pred->cdsStart == pred->cdsEnd) return; if (numCols < -1) errAbort("Number of columns must be zero or greater."); -struct exonInfo *giList = buildGIList(dbName, pred, mafTable, options); + +struct exonInfo *giList = buildGIList(dbName, pred, mafTable, options, mafFileCache); if (giList == NULL) return; struct hash *speciesInfoHash = newHash(5); struct speciesInfo *speciesList = getSpeciesInfo(giList, speciesNameList, speciesInfoHash); copyMafs(speciesInfoHash, &giList, inExons); struct speciesInfo *si = speciesList; for(; si ; si = si->next) si->curPosString = si->posStrings; writeOutSpecies(f, dbName, speciesList, giList, options, numCols); freeSpeciesInfo(speciesList); freeGIList(giList); } + +void mafGeneOutPred(FILE *f, struct genePred *pred, char *dbName, + char *mafTable, struct slName *speciesNameList, unsigned options, + int numCols) +{ +mafGeneOutPredExt(f, pred, dbName, mafTable, speciesNameList, options, numCols, NULL) ; +}