4d80894351b62c4daa432279691a9ef73717d86b
braney
  Wed Oct 29 08:57:35 2025 -0700
allow mafGene to use bigMaf and twoBit instead of database

diff --git src/hg/lib/mafGene.c src/hg/lib/mafGene.c
index 517592d3537..e322d514304 100644
--- src/hg/lib/mafGene.c
+++ src/hg/lib/mafGene.c
@@ -1,28 +1,31 @@
 /* Copyright (C) 2014 The Regents of the University of California 
  * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "hdb.h"
 #include "maf.h"
 #include "obscure.h"
 #include "genePred.h"
+#include "dnaseq.h"
 #include "mafGene.h"
+#include "chromAlias.h"
+#include "hgMaf.h"
 
 
 struct exonInfo
 {
     struct exonInfo *next;
     struct mafAli *ali;
     int exonStart;
     int exonSize;
     int chromStart;
     int chromEnd;
     int frame;
     char strand;
     char *name;
 };
 
@@ -103,135 +106,169 @@
 	{
         if (stop)
 	    break;
 	else
 	    aa = 'Z';
 	}
     *pep++ = aa;
     ++actualSize;
     }
 *pep = 0;
 assert(actualSize <= inSize/3+1);
 seq->size = actualSize;
 return seq;
 }
 
-static struct mafAli *getRefAli(char *database, char *chrom, int start, int end)
+static DNA *getTwoBitSeq(struct mafFileCache *mafFileCache, char *chrom, int start, int end)
+{
+if (!sameOk(mafFileCache->chrom, chrom))
+    {
+    freeMem(mafFileCache->chrom);
+    mafFileCache->chrom = cloneString(chrom);
+
+    freeDnaSeq(&mafFileCache->dnaSeq);
+
+    int size;
+    struct dnaSeq *dnaSeq = twoBitReadSeqFragExt(mafFileCache->tbf, chrom, 0, 0, FALSE, &size);
+    mafFileCache->dnaSeq = dnaSeq;
+    }
+
+// we copy this because it gets freed later by the maf code
+int length = end - start;
+char *text = cloneMem(mafFileCache->dnaSeq->dna + start, length+1); 
+text[length] = 0;
+
+return text;
+}
+
+static struct mafAli *getRefAli(char *database, char *chrom, int start, int end, struct mafFileCache *mafFileCache)
 {
 struct mafAli *ali;
 struct mafComp *comp;
 char buffer[1024];
 
 AllocVar(ali);
 AllocVar(comp);
 ali->components = comp;
 ali->textSize = end - start;
 
 safef(buffer, sizeof buffer, "%s.%s", database, chrom);
 comp->src = cloneString(buffer);
 comp->start = start;
 comp->strand = '+';
 comp->size = end - start;
-struct dnaSeq *seq = hChromSeqMixed(database, chrom, start , end);
+struct dnaSeq *seq = NULL;
+if (mafFileCache && mafFileCache->tbf)
+    comp->text = getTwoBitSeq(mafFileCache, chrom, start , end);
+else
+    {
+    seq = hChromSeqMixed(database, chrom, start , end);
     comp->text = cloneString(seq->dna);
     freeDnaSeq(&seq);
+    }
 
 return ali;
 }
 
 /* make sure we have the whole range even if
  * there isn't a maf loaded in this region
  */
 static struct mafAli *padOutAli(struct mafAli *list, char *database, 
-    char *chrom, int start, int end)
+    char *chrom, int start, int end, struct mafFileCache *mafFileCache)
 {
 if (list == NULL)
     {
-    struct mafAli *ali = getRefAli(database, chrom, start, end);
+    struct mafAli *ali = getRefAli(database, chrom, start, end, mafFileCache);
     return ali;
     }
 
 int aliStart = list->components->start;
 if (start != aliStart)
     {
-    struct mafAli *ali = getRefAli(database, chrom, start, aliStart);
+    struct mafAli *ali = getRefAli(database, chrom, start, aliStart, mafFileCache);
     slAddHead(&list, ali);
     }
 
 struct mafAli *next, *last = list;
 for(; last->next; last = last->next)
     {
     next=last->next;
     int aliEnd = last->components->start + last->components->size;
     int nextStart = next->components->start ;
 
     if (aliEnd != nextStart)
 	{
-	struct mafAli *ali = getRefAli(database, chrom, aliEnd, nextStart);
+	struct mafAli *ali = getRefAli(database, chrom, aliEnd, nextStart, mafFileCache);
 	ali->next = next;
 	last->next = ali;
 	}
     }
 
 int aliEnd = last->components->start + last->components->size;
 if (end != aliEnd)
     {
-    struct mafAli *ali = getRefAli(database, chrom, aliEnd, end);
+    struct mafAli *ali = getRefAli(database, chrom, aliEnd, end, mafFileCache);
     slAddTail(&list, ali);
     }
 
 return list;
 }
 
 static struct mafAli *getAliForRange(char *database, char *mafTable, 
-    char *chrom, int start, int end)
+    char *chrom, int start, int end, struct mafFileCache *mafFileCache)
+{
+struct mafAli *aliAll = NULL;
+if (mafFileCache && (mafFileCache->bbi != NULL))
+    {
+    aliAll = bigMafLoadInRegion(mafFileCache->bbi, chrom, start, end);
+    }
+else
     {
     struct sqlConnection *conn = hAllocConn(database);
-struct mafAli *aliAll = mafLoadInRegion(conn, mafTable,
-	chrom, start, end);
+    aliAll = mafLoadInRegion(conn, mafTable, chrom, start, end);
+    hFreeConn(&conn);
+    }
 struct mafAli *ali;
 struct mafAli *list = NULL;
 struct mafAli *nextAli;
 
-hFreeConn(&conn);
-
 for(ali = aliAll; ali; ali = nextAli)
     {
     nextAli = ali->next;
     ali->next = NULL;
 
     char *masterSrc = ali->components->src;
     struct mafAli *subAli = NULL;
 
     if (mafNeedSubset(ali, masterSrc, start, end))
 	{
 	subAli = mafSubset( ali, masterSrc, start, end);
 	if (subAli == NULL)
 	    continue;
 	}
 
     if (subAli)
 	{
 	slAddHead(&list, subAli);
 	mafAliFree(&ali);
 	}
     else
 	slAddHead(&list, ali);
     }
 slReverse(&list);
 
-list = padOutAli(list, database, chrom, start, end);
+list = padOutAli(list, database, chrom, start, end, mafFileCache);
 
 return list;
 }
 
 
 /* allocate space for the nuc and aa sequence for each species */
 static struct speciesInfo *getSpeciesInfo(struct exonInfo *giList, 
     struct slName *speciesNames, struct hash *siHash)
 {
 struct exonInfo *gi;
 int size = 0;
 struct speciesInfo *siList = NULL;
 
 for(gi = giList ; gi ; gi = gi->next)
     size += gi->exonSize;
@@ -307,31 +344,31 @@
 		ptr += gi->exonSize - 1;
 		break;
 	    }
 
 	int lastFrame = (gi->frame + gi->exonSize) % 3;
 	if (lastFrame == 1) /* delete the last nucleotide */
 	    --ptr;
 	else if (lastFrame == 2) /* add one more nucleotide from
 	                          * the next exon */
 	    *ptr++ = siTemp->nucSequence[gi->exonStart + gi->exonSize];
 	*ptr++ = 0;   /* null terminate */
 
 	thisSeq.dna = exonBuffer;
 	thisSeq.size = ptr - exonBuffer;
 	outSeq =  doTranslate(&thisSeq, 0,  0, FALSE, doUniq);
-	char buffer[10 * 1024];
+	char buffer[1024 * 1024];
 
 	safef(buffer, sizeof buffer,  "%s_%s_%d_%d %d %d %d %s",
 	    gi->name, 
 	    siTemp->name, exonNum, exonCount, 
 	    outSeq->size,
 	    gi->frame, lastFrame,
 	    siTemp->curPosString->name);
 
 	if ((outSeq->size > 0) && (doBlank || !allDashes(outSeq->dna)))
 	    {
 	    if (doTable)
 		{
 		if (numCols == -1)
 		    fprintf(f, "%s ", buffer);
 		else
@@ -538,31 +575,31 @@
 		}
 	    else
 		fprintf(f, ">%s\n", buffer);
 
 	    fprintf(f, "%s\n", si->aaSequence);
 	    }
 	}
     fprintf(f, "\n\n");
     }
 }
 
 static void flushPosString(struct speciesInfo *si)
 {
 if (si->chrom != NULL)
     {
-    char buffer[10*1024];
+    char buffer[1024*1024];
     char strand = '+';
 
     if (si->strand != si->frameStrand)
 	strand = '-';
 
     if (si->posString == NULL)
 	{
 	safef(buffer, sizeof buffer, "%s:%d-%d%c", si->chrom,
 	    si->start+1, si->end, strand);
 	}
     else
 	{
 	safef(buffer, sizeof buffer, "%s;%s:%d-%d%c", si->posString,
 	    si->chrom, si->start+1, si->end, strand);
 	freez(&si->posString);
@@ -782,31 +819,31 @@
 
 /* free exonInfo list */
 static void freeGIList(struct exonInfo *list)
 {
 struct exonInfo *giNext;
 
 for(; list ; list = giNext)
     {
     giNext = list->next;
 
     mafAliFreeList(&list->ali);
     }
 }
  
 static struct exonInfo *buildGIList(char *database, struct genePred *pred, 
-    char *mafTable, unsigned options)
+    char *mafTable, unsigned options, struct mafFileCache *mafFileCache)
 {
 struct exonInfo *giList = NULL;
 unsigned *exonStart = pred->exonStarts;
 unsigned *lastStart = &exonStart[pred->exonCount];
 unsigned *exonEnd = pred->exonEnds;
 int *frames = pred->exonFrames;
 
 boolean includeUtr = options & MAFGENE_INCLUDEUTR;
 
 if (frames == NULL)
     {
     genePredAddExonFrames(pred);
     frames = pred->exonFrames;
     }
 
@@ -841,68 +878,77 @@
         if (thisStart < pred->cdsStart)
 	    thisStart = pred->cdsStart;
 
 
         if (thisEnd > pred->cdsEnd)
 	    thisEnd = pred->cdsEnd;
         
 	}
     int thisSize = thisEnd - thisStart;
     if (!includeUtr)
         verbose(3, "in %d %d cds %d %d\n",*exonStart,*exonEnd, thisStart, thisEnd);
     AllocVar(gi);
     gi->frame = *frames;
     gi->name = pred->name;
     gi->ali = getAliForRange(database, mafTable, pred->chrom, 
-	thisStart, thisEnd);
+	thisStart, thisEnd, mafFileCache);
     gi->chromStart = thisStart;
     gi->chromEnd = thisEnd;
     gi->exonStart = start;
     gi->exonSize = thisSize;
     verbose(3, "exon size %d\n", thisSize);
     gi->strand = pred->strand[0];
     start += gi->exonSize;
     slAddHead(&giList, gi);
  
     if (!includeUtr) 
         {
         if (thisEnd == pred->cdsEnd)
 	    break;
         }
     }
 slReverse(&giList);
 
 return giList;
 }
 
-void mafGeneOutPred(FILE *f, struct genePred *pred, char *dbName, 
+
+void mafGeneOutPredExt(FILE *f, struct genePred *pred, char *dbName, 
     char *mafTable,  struct slName *speciesNameList, unsigned options,
-    int numCols) 
+    int numCols, struct mafFileCache *mafFileCache) 
 {
 boolean inExons = options & MAFGENE_EXONS;
 
 if (pred->cdsStart == pred->cdsEnd)
     return;
 
 if (numCols < -1)
     errAbort("Number of columns must be zero or greater.");
 
-struct exonInfo *giList = buildGIList(dbName, pred, mafTable, options);
+
+struct exonInfo *giList = buildGIList(dbName, pred, mafTable, options, mafFileCache);
 
 if (giList == NULL)
     return;
 
 struct hash *speciesInfoHash = newHash(5);
 struct speciesInfo *speciesList = getSpeciesInfo(giList, speciesNameList, 
     speciesInfoHash);
 
 copyMafs(speciesInfoHash, &giList, inExons);
 
 struct speciesInfo *si = speciesList;
 for(; si ; si = si->next)
     si->curPosString = si->posStrings;
 
 writeOutSpecies(f, dbName, speciesList, giList, options, numCols);
 
 freeSpeciesInfo(speciesList);
 freeGIList(giList);
 }
+
+void mafGeneOutPred(FILE *f, struct genePred *pred, char *dbName, 
+    char *mafTable,  struct slName *speciesNameList, unsigned options,
+    int numCols) 
+{
+mafGeneOutPredExt(f, pred, dbName, mafTable,  speciesNameList, options, numCols, NULL) ;
+}