790660dbd19f460e345fad829ce38ea978e29c7b hiram Sun Apr 5 19:01:24 2026 -0700 claude author rewrite of mafAddIRows for 5X to 10X improvement in peak memory usage, and 100s of time faster for VGP 577 way alignments diff --git src/hg/ratStuff/mafAddIRowsStream/mafAddIRowsStream.c src/hg/ratStuff/mafAddIRowsStream/mafAddIRowsStream.c new file mode 100644 index 00000000000..8a35f52f3a6 --- /dev/null +++ src/hg/ratStuff/mafAddIRowsStream/mafAddIRowsStream.c @@ -0,0 +1,924 @@ +/* mafAddIRowsStream - add 'i' rows to a maf, two-pass streaming version. + * + * Memory-efficient rewrite of mafAddIRows.c. Two passes over the MAF: + * Pass 1: reads each block, extracts status metadata into a compact + * flat array (~12 bytes/component), builds linkBlock chains, + * then frees the mafAli immediately. Runs chainStrands on + * linkBlocks and bridgeSpecies on the compact array. + * Pass 2: re-reads the MAF one block at a time, applies pre-computed + * statuses, runs fillHoles logic, writes and frees each block. + * + * Peak memory: ~40GB for 900M components (vs ~157GB with lightweight list + * approach, vs entire-file-in-RAM for the original). + */ + +/* Copyright (C) 2011 The Regents of the University of California + * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ +#include "common.h" +#include "errAbort.h" +#include "linefile.h" +#include "hash.h" +#include "chain.h" +#include "options.h" +#include "maf.h" +#include "bed.h" +#include "twoBit.h" +#include "binRange.h" + + +char *masterSpecies; +char *masterChrom; +struct hash *speciesHash; +struct subSpecies *speciesList; +struct strandHead *strandHeads; + +boolean addN = FALSE; +boolean addDash = FALSE; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "mafAddIRowsStream - add 'i' rows to a maf (streaming, low memory)\n" + "usage:\n" + " mafAddIRowsStream mafIn twoBitFile mafOut\n" + "WARNING: requires a maf with only a single target sequence\n" + "options:\n" + " -nBeds=listOfBedFiles\n" + " reads in list of bed files, one per species, with N locations\n" + " -addN\n" + " adds rows of N's into maf blocks (rather than just annotating them)\n" + " -addDash\n" + " adds rows of -'s into maf blocks (rather than just annotating them)\n" + "NOTE: as of November 2025 - can manage GenArk assembly names GCA_...\n" + " and GCF_... with their .n extensions. Can only work with such\n" + " such names that begin with GC." + ); +} + +static struct optionSpec options[] = { + {"nBeds", OPTION_STRING}, + {"addN", OPTION_BOOLEAN}, + {"addDash", OPTION_BOOLEAN}, + {NULL, 0}, +}; + +struct bedHead +{ + struct bed *list; +}; + +/* Compact per-component data. Replaces keeping full mafComp structs + * in memory during pass 1. 12 bytes vs ~110 bytes (struct+src+malloc + * overhead) per component. */ +struct compInfo +{ + int leftLen; + int rightLen; + char leftStatus; + char rightStatus; + short speciesIdx; /* index into speciesByIdx[], -1 for master */ +}; + +/* Block boundary tracking for the compInfo flat array. */ +struct blockIndex +{ + int nBlocks; + long totalComps; + int *compCounts; /* number of components per block */ + long *offsets; /* offsets[i] = cumulative comp count before block i */ +}; + +struct blockStatus +/* Per-species tracking state for fillHoles logic. + * Owns its src string (cloned). */ +{ + int masterStart, masterEnd; + boolean active; + char *src; + int srcSize; + char strand; + int start; + int size; + char rightStatus; + int rightLen; +}; + +struct subSpecies +{ + struct subSpecies *next; + char *name; + int idx; /* index for speciesByIdx[] reverse lookup */ + struct hash *hash; + struct blockStatus blockStatus; + /* Per-species state for block-major bridgeSpecies */ + int bridgePushState; + int bridgeLeftLen; + /* Per-block species lookup cache */ + boolean seenInBlock; + struct mafComp *blockMc; +}; + +struct linkBlock +{ + struct linkBlock *next; + struct cBlock cb; + long compIdx; /* index into compInfo array (not a pointer, + * survives realloc) */ +}; + +struct strandHead +{ + struct strandHead *next; + char strand; + char *name; + char *qName; + int qSize; + char *species; + struct linkBlock *links; +}; + +static char *chromFromSrc(char *src) +/* get chrom name from . + returned pointer should be on the . separator */ +{ +char *p; +if ((p = strchr(src, '.')) == NULL) + errAbort("Can't find chrom in MAF component src: %s\n", src); +char *skipDot = p; +++skipDot; /* skip the dot to the word following */ +if (startsWith("GC", src)) + { + char *nextDot = strchr(skipDot,'.'); + if (nextDot) + { + p = nextDot; /* new answer */ + } + } /* else: no next dot, leave p it where it is */ +return p; +} + +static void parseSpeciesFromSrc(char *src, char *buf, int bufSize) +/* Extract species name from "species.chrom" src string into buf. */ +{ +safef(buf, bufSize, "%s", src); +char *dot = chromFromSrc(buf); +*dot = 0; +} + +static void updateBlockStatus(struct blockStatus *bs, struct mafComp *mc) +/* Copy metadata fields from mc into blockStatus, cloning src. */ +{ +freez(&bs->src); +bs->src = cloneString(mc->src); +bs->srcSize = mc->srcSize; +bs->strand = mc->strand; +bs->start = mc->start; +bs->size = mc->size; +bs->rightStatus = mc->rightStatus; +bs->rightLen = mc->rightLen; +bs->active = TRUE; +} + +static void freeAllLinkBlocks(struct strandHead *sh) +/* Free all linkBlock chains from strandHeads. */ +{ +for (; sh; sh = sh->next) + { + struct linkBlock *lb, *next; + for (lb = sh->links; lb; lb = next) + { + next = lb->next; + freeMem(lb); + } + sh->links = NULL; + } +} + +static void freeBlockIndex(struct blockIndex *bi) +{ +freeMem(bi->compCounts); +freeMem(bi->offsets); +} + +/* ---- Pass 1: compact read ---- */ + +static int nextSpeciesIdx = 0; +static struct subSpecies **speciesByIdx = NULL; +static int speciesByIdxCapacity = 0; + +void readMafsCompact(struct mafFile *mf, + struct compInfo **retCompArray, long *retCompCount, + struct blockIndex *blockIdx) +/* Read all maf blocks. For each block, extract component metadata into + * the compInfo flat array, build linkBlock chains for strandHeads, then + * free the mafAli immediately. No mafComp structs are kept in memory. + * Peak memory: compInfo array (~12 bytes/comp) + linkBlocks (~32 bytes/comp). */ +{ +struct mafAli *maf; +char buffer[2048]; +char buffer2[2048]; +struct strandHead *strandHead; +char *ourChrom = NULL; + +/* Dynamic compInfo array */ +long compCapacity = 4 * 1024 * 1024; +struct compInfo *compArray = needLargeMem(compCapacity * sizeof(struct compInfo)); +long compCount = 0; + +/* Dynamic block index arrays */ +int blockCapacity = 1024 * 1024; +int *blockCompCounts = needLargeMem(blockCapacity * sizeof(int)); +long *blockOffsets = needLargeMem(blockCapacity * sizeof(long)); +int nBlocks = 0; + +/* Species by index array */ +speciesByIdxCapacity = 256; +speciesByIdx = needLargeMem(speciesByIdxCapacity * sizeof(struct subSpecies *)); + +while((maf = mafNext(mf)) != NULL) + { + struct mafComp *mc, *masterMc = maf->components; + char *species = buffer; + char *chrom; + + if (ourChrom == NULL) + ourChrom = cloneString(masterMc->src); + else + { + if (differentString(masterMc->src, ourChrom)) + errAbort("ERROR: mafAddIrows requires maf have only one target sequence.\n" + "Use mafSplit -byTarget -useFullSequenceName to split maf"); + } + + strcpy(species, masterMc->src); + chrom = chromFromSrc(species); + if (chrom) + *chrom++ = 0; + else + errAbort("reference species has no chrom name\n"); + + if (masterSpecies == NULL) + { + masterSpecies = cloneString(species); + masterChrom = cloneString(chrom); + } + else + { + if (!sameString(masterSpecies, species)) + errAbort("first species (%s) not master species (%s)\n",species,masterSpecies); + } + + /* Grow block index if needed */ + if (nBlocks >= blockCapacity) + { + blockCapacity *= 2; + blockCompCounts = needLargeMemResize(blockCompCounts, + blockCapacity * sizeof(int)); + blockOffsets = needLargeMemResize(blockOffsets, + blockCapacity * sizeof(long)); + } + blockOffsets[nBlocks] = compCount; + int blockCompStart = compCount; + + /* Store master component in compInfo (speciesIdx = -1) */ + if (compCount >= compCapacity) + { + compCapacity *= 2; + compArray = needLargeMemResize(compArray, + compCapacity * sizeof(struct compInfo)); + } + compArray[compCount].leftStatus = 0; + compArray[compCount].rightStatus = 0; + compArray[compCount].leftLen = 0; + compArray[compCount].rightLen = 0; + compArray[compCount].speciesIdx = -1; + compCount++; + + for(mc = masterMc->next; mc; mc = mc->next) + { + struct linkBlock *linkBlock; + struct subSpecies *subSpecies = NULL; + + strcpy(species, mc->src); + chrom = chromFromSrc(species); + *chrom++ = 0; + + if ((subSpecies = hashFindVal(speciesHash, species)) == NULL) + { + AllocVar(subSpecies); + subSpecies->name = cloneString(species); + subSpecies->idx = nextSpeciesIdx++; + subSpecies->hash = newHash(6); + subSpecies->blockStatus.masterStart = masterMc->start; + slAddHead(&speciesList, subSpecies); + hashAdd(speciesHash, species, subSpecies); + /* Grow speciesByIdx if needed */ + if (subSpecies->idx >= speciesByIdxCapacity) + { + speciesByIdxCapacity *= 2; + speciesByIdx = needLargeMemResize(speciesByIdx, + speciesByIdxCapacity * sizeof(struct subSpecies *)); + } + speciesByIdx[subSpecies->idx] = subSpecies; + } + subSpecies->blockStatus.masterEnd = masterMc->start + masterMc->size; + + sprintf(buffer2, "%s%c%s", masterChrom, mc->strand, chrom); + if ((strandHead = hashFindVal(subSpecies->hash, buffer2)) == NULL) + { + AllocVar(strandHead); + hashAdd(subSpecies->hash, buffer2, strandHead); + strandHead->name = cloneString(buffer2); + strandHead->species = cloneString(species); + strandHead->qName = cloneString(chrom); + strandHead->qSize = mc->srcSize; + strandHead->strand = mc->strand; + slAddHead(&strandHeads, strandHead); + } + + /* Store component in compInfo */ + if (compCount >= compCapacity) + { + compCapacity *= 2; + compArray = needLargeMemResize(compArray, + compCapacity * sizeof(struct compInfo)); + } + compArray[compCount].leftStatus = 0; + compArray[compCount].rightStatus = 0; + compArray[compCount].leftLen = 0; + compArray[compCount].rightLen = 0; + compArray[compCount].speciesIdx = subSpecies->idx; + + /* Build linkBlock with index into compArray (not mc pointer) */ + AllocVar(linkBlock); + linkBlock->compIdx = compCount; + linkBlock->cb.qStart = mc->start; + linkBlock->cb.qEnd = mc->start + mc->size; + linkBlock->cb.tStart = masterMc->start; + linkBlock->cb.tEnd = masterMc->start + masterMc->size; + slAddHead(&strandHead->links, linkBlock); + + compCount++; + } + + blockCompCounts[nBlocks] = compCount - blockCompStart; + nBlocks++; + + /* Free the entire mafAli immediately -- we've extracted everything + * we need into compInfo and linkBlocks. */ + mafAliFree(&maf); + } + +/* Trim arrays to exact size */ +compArray = needLargeMemResize(compArray, compCount * sizeof(struct compInfo)); +blockCompCounts = needLargeMemResize(blockCompCounts, nBlocks * sizeof(int)); +blockOffsets = needLargeMemResize(blockOffsets, nBlocks * sizeof(long)); + +*retCompArray = compArray; +*retCompCount = compCount; +blockIdx->nBlocks = nBlocks; +blockIdx->totalComps = compCount; +blockIdx->compCounts = blockCompCounts; +blockIdx->offsets = blockOffsets; +} + + +void chainStrands(struct strandHead *strandHead, struct hash *bedHash, + struct compInfo *compArray) +/* Compute i-row statuses by walking consecutive link pairs per strand chain. + * Writes statuses into compArray via linkBlock->compIdx. */ +{ +for(; strandHead ; strandHead = strandHead->next) + { + struct linkBlock *link, *prevLink; + struct hashEl *hel = hashLookup(bedHash, strandHead->species); + struct hash *chromHash = (hel != NULL) ? hel->val : NULL; + struct bedHead *bedHead = (chromHash != NULL) ? + hashFindVal(chromHash, strandHead->qName): NULL; + + slReverse(&strandHead->links); + + prevLink = strandHead->links; + compArray[prevLink->compIdx].leftStatus = MAF_NEW_STATUS; + for(link = prevLink->next; link; prevLink = link, link = link->next) + { + int tDiff = link->cb.tStart - prevLink->cb.tEnd; + int qDiff = link->cb.qStart - prevLink->cb.qEnd; + int nCount = 0; + int nStart, nEnd; + struct bed *bed; + + if (strandHead->strand == '+') + { + nStart = prevLink->cb.qEnd; + nEnd = link->cb.qStart; + } + else + { + nEnd = strandHead->qSize - prevLink->cb.qEnd; + nStart = strandHead->qSize - link->cb.qStart; + } + + /* a very inefficient search for an N bed */ + if ((nEnd - nStart > 0) && (bedHead)) + { + for(bed = bedHead->list; bed; bed = bed->next) + { + if (bed->chromStart >= nEnd) + break; + + if ( bed->chromEnd > nStart) + { + nCount += positiveRangeIntersection(nStart, nEnd, + bed->chromStart, bed->chromEnd); + } + } + } + + + if ((qDiff && (100 * nCount / qDiff > 95)) + && (tDiff && (100 * nCount / tDiff > 10))) + { + compArray[prevLink->compIdx].rightStatus = MAF_MISSING_STATUS; + compArray[prevLink->compIdx].rightLen = nCount; + compArray[link->compIdx].leftStatus = MAF_MISSING_STATUS; + compArray[link->compIdx].leftLen = nCount; + } + else if ((tDiff > 100000) || + (qDiff > 100000) || (qDiff < -100000)) + { + compArray[prevLink->compIdx].rightStatus = MAF_NEW_STATUS; + compArray[prevLink->compIdx].rightLen = 0; + compArray[link->compIdx].leftStatus = MAF_NEW_STATUS; + compArray[link->compIdx].leftLen = 0; + } + else if (qDiff < 0) + { + compArray[prevLink->compIdx].rightStatus = MAF_TANDEM_STATUS; + compArray[prevLink->compIdx].rightLen = -qDiff; + compArray[link->compIdx].leftStatus = MAF_TANDEM_STATUS; + compArray[link->compIdx].leftLen = -qDiff; + } + else if (qDiff == 0) + { + compArray[prevLink->compIdx].rightStatus = MAF_CONTIG_STATUS; + compArray[prevLink->compIdx].rightLen = 0; + compArray[link->compIdx].leftStatus = MAF_CONTIG_STATUS; + compArray[link->compIdx].leftLen = 0; + } + else + { + compArray[prevLink->compIdx].rightStatus = MAF_INSERT_STATUS; + compArray[prevLink->compIdx].rightLen = qDiff; + compArray[link->compIdx].leftStatus = MAF_INSERT_STATUS; + compArray[link->compIdx].leftLen = qDiff; + } + } + compArray[prevLink->compIdx].rightStatus = MAF_NEW_STATUS; + } +} + +void bridgeSpecies(struct compInfo *compArray, struct blockIndex *blockIdx, + struct subSpecies *speciesListHead) +/* Block-major bridgeSpecies. Iterates compArray using block offsets. + * Uses speciesIdx for O(1) species lookup. */ +{ +struct subSpecies *sp; +int bi; + +for (sp = speciesListHead; sp; sp = sp->next) + { + sp->bridgePushState = 0; + sp->bridgeLeftLen = 0; + } + +for (bi = 0; bi < blockIdx->nBlocks; bi++) + { + long offset = blockIdx->offsets[bi]; + int count = blockIdx->compCounts[bi]; + int i; + + for (sp = speciesListHead; sp; sp = sp->next) + sp->seenInBlock = FALSE; + + /* Skip component 0 (master, speciesIdx == -1) */ + for (i = 1; i < count; i++) + { + struct compInfo *ci = &compArray[offset + i]; + struct subSpecies *sub; + + if (ci->speciesIdx < 0) + continue; + sub = speciesByIdx[ci->speciesIdx]; + if (sub->seenInBlock) + continue; + sub->seenInBlock = TRUE; + + if (ci->leftStatus == 0) + errAbort("zero left status in block %d, component %d\n", bi, i); + + if (ci->leftStatus == MAF_NEW_STATUS) + { + if (sub->bridgePushState) + { + ci->leftStatus = MAF_NEW_NESTED_STATUS; + ci->leftLen = sub->bridgeLeftLen; + } + sub->bridgePushState++; + } + + if (ci->rightStatus == 0) + { + errAbort("zero right status in block %d, component %d\n", bi, i); + } + else if (isContigOrTandem(ci->rightStatus) || (ci->rightStatus == MAF_INSERT_STATUS)) + sub->bridgeLeftLen = ci->rightLen; + else if (ci->rightStatus == MAF_NEW_STATUS) + { + sub->bridgePushState--; + if (sub->bridgePushState) + { + ci->rightStatus = MAF_NEW_NESTED_STATUS; + ci->rightLen = sub->bridgeLeftLen; + } + } + } + } +} + + +/* ---- Pass 2: re-read, apply statuses, fill holes, write ---- */ + +static void applyStatuses(struct compInfo *compArray, struct blockIndex *blockIdx, + int blockNum, struct mafAli *maf) +/* Copy pre-computed statuses from compArray onto freshly-read maf block. */ +{ +long offset = blockIdx->offsets[blockNum]; +int count = blockIdx->compCounts[blockNum]; +struct mafComp *mc; +int i; +for (mc = maf->components, i = 0; mc && i < count; mc = mc->next, i++) + { + mc->leftStatus = compArray[offset + i].leftStatus; + mc->leftLen = compArray[offset + i].leftLen; + mc->rightStatus = compArray[offset + i].rightStatus; + mc->rightLen = compArray[offset + i].rightLen; + } +} + +void streamFillAndWrite(char *mafIn, struct compInfo *compArray, + struct blockIndex *blockIdx, + struct subSpecies *speciesList, struct twoBitFile *twoBit, FILE *f) +/* Pass 2: re-read the MAF one block at a time, apply statuses, fill holes, + * write and free. */ +{ +struct mafFile *mf2 = mafOpen(mafIn); +struct mafAli *maf; +int lastEnd = 100000000; +struct subSpecies *species; +int blockNum = 0; + +while ((maf = mafNext(mf2)) != NULL) + { + struct mafComp *mc = NULL, *masterMc, *lastMc = NULL; + struct blockStatus *blockStatus; + + applyStatuses(compArray, blockIdx, blockNum, maf); + + masterMc = maf->components; + + /* SECTION A: fill gap between previous block and this one */ + if (masterMc->start > lastEnd) + { + struct mafAli *gapMaf = NULL; + struct mafComp *lastGapMc = NULL; + + for(species = speciesList; species; species = species->next) + { + mc = NULL; + blockStatus = &species->blockStatus; + if (blockStatus->active) + { + switch (blockStatus->rightStatus) + { + case MAF_MISSING_STATUS: + case MAF_NEW_NESTED_STATUS: + case MAF_MAYBE_NEW_NESTED_STATUS: + case MAF_CONTIG_STATUS: + case MAF_TANDEM_STATUS: + case MAF_INSERT_STATUS: + AllocVar(mc); + mc->rightStatus = mc->leftStatus = blockStatus->rightStatus; + mc->rightLen = mc->leftLen = blockStatus->rightLen; + mc->src = cloneString(blockStatus->src); + mc->srcSize = blockStatus->srcSize; + mc->strand = blockStatus->strand; + mc->start = blockStatus->start + blockStatus->size; + if (lastGapMc == NULL) + { + struct mafComp *miniMasterMc = NULL; + char *seqName; + struct dnaSeq *seq; + + AllocVar(miniMasterMc); + miniMasterMc->next = mc; + miniMasterMc->strand = '+'; + miniMasterMc->srcSize = masterMc->srcSize; + miniMasterMc->src = cloneString(masterMc->src); + miniMasterMc->start = lastEnd; + miniMasterMc->size = masterMc->start - lastEnd; + + if ((seqName = chromFromSrc(miniMasterMc->src)) != NULL) + seqName++; + else + seqName = miniMasterMc->src; + + seq = twoBitReadSeqFrag(twoBit, seqName, lastEnd, masterMc->start); + miniMasterMc->text = seq->dna; + + AllocVar(gapMaf); + gapMaf->textSize = maf->textSize; + gapMaf->components = miniMasterMc; + } + else + { + lastGapMc->next = mc; + } + lastGapMc = mc; + if (blockStatus->rightStatus == MAF_MISSING_STATUS) + { + if (addN) + { + char buffer[256]; + + safef(buffer, sizeof(buffer), "%s.N",species->name); + freez(&mc->src); + mc->src = cloneString(buffer); + mc->start = 0; + mc->srcSize = 200000; + mc->size = masterMc->start - lastEnd; + mc->text = needMem(mc->size + 1); + memset(mc->text, 'N', mc->size); + } + } + else + { + if (addDash) + { + mc->size = masterMc->size; + mc->srcSize = blockStatus->srcSize; + mc->text = needMem(mc->size + 1); + memset(mc->text, '-', mc->size); + mc->text[mc->size] = 0; + if (mc->size == 0) + errAbort("bad dash add"); + mc->size = 0; + } + } + break; + } + } + } + + if (gapMaf) + { + mafWrite(f, gapMaf); + mafAliFree(&gapMaf); + } + } + + lastEnd = masterMc->start + masterMc->size; + for(lastMc = masterMc; lastMc->next; lastMc = lastMc->next) + ; + + /* Pre-scan: build per-species component lookup for this block. */ + { + char speciesBuf[2048]; + struct mafComp *scanMc; + for (species = speciesList; species; species = species->next) + { + species->seenInBlock = FALSE; + species->blockMc = NULL; + } + for (scanMc = masterMc->next; scanMc; scanMc = scanMc->next) + { + struct subSpecies *sub; + parseSpeciesFromSrc(scanMc->src, speciesBuf, sizeof(speciesBuf)); + sub = hashFindVal(speciesHash, speciesBuf); + if (sub && !sub->seenInBlock) + { + sub->blockMc = scanMc; + sub->seenInBlock = TRUE; + } + } + } + + /* SECTION B: add missing species to current block */ + for(species = speciesList; species; species = species->next) + { + blockStatus = &species->blockStatus; + + mc = NULL; + if ((blockStatus->masterStart <= masterMc->start) && + (blockStatus->masterEnd > masterMc->start)) + { + mc = species->blockMc; + } + if ((blockStatus->masterStart <= masterMc->start) && + (blockStatus->masterEnd > masterMc->start) && + (mc == NULL)) + { + if (blockStatus->active) + { + switch (blockStatus->rightStatus) + { + case MAF_MISSING_STATUS: + case MAF_CONTIG_STATUS: + case MAF_TANDEM_STATUS: + case MAF_INSERT_STATUS: + case MAF_NEW_NESTED_STATUS: + case MAF_MAYBE_NEW_NESTED_STATUS: + AllocVar(mc); + mc->rightStatus = mc->leftStatus = blockStatus->rightStatus; + if (mc->rightStatus == MAF_NEW_NESTED_STATUS) + mc->rightStatus = MAF_INSERT_STATUS; + if (mc->leftStatus == MAF_NEW_NESTED_STATUS) + mc->leftStatus = MAF_INSERT_STATUS; + mc->rightLen = mc->leftLen = blockStatus->rightLen; + mc->src = cloneString(blockStatus->src); + mc->strand = blockStatus->strand; + mc->srcSize = blockStatus->srcSize; + mc->start = blockStatus->start + blockStatus->size ; + lastMc->next = mc; + lastMc = mc; + if (addN && (blockStatus->rightStatus == MAF_MISSING_STATUS)) + { + char buffer[256]; + + safef(buffer, sizeof(buffer), "%s.N",species->name); + freez(&mc->src); + mc->src = cloneString(buffer); + mc->start = 0; + mc->srcSize = 200000; + mc->size = maf->textSize; + mc->text = needMem(mc->size + 1); + memset(mc->text, 'N', mc->size); + } + else if (addDash) + { + mc->size = masterMc->size; + mc->text = needMem(mc->size + 1); + if (mc->size == 0) + errAbort("bad dash add"); + memset(mc->text, '-', mc->size); + mc->text[mc->size] = 0; + mc->size = 0; + } + break; + default: + break; + } + } + } + if (mc) + { + updateBlockStatus(blockStatus, mc); + } + } + + mafWrite(f, maf); + mafAliFree(&maf); + blockNum++; + } + +mafFileFree(&mf2); +} + + +struct hash *readBed(char *fileName) +/* Read bed and return it as a hash keyed by chromName */ +{ +char *row[3]; +struct lineFile *lf = lineFileOpen(fileName, TRUE); +struct bedHead *bedHead = NULL; +struct hash *hash = newHash(6); +struct hashEl *hel, *lastHel = NULL; +struct bed3 *bed; + +while (lineFileRow(lf, row)) + { + hel = hashLookup(hash, row[0]); + if ((lastHel) && (hel != lastHel)) + { + assert(bedHead != NULL); + slReverse(&bedHead->list); + } + + if (hel == NULL) + { + char *ptr; + + AllocVar(bedHead); + ptr = row[0]; + hel = hashAdd(hash, ptr, bedHead); + } + bedHead = hel->val; + AllocVar(bed); + bed->chrom = hel->name; + bed->chromStart = lineFileNeedNum(lf, row, 1); + bed->chromEnd = lineFileNeedNum(lf, row, 2); + if (bed->chromStart > bed->chromEnd) + errAbort("start after end line %d of %s", lf->lineIx, lf->fileName); + slAddHead(&bedHead->list, (struct bed *)bed); + lastHel = hel; + } + +if (bedHead != NULL) + slReverse(&bedHead->list); +lineFileClose(&lf); +return hash; +} + +void addBed(char *file, struct hash *fileHash) +{ +char name[128]; + +if (!endsWith(file, ".bed")) + errAbort("filenames in bed list must end in '.bed'"); + +splitPath(file, NULL, name, NULL); +hashAdd(fileHash, name, readBed(file)); +} + +void mafAddIRows(char *mafIn, char *twoBitIn, char *mafOut, char *nBedFile) +/* mafAddIRows - add 'i' rows to a maf, streaming two-pass version. */ +{ +FILE *f = mustOpen(mafOut, "w"); +struct twoBitFile *twoBit = twoBitOpen(twoBitIn); +struct mafFile *mf = mafOpen(mafIn); +struct hash *bedHash = newHash(6); +char *scoring; +struct compInfo *compArray; +long compCount; +struct blockIndex blockIdx; + +if (nBedFile != NULL) + { + struct lineFile *lf = lineFileOpen(nBedFile, TRUE); + char *row[1]; + while (lineFileRow(lf, row)) + { + addBed(row[0], bedHash); + } + lineFileClose(&lf); + } + +speciesHash = newHash(6); + +/* Pass 1: compact read into flat compInfo array + linkBlock chains */ +readMafsCompact(mf, &compArray, &compCount, &blockIdx); +scoring = cloneString(mf->scoring); +mafFileFree(&mf); + +verbose(1, "# pass 1 read: %d blocks, %ld components, %d species\n", + blockIdx.nBlocks, compCount, nextSpeciesIdx); + +/* Compute i-row statuses on compArray via linkBlock chain walks */ +chainStrands(strandHeads, bedHash, compArray); + +/* Free linkBlocks -- no longer needed. Reclaims ~32 bytes/component. */ +freeAllLinkBlocks(strandHeads); + +verbose(1, "# chainStrands done, linkBlocks freed\n"); + +/* Compute bridge/nested statuses directly on compArray */ +bridgeSpecies(compArray, &blockIdx, speciesList); + +verbose(1, "# bridgeSpecies done, starting pass 2\n"); + +/* Pass 2: re-read MAF with text, apply statuses, fill holes, write */ +mafWriteStart(f, scoring); +streamFillAndWrite(mafIn, compArray, &blockIdx, speciesList, twoBit, f); + +freeMem(compArray); +freeBlockIndex(&blockIdx); +freez(&scoring); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +char *nBedFile; + +optionInit(&argc, argv, options); +if (argc != 4) + usage(); + +nBedFile = optionVal("nBeds", NULL); +addN = optionExists("addN"); +addDash = optionExists("addDash"); + +mafAddIRows(argv[1], argv[2], argv[3], nBedFile); +return 0; +}