4898794edd81be5285ea6e544acbedeaeb31bf78 max Tue Nov 23 08:10:57 2021 -0800 Fixing pointers to README file for license in all source code files. refs #27614 diff --git src/hg/mouseStuff/axtChain/axtChain.c src/hg/mouseStuff/axtChain/axtChain.c index 873b263..2875cad 100644 --- src/hg/mouseStuff/axtChain/axtChain.c +++ src/hg/mouseStuff/axtChain/axtChain.c @@ -1,510 +1,510 @@ /* axtChain - Chain together axt alignments.. */ /* Copyright (C) 2013 The Regents of the University of California - * See README in this or parent directory for licensing information. */ + * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dystring.h" #include "dnaseq.h" #include "nib.h" #include "twoBit.h" #include "fa.h" #include "axt.h" #include "psl.h" #include "boxClump.h" #include "portable.h" #include "chainBlock.h" #include "gapCalc.h" #include "chainConnect.h" /* Variables set via command line. */ int minScore = 1000; char *detailsName = NULL; char *gapFileName = NULL; struct gapCalc *gapCalc = NULL; /* Gap scoring scheme to use. */ struct axtScoreScheme *scoreScheme = NULL; void usage() /* Explain usage and exit. */ { errAbort( "axtChain - Chain together axt alignments.\n" "usage:\n" " axtChain [options] -linearGap=loose in.axt tNibDir qNibDir out.chain\n" "Where tNibDir/qNibDir are either directories full of nib files, the name\n" "of a .2bit file, or a single fasta file with additional -faQ or -faT options.\n" "options:\n" " -psl Use psl instead of axt format for input\n" " -faQ The specified qNibDir is a fasta file with multiple sequences for query\n" " -faT The specified tNibDir is a fasta file with multiple sequences for target\n" " NOTE: will not work with gzipped fasta files\n" " -minScore=N Minimum score for chain, default %d\n" " -details=fileName Output some additional chain details\n" " -scoreScheme=fileName Read the scoring matrix from a blastz-format file\n" " -linearGap=<medium|loose|filename> Specify type of linearGap to use.\n" " *Must* specify this argument to one of these choices.\n" " loose is chicken/human linear gap costs.\n" " medium is mouse/human linear gap costs.\n" " Or specify a piecewise linearGap tab delimited file.\n" " sample linearGap file (loose)\n" "%s" , minScore, gapCalcSampleFileContents() ); } struct seqPair /* Pair of sequences. */ { struct seqPair *next; char *name; /* Allocated in hash */ char *qName; /* Name of query sequence. */ char *tName; /* Name of target sequence. */ char qStrand; /* Strand of query sequence. */ struct cBlock *blockList; /* List of alignments. */ int axtCount; /* Count of axt's that make this up (just for stats) */ }; int seqPairCmp(const void *va, const void *vb) /* Compare to sort based on tName,qName. */ { const struct seqPair *a = *((struct seqPair **)va); const struct seqPair *b = *((struct seqPair **)vb); int dif; dif = strcmp(a->tName, b->tName); if (dif == 0) dif = strcmp(a->qName, b->qName); if (dif == 0) dif = (int)a->qStrand - (int)b->qStrand; return dif; } void addPslBlocks(struct cBlock **pList, struct psl *psl) /* Add blocks (gapless subalignments) from psl to block list. */ { int i; for (i=0; i<psl->blockCount; ++i) { struct cBlock *b; int size; AllocVar(b); size = psl->blockSizes[i]; b->qStart = b->qEnd = psl->qStarts[i]; b->qEnd += size; b->tStart = b->tEnd = psl->tStarts[i]; b->tEnd += size; slAddHead(pList, b); } } struct twoBitFile *twoBitOpenCached(char *path) /* Return open two bit file associated with path. */ { static struct hash *hash = NULL; struct twoBitFile *tbf; if (hash == NULL) hash = newHash(8); tbf = hashFindVal(hash, path); if (tbf == NULL) { tbf = twoBitOpen(path); hashAdd(hash, path, tbf); } return tbf; } void loadIfNewSeq(char *seqPath, boolean isTwoBit, char *newName, char strand, char **pName, struct dnaSeq **pSeq, char *pStrand) /* Load sequence unless it is already loaded. Reverse complement * if necessary. */ { struct dnaSeq *seq; if (sameString(newName, *pName)) { if (strand != *pStrand) { seq = *pSeq; reverseComplement(seq->dna, seq->size); *pStrand = strand; } } else { char fileName[512]; freeDnaSeq(pSeq); if (isTwoBit) { struct twoBitFile *tbf = twoBitOpenCached(seqPath); *pSeq = seq = twoBitReadSeqFrag(tbf, newName, 0, 0); verbose(1, "Loaded %d bases of %s from %s\n", seq->size, newName, seqPath); } else { snprintf(fileName, sizeof(fileName), "%s/%s.nib", seqPath, newName); *pSeq = seq = nibLoadAllMasked(NIB_MASK_MIXED, fileName); verbose(1, "Loaded %d bases in %s\n", seq->size, fileName); } *pName = newName; *pStrand = strand; if (strand == '-') reverseComplement(seq->dna, seq->size); } } static void loadFaSeq(struct hash *faHash, char *newName, char strand, char **pName, struct dnaSeq **pSeq, char *pStrand, char *fastaFileName) /* retrieve sequence from hash. Reverse complement * if necessary. */ { struct dnaSeq *seq; if (sameString(newName, *pName)) { if (strand != *pStrand) { seq = *pSeq; reverseComplement(seq->dna, seq->size); *pStrand = strand; } } else { *pName = newName; *pSeq = seq = hashFindVal(faHash, newName); if (NULL == seq) errAbort("ERROR: can not find sequence name '%s' from fasta file '%s'\n", newName, fastaFileName); *pStrand = strand; if (strand == '-') reverseComplement(seq->dna, seq->size); verbose(1, "Loaded %d bases from %s fa\n", seq->size, newName); } } void removeExactOverlaps(struct cBlock **pBoxList) /* Remove from list blocks that start in exactly the same * place on both coordinates. */ { struct cBlock *newList = NULL, *b, *next, *last = NULL; slSort(pBoxList, cBlockCmpBoth); for (b = *pBoxList; b != NULL; b = next) { next = b->next; if (last != NULL && b->qStart == last->qStart && b->tStart == last->tStart) { /* Fold this block into previous one. */ if (last->qEnd < b->qEnd) last->qEnd = b->qEnd; if (last->tEnd < b->tEnd) last->tEnd = b->tEnd; freeMem(b); } else { slAddHead(&newList, b); last = b; } } slReverse(&newList); *pBoxList = newList; } #ifdef TESTONLY void abortChain(struct chain *chain, char *message) /* Report chain problem and abort. */ { errAbort("%s tName %s, tStart %d, tEnd %d, qStrand %c, qName %s, qStart %d, qEnd %d", message, chain->tName, chain->tStart, chain->tEnd, chain->qStrand, chain->qName, chain->qStart, chain->qEnd); } void checkChainScore(struct chain *chain, struct dnaSeq *qSeq, struct dnaSeq *tSeq) /* Check that chain score is reasonable. */ { struct cBlock *b; int totalBases = 0; double maxPerBase = 100, maxScore; int gapCount = 0; int size = 0; int gaplessScore = 0; int oneScore = 0; for (b = chain->blockList; b != NULL; b = b->next) { size = b->qEnd - b->qStart; if (size != b->tEnd - b->tStart) abortChain(chain, "q/t size mismatch"); totalBases += b->qEnd - b->qStart; ++gapCount; } maxScore = totalBases * maxPerBase; if (maxScore < chain->score) { verbose(1, "maxScore %f, chainScore %f\n", maxScore, chain->score); for (b = chain->blockList; b != NULL; b = b->next) { size = b->qEnd - b->qStart; oneScore = chainScoreBlock(qSeq->dna + b->qStart, tSeq->dna + b->tStart, size, scoreData.ss->matrix); verbose(1, " q %d, t %d, size %d, score %d\n", b->qStart, b->tStart, size, oneScore); gaplessScore += oneScore; } verbose(1, "gaplessScore %d\n", gaplessScore); abortChain(chain, "score too big"); } } #endif /* TESTONLY */ static void checkBlockRange(char *what, struct dnaSeq *seq, int start, int end) /* check block is in range of sequence */ { if (end > seq->size) errAbort("%s %s block %d-%d exceeds sequence length %d", what, seq->name, start, end, seq->size); } void chainPair(struct seqPair *sp, struct dnaSeq *qSeq, struct dnaSeq *tSeq, struct chain **pChainList, FILE *details) /* Chain up blocks and output. */ { struct chain *chainList, *chain, *next; struct cBlock *b; long startTime, dt; int size = 0; struct chainConnect cc; verbose(1, "chainPair %s\n", sp->name); /* Set up info for connect function. */ ZeroVar(&cc); cc.query = qSeq; cc.target = tSeq; cc.ss = scoreScheme; cc.gapCalc = gapCalc; /* Score blocks. */ for (b = sp->blockList; b != NULL; b = b->next) { size = b->qEnd - b->qStart; checkBlockRange("query", qSeq, b->qStart, b->qEnd); checkBlockRange("target", tSeq, b->tStart, b->tEnd); b->score = axtScoreUngapped(scoreScheme, qSeq->dna + b->qStart, tSeq->dna + b->tStart, size); } /* Get chain list and clean it up a little. */ startTime = clock1000(); chainList = chainBlocks(sp->qName, qSeq->size, sp->qStrand, sp->tName, tSeq->size, &sp->blockList, (ConnectCost)chainConnectCost, (GapCost)chainConnectGapCost, &cc, details); dt = clock1000() - startTime; verbose(1, "Main chaining step done in %ld milliseconds\n", dt); for (chain = chainList; chain != NULL; chain = chain->next) { chainRemovePartialOverlaps(chain, qSeq, tSeq, scoreScheme->matrix); chainMergeAbutting(chain); chain->score = chainCalcScore(chain, scoreScheme, gapCalc, qSeq, tSeq); } /* Move chains scoring over threshold to master list. */ for (chain = chainList; chain != NULL; chain = next) { next = chain->next; if (chain->score >= minScore) { slAddHead(pChainList, chain); } else { chainFree(&chain); } } } struct seqPair *readAxtBlocks(char *fileName, struct hash *pairHash, FILE *f) /* Read in axt file and parse blocks into pairHash */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct dyString *dy = newDyString(512); struct axt *axt; struct seqPair *spList = NULL, *sp; lineFileSetMetaDataOutput(lf, f); lineFileSetUniqueMetaData(lf); while ((axt = axtRead(lf)) != NULL) { dyStringClear(dy); dyStringPrintf(dy, "%s%c%s", axt->qName, axt->qStrand, axt->tName); sp = hashFindVal(pairHash, dy->string); if (sp == NULL) { AllocVar(sp); slAddHead(&spList, sp); hashAddSaveName(pairHash, dy->string, sp, &sp->name); sp->qName = cloneString(axt->qName); sp->tName = cloneString(axt->tName); sp->qStrand = axt->qStrand; } axtAddBlocksToBoxInList(&sp->blockList, axt); sp->axtCount += 1; axtFree(&axt); } lineFileClose(&lf); dyStringFree(&dy); slSort(&spList, seqPairCmp); return spList; } struct seqPair *readPslBlocks(char *fileName, struct hash *pairHash, FILE *f) /* Read in psl file and parse blocks into pairHash */ { struct seqPair *spList = NULL, *sp; struct lineFile *lf = pslFileOpenWithUniqueMeta(fileName, f); struct dyString *dy = newDyString(512); struct psl *psl; while ((psl = pslNext(lf)) != NULL) { if (psl->strand[1] != '\0') errAbort("requires PSLs to have implicit positive strand, found `%s'", psl->strand); dyStringClear(dy); dyStringPrintf(dy, "%s%s%s", psl->qName, psl->strand, psl->tName); sp = hashFindVal(pairHash, dy->string); if (sp == NULL) { AllocVar(sp); slAddHead(&spList, sp); hashAddSaveName(pairHash, dy->string, sp, &sp->name); sp->qName = cloneString(psl->qName); sp->tName = cloneString(psl->tName); sp->qStrand = psl->strand[0]; } addPslBlocks(&sp->blockList, psl); sp->axtCount += 1; pslFree(&psl); } lineFileClose(&lf); dyStringFree(&dy); return spList; } void axtChain(char *axtIn, char *tNibDir, char *qNibDir, char *chainOut) /* axtChain - Chain together axt alignments.. */ { struct hash *pairHash = newHash(0); /* Hash keyed by qSeq<strand>tSeq */ struct seqPair *spList = NULL, *sp; FILE *f = mustOpen(chainOut, "w"); char *qName = "", *tName = ""; struct dnaSeq *qSeq = NULL, *tSeq = NULL; char qStrand = 0, tStrand = 0; struct chain *chainList = NULL, *chain; FILE *details = NULL; struct dnaSeq *seq = NULL; struct hash *qFaHash = newHash(0); struct hash *tFaHash = newHash(0); FILE *faF; boolean qIsTwoBit = twoBitIsFile(qNibDir); boolean tIsTwoBit = twoBitIsFile(tNibDir); axtScoreSchemeDnaWrite(scoreScheme, f, "axtChain"); if (detailsName != NULL) details = mustOpen(detailsName, "w"); /* Read input file and divide alignments into various parts. */ if (optionExists("psl")) spList = readPslBlocks(axtIn, pairHash, f); else spList = readAxtBlocks(axtIn, pairHash, f); if (optionExists("faQ")) { faF = mustOpen(qNibDir, "r"); verbose(1, "reading query fasta sequence from '%s'\n", qNibDir); while ( faReadMixedNext(faF, TRUE, NULL, TRUE, NULL, &seq)) hashAdd(qFaHash, seq->name, seq); fclose(faF); } if (optionExists("faT")) { faF = mustOpen(tNibDir, "r"); verbose(1, "reading target fasta sequence from '%s'\n", tNibDir); while ( faReadMixedNext(faF, TRUE, NULL, TRUE, NULL, &seq)) hashAdd(tFaHash, seq->name, seq); fclose(faF); } for (sp = spList; sp != NULL; sp = sp->next) { slReverse(&sp->blockList); removeExactOverlaps(&sp->blockList); verbose(1, "%d blocks after duplicate removal\n", slCount(sp->blockList)); if (optionExists("faQ")) { assert (qFaHash != NULL); loadFaSeq(qFaHash, sp->qName, sp->qStrand, &qName, &qSeq, &qStrand, qNibDir); } else { if (! qIsTwoBit && ! isDirectory(qNibDir)) errAbort("given qNibDir argument: '%s' is not a 2bit file or a directory\n", qNibDir); loadIfNewSeq(qNibDir, qIsTwoBit, sp->qName, sp->qStrand, &qName, &qSeq, &qStrand); } if (optionExists("faT")) { assert (tFaHash != NULL); loadFaSeq(tFaHash, sp->tName, '+', &tName, &tSeq, &tStrand, tNibDir); } else { if (! tIsTwoBit && ! isDirectory(tNibDir)) errAbort("given tNibDir argument: '%s' is not a 2bit file or a directory\n", tNibDir); loadIfNewSeq(tNibDir, tIsTwoBit, sp->tName, '+', &tName, &tSeq, &tStrand); } chainPair(sp, qSeq, tSeq, &chainList, details); } slSort(&chainList, chainCmpScore); for (chain = chainList; chain != NULL; chain = chain->next) { assert(chain->qStart == chain->blockList->qStart && chain->tStart == chain->blockList->tStart); chainWrite(chain, f); } carefulClose(&f); } struct gapCalc *gapCalcReadOrDefault(char *fileName) /* Return gaps from file, or default if fileName is NULL. */ { if (fileName == NULL) errAbort("Must specify linear gap costs. Use 'loose' or 'medium' for defaults\n"); return gapCalcFromFile(fileName); } int main(int argc, char *argv[]) /* Process command line. */ { char *scoreSchemeName = NULL; optionHash(&argc, argv); minScore = optionInt("minScore", minScore); detailsName = optionVal("details", NULL); gapFileName = optionVal("linearGap", NULL); scoreSchemeName = optionVal("scoreScheme", NULL); if (argc != 5) usage(); if (scoreSchemeName != NULL) { verbose(1, "Reading scoring matrix from %s\n", scoreSchemeName); scoreScheme = axtScoreSchemeRead(scoreSchemeName); } else scoreScheme = axtScoreSchemeDefault(); dnaUtilOpen(); gapCalc = gapCalcReadOrDefault(gapFileName); /* testGaps(); */ axtChain(argv[1], argv[2], argv[3], argv[4]); return 0; }