02336754147822f5aa61ba13277123b2cc629001 markd Thu May 20 08:38:55 2021 -0700 Moved pslMap, pslMapPostChain, pslRc, pslSwap to src/utils, as they do not have hg/lib dependencies. diff --git src/utils/pslMap/pslMap.c src/utils/pslMap/pslMap.c new file mode 100644 index 0000000..509cc1d --- /dev/null +++ src/utils/pslMap/pslMap.c @@ -0,0 +1,405 @@ +/* pslMap - map PSLs alignments to new targets using alignments of the old + * target to the new target. */ + +/* Copyright (C) 2014 The Regents of the University of California + * See README in this or parent directory for licensing information. */ +#include "common.h" +#include "pslTransMap.h" +#include "options.h" +#include "linefile.h" +#include "genomeRangeTree.h" +#include "dystring.h" +#include "psl.h" +#include "dnautil.h" +#include "chain.h" +#include "verbose.h" + + +/* command line option specifications */ +static struct optionSpec optionSpecs[] = { + {"suffix", OPTION_STRING}, + {"keepTranslated", OPTION_BOOLEAN}, + {"mapFileWithInQName", OPTION_BOOLEAN}, + {"chainMapFile", OPTION_BOOLEAN}, + {"swapMap", OPTION_BOOLEAN}, + {"swapIn", OPTION_BOOLEAN}, + {"mapInfo", OPTION_STRING}, + {"mappingPsls", OPTION_STRING}, + {"simplifyMappingIds", OPTION_BOOLEAN}, + {NULL, 0} +}; + +/* Values parsed from command line */ +static char* suffix = NULL; +static unsigned mapOpts = pslTransMapNoOpts; +static boolean mapFileWithInQName = FALSE; +static boolean chainMapFile = FALSE; +static boolean swapMap = FALSE; +static boolean swapIn = FALSE; +static boolean simplifyMappingIds = FALSE; +static char* mapInfoFile = NULL; +static char* mappingPslFile = NULL; + +static char *mapInfoHdr = + "#srcQName\t" "srcQStart\t" "srcQEnd\t" "srcQSize\t" + "srcTName\t" "srcTStart\t" "srcTEnd\t" + "srcStrand\t" "srcAligned\t" + "mappingQName\t" "mappingQStart\t" "mappingQEnd\t" + "mappingTName\t" "mappingTStart\t" "mappingTEnd\t" + "mappingStrand\t" "mappingId\t" + "mappedQName\t" "mappedQStart\t" "mappedQEnd\t" + "mappedTName\t" "mappedTStart\t" "mappedTEnd\t" + "mappedStrand\t" + "mappedAligned\t" "qStartTrunc\t" "qEndTrunc\n"; + +static void usage() +/* usage msg and exit */ +{ +/* message got huge, so it's in a generate file */ +static char *usageMsg = +#include "usage.msg" + ; +errAbort("%s", usageMsg); +} + +static void verbosePslNl(int verbosity, char *msg, struct psl *psl) +/* Verbose logging of msg, if not null, followed by a psl if not null, followed by a new line */ +{ +if (verboseLevel() >= verbosity) + { + if (msg != NULL) + verbose(verbosity, "%s ", msg); + if (psl != NULL) + pslTabOut(psl, verboseLogFile()); + else + verbose(verbosity, "NULL\n"); + } +} + +struct mapAln +/* Mapping alignment, psl plus additional information. */ +{ + struct mapAln *next; /* list mapping to the genomeRangeTree node */ + struct psl *psl; /* psl, maybe created from a chain */ + int id; /* chain id, or psl file row */ + char *qNameMatch; /* option qName to match with input file */ +}; + +static struct mapAln *mapAlnNew(struct psl *psl, int id, char *qNameMatch) +/* construct a new mapAln object */ +{ +struct mapAln *mapAln; +AllocVar(mapAln); +mapAln->psl = psl; +mapAln->id = id; +mapAln->qNameMatch = cloneString(qNameMatch); // maybe NULL +return mapAln; +} + +static char *getMappingId(char *id, struct dyString **idBufPtr) +/* get the mapping id, optionally simplified. idBuf will be + * allocated if NULL and is used to store the result */ +{ +if (!simplifyMappingIds) + return id; +if (*idBufPtr == NULL) + *idBufPtr = dyStringNew(128); +struct dyString *idBuf = *idBufPtr; +dyStringClear(idBuf); +dyStringAppend(idBuf, id); +char *p = strrchr(idBuf->string, '-'); +if (p != NULL) + *p = '\0'; +p = strrchr(idBuf->string, '.'); +if (p != NULL) + *p = '\0'; +idBuf->stringSize = strlen(idBuf->string); +return idBuf->string; +} + +static void *slCatReversed(void *va, void *vb) +/* slCat that reverses parameter order, as the first list in rangeTreeAddVal + * mergeVals function tends to be larger in degenerate cases of a huge number + * of chains */ +{ +return slCat(vb, va); +} + +static void mapAlnsAdd(struct genomeRangeTree* mapAlns, + char *mappingId, + struct mapAln *mapAln) +/* add a map align object to the genomeRangeTree */ +{ +genomeRangeTreeAddVal(mapAlns, mappingId, mapAln->psl->qStart, mapAln->psl->qEnd, mapAln, slCatReversed); +} + +static struct mapAln *chainToPsl(struct chain *ch) +/* convert a chain to a psl, ignoring match counts, etc */ +{ +struct psl *psl; +struct cBlock *cBlk; +int iBlk; +int qStart = ch->qStart, qEnd = ch->qEnd; +char strand[2]; +strand[0] = ch->qStrand; +strand[1] = '\0'; +if (ch->qStrand == '-') + reverseIntRange(&qStart, &qEnd, ch->qSize); + +psl = pslNew(ch->qName, ch->qSize, qStart, qEnd, + ch->tName, ch->tSize, ch->tStart, ch->tEnd, + strand, slCount(ch->blockList), 0); +for (cBlk = ch->blockList, iBlk = 0; cBlk != NULL; cBlk = cBlk->next, iBlk++) + { + psl->blockSizes[iBlk] = (cBlk->tEnd - cBlk->tStart); + psl->qStarts[iBlk] = cBlk->qStart; + psl->tStarts[iBlk] = cBlk->tStart; + psl->match += psl->blockSizes[iBlk]; + psl->blockCount++; + } +pslComputeInsertCounts(psl); +if (swapMap) + pslSwap(psl, FALSE); +return mapAlnNew(psl, ch->id, NULL); +} + +static struct genomeRangeTree* loadMapChains(char *chainFile) +/* read a chain file, convert to mapAln object and genomeRangeTree by query locations. */ +{ +struct genomeRangeTree* mapAlns = genomeRangeTreeNew(); +struct chain *ch; +struct lineFile *chLf = lineFileOpen(chainFile, TRUE); +while ((ch = chainRead(chLf)) != NULL) + { + struct mapAln* mapAln = chainToPsl(ch); + mapAlnsAdd(mapAlns, mapAln->psl->qName, mapAln); + chainFree(&ch); + } +lineFileClose(&chLf); +return mapAlns; +} + +static struct mapAln* readPslMapAln(struct lineFile *pslLf, int id) +/* read the next mapping PSL, handling optional qName. Return NULL on EOF */ +{ +char *words[32]; +int wordCount = lineFileChopNextTab(pslLf, words, ArraySize(words)); +int pslOffset = mapFileWithInQName ? 1 : 0; // first column is qName? +if (wordCount == 0) + return NULL; + +struct psl *psl = NULL; +if (wordCount-pslOffset == 21) + psl = pslLoad(words+pslOffset); +else if (wordCount-pslOffset == 23) + psl = pslxLoad(words+pslOffset); +else + errAbort("Bad PSL line %d of %s has is %d columns instead of %d or %d%s", pslLf->lineIx, pslLf->fileName, + wordCount, 21+pslOffset, 23+pslOffset, + (mapFileWithInQName ? " (including -mapFileWithInQName extra column)" : "")); +if (swapMap) + pslSwap(psl, FALSE); +return mapAlnNew(psl, id, mapFileWithInQName ? words[0] : NULL); +} + +static struct genomeRangeTree* loadMapPsls(char *pslFile) +/* read a psl file and genomeRangeTree by query, linking multiple PSLs for the + * same query.*/ +{ +struct dyString* idBuf = NULL; +struct genomeRangeTree* mapAlns = genomeRangeTreeNew(); +int id = 0; +struct mapAln* mapAln; +struct lineFile *pslLf = lineFileOpen(pslFile, TRUE); +while ((mapAln = readPslMapAln(pslLf, id)) != NULL) + { + mapAlnsAdd(mapAlns, getMappingId(mapAln->psl->qName, &idBuf), mapAln); + id++; + } +lineFileClose(&pslLf); +dyStringFree(&idBuf); +return mapAlns; +} + +static int pslAlignedBases(struct psl *psl) +/* get number of aligned bases in a psl. Need because we can't set the stats + * in the mapped psl */ +{ +int i, cnt = 0; +for (i = 0; i < psl->blockCount; i++) + cnt += psl->blockSizes[i]; +return cnt; +} + +static void writeMapInfo(FILE* fh, struct psl *inPsl, struct mapAln *mapAln, + struct psl* mappedPsl) +/* write mapInfo row. mapAln and mappedPsl are NULL in not mapped */ +{ +/* srcQName srcQStart srcQEnd srcQSize */ +fprintf(fh, "%s\t%d\t%d\t%d\t", + inPsl->qName, inPsl->qStart, inPsl->qEnd, inPsl->qSize); +/* srcTName srcTStart srcTEnd srcStrand srcAligned */ +fprintf(fh, "%s\t%d\t%d\t%s\t%d\t", + inPsl->tName, inPsl->tStart, inPsl->tEnd, inPsl->strand, + pslAlignedBases(inPsl)); +if (mapAln != NULL) + { + /* mappingQName mappingQStart mappingQEnd */ + fprintf(fh, "%s\t%d\t%d", + mapAln->psl->qName, mapAln->psl->qStart, mapAln->psl->qEnd); + /* mappingTName mappingTStart mappingTEnd mappingStrand mappingId */ + fprintf(fh, "\t%s\t%d\t%d\t%s\t%d\t", + mapAln->psl->tName, mapAln->psl->tStart, mapAln->psl->tEnd, + mapAln->psl->strand, mapAln->id); + /* mappedQName mappedQStart mappedQEnd mappedTName mappedTStart mappedTEnd mappedStrand */ + fprintf(fh, "%s\t%d\t%d\t%s\t%d\t%d\t%s\t", + mappedPsl->qName, mappedPsl->qStart, mappedPsl->qEnd, + mappedPsl->tName, mappedPsl->tStart, mappedPsl->tEnd, + mappedPsl->strand); + /* mappedAligned qStartTrunc qEndTrunc */ + fprintf(fh, "%d\t%d\t%d\n", + pslAlignedBases(mappedPsl), + (mappedPsl->qStart-inPsl->qStart), (inPsl->qEnd-mappedPsl->qEnd)); + } +else + { + int i; + for (i = 9; i < 26; i++) + fputc('\t', fh); + fputc('\n', fh); + } +} + +static void addQNameSuffix(struct psl *psl) +/* add the suffix to the mapped psl qName */ +{ +char *oldQName = psl->qName; +char *newQName = needMem(strlen(psl->qName)+strlen(suffix)+1); +strcpy(newQName, oldQName); +strcat(newQName, suffix); +psl->qName = newQName; +freeMem(oldQName); +} + +static void mappedPslOutput(struct psl *inPsl, struct mapAln *mapAln, struct psl* mappedPsl, + FILE* outPslFh, FILE *mapInfoFh, FILE *mappingPslFh) +/* output one mapped psl */ +{ +if (suffix != NULL) + addQNameSuffix(mappedPsl); +pslTabOut(mappedPsl, outPslFh); +if (mapInfoFh != NULL) + writeMapInfo(mapInfoFh, inPsl, mapAln, mappedPsl); +if (mappingPslFh != NULL) + pslTabOut(mapAln->psl, mappingPslFh); +} + +static boolean mapPslPair(struct psl *inPsl, struct mapAln *mapAln, + FILE* outPslFh, FILE *mapInfoFh, FILE *mappingPslFh) +/* map one pair of query and target PSL */ +{ +struct psl* mappedPsl; +if (inPsl->tSize != mapAln->psl->qSize) + errAbort("Error: inPsl %s tSize (%d) != mapping alignment %s qSize (%d) (perhaps you need to specify -swapMap?)\n", + inPsl->tName, inPsl->tSize, mapAln->psl->qName, mapAln->psl->qSize); +verbosePslNl(2, "inAln", inPsl); +verbosePslNl(2, "mapAln", mapAln->psl); + +mappedPsl = pslTransMap(mapOpts, inPsl, mapAln->psl); + +verbosePslNl(2, "mappedAln", mappedPsl); + +/* only output if blocks were actually mapped */ +boolean wasMapped = mappedPsl != NULL; +if (wasMapped) + mappedPslOutput(inPsl, mapAln, mappedPsl, outPslFh, mapInfoFh, mappingPslFh); +pslFree(&mappedPsl); +return wasMapped; +} + +static void mapQueryPsl(struct psl* inPsl, struct genomeRangeTree *mapAlns, + FILE* outPslFh, FILE *mapInfoFh, FILE *mappingPslFh) +/* map a query psl to all targets */ +{ +static struct dyString *idBuf = NULL; +struct range *overMapAlnNodes = genomeRangeTreeAllOverlapping(mapAlns, getMappingId(inPsl->tName, &idBuf), inPsl->tStart, inPsl->tEnd); +struct range *overMapAlnNode; +struct mapAln *overMapAln; +boolean wasMapped = FALSE; +for (overMapAlnNode = overMapAlnNodes; overMapAlnNode != NULL; overMapAlnNode = overMapAlnNode->next) + { + for (overMapAln = overMapAlnNode->val; overMapAln != NULL; overMapAln = overMapAln->next) + { + if ((overMapAln->qNameMatch == NULL) || sameString(inPsl->qName, overMapAln->qNameMatch)) + { + if (mapPslPair(inPsl, overMapAln, outPslFh, mapInfoFh, mappingPslFh)) + wasMapped = TRUE; + } + } + } +if ((mapInfoFh != NULL) && !wasMapped) + writeMapInfo(mapInfoFh, inPsl, NULL, NULL); +} + +static void pslMap(char* inPslFile, char *mapFile, char *outPslFile) +/* project inPsl query through mapFile query to mapFile target */ +{ +struct genomeRangeTree *mapAlns; +struct psl* inPsl; +struct lineFile* inPslLf = pslFileOpen(inPslFile); +FILE *outPslFh, *mapInfoFh = NULL, *mappingPslFh = NULL; + +if (chainMapFile) + mapAlns = loadMapChains(mapFile); +else + mapAlns = loadMapPsls(mapFile); + +outPslFh = mustOpen(outPslFile, "w"); +if (mapInfoFile != NULL) + { + mapInfoFh = mustOpen(mapInfoFile, "w"); + fputs(mapInfoHdr, mapInfoFh); + } +if (mappingPslFile != NULL) + mappingPslFh = mustOpen(mappingPslFile, "w"); +while ((inPsl = pslNext(inPslLf)) != NULL) + { + if (swapIn) + pslSwap(inPsl, FALSE); + mapQueryPsl(inPsl, mapAlns, outPslFh, mapInfoFh, mappingPslFh); + pslFree(&inPsl); + } +carefulClose(&mappingPslFh); +carefulClose(&mapInfoFh); +carefulClose(&outPslFh); +lineFileClose(&inPslLf); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, optionSpecs); +if (argc != 4) + usage(); +suffix = optionVal("suffix", NULL); +if (optionExists("keepTranslated")) + mapOpts |= pslTransMapKeepTrans; +mapFileWithInQName = optionExists("mapFileWithInQName"); +chainMapFile = optionExists("chainMapFile"); +if (mapFileWithInQName && chainMapFile) + errAbort("can't specify -mapFileWithInQName with -chainMapFile"); +swapMap = optionExists("swapMap"); +swapIn = optionExists("swapIn"); +simplifyMappingIds = optionExists("simplifyMappingIds"); +mapInfoFile = optionVal("mapInfo", NULL); +mappingPslFile = optionVal("mappingPsls", NULL); +pslMap(argv[1], argv[2], argv[3]); + +return 0; +} +/* + * Local Variables: + * c-file-style: "jkent-c" + * End: + */ +