02336754147822f5aa61ba13277123b2cc629001 markd Thu May 20 08:38:55 2021 -0700 Moved pslMap, pslMapPostChain, pslRc, pslSwap to src/utils, as they do not have hg/lib dependencies. diff --git src/hg/utils/pslMap/pslMap.c src/hg/utils/pslMap/pslMap.c deleted file mode 100644 index 509cc1d..0000000 --- src/hg/utils/pslMap/pslMap.c +++ /dev/null @@ -1,405 +0,0 @@ -/* pslMap - map PSLs alignments to new targets using alignments of the old - * target to the new target. */ - -/* Copyright (C) 2014 The Regents of the University of California - * See README in this or parent directory for licensing information. */ -#include "common.h" -#include "pslTransMap.h" -#include "options.h" -#include "linefile.h" -#include "genomeRangeTree.h" -#include "dystring.h" -#include "psl.h" -#include "dnautil.h" -#include "chain.h" -#include "verbose.h" - - -/* command line option specifications */ -static struct optionSpec optionSpecs[] = { - {"suffix", OPTION_STRING}, - {"keepTranslated", OPTION_BOOLEAN}, - {"mapFileWithInQName", OPTION_BOOLEAN}, - {"chainMapFile", OPTION_BOOLEAN}, - {"swapMap", OPTION_BOOLEAN}, - {"swapIn", OPTION_BOOLEAN}, - {"mapInfo", OPTION_STRING}, - {"mappingPsls", OPTION_STRING}, - {"simplifyMappingIds", OPTION_BOOLEAN}, - {NULL, 0} -}; - -/* Values parsed from command line */ -static char* suffix = NULL; -static unsigned mapOpts = pslTransMapNoOpts; -static boolean mapFileWithInQName = FALSE; -static boolean chainMapFile = FALSE; -static boolean swapMap = FALSE; -static boolean swapIn = FALSE; -static boolean simplifyMappingIds = FALSE; -static char* mapInfoFile = NULL; -static char* mappingPslFile = NULL; - -static char *mapInfoHdr = - "#srcQName\t" "srcQStart\t" "srcQEnd\t" "srcQSize\t" - "srcTName\t" "srcTStart\t" "srcTEnd\t" - "srcStrand\t" "srcAligned\t" - "mappingQName\t" "mappingQStart\t" "mappingQEnd\t" - "mappingTName\t" "mappingTStart\t" "mappingTEnd\t" - "mappingStrand\t" "mappingId\t" - "mappedQName\t" "mappedQStart\t" "mappedQEnd\t" - "mappedTName\t" "mappedTStart\t" "mappedTEnd\t" - "mappedStrand\t" - "mappedAligned\t" "qStartTrunc\t" "qEndTrunc\n"; - -static void usage() -/* usage msg and exit */ -{ -/* message got huge, so it's in a generate file */ -static char *usageMsg = -#include "usage.msg" - ; -errAbort("%s", usageMsg); -} - -static void verbosePslNl(int verbosity, char *msg, struct psl *psl) -/* Verbose logging of msg, if not null, followed by a psl if not null, followed by a new line */ -{ -if (verboseLevel() >= verbosity) - { - if (msg != NULL) - verbose(verbosity, "%s ", msg); - if (psl != NULL) - pslTabOut(psl, verboseLogFile()); - else - verbose(verbosity, "NULL\n"); - } -} - -struct mapAln -/* Mapping alignment, psl plus additional information. */ -{ - struct mapAln *next; /* list mapping to the genomeRangeTree node */ - struct psl *psl; /* psl, maybe created from a chain */ - int id; /* chain id, or psl file row */ - char *qNameMatch; /* option qName to match with input file */ -}; - -static struct mapAln *mapAlnNew(struct psl *psl, int id, char *qNameMatch) -/* construct a new mapAln object */ -{ -struct mapAln *mapAln; -AllocVar(mapAln); -mapAln->psl = psl; -mapAln->id = id; -mapAln->qNameMatch = cloneString(qNameMatch); // maybe NULL -return mapAln; -} - -static char *getMappingId(char *id, struct dyString **idBufPtr) -/* get the mapping id, optionally simplified. idBuf will be - * allocated if NULL and is used to store the result */ -{ -if (!simplifyMappingIds) - return id; -if (*idBufPtr == NULL) - *idBufPtr = dyStringNew(128); -struct dyString *idBuf = *idBufPtr; -dyStringClear(idBuf); -dyStringAppend(idBuf, id); -char *p = strrchr(idBuf->string, '-'); -if (p != NULL) - *p = '\0'; -p = strrchr(idBuf->string, '.'); -if (p != NULL) - *p = '\0'; -idBuf->stringSize = strlen(idBuf->string); -return idBuf->string; -} - -static void *slCatReversed(void *va, void *vb) -/* slCat that reverses parameter order, as the first list in rangeTreeAddVal - * mergeVals function tends to be larger in degenerate cases of a huge number - * of chains */ -{ -return slCat(vb, va); -} - -static void mapAlnsAdd(struct genomeRangeTree* mapAlns, - char *mappingId, - struct mapAln *mapAln) -/* add a map align object to the genomeRangeTree */ -{ -genomeRangeTreeAddVal(mapAlns, mappingId, mapAln->psl->qStart, mapAln->psl->qEnd, mapAln, slCatReversed); -} - -static struct mapAln *chainToPsl(struct chain *ch) -/* convert a chain to a psl, ignoring match counts, etc */ -{ -struct psl *psl; -struct cBlock *cBlk; -int iBlk; -int qStart = ch->qStart, qEnd = ch->qEnd; -char strand[2]; -strand[0] = ch->qStrand; -strand[1] = '\0'; -if (ch->qStrand == '-') - reverseIntRange(&qStart, &qEnd, ch->qSize); - -psl = pslNew(ch->qName, ch->qSize, qStart, qEnd, - ch->tName, ch->tSize, ch->tStart, ch->tEnd, - strand, slCount(ch->blockList), 0); -for (cBlk = ch->blockList, iBlk = 0; cBlk != NULL; cBlk = cBlk->next, iBlk++) - { - psl->blockSizes[iBlk] = (cBlk->tEnd - cBlk->tStart); - psl->qStarts[iBlk] = cBlk->qStart; - psl->tStarts[iBlk] = cBlk->tStart; - psl->match += psl->blockSizes[iBlk]; - psl->blockCount++; - } -pslComputeInsertCounts(psl); -if (swapMap) - pslSwap(psl, FALSE); -return mapAlnNew(psl, ch->id, NULL); -} - -static struct genomeRangeTree* loadMapChains(char *chainFile) -/* read a chain file, convert to mapAln object and genomeRangeTree by query locations. */ -{ -struct genomeRangeTree* mapAlns = genomeRangeTreeNew(); -struct chain *ch; -struct lineFile *chLf = lineFileOpen(chainFile, TRUE); -while ((ch = chainRead(chLf)) != NULL) - { - struct mapAln* mapAln = chainToPsl(ch); - mapAlnsAdd(mapAlns, mapAln->psl->qName, mapAln); - chainFree(&ch); - } -lineFileClose(&chLf); -return mapAlns; -} - -static struct mapAln* readPslMapAln(struct lineFile *pslLf, int id) -/* read the next mapping PSL, handling optional qName. Return NULL on EOF */ -{ -char *words[32]; -int wordCount = lineFileChopNextTab(pslLf, words, ArraySize(words)); -int pslOffset = mapFileWithInQName ? 1 : 0; // first column is qName? -if (wordCount == 0) - return NULL; - -struct psl *psl = NULL; -if (wordCount-pslOffset == 21) - psl = pslLoad(words+pslOffset); -else if (wordCount-pslOffset == 23) - psl = pslxLoad(words+pslOffset); -else - errAbort("Bad PSL line %d of %s has is %d columns instead of %d or %d%s", pslLf->lineIx, pslLf->fileName, - wordCount, 21+pslOffset, 23+pslOffset, - (mapFileWithInQName ? " (including -mapFileWithInQName extra column)" : "")); -if (swapMap) - pslSwap(psl, FALSE); -return mapAlnNew(psl, id, mapFileWithInQName ? words[0] : NULL); -} - -static struct genomeRangeTree* loadMapPsls(char *pslFile) -/* read a psl file and genomeRangeTree by query, linking multiple PSLs for the - * same query.*/ -{ -struct dyString* idBuf = NULL; -struct genomeRangeTree* mapAlns = genomeRangeTreeNew(); -int id = 0; -struct mapAln* mapAln; -struct lineFile *pslLf = lineFileOpen(pslFile, TRUE); -while ((mapAln = readPslMapAln(pslLf, id)) != NULL) - { - mapAlnsAdd(mapAlns, getMappingId(mapAln->psl->qName, &idBuf), mapAln); - id++; - } -lineFileClose(&pslLf); -dyStringFree(&idBuf); -return mapAlns; -} - -static int pslAlignedBases(struct psl *psl) -/* get number of aligned bases in a psl. Need because we can't set the stats - * in the mapped psl */ -{ -int i, cnt = 0; -for (i = 0; i < psl->blockCount; i++) - cnt += psl->blockSizes[i]; -return cnt; -} - -static void writeMapInfo(FILE* fh, struct psl *inPsl, struct mapAln *mapAln, - struct psl* mappedPsl) -/* write mapInfo row. mapAln and mappedPsl are NULL in not mapped */ -{ -/* srcQName srcQStart srcQEnd srcQSize */ -fprintf(fh, "%s\t%d\t%d\t%d\t", - inPsl->qName, inPsl->qStart, inPsl->qEnd, inPsl->qSize); -/* srcTName srcTStart srcTEnd srcStrand srcAligned */ -fprintf(fh, "%s\t%d\t%d\t%s\t%d\t", - inPsl->tName, inPsl->tStart, inPsl->tEnd, inPsl->strand, - pslAlignedBases(inPsl)); -if (mapAln != NULL) - { - /* mappingQName mappingQStart mappingQEnd */ - fprintf(fh, "%s\t%d\t%d", - mapAln->psl->qName, mapAln->psl->qStart, mapAln->psl->qEnd); - /* mappingTName mappingTStart mappingTEnd mappingStrand mappingId */ - fprintf(fh, "\t%s\t%d\t%d\t%s\t%d\t", - mapAln->psl->tName, mapAln->psl->tStart, mapAln->psl->tEnd, - mapAln->psl->strand, mapAln->id); - /* mappedQName mappedQStart mappedQEnd mappedTName mappedTStart mappedTEnd mappedStrand */ - fprintf(fh, "%s\t%d\t%d\t%s\t%d\t%d\t%s\t", - mappedPsl->qName, mappedPsl->qStart, mappedPsl->qEnd, - mappedPsl->tName, mappedPsl->tStart, mappedPsl->tEnd, - mappedPsl->strand); - /* mappedAligned qStartTrunc qEndTrunc */ - fprintf(fh, "%d\t%d\t%d\n", - pslAlignedBases(mappedPsl), - (mappedPsl->qStart-inPsl->qStart), (inPsl->qEnd-mappedPsl->qEnd)); - } -else - { - int i; - for (i = 9; i < 26; i++) - fputc('\t', fh); - fputc('\n', fh); - } -} - -static void addQNameSuffix(struct psl *psl) -/* add the suffix to the mapped psl qName */ -{ -char *oldQName = psl->qName; -char *newQName = needMem(strlen(psl->qName)+strlen(suffix)+1); -strcpy(newQName, oldQName); -strcat(newQName, suffix); -psl->qName = newQName; -freeMem(oldQName); -} - -static void mappedPslOutput(struct psl *inPsl, struct mapAln *mapAln, struct psl* mappedPsl, - FILE* outPslFh, FILE *mapInfoFh, FILE *mappingPslFh) -/* output one mapped psl */ -{ -if (suffix != NULL) - addQNameSuffix(mappedPsl); -pslTabOut(mappedPsl, outPslFh); -if (mapInfoFh != NULL) - writeMapInfo(mapInfoFh, inPsl, mapAln, mappedPsl); -if (mappingPslFh != NULL) - pslTabOut(mapAln->psl, mappingPslFh); -} - -static boolean mapPslPair(struct psl *inPsl, struct mapAln *mapAln, - FILE* outPslFh, FILE *mapInfoFh, FILE *mappingPslFh) -/* map one pair of query and target PSL */ -{ -struct psl* mappedPsl; -if (inPsl->tSize != mapAln->psl->qSize) - errAbort("Error: inPsl %s tSize (%d) != mapping alignment %s qSize (%d) (perhaps you need to specify -swapMap?)\n", - inPsl->tName, inPsl->tSize, mapAln->psl->qName, mapAln->psl->qSize); -verbosePslNl(2, "inAln", inPsl); -verbosePslNl(2, "mapAln", mapAln->psl); - -mappedPsl = pslTransMap(mapOpts, inPsl, mapAln->psl); - -verbosePslNl(2, "mappedAln", mappedPsl); - -/* only output if blocks were actually mapped */ -boolean wasMapped = mappedPsl != NULL; -if (wasMapped) - mappedPslOutput(inPsl, mapAln, mappedPsl, outPslFh, mapInfoFh, mappingPslFh); -pslFree(&mappedPsl); -return wasMapped; -} - -static void mapQueryPsl(struct psl* inPsl, struct genomeRangeTree *mapAlns, - FILE* outPslFh, FILE *mapInfoFh, FILE *mappingPslFh) -/* map a query psl to all targets */ -{ -static struct dyString *idBuf = NULL; -struct range *overMapAlnNodes = genomeRangeTreeAllOverlapping(mapAlns, getMappingId(inPsl->tName, &idBuf), inPsl->tStart, inPsl->tEnd); -struct range *overMapAlnNode; -struct mapAln *overMapAln; -boolean wasMapped = FALSE; -for (overMapAlnNode = overMapAlnNodes; overMapAlnNode != NULL; overMapAlnNode = overMapAlnNode->next) - { - for (overMapAln = overMapAlnNode->val; overMapAln != NULL; overMapAln = overMapAln->next) - { - if ((overMapAln->qNameMatch == NULL) || sameString(inPsl->qName, overMapAln->qNameMatch)) - { - if (mapPslPair(inPsl, overMapAln, outPslFh, mapInfoFh, mappingPslFh)) - wasMapped = TRUE; - } - } - } -if ((mapInfoFh != NULL) && !wasMapped) - writeMapInfo(mapInfoFh, inPsl, NULL, NULL); -} - -static void pslMap(char* inPslFile, char *mapFile, char *outPslFile) -/* project inPsl query through mapFile query to mapFile target */ -{ -struct genomeRangeTree *mapAlns; -struct psl* inPsl; -struct lineFile* inPslLf = pslFileOpen(inPslFile); -FILE *outPslFh, *mapInfoFh = NULL, *mappingPslFh = NULL; - -if (chainMapFile) - mapAlns = loadMapChains(mapFile); -else - mapAlns = loadMapPsls(mapFile); - -outPslFh = mustOpen(outPslFile, "w"); -if (mapInfoFile != NULL) - { - mapInfoFh = mustOpen(mapInfoFile, "w"); - fputs(mapInfoHdr, mapInfoFh); - } -if (mappingPslFile != NULL) - mappingPslFh = mustOpen(mappingPslFile, "w"); -while ((inPsl = pslNext(inPslLf)) != NULL) - { - if (swapIn) - pslSwap(inPsl, FALSE); - mapQueryPsl(inPsl, mapAlns, outPslFh, mapInfoFh, mappingPslFh); - pslFree(&inPsl); - } -carefulClose(&mappingPslFh); -carefulClose(&mapInfoFh); -carefulClose(&outPslFh); -lineFileClose(&inPslLf); -} - -int main(int argc, char *argv[]) -/* Process command line. */ -{ -optionInit(&argc, argv, optionSpecs); -if (argc != 4) - usage(); -suffix = optionVal("suffix", NULL); -if (optionExists("keepTranslated")) - mapOpts |= pslTransMapKeepTrans; -mapFileWithInQName = optionExists("mapFileWithInQName"); -chainMapFile = optionExists("chainMapFile"); -if (mapFileWithInQName && chainMapFile) - errAbort("can't specify -mapFileWithInQName with -chainMapFile"); -swapMap = optionExists("swapMap"); -swapIn = optionExists("swapIn"); -simplifyMappingIds = optionExists("simplifyMappingIds"); -mapInfoFile = optionVal("mapInfo", NULL); -mappingPslFile = optionVal("mappingPsls", NULL); -pslMap(argv[1], argv[2], argv[3]); - -return 0; -} -/* - * Local Variables: - * c-file-style: "jkent-c" - * End: - */ -