87f40c4dbf6ae4e07fe0ecd8b4cb75dec6e3c0a5 markd Thu Jul 25 20:21:01 2024 -0700 add option to pslMap to output TSVs for mapInfo files rather than autoSql tab format diff --git src/utils/pslMap/pslMap.c src/utils/pslMap/pslMap.c index 026a0f9..0f3d122 100644 --- src/utils/pslMap/pslMap.c +++ src/utils/pslMap/pslMap.c @@ -1,440 +1,445 @@ /* pslMap - map PSLs alignments to new targets using alignments of the old * target to the new target. */ /* Copyright (C) 2014 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "pslTransMap.h" #include "options.h" #include "linefile.h" #include "genomeRangeTree.h" #include "dystring.h" #include "psl.h" #include "dnautil.h" #include "chain.h" #include "verbose.h" /* command line option specifications */ static struct optionSpec optionSpecs[] = { {"suffix", OPTION_STRING}, {"keepTranslated", OPTION_BOOLEAN}, {"mapFileWithInQName", OPTION_BOOLEAN}, {"check", OPTION_BOOLEAN}, {"chainMapFile", OPTION_BOOLEAN}, {"swapMap", OPTION_BOOLEAN}, {"swapIn", OPTION_BOOLEAN}, {"mapInfo", OPTION_STRING}, + {"tsv", OPTION_BOOLEAN}, {"mappingPsls", OPTION_STRING}, {"inType", OPTION_STRING}, {"mapType", OPTION_STRING}, {"simplifyMappingIds", OPTION_BOOLEAN}, {NULL, 0} }; /* Values parsed from command line */ static char* suffix = NULL; static unsigned mapOpts = pslTransMapNoOpts; static boolean mapFileWithInQName = FALSE; static boolean chainMapFile = FALSE; static boolean swapMap = FALSE; static boolean swapIn = FALSE; static boolean check = FALSE; static boolean simplifyMappingIds = FALSE; static char* mapInfoFile = NULL; +static boolean tsv = FALSE; static char* mappingPslFile = NULL; static enum pslType inPslType = pslTypeUnspecified; static enum pslType mapPslType = pslTypeUnspecified; static char *mapInfoHdr = - "#srcQName\t" "srcQStart\t" "srcQEnd\t" "srcQSize\t" + "srcQName\t" "srcQStart\t" "srcQEnd\t" "srcQSize\t" "srcTName\t" "srcTStart\t" "srcTEnd\t" "srcStrand\t" "srcAligned\t" "mappingQName\t" "mappingQStart\t" "mappingQEnd\t" "mappingTName\t" "mappingTStart\t" "mappingTEnd\t" "mappingStrand\t" "mappingId\t" "mappedQName\t" "mappedQStart\t" "mappedQEnd\t" "mappedTName\t" "mappedTStart\t" "mappedTEnd\t" "mappedStrand\t" "mappedAligned\t" "qStartTrunc\t" "qEndTrunc\t" "mappedPslLine\n"; static void usage() /* usage msg and exit */ { /* message got huge, so it's in a generate file */ static char *usageMsg = #include "usage.msg" ; errAbort("%s", usageMsg); } static enum pslType parsePslType(char *typeStr) /* parse argument value of pslType */ { if (sameString(typeStr, "prot_prot")) return pslTypeProtProt; else if (sameString(typeStr, "prot_na")) return pslTypeProtNa; else if (sameString(typeStr, "na_na")) return pslTypeNaNa; else errAbort("invalid value for PSL type '%s', expected 'prot_prot', 'prot_na', or 'na_na'", typeStr); return pslTypeUnspecified; } static void verbosePslNl(int verbosity, char *msg, struct psl *psl) /* Verbose logging of msg, if not null, followed by a psl if not null, followed by a new line */ { if (verboseLevel() >= verbosity) { if (msg != NULL) verbose(verbosity, "%s ", msg); if (psl != NULL) pslTabOut(psl, verboseLogFile()); else verbose(verbosity, "NULL\n"); } } struct mapAln /* Mapping alignment, psl plus additional information. */ { struct mapAln *next; /* list mapping to the genomeRangeTree node */ struct psl *psl; /* psl, maybe created from a chain */ int id; /* chain id, or psl file row */ char *qNameMatch; /* option qName to match with input file */ }; static struct mapAln *mapAlnNew(struct psl *psl, int id, char *qNameMatch) /* construct a new mapAln object */ { struct mapAln *mapAln; AllocVar(mapAln); mapAln->psl = psl; mapAln->id = id; mapAln->qNameMatch = cloneString(qNameMatch); // maybe NULL return mapAln; } static char *getMappingId(char *id, struct dyString **idBufPtr) /* get the mapping id, optionally simplified. idBuf will be * allocated if NULL and is used to store the result */ { if (!simplifyMappingIds) return id; if (*idBufPtr == NULL) *idBufPtr = dyStringNew(128); struct dyString *idBuf = *idBufPtr; dyStringClear(idBuf); dyStringAppend(idBuf, id); char *p = strrchr(idBuf->string, '-'); if (p != NULL) *p = '\0'; p = strrchr(idBuf->string, '.'); if (p != NULL) *p = '\0'; idBuf->stringSize = strlen(idBuf->string); return idBuf->string; } static void *slCatReversed(void *va, void *vb) /* slCat that reverses parameter order, as the first list in rangeTreeAddVal * mergeVals function tends to be larger in degenerate cases of a huge number * of chains */ { return slCat(vb, va); } static void mapAlnsAdd(struct genomeRangeTree* mapAlns, char *mappingId, struct mapAln *mapAln) /* add a map align object to the genomeRangeTree */ { genomeRangeTreeAddVal(mapAlns, mappingId, mapAln->psl->qStart, mapAln->psl->qEnd, mapAln, slCatReversed); } static struct mapAln *chainToPsl(struct chain *ch) /* convert a chain to a psl, ignoring match counts, etc */ { struct psl *psl; struct cBlock *cBlk; int iBlk; int qStart = ch->qStart, qEnd = ch->qEnd; char strand[2]; strand[0] = ch->qStrand; strand[1] = '\0'; if (ch->qStrand == '-') reverseIntRange(&qStart, &qEnd, ch->qSize); psl = pslNew(ch->qName, ch->qSize, qStart, qEnd, ch->tName, ch->tSize, ch->tStart, ch->tEnd, strand, slCount(ch->blockList), 0); for (cBlk = ch->blockList, iBlk = 0; cBlk != NULL; cBlk = cBlk->next, iBlk++) { psl->blockSizes[iBlk] = (cBlk->tEnd - cBlk->tStart); psl->qStarts[iBlk] = cBlk->qStart; psl->tStarts[iBlk] = cBlk->tStart; psl->match += psl->blockSizes[iBlk]; psl->blockCount++; } pslComputeInsertCounts(psl); if (swapMap) pslSwap(psl, FALSE); return mapAlnNew(psl, ch->id, NULL); } static struct genomeRangeTree* loadMapChains(char *chainFile) /* read a chain file, convert to mapAln object and genomeRangeTree by query locations. */ { struct genomeRangeTree* mapAlns = genomeRangeTreeNew(); struct chain *ch; struct lineFile *chLf = lineFileOpen(chainFile, TRUE); while ((ch = chainRead(chLf)) != NULL) { struct mapAln* mapAln = chainToPsl(ch); mapAlnsAdd(mapAlns, mapAln->psl->qName, mapAln); chainFree(&ch); } lineFileClose(&chLf); return mapAlns; } static struct mapAln* readPslMapAln(struct lineFile *pslLf, int id) /* read the next mapping PSL, handling optional qName. Return NULL on EOF */ { char *words[32]; int wordCount = lineFileChopNextTab(pslLf, words, ArraySize(words)); int pslOffset = mapFileWithInQName ? 1 : 0; // first column is qName? if (wordCount == 0) return NULL; struct psl *psl = NULL; if (wordCount-pslOffset == 21) psl = pslLoad(words+pslOffset); else if (wordCount-pslOffset == 23) psl = pslxLoad(words+pslOffset); else errAbort("Bad PSL line %d of %s has is %d columns instead of %d or %d%s", pslLf->lineIx, pslLf->fileName, wordCount, 21+pslOffset, 23+pslOffset, (mapFileWithInQName ? " (including -mapFileWithInQName extra column)" : "")); if (swapMap) pslSwap(psl, FALSE); return mapAlnNew(psl, id, mapFileWithInQName ? words[0] : NULL); } static struct genomeRangeTree* loadMapPsls(char *pslFile) /* read a psl file and genomeRangeTree by query, linking multiple PSLs for the * same query.*/ { struct dyString* idBuf = NULL; struct genomeRangeTree* mapAlns = genomeRangeTreeNew(); int id = 0; struct mapAln* mapAln; struct lineFile *pslLf = lineFileOpen(pslFile, TRUE); while ((mapAln = readPslMapAln(pslLf, id)) != NULL) { mapAlnsAdd(mapAlns, getMappingId(mapAln->psl->qName, &idBuf), mapAln); id++; } lineFileClose(&pslLf); dyStringFree(&idBuf); return mapAlns; } static int pslAlignedBases(struct psl *psl) /* get number of aligned bases in a psl. Need because we can't set the stats * in the mapped psl */ { int i, cnt = 0; for (i = 0; i < psl->blockCount; i++) cnt += psl->blockSizes[i]; return cnt; } static void writeMapInfo(FILE* fh, struct psl *inPsl, struct mapAln *mapAln, struct psl* mappedPsl, unsigned outPslLine) /* write mapInfo row. mapAln and mappedPsl are NULL in not mapped */ { /* srcQName srcQStart srcQEnd srcQSize */ fprintf(fh, "%s\t%d\t%d\t%d\t", inPsl->qName, inPsl->qStart, inPsl->qEnd, inPsl->qSize); /* srcTName srcTStart srcTEnd srcStrand srcAligned */ fprintf(fh, "%s\t%d\t%d\t%s\t%d\t", inPsl->tName, inPsl->tStart, inPsl->tEnd, inPsl->strand, pslAlignedBases(inPsl)); if (mapAln != NULL) { /* mappingQName mappingQStart mappingQEnd */ fprintf(fh, "%s\t%d\t%d", mapAln->psl->qName, mapAln->psl->qStart, mapAln->psl->qEnd); /* mappingTName mappingTStart mappingTEnd mappingStrand mappingId */ fprintf(fh, "\t%s\t%d\t%d\t%s\t%d\t", mapAln->psl->tName, mapAln->psl->tStart, mapAln->psl->tEnd, mapAln->psl->strand, mapAln->id); /* mappedQName mappedQStart mappedQEnd mappedTName mappedTStart mappedTEnd mappedStrand */ fprintf(fh, "%s\t%d\t%d\t%s\t%d\t%d\t%s\t", mappedPsl->qName, mappedPsl->qStart, mappedPsl->qEnd, mappedPsl->tName, mappedPsl->tStart, mappedPsl->tEnd, mappedPsl->strand); /* mappedAligned qStartTrunc qEndTrunc */ fprintf(fh, "%d\t%d\t%d\t", pslAlignedBases(mappedPsl), (mappedPsl->qStart-inPsl->qStart), (inPsl->qEnd-mappedPsl->qEnd)); /* mappedPslLine */ fprintf(fh, "%u", outPslLine); } else { int i; for (i = 9; i < 27; i++) fputc('\t', fh); } fputc('\n', fh); } static void addQNameSuffix(struct psl *psl) /* add the suffix to the mapped psl qName */ { char *oldQName = psl->qName; char *newQName = needMem(strlen(psl->qName)+strlen(suffix)+1); strcpy(newQName, oldQName); strcat(newQName, suffix); psl->qName = newQName; freeMem(oldQName); } static void mappedPslOutput(struct psl *inPsl, struct mapAln *mapAln, struct psl* mappedPsl, FILE* outPslFh, FILE *mapInfoFh, FILE *mappingPslFh, unsigned outPslLine) /* output one mapped psl */ { if (suffix != NULL) addQNameSuffix(mappedPsl); pslTabOut(mappedPsl, outPslFh); if (mapInfoFh != NULL) writeMapInfo(mapInfoFh, inPsl, mapAln, mappedPsl, outPslLine); if (mappingPslFh != NULL) pslTabOut(mapAln->psl, mappingPslFh); } static boolean mapPslPair(struct psl *inPsl, struct mapAln *mapAln, FILE* outPslFh, FILE *mapInfoFh, FILE *mappingPslFh, unsigned* outPslLineRef) /* map one pair of query and target PSL */ { verbosePslNl(2, "inAln", inPsl); verbosePslNl(2, "mapAln", mapAln->psl); if (check && (pslCheck("input PSL", stderr, inPsl) > 0)) errAbort("BUG: invalid input PSL, run with -verbose=2 for more details"); if (check && (pslCheck("mapping PSL", stderr, mapAln->psl) > 0)) errAbort("BUG: invalid mapping PSL, run with -verbose=2 for more details"); struct psl* mappedPsl = pslTransMap(mapOpts, inPsl, inPslType, mapAln->psl, mapPslType); verbosePslNl(2, "mappedPsl", mappedPsl); /* only output if blocks were actually mapped */ boolean wasMapped = mappedPsl != NULL; if (wasMapped) { if (check && (pslCheck("mapped psl", stderr, mappedPsl) > 0)) errAbort("BUG: invalid mapped PSL created, run with -verbose=2 for more details"); mappedPslOutput(inPsl, mapAln, mappedPsl, outPslFh, mapInfoFh, mappingPslFh, *outPslLineRef); (*outPslLineRef)++; } pslFree(&mappedPsl); return wasMapped; } static void mapQueryPsl(struct psl* inPsl, struct genomeRangeTree *mapAlns, FILE* outPslFh, FILE *mapInfoFh, FILE *mappingPslFh, unsigned* outPslLineRef) /* map a query psl to all targets */ { static struct dyString *idBuf = NULL; struct range *overMapAlnNodes = genomeRangeTreeAllOverlapping(mapAlns, getMappingId(inPsl->tName, &idBuf), inPsl->tStart, inPsl->tEnd); struct range *overMapAlnNode; struct mapAln *overMapAln; boolean wasMapped = FALSE; for (overMapAlnNode = overMapAlnNodes; overMapAlnNode != NULL; overMapAlnNode = overMapAlnNode->next) { for (overMapAln = overMapAlnNode->val; overMapAln != NULL; overMapAln = overMapAln->next) { if ((overMapAln->qNameMatch == NULL) || sameString(inPsl->qName, overMapAln->qNameMatch)) { if (mapPslPair(inPsl, overMapAln, outPslFh, mapInfoFh, mappingPslFh, outPslLineRef)) wasMapped = TRUE; } } } if ((mapInfoFh != NULL) && !wasMapped) writeMapInfo(mapInfoFh, inPsl, NULL, NULL, 0); } static void pslMap(char* inPslFile, char *mapFile, char *outPslFile) /* project inPsl query through mapFile query to mapFile target */ { struct genomeRangeTree *mapAlns; struct psl* inPsl; struct lineFile* inPslLf = pslFileOpen(inPslFile); FILE *outPslFh, *mapInfoFh = NULL, *mappingPslFh = NULL; unsigned outPslLine = 0; if (chainMapFile) mapAlns = loadMapChains(mapFile); else mapAlns = loadMapPsls(mapFile); outPslFh = mustOpen(outPslFile, "w"); if (mapInfoFile != NULL) { mapInfoFh = mustOpen(mapInfoFile, "w"); + if (!tsv) + fputc('#', mapInfoFh); fputs(mapInfoHdr, mapInfoFh); } if (mappingPslFile != NULL) mappingPslFh = mustOpen(mappingPslFile, "w"); while ((inPsl = pslNext(inPslLf)) != NULL) { if (swapIn) pslSwap(inPsl, FALSE); mapQueryPsl(inPsl, mapAlns, outPslFh, mapInfoFh, mappingPslFh, &outPslLine); pslFree(&inPsl); } carefulClose(&mappingPslFh); carefulClose(&mapInfoFh); carefulClose(&outPslFh); lineFileClose(&inPslLf); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); if (argc != 4) usage(); suffix = optionVal("suffix", NULL); if (optionExists("keepTranslated")) mapOpts |= pslTransMapKeepTrans; mapFileWithInQName = optionExists("mapFileWithInQName"); chainMapFile = optionExists("chainMapFile"); if (mapFileWithInQName && chainMapFile) errAbort("can't specify -mapFileWithInQName with -chainMapFile"); swapMap = optionExists("swapMap"); swapIn = optionExists("swapIn"); check = optionExists("check"); simplifyMappingIds = optionExists("simplifyMappingIds"); char *typeStr; if ((typeStr = optionVal("inType", NULL)) != NULL) inPslType = parsePslType(typeStr); if ((typeStr = optionVal("mapType", NULL)) != NULL) mapPslType = parsePslType(typeStr); mapInfoFile = optionVal("mapInfo", NULL); +tsv = optionExists("tsv"); mappingPslFile = optionVal("mappingPsls", NULL); pslMap(argv[1], argv[2], argv[3]); return 0; } /* * Local Variables: * c-file-style: "jkent-c" * End: */