9ab015136bbc854a5dc03b71d5d465771539f6c3 angie Thu Sep 7 11:31:20 2017 -0700 pslRecalcMatch: new option -ignoreQMissing for when we have fasta for only a subset of query sequences. diff --git src/hg/pslRecalcMatch/pslRecalcMatch.c src/hg/pslRecalcMatch/pslRecalcMatch.c index 9def72c..631e33a 100644 --- src/hg/pslRecalcMatch/pslRecalcMatch.c +++ src/hg/pslRecalcMatch/pslRecalcMatch.c @@ -23,38 +23,41 @@ /* Explain usage and exit. */ { errAbort( "pslRecalcMatch - Recalculate match,mismatch,repMatch columns in psl file.\n" "This can be useful if the psl went through pslMap, or if you've added \n" "lower-case repeat masking after the fact\n" "usage:\n" " pslRecalcMatch in.psl targetSeq querySeq out.psl\n" "where targetSeq is either a nib directory or a two bit file\n" "and querySeq is a fasta file, nib file, two bit file, or list\n" "of such files. The psl's should be simple non-translated ones.\n" "This will work faster if the in.psl is sorted on target.\n" "options:\n" " -ignoreQUniq - ignore everything after the last `-' in the qName field, that\n" " is sometimes used to generate a unique identifier\n" + " -ignoreQMissing - pass through the record if querySeq doesn't include qName\n" ); } static struct optionSpec options[] = { {"ignoreQUniq", OPTION_BOOLEAN}, + {"ignoreQMissing", OPTION_BOOLEAN}, {NULL, 0}, }; static boolean ignoreQUniq = FALSE; +static boolean ignoreQMissing = FALSE; static char *getQName(char *qName) /* get query name, optionally dropping trailing unique identifier. * WARNING: static return */ { static struct dyString *buf = NULL; if (ignoreQUniq) { if (buf == NULL) buf = dyStringNew(2*strlen(qName)); dyStringClear(buf); char *dash = strrchr(qName, '-'); if (dash == NULL) return qName; @@ -118,34 +121,39 @@ * This can be useful if the psl went through pslMap, or if you've added * lower-case repeat masking after the fact. */ { struct nibTwoCache *tCache = nibTwoCacheNew(targetName); struct dnaSeq *qSeqList = dnaLoadAll(queryName); struct hash *qHash = dnaSeqHash(qSeqList); struct psl *psl; struct lineFile *lf = pslFileOpen(inName); FILE *f = mustOpen(outName, "w"); while ((psl = pslNext(lf)) != NULL) { int tSize; struct dnaSeq *tSeqPart = nibTwoCacheSeqPart(tCache, psl->tName, psl->tStart, psl->tEnd - psl->tStart, &tSize); - struct dnaSeq *qSeq = hashMustFindVal(qHash, getQName(psl->qName)); + char *qName = getQName(psl->qName); + struct dnaSeq *qSeq = hashFindVal(qHash, qName); + if (!ignoreQMissing && qSeq == NULL) + errAbort("Can't find sequence for qName '%s'", qName); + else if (qSeq) recalcMatches(psl, tSeqPart, psl->tStart, qSeq); pslTabOut(psl, f); dnaSeqFree(&tSeqPart); } carefulClose(&f); lineFileClose(&lf); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 5) usage(); ignoreQUniq = optionExists("ignoreQUniq"); +ignoreQMissing = optionExists("ignoreQMissing"); pslRecalcMatch(argv[1], argv[2], argv[3], argv[4]); return 0; }