afafa0301ea4b14fbe1fbd5aa379c5351ecd640d angie Tue Aug 2 19:41:44 2022 -0700 Add support for non-wuhCor1 genomes (e.g. monkeypox GenArk hub). * Search in hgPhyloPlaceData for config.ra files, taking assembly name (minus hub prefix) from directory name. * Add a menu input to the main page for switching between supported genomes if there are more than one. * Replace hardcoded values or global vars with dnaSeq attributes, assembly metadata queries or new config.ra settings. * Separate out SARS-CoV-2-specific help text like GISAID/CNCB descriptions. * Support metadata columns for GenBank-specific stuff & Nextstrain lineages (for MPXV). * also a little refactoring in runUsher in preparation for supporting usher server mode: parse new placement info file so we don't have to parse that data form usher stderr output. TODO: update Nextstrain/Auspice JSON output to use appropriate metadata columns and support monkeypox genes. diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c index 2b2e578..6e06e530 100644 --- src/hg/hgPhyloPlace/vcfFromFasta.c +++ src/hg/hgPhyloPlace/vcfFromFasta.c @@ -1,55 +1,49 @@ /* Align user sequence to the reference genome and write VCF */ /* Copyright (C) 2020 The Regents of the University of California */ #include "common.h" #include "fa.h" #include "genoFind.h" #include "iupac.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "psl.h" #include "trashDir.h" -// Globals -extern int chromSize; - -// wuhCor1-specific: -int minSeqSize = 10000; -int maxSeqSize = 35000; double maxNs = 0.5; // Using blat defaults for these: int gfRepMatch = 1024*4; char *gfOoc = NULL; int gfStepSize = gfTileSize; int gfOutMinIdentityPpt = 900; // The default minScore for gfLongDnaInMem's 4500 base preferred size is 30, but we want way // higher, expecting very similar sequences. -- But watch out for chunking effects in // gfLongDnaInMem. ... would be nice if gfLongDnaInMem scaled minMatch for the last little // chunk. int gfMinScore = 50; // Normally this would be 12 for a genome with size 29903 // (see hgBlat.c findMinMatch... k = ceil(log4(genomeSize*250))) // but we expect a really good alignment so we can crank it up. (Again modulo chunking) int gfIMinMatch = 30; static struct seqInfo *checkSequences(struct dnaSeq *seqs, struct hash *treeNames, - struct slPair **retFailedSeqs) + int minSeqSize, int maxSeqSize, struct slPair **retFailedSeqs) /* Return a list of sequences that pass basic QC checks (appropriate size etc). * If any sequences have names that are already in the tree, add a prefix so usher doesn't * reject them. * Set retFailedSeqs to the list of sequences that failed checks. */ { struct seqInfo *filteredSeqs = NULL; struct hash *uniqNames = hashNew(0); *retFailedSeqs = NULL; struct dnaSeq *seq, *nextSeq; for (seq = seqs; seq != NULL; seq = nextSeq) { boolean passes = TRUE; nextSeq = seq->next; if (seq->size < minSeqSize) { @@ -136,31 +130,31 @@ si->nCountStart = nCountStart; si->nCountMiddle = nCountMiddle; si->nCountEnd = nCountEnd; si->ambigCount = ambigCount; hashAdd(uniqNames, seq->name, NULL); slAddHead(&filteredSeqs, si); } } } slReverse(&filteredSeqs); slReverse(retFailedSeqs); hashFree(&uniqNames); return filteredSeqs; } -static struct psl *alignSequences(char *db, struct dnaSeq *refGenome, struct seqInfo *seqs, +static struct psl *alignSequences(struct dnaSeq *refGenome, struct seqInfo *seqs, int *pStartTime) /* Use BLAT server to align seqs to reference; keep only the best alignment for each seq. * (In normal usage, there should be one very good alignment per seq.) */ { struct psl *alignments = NULL; struct genoFind *gf = gfIndexSeq(refGenome, gfIMinMatch, gfMaxGap, gfTileSize, gfRepMatch, gfOoc, FALSE, FALSE, FALSE, gfStepSize, FALSE); reportTiming(pStartTime, "gfIndexSeq"); struct tempName pslTn; trashDirFile(&pslTn, "ct", "pp", ".psl"); FILE *f = mustOpen(pslTn.forCgi, "w"); struct gfOutput *gvo = gfOutputPsl(gfOutMinIdentityPpt, FALSE, FALSE, f, FALSE, TRUE); struct seqInfo *si; for (si = seqs; si != NULL; si = si->next) { @@ -427,31 +421,31 @@ } if (blkIx < psl->blockCount - 1) { // Add no-calls for unaligned bases between this block and the next for (; tStart < psl->tStarts[blkIx+1]; tStart++) { if (informativeBases[tStart]) { char refBase = ref->dna[tStart]; addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount); } } } } // Add no-calls for unaligned bases after last block - for (tStart = psl->tEnd; tStart < chromSize; tStart++) + for (tStart = psl->tEnd; tStart < ref->size; tStart++) { if (informativeBases[tStart]) { char refBase = ref->dna[tStart]; addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount); } } slReverse(&qSeq->sncList); } } static void writeVcfSnv(struct snvInfo **snvsByPos, int gtCount, char *chrom, int chromStart, FILE *f) /* If snvsByPos[chromStart] is not NULL, write out a VCF data row with genotypes. */ { @@ -588,35 +582,48 @@ } struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome, boolean *informativeBases, struct slName **maskSites, struct hash *treeNames, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime) /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ { struct tempName *tn = NULL; struct slName *sampleIds = NULL; struct dnaSeq *allSeqs = faReadAllMixedInLf(lf); -struct seqInfo *filteredSeqs = checkSequences(allSeqs, treeNames, retFailedSeqs); +int minSeqSize = 0, maxSeqSize = 0; +// Default to SARS-CoV-2 or hMPXV values if setting is missing from config.ra. +char *minSeqSizeSetting = phyloPlaceDbSetting(db, "minSeqSize"); +if (isEmpty(minSeqSizeSetting)) + minSeqSize = sameString(db, "wuhCor1") ? 10000 : 100000; +else + minSeqSize = atoi(minSeqSizeSetting); +char *maxSeqSizeSetting = phyloPlaceDbSetting(db, "maxSeqSize"); +if (isEmpty(maxSeqSizeSetting)) + maxSeqSize = sameString(db, "wuhCor1") ? 35000 : 220000; +else + maxSeqSize = atoi(maxSeqSizeSetting); +struct seqInfo *filteredSeqs = checkSequences(allSeqs, treeNames, minSeqSize, maxSeqSize, + retFailedSeqs); reportTiming(pStartTime, "read and check uploaded FASTA"); if (filteredSeqs) { - struct psl *alignments = alignSequences(db, refGenome, filteredSeqs, pStartTime); + struct psl *alignments = alignSequences(refGenome, filteredSeqs, pStartTime); struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls); removeUnalignedSeqs(&filteredSeqs, filteredAlignments, retFailedSeqs); if (filteredAlignments) { AllocVar(tn); trashDirFile(tn, "ct", "pp", ".vcf"); pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, informativeBases, maskSites); struct psl *psl; for (psl = filteredAlignments; psl != NULL; psl = psl->next) slNameAddHead(&sampleIds, psl->qName); analyzeGaps(filteredSeqs, refGenome); slReverse(&sampleIds); reportTiming(pStartTime, "write VCF for uploaded FASTA"); }