da3772f3615ae3aa407af6aa2215a1b2ff480183 angie Mon Apr 29 13:40:41 2024 -0700 If configured, use nextclade to align sequences (better for more diverged viruses like influenza). Also fix some buttons that the last change broke. diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c index 37d1b12..919c5ed 100644 --- src/hg/hgPhyloPlace/vcfFromFasta.c +++ src/hg/hgPhyloPlace/vcfFromFasta.c @@ -1,25 +1,26 @@ /* Align user sequence to the reference genome and write VCF */ /* Copyright (C) 2020-2024 The Regents of the University of California */ #include "common.h" #include "fa.h" #include "genoFind.h" #include "iupac.h" #include "parsimonyProto.h" #include "phyloPlace.h" +#include "pipeline.h" #include "psl.h" #include "trashDir.h" double maxNs = 0.5; // Using blat defaults for these: int gfRepMatch = 1024*4; char *gfOoc = NULL; int gfStepSize = gfTileSize; int gfOutMinIdentityPpt = 900; // The default minScore for gfLongDnaInMem's 4500 base preferred size is 30, but we want way // higher, expecting very similar sequences. -- But watch out for chunking effects in // gfLongDnaInMem. ... would be nice if gfLongDnaInMem scaled minMatch for the last little // chunk. @@ -593,65 +594,413 @@ touppers(delSeq); dyStringPrintf(dyDel, "%d-%d:%s", tGapEnd - delLen + 1, tGapEnd, delSeq); } else dyStringPrintf(dyDel, "%d-%d:%d bases", tGapEnd - delLen + 1, tGapEnd, delLen); } } si->insBases = insBases; si->delBases = delBases; si->insRanges = dyStringCannibalize(&dyIns); si->delRanges = dyStringCannibalize(&dyDel); } } } +static struct tempName *alignWithBlat(struct seqInfo *filteredSeqs, struct dnaSeq *refGenome, + struct slName **maskSites, struct slName **retSampleIds, + struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, + struct slPair **retFailedPsls, int *pStartTime) +/* Use blat gf code to align filteredSeqs to refGenome, extract SNVs from alignment, and + * save as VCF with sample genotype columns. */ +{ +struct tempName *tn = NULL; +struct slName *sampleIds = NULL; +struct psl *alignments = alignSequences(refGenome, filteredSeqs, pStartTime); +struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls); +removeUnalignedSeqs(&filteredSeqs, filteredAlignments, retFailedSeqs); +if (filteredAlignments) + { + AllocVar(tn); + trashDirFile(tn, "ct", "pp", ".vcf"); + pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, maskSites); + struct psl *psl; + for (psl = filteredAlignments; psl != NULL; psl = psl->next) + slNameAddHead(&sampleIds, psl->qName); + analyzeGaps(filteredSeqs, refGenome); + slReverse(&sampleIds); + reportTiming(pStartTime, "write VCF for uploaded FASTA"); + } +*retSampleIds = sampleIds; +*retSeqInfo = filteredSeqs; +return tn; +} + +char *runNextcladeRun(char *seqFile, char *nextcladeDataset) +/* Run the nextclade run command on uploaded seqs, return output TSV file name. */ +{ +char *nextcladePath = getNextcladePath(); +struct tempName tnNextcladeOut; +trashDirFile(&tnNextcladeOut, "ct", "usher_nextclade_run", ".tsv"); +#define NEXTCLADE_NUM_THREADS "4" +char *cmd[] = { nextcladePath, "run", + "--input-dataset", nextcladeDataset, + "--output-tsv", tnNextcladeOut.forCgi, + "--output-columns-selection", + "alignmentStart,alignmentEnd,substitutions,deletions,insertions,missing,nonACGTNs", + "-j", NEXTCLADE_NUM_THREADS, + seqFile, NULL }; +char **cmds[] = { cmd, NULL }; +struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, "/dev/stderr", 0); +pipelineClose(&pl); +return cloneString(tnNextcladeOut.forCgi); +} + +static int countNextcladeAlignedSeqs(char *nextcladeOut, struct seqInfo **pFilteredSeqs, + struct slPair **retFailedSeqs) +/* Before we can start building genotype arrays for VCF we need to know the number of aligned + * sequences, and to remove unaligned sequences from pFilteredSeqs (add them to retFailedSeqs). */ +{ +struct hash *alignedSeqNames = hashNew(0); +struct lineFile *lf = lineFileOpen(nextcladeOut, TRUE); +// First line is header; get column indexes +char *row[100]; +int headerColCount = lineFileChopTab(lf, row); +if (headerColCount <= 0) + errAbort("nextclade run output is empty"); +int seqNameIx = stringArrayIx("seqName", row, headerColCount); +int alStartIx = stringArrayIx("alignmentStart", row, headerColCount); +int gtCount = 0; +int colCount = 0; +while ((colCount = lineFileChopTab(lf, row)) > 0) + { + char *seqName = row[seqNameIx]; + if (hashLookup(alignedSeqNames, seqName) != NULL) + errAbort("Duplicate seqName '%s' in nextclade output", seqName); + // Nextclade still includes unaligned sequences in its output, just with empty alignment columns + if (isNotEmpty(row[alStartIx])) + { + hashAdd(alignedSeqNames, seqName, NULL); + gtCount++; + } + } +lineFileClose(&lf); +// Set pFilteredSeqs and retFailedSeqs according to which seqs could be aligned or not. +struct seqInfo *alignedSeqs = NULL, *nextSi = NULL; +struct slPair *newFailedSeqs = NULL; +struct seqInfo *si; +for (si = *pFilteredSeqs; si != NULL; si = nextSi) + { + nextSi = si->next; + if (hashLookup(alignedSeqNames, si->seq->name) != NULL) + slAddHead(&alignedSeqs, si); + else + { + struct dyString *dy = dyStringCreate("Sequence %s could not be aligned to the reference; " + "skipping", si->seq->name); + slPairAdd(&newFailedSeqs, dyStringCannibalize(&dy), si); + } + } +slReverse(&alignedSeqs); +slReverse(&newFailedSeqs); +*pFilteredSeqs = alignedSeqs; +*retFailedSeqs = slCat(*retFailedSeqs, newFailedSeqs); +hashFree(&alignedSeqNames); +return gtCount; +} + +static void parseNextcladeSubstitutions(char *substStr, struct seqInfo *si, struct dnaSeq *ref, + struct slName **maskSites, int gtIx, int gtCount, + struct snvInfo **snvsByPos) +/* Add genotypes for substitutions; also build up si fields for substs and masked substs */ +{ +int substCount = chopByChar(substStr, ',', NULL, 0); +char *substArr[substCount]; +chopCommas(substStr, substArr); +int i; +for (i = 0; i < substCount; i++) + { + char *subst = substArr[i]; + int t = atol(subst+1) - 1; + char altBase = subst[strlen(subst)-1]; + int gt = addCall(snvsByPos, t, ref->dna[t], tolower(altBase), gtIx, gtCount); + if (gt > 0) + { + struct singleNucChange *snc = sncNew(t, ref->dna[t], '\0', altBase); + struct slName *maskedReasons = maskSites[t]; + if (maskedReasons) + { + slAddHead(&si->maskedSncList, snc); + slAddHead(&si->maskedReasonsList, slRefNew(maskedReasons)); + } + else + slAddHead(&si->sncList, snc); + } + } +} + +static void parseNextcladeMissing(char *nRangeStr, struct dnaSeq *ref, struct slName **maskSites, + int gtIx, int gtCount, struct snvInfo **snvsByPos) +/* Add no-calls to snvsByPos for ranges of N bases */ +{ +int nRangeCount = chopByChar(nRangeStr, ',', NULL, 0); +char *nRangeArr[nRangeCount]; +chopCommas(nRangeStr, nRangeArr); +int i; +for (i = 0; i < nRangeCount; i++) + { + char *nRange = nRangeArr[i]; + int nStart = atol(nRange) - 1; + int nEnd = nStart + 1; + char *p = strchr(nRange, '-'); + if (p) + nEnd = atol(p+1); + int t; + for (t = nStart; t < nEnd; t++) + if (!maskSites[t]) + addCall(snvsByPos, t, ref->dna[t], 'n', gtIx, gtCount); + } +} + +static void parseNextcladeAmbig(char *ambigStr, struct dnaSeq *ref, + int gtIx, int gtCount, struct snvInfo **snvsByPos) +/* Add calls to snvsByPos for non-N ambiguous bases */ +{ +int ambigCount = chopByChar(ambigStr, ',', NULL, 0); +char *ambigArr[ambigCount]; +chopCommas(ambigStr, ambigArr); +int i; +for (i = 0; i < ambigCount; i++) + { + char *ambig = ambigArr[i]; + char *p = strchr(ambig, ':'); + if (! p) + errAbort("missing ':' separator in ambig '%s'", ambig); + int aStart = atol(p+1) - 1; + int aEnd = aStart+1; + p = strchr(ambig, '-'); + if (p != NULL) + aEnd = atol(p+1); + int t; + for (t = aStart; t < aEnd; t++) + addCall(snvsByPos, t, ref->dna[t], ambig[0], gtIx, gtCount); + } +} + +static void parseNextcladeDeletions(char *delStr, struct seqInfo *si, struct dnaSeq *ref) +/* We don't add calls for insertions or deletions, but we do describe them in seqInfo */ +{ +int delRangeCount = chopByChar(delStr, ',', NULL, 0); +char *delRangeArr[delRangeCount]; +chopCommas(delStr, delRangeArr); +int delBases = 0; +struct dyString *dy = dyStringNew(0); +int i; +for (i = 0; i < delRangeCount; i++) + { + char *delRange = delRangeArr[i]; + int delStart = atol(delRange) - 1; + int delEnd = delStart + 1; + char *p = strchr(delRange, '-'); + if (p) + delEnd = atol(p+1); + int delLen = delEnd - delStart; + delBases += delLen; + if (isNotEmpty(dy->string)) + dyStringAppend(dy, ", "); + if (delLen <= 12) + { + char delSeq[delLen+1]; + safencpy(delSeq, sizeof delSeq, ref->dna + delStart, delLen); + touppers(delSeq); + dyStringPrintf(dy, "%d-%d:%s", delStart + 1, delEnd, delSeq); + } + else + dyStringPrintf(dy, "%d-%d:%d bases", delStart + 1, delEnd, delLen); + } +si->delBases = delBases; +si->delRanges = dyStringCannibalize(&dy); +} + +static void parseNextcladeInsertions(char *insStr, struct seqInfo *si, struct dnaSeq *ref) +/* We don't add calls for insertions or deletions, but we do describe them in seqInfo */ +{ +int insCount = chopByChar(insStr, ',', NULL, 0); +char *insArr[insCount]; +chopCommas(insStr, insArr); +int insBases = 0; +struct dyString *dy = dyStringNew(0); +int i; +for (i = 0; i < insCount; i++) + { + char *ins = insArr[i]; + int t = atol(ins); + char *p = strchr(ins, ':'); + if (! p) + errAbort("missing ':' separator in insertion '%s'", ins); + char *insSeq = p+1; + int insLen = strlen(insSeq); + insBases += insLen; + if (isNotEmpty(dy->string)) + dyStringAppend(dy, ", "); + if (insLen <= 12) + dyStringPrintf(dy, "%d-%d:%s", t, t+1, insSeq); + else + dyStringPrintf(dy, "%d-%d:%d bases", t, t+1, insLen); + } +si->insBases = insBases; +si->insRanges = dyStringCannibalize(&dy); +} + +static struct snvInfo **parseNextcladeRun(char *nextcladeOut, struct seqInfo **pFilteredSeqs, + struct dnaSeq *ref, struct slName **maskSites, + struct slPair **retFailedSeqs) +/* Parse the TSV output from nextcladeRun into a per-base array of snvInfo for making VCF, + * while filling in alignment-related seqInfo fields. + * Append any unaligned sequences to retFailedSeqs and set pFilteredSeqs to aligned seqs. */ +{ +struct snvInfo **snvsByPos = NULL; +AllocArray(snvsByPos, ref->size); +int gtCount = countNextcladeAlignedSeqs(nextcladeOut, pFilteredSeqs, retFailedSeqs); +struct lineFile *lf = lineFileOpen(nextcladeOut, TRUE); +// First line is header; get column indexes +char *row[100]; +int headerColCount = lineFileChopTab(lf, row); +if (headerColCount <= 0) + errAbort("nextclade run output is empty"); +int seqNameIx = stringArrayIx("seqName", row, headerColCount); +int alStartIx = stringArrayIx("alignmentStart", row, headerColCount); +int alEndIx = stringArrayIx("alignmentEnd", row, headerColCount); +int substIx = stringArrayIx("substitutions", row, headerColCount); +int delIx = stringArrayIx("deletions", row, headerColCount); +int insIx = stringArrayIx("insertions", row, headerColCount); +int nIx = stringArrayIx("missing", row, headerColCount); +int ambigIx = stringArrayIx("nonACGTNs", row, headerColCount); +if (seqNameIx == -1 || alStartIx == -1 || alEndIx == -1 || substIx == -1 || delIx == -1 || + insIx == -1 || nIx == -1 || ambigIx == -1) + errAbort("nextclade run output header line does not contain all expected columns"); +int colCount = 0; +while ((colCount = lineFileChopTab(lf, row)) > 0) + { + char *seqName = row[seqNameIx]; + // Nextclade still includes unaligned sequences in its output, just with empty alignment columns + if (isEmpty(row[alStartIx])) + continue; + int gtIx; + struct seqInfo *si = mustFindSeqAndIx(seqName, *pFilteredSeqs, >Ix); + int tStart = atol(row[alStartIx]) - 1; + // Add no-calls for unaligned bases before alignmentStart + int t; + for (t = 0; t < tStart; t++) + { + if (!maskSites[t]) + addCall(snvsByPos, t, ref->dna[t], 'n', gtIx, gtCount); + } + int tEnd = atol(row[alEndIx]); + // Add no-calls for unaligned bases after alignmentEnd + for (t = tEnd; t < ref->size; t++) + { + if (!maskSites[t]) + addCall(snvsByPos, t, ref->dna[t], 'n', gtIx, gtCount); + } + parseNextcladeSubstitutions(row[substIx], si, ref, maskSites, gtIx, gtCount, snvsByPos); + parseNextcladeMissing(row[nIx], ref, maskSites, gtIx, gtCount, snvsByPos); + parseNextcladeAmbig(row[ambigIx], ref, gtIx, gtCount, snvsByPos); + parseNextcladeDeletions(row[delIx], si, ref); + parseNextcladeInsertions(row[insIx], si, ref); + } +lineFileClose(&lf); +return snvsByPos; +} + +static struct tempName *alignWithNextclade(struct seqInfo *filteredSeqs, char *nextcladeDataset, + struct dnaSeq *ref, struct slName **maskSites, + struct slName **retSampleIds, + struct seqInfo **retSeqInfo, + struct slPair **retFailedSeqs, int *pStartTime) +/* Use nextclade to align filteredSeqs to ref, extract SNVs from alignment, and + * save as VCF with sample genotype columns. */ +{ +struct tempName *tn = NULL; +struct slName *sampleIds = NULL; +// Dump filteredSeqs to fasta file for nextclade +struct tempName tnSeqFile; +trashDirFile(&tnSeqFile, "ct", "usher_nextclade_input", ".txt"); +FILE *f = mustOpen(tnSeqFile.forCgi, "w"); +struct seqInfo *si; +for (si = filteredSeqs; si != NULL; si = si->next) + faWriteNext(f, si->seq->name, si->seq->dna, si->seq->size); +carefulClose(&f); +reportTiming(pStartTime, "dump sequences to file"); +char *nextcladeOut = runNextcladeRun(tnSeqFile.forCgi, nextcladeDataset); +reportTiming(pStartTime, "run nextclade run"); +struct snvInfo **snvsByPos = parseNextcladeRun(nextcladeOut, &filteredSeqs, ref, maskSites, + retFailedSeqs); +// Write VCF +if (filteredSeqs) + { + AllocVar(tn); + trashDirFile(tn, "ct", "pp", ".vcf"); + int sampleCount = slCount(filteredSeqs); + char *sampleNames[sampleCount]; + struct seqInfo *si; + int i; + for (i = 0, si = filteredSeqs; i < sampleCount; i++, si = si->next) + { + sampleNames[i] = si->seq->name; + slNameAddHead(&sampleIds, si->seq->name); + } + slReverse(&sampleIds); + writeSnvsToVcfFile(snvsByPos, ref, sampleNames, sampleCount, maskSites, tn->forCgi); + } + +// Clean up +freeMem(snvsByPos); +unlink(tnSeqFile.forCgi); +unlink(nextcladeOut); +*retSampleIds = sampleIds; +*retSeqInfo = filteredSeqs; +return tn; +} + struct tempName *vcfFromFasta(struct lineFile *lf, char *org, char *db, struct dnaSeq *refGenome, struct slName **maskSites, struct hash *treeNames, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime) /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ { struct tempName *tn = NULL; -struct slName *sampleIds = NULL; struct dnaSeq *allSeqs = faReadAllMixedInLf(lf); int minSeqSize = 0, maxSeqSize = 0; // Default to SARS-CoV-2 or hMPXV values if setting is missing from config.ra. char *minSeqSizeSetting = phyloPlaceRefSetting(org, db, "minSeqSize"); if (isEmpty(minSeqSizeSetting)) minSeqSize = sameString(db, "wuhCor1") ? 10000 : 100000; else minSeqSize = atoi(minSeqSizeSetting); char *maxSeqSizeSetting = phyloPlaceRefSetting(org, db, "maxSeqSize"); if (isEmpty(maxSeqSizeSetting)) maxSeqSize = sameString(db, "wuhCor1") ? 35000 : 220000; else maxSeqSize = atoi(maxSeqSizeSetting); struct seqInfo *filteredSeqs = checkSequences(allSeqs, treeNames, minSeqSize, maxSeqSize, retFailedSeqs); reportTiming(pStartTime, "read and check uploaded FASTA"); if (filteredSeqs) { - struct psl *alignments = alignSequences(refGenome, filteredSeqs, pStartTime); - struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls); - removeUnalignedSeqs(&filteredSeqs, filteredAlignments, retFailedSeqs); - if (filteredAlignments) + char *nextcladeDataset = phyloPlaceRefSettingPath(org, db, "nextcladeDataset"); + if (nextcladeDataset) { - AllocVar(tn); - trashDirFile(tn, "ct", "pp", ".vcf"); - pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, maskSites); - struct psl *psl; - for (psl = filteredAlignments; psl != NULL; psl = psl->next) - slNameAddHead(&sampleIds, psl->qName); - analyzeGaps(filteredSeqs, refGenome); - slReverse(&sampleIds); - reportTiming(pStartTime, "write VCF for uploaded FASTA"); + *retFailedPsls = NULL; + tn = alignWithNextclade(filteredSeqs, nextcladeDataset, refGenome, maskSites, retSampleIds, + retSeqInfo, retFailedSeqs, pStartTime); } + else + tn = alignWithBlat(filteredSeqs, refGenome, maskSites, retSampleIds, retSeqInfo, + retFailedSeqs, retFailedPsls, pStartTime); } -*retSampleIds = sampleIds; -*retSeqInfo = filteredSeqs; return tn; }