0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3 angie Fri Oct 28 15:08:06 2022 -0700 Performance improvements for trees with millions of sequences: * Use @yceh's usher-sampled-server if configured; it preloads protobufs and can start placing sequences immediately using usher-sampled, a faster version of usher * Use usher-sampled instead of usher if server is not configured but usher-sampled is available * Load sample metadata file in a pthread while usher(-sampled(-server)) or matUtils is running * Skip checking for sample name clashes in uploaded fasta when using usher-sampled(-server)'s new --no-ignore-prefix option (but look for the prefix when parsing results) * Avoid parsing the protobuf and traversing the big tree unless absolutely necessary ** Subtrees from usher/matUtils have not included condensed nodes in a long time; remove lots of condensedNodes/summarization code from phyloPlace.c, runUsher.c, writeCustomTracks.c ** Use subtrees instead of big tree when possible (in findNearestNeighbor, treeToBaseAlleles, uploadedSamplesTree) ** Skip the informativeBases stuff that inhibits masking of sites from Problematic Sites set when the tree was built with an earlier version -- that pretty much never applies anymore now that only daily-updated trees are offered, not a range from old to new. ** Allow config.ra to specify a flat file of sample names (needed for searching user's uploaded names/IDs before calling matUtils) instead of getting names from the big tree diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c index 6e06e530..f914657 100644 --- src/hg/hgPhyloPlace/vcfFromFasta.c +++ src/hg/hgPhyloPlace/vcfFromFasta.c @@ -1,635 +1,633 @@ /* Align user sequence to the reference genome and write VCF */ /* Copyright (C) 2020 The Regents of the University of California */ #include "common.h" #include "fa.h" #include "genoFind.h" #include "iupac.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "psl.h" #include "trashDir.h" double maxNs = 0.5; // Using blat defaults for these: int gfRepMatch = 1024*4; char *gfOoc = NULL; int gfStepSize = gfTileSize; int gfOutMinIdentityPpt = 900; // The default minScore for gfLongDnaInMem's 4500 base preferred size is 30, but we want way // higher, expecting very similar sequences. -- But watch out for chunking effects in // gfLongDnaInMem. ... would be nice if gfLongDnaInMem scaled minMatch for the last little // chunk. int gfMinScore = 50; // Normally this would be 12 for a genome with size 29903 // (see hgBlat.c findMinMatch... k = ceil(log4(genomeSize*250))) // but we expect a really good alignment so we can crank it up. (Again modulo chunking) int gfIMinMatch = 30; static struct seqInfo *checkSequences(struct dnaSeq *seqs, struct hash *treeNames, int minSeqSize, int maxSeqSize, struct slPair **retFailedSeqs) /* Return a list of sequences that pass basic QC checks (appropriate size etc). * If any sequences have names that are already in the tree, add a prefix so usher doesn't * reject them. * Set retFailedSeqs to the list of sequences that failed checks. */ { struct seqInfo *filteredSeqs = NULL; struct hash *uniqNames = hashNew(0); *retFailedSeqs = NULL; struct dnaSeq *seq, *nextSeq; for (seq = seqs; seq != NULL; seq = nextSeq) { boolean passes = TRUE; nextSeq = seq->next; if (seq->size < minSeqSize) { struct dyString *dy = dyStringCreate("Sequence %s has too few bases (%d, must have " "at least %d); skipping.", seq->name, seq->size, minSeqSize); slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq); passes = FALSE; } else if (seq->size > maxSeqSize) { struct dyString *dy = dyStringCreate("Sequence %s has too many bases (%d, must have " "at most %d); skipping.", seq->name, seq->size, maxSeqSize); slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq); passes = FALSE; } if (!passes) continue; // blat requires lower case sequence. Lowercasing makes it easier to find n's and N's too: toLowerN(seq->dna, seq->size); // Many sequences begin and end with blocks of N bases; treat those differently than Ns in middle int nCountTotal = countChars(seq->dna, 'n'); int nCountStart = 0; while (seq->dna[nCountStart] == 'n') nCountStart++; int nCountEnd = 0; if (nCountStart < seq->size) { while (seq->dna[seq->size - 1 - nCountEnd] == 'n') nCountEnd++; } int nCountMiddle = nCountTotal - (nCountStart + nCountEnd); int effectiveLength = seq->size - (nCountStart + nCountEnd); if (effectiveLength < minSeqSize) { struct dyString *dy = dyStringCreate("Sequence %s has too few bases (%d excluding %d Ns " "at beginning and %d Ns at end), " "must have at least %d); skipping.", seq->name, effectiveLength, nCountStart, nCountEnd, minSeqSize); slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq); passes = FALSE; } if (!passes) continue; if (nCountMiddle > maxNs * effectiveLength) { struct dyString *dy = dyStringCreate("Sequence %s has too many N bases (%d out of %d " "> %0.2f); skipping.", seq->name, nCountMiddle, effectiveLength, maxNs); slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq); passes = FALSE; } int ambigCount = 0; int i; for (i = 0; i < seq->size; i++) if (seq->dna[i] != 'n' && isIupacAmbiguous(seq->dna[i])) ambigCount++; if (passes) { if (hashLookup(uniqNames, seq->name)) { struct dyString *dy = dyStringCreate("Sequence name '%s' has already been used; " "ignoring subsequent usage " "(%d bases, %d N's, %d ambiguous).", seq->name, seq->size, nCountTotal, ambigCount); slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq); } else { - if (isInternalNodeName(seq->name, 0) || hashLookup(treeNames, seq->name)) + if (treeNames && + (isInternalNodeName(seq->name, 0) || hashLookup(treeNames, seq->name))) { // usher will reject any sequence whose name is already in the tree, even a // numeric internal node name. Add a prefix so usher won't reject sequence. char newName[strlen(seq->name)+32]; safef(newName, sizeof newName, "uploaded_%s", seq->name); freeMem(seq->name); seq->name = cloneString(newName); } struct seqInfo *si; AllocVar(si); si->seq = seq; si->nCountStart = nCountStart; si->nCountMiddle = nCountMiddle; si->nCountEnd = nCountEnd; si->ambigCount = ambigCount; hashAdd(uniqNames, seq->name, NULL); slAddHead(&filteredSeqs, si); } } } slReverse(&filteredSeqs); slReverse(retFailedSeqs); hashFree(&uniqNames); return filteredSeqs; } static struct psl *alignSequences(struct dnaSeq *refGenome, struct seqInfo *seqs, int *pStartTime) /* Use BLAT server to align seqs to reference; keep only the best alignment for each seq. * (In normal usage, there should be one very good alignment per seq.) */ { struct psl *alignments = NULL; struct genoFind *gf = gfIndexSeq(refGenome, gfIMinMatch, gfMaxGap, gfTileSize, gfRepMatch, gfOoc, FALSE, FALSE, FALSE, gfStepSize, FALSE); reportTiming(pStartTime, "gfIndexSeq"); struct tempName pslTn; trashDirFile(&pslTn, "ct", "pp", ".psl"); FILE *f = mustOpen(pslTn.forCgi, "w"); struct gfOutput *gvo = gfOutputPsl(gfOutMinIdentityPpt, FALSE, FALSE, f, FALSE, TRUE); struct seqInfo *si; for (si = seqs; si != NULL; si = si->next) { // Positive strand only; we're expecting a mostly complete match to the reference gfLongDnaInMem(si->seq, gf, FALSE, gfMinScore, NULL, gvo, FALSE, FALSE); } reportTiming(pStartTime, "gfLongDnaInMem all"); gfOutputFree(&gvo); carefulClose(&f); alignments = pslLoadAll(pslTn.forCgi); reportTiming(pStartTime, "load PSL file"); //#*** TODO: keep only the best alignment for each seq. return alignments; } struct psl *checkAlignments(struct psl *psls, struct seqInfo *userSeqs, struct slPair **retFailedPsls) /* No thresholds applied at this point - just makes sure there aren't multiple PSLs per seq. */ { struct psl *filteredPsls = NULL; *retFailedPsls = NULL; struct hash *userSeqsByName = hashNew(0); struct seqInfo *si; for (si = userSeqs; si != NULL; si = si->next) hashAdd(userSeqsByName, si->seq->name, si); struct hash *alignedSeqs = hashNew(0); struct psl *psl, *nextPsl; for (psl = psls; psl != NULL; psl = nextPsl) { nextPsl = psl->next; boolean passes = TRUE; struct psl *otherPsl = hashFindVal(alignedSeqs, psl->qName); if (otherPsl) { //#*** Is this ever going to happen? Maybe if there's a large dup??? //#*** Is there any condition under which we would want to try to merge? (Would expect blat //#*** to have done that) struct dyString *dy = dyStringCreate("Warning: multiple alignments found for sequence %s " "(%d-%d and %d-%d). Skipping alignment of %d-%d", psl->qName, otherPsl->qStart, otherPsl->qEnd, psl->qStart, psl->qEnd, psl->qStart, psl->qEnd); slPairAdd(retFailedPsls, dyStringCannibalize(&dy), psl); passes = FALSE; } hashAdd(alignedSeqs, psl->qName, psl); if (passes) { si = hashFindVal(userSeqsByName, psl->qName); if (si) si->psl = psl; else warn("Aligned sequence name '%s' does not match any input sequence name", psl->qName); slAddHead(&filteredPsls, psl); } } hashFree(&alignedSeqs); hashFree(&userSeqsByName); slReverse(&filteredPsls); slReverse(retFailedPsls); return filteredPsls; } static void removeUnalignedSeqs(struct seqInfo **pSeqs, struct psl *alignments, struct slPair **pFailedSeqs) /* If any seqs were not aligned, then remove them from *pSeqs and add them to *pFailedSeqs. */ { struct seqInfo *alignedSeqs = NULL; struct slPair *newFailedSeqs = NULL; struct seqInfo *seq, *nextSeq; for (seq = *pSeqs; seq != NULL; seq = nextSeq) { nextSeq = seq->next; boolean found = FALSE; struct psl *psl; for (psl = alignments; psl != NULL; psl = psl->next) { if (sameString(psl->qName, seq->seq->name)) { found = TRUE; break; } } if (found) slAddHead(&alignedSeqs, seq); else { struct dyString *dy = dyStringCreate("Sequence %s could not be aligned to the reference; " "skipping", seq->seq->name); slPairAdd(&newFailedSeqs, dyStringCannibalize(&dy), seq); } } slReverse(&alignedSeqs); slReverse(&newFailedSeqs); *pSeqs = alignedSeqs; *pFailedSeqs = slCat(*pFailedSeqs, newFailedSeqs); } struct snvInfo /* Summary of SNV with sample genotypes (position and samples are externally defined) */ { int altCount; // number of alternate alleles int altAlloced; // number of allocated array elements for alt alleles char *altBases; // alternate allele values (not '\0'-terminated string) int *altCounts; // number of occurrences of each alt allele int *genotypes; // -1 for no-call, 0 for ref, otherwise 1+index in altBases int unkCount; // number of no-calls (genotype is -1) char refBase; // reference allele }; static struct snvInfo *snvInfoNew(char refBase, char altBase, int gtIx, int gtCount) /* Alloc & return a new snvInfo for ref & alt. If alt == ref, just record the genotype call. */ { struct snvInfo *snv = NULL; AllocVar(snv); snv->refBase = refBase; snv->altAlloced = 4; AllocArray(snv->altBases, snv->altAlloced); AllocArray(snv->altCounts, snv->altAlloced); AllocArray(snv->genotypes, gtCount); // Initialize snv->genotypes to all no-call int i; for (i = 0; i < gtCount; i++) snv->genotypes[i] = -1; if (altBase == refBase) snv->genotypes[gtIx] = 0; else if (altBase == 'n' || altBase == '-') { snv->genotypes[gtIx] = -1; snv->unkCount = 1; } else { snv->altCount = 1; snv->altBases[0] = altBase; snv->altCounts[0] = 1; snv->genotypes[gtIx] = 1; } return snv; } static void snvInfoAddCall(struct snvInfo *snv, char refBase, char altBase, int gtIx) /* Add a new genotype call to snv. */ { if (refBase != snv->refBase) errAbort("snvInfoAddCall: snv->refBase is %c but incoming refBase is %c", snv->refBase, refBase); // If we have already recorded a call for this sample at this position, then there's an overlap // in alignments (perhaps a deletion in q); we're just looking for SNVs so ignore this base. if (snv->genotypes[gtIx] >= 0) return; if (altBase == refBase) snv->genotypes[gtIx] = 0; else if (altBase == 'n' || altBase == '-') { snv->genotypes[gtIx] = -1; snv->unkCount++; } else { int altIx; for (altIx = 0; altIx < snv->altCount; altIx++) if (altBase == snv->altBases[altIx]) break; if (altIx < snv->altCount) snv->altCounts[altIx]++; else { // We haven't seen this alt before; record the new alt, expanding arrays if necessary. if (snv->altCount == snv->altAlloced) { int newAlloced = snv->altAlloced * 2; ExpandArray(snv->altBases, snv->altAlloced, newAlloced); ExpandArray(snv->altCounts, snv->altAlloced, newAlloced); snv->altAlloced = newAlloced; } snv->altCount++; snv->altBases[altIx] = altBase; snv->altCounts[altIx] = 1; } snv->genotypes[gtIx] = 1 + altIx; } } static struct seqInfo *mustFindSeqAndIx(char *qName, struct seqInfo *querySeqs, int *retGtIx) /* Find qName and its index in querySeqs or errAbort. */ { struct seqInfo *qSeq; int gtIx; for (gtIx = 0, qSeq = querySeqs; qSeq != NULL; gtIx++, qSeq = qSeq->next) if (sameString(qName, qSeq->seq->name)) break; if (!qSeq) errAbort("PSL query '%s' not found in input sequences", qName); *retGtIx = gtIx; return qSeq; } static int addCall(struct snvInfo **snvsByPos, int tStart, char refBase, char altBase, int gtIx, int gtCount) /* If refBase is not N, add a call (or no-call if altBase is N or '-') to snvsByPos[tStart]. * Return the VCF genotype code (-1 for missing/N, 0 for ref, 1 for first alt etc). */ { if (refBase != 'n') { if (snvsByPos[tStart] == NULL) snvsByPos[tStart] = snvInfoNew(refBase, altBase, gtIx, gtCount); else snvInfoAddCall(snvsByPos[tStart], refBase, altBase, gtIx); return snvsByPos[tStart]->genotypes[gtIx]; } return -1; } static void extractSnvs(struct psl *psls, struct dnaSeq *ref, struct seqInfo *querySeqs, - struct snvInfo **snvsByPos, boolean *informativeBases, - struct slName **maskSites) + struct snvInfo **snvsByPos, struct slName **maskSites) /* For each PSL, find single-nucleotide differences between ref and query (one of querySeqs), * building up target position-indexed array of snvInfo w/genotypes in same order as querySeqs. - * Add explicit no-calls for unaligned positions in informativeBases. */ + * Add explicit no-calls for unaligned positions that are not masked. */ { int gtCount = slCount(querySeqs); struct psl *psl; for (psl = psls; psl != NULL; psl = psl->next) { if (!sameString(psl->strand, "+")) errAbort("extractSnvs: expected strand to be '+' but it's '%s'", psl->strand); int gtIx; struct seqInfo *qSeq = mustFindSeqAndIx(psl->qName, querySeqs, >Ix); // Add no-calls for unaligned bases before first block int tStart; for (tStart = 0; tStart < psl->tStart; tStart++) { - if (informativeBases[tStart]) + if (!maskSites[tStart]) { char refBase = ref->dna[tStart]; addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount); } } int blkIx; for (blkIx = 0; blkIx < psl->blockCount; blkIx++) { // Add genotypes for aligned bases tStart = psl->tStarts[blkIx]; int tEnd = tStart + psl->blockSizes[blkIx]; int qStart = psl->qStarts[blkIx]; for (; tStart < tEnd; tStart++, qStart++) { char refBase = ref->dna[tStart]; char altBase = qSeq->seq->dna[qStart]; if (!altBase) errAbort("nil altBase at %s:%d", qSeq->seq->name, qStart); int gt = addCall(snvsByPos, tStart, refBase, altBase, gtIx, gtCount); if (gt > 0) { struct singleNucChange *snc = sncNew(tStart, refBase, '\0', altBase); struct slName *maskedReasons = maskSites[tStart]; if (maskedReasons) { slAddHead(&qSeq->maskedSncList, snc); slAddHead(&qSeq->maskedReasonsList, slRefNew(maskedReasons)); } else slAddHead(&qSeq->sncList, snc); } } if (blkIx < psl->blockCount - 1) { // Add no-calls for unaligned bases between this block and the next for (; tStart < psl->tStarts[blkIx+1]; tStart++) { - if (informativeBases[tStart]) + if (!maskSites[tStart]) { char refBase = ref->dna[tStart]; addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount); } } } } // Add no-calls for unaligned bases after last block for (tStart = psl->tEnd; tStart < ref->size; tStart++) { - if (informativeBases[tStart]) + if (!maskSites[tStart]) { char refBase = ref->dna[tStart]; addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount); } } slReverse(&qSeq->sncList); } } static void writeVcfSnv(struct snvInfo **snvsByPos, int gtCount, char *chrom, int chromStart, FILE *f) /* If snvsByPos[chromStart] is not NULL, write out a VCF data row with genotypes. */ { struct snvInfo *snv = snvsByPos[chromStart]; if (snv && (snv->altCount > 0 || snv->unkCount > 0)) { if (snv->altCount == 0) snv->altBases[0] = '*'; int pos = chromStart+1; fprintf(f, "%s\t%d\t%c%d%c", chrom, pos, toupper(snv->refBase), pos, toupper(snv->altBases[0])); int altIx; for (altIx = 1; altIx < snv->altCount; altIx++) fprintf(f, ",%c%d%c", toupper(snv->refBase), pos, toupper(snv->altBases[altIx])); fprintf(f, "\t%c\t%c", toupper(snv->refBase), toupper(snv->altBases[0])); for (altIx = 1; altIx < snv->altCount; altIx++) fprintf(f, ",%c", toupper(snv->altBases[altIx])); fputs("\t.\t.\t", f); fprintf(f, "AC=%d", snv->altCounts[0]); for (altIx = 1; altIx < snv->altCount; altIx++) fprintf(f, ",%d", snv->altCounts[altIx]); int callCount = gtCount; int gtIx; for (gtIx = 0; gtIx < gtCount; gtIx++) if (snv->genotypes[gtIx] < 0) callCount--; fprintf(f, ";AN=%d\tGT", callCount); for (gtIx = 0; gtIx < gtCount; gtIx++) if (snv->genotypes[gtIx] < 0) fputs("\t.", f); else fprintf(f, "\t%d", snv->genotypes[gtIx]); fputc('\n', f); } } static void writeSnvsToVcfFile(struct snvInfo **snvsByPos, struct dnaSeq *ref, char **sampleNames, int sampleCount, struct slName **maskSites, char *vcfFileName) /* Given an array of SNVs (one per base of ref, probably mostly NULL) and sequence of samples, * write a VCF header and rows with genotypes. */ { FILE *f = mustOpen(vcfFileName, "w"); fprintf(f, "##fileformat=VCFv4.2\n"); fprintf(f, "##reference=%s\n", ref->name); fprintf(f, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); int i; for (i = 0; i < sampleCount; i++) fprintf(f, "\t%s", sampleNames[i]); fputc('\n', f); int chromStart; for (chromStart = 0; chromStart < ref->size; chromStart++) if (!maskSites[chromStart]) writeVcfSnv(snvsByPos, sampleCount, ref->name, chromStart, f); carefulClose(&f); } static void pslSnvsToVcfFile(struct psl *psls, struct dnaSeq *ref, struct seqInfo *querySeqs, - char *vcfFileName, boolean *informativeBases, struct slName **maskSites) + char *vcfFileName, struct slName **maskSites) /* Find single-nucleotide differences between each query sequence and reference, and * write out a VCF file with genotype columns for the queries. */ { struct snvInfo *snvsByPos[ref->size]; memset(snvsByPos, 0, sizeof snvsByPos); -extractSnvs(psls, ref, querySeqs, snvsByPos, informativeBases, maskSites); +extractSnvs(psls, ref, querySeqs, snvsByPos, maskSites); int sampleCount = slCount(querySeqs); char *sampleNames[sampleCount]; struct seqInfo *qSeq; int i; for (i = 0, qSeq = querySeqs; i < sampleCount; i++, qSeq = qSeq->next) sampleNames[i] = qSeq->seq->name; writeSnvsToVcfFile(snvsByPos, ref, sampleNames, sampleCount, maskSites, vcfFileName); } static void analyzeGaps(struct seqInfo *filteredSeqs, struct dnaSeq *refGenome) /* Tally up actual insertions and deletions in each psl; ignore skipped N bases. */ { struct seqInfo *si; for (si = filteredSeqs; si != NULL; si = si->next) { struct psl *psl = si->psl; if (psl && (psl->qBaseInsert || psl->tBaseInsert)) { struct dyString *dyIns = dyStringNew(0); struct dyString *dyDel = dyStringNew(0); int insBases = 0, delBases = 0; int ix; for (ix = 0; ix < psl->blockCount - 1; ix++) { int qGapStart = psl->qStarts[ix] + psl->blockSizes[ix]; int qGapEnd = psl->qStarts[ix+1]; int qGapLen = qGapEnd - qGapStart; int tGapStart = psl->tStarts[ix] + psl->blockSizes[ix]; int tGapEnd = psl->tStarts[ix+1]; int tGapLen = tGapEnd - tGapStart; if (qGapLen > tGapLen) { int insLen = qGapLen - tGapLen; insBases += insLen; if (isNotEmpty(dyIns->string)) dyStringAppend(dyIns, ", "); if (insLen <= 12) { char insSeq[insLen+1]; safencpy(insSeq, sizeof insSeq, si->seq->dna + qGapEnd - insLen, insLen); touppers(insSeq); dyStringPrintf(dyIns, "%d-%d:%s", tGapEnd, tGapEnd+1, insSeq); } else dyStringPrintf(dyIns, "%d-%d:%d bases", tGapEnd, tGapEnd+1, insLen); } else if (tGapLen > qGapLen) { int delLen = tGapLen - qGapLen;; delBases += delLen; if (isNotEmpty(dyDel->string)) dyStringAppend(dyDel, ", "); if (delLen <= 12) { char delSeq[delLen+1]; safencpy(delSeq, sizeof delSeq, refGenome->dna + tGapEnd - delLen, delLen); touppers(delSeq); dyStringPrintf(dyDel, "%d-%d:%s", tGapEnd - delLen + 1, tGapEnd, delSeq); } else dyStringPrintf(dyDel, "%d-%d:%d bases", tGapEnd - delLen + 1, tGapEnd, delLen); } } si->insBases = insBases; si->delBases = delBases; si->insRanges = dyStringCannibalize(&dyIns); si->delRanges = dyStringCannibalize(&dyDel); } } } struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome, - boolean *informativeBases, struct slName **maskSites, - struct hash *treeNames, + struct slName **maskSites, struct hash *treeNames, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime) /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ { struct tempName *tn = NULL; struct slName *sampleIds = NULL; struct dnaSeq *allSeqs = faReadAllMixedInLf(lf); int minSeqSize = 0, maxSeqSize = 0; // Default to SARS-CoV-2 or hMPXV values if setting is missing from config.ra. char *minSeqSizeSetting = phyloPlaceDbSetting(db, "minSeqSize"); if (isEmpty(minSeqSizeSetting)) minSeqSize = sameString(db, "wuhCor1") ? 10000 : 100000; else minSeqSize = atoi(minSeqSizeSetting); char *maxSeqSizeSetting = phyloPlaceDbSetting(db, "maxSeqSize"); if (isEmpty(maxSeqSizeSetting)) maxSeqSize = sameString(db, "wuhCor1") ? 35000 : 220000; else maxSeqSize = atoi(maxSeqSizeSetting); struct seqInfo *filteredSeqs = checkSequences(allSeqs, treeNames, minSeqSize, maxSeqSize, retFailedSeqs); reportTiming(pStartTime, "read and check uploaded FASTA"); if (filteredSeqs) { struct psl *alignments = alignSequences(refGenome, filteredSeqs, pStartTime); struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls); removeUnalignedSeqs(&filteredSeqs, filteredAlignments, retFailedSeqs); if (filteredAlignments) { AllocVar(tn); trashDirFile(tn, "ct", "pp", ".vcf"); - pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, informativeBases, - maskSites); + pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, maskSites); struct psl *psl; for (psl = filteredAlignments; psl != NULL; psl = psl->next) slNameAddHead(&sampleIds, psl->qName); analyzeGaps(filteredSeqs, refGenome); slReverse(&sampleIds); reportTiming(pStartTime, "write VCF for uploaded FASTA"); } } *retSampleIds = sampleIds; *retSeqInfo = filteredSeqs; return tn; }