bdc1ddd9389c69f90464e15fff6661b52befdd1c angie Sun May 23 13:49:32 2021 -0700 If a sequence can't be aligned to the reference then move it from filteredSeqs to failedSeqs so it doesn't end up in the VCF with no actual calls. diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c index c929d7a..e9082a5 100644 --- src/hg/hgPhyloPlace/vcfFromFasta.c +++ src/hg/hgPhyloPlace/vcfFromFasta.c @@ -202,30 +202,65 @@ si = hashFindVal(userSeqsByName, psl->qName); if (si) si->psl = psl; else warn("Aligned sequence name '%s' does not match any input sequence name", psl->qName); slAddHead(&filteredPsls, psl); } } hashFree(&alignedSeqs); hashFree(&userSeqsByName); slReverse(&filteredPsls); slReverse(retFailedPsls); return filteredPsls; } +static void removeUnalignedSeqs(struct seqInfo **pSeqs, struct psl *alignments, + struct slPair **pFailedSeqs) +/* If any seqs were not aligned, then remove them from *pSeqs and add them to *pFailedSeqs. */ +{ +struct seqInfo *alignedSeqs = NULL; +struct slPair *newFailedSeqs = NULL; +struct seqInfo *seq, *nextSeq; +for (seq = *pSeqs; seq != NULL; seq = nextSeq) + { + nextSeq = seq->next; + boolean found = FALSE; + struct psl *psl; + for (psl = alignments; psl != NULL; psl = psl->next) + { + if (sameString(psl->qName, seq->seq->name)) + { + found = TRUE; + break; + } + } + if (found) + slAddHead(&alignedSeqs, seq); + else + { + struct dyString *dy = dyStringCreate("Sequence %s could not be aligned to the reference; " + "skipping", seq->seq->name); + slPairAdd(&newFailedSeqs, dyStringCannibalize(&dy), seq); + } + } +slReverse(&alignedSeqs); +slReverse(&newFailedSeqs); +*pSeqs = alignedSeqs; +*pFailedSeqs = slCat(*pFailedSeqs, newFailedSeqs); +} + struct snvInfo /* Summary of SNV with sample genotypes (position and samples are externally defined) */ { int altCount; // number of alternate alleles int altAlloced; // number of allocated array elements for alt alleles char *altBases; // alternate allele values (not '\0'-terminated string) int *altCounts; // number of occurrences of each alt allele int *genotypes; // -1 for no-call, 0 for ref, otherwise 1+index in altBases int unkCount; // number of no-calls (genotype is -1) char refBase; // reference allele }; static struct snvInfo *snvInfoNew(char refBase, char altBase, int gtIx, int gtCount) /* Alloc & return a new snvInfo for ref & alt. If alt == ref, just record the genotype call. */ { @@ -549,30 +584,31 @@ struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime) /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ { struct tempName *tn = NULL; struct slName *sampleIds = NULL; struct dnaSeq *allSeqs = faReadAllMixedInLf(lf); struct seqInfo *filteredSeqs = checkSequences(allSeqs, retFailedSeqs); reportTiming(pStartTime, "read and check uploaded FASTA"); if (filteredSeqs) { struct psl *alignments = alignSequences(db, refGenome, filteredSeqs, pStartTime); struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls); + removeUnalignedSeqs(&filteredSeqs, filteredAlignments, retFailedSeqs); if (filteredAlignments) { AllocVar(tn); trashDirFile(tn, "ct", "pp", ".vcf"); pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, informativeBases, maskSites); struct psl *psl; for (psl = filteredAlignments; psl != NULL; psl = psl->next) slNameAddHead(&sampleIds, psl->qName); analyzeGaps(filteredSeqs, refGenome); slReverse(&sampleIds); reportTiming(pStartTime, "write VCF for uploaded FASTA"); } } *retSampleIds = sampleIds;