5956c40955854fd83b75627124f6420518a78960 angie Wed Feb 24 20:51:42 2021 -0800 Add more columns from summary table to TSV summary file. Move PSL indel-tallying code upstream & store results for summaries. diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c index 501fbf0..ca46b1e 100644 --- src/hg/hgPhyloPlace/vcfFromFasta.c +++ src/hg/hgPhyloPlace/vcfFromFasta.c @@ -469,51 +469,114 @@ /* Find single-nucleotide differences between each query sequence and reference, and * write out a VCF file with genotype columns for the queries. */ { struct snvInfo *snvsByPos[ref->size]; memset(snvsByPos, 0, sizeof snvsByPos); extractSnvs(psls, ref, querySeqs, snvsByPos, informativeBases, maskSites); int sampleCount = slCount(querySeqs); char *sampleNames[sampleCount]; struct seqInfo *qSeq; int i; for (i = 0, qSeq = querySeqs; i < sampleCount; i++, qSeq = qSeq->next) sampleNames[i] = qSeq->seq->name; writeSnvsToVcfFile(snvsByPos, ref, sampleNames, sampleCount, maskSites, vcfFileName); } +static void analyzeGaps(struct seqInfo *filteredSeqs, struct dnaSeq *refGenome) +/* Tally up actual insertions and deletions in each psl; ignore skipped N bases. */ +{ +struct seqInfo *si; +for (si = filteredSeqs; si != NULL; si = si->next) + { + struct psl *psl = si->psl; + if (psl && (psl->qBaseInsert || psl->tBaseInsert)) + { + struct dyString *dyIns = dyStringNew(0); + struct dyString *dyDel = dyStringNew(0); + int insBases = 0, delBases = 0; + int ix; + for (ix = 0; ix < psl->blockCount - 1; ix++) + { + int qGapStart = psl->qStarts[ix] + psl->blockSizes[ix]; + int qGapEnd = psl->qStarts[ix+1]; + int qGapLen = qGapEnd - qGapStart; + int tGapStart = psl->tStarts[ix] + psl->blockSizes[ix]; + int tGapEnd = psl->tStarts[ix+1]; + int tGapLen = tGapEnd - tGapStart; + if (qGapLen > tGapLen) + { + int insLen = qGapLen - tGapLen; + insBases += insLen; + if (isNotEmpty(dyIns->string)) + dyStringAppend(dyIns, ", "); + if (insLen <= 12) + { + char insSeq[insLen+1]; + safencpy(insSeq, sizeof insSeq, si->seq->dna + qGapEnd - insLen, insLen); + touppers(insSeq); + dyStringPrintf(dyIns, "%d-%d:%s", tGapEnd, tGapEnd+1, insSeq); + } + else + dyStringPrintf(dyIns, "%d-%d:%d bases", tGapEnd, tGapEnd+1, insLen); + } + else if (tGapLen > qGapLen) + { + int delLen = tGapLen - qGapLen;; + delBases += delLen; + if (isNotEmpty(dyDel->string)) + dyStringAppend(dyDel, ", "); + if (delLen <= 12) + { + char delSeq[delLen+1]; + safencpy(delSeq, sizeof delSeq, refGenome->dna + tGapEnd - delLen, delLen); + touppers(delSeq); + dyStringPrintf(dyDel, "%d-%d:%s", tGapEnd - delLen + 1, tGapEnd, delSeq); + } + else + dyStringPrintf(dyDel, "%d-%d:%d bases", tGapEnd - delLen + 1, tGapEnd, delLen); + } + } + si->insBases = insBases; + si->delBases = delBases; + si->insRanges = dyStringCannibalize(&dyIns); + si->delRanges = dyStringCannibalize(&dyDel); + } + } +} + struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome, boolean *informativeBases, struct slName **maskSites, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime) /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ { struct tempName *tn = NULL; struct slName *sampleIds = NULL; struct dnaSeq *allSeqs = faReadAllMixedInLf(lf); struct seqInfo *filteredSeqs = checkSequences(allSeqs, retFailedSeqs); reportTiming(pStartTime, "read and check uploaded FASTA"); if (filteredSeqs) { struct psl *alignments = alignSequences(db, refGenome, filteredSeqs, pStartTime); struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls); if (filteredAlignments) { AllocVar(tn); trashDirFile(tn, "ct", "pp", ".vcf"); pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, informativeBases, maskSites); struct psl *psl; for (psl = filteredAlignments; psl != NULL; psl = psl->next) slNameAddHead(&sampleIds, psl->qName); + analyzeGaps(filteredSeqs, refGenome); slReverse(&sampleIds); reportTiming(pStartTime, "write VCF for uploaded FASTA"); } } *retSampleIds = sampleIds; *retSeqInfo = filteredSeqs; return tn; }