src/hg/hgPhyloPlace/vcfFromFasta.c 06d7be056190c14b85e71bc12523f18ea6815b5e

06d7be056190c14b85e71bc12523f18ea6815b5e
markd
  Mon Dec 7 00:50:29 2020 -0800
BLAT mmap index support merge with master

diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c
new file mode 100644
index 0000000..501fbf0
--- /dev/null
+++ src/hg/hgPhyloPlace/vcfFromFasta.c
@@ -0,0 +1,519 @@
+/* Align user sequence to the reference genome and write VCF */
+
+/* Copyright (C) 2020 The Regents of the University of California */
+
+#include "common.h"
+#include "fa.h"
+#include "genoFind.h"
+#include "iupac.h"
+#include "parsimonyProto.h"
+#include "phyloPlace.h"
+#include "psl.h"
+#include "trashDir.h"
+
+// Globals
+extern int chromSize;
+
+// wuhCor1-specific:
+int minSeqSize = 10000;
+int maxSeqSize = 35000;
+double maxNs = 0.5;
+
+// Using blat defaults for these:
+int gfRepMatch = 1024*4;
+char *gfOoc = NULL;
+int gfStepSize = gfTileSize;
+int gfOutMinIdentityPpt = 900;
+
+// The default minScore for gfLongDnaInMem's 4500 base preferred size is 30, but we want way
+// higher, expecting very similar sequences. -- But watch out for chunking effects in
+// gfLongDnaInMem.  ... would be nice if gfLongDnaInMem scaled minMatch for the last little
+// chunk.
+int gfMinScore = 50;
+// Normally this would be 12 for a genome with size 29903
+// (see hgBlat.c findMinMatch... k = ceil(log4(genomeSize*250)))
+// but we expect a really good alignment so we can crank it up.  (Again modulo chunking)
+int gfIMinMatch = 30;
+
+
+static struct seqInfo *checkSequences(struct dnaSeq *seqs, struct slPair **retFailedSeqs)
+/* Return a list of sequences that pass basic QC checks (appropriate size etc).
+ * Set retFailedSeqs to the list of sequences that failed checks. */
+{
+struct seqInfo *filteredSeqs = NULL;
+struct hash *uniqNames = hashNew(0);
+*retFailedSeqs = NULL;
+struct dnaSeq *seq, *nextSeq;
+for (seq = seqs;  seq != NULL;  seq = nextSeq)
+    {
+    //#*** TODO: if the user uploads a sample with the same ID as one already in the
+    //#*** saved assignment file, then usher will ignore it!
+    //#*** Better check for that and warn the user.
+    boolean passes = TRUE;
+    nextSeq = seq->next;
+    if (seq->size < minSeqSize)
+        {
+        struct dyString *dy = dyStringCreate("Sequence %s has too few bases (%d, must have "
+                                             "at least %d); skipping.",
+                                             seq->name, seq->size, minSeqSize);
+        slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq);
+        passes = FALSE;
+        }
+    else if (seq->size > maxSeqSize)
+        {
+        struct dyString *dy = dyStringCreate("Sequence %s has too many bases (%d, must have "
+                                             "at most %d); skipping.",
+                                             seq->name, seq->size, maxSeqSize);
+        slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq);
+        passes = FALSE;
+        }
+    if (!passes)
+        continue;
+    // blat requires lower case sequence.  Lowercasing makes it easier to find n's and N's too:
+    toLowerN(seq->dna, seq->size);
+    // Many sequences begin and end with blocks of N bases; treat those differently than Ns in middle
+    int nCountTotal = countChars(seq->dna, 'n');
+    int nCountStart = 0;
+    while (seq->dna[nCountStart] == 'n')
+        nCountStart++;
+    int nCountEnd = 0;
+    if (nCountStart < seq->size)
+        {
+        while (seq->dna[seq->size - 1 - nCountEnd] == 'n')
+            nCountEnd++;
+        }
+    int nCountMiddle = nCountTotal - (nCountStart + nCountEnd);
+    int effectiveLength = seq->size - (nCountStart + nCountEnd);
+    if (effectiveLength < minSeqSize)
+        {
+        struct dyString *dy = dyStringCreate("Sequence %s has too few bases (%d excluding %d Ns "
+                                             "at beginning and %d Ns at end), "
+                                             "must have at least %d); skipping.",
+                                             seq->name, effectiveLength, nCountStart, nCountEnd,
+                                             minSeqSize);
+        slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq);
+        passes = FALSE;
+        }
+    if (!passes)
+        continue;
+    if (nCountMiddle > maxNs * effectiveLength)
+        {
+        struct dyString *dy = dyStringCreate("Sequence %s has too many N bases (%d out of %d "
+                                             "> %0.2f); skipping.",
+                                             seq->name, nCountMiddle, effectiveLength, maxNs);
+        slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq);
+        passes = FALSE;
+        }
+    int ambigCount = 0;
+    int i;
+    for (i = 0;  i < seq->size;  i++)
+        if (seq->dna[i] != 'n' && isIupacAmbiguous(seq->dna[i]))
+            ambigCount++;
+    if (passes)
+        {
+        if (hashLookup(uniqNames, seq->name))
+            {
+            struct dyString *dy = dyStringCreate("Sequence name '%s' has already been used; "
+                                                 "ignoring subsequent usage "
+                                                 "(%d bases, %d N's, %d ambiguous).",
+                                                 seq->name, seq->size, nCountTotal, ambigCount);
+            slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq);
+            }
+        else
+            {
+            struct seqInfo *si;
+            AllocVar(si);
+            si->seq = seq;
+            si->nCountStart = nCountStart;
+            si->nCountMiddle = nCountMiddle;
+            si->nCountEnd = nCountEnd;
+            si->ambigCount = ambigCount;
+            hashAdd(uniqNames, seq->name, NULL);
+            slAddHead(&filteredSeqs, si);
+            }
+        }
+    }
+slReverse(&filteredSeqs);
+slReverse(retFailedSeqs);
+hashFree(&uniqNames);
+return filteredSeqs;
+}
+
+static struct psl *alignSequences(char *db, struct dnaSeq *refGenome, struct seqInfo *seqs,
+                                  int *pStartTime)
+/* Use BLAT server to align seqs to reference; keep only the best alignment for each seq.
+ * (In normal usage, there should be one very good alignment per seq.) */
+{
+struct psl *alignments = NULL;
+struct genoFind *gf = gfIndexSeq(refGenome, gfIMinMatch, gfMaxGap, gfTileSize, gfRepMatch, gfOoc,
+                                 FALSE, FALSE, FALSE, gfStepSize, FALSE);
+reportTiming(pStartTime, "gfIndexSeq");
+struct tempName pslTn;
+trashDirFile(&pslTn, "ct", "pp", ".psl");
+FILE *f = mustOpen(pslTn.forCgi, "w");
+struct gfOutput *gvo = gfOutputPsl(gfOutMinIdentityPpt, FALSE, FALSE, f, FALSE, TRUE);
+struct seqInfo *si;
+for (si = seqs;  si != NULL;  si = si->next)
+    {
+    // Positive strand only; we're expecting a mostly complete match to the reference
+    gfLongDnaInMem(si->seq, gf, FALSE, gfMinScore, NULL, gvo, FALSE, FALSE);
+    reportTiming(pStartTime, "gfLongDnaInMem");
+    }
+gfOutputFree(&gvo);
+carefulClose(&f);
+alignments = pslLoadAll(pslTn.forCgi);
+reportTiming(pStartTime, "load PSL file");
+//#*** TODO: keep only the best alignment for each seq.
+return alignments;
+}
+
+struct psl *checkAlignments(struct psl *psls, struct seqInfo *userSeqs,
+                            struct slPair **retFailedPsls)
+/* No thresholds applied at this point - just makes sure there aren't multiple PSLs per seq. */
+{
+struct psl *filteredPsls = NULL;
+*retFailedPsls = NULL;
+struct hash *userSeqsByName = hashNew(0);
+struct seqInfo *si;
+for (si = userSeqs;  si != NULL;  si = si->next)
+    hashAdd(userSeqsByName, si->seq->name, si);
+struct hash *alignedSeqs = hashNew(0);
+struct psl *psl, *nextPsl;
+for (psl = psls;  psl != NULL;  psl = nextPsl)
+    {
+    nextPsl = psl->next;
+    boolean passes = TRUE;
+    struct psl *otherPsl = hashFindVal(alignedSeqs, psl->qName);
+    if (otherPsl)
+        {
+        //#*** Is this ever going to happen?  Maybe if there's a large dup???
+        //#*** Is there any condition under which we would want to try to merge?  (Would expect blat
+        //#*** to have done that)
+        struct dyString *dy = dyStringCreate("Warning: multiple alignments found for sequence %s "
+                                             "(%d-%d and %d-%d).  Skipping alignment of %d-%d",
+                                              psl->qName, otherPsl->qStart, otherPsl->qEnd,
+                                              psl->qStart, psl->qEnd, psl->qStart, psl->qEnd);
+        slPairAdd(retFailedPsls, dyStringCannibalize(&dy), psl);
+        passes = FALSE;
+        }
+    hashAdd(alignedSeqs, psl->qName, psl);
+    if (passes)
+        {
+        si = hashFindVal(userSeqsByName, psl->qName);
+        if (si)
+            si->psl = psl;
+        else
+            warn("Aligned sequence name '%s' does not match any input sequence name", psl->qName);
+        slAddHead(&filteredPsls, psl);
+        }
+    }
+hashFree(&alignedSeqs);
+hashFree(&userSeqsByName);
+slReverse(&filteredPsls);
+slReverse(retFailedPsls);
+return filteredPsls;
+}
+
+struct snvInfo
+/* Summary of SNV with sample genotypes (position and samples are externally defined) */
+    {
+    int altCount;     // number of alternate alleles
+    int altAlloced;   // number of allocated array elements for alt alleles
+    char *altBases;   // alternate allele values (not '\0'-terminated string)
+    int *altCounts;   // number of occurrences of each alt allele
+    int *genotypes;   // -1 for no-call, 0 for ref, otherwise 1+index in altBases
+    int unkCount;     // number of no-calls (genotype is -1)
+    char refBase;     // reference allele
+    };
+
+static struct snvInfo *snvInfoNew(char refBase, char altBase, int gtIx, int gtCount)
+/* Alloc & return a new snvInfo for ref & alt.  If alt == ref, just record the genotype call. */
+{
+struct snvInfo *snv = NULL;
+AllocVar(snv);
+snv->refBase = refBase;
+snv->altAlloced = 4;
+AllocArray(snv->altBases, snv->altAlloced);
+AllocArray(snv->altCounts, snv->altAlloced);
+AllocArray(snv->genotypes, gtCount);
+// Initialize snv->genotypes to all no-call
+int i;
+for (i = 0;  i < gtCount;  i++)
+    snv->genotypes[i] = -1;
+if (altBase == refBase)
+    snv->genotypes[gtIx] = 0;
+else if (altBase == 'n' || altBase == '-')
+    {
+    snv->genotypes[gtIx] = -1;
+    snv->unkCount = 1;
+    }
+else
+    {
+    snv->altCount = 1;
+    snv->altBases[0] = altBase;
+    snv->altCounts[0] = 1;
+    snv->genotypes[gtIx] = 1;
+    }
+return snv;
+}
+
+static void snvInfoAddCall(struct snvInfo *snv, char refBase, char altBase, int gtIx)
+/* Add a new genotype call to snv. */
+{
+if (refBase != snv->refBase)
+    errAbort("snvInfoAddCall: snv->refBase is %c but incoming refBase is %c", snv->refBase, refBase);
+// If we have already recorded a call for this sample at this position, then there's an overlap
+// in alignments (perhaps a deletion in q); we're just looking for SNVs so ignore this base.
+if (snv->genotypes[gtIx] >= 0)
+    return;
+if (altBase == refBase)
+    snv->genotypes[gtIx] = 0;
+else if (altBase == 'n' || altBase == '-')
+    {
+    snv->genotypes[gtIx] = -1;
+    snv->unkCount++;
+    }
+else
+    {
+    int altIx;
+    for (altIx = 0;  altIx < snv->altCount;  altIx++)
+        if (altBase == snv->altBases[altIx])
+            break;
+    if (altIx < snv->altCount)
+        snv->altCounts[altIx]++;
+    else
+        {
+        // We haven't seen this alt before; record the new alt, expanding arrays if necessary.
+        if (snv->altCount == snv->altAlloced)
+            {
+            int newAlloced = snv->altAlloced * 2;
+            ExpandArray(snv->altBases, snv->altAlloced, newAlloced);
+            ExpandArray(snv->altCounts, snv->altAlloced, newAlloced);
+            snv->altAlloced = newAlloced;
+            }
+        snv->altCount++;
+        snv->altBases[altIx] = altBase;
+        snv->altCounts[altIx] = 1;
+        }
+    snv->genotypes[gtIx] = 1 + altIx;
+    }
+}
+
+static struct seqInfo *mustFindSeqAndIx(char *qName, struct seqInfo *querySeqs, int *retGtIx)
+/* Find qName and its index in querySeqs or errAbort. */
+{
+struct seqInfo *qSeq;
+int gtIx;
+for (gtIx = 0, qSeq = querySeqs;  qSeq != NULL;  gtIx++, qSeq = qSeq->next)
+    if (sameString(qName, qSeq->seq->name))
+        break;
+if (!qSeq)
+    errAbort("PSL query '%s' not found in input sequences", qName);
+*retGtIx = gtIx;
+return qSeq;
+}
+
+static int addCall(struct snvInfo **snvsByPos, int tStart, char refBase, char altBase, int gtIx,
+                    int gtCount)
+/* If refBase is not N, add a call (or no-call if altBase is N or '-') to snvsByPos[tStart].
+ * Return the VCF genotype code (-1 for missing/N, 0 for ref, 1 for first alt etc). */
+{
+if (refBase != 'n')
+    {
+    if (snvsByPos[tStart] == NULL)
+        snvsByPos[tStart] = snvInfoNew(refBase, altBase, gtIx, gtCount);
+    else
+        snvInfoAddCall(snvsByPos[tStart], refBase, altBase, gtIx);
+    return snvsByPos[tStart]->genotypes[gtIx];
+    }
+return -1;
+}
+
+static void extractSnvs(struct psl *psls, struct dnaSeq *ref, struct seqInfo *querySeqs,
+                        struct snvInfo **snvsByPos, boolean *informativeBases,
+                        struct slName **maskSites)
+/* For each PSL, find single-nucleotide differences between ref and query (one of querySeqs),
+ * building up target position-indexed array of snvInfo w/genotypes in same order as querySeqs.
+ * Add explicit no-calls for unaligned positions in informativeBases. */
+{
+int gtCount = slCount(querySeqs);
+struct psl *psl;
+for (psl = psls;  psl != NULL;  psl = psl->next)
+    {
+    if (!sameString(psl->strand, "+"))
+        errAbort("extractSnvs: expected strand to be '+' but it's '%s'", psl->strand);
+    int gtIx;
+    struct seqInfo *qSeq = mustFindSeqAndIx(psl->qName, querySeqs, &gtIx);
+    // Add no-calls for unaligned bases before first block
+    int tStart;
+    for (tStart = 0;  tStart < psl->tStart;  tStart++)
+        {
+        if (informativeBases[tStart])
+            {
+            char refBase = ref->dna[tStart];
+            addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount);
+            }
+        }
+    int blkIx;
+    for (blkIx = 0;  blkIx < psl->blockCount;  blkIx++)
+        {
+        // Add genotypes for aligned bases
+        tStart = psl->tStarts[blkIx];
+        int tEnd = tStart + psl->blockSizes[blkIx];
+        int qStart = psl->qStarts[blkIx];
+        for (;  tStart < tEnd;  tStart++, qStart++)
+            {
+            char refBase = ref->dna[tStart];
+            char altBase = qSeq->seq->dna[qStart];
+            if (!altBase)
+                errAbort("nil altBase at %s:%d", qSeq->seq->name, qStart);
+            int gt = addCall(snvsByPos, tStart, refBase, altBase, gtIx, gtCount);
+            if (gt > 0)
+                {
+                struct singleNucChange *snc = sncNew(tStart, refBase, '\0', altBase);
+                struct slName *maskedReasons = maskSites[tStart];
+                if (maskedReasons)
+                    {
+                    slAddHead(&qSeq->maskedSncList, snc);
+                    slAddHead(&qSeq->maskedReasonsList, slRefNew(maskedReasons));
+                    }
+                else
+                    slAddHead(&qSeq->sncList, snc);
+                }
+            }
+        if (blkIx < psl->blockCount - 1)
+            {
+            // Add no-calls for unaligned bases between this block and the next
+            for (;  tStart < psl->tStarts[blkIx+1];  tStart++)
+                {
+                if (informativeBases[tStart])
+                    {
+                    char refBase = ref->dna[tStart];
+                    addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount);
+                    }
+                }
+            }
+        }
+    // Add no-calls for unaligned bases after last block
+    for (tStart = psl->tEnd;  tStart < chromSize;  tStart++)
+        {
+        if (informativeBases[tStart])
+            {
+            char refBase = ref->dna[tStart];
+            addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount);
+            }
+        }
+    slReverse(&qSeq->sncList);
+    }
+}
+
+static void writeVcfSnv(struct snvInfo **snvsByPos, int gtCount, char *chrom, int chromStart,
+                        FILE *f)
+/* If snvsByPos[chromStart] is not NULL, write out a VCF data row with genotypes. */
+{
+struct snvInfo *snv = snvsByPos[chromStart];
+if (snv && (snv->altCount > 0 || snv->unkCount > 0))
+    {
+    if (snv->altCount == 0)
+        snv->altBases[0] = '*';
+    int pos = chromStart+1;
+    fprintf(f, "%s\t%d\t%c%d%c",
+            chrom, pos, toupper(snv->refBase), pos, toupper(snv->altBases[0]));
+    int altIx;
+    for (altIx = 1;  altIx < snv->altCount;  altIx++)
+        fprintf(f, ",%c%d%c", toupper(snv->refBase), pos, toupper(snv->altBases[altIx]));
+    fprintf(f, "\t%c\t%c", toupper(snv->refBase), toupper(snv->altBases[0]));
+    for (altIx = 1;  altIx < snv->altCount;  altIx++)
+        fprintf(f, ",%c", toupper(snv->altBases[altIx]));
+    fputs("\t.\t.\t", f);
+    fprintf(f, "AC=%d", snv->altCounts[0]);
+    for (altIx = 1;  altIx < snv->altCount;  altIx++)
+        fprintf(f, ",%d", snv->altCounts[altIx]);
+    int callCount = gtCount;
+    int gtIx;
+    for (gtIx = 0;  gtIx < gtCount;  gtIx++)
+        if (snv->genotypes[gtIx] < 0)
+            callCount--;
+    fprintf(f, ";AN=%d\tGT", callCount);
+    for (gtIx = 0;  gtIx < gtCount;  gtIx++)
+        if (snv->genotypes[gtIx] < 0)
+            fputs("\t.", f);
+        else
+            fprintf(f, "\t%d", snv->genotypes[gtIx]);
+    fputc('\n', f);
+    }
+}
+
+static void writeSnvsToVcfFile(struct snvInfo **snvsByPos, struct dnaSeq *ref, char **sampleNames,
+                               int sampleCount, struct slName **maskSites, char *vcfFileName)
+/* Given an array of SNVs (one per base of ref, probably mostly NULL) and sequence of samples,
+ * write a VCF header and rows with genotypes. */
+{
+FILE *f = mustOpen(vcfFileName, "w");
+fprintf(f, "##fileformat=VCFv4.2\n");
+fprintf(f, "##reference=%s\n", ref->name);
+fprintf(f, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
+int i;
+for (i = 0;  i < sampleCount;  i++)
+    fprintf(f, "\t%s", sampleNames[i]);
+fputc('\n', f);
+int chromStart;
+for (chromStart = 0;  chromStart < ref->size;  chromStart++)
+    if (!maskSites[chromStart])
+        writeVcfSnv(snvsByPos, sampleCount, ref->name, chromStart, f);
+carefulClose(&f);
+}
+
+static void pslSnvsToVcfFile(struct psl *psls, struct dnaSeq *ref, struct seqInfo *querySeqs,
+                             char *vcfFileName, boolean *informativeBases, struct slName **maskSites)
+/* Find single-nucleotide differences between each query sequence and reference, and
+ * write out a VCF file with genotype columns for the queries. */
+{
+struct snvInfo *snvsByPos[ref->size];
+memset(snvsByPos, 0, sizeof snvsByPos);
+extractSnvs(psls, ref, querySeqs, snvsByPos, informativeBases, maskSites);
+int sampleCount = slCount(querySeqs);
+char *sampleNames[sampleCount];
+struct seqInfo *qSeq;
+int i;
+for (i = 0, qSeq = querySeqs;  i < sampleCount;  i++, qSeq = qSeq->next)
+    sampleNames[i] = qSeq->seq->name;
+writeSnvsToVcfFile(snvsByPos, ref, sampleNames, sampleCount, maskSites, vcfFileName);
+}
+
+struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome,
+                              boolean *informativeBases, struct slName **maskSites,
+                              struct slName **retSampleIds, struct seqInfo **retSeqInfo,
+                              struct slPair **retFailedSeqs, struct slPair **retFailedPsls,
+                              int *pStartTime)
+/* Read in FASTA from lf and make sure each item has a reasonable size and not too high
+ * percentage of N's.  Align to reference, extract SNVs from alignment, and save as VCF
+ * with sample genotype columns. */
+{
+struct tempName *tn = NULL;
+struct slName *sampleIds = NULL;
+struct dnaSeq *allSeqs = faReadAllMixedInLf(lf);
+struct seqInfo *filteredSeqs = checkSequences(allSeqs, retFailedSeqs);
+reportTiming(pStartTime, "read and check uploaded FASTA");
+if (filteredSeqs)
+    {
+    struct psl *alignments = alignSequences(db, refGenome, filteredSeqs, pStartTime);
+    struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls);
+    if (filteredAlignments)
+        {
+        AllocVar(tn);
+        trashDirFile(tn, "ct", "pp", ".vcf");
+        pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, informativeBases,
+                         maskSites);
+        struct psl *psl;
+        for (psl = filteredAlignments;  psl != NULL;  psl = psl->next)
+            slNameAddHead(&sampleIds, psl->qName);
+        slReverse(&sampleIds);
+        reportTiming(pStartTime, "write VCF for uploaded FASTA");
+        }
+    }
+*retSampleIds = sampleIds;
+*retSeqInfo = filteredSeqs;
+return tn;
+}
+