src/hg/hgPhyloPlace/vcfFromFasta.c 0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3

0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3
angie
  Fri Oct 28 15:08:06 2022 -0700
Performance improvements for trees with millions of sequences:
* Use @yceh's usher-sampled-server if configured; it preloads protobufs and can start placing sequences immediately using usher-sampled, a faster version of usher
* Use usher-sampled instead of usher if server is not configured but usher-sampled is available
* Load sample metadata file in a pthread while usher(-sampled(-server)) or matUtils is running
* Skip checking for sample name clashes in uploaded fasta when using usher-sampled(-server)'s new --no-ignore-prefix option (but look for the prefix when parsing results)
* Avoid parsing the protobuf and traversing the big tree unless absolutely necessary
** Subtrees from usher/matUtils have not included condensed nodes in a long time; remove lots of condensedNodes/summarization code from phyloPlace.c, runUsher.c, writeCustomTracks.c
** Use subtrees instead of big tree when possible (in findNearestNeighbor, treeToBaseAlleles, uploadedSamplesTree)
** Skip the informativeBases stuff that inhibits masking of sites from Problematic Sites set when the tree was built with an earlier version -- that pretty much never applies anymore now that only daily-updated trees are offered, not a range from old to new.
** Allow config.ra to specify a flat file of sample names (needed for searching user's uploaded names/IDs before calling matUtils) instead of getting names from the big tree

diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c
index 6e06e530..f914657 100644
--- src/hg/hgPhyloPlace/vcfFromFasta.c
+++ src/hg/hgPhyloPlace/vcfFromFasta.c
@@ -103,31 +103,32 @@
     for (i = 0;  i < seq->size;  i++)
         if (seq->dna[i] != 'n' && isIupacAmbiguous(seq->dna[i]))
             ambigCount++;
     if (passes)
         {
         if (hashLookup(uniqNames, seq->name))
             {
             struct dyString *dy = dyStringCreate("Sequence name '%s' has already been used; "
                                                  "ignoring subsequent usage "
                                                  "(%d bases, %d N's, %d ambiguous).",
                                                  seq->name, seq->size, nCountTotal, ambigCount);
             slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq);
             }
         else
             {
-            if (isInternalNodeName(seq->name, 0) || hashLookup(treeNames, seq->name))
+            if (treeNames &&
+                (isInternalNodeName(seq->name, 0) || hashLookup(treeNames, seq->name)))
                 {
                 // usher will reject any sequence whose name is already in the tree, even a
                 // numeric internal node name.  Add a prefix so usher won't reject sequence.
                 char newName[strlen(seq->name)+32];
                 safef(newName, sizeof newName, "uploaded_%s", seq->name);
                 freeMem(seq->name);
                 seq->name = cloneString(newName);
                 }
             struct seqInfo *si;
             AllocVar(si);
             si->seq = seq;
             si->nCountStart = nCountStart;
             si->nCountMiddle = nCountMiddle;
             si->nCountEnd = nCountEnd;
             si->ambigCount = ambigCount;
@@ -356,49 +357,48 @@
 /* If refBase is not N, add a call (or no-call if altBase is N or '-') to snvsByPos[tStart].
  * Return the VCF genotype code (-1 for missing/N, 0 for ref, 1 for first alt etc). */
 {
 if (refBase != 'n')
     {
     if (snvsByPos[tStart] == NULL)
         snvsByPos[tStart] = snvInfoNew(refBase, altBase, gtIx, gtCount);
     else
         snvInfoAddCall(snvsByPos[tStart], refBase, altBase, gtIx);
     return snvsByPos[tStart]->genotypes[gtIx];
     }
 return -1;
 }
 
 static void extractSnvs(struct psl *psls, struct dnaSeq *ref, struct seqInfo *querySeqs,
-                        struct snvInfo **snvsByPos, boolean *informativeBases,
-                        struct slName **maskSites)
+                        struct snvInfo **snvsByPos, struct slName **maskSites)
 /* For each PSL, find single-nucleotide differences between ref and query (one of querySeqs),
  * building up target position-indexed array of snvInfo w/genotypes in same order as querySeqs.
- * Add explicit no-calls for unaligned positions in informativeBases. */
+ * Add explicit no-calls for unaligned positions that are not masked. */
 {
 int gtCount = slCount(querySeqs);
 struct psl *psl;
 for (psl = psls;  psl != NULL;  psl = psl->next)
     {
     if (!sameString(psl->strand, "+"))
         errAbort("extractSnvs: expected strand to be '+' but it's '%s'", psl->strand);
     int gtIx;
     struct seqInfo *qSeq = mustFindSeqAndIx(psl->qName, querySeqs, &gtIx);
     // Add no-calls for unaligned bases before first block
     int tStart;
     for (tStart = 0;  tStart < psl->tStart;  tStart++)
         {
-        if (informativeBases[tStart])
+        if (!maskSites[tStart])
             {
             char refBase = ref->dna[tStart];
             addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount);
             }
         }
     int blkIx;
     for (blkIx = 0;  blkIx < psl->blockCount;  blkIx++)
         {
         // Add genotypes for aligned bases
         tStart = psl->tStarts[blkIx];
         int tEnd = tStart + psl->blockSizes[blkIx];
         int qStart = psl->qStarts[blkIx];
         for (;  tStart < tEnd;  tStart++, qStart++)
             {
             char refBase = ref->dna[tStart];
@@ -412,42 +412,42 @@
                 struct slName *maskedReasons = maskSites[tStart];
                 if (maskedReasons)
                     {
                     slAddHead(&qSeq->maskedSncList, snc);
                     slAddHead(&qSeq->maskedReasonsList, slRefNew(maskedReasons));
                     }
                 else
                     slAddHead(&qSeq->sncList, snc);
                 }
             }
         if (blkIx < psl->blockCount - 1)
             {
             // Add no-calls for unaligned bases between this block and the next
             for (;  tStart < psl->tStarts[blkIx+1];  tStart++)
                 {
-                if (informativeBases[tStart])
+                if (!maskSites[tStart])
                     {
                     char refBase = ref->dna[tStart];
                     addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount);
                     }
                 }
             }
         }
     // Add no-calls for unaligned bases after last block
     for (tStart = psl->tEnd;  tStart < ref->size;  tStart++)
         {
-        if (informativeBases[tStart])
+        if (!maskSites[tStart])
             {
             char refBase = ref->dna[tStart];
             addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount);
             }
         }
     slReverse(&qSeq->sncList);
     }
 }
 
 static void writeVcfSnv(struct snvInfo **snvsByPos, int gtCount, char *chrom, int chromStart,
                         FILE *f)
 /* If snvsByPos[chromStart] is not NULL, write out a VCF data row with genotypes. */
 {
 struct snvInfo *snv = snvsByPos[chromStart];
 if (snv && (snv->altCount > 0 || snv->unkCount > 0))
@@ -491,37 +491,37 @@
 fprintf(f, "##fileformat=VCFv4.2\n");
 fprintf(f, "##reference=%s\n", ref->name);
 fprintf(f, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
 int i;
 for (i = 0;  i < sampleCount;  i++)
     fprintf(f, "\t%s", sampleNames[i]);
 fputc('\n', f);
 int chromStart;
 for (chromStart = 0;  chromStart < ref->size;  chromStart++)
     if (!maskSites[chromStart])
         writeVcfSnv(snvsByPos, sampleCount, ref->name, chromStart, f);
 carefulClose(&f);
 }
 
 static void pslSnvsToVcfFile(struct psl *psls, struct dnaSeq *ref, struct seqInfo *querySeqs,
-                             char *vcfFileName, boolean *informativeBases, struct slName **maskSites)
+                             char *vcfFileName, struct slName **maskSites)
 /* Find single-nucleotide differences between each query sequence and reference, and
  * write out a VCF file with genotype columns for the queries. */
 {
 struct snvInfo *snvsByPos[ref->size];
 memset(snvsByPos, 0, sizeof snvsByPos);
-extractSnvs(psls, ref, querySeqs, snvsByPos, informativeBases, maskSites);
+extractSnvs(psls, ref, querySeqs, snvsByPos, maskSites);
 int sampleCount = slCount(querySeqs);
 char *sampleNames[sampleCount];
 struct seqInfo *qSeq;
 int i;
 for (i = 0, qSeq = querySeqs;  i < sampleCount;  i++, qSeq = qSeq->next)
     sampleNames[i] = qSeq->seq->name;
 writeSnvsToVcfFile(snvsByPos, ref, sampleNames, sampleCount, maskSites, vcfFileName);
 }
 
 static void analyzeGaps(struct seqInfo *filteredSeqs, struct dnaSeq *refGenome)
 /* Tally up actual insertions and deletions in each psl; ignore skipped N bases. */
 {
 struct seqInfo *si;
 for (si = filteredSeqs;  si != NULL;  si = si->next)
     {
@@ -570,32 +570,31 @@
                     dyStringPrintf(dyDel, "%d-%d:%s", tGapEnd - delLen + 1, tGapEnd, delSeq);
                     }
                 else
                     dyStringPrintf(dyDel, "%d-%d:%d bases", tGapEnd - delLen + 1, tGapEnd, delLen);
                 }
             }
         si->insBases = insBases;
         si->delBases = delBases;
         si->insRanges = dyStringCannibalize(&dyIns);
         si->delRanges = dyStringCannibalize(&dyDel);
         }
     }
 }
 
 struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome,
-                              boolean *informativeBases, struct slName **maskSites,
-                              struct hash *treeNames,
+                              struct slName **maskSites, struct hash *treeNames,
                               struct slName **retSampleIds, struct seqInfo **retSeqInfo,
                               struct slPair **retFailedSeqs, struct slPair **retFailedPsls,
                               int *pStartTime)
 /* Read in FASTA from lf and make sure each item has a reasonable size and not too high
  * percentage of N's.  Align to reference, extract SNVs from alignment, and save as VCF
  * with sample genotype columns. */
 {
 struct tempName *tn = NULL;
 struct slName *sampleIds = NULL;
 struct dnaSeq *allSeqs = faReadAllMixedInLf(lf);
 int minSeqSize = 0, maxSeqSize = 0;
 // Default to SARS-CoV-2 or hMPXV values if setting is missing from config.ra.
 char *minSeqSizeSetting = phyloPlaceDbSetting(db, "minSeqSize");
 if (isEmpty(minSeqSizeSetting))
     minSeqSize = sameString(db, "wuhCor1") ? 10000 : 100000;
@@ -606,30 +605,29 @@
     maxSeqSize = sameString(db, "wuhCor1") ? 35000 : 220000;
 else
     maxSeqSize = atoi(maxSeqSizeSetting);
 struct seqInfo *filteredSeqs = checkSequences(allSeqs, treeNames, minSeqSize, maxSeqSize,
                                               retFailedSeqs);
 reportTiming(pStartTime, "read and check uploaded FASTA");
 if (filteredSeqs)
     {
     struct psl *alignments = alignSequences(refGenome, filteredSeqs, pStartTime);
     struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls);
     removeUnalignedSeqs(&filteredSeqs, filteredAlignments, retFailedSeqs);
     if (filteredAlignments)
         {
         AllocVar(tn);
         trashDirFile(tn, "ct", "pp", ".vcf");
-        pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, informativeBases,
-                         maskSites);
+        pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, maskSites);
         struct psl *psl;
         for (psl = filteredAlignments;  psl != NULL;  psl = psl->next)
             slNameAddHead(&sampleIds, psl->qName);
         analyzeGaps(filteredSeqs, refGenome);
         slReverse(&sampleIds);
         reportTiming(pStartTime, "write VCF for uploaded FASTA");
         }
     }
 *retSampleIds = sampleIds;
 *retSeqInfo = filteredSeqs;
 return tn;
 }