src/hg/hgPhyloPlace/vcfFromFasta.c 2d05d30ed4df1612d72ba84c812d004de935b122

2d05d30ed4df1612d72ba84c812d004de935b122
angie
  Fri May 17 16:08:54 2024 -0700
Add lib module mmHash (memory-mapped hash), util tabToMmHash, and hgPhyloPlace support for using mmHash files instead of tab-separated files for metadata and name lookup.
Using mmHash for name lookup saves about 50-55 seconds for SARS-CoV-2 hgPhyloPlace name/ID queries.

diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c
index c8308dd..1b978f7 100644
--- src/hg/hgPhyloPlace/vcfFromFasta.c
+++ src/hg/hgPhyloPlace/vcfFromFasta.c
@@ -1,23 +1,24 @@
 /* Align user sequence to the reference genome and write VCF */
 
 /* Copyright (C) 2020-2024 The Regents of the University of California */
 
 #include "common.h"
 #include "fa.h"
 #include "genoFind.h"
 #include "iupac.h"
+#include "mmHash.h"
 #include "parsimonyProto.h"
 #include "phyloPlace.h"
 #include "pipeline.h"
 #include "psl.h"
 #include "trashDir.h"
 
 double maxNs = 0.5;
 
 // Using blat defaults for these:
 int gfRepMatch = 1024*4;
 char *gfOoc = NULL;
 int gfStepSize = gfTileSize;
 int gfOutMinIdentityPpt = 900;
 
 // The default minScore for gfLongDnaInMem's 4500 base preferred size is 30, but we want way
@@ -41,31 +42,31 @@
      subChar(seqName, '(', '_');
      subChar(seqName, ')', '_');
      subChar(seqName, ':', '_');
      subChar(seqName, ';', '_');
      subChar(seqName, ',', '_');
      // Strip any underscores from beginning/end; strip double-underscores
      while (seqName[0] == '_')
          memmove(seqName, seqName+1, strlen(seqName)+1);
      char *p;
      while (*(p = seqName + strlen(seqName) - 1) == '_')
          *p = '\0';
      strSwapStrs(seqName, strlen(seqName), "__", "_");
     }
 }
 
-static struct seqInfo *checkSequences(struct dnaSeq *seqs, struct hash *treeNames,
+static struct seqInfo *checkSequences(struct dnaSeq *seqs, struct hashOrMmHash *treeNames,
                                       int minSeqSize, int maxSeqSize, struct slPair **retFailedSeqs)
 /* Return a list of sequences that pass basic QC checks (appropriate size etc).
  * If any sequences have names that are already in the tree, add a prefix so usher doesn't
  * reject them.  If any sequence name contains characters that have special meaning in the Newick
  * format, replace them with something harmless.
  * Set retFailedSeqs to the list of sequences that failed checks. */
 {
 struct seqInfo *filteredSeqs = NULL;
 struct hash *uniqNames = hashNew(0);
 *retFailedSeqs = NULL;
 struct dnaSeq *seq, *nextSeq;
 for (seq = seqs;  seq != NULL;  seq = nextSeq)
     {
     boolean passes = TRUE;
     nextSeq = seq->next;
@@ -128,40 +129,49 @@
         if (seq->dna[i] != 'n' && isIupacAmbiguous(seq->dna[i]))
             ambigCount++;
     if (passes)
         {
         replaceNewickChars(seq->name);
         if (hashLookup(uniqNames, seq->name))
             {
             struct dyString *dy = dyStringCreate("Sequence name '%s' has already been used; "
                                                  "ignoring subsequent usage "
                                                  "(%d bases, %d N's, %d ambiguous).",
                                                  seq->name, seq->size, nCountTotal, ambigCount);
             slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq);
             }
         else
             {
-            if (treeNames &&
-                (isInternalNodeName(seq->name, 0) || hashLookup(treeNames, seq->name)))
+            // If non-NULL treeNames was passed in then we're using original usher not usher-sampled
+            // and need to modify uploaded names that are already in the tree, if any.
+            if (treeNames)
+                {
+                boolean foundInTreeNames = FALSE;
+                if (treeNames->hash)
+                    foundInTreeNames = (hashLookup(treeNames->hash, seq->name) != NULL);
+                else
+                    foundInTreeNames = (mmHashFindVal(treeNames->mmh, seq->name) != NULL);
+                if (foundInTreeNames || isInternalNodeName(seq->name, 0))
                     {
                     // usher will reject any sequence whose name is already in the tree, even a
                     // numeric internal node name.  Add a prefix so usher won't reject sequence.
                     char newName[strlen(seq->name)+32];
                     safef(newName, sizeof newName, "uploaded_%s", seq->name);
                     freeMem(seq->name);
                     seq->name = cloneString(newName);
                     }
+                }
             struct seqInfo *si;
             AllocVar(si);
             si->seq = seq;
             si->nCountStart = nCountStart;
             si->nCountMiddle = nCountMiddle;
             si->nCountEnd = nCountEnd;
             si->ambigCount = ambigCount;
             hashAdd(uniqNames, seq->name, NULL);
             slAddHead(&filteredSeqs, si);
             }
         }
     }
 slReverse(&filteredSeqs);
 slReverse(retFailedSeqs);
 hashFree(&uniqNames);
@@ -960,31 +970,31 @@
         }
     slReverse(&sampleIds);
     writeSnvsToVcfFile(snvsByPos, ref, sampleNames, sampleCount, maskSites, tn->forCgi);
     }
 
 // Clean up
 freeMem(snvsByPos);
 unlink(tnSeqFile.forCgi);
 unlink(nextcladeOut);
 *retSampleIds = sampleIds;
 *retSeqInfo = filteredSeqs;
 return tn;
 }
 
 struct tempName *vcfFromFasta(struct lineFile *lf, char *org, char *db, struct dnaSeq *refGenome,
-                              struct slName **maskSites, struct hash *treeNames,
+                              struct slName **maskSites, struct hashOrMmHash *treeNames,
                               struct slName **retSampleIds, struct seqInfo **retSeqInfo,
                               struct slPair **retFailedSeqs, struct slPair **retFailedPsls,
                               int *pStartTime)
 /* Read in FASTA from lf and make sure each item has a reasonable size and not too high
  * percentage of N's.  Align to reference, extract SNVs from alignment, and save as VCF
  * with sample genotype columns. */
 {
 struct tempName *tn = NULL;
 struct dnaSeq *allSeqs = faReadAllMixedInLf(lf);
 int minSeqSize = 0, maxSeqSize = 0;
 // Default to SARS-CoV-2 or hMPXV values if setting is missing from config.ra.
 char *minSeqSizeSetting = phyloPlaceRefSetting(org, db, "minSeqSize");
 if (isEmpty(minSeqSizeSetting))
     minSeqSize = sameString(db, "wuhCor1") ? 10000 : 100000;
 else