src/hg/hgPhyloPlace/vcfFromFasta.c afafa0301ea4b14fbe1fbd5aa379c5351ecd640d

afafa0301ea4b14fbe1fbd5aa379c5351ecd640d
angie
  Tue Aug 2 19:41:44 2022 -0700
Add support for non-wuhCor1 genomes (e.g. monkeypox GenArk hub).
* Search in hgPhyloPlaceData for config.ra files, taking assembly name (minus hub prefix) from directory name.
* Add a menu input to the main page for switching between supported genomes if there are more than one.
* Replace hardcoded values or global vars with dnaSeq attributes, assembly metadata queries or new config.ra settings.
* Separate out SARS-CoV-2-specific help text like GISAID/CNCB descriptions.
* Support metadata columns for GenBank-specific stuff & Nextstrain lineages (for MPXV).
* also a little refactoring in runUsher in preparation for supporting usher server mode: parse new placement info file so we don't have to parse that data form usher stderr output.

TODO: update Nextstrain/Auspice JSON output to use appropriate metadata columns and support monkeypox genes.

diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c
index 2b2e578..6e06e530 100644
--- src/hg/hgPhyloPlace/vcfFromFasta.c
+++ src/hg/hgPhyloPlace/vcfFromFasta.c
@@ -1,55 +1,49 @@
 /* Align user sequence to the reference genome and write VCF */
 
 /* Copyright (C) 2020 The Regents of the University of California */
 
 #include "common.h"
 #include "fa.h"
 #include "genoFind.h"
 #include "iupac.h"
 #include "parsimonyProto.h"
 #include "phyloPlace.h"
 #include "psl.h"
 #include "trashDir.h"
 
-// Globals
-extern int chromSize;
-
-// wuhCor1-specific:
-int minSeqSize = 10000;
-int maxSeqSize = 35000;
 double maxNs = 0.5;
 
 // Using blat defaults for these:
 int gfRepMatch = 1024*4;
 char *gfOoc = NULL;
 int gfStepSize = gfTileSize;
 int gfOutMinIdentityPpt = 900;
 
 // The default minScore for gfLongDnaInMem's 4500 base preferred size is 30, but we want way
 // higher, expecting very similar sequences. -- But watch out for chunking effects in
 // gfLongDnaInMem.  ... would be nice if gfLongDnaInMem scaled minMatch for the last little
 // chunk.
 int gfMinScore = 50;
 // Normally this would be 12 for a genome with size 29903
 // (see hgBlat.c findMinMatch... k = ceil(log4(genomeSize*250)))
 // but we expect a really good alignment so we can crank it up.  (Again modulo chunking)
 int gfIMinMatch = 30;
 
 
 static struct seqInfo *checkSequences(struct dnaSeq *seqs, struct hash *treeNames,
-                                      struct slPair **retFailedSeqs)
+                                      int minSeqSize, int maxSeqSize, struct slPair **retFailedSeqs)
 /* Return a list of sequences that pass basic QC checks (appropriate size etc).
  * If any sequences have names that are already in the tree, add a prefix so usher doesn't
  * reject them.
  * Set retFailedSeqs to the list of sequences that failed checks. */
 {
 struct seqInfo *filteredSeqs = NULL;
 struct hash *uniqNames = hashNew(0);
 *retFailedSeqs = NULL;
 struct dnaSeq *seq, *nextSeq;
 for (seq = seqs;  seq != NULL;  seq = nextSeq)
     {
     boolean passes = TRUE;
     nextSeq = seq->next;
     if (seq->size < minSeqSize)
         {
@@ -136,31 +130,31 @@
             si->nCountStart = nCountStart;
             si->nCountMiddle = nCountMiddle;
             si->nCountEnd = nCountEnd;
             si->ambigCount = ambigCount;
             hashAdd(uniqNames, seq->name, NULL);
             slAddHead(&filteredSeqs, si);
             }
         }
     }
 slReverse(&filteredSeqs);
 slReverse(retFailedSeqs);
 hashFree(&uniqNames);
 return filteredSeqs;
 }
 
-static struct psl *alignSequences(char *db, struct dnaSeq *refGenome, struct seqInfo *seqs,
+static struct psl *alignSequences(struct dnaSeq *refGenome, struct seqInfo *seqs,
                                   int *pStartTime)
 /* Use BLAT server to align seqs to reference; keep only the best alignment for each seq.
  * (In normal usage, there should be one very good alignment per seq.) */
 {
 struct psl *alignments = NULL;
 struct genoFind *gf = gfIndexSeq(refGenome, gfIMinMatch, gfMaxGap, gfTileSize, gfRepMatch, gfOoc,
                                  FALSE, FALSE, FALSE, gfStepSize, FALSE);
 reportTiming(pStartTime, "gfIndexSeq");
 struct tempName pslTn;
 trashDirFile(&pslTn, "ct", "pp", ".psl");
 FILE *f = mustOpen(pslTn.forCgi, "w");
 struct gfOutput *gvo = gfOutputPsl(gfOutMinIdentityPpt, FALSE, FALSE, f, FALSE, TRUE);
 struct seqInfo *si;
 for (si = seqs;  si != NULL;  si = si->next)
     {
@@ -427,31 +421,31 @@
             }
         if (blkIx < psl->blockCount - 1)
             {
             // Add no-calls for unaligned bases between this block and the next
             for (;  tStart < psl->tStarts[blkIx+1];  tStart++)
                 {
                 if (informativeBases[tStart])
                     {
                     char refBase = ref->dna[tStart];
                     addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount);
                     }
                 }
             }
         }
     // Add no-calls for unaligned bases after last block
-    for (tStart = psl->tEnd;  tStart < chromSize;  tStart++)
+    for (tStart = psl->tEnd;  tStart < ref->size;  tStart++)
         {
         if (informativeBases[tStart])
             {
             char refBase = ref->dna[tStart];
             addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount);
             }
         }
     slReverse(&qSeq->sncList);
     }
 }
 
 static void writeVcfSnv(struct snvInfo **snvsByPos, int gtCount, char *chrom, int chromStart,
                         FILE *f)
 /* If snvsByPos[chromStart] is not NULL, write out a VCF data row with genotypes. */
 {
@@ -588,35 +582,48 @@
 }
 
 struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome,
                               boolean *informativeBases, struct slName **maskSites,
                               struct hash *treeNames,
                               struct slName **retSampleIds, struct seqInfo **retSeqInfo,
                               struct slPair **retFailedSeqs, struct slPair **retFailedPsls,
                               int *pStartTime)
 /* Read in FASTA from lf and make sure each item has a reasonable size and not too high
  * percentage of N's.  Align to reference, extract SNVs from alignment, and save as VCF
  * with sample genotype columns. */
 {
 struct tempName *tn = NULL;
 struct slName *sampleIds = NULL;
 struct dnaSeq *allSeqs = faReadAllMixedInLf(lf);
-struct seqInfo *filteredSeqs = checkSequences(allSeqs, treeNames, retFailedSeqs);
+int minSeqSize = 0, maxSeqSize = 0;
+// Default to SARS-CoV-2 or hMPXV values if setting is missing from config.ra.
+char *minSeqSizeSetting = phyloPlaceDbSetting(db, "minSeqSize");
+if (isEmpty(minSeqSizeSetting))
+    minSeqSize = sameString(db, "wuhCor1") ? 10000 : 100000;
+else
+    minSeqSize = atoi(minSeqSizeSetting);
+char *maxSeqSizeSetting = phyloPlaceDbSetting(db, "maxSeqSize");
+if (isEmpty(maxSeqSizeSetting))
+    maxSeqSize = sameString(db, "wuhCor1") ? 35000 : 220000;
+else
+    maxSeqSize = atoi(maxSeqSizeSetting);
+struct seqInfo *filteredSeqs = checkSequences(allSeqs, treeNames, minSeqSize, maxSeqSize,
+                                              retFailedSeqs);
 reportTiming(pStartTime, "read and check uploaded FASTA");
 if (filteredSeqs)
     {
-    struct psl *alignments = alignSequences(db, refGenome, filteredSeqs, pStartTime);
+    struct psl *alignments = alignSequences(refGenome, filteredSeqs, pStartTime);
     struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls);
     removeUnalignedSeqs(&filteredSeqs, filteredAlignments, retFailedSeqs);
     if (filteredAlignments)
         {
         AllocVar(tn);
         trashDirFile(tn, "ct", "pp", ".vcf");
         pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, informativeBases,
                          maskSites);
         struct psl *psl;
         for (psl = filteredAlignments;  psl != NULL;  psl = psl->next)
             slNameAddHead(&sampleIds, psl->qName);
         analyzeGaps(filteredSeqs, refGenome);
         slReverse(&sampleIds);
         reportTiming(pStartTime, "write VCF for uploaded FASTA");
         }