src/hg/hgPhyloPlace/vcfFromFasta.c aa32b124d6e43ec6717547bed2b18c489a1d04c8

aa32b124d6e43ec6717547bed2b18c489a1d04c8
angie
  Fri Jun 2 11:35:49 2023 -0700
If user uploads fasta with names that include characters with special meaning in Newick then replace those with _.  Influenza sequence names often include serotype in parens e.g. (H3N2), (H5N6).

diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c
index f914657..5e1b00b 100644
--- src/hg/hgPhyloPlace/vcfFromFasta.c
+++ src/hg/hgPhyloPlace/vcfFromFasta.c
@@ -18,35 +18,58 @@
 char *gfOoc = NULL;
 int gfStepSize = gfTileSize;
 int gfOutMinIdentityPpt = 900;
 
 // The default minScore for gfLongDnaInMem's 4500 base preferred size is 30, but we want way
 // higher, expecting very similar sequences. -- But watch out for chunking effects in
 // gfLongDnaInMem.  ... would be nice if gfLongDnaInMem scaled minMatch for the last little
 // chunk.
 int gfMinScore = 50;
 // Normally this would be 12 for a genome with size 29903
 // (see hgBlat.c findMinMatch... k = ceil(log4(genomeSize*250)))
 // but we expect a really good alignment so we can crank it up.  (Again modulo chunking)
 int gfIMinMatch = 30;
 
 
+static void replaceNewickChars(char *seqName)
+/* If seqName includes any characters with special meaning in the Newick format, then substitute
+ * _ for each problematic character in seqName. */
+{
+if (strchr(seqName, '(') || strchr(seqName, ')') || strchr(seqName, ':') || strchr(seqName, ';') ||
+    strchr(seqName, ','))
+    {
+     subChar(seqName, '(', '_');
+     subChar(seqName, ')', '_');
+     subChar(seqName, ':', '_');
+     subChar(seqName, ';', '_');
+     subChar(seqName, ',', '_');
+     // Strip any underscores from beginning/end; strip double-underscores
+     while (seqName[0] == '_')
+         memmove(seqName, seqName+1, strlen(seqName)+1);
+     char *p;
+     while (*(p = seqName + strlen(seqName) - 1) == '_')
+         *p = '\0';
+     strSwapStrs(seqName, strlen(seqName), "__", "_");
+    }
+}
+
 static struct seqInfo *checkSequences(struct dnaSeq *seqs, struct hash *treeNames,
                                       int minSeqSize, int maxSeqSize, struct slPair **retFailedSeqs)
 /* Return a list of sequences that pass basic QC checks (appropriate size etc).
  * If any sequences have names that are already in the tree, add a prefix so usher doesn't
- * reject them.
+ * reject them.  If any sequence name contains characters that have special meaning in the Newick
+ * format, replace them with something harmless.
  * Set retFailedSeqs to the list of sequences that failed checks. */
 {
 struct seqInfo *filteredSeqs = NULL;
 struct hash *uniqNames = hashNew(0);
 *retFailedSeqs = NULL;
 struct dnaSeq *seq, *nextSeq;
 for (seq = seqs;  seq != NULL;  seq = nextSeq)
     {
     boolean passes = TRUE;
     nextSeq = seq->next;
     if (seq->size < minSeqSize)
         {
         struct dyString *dy = dyStringCreate("Sequence %s has too few bases (%d, must have "
                                              "at least %d); skipping.",
                                              seq->name, seq->size, minSeqSize);
@@ -93,30 +116,31 @@
     if (nCountMiddle > maxNs * effectiveLength)
         {
         struct dyString *dy = dyStringCreate("Sequence %s has too many N bases (%d out of %d "
                                              "> %0.2f); skipping.",
                                              seq->name, nCountMiddle, effectiveLength, maxNs);
         slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq);
         passes = FALSE;
         }
     int ambigCount = 0;
     int i;
     for (i = 0;  i < seq->size;  i++)
         if (seq->dna[i] != 'n' && isIupacAmbiguous(seq->dna[i]))
             ambigCount++;
     if (passes)
         {
+        replaceNewickChars(seq->name);
         if (hashLookup(uniqNames, seq->name))
             {
             struct dyString *dy = dyStringCreate("Sequence name '%s' has already been used; "
                                                  "ignoring subsequent usage "
                                                  "(%d bases, %d N's, %d ambiguous).",
                                                  seq->name, seq->size, nCountTotal, ambigCount);
             slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq);
             }
         else
             {
             if (treeNames &&
                 (isInternalNodeName(seq->name, 0) || hashLookup(treeNames, seq->name)))
                 {
                 // usher will reject any sequence whose name is already in the tree, even a
                 // numeric internal node name.  Add a prefix so usher won't reject sequence.