aa32b124d6e43ec6717547bed2b18c489a1d04c8 angie Fri Jun 2 11:35:49 2023 -0700 If user uploads fasta with names that include characters with special meaning in Newick then replace those with _. Influenza sequence names often include serotype in parens e.g. (H3N2), (H5N6). diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c index f914657..5e1b00b 100644 --- src/hg/hgPhyloPlace/vcfFromFasta.c +++ src/hg/hgPhyloPlace/vcfFromFasta.c @@ -18,35 +18,58 @@ char *gfOoc = NULL; int gfStepSize = gfTileSize; int gfOutMinIdentityPpt = 900; // The default minScore for gfLongDnaInMem's 4500 base preferred size is 30, but we want way // higher, expecting very similar sequences. -- But watch out for chunking effects in // gfLongDnaInMem. ... would be nice if gfLongDnaInMem scaled minMatch for the last little // chunk. int gfMinScore = 50; // Normally this would be 12 for a genome with size 29903 // (see hgBlat.c findMinMatch... k = ceil(log4(genomeSize*250))) // but we expect a really good alignment so we can crank it up. (Again modulo chunking) int gfIMinMatch = 30; +static void replaceNewickChars(char *seqName) +/* If seqName includes any characters with special meaning in the Newick format, then substitute + * _ for each problematic character in seqName. */ +{ +if (strchr(seqName, '(') || strchr(seqName, ')') || strchr(seqName, ':') || strchr(seqName, ';') || + strchr(seqName, ',')) + { + subChar(seqName, '(', '_'); + subChar(seqName, ')', '_'); + subChar(seqName, ':', '_'); + subChar(seqName, ';', '_'); + subChar(seqName, ',', '_'); + // Strip any underscores from beginning/end; strip double-underscores + while (seqName[0] == '_') + memmove(seqName, seqName+1, strlen(seqName)+1); + char *p; + while (*(p = seqName + strlen(seqName) - 1) == '_') + *p = '\0'; + strSwapStrs(seqName, strlen(seqName), "__", "_"); + } +} + static struct seqInfo *checkSequences(struct dnaSeq *seqs, struct hash *treeNames, int minSeqSize, int maxSeqSize, struct slPair **retFailedSeqs) /* Return a list of sequences that pass basic QC checks (appropriate size etc). * If any sequences have names that are already in the tree, add a prefix so usher doesn't - * reject them. + * reject them. If any sequence name contains characters that have special meaning in the Newick + * format, replace them with something harmless. * Set retFailedSeqs to the list of sequences that failed checks. */ { struct seqInfo *filteredSeqs = NULL; struct hash *uniqNames = hashNew(0); *retFailedSeqs = NULL; struct dnaSeq *seq, *nextSeq; for (seq = seqs; seq != NULL; seq = nextSeq) { boolean passes = TRUE; nextSeq = seq->next; if (seq->size < minSeqSize) { struct dyString *dy = dyStringCreate("Sequence %s has too few bases (%d, must have " "at least %d); skipping.", seq->name, seq->size, minSeqSize); @@ -93,30 +116,31 @@ if (nCountMiddle > maxNs * effectiveLength) { struct dyString *dy = dyStringCreate("Sequence %s has too many N bases (%d out of %d " "> %0.2f); skipping.", seq->name, nCountMiddle, effectiveLength, maxNs); slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq); passes = FALSE; } int ambigCount = 0; int i; for (i = 0; i < seq->size; i++) if (seq->dna[i] != 'n' && isIupacAmbiguous(seq->dna[i])) ambigCount++; if (passes) { + replaceNewickChars(seq->name); if (hashLookup(uniqNames, seq->name)) { struct dyString *dy = dyStringCreate("Sequence name '%s' has already been used; " "ignoring subsequent usage " "(%d bases, %d N's, %d ambiguous).", seq->name, seq->size, nCountTotal, ambigCount); slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq); } else { if (treeNames && (isInternalNodeName(seq->name, 0) || hashLookup(treeNames, seq->name))) { // usher will reject any sequence whose name is already in the tree, even a // numeric internal node name. Add a prefix so usher won't reject sequence.