src/hg/hgPhyloPlace/vcfFromFasta.c 0e66591e44d669ef83fa016069b1ceb283c01800

0e66591e44d669ef83fa016069b1ceb283c01800
angie
  Thu Jul 15 12:40:43 2021 -0700
Support more flexible ID search: accessions without dot-versions and isolate names.  When loading FASTA, detect sequence names that are numeric or already in the tree and add a prefix to prevent usher from rejecting the sequences.

diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c
index e9082a5..3275373 100644
--- src/hg/hgPhyloPlace/vcfFromFasta.c
+++ src/hg/hgPhyloPlace/vcfFromFasta.c
@@ -24,43 +24,43 @@
 char *gfOoc = NULL;
 int gfStepSize = gfTileSize;
 int gfOutMinIdentityPpt = 900;
 
 // The default minScore for gfLongDnaInMem's 4500 base preferred size is 30, but we want way
 // higher, expecting very similar sequences. -- But watch out for chunking effects in
 // gfLongDnaInMem.  ... would be nice if gfLongDnaInMem scaled minMatch for the last little
 // chunk.
 int gfMinScore = 50;
 // Normally this would be 12 for a genome with size 29903
 // (see hgBlat.c findMinMatch... k = ceil(log4(genomeSize*250)))
 // but we expect a really good alignment so we can crank it up.  (Again modulo chunking)
 int gfIMinMatch = 30;
 
 
-static struct seqInfo *checkSequences(struct dnaSeq *seqs, struct slPair **retFailedSeqs)
+static struct seqInfo *checkSequences(struct dnaSeq *seqs, struct hash *treeNames,
+                                      struct slPair **retFailedSeqs)
 /* Return a list of sequences that pass basic QC checks (appropriate size etc).
+ * If any sequences have names that are already in the tree, add a prefix so usher doesn't
+ * reject them.
  * Set retFailedSeqs to the list of sequences that failed checks. */
 {
 struct seqInfo *filteredSeqs = NULL;
 struct hash *uniqNames = hashNew(0);
 *retFailedSeqs = NULL;
 struct dnaSeq *seq, *nextSeq;
 for (seq = seqs;  seq != NULL;  seq = nextSeq)
     {
-    //#*** TODO: if the user uploads a sample with the same ID as one already in the
-    //#*** saved assignment file, then usher will ignore it!
-    //#*** Better check for that and warn the user.
     boolean passes = TRUE;
     nextSeq = seq->next;
     if (seq->size < minSeqSize)
         {
         struct dyString *dy = dyStringCreate("Sequence %s has too few bases (%d, must have "
                                              "at least %d); skipping.",
                                              seq->name, seq->size, minSeqSize);
         slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq);
         passes = FALSE;
         }
     else if (seq->size > maxSeqSize)
         {
         struct dyString *dy = dyStringCreate("Sequence %s has too many bases (%d, must have "
                                              "at most %d); skipping.",
                                              seq->name, seq->size, maxSeqSize);
@@ -109,30 +109,40 @@
     for (i = 0;  i < seq->size;  i++)
         if (seq->dna[i] != 'n' && isIupacAmbiguous(seq->dna[i]))
             ambigCount++;
     if (passes)
         {
         if (hashLookup(uniqNames, seq->name))
             {
             struct dyString *dy = dyStringCreate("Sequence name '%s' has already been used; "
                                                  "ignoring subsequent usage "
                                                  "(%d bases, %d N's, %d ambiguous).",
                                                  seq->name, seq->size, nCountTotal, ambigCount);
             slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq);
             }
         else
             {
+            if (isAllDigits(seq->name) || hashLookup(treeNames, seq->name))
+                {
+                // Internal nodes of tree have numeric IDs, so usher may reject numeric name
+                // as a conflict.  usher will definitely reject a sequence name already in the tree.
+                // Add a prefix so usher won't reject the sequence.
+                char newName[strlen(seq->name)+32];
+                safef(newName, sizeof newName, "uploaded_%s", seq->name);
+                freeMem(seq->name);
+                seq->name = cloneString(newName);
+                }
             struct seqInfo *si;
             AllocVar(si);
             si->seq = seq;
             si->nCountStart = nCountStart;
             si->nCountMiddle = nCountMiddle;
             si->nCountEnd = nCountEnd;
             si->ambigCount = ambigCount;
             hashAdd(uniqNames, seq->name, NULL);
             slAddHead(&filteredSeqs, si);
             }
         }
     }
 slReverse(&filteredSeqs);
 slReverse(retFailedSeqs);
 hashFree(&uniqNames);
@@ -568,41 +578,42 @@
                     }
                 else
                     dyStringPrintf(dyDel, "%d-%d:%d bases", tGapEnd - delLen + 1, tGapEnd, delLen);
                 }
             }
         si->insBases = insBases;
         si->delBases = delBases;
         si->insRanges = dyStringCannibalize(&dyIns);
         si->delRanges = dyStringCannibalize(&dyDel);
         }
     }
 }
 
 struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome,
                               boolean *informativeBases, struct slName **maskSites,
+                              struct hash *treeNames,
                               struct slName **retSampleIds, struct seqInfo **retSeqInfo,
                               struct slPair **retFailedSeqs, struct slPair **retFailedPsls,
                               int *pStartTime)
 /* Read in FASTA from lf and make sure each item has a reasonable size and not too high
  * percentage of N's.  Align to reference, extract SNVs from alignment, and save as VCF
  * with sample genotype columns. */
 {
 struct tempName *tn = NULL;
 struct slName *sampleIds = NULL;
 struct dnaSeq *allSeqs = faReadAllMixedInLf(lf);
-struct seqInfo *filteredSeqs = checkSequences(allSeqs, retFailedSeqs);
+struct seqInfo *filteredSeqs = checkSequences(allSeqs, treeNames, retFailedSeqs);
 reportTiming(pStartTime, "read and check uploaded FASTA");
 if (filteredSeqs)
     {
     struct psl *alignments = alignSequences(db, refGenome, filteredSeqs, pStartTime);
     struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls);
     removeUnalignedSeqs(&filteredSeqs, filteredAlignments, retFailedSeqs);
     if (filteredAlignments)
         {
         AllocVar(tn);
         trashDirFile(tn, "ct", "pp", ".vcf");
         pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, informativeBases,
                          maskSites);
         struct psl *psl;
         for (psl = filteredAlignments;  psl != NULL;  psl = psl->next)
             slNameAddHead(&sampleIds, psl->qName);