0e66591e44d669ef83fa016069b1ceb283c01800 angie Thu Jul 15 12:40:43 2021 -0700 Support more flexible ID search: accessions without dot-versions and isolate names. When loading FASTA, detect sequence names that are numeric or already in the tree and add a prefix to prevent usher from rejecting the sequences. diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c index e9082a5..3275373 100644 --- src/hg/hgPhyloPlace/vcfFromFasta.c +++ src/hg/hgPhyloPlace/vcfFromFasta.c @@ -24,43 +24,43 @@ char *gfOoc = NULL; int gfStepSize = gfTileSize; int gfOutMinIdentityPpt = 900; // The default minScore for gfLongDnaInMem's 4500 base preferred size is 30, but we want way // higher, expecting very similar sequences. -- But watch out for chunking effects in // gfLongDnaInMem. ... would be nice if gfLongDnaInMem scaled minMatch for the last little // chunk. int gfMinScore = 50; // Normally this would be 12 for a genome with size 29903 // (see hgBlat.c findMinMatch... k = ceil(log4(genomeSize*250))) // but we expect a really good alignment so we can crank it up. (Again modulo chunking) int gfIMinMatch = 30; -static struct seqInfo *checkSequences(struct dnaSeq *seqs, struct slPair **retFailedSeqs) +static struct seqInfo *checkSequences(struct dnaSeq *seqs, struct hash *treeNames, + struct slPair **retFailedSeqs) /* Return a list of sequences that pass basic QC checks (appropriate size etc). + * If any sequences have names that are already in the tree, add a prefix so usher doesn't + * reject them. * Set retFailedSeqs to the list of sequences that failed checks. */ { struct seqInfo *filteredSeqs = NULL; struct hash *uniqNames = hashNew(0); *retFailedSeqs = NULL; struct dnaSeq *seq, *nextSeq; for (seq = seqs; seq != NULL; seq = nextSeq) { - //#*** TODO: if the user uploads a sample with the same ID as one already in the - //#*** saved assignment file, then usher will ignore it! - //#*** Better check for that and warn the user. boolean passes = TRUE; nextSeq = seq->next; if (seq->size < minSeqSize) { struct dyString *dy = dyStringCreate("Sequence %s has too few bases (%d, must have " "at least %d); skipping.", seq->name, seq->size, minSeqSize); slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq); passes = FALSE; } else if (seq->size > maxSeqSize) { struct dyString *dy = dyStringCreate("Sequence %s has too many bases (%d, must have " "at most %d); skipping.", seq->name, seq->size, maxSeqSize); @@ -109,30 +109,40 @@ for (i = 0; i < seq->size; i++) if (seq->dna[i] != 'n' && isIupacAmbiguous(seq->dna[i])) ambigCount++; if (passes) { if (hashLookup(uniqNames, seq->name)) { struct dyString *dy = dyStringCreate("Sequence name '%s' has already been used; " "ignoring subsequent usage " "(%d bases, %d N's, %d ambiguous).", seq->name, seq->size, nCountTotal, ambigCount); slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq); } else { + if (isAllDigits(seq->name) || hashLookup(treeNames, seq->name)) + { + // Internal nodes of tree have numeric IDs, so usher may reject numeric name + // as a conflict. usher will definitely reject a sequence name already in the tree. + // Add a prefix so usher won't reject the sequence. + char newName[strlen(seq->name)+32]; + safef(newName, sizeof newName, "uploaded_%s", seq->name); + freeMem(seq->name); + seq->name = cloneString(newName); + } struct seqInfo *si; AllocVar(si); si->seq = seq; si->nCountStart = nCountStart; si->nCountMiddle = nCountMiddle; si->nCountEnd = nCountEnd; si->ambigCount = ambigCount; hashAdd(uniqNames, seq->name, NULL); slAddHead(&filteredSeqs, si); } } } slReverse(&filteredSeqs); slReverse(retFailedSeqs); hashFree(&uniqNames); @@ -568,41 +578,42 @@ } else dyStringPrintf(dyDel, "%d-%d:%d bases", tGapEnd - delLen + 1, tGapEnd, delLen); } } si->insBases = insBases; si->delBases = delBases; si->insRanges = dyStringCannibalize(&dyIns); si->delRanges = dyStringCannibalize(&dyDel); } } } struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome, boolean *informativeBases, struct slName **maskSites, + struct hash *treeNames, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime) /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ { struct tempName *tn = NULL; struct slName *sampleIds = NULL; struct dnaSeq *allSeqs = faReadAllMixedInLf(lf); -struct seqInfo *filteredSeqs = checkSequences(allSeqs, retFailedSeqs); +struct seqInfo *filteredSeqs = checkSequences(allSeqs, treeNames, retFailedSeqs); reportTiming(pStartTime, "read and check uploaded FASTA"); if (filteredSeqs) { struct psl *alignments = alignSequences(db, refGenome, filteredSeqs, pStartTime); struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls); removeUnalignedSeqs(&filteredSeqs, filteredAlignments, retFailedSeqs); if (filteredAlignments) { AllocVar(tn); trashDirFile(tn, "ct", "pp", ".vcf"); pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, informativeBases, maskSites); struct psl *psl; for (psl = filteredAlignments; psl != NULL; psl = psl->next) slNameAddHead(&sampleIds, psl->qName);