0e66591e44d669ef83fa016069b1ceb283c01800 angie Thu Jul 15 12:40:43 2021 -0700 Support more flexible ID search: accessions without dot-versions and isolate names. When loading FASTA, detect sequence names that are numeric or already in the tree and add a prefix to prevent usher from rejecting the sequences. diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h index 64391d4..db22d6f 100644 --- src/hg/hgPhyloPlace/phyloPlace.h +++ src/hg/hgPhyloPlace/phyloPlace.h @@ -136,30 +136,31 @@ char *origLab; // Originating lab char *subLab; // Submitting lab char *region; // Continent on which sample was collected }; struct geneInfo /* Information sufficient to determine whether a genome change causes a coding change. */ { struct geneInfo *next; struct psl *psl; // Alignment of transcript to genome struct dnaSeq *txSeq; // Transcript sequence }; struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome, boolean *informativeBases, struct slName **maskSites, + struct hash *treeNames, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime); /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile, int subtreeSize, struct slName *userSampleIds, struct hash *condensedNodes, int *pStartTime); /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and * subtrees to trash files, return list of slRef to struct tempName for the trash files * and parse other results out of stderr output. */ struct usherResults *runMatUtilsExtractSubtrees(char *matUtilsPath, char *protobufPath,