705420e703b067fbcad43ab67ed3e131552e7ac8 angie Wed Apr 24 14:23:03 2024 -0700 Pathogen drop-down choices can now be groups of references/trees, for example 'Dengue (types 1 - 4)' instead of a separate choice for each type. Instead of config.ra, each group has an organism.ra and subdirectories named after reference accessions that contain reference.ra files. nextclade sort is used to match the user's uploaded sequences against available references for the selected pathogen. SARS-CoV-2, M. tuberculosis and hMPXV still have only one reference and still use config.ra, but RSV, Dengue and Influenza will become groups. Presentation is still kinda rough, just a loop on the original results output. The server commands part needs testing and will not work yet for groups (currently used only for SARS-CoV-2). diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c index 1cabca6..37d1b12 100644 --- src/hg/hgPhyloPlace/vcfFromFasta.c +++ src/hg/hgPhyloPlace/vcfFromFasta.c @@ -1,18 +1,18 @@ /* Align user sequence to the reference genome and write VCF */ -/* Copyright (C) 2020 The Regents of the University of California */ +/* Copyright (C) 2020-2024 The Regents of the University of California */ #include "common.h" #include "fa.h" #include "genoFind.h" #include "iupac.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "psl.h" #include "trashDir.h" double maxNs = 0.5; // Using blat defaults for these: int gfRepMatch = 1024*4; char *gfOoc = NULL; @@ -593,50 +593,50 @@ touppers(delSeq); dyStringPrintf(dyDel, "%d-%d:%s", tGapEnd - delLen + 1, tGapEnd, delSeq); } else dyStringPrintf(dyDel, "%d-%d:%d bases", tGapEnd - delLen + 1, tGapEnd, delLen); } } si->insBases = insBases; si->delBases = delBases; si->insRanges = dyStringCannibalize(&dyIns); si->delRanges = dyStringCannibalize(&dyDel); } } } -struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome, +struct tempName *vcfFromFasta(struct lineFile *lf, char *org, char *db, struct dnaSeq *refGenome, struct slName **maskSites, struct hash *treeNames, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime) /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ { struct tempName *tn = NULL; struct slName *sampleIds = NULL; struct dnaSeq *allSeqs = faReadAllMixedInLf(lf); int minSeqSize = 0, maxSeqSize = 0; // Default to SARS-CoV-2 or hMPXV values if setting is missing from config.ra. -char *minSeqSizeSetting = phyloPlaceDbSetting(db, "minSeqSize"); +char *minSeqSizeSetting = phyloPlaceRefSetting(org, db, "minSeqSize"); if (isEmpty(minSeqSizeSetting)) minSeqSize = sameString(db, "wuhCor1") ? 10000 : 100000; else minSeqSize = atoi(minSeqSizeSetting); -char *maxSeqSizeSetting = phyloPlaceDbSetting(db, "maxSeqSize"); +char *maxSeqSizeSetting = phyloPlaceRefSetting(org, db, "maxSeqSize"); if (isEmpty(maxSeqSizeSetting)) maxSeqSize = sameString(db, "wuhCor1") ? 35000 : 220000; else maxSeqSize = atoi(maxSeqSizeSetting); struct seqInfo *filteredSeqs = checkSequences(allSeqs, treeNames, minSeqSize, maxSeqSize, retFailedSeqs); reportTiming(pStartTime, "read and check uploaded FASTA"); if (filteredSeqs) { struct psl *alignments = alignSequences(refGenome, filteredSeqs, pStartTime); struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls); removeUnalignedSeqs(&filteredSeqs, filteredAlignments, retFailedSeqs); if (filteredAlignments) { AllocVar(tn);