06d7be056190c14b85e71bc12523f18ea6815b5e markd Mon Dec 7 00:50:29 2020 -0800 BLAT mmap index support merge with master diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h new file mode 100644 index 0000000..9408a7f --- /dev/null +++ src/hg/hgPhyloPlace/phyloPlace.h @@ -0,0 +1,161 @@ +/* Place SARS-CoV-2 sequences in phylogenetic tree using add_missing_samples program. */ + +#ifndef _PHYLO_PLACE_H_ +#define _PHYLO_PLACE_H_ + +#include "common.h" +#include "dnaseq.h" +#include "hash.h" +#include "linefile.h" +#include "parsimonyProto.h" +#include "phyloTree.h" +#include "trashDir.h" + +#define PHYLOPLACE_DATA_DIR "hgPhyloPlaceData" + +#define NEXTSTRAIN_DRAG_DROP_DOC "https://docs.nextstrain.org/projects/auspice/en/latest/advanced-functionality/drag-drop-csv-tsv.html" + +struct seqInfo +/* User sequences, alignments and statistics */ +{ + struct seqInfo *next; + struct dnaSeq *seq; + struct psl *psl; + struct singleNucChange *sncList; + struct singleNucChange *maskedSncList; + struct slRef *maskedReasonsList; + uint nCountStart; + uint nCountMiddle; + uint nCountEnd; + uint ambigCount; +}; + +struct variantPathNode +/* A list of these gives a path from phylo tree root to some node (usually user seq leaf). */ + { + struct variantPathNode *next; + char *nodeName; // Either a numeric internal node ID or the user seq name + struct singleNucChange *sncList; // One or more single nucleotide changes associated with node + }; + +struct bestNodeInfo + { + struct bestNodeInfo *next; + char *name; // Node name + struct variantPathNode *variantPath; // Mutations assigned to nodes along path from root->node + boolean isSibling; // Placement would be as sibling of node (not child) + }; + +struct baseVal +/* List of imputed base positions and values */ + { + struct baseVal *next; + int chromStart; // 0-based position + char *val; // nucleotide base(s) + }; + +struct placementInfo +/* Info about a sample's mutations and placement in the phylo tree */ + { + char *sampleId; // Sample name from FASTA or VCF header + struct slName *sampleMuts; // Differences with the reference genome + struct variantPathNode *variantPath; // Mutations assigned to nodes along path from root + struct bestNodeInfo *bestNodes; // Other nodes identified as equally parsimonious + struct baseVal *imputedBases; // Ambiguous bases imputed to ref/alt [ACGT] + int parsimonyScore; // Parsimony cost of placing sample + int bestNodeCount; // Number of equally parsimonious placements + }; + +struct subtreeInfo +/* Parsed subtree from usher and derivative products. */ + { + struct subtreeInfo *next; + struct tempName *subtreeTn; // Newick file from usher (may have condensed nodes) + struct phyloTree *subtree; // Parsed subtree (#*** with annotated muts? shoudl we?_ + struct hash *subtreeIdToIx; // Map of subtree nodes to VCF output sample order + struct slName *subtreeUserSampleIds; // List of user-uploaded samples in this subtree + struct slName *subtreeNameList; // List of leaf names with nicer names for cond. nodes + }; + +struct usherResults +/* Tree+samples download file, sample placements, and subtrees parsed from usher output. */ + { + struct tempName *bigTreePlusTn; // Newick file: original tree plus user's samples + struct hash *samplePlacements; // Info about each sample's placement in the tree + struct subtreeInfo *subtreeInfoList; // For each subtree: tree, file, node info etc. + }; + +struct sampleMetadata +/* Information about a virus sample. */ + { + char *strain; // Strain name, usually of the form Country/ArbitraryId/YYYY-MM-DD + char *epiId; // GISAID EPI_ISL_[0-9]+ ID + char *gbAcc; // GenBank accession + char *date; // Sample collection date + char *author; // Author(s) to credit + char *nClade; // Nextstrain year-letter clade + char *gClade; // GISAID amino acid change clade + char *lineage; // Pangolin lineage + char *country; // Country in which sample was collected + char *division; // Administrative division in which sample was collected (country or state) + char *location; // Location in which sample was collected (city) + char *countryExp; // Country in which host was exposed to virus + char *divExp; // Administrative division in which host was exposed to virus + char *origLab; // Originating lab + char *subLab; // Submitting lab + char *region; // Continent on which sample was collected + }; + +struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome, + boolean *informativeBases, struct slName **maskSites, + struct slName **retSampleIds, struct seqInfo **retSeqInfo, + struct slPair **retFailedSeqs, struct slPair **retFailedPsls, + int *pStartTime); +/* Read in FASTA from lf and make sure each item has a reasonable size and not too high + * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF + * with sample genotype columns. */ + +struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile, + int subtreeSize, struct slName *userSampleIds, + struct hash *condensedNodes, int *pStartTime); +/* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and + * subtrees to trash files, return list of slRef to struct tempName for the trash files + * and parse other results out of stderr output. */ + +void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct dnaSeq *ref, + char *bigGenePredFile, struct hash *sampleMetadata, char *jsonFile); +/* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format + * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ + +struct tempName *writeCustomTracks(struct tempName *vcfTn, struct usherResults *ur, + struct slName *sampleIds, struct phyloTree *bigTree, + int fontHeight, int *pStartTime); +/* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */ + + +struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId); +/* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */ + +struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds); +/* Prune all descendants of node that have no leaf descendants in sampleIds. */ + +char *phyloPlaceDbSetting(char *db, char *settingName); +/* Return a setting from hgPhyloPlaceData/<db>/config.ra or NULL if not found. */ + +char *phyloPlaceDbSettingPath(char *db, char *settingName); +/* Return path to a file named by a setting from hgPhyloPlaceData/<db>/config.ra, + * or NULL if not found. (Append hgPhyloPlaceData/<db>/ to the beginning of relative path) */ + +void reportTiming(int *pStartTime, char *message); +/* Print out a report to stderr of how much time something took. */ + +boolean hgPhyloPlaceEnabled(); +/* Return TRUE if hgPhyloPlace is enabled in hg.conf and db wuhCor1 exists. */ + +char *phyloPlaceSamples(struct lineFile *lf, char *db, boolean doMeasureTiming, int subtreeSize, + int fontHeight); +/* Given a lineFile that contains either FASTA or VCF, prepare VCF for add_missing_samples; + * if that goes well then run add_missing_samples, report results, make custom track files + * and return the top-level custom track file; otherwise return NULL. */ + +#endif //_PHYLO_PLACE_H_