66a1a82f72f9f3295bb4007b1b5681013976a694 angie Wed Mar 20 15:20:27 2024 -0700 Overhaul struct sampleMetadata: instead of one struct member per anticipated column, use an array of column values and a shared array of column names. As we support more species, the available metadata gets more divergent so this needs to be more general. We still need to make aggregation of attributes on branches more generic / config-driven. diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h index ede0d8f2..f0edb7e 100644 --- src/hg/hgPhyloPlace/phyloPlace.h +++ src/hg/hgPhyloPlace/phyloPlace.h @@ -116,51 +116,34 @@ struct slName *subtreeNameList; // List of leaf names with nicer names for cond. nodes }; struct usherResults /* Tree+samples download file, sample placements, and subtrees parsed from usher output. */ { struct tempName *bigTreePlusTn; // Newick file: original tree plus user's samples struct hash *samplePlacements; // Info about each sample's placement in the tree struct subtreeInfo *singleSubtreeInfo; // Comprehensive subtree with all uploaded samples struct subtreeInfo *subtreeInfoList; // For each subtree: tree, file, node info etc. }; struct sampleMetadata /* Information about a virus sample. */ { - char *strain; // Strain name, usually of the form Country/ArbitraryId/YYYY-MM-DD - char *epiId; // GISAID EPI_ISL_[0-9]+ ID - char *gbAcc; // GenBank accession - char *date; // Sample collection date - char *author; // Author(s) to credit - char *nClade; // Nextstrain year-letter clade assigned by nextclade - char *gClade; // GISAID amino acid change clade - char *lineage; // Pango lineage assigned by pangolin - char *country; // Country in which sample was collected - char *division; // Administrative division in which sample was collected (country or state) - char *location; // Location in which sample was collected (city) - char *countryExp; // Country in which host was exposed to virus - char *divExp; // Administrative division in which host was exposed to virus - char *origLab; // Originating lab - char *subLab; // Submitting lab - char *region; // Continent on which sample was collected - char *nCladeUsher; // Nextstrain clade according to annotated tree - char *lineageUsher; // Pango lineage according to annotated tree - char *authors; // Sequence submitters/authors - char *pubs; // PubMed ID numbers of publications associated with sequences - char *nLineage; // Nextstrain letter-dot-numbers lineage assigned by nextclade + size_t columnCount; // Number of metadata columns + char **columnNames; // Metadata column names (e.g. date, genbank_accession, pangolin_lineage) + // -- shared by all metadata rows, not allocated for each struct + char **columnValues; // Metadata column values -- allocated for each struct }; struct geneInfo /* Information sufficient to determine whether a genome change causes a coding change. */ { struct geneInfo *next; struct psl *psl; // Alignment of transcript to genome struct dnaSeq *txSeq; // Transcript sequence struct genbankCds *cds; // CDS (for those few pathogens that have transcript UTRs) int cdsStart; // genePred cdsStart (genome coord, really cds end if - strand) int cdsEnd; // genePred cdsEnd (genome coord, really cds start if - strand) }; struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome, struct slName **maskSites, struct hash *treeNames,