99599c7130790107ff0de9f043930da6aa7fddf1 angie Mon Nov 16 16:35:58 2020 -0800 Scripts for automating SARS-CoV-2 Phylogeny tracks (refs #26530): fetching sequences and metadata from several public sources, mapping GISAID IDs to public seq IDs, downloading the latest release of the phylogenetic tree from github.com/roblanf/sarscov2phylo/ , making VCFs from GISAID and public sequences, and using github.com/yatisht/usher to resolve ambiguous alleles, make protobuf files for hgPhyloPlace, and add public sequences that have not been mapped to GISAID sequences to the sarscov2phylo tree for a comprehensive public tree+VCF. This is still not fully otto-mated because certain crucial inputs like GISAID sequences still must be downloaded using a web browser, but the goal is to automate as much as possible and maybe someday have it fully cron-driven. There are two main top-level scripts which call other scripts, which may in turn call scripts, in this hierarchy: updateIdMapping.sh getCogUk.sh getNcbi.sh searchAllSarsCov2BioSample.sh bioSampleIdToText.sh bioSampleTextToTab.pl gbMetadataAddBioSample.pl fixNcbiFastaNames.pl updateSarsCov2Phylo.sh getRelease.sh processRelease.sh cladeLineageColors.pl mapPublic.sh extractUnmappedPublic.sh addUnmappedPublic.sh many of the above: util.sh publicCredits.sh will hopefully be folded into updateSarsCov2Phylo.sh when I figure out how to automate fetching of author/institution metadata from NCBI and COG-UK. diff --git src/hg/utils/otto/sarscov2phylo/util.sh src/hg/utils/otto/sarscov2phylo/util.sh new file mode 100755 index 0000000..416a96d --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/util.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# Define some handy functions for other bash scripts in this directory + +xcat () { + fasta=$1 + if [ "${fasta##*.}" == "xz" ]; then + xzcat $fasta + elif [ "${fasta##*.}" == "gz" ]; then + zcat $fasta + else + cat $fasta + fi +} +export -f xcat + +fastaNames () { + xcat $1 \ + | grep ^\> | sed -re 's/^>//;' +} +export -f fastaNames + +fastaSeqCount () { + xcat $1 \ + | grep ^\> | wc -l +} +export -f fastaSeqCount + +cleanGenbank () { + sed -re "s@Severe acute respiratory syndrome coronavirus 2 isolate SARS[ -]Co[Vv]-2/(human|homo ?sapiens)/@@; + s@Severe acute respiratory syndrome coronavirus 2 SARS-CoV-2/@@; + s@Mutant Severe acute respiratory syndrome coronavirus 2 clone SARS-CoV-2[_-]@@; + s@Severe acute respiratory syndrome coronavirus 2( isolate)?( 2019_nCoV)?@@; + s@[A-Za-z0-9]+ [a-z]*protein.*@@; + s@(( genomic)? RNA)?, ((nearly )?complete|partial) genome\$@@; + s@genome assembly(, complete genome)?: monopartite\$@@; + s@ (1 |nasopharyngeal )?genome assembly, chromosome: .*\$@@; + s@, complete sequence@@; + s@hCo[vV]-19/@@; + s@SARS?-CoV-?2/([Hh]umai?ns?|[Hh]o[mw]o ?sapiens?)/@@; + s@SARS-CoV-2/(environment|ENV)/@env/@; + s@SARS-CoV-2/Felis catus/@cat/@; + s@SARS-CoV-2/Panthera leo/@lion/@; + s@SARS-CoV-2/Panthera tigris/@tiger/@; + s@SARS-CoV-2/@@; + s@BetaCoV/@@; + s@Homo sapines/@@; + s@ \| @ \|@; s@ \$@@; s@ @ @; + s@ \|@\t@;" +# Got rid of this: s/ ([^|])/_\1/g; +} +export -f cleanGenbank + +cleanCncb () { + sed -re "s@^BetaCoV/@@; + s@^hCoV-19/@@; + s@^SARS-CoV-2/@@; + s@^human/@@; + s@ *\| *@\t@;" +} +export -f cleanCncb +