99599c7130790107ff0de9f043930da6aa7fddf1 angie Mon Nov 16 16:35:58 2020 -0800 Scripts for automating SARS-CoV-2 Phylogeny tracks (refs #26530): fetching sequences and metadata from several public sources, mapping GISAID IDs to public seq IDs, downloading the latest release of the phylogenetic tree from github.com/roblanf/sarscov2phylo/ , making VCFs from GISAID and public sequences, and using github.com/yatisht/usher to resolve ambiguous alleles, make protobuf files for hgPhyloPlace, and add public sequences that have not been mapped to GISAID sequences to the sarscov2phylo tree for a comprehensive public tree+VCF. This is still not fully otto-mated because certain crucial inputs like GISAID sequences still must be downloaded using a web browser, but the goal is to automate as much as possible and maybe someday have it fully cron-driven. There are two main top-level scripts which call other scripts, which may in turn call scripts, in this hierarchy: updateIdMapping.sh getCogUk.sh getNcbi.sh searchAllSarsCov2BioSample.sh bioSampleIdToText.sh bioSampleTextToTab.pl gbMetadataAddBioSample.pl fixNcbiFastaNames.pl updateSarsCov2Phylo.sh getRelease.sh processRelease.sh cladeLineageColors.pl mapPublic.sh extractUnmappedPublic.sh addUnmappedPublic.sh many of the above: util.sh publicCredits.sh will hopefully be folded into updateSarsCov2Phylo.sh when I figure out how to automate fetching of author/institution metadata from NCBI and COG-UK. diff --git src/hg/utils/otto/sarscov2phylo/publicCredits.sh src/hg/utils/otto/sarscov2phylo/publicCredits.sh new file mode 100644 index 0000000..1c464ff --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/publicCredits.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -beEu -x -o pipefail + +# Do not modify this script, modify the source tree copy: +# kent/src/hg/utils/otto/sarscov2phylo/mapPublic.sh + +usage() { + echo "usage: $0" +} + +if [ $# != 0 ]; then + usage + exit 1 +fi + +cncbMetadata=/hive/data/outside/otto/sarscov2phylo/cncb.latest/cncb.metadata.tsv +ncbiMetadata=ncbi.authors.20-11-13.csv +#*** TODO AUTOMATE ME +# * https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=SARS-CoV-2,%20taxid:2697049 +# * Download button +# * Current table view result --> CSV format, Next button +# * Download all records, Next button +# * Select Accession and Authors [no labs options unfortunately] +# * Download button, save as ncbi.authors.date.csv + +#*** TODO AUTOMATE ME +# * https://www.ebi.ac.uk/ena/browser/view/PRJEB37886 +# * select columns center_name, sample_accession, sample_alias +# * Download report: TSV +# * file saved to filereport_read_run_PRJEB37886_tsv.2020-11-3.txt (extra first column, run_accession) +cogUkMetadata=filereport_read_run_PRJEB37886_tsv.2020-11-13.txt + + +# Author credits file... strip GenBank version numbers because NCBI metadata doesn't have those +cut -f 2 treeToPublic \ +| cut -d \| -f 2 \ +| sed -re 's/^([A-Z][A-Z][0-9]{6})\.[0-9]/\1/;' \ +| sort > publicIdsInTree +tail -n+2 $cncbMetadata \ +| cut -f 2,12,14 \ +| grep -v ^EPI_ISL_ \ +| egrep -v '^[A-Z][A-Z][0-9]{6}' \ +| sed -e 's/"//g; s/$/\tn\/a/;' \ + > cncb.credits +tail -n+2 $ncbiMetadata \ +| csvToTab \ +| tawk '{print $1, "n/a", "n/a", $2;}' \ + > ncbi.credits +tail -n+2 $cogUkMetadata \ +| tawk '{print $4, $3, $3, "COVID-19 Genomics UK Consortium";}' \ +| sed -e 's@^COG-UK/@@;' \ +| sort -u \ + > cogUk.credits.partialIds +grep / publicIdsInTree \ +| awk -F/ '{print $2 "\t" $0;}' \ +| sort \ + > cogUk.partialToFull +join -a 2 -e "n/a" -t$'\t' -o 2.2,1.2,1.3,1.4 cogUk.credits.partialIds cogUk.partialToFull \ +| tawk '{ if ($4 == "n/a") { $4 = "COVID-19 Genomics UK Consortium"; } print; }' \ + > cogUk.credits +/bin/echo -e "accession\toriginating_lab\tsubmitting_lab\tauthors" > acknowledgements.tsv + +#*** THIS IS just for the subset... we need a full acknowledgements file too. +grep -Fwf publicIdsInTree cncb.credits >> acknowledgements.tsv +grep -Fwf publicIdsInTree ncbi.credits >> acknowledgements.tsv +grep -Fwf publicIdsInTree cogUk.credits >> acknowledgements.tsv +gzip acknowledgements.tsv