abdea7a81b3d511adaee6300277721a804f5581f angie Tue Mar 30 11:57:47 2021 -0700 Ongoing fixes for odd characters in sequence names and other pipeline tweaks. diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh index 96e63f0..b7186aa 100755 --- src/hg/utils/otto/sarscov2phylo/getNcbi.sh +++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh @@ -8,31 +8,31 @@ # Use enhanced metadata to rewrite FASTA headers for matching up with sequences in other databases. scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh today=$(date +%F) prevDate=$(date -d yesterday +%F) ottoDir=/hive/data/outside/otto/sarscov2phylo mkdir -p $ottoDir/ncbi.$today cd $ottoDir/ncbi.$today # This query gives a large zip file that includes extra stuff, and the download really slows # down after a point, but it does provide a consistent set of FASTA and metadata: -time curl -s -X GET \ +time curl -sS -X GET \ "https://api.ncbi.nlm.nih.gov/datasets/v1alpha/virus/taxon/2697049/genome/download?refseq_only=false&annotated_only=false&released_since=2020-01-01&complete_only=false&include_annotation_type=DEFAULT&filename=ncbi_dataset.$today.zip" \ -H "Accept: application/zip" \ > ncbi_dataset.zip rm -rf ncbi_dataset unzip ncbi_dataset.zip # Creates ./ncbi_dataset/ # This makes something just like ncbi.datasets.tsv from the /table/ API query: jq -c -r '[.accession, .biosample, .isolate.collectionDate, .location.geographicLocation, .host.sciName, .isolate.name, .completeness, (.length|tostring)] | join("\t")' \ ncbi_dataset/data/data_report.jsonl \ | sed -e 's/COMPLETE/complete/; s/PARTIAL/partial/;' \ | sort \ > ncbi_dataset.tsv