22e94ffbe044de3b15c736105afa2260196e752b angie Fri Nov 12 13:59:18 2021 -0800 biosample.jsonl from NCBI Datasets is stable & complete -- finally I can get rid of the redundant EUtils fetching of BioSample. diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh index 357d594..7119c51 100755 --- src/hg/utils/otto/sarscov2phylo/getNcbi.sh +++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh @@ -1,21 +1,20 @@ #!/bin/bash source ~/.bashrc set -beEu -x -o pipefail # Download SARS-CoV-2 GenBank FASTA and metadata using NCBI Datasets API. -# Use E-Utils to get SARS-CoV-2 metadata from BioSample. # Use BioSample metadata to fill in gaps in GenBank metadata and report conflicting dates. # Use enhanced metadata to rewrite FASTA headers for matching up with sequences in other databases. scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh today=$(date +%F) ottoDir=/hive/data/outside/otto/sarscov2phylo mkdir -p $ottoDir/ncbi.$today cd $ottoDir/ncbi.$today attempt=0 maxAttempts=5 @@ -42,83 +41,32 @@ if [[ ! -f ncbi_dataset.zip ]]; then echo "datasets command failed $maxAttempts times; quitting." exit 1 fi rm -rf ncbi_dataset unzip -o ncbi_dataset.zip # Creates ./ncbi_dataset/ # This makes something just like ncbi.datasets.tsv from the /table/ API query: jq -c -r '[.accession, .biosample, .isolate.collectionDate, .location.geographicLocation, .host.sciName, .isolate.name, .completeness, (.length|tostring)] | join("\t")' \ ncbi_dataset/data/data_report.jsonl \ | sed -e 's/COMPLETE/complete/; s/PARTIAL/partial/;' \ | sort \ > ncbi_dataset.tsv -# TODO: get rid of all this eutils cruft when biosample.jsonl is stable: - -# Use EUtils (esearch) to get all SARS-CoV-2 BioSample GI# IDs: -$scriptDir/searchAllSarsCov2BioSample.sh -sort all.biosample.gids.txt > all.biosample.gids.sorted.txt - -# Copy yesterday's all.bioSample.txt so we don't have to refetch all the old stuff. -if [ -e ../ncbi.latest/all.bioSample.txt.xz ]; then - xzcat ../ncbi.latest/all.bioSample.txt.xz > all.bioSample.txt - grep ^Accession all.bioSample.txt | sed -re 's/^.*ID: //' | sort -u > ids.loaded - comm -23 all.biosample.gids.sorted.txt ids.loaded > ids.notLoaded -elif [ -e ../ncbi.latest/all.bioSample.txt ]; then - cp ../ncbi.latest/all.bioSample.txt all.bioSample.txt - grep ^Accession all.bioSample.txt | sed -re 's/^.*ID: //' | sort -u > ids.loaded - comm -23 all.biosample.gids.sorted.txt ids.loaded > ids.notLoaded -else - cp -p all.biosample.gids.txt ids.notLoaded -fi -wc -l ids.notLoaded - -# Use EUtils (efetch) to get BioSample records for the GI# IDs that we don't have yet: -time $scriptDir/bioSampleIdToText.sh < ids.notLoaded >> all.bioSample.txt - -grep ^Accession all.bioSample.txt | sed -re 's/^.*ID: //' | sort > ids.loaded -comm -23 all.biosample.gids.sorted.txt ids.loaded > ids.notLoaded -if [ -s ids.notLoaded ]; then - echo Retrying queries for `wc -l < ids.notLoaded` IDs - $scriptDir/bioSampleIdToText.sh < ids.notLoaded >> all.bioSample.txt - grep ^Accession all.bioSample.txt | sed -re 's/^.*ID: //' | sort > ids.loaded - comm -23 all.biosample.gids.sorted.txt ids.loaded > ids.notLoaded - if [ -s ids.notLoaded ]; then - echo "Still have only $accCount accession lines after retrying; quitting." - exit 1 - fi -fi - -# Extract properties of interest into tab-sep text: -$scriptDir/bioSampleTextToTab.pl < all.bioSample.txt > all.bioSample.tab - -# Extract BioSample tab-sep lines just for BioSample accessions included in the ncbi_dataset data: -tawk '$2 != "" {print $2;}' ncbi_dataset.tsv \ -| grep -Fwf - all.bioSample.tab \ - > gb.bioSample.eutils.tab - time $scriptDir/bioSampleJsonToTab.py ncbi_dataset/data/biosample.jsonl > gb.bioSample.tab -# TODO: get rid of this when bioSample json is stable -comm -23 <(cut -f 2 gb.bioSample.eutils.tab | sort) <(cut -f 2 gb.bioSample.tab | sort) \ - > bioSample.missingFromJson.txt -wc -l bioSample.missingFromJson.txt -grep -Fwf bioSample.missingFromJson.txt gb.bioSample.eutils.tab \ - >> gb.bioSample.tab - # Use BioSample metadata to fill in missing pieces of GenBank metadata and report conflicting # sample collection dates: $scriptDir/gbMetadataAddBioSample.pl gb.bioSample.tab ncbi_dataset.tsv \ > ncbi_dataset.plusBioSample.tsv 2>gbMetadataAddBioSample.log # Make a file for joining collection date with ID: tawk '$3 != "" {print $1, $3;}' ncbi_dataset.plusBioSample.tsv \ | sort > gbToDate # Replace FASTA headers with reconstructed names from enhanced metadata. time cleanGenbank < ncbi_dataset/data/genomic.fna \ | $scriptDir/fixNcbiFastaNames.pl ncbi_dataset.plusBioSample.tsv \ | xz -T 8 \ > genbank.fa.xz @@ -191,16 +139,15 @@ fi wc -l lineage_report.csv # It turns out that sometimes new sequences sneak into ncbi_dataset.tsv before they're included in # genomic.fna. Filter those out so we don't have missing pangolin and nextclade for just-added # COG-UK sequences. (https://github.com/theosanderson/taxodium/issues/10#issuecomment-876800176) grep -Fwf gb.names ncbi_dataset.plusBioSample.tsv > tmp wc -l tmp ncbi_dataset.plusBioSample.tsv mv tmp ncbi_dataset.plusBioSample.tsv rm -f $ottoDir/ncbi.latest ln -s ncbi.$today $ottoDir/ncbi.latest # Clean up rm -r ncbi_dataset -nice xz all.bioSample.* &