2df325b2cfee5a71d16157bf3aaa62c3e6734ec3 angie Sat Apr 22 18:23:06 2023 -0700 Guard against truncated download file which can happen despite successful return from datasets command. diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh index a8519b5..b3abc2a 100755 --- src/hg/utils/otto/sarscov2phylo/getNcbi.sh +++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh @@ -47,30 +47,38 @@ if [[ ! -f ncbi_dataset.zip ]]; then echo "datasets command failed $maxAttempts times; quitting." exit 1 fi rm -rf ncbi_dataset unzip -o ncbi_dataset.zip # Creates ./ncbi_dataset/ # This makes something just like ncbi.datasets.tsv from the /table/ API query: time jq -c -r '[.accession, .biosample, .isolate.collectionDate, .location.geographicLocation, .host.sciName, .isolate.name, .completeness, (.length|tostring)] | join("\t")' \ ncbi_dataset/data/data_report.jsonl \ | sed -e 's/COMPLETE/complete/; s/PARTIAL/partial/;' \ | sort -u \ > ncbi_dataset.tsv +# Make sure we didn't get a drastically truncated download despite apparently successful commands +minSeqCount=6800000 +metadataLC=$(wc -l < ncbi_dataset.tsv) +if (($metadataLC < $minSeqCount)); then + echo "TOO FEW SEQUENCES: $metadataLC lines of ncbi_dataset.tsv, expected at least $minSeqCount" + exit 1 +fi + time $scriptDir/bioSampleJsonToTab.py ncbi_dataset/data/biosample.jsonl | uniq > gb.bioSample.tab # Use BioSample metadata to fill in missing pieces of GenBank metadata and report conflicting # sample collection dates: $scriptDir/gbMetadataAddBioSample.pl gb.bioSample.tab ncbi_dataset.tsv \ > ncbi_dataset.plusBioSample.tsv 2>gbMetadataAddBioSample.log # Manually patch some GB-to-BioSample associations that somehow got mangled at ENA, until # they fix them and the fix percolates through to NCBI... grep -vFwf <(cut -f 1 $ottoDir/ncbi.2022-06-25/gbToBioSample.changes.tsv) \ ncbi_dataset.plusBioSample.tsv > tmp sort tmp $ottoDir/ncbi.2022-06-25/gbToBioSample.patch.tsv > ncbi_dataset.plusBioSample.tsv rm tmp # Make a file for joining collection date with ID: