src/hg/utils/otto/sarscov2phylo/getNcbi.sh 2df325b2cfee5a71d16157bf3aaa62c3e6734ec3

2df325b2cfee5a71d16157bf3aaa62c3e6734ec3
angie
  Sat Apr 22 18:23:06 2023 -0700
Guard against truncated download file which can happen despite successful return from datasets command.

diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh
index a8519b5..b3abc2a 100755
--- src/hg/utils/otto/sarscov2phylo/getNcbi.sh
+++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh
@@ -47,30 +47,38 @@
 if [[ ! -f ncbi_dataset.zip ]]; then
     echo "datasets command failed $maxAttempts times; quitting."
     exit 1
 fi
 rm -rf ncbi_dataset
 unzip -o ncbi_dataset.zip
 # Creates ./ncbi_dataset/
 
 # This makes something just like ncbi.datasets.tsv from the /table/ API query:
 time jq -c -r '[.accession, .biosample, .isolate.collectionDate, .location.geographicLocation, .host.sciName, .isolate.name, .completeness, (.length|tostring)] | join("\t")' \
     ncbi_dataset/data/data_report.jsonl \
 | sed -e 's/COMPLETE/complete/; s/PARTIAL/partial/;' \
 | sort -u \
     > ncbi_dataset.tsv
 
+# Make sure we didn't get a drastically truncated download despite apparently successful commands
+minSeqCount=6800000
+metadataLC=$(wc -l < ncbi_dataset.tsv)
+if (($metadataLC < $minSeqCount)); then
+    echo "TOO FEW SEQUENCES: $metadataLC lines of ncbi_dataset.tsv, expected at least $minSeqCount"
+    exit 1
+fi
+
 time $scriptDir/bioSampleJsonToTab.py ncbi_dataset/data/biosample.jsonl | uniq > gb.bioSample.tab
 
 # Use BioSample metadata to fill in missing pieces of GenBank metadata and report conflicting
 # sample collection dates:
 $scriptDir/gbMetadataAddBioSample.pl gb.bioSample.tab ncbi_dataset.tsv \
     > ncbi_dataset.plusBioSample.tsv 2>gbMetadataAddBioSample.log
 
 # Manually patch some GB-to-BioSample associations that somehow got mangled at ENA, until
 # they fix them and the fix percolates through to NCBI...
 grep -vFwf <(cut -f 1 $ottoDir/ncbi.2022-06-25/gbToBioSample.changes.tsv) \
     ncbi_dataset.plusBioSample.tsv > tmp
 sort tmp $ottoDir/ncbi.2022-06-25/gbToBioSample.patch.tsv > ncbi_dataset.plusBioSample.tsv
 rm tmp
 
 # Make a file for joining collection date with ID: