src/hg/utils/otto/sarscov2phylo/getCncb.sh d6e143eb6277e6a8d5ffadccbb9538325f5d86af

d6e143eb6277e6a8d5ffadccbb9538325f5d86af
angie
  Wed Aug 14 13:12:26 2024 -0700
Add check for format of data returned by server (sometimes error messages are returned as text without HTTP error codes)

diff --git src/hg/utils/otto/sarscov2phylo/getCncb.sh src/hg/utils/otto/sarscov2phylo/getCncb.sh
index 44976b8..6af25fc 100755
--- src/hg/utils/otto/sarscov2phylo/getCncb.sh
+++ src/hg/utils/otto/sarscov2phylo/getCncb.sh
@@ -33,30 +33,36 @@
     done
     if [[ $attempt -gt $maxAttempts ]]; then
         $>2 echo "curl failed $maxAttempts times; quitting."
         exit 1
     fi
 }
 
 # Discard Pango lineage column so we can keep the same column order as before & won't have to
 # update scripts.  Also watch out for non-ASCII characters (e.g. NMDC60013002-06's "2019?"
 # was changed to "2019\302\240" -- change it back).
 curlRetry $metadataUrl \
 | cut -f 1-4,6- \
 | perl -wpe 's/[^[:print:]^\t^\n]+/?/g;' \
     > cncb.metadata.tsv
 
+colCount=$(head -1 cncb.metadata.tsv | tawk '{print NF;}')
+if [[ $colCount != 16 ]]; then
+    echo "Metadata format error: expected 16 columns, got $colCount"
+    exit 1
+fi
+
 # Make a cncbToDate file for ID mapping.
 tail -n+2 cncb.metadata.tsv \
 | cut -f 1,10 \
 | cleanCncb \
 | sed -re 's/ //;' \
     > cncbToDate
 
 # Make a renaming file to translate from accessions to the 'name | acc' headers expected by
 # Chris's ID-matching pipeline.  Exclude sequences that are also in GenBank.  If for some reason
 # the EPI_ ID is listed as primary and CNCB as secondary, swap them.
 tail -n+2 cncb.metadata.tsv \
 | grep -vE '[,'$'\t''][A-Z]{2}[0-9]{6}\.[0-9]+['$'\t'',]' \
 | tawk '{ if ($4 == "null") { $4 = ""; } if ($2 ~ /^EPI_ISL/) { tmp = $2; $2 = $4; $4 = tmp; } print $1, $2; }' \
 | cleanCncb \
 | sed -re 's/ //;' \