d6e143eb6277e6a8d5ffadccbb9538325f5d86af angie Wed Aug 14 13:12:26 2024 -0700 Add check for format of data returned by server (sometimes error messages are returned as text without HTTP error codes) diff --git src/hg/utils/otto/sarscov2phylo/getCncb.sh src/hg/utils/otto/sarscov2phylo/getCncb.sh index 44976b8..6af25fc 100755 --- src/hg/utils/otto/sarscov2phylo/getCncb.sh +++ src/hg/utils/otto/sarscov2phylo/getCncb.sh @@ -33,30 +33,36 @@ done if [[ $attempt -gt $maxAttempts ]]; then $>2 echo "curl failed $maxAttempts times; quitting." exit 1 fi } # Discard Pango lineage column so we can keep the same column order as before & won't have to # update scripts. Also watch out for non-ASCII characters (e.g. NMDC60013002-06's "2019?" # was changed to "2019\302\240" -- change it back). curlRetry $metadataUrl \ | cut -f 1-4,6- \ | perl -wpe 's/[^[:print:]^\t^\n]+/?/g;' \ > cncb.metadata.tsv +colCount=$(head -1 cncb.metadata.tsv | tawk '{print NF;}') +if [[ $colCount != 16 ]]; then + echo "Metadata format error: expected 16 columns, got $colCount" + exit 1 +fi + # Make a cncbToDate file for ID mapping. tail -n+2 cncb.metadata.tsv \ | cut -f 1,10 \ | cleanCncb \ | sed -re 's/ //;' \ > cncbToDate # Make a renaming file to translate from accessions to the 'name | acc' headers expected by # Chris's ID-matching pipeline. Exclude sequences that are also in GenBank. If for some reason # the EPI_ ID is listed as primary and CNCB as secondary, swap them. tail -n+2 cncb.metadata.tsv \ | grep -vE '[,'$'\t''][A-Z]{2}[0-9]{6}\.[0-9]+['$'\t'',]' \ | tawk '{ if ($4 == "null") { $4 = ""; } if ($2 ~ /^EPI_ISL/) { tmp = $2; $2 = $4; $4 = tmp; } print $1, $2; }' \ | cleanCncb \ | sed -re 's/ //;' \