d48d0f53ee67f93d7193241cd226c8aa039cde42 angie Fri Dec 20 15:37:20 2024 -0800 Uniquify to guard against occasional duplicates. diff --git src/hg/utils/otto/sarscov2phylo/getCncb.sh src/hg/utils/otto/sarscov2phylo/getCncb.sh index 6af25fc..cd3cd7f 100755 --- src/hg/utils/otto/sarscov2phylo/getCncb.sh +++ src/hg/utils/otto/sarscov2phylo/getCncb.sh @@ -43,54 +43,57 @@ curlRetry $metadataUrl \ | cut -f 1-4,6- \ | perl -wpe 's/[^[:print:]^\t^\n]+/?/g;' \ > cncb.metadata.tsv colCount=$(head -1 cncb.metadata.tsv | tawk '{print NF;}') if [[ $colCount != 16 ]]; then echo "Metadata format error: expected 16 columns, got $colCount" exit 1 fi # Make a cncbToDate file for ID mapping. tail -n+2 cncb.metadata.tsv \ | cut -f 1,10 \ | cleanCncb \ +| sort -u \ | sed -re 's/ //;' \ > cncbToDate # Make a renaming file to translate from accessions to the 'name | acc' headers expected by # Chris's ID-matching pipeline. Exclude sequences that are also in GenBank. If for some reason # the EPI_ ID is listed as primary and CNCB as secondary, swap them. tail -n+2 cncb.metadata.tsv \ | grep -vE '[,'$'\t''][A-Z]{2}[0-9]{6}\.[0-9]+['$'\t'',]' \ | tawk '{ if ($4 == "null") { $4 = ""; } if ($2 ~ /^EPI_ISL/) { tmp = $2; $2 = $4; $4 = tmp; } print $1, $2; }' \ | cleanCncb \ +| sort -u \ | sed -re 's/ //;' \ | tawk '{print $2, $1 " | " $2;}' \ | sort \ > accToNameBarAcc.tsv # See what sequences are in metadata but that we haven't already downloaded. -fastaNames ../cncb.latest/cncb.nonGenBank.acc.fasta.xz | sort > prevAccs -comm -23 <(cut -f 1 accToNameBarAcc.tsv) prevAccs | sort > missingIDs +fastaNames ../cncb.latest/cncb.nonGenBank.acc.fasta.xz | sort -u > prevAccs +comm -23 <(cut -f 1 accToNameBarAcc.tsv) prevAccs | sort -u > missingIDs # Use the GenBase API to download missing GenBase sequences, unfortunately only one sequence # at a time. tail -n+2 cncb.metadata.tsv \ | grep -vE '[,'$'\t''][A-Z]{2}[0-9]{6}\.[0-9]+['$'\t'',]' \ | tawk '{ if ($4 == "null") { $4 = ""; } if ($2 ~ /^EPI_ISL/) { tmp = $2; $2 = $4; $4 = tmp; } print $2; }' \ +| sort -u \ > nonGenBankIds set +o pipefail grep ^C_ missingIDs \ | grep -Fwf - nonGenBankIds \ | cat \ > missingGenBaseIds set -o pipefail for acc in $(cat missingGenBaseIds); do curlRetry "$genBaseFileApiBase$acc" \ | sed -re '/^>/ s/ .*//;' sleep 5 done > new.accs.fa cat <(xzcat ../cncb.latest/cncb.nonGenBank.acc.fasta.xz) new.accs.fa \ | faUniqify stdin stdout \