9011b709f1fa87332b94a5920062582f5a725f68 angie Thu Jun 22 10:17:30 2023 -0700 Watch out for empty name and date columns for COG-UKs in GenBank diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh index 1d1fa2c..bd850a0 100755 --- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh +++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh @@ -100,31 +100,31 @@ if [ -s prevNameToRemove ]; then rm -f prevDedup.pb $matUtils extract -i $baseProtobuf \ -p -s <(sort -u prevNameToRemove) \ -u prevDedupNames \ -o prevDedup.pb else ln -sf $baseProtobuf prevDedup.pb $matUtils extract -i prevDedup.pb -u prevDedupNames fi function gbAccCogRenaming { # pipeline: one INSDC accession per line of stdin, acc to full name if COG-UK on stdout grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \ | grep COG-UK/ \ - | tawk '{print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \ + | tawk '{ if ($4 != "") { print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3; } else { if ($3 != "") { print $1, $6 "/" $3 "|" $1 "|" $3; } else { print $1, $6 "|?"; } } }' \ | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' } function gbAccNonCogRenaming { # pipeline: one INSDC accession per line of stdin, acc to full name if non-COG-UK on stdout grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \ | grep -v COG-UK/ \ | cleanGenbank \ | tawk '{ if ($3 == "") { $3 = "?"; } if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; } else { print $1 "\t" $1 "|" $3; } }' \ | sed -re 's/ /_/g' } # To update names that have changed and simplify detection of new sequences to add, relate to acc.