8ec3c55d2932bb09df06ae9bd9be0b0dd37c2ea9 angie Sat Apr 22 18:21:48 2023 -0700 Overhaul detection of duplicates in previous day's protobuf vs. today's ID-mapping, and get latest sequence names from metadata, by first converting previous day's names to accessions only. Add in detection of new CNCB sequences. diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh index 3327343..1d1fa2c 100755 --- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh +++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh @@ -1,322 +1,319 @@ #!/bin/bash set -beEu -x -o pipefail # Do not modify this script, modify the source tree copy: # kent/src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh usage() { echo "usage: $0 prevDate today problematicSitesVcf [baseProtobuf]" echo "This assumes that ncbi.latest and cogUk.latest links/directories have been updated." } -if [ $# != 3 && $# != 4 ]; then +if (( $# != 3 && $# != 4 )); then usage exit 1 fi prevDate=$1 today=$2 problematicSitesVcf=$3 if [ $# == 4 ]; then baseProtobuf=$4 else baseProtobuf= fi ottoDir=/hive/data/outside/otto/sarscov2phylo ncbiDir=$ottoDir/ncbi.latest cogUkDir=$ottoDir/cogUk.latest cncbDir=$ottoDir/cncb.latest gisaidDir=/hive/users/angie/gisaid minReal=20000 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit epiToPublic=$gisaidDir/epiToPublicAndDate.latest + scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh mkdir -p $ottoDir/$today cd $ottoDir/$today # If there's a version that I didn't want to push out to the main site, but wanted to be used # as the basis for the next day's build (for example with some extra pruning), use that: if [ -e $ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.useMe.pb ]; then prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.useMe.pb else prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb fi usherDir=~angie/github/usher usher=$usherDir/build/usher matUtils=$usherDir/build/matUtils renaming=oldAndNewNames if [ "$baseProtobuf" == "" ]; then baseProtobuf=$prevProtobufMasked fi # Make lists of sequences already in the tree. $matUtils extract -i $baseProtobuf -u prevNames -# Before updating the tree with new sequences, update the names used in the tree: -# * Sequences that are already in the tree with EPI_ IDs, but that have been mapped to public IDs -# * COG-UK sequences that are in GenBank. Per Sam Nicholls the isolate alone is enough to identify -# them -- no need to match country & year which are sometimes incorrect at first. -grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \ -| tawk '{print $6, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \ -| sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \ -| sort \ - > cogUkInGenBankIsolateToNewName -fastaNames $cogUkDir/cog_all.fasta.xz > cogUk.names -grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) cogUk.names \ - > cogUkInGenBank -set +o pipefail -# From 2021-05-04 on there should be no more unrenamed COG-UK/ in prevNames, but doesn't hurt -# to check. -grep COG-UK/ prevNames \ -| awk -F\| '{print $1 "\t" $0;}' \ -| sed -re 's@COG-UK/@@;' \ -| sort \ - > cogUkInGenBankIsolateToPrevName -set -o pipefail -join -t$'\t' cogUkInGenBankIsolateToPrevName cogUkInGenBankIsolateToNewName \ -| cut -f 2,3 \ - > prevTree.renaming -# Look for COG-UK isolates in prevNames that have just been added to GenBank and need to be renamed. -# Unfortunately for now we are not getting those from $epiToPublic. -grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) prevNames \ -| awk -F\| '$3 == ""' \ -| awk -F/ '{print $2 "\t" $0;}' \ -| sort \ - > cogUkInGenBankIsolateToPrevName -join -t$'\t' cogUkInGenBankIsolateToPrevName cogUkInGenBankIsolateToNewName \ -| cut -f 2,3 \ - >> prevTree.renaming -# Look for names with EPI_IDs that have just today been mapped to public sequences. -grep EPI_ISL_ prevNames \ -| awk -F\| '{print $2 "\t" $0;}' \ -| sort \ - > epiToPrevName -set +o pipefail -grep -Fwf <(cut -f 1 epiToPrevName) $epiToPublic \ -| grep -v COG-UK/ \ -| tawk '{if ($4 != "" && $3 == $2) { print $1, $2 "|" $4; } else if ($4 != "") { print $1, $3 "|" $2 "|" $4; }}' \ +# First, in order to catch & remove duplicates and update names that have changed, extract the +# accession from every name in the tree. +awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' prevNames \ +| subColumn 1 -miss=/dev/null stdin <(cut -f 1,2 $epiToPublic) stdout \ | sort \ - > epiToNewName -set -o pipefail -# Argh, missing sequences in COG-UK metadata can mean that a sequence may have been added to the -# tree both with and without EPI ID... so renaming makes a name conflict. -# If there are any of those then prune the sequences with EPI_ID or longer dates, so renaming doesn't -# cause conflict. -comm -12 <(cut -f 2 epiToNewName | sort) <(sort prevNames) > epiToNewNameAlreadyInTree -join -t$'\t' epiToPrevName <(grep -vFwf epiToNewNameAlreadyInTree epiToNewName) \ -| cut -f 2,3 \ - >> prevTree.renaming -cp /dev/null dupsToRemove -if [ -s epiToNewNameAlreadyInTree ]; then - set +o pipefail - grep -Fwf <(cut -d\| -f 1 epiToNewNameAlreadyInTree) prevNames \ - | grep EPI_ISL \ - | cat \ - >> dupsToRemove - set -o pipefail + > prevIdToName +# Arbitrarily pick one of each duplicated ID to keep in the tree +sort -k1,1 -u prevIdToName > prevIdToNameDeDup +# Remove whichever duplicated items were not arbitrarily chosen to remain +comm -23 prevIdToName prevIdToNameDeDup | cut -f 2 > prevNameToRemove +# Also remove items for which we have no metadata (i.e. they were removed from repository, +# or .1 was replaced by .2 etc.) +cut -f 1 prevIdToNameDeDup | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]+' > gb.acc +comm -13 <(cut -f 1 $ncbiDir/ncbi_dataset.plusBioSample.tsv) gb.acc > gb.removed.acc +if [ -s gb.removed.acc ]; then + grep -Fwf gb.removed.acc prevIdToName | cut -f 2 >> prevNameToRemove fi -# Don't rename if the final name is already in the tree; remove the dup with the old name. -set +o pipefail -cut -f 2 prevTree.renaming \ -| grep -Fwf - prevNames | cat \ - > alreadyThere -grep -Fwf alreadyThere prevTree.renaming | cut -f 1 \ - >> dupsToRemove -# And finally, sometimes there are duplicates due to country or date being corrected in COG-UK -# metadata. -grep -vFwf dupsToRemove prevTree.renaming \ -| cut -f 2 | sort | uniq -c | awk '$1 > 1 {print $2;}' \ -| cut -d/ -f 2 \ - > cogUkIsolateStillDup -grep -Fwf cogUkIsolateStillDup $cogUkDir/cog_metadata.csv \ -| awk -F, '{print $1 "|" $5;}' \ - > cogDupsLatestMetadata -grep -Fwf cogUkIsolateStillDup prevNames \ -| grep -vFwf dupsToRemove \ -| grep -vFwf cogDupsLatestMetadata \ -| cat \ - >> dupsToRemove -set -o pipefail -startingProtobuf=$baseProtobuf -if [ -s dupsToRemove ]; then - startingProtobuf=prevDupTrimmed.pb - $matUtils extract -i $baseProtobuf -p -s dupsToRemove -o $startingProtobuf +cut -f 1 prevIdToNameDeDup | grep -E '^EPI_ISL_' > epi.acc +comm -13 <(zcat $gisaidDir/metadata_batch_$today.tsv.gz | cut -f 3 | sort) epi.acc > epi.removed.acc +if [ -s epi.removed.acc ]; then + grep -Fwf epi.removed.acc prevIdToName | cut -f 2 >> prevNameToRemove +fi +cut -f 1 prevIdToNameDeDup | grep -E '^(England|Northern|Scotland|Wales)' > cog.acc +comm -13 <(cut -d, -f 1 $cogUkDir/cog_metadata.csv | sort) cog.acc > cog.removed.acc +if [ -s cog.removed.acc ]; then + grep -Fwf cog.removed.acc prevIdToName \ + | grep -vE '^([A-Z]{2}[0-9]{6}\.[0-9]+|EPI_ISL_)' \ + | cut -f 2 >> prevNameToRemove +fi +cut -f 1 prevIdToNameDeDup \ +| grep -vE '^([A-Z]{2}[0-9]{6}\.[0-9]+|EPI_ISL_|England|Northern|Scotland|Wales)' \ + > cncb.acc +comm -13 <(cut -f 2 $cncbDir/cncb.metadata.tsv | sort) cncb.acc > cncb.removed.acc +if [ -s cncb.removed.acc ]; then + grep -Fwf cncb.removed.acc prevIdToName | cut -f 2 >> prevNameToRemove fi -if [ -s prevTree.renaming ]; then - $matUtils mask -r prevTree.renaming -i $startingProtobuf -o prevRenamed.pb >& rename.out - $matUtils extract -i prevRenamed.pb -u prevNames +# Remove duplicates and withdrawn sequences +if [ -s prevNameToRemove ]; then + rm -f prevDedup.pb + $matUtils extract -i $baseProtobuf \ + -p -s <(sort -u prevNameToRemove) \ + -u prevDedupNames \ + -o prevDedup.pb else - ln -sf $startingProtobuf prevRenamed.pb + ln -sf $baseProtobuf prevDedup.pb + $matUtils extract -i prevDedup.pb -u prevDedupNames fi +function gbAccCogRenaming { + # pipeline: one INSDC accession per line of stdin, acc to full name if COG-UK on stdout + grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \ + | grep COG-UK/ \ + | tawk '{print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \ + | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' +} + +function gbAccNonCogRenaming { + # pipeline: one INSDC accession per line of stdin, acc to full name if non-COG-UK on stdout + grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \ + | grep -v COG-UK/ \ + | cleanGenbank \ + | tawk '{ if ($3 == "") { $3 = "?"; } + if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; } + else { print $1 "\t" $1 "|" $3; } }' \ + | sed -re 's/ /_/g' +} + +# To update names that have changed and simplify detection of new sequences to add, relate to acc. +# Strip country and year from COG-UK names to get COG acc. +awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' prevDedupNames \ +| subColumn 1 -miss=/dev/null stdin <(cut -f 1,2 $epiToPublic) stdout \ +| sed -re 's@^(England|Northern_?Ireland|Scotland|Wales)/([A-Z]+[_-]?[A-Za-z0-9]+)/[0-9]+@COG:\2@;' \ +| sort \ + > accToPrevDedupName + +# Break down accs by source -- we will need those both for renaming acc to full name and for +# figuring out which seqs the tree already has vs. which are new. +cut -f 1 accToPrevDedupName | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]+' > prevGbAcc +cut -f 1 accToPrevDedupName | grep -E '^COG:' | sed -re 's/^COG://;' > prevCogUk +cut -f 1 accToPrevDedupName | grep -E '^EPI_ISL_' > prevGisaid +cut -f 1 accToPrevDedupName | grep -vE '^([A-Z]{2}[0-9]{6}\.[0-9]+|COG:|EPI_ISL_)' > prevCncb + +# GenBank renaming has both COG-UK and non-COG-UK versions: +gbAccCogRenaming < prevGbAcc > accToNewName +gbAccNonCogRenaming < prevGbAcc >> accToNewName +# Restore the COG:isolate format for non-GenBank COG-UK sequences: +grep -Fwf prevCogUk $cogUkDir/cog_metadata.csv \ +| awk -F, '{print $1 "\t" $1 "|" $5;}' \ +| sed -re 's@^(England|Northern_?Ireland|Scotland|Wales)/([A-Z]+[_-]?[A-Za-z0-9]+)/[0-9]+@COG:\2@;' \ + >> accToNewName +# GISAID: +zcat $gisaidDir/metadata_batch_$today.tsv.gz \ +| grep -Fwf prevGisaid \ +| tawk '$3 != "" {print $3 "\t" $1 "|" $3 "|" $5;}' \ + >> accToNewName +# CNCB: +grep -Fwf prevCncb $cncbDir/cncb.metadata.tsv \ +| cleanCncb \ +| sed -re 's/ /_/g;' \ +| tawk '{print $2 "\t" $1 "|" $2 "|" $10;}' \ + >> accToNewName +join -t$'\t' accToPrevDedupName <(sort accToNewName) \ +| tawk '$2 != $3 {print $2, $3;}' \ + > prevDedupNameToNewName + +$matUtils mask -i prevDedup.pb -r prevDedupNameToNewName -o prevRenamed.pb \ + >& renaming.out +rm renaming.out + # OK, now that the tree names are updated, figure out which seqs are already in there and # which need to be added. -awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ -| grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc -grep -E '^(England|Northern|Scotland|Wales)' prevNames \ -| cut -d\| -f1 > prevCogUk -awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ -| grep -E '^EPI_ISL_' > prevGisaid +# Add GenBank COG-UK sequences to prevCogUk so we don't add dups. +cut -f 6 $ncbiDir/ncbi_dataset.plusBioSample.tsv | grep COG-UK | sed -re 's@^COG-UK/@@;' >> prevCogUk # Add public sequences that have been mapped to GISAID sequences to prevGisaid. -grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid -grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid -wc -l prev* +cut -f 1 $epiToPublic >> prevGisaid +wc -l prev{GbAcc,CogUk,Gisaid,Cncb} # Exclude some sequences based on nextclade counts of reversions and other-clade mutations. zcat $gisaidDir/chunks/nextclade.full.tsv.gz \ | $scriptDir/findDropoutContam.pl > gisaid.dropoutContam zcat $ncbiDir/nextclade.full.tsv.gz \ | $scriptDir/findDropoutContam.pl > gb.dropoutContam zcat $cogUkDir/nextclade.full.tsv.gz \ | $scriptDir/findDropoutContam.pl > cog.dropoutContam +zcat $cncbDir/nextclade.full.tsv.gz \ +| $scriptDir/findDropoutContam.pl > cncb.dropoutContam cut -f 1 *.dropoutContam \ | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \ > dropoutContam.ids # Also exclude sequences with unbelievably low numbers of mutations given sampling dates. zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 1,10 \ | awk -F\| '{ if ($3 == "") { print $1 "\t" $2; } else { print $2 "\t" $3; } }' \ | $scriptDir/findRefBackfill.pl > gisaid.refBackfill zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \ | join -t $'\t' <(cut -f 1,3 $ncbiDir/ncbi_dataset.plusBioSample.tsv | sort) - \ | $scriptDir/findRefBackfill.pl > gb.refBackfill zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \ | join -t $'\t' <(cut -d, -f 1,5 $cogUkDir/cog_metadata.csv | tr , $'\t' | sort) - \ | $scriptDir/findRefBackfill.pl > cog.refBackfill +zcat $cncbDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \ +| join -t$'\t' <(cut -f 2,10 $cncbDir/cncb.metadata.tsv | sort) - \ +| $scriptDir/findRefBackfill.pl > cncb.refBackfill cut -f 1 *.refBackfill > refBackfill.ids sort -u ../tooManyEpps.ids ../badBranchSeed.ids dropoutContam.ids refBackfill.ids \ | grep -vFwf <(tail -n+2 $scriptDir/includeRecombinants.tsv | cut -f 1) \ > exclude.ids # Get new GenBank sequences with at least $minReal non-N bases. -# Exclude seqs in the tree with EPI IDs that that have been mapped in the very latest $epiToPublic. -set +o pipefail -egrep $'\t''[A-Z][A-Z][0-9]{6}\.[0-9]+' $epiToPublic \ -| grep -Fwf prevGisaid - \ -| grep -vFwf prevGbAcc \ -| cat \ - >> prevGbAcc -set -o pipefail xzcat $ncbiDir/genbank.fa.xz \ | faSomeRecords -exclude stdin <(cat prevGbAcc exclude.ids) newGenBank.fa faSize -veryDetailed newGenBank.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > gbTooSmall # NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too. -set +o pipefail -fastaNames newGenBank.fa | grep NC_045512 >> gbTooSmall -set -o pipefail +echo NC_045512.2 >> gbTooSmall faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa faSize newGenBank.filtered.fa # Get new COG-UK sequences with at least $minReal non-N bases. # Also exclude cog_all.fasta sequences not found in cog_metadata.csv. -comm -23 <(fastaNames $cogUkDir/cog_all.fasta.xz | sort) \ - <(cut -d, -f1 $cogUkDir/cog_metadata.csv | sort) \ - > cogFaNotMeta -# Also exclude COG-UK sequences that have been added to GenBank (cogUkInGenBank, see above). +fastaNames $cogUkDir/cog_all.fasta.xz \ +| grep -vFwf prevCogUk \ +| grep -vFwf exclude.ids \ +| grep -Fwf <(cut -d, -f 1 $cogUkDir/cog_metadata.csv) \ + > newCogUk.accs xzcat $cogUkDir/cog_all.fasta.xz \ -| faSomeRecords -exclude stdin \ - <(cat prevCogUk cogFaNotMeta cogUkInGenBank exclude.ids) newCogUk.fa +| faSomeRecords stdin newCogUk.accs newCogUk.fa faSize -veryDetailed newCogUk.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > cogUkTooSmall faSomeRecords -exclude newCogUk.fa cogUkTooSmall newCogUk.filtered.fa faSize newCogUk.filtered.fa # Get new GISAID sequences with at least $minReal non-N bases. xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \ | sed -re 's/^>.*\|(EPI_ISL_[0-9]+)\|.*/>\1/' \ | faSomeRecords -exclude stdin <(cat prevGisaid exclude.ids) newGisaid.fa faSize -veryDetailed newGisaid.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > gisaidTooSmall faSomeRecords -exclude newGisaid.fa gisaidTooSmall newGisaid.filtered.fa faSize newGisaid.filtered.fa -# Exclude public-mapped sequences from newGisaid: -set +o pipefail -fastaNames newGisaid.filtered.fa \ -| grep -Fwf - $epiToPublic \ -| cut -f 1 \ - > newGisaid.public.names -set -o pipefail -if [ -s newGisaid.public.names ]; then - faSomeRecords -exclude newGisaid.filtered.fa newGisaid.public.names tmp - mv tmp newGisaid.filtered.fa - faSize newGisaid.filtered.fa -fi +# Get new CNCB sequences with at least $minReal non-N bases. +xzcat $cncbDir/cncb.nonGenBank.acc.fasta.xz \ +| faSomeRecords -exclude stdin <(cat prevCncb exclude.ids) newCncb.fa +faSize -veryDetailed newCncb.fa \ +| tawk '$4 < '$minReal' {print $1;}' \ + > cncbTooSmall +faSomeRecords -exclude newCncb.fa cncbTooSmall newCncb.filtered.fa +faSize newCncb.filtered.fa + cat new*.filtered.fa > new.fa faSize new.fa # Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results # as a multifasta alignment (from which we can extract VCF with SNVs): #conda install -c bioconda mafft alignedFa=new.aligned.fa rm -f $alignedFa export TMPDIR=/dev/shm time bash ~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh \ -i new.fa \ -o $alignedFa \ - -t 50 + -t 30 faSize $alignedFa -# Now make a renaming that keeps all the prevNames and adds full names for the new seqs. -tawk '{print $1, $1;}' prevNames > $renaming +# Now make a renaming that converts accessions back to full name|acc|year names. +cp /dev/null $renaming if [ -s newCogUk.filtered.fa ]; then - # Sometimes all of the new COG-UK sequences are missing from cog_metadata.csv -- complained. set +o pipefail fastaNames newCogUk.filtered.fa \ | grep -Fwf - $cogUkDir/cog_metadata.csv \ | awk -F, '{print $1 "\t" $1 "|" $5;}' \ >> $renaming set -o pipefail fi if [ -s newGenBank.filtered.fa ]; then # Special renaming for COG-UK sequences: strip COG-UK/, add back country and year set +o pipefail fastaNames newGenBank.filtered.fa \ - | grep COG-UK/ \ | sed -re 's/[ |].*//' \ - | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \ - | tawk '{print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \ - | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \ + | gbAccCogRenaming \ >> $renaming fastaNames newGenBank.filtered.fa \ - | grep -v COG-UK/ \ | sed -re 's/[ |].*//' \ - | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \ - | tawk '{ if ($3 == "") { $3 = "?"; } - if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; } - else { print $1 "\t" $1 "|" $3; } }' \ - | cleanGenbank \ - | sed -re 's/ /_/g' \ + | gbAccNonCogRenaming \ >> $renaming set -o pipefail fi if [ -s newGisaid.filtered.fa ]; then zcat $gisaidDir/metadata_batch_$today.tsv.gz \ | grep -Fwf <(fastaNames newGisaid.filtered.fa) \ | tawk '$3 != "" {print $3 "\t" $1 "|" $3 "|" $5;}' \ >> $renaming fi +if [ -s newCncb.filtered.fa ]; then + cleanCncb < $cncbDir/cncb.metadata.tsv \ + | sed -re 's/ /_/g;' \ + | grep -Fwf <(fastaNames newCncb.filtered.fa) \ + | tawk '{print $2 "\t" $1 "|" $2 "|" $10;}' \ + >> $renaming +fi wc -l $renaming # Make masked VCF tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \ $problematicSitesVcf > mask.vcf time cat <(twoBitToFa $ref2bit stdout) $alignedFa \ | faToVcf -maxDiff=200 \ -excludeFile=exclude.ids \ -verbose=2 stdin stdout \ | vcfRenameAndPrune stdin $renaming stdout \ | vcfFilter -excludeVcf=mask.vcf stdin \ | gzip -c \ > new.masked.vcf.gz