a087918aa0bbba12853825bd8a588ab26b0b70bd angie Sun Feb 4 09:24:02 2024 -0800 Update to nextclade v3 output column indices. diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh index 39cd74e..812d54e 100755 --- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh +++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh @@ -193,40 +193,40 @@ wc -l prev{GbAcc,CogUk,Gisaid,Cncb} # Exclude some sequences based on nextclade counts of reversions and other-clade mutations. zcat $gisaidDir/chunks/nextclade.full.tsv.gz \ | $scriptDir/findDropoutContam.pl > gisaid.dropoutContam zcat $ncbiDir/nextclade.full.tsv.gz \ | $scriptDir/findDropoutContam.pl > gb.dropoutContam zcat $cogUkDir/nextclade.full.tsv.gz \ | $scriptDir/findDropoutContam.pl > cog.dropoutContam zcat $cncbDir/nextclade.full.tsv.gz \ | $scriptDir/findDropoutContam.pl > cncb.dropoutContam cut -f 1 *.dropoutContam \ | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \ > dropoutContam.ids # Also exclude sequences with unbelievably low numbers of mutations given sampling dates. -zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 1,10 \ +zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 2,11 \ | awk -F\| '{ if ($3 == "") { print $1 "\t" $2; } else { print $2 "\t" $3; } }' \ | $scriptDir/findRefBackfill.pl > gisaid.refBackfill -zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \ +zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 2,11 | sort \ | join -t $'\t' <(cut -f 1,3 $ncbiDir/ncbi_dataset.plusBioSample.tsv | sort) - \ | $scriptDir/findRefBackfill.pl > gb.refBackfill -zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \ +zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 2,11 | sort \ | join -t $'\t' <(zcat $cogUkDir/cog_metadata.csv.gz | cut -d, -f 1,5 | tr , $'\t' | sort) - \ | $scriptDir/findRefBackfill.pl > cog.refBackfill -zcat $cncbDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \ +zcat $cncbDir/nextclade.full.tsv.gz | cut -f 2,11 | sort \ | join -t$'\t' <(cut -f 2,10 $cncbDir/cncb.metadata.tsv | sort) - \ | $scriptDir/findRefBackfill.pl > cncb.refBackfill cut -f 1 *.refBackfill > refBackfill.ids curl -sS $lineageProposalsRecombinants | tail -n+2 | cut -f 1 \ | sed -re 's@(England|Northern[ _]?Ireland|Scotland|Wales)/([A-Z0-9_-]+).*@\2@; s/.*(EPI_ISL_[0-9]+|[A-Z]{2}[0-9]+{6}(\.[0-9]+)?).*/\1/;' \ > tmp grep -Fwf tmp $epiToPublic | cut -f 2 | grep -E '^[A-Z]{2}[0-9]{6}' > tmp2 sort -u tmp tmp2 > lpRecombinantIds rm tmp tmp2 sort -u ../tooManyEpps.ids ../badBranchSeed.ids dropoutContam.ids refBackfill.ids \ | grep -vFwf <(tail -n+2 $scriptDir/includeRecombinants.tsv | cut -f 1) \ | grep -vFwf lpRecombinantIds \ > exclude.ids