b777f63dced2a62cdf363cc456ded46d569ceb46 angie Fri Dec 2 11:11:11 2022 -0800 Update nextclade output column indices diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh index 3de9790..0f948c6 100755 --- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh +++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh @@ -168,37 +168,37 @@ grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid wc -l prev* # Exclude some sequences based on nextclade counts of reversions and other-clade mutations. zcat $gisaidDir/chunks/nextclade.full.tsv.gz \ | $scriptDir/findDropoutContam.pl > gisaid.dropoutContam zcat $ncbiDir/nextclade.full.tsv.gz \ | $scriptDir/findDropoutContam.pl > gb.dropoutContam zcat $cogUkDir/nextclade.full.tsv.gz \ | $scriptDir/findDropoutContam.pl > cog.dropoutContam cut -f 1 *.dropoutContam \ | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \ > dropoutContam.ids # Also exclude sequences with unbelievably low numbers of mutations given sampling dates. -zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 1,6 \ +zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 1,10 \ | awk -F\| '{ if ($3 == "") { print $1 "\t" $2; } else { print $2 "\t" $3; } }' \ | $scriptDir/findRefBackfill.pl > gisaid.refBackfill -zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 1,6 | sort \ +zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \ | join -t $'\t' <(cut -f 1,3 $ncbiDir/ncbi_dataset.plusBioSample.tsv | sort) - \ | $scriptDir/findRefBackfill.pl > gb.refBackfill -zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 1,6 | sort \ +zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \ | join -t $'\t' <(cut -d, -f 1,5 $cogUkDir/cog_metadata.csv | tr , $'\t' | sort) - \ | $scriptDir/findRefBackfill.pl > cog.refBackfill cut -f 1 *.refBackfill > refBackfill.ids sort -u ../tooManyEpps.ids ../badBranchSeed.ids dropoutContam.ids refBackfill.ids \ | grep -vFwf <(tail -n+4 $scriptDir/includeRecombinants.tsv | cut -f 1) \ > exclude.ids # Get new GenBank sequences with at least $minReal non-N bases. # Exclude seqs in the tree with EPI IDs that that have been mapped in the very latest $epiToPublic. set +o pipefail egrep $'\t''[A-Z][A-Z][0-9]{6}\.[0-9]+' $epiToPublic \ | grep -Fwf prevGisaid - \ | grep -vFwf prevGbAcc \ | cat \ >> prevGbAcc