b777f63dced2a62cdf363cc456ded46d569ceb46
angie
  Fri Dec 2 11:11:11 2022 -0800
Update nextclade output column indices

diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
index 3de9790..0f948c6 100755
--- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
+++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
@@ -168,37 +168,37 @@
 grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid
 grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid
 wc -l prev*
 
 # Exclude some sequences based on nextclade counts of reversions and other-clade mutations.
 zcat $gisaidDir/chunks/nextclade.full.tsv.gz \
 | $scriptDir/findDropoutContam.pl > gisaid.dropoutContam
 zcat $ncbiDir/nextclade.full.tsv.gz \
 | $scriptDir/findDropoutContam.pl > gb.dropoutContam
 zcat $cogUkDir/nextclade.full.tsv.gz \
 | $scriptDir/findDropoutContam.pl > cog.dropoutContam
 cut -f 1 *.dropoutContam \
 | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \
     > dropoutContam.ids
 # Also exclude sequences with unbelievably low numbers of mutations given sampling dates.
-zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 1,6 \
+zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 1,10 \
 | awk -F\| '{ if ($3 == "") { print $1 "\t" $2; } else { print $2 "\t" $3; } }' \
 | $scriptDir/findRefBackfill.pl > gisaid.refBackfill
-zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 1,6 | sort \
+zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \
 | join -t $'\t' <(cut -f 1,3 $ncbiDir/ncbi_dataset.plusBioSample.tsv | sort) - \
 | $scriptDir/findRefBackfill.pl > gb.refBackfill
-zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 1,6 | sort \
+zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \
 | join -t $'\t' <(cut -d, -f 1,5 $cogUkDir/cog_metadata.csv | tr , $'\t' | sort) - \
 | $scriptDir/findRefBackfill.pl > cog.refBackfill
 cut -f 1 *.refBackfill > refBackfill.ids
 sort -u ../tooManyEpps.ids ../badBranchSeed.ids dropoutContam.ids refBackfill.ids \
 | grep -vFwf <(tail -n+4 $scriptDir/includeRecombinants.tsv | cut -f 1) \
     > exclude.ids
 
 # Get new GenBank sequences with at least $minReal non-N bases.
 # Exclude seqs in the tree with EPI IDs that that have been mapped in the very latest $epiToPublic.
 set +o pipefail
 egrep $'\t''[A-Z][A-Z][0-9]{6}\.[0-9]+' $epiToPublic \
 | grep -Fwf prevGisaid - \
 | grep -vFwf prevGbAcc \
 | cat \
     >> prevGbAcc