a087918aa0bbba12853825bd8a588ab26b0b70bd
angie
  Sun Feb 4 09:24:02 2024 -0800
Update to nextclade v3 output column indices.

diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
index 39cd74e..812d54e 100755
--- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
+++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
@@ -193,40 +193,40 @@
 wc -l prev{GbAcc,CogUk,Gisaid,Cncb}
 
 # Exclude some sequences based on nextclade counts of reversions and other-clade mutations.
 zcat $gisaidDir/chunks/nextclade.full.tsv.gz \
 | $scriptDir/findDropoutContam.pl > gisaid.dropoutContam
 zcat $ncbiDir/nextclade.full.tsv.gz \
 | $scriptDir/findDropoutContam.pl > gb.dropoutContam
 zcat $cogUkDir/nextclade.full.tsv.gz \
 | $scriptDir/findDropoutContam.pl > cog.dropoutContam
 zcat $cncbDir/nextclade.full.tsv.gz \
 | $scriptDir/findDropoutContam.pl > cncb.dropoutContam
 cut -f 1 *.dropoutContam \
 | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \
     > dropoutContam.ids
 # Also exclude sequences with unbelievably low numbers of mutations given sampling dates.
-zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 1,10 \
+zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 2,11 \
 | awk -F\| '{ if ($3 == "") { print $1 "\t" $2; } else { print $2 "\t" $3; } }' \
 | $scriptDir/findRefBackfill.pl > gisaid.refBackfill
-zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \
+zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 2,11 | sort \
 | join -t $'\t' <(cut -f 1,3 $ncbiDir/ncbi_dataset.plusBioSample.tsv | sort) - \
 | $scriptDir/findRefBackfill.pl > gb.refBackfill
-zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \
+zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 2,11 | sort \
 | join -t $'\t' <(zcat $cogUkDir/cog_metadata.csv.gz | cut -d, -f 1,5 | tr , $'\t' | sort) - \
 | $scriptDir/findRefBackfill.pl > cog.refBackfill
-zcat $cncbDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \
+zcat $cncbDir/nextclade.full.tsv.gz | cut -f 2,11 | sort \
 | join -t$'\t' <(cut -f 2,10 $cncbDir/cncb.metadata.tsv | sort) - \
 | $scriptDir/findRefBackfill.pl > cncb.refBackfill
 cut -f 1 *.refBackfill > refBackfill.ids
 curl -sS $lineageProposalsRecombinants  | tail -n+2 | cut -f 1 \
 | sed -re 's@(England|Northern[ _]?Ireland|Scotland|Wales)/([A-Z0-9_-]+).*@\2@;
            s/.*(EPI_ISL_[0-9]+|[A-Z]{2}[0-9]+{6}(\.[0-9]+)?).*/\1/;' \
     > tmp
 grep -Fwf tmp $epiToPublic | cut -f 2 | grep -E '^[A-Z]{2}[0-9]{6}' > tmp2
 sort -u tmp tmp2 > lpRecombinantIds
 rm tmp tmp2
 sort -u ../tooManyEpps.ids ../badBranchSeed.ids dropoutContam.ids refBackfill.ids \
 | grep -vFwf <(tail -n+2 $scriptDir/includeRecombinants.tsv | cut -f 1) \
 | grep -vFwf lpRecombinantIds \
     > exclude.ids