47541d92430fc9e07cd0bab4ba386faaa1f3d76c angie Sat Dec 23 18:27:34 2023 -0800 Remove duplicates between COG-UK and INSDC. Fix name format that omitted INSDC accession. Get INSDC accessions for COG-UK sequences in lineageProposalsRecombinants file. diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh index d4c8ed0..39cd74e 100755 --- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh +++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh @@ -86,47 +86,59 @@ cut -f 1 prevIdToNameDeDup | grep -E '^(England|Northern|Scotland|Wales)' > cog.acc comm -13 <(zcat $cogUkDir/cog_metadata.csv.gz | cut -d, -f 1 | sort) cog.acc > cog.removed.acc if [ -s cog.removed.acc ]; then grep -Fwf cog.removed.acc prevIdToName \ | grep -vE '^([A-Z]{2}[0-9]{6}\.[0-9]+|EPI_ISL_)' \ | cut -f 2 >> prevNameToRemove fi cut -f 1 prevIdToNameDeDup \ | grep -vE '^([A-Z]{2}[0-9]{6}\.[0-9]+|EPI_ISL_|England|Northern|Scotland|Wales)' \ > cncb.acc comm -13 <(cut -f 2 $cncbDir/cncb.metadata.tsv | sort) cncb.acc > cncb.removed.acc if [ -s cncb.removed.acc ]; then grep -Fwf cncb.removed.acc prevIdToName | cut -f 2 >> prevNameToRemove fi +# If a sequence is in both INSDC and COG-UK, keep the INSDC and remove the COG-UK as dup. +set +o pipefail +cut -f 6 $ncbiDir/ncbi_dataset.plusBioSample.tsv \ +| grep ^COG-UK/ \ +| sed -re 's@COG-UK/@@' \ +| grep -Fwf - prevNames \ +| grep -vE '\|[A-Z]{2}[0-9]{6}\.[0-9]+\|' \ +| grep -vE '\|EPI_ISL_[0-9]+\|' \ +| cat \ + >> prevNameToRemove +set -o pipefail + # Remove duplicates and withdrawn sequences if [ -s prevNameToRemove ]; then rm -f prevDedup.pb $matUtils extract -i $baseProtobuf \ -p -s <(sort -u prevNameToRemove) \ -u prevDedupNames \ -o prevDedup.pb else ln -sf $baseProtobuf prevDedup.pb $matUtils extract -i prevDedup.pb -u prevDedupNames fi function gbAccCogRenaming { # pipeline: one INSDC accession per line of stdin, acc to full name if COG-UK on stdout grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \ | grep COG-UK/ \ - | tawk '{ if ($4 != "") { print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3; } else { if ($3 != "") { print $1, $6 "/" $3 "|" $1 "|" $3; } else { print $1, $6 "|?"; } } }' \ + | tawk '{ if ($4 != "") { print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3; } else { if ($3 != "") { print $1, $6 "/" $3 "|" $1 "|" $3; } else { print $1, $6 "|" $1 "|?"; } } }' \ | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' } function gbAccNonCogRenaming { # pipeline: one INSDC accession per line of stdin, acc to full name if non-COG-UK on stdout grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \ | grep -v COG-UK/ \ | cleanGenbank \ | tawk '{ if ($3 == "") { $3 = "?"; } if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; } else { print $1 "\t" $1 "|" $3; } }' \ | sed -re 's/ /_/g' } # To update names that have changed and simplify detection of new sequences to add, relate to acc. @@ -194,35 +206,40 @@ > dropoutContam.ids # Also exclude sequences with unbelievably low numbers of mutations given sampling dates. zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 1,10 \ | awk -F\| '{ if ($3 == "") { print $1 "\t" $2; } else { print $2 "\t" $3; } }' \ | $scriptDir/findRefBackfill.pl > gisaid.refBackfill zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \ | join -t $'\t' <(cut -f 1,3 $ncbiDir/ncbi_dataset.plusBioSample.tsv | sort) - \ | $scriptDir/findRefBackfill.pl > gb.refBackfill zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \ | join -t $'\t' <(zcat $cogUkDir/cog_metadata.csv.gz | cut -d, -f 1,5 | tr , $'\t' | sort) - \ | $scriptDir/findRefBackfill.pl > cog.refBackfill zcat $cncbDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \ | join -t$'\t' <(cut -f 2,10 $cncbDir/cncb.metadata.tsv | sort) - \ | $scriptDir/findRefBackfill.pl > cncb.refBackfill cut -f 1 *.refBackfill > refBackfill.ids +curl -sS $lineageProposalsRecombinants | tail -n+2 | cut -f 1 \ +| sed -re 's@(England|Northern[ _]?Ireland|Scotland|Wales)/([A-Z0-9_-]+).*@\2@; + s/.*(EPI_ISL_[0-9]+|[A-Z]{2}[0-9]+{6}(\.[0-9]+)?).*/\1/;' \ + > tmp +grep -Fwf tmp $epiToPublic | cut -f 2 | grep -E '^[A-Z]{2}[0-9]{6}' > tmp2 +sort -u tmp tmp2 > lpRecombinantIds +rm tmp tmp2 sort -u ../tooManyEpps.ids ../badBranchSeed.ids dropoutContam.ids refBackfill.ids \ | grep -vFwf <(tail -n+2 $scriptDir/includeRecombinants.tsv | cut -f 1) \ -| grep -vFwf <(curl -sS $lineageProposalsRecombinants | tail -n+2 | cut -f 1 \ - | sed -re 's/.*(EPI_ISL_[0-9]+|[A-Z]{2}[0-9]+{6}(\.[0-9]+)?).*/\1/; - s@(England|Northern[ _]?Ireland|Scotland|Wales)/([A-Z0-9_-]+).*@\2@;') \ +| grep -vFwf lpRecombinantIds \ > exclude.ids # Get new GenBank sequences with at least $minReal non-N bases. xzcat $ncbiDir/genbank.fa.xz \ | faSomeRecords -exclude stdin <(cat prevGbAcc exclude.ids) newGenBank.fa faSize -veryDetailed newGenBank.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > gbTooSmall # NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too. echo NC_045512.2 >> gbTooSmall faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa faSize newGenBank.filtered.fa # Get new COG-UK sequences with at least $minReal non-N bases. # Also exclude cog_all.fasta sequences not found in cog_metadata.csv.