src/hg/utils/otto/sarscov2phylo/getNcbi.sh a4d0ac69ad1b6dd12a46266be6d95249c405d208

a4d0ac69ad1b6dd12a46266be6d95249c405d208
angie
  Sat Dec 23 19:18:43 2023 -0800
Remove manual patch for BioSample since records have been fixed by ENA.

diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh
index 57fba4d..6339a09 100755
--- src/hg/utils/otto/sarscov2phylo/getNcbi.sh
+++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh
@@ -62,37 +62,30 @@
 minSeqCount=6800000
 metadataLC=$(wc -l < ncbi_dataset.tsv)
 if (($metadataLC < $minSeqCount)); then
     echo "TOO FEW SEQUENCES: $metadataLC lines of ncbi_dataset.tsv, expected at least $minSeqCount"
     exit 1
 fi
 
 time $scriptDir/bioSampleJsonToTab.py ncbi_dataset/data/biosample_report.jsonl \
     | uniq > gb.bioSample.tab
 
 # Use BioSample metadata to fill in missing pieces of GenBank metadata and report conflicting
 # sample collection dates:
 $scriptDir/gbMetadataAddBioSample.pl gb.bioSample.tab ncbi_dataset.tsv \
     > ncbi_dataset.plusBioSample.tsv 2>gbMetadataAddBioSample.log
 
-# Manually patch some GB-to-BioSample associations that somehow got mangled at ENA, until
-# they fix them and the fix percolates through to NCBI...
-grep -vFwf <(cut -f 1 $ottoDir/ncbi.2022-06-25/gbToBioSample.changes.tsv) \
-    ncbi_dataset.plusBioSample.tsv > tmp
-sort tmp $ottoDir/ncbi.2022-06-25/gbToBioSample.patch.tsv > ncbi_dataset.plusBioSample.tsv
-rm tmp
-
 # Make a file for joining collection date with ID:
 tawk '$3 != "" {print $1, $3;}' ncbi_dataset.plusBioSample.tsv \
 | sort > gbToDate
 
 # Replace FASTA headers with reconstructed names from enhanced metadata.
 time cleanGenbank < ncbi_dataset/data/genomic.fna \
 | $scriptDir/fixNcbiFastaNames.pl ncbi_dataset.plusBioSample.tsv \
     > genbank.maybeDups.fa
 time fastaNames genbank.maybeDups.fa | awk '{print $1 "\t" $0;}' > gb.rename
 time faUniqify genbank.maybeDups.fa stdout \
 | faRenameRecords stdin gb.rename stdout \
 | xz -T 20 \
     > genbank.fa.xz
 
 # Run pangolin and nextclade on sequences that are new since yesterday