a4d0ac69ad1b6dd12a46266be6d95249c405d208 angie Sat Dec 23 19:18:43 2023 -0800 Remove manual patch for BioSample since records have been fixed by ENA. diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh index 57fba4d..6339a09 100755 --- src/hg/utils/otto/sarscov2phylo/getNcbi.sh +++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh @@ -62,37 +62,30 @@ minSeqCount=6800000 metadataLC=$(wc -l < ncbi_dataset.tsv) if (($metadataLC < $minSeqCount)); then echo "TOO FEW SEQUENCES: $metadataLC lines of ncbi_dataset.tsv, expected at least $minSeqCount" exit 1 fi time $scriptDir/bioSampleJsonToTab.py ncbi_dataset/data/biosample_report.jsonl \ | uniq > gb.bioSample.tab # Use BioSample metadata to fill in missing pieces of GenBank metadata and report conflicting # sample collection dates: $scriptDir/gbMetadataAddBioSample.pl gb.bioSample.tab ncbi_dataset.tsv \ > ncbi_dataset.plusBioSample.tsv 2>gbMetadataAddBioSample.log -# Manually patch some GB-to-BioSample associations that somehow got mangled at ENA, until -# they fix them and the fix percolates through to NCBI... -grep -vFwf <(cut -f 1 $ottoDir/ncbi.2022-06-25/gbToBioSample.changes.tsv) \ - ncbi_dataset.plusBioSample.tsv > tmp -sort tmp $ottoDir/ncbi.2022-06-25/gbToBioSample.patch.tsv > ncbi_dataset.plusBioSample.tsv -rm tmp - # Make a file for joining collection date with ID: tawk '$3 != "" {print $1, $3;}' ncbi_dataset.plusBioSample.tsv \ | sort > gbToDate # Replace FASTA headers with reconstructed names from enhanced metadata. time cleanGenbank < ncbi_dataset/data/genomic.fna \ | $scriptDir/fixNcbiFastaNames.pl ncbi_dataset.plusBioSample.tsv \ > genbank.maybeDups.fa time fastaNames genbank.maybeDups.fa | awk '{print $1 "\t" $0;}' > gb.rename time faUniqify genbank.maybeDups.fa stdout \ | faRenameRecords stdin gb.rename stdout \ | xz -T 20 \ > genbank.fa.xz # Run pangolin and nextclade on sequences that are new since yesterday