4d34cfc382f4e45a7119129b2baa6a8645b80542 angie Fri Oct 1 14:21:27 2021 -0700 Max pointed out that 8 threads would be much more reasonable for xz than 50. diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh index 4a15f79..357d594 100755 --- src/hg/utils/otto/sarscov2phylo/getNcbi.sh +++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh @@ -107,31 +107,31 @@ grep -Fwf bioSample.missingFromJson.txt gb.bioSample.eutils.tab \ >> gb.bioSample.tab # Use BioSample metadata to fill in missing pieces of GenBank metadata and report conflicting # sample collection dates: $scriptDir/gbMetadataAddBioSample.pl gb.bioSample.tab ncbi_dataset.tsv \ > ncbi_dataset.plusBioSample.tsv 2>gbMetadataAddBioSample.log # Make a file for joining collection date with ID: tawk '$3 != "" {print $1, $3;}' ncbi_dataset.plusBioSample.tsv \ | sort > gbToDate # Replace FASTA headers with reconstructed names from enhanced metadata. time cleanGenbank < ncbi_dataset/data/genomic.fna \ | $scriptDir/fixNcbiFastaNames.pl ncbi_dataset.plusBioSample.tsv \ -| xz -T 50 \ +| xz -T 8 \ > genbank.fa.xz # Run pangolin and nextclade on sequences that are new since yesterday export TMPDIR=/dev/shm fastaNames genbank.fa.xz | awk '{print $1;}' | sed -re 's/\|.*//' | grep -vx pdb | sort -u > gb.names splitDir=splitForNextclade rm -rf $splitDir mkdir $splitDir if [ -e ../ncbi.latest/nextclade.tsv ]; then cp ../ncbi.latest/nextclade.tsv . cut -f 1 nextclade.tsv | sort -u > nextclade.prev.names comm -23 gb.names nextclade.prev.names > nextclade.names faSomeRecords <(xzcat genbank.fa.xz) nextclade.names nextclade.fa faSplit about nextclade.fa 30000000 $splitDir/chunk else