src/hg/utils/otto/sarscov2phylo/getNcbi.sh 4d34cfc382f4e45a7119129b2baa6a8645b80542

4d34cfc382f4e45a7119129b2baa6a8645b80542
angie
  Fri Oct 1 14:21:27 2021 -0700
Max pointed out that 8 threads would be much more reasonable for xz than 50.

diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh
index 4a15f79..357d594 100755
--- src/hg/utils/otto/sarscov2phylo/getNcbi.sh
+++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh
@@ -107,31 +107,31 @@
 grep -Fwf bioSample.missingFromJson.txt gb.bioSample.eutils.tab \
     >> gb.bioSample.tab
 
 # Use BioSample metadata to fill in missing pieces of GenBank metadata and report conflicting
 # sample collection dates:
 $scriptDir/gbMetadataAddBioSample.pl gb.bioSample.tab ncbi_dataset.tsv \
     > ncbi_dataset.plusBioSample.tsv 2>gbMetadataAddBioSample.log
 
 # Make a file for joining collection date with ID:
 tawk '$3 != "" {print $1, $3;}' ncbi_dataset.plusBioSample.tsv \
 | sort > gbToDate
 
 # Replace FASTA headers with reconstructed names from enhanced metadata.
 time cleanGenbank < ncbi_dataset/data/genomic.fna \
 | $scriptDir/fixNcbiFastaNames.pl ncbi_dataset.plusBioSample.tsv \
-| xz -T 50 \
+| xz -T 8 \
     > genbank.fa.xz
 
 # Run pangolin and nextclade on sequences that are new since yesterday
 export TMPDIR=/dev/shm
 fastaNames genbank.fa.xz | awk '{print $1;}' | sed -re 's/\|.*//' | grep -vx pdb | sort -u > gb.names
 splitDir=splitForNextclade
 rm -rf $splitDir
 mkdir $splitDir
 if [ -e ../ncbi.latest/nextclade.tsv ]; then
     cp ../ncbi.latest/nextclade.tsv .
     cut -f 1 nextclade.tsv | sort -u > nextclade.prev.names
     comm -23 gb.names nextclade.prev.names > nextclade.names
     faSomeRecords <(xzcat genbank.fa.xz) nextclade.names nextclade.fa
     faSplit about nextclade.fa 30000000 $splitDir/chunk
 else