8a0ebbbae627da20c8c7c65aa57d297e5a19529a angie Tue Jun 3 11:08:52 2025 -0700 Tweak download dir name. Update hgdownload2 --> hgdownload3 and tolerate hgdownload rsync failure. diff --git src/hg/utils/otto/mtb/getNcbiMtb.sh src/hg/utils/otto/mtb/getNcbiMtb.sh index 7a90e0d813c..567b1a12f4c 100755 --- src/hg/utils/otto/mtb/getNcbiMtb.sh +++ src/hg/utils/otto/mtb/getNcbiMtb.sh @@ -32,31 +32,31 @@ rm -f ncbi_dataset.zip sleep $retryDelay # Double the delay to give NCBI progressively more time retryDelay=$(($retryDelay * 2)) fi done if [[ ! -s ncbi_dataset.zip ]]; then echo "datasets query failed $maxAttempts times; quitting." exit 1 fi unzip ncbi_dataset.zip # Extract metadata for single-sequence GCA assemblies only from assembly_data_report.jsonl jq -c -r 'if .assemblyStats.numberOfComponentSequences == 1 then ([.accession, .assemblyInfo.biosample.accession, (.assemblyInfo.biosample.attributes[] | select(.name == "collection_date") | .value) // "", (.assemblyInfo.biosample.attributes[] | select(.name == "geo_loc_name") | .value) // "", (.assemblyInfo.biosample.attributes[] | select(.name == "host") | .value) // "", .assemblyInfo.assemblyName, .assemblyStats.totalSequenceLength, .assemblyInfo.bioprojectAccession // "", (if .assemblyInfo.biosample.sampleIds != null then .assemblyInfo.biosample.sampleIds[] | select(.db == "SRA") | .value else "" end) // "", (.assemblyInfo.biosample.attributes[] | select(.name == "strain") | .value) // "", .organism.organismName, .organism.infraspecificNames.strain // "", .organism.infraspecificNames.isolate // ""] | join("\t")) else "" end' \ ncbi_dataset/data/assembly_data_report.jsonl \ -| g ^GCA \ +| grep ^GCA \ | sort \ > assembly_metadata.tsv # Unfortunately the correspondence between assembly accession and fasta genbank accession is # encoded in the fasta directory/filenames, so we have to resolve that one GCA directory at a time. for d in $(cut -f 1 assembly_metadata.tsv); do echo -n -e "$d\t" head -1 ncbi_dataset/data/$d/$d*.fna | sed -re 's/^>//; s/ .*//;' done | sort > assemblyToNuc.tsv # Build fasta using only the GCA assemblies. for d in $(cut -f 1 assembly_metadata.tsv); do sed -re '/^>/ s/ .*//' ncbi_dataset/data/$d/$d*.fna done \ | faFilter -minSize=$minSize stdin stdout \ | xz -T 20 > genbank.fa.xz