30d1e434dcd5b313f393eac770c8da2d4323d54f angie Fri Oct 13 11:45:13 2023 -0700 Adding download & usher tree build scripts for several non-SARS-CoV-2 viruses. mpxv has been going for over a year, rsv & dengue for some months now. fluA is an incomplete work in progress, complicated by having 8 segments, greater distances, and different subtyping schemes for different segments and even for different types within the segments. diff --git src/hg/utils/otto/fluA/splitGbff.sh src/hg/utils/otto/fluA/splitGbff.sh new file mode 100755 index 0000000..418e0db --- /dev/null +++ src/hg/utils/otto/fluA/splitGbff.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -beEu -o pipefail +set -x + +# One-time thing: split RefSeq Assembly _genomic.gbff.gz files into one file per NC_ accession. + +fluAScriptDir=$(dirname "${BASH_SOURCE[0]}") + +fluADir=/hive/data/outside/otto/fluA + +assemblyDir=/hive/data/outside/ncbi/genomes + +for asmAcc in GCF_000865085.1 GCF_001343785.1 GCF_000865725.1 GCF_000928555.1 GCF_000864105.1 \ + GCF_000866645.1 GCF_000851145.1; do + asmDir=$(echo $asmAcc \ + | sed -re 's@^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\.([0-9]+)@\1/\2/\3/\4/\1_\2\3\4.\5@') + assemblyGbff=$assemblyDir/$asmDir*/$asmAcc*_genomic.gbff.gz + assemblyReport=$assemblyDir/$asmDir*/$asmAcc*_assembly_report.txt + segRefs=$(tawk '$8 == "Primary Assembly" {print $7;}' $assemblyReport) + + mkdir -p $fluADir/$asmAcc + pushd $fluADir/$asmAcc + gunzip -c $assemblyGbff | $fluAScriptDir/splitGbff.pl + for segRef in $segRefs; do + segRefNoDot=$(echo $segRef | sed -re 's/\.[0-9]$//;') + if [ -s $segRefNoDot.gbff ]; then + segRefInFile=$(grep ^VERSION $segRefNoDot.gbff | awk '{print $2;}') + mv $segRefNoDot.gbff $segRefInFile.gbff + else + echo "*** ERROR: expected to find $segRefNoDot.gbff from $assemblyGbff but it's not there ***" + fi + done + popd +done