7d169d1245946d2eb9cf2d6f65adf2fc56ef0b35 chmalee Wed Aug 5 14:06:59 2020 -0700 Stage genome in a bottle structural variants on dev, refs #24349 diff --git src/hg/utils/otto/dbVar/checkNstd175.sh src/hg/utils/otto/dbVar/checkNstd175.sh new file mode 100755 index 0000000..67846c9 --- /dev/null +++ src/hg/utils/otto/dbVar/checkNstd175.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# Do not modify this script, modify the source tree copy: +# src/hg/utils/dbVar/checkNstd175.sh + +set -beEu -o pipefail +WORKDIR=$1 +today=`date +%F` + +cleanUpOnError () { + echo "dbVar nstd175 build failed" +} + +trap cleanUpOnError ERR +trap "cleanUpOnError; exit 1" SIGINT SIGTERM +umask 002 + +mkdir -p ${WORKDIR}/${today}/giab +cd ${WORKDIR}/${today}/giab +rm -f ftp.giab.rsp +echo "user anonymous otto@soe.ucsc.edu +cd /pub/dbVar/data/Homo_sapiens/by_study/gvf +ls nstd175* +bye" > ftp.giab.rsp + +# reorganize results files +rm -f ls.check +rm -f old.release.list +if [ -e prev.release.list ] +then + mv prev.release.list old.release.list +else + touch release.list +fi +cp -p release.list prev.release.list +rm -f release.list + +# connect and list a directory, result to file: ls.check +ftp -n -v -i ftp.ncbi.nlm.nih.gov 2>&1 < ftp.giab.rsp &> ls.check + +# fetch the release directory names from the ls.check result file +grep "gvf.gz" ls.check | sort > release.list || echo "Error - no gvf files found" +touch release.list +chmod o+w release.list + +# verify we are getting a proper list +WC=`cat release.list | wc -l` +if [ "${WC}" -lt 1 ]; then + echo "potential error in dbVar GIAB sv data, no gvf files found. Check ls.check in ${WORKDIR}/${today}/giab" + cleanUpOnError + exit 255 +fi + +# see if anything is changing, if so, email notify, download, and build +diff prev.release.list release.list > release.diff || true +WC=`cat release.diff | wc -l` +if [ "${WC}" -gt 1 ]; then + echo -e "New dbVar GIAB Structural Variant update at:\n" \ + "ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/gvf/nstd175.*\n" + + for grc in GRCh38 GRCh37 + do + db="" + if [ ${grc} == "GRCh37" ]; then + db="hg19" + else + db="hg38" + fi + mkdir -p ${db} + pushd ${db} > /dev/null + echo "processing nstd175 for ${db}" + hgsql -Ne 'select 0, ca.alias, size, ca.chrom, size from chromInfo ci join chromAlias ca on ci.chrom = ca.chrom where source = "refseq"' ${db} > ${db}.lift + wget -N -q "ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/gvf/nstd175.${grc}.variant*.gvf.gz" + zcat nstd175.${grc}.* | ../../../processNstd175.py stdin ${db}.lift | sort -k1,1 -k2,2n | bedClip -truncate stdin /hive/data/genomes/${db}/chrom.sizes stdout > giabSv.bed + bedToBigBed -type=bed9+11 -as=../../../giabSv.as -tab giabSv.bed /hive/data/genomes/${db}/chrom.sizes giabSv.bb + cp giabSv.bb ${WORKDIR}/release/${db}/giabSv.bb + popd > /dev/null + done +fi +echo "dbVar nstd175 update done"