6f6777bc15f21a8bbb6b2f402eb328a1ecb21652 jbirgmei Thu Jul 3 01:58:56 2025 -0700 Refactored a bit diff --git src/hg/utils/otto/pubtatorDbSnp/BUILD src/hg/utils/otto/pubtatorDbSnp/doUpdate.sh similarity index 90% rename from src/hg/utils/otto/pubtatorDbSnp/BUILD rename to src/hg/utils/otto/pubtatorDbSnp/doUpdate.sh index 3b862055ea1..6e5e7d0893c 100755 --- src/hg/utils/otto/pubtatorDbSnp/BUILD +++ src/hg/utils/otto/pubtatorDbSnp/doUpdate.sh @@ -7,33 +7,30 @@ absolute_log=`readlink -f $LOG_FILE` # Redirect both stdout and stderr to the log file exec >${LOG_FILE} 2>&1 # Function to handle errors function on_error { # Send the output via email if the script fails echo "Error occurred, see ${absolute_log} for details" | mail -s "pubtatorDbSnp BUILD Failure - $(date)" jbirgmei@ucsc.edu echo "An error occurred. Email sent to jbirgmei@ucsc.edu" [[ -n "$jobdir" ]] && rm -rf "$jobdir" || true [[ -n "$out_dir" ]] && rm -rf "$out_dir" || true [[ -n "$splits" ]] && rm -rf "$splits" || true - # [[ -n "$pmid_splits" ]] && rm -rf "$pmid_splits" || true - # [[ -n "$pmid_splits_with_summary" ]] && rm -rf "$pmid_splits_with_summary" || true - # [[ -n "$new_dbs_dir" ]] && rm -rf "$new_dbs_dir" || true [[ -n "$VENV_DIR" ]] && rm -rf "$VENV_DIR" || true exit 1 } # Trap ERR to handle script errors trap 'on_error' ERR dbs=("hg19" "hg38") # Check for --force flag force=false for arg in "$@"; do if [[ "$arg" == "--force" ]]; then @@ -52,52 +49,50 @@ mail -s "pubtatorDbSnp BUILD - not modified - $(date)" jbirgmei@ucsc.edu < ${LOG_FILE} exit 0 fi VENV_DIR=$(mktemp -d venv_XXXXXX) /cluster/home/jbirgmei/opt/miniconda3/bin/python3 -m venv $VENV_DIR source $VENV_DIR/bin/activate $VENV_DIR/bin/python3 -m pip install nltk==3.8.1 sqlitedict==2.1.0 sumy==0.11.0 matplotlib==3.8.3 zcat mutation2pubtator3.gz | awk '$3 ~ /^rs[0-9]+$/' | cut -f 1,3 | uniq | sort -u | $VENV_DIR/bin/python3 ./datamash_collapse.py > rs_to_pmid.tsv deactivate rm -rf $VENV_DIR # remove temp dirs and files: -# rm -rf ${pmid_splits} ${pmid_splits_with_summary} ${new_dbs_dir} ${jobdir} cut -f 1 rs_to_pmid.tsv > rsIds.tsv splits=`mktemp -d ./rsids_splits_XXXXXXXX` split -l 2000 rsIds.tsv ${splits}/split_ for db in $dbs do jobdir=`mktemp -d ./jobs_XXXXXXXX` out_dir=`mktemp -d ./out_XXXXXXXX` for i in ${splits}/split_* do basename=`basename $i` full_path=`readlink -f $i` out=`readlink -f ${out_dir}/${basename}.bed` echo "bigBedNamedItems /hive/data/outside/dbSNP/155/bigDbSnp.2023-03-26/${db}.dbSnp155.bb -nameFile ${full_path} ${out}" >> ${jobdir}/joblist done full_jobdir=`readlink -f ${jobdir}` - # ssh ku "cd ${full_jobdir} && /parasol/bin/para make joblist" cat ${full_jobdir}/joblist | parallel -j 50 cat ${out_dir}/* > rsids.${db}.bed # rm -rf $jobdir $out_dir export LC_ALL=C join -1 4 -2 1 -t $'\t' \ <(sort -t $'\t' -k4,4 rsids.${db}.bed) \ <(sort -t $'\t' -k1,1 rs_to_pmid.tsv) | awk -F $'\t' '{OFS="\t"; print $2, $3, $4, $1, 0, ".", $3, $4, $NF, $(NF-2), $(NF-1)}' | sort -t $'\t' -k1,1 -k2,2n > $today/track.${db}.bed /cluster/bin/x86_64/bedToBigBed -as=pubtatorDbSnp.as -type=bed9+2 -tab $today/track.${db}.bed /cluster/data/${db}/chrom.sizes $today/pubtatorDbSnp.${db}.bb