986c4ede954e44904eb314772fb2cf83a48d307c max Wed May 6 06:24:47 2026 -0700 varFreqs: lift GenomeAsia (gasp + gaspIndel) GRCh37 -> hg38 Both subtracks were served at /gbdb/hg38/ but the upstream callset is GRCh37 (caught in QA, see #36642 note 2026-05-04). Lifted with CrossMap using hg19ToHg38.over.chain.gz; recipe matches tishkoff180 / mxbFreq. gasp (SNVs): 66,236,516 -> 66,222,771 (99.98%; 6,240 unmapped + 7,505 alt/random) gaspIndel: 4,415,156 -> 4,410,871 (99.90%; 3,332 unmapped + 953 alt/random) New driver script: scripts/varFreqs/gaspLift.sh. gaspIndel bigDataUrl renamed from All.indels.annot.cont_withmaf.vcf.gz to ga100k.indels.vcf.gz (old name was a verbatim copy of the upstream download name). varFreqsAll combined bigBed regenerated to fold in the corrected coordinates (36.5 GB, 1,166,451,644 items, 125 fields). refs #36642 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> diff --git src/hg/makeDb/scripts/varFreqs/gaspLift.sh src/hg/makeDb/scripts/varFreqs/gaspLift.sh new file mode 100755 index 00000000000..fc0430d70e1 --- /dev/null +++ src/hg/makeDb/scripts/varFreqs/gaspLift.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# gaspLift.sh - lift the GenomeAsia Pilot 100k VCFs from GRCh37 to hg38 +# usage: gaspLift.sh <inVcf.gz> <outBase> +# outputs: <outBase>.vcf.gz (sorted, bgzipped) and .tbi +# +# Recipe (same as tishkoff180): +# bare-name -> chr-prefix rename -> CrossMap.py vcf with hg19ToHg38 +# -> drop alt/random/Un/fix landings -> fix contig= header lines +# -> bcftools sort -> bgzip + tabix. +set -euo pipefail + +if [ $# -ne 2 ]; then + echo "usage: $0 <inVcf.gz> <outBase>" >&2 + exit 1 +fi + +IN_VCF="$1" +OUT_BASE="$2" +WORK_DIR="$(dirname "$OUT_BASE")" +TMP_BASE="$WORK_DIR/$(basename "$OUT_BASE").tmp" + +CHAIN=/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz +HG38_FA=/hive/data/genomes/hg38/bed/varFreqs/all/hg38.fa +RENAME=/hive/data/genomes/hg38/bed/varFreqs/ga100k/lift/chrRename.txt + +mkdir -p "$WORK_DIR" + +echo "[$(date '+%H:%M:%S')] Step 1: rename bare chromosomes -> chr-prefixed" +# The indel VCF has a malformed #CHROM line: it declares FORMAT but has no +# sample columns and no FORMAT field on data lines. Strip the trailing +# FORMAT before bcftools sees it. NF=8 is a no-op for clean sites-only files. +zcat "$IN_VCF" \ + | awk 'BEGIN{OFS="\t"} /^#CHROM/{NF=8} {print}' \ + | bcftools annotate --rename-chrs "$RENAME" --threads 4 -Oz \ + -o "$TMP_BASE.chr.vcf.gz" - + +echo "[$(date '+%H:%M:%S')] Step 2: CrossMap.py vcf hg19 -> hg38" +CrossMap.py vcf "$CHAIN" \ + "$TMP_BASE.chr.vcf.gz" \ + "$HG38_FA" \ + "$TMP_BASE.hg38.vcf" \ + 2> "$TMP_BASE.crossmap.log" +tail -20 "$TMP_BASE.crossmap.log" + +echo "[$(date '+%H:%M:%S')] Step 3: filter to canonical chr1-22, X, Y, M and fix contig= header" +awk 'BEGIN{OFS="\t"} + /^##contig=/ {sub(/<ID=/,"<ID=chr"); print; next} + /^#/ {print; next} + $1 ~ /^chr([1-9]|1[0-9]|2[0-2]|X|Y|M)$/ {print}' \ + "$TMP_BASE.hg38.vcf" > "$TMP_BASE.canon.vcf" + +echo "[$(date '+%H:%M:%S')] Step 4: bcftools sort" +bcftools sort "$TMP_BASE.canon.vcf" -Oz -m 8G -T /data/tmp/ \ + -o "$OUT_BASE.vcf.gz" + +echo "[$(date '+%H:%M:%S')] Step 5: tabix" +tabix -p vcf "$OUT_BASE.vcf.gz" + +echo "[$(date '+%H:%M:%S')] Counts:" +INPUT_N=$(zcat "$IN_VCF" | grep -cv '^#' || true) +LIFTED_N=$(grep -cv '^#' "$TMP_BASE.hg38.vcf" || true) +UNMAP_N=$([ -f "$TMP_BASE.hg38.vcf.unmap" ] && grep -cv '^#' "$TMP_BASE.hg38.vcf.unmap" || echo 0) +FINAL_N=$(zcat "$OUT_BASE.vcf.gz" | grep -cv '^#' || true) +DROPPED_NONCANON=$((LIFTED_N - FINAL_N)) +echo " input : $INPUT_N" +echo " lifted : $LIFTED_N" +echo " unmapped : $UNMAP_N" +echo " drop non-canon: $DROPPED_NONCANON" +echo " final : $FINAL_N" + +# clean up temp files (keep .crossmap.log and .unmap for QA) +rm -f "$TMP_BASE.chr.vcf.gz" "$TMP_BASE.hg38.vcf" "$TMP_BASE.canon.vcf" + +echo "[$(date '+%H:%M:%S')] Done. Output: $OUT_BASE.vcf.gz"