50d9453206becf6203aa41b831cca95ba6b5b58d max Thu Sep 22 10:31:16 2022 -0700 adding decode and 1k Genomes genetic maps, refs #30044 diff --git src/hg/makeDb/doc/hg38/recombRate.txt src/hg/makeDb/doc/hg38/recombRate.txt new file mode 100644 index 0000000..c72538b --- /dev/null +++ src/hg/makeDb/doc/hg38/recombRate.txt @@ -0,0 +1,91 @@ +# deCODE and 1000 Genomes recombination rates super track +# Max, Thu Sep 22 10:23:39 PDT 2022 +cd /hive/data/genomes/hg38/bed/recombRate +mkdir orig +zcat orig/aau1043_datas1.gz | grep -v '^#' | grep -v cMperMb | less | cut -f1-4 > recombPat.bedgraph +zcat orig/aau1043_datas2.gz | grep -v '^#' | grep -v cMperMb | less | cut -f1-4 > recombMat.bedgraph +zcat orig/aau1043_datas3.gz | grep -v '^#' | grep -v cMperMb | less | cut -f1-4 > recombAvg.bedgraph +bedGraphToBigWig recombPat.bedgraph ../../chrom.sizes recombPat.bw +bedGraphToBigWig recombAvg.bedgraph ../../chrom.sizes recombAvg.bw +bedGraphToBigWig recombMat.bedgraph ../../chrom.sizes recombMat.bw + +# crossover events +zless orig/aau1043_datas4.gz | grep -v '^#' | grep -v medsnp > events.bed +bedSort events.bed events.bed +bedToBigBed events.bed -as=${HOME}/kent/src/hg/makeDb/autoSql/recombEvents.as -tab ../../chrom.sizes events.bb -type=bed4+ + +# de novo mutations +less orig/aau1043_datas5.tsv | grep -v \# | grep -v Chr | tawk '{print($1,$2,$2+1, $3">"$4, $5, $6, $7, $8)}' > denovo.bed +bedSort denovo.bed denovo.bed +bedToBigBed denovo.bed ../../chrom.sizes recombDenovo.bb -type=bed4+ -as=${HOME}/kent/src/hg/makeDb/autoSql/recombDenovo.as -tab + +# 1000 Genomes subtrack +# the 1000 genomes map, received makeDocs from poruloh@broadinstitute.org +# the following makedocs after ---- are NOT from UCSC, they are from Po-Ru: +----- +I believe I downloaded the hg19 map from either the HapMap website or the IMPUTE2 website (maybe the same?): + +https://ftp.ncbi.nlm.nih.gov/hapmap/recombination/latest/rates/ +https://mathgen.stats.ox.ac.uk/impute/1000GP_Phase3.html + + +### liftOver + post-processing script: + +HG19_BED=genetic_map_hg19_withX.bed + +# prepare UCSC .bed file with hg19 map data +zcat genetic_map_hg19_withX.txt.gz | awk -v OFS=$'\t' 'NR>1 {if ($1==23) $1="X"; print "chr"$1,$2-1,$2,"chr"$1","$2","$4}' > $HG19_BED + +# lift to hg38 +HG38_BED_RAW=genetic_map_hg38_withX.bed +HG38_BED_UNMAPPED=genetic_map_hg38_withX.bed.unmapped +/home/pl88/liftOver/liftOver $HG19_BED /home/pl88/liftOver/hg19ToHg38.over.chain $HG38_BED_RAW $HG38_BED_UNMAPPED + +# filter sites that moved to different chromosomes; sort in hg38 order +HG38_BED_CLEAN=genetic_map_hg38_withX.clean.bed +sed 's/,/\t/g' $HG38_BED_RAW | awk '$1==$4' | sed 's/X/23/g' | sed 's/chr//g' | cut -f1,3- | sort -k1,1n -k2,2n > $HG38_BED_CLEAN + +# for each pair of consecutive hg38 map positions: +# - if base pair span in hg38 is within 20% of base pair span in hg19, keep relative cM coordinates +# - else, set recombination rate to default 1cM/Mb (with 1cM cap on cM distance between positions) +awk ' +BEGIN {prevChr=0; prevPos=0; prevPosHg19=0; prevcM19=0; cM38 = 0; +print "chr position COMBINED_rate(cM/Mb) Genetic_Map(cM)"} +{ +chr=$1; pos=$2; posHg19=$4; cM19=$5; +if (chr==prevChr) { + deltaPosRatio = (pos-prevPos) / (posHg19-prevPosHg19); + if (0.8 < deltaPosRatio && deltaPosRatio < 1.2) { + rate = (cM19-prevcM19) / (pos-prevPos) * 1e6; cM38 += cM19-prevcM19; + } + else if (-1.2 < deltaPosRatio && deltaPosRatio < -0.8) { + rate = (prevcM19-cM19) / (pos-prevPos) * 1e6; cM38 += prevcM19-cM19; + } + else { + cMdiff = (pos-prevPos)*1e-6; + if (cMdiff > 1) { + print "WARNING_LARGE_CMDIFF",prevChr,prevPos,prevPosHg19,posHg19-prevPosHg19,chr,pos,posHg19,pos-prevPos,(pos-prevPos)/(posHg19-prevPosHg19) > "/dev/stderr"; + cMdiff = 1; + } + rate = cMdiff / (pos-prevPos); cM38 += cMdiff; + print "BAD",prevChr,prevPos,prevPosHg19,posHg19-prevPosHg19,chr,pos,posHg19,pos-prevPos,(pos-prevPos)/(posHg19-prevPosHg19) > "/dev/stderr"; + } +} +else { + rate=0; cM38 = 0; +} +printf("%s %d %.10f %.15f\n",chr,pos,rate,cM38); +prevChr=chr; prevPos=pos; prevPosHg19=posHg19; prevcM19=cM19; +}' $HG38_BED_CLEAN 2> genetic_map_hg38_withX.BAD.txt | gzip > genetic_map_hg38_withX.txt.gz + +rm $HG19_BED $HG38_BED_RAW $HG38_BED_CLEAN $HG38_BED_UNMAPPED +---- + +Now the UCSC makedocs, which are very simple: + +cd orig +wget https://storage.googleapis.com/broad-alkesgroup-public/Eagle/downloads/tables/genetic_map_hg38_withX.txt.gz +cd .. +less orig/genetic_map_hg38_withX.txt.gz | grep -v '^\chr' | sed -e 's/^/chr/' | sed -e 's/chr23/chrX/' | awk 'BEGIN {OFS="\t"; prevChr=0; prevPos=0; } {if ($1==prevChr) { print ($1, prevPos, $2, $3); } else { prevPos=0; print ($1, "0", $2, $3) } prevPos = $2; prevChr=$1;}' | less > recomb1000GAvg.bedgraph +bedSort recomb1000GAvg.bedgraph recomb1000GAvg.bedgraph +bedGraphToBigWig recomb1000GAvg.bedgraph ../../chrom.sizes recomb1000GAvg.bw