8a687b59577f35717a86b46b3a9f4e4ffc843ff5 hiram Mon Sep 4 14:13:21 2023 -0700 SNP view construction script for 104-way no redmine diff --git src/hg/makeDb/doc/mito/snpView.txt src/hg/makeDb/doc/mito/snpView.txt new file mode 100644 index 0000000..523489f --- /dev/null +++ src/hg/makeDb/doc/mito/snpView.txt @@ -0,0 +1,158 @@ +### making the snpView for the mito browser + +This script demonstrates making the snpView for one of +the species in the mito browser. In this case, the hg38a +species (==hg38 chrM) + +This script will be used later with input arg parameters in order to +make up the snpView for each target reference. + +####################################################################### + +#!/bin/bash + +export allDists="/cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/all_dists" + +export target="hg38a" +export Nway="104way" +export dbTable="${target}_${Nway}" +export mafFile="/hive/data/genomes/mito/trackData/mafFiles/cactus/${target}.104way.maf" + +mkdir -p /hive/data/genomes/mito/bed/snpView/${target} +cd /hive/data/genomes/mito/bed/snpView/${target} + +mkdir -p /gbdb/mito/maf +if [ "${mafFile}" -nt species.list ]; then +rm -f /gbdb/mito/maf/${target}.maf + +# translate the names to avoid the underbars and . versions in +# the 'species' names + +sed -e 's/GCA_\([0-9]\+\)\./GCAu\1v/g; s/GCF_\([0-9]\+\)\./GCFu\1v/g;' \ + "${mafFile}" > noUnderbar.maf + +rm -f mito.${Nway}.nh +ln -s ../../../findSequences/finalPlus7.nh ./mito.${Nway}.nh + +# 'target' species is always 'mito' +printf "mito\n" > species.list + +# order species via their distance in the phylo-tree +$allDists mito.${Nway}.nh | grep -w "${target}" | sed -e "s/${target}.//;" \ + | sort -k2,2n | cut -f1 \ + | sed -e 's/GCA_\([0-9]\+\)\./GCAu\1v/g; s/GCF_\([0-9]\+\)\./GCFu\1v/g;' \ + >> species.list + +ln -s `pwd`/noUnderbar.maf /gbdb/mito/maf/${target}.maf + +# temporary load this maf file into 'mito' database for the convenience +# of the mafGene command +hgLoadMaf -pathPrefix=/gbdb/mito/maf \ + -loadFile=../../mafFiles/cactus/${target}.${Nway}.maf mito ${dbTable} + +fi + +# the genes for this target +### XXX should the genes for all the other species also be here ? +if [ "../../genes/hg38/hg38_chrM.gp.gz" -nt "${target}.gp" ]; then + zcat ../../genes/hg38/hg38_chrM.gp.gz > "${target}.gp" + touch -r ../../genes/hg38/hg38_chrM.gp.gz "${target}.gp" +fi + +if [ "${target}.gp" -nt nonsyn.faa ]; then + mafGene -exons mito ${dbTable} -useFile "${target}.gp" species.list nonsyn.faa + touch -r "${target}.gp" nonsyn.faa +fi + +# the 1583 is the soTerm for missense_variant +if [ nonsyn.faa -nt nonsyn.bed ]; then + paSNP species.list nonsyn.faa stdout | sed 's/:/ /' | sed 's/-/ /' \ + | awk '{printf "%s\t%d\t%d\t%s\t1583\t+\t%d\t%d\t255,0,0\t1\t%d\t0\n", $1, $2-1, $3, $4, $2-1, $3, $3-($2 - 1)}' > nonsyn.bed + touch -r nonsyn.faa nonsyn.bed +fi + +if [ "${target}.gp" -nt syn.faa ]; then + mafGene -uniqAA -exons mito ${dbTable} -useFile "${target}.gp" species.list syn.faa + touch -r "${target}.gp" syn.faa +fi + +# the 1819 is the soTerm for synonymous_variant +if [ syn.faa -nt syn.bed ]; then + paSNP species.list syn.faa stdout | sed 's/:/ /' | sed 's/-/ /' \ + | awk '{printf "%s\t%d\t%d\t%s\t1819\t+\t%d\t%d\t0,255,0\t1\t%d\t0\n", $1, $2-1, $3, $4, $2-1, $3, $3-($2 - 1)}' > syn.bed + touch -r syn.faa syn.bed +fi + +if [ "${target}.gp" -nt single.bed ]; then + mafToSnpBed mito noUnderbar.maf "${target}.gp" stdout \ + | sed -e 's/mito.//;' > single.bed + touch -r "${target}.gp" single.bed +fi + +# select out based on 'soTerm' identifier in the last column +# soTerms defined in src/hg/inc/soTerm.h +# 1 2 3 4 5 +# hg38a_chrM 72 73 GCAu018873775v2 1628 + +if [ single.bed -nt codingVariant.bed ]; then + awk '$NF == 1580' single.bed \ + | awk '{printf "%s\t%d\t%d\t%s\t%d\t+\t%d\t%d\t255,255,0\t1\t%d\t0\n", $1, $2, $3, $4, $5, $2, $3, $3-$2}' \ + | sed -e 's/GCAu\([0-9]\+\)v/GCA_\1./g; s/GCFu\([0-9]\+\)v/GCF_\1./g;' \ + | sort -k1,1 -k2,2n > codingVariant.bed + touch -r single.bed codingVariant.bed +fi + +if [ single.bed -nt utrVariant.bed ]; then + awk '$NF == 1623 || $NF == 1624' single.bed \ + | awk '{printf "%s\t%d\t%d\t%s\t%d\t+\t%d\t%d\t255,255,0\t1\t%d\t0\n", $1, $2, $3, $4, $5, $2, $3, $3-$2}' \ + | sed -e 's/GCAu\([0-9]\+\)v/GCA_\1./g; s/GCFu\([0-9]\+\)v/GCF_\1./g;' \ + | sort -k1,1 -k2,2n > utrVariant.bed + touch -r single.bed utrVariant.bed +fi + +if [ single.bed -nt missing.bed ]; then + awk '$NF == 0' single.bed \ + | awk '{printf "%s\t%d\t%d\t%s\t%d\t+\t%d\t%d\t255,255,0\t1\t%d\t0\n", $1, $2, $3, $4, $5, $2, $3, $3-$2}' \ + | sed -e 's/GCAu\([0-9]\+\)v/GCA_\1./g; s/GCFu\([0-9]\+\)v/GCF_\1./g;' \ + | sort -k1,1 -k2,2n > missing.bed + touch -r single.bed missing.bed +fi + +if [ single.bed -nt intergenic.bed ]; then + awk '$NF == 1628' single.bed \ + | awk '{printf "%s\t%d\t%d\t%s\t%d\t+\t%d\t%d\t255,255,0\t1\t%d\t0\n", $1, $2, $3, $4, $5, $2, $3, $3-$2}' \ + | sed -e 's/GCAu\([0-9]\+\)v/GCA_\1./g; s/GCFu\([0-9]\+\)v/GCF_\1./g;' \ + | sort -k1,1 -k2,2n > intergenic.bed + touch -r single.bed intergenic.bed +fi + +if [ single.bed -nt intron.bed ]; then + awk '$NF == 1627' single.bed \ + | awk '{printf "%s\t%d\t%d\t%s\t%d\t+\t%d\t%d\t255,255,0\t1\t%d\t0\n", $1, $2, $3, $4, $5, $2, $3, $3-$2}' \ + | sed -e 's/GCAu\([0-9]\+\)v/GCA_\1./g; s/GCFu\([0-9]\+\)v/GCF_\1./g;' \ + | sort -k1,1 -k2,2n > intron.bed + touch -r single.bed intron.bed +fi + +if [ species.list -nt mito.species.list ]; then + grep -v -w mito species.list \ + | sed -e 's/GCAu\([0-9]\+\)v/GCA_\1./g; s/GCFu\([0-9]\+\)v/GCF_\1./g;' \ + > mito.species.list + touch -r species.list mito.species.list +fi + +if [ syn.bed -nt output.bed -o intergenic.bed -nt output.bed ]; then +for S in `cat mito.species.list` +do + grep -wh "${S}" nonsyn.bed syn.bed codingVariant.bed utrVariant.bed intron.bed intergenic.bed missing.bed \ + | bedSmash stdin ../../../chrom.sizes stdout +done | sort -k1,1 -k2,2n > output.bed + touch -r intergenic.bed output.bed +fi + +if [ output.bed -nt load.bed ]; then + awk '{print $1,$2,$3,$4,$5}' output.bed | sort -k1,1 -k2,2n > load.bed + bedToBigBed -type=bed4+1 -as=snpView.as load.bed \ + ../../../chrom.sizes mafSnpHg38a.bb +fi +##########################################################################