e5d963606c378c0f08cb946c3a9393847dd6ce22 kate Wed Oct 7 15:23:41 2020 -0700 Add bigBed to GeneReviews to support mouseOver with diseases. refs #19841 diff --git src/hg/utils/otto/geneReviews/buildGeneReviews.sh src/hg/utils/otto/geneReviews/buildGeneReviews.sh index 6112a99..a492419 100755 --- src/hg/utils/otto/geneReviews/buildGeneReviews.sh +++ src/hg/utils/otto/geneReviews/buildGeneReviews.sh @@ -1,82 +1,108 @@ #!/bin/sh set -e # processing raw data file from GENEREVIEWS -#function to create geneReviews table +geneDiseaseFile="geneReviewsGeneDiseases.tab" + function createGeneReviewTables() { +#Create geneReviews table and bigBed + # Load the internal working table geneReviewsGrshortNBKid to hg38/hg19/hg18 hgsql $1 -e 'drop table if exists geneReviewsGrshortNBKidNew' hgsql $1 -e 'create table geneReviewsGrshortNBKidNew select * from geneReviewsGrshortNBKid limit 0' hgsql $1 -e \ 'load data local infile "geneReviewsGrshortNBKid.tab" into table geneReviewsGrshortNBKidNew' # Load the internal working table geneReviewsGrshortTitleNBKid to # hg38/19/18 hgsql $1 -e 'drop table if exists geneReviewsGrshortTitleNBKidNew' hgsql $1 -e 'create table geneReviewsGrshortTitleNBKidNew select * from geneReviewsGrshortTitleNBKid limit 0' hgsql $1 -e \ 'load data local infile "geneReviewsGrshortTitleNBKid.tab" into table geneReviewsGrshortTitleNBKidNew' # for each refGen in grRefGene.lst, create a non-overlapping bed row. -if [ -e "geneReviews.tab" ] -then - rm geneReviews.tab -fi +rm -f geneReviews.$1.tab cat grRefGene.lst | while read G do if [ "$1" = "hg38" ] then hgsql $1 -N -e \ "SELECT e.chrom,e.txStart,e.txEnd,e.name2 \ FROM gencodeAnnotV35 e where e.name2 ='${G}' \ ORDER BY e.chrom,e.txStart;" > temp.in - bedRemoveOverlap temp.in temp.out - cat temp.out >> geneReviews.tab else hgsql $1 -N -e \ "SELECT e.chrom,e.txStart,e.txEnd,j.geneSymbol \ FROM knownGene e, kgXref j WHERE e.name = j.kgID AND \ j.geneSymbol ='${G}' ORDER BY e.chrom,e.txStart;" > temp.in - bedRemoveOverlap temp.in temp.out - cat temp.out >> geneReviews.tab fi + bedRemoveOverlap temp.in temp.out + cat temp.out >> geneReviews.$1.tab done rm temp.* # load the collapsed bed4 file to database hgsql $1 -e 'drop table if exists geneReviewsNew' -hgLoadBed $1 geneReviewsNew geneReviews.tab + +# NOTE: keeping table for backwards compatibility and search (for now) +hgLoadBed $1 geneReviewsNew geneReviews.$1.tab + +# Create big bed file with diseases for mouseover +db=$1 +perl ../geneRevsAddDiseases.pl $geneDiseaseFile geneReviews.$db.tab > geneReviewsExt.$db.tab +bedSort geneReviewsExt.$db.tab stdout | uniq > geneReviewsExt.$db.bed +gbdb="/gbdb/$db/geneReviews" +mkdir -p $gbdb + +# validate +oldLc=`bigBedToBed $gbdb/geneReviews.bb stdout | wc -l` +newLc=`wc -l < geneReviewsExt.$db.bed` +echo rowcount: old $oldLc new: $newLc +if [ $oldLc -ne 0 ]; then + echo $oldLc $newLc | \ + awk '{if (($2-$1)/$1 > 0.1) {printf "validate $db GENE REVIEWS failed: old count: %d, new count: %d\n", $1,$2; exit 1;}}' +fi + +#install +bedToBigBed -tab -type=bed9+2 -as=../geneReviews.as geneReviewsExt.$db.bed \ + /hive/data/genomes/$db/chrom.sizes geneReviews.$db.bb +rm -f $gbdb/geneReviews.bb +ln -s `pwd`/geneReviews.$db.bb $gbdb/geneReviews.bb # Create and load geneReviewsDetail table hgsql $1 -N -e \ "SELECT s.geneSymbol, s.grShort, t.NBKid, t.grTitle \ FROM geneReviewsGrshortNBKidNew s, geneReviewsGrshortTitleNBKidNew t \ WHERE s.grShort = t.grShort ORDER BY s.geneSymbol;" > geneReviewsDetail.tab hgsql $1 -e 'drop table if exists geneReviewsDetailNew' hgsql $1 -e 'create table geneReviewsDetailNew select * from geneReviewsDetail limit 0' hgsql $1 -e \ 'load data local infile "geneReviewsDetail.tab" into table geneReviewsDetailNew' } -# Create tab files for internal geneReviewsGrshortNBKid and -# geneReviewsGrshortTitleNBKid tables +####### main ########## + +# Create gene to disease file for extended BED mouseover +perl ../geneRevsListDiseases.pl NBKid_shortname_genesymbol.txt GRtitle_shortname_NBKid.txt > \ + $geneDiseaseFile + +# Create tab files for internal geneReviewsGrshortNBKid and geneReviewsGrshortTitleNBKid tables cat NBKid_shortname_genesymbol.txt | grep -v "^#" \ | awk '{FS="\t"} {OFS="\t"} {if ($3!="Not applicable") print $3,$2,$1}' \ | sort -k1 > geneReviewsGrshortNBKid.tab grep -v "^#" GRtitle_shortname_NBKid.txt | sort -k1 > geneReviewsGrshortTitleNBKid.tab -# Generate a list of refSeq genes that have geneReview assoicate with -# it. +# Generate a list of refSeq genes that have geneReview associated with it. cat geneReviewsGrshortNBKid.tab | awk -F'\t' '{printf "%s\n", $1}' \ | sort | uniq > grRefGene.lst createGeneReviewTables "hg38" createGeneReviewTables "hg19" createGeneReviewTables "hg18" exit 0