da323191c57c36932ad1751af87b59b71b836501 markd Fri Jul 25 23:20:00 2025 -0700 fixed issues created by the pain of parsing gene2accession #36073 diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt index 9179bc80e01..f1bfe9b38f0 100644 --- src/hg/makeDb/doc/hg19.txt +++ src/hg/makeDb/doc/hg19.txt @@ -33930,31 +33930,30 @@ -fileServer=hgwdev -smallClusterHub=hgwdev -workhorse=hgwdev \ GCF_000001405.25_GRCh37.p13 hg19) > do.log 2>&1 & # real 7m42.506s cat fb.ncbiRefSeq.hg19.txt # 95470885 bases of 2991710746 (3.191%) in intersection ############################################################################# # Covid-19 rare mutations, Max, Fri Oct 30 08:40:34 PDT 2020 # received table from qzhang02@rockefeller.edu, wrote to UCSC.txt cd /hive/data/genomes/hg19/bed/covidMuts/ dos2unix UCSC.txt cat UCSC.txt | tawk '{$1="chr"$1; chrom=$1; start=$2; rsId=$3; ref=$4; alt=$5; zygo=$6; gene=$7; genotype=$8; inh=$9; end=$2+length(ref); print chrom, start, end, ref">"alt, "0", ".", start, end, "0,0,0", "1", length(ref), "0", ref, alt, rsId, zygo, gene, genotype, inh;}' | grep -v chrchr > covidMuts.bed bedSort covidMuts.bed covidMuts.bed bedToBigBed -tab covidMuts.bed ../../chrom.sizes covidMuts.bb -as=../../hg19/bed/covidMuts/covidMuts.as -type=bed12+ -<<<<<<< Updated upstream ############################################################################# # gnomAD v2.1.1 update, ChrisL 12-2-2020 ############################################################################# # See /hive/data/inside/gnomAD/v2.1.1/run.sh for more information, listed # here are the important steps: WORKDIR=/hive/data/inside/gnomAD/v2.1.1/ cd $WORKDIR db="hg19" cd $db time parallel -j15 --joblog exomes.run.log --plus "vcfToBed -fields=${fields} {} exomes/{/..}.bed" ::: /hive/data/outside/gnomAD.2/v2.1.1/exomes/*.bgz # real 16m42.939s # user 172m26.966s # sys 1m41.186s @@ -35043,30 +35042,33 @@ # let's do this thing! bedToBigBed -extraIndex=name -tab -type=bed9+53 input.bed /cluster/data/hg19/chrom.sizes hgnc.hg19.bb -as=$HOME/kent/src/hg/lib/hgncBig62.as # put the files into /gbdb mkdir -p /gbdb/hg19/hgnc rm -rf /gbdb/hg19/hgnc/hgnc.bb ln -s `pwd`/hgnc.hg19.bb /gbdb/hg19/hgnc/hgnc.bb rm -rf /gbdb/hg19/hgnc/search.ix /gbdb/hg19/hgnc/search.ixx ln -s `pwd`/search.hg19.ix /gbdb/hg19/hgnc/search.ix ln -s `pwd`/search.hg19.ixx /gbdb/hg19/hgnc/search.ixx ####### # HGNC Done (braney 2022-09-07) # + +# *** PROCESS HAS CHANGE DON"T DO THIS *** + mkdir -p /cluster/data/hg19/bed/hgnc cd /cluster/data/hg19/bed/hgnc # get the HGNC data wget "http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt" # get the Entrez data wget "https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gff.gz" zcat GRCh37_latest_genomic.gff.gz | grep "GeneID:" | tawk '{print $1, $4, $5}' > positions.txt zcat GRCh37_latest_genomic.gff.gz | grep "GeneID:" | sed 's/.*GeneID://' | sed 's/,.*//' | sed 's/;.*//' > geneIds.txt paste positions.txt geneIds.txt | tawk '{print $4, $1,$2,$3, $3 - $2}' | sort -k 1,1 -k 5,5nr | tawk '{if (seen[$1] != 1) print $1, $2, $3,$4; seen[$1] = 1}' > entrezToLocNcbi.txt # substitute UCSC names tawk '{print $2,$1}' /cluster/data/hg19/bed/chromAlias/ucsc.refseq.p13.plusMT.tab > ncbiToUcsc.map subColumn -skipMiss -miss=idMiss.txt 2 entrezToLocNcbi.txt ncbiToUcsc.map entrezToLoc.txt @@ -35774,30 +35776,33 @@ print $9, $10, $10, $12, "0", ".", $10, $10, "0", $1, $2, $3, $4, $5, $6, $7, $8, $11, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, "0" }' > output_non_ssc.bed awk '{if ($2 != "chromStart") $2 = $2 - 1; print}' OFS="\t" output_non_ssc.bed > output_non_ssc_adjusted.bed tail -n +2 output_non_ssc_adjusted.bed > output_non_ssc_no_header.bed awk '{if($1 !~ /^chr/) $1 = "chr"$1; print}' OFS="\t" output_non_ssc_no_header.bed > output_non_ssc_standard.bed awk -F'\t' 'BEGIN {OFS="\t"} {if (NF < 36) {for (i=NF+1; i<=36; i++) $i="0"}; print}' output_non_ssc_standard.bed > filtered_output_fixed_non_ssc.bed awk -F'\t' 'BEGIN {OFS="\t"} {$26=($26 == "-1") ? "0" : $26; print}' filtered_output_fixed_non_ssc.bed > output_non_ssc_final_fixed.bed awk -F'\t' '{if (length($10) > 255) print "Line " NR " has length " length($10)}' output_non_ssc_final_fixed.bed awk -F'\t' -v OFS='\t' '{if (length($18) > 255) $18 = substr($18, 1, 255); print}' output_non_ssc_final_fixed.bed > output_non_ssc_final_fixed_truncated.bed bedToBigBed -as=bedExample2.as -type=bed9+27 -tab output_non_ssc_final_fixed_truncated.bed hg19.chrom.sizes output_non_ssc.bb ## ####### # HGNC Done (galt 2025-04-17) # + +# *** PROCESS HAS CHANGE DON"T DO THIS *** + #mkdir -p /cluster/data/hg19/bed/hgnc cd /cluster/data/hg19/bed/hgnc mkdir old2 mv * old2/ # get the HGNC data wget "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt" # get the Entrez data wget "https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gff.gz" zcat GRCh37_latest_genomic.gff.gz | grep "GeneID:" | tawk '{print $1, $4, $5, $7}' > positions.txt @@ -35881,21 +35886,55 @@ wget https://mutscore-ucsc.iob.ch/MutScore.hg38.C.bw wget https://mutscore-ucsc.iob.ch/MutScore.hg38.T.bw wget https://mutscore-ucsc.iob.ch/MutScore.hg38.G.bw mv MutScore.hg38.A.bw mutscoreA.bw mv MutScore.hg38.C.bw mutscoreC.bw mv MutScore.hg38.G.bw mutscoreG.bw mv MutScore.hg38.T.bw mutscoreT.bw ############################################################################# # MCAP scores, Max, Tue Jun 17 08:14:44 AM PDT 2025 cd /hive/data/genomes/hg19/bed/mcap/ wget http://bejerano.stanford.edu/mcap/downloads/dat/mcap_v1_4.txt.gz python ~/kent/src/hg/makeDb/scripts/mcap/mcapToBw.py rm -f bw/*.bedGraph - ######################################################################### - # Bionano, max, July 4 2025 # received files from apang@bionano.com # converted to bigBed with standard bedToBigBed + +############################################################################# +# HGNC 2025-07-24 markd + +Replace previous build method with one to based on GENCODE, RefSeq curated, +and RefSeq other. This address various problems dealing with the complexity +of NCBI gene2accession to find the locations. See #36073. + +cd /cluster/data/hg19/bed/hgnc/ +mkdir build.2025-07-24 +cd build.2025-07-24 + +# hgnc was just download for hg38 from +# https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt +# so just link to make sure the versions are the same + +ln /cluster/data/hg38/bed/hgnc/build.2025-07-24/hgnc_complete_set.txt . + +~/kent/src/hg/makeDb/outside/hgnc/hgncToBigBed hg19 V48lift37 --unmappedTsv=hgnc.unmapped.tsv hgnc_complete_set.txt hgnc.bed hgnc.bb search locus_type.filter.txt + +# Note it is expected that there will be unmapped: +# - a few protein-coding genes not in the assembly +# - pseudogenes and non-coding genes not in GENCODE or RefSeq for unknown reasons +# - locus_group `other' types that are not mapped. + +# put the files into /gbdb +mkdir -p /gbdb/hg19/hgnc +ln -sf $(realpath hgnc.bb search.ix search.ixx) /gbdb/hg19/hgnc/ + +# double check +ls -l /gbdb/hg19/hgnc/ +file /gbdb/hg19/hgnc/* + +# if not already done for hg38: +# edit trackDb/human/trackDb.ra to set filterValues.locus_type +# from the locus_type.filter.txt file