src/hg/makeDb/doc/hg19.txt da323191c57c36932ad1751af87b59b71b836501

da323191c57c36932ad1751af87b59b71b836501
markd
  Fri Jul 25 23:20:00 2025 -0700
fixed issues created by the pain of parsing gene2accession #36073

diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt
index 9179bc80e01..f1bfe9b38f0 100644
--- src/hg/makeDb/doc/hg19.txt
+++ src/hg/makeDb/doc/hg19.txt
@@ -33930,31 +33930,30 @@
       -fileServer=hgwdev -smallClusterHub=hgwdev -workhorse=hgwdev \
       GCF_000001405.25_GRCh37.p13 hg19) > do.log 2>&1 &
   # real    7m42.506s
 
   cat fb.ncbiRefSeq.hg19.txt
   # 95470885 bases of 2991710746 (3.191%) in intersection
 
 #############################################################################
 # Covid-19 rare mutations, Max, Fri Oct 30 08:40:34 PDT 2020
 # received table from qzhang02@rockefeller.edu, wrote to UCSC.txt
 cd /hive/data/genomes/hg19/bed/covidMuts/
 dos2unix UCSC.txt
 cat UCSC.txt | tawk '{$1="chr"$1; chrom=$1; start=$2; rsId=$3; ref=$4; alt=$5; zygo=$6; gene=$7; genotype=$8; inh=$9; end=$2+length(ref); print chrom, start, end, ref">"alt, "0", ".", start, end, "0,0,0", "1", length(ref), "0", ref, alt, rsId, zygo, gene, genotype, inh;}' | grep -v chrchr > covidMuts.bed
 bedSort covidMuts.bed covidMuts.bed
 bedToBigBed -tab covidMuts.bed ../../chrom.sizes covidMuts.bb -as=../../hg19/bed/covidMuts/covidMuts.as -type=bed12+
-<<<<<<< Updated upstream
 
 #############################################################################
 # gnomAD v2.1.1 update, ChrisL 12-2-2020
 #############################################################################
 # See /hive/data/inside/gnomAD/v2.1.1/run.sh for more information, listed
 # here are the important steps:
 WORKDIR=/hive/data/inside/gnomAD/v2.1.1/
 cd $WORKDIR
 db="hg19"
 cd $db
 
 time parallel -j15 --joblog exomes.run.log --plus "vcfToBed -fields=${fields} {} exomes/{/..}.bed" ::: /hive/data/outside/gnomAD.2/v2.1.1/exomes/*.bgz
 # real    16m42.939s
 # user    172m26.966s
 # sys 1m41.186s
@@ -35043,30 +35042,33 @@
 # let's do this thing!
 bedToBigBed -extraIndex=name -tab -type=bed9+53 input.bed /cluster/data/hg19/chrom.sizes hgnc.hg19.bb -as=$HOME/kent/src/hg/lib/hgncBig62.as
 
 # put the files into /gbdb
 mkdir -p /gbdb/hg19/hgnc
 rm -rf /gbdb/hg19/hgnc/hgnc.bb
 ln -s `pwd`/hgnc.hg19.bb /gbdb/hg19/hgnc/hgnc.bb 
 
 rm -rf /gbdb/hg19/hgnc/search.ix /gbdb/hg19/hgnc/search.ixx
 ln -s `pwd`/search.hg19.ix /gbdb/hg19/hgnc/search.ix
 ln -s `pwd`/search.hg19.ixx /gbdb/hg19/hgnc/search.ixx
 
 #######
 #  HGNC   Done (braney 2022-09-07)
 #
+
+# *** PROCESS HAS CHANGE DON"T DO THIS ***
+
 mkdir -p /cluster/data/hg19/bed/hgnc
 cd /cluster/data/hg19/bed/hgnc
 
 # get the HGNC data
 wget "http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt"
 
 # get the Entrez data
 wget "https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gff.gz"
 zcat GRCh37_latest_genomic.gff.gz | grep "GeneID:" | tawk '{print $1, $4, $5}' >  positions.txt
 zcat GRCh37_latest_genomic.gff.gz |  grep "GeneID:" | sed 's/.*GeneID://' | sed 's/,.*//' | sed 's/;.*//' > geneIds.txt
 paste positions.txt geneIds.txt | tawk '{print $4, $1,$2,$3, $3 - $2}' | sort -k 1,1 -k 5,5nr | tawk '{if (seen[$1] != 1) print $1, $2, $3,$4; seen[$1] = 1}' > entrezToLocNcbi.txt
 
 # substitute UCSC names
 tawk '{print $2,$1}' /cluster/data/hg19/bed/chromAlias/ucsc.refseq.p13.plusMT.tab > ncbiToUcsc.map
 subColumn -skipMiss -miss=idMiss.txt 2 entrezToLocNcbi.txt ncbiToUcsc.map entrezToLoc.txt
@@ -35774,30 +35776,33 @@
     print $9, $10, $10, $12, "0", ".", $10, $10, "0", $1, $2, $3, $4, $5, $6, $7, $8, $11, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, "0"
 }'  > output_non_ssc.bed
 awk '{if ($2 != "chromStart") $2 = $2 - 1; print}' OFS="\t" output_non_ssc.bed > output_non_ssc_adjusted.bed
 tail -n +2 output_non_ssc_adjusted.bed > output_non_ssc_no_header.bed
 awk '{if($1 !~ /^chr/) $1 = "chr"$1; print}' OFS="\t" output_non_ssc_no_header.bed > output_non_ssc_standard.bed
 awk -F'\t' 'BEGIN {OFS="\t"} {if (NF < 36) {for (i=NF+1; i<=36; i++) $i="0"}; print}' output_non_ssc_standard.bed > filtered_output_fixed_non_ssc.bed
 awk -F'\t' 'BEGIN {OFS="\t"} {$26=($26 == "-1") ? "0" : $26; print}' filtered_output_fixed_non_ssc.bed > output_non_ssc_final_fixed.bed
 awk -F'\t' '{if (length($10) > 255) print "Line " NR " has length " length($10)}' output_non_ssc_final_fixed.bed
 awk -F'\t' -v OFS='\t' '{if (length($18) > 255) $18 = substr($18, 1, 255); print}' output_non_ssc_final_fixed.bed > output_non_ssc_final_fixed_truncated.bed
 bedToBigBed -as=bedExample2.as -type=bed9+27 -tab output_non_ssc_final_fixed_truncated.bed hg19.chrom.sizes output_non_ssc.bb
 
 ##
 #######
 #  HGNC   Done (galt 2025-04-17)
 #
+
+# *** PROCESS HAS CHANGE DON"T DO THIS ***
+
 #mkdir -p /cluster/data/hg19/bed/hgnc
 cd /cluster/data/hg19/bed/hgnc
 mkdir old2
 mv * old2/
 
 
 
 # get the HGNC data
 wget "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt"
 
 
 # get the Entrez data
 wget "https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gff.gz"
 zcat GRCh37_latest_genomic.gff.gz | grep "GeneID:" | tawk '{print $1, $4, $5, $7}' >  positions.txt
 
@@ -35881,21 +35886,55 @@
 wget https://mutscore-ucsc.iob.ch/MutScore.hg38.C.bw
 wget https://mutscore-ucsc.iob.ch/MutScore.hg38.T.bw
 wget https://mutscore-ucsc.iob.ch/MutScore.hg38.G.bw
 mv MutScore.hg38.A.bw mutscoreA.bw
 mv MutScore.hg38.C.bw mutscoreC.bw
 mv MutScore.hg38.G.bw mutscoreG.bw
 mv MutScore.hg38.T.bw mutscoreT.bw
 
 #############################################################################
 # MCAP scores, Max, Tue Jun 17 08:14:44 AM PDT 2025
 cd /hive/data/genomes/hg19/bed/mcap/
 wget http://bejerano.stanford.edu/mcap/downloads/dat/mcap_v1_4.txt.gz
 python ~/kent/src/hg/makeDb/scripts/mcap/mcapToBw.py
 rm -f bw/*.bedGraph
 
-
 #########################################################################
-
 # Bionano, max, July 4 2025
 # received files from apang@bionano.com
 # converted to bigBed with standard bedToBigBed
+
+#############################################################################
+# HGNC 2025-07-24 markd
+
+Replace previous build method with one to based on GENCODE, RefSeq curated,
+and RefSeq other.  This address various problems dealing with the complexity
+of NCBI gene2accession to find the locations. See #36073.
+
+cd /cluster/data/hg19/bed/hgnc/
+mkdir build.2025-07-24
+cd build.2025-07-24
+
+# hgnc was just download for hg38 from
+# https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt
+# so just link to make sure the versions are the same
+
+ln /cluster/data/hg38/bed/hgnc/build.2025-07-24/hgnc_complete_set.txt .
+
+~/kent/src/hg/makeDb/outside/hgnc/hgncToBigBed hg19 V48lift37 --unmappedTsv=hgnc.unmapped.tsv hgnc_complete_set.txt hgnc.bed hgnc.bb search locus_type.filter.txt
+
+# Note it is expected that there will be unmapped:
+#  - a few protein-coding genes not in the assembly
+#  - pseudogenes and non-coding genes not in GENCODE or RefSeq for unknown reasons
+#  - locus_group `other' types that are not mapped.
+
+# put the files into /gbdb
+mkdir -p /gbdb/hg19/hgnc
+ln -sf $(realpath hgnc.bb search.ix search.ixx) /gbdb/hg19/hgnc/
+
+# double check
+ls -l /gbdb/hg19/hgnc/
+file  /gbdb/hg19/hgnc/*
+
+# if not already done for hg38:
+# edit trackDb/human/trackDb.ra to set filterValues.locus_type
+# from the locus_type.filter.txt file