da323191c57c36932ad1751af87b59b71b836501 markd Fri Jul 25 23:20:00 2025 -0700 fixed issues created by the pain of parsing gene2accession #36073 diff --git src/hg/makeDb/doc/hg38/hgnc.txt src/hg/makeDb/doc/hg38/hgnc.txt index 04638516fb9..19f55c703fc 100644 --- src/hg/makeDb/doc/hg38/hgnc.txt +++ src/hg/makeDb/doc/hg38/hgnc.txt @@ -142,18 +142,47 @@ # modified mgd_id to be lstring instead of string since it was 493 chars that overflowed string which takes 255 char max. # changed strand description to "+ or - for strand" # let's do this thing! bedToBigBed -extraIndex=name -tab -type=bed9+53 input.bed /cluster/data/hg38/chrom.sizes hgnc.hg38.bb -as=$HOME/kent/src/hg/lib/hgncBig62.as # put the files into /gbdb mkdir -p /gbdb/hg38/hgnc rm -rf /gbdb/hg38/hgnc/hgnc.bb ln -s `pwd`/hgnc.hg38.bb /gbdb/hg38/hgnc/hgnc.bb rm -rf /gbdb/hg38/hgnc/search.ix /gbdb/hg38/hgnc/search.ixx ln -s `pwd`/search.hg38.ix /gbdb/hg38/hgnc/search.ix ln -s `pwd`/search.hg38.ixx /gbdb/hg38/hgnc/search.ixx -################################## +################################################################################# +2025-07-23: markd #36073 + +Replace previous build method with one to based on GENCODE, RefSeq curated, +and RefSeq other. This address various problems dealing with the complexity +of NCBI gene2accession to find the locations. See #36073. + +cd /cluster/data/hg38/bed/hgnc/ +mkdir build.2025-07-24 +cd build.2025-07-24 + +wget -nv https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt + +~/kent/src/hg/makeDb/outside/hgnc/hgncToBigBed --unmappedTsv=hgnc.unmapped.tsv hg38 V48 hgnc_complete_set.txt hgnc.bed hgnc.bb search locus_type.filter.txt + +# Note it is expected that there will be unmapped: +# - a few protein-coding genes not in the assembly +# - pseudogenes and non-coding genes not in GENCODE or RefSeq for unknown reasons +# - locus_group `other' types that are not mapped. + +# put the files into /gbdb +mkdir -p /gbdb/hg38/hgnc +ln -sf $(realpath hgnc.bb search.ix search.ixx) /gbdb/hg38/hgnc/ + +# double check +ls -l /gbdb/hg38/hgnc/ +file /gbdb/hg38/hgnc/* +# edit trackDb/human/trackDb.ra to set filterValues.locus_type +# from the locus_type.filter.txt file +#################################################################################