da323191c57c36932ad1751af87b59b71b836501 markd Fri Jul 25 23:20:00 2025 -0700 fixed issues created by the pain of parsing gene2accession #36073 diff --git src/hg/makeDb/doc/hg38/hgnc.txt src/hg/makeDb/doc/hg38/hgnc.txt index 04638516fb9..19f55c703fc 100644 --- src/hg/makeDb/doc/hg38/hgnc.txt +++ src/hg/makeDb/doc/hg38/hgnc.txt @@ -1,159 +1,188 @@ ##### BRANEY ########### # 2022 mkdir -p /cluster/data/hg38/bed/hgnc cd /cluster/data/hg38/bed/hgnc # get the HGNC data wget "http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt" # get the Entrez data wget "https://ftp.ncbi.nih.gov/gene/DATA/gene2accession.gz" gunzip gene2accession.gz # separate human accessions in the Entrez data grep "^9606" gene2accession > human.gene2accession # collect all the mappings to hg38 (these have NCBI chromosome names) grep GRCh38 human.gene2accession | tawk '{print $2, $8,$10,$11}' | sort | uniq > entrezToLocNcbi.txt # substitute UCSC names tawk '{print $1,$2}' /cluster/data/hg38/hg38.chromAlias.tsv > ncbiToUcsc.map subColumn -skipMiss -miss=idMiss.txt 2 entrezToLocNcbi.txt ncbiToUcsc.map entrezToLoc.txt # generate trix file with symbol, alias, and previous values tail -n +2 hgnc_complete_set.txt | tawk '{print $1, $2 " " $9 " " $11;}' | tr -d '"' | tr '|' ' ' > trixInput.txt ixIxx trixInput.txt search.hg38.ix search.hg38.ixx # look at field names and create proto AS file (just done the first time) #tawk '{for(ii=1; ii <= NF; ii++) print ii, $ii; exit}' hgnc_complete_set.txt #tawk '{for(ii=1; ii <= NF; ii++) printf "\tstring %s; \"%s\"\n", $ii,$ii; exit}' *comple* > asSkeleton.as # create input file without header and sorted by first field ( HGNC:### ) tail -n +2 hgnc_complete_set.txt | sort -k 1b,1 > input.txt # put black for every type for the moment. This should be based on GENCODE colors tawk '{print $5}' input.txt | sort -u | awk '{printf "s/%s/0,0,0/\n", $0}' > color.sed.txt #manually fixed colors for coding, pseudogene, and non-coding tawk '{print $1,$5}' input.txt | sed -f color.sed.txt | sort -k 1b,1 > id.color.txt # build hgnc file with assigned colors as the 58th field join -j 1 -t $'\t' input.txt id.color.txt > inputColor.txt # put entrez ID as first field for joins tawk '{if ($19 != "") print $19,$0}' inputColor.txt | sort -k 1b,1 > entrez.hgnc.txt # add position information to HGNC information join -j 1 -t $'\t' entrezToLoc.txt entrez.hgnc.txt | cut -f 2- > positioned.hg38.txt # build first nine fields of bigbed tawk '{print $1, $2, $3, $4, 0, "+", 0,0, $58}' positioned.hg38.txt > tmp1 # add the rest of the HGNC fields as extra fields except the id and color which are in the first nine tawk '{for(ii=5; ii <= 56; ii++) printf("%s\t", $ii); print $ii}' positioned.hg38.txt > tmp2 paste tmp1 tmp2 | sort -k1,1 -k2,2n > input.bed # let's do this thing! bedToBigBed -extraIndex=name -tab -type=bed9+53 input.bed /cluster/data/hg38/chrom.sizes hgnc.hg38.bb -as=$HOME/kent/src/hg/lib/hgncBig62.as # put the files into /gbdb mkdir -p /gbdb/hg38/hgnc rm -rf /gbdb/hg38/hgnc/hgnc.bb ln -s `pwd`/hgnc.hg38.bb /gbdb/hg38/hgnc/hgnc.bb rm -rf /gbdb/hg38/hgnc/search.ix /gbdb/hg38/hgnc/search.ixx ln -s `pwd`/search.hg38.ix /gbdb/hg38/hgnc/search.ix ln -s `pwd`/search.hg38.ixx /gbdb/hg38/hgnc/search.ixx ################################## ##### GALT ########### # 2025-04-17 #mkdir -p /cluster/data/hg38/bed/hgnc cd /cluster/data/hg38/bed/hgnc mkdir old3 mv * old3/ # get the HGNC data wget https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt # get the Entrez data wget "https://ftp.ncbi.nih.gov/gene/DATA/gene2accession.gz" gunzip gene2accession.gz # separate human accessions in the Entrez data grep "^9606" gene2accession > human.gene2accession # collect all the mappings to hg38 (these have NCBI chromosome names) # TODO GALT add the strand here. grep GRCh38 human.gene2accession | tawk '{print $2, $8,$10,$11,$12}' | sort | uniq > entrezToLocNcbi.txt # substitute UCSC names tawk '{print $1,$2}' /cluster/data/hg38/hg38.chromAlias.tsv > ncbiToUcsc.map subColumn -skipMiss -miss=idMiss.txt 2 entrezToLocNcbi.txt ncbiToUcsc.map entrezToLoc.txt #missed 3468 # generate trix file with symbol, alias, and previous values tail -n +2 hgnc_complete_set.txt | tawk '{print $1, $2 " " $9 " " $11;}' | tr -d '"' | tr '|' ' ' > trixInput.txt ixIxx -maxWordLength=32 trixInput.txt search.hg38.ix search.hg38.ixx # look at field names and create proto AS file (just done the first time) #tawk '{for(ii=1; ii <= NF; ii++) print ii, $ii; exit}' hgnc_complete_set.txt #tawk '{for(ii=1; ii <= NF; ii++) printf "\tstring %s; \"%s\"\n", $ii,$ii; exit}' *comple* > asSkeleton.as # I looked at this and it is correct, the hgnc_id is in the name and 53 more fields is 54 found in the hgncBig62.as # create input file without header and sorted by first field ( HGNC:### ) tail -n +2 hgnc_complete_set.txt | sort -k 1b,1 > input.txt #Braney manually created colors for coding, pseudogene, and non-coding. I am just copying. cp old3/color.sed.txt . tawk '{print $1,$5}' input.txt | sed -f color.sed.txt | sort -k 1b,1 > id.color.txt # build hgnc file with assigned colors as the 58th field # BASH run this command in bash to escape the tab with $ join -j 1 -t $'\t' input.txt id.color.txt > inputColor.txt # put entrez ID as first field for joins tawk '{if ($19 != "") print $19,$0}' inputColor.txt | sort -k 1b,1 > entrez.hgnc.txt # add position information to HGNC information # BASH run this command in bash to escape the tab with $ join -j 1 -t $'\t' entrezToLoc.txt entrez.hgnc.txt | cut -f 2- > positioned.hg38.txt # build first nine fields of bigbed tawk '{print $1, $2, $3, $5, 0, $4, 0,0, $59}' positioned.hg38.txt > tmp1 # add the rest of the HGNC fields as extra fields except the id and color which are in the first nine tawk '{for(ii=6; ii <= 57; ii++) printf("%s\t", $ii); print $ii}' positioned.hg38.txt > tmp2 paste tmp1 tmp2 | sort -k1,1 -k2,2n > input.bed vi $HOME/kent/src/hg/lib/hgncBig62.as # modified mgd_id to be lstring instead of string since it was 493 chars that overflowed string which takes 255 char max. # changed strand description to "+ or - for strand" # let's do this thing! bedToBigBed -extraIndex=name -tab -type=bed9+53 input.bed /cluster/data/hg38/chrom.sizes hgnc.hg38.bb -as=$HOME/kent/src/hg/lib/hgncBig62.as # put the files into /gbdb mkdir -p /gbdb/hg38/hgnc rm -rf /gbdb/hg38/hgnc/hgnc.bb ln -s `pwd`/hgnc.hg38.bb /gbdb/hg38/hgnc/hgnc.bb rm -rf /gbdb/hg38/hgnc/search.ix /gbdb/hg38/hgnc/search.ixx ln -s `pwd`/search.hg38.ix /gbdb/hg38/hgnc/search.ix ln -s `pwd`/search.hg38.ixx /gbdb/hg38/hgnc/search.ixx -################################## +################################################################################# +2025-07-23: markd #36073 + +Replace previous build method with one to based on GENCODE, RefSeq curated, +and RefSeq other. This address various problems dealing with the complexity +of NCBI gene2accession to find the locations. See #36073. + +cd /cluster/data/hg38/bed/hgnc/ +mkdir build.2025-07-24 +cd build.2025-07-24 + +wget -nv https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt + +~/kent/src/hg/makeDb/outside/hgnc/hgncToBigBed --unmappedTsv=hgnc.unmapped.tsv hg38 V48 hgnc_complete_set.txt hgnc.bed hgnc.bb search locus_type.filter.txt + +# Note it is expected that there will be unmapped: +# - a few protein-coding genes not in the assembly +# - pseudogenes and non-coding genes not in GENCODE or RefSeq for unknown reasons +# - locus_group `other' types that are not mapped. + +# put the files into /gbdb +mkdir -p /gbdb/hg38/hgnc +ln -sf $(realpath hgnc.bb search.ix search.ixx) /gbdb/hg38/hgnc/ + +# double check +ls -l /gbdb/hg38/hgnc/ +file /gbdb/hg38/hgnc/* +# edit trackDb/human/trackDb.ra to set filterValues.locus_type +# from the locus_type.filter.txt file +#################################################################################