00a42f1666c47609c7655b9de0d0959dd5706745
galt
  Fri Apr 18 00:21:59 2025 -0700
Added strand to HGNC and fetching lastest data for hg19. fixes #34754.

diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt
index 7fc6c75f99a..10897bf16b2 100644
--- src/hg/makeDb/doc/hg19.txt
+++ src/hg/makeDb/doc/hg19.txt
@@ -35758,15 +35758,92 @@
 bedToBigBed -as=bedExample2.as -type=bed9+27 -tab filtered_output_uint_fixed.bed hg19.chrom.sizes output.bb
 zcat denovo-db.non-ssc-samples.variants.tsv.gz | \
 awk 'BEGIN {OFS="\t"} NR > 1 {
     print $9, $10, $10, $12, "0", ".", $10, $10, "0", $1, $2, $3, $4, $5, $6, $7, $8, $11, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, "0"
 }'  > output_non_ssc.bed
 awk '{if ($2 != "chromStart") $2 = $2 - 1; print}' OFS="\t" output_non_ssc.bed > output_non_ssc_adjusted.bed
 tail -n +2 output_non_ssc_adjusted.bed > output_non_ssc_no_header.bed
 awk '{if($1 !~ /^chr/) $1 = "chr"$1; print}' OFS="\t" output_non_ssc_no_header.bed > output_non_ssc_standard.bed
 awk -F'\t' 'BEGIN {OFS="\t"} {if (NF < 36) {for (i=NF+1; i<=36; i++) $i="0"}; print}' output_non_ssc_standard.bed > filtered_output_fixed_non_ssc.bed
 awk -F'\t' 'BEGIN {OFS="\t"} {$26=($26 == "-1") ? "0" : $26; print}' filtered_output_fixed_non_ssc.bed > output_non_ssc_final_fixed.bed
 awk -F'\t' '{if (length($10) > 255) print "Line " NR " has length " length($10)}' output_non_ssc_final_fixed.bed
 awk -F'\t' -v OFS='\t' '{if (length($18) > 255) $18 = substr($18, 1, 255); print}' output_non_ssc_final_fixed.bed > output_non_ssc_final_fixed_truncated.bed
 bedToBigBed -as=bedExample2.as -type=bed9+27 -tab output_non_ssc_final_fixed_truncated.bed hg19.chrom.sizes output_non_ssc.bb
 
 ##
+#######
+#  HGNC   Done (galt 2025-04-17)
+#
+#mkdir -p /cluster/data/hg19/bed/hgnc
+cd /cluster/data/hg19/bed/hgnc
+mkdir old2
+mv * old2/
+
+
+
+# get the HGNC data
+wget "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt"
+
+
+# get the Entrez data
+wget "https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gff.gz"
+zcat GRCh37_latest_genomic.gff.gz | grep "GeneID:" | tawk '{print $1, $4, $5, $7}' >  positions.txt
+
+zcat GRCh37_latest_genomic.gff.gz |  grep "GeneID:" | sed 's/.*GeneID://' | sed 's/,.*//' | sed 's/;.*//' > geneIds.txt
+
+#ORIG paste positions.txt geneIds.txt | tawk '{print $4, $1,$2,$3, $3 - $2}' | sort -k 1,1 -k 5,5nr | tawk '{if (seen[$1] != 1) print $1, $2, $3,$4; seen[$1] = 1}' > entrezToLocNcbi.txt
+      paste positions.txt geneIds.txt | tawk '{print $5, $1,$2,$3,$4, $3 - $2}' | sort -k 1,1 -k 6,6nr | tawk '{if (seen[$1] != 1) print $1, $2, $3, $4, $5; seen[$1] = 1}' > entrezToLocNcbi.txt
+
+# substitute UCSC names
+tawk '{print $2,$1}' /cluster/data/hg19/bed/chromAlias/ucsc.refseq.p13.plusMT.tab > ncbiToUcsc.map
+subColumn -skipMiss -miss=idMiss.txt 2 entrezToLocNcbi.txt ncbiToUcsc.map entrezToLoc.txt
+
+# generate trix file with symbol, alias, and previous values
+tail -n +2 hgnc_complete_set.txt | tawk '{print $1, $2 " " $9 " " $11;}' | tr -d '"' | tr '|' ' ' > trixInput.txt
+ixIxx -maxWordLength=32 trixInput.txt search.hg19.ix search.hg19.ixx
+
+# look at field names and create proto AS file (just done the first time)
+#tawk '{for(ii=1;  ii <= NF; ii++) print ii, $ii; exit}' hgnc_complete_set.txt
+#tawk '{for(ii=1;  ii <= NF; ii++) printf "\tstring %s; \"%s\"\n", $ii,$ii; exit}' *comple* > asSkeleton.as
+
+# create input file without header and sorted by first field ( HGNC:### )
+tail -n +2 hgnc_complete_set.txt | sort -k 1b,1 > input.txt
+
+# put black for every type for the moment.  This should be based on GENCODE colors
+tawk '{print $5}' input.txt | sort -u | awk '{printf "s/%s/0,0,0/\n", $0}' > color.sed.txt
+
+#manually fixed colors for coding, pseudogene, and non-coding
+
+cp /cluster/data/hg38/bed/hgnc/color.sed.txt .
+tawk '{print $1,$5}' input.txt | sed -f color.sed.txt | sort -k 1b,1 > id.color.txt
+
+# build hgnc file with assigned colors as the 58th field
+BASH needs tab escape
+join -j 1 -t $'\t' input.txt id.color.txt > inputColor.txt
+
+
+# put entrez ID as first field for joins
+tawk '{if ($19 != "")  print $19,$0}' inputColor.txt | sort -k 1b,1 > entrez.hgnc.txt
+
+# add position information to HGNC information
+BASH needs tab escape
+join -j 1 -t $'\t' entrezToLoc.txt  entrez.hgnc.txt | cut -f 2- > positioned.hg19.txt
+
+# build first nine fields of bigbed
+tawk '{print $1, $2, $3, $5, 0, $4, 0,0, $59}' positioned.hg19.txt > tmp1
+
+# add the rest of the HGNC fields as extra fields except the id and color which are in the first nine
+tawk '{for(ii=6; ii <= 57; ii++) printf("%s\t", $ii); print $ii}' positioned.hg19.txt > tmp2
+paste tmp1 tmp2 | sort -k1,1 -k2,2n > input.bed
+
+# let's do this thing!
+bedToBigBed -extraIndex=name -tab -type=bed9+53 input.bed /cluster/data/hg19/chrom.sizes hgnc.hg19.bb -as=$HOME/kent/src/hg/lib/hgncBig62.as
+
+# put the files into /gbdb
+mkdir -p /gbdb/hg19/hgnc
+rm -rf /gbdb/hg19/hgnc/hgnc.bb
+ln -s `pwd`/hgnc.hg19.bb /gbdb/hg19/hgnc/hgnc.bb 
+
+rm -rf /gbdb/hg19/hgnc/search.ix /gbdb/hg19/hgnc/search.ixx
+ln -s `pwd`/search.hg19.ix /gbdb/hg19/hgnc/search.ix
+ln -s `pwd`/search.hg19.ixx /gbdb/hg19/hgnc/search.ixx
+#############################################################################