src/hg/makeDb/doc/hg19.txt 39a32e76ca8302695f6724eb873d5baf74c92f49

39a32e76ca8302695f6724eb873d5baf74c92f49
galt
  Wed May 7 17:27:20 2025 -0700
updated hgnc for color fix.

diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt
index 9d3dedd8d5e..632146fa65d 100644
--- src/hg/makeDb/doc/hg19.txt
+++ src/hg/makeDb/doc/hg19.txt
@@ -35778,61 +35778,56 @@
 mkdir old2
 mv * old2/
 
 
 
 # get the HGNC data
 wget "https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt"
 
 
 # get the Entrez data
 wget "https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gff.gz"
 zcat GRCh37_latest_genomic.gff.gz | grep "GeneID:" | tawk '{print $1, $4, $5, $7}' >  positions.txt
 
 zcat GRCh37_latest_genomic.gff.gz |  grep "GeneID:" | sed 's/.*GeneID://' | sed 's/,.*//' | sed 's/;.*//' > geneIds.txt
 
-#ORIG paste positions.txt geneIds.txt | tawk '{print $4, $1,$2,$3, $3 - $2}' | sort -k 1,1 -k 5,5nr | tawk '{if (seen[$1] != 1) print $1, $2, $3,$4; seen[$1] = 1}' > entrezToLocNcbi.txt
 paste positions.txt geneIds.txt | tawk '{print $5, $1,$2,$3,$4, $3 - $2}' | sort -k 1,1 -k 6,6nr | tawk '{if (seen[$1] != 1) print $1, $2, $3, $4, $5; seen[$1] = 1}' > entrezToLocNcbi.txt
 
 # substitute UCSC names
 tawk '{print $2,$1}' /cluster/data/hg19/bed/chromAlias/ucsc.refseq.p13.plusMT.tab > ncbiToUcsc.map
 subColumn -skipMiss -miss=idMiss.txt 2 entrezToLocNcbi.txt ncbiToUcsc.map entrezToLoc.txt
 
 # generate trix file with symbol, alias, and previous values
 tail -n +2 hgnc_complete_set.txt | tawk '{print $1, $2 " " $9 " " $11;}' | tr -d '"' | tr '|' ' ' > trixInput.txt
 ixIxx -maxWordLength=32 trixInput.txt search.hg19.ix search.hg19.ixx
 
 # look at field names and create proto AS file (just done the first time)
 #tawk '{for(ii=1;  ii <= NF; ii++) print ii, $ii; exit}' hgnc_complete_set.txt
 #tawk '{for(ii=1;  ii <= NF; ii++) printf "\tstring %s; \"%s\"\n", $ii,$ii; exit}' *comple* > asSkeleton.as
 
 # create input file without header and sorted by first field ( HGNC:### )
 tail -n +2 hgnc_complete_set.txt | sort -k 1b,1 > input.txt
 
-# put black for every type for the moment.  This should be based on GENCODE colors
-tawk '{print $5}' input.txt | sort -u | awk '{printf "s/%s/0,0,0/\n", $0}' > color.sed.txt
-
-#manually fixed colors for coding, pseudogene, and non-coding
-
+#copy manually fixed colors for coding, pseudogene, and non-coding from Brian's earlier run.
 cp /cluster/data/hg38/bed/hgnc/color.sed.txt .
+
 tawk '{print $1,$5}' input.txt | sed -f color.sed.txt | sort -k 1b,1 > id.color.txt
 
 # build hgnc file with assigned colors as the 58th field
 BASH needs tab escape
 join -j 1 -t $'\t' input.txt id.color.txt > inputColor.txt
 
-
 # put entrez ID as first field for joins
 tawk '{if ($19 != "")  print $19,$0}' inputColor.txt | sort -k 1b,1 > entrez.hgnc.txt
 
 # add position information to HGNC information
 BASH needs tab escape
 join -j 1 -t $'\t' entrezToLoc.txt  entrez.hgnc.txt | cut -f 2- > positioned.hg19.txt
 
 # build first nine fields of bigbed
 tawk '{print $1, $2, $3, $5, 0, $4, 0,0, $59}' positioned.hg19.txt > tmp1
 
 # add the rest of the HGNC fields as extra fields except the id and color which are in the first nine
 tawk '{for(ii=6; ii <= 57; ii++) printf("%s\t", $ii); print $ii}' positioned.hg19.txt > tmp2
 paste tmp1 tmp2 | sort -k1,1 -k2,2n > input.bed
 
 # let's do this thing!