3ec510efd881b049639d056fb2a58c67d9951ccd braney Wed Sep 7 16:03:29 2022 -0700 add hg19 HGNC track diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt index 07001e3..860bcc8 100644 --- src/hg/makeDb/doc/hg19.txt +++ src/hg/makeDb/doc/hg19.txt @@ -35017,15 +35017,84 @@ tawk '{for(ii=5; ii <= 56; ii++) printf("%s\t", $ii); print $ii}' positioned.hg19.txt > tmp2 paste tmp1 tmp2 | sort -k1,1 -k2,2n > input.bed # let's do this thing! bedToBigBed -extraIndex=name -tab -type=bed9+53 input.bed /cluster/data/hg19/chrom.sizes hgnc.hg19.bb -as=$HOME/kent/src/hg/lib/hgncBig62.as # put the files into /gbdb mkdir -p /gbdb/hg19/hgnc rm -rf /gbdb/hg19/hgnc/hgnc.bb ln -s `pwd`/hgnc.hg19.bb /gbdb/hg19/hgnc/hgnc.bb rm -rf /gbdb/hg19/hgnc/search.ix /gbdb/hg19/hgnc/search.ixx ln -s `pwd`/search.hg19.ix /gbdb/hg19/hgnc/search.ix ln -s `pwd`/search.hg19.ixx /gbdb/hg19/hgnc/search.ixx +####### +# HGNC Done (braney 2022-09-07) +# +mkdir -p /cluster/data/hg19/bed/hgnc +cd /cluster/data/hg19/bed/hgnc + +# get the HGNC data +wget "http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt" + +# get the Entrez data +wget "https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gff.gz" +zcat GRCh37_latest_genomic.gff.gz | grep "GeneID:" | tawk '{print $1, $4, $5}' > positions.txt +zcat GRCh37_latest_genomic.gff.gz | grep "GeneID:" | sed 's/.*GeneID://' | sed 's/,.*//' | sed 's/;.*//' > geneIds.txt +paste positions.txt geneIds.txt | tawk '{print $4, $1,$2,$3, $3 - $2}' | sort -k 1,1 -k 5,5nr | tawk '{if (seen[$1] != 1) print $1, $2, $3,$4; seen[$1] = 1}' > entrezToLocNcbi.txt + +# substitute UCSC names +tawk '{print $2,$1}' /cluster/data/hg19/bed/chromAlias/ucsc.refseq.p13.plusMT.tab > ncbiToUcsc.map +subColumn -skipMiss -miss=idMiss.txt 2 entrezToLocNcbi.txt ncbiToUcsc.map entrezToLoc.txt + +# separate human accessions in the Entrez data +grep "^9606" gene2accession > human.gene2accession + +# generate trix file with symbol, alias, and previous values +tail -n +2 hgnc_complete_set.txt | tawk '{print $1, $2 " " $9 " " $11;}' | tr -d '"' | tr '|' ' ' > trixInput.txt +ixIxx trixInput.txt search.hg19.ix search.hg19.ixx + +# look at field names and create proto AS file (just done the first time) +#tawk '{for(ii=1; ii <= NF; ii++) print ii, $ii; exit}' hgnc_complete_set.txt +#tawk '{for(ii=1; ii <= NF; ii++) printf "\tstring %s; \"%s\"\n", $ii,$ii; exit}' *comple* > asSkeleton.as + +# create input file without header and sorted by first field ( HGNC:### ) +tail -n +2 hgnc_complete_set.txt | sort -k 1b,1 > input.txt + +# put black for every type for the moment. This should be based on GENCODE colors +#tawk '{print $5}' input.txt | sort -u | awk '{printf "s/%s/0,0,0/\n", $0}' > color.sed.txt + +#manually fixed colors for coding, pseudogene, and non-coding + +cp /cluster/data/hg38/bed/hgnc/color.sed.txt . +tawk '{print $1,$5}' input.txt | sed -f color.sed.txt | sort -k 1b,1 > id.color.txt + +# build hgnc file with assigned colors as the 58th field +join -j 1 -t $'\t' input.txt id.color.txt > inputColor.txt + +# put entrez ID as first field for joins +tawk '{if ($19 != "") print $19,$0}' inputColor.txt | sort -k 1b,1 > entrez.hgnc.txt + +# add position information to HGNC information +join -j 1 -t $'\t' entrezToLoc.txt entrez.hgnc.txt | cut -f 2- > positioned.hg19.txt + +# build first nine fields of bigbed +tawk '{print $1, $2, $3, $4, 0, "+", 0,0, $58}' positioned.hg19.txt > tmp1 + +# add the rest of the HGNC fields as extra fields except the id and color which are in the first nine +tawk '{for(ii=5; ii <= 56; ii++) printf("%s\t", $ii); print $ii}' positioned.hg19.txt > tmp2 +paste tmp1 tmp2 | sort -k1,1 -k2,2n > input.bed + +# let's do this thing! +bedToBigBed -extraIndex=name -tab -type=bed9+53 input.bed /cluster/data/hg19/chrom.sizes hgnc.hg19.bb -as=$HOME/kent/src/hg/lib/hgncBig62.as + +# put the files into /gbdb +mkdir -p /gbdb/hg19/hgnc +rm -rf /gbdb/hg19/hgnc/hgnc.bb +ln -s `pwd`/hgnc.hg19.bb /gbdb/hg19/hgnc/hgnc.bb + +rm -rf /gbdb/hg19/hgnc/search.ix /gbdb/hg19/hgnc/search.ixx +ln -s `pwd`/search.hg19.ix /gbdb/hg19/hgnc/search.ix +ln -s `pwd`/search.hg19.ixx /gbdb/hg19/hgnc/search.ixx +