48578d7724625cda2499bd11f8b6ce19463a1279 braney Fri Sep 2 15:24:25 2022 -0700 HGNC track is ready for QA. diff --git src/hg/makeDb/doc/hg38/hgnc.txt src/hg/makeDb/doc/hg38/hgnc.txt index 1c8f4aa..6606642 100644 --- src/hg/makeDb/doc/hg38/hgnc.txt +++ src/hg/makeDb/doc/hg38/hgnc.txt @@ -1,51 +1,60 @@ mkdir -p /cluster/data/hg38/bed/hgnc cd /cluster/data/hg38/bed/hgnc -# get the data +# get the HGNC data wget "http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt" -# generate trix file with symbol, alias, and previos values +# get the Entrez data +wget "https://ftp.ncbi.nih.gov/gene/DATA/gene2accession.gz" +gunzip gene2accession.gz + +# separate human accessions in the Entrez data +grep "^9606" gene2accession > human.gene2accession + +# collect all the mappings to hg38 (these have NCBI chromosome names) +grep GRCh38 human.gene2accession | tawk '{print $2, $8,$10,$11}' | sort | uniq > entrezToLocNcbi.txt + +# substitute UCSC names +tawk '{print $1,$2}' /cluster/data/hg38/hg38.chromAlias.tsv > ncbiToUcsc.map +subColumn -skipMiss -miss=idMiss.txt 2 entrezToLocNcbi.txt ncbiToUcsc.map entrezToLoc.txt + +# generate trix file with symbol, alias, and previous values tail -n +2 hgnc_complete_set.txt | tawk '{print $1, $2 " " $9 " " $11;}' | tr -d '"' | tr '|' ' ' > trixInput.txt ixIxx trixInput.txt search.hg38.ix search.hg38.ixx -# look at field names and create proto AS file +# look at field names and create proto AS file (just done the first time) #tawk '{for(ii=1; ii <= NF; ii++) print ii, $ii; exit}' hgnc_complete_set.txt #tawk '{for(ii=1; ii <= NF; ii++) printf "\tstring %s; \"%s\"\n", $ii,$ii; exit}' *comple* > asSkeleton.as # create input file without header and sorted by first field ( HGNC:### ) tail -n +2 hgnc_complete_set.txt | sort -k 1b,1 > input.txt # put black for every type for the moment. This should be based on GENCODE colors tawk '{print $5}' input.txt | sort -u | awk '{printf "s/%s/0,0,0/\n", $0}' > color.sed.txt -tawk '{print $1,$5}' input.txt | sed -f color.sed.txt | sort -k 1b,1 > id.color.txt -join -j 1 -t $'\t' input.txt id.color.txt > inputColor.txt - -# put refseq ID as first field for joins -tawk '{if ($24 != "") print $24,$0}' inputColor.txt | sort -k 1b,1 > refSeq.hgnc.txt -# get NG_* positions from LRG track -bigBedToBed /gbdb/hg38/bbi/lrg.bb stdout | tawk '{split($18,a,"."); print a[1],$1,$2,$3}' | sort -k 1b,1 > refSeqGene.hg38.txt +#manually fixed colors for coding, pseudogene, and non-coding +tawk '{print $1,$5}' input.txt | sed -f color.sed.txt | sort -k 1b,1 > id.color.txt -# get NM, NR, XR, XM, YP, etc from ncbiRefSeq track -hgsql hg38 -Ne "select * from ncbiRefSeq" | tawk '{split($2,a,"."); print a[1],$3,$5,$6}' | sort -k 1b,1 > refSeq.hg38.txt +# build hgnc file with assigned colors as the 58th field +join -j 1 -t $'\t' input.txt id.color.txt > inputColor.txt -# merge positions -sort -k 1b,1 refSeqGene.hg38.txt refSeq.hg38.txt > refSeq.all.hg38.txt +# put entrez ID as first field for joins +tawk '{if ($19 != "") print $19,$0}' inputColor.txt | sort -k 1b,1 > entrez.hgnc.txt # add position information to HGNC information -join -j 1 -t $'\t' refSeq.all.hg38.txt refSeq.hgnc.txt | cut -f 2- > positioned.hg38.txt +join -j 1 -t $'\t' entrezToLoc.txt entrez.hgnc.txt | cut -f 2- > positioned.hg38.txt # build first nine fields of bigbed tawk '{print $1, $2, $3, $4, 0, "+", 0,0, $58}' positioned.hg38.txt > tmp1 # add the rest of the HGNC fields as extra fields except the id and color which are in the first nine tawk '{for(ii=5; ii <= 56; ii++) printf("%s\t", $ii); print $ii}' positioned.hg38.txt > tmp2 paste tmp1 tmp2 | sort -k1,1 -k2,2n > input.bed # let's do this thing! bedToBigBed -extraIndex=name -tab -type=bed9+53 input.bed /cluster/data/hg38/chrom.sizes hgnc.hg38.bb -as=$HOME/kent/src/hg/lib/hgncBig62.as # put the files into /gbdb mkdir -p /gbdb/hg38/hgnc rm -rf /gbdb/hg38/hgnc/hgnc.bb ln -s `pwd`/hgnc.hg38.bb /gbdb/hg38/hgnc/hgnc.bb