a919a3dbe5fd770f03b4c879612b123e04743a9c
gperez2
  Thu Dec 12 23:47:12 2024 -0800
Adding the Gene Orthologs track to the NCBI RefSeq composite track, refs #30262

diff --git src/hg/makeDb/doc/ncbiRefSeq/ncbiOrtho.makedoc src/hg/makeDb/doc/ncbiRefSeq/ncbiOrtho.makedoc
new file mode 100644
index 0000000..e3b6660
--- /dev/null
+++ src/hg/makeDb/doc/ncbiRefSeq/ncbiOrtho.makedoc
@@ -0,0 +1,89 @@
+#! /bin/bash
+
+# these NCBI files get replaced daily
+#wget https://ftp.ncbi.nih.gov/gene/DATA/gene2accession.gz
+#gunzip gene2accession.gz &
+#wget https://ftp.ncbi.nih.gov/gene/DATA/gene_orthologs.gz
+#gunzip gene_orthologs.gz
+#wait
+
+## Note: the annotated cat genes in the ncbi set appear to be on a different genome
+## than the one used at UCSC, e.g. chrB4 is NC_018729.3 but NC_058374.1 is used in gene2accession
+#
+#printf "7955\tzebrafish\tdanRer11\n" > species.txt
+#printf "9606\thuman\thg38\n" >> species.txt
+#printf "9913\tcow\tbosTau9\n" >> species.txt
+#printf "10090\tmouse\tmm39\n" >> species.txt
+##printf "9031\tchicken\tgalGal6\n" >> species.txt
+#printf "9615\tdog\tcanFam6\n" >> species.txt
+##printf "10116\trat\trn6\n" >> species.txt # rat is different version: ucsc chr1 is NC_005100.4 instead of NC_086019
+## this is because rn8 is stuck on hgwdev
+## the same is true-ish for chicken: UCSC lists NC_006088.5 for chr1 but NCBI uses NC_052573.1, which is weirdly titled 
+## 'alternate assembly' 
+#
+## Other orthologs listed in first column of gene_ortholog file (fly doesn't have matches with human)
+##7227 fly 7460 honeybee 36329 plasmodium
+#
+#rm *.chrom.bed
+#for species in $(cut -f3 species.txt); do
+#	bigBedToBed /gbdb/$species/$species.chromAlias.bb $species.chrom.bed
+## 	head -n2 $species.chrom.bed # mm39 and hg38 have 8 fields instead of 7
+#	tawk '{print $NF}' $species.chrom.bed >> field1
+#	tawk '{print $1}' $species.chrom.bed >> field2
+##      rm $species.chrom.bed
+#done
+## remove a few chroms without NCBI IDs
+#paste field1 field2 | grep -vP '\t$' > want.chroms
+#rm field1 field2
+#
+## get all gene info for genes on NCBI chromosomes for our species
+#selectById 1 want.chroms 8 gene2accession > accession.info # 5 minutes
+# sanity check, do we get all 6 species:
+#cut -f1 accession.info | uniq -c
+#  82307 7955
+# 352450 9606
+#  99190 9615
+#  84338 9913
+# 187233 10090
+
+
+## replace with UCSC IDs
+#subColumn -skipMiss -miss=idMiss.txt 8 accession.info want.chroms accToLoc.txt
+##missed 0
+#head -n1 gene2accession > gene2ucscAccession.txt
+#cat accToLoc.txt >> gene2ucscAccession.txt
+#
+## and create tracks
+#./ortho.py --ortho gene_orthologs --species species.txt --coords gene2ucscAccession.txt 
+##There are 297 genes without genome info
+## this creates <ucsc species ID>.bed files
+#
+#cat << '_EOF_' > ncbiOrtho.as
+#table orthologs
+#"Bed 9+2 file for NCBI orthologs"
+#    (
+#    string chrom;      "Reference sequence chromosome or scaffold"
+#    uint   chromStart; "Start position in chromosome"
+#    uint   chromEnd;   "End position in chromosome"
+#    string name;       "Short Name of item"
+#    uint   score;      "Score from 0-1000"
+#    char[1] strand;    "+ or -"
+#    uint thickStart;   "Start of where display should be thick (start codon)"
+#    uint thickEnd;     "End of where display should be thick (stop codon)"
+#    uint reserved;     "Used as itemRgb as of 2004-11-22"
+#    string hugo;       "gene symbol"
+#    lstring url;       "urls to orthologs"
+#    )
+#_EOF_
+#
+outdir=/cluster/home/jeltje/public_html/brwsr/ncbiOrtho_hub/
+for species in $(cut -f3 species.txt); do
+#    bedSort $species.bed $species.bed
+    # trix index (make search case insensitive) for hugo IDs
+
+    tawk '{print $4,$10,$4}' $species.bed > name.table
+    ixIxx name.table $outdir/$species/ncbiOrtho.ix $outdir/$species/ncbiOrtho.ixx
+    #bedToBigBed -type=bed9+3 -tab -as=ncbiOrtho.as -extraIndex=hugo,name $species.bed -sizesIsChromAliasBb /gbdb/$species/$species.chromAlias.bb $outdir/$species/ncbiOrtho.bb
+done
+#rm gene2accession species.txt want.chroms accession.info idMiss.txt accToLoc.txt gene2ucscAccession.txt *.bed
+