src/hg/makeDb/doc/hg38/pseudogenes.txt e42db081e12ceb2019b78b9b29e7838b908c74f9

e42db081e12ceb2019b78b9b29e7838b908c74f9
jeltje.van.baren
  Thu Mar 13 12:31:40 2025 -0700
adding pseudogene makedoc

diff --git src/hg/makeDb/doc/hg38/pseudogenes.txt src/hg/makeDb/doc/hg38/pseudogenes.txt
new file mode 100755
index 00000000000..1ec5da3c10b
--- /dev/null
+++ src/hg/makeDb/doc/hg38/pseudogenes.txt
@@ -0,0 +1,118 @@
+#! /bin/bash
+
+# file (received by MarkD through email 12 May 2024):
+# /hive/data/genomes/hg38/bed/pseudoPipe/Homo_sapiens.GRCh38.103.pseudogene.pseudopipe.wparent.gtf
+# This is incorrectly formatted gtf because it has overlapping exons. The python code converts
+# the gtf to bed using MarkD's pycbio code, which can be pip installed:
+#python3 -m venv mytestvenv
+#source mytestvenv/bin/activate
+#pip install /hive/groups/browser/pycbio
+# the program also merges overlapping exons when they're in the same transcript because
+# the gtf we received contains those.
+
+# Adding HUGO ids using gencode tables.
+# gene IDs come from v37 and v38 (both are needed to find all)
+rm -rf all.ids
+for i in 37 38; do
+       echo "select geneId, geneName, transcriptId, proteinId from hg38.wgEncodeGencodeAttrsV$i" | \
+          hgsql -N >> all.ids
+done
+
+annotationFile='/hive/data/genomes/hg38/bed/pseudoPipe/Homo_sapiens.GRCh38.103.pseudogene.pseudopipe.wparent.gtf'
+
+# file has duplicate lines. Remove without sorting
+awk '!seen[$0]++' $annotationFile > uniq.gtf
+./pseudoPipeToBed.py --genes all.ids --gtf uniq.gtf --bed pgene.bed
+
+# change the chromosome IDs (they need to  match the parents)
+bigBedToBed /gbdb/hg38/hg38.chromAlias.bb chromAlias.bed
+tawk '{print $6, $1}' chromAlias.bed > sub.list
+subColumn 1 pgene.bed sub.list pgene2.bed
+
+bedSort pgene2.bed pgene.bed
+
+
+
+########## PARENT TRACK ######################
+# Create a parent track and add parent info to the children
+
+### V37
+# we only need the chrom coords, not the blocks
+zcat /hive/data/genomes/hg38/bed/gencodeV37/build/gencodeV37.bed.gz | cut -f1-6 > v37.tsv
+
+# these have ENST IDs but not gene IDs
+# replace with gene IDs (note that this creates duplicate IDs in the file)
+paste <(cut -f3 all.ids) <(cut -f1 all.ids | cut -f1 -d'.') > sub.me
+subColumn -miss=idMiss.txt 4 v37.tsv sub.me v37.sub
+#grep -c ENST v37.sub # 0
+
+### V38
+bigBedToBed /hive/data/genomes/hg38/bed/gencodeV38/build/hg38.gencodeV38.bb V38.bed
+subColumn -miss=idMiss.txt 4 <(cut -f1-6 V38.bed) sub.me v38.sub
+#grep -c ENST v38.sub # 0
+
+# we really only want v37 genes if they are not in v38
+selectById -not 4 v38.sub 4 v37.sub > addMe
+# sort by gene ID, then chr
+cat v38.sub addMe  | sort -k4 -k1 > annot.bed
+
+./pseudoPipeParents.py --pgenebed pgene.bed --annotbed annot.bed --pgeneout pseudopipePgenes.bed --parentout parents.bed
+
+### convert to bigbed ###
+cat << '_EOF_' > pseudoPipePgenes.as
+table yale_pseudogenes
+"Bed 12+6 file with pseudogenes and their (gene, transcript, and protein) parents."
+    (
+    string chrom;      "Chromosome (or contig, scaffold, etc.)"
+    uint   chromStart; "Start position in chromosome"
+    uint   chromEnd;   "End position in chromosome"
+    string name;       "Name of item"
+    uint   score;      "Score from 0-1000"
+    char[1] strand;    "+ or -"
+    uint thickStart;   "Start of where display should be thick (start codon)"
+    uint thickEnd;     "End of where display should be thick (stop codon)"
+    uint reserved;     "Used as itemRgb as of 2004-11-22"
+    int blockCount;    "Number of blocks"
+    int[blockCount] blockSizes;  "Comma separated list of block sizes"
+    int[blockCount] chromStarts; "Start positions relative to chromStart"
+    string hugo;                "Pseudogene of"
+    string pgeneType;           "pseudogene, processed_pseudogene, unprocessed_pseudogene"
+    string ppgene;              "Pseudopipe pseudogene ID"
+    string gene;                "Ensembl pseudogene ID"
+    string geneParent;          "Ensembl gene parent ID (url)"
+    string proteinParent;       "Ensembl protein parent ID"
+    )
+_EOF_
+bedToBigBed -type=bed12+8 -tab -as=pseudoPipePgenes.as -extraIndex=hugo,name,ppgene pseudopipePgenes.bed /hive/data/genomes/hg38/chrom.sizes pseudoPipePgenes.bb
+# trix index (make search case insensitive) for hugo and ENSG parent IDs
+cut -f4,14,18 pseudopipePgenes.bed > name.table
+ixIxx name.table pseudoPipePgenes.ix pseudoPipePgenes.ixx
+
+cat << '_EOF_' > pseudoPipeParents.as
+table yale_parents
+"Bed 9+2 file with parents and and their pseudogenes."
+    (
+    string chrom;      "Chromosome (or contig, scaffold, etc.)"
+    uint   chromStart; "Start position in chromosome"
+    uint   chromEnd;   "End position in chromosome"
+    string name;       "Name of item"
+    uint   score;      "Score from 0-1000"
+    char[1] strand;    "+ or -"
+    uint thickStart;   "Start of where display should be thick (start codon)"
+    uint thickEnd;     "End of where display should be thick (stop codon)"
+    uint reserved;     "Used as itemRgb as of 2004-11-22"
+    string hugo;       "Gene symbol"
+    string url;        "Link to pseudogene"
+    )
+_EOF_
+
+# sort so that ENSG comes before PGO Ids, putting the parent at the top
+sort -k1,1 -k2,2n -k4 parents.bed > pseudopipeParents.bed 
+bedToBigBed -type=bed9+2 -tab -as=pseudoPipeParents.as pseudopipeParents.bed /hive/data/genomes/hg38/chrom.sizes pseudoPipeParents.bb
+cut -f4,10 pseudopipeParents.bed > name.table
+ixIxx name.table pseudoPipeParents.ix pseudoPipeParents.ixx
+
+rm addMe v37.tsv v38.sub V38.bed v37.sub sub.me sub.list idMiss.txt
+rm *.bed
+rm name.table uniq.gtf all.ids
+