src/hg/makeDb/doc/hg38/ncbiRefSeq.txt bbca1ee6f90a46e7a139434849dbf32bd206b522

bbca1ee6f90a46e7a139434849dbf32bd206b522
chmalee
  Thu Sep 21 12:27:34 2023 -0700
First cut of adding a historical refseq transcripts track, and getting hgvs searching to work with them, refs #26016

diff --git src/hg/makeDb/doc/hg38/ncbiRefSeq.txt src/hg/makeDb/doc/hg38/ncbiRefSeq.txt
index 828e094..c8686f3 100644
--- src/hg/makeDb/doc/hg38/ncbiRefSeq.txt
+++ src/hg/makeDb/doc/hg38/ncbiRefSeq.txt
@@ -344,15 +344,76 @@
 # update 2021-05-13 (DONE - Hiram - 2021-05-13)
 
   mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2021-05-13
   cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2021-05-13
 
   time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       -bigClusterHub=ku -dbHost=hgwdev \
       -fileServer=hgwdev -smallClusterHub=hgwdev -workhorse=hgwdev \
       GCF_000001405.39_GRCh38.p13 hg38) > do.log 2>&1 &
   # real    11m46.506s
 
   cat fb.ncbiRefSeq.hg38.txt
   # 137385668 bases of 3110768607 (4.416%) in intersection
 
 #############################################################################
+
+#############################################################################
+# Add psuedo-track of old transcripts (DONE - ChrisL -2023-09-21)
+#############################################################################
+# see also /hive/data/outside/refSeqHistorical/build.sh
+# get the gff3
+wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_genomic.gff.gz
+wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam
+wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam.bai
+
+# turn into genePred
+zcat GCF_000001405.40-RS_2023_03_genomic.gff.gz \
+    | gff3ToGenePred -maxParseErrors=-1 -maxConvertErrors=-1 -warnAndContinue \
+        -attrsOut=attrs.out -unprocessedRootsOut=unprocessedRoots.out \
+        -refseqHacks stdin out.gp 2>err
+
+# rename chromosomes to ucsc names, throwing away fix/alts/etc, and removing the ids
+# from the current version of the refSeq track
+# I don't know the story with NM_001304990.1, but it has an entry on chrX and chrY
+# with a different cds record, which have to be unique, so I'm throwing it away for
+# now
+hgsql -Ne "select distinct(name) from ncbiRefSeqCurated" hg38 | sort > ncbiRefSeq.currentIds
+chromToUcsc -k 2 -a /hive/data/genomes/hg38/goldenPath/bigZips/p14/hg38.p14.chromAlias.txt  \
+        -i out.gp \
+    | tawk '$2 !~ /alt/ && $2 !~ /fix/ && $2 !~ /random/ && $1 ~ /\./ && $1 !~ /NP/' \
+    | tawk '{if ($1 == "NM_001304990.1" && $2 == "chrY") {next} else {print}}' \
+    | sort > out.ucscChrom.gp
+cut -f1 out.ucscChrom.gp | sort -u | comm -23 - ncbiRefSeq.currentIds > old.ids
+grep -Fwf old.ids out.ucscChrom.gp > refSeqHistorical.gp
+
+# make the bed file for the track
+genePredToBed refSeqHistorical.gp stdout | sort -k1,1 -k2,2n > refSeqHistorical.bed
+bedToBigBed -extraIndex=name -sizesIsChromAliasBb -type=bed12 -tab \
+    refSeqHistorical.bed \
+    /hive/data/genomes/hg38/goldenPath/bigZips/p14/hg38.p14.chromAlias.bb \
+    refSeqHistorical.bb
+
+# construct the necessary tables for the hgvs search:
+# make a psl+cds for a special hgvs search on these old sequences
+genePredToFakePsl -chromSize=/hive/data/genomes/hg38/goldenPath/bigZips/p14/hg38.p14.chrom.sizes \
+    hg38 refSeqHistorical.gp  refSeqHistorical.psl refSeqHistorical.cds
+
+# ensure only coding transcripts have a cds
+awk -F$'\\t' '$6 != $7 {print $1;}' refSeqHistorical.gp | sort -u > coding.cds.names
+join -t$'\t' coding.cds.names <(sort -u refSeqHistorical.cds) > refSeqHistorical.cds.coding
+
+# extract the sequences
+samtools fasta GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam > refSeqHistorical.fa
+
+# load psl table
+hgLoadPsl hg38 -table=ncbiRefSeqPslOld refSeqHistorical.psl
+
+# load cds table
+hgLoadSqlTab hg38 ncbiRefSeqCdsOld ~/kent/src/hg/lib/cdsSpec.sql refSeqHistorical.cds.coding
+
+# load seq and ext tables
+ln -sf `pwd`/refSeqHistorical.fa /gbdb/hg38/ncbiRefSeq/refSeqHistorical.fa
+hgLoadSeq -drop -seqTbl=seqNcbiRefSeqOld -extFileTbl=extNcbiRefSeqOld hg38 /gbdb/hg38/ncbiRefSeq/refSeqHistorical.fa
+
+# link the files into /gbdb
+ln -s `pwd`/refSeqHistorical.bb /gbdb/hg38/ncbiRefSeq/refSeqHistorical.bb