src/hg/makeDb/doc/hg38/ncbiRefSeq.txt 62ccb95d5c71a3bf7658327e6db2bc35c19d1d71

62ccb95d5c71a3bf7658327e6db2bc35c19d1d71
chmalee
  Wed Oct 11 09:57:36 2023 -0700
Adding back hgvs search support for historical refseq transcripts and
historical refSeq transcripts track. Should be more resilient code this
time.

diff --git src/hg/makeDb/doc/hg38/ncbiRefSeq.txt src/hg/makeDb/doc/hg38/ncbiRefSeq.txt
index 828e094..ed6807b 100644
--- src/hg/makeDb/doc/hg38/ncbiRefSeq.txt
+++ src/hg/makeDb/doc/hg38/ncbiRefSeq.txt
@@ -344,15 +344,129 @@
 # update 2021-05-13 (DONE - Hiram - 2021-05-13)
 
   mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2021-05-13
   cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2021-05-13
 
   time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       -bigClusterHub=ku -dbHost=hgwdev \
       -fileServer=hgwdev -smallClusterHub=hgwdev -workhorse=hgwdev \
       GCF_000001405.39_GRCh38.p13 hg38) > do.log 2>&1 &
   # real    11m46.506s
 
   cat fb.ncbiRefSeq.hg38.txt
   # 137385668 bases of 3110768607 (4.416%) in intersection
 
 #############################################################################
+
+#############################################################################
+# Add psuedo-track of old transcripts (DONE - ChrisL - 2023-09-21)
+# Updated ChrisL 2023-10-06
+#############################################################################
+# see also /hive/data/outside/refSeqHistorical/newBuild.sh
+# get the gff3
+wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_genomic.gff.gz
+wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam
+wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam.bai
+wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz
+
+# extract the sequences
+cp /hive/data/genomes/hg38/goldenPath/bigZips/p14/hg38.p14.chromAlias.txt chromAlias.txt
+samtools fasta GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam > refSeqHistorical.fa
+faToTwoBit -ignoreDups refSeqHistorical.fa refSeqHistorical.2bit
+bamToPsl -nohead -chromAlias=chromAlias.txt \
+        -allowDups GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam out.psl 
+~/kent/src/hg/utils/automation/gbffToCds.pl GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz \
+    | sort > refSeqHistorical.cds
+# Check our cds is correct:
+hgsql -Ne "select * from ncbiRefSeqCds where id like 'NM%' order by rand() limit 6" hg38
+# +----------------+-----------+
+# | NM_001110798.2 | 263..1552 |
+# | NM_001190818.2 | 181..2766 |
+# | NM_002009.4    | 466..1050 |
+# | NM_006896.4    | 131..823  |
+# | NM_025190.4    | 193..4254 |
+# | NM_177402.5    | 215..1474 |
+# +----------------+-----------+
+grep "NM_025190.4\|NM_001110798.2\|NM_006896.4\|NM_177402.5\|NM_002009.4\|NM_001190818.2" refSeqHistorical.cds
+# NM_001110798.2    263..1552
+# NM_001190818.2  181..2766
+# NM_002009.4 466..1050
+# NM_006896.4 131..823
+# NM_025190.4 193..4254
+# NM_177402.5 215..1474
+
+# Now we have psl and cds for searching, make a genePred track for display
+zcat GCF_000001405.40-RS_2023_03_genomic.gff.gz \
+    | gff3ToGenePred -maxParseErrors=-1 -maxConvertErrors=-1 -warnAndContinue \
+        -attrsOut=attrs.out -unprocessedRootsOut=unprocessedRoots.out \
+        -refseqHacks stdin out.gp 2>err
+
+# rename chromosomes to ucsc names, and remove the ids
+# from the current version of the refSeq track
+hgsql -Ne "select distinct(name) from ncbiRefSeq" hg38 | sort > ncbiRefSeq.currentIds
+chromToUcsc -k 2 -a chromAlias.txt -i out.gp \
+    | sort > out.ucscChrom.gp
+chromToUcsc -k 14 -a chromAlias.txt -i out.psl \
+    | sort > out.ucscChrom.psl
+cut -f10 out.ucscChrom.psl | sort -u | comm -23 - ncbiRefSeq.currentIds > old.ids
+grep -Fwf old.ids out.ucscChrom.gp > refSeqHistorical.gp
+grep -Fwf old.ids out.ucscChrom.psl > refSeqHistorical.psl
+# make sure we have the right sequences:
+twoBitToFa -seqList=old.ids refSeqHistorical.2bit refSeqHistorical.deDuped.fa
+
+# make the bed file for the track
+hgLoadGenePred -genePredExt hg38 ncbiRefSeqHistorical refSeqHistorical.gp
+#genePredToBigGenePred refSeqHistorical.gp stdout | sort -k1,1 -k2,2n > refSeqHistorical.bigGp
+#bedToBigBed -type=bed12+8 -tab -sizesIsChromAliasBb \
+#    -as=$HOME/kent/src/hg/lib/bigGenePred.as -extraIndex=name \
+#    refSeqHistorical.bigGp \
+#    /hive/data/genomes/hg38/goldenPath/bigZips/p14/hg38.p14.chromAlias.bb \
+#    refSeqHistorical.bb
+
+# ensure only coding transcripts have a cds
+awk -F$'\\t' '$6 != $7 {print $1;}' refSeqHistorical.gp | sort -u > coding.cds.names
+join -t$'\t' coding.cds.names <(sort -u refSeqHistorical.cds) > refSeqHistorical.cds.coding
+
+# load psl table
+hgLoadPsl hg38 -table=ncbiRefSeqPslHistorical refSeqHistorical.psl 
+
+# load cds table
+hgLoadSqlTab hg38 ncbiRefSeqCdsHistorical ~/kent/src/hg/lib/cdsSpec.sql refSeqHistorical.cds.coding
+
+# load seq and ext tables
+ln -sf `pwd`/refSeqHistorical.deDuped.fa /gbdb/hg38/ncbiRefSeq/refSeqHistorical.fa
+hgLoadSeq -drop -seqTbl=seqNcbiRefSeqHistorical -extFileTbl=extNcbiRefSeqHistorical hg38 /gbdb/hg38/ncbiRefSeq/refSeqHistorical.fa
+
+# link the files into /gbdb
+ln -s `pwd`/refSeqHistorical.bb /gbdb/hg38/ncbiRefSeq/refSeqHistorical.bb
+
+# Update 10-10-2023, need to build the ncbiRefSeqOld and ncbiRefSeqLinkOld
+# tables for associated hgvs searching
+# extract labels from semi-structured text in gbff COMMENT/description sections:
+zcat GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz \
+  | (grep ' :: ' || true) \
+  | perl -wpe 's/\\s+::.*//; s/^\\s+//;' \
+  | sort -u > pragmaLabels.txt
+# genbank processor extracts infomation about the RNAs
+/hive/data/outside/genbank/bin/x86_64/gbProcess \
+    /dev/null raFile.txt GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz \
+    2>gbProcess.err
+# Processing GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz into /dev/null and raFile.txt
+
+# gbProcess has a bunch of bogus info, trim it to what we need:
+cat /hive/data/outside/refSeqHistorical/pragmaLabels.txt \
+    | tr -s '[:blank:]' \
+    | cut -d':' -f1 \
+    | sort -u \
+    | cut -d' ' -f2- > pragmaLabels.cleaned.txt
+~/kent/src/hg/utils/automation/gff3ToRefLink.pl \
+    raFile.txt \
+    GCF_000001405.40-RS_2023_03_genomic.gff.gz \
+    pragmaLabels.cleaned.txt \
+    continueOnParentErrors \
+    2> refLink.stderr \
+    | sort > refLink.tab
+# join the refLink metadata with curated+predicted names
+cut -f1 refSeqHistorical.gp | sort -u > name.list
+join -t$'\t' name.list refLink.tab > ncbiRefSeqLinkOld.tab
+# loading the cross reference data
+hgLoadSqlTab hg38 ncbiRefSeqLinkHistorical ~/kent/src/hg/lib/ncbiRefSeqLink.sql ncbiRefSeqLinkOld.tab