src/hg/makeDb/doc/hg38/ncbiRefSeq.txt 964998333659e919296441a2e8567a6ef4fff8d6

964998333659e919296441a2e8567a6ef4fff8d6
lrnassar
  Tue Jun 2 13:42:29 2026 -0700
Update hg38 RefSeq Historical track to NCBI RS_2024_08 release. refs #35766

diff --git src/hg/makeDb/doc/hg38/ncbiRefSeq.txt src/hg/makeDb/doc/hg38/ncbiRefSeq.txt
index b911944d703..de2a6506f92 100644
--- src/hg/makeDb/doc/hg38/ncbiRefSeq.txt
+++ src/hg/makeDb/doc/hg38/ncbiRefSeq.txt
@@ -464,15 +464,88 @@
     | cut -d':' -f1 \
     | sort -u \
     | cut -d' ' -f2- > pragmaLabels.cleaned.txt
 ~/kent/src/hg/utils/automation/gff3ToRefLink.pl \
     raFile.txt \
     GCF_000001405.40-RS_2023_03_genomic.gff.gz \
     pragmaLabels.cleaned.txt \
     continueOnParentErrors \
     2> refLink.stderr \
     | sort > refLink.tab
 # join the refLink metadata with curated+predicted names
 cut -f1 refSeqHistorical.gp | sort -u > name.list
 join -t$'\t' name.list refLink.tab > ncbiRefSeqLinkOld.tab
 # loading the cross reference data
 hgLoadSqlTab hg38 ncbiRefSeqLinkHistorical ~/kent/src/hg/lib/ncbiRefSeqLink.sql ncbiRefSeqLinkOld.tab
+
+#############################################################################
+# Update RefSeq Historical to RS_2024_08 (DONE - Lou - 2026-06-02)
+#############################################################################
+# NCBI published a newer historical release, GCF_000001405.40-RS_2024_08
+# (2024-10-22), superseding the RS_2023_03 build above. One-off refresh of the
+# same six tables (names unchanged, no trackDb change). Full script kept at
+# /hive/data/outside/refSeqHistorical/RS_2024_08/build.RS_2024_08.sh
+mkdir /hive/data/outside/refSeqHistorical/RS_2024_08
+cd /hive/data/outside/refSeqHistorical/RS_2024_08
+REL=GCF_000001405.40-RS_2024_08
+BASE=https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38
+
+# get the four source files
+wget "$BASE/current/${REL}_genomic.gff.gz"
+wget "$BASE/current/${REL}_knownrefseq_alns.bam"
+wget "$BASE/current/${REL}_knownrefseq_alns.bam.bai"
+wget "$BASE/${REL}_historical/${REL}_knownrefseq_rna.gbff.gz"
+
+cp /hive/data/genomes/hg38/goldenPath/bigZips/p14/hg38.p14.chromAlias.txt chromAlias.txt
+
+# sequences from the BAM.
+# No samtools 1.x was available on the host (PATH samtools is 0.1.18), so the
+# `samtools fasta` step was reproduced with `samtools view` piped to an awk that
+# reverse-complements reverse-strand (0x10) reads (bamToFa.awk in the build dir),
+# leaving each transcript in native 5'->3' orientation. Verified byte-identical
+# to the RS_2023_03 2bit for 60 shared accessions.
+samtools view ${REL}_knownrefseq_alns.bam | gawk -f bamToFa.awk > refSeqHistorical.fa
+faToTwoBit -ignoreDups refSeqHistorical.fa refSeqHistorical.2bit
+bamToPsl -nohead -chromAlias=chromAlias.txt ${REL}_knownrefseq_alns.bam out.psl
+
+~/kent/src/hg/utils/automation/gbffToCds.pl ${REL}_knownrefseq_rna.gbff.gz | sort > refSeqHistorical.cds
+
+zcat ${REL}_genomic.gff.gz \
+    | gff3ToGenePred -maxParseErrors=-1 -maxConvertErrors=-1 -warnAndContinue \
+        -attrsOut=attrs.out -unprocessedRootsOut=unprocessedRoots.out \
+        -refseqHacks stdin out.gp 2>err
+
+hgsql -Ne "select distinct(name) from ncbiRefSeq" hg38 | sort > ncbiRefSeq.currentIds
+chromToUcsc -k 2  -a chromAlias.txt -i out.gp  | sort > out.ucscChrom.gp
+chromToUcsc -k 14 -a chromAlias.txt -i out.psl | sort > out.ucscChrom.psl
+cut -f10 out.ucscChrom.psl | sort -u | comm -23 - ncbiRefSeq.currentIds > old.ids
+grep -Fwf old.ids out.ucscChrom.gp  > refSeqHistorical.gp
+grep -Fwf old.ids out.ucscChrom.psl > refSeqHistorical.psl
+twoBitToFa -seqList=old.ids refSeqHistorical.2bit refSeqHistorical.deDuped.fa
+
+# ensure only coding transcripts have a cds.
+# Use LC_ALL=C so sort and join agree on collation -- the locale-default join
+# used in the RS_2023_03 build above silently dropped some coding rows.
+awk -F'\t' '$6 != $7 {print $1;}' refSeqHistorical.gp | LC_ALL=C sort -u > coding.cds.names
+LC_ALL=C sort -u refSeqHistorical.cds > refSeqHistorical.cds.sortedC
+LC_ALL=C join -t$'\t' coding.cds.names refSeqHistorical.cds.sortedC > refSeqHistorical.cds.coding
+
+# load display/search tables
+hgLoadGenePred -genePredExt hg38 ncbiRefSeqHistorical refSeqHistorical.gp
+hgLoadPsl hg38 -table=ncbiRefSeqPslHistorical refSeqHistorical.psl
+hgLoadSqlTab hg38 ncbiRefSeqCdsHistorical ~/kent/src/hg/lib/cdsSpec.sql refSeqHistorical.cds.coding
+
+# seq/ext tables + gbdb symlink
+ln -sf `pwd`/refSeqHistorical.deDuped.fa /gbdb/hg38/ncbiRefSeq/refSeqHistorical.fa
+hgLoadSeq -drop -seqTbl=seqNcbiRefSeqHistorical -extFileTbl=extNcbiRefSeqHistorical hg38 /gbdb/hg38/ncbiRefSeq/refSeqHistorical.fa
+
+# link metadata table for the hgvs xref search
+zcat ${REL}_knownrefseq_rna.gbff.gz | (grep ' :: ' || true) \
+  | perl -wpe 's/\s+::.*//; s/^\s+//;' | sort -u > pragmaLabels.txt
+/hive/data/outside/genbank/bin/x86_64/gbProcess /dev/null raFile.txt ${REL}_knownrefseq_rna.gbff.gz 2>gbProcess.err
+cat pragmaLabels.txt | tr -s '[:blank:]' | cut -d':' -f1 | sort -u | cut -d' ' -f2- > pragmaLabels.cleaned.txt
+~/kent/src/hg/utils/automation/gff3ToRefLink.pl \
+    raFile.txt ${REL}_genomic.gff.gz pragmaLabels.cleaned.txt continueOnParentErrors \
+    2> refLink.stderr | LC_ALL=C sort > refLink.tab
+cut -f1 refSeqHistorical.gp | LC_ALL=C sort -u > name.list
+LC_ALL=C join -t$'\t' name.list refLink.tab > ncbiRefSeqLinkOld.tab
+hgLoadSqlTab hg38 ncbiRefSeqLinkHistorical ~/kent/src/hg/lib/ncbiRefSeqLink.sql ncbiRefSeqLinkOld.tab