c9ddf4ca037d9b61b462b77da54bc2f5b14726be chmalee Wed Oct 11 12:05:59 2023 -0700 Revert historical refSeq track changes This reverts the following commits 5f7b28612df1ff4e29c31020b33f8fee9c097b11 346fa22af4717e4d8bdaa6a22a873ac4324c357b d2346420237f134dff79722739380d022c5ec48d fbcacfcd4a8baadb31fca8b07c8f831580c31c82 db4660fe604ccb1b21aa3b5dfe3421cf8ac662eb bbca1ee6f90a46e7a139434849dbf32bd206b522 diff --git src/hg/makeDb/doc/hg38/ncbiRefSeq.txt src/hg/makeDb/doc/hg38/ncbiRefSeq.txt index 96a722d..828e094 100644 --- src/hg/makeDb/doc/hg38/ncbiRefSeq.txt +++ src/hg/makeDb/doc/hg38/ncbiRefSeq.txt @@ -1,439 +1,358 @@ # for emacs: -*- mode: sh; -*- # This file describes the construction of NCBI RefSeq track -- latest first ######################################################################### # ncbiRefSeq.p11 Genes (DONE - 2018-01-09 - Angie) # Previously done 2017-09-28; redone 11-16 to include mito "rna" from chrM genomic seq; # redone 2018-01-09 to pick up changes in NCBI download files from 2017-12-22 mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p11.2018-01-09 cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p11.2018-01-09 time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ refseq vertebrate_mammalian Homo_sapiens \ GCF_000001405.37_GRCh38.p11 hg38) > do.log 2>&1 & tail -f do.log # *** All done ! Elapsed time: 23m51s # real 23m51.091s cat fb.ncbiRefSeq.hg38.txt # 129593530 bases of 3049335806 (4.250%) in intersection ######################################################################### ######################################################################### # ncbiRefSeq.p8 Genes (DONE - 2016-08-18 - Hiram) mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p8 cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p8 # running step wise as this script is still under development time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev \ -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_mammalian Homo_sapiens \ GCF_000001405.34_GRCh38.p8 hg38) > download.log 2>&1 # real 12m35.601s time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -continue=process -bigClusterHub=ku -dbHost=hgwdev \ -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_mammalian Homo_sapiens \ GCF_000001405.34_GRCh38.p8 hg38) > process.log 2>&1 # real 14m18.953s time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -continue=load -bigClusterHub=ku -dbHost=hgwdev \ -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_mammalian Homo_sapiens \ GCF_000001405.34_GRCh38.p8 hg38) > load.log 2>&1 # real 0m59.189s cat fb.ncbiRefSeq.hg38.txt # 141131948 bases of 3049335806 (4.628%) in intersection ############################################################################# ######################################################################### # ncbiRefSeq.p7 Genes (DONE - 2016-05-20 - Hiram) mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p7 cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p7 # running step wise as this script is still under development time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev \ -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_mammalian Homo_sapiens \ GCF_000001405.33_GRCh38.p7 hg38) > download.log 2>&1 # real 12m35.601s time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -continue=process -bigClusterHub=ku -dbHost=hgwdev \ -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_mammalian Homo_sapiens \ GCF_000001405.33_GRCh38.p7 hg38) > process.log 2>&1 # real 14m18.953s time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -continue=load -bigClusterHub=ku -dbHost=hgwdev \ -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_mammalian Homo_sapiens \ GCF_000001405.33_GRCh38.p7 hg38) > load.log 2>&1 # real 0m59.189s cat fb.ncbiRefSeq.hg38.txt # 141131948 bases of 3049335806 (4.628%) in intersection ############################################################################# mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq cd /hive/data/genomes/hg38/bed/ncbiRefSeq export asmName="GCF_000001405.31_GRCh38.p5" # most of these files could be linked in from the refseq download, for example: ln -s /hive/data/genomes/hg38/refseq/${asmName}_rna.gbff.gz ./ # or, fetched freshly from NCBI refseq assemblies: for F in ${asmName}_genomic.fna.gz ${asmName}_genomic.gff.gz \ ${asmName}_rna.fna.gz ${asmName}_assembly_report.txt do time rsync -L -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/${asmName}/${F} ./ done # measure these items: faCount ${asmName}_rna.fna.gz > rna.faCount.txt faCount ${asmName}_genomic.fna.gz > genomic.faCount.txt egrep -v "^#seq|^total" rna.faCount.txt | cut -f1,2 \ | sort -k2nr > rna.chrom.sizes egrep -v "^#seq|^total" genomic.faCount.txt | cut -f1,2 \ | sort -k2nr > genomic.chrom.sizes # this constructes the unlifted genePred file with NCBI chrom names gff3ToGenePred -useName -attrsOut=${asmName}.attrs.useName.txt \ ${asmName}_genomic.gff.gz \ ${asmName}.useName.gp # extract additional names from column nine to use as an index for # the variety of names that identify each item ~/kent/src/hg/makeDb/doc/hg38/gffToIx.pl ${asmName}_genomic.gff.gz \ > ${asmName}_genomic.ix.txt # for psl alignments, only need to work on the items with Gap specifications # The NG_ named items do not function correctly zgrep -v "NG_" ${asmName}_genomic.gff.gz | grep "Gap=" >> only.gap.gff # construct the PSL alignments from the Gap records sed -e 's/cDNA_match/match/;' only.gap.gff \ | gff3ToPsl genomic.chrom.sizes rna.chrom.sizes stdin ${asmName}.psl # this doesn't work, too many UCSC names missing from the NCBI file: grep -v "^#" ${asmName}_assembly_report.txt | sed -e 's/^M//g;' \ | cut -f7,9,10 \ |awk -F'\t' '{printf "0\t%s\t%d\t%s\t%d\n", $1, $2, $3, $2}' \ > insdcToHg38.lift # the automated hubs process does construct a good NCBI to UCSC lift file: export liftFile="/hive/data/inside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/${asmName}/${asmName}.ncbiToUcsc.lift" # liftUp the genePred from NCBI chrom names to UCSC chrom names liftUp -extGenePred -type=.gp stdout \ ${liftFile} warn ${asmName}.useName.gp | gzip -c \ > ${asmName}.ucsc.gp.gz touch -r ${asmName}.useName.gp ${asmName}.ucsc.gp.gz # some items will fail because they are on patches or alternative sequence # from Patch 5 that does not exist in hg38 genePredCheck -db=hg38 ${asmName}.ucsc.gp.gz # checked: 150709 failed: 1455 # filter out the patches and alternative sequences that are not in hg38 genePredFilter -db=hg38 ${asmName}.ucsc.gp.gz stdout \ | gzip -c > ${asmName}.hg38.gp.gz # this should be perfectly good genePredCheck -db=hg38 ${asmName}.hg38.gp.gz # checked: 149254 failed: 0 # liftUp the psl file from NCBI chrom names to UCSC chrom names pslSwap ${asmName}.psl stdout \ | liftUp -type=.psl stdout ${liftFile} drop stdin \ | gzip -c > ${asmName}.ucsc.psl.gz touch -r ${asmName}.psl ${asmName}.ucsc.psl.gz # same problem here as above, not everything is in hg38, # filter patch 5 items out: pslCheck -pass=stdout -db=hg38 ${asmName}.ucsc.psl.gz 2> /dev/null \ | gzip -c > ${asmName}.hg38.psl.gz pslCheck -db=hg38 ${asmName}.hg38.psl.gz # checked: 1062 failed: 0 errors: 0 # separate categories of genes: zegrep "^N(M|R)|^YP" ${asmName}.hg38.gp.gz > curated.gp zegrep "^X(M|R)" ${asmName}.hg38.gp.gz > predicted.gp zegrep -v "^N(M|R)|^YP|X(M|R)" ${asmName}.hg38.gp.gz > other.gp # verify none lost wc -l curated.gp predicted.gp other.gp 54869 curated.gp 88703 predicted.gp 5682 other.gp 149254 total zcat ${asmName}.hg38.gp.gz | wc -l 149254 # extract CDS records from the rna.gbff.gz for bigPsl construction: ~/kent/src/hg/makeDb/doc/hg38/gbffToCds.pl ${asmName}_rna.gbff.gz > rna.cds # ensure simple names are in the rna.fa file zcat ${asmName}_rna.fna.gz | sed -e 's/ .*//;' > rna.fa pslToBigPsl -fa=rna.fa -cds=rna.cds ${asmName}.hg38.psl.gz test.bigPsl sort -k1,1 -k2,2n test.bigPsl > ${asmName}.hg38.bigPsl bedToBigBed -type=bed12+12 -tab -as=$HOME/kent/src/hg/lib/bigPsl.as \ ${asmName}.hg38.bigPsl ../../chrom.sizes ${asmName}.hg38.bigPsl.bb # loading genePred tracks: hgLoadGenePred -genePredExt hg38 ncbiRefSeqCurated curated.gp genePredCheck -db=hg38 ncbiRefSeqCurated # checked: 54869 failed: 0 hgLoadGenePred -genePredExt hg38 ncbiRefSeqPredicted predicted.gp genePredCheck -db=hg38 ncbiRefSeqPredicted # checked: 88703 failed: 0 hgLoadGenePred -genePredExt hg38 ncbiRefSeqOther other.gp genePredCheck -db=hg38 ncbiRefSeqOther # checked: 5682 failed: 0 ########## early experiment, not used later # # and the bigPsl file: # mkdir -p /gbdb/hg38/bbi/ncbiRefSeq # ln -s `pwd`/${asmName}.hg38.bigPsl.bb /gbdb/hg38/bbi/ncbiRefSeqBigPsl.bb # hgBbiDbLink hg38 ncbiRefSeqBigPsl /gbdb/hg38/bbi/ncbiRefSeqBigPsl.bb ########## early experiment, not used later ############################################################################# # addition of HGMD-restricted subset, Max, Jan 29 2019 # updated Dec 2019 # updated Dec 2020 # updated Aug 2023 cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2020-10-27/ # change in 2019: ignore the version numbers, otherwise only 1815 transcripts left, big update by HGMD in 2019? # adding "." so NM_123 doesn't match NM_123123 cat /hive/data/outside/hgmd/$year.4-hgmd-public_hg38.tsv | cut -f7 | cut -d. -f1 | sort -u | awk '{print $1"."}' > hgmdTranscripts.txt zcat process/hg38.curated.gp.gz | fgrep -f hgmdTranscripts.txt - > hgmd.curated.gp hgLoadGenePred -genePredExt hg38 ncbiRefSeqHgmd hgmd.curated.gp # wc -l says: 10772 hgmd.curated.gp for the 2021 version ############################################################################# # ncbiRefSeq.p13 update (DONE - 2019-12-06 - Hiram) # current version information cat /gbdb/hg38/ncbiRefSeq/ncbiRefSeqVersion.txt # NCBI Homo sapiens Annotation Release 109 (2018-03-29) # Version information from the file: # /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/ # all_assembly_versions/GCF_000001405.39_GRCh38.p13/ # GCF_000001405.39_GRCh38.p13_genomic.gff.gz #!annotation-date 09/05/2019 #!annotation-source NCBI Homo sapiens Updated Annotation Release 109.20190905 mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2019-12-06 cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2019-12-06 ### BEFORE loading this updated table featureBits -countGaps hg38 ncbiRefSeq # 134109466 bases of 3257347282 (4.117%) in intersection featureBits -enrichment hg38 refGene ncbiRefSeq # refGene 3.098%, ncbiRefSeq 4.332%, both 2.920%, cover 94.23%, enrich 21.75x featureBits -enrichment hg38 ncbiRefSeq refGene # ncbiRefSeq 4.332%, refGene 3.098%, both 2.920%, cover 67.40%, enrich 21.75x featureBits -enrichment hg38 ncbiRefSeqCurated refGene # ncbiRefSeqCurated 2.880%, refGene 3.098%, both 2.846%, cover 98.84%, enrich 31.90x featureBits -enrichment hg38 refGene ncbiRefSeqCurated # refGene 3.098%, ncbiRefSeqCurated 2.880%, both 2.846%, cover 91.86%, enrich 31.90x # running step wise just to be careful time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev \ -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_mammalian Homo_sapiens \ GCF_000001405.39_GRCh38.p13 hg38) > download.log 2>&1 # real 3m23.090s time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -continue=process -bigClusterHub=ku -dbHost=hgwdev \ -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_mammalian Homo_sapiens \ GCF_000001405.39_GRCh38.p13 hg38) > process.log 2>&1 # real 6m10.922s time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -continue=load -bigClusterHub=ku -dbHost=hgwdev \ -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_mammalian Homo_sapiens \ GCF_000001405.39_GRCh38.p13 hg38) > load.log 2>&1 # real 0m41.366s ### AFTER loading this updated table # compare this result: cat fb.ncbiRefSeq.hg38.txt # 136778258 bases of 3095998939 (4.418%) in intersection # with previous version existing table (from fb above): # 134109466 bases of 3257347282 (4.117%) in intersection featureBits -enrichment hg38 refGene ncbiRefSeq # refGene 3.098%, ncbiRefSeq 4.418%, both 3.073%, cover 99.19%, enrich 22.45x # previous: # refGene 3.098%, ncbiRefSeq 4.332%, both 2.920%, cover 94.23%, enrich 21.75x featureBits -enrichment hg38 ncbiRefSeq refGene # ncbiRefSeq 4.418%, refGene 3.098%, both 3.073%, cover 69.56%, enrich 22.45x # previous: # ncbiRefSeq 4.332%, refGene 3.098%, both 2.920%, cover 67.40%, enrich 21.75x featureBits -enrichment hg38 ncbiRefSeqCurated refGene # ncbiRefSeqCurated 3.073%, refGene 3.098%, both 3.067%, cover 99.81%, enrich 32.22x # previous: # ncbiRefSeqCurated 2.880%, refGene 3.098%, both 2.846%, cover 98.84%, enrich 31.90x featureBits -enrichment hg38 refGene ncbiRefSeqCurated # refGene 3.098%, ncbiRefSeqCurated 3.073%, both 3.067%, cover 98.99%, enrich 32.22x # previous: # refGene 3.098%, ncbiRefSeqCurated 2.880%, both 2.846%, cover 91.86%, enrich 31.90x ######################################################################### # addition of RefSeq Select-restricted subset, Max, Feb 10 2019 cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2019-12-06/ zcat download/*_genomic.gff.gz | egrep 'tag=(RefSeq|MANE) Select' | cut -f9- | tr ';' '\n' | grep Name= | grep -v NP_ | cut -d= -f2 | sort -u > refseqSelectTranscripts.txt cat process/hg38.curated.gp | fgrep -f refseqSelectTranscripts.txt - > refseqSelect.curated.gp hgLoadGenePred -genePredExt hg38 ncbiRefSeqSelect refseqSelect.curated.gp wc -l refseqSelect.curated.gp 21071 refseqSelect.curated.gp ############################################################################# ############################################################################# # update 2020-10-27 (DONE - Hiram - 2020-10-27) mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2020-10-27 cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2020-10-27 time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev \ -fileServer=hgwdev -smallClusterHub=hgwdev -workhorse=hgwdev \ GCF_000001405.39_GRCh38.p13 hg38) > do.log 2>&1 & # real 11m46.506s cat fb.ncbiRefSeq.hg38.txt # 137385668 bases of 3110768607 (4.416%) in intersection ############################################################################# # update 2021-05-13 (DONE - Hiram - 2021-05-13) mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2021-05-13 cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2021-05-13 time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev \ -fileServer=hgwdev -smallClusterHub=hgwdev -workhorse=hgwdev \ GCF_000001405.39_GRCh38.p13 hg38) > do.log 2>&1 & # real 11m46.506s cat fb.ncbiRefSeq.hg38.txt # 137385668 bases of 3110768607 (4.416%) in intersection ############################################################################# - -############################################################################# -# Add psuedo-track of old transcripts (DONE - ChrisL - 2023-09-21) -# Updated ChrisL 2023-10-06 -############################################################################# -# see also /hive/data/outside/refSeqHistorical/newBuild.sh -# get the gff3 -wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_genomic.gff.gz -wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam -wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam.bai -wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz - -# extract the sequences -cp /hive/data/genomes/hg38/goldenPath/bigZips/p14/hg38.p14.chromAlias.txt chromAlias.txt -samtools fasta GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam > refSeqHistorical.fa -faToTwoBit -ignoreDups refSeqHistorical.fa refSeqHistorical.2bit -bamToPsl -nohead -chromAlias=chromAlias.txt \ - -allowDups GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam out.psl -~/kent/src/hg/utils/automation/gbffToCds.pl GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz \ - | sort > refSeqHistorical.cds -# Check our cds is correct: -hgsql -Ne "select * from ncbiRefSeqCds where id like 'NM%' order by rand() limit 6" hg38 -# +----------------+-----------+ -# | NM_001110798.2 | 263..1552 | -# | NM_001190818.2 | 181..2766 | -# | NM_002009.4 | 466..1050 | -# | NM_006896.4 | 131..823 | -# | NM_025190.4 | 193..4254 | -# | NM_177402.5 | 215..1474 | -# +----------------+-----------+ -grep "NM_025190.4\|NM_001110798.2\|NM_006896.4\|NM_177402.5\|NM_002009.4\|NM_001190818.2" refSeqHistorical.cds -# NM_001110798.2 263..1552 -# NM_001190818.2 181..2766 -# NM_002009.4 466..1050 -# NM_006896.4 131..823 -# NM_025190.4 193..4254 -# NM_177402.5 215..1474 - -# Now we have psl and cds for searching, make a genePred track for display -zcat GCF_000001405.40-RS_2023_03_genomic.gff.gz \ - | gff3ToGenePred -maxParseErrors=-1 -maxConvertErrors=-1 -warnAndContinue \ - -attrsOut=attrs.out -unprocessedRootsOut=unprocessedRoots.out \ - -refseqHacks stdin out.gp 2>err - -# rename chromosomes to ucsc names, and remove the ids -# from the current version of the refSeq track -hgsql -Ne "select distinct(name) from ncbiRefSeq" hg38 | sort > ncbiRefSeq.currentIds -chromToUcsc -k 2 -a chromAlias.txt -i out.gp \ - | sort > out.ucscChrom.gp -chromToUcsc -k 14 -a chromAlias.txt -i out.psl \ - | sort > out.ucscChrom.psl -cut -f10 out.ucscChrom.psl | sort -u | comm -23 - ncbiRefSeq.currentIds > old.ids -grep -Fwf old.ids out.ucscChrom.gp > refSeqHistorical.gp -grep -Fwf old.ids out.ucscChrom.psl > refSeqHistorical.psl -# make sure we have the right sequences: -twoBitToFa -seqList=old.ids refSeqHistorical.2bit refSeqHistorical.deDuped.fa - -# make the bed file for the track -genePredToBigGenePred refSeqHistorical.gp stdout | sort -k1,1 -k2,2n > refSeqHistorical.bigGp -bedToBigBed -type=bed12+8 -tab -sizesIsChromAliasBb \ - -as=$HOME/kent/src/hg/lib/bigGenePred.as -extraIndex=name \ - refSeqHistorical.bigGp \ - /hive/data/genomes/hg38/goldenPath/bigZips/p14/hg38.p14.chromAlias.bb \ - refSeqHistorical.bb - -# ensure only coding transcripts have a cds -awk -F$'\\t' '$6 != $7 {print $1;}' refSeqHistorical.gp | sort -u > coding.cds.names -join -t$'\t' coding.cds.names <(sort -u refSeqHistorical.cds) > refSeqHistorical.cds.coding - -# load psl table -hgLoadPsl hg38 -table=ncbiRefSeqPslOld refSeqHistorical.psl - -# load cds table -hgLoadSqlTab hg38 ncbiRefSeqCdsOld ~/kent/src/hg/lib/cdsSpec.sql refSeqHistorical.cds.coding - -# load seq and ext tables -ln -sf `pwd`/refSeqHistorical.deDuped.fa /gbdb/hg38/ncbiRefSeq/refSeqHistorical.fa -hgLoadSeq -drop -seqTbl=seqNcbiRefSeqOld -extFileTbl=extNcbiRefSeqOld hg38 /gbdb/hg38/ncbiRefSeq/refSeqHistorical.fa - -# link the files into /gbdb -ln -s `pwd`/refSeqHistorical.bb /gbdb/hg38/ncbiRefSeq/refSeqHistorical.bb