c9ddf4ca037d9b61b462b77da54bc2f5b14726be chmalee Wed Oct 11 12:05:59 2023 -0700 Revert historical refSeq track changes This reverts the following commits 5f7b28612df1ff4e29c31020b33f8fee9c097b11 346fa22af4717e4d8bdaa6a22a873ac4324c357b d2346420237f134dff79722739380d022c5ec48d fbcacfcd4a8baadb31fca8b07c8f831580c31c82 db4660fe604ccb1b21aa3b5dfe3421cf8ac662eb bbca1ee6f90a46e7a139434849dbf32bd206b522 diff --git src/hg/makeDb/doc/hg38/ncbiRefSeq.txt src/hg/makeDb/doc/hg38/ncbiRefSeq.txt index 96a722d..828e094 100644 --- src/hg/makeDb/doc/hg38/ncbiRefSeq.txt +++ src/hg/makeDb/doc/hg38/ncbiRefSeq.txt @@ -344,96 +344,15 @@ # update 2021-05-13 (DONE - Hiram - 2021-05-13) mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2021-05-13 cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2021-05-13 time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev \ -fileServer=hgwdev -smallClusterHub=hgwdev -workhorse=hgwdev \ GCF_000001405.39_GRCh38.p13 hg38) > do.log 2>&1 & # real 11m46.506s cat fb.ncbiRefSeq.hg38.txt # 137385668 bases of 3110768607 (4.416%) in intersection ############################################################################# - -############################################################################# -# Add psuedo-track of old transcripts (DONE - ChrisL - 2023-09-21) -# Updated ChrisL 2023-10-06 -############################################################################# -# see also /hive/data/outside/refSeqHistorical/newBuild.sh -# get the gff3 -wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_genomic.gff.gz -wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam -wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/current/GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam.bai -wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/historical/GRCh38/GCF_000001405.40-RS_2023_03_historical/GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz - -# extract the sequences -cp /hive/data/genomes/hg38/goldenPath/bigZips/p14/hg38.p14.chromAlias.txt chromAlias.txt -samtools fasta GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam > refSeqHistorical.fa -faToTwoBit -ignoreDups refSeqHistorical.fa refSeqHistorical.2bit -bamToPsl -nohead -chromAlias=chromAlias.txt \ - -allowDups GCF_000001405.40-RS_2023_03_knownrefseq_alns.bam out.psl -~/kent/src/hg/utils/automation/gbffToCds.pl GCF_000001405.40-RS_2023_03_knownrefseq_rna.gbff.gz \ - | sort > refSeqHistorical.cds -# Check our cds is correct: -hgsql -Ne "select * from ncbiRefSeqCds where id like 'NM%' order by rand() limit 6" hg38 -# +----------------+-----------+ -# | NM_001110798.2 | 263..1552 | -# | NM_001190818.2 | 181..2766 | -# | NM_002009.4 | 466..1050 | -# | NM_006896.4 | 131..823 | -# | NM_025190.4 | 193..4254 | -# | NM_177402.5 | 215..1474 | -# +----------------+-----------+ -grep "NM_025190.4\|NM_001110798.2\|NM_006896.4\|NM_177402.5\|NM_002009.4\|NM_001190818.2" refSeqHistorical.cds -# NM_001110798.2 263..1552 -# NM_001190818.2 181..2766 -# NM_002009.4 466..1050 -# NM_006896.4 131..823 -# NM_025190.4 193..4254 -# NM_177402.5 215..1474 - -# Now we have psl and cds for searching, make a genePred track for display -zcat GCF_000001405.40-RS_2023_03_genomic.gff.gz \ - | gff3ToGenePred -maxParseErrors=-1 -maxConvertErrors=-1 -warnAndContinue \ - -attrsOut=attrs.out -unprocessedRootsOut=unprocessedRoots.out \ - -refseqHacks stdin out.gp 2>err - -# rename chromosomes to ucsc names, and remove the ids -# from the current version of the refSeq track -hgsql -Ne "select distinct(name) from ncbiRefSeq" hg38 | sort > ncbiRefSeq.currentIds -chromToUcsc -k 2 -a chromAlias.txt -i out.gp \ - | sort > out.ucscChrom.gp -chromToUcsc -k 14 -a chromAlias.txt -i out.psl \ - | sort > out.ucscChrom.psl -cut -f10 out.ucscChrom.psl | sort -u | comm -23 - ncbiRefSeq.currentIds > old.ids -grep -Fwf old.ids out.ucscChrom.gp > refSeqHistorical.gp -grep -Fwf old.ids out.ucscChrom.psl > refSeqHistorical.psl -# make sure we have the right sequences: -twoBitToFa -seqList=old.ids refSeqHistorical.2bit refSeqHistorical.deDuped.fa - -# make the bed file for the track -genePredToBigGenePred refSeqHistorical.gp stdout | sort -k1,1 -k2,2n > refSeqHistorical.bigGp -bedToBigBed -type=bed12+8 -tab -sizesIsChromAliasBb \ - -as=$HOME/kent/src/hg/lib/bigGenePred.as -extraIndex=name \ - refSeqHistorical.bigGp \ - /hive/data/genomes/hg38/goldenPath/bigZips/p14/hg38.p14.chromAlias.bb \ - refSeqHistorical.bb - -# ensure only coding transcripts have a cds -awk -F$'\\t' '$6 != $7 {print $1;}' refSeqHistorical.gp | sort -u > coding.cds.names -join -t$'\t' coding.cds.names <(sort -u refSeqHistorical.cds) > refSeqHistorical.cds.coding - -# load psl table -hgLoadPsl hg38 -table=ncbiRefSeqPslOld refSeqHistorical.psl - -# load cds table -hgLoadSqlTab hg38 ncbiRefSeqCdsOld ~/kent/src/hg/lib/cdsSpec.sql refSeqHistorical.cds.coding - -# load seq and ext tables -ln -sf `pwd`/refSeqHistorical.deDuped.fa /gbdb/hg38/ncbiRefSeq/refSeqHistorical.fa -hgLoadSeq -drop -seqTbl=seqNcbiRefSeqOld -extFileTbl=extNcbiRefSeqOld hg38 /gbdb/hg38/ncbiRefSeq/refSeqHistorical.fa - -# link the files into /gbdb -ln -s `pwd`/refSeqHistorical.bb /gbdb/hg38/ncbiRefSeq/refSeqHistorical.bb