282d648d8896a7bb45af20bbc306873f6a5ffb87 angie Wed Jan 17 11:03:32 2024 -0800 Added chrUn_KI270752v1 to hg38.{p11,p12,p13}.chromAlias.txt files after user pointed out in MLQ #32874 that it was missing. It is present in hg38.p14.chromAlias.txt and all *chromAlias.bb files. diff --git src/hg/makeDb/doc/hg38/patchUpdate.11.txt src/hg/makeDb/doc/hg38/patchUpdate.11.txt index 28285ff..028c090 100644 --- src/hg/makeDb/doc/hg38/patchUpdate.11.txt +++ src/hg/makeDb/doc/hg38/patchUpdate.11.txt @@ -1,700 +1,721 @@ # for emacs: -*- mode: sh; -*- # This file describes how hg38 was extended with patch sequences and annotations from grcH38P11 # Hold off on actually installing these until the genbank process has produced tables on hg38 ############################################################################## # Extend main database 2bit, chrom.sizes, chromInfo (DONE - 2018-08-06 - Angie) cd /hive/data/genomes/hg38 # main 2bit time faToTwoBit <(twoBitToFa hg38.2bit stdout) \ <(twoBitToFa /hive/data/genomes/grcH38P11/grcH38P11.2bit stdout) \ hg38.p11.2bit #real 1m30.733s # unmasked 2bit twoBitMask -type=.bed hg38.p11.2bit /dev/null hg38.p11.unmasked.2bit # chrom.sizes sort -k2nr,2nr chrom.sizes /hive/data/genomes/grcH38P11/chrom.sizes > chrom.sizes.p11 # chromInfo cd /hive/data/genomes/hg38/bed/chromInfo awk '{print $1 "\t" $2 "\t/gbdb/hg38/hg38.2bit";}' ../../chrom.sizes.p11 \ > chromInfo.p11.tab wc -l chromInfo*.tab # 578 chromInfo.p11.tab # 455 chromInfo.tab # Install cd /hive/data/genomes/hg38 # For the first update only, move initial release files to .initial. Don't do this next update! mv hg38.2bit hg38.initial.2bit ln -s hg38.p11.2bit hg38.2bit mv hg38.unmasked.2bit hg38.initial.unmasked.2bit ln -s hg38.p11.unmasked.2bit hg38.unmasked.2bit mv chrom.sizes chrom.sizes.initial ln -s chrom.sizes.p11 chrom.sizes cd /hive/data/genomes/hg38/bed/chromInfo hgLoadSqlTab hg38 chromInfo chromInfo.sql chromInfo.p11.tab ############################################################################## # Extend main database tables for fileless tracks (DONE - 2018-08-06 - Angie) # Just add the patch table rows to the main database tables for table in gap gold rmsk simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene; do echo $table hgsql hg38 -e "insert into hg38.$table select * from grcH38P11.$table" done ############################################################################## # Extend main database gc5BaseBw.bw (DONE - 2018-08-06 - Angie) cd /hive/data/genomes/hg38/bed/gc5Base/ # Concatenate original assembly results with grcH38P11 results time (zcat hg38.gc5Base.wigVarStep.gz \ /hive/data/genomes/grcH38P11/bed/gc5Base/grcH38P11.gc5Base.wigVarStep.gz \ | gzip -c \ > hg38.p11.gc5Base.wigVarStep.gz) #real 8m10.061s # Make a new gc5BaseBw.bw time wigToBigWig hg38.p11.gc5Base.wigVarStep.gz ../../chrom.sizes.p11 \ hg38.p11.gc5Base.bw #real 16m28.272s # Install cd /hive/data/genomes/hg38/bed/gc5Base/ # For the first update only, move initial release files to .initial. Don't do this next update! mv hg38.gc5Base.wigVarStep.gz hg38.initial.gc5Base.wigVarStep.gz mv hg38.gc5Base.bw hg38.initial.gc5Base.bw # End of first-update-only stuff ln -s hg38.p11.gc5Base.wigVarStep.gz hg38.gc5Base.wigVarStep.gz ln -s hg38.p11.gc5Base.bw hg38.gc5Base.bw # Because of this symlink, browser track gc5BaseBw has been automatically updated: # /gbdb/hg38/bbi/gc5BaseBw/gc5Base.bw -> /cluster/data/hg38/bed/gc5Base/hg38.gc5Base.bw # For the first update only. Don't do this next update! # The .wib and .wig.gz are obsolete, remove them from the downloads. rm /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/hg38.gc5Base.wib rm /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/hg38.gc5Base.wig.gz # link the bigwig to downloads ln -sf /hive/data/genomes/hg38/bed/gc5Base/hg38.gc5Base.bw /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/hg38.gc5Base.bw # End of first-update-only stuff ############################################################################## # Extend main database download files (DONE - 2018-08-06 - Angie) cd /hive/data/genomes/hg38/goldenPath/bigZips mkdir p11 # hg38.2bit was already extended above. ln -sf /hive/data/genomes/hg38/hg38.p11.2bit p11/ # AGP: zcat hg38.agp.gz \ /hive/data/genomes/grcH38P11/goldenPath/bigZips/grcH38P11.agp.gz \ | grep -v ^# \ | gzip -c > p11/hg38.p11.agp.gz # FASTA twoBitToFa ../../hg38.p11.2bit stdout \ | gzip -c > p11/hg38.p11.fa.gz twoBitToFa hg38.2bit stdout \ | maskOutFa stdin hard stdout \ | gzip -c > p11/hg38.p11.fa.masked.gz # RepeatMasker (don't include header of patch file): cat <(zcat hg38.fa.out.gz) \ <(zcat /hive/data/genomes/grcH38P11/goldenPath/bigZips/grcH38P11.fa.out.gz | tail -n +4) \ | gzip -c > p11/hg38.p11.fa.out.gz # SimpleRepeats/TRF: zcat hg38.trf.bed.gz \ /hive/data/genomes/grcH38P11/goldenPath/bigZips/grcH38P11.trf.bed.gz \ | gzip -c > p11/hg38.p11.trf.bed.gz # We don't expect a complete set of chroms to have simpleRepeats, but at least an increase: zcat p11/hg38.trf.bed.gz | cut -f 1 | uniq | wc -l #363 zcat p11/hg38.p11.trf.bed.gz | cut -f 1 | uniq | wc -l #485 # hg38 files that are not built by makeDownloads.pl because hg38 is treated as 'scaffold-based': # Per-chrom soft-masked FASTA: rm -rf chroms tar xvzf hg38.chromFa.tar.gz faSplit byname /hive/data/genomes/grcH38P11/goldenPath/bigZips/grcH38P11.fa.gz chroms/ ls -1 chroms | wc -l #578 tar cvzf p11/hg38.p11.chromFa.tar.gz ./chroms rm -rf chroms # Per-chrom hard-masked FASTA: rm -rf maskedChroms tar xvzf hg38.chromFaMasked.tar.gz faSplit byname /hive/data/genomes/grcH38P11/goldenPath/bigZips/grcH38P11.fa.masked.gz \ maskedChroms/ ls -1 maskedChroms | wc -l #578 tar cvzf p11/hg38.p11.chromFaMasked.tar.gz ./maskedChroms rm -rf maskedChroms # RepeatMasker .align files: zcat hg38.fa.align.gz /hive/data/genomes/grcH38P11/bed/repeatMasker/grcH38P11.fa.align.gz \ | gzip -c > p11/hg38.p11.fa.align.gz # Make new md5sum.txt cd p11 md5sum hg38.* > md5sum.txt # Install cd /hive/data/genomes/hg38/goldenPath/bigZips mkdir initial # For the first update only, move initial release files to initial/. Don't do this next update! mv hg38.* md5sum.txt initial/ ln -s p11/README.txt . for file in p11/hg38.p11* p11/md5sum.txt; do linkName=$(echo $file | sed -e 's/p11.//g') ln -sf $file $linkName done # First time only: update symlinks used for initial downloads. # Won't be necessary for subsequent patches. ln -sf /hive/data/genomes/hg38/hg38.initial.2bit initial/hg38.2bit ln -sf /hive/data/genomes/hg38/bed/rmskCM/hg38.fa.align.gz initial/hg38.fa.align.gz # Edit README.txt ############################################################################# # Extend cytoBand{,Ideo} (DONE 18-08-06 angie) cd /hive/data/genomes/hg38/bed/cytoBand tawk '{print $1, 0, $2, "", "gneg";}' /hive/data/genomes/grcH38P11/chrom.sizes \ > cytoBand.p11.tab hgLoadSqlTab -oldTable hg38 cytoBand - cytoBand.p11.tab hgLoadSqlTab -oldTable hg38 cytoBandIdeo - cytoBand.p11.tab ######################################################################### # Regenerate idKeys with extended hg38 (DONE - 2018-08-06 - Angie) mkdir /hive/data/genomes/hg38/bed/idKeys.p11 cd /hive/data/genomes/hg38/bed/idKeys.p11 # ku down... use hgwdev this time: time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl \ -twoBit=/hive/data/genomes/hg38/hg38.p11.unmasked.2bit \ -bigClusterHub=hgwdev -smallClusterHub=hgwdev \ -buildDir=`pwd` hg38) > do.log 2>&1 & tail -f do.log #real 1m25.689s cat hg38.keySignature.txt #15f8c2af14b6aaaef08775dbf0c8e900 # Install # For the first update only, move initial release files to .initial. Don't do this next update! mv /hive/data/genomes/hg38/bed/idKeys{,.initial} cd /hive/data/genomes/hg38/bed/ ln -s idKeys.p11 idKeys ############################################################################## # UCSC to RefSeq, INSDC, Assembly; chromAlias (DONE 18-08-07 angie) # need to have idKeys for the genbank and refseq assemblies: mkdir -p /hive/data/genomes/hg38/bed/ucscToINSDC/genbankP11 cd /hive/data/genomes/hg38/bed/ucscToINSDC/genbankP11 ln -s /hive/data/outside/ncbi/genomes/genbank/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCA_000001405.26_GRCh38.p11/GCA_000001405.26_GRCh38.p11_genomic.fna.gz . faToTwoBit GCA_000001405.26_GRCh38.p11_genomic.fna.gz genbankP11.2bit time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=genbankP11.2bit \ -bigClusterHub=hgwdev -smallClusterHub=hgwdev \ genbankP11) > do.log 2>&1 #real 1m50.109s mkdir /hive/data/genomes/hg38/bed/ucscToINSDC/refseqP11 cd /hive/data/genomes/hg38/bed/ucscToINSDC/refseqP11 ln -s /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.37_GRCh38.p11/GCF_000001405.37_GRCh38.p11_genomic.fna.gz ./ faToTwoBit GCF_000001405.37_GRCh38.p11_genomic.fna.gz refseqP11.2bit time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=refseqP11.2bit \ -bigClusterHub=hgwdev -smallClusterHub=hgwdev \ refseqP11) > do.log 2>&1 #real 1m47.878s # with the three idKeys available, join them to make the table bed files: cd /hive/data/genomes/hg38/bed/ucscToINSDC join -t$'\t' ../idKeys/hg38.idKeys.txt genbankP11/genbankP11.idKeys.txt \ | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToINSDC.p11.bed join -t$'\t' ../idKeys/hg38.idKeys.txt refseqP11/refseqP11.idKeys.txt \ | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToRefSeq.p11.bed # loading tables: export db=hg38 export chrSize=`cut -f1 ucscToINSDC.p11.bed | awk '{print length($0)}' | sort -n | tail -1` sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab ${db} ucscToINSDC stdin ucscToINSDC.p11.bed export chrSize=`cut -f1 ucscToRefSeq.p11.bed | awk '{print length($0)}' | sort -n | tail -1` sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' \ | hgLoadSqlTab ${db} ucscToRefSeq stdin ucscToRefSeq.p11.bed # must be exactly 100% coverage featureBits -countGaps ${db} ucscToINSDC #3257347282 bases of 3257347282 (100.000%) in intersection featureBits -countGaps ${db} ucscToRefSeq #3257319537 bases of 3257347282 (99.999%) in intersection # construct chromAlias: cd /hive/data/genomes/hg38/bed/chromAlias hgsql -N -e 'select chrom,name from ucscToRefSeq;' ${db} \ | sort -k1,1 > ucsc.refseq.tab hgsql -N -e 'select chrom,name from ucscToINSDC;' ${db} \ | sort -k1,1 > ucsc.genbank.tab # add NCBI sequence names from assembly report grep -v ^# \ /hive/data/genomes/grcH38P11/genbank/GCA_000001405.26_GRCh38.p11_assembly_report.txt \ | tawk '{print $5, $1;}' | sort \ > genbankToAssembly.txt tawk '{print $2, $1;}' ucsc.genbank.tab | sort \ | join -t$'\t' -o 1.2,2.2 - genbankToAssembly.txt \ | sort -k1,1 > ucsc.assembly.tab ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \ > ${db}.chromAlias.tab # verify all there: for t in refseq genbank assembly do c0=`cat ucsc.$t.tab | wc -l` c1=`grep $t hg38.chromAlias.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking refseq: 578 =? 578 OK # checking genbank: 578 =? 578 OK # checking assembly: 578 =? 578 OK hgLoadSqlTab hg38 chromAlias $HOME/kent/src/hg/lib/chromAlias.sql ${db}.chromAlias.tab ############################################################################## # UCSC to Ensembl (TODO 18-08-06 angie) # doc?? ############################################################################ # altLocations and patchLocations (DONE - 2018-08-06 - Angie) # indicate corresponding locations between haplotypes and reference mkdir /hive/data/genomes/hg38/bed/altLocations.p11 cd /hive/data/genomes/hg38/bed/altLocations.p11 ~/kent/src/hg/utils/automation/altScaffoldPlacementToBed.pl \ /hive/data/genomes/grcH38P11/genbank/GCA_000001405.26_GRCh38.p11_assembly_structure/{ALT_*,PATCHES}/alt_scaffolds/alt_scaffold_placement.txt \ | sort -k1,1 -k2n,2n \ > altAndFixLocations.bed wc -l altAndFixLocations.bed #768 altAndFixLocations.bed grep _alt altAndFixLocations.bed > altLocations.bed grep _fix altAndFixLocations.bed > fixLocations.bed hgLoadBed hg38 altLocations{,.bed} #Read 642 elements of size 4 from altLocations.bed hgLoadBed hg38 fixLocations{,.bed} #Read 128 elements of size 4 from fixLocations.bed featureBits -countGaps hg38 altLocations #196222738 bases of 3253848404 (6.030%) in intersection featureBits -countGaps hg38 fixLocations #60769916 bases of 3253848404 (1.868%) in intersection ############################################################################# # Check for new chrX alts/patches to add to par (DONE 2018-08-06 angie) # Thanks to Hiram for pointing out that intersecting chrX positions in # altLocations and par shows whether a chrX alt overlaps a PAR. cd /hive/data/genomes/hg38/bed/par hgsql hg38 -e 'select * from altLocations where chrom like "chrX%"' #+-----+---------------------+------------+----------+------------------------+ #| bin | chrom | chromStart | chromEnd | name | #+-----+---------------------+------------+----------+------------------------+ #| 73 | chrX | 319337 | 601516 | chrX_KI270880v1_alt | #| 73 | chrX | 326487 | 601516 | chrX_KI270913v1_alt | #| 149 | chrX | 79965153 | 80097082 | chrX_KI270881v1_alt | #| 73 | chrX_KI270880v1_alt | 0 | 284869 | chrX:319338-601516 | #| 73 | chrX_KI270881v1_alt | 0 | 144206 | chrX:79965154-80097082 | #| 73 | chrX_KI270913v1_alt | 0 | 274009 | chrX:326488-601516 | #+-----+---------------------+------------+----------+------------------------+ hgsql hg38 -e 'select * from par where chrom like "chrX%"' #+-----+---------------------+------------+-----------+------+ #| bin | chrom | chromStart | chromEnd | name | #+-----+---------------------+------------+-----------+------+ #| 9 | chrX | 10000 | 2781479 | PAR1 | #| 221 | chrX | 155701382 | 156030895 | PAR2 | #| 73 | chrX_KI270880v1_alt | 0 | 284869 | PAR1 | #| 73 | chrX_KI270913v1_alt | 0 | 274009 | PAR1 | #+-----+-------+------------+-----------+------+ # chrX_KI270881v1_alt is not in either PAR. # chrX_KI270880v1_alt and chrX_KI270913v1_alt are entirely contained in PAR1 -- # and are already in the PAR table, so nothing to add. ############################################################################## # altSeqLiftOver (DONE 19-01-07 Angie) # originally done 18-08-07; redone 18-11-06 with fixed gff3ToPsl to get correct - strand alignments # mainToPatch over.chain regenerated 19-01-07 with fixed pslToChain mkdir /hive/data/genomes/hg38/bed/altSeqLiftOver.p11 cd /hive/data/genomes/hg38/bed/altSeqLiftOver.p11 # Eventually these will be under the /hive/data/genomes/.../genbank/... directory # that points to /hive/data/outside/ncbi/genomes/... but at the moment the contents # of the alignments/ directories are not included in the sync. So for now, # manually download them here. # Original alts: mkdir initialAlts cd initialAlts for d in /hive/data/genomes/grcH38P11/genbank/GCA_000001405.26_GRCh38.p11_assembly_structure/ALT*/alt_scaffolds/alignments; do subdir=$(echo $d | sed -re 's@^/hive/data/genomes/grcH38P11/genbank/@@;') wget --timestamping --no-verbose \ ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCA_000001405.26_GRCh38.p11/$subdir/\*.gff done # New alts and patches too: mkdir ../patches cd ../patches wget --timestamping --no-verbose\ ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCA_000001405.26_GRCh38.p11/GCA_000001405.26_GRCh38.p11_assembly_structure/PATCHES/alt_scaffolds/alignments/\*.gff cd .. # Use chromAlias to make a .sed file to substitute Genbank accessions to UCSC names hgsql hg38 -NBe 'select alias,chrom from chromAlias where find_in_set("genbank", source);' \ | awk '{print "s@" $1 "@" $2 "@;";}' > gbToUcsc.sed cp /dev/null altToChrom.noScore.psl for f in initialAlts/*.gff patches/*.gff; do e=$(basename $f .gff | sed -e 's/_/|/g;') s=$(grep -E $e gbToUcsc.sed) sed -re "$s" $f | gff3ToPsl ../../chrom.sizes{,} stdin stdout \ | pslPosTarget stdin stdout \ >> altToChrom.noScore.psl done pslCheck altToChrom.noScore.psl #checked: 404 failed: 0 errors: 0 time pslRecalcMatch altToChrom.noScore.psl ../../hg38.2bit{,} altToChrom.psl #202.461u 1.836s 3:24.46 99.9% 0+0k 0+0io 0pf+0w pslSwap altToChrom.psl stdout | pslPosTarget stdin chromToAlt.psl sort -k14,14 -k16n,16n -k10,10 -k12n,12n altToChrom.psl chromToAlt.psl \ > altAndPatches.psl grep _alt altAndPatches.psl > altSeqLiftOver.psl grep _fix altAndPatches.psl > fixSeqLiftOver.psl # Load tables # Not reloaded 18-11-06 because tables have been reloaded with alignments through p12, # see patchUpdate.12.txt hgLoadPsl hg38 -table=altSeqLiftOverPsl altSeqLiftOver.psl hgLoadPsl hg38 -table=fixSeqLiftOverPsl fixSeqLiftOver.psl # Make chrom-to-alt PSL file for genbank process. ln -f -s `pwd`/chromToAlt.psl \ /hive/data/genomes/hg38/jkStuff/hg38.p11.alt.psl # Make a liftOver chain file for mapping annotations on main chroms to new patch sequences # 6/15/18: exclude alts that were already in hg38 before p11 # Redone 1/7/19 after Braney fixed pslToChain cut -f 1 ../../chrom.sizes.initial | grep _ \ | grep -vwf - chromToAlt.psl \ | pslToChain stdin stdout \ | chainScore stdin ../../hg38.2bit{,} ../../jkStuff/hg38.mainToPatch.p11.over.chain #52.068u 1.626s 0:54.43 98.6% 0+0k 15952+0io 2pf+0w # 1/7/19 also make a liftOver that includes the original alts, for tracks that have # annotations only on main chromosomes. Exclude alt-to-fix alignments. # This is necessary only for the first time we add a patch update. awk '($14 !~ /_/)' chromToAlt.psl \ | pslToChain stdin stdout \ | chainScore stdin ../../hg38.2bit{,} ../../jkStuff/hg38.mainToAllAltPatch.p11.over.chain #23.971u 1.400s 0:25.34 100.1% 0+0k 0+0io 0pf+0w ######################################################################### # ncbiRefSeq.p11 Genes (DONE - 2018-08-08 - Angie) mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p11.2018-08-08 cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p11.2018-08-08 time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ refseq vertebrate_mammalian Homo_sapiens \ GCF_000001405.37_GRCh38.p11 hg38) > do.log 2>&1 & tail -f do.log # *** All done ! Elapsed time: 15m24s #real 15m24.236s cat fb.ncbiRefSeq.hg38.txt #131634821 bases of 3092500061 (4.257%) in intersection ############################################################################## # Extend wgEncodeReg bigWig tracks (DONE 19-01-07 angie) # first done 18-08-10; redone 18-11-06 and 19-01-07 with updated .over.chain file. # NOTE: These tracks have not been liftOver'd to original alts... at least not completely. # They were lifted over from hg19, and some happened to be lifted from hg19 main chroms # to hg38 initial assembly alts. While it would be nice to lift from hg38 main chroms # to all alts and patches, that would lose the information that was mapped to some of the # alts in the hg19-to-hg38 lift. So don't use mainToAllAltPatch, use mainToPatch and # accept that the original alts will have a few good mappings from hg19, but mostly missing data. # 18-08-10 with original files: for dir in /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/{*Mark*,*Txn}; do composite=$(basename $dir) echo $composite cd $dir for f in wg*.bigWig; do track=$(basename $f .bigWig) ~/kent/src/hg/utils/liftOverBigWigToPatches $f \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p11.over.chain \ /hive/data/genomes/hg38/chrom.sizes \ $track.plusP11.bigWig & done wait done # 18-08-10: Install and rename original files for dir in /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/{*Mark*,*Txn}; do composite=$(basename $dir) echo $composite cd $dir for f in wg*.plusP11.bigWig; do track=$(basename $f .plusP11.bigWig) # First time only -- don't mv this in subsequent patches! mv $track.bigWig $track.initial.bigWig ln -sf `pwd`/$track.plusP11.bigWig /gbdb/hg38/bbi/wgEncodeReg/$composite/$track.bigWig done done # 18-11-06, 19-01-07: Recompute .plusP11 files from .initial files with updated .chain file for dir in /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/{*Mark*,*Txn}; do composite=$(basename $dir) echo $composite cd $dir for f in wg*.initial.bigWig; do track=$(basename $f .initial.bigWig) ~/kent/src/hg/utils/liftOverBigWigToPatches $f \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p11.over.chain \ /hive/data/genomes/hg38/chrom.sizes \ $track.plusP11.bigWig & done wait done # Not reinstalled -- used to recompute plusP12 ############################################################################## # Extend wgEncodeRegDnaseClustered (DONE 19-01-08 angie) # first done 18-08-10; redone 18-11-06 and 19-01-08 with updated .over.chain file. #NOTE: this has not been liftOver'd to original alts, aside from hg19 chr to -> hg38 alt! cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase origFile=clusters/uwEnc2DnaseClustered.bed liftOver -multiple -bedPlus=5 -noSerial $origFile \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p11.over.chain \ wgEncodeRegDnaseClustered.p11.bed /dev/null sort -k1,1 -k2n,2n $origFile wgEncodeRegDnaseClustered.p11.bed \ > wgEncodeRegDnaseClustered.plusP11.bed hgLoadBed -type=bed5+ -sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql -renameSqlTable \ hg38 wgEncodeRegDnaseClustered \ wgEncodeRegDnaseClustered.plusP11.bed # 18-11-06, 19-01-08: Recompute .plusP11 files with updated .chain file. cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase origFile=clusters/uwEnc2DnaseClustered.bed liftOver -multiple -bedPlus=5 -noSerial $origFile \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p11.over.chain \ wgEncodeRegDnaseClustered.p11.bed /dev/null sort -k1,1 -k2n,2n $origFile wgEncodeRegDnaseClustered.p11.bed \ > wgEncodeRegDnaseClustered.plusP11.bed # Not reinstalled -- used to recompute plusP12 ############################################################################## # Extend wgEncodeRegTfbsClusteredV3 (DONE 19-01-08 angie) # first done 18-08-10; redone 18-11-06 and 19-01-08 with updated .over.chain file. #NOTE: this has not been liftOver'd to original alts, aside from hg19 chr to -> hg38 alt! # 18-08-10 with original files: cd /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegTfbsClusteredV3/ origFile=hg38.wgEncodeRegClusteredV3.bed liftOver -multiple -bedPlus=5 -noSerial $origFile \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p11.over.chain \ wgEncodeRegTfbsClusteredV3.p11.bed /dev/null sort -k1,1 -k2n,2n $origFile wgEncodeRegTfbsClusteredV3.p11.bed \ > wgEncodeRegTfbsClusteredV3.plusP11.bed hgLoadBed -type=bed5+ -sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql -renameSqlTable \ hg38 wgEncodeRegTfbsClusteredV3 wgEncodeRegTfbsClusteredV3.plusP11.bed # 18-11-06, 19-01-08: Recompute .plusP11 files with updated .chain file. cd /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegTfbsClusteredV3/ origFile=hg38.wgEncodeRegClusteredV3.bed liftOver -multiple -bedPlus=5 -noSerial $origFile \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p11.over.chain \ wgEncodeRegTfbsClusteredV3.p11.bed /dev/null sort -k1,1 -k2n,2n $origFile wgEncodeRegTfbsClusteredV3.p11.bed \ > wgEncodeRegTfbsClusteredV3.plusP11.bed # Not reinstalled -- used to recompute plusP12 ############################################################################## # Extend GTEX GENE (DONE 19-01-15 angie) # first done 18-08-10; redone 18-11-06 and 19-01-08 with updated .over.chain file. # gtexGeneModel redone 19-01-15 after implementing liftOver -multiple -genePred. #NOTE: this has not been liftOver'd to original alts, aside from hg19 chr to -> hg38 alt! # I'm not really sure what file(s) are the true source of the latest hg38 GTEX Gene tables, # so I'll just work from the tables. mkdir /hive/data/genomes/hg38/bed/gtex.p11 cd /hive/data/genomes/hg38/bed/gtex.p11 # There is actually no bin column in gtexGene. hgsql hg38 -NBe 'select * from gtexGene' > gtexGene.initial.bed liftOver -multiple -bedPlus=6 -noSerial gtexGene.initial.bed \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p11.over.chain \ gtexGene.p11.bed /dev/null sort -k1,1 -k2n,2n gtexGene.initial.bed gtexGene.p11.bed \ | hgLoadBed -noBin -type=bed6+ -sqlTable=$HOME/kent/src/hg/lib/gtexGeneBed.sql -renameSqlTable \ hg38 gtexGene stdin # gtexGeneModel does have a bin. hgsql hg38 -NBe 'select * from gtexGeneModel' | cut -f 2- > gtexGeneModel.initial.gp liftOver -multiple -genePred gtexGeneModel.initial.gp \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p11.over.chain \ gtexGeneModel.p11.gp /dev/null sort -k2,2 -k3n,3n gtexGeneModel.initial.gp gtexGeneModel.p11.gp \ | hgLoadGenePred hg38 gtexGeneModel stdin # 18-11-06, 19-01-08: Recompute .plusP11 files with updated .chain file. # Don't reload tables because we're about to recompute plusP12. cd /hive/data/genomes/hg38/bed/gtex.p11 liftOver -multiple -bedPlus=6 -noSerial gtexGene.initial.bed \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p11.over.chain \ gtexGene.p11.bed /dev/null sort -k1,1 -k2n,2n gtexGene.initial.bed gtexGene.p11.bed \ > gtexGene.plusP11.bed # 19-01-15: recompute now that liftOver -multiple -genePred works. liftOver -multiple -genePred gtexGeneModel.initial.gp \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p11.over.chain \ gtexGeneModel.p11.gp /dev/null sort -k2,2 -k3n,3n gtexGeneModel.initial.gp gtexGeneModel.p11.gp \ > gtexGeneModel.plusP11.gp ############################################################################## # Extend wgEncodeRegDnase (DNase HS) (DONE 19-01-23 angie) # Nothing on the original alts, so use mainToAllAltPatch # 95 Peak view subtracks # I tried in vain to find the files from which the *Peak tables were loaded. # The closest I got were some files like # /hive/data/genomes/hg38/bed/wgEncodeRegDnase/run_pooled_hotspot5.1/wgEncodeEH000484.pooled.narrowPeak # a bigBed file which has the same number of rows as wgEncodeRegDnaseUwK562Peak, # and similar but not identical values. # So, first dump values from the database tables. mkdir /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHS.p11 cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHS.p11 for table in $(hgsql hg38 -NBe 'show tables like "wgEncodeRegDnase%Peak";'); do echo $table hgsql hg38 -NBe 'select * from '$table';' | cut -f 2- > $table.initial.bed done for f in *.initial.bed; do track=$(basename $f .initial.bed) echo $track liftOver -multiple -bedPlus=5 -noSerial $f \ /hive/data/genomes/hg38/jkStuff/hg38.mainToAllAltPatch.p11.over.chain \ $track.p11.bed /dev/null sort -k1,1 -k2n,2n $f $track.p11.bed > $track.plusP11.bed done # 95 Hotspots view subtracks mkdir /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHotspot.p11 cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHotspot.p11 # Similar to wgEncodeRegDnaseWig, the original /gbdb/ links point to files with names # that differ from the track names. Save that association and use it to add patches. ls -l /gbdb/hg38/bbi/wgEncodeRegDnase/wgEncodeRegDnase*Hotspot*.bb \ | awk '{print $9 "\t" $11;}' > wgEncodeRegDnaseHotspot.gbdbToOrigFile.tab wc -l wgEncodeRegDnaseHotspot.gbdbToOrigFile.tab #95 wgEncodeRegDnaseHotspot.gbdbToOrigFile.tab cat >runOne <<'_EOF_' #!/bin/bash set -beEu -o pipefail track=$1 origFile=$2 bigBedToBed $origFile stdout \ | liftOver -multiple -bedPlus=6 -noSerial stdin \ /hive/data/genomes/hg38/jkStuff/hg38.mainToAllAltPatch.p11.over.chain \ $track.broadPeak.p11.bed /dev/null sort -k1,1 -k2n,2n <(bigBedToBed $origFile stdout) $track.broadPeak.p11.bed \ > $track.broadPeak.plusP11.bed # Don't make bigBed yet; we'll use the .bed to compute .plusP12 next, and install that. #bedToBigBed -as $HOME/kent/src/hg/lib/bigNarrowPeak.as -type=bed6+3 $track.broadPeak.plusP11.bed \ # /hive/data/genomes/hg38/chrom.sizes $track.broadPeak.plusP11.bb _EOF_ chmod a+x runOne while read gbdbPath origFile; do track=$(basename $gbdbPath .broadPeak.bb) echo ./runOne $track $origFile done < wgEncodeRegDnaseHotspot.gbdbToOrigFile.tab > jobList para make jobList para time #Completed: 95 of 95 jobs #CPU time in finished jobs: 153s 2.56m 0.04h 0.00d 0.000 y #IO & Wait Time: 245s 4.08m 0.07h 0.00d 0.000 y #Average job time: 4s 0.07m 0.00h 0.00d #Longest finished job: 8s 0.13m 0.00h 0.00d #Submission to last job: 26s 0.43m 0.01h 0.00d # Don't do wgEncodeRegDnaseSignal view... the data files are same as DnaseWig below! # Don't update /gbdb -- use .plusP11 files to make .plusP12. ############################################################################## # Extend wgEncodeRegDnaseWig (DNase Signal) (DONE 19-01-23 angie) # Nothing on the original alts, so use mainToAllAltPatch mkdir /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseWig.p11 cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseWig.p11 # There are 95 subtracks. The original /gbdb/ links point to files with very different # names from the track/link name. For example: # /gbdb/hg38/bbi/wgEncodeRegDnase/wgEncodeRegDnaseUwHelas3Signal.bw -> # /hive/data/genomes/hg38/bed/wgEncodeRegDnase/run_normalized/out/wgEncodeEH000495.norm.bw # So for starters, make a file to record the correspondence between track and file. ls -l /gbdb/hg38/bbi/wgEncodeRegDnase/wgEncodeRegDnase*Signal*.bw \ | awk '{print $9 "\t" $11;}' > wgEncodeRegDnaseWig.gbdbToOrigFile.tab # Use that file to make a small cluster job to do the liftOvers while read gbdbPath origFile; do track=$(basename $gbdbPath .bw) echo ~/kent/src/hg/utils/liftOverBigWigToPatches $origFile \ /hive/data/genomes/hg38/jkStuff/hg38.mainToAllAltPatch.p11.over.chain \ /hive/data/genomes/hg38/chrom.sizes \ {check out exists $track.plusP11.bw} done < wgEncodeRegDnaseWig.gbdbToOrigFile.tab > jobList para make jobList para time #Completed: 95 of 95 jobs #CPU time in finished jobs: 136582s 2276.37m 37.94h 1.58d 0.004 y #IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y #Average job time: 463s 7.72m 0.13h 0.01d #Longest finished job: 884s 14.73m 0.25h 0.01d #Submission to last job: 1684s 28.07m 0.47h 0.02d # Don't update /gbdb -- use .plusP11 files to make .plusP12. +############################################################################# +# Update hg38.p11.chromAlias.txt (DONE 2024-01-17 Angie) + + # In MLQ#32874, the user reported that chrUn_KI270752v1 is missing from hg38.p12.chromAlias.txt. + # That's because when the hg38.{p11,p12,p13}.chromAlias.txt files were initially created + # in patchUpdate.13.txt ("Correctly versioned hg38.chromAlias.txt files in downloads"), + # chrUn_KI270752v1 was omitted because it had been removed from the RefSeq assembly as + # contamination. However, it is confusing for users to have a sequence in the db with no + # aliases despite it having Assembly, Ensembl and INSDC aliases. So add it back. + hgsql hg38 -NBe 'select * from chromAlias where chrom = "chrUn_KI270752v1"' +#HSCHRUN_RANDOM_CTG29 chrUn_KI270752v1 assembly +#KI270752.1 chrUn_KI270752v1 ensembl,genbank + cd /hive/data/genomes/hg38/goldenPath/bigZips + echo -e "chrUn_KI270752v1\tHSCHRUN_RANDOM_CTG29\tKI270752.1\t" >> hg38.p11.chromAlias.txt + # p11/hg38.p11.chromAlias.txt is a symlink to the one in this directory. + # p11/hg38.p11.chromAlias.bb is a file in p11/ . -- But it already has chrUn_KI270752v1 + # so it does not need to be updated, great. + bigBedToBed -chrom=chrUn_KI270752v1 p11/hg38.p11.chromAlias.bb stdout +#chrUn_KI270752v1 0 27745 chrUn_KI270752v1 HSCHRUN_RANDOM_CTG29 KI270752.1 KI270752.1 + + ##############################################################################