ea85b03d619559dd267cc8765ca8810d9e706ccd angie Wed Aug 14 14:48:45 2019 -0700 Added instructions for building new altsAndFixes file for gfServer -perSeqMax. refs #23112 diff --git src/hg/makeDb/doc/hg38/patchUpdate.12.txt src/hg/makeDb/doc/hg38/patchUpdate.12.txt index f26cac2..1868fab 100644 --- src/hg/makeDb/doc/hg38/patchUpdate.12.txt +++ src/hg/makeDb/doc/hg38/patchUpdate.12.txt @@ -1,776 +1,788 @@ # for emacs: -*- mode: sh; -*- # This file describes how hg38 was extended with patch sequences and annotations from grcH38P12, -# after having been extended with grcH38P11 (see patchUpdate.11.txt). +# after having been extended with grcH38P1 (see patchUpdate.11.txt). ############################################################################## # Extend main database 2bit, chrom.sizes, chromInfo (DONE - 2018-08-10 - Angie) cd /hive/data/genomes/hg38 # main 2bit time faToTwoBit <(twoBitToFa hg38.2bit stdout) \ <(twoBitToFa /hive/data/genomes/grcH38P12/grcH38P12.2bit stdout) \ hg38.p12.2bit #real 1m33.136s # unmasked 2bit twoBitMask -type=.bed hg38.p12.2bit /dev/null hg38.p12.unmasked.2bit # chrom.sizes sort -k2nr,2nr chrom.sizes /hive/data/genomes/grcH38P12/chrom.sizes > chrom.sizes.p12 # chromInfo cd /hive/data/genomes/hg38/bed/chromInfo awk '{print $1 "\t" $2 "\t/gbdb/hg38/hg38.2bit";}' ../../chrom.sizes.p12 \ > chromInfo.p12.tab wc -l chromInfo*.tab # 578 chromInfo.p11.tab # 595 chromInfo.p12.tab # 455 chromInfo.tab # Install cd /hive/data/genomes/hg38 ln -sf hg38.p12.2bit hg38.2bit ln -sf hg38.p12.unmasked.2bit hg38.unmasked.2bit ln -sf chrom.sizes.p12 chrom.sizes cd /hive/data/genomes/hg38/bed/chromInfo hgLoadSqlTab hg38 chromInfo chromInfo.sql chromInfo.p12.tab ############################################################################## # Extend main database tables for fileless tracks (DONE - 2018-08-10 - Angie) # Just add the patch table rows to the main database tables for table in gap gold rmsk simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene; do echo $table hgsql hg38 -e "insert into hg38.$table select * from grcH38P12.$table" done ############################################################################## # Extend main database gc5BaseBw.bw (DONE - 2018-08-10 - Angie) cd /hive/data/genomes/hg38/bed/gc5Base/ # Concatenate original assembly results with grcH38P12 results time (zcat hg38.gc5Base.wigVarStep.gz \ /hive/data/genomes/grcH38P12/bed/gc5Base/grcH38P12.gc5Base.wigVarStep.gz \ | gzip -c \ > hg38.p12.gc5Base.wigVarStep.gz) #real 8m9.913s # Make a new gc5BaseBw.bw time wigToBigWig hg38.p12.gc5Base.wigVarStep.gz ../../chrom.sizes.p12 \ hg38.p12.gc5Base.bw #real 16m33.792s # Install cd /hive/data/genomes/hg38/bed/gc5Base/ ln -sf hg38.p12.gc5Base.wigVarStep.gz hg38.gc5Base.wigVarStep.gz ln -sf hg38.p12.gc5Base.bw hg38.gc5Base.bw ############################################################################## # Extend main database download files (DONE - 2019-07-24 - Angie) # Previously done 2018-11-11 # NOTE FOR NEXT TIME: set up hg38*.chrom.sizes links correctly the first time (see 7/24/19) cd /hive/data/genomes/hg38/goldenPath/bigZips mkdir p12 # hg38.2bit was already extended above. ln -sf /hive/data/genomes/hg38/hg38.p12.2bit p12/ # AGP: zcat hg38.agp.gz \ /hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.agp.gz \ | grep -v ^# \ | gzip -c > p12/hg38.p12.agp.gz # FASTA twoBitToFa ../../hg38.p12.2bit stdout \ | gzip -c > p12/hg38.p12.fa.gz faSize p12/hg38.p12.fa.gz #3257347282 bases (161368694 N's 3095978588 real 1483113183 upper 1612865405 lower) in 595 sequences in 1 files #Total size: mean 5474533.2 sd 27729929.3 min 970 (chrUn_KI270394v1) max 248956422 (chr1) median 166200 twoBitToFa hg38.2bit stdout \ | maskOutFa stdin hard stdout \ | gzip -c > p12/hg38.p12.fa.masked.gz # RepeatMasker (don't include header of patch file): cat <(zcat hg38.fa.out.gz) \ <(zcat /hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.fa.out.gz | tail -n +4) \ | gzip -c > p12/hg38.p12.fa.out.gz # SimpleRepeats/TRF: zcat hg38.trf.bed.gz \ /hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.trf.bed.gz \ | gzip -c > p12/hg38.p12.trf.bed.gz # We don't expect a complete set of chroms to have simpleRepeats, but at least an increase: zcat initial/hg38.trf.bed.gz | cut -f 1 | uniq | wc -l #363 zcat p12/hg38.p12.trf.bed.gz | cut -f 1 | uniq | wc -l #502 # hg38 files that are not built by makeDownloads.pl because hg38 is treated as 'scaffold-based': # Per-chrom soft-masked FASTA: rm -rf chroms tar xvzf hg38.chromFa.tar.gz faSplit byname /hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.fa.gz chroms/ ls -1 chroms | wc -l #595 tar cvzf p12/hg38.p12.chromFa.tar.gz ./chroms rm -rf chroms # Per-chrom hard-masked FASTA: rm -rf maskedChroms tar xvzf hg38.chromFaMasked.tar.gz faSplit byname /hive/data/genomes/grcH38P12/goldenPath/bigZips/grcH38P12.fa.masked.gz \ maskedChroms/ ls -1 maskedChroms | wc -l #595 tar cvzf p12/hg38.p12.chromFaMasked.tar.gz ./maskedChroms rm -rf maskedChroms # RepeatMasker .align files: zcat hg38.fa.align.gz /hive/data/genomes/grcH38P12/bed/repeatMasker/grcH38P12.fa.align.gz \ | gzip -c > p12/hg38.p12.fa.align.gz # Make new md5sum.txt cd p12 md5sum hg38.* > md5sum.txt # Install # 11/1/18 -- leave bigZips/ top-level files unchanged (links to initial not p12) cd /hive/data/genomes/hg38/goldenPath/bigZips for file in initial/*; do ln -sf $file . done # 4/10/19: make latest a real dir with versionless filenames. rm -rf latest mkdir latest cd latest for file in ../p12/*; do noVersion=$(echo $(basename $file) | sed -e 's/.p12//') ln -s $file $noVersion done rm md5sum.txt md5sum hg38* > md5sum.txt echo GRCh38.p12 > LATEST_VERSION rm -f /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/p12 ln -s /hive/data/genomes/hg38/goldenPath/bigZips/p12 \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/p12 rm -f /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/latest ln -s /hive/data/genomes/hg38/goldenPath/bigZips/latest \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/latest # 7/24/19: make correct links for chrom.sizes ln -sf /hive/data/genomes/hg38/chrom.sizes.initial \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/hg38.chrom.sizes ln -sf /hive/data/genomes/hg38/chrom.sizes.initial \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/initial/hg38.chrom.sizes ln -sf /hive/data/genomes/hg38/chrom.sizes \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/latest/hg38.chrom.sizes ln -sf /hive/data/genomes/hg38/chrom.sizes.p12 \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/p12/hg38.p12.chrom.sizes +############################################################################# +# Build perSeqMax file for gfServer (hgBlat) (DONE 19-08-14 angie) + # When the blat server is restarted with the updated hg38.2bit file, + # hg38.altsAndFixes needs to be copied over along with the new hg38.2bit file, + # and gfServer needs to be restarted with -perSeqMax=hg38.altsAndFixes. + cd /hive/data/genomes/hg38 + cut -f 1 chrom.sizes.p12 \ + | grep -E '_(alt|fix)$' \ + | sed -re 's/^/hg38.2bit:/;' \ + > hg38.altsAndFixes + + ######################################################################### # Regenerate idKeys with extended hg38 (DONE - 2018-08-10 - Angie) mkdir /hive/data/genomes/hg38/bed/idKeys.p12 cd /hive/data/genomes/hg38/bed/idKeys.p12 # ku down... use hgwdev this time: time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl \ -twoBit=/hive/data/genomes/hg38/hg38.p12.unmasked.2bit \ -bigClusterHub=hgwdev -smallClusterHub=hgwdev \ -buildDir=`pwd` hg38) > do.log 2>&1 & tail -f do.log #real 1m21.903s cat hg38.keySignature.txt #c9c5d621a52f96886fa9cd785c99248f # Install cd /hive/data/genomes/hg38/bed/ rm idKeys ln -s idKeys.p12 idKeys ############################################################################# # Extend cytoBand{,Ideo} (DONE 2018-08-10 angie) cd /hive/data/genomes/hg38/bed/cytoBand tawk '{print $1, 0, $2, "", "gneg";}' /hive/data/genomes/grcH38P12/chrom.sizes \ > cytoBand.p12.tab # Install hgLoadSqlTab -oldTable hg38 cytoBand - cytoBand.p12.tab hgLoadSqlTab -oldTable hg38 cytoBandIdeo - cytoBand.p12.tab ######################################################################### # ncbiRefSeq.p12 Genes (DONE - 2018-08-10 - Angie) mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p12.2018-08-10 cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p12.2018-08-10 # Adding the -toGpWarnOnly flag because there are a handful of cases of CDS extending # beyond exon coordinates. Terence Murphy says they'll eventually fix it but not soon. # So, make sure to check do.log for warnings from gff3ToGenePred: time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -toGpWarnOnly \ refseq vertebrate_mammalian Homo_sapiens \ GCF_000001405.38_GRCh38.p12 hg38) > do.log 2>&1 & tail -f do.log # gff3ToGenePred warnings: #Warning: skipping: no exon in id1912382 contains CDS 555851-556197 #Warning: skipping: no exon in id1790907 contains CDS 22922593-22922913 #Warning: skipping: no exon in id1790877 contains CDS 22906341-22906661 #Warning: skipping: no exon in id1790824 contains CDS 22822981-22823289 #Warning: skipping: no exon in id1365744 contains CDS 106088082-106088428 #5 warnings converting GFF3 file: stdin # *** All done ! Elapsed time: 17m34s #real 17m33.906s cat fb.ncbiRefSeq.hg38.txt #134109466 bases of 3095998939 (4.332%) in intersection ############################################################################# # UCSC to RefSeq, INSDC, Assembly; chromAlias (DONE 18-08-10 angie) # need to have idKeys for the genbank and refseq assemblies: mkdir -p /hive/data/genomes/hg38/bed/ucscToINSDC/genbankP12 cd /hive/data/genomes/hg38/bed/ucscToINSDC/genbankP12 ln -s /hive/data/outside/ncbi/genomes/genbank/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCA_000001405.27_GRCh38.p12/GCA_000001405.27_GRCh38.p12_genomic.fna.gz . faToTwoBit GCA_000001405.27_GRCh38.p12_genomic.fna.gz genbankP12.2bit time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=genbankP12.2bit \ -bigClusterHub=hgwdev -smallClusterHub=hgwdev \ genbankP12) > do.log 2>&1 #real 1m50.109s mkdir /hive/data/genomes/hg38/bed/ucscToINSDC/refseqP12 cd /hive/data/genomes/hg38/bed/ucscToINSDC/refseqP12 ln -s /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz ./ faToTwoBit GCF_000001405.38_GRCh38.p12_genomic.fna.gz refseqP12.2bit time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=refseqP12.2bit \ -bigClusterHub=hgwdev -smallClusterHub=hgwdev \ refseqP12) > do.log 2>&1 #real 1m19.941s # with the three idKeys available, join them to make the table bed files: cd /hive/data/genomes/hg38/bed/ucscToINSDC join -t$'\t' ../idKeys/hg38.idKeys.txt genbankP12/genbankP12.idKeys.txt \ | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToINSDC.p12.bed join -t$'\t' ../idKeys/hg38.idKeys.txt refseqP12/refseqP12.idKeys.txt \ | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToRefSeq.p12.bed # loading tables: export db=hg38 export chrSize=`cut -f1 ucscToINSDC.p12.bed | awk '{print length($0)}' | sort -n | tail -1` sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab ${db} ucscToINSDC stdin ucscToINSDC.p12.bed export chrSize=`cut -f1 ucscToRefSeq.p12.bed | awk '{print length($0)}' | sort -n | tail -1` sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' \ | hgLoadSqlTab ${db} ucscToRefSeq stdin ucscToRefSeq.p12.bed # must be exactly 100% coverage featureBits -countGaps ${db} ucscToINSDC #3257347282 bases of 3257347282 (100.000%) in intersection featureBits -countGaps ${db} ucscToRefSeq #3257319537 bases of 3257347282 (99.999%) in intersection # uh-oh! not 100% featureBits -countGaps ${db} \!ucscToRefSeq -bed=stdout #chrUn_KI270752v1 0 27745 chrUn_KI270752v1.1 grep KI270752 \ /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_assembly_report.txt #HSCHRUN_RANDOM_CTG29 unplaced-scaffold na na KI270752.1 <> na Primary Assembly 27745 chrUn_KI270752v1 # Yep, no RefSeq accession there. Guess it was dropped from the RefSeq p12 assembly??? # Will ask Hiram and probably Terence. # construct chromAlias: cd /hive/data/genomes/hg38/bed/chromAlias hgsql -N -e 'select chrom,name from ucscToRefSeq;' ${db} \ | sort -k1,1 > ucsc.refseq.tab hgsql -N -e 'select chrom,name from ucscToINSDC;' ${db} \ | sort -k1,1 > ucsc.genbank.tab # add NCBI sequence names from assembly report grep -v ^# \ /hive/data/genomes/grcH38P12/genbank/GCA_000001405.27_GRCh38.p12_assembly_report.txt \ | tawk '{print $5, $1;}' | sort \ > genbankToAssembly.txt tawk '{print $2, $1;}' ucsc.genbank.tab | sort \ | join -t$'\t' -o 1.2,2.2 - genbankToAssembly.txt \ | sort -k1,1 > ucsc.assembly.tab ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \ > ${db}.chromAlias.tab # verify all there: for t in refseq genbank assembly do c0=`cat ucsc.$t.tab | wc -l` c1=`grep $t hg38.chromAlias.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking refseq: 594 =? 594 OK # checking genbank: 595 =? 595 OK # checking assembly: 595 =? 595 OK # Note how there's one fewer refseq, consistent with featureBits above. hgLoadSqlTab hg38 chromAlias $HOME/kent/src/hg/lib/chromAlias.sql ${db}.chromAlias.tab ############################################################################## # UCSC to Ensembl (TODO 18-08-06 angie) # doc?? ############################################################################ # altLocations and patchLocations (DONE - 2018-08-10 - Angie) # indicate corresponding locations between haplotypes and reference mkdir /hive/data/genomes/hg38/bed/altLocations.p12 cd /hive/data/genomes/hg38/bed/altLocations.p12 ~/kent/src/hg/utils/automation/altScaffoldPlacementToBed.pl \ /hive/data/genomes/grcH38P12/genbank/GCA_000001405.27_GRCh38.p12_assembly_structure/{ALT_*,PATCHES}/alt_scaffolds/alt_scaffold_placement.txt \ | sort -k1,1 -k2n,2n \ > altAndFixLocations.bed wc -l altAndFixLocations.bed #802 altAndFixLocations.bed grep _alt altAndFixLocations.bed > altLocations.bed grep _fix altAndFixLocations.bed > fixLocations.bed hgLoadBed hg38 altLocations{,.bed} #Read 664 elements of size 4 from altLocations.bed hgLoadBed hg38 fixLocations{,.bed} #Read 140 elements of size 4 from fixLocations.bed featureBits -countGaps hg38 altLocations #200094386 bases of 3257347282 (6.143%) in intersection featureBits -countGaps hg38 fixLocations #63654955 bases of 3257347282 (1.954%) in intersection ############################################################################# # Check for new chrX alts/patches to add to par (DONE 2018-08-10 angie) # Thanks to Hiram for pointing out that intersecting chrX positions in # altLocations and par shows whether a chrX alt overlaps a PAR. cd /hive/data/genomes/hg38/bed/par hgsql hg38 -e 'select * from altLocations where chrom = "chrX"' #+-----+-------+------------+----------+---------------------+ #| bin | chrom | chromStart | chromEnd | name | #+-----+-------+------------+----------+---------------------+ #| 73 | chrX | 319337 | 601516 | chrX_KI270880v1_alt | #| 73 | chrX | 326487 | 601516 | chrX_KI270913v1_alt | #| 77 | chrX | 4950956 | 5129468 | chrX_KV766199v1_alt | #| 149 | chrX | 79965153 | 80097082 | chrX_KI270881v1_alt | #+-----+-------+------------+----------+---------------------+ hgsql hg38 -e 'select * from par where chrom like "chrX%"' #+-----+---------------------+------------+-----------+------+ #| bin | chrom | chromStart | chromEnd | name | #+-----+---------------------+------------+-----------+------+ #| 9 | chrX | 10000 | 2781479 | PAR1 | #| 221 | chrX | 155701382 | 156030895 | PAR2 | #| 73 | chrX_KI270880v1_alt | 0 | 284869 | PAR1 | #| 73 | chrX_KI270913v1_alt | 0 | 274009 | PAR1 | #+-----+---------------------+------------+-----------+------+ # chrX_KI270881v1_alt and chrX_KV766199v1_alt are not in either PAR. # chrX_KI270880v1_alt and chrX_KI270913v1_alt are entirely contained in PAR1 -- # and are already in the PAR table, so nothing to add. ############################################################################## # altSeqLiftOver (DONE 2018-11-06 Angie) # originally done 2018-08-10; redone 2018-11-06 w/fixed gff3ToPsl to get correct - strand alignments # mainToPatch over.chain regenerated 2018-12-03 w/fixed pslToChain mkdir /hive/data/genomes/hg38/bed/altSeqLiftOver.p12 cd /hive/data/genomes/hg38/bed/altSeqLiftOver.p12 # Eventually these will be under the /hive/data/genomes/.../genbank/... directory # that points to /hive/data/outside/ncbi/genomes/... but at the moment the contents # of the alignments/ directories are not included in the sync. So for now, # manually download them here. # Original alts -- reuse the ones downloaded for p11: ln -s ../altSeqLiftOver.p11/initialAlts . # New alts and patches too: mkdir patches cd patches wget --timestamping --no-verbose\ ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCA_000001405.27_GRCh38.p12/GCA_000001405.27_GRCh38.p12_assembly_structure/PATCHES/alt_scaffolds/alignments/\*.gff cd .. # Use chromAlias to make a .sed file to substitute Genbank accessions to UCSC names hgsql hg38 -NBe 'select alias,chrom from chromAlias where find_in_set("genbank", source);' \ | awk '{print "s@" $1 "@" $2 "@;";}' > gbToUcsc.sed wc -l gbToUcsc.sed #595 gbToUcsc.sed cp /dev/null altToChrom.noScore.psl for f in initialAlts/*.gff patches/*.gff; do e=`basename $f .gff | sed -e 's/_/|/g;'` s=`grep -E $e gbToUcsc.sed` sed -re "$s" $f | gff3ToPsl ../../chrom.sizes{,} stdin stdout \ | pslPosTarget stdin stdout \ >> altToChrom.noScore.psl done pslCheck altToChrom.noScore.psl #checked: 421 failed: 0 errors: 0 time pslRecalcMatch altToChrom.noScore.psl ../../hg38.2bit{,} altToChrom.psl #real 0m29.571s pslSwap altToChrom.psl stdout | pslPosTarget stdin chromToAlt.psl sort -k14,14 -k16n,16n -k10,10 -k12n,12n altToChrom.psl chromToAlt.psl \ > altAndPatches.psl grep _alt altAndPatches.psl > altSeqLiftOver.psl grep _fix altAndPatches.psl > fixSeqLiftOver.psl # Load tables hgLoadPsl hg38 -table=altSeqLiftOverPsl altSeqLiftOver.psl hgLoadPsl hg38 -table=fixSeqLiftOverPsl fixSeqLiftOver.psl # Make chrom-to-alt PSL file for genbank process. ln -f -s `pwd`/chromToAlt.psl \ /hive/data/genomes/hg38/jkStuff/hg38.p12.alt.psl wc -l /hive/data/genomes/hg38/jkStuff/hg38.p12.alt.psl #421 /hive/data/genomes/hg38/jkStuff/hg38.p12.alt.psl # Make a liftOver chain file for mapping annotations on main chroms to new patch sequences # exclude alts that were already in hg38 before p12 # Redone 12/3/18 after Braney fixed pslToChain cut -f 1 ../../chrom.sizes.p11 | grep _ \ | grep -vwf - chromToAlt.psl \ | pslToChain stdin stdout \ | chainScore stdin ../../hg38.2bit{,} ../../jkStuff/hg38.mainToPatch.p12.over.chain grep chain ../../jkStuff/hg38.mainToPatch.p12.over.chain | wc -l #17 ############################################################################## # Extend wgEncodeReg bigWig tracks (DONE 19-01-08 angie) # originally done 18-08-29; redone 18-11-06, 19-01-08 with recomputed .plusP11 # files and updated .chain #NOTE: this has not been liftOver'd to original alts! # Use the *.plusP11.bigWig files and add p12. for dir in /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/{*Mark*,*Txn}; do composite=$(basename $dir) echo $composite cd $dir for f in wg*.plusP11.bigWig; do track=$(basename $f .plusP11.bigWig) ~/kent/src/hg/utils/liftOverBigWigToPatches $f \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \ /hive/data/genomes/hg38/chrom.sizes \ $track.plusP12.bigWig & done wait done # Install (not necessary after updating .plusP12 files, links already point there) for dir in /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/{*Mark*,*Txn}; do composite=$(basename $dir) echo $composite cd $dir for f in wg*.plusP12.bigWig; do track=$(basename $f .plusP12.bigWig) ln -sf `pwd`/$track.plusP12.bigWig /gbdb/hg38/bbi/wgEncodeReg/$composite/$track.bigWig done done ############################################################################## # Extend wgEncodeRegDnase (DONE 19-01-08 angie) # originally done 18-08-28; redone 18-11-08, 19-01-08 with recomputed .plusP11 # files and updated .chain #NOTE: this has not been liftOver'd to original alts! cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase origFile=wgEncodeRegDnaseClustered.plusP11.bed liftOver -multiple -bedPlus=5 -noSerial $origFile \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \ wgEncodeRegDnaseClustered.p12.bed /dev/null sort -k1,1 -k2n,2n $origFile wgEncodeRegDnaseClustered.p12.bed \ > wgEncodeRegDnaseClustered.plusP12.bed hgLoadBed hg38 wgEncodeRegDnaseClustered wgEncodeRegDnaseClustered.plusP12.bed \ -sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql \ -renameSqlTable -as=$HOME/kent/src/hg/lib/bed5SourceVals.as ############################################################################## # Extend wgEncodeRegTfbsClusteredV3 (DONE 19-01-08 angie) # originally done 18-08-28; redone 18-11-08, 19-01-08 with recomputed .plusP11 # files and updated .chain #NOTE: this has not been liftOver'd to original alts! cd /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegTfbsClusteredV3/ origFile=wgEncodeRegTfbsClusteredV3.plusP11.bed liftOver -multiple -bedPlus=5 -noSerial $origFile \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \ wgEncodeRegTfbsClusteredV3.p12.bed /dev/null sort -k1,1 -k2n,2n $origFile wgEncodeRegTfbsClusteredV3.p12.bed \ > wgEncodeRegTfbsClusteredV3.plusP12.bed hgLoadBed hg38 wgEncodeRegTfbsClusteredV3 wgEncodeRegTfbsClusteredV3.plusP12.bed \ -sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql \ -renameSqlTable -as=$HOME/kent/src/hg/lib/bed5SourceVals.as ############################################################################## # Extend GTEX GENE (DONE 19-01-15 angie) # originally done 18-08-28; redone 18-11-08, 19-01-08 with recomputed .plusP11 # files and updated .chain # gtexGeneModel redone 19-01-15 after implementing liftOver -multiple -genePred. mkdir /hive/data/genomes/hg38/bed/gtex.p12 cd /hive/data/genomes/hg38/bed/gtex.p12 liftOver -multiple -bedPlus=6 -noSerial ../gtex.p11/gtexGene.plusP11.bed \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \ gtexGene.p12.bed /dev/null sort -k1,1 -k2n,2n ../gtex.p11/gtexGene.plusP11.bed gtexGene.p12.bed \ > gtexGene.plusP12.bed # There is actually no bin column in gtexGene. hgLoadSqlTab hg38 gtexGene $HOME/kent/src/hg/lib/gtexGeneBed.sql gtexGene.plusP12.bed # Two of the genes fall on inversions in the mapping of chr to alt/fix, so part of a gene # maps on a + chain and part on a - chain. The SQL table has a unique index on # (chr, geneId) so having two results (+ and -) makes it error out. When that happens, # remove the smaller inversion mapping -- the larger gene region is still mapped. grep -v '^chr12_KN538369v1_fix.*-.ENSG00000165714' gtexGene.plusP12.bed \ | grep -v '^chr7_KV880765v1_fix.*+.ENSG00000164597' \ | hgLoadSqlTab hg38 gtexGene $HOME/kent/src/hg/lib/gtexGeneBed.sql stdin # 19-01-15: recompute now that liftOver -multiple -genePred works. liftOver -multiple -genePred ../gtex.p11/gtexGeneModel.plusP11.gp \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \ gtexGeneModel.p12.gp /dev/null sort -k2,2 -k3n,3n ../gtex.p11/gtexGeneModel.plusP11.gp gtexGeneModel.p12.gp \ > gtexGeneModel.plusP12.gp hgLoadGenePred hg38 gtexGeneModel gtexGeneModel.plusP12.gp ############################################################################# # UPDATE /scratch/data/ 2bit (DONE 2018-09-26 angie) cp -p /hive/data/genomes/hg38/hg38.p12.2bit /hive/data/staging/data/hg38/ mv /hive/data/staging/data/hg38/hg38.2bit{,.bak} mv /hive/data/staging/data/hg38/hg38{.p12,}.2bit cmp /hive/data/genomes/hg38/hg38.initial.2bit /hive/data/staging/data/hg38/hg38.2bit.bak # No output -- the .bak copy is identical as expected, so clean it up. rm /hive/data/staging/data/hg38/hg38.2bit.bak ############################################################################## # Extend wgEncodeRegDnase (DNase HS) (DONE 19-01-23 angie) # 95 Peak view subtracks mkdir /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHS.p12 cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHS.p12 for f in ../wgEncodeRegDnaseHS.p11/*.plusP11.bed; do track=$(basename $f .plusP11.bed) liftOver -multiple -bedPlus=5 -noSerial $f \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \ $track.p12.bed /dev/null sort -k1,1 -k2n,2n $f $track.p12.bed > $track.plusP12.bed done # Install for f in *.plusP12.bed; do table=$(basename $f .plusP12.bed) echo $table hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable \ -type=bed6+4 -as=$HOME/kent/src/hg/lib/bigNarrowPeak.as -noNameIx \ hg38 $table $f done rm bed.tab # 95 Hotspots view subtracks mkdir /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHotspot.p12 cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHotspot.p12 cat >runOne <<'_EOF_' #!/bin/bash set -beEu -o pipefail track=$1 origFile=$2 liftOver -multiple -bedPlus=6 -noSerial $origFile \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \ $track.broadPeak.p12.bed /dev/null sort -k1,1 -k2n,2n $origFile $track.broadPeak.p12.bed > $track.broadPeak.plusP12.bed bedToBigBed -as=$HOME/kent/src/hg/lib/encode/broadPeak.as -type=bed6+3 $track.broadPeak.plusP12.bed \ /hive/data/genomes/hg38/chrom.sizes $track.broadPeak.plusP12.bb _EOF_ chmod a+x runOne cp /dev/null jobList for origFile in ../wgEncodeRegDnaseHotspot.p11/*.broadPeak.plusP11.bed; do track=$(basename $origFile .broadPeak.plusP11.bed) echo ./runOne $track $origFile >> jobList done para make jobList para time #Completed: 95 of 95 jobs #CPU time in finished jobs: 206s 3.43m 0.06h 0.00d 0.000 y #IO & Wait Time: 251s 4.19m 0.07h 0.00d 0.000 y #Average job time: 5s 0.08m 0.00h 0.00d #Longest finished job: 8s 0.13m 0.00h 0.00d #Submission to last job: 22s 0.37m 0.01h 0.00d # Install for f in *.broadPeak.plusP12.bb; do track=$(basename $f .broadPeak.plusP12.bb) ln -sf `pwd`/$f /gbdb/hg38/bbi/wgEncodeRegDnase/$track.broadPeak.bb done # Don't do wgEncodeRegDnaseSignal view... the data files are same as DnaseWig below! ############################################################################# # Extend wgEncodeRegDnaseWig (DNase Signal) (DONE 19-01-23 angie) mkdir /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseWig.p12 cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseWig.p12 cp /dev/null jobList for origFile in ../wgEncodeRegDnaseWig.p11/*.plusP11.bw; do track=$(basename $origFile .plusP11.bw) echo ~/kent/src/hg/utils/liftOverBigWigToPatches $origFile \ /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p12.over.chain \ /hive/data/genomes/hg38/chrom.sizes \ {check out exists $track.plusP12.bw} \ >> jobList done para make jobList para time #Completed: 95 of 95 jobs #CPU time in finished jobs: 140913s 2348.55m 39.14h 1.63d 0.004 y #IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y #Average job time: 428s 7.14m 0.12h 0.00d #Longest finished job: 828s 13.80m 0.23h 0.01d #Submission to last job: 1534s 25.57m 0.43h 0.02d # Install by updating /gbdb/ links. for f in *.plusP12.bw; do track=$(basename $f .plusP12.bw) ln -sf `pwd`/$f /gbdb/hg38/bbi/wgEncodeRegDnase/$track.bw done ############################################################################# # Rebuild ncbiRefSeqGenomicDiff (DONE 19-01-25 angie) mkdir /hive/data/genomes/hg38/bed/ncbiRefSeqAnomalies.p12 cd /hive/data/genomes/hg38/bed/ncbiRefSeqAnomalies.p12 db=hg38 pre=ncbiRefSeqGenomicDiff buildDir=/hive/data/genomes/hg38/bed/ncbiRefSeq.p12.2018-08-10 asmId=GCF_000001405.38_GRCh38.p12 time (zcat $buildDir/process/$asmId.rna.cds.gz \ | egrep '[0-9]+\.\.[0-9]+' \ | pslMismatchGapToBed -cdsFile=stdin -db=$db -ignoreQNamePrefix=X \ $buildDir/process/$asmId.$db.psl.gz \ /hive/data/genomes/$db/$db.2bit \ $buildDir/$db.rna.fa \ $pre) #real 0m40.604s bedToBigBed -type=bed9+ -tab -as=$HOME/kent/src/hg/lib/txAliDiff.as $pre.bed \ /hive/data/genomes/$db/chrom.sizes $pre.bb ln -sf `pwd`/$pre.bb /gbdb/hg38/ncbiRefSeq/$pre.bb ############################################################################# # DBSNP B151 / SNP151 (TODO 19-? angie) # b151 is for GRCh38.p7 (orgDir human_9606_b151_GRCh38p7) so it doesn't have # sequences added in p8 and later. How do we do this without disrupting the existing snp151 track? Can we build with a suffix and rename? Or just stop before loading... mkdir -p /hive/data/outside/dbSNP/151/human_hg38 cd /hive/data/outside/dbSNP/151/human_hg38 # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/ # to find the subdir name to use as orgDir below (human_9606_b151_GRCh38p7 in this case). # Go to that subdirectory, then to database/organism_data/ and look for files # whose names start with b151_* and may or may not end with a suffix that identifies # the build assembly version or some annotation version. If there is a suffix shared # by all b151_* files, add that to config.ra as the "buildAssembly". # dbSNP has all NT_/NW_ contig IDs, but our 2bit has UCSC-ified genbank names. # make a lift file. cat > config.ra < /hive/data/outside/dbSNP/151/human_hg38/assemblyLabels.txt # Start the usual pipeline at the loadDbSnp step. ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log & tail -f do.log #*** b151_ContigInfo_108 has coords for 305 sequences; these have been written to #*** /hive/data/outside/dbSNP/151/human_hg38/suggested.lft . # #*** GCF_000001405.33_GRCh38.p7_assembly_report.txt has mappings for 500 sequences; #*** these have been written to #*** /hive/data/outside/dbSNP/151/human_hg38/suggested.lft . # #*** You must account for all 805 contig_acc values in config.ra, #*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output). #*** Check the auto-generated suggested.lft to see if it covers all #*** 805 contigs; if it does, add 'liftUp suggested.lft' to config.ra. #*** Then run again with -continue=loadDbSnp . cp suggested.lft hg38.lft cat >> config.ra <>& do.log & tail -f do.log # *** All done! (through the 'bigBed' step) # Make a liftOver chain from main chromosomes to patches added after p7. # Do the liftOver for all positional snp151* tables, load with -oldTable ############################################################################## # OMIM tracks (TODO 19-? angie) # the otto process builds the omim* tables; edit otto/omim/buildOmimTracks.sh to make sure # the most recent dbSNP version is listed for the db. After the snpNNN table is updated to # include patch sequences, the next otto update will include patches. # omimGene2 is still using refGene, but I think it would be better if it used ncbiRefSeqCurated # if it exists. # TODO: OMIM Genes needs liftOver to new alts and fixes (or redo from ncbiRefSeq). # OMIM Phenotypes needs liftOvers to all alts and fixes. Sometimes it spans a region larger # than an alt/fix, so maybe lower the percentage that has to map? ############################################################################## # GRC Incident Database (TODO - 19-? - Angie) # Wait until the updated hg19 files have been pushed to RR because GRC Incident update is # automated. Then update the file used to map GRC's RefSeq accessions to our names: hgsql hg19 -NBe 'select alias,chrom from chromAlias where source = "refseq" order by alias;' \ > /hive/data/outside/grc/incidentDb/GRCh38/refSeq.chromNames.tab #############################################################################