4f93d78f61f85b8b7553e1ea8afb033b8db78d03 angie Thu Apr 8 15:23:24 2021 -0700 Adding steps for updating hg13 files & tracks to GRCh38.p13 -- some steps done, most not yet. diff --git src/hg/makeDb/doc/hg38/patchUpdate.13.txt src/hg/makeDb/doc/hg38/patchUpdate.13.txt new file mode 100644 index 0000000..c191056 --- /dev/null +++ src/hg/makeDb/doc/hg38/patchUpdate.13.txt @@ -0,0 +1,771 @@ +# for emacs: -*- mode: sh; -*- + +# This file describes how hg38 was extended with patch sequences and annotations from grcH38P13, +# after having previously been extended with grcH38P12 (see patchUpdate.12.txt). + +############################################################################## +# Extend main database 2bit, chrom.sizes, chromInfo (DONE - 2020-03-13 - Angie) + + cd /hive/data/genomes/hg38 + # main 2bit + time faToTwoBit <(twoBitToFa hg38.2bit stdout) \ + <(twoBitToFa /hive/data/genomes/grcH38P13/grcH38P13.2bit stdout) \ + hg38.p13.2bit +#real 0m44.777s + # unmasked 2bit + twoBitMask -type=.bed hg38.p13.2bit /dev/null hg38.p13.unmasked.2bit + # chrom.sizes + sort -k2nr,2nr chrom.sizes /hive/data/genomes/grcH38P13/chrom.sizes > chrom.sizes.p13 + # chromInfo + cd /hive/data/genomes/hg38/bed/chromInfo + awk '{print $1 "\t" $2 "\t/gbdb/hg38/hg38.2bit";}' ../../chrom.sizes.p13 \ + > chromInfo.p13.tab + wc -l chromInfo*.tab +# 578 chromInfo.p11.tab +# 595 chromInfo.p12.tab +# 640 chromInfo.p13.tab +# 455 chromInfo.tab + + # Install + cd /hive/data/genomes/hg38 + ln -sf hg38.p13.2bit hg38.2bit + ln -sf hg38.p13.unmasked.2bit hg38.unmasked.2bit + ln -sf chrom.sizes.p13 chrom.sizes + + cd /hive/data/genomes/hg38/bed/chromInfo + hgLoadSqlTab hg38 chromInfo chromInfo.sql chromInfo.p13.tab + + +############################################################################## +# Extend main database tables for fileless tracks (DONE - 2020-03-13 - Angie) + + # Just add the patch table rows to the main database tables + for table in gap gold rmsk simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene; do + echo $table + hgsql hg38 -e "insert into hg38.$table select * from grcH38P13.$table" + done + + for table in gap gold rmsk simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene; do + positionalTblCheck hg38 $table + done + + +############################################################################## +# Extend main database gc5BaseBw.bw (DONE - 2020-03-16 - Angie) + + cd /hive/data/genomes/hg38/bed/gc5Base/ + # Concatenate original assembly results with grcH38P13 results + time (zcat hg38.p12.gc5Base.wigVarStep.gz \ + /hive/data/genomes/grcH38P13/bed/gc5Base/grcH38P13.gc5Base.wigVarStep.gz \ + | gzip -c \ + > hg38.p13.gc5Base.wigVarStep.gz) +#real 6m39.885s + # Make a new gc5BaseBw.bw + time wigToBigWig hg38.p13.gc5Base.wigVarStep.gz ../../chrom.sizes.p13 \ + hg38.p13.gc5Base.bw +#real 11m38.366s + + # Install + cd /hive/data/genomes/hg38/bed/gc5Base/ + ln -sf hg38.p13.gc5Base.wigVarStep.gz hg38.gc5Base.wigVarStep.gz + ln -sf hg38.p13.gc5Base.bw hg38.gc5Base.bw + + +############################################################################## +# Extend main database download files (DONE - 2021-04-08 - Angie) + + cd /hive/data/genomes/hg38/goldenPath/bigZips + mkdir p13 + # hg38.2bit was already extended above. + ln -sf /hive/data/genomes/hg38/hg38.p13.2bit p13/ + + # AGP: + zcat p12/hg38.p12.agp.gz \ + /hive/data/genomes/grcH38P13/goldenPath/bigZips/grcH38P13.agp.gz \ + | grep -v ^# \ + | gzip -c > p13/hg38.p13.agp.gz + + # FASTA + twoBitToFa ../../hg38.p13.2bit stdout \ + | gzip -c > p13/hg38.p13.fa.gz + faSize p13/hg38.p13.fa.gz +#3272116950 bases (161368694 N's 3110748256 real 1489641612 upper 1621106644 lower) in 640 sequences in 1 files +#Total size: mean 5112682.7 sd 26768936.7 min 970 (chrUn_KI270394v1) max 248956422 (chr1) median 166743 + + twoBitToFa hg38.2bit stdout \ + | maskOutFa stdin hard stdout \ + | gzip -c > p13/hg38.p13.fa.masked.gz + + # RepeatMasker (don't include header of patch file): + cat <(zcat p12/hg38.p12.fa.out.gz) \ + <(zcat /hive/data/genomes/grcH38P13/goldenPath/bigZips/grcH38P13.fa.out.gz | tail -n +4) \ + | gzip -c > p13/hg38.p13.fa.out.gz + + # SimpleRepeats/TRF: + zcat p12/hg38.p12.trf.bed.gz \ + /hive/data/genomes/grcH38P13/goldenPath/bigZips/grcH38P13.trf.bed.gz \ + | gzip -c > p13/hg38.p13.trf.bed.gz + # We don't expect a complete set of chroms to have simpleRepeats, but at least an increase: + zcat p12/hg38.p12.trf.bed.gz | cut -f 1 | uniq | wc -l +#502 + zcat p13/hg38.p13.trf.bed.gz | cut -f 1 | uniq | wc -l +#547 + + # hg38 files that are not built by makeDownloads.pl because hg38 is treated as 'scaffold-based': + # Per-chrom soft-masked FASTA: + rm -rf chroms + tar xzf p12/hg38.p12.chromFa.tar.gz + faSplit byname /hive/data/genomes/grcH38P13/goldenPath/bigZips/grcH38P13.fa.gz chroms/ + ls -1 chroms | wc -l +#640 + tar czf p13/hg38.p13.chromFa.tar.gz ./chroms + rm -rf chroms + + # Per-chrom hard-masked FASTA: + rm -rf maskedChroms + tar xzf p12/hg38.p12.chromFaMasked.tar.gz + faSplit byname /hive/data/genomes/grcH38P13/goldenPath/bigZips/grcH38P13.fa.masked.gz \ + maskedChroms/ + ls -1 maskedChroms | wc -l +#640 + tar czf p13/hg38.p13.chromFaMasked.tar.gz ./maskedChroms + rm -rf maskedChroms + + # RepeatMasker .align files: + zcat p12/hg38.p12.fa.align.gz \ + /hive/data/genomes/grcH38P13/bed/repeatMasker/grcH38P13.fa.align.gz \ + | gzip -c > p13/hg38.p13.fa.align.gz + + # Make new md5sum.txt + cd p13 + md5sum hg38.* > md5sum.txt + + # Install + cd /hive/data/genomes/hg38/goldenPath/bigZips + rm -rf latest + mkdir latest + cd latest + for file in ../p13/*; do + noVersion=$(echo $(basename $file) | sed -e 's/.p13//') + ln -s $file $noVersion + done + rm md5sum.txt + md5sum hg38* > md5sum.txt + echo GRCh38.p13 > LATEST_VERSION + + rm -f /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/p13 + ln -s /hive/data/genomes/hg38/goldenPath/bigZips/p13 \ + /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/p13 + rm -f /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/latest + ln -s /hive/data/genomes/hg38/goldenPath/bigZips/latest \ + /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/latest + ln -sf /hive/data/genomes/hg38/chrom.sizes.p13 \ + /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/p13/hg38.p13.chrom.sizes + + +############################################################################# +# Build perSeqMax file for gfServer (hgBlat) (TODO - 2020-08-14 angie) + # When the blat server is restarted with the updated hg38.2bit file, + # hg38.altsAndFixes needs to be copied over along with the new hg38.2bit file, + # and gfServer needs to be restarted with -perSeqMax=hg38.altsAndFixes. + cd /hive/data/genomes/hg38 + cut -f 1 chrom.sizes.p13 \ + | grep -E '_(alt|fix)$' \ + | sed -re 's/^/hg38.2bit:/;' \ + > hg38.altsAndFixes.p13 + # Link for blat server installation convenience: + ln -sf hg38.altsAndFixes.p13 altsAndFixes + + +######################################################################### +# Regenerate idKeys with extended hg38 (TODO - 2020-08-10 - Angie) + + mkdir /hive/data/genomes/hg38/bed/idKeys.p13 + cd /hive/data/genomes/hg38/bed/idKeys.p13 + # ku down... use hgwdev this time: + time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl \ + -twoBit=/hive/data/genomes/hg38/hg38.p13.unmasked.2bit \ + -bigClusterHub=hgwdev -smallClusterHub=hgwdev \ + -buildDir=`pwd` hg38) > do.log 2>&1 & + tail -f do.log +#real 1m21.903s + cat hg38.keySignature.txt +#c9c5d621a52f96886fa9cd785c99248f + + # Install + cd /hive/data/genomes/hg38/bed/ + rm idKeys + ln -s idKeys.p13 idKeys + + +############################################################################# +# Extend cytoBand{,Ideo} (TODO - 2020-08-10 angie) + cd /hive/data/genomes/hg38/bed/cytoBand + tawk '{print $1, 0, $2, "", "gneg";}' /hive/data/genomes/grcH38P13/chrom.sizes \ + > cytoBand.p13.tab + # Install + hgLoadSqlTab -oldTable hg38 cytoBand - cytoBand.p13.tab + hgLoadSqlTab -oldTable hg38 cytoBandIdeo - cytoBand.p13.tab + + +######################################################################### +# ncbiRefSeq.p13 Genes (TODO - 2020-08-10 - Angie) + + mkdir /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2020-08-10 + cd /hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2020-08-10 + + # Adding the -toGpWarnOnly flag because there are a handful of cases of CDS extending + # beyond exon coordinates. Terence Murphy says they'll eventually fix it but not soon. + # So, make sure to check do.log for warnings from gff3ToGenePred: + time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ + -toGpWarnOnly \ + refseq vertebrate_mammalian Homo_sapiens \ + GCF_000001405.38_GRCh38.p13 hg38) > do.log 2>&1 & tail -f do.log + # gff3ToGenePred warnings: +#Warning: skipping: no exon in id1912382 contains CDS 555851-556197 +#Warning: skipping: no exon in id1790907 contains CDS 22922593-22922913 +#Warning: skipping: no exon in id1790877 contains CDS 22906341-22906661 +#Warning: skipping: no exon in id1790824 contains CDS 22822981-22823289 +#Warning: skipping: no exon in id1365744 contains CDS 106088082-106088428 +#5 warnings converting GFF3 file: stdin + +# *** All done ! Elapsed time: 17m34s +#real 17m33.906s + + cat fb.ncbiRefSeq.hg38.txt +#134109466 bases of 3095998939 (4.332%) in intersection + + +############################################################################# +# UCSC to RefSeq, INSDC, Assembly; chromAlias (TODO 18-08-10 angie) + + # need to have idKeys for the genbank and refseq assemblies: + mkdir -p /hive/data/genomes/hg38/bed/ucscToINSDC/genbankP13 + cd /hive/data/genomes/hg38/bed/ucscToINSDC/genbankP13 + ln -s /hive/data/outside/ncbi/genomes/genbank/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCA_000001405.27_GRCh38.p13/GCA_000001405.27_GRCh38.p13_genomic.fna.gz . + faToTwoBit GCA_000001405.27_GRCh38.p13_genomic.fna.gz genbankP13.2bit + time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=genbankP13.2bit \ + -bigClusterHub=hgwdev -smallClusterHub=hgwdev \ + genbankP13) > do.log 2>&1 +#real 1m50.109s + + mkdir /hive/data/genomes/hg38/bed/ucscToINSDC/refseqP13 + cd /hive/data/genomes/hg38/bed/ucscToINSDC/refseqP13 + ln -s /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.38_GRCh38.p13/GCF_000001405.38_GRCh38.p13_genomic.fna.gz ./ + faToTwoBit GCF_000001405.38_GRCh38.p13_genomic.fna.gz refseqP13.2bit + time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=refseqP13.2bit \ + -bigClusterHub=hgwdev -smallClusterHub=hgwdev \ + refseqP13) > do.log 2>&1 +#real 1m19.941s + + # with the three idKeys available, join them to make the table bed files: + cd /hive/data/genomes/hg38/bed/ucscToINSDC + join -t$'\t' ../idKeys/hg38.idKeys.txt genbankP13/genbankP13.idKeys.txt \ + | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ + | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ + | sort -k1,1 -k2,2n > ucscToINSDC.p13.bed + + join -t$'\t' ../idKeys/hg38.idKeys.txt refseqP13/refseqP13.idKeys.txt \ + | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ + | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ + | sort -k1,1 -k2,2n > ucscToRefSeq.p13.bed + + # loading tables: + export db=hg38 + + export chrSize=`cut -f1 ucscToINSDC.p13.bed | awk '{print length($0)}' | sort -n | tail -1` + sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ + | hgLoadSqlTab ${db} ucscToINSDC stdin ucscToINSDC.p13.bed + + export chrSize=`cut -f1 ucscToRefSeq.p13.bed | awk '{print length($0)}' | sort -n | tail -1` + sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ + | sed -e 's/INSDC/RefSeq/g;' \ + | hgLoadSqlTab ${db} ucscToRefSeq stdin ucscToRefSeq.p13.bed + + # must be exactly 100% coverage + + featureBits -countGaps ${db} ucscToINSDC +#3257347282 bases of 3257347282 (100.000%) in intersection + + featureBits -countGaps ${db} ucscToRefSeq +#3257319537 bases of 3257347282 (99.999%) in intersection + # uh-oh! not 100% + featureBits -countGaps ${db} \!ucscToRefSeq -bed=stdout +#chrUn_KI270752v1 0 27745 chrUn_KI270752v1.1 + grep KI270752 \ + /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.38_GRCh38.p13/GCF_000001405.38_GRCh38.p13_assembly_report.txt +#HSCHRUN_RANDOM_CTG29 unplaced-scaffold na na KI270752.1 <> na Primary Assembly 27745 chrUn_KI270752v1 + # Yep, no RefSeq accession there. Guess it was dropped from the RefSeq p13 assembly??? + # Will ask Hiram and probably Terence. + + # construct chromAlias: + cd /hive/data/genomes/hg38/bed/chromAlias + hgsql -N -e 'select chrom,name from ucscToRefSeq;' ${db} \ + | sort -k1,1 > ucsc.refseq.tab + hgsql -N -e 'select chrom,name from ucscToINSDC;' ${db} \ + | sort -k1,1 > ucsc.genbank.tab + # add NCBI sequence names from assembly report + grep -v ^# \ + /hive/data/genomes/grcH38P13/genbank/GCA_000001405.27_GRCh38.p13_assembly_report.txt \ + | tawk '{print $5, $1;}' | sort \ + > genbankToAssembly.txt + tawk '{print $2, $1;}' ucsc.genbank.tab | sort \ + | join -t$'\t' -o 1.2,2.2 - genbankToAssembly.txt \ + | sort -k1,1 > ucsc.assembly.tab + + ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \ + > ${db}.chromAlias.tab + + # verify all there: + for t in refseq genbank assembly +do + c0=`cat ucsc.$t.tab | wc -l` + c1=`grep $t hg38.chromAlias.tab | wc -l` + ok="OK" + if [ "$c0" -ne "$c1" ]; then + ok="ERROR" + fi + printf "# checking $t: $c0 =? $c1 $ok\n" +done +# checking refseq: 594 =? 594 OK +# checking genbank: 595 =? 595 OK +# checking assembly: 595 =? 595 OK + # Note how there's one fewer refseq, consistent with featureBits above. + + hgLoadSqlTab hg38 chromAlias $HOME/kent/src/hg/lib/chromAlias.sql ${db}.chromAlias.tab + + +############################################################################## +# UCSC to Ensembl (TODO 18-08-06 angie) +# doc?? + + +############################################################################ +# altLocations and patchLocations (TODO - 2020-08-10 - Angie) + # indicate corresponding locations between haplotypes and reference + mkdir /hive/data/genomes/hg38/bed/altLocations.p13 + cd /hive/data/genomes/hg38/bed/altLocations.p13 + ~/kent/src/hg/utils/automation/altScaffoldPlacementToBed.pl \ + /hive/data/genomes/grcH38P13/genbank/GCA_000001405.27_GRCh38.p13_assembly_structure/{ALT_*,PATCHES}/alt_scaffolds/alt_scaffold_placement.txt \ + | sort -k1,1 -k2n,2n \ + > altAndFixLocations.bed + wc -l altAndFixLocations.bed +#802 altAndFixLocations.bed + grep _alt altAndFixLocations.bed > altLocations.bed + grep _fix altAndFixLocations.bed > fixLocations.bed + hgLoadBed hg38 altLocations{,.bed} +#Read 664 elements of size 4 from altLocations.bed + hgLoadBed hg38 fixLocations{,.bed} +#Read 140 elements of size 4 from fixLocations.bed + featureBits -countGaps hg38 altLocations +#200094386 bases of 3257347282 (6.143%) in intersection + featureBits -countGaps hg38 fixLocations +#63654955 bases of 3257347282 (1.954%) in intersection + + +############################################################################# +# Check for new chrX alts/patches to add to par (TODO - 2020-08-10 angie) + +# Thanks to Hiram for pointing out that intersecting chrX positions in +# altLocations and par shows whether a chrX alt overlaps a PAR. + cd /hive/data/genomes/hg38/bed/par + hgsql hg38 -e 'select * from altLocations where chrom = "chrX"' +#+-----+-------+------------+----------+---------------------+ +#| bin | chrom | chromStart | chromEnd | name | +#+-----+-------+------------+----------+---------------------+ +#| 73 | chrX | 319337 | 601516 | chrX_KI270880v1_alt | +#| 73 | chrX | 326487 | 601516 | chrX_KI270913v1_alt | +#| 77 | chrX | 4950956 | 5129468 | chrX_KV766199v1_alt | +#| 149 | chrX | 79965153 | 80097082 | chrX_KI270881v1_alt | +#+-----+-------+------------+----------+---------------------+ + + hgsql hg38 -e 'select * from par where chrom like "chrX%"' +#+-----+---------------------+------------+-----------+------+ +#| bin | chrom | chromStart | chromEnd | name | +#+-----+---------------------+------------+-----------+------+ +#| 9 | chrX | 10000 | 2781479 | PAR1 | +#| 221 | chrX | 155701382 | 156030895 | PAR2 | +#| 73 | chrX_KI270880v1_alt | 0 | 284869 | PAR1 | +#| 73 | chrX_KI270913v1_alt | 0 | 274009 | PAR1 | +#+-----+---------------------+------------+-----------+------+ + # chrX_KI270881v1_alt and chrX_KV766199v1_alt are not in either PAR. + # chrX_KI270880v1_alt and chrX_KI270913v1_alt are entirely contained in PAR1 -- + # and are already in the PAR table, so nothing to add. + + +############################################################################## +# altSeqLiftOver (TODO - 2020-11-06 Angie) +# originally done 2020-08-10; redone 2020-11-06 w/fixed gff3ToPsl to get correct - strand alignments +# mainToPatch over.chain regenerated 2020-12-03 w/fixed pslToChain + + mkdir /hive/data/genomes/hg38/bed/altSeqLiftOver.p13 + cd /hive/data/genomes/hg38/bed/altSeqLiftOver.p13 + # Eventually these will be under the /hive/data/genomes/.../genbank/... directory + # that points to /hive/data/outside/ncbi/genomes/... but at the moment the contents + # of the alignments/ directories are not included in the sync. So for now, + # manually download them here. + # Original alts -- reuse the ones downloaded for p11: + ln -s ../altSeqLiftOver.p11/initialAlts . + # New alts and patches too: + mkdir patches + cd patches + wget --timestamping --no-verbose\ + ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCA_000001405.27_GRCh38.p13/GCA_000001405.27_GRCh38.p13_assembly_structure/PATCHES/alt_scaffolds/alignments/\*.gff + cd .. + # Use chromAlias to make a .sed file to substitute Genbank accessions to UCSC names + hgsql hg38 -NBe 'select alias,chrom from chromAlias where find_in_set("genbank", source);' \ + | awk '{print "s@" $1 "@" $2 "@;";}' > gbToUcsc.sed + wc -l gbToUcsc.sed +#595 gbToUcsc.sed + cp /dev/null altToChrom.noScore.psl + for f in initialAlts/*.gff patches/*.gff; + do + e=`basename $f .gff | sed -e 's/_/|/g;'` + s=`grep -E $e gbToUcsc.sed` + sed -re "$s" $f | gff3ToPsl ../../chrom.sizes{,} stdin stdout \ + | pslPosTarget stdin stdout \ + >> altToChrom.noScore.psl + done + pslCheck altToChrom.noScore.psl +#checked: 421 failed: 0 errors: 0 + time pslRecalcMatch altToChrom.noScore.psl ../../hg38.2bit{,} altToChrom.psl +#real 0m29.571s + pslSwap altToChrom.psl stdout | pslPosTarget stdin chromToAlt.psl + sort -k14,14 -k16n,16n -k10,10 -k12n,12n altToChrom.psl chromToAlt.psl \ + > altAndPatches.psl + grep _alt altAndPatches.psl > altSeqLiftOver.psl + grep _fix altAndPatches.psl > fixSeqLiftOver.psl + + # Load tables + hgLoadPsl hg38 -table=altSeqLiftOverPsl altSeqLiftOver.psl + hgLoadPsl hg38 -table=fixSeqLiftOverPsl fixSeqLiftOver.psl + + # Make chrom-to-alt PSL file for genbank process. + ln -f -s `pwd`/chromToAlt.psl \ + /hive/data/genomes/hg38/jkStuff/hg38.p13.alt.psl + wc -l /hive/data/genomes/hg38/jkStuff/hg38.p13.alt.psl +#421 /hive/data/genomes/hg38/jkStuff/hg38.p13.alt.psl + + # Make a liftOver chain file for mapping annotations on main chroms to new patch sequences + # exclude alts that were already in hg38 before p13 + # Redone 12/3/18 after Braney fixed pslToChain + cut -f 1 ../../chrom.sizes.p12 | grep _ \ + | grep -vwf - chromToAlt.psl \ + | pslToChain stdin stdout \ + | chainScore stdin ../../hg38.2bit{,} ../../jkStuff/hg38.mainToPatch.p13.over.chain + grep chain ../../jkStuff/hg38.mainToPatch.p13.over.chain | wc -l +#17 + + +############################################################################## +# Extend wgEncodeReg bigWig tracks (TODO - 2020-01-08 angie) +#NOTE: this has not been liftOver'd to original alts! + + # Use the *.plusP12.bigWig files and add p13. + for dir in /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/{*Mark*,*Txn}; do + composite=$(basename $dir) + echo $composite + cd $dir + for f in wg*.plusP12.bigWig; do + track=$(basename $f .plusP12.bigWig) + ~/kent/src/hg/utils/liftOverBigWigToPatches $f \ + /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p13.over.chain \ + /hive/data/genomes/hg38/chrom.sizes \ + $track.plusP13.bigWig & + done + wait + done + + # Install (not necessary after updating .plusP13 files, links already point there) + for dir in /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/{*Mark*,*Txn}; do + composite=$(basename $dir) + echo $composite + cd $dir + for f in wg*.plusP13.bigWig; do + track=$(basename $f .plusP13.bigWig) + ln -sf `pwd`/$track.plusP13.bigWig /gbdb/hg38/bbi/wgEncodeReg/$composite/$track.bigWig + done + done + + +############################################################################## +# Extend wgEncodeRegDnase (TODO - 2020-01-08 angie) +#NOTE: this has not been liftOver'd to original alts! + cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase + origFile=wgEncodeRegDnaseClustered.plusP12.bed + liftOver -multiple -bedPlus=5 -noSerial $origFile \ + /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p13.over.chain \ + wgEncodeRegDnaseClustered.p13.bed /dev/null + sort -k1,1 -k2n,2n $origFile wgEncodeRegDnaseClustered.p13.bed \ + > wgEncodeRegDnaseClustered.plusP13.bed + + hgLoadBed hg38 wgEncodeRegDnaseClustered wgEncodeRegDnaseClustered.plusP13.bed \ + -sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql \ + -renameSqlTable -as=$HOME/kent/src/hg/lib/bed5SourceVals.as + + +############################################################################## +# Extend wgEncodeRegTfbsClusteredV3 (TODO - 2020-01-08 angie) +#NOTE: this has not been liftOver'd to original alts! + cd /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegTfbsClusteredV3/ + origFile=wgEncodeRegTfbsClusteredV3.plusP12.bed + liftOver -multiple -bedPlus=5 -noSerial $origFile \ + /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p13.over.chain \ + wgEncodeRegTfbsClusteredV3.p13.bed /dev/null + sort -k1,1 -k2n,2n $origFile wgEncodeRegTfbsClusteredV3.p13.bed \ + > wgEncodeRegTfbsClusteredV3.plusP13.bed + hgLoadBed hg38 wgEncodeRegTfbsClusteredV3 wgEncodeRegTfbsClusteredV3.plusP13.bed \ + -sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql \ + -renameSqlTable -as=$HOME/kent/src/hg/lib/bed5SourceVals.as + + +############################################################################## +# Extend GTEX GENE (TODO - 2020-01-15 angie) + mkdir /hive/data/genomes/hg38/bed/gtex.p13 + cd /hive/data/genomes/hg38/bed/gtex.p13 + liftOver -multiple -bedPlus=6 -noSerial ../gtex.p12/gtexGene.plusP12.bed \ + /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p13.over.chain \ + gtexGene.p13.bed /dev/null + sort -k1,1 -k2n,2n ../gtex.p12/gtexGene.plusP12.bed gtexGene.p13.bed \ + > gtexGene.plusP13.bed + # There is actually no bin column in gtexGene. + hgLoadSqlTab hg38 gtexGene $HOME/kent/src/hg/lib/gtexGeneBed.sql gtexGene.plusP13.bed + + # Two of the genes fall on inversions in the mapping of chr to alt/fix, so part of a gene + # maps on a + chain and part on a - chain. The SQL table has a unique index on + # (chr, geneId) so having two results (+ and -) makes it error out. When that happens, + # remove the smaller inversion mapping -- the larger gene region is still mapped. + grep -v '^chr12_KN538369v1_fix.*-.ENSG00000165714' gtexGene.plusP13.bed \ + | grep -v '^chr7_KV880765v1_fix.*+.ENSG00000164597' \ + | hgLoadSqlTab hg38 gtexGene $HOME/kent/src/hg/lib/gtexGeneBed.sql stdin + + # gtexGeneModel + liftOver -multiple -genePred ../gtex.p12/gtexGeneModel.plusP12.gp \ + /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p13.over.chain \ + gtexGeneModel.p13.gp /dev/null + sort -k2,2 -k3n,3n ../gtex.p12/gtexGeneModel.plusP12.gp gtexGeneModel.p13.gp \ + > gtexGeneModel.plusP13.gp + hgLoadGenePred hg38 gtexGeneModel gtexGeneModel.plusP13.gp + + +############################################################################# +# UPDATE /scratch/data/ 2bit (TODO - 2020-09-26 angie) + cp -p /hive/data/genomes/hg38/hg38.p13.2bit /hive/data/staging/data/hg38/ + mv /hive/data/staging/data/hg38/hg38.2bit{,.bak} + mv /hive/data/staging/data/hg38/hg38{.p13,}.2bit + cmp /hive/data/genomes/hg38/hg38.initial.2bit /hive/data/staging/data/hg38/hg38.2bit.bak + # No output -- the .bak copy is identical as expected, so clean it up. + rm /hive/data/staging/data/hg38/hg38.2bit.bak + + +############################################################################## +# Extend wgEncodeRegDnase (DNase HS) (TODO - 2020-01-23 angie) + # 95 Peak view subtracks + mkdir /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHS.p13 + cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHS.p13 + for f in ../wgEncodeRegDnaseHS.p12/*.plusP12.bed; do + track=$(basename $f .plusP12.bed) + liftOver -multiple -bedPlus=5 -noSerial $f \ + /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p13.over.chain \ + $track.p13.bed /dev/null + sort -k1,1 -k2n,2n $f $track.p13.bed > $track.plusP13.bed + done + # Install + for f in *.plusP13.bed; do + table=$(basename $f .plusP13.bed) + echo $table + hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable \ + -type=bed6+4 -as=$HOME/kent/src/hg/lib/bigNarrowPeak.as -noNameIx \ + hg38 $table $f + done + rm bed.tab + + # 95 Hotspots view subtracks + mkdir /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHotspot.p13 + cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseHotspot.p13 + cat >runOne <<'_EOF_' +#!/bin/bash +set -beEu -o pipefail +track=$1 +origFile=$2 + +liftOver -multiple -bedPlus=6 -noSerial $origFile \ + /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p13.over.chain \ + $track.broadPeak.p13.bed /dev/null +sort -k1,1 -k2n,2n $origFile $track.broadPeak.p13.bed > $track.broadPeak.plusP13.bed + +bedToBigBed -as=$HOME/kent/src/hg/lib/encode/broadPeak.as -type=bed6+3 $track.broadPeak.plusP13.bed \ + /hive/data/genomes/hg38/chrom.sizes $track.broadPeak.plusP13.bb +_EOF_ + chmod a+x runOne + cp /dev/null jobList + for origFile in ../wgEncodeRegDnaseHotspot.p12/*.broadPeak.plusP12.bed; do + track=$(basename $origFile .broadPeak.plusP12.bed) + echo ./runOne $track $origFile >> jobList + done + para make jobList + para time +#Completed: 95 of 95 jobs +#CPU time in finished jobs: 206s 3.43m 0.06h 0.00d 0.000 y +#IO & Wait Time: 251s 4.19m 0.07h 0.00d 0.000 y +#Average job time: 5s 0.08m 0.00h 0.00d +#Longest finished job: 8s 0.13m 0.00h 0.00d +#Submission to last job: 22s 0.37m 0.01h 0.00d + + # Install + for f in *.broadPeak.plusP13.bb; do + track=$(basename $f .broadPeak.plusP13.bb) + ln -sf `pwd`/$f /gbdb/hg38/bbi/wgEncodeRegDnase/$track.broadPeak.bb + done + + # Don't do wgEncodeRegDnaseSignal view... the data files are same as DnaseWig below! + + +############################################################################# +# Extend wgEncodeRegDnaseWig (DNase Signal) (TODO - 2020-01-23 angie) + + mkdir /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseWig.p13 + cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/wgEncodeRegDnaseWig.p13 + cp /dev/null jobList + for origFile in ../wgEncodeRegDnaseWig.p12/*.plusP12.bw; do + track=$(basename $origFile .plusP12.bw) + echo ~/kent/src/hg/utils/liftOverBigWigToPatches $origFile \ + /hive/data/genomes/hg38/jkStuff/hg38.mainToPatch.p13.over.chain \ + /hive/data/genomes/hg38/chrom.sizes \ + {check out exists $track.plusP13.bw} \ + >> jobList + done + para make jobList + para time +#Completed: 95 of 95 jobs +#CPU time in finished jobs: 140913s 2348.55m 39.14h 1.63d 0.004 y +#IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y +#Average job time: 428s 7.14m 0.12h 0.00d +#Longest finished job: 828s 13.80m 0.23h 0.01d +#Submission to last job: 1534s 25.57m 0.43h 0.02d + + # Install by updating /gbdb/ links. + for f in *.plusP13.bw; do + track=$(basename $f .plusP13.bw) + ln -sf `pwd`/$f /gbdb/hg38/bbi/wgEncodeRegDnase/$track.bw + done + + +############################################################################# +# Rebuild ncbiRefSeqGenomicDiff (TODO - 2020-01-25 angie) + mkdir /hive/data/genomes/hg38/bed/ncbiRefSeqAnomalies.p13 + cd /hive/data/genomes/hg38/bed/ncbiRefSeqAnomalies.p13 + + db=hg38 + pre=ncbiRefSeqGenomicDiff + buildDir=/hive/data/genomes/hg38/bed/ncbiRefSeq.p13.2020-08-10 + asmId=GCF_000001405.38_GRCh38.p13 + + time (zcat $buildDir/process/$asmId.rna.cds.gz \ + | egrep '[0-9]+\.\.[0-9]+' \ + | pslMismatchGapToBed -cdsFile=stdin -db=$db -ignoreQNamePrefix=X \ + $buildDir/process/$asmId.$db.psl.gz \ + /hive/data/genomes/$db/$db.2bit \ + $buildDir/$db.rna.fa \ + $pre) +#real 0m40.604s + + bedToBigBed -type=bed9+ -tab -as=$HOME/kent/src/hg/lib/txAliDiff.as $pre.bed \ + /hive/data/genomes/$db/chrom.sizes $pre.bb + ln -sf `pwd`/$pre.bb /gbdb/hg38/ncbiRefSeq/$pre.bb + + +############################################################################# +# DBSNP B151 / SNP151 (TODO - 2020-? angie) +# b151 is for GRCh38.p7 (orgDir human_9606_b151_GRCh38p7) so it doesn't have +# sequences added in p8 and later. + +How do we do this without disrupting the existing snp151 track? Can we build with a suffix and rename? Or just stop before loading... + + mkdir -p /hive/data/outside/dbSNP/151/human_hg38 + cd /hive/data/outside/dbSNP/151/human_hg38 + # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/organisms/ + # to find the subdir name to use as orgDir below (human_9606_b151_GRCh38p7 in this case). + # Go to that subdirectory, then to database/organism_data/ and look for files + # whose names start with b151_* and may or may not end with a suffix that identifies + # the build assembly version or some annotation version. If there is a suffix shared + # by all b151_* files, add that to config.ra as the "buildAssembly". + # dbSNP has all NT_/NW_ contig IDs, but our 2bit has UCSC-ified genbank names. + # make a lift file. + cat > config.ra < /hive/data/outside/dbSNP/151/human_hg38/assemblyLabels.txt + + # Start the usual pipeline at the loadDbSnp step. + ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log & + tail -f do.log +#*** b151_ContigInfo_108 has coords for 305 sequences; these have been written to +#*** /hive/data/outside/dbSNP/151/human_hg38/suggested.lft . +# +#*** GCF_000001405.33_GRCh38.p7_assembly_report.txt has mappings for 500 sequences; +#*** these have been written to +#*** /hive/data/outside/dbSNP/151/human_hg38/suggested.lft . +# +#*** You must account for all 805 contig_acc values in config.ra, +#*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output). +#*** Check the auto-generated suggested.lft to see if it covers all +#*** 805 contigs; if it does, add 'liftUp suggested.lft' to config.ra. +#*** Then run again with -continue=loadDbSnp . + cp suggested.lft hg38.lft + cat >> config.ra <>& do.log & + tail -f do.log +# *** All done! (through the 'bigBed' step) + + + + # Make a liftOver chain from main chromosomes to patches added after p7. + + + # Do the liftOver for all positional snp151* tables, load with -oldTable + + +############################################################################## +# OMIM tracks (TODO - 2020-? angie) +# the otto process builds the omim* tables; edit otto/omim/buildOmimTracks.sh to make sure +# the most recent dbSNP version is listed for the db. After the snpNNN table is updated to +# include patch sequences, the next otto update will include patches. +# omimGene2 is still using refGene, but I think it would be better if it used ncbiRefSeqCurated +# if it exists. + +# TODO: OMIM Genes needs liftOver to new alts and fixes (or redo from ncbiRefSeq). +# OMIM Phenotypes needs liftOvers to all alts and fixes. Sometimes it spans a region larger +# than an alt/fix, so maybe lower the percentage that has to map? + + +############################################################################## +# GRC Incident Database (TODO - 2020-? - Angie) + + # Wait until the updated hg19 files have been pushed to RR because GRC Incident update is + # automated. Then update the file used to map GRC's RefSeq accessions to our names: + hgsql hg19 -NBe 'select alias,chrom from chromAlias where source = "refseq" order by alias;' \ + > /hive/data/outside/grc/incidentDb/GRCh38/refSeq.chromNames.tab + + +#############################################################################