f7577fc37cbcba25eac796f12e81ac0235f77c0c jeltje.van.baren Tue Jan 21 11:03:16 2025 -0800 fixing wrongly named copy of alphaMissense.txt diff --git src/hg/makeDb/doc/hg38/hg38.txt src/hg/makeDb/doc/hg38/hg38.txt index 20b6e2a715a..1f3dc1234db 100644 --- src/hg/makeDb/doc/hg38/hg38.txt +++ src/hg/makeDb/doc/hg38/hg38.txt @@ -1,14 +1,7390 @@ +# for emacs: -*- mode: sh; -*- + +# This file describes how we made the browser database on +# NCBI build 38 (December 2013 freeze) aka: +# GRCh38 - Genome Reference Consortium Human Reference 38 +# Assembly Accession: GCA_000001405.2 + +############################################################################# +## Download sequence - DONE - 2013-12-24 + mkdir /hive/data/genomes/hg38 + mkdir /hive/data/genomes/hg38/genbank + cd /hive/data/genomes/hg38/genbank + time rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/ ./ +# sent 19643 bytes received 4914689807 bytes 4490369.53 bytes/sec +# total size is 4914019581 speedup is 1.00 + +# real 18m14.497s + +############################################################################# +## convert to UCSC names - DONE - 2013-12-24 +# with this release, NCBI has adopted a naming convention that is similar +# to UCSC. The delivered sequence with these names can be found in: +# /hive/data/genomes/hg38/genbank/seqs_for_alignment_pipelines/ +# +# The following scripts reproduce this naming scheme from the separate +# files in the release +# + mkdir /hive/data/genomes/hg38/ucsc + cat << '_EOF_' > ucscCompositeAgp.pl +#!/bin/env perl + +use strict; +use warnings; + +my %accToChr; + +open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or + die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; +while (my $line = ) { + next if ($line =~ m/^#/); + chomp $line; + my ($chrN, $acc) = split('\s+', $line); + $accToChr{$acc} = $chrN; +} +close (FH); + +foreach my $acc (keys %accToChr) { + my $chrN = $accToChr{$acc}; + print "$acc $accToChr{$acc}\n"; + open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz|") or die "can not read chr${chrN}.comp.agp.gz"; + open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; + while (my $line = ) { + if ($line =~ m/^#/) { + print UC $line; + } else { + $line =~ s/^$acc/chr${chrN}/; + print UC $line; + } + } + close (FH); + close (UC); + open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; + open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; + while (my $line = ) { + if ($line =~ m/^>/) { + printf UC ">chr${chrN}\n"; + } else { + print UC $line; + } + } + close (FH); + close (UC); +} +'_EOF_' + # << happy emacs + chmod +x ucscCompositeAgp.pl + + cat << '_EOF_' > unlocalized.pl +#!/bin/env perl + +use strict; +use warnings; + +my %accToChr; +my %chrNames; + +open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or + die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf"; +while (my $line = ) { + next if ($line =~ m/^#/); + chomp $line; + my ($chrN, $acc) = split('\s+', $line); + $acc =~ s/\./v/; + $accToChr{$acc} = $chrN; + $chrNames{$chrN} += 1; +} +close (FH); + +foreach my $chrN (keys %chrNames) { + my $agpFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz"; + my $fastaFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz"; + open (FH, "zcat $agpFile|") or die "can not read $agpFile"; + open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp"; + while (my $line = ) { + if ($line =~ m/^#/) { + print UC $line; + } else { + chomp $line; + my (@a) = split('\t', $line); + my $acc = $a[0]; + $acc =~ s/\./v/; + die "ERROR: chrN $chrN not correct for $acc" + if ($accToChr{$acc} ne $chrN); + my $ucscName = "chr${chrN}_${acc}_random"; + printf UC "%s", $ucscName; + for (my $i = 1; $i < scalar(@a); ++$i) { + printf UC "\t%s", $a[$i]; + } + printf UC "\n"; + } + } + close (FH); + close (UC); + printf "chr%s\n", $chrN; + open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; + open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa"; + while (my $line = ) { + if ($line =~ m/^>/) { + chomp $line; + my $acc = $line; + $acc =~ s/.*gb\|//; + $acc =~ s/. Homo.*//; + $acc =~ s/\./v/; + die "ERROR: chrN $chrN not correct for $acc" + if ($accToChr{$acc} ne $chrN); + my $ucscName = "chr${chrN}_${acc}_random"; + printf UC ">$ucscName\n"; + } else { + print UC $line; + } + } + close (FH); + close (UC); +} +'_EOF_' + # << happy emacs + chmod +x unlocalized.pl + + cat << '_EOF_' > unplaced.pl +#!/bin/env perl + +use strict; +use warnings; + +my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; +my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; +open (FH, "zcat $agpFile|") or die "can not read $agpFile"; +open (UC, ">chrUn.agp") or die "can not write to chrUn.agp"; +while (my $line = ) { + if ($line =~ m/^#/) { + print UC $line; + } else { + $line =~ s/\./v/; + printf UC "chrUn_%s", $line; + } +} +close (FH); +close (UC); + +open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; +open (UC, ">chrUn.fa") or die "can not write to chrUn.fa"; +while (my $line = ) { + if ($line =~ m/^>/) { + chomp $line; + $line =~ s/.*gb\|//; + $line =~ s/. Homo.*//; + $line =~ s/\./v/; + printf UC ">chrUn_$line\n"; + } else { + print UC $line; + } +} +close (FH); +close (UC); +'_EOF_' + # << happy emacs + chmod +x unplaced.pl + + cat << '_EOF_' > altSequence.pl +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename; + +open (AG, ">chrAlt.agp") or die "can not write to chrAlt.agp"; +open (FA, ">chrAlt.fa") or die "can not write to chrAlt.fa"; +open (FH, "find ../genbank/ALT* -type f | grep alt_scaffold_placement.txt|") or die "can not find alt_scaffold_placement.txt files"; +while (my $file = ) { + chomp $file; + my $dirName = dirname($file); + my $agpFile = "$dirName/AGP/alt.scaf.agp.gz"; + my $fastaFile = "$dirName/FASTA/alt.scaf.fa.gz"; + # key is genbank acc name, value is UCSC chr name + my %nameDelta; +# printf STDERR "# %s\n", $file; + open (AL, "<$file") or die "can not read $file"; + while (my $line = ) { + next if ($line =~ m/^#/); + chomp $line; + my ($alt_asm_name, $prim_asm_name, $alt_scaf_name, $alt_scaf_acc, + $parent_type, $parent_name, $parent_acc, $region_name, $ori, + $alt_scaf_start, $alt_scaf_stop, $parent_start, $parent_stop, + $alt_start_tail, $alt_stop_tail) = split('\t', $line); + my $ucscAcc = $alt_scaf_acc; + $ucscAcc =~ s/\./v/; + my $ucscName = sprintf("chr%s_%s_alt", $parent_name, $ucscAcc); + printf "%s %s\n", $alt_scaf_acc, $ucscName; + if (exists ($nameDelta{$alt_scaf_acc})) { + die "duplicate name incorrect ? $alt_scaf_acc $nameDelta{$alt_scaf_acc} ne $ucscName" if ($nameDelta{$alt_scaf_acc} ne $ucscName); + } else { + $nameDelta{$alt_scaf_acc} = $ucscName; + } + } + close (AL); + open (AL, "zcat $agpFile|") or die "can not read $agpFile"; + while (my $line = ) { + if ($line =~ m/^#/) { + print AG "$line"; + } else { + my ($acc, $rest) = split('\t', $line, 2); + die "can not find ucsc name for $acc" if (!exists($nameDelta{$acc})); + printf AG "%s\t%s", $nameDelta{$acc}, $rest; + } + } + close (AL); + open (AL, "zcat $fastaFile|") or die "can not read $fastaFile"; + while (my $line = ) { + chomp $line; + if ($line =~ m/^>/) { + $line =~ s/.*gb.//; + $line =~ s/. Homo.*//; + die "can not find ucsc name for $line" if (!exists($nameDelta{$line})); + printf FA ">%s\n", $nameDelta{$line}; + } else { + printf FA "%s\n", $line; + } + } + close (AL); +} +close (FH); +close (AG); +close (FA); +'_EOF_' + # << happy emacs + chmod +x altSequence.pl + + ./ucscCompositeAgp.pl + ./unlocalized.pl + ./unplaced.pl + ./altSequence.pl + + # temporarily verify the fasta and AGP are complete and compatible + faToTwoBit chr*.fa hg38.test.2bit + cat chr*.agp > hg38.agp + checkAgpAndFa hg38.agp hg38.test.2bit 2>&1 | tail -1 +# All AGP and FASTA entries agree - both files are valid + + rm -f hg38.agp hg38.test.2bit + + # comparing faCounts of this 2bit file and the sequences delivered + # in genbank/seqs_for_alignment_pipelines/ + # result in the exact same sequence + +############################################################################# +## initial db build - DONE - 2013-12-24 - Hiram + + cd /hive/data/genomes/hg38 + cat << '_EOF_' > hg38.config.ra +# Config parameters for makeGenomeDb.pl: +db hg38 +scientificName Homo sapiens +commonName Human +assemblyDate Dec. 2013 +assemblyLabel GRCh38 Genome Reference Consortium Human Reference 38 (GCA_000001405.2) +assemblyShortLabel GRCh38 +orderKey 13 +mitoAcc none +fastaFiles /hive/data/genomes/hg38/ucsc/chr*.fa +agpFiles /hive/data/genomes/hg38/ucsc/chr*.agp +# qualFiles /dev/null +dbDbSpeciesDir human +photoCreditURL http://www.cbse.ucsc.edu/ +photoCreditName Graphic courtesy of CBSE +ncbiGenomeId 51 +ncbiAssemblyId 883148 +ncbiAssemblyName GRCh38 +ncbiBioProject 31257 +genBankAccessionID GCA_000001305.2 +taxId 9606 +'_EOF_' + # << happy emacs + + # step wise to first verify AGP and Fasta files + time makeGenomeDb.pl -stop=agp hg38.config.ra > agp.log 2>&1 + + # looking good, continue: + time makeGenomeDb.pl -continue=db hg38.config.ra > db.log 2>&1 + + # add the files produced by the trackDb build to the source tree + + # this path is fixed in the makeGenomeDb.pl for next time + # honor new convention for bbi location files: + cd /gbdb/hg38/bbi + mkdir gc5BaseBw + mv gc5Base.bw gc5BaseBw + cd gc5BaseBw + # before + hgsql -e 'select * from gc5BaseBw;' hg38 +# +---------------------------+ +# | fileName | +# +---------------------------+ +# | /gbdb/hg38/bbi/gc5Base.bw | +# +---------------------------+ + # and fixed + hgBbiDbLink hg38 gc5BaseBw `pwd`/gc5Base.bw + hgsql -e 'select * from gc5BaseBw;' hg38 +# +-------------------------------------+ +# | fileName | +# +-------------------------------------+ +# | /gbdb/hg38/bbi/gc5BaseBw/gc5Base.bw | +# +-------------------------------------+ + +############################################################################# +## RepeatMasker with CrossMatch - DONE - 2013-12-24,27 - Hiram + mkdir /hive/data/genomes/hg38/bed/repeatMaskerCM + cd /hive/data/genomes/hg38/bed/repeatMaskerCM + # running this step wise so it can be loaded into its own table + time doRepeatMasker.pl -stop=mask -bigClusterHub=ku \ + -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1 + # real 3443m13.026s +# RepeatMasker version June 20 2013 open-4.0.3 +# Search Engine: cross-match version 1.090518 +# RepeatMasker Database: 20130422 + + # take the install script from this -debug run and alter it to load + # the table into rmskCM + time doRepeatMasker.pl -continue=install -stop=install -debug \ + -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 + cat fb.hg38.rmskCM.txt + # 1586326530 bases of 3209286105 (49.429%) in intersection + + # profile of repeat elements: +# 1852545 rmskClass/SINE.tab +# 1570523 rmskClass/LINE.tab +# 748597 rmskClass/LTR.tab +# 703682 rmskClass/Simple_repeat.tab +# 499108 rmskClass/DNA.tab +# 102856 rmskClass/Low_complexity.tab +# 7962 rmskClass/Satellite.tab +# 5750 rmskClass/Retroposon.tab +# 5667 rmskClass/LTR?.tab +# 5622 rmskClass/Unknown.tab +# 4516 rmskClass/snRNA.tab +# 3294 rmskClass/DNA?.tab +# 2026 rmskClass/tRNA.tab +# 1840 rmskClass/rRNA.tab +# 1784 rmskClass/RC.tab +# 1672 rmskClass/srpRNA.tab +# 1420 rmskClass/scRNA.tab +# 704 rmskClass/RNA.tab +# 411 rmskClass/RC?.tab +# 38 rmskClass/SINE?.tab + + # using this RM result with trfMask for the final masked sequence + cd /hive/data/genomes/hg38 + twoBitMask hg38.rmskCM.2bit -add bed/simpleRepeat/trfMask.bed hg38.2bit + twoBitToFa hg38.2bit stdout | faSize stdin > faSize.hg38.2bit.txt +# 3209286105 bases (159970322 N's 3049315783 real 1460684798 upper 1588630985 lower) in 455 sequences in 1 files +# %49.50 masked total, %52.10 masked real + + featureBits -countGaps hg38 rmskCM '!rmskHmmer' -bed=crossMatchUnique.bed + # 24868153 bases of 3209286105 (0.775%) in intersection + hgLoadBed hg38 crossMatchUnique crossMatchUnique.bed + # Read 2352219 elements of size 4 from crossMatchUnique.bed + +############################################################################# +## repeating RepeatMasker Blastn run (DONE - 2014-01-07 - Hiram) + mkdir /hive/data/genomes/hg38/bed/rmskBlastn + cd /hive/data/genomes/hg38/bed/rmskBlastn + + time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ + -useRMBlastn -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ + -stop=mask -buildDir=`pwd` hg38 > mask.log + # real 203m33.670s + +# 3209286105 bases (159970322 N's 3049315783 real 1491207906 upper 1558107877 lower) in 455 sequences in 1 files +# %48.55 masked total, %51.10 masked real + + # install step with debug so the script can be altered to load into + # a specific rmskBlastn table: + + $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ + -useRMBlastn -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ + -continue=install -debug -buildDir=`pwd` hg38 + +############################################################################# +## repeating RepeatMasker cross-match run (DONE - 2014-01-07 - Hiram) + mkdir /hive/data/genomes/hg38/bed/rmskCM + cd /hive/data/genomes/hg38/bed/rmskCM + + # missed recording stderr .... forgot the 2>&1 + time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ + -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ + -stop=mask -buildDir=`pwd` hg38 > mask.log + # real 1897m33.517s + # running from Tue Jan 7 16:10:33 PST 2014 thru 08 Jan 23:48 +# *** All done! (through the 'mask' step) - Elapsed time: 1897m34s +# *** Steps were performed in /hive/data/genomes/hg38/bed/rmskCM + # running install manually to allow edit of the script to load + # a specific rmskCm table + time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ + -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ + -continue=install -stop=install -buildDir=`pwd` hg38 -debug + +############################################################################# +## RepeatMasker with RM Blastn - DONE - 2013-12-24,25 - Hiram + mkdir /hive/data/genomes/hg38/bed/repeatMaskerBlastn + cd /hive/data/genomes/hg38/bed/repeatMaskerBlastn + # running this step wise so it can be loaded into its own table + time doRepeatMasker.pl -stop=mask -useRMBlastn -bigClusterHub=ku \ + -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1 + # real 354m55.842s + + # take the install script from this -debug run and alter it to load + # the table into rmskBlastn + doRepeatMasker.pl -useRMBlastn -bigClusterHub=ku -continue=install \ + -stop=install -debug -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 + # 1560264046 bases of 3209286105 (48.617%) in intersection + # profile of repeat elements: +# 1824560 rmskClass/SINE.tab +# 1552814 rmskClass/LINE.tab +# 738435 rmskClass/LTR.tab +# 715998 rmskClass/Simple_repeat.tab +# 486591 rmskClass/DNA.tab +# 105026 rmskClass/Low_complexity.tab +# 7712 rmskClass/Satellite.tab +# 5638 rmskClass/Retroposon.tab +# 5276 rmskClass/Unknown.tab +# 5100 rmskClass/LTR?.tab +# 4548 rmskClass/snRNA.tab +# 3033 rmskClass/DNA?.tab +# 1987 rmskClass/tRNA.tab +# 1809 rmskClass/rRNA.tab +# 1710 rmskClass/RC.tab +# 1633 rmskClass/srpRNA.tab +# 1428 rmskClass/scRNA.tab +# 614 rmskClass/RNA.tab +# 376 rmskClass/RC?.tab +# 38 rmskClass/SINE?.tab +# 3 rmskClass/Unspecified.tab +# 5464329 total + +############################################################################# +## repeating RepeatMasker run with HMMER - DONE - 2014-01-08 - Hiram + mkdir /hive/data/genomes/hg38/bed/rmskHmmer + cd /hive/data/genomes/hg38/bed/rmskHmmer + + # trying cpu=4 and ram=32g + time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ + -stop=mask -useHMMER -bigClusterHub=ku \ + -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1 + # 6 jobs required more than 32 Gb of memory to complete, ran them on + # hgwdev to complete, then continuing: + time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ + -continue=cat -stop=mask -useHMMER -bigClusterHub=ku \ + -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > cat.log 2>&1 + # real 24m5.274s +# 3209286105 bases (159970322 N's 3049315783 real 1314916231 upper 1734399552 lower) in 455 sequences in 1 files +# %54.04 masked total, %56.88 masked real + + # running install manually to allow edit of the script to load + # a specific rmskHmmer table + time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ + -continue=install -debug -useHMMER -bigClusterHub=ku \ + -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 + + time ./doLoad_rmskHmmer.bash > load.log 2>&1 + # real 4m47.432s + + featureBits -countGaps hg38 rmskHmmer > fb.hg38.rmskHmmer.txt 2>&1 + # 1734398971 bases of 3209286105 (54.043%) in intersection + + grep rmskClass hg38.class.profile.txt \ + | sed -e 's#rmskClass/##; s/.tab//;' | sort -rn + # profile of repeat elements: +# 1884179 SINE +# 1702529 LINE +# 805427 LTR +# 636906 Simple_repeat +# 565171 DNA +# 95480 Low_complexity +# 11861 Retroposon +# 10852 Satellite +# 9181 LTR? +# 6783 scRNA +# 4582 DNA? +# 3914 Unknown +# 2059 RC +# 1517 srpRNA +# 1484 RNA +# 970 SINE? +# 806 RC? +# 464 rRNA +# 5744165 total + + featureBits -countGaps hg38 rmskHmmer '!rmskCM' -bed=hmmerUnique.bed + # 172940594 bases of 3209286105 (5.389%) in intersection + hgLoadBed hg38 hmmerUnique hmmerUnique.bed + # Read 3099505 elements of size 4 from hmmerUnique.bed + +############################################################################# +## RepeatMasker with HMMER - DONE - 2013-12-24,26 - Hiram + mkdir /hive/data/genomes/hg38/bed/repeatMaskerHMMER + cd /hive/data/genomes/hg38/bed/repeatMaskerHMMER + + time doRepeatMasker.pl -stop=mask -useHMMER -bigClusterHub=ku \ + -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1 + # take the install script from this -debug run and alter it to load + # the table into rmskHmmer + doRepeatMasker.pl -continue=install -stop=install -useHMMER \ + -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ + -buildDir=`pwd` hg38 > mask.log 2>&1 + # 1702017722 bases of 3209286105 (53.034%) in intersection + # profile of repeat elements: +# 1879864 rmskClass/SINE.tab +# 1678216 rmskClass/LINE.tab +# 794231 rmskClass/LTR.tab +# 651561 rmskClass/Simple_repeat.tab +# 551965 rmskClass/DNA.tab +# 97186 rmskClass/Low_complexity.tab +# 10756 rmskClass/Retroposon.tab +# 10448 rmskClass/Satellite.tab +# 8393 rmskClass/LTR?.tab +# 5849 rmskClass/scRNA.tab +# 4282 rmskClass/Unknown.tab +# 4276 rmskClass/DNA?.tab +# 2000 rmskClass/RC.tab +# 1573 rmskClass/srpRNA.tab +# 1291 rmskClass/RNA.tab +# 906 rmskClass/snRNA.tab +# 747 rmskClass/SINE?.tab +# 723 rmskClass/RC?.tab +# 722 rmskClass/rRNA.tab +# 468 rmskClass/tRNA.tab +# 5705457 total + +############################################################################# +# rmsk from genbank release (DONE - 2014-12-25 - Hiram) + mkdir /hive/data/genomes/hg38/bed/repeatMaskerGenbank + cd /hive/data/genomes/hg38/bed/repeatMaskerGenbank + + head -3 ../repeatMaskerBlastn/hg38.fa.out > genbank.rm.out +find ../../genbank -type f | grep rm.out | grep -v "/placed_scaffolds/" | while read F +do + headRest 3 $F +done | sort -k5,45 -k6,6n >> genbank.rm.out + grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \ + | awk '{printf "s/%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt + + sed -e "`cat accessionToUcsc.sed.txt`" genbank.rm.out > ucscNames.rm.out + + head -3 ucscNames.rm.out > hg38.sorted.fa.out + tail -n +4 ucscNames.rm.out | sort -k5,5 -k6,6n >> hg38.sorted.fa.out + + hgLoadOut -table=rmskGenbank -nosplit hg38 hg38.sorted.fa.out + hgLoadOut -verbose=2 -tabFile=hg38.rmskGenbank.tab -table=rmskGenbank \ + -nosplit hg38 hg38.sorted.fa.out 2> bad.records.txt + # fixed up one of the masking scripts from the other runs to construct + # the bbi files + + # 1581568556 bases of 3209286105 (49.281%) in intersection + # profile of repeat elements: +# 1849444 rmskClass/SINE.tab +# 1586141 rmskClass/LINE.tab +# 759248 rmskClass/LTR.tab +# 502186 rmskClass/DNA.tab +# 433789 rmskClass/Simple_repeat.tab +# 396378 rmskClass/Low_complexity.tab +# 10198 rmskClass/Satellite.tab +# 5884 rmskClass/LTR?.tab +# 4595 rmskClass/snRNA.tab +# 4163 rmskClass/Retroposon.tab +# 2802 rmskClass/Unknown.tab +# 2157 rmskClass/DNA?.tab +# 2154 rmskClass/tRNA.tab +# 1915 rmskClass/rRNA.tab +# 1860 rmskClass/RC.tab +# 1784 rmskClass/srpRNA.tab +# 1397 rmskClass/scRNA.tab +# 822 rmskClass/RNA.tab +# 488 rmskClass/SINE?.tab +# 445 rmskClass/RC?.tab +# 5567850 total + +############################################################################# +## running TRF simple repeats - DONE - 2013-12-24,29 - Hiram + # this procedure ran into much trouble on this release. The new + # repeat sequences in the centromeres caused trf to run indefinitely. + # I tried different sizes of chunks, working down to 20 Mbase chunks. + # Even still, some jobs would not complete. Those broke down even + # more, eventually to the smallest bit of 30 Kbase that needed to + # run all the way down to 3,000 based chunks with 1,000 base overlaps. + + # this did not work: + screen # use screen to manage this day-long job + mkdir /hive/data/genomes/hg38/bed/simpleRepeat + cd /hive/data/genomes/hg38/bed/simpleRepeat + time doSimpleRepeat.pl -bigClusterHub=ku -workhorse=hgwdev \ + -smallClusterHub=ku -buildDir=`pwd` hg38 > do.log 2>&1 + cd /hive/data/genomes/hg38/bed + # move it aside: + mv simpleRepeat simpleRepeat.2013-12-24 + + # Instead, something like this: + mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap + cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap + mkdir -p noGap + + twoBitToFa ../../../hg38.unmasked.2bit stdout \ + | faSplit -lift=noGap.lift gap stdin 5000000 noGap/hg38_ + # make sure nothing has gone missing: + faCount noGap/*.fa > faCount.txt + tail -1 faCount.txt +# total 3068387174 898285419 623727342 626335137 900967885 19071391 30979734 + # compared to the full sequence, same numbers for ACGT: + twoBitToFa ../../../hg38.unmasked.2bit stdout | faCount stdin +# total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743 + faToTwoBit noGap/*.fa hg38.nogap.2bit + twoBitInfo hg38.nogap.2bit stdout | sort -k2,2nr > hg38.nogap.sizes + + + mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M + cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M + rm -rf /hive/data/genomes/hg38/TrfPart20M + /cluster/bin/scripts/simplePartition.pl \ +/hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap/hg38.nogap.2bit \ + 20000000 /hive/data/genomes/hg38/TrfPart20M + rm -f /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/TrfPart20M + ln -s /hive/data/genomes/hg38/TrfPart20M \ + /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/TrfPart20M + ssh ku + cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M + gensub2 /hive/data/genomes/hg38/TrfPart20M/partitions.lst single gsub jobList + para create jobList + para push + # 20 jobs would not complete: +# Completed: 143 of 163 jobs +# Jobs currently running: 20 +# CPU time in finished jobs: 76994s 1283.24m 21.39h 0.89d 0.002 y +# IO & Wait Time: 1095s 18.24m 0.30h 0.01d 0.000 y +# Time in running jobs: 1807279s 30121.32m 502.02h 20.92d 0.057 y +# Average job time: 546s 9.10m 0.15h 0.01d +# Longest running job: 90422s 1507.03m 25.12h 1.05d +# Longest finished job: 43348s 722.47m 12.04h 0.50d +# Submission to last job: 43363s 722.72m 12.05h 0.50d + # determine which are the last jobs as individual bits: + para status | grep -v done | awk '{print $(NF-1),$NF}' | grep TrfRun \ + > not.done.list + awk '{print $NF}' not.done.list | sed -e 's/.bed//' | while read F +do + cat $F +done > seq.specs.not.done + + mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs + cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs + mkdir fasta + for seqSpec in `cat ../seq.specs.not.done` +do + fName=`echo $seqSpec | sed -e 's/.*://'` + echo $fName + twoBitToFa $seqSpec fasta/$fName.fa +done + ls -1S `pwd`/fasta > part.list + cat << '_EOF_' > template +#LOOP +./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} +#ENDLOOP +'_EOF_' + # << happy emacs + + cat << '_EOF_' > runTrf +#!/bin/bash +set -beEu -o pipefail +export path1=$1 +export inputFN=`basename $1` +export outpath=$2 +export outputFN=`basename $2` +mkdir -p /dev/shm/$outputFN +cp -p $path1 /dev/shm/$outputFN +cd /dev/shm/$outputFN +/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ + $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm +cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs +rm -f $outpath +cp -p /dev/shm/$outputFN/$outputFN $outpath +rm -fr /dev/shm/$outputFN/* +rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN +'_EOF_' + # << happy emacs + chmod +x runTrf + + gensub2 part.list single template jobList + para create jobList + para push + # not all of these jobs will finish either: +# Completed: 85 of 106 jobs +# Jobs currently running: 21 +# CPU time in finished jobs: 58076s 967.93m 16.13h 0.67d 0.002 y +# IO & Wait Time: 828s 13.81m 0.23h 0.01d 0.000 y +# Time in running jobs: 1988997s 33149.95m 552.50h 23.02d 0.063 y +# Average job time: 693s 11.55m 0.19h 0.01d +# Longest running job: 94730s 1578.83m 26.31h 1.10d +# Longest finished job: 34216s 570.27m 9.50h 0.40d +# Submission to last job: 34342s 572.37m 9.54h 0.40d + + # can use what we have here: + liftUp result.bed ../../splitGap/noGap.lift error bed/*.bed + # find jobs not done + para status | grep -v done | awk '{print $(NF-1),$NF}' | grep TrfRun \ + > not.done.list + # splitting up those last jobs: + mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits + cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits + mkdir noGap + awk '{print $2}' ../lastJobs/not.done.list | while read F +do + cp -p $F ./noGap/ +done + + # split into 1,000,000 chunks with 10,000 overlap: + mkdir -p 1M_10K + +for F in noGap/*.fa +do + B=`basename $F | sed -e 's/.fa//'` + echo "faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/$B_" + faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/${B}_ +done + + ls -1S `pwd`/1M_10K/*.fa > part.list + cat << '_EOF_' > runTrf +#!/bin/bash +set -beEu -o pipefail +export path1=$1 +export inputFN=`basename $1` +export outpath=$2 +export outputFN=`basename $2` +mkdir -p /dev/shm/$outputFN +cp -p $path1 /dev/shm/$outputFN +cd /dev/shm/$outputFN +/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ + $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm +cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits +rm -f $outpath +cp -p /dev/shm/$outputFN/$outputFN $outpath +rm -fr /dev/shm/$outputFN/* +rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN +'_EOF_' + # << happy emacs + + cat << '_EOF_' > template +#LOOP +./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} +#ENDLOOP +'_EOF_' + # << happy emacs + + gensub2 part.list single template jobList + para create jobList + para push + # not all of these jobs will complete either: +# Completed: 53 of 96 jobs +# CPU time in finished jobs: 212403s 3540.05m 59.00h 2.46d 0.007 y +# IO & Wait Time: 1851s 30.85m 0.51h 0.02d 0.000 y +# Average job time: 4043s 67.38m 1.12h 0.05d +# Longest finished job: 68726s 1145.43m 19.09h 0.80d +# Submission to last job: 68890s 1148.17m 19.14h 0.80d + # use what results we have here: + cat *.lift | liftUp parts.bed stdin error bed/*.bed + liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed \ + | sort -u | sort -k1,1 -k2,2n > hg38.result.bed + + para status | grep -v -w done | awk '{print $(NF-1)}' > will.not.finish.txt + + # split those last bits: + mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits + cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits + mkdir splitBits + cat ../splitBits/will.not.finish.txt | while read F +do + cp -p $F splitBits +done + + # 100K chunks with 10K overlap + mkdir -p 100K_10K + +for F in splitBits/*.fa +do + B=`basename $F | sed -e 's/.fa//'` + echo "faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/$B_" + faSplit -lift=$B.lift -extra=10000 size $F 100000 100K_10K/${B}_ +done + + cat << '_EOF_' > runTrf +#!/bin/bash +set -beEu -o pipefail +export path1=$1 +export inputFN=`basename $1` +export outpath=$2 +export outputFN=`basename $2` +mkdir -p /dev/shm/$outputFN +cp -p $path1 /dev/shm/$outputFN +cd /dev/shm/$outputFN +/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ + $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm +cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits +rm -f $outpath +cp -p /dev/shm/$outputFN/$outputFN $outpath +rm -fr /dev/shm/$outputFN/* +rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN +'_EOF_' + # << happy emacs + chmod +x runTrf + + cat << '_EOF_' > template +#LOOP +./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} +#ENDLOOP +'_EOF_' + # << happy emacs + + ls -1S `pwd`/100K_10K/*.fa > part.list + gensub2 part.list single template jobList + para create jobList + para push + # one last bit does not complete: +# Completed: 420 of 421 jobs +# CPU time in finished jobs: 19862s 331.04m 5.52h 0.23d 0.001 y +# IO & Wait Time: 2360s 39.33m 0.66h 0.03d 0.000 y +# Average job time: 53s 0.88m 0.01h 0.00d +# Longest finished job: 368s 6.13m 0.10h 0.00d +# Submission to last job: 448s 7.47m 0.12h 0.01d + + # can use the results obtained here: + cat *.lift | liftUp splitParts.bed stdin error bed/*.bed + cat ../splitBits/*.lift | liftUp parts.bed stdin error splitParts.bed + liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \ + | sort -k1,1 -k2,2n > hg38.result.bed + + para status | grep -v -w done | awk '{print $(NF-1)}' + # last chunk: 100K_10K/hg38_89_2_00.fa + + mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K + cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K + cp -p ../splitSplitBits/100K_10K/hg38_89_2_00.fa . + + # 20K chunks with 10K overlap: + mkdir -p 20K_10K + +for F in hg38_89_2_00.fa +do + B=`basename $F | sed -e 's/.fa//'` + echo "faSplit -lift=$B.lift -extra=10000 size $F 20000 20K_10K/$B_" + faSplit -lift=$B.lift -extra=10000 size $F 20000 20K_10K/${B}_ +done + + ls -1S `pwd`/20K_10K/*.fa > part.list + cat << '_EOF_' > runTrf +#!/bin/bash +set -beEu -o pipefail +export path1=$1 +export inputFN=`basename $1` +export outpath=$2 +export outputFN=`basename $2` +mkdir -p /dev/shm/$outputFN +cp -p $path1 /dev/shm/$outputFN +cd /dev/shm/$outputFN +/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ + $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm +cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K +rm -f $outpath +cp -p /dev/shm/$outputFN/$outputFN $outpath +rm -fr /dev/shm/$outputFN/* +rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN +'_EOF_' + # << happy emacs + chmod +s runTrf + cat << '_EOF_' > template +#LOOP +./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} +#ENDLOOP +'_EOF_' + # << happy emacs + + gensub2 part.list single template jobList + para create jobList + para push + # one of these jobs will not finish: +# Completed: 4 of 5 jobs +# CPU time in finished jobs: 10s 0.17m 0.00h 0.00d 0.000 y +# IO & Wait Time: 16s 0.26m 0.00h 0.00d 0.000 y +# Average job time: 7s 0.11m 0.00h 0.00d +# Longest finished job: 8s 0.13m 0.00h 0.00d +# Submission to last job: 16s 0.27m 0.00h 0.00d + + # can use the results we have here: + cat *.lift | liftUp 20Kparts.bed stdin error bed/*.bed + cat ../splitSplitBits/*.lift | liftUp 100Kpart.bed stdin error 20Kparts.bed + cat ../splitBits/*.lift | liftUp parts.bed stdin error 100Kpart.bed + liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \ + | sort -k1,1 -k2,2n > hg38.result.bed + + # finally, what turns out to be the last batch: + mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K + cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K + cp -p ../last100K/20K_10K/hg38_89_2_00_3.fa . + + # 2K chunks with 1K overlap + mkdir -p 2K_1K + +for F in hg38_89_2_00_3.fa +do + B=`basename $F | sed -e 's/.fa//'` + echo "faSplit -lift=$B.lift -extra=1000 size $F 2000 2K_1K/$B_" + faSplit -lift=$B.lift -extra=1000 size $F 2000 2K_1K/${B}_ +done + + ls -1S `pwd`/2K_1K/*.fa > part.list + cat << '_EOF_' > runTrf +#!/bin/bash +set -beEu -o pipefail +export path1=$1 +export inputFN=`basename $1` +export outpath=$2 +export outputFN=`basename $2` +mkdir -p /dev/shm/$outputFN +cp -p $path1 /dev/shm/$outputFN +cd /dev/shm/$outputFN +/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ + $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm +cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K +rm -f $outpath +cp -p /dev/shm/$outputFN/$outputFN $outpath +rm -fr /dev/shm/$outputFN/* +rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN +'_EOF_' + # << happy emacs + chmod +x runTrf + cat << '_EOF_' > template +#LOOP +./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} +#ENDLOOP +'_EOF_' + # << happy emacs + + gensub2 part.list single template jobList + para create + para push +# Completed: 15 of 15 jobs +# CPU time in finished jobs: 1s 0.02m 0.00h 0.00d 0.000 y +# IO & Wait Time: 26s 0.43m 0.01h 0.00d 0.000 y +# Average job time: 2s 0.03m 0.00h 0.00d +# Longest finished job: 4s 0.07m 0.00h 0.00d +# Submission to last job: 14s 0.23m 0.00h 0.00d + + cat *.lift | liftUp 2Kparts.bed stdin error bed/*.bed + cat ../last100K/*.lift | liftUp 20Kpart.bed stdin error 2Kparts.bed + cat ../splitSplitBits/*.lift | liftUp 100Kpart.bed stdin error 20Kpart.bed + cat ../splitBits/*.lift | liftUp parts.bed stdin error 100Kpart.bed + liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \ + | sort -k1,1 -k2,2n > hg38.result.bed + + ## To put it all together: + cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M + cat /hive/data/genomes/hg38/TrfPart20M/???/*.bed lastJobs/bed/*.bed \ + splitBits/parts.bed splitSplitBits/parts.bed last100K/parts.bed \ + last30K/parts.bed > beforeLift.simpleRepeat.bed + liftUp -type=.bed stdout ../splitGap/noGap.lift error \ + beforeLift.simpleRepeat.bed | sort -u \ + | sort -k1,1 -k2,2n > simpleRepeat.bed + + awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed + + hgLoadBed hg38 simpleRepeat simpleRepeat.bed \ + -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql + featureBits hg38 simpleRepeat > fb.simpleRepeat 2>&1 + cat fb.simpleRepeat +# 146785521 bases of 3049335806 (4.814%) in intersection + + cd /hive/data/genomes/hg38/bed + ln -s simpleRepeat.2013-12-27/run20M simpleRepeat + +############################################################################ + + # WINDOWMASKER - DONE - 2013-12-24 - Hiram + mkdir /hive/data/genomes/hg38/bed/windowMasker + cd /hive/data/genomes/hg38/bed/windowMasker + time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ + -dbHost=hgwdev hg38 > do.log 2>&1 & + +############################################################################ +# Verify all gaps are marked - DONE - 2013-12-24 - Hiram + mkdir /hive/data/genomes/hg38/bed/gap + cd /hive/data/genomes/hg38/bed/gap + time nice -n +19 findMotif -motif=gattaca -verbose=4 \ + -strand=+ ../../hg38.unmasked.2bit > findMotif.txt 2>&1 + # real 0m28.634s + grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed + featureBits hg38 -not gap -bed=notGap.bed + # 3049335806 bases of 3049335806 (100.000%) in intersection + time featureBits hg38 allGaps.bed notGap.bed -bed=new.gaps.bed + # 20023 bases of 3049335806 (0.001%) in intersection + # real 0m20.427s + # this indicates that 20,023 bases are not marked as N's + # with this element size profile: + awk '{print $3-$2}' new.gaps.bed | ave stdin +# Q1 1.000000 +# median 1.000000 +# Q3 100.000000 +# average 44.894619 +# min 1.000000 +# max 1000.000000 +# count 446 +# total 20023.000000 +# standard deviation 81.743447 + + # the four largest ones: +# 1000 chr2 32916625 32917625 chr2.7 +# 1000 chr2 32867130 32868130 chr2.6 +# 348 chr20 36314371 36314719 chr20.36 +# 200 chr12 123443533 123443733 chr12.10 + +######################################################################### +## CYTOBAND - fixing the ideogram track (DONE - 2014-06-11 - Hiram) + ## the file we used before was broken + mkdir -p /hive/data/outside/ncbi/ideogram/2014-06 + cd /hive/data/outside/ncbi/ideogram/2014-06 + # fetch all the ideogram files: + rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/pub/gdp/ ./ + mkdir /hive/data/genomes/hg38/bed/cytoBandUpdate + cd /hive/data/genomes/hg38/bed/cytoBandUpdate + + # Create bed file + $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl \ +/hive/data/outside/ncbi/ideogram/2014-06/ideogram_9606_GCF_000001305.14_850_V1 + + # add in the other genome data: + hgsql -N -e 'select * from cytoBand;' hg38 \ + | egrep "chrU|chrM|_alt|_random" >> cytoBand.bed + + $HOME/kent/src/utils/ncbi/cytoBandVerify.pl + # everything checks out OK on 455 chroms + + # Load the bed file + hgLoadBed -tab -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \ + hg38 cytoBand cytoBand.bed + cut -f1 cytoBand.bed | sort -u | awk '{print length($1)}' | sort -rn | head + # 23 + sed -e 's/12/23/' $HOME/kent/src/hg/lib/cytoBand.sql > cytoBand.sql + sort -k1,1 -k2,2n cytoBand.bed \ + | hgLoadSqlTab hg38 cytoBand cytoBand.sql stdin + + # Make cytoBandIdeo track for ideogram gif on hgTracks page. + # cytoBandIdeo is just a replicate of the cytoBand track. + hgsql -e "drop table cytoBandIdeo;" hg38 + hgsql hg38 -e "create table cytoBandIdeo (index(chrom(23),chromStart)) as select * from cytoBand;" + +######################################################################### +## CYTOBAND - ideogram track (DONE - 2014-03-04 - Hiram) + ssh hgwdev + mkdir -p /hive/data/outside/ncbi/ideogram/2014-03 + cd /hive/data/outside/ncbi/ideogram/2014-03 + + # fetch all the ideogram files: + rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/pub/gdp/ ./ + + mkdir /hive/data/genomes/hg38/bed/cytoBand + cd /hive/data/genomes/hg38/bed/cytoBand + + # Create bed file + $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl \ +/hive/data/outside/ncbi/ideogram/2014-03/ideogram_9606_GCF_000001305.14_850_V1 + + # add in the other genome data: + hgsql -N -e 'select * from cytoBand;' hg38 > bobTable.bed + + egrep "chrU|chrM|_alt|_random" bobTable.bed >> cytoBand.bed + + ## can now verify before load: + $HOME/kent/src/utils/ncbi/cytoBandVerify.pl + # everything checks out OK on 455 chroms + + # Load the bed file + hgLoadBed -tab -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \ + hg38 cytoBand cytoBand.bed + cut -f1 cytoBand.bed | sort -u | awk '{print length($1)}' | sort -rn | head + # 23 + sed -e 's/12/23/' $HOME/kent/src/hg/lib/cytoBand.sql > cytoBand.sql + sort -k1,1 -k2,2n cytoBand.bed \ + | hgLoadSqlTab hg38 cytoBand cytoBand.sql stdin + + # Make cytoBandIdeo track for ideogram gif on hgTracks page. + # cytoBandIdeo is just a replicate of the cytoBand track. + hgsql -e "drop table cytoBandIdeo;" hg38 + hgsql hg38 -e "create table cytoBandIdeo (index(chrom(23),chromStart)) as select * from cytoBand;" + +########################################################################## +# cytoBandIdeo - (DONE - 2013-12-26 - Hiram) + mkdir /hive/data/genomes/hg38/bed/cytoBand + cd /hive/data/genomes/hg38/bed/cytoBand + makeCytoBandIdeo.csh hg38 + +#making temporary liftover of items from hg19 +liftOver /hive/data/genomes/hg19/bed/ncbiCytoBand/cytobands.bed \ + /hive/data/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \ + cytobands.bed unMapped + +liftOver -minBlocks=0.5 /hive/data/genomes/hg19/bed/ncbiCytoBand/cytobands.bed \ + /hive/data/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \ + cytobands.0.5.bed unMapped0.5 + +############################### ###################### +# cytoBandIdeo - (reDONE - 2014-02-25 - kuhn) + +# adding centromeres to generic cytonBandIdeo tavle as it exists. +# (lifted track is already gone) + +# get the cen values for hg38 +hgsql -Ne "SELECT DISTINCT chrom FROM centromeres" hg38 | sort > hg38.chroms +rm -f hg38.cens +foreach chrom (`cat hg38.chroms`) + set cenStart="" + set cenEnd="" + set cenStart=`hgsql -Ne 'SELECT MIN(chromStart) FROM centromeres WHERE chrom = "'$chrom'"' hg38` + set cenEnd=`hgsql -Ne 'SELECT MAX(chromEnd) FROM centromeres WHERE chrom = "'$chrom'"' hg38` + echo "$chrom $cenStart $cenEnd" >> hg38.cens +end + +# Modified makeCytoBandIdeo.csh to use this file instead of looking +# for centromeres in a gap table. +# Replaced existing cytoBandIdeo table, which was really only a copy +# of chromInfo. + +########################################################################## +# hg19 <-> hg38 difference tracks (DONE - 2013-12-28 - Hiram) + mkdir /hive/data/genomes/hg19/bed/liftOverHg38 + cd /hive/data/genomes/hg19/bed/liftOverHg38 + + # not needed, but interesting, collect all the fragment + # definitions from the gold tables: + hgsql -N -e "select frag,fragStart,fragEnd,strand from gold;" hg19 \ + | sort > hg19.gold.frags.tab + + hgsql -N -e "select frag,fragStart,fragEnd,strand from gold;" hg38 \ + | sort > hg38.gold.frags.tab + + # construct common and difference listings + comm -12 hg19.gold.frags.tab hg38.gold.frags.tab \ + > identical.hg19.hg38.frags.tab + comm -23 hg19.gold.frags.tab hg38.gold.frags.tab \ + > unique.hg19Only.frags.tab + comm -13 hg19.gold.frags.tab hg38.gold.frags.tab \ + > unique.hg38Only.frags.tab + + # better yet, get full information about each fragment + hgsql -N -e "select chrom,chromStart,chromEnd,ix,type,frag,fragStart,fragEnd,strand from gold;" hg19 \ + | sort -k6 > hg19.gold.tab + + hgsql -N -e "select chrom,chromStart,chromEnd,ix,type,frag,fragStart,fragEnd,strand from gold;" hg38 \ + | sort -k6 > hg38.gold.tab + + # construct a single key for each fragment for joining. + # the key is frag,fragStart,fragEnd,strand + awk '{printf "%s,%d,%d,%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\n", + $6,$7,$8,$9,$6,$9,$1,$2,$3,$4,$5}' hg19.gold.tab | sort \ + > hg19.fragKey.tab + awk '{printf "%s,%d,%d,%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\n", + $6,$7,$8,$9,$6,$9,$1,$2,$3,$4,$5}' hg38.gold.tab | sort \ + > hg38.fragKey.tab + + # now, by joining those keys, we can get exact identicals, and + # the only-in listings as bed files to load as tracks: + join hg19.fragKey.tab hg38.fragKey.tab \ + | awk '{printf "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0,0,128\n", $4,$5,$6,$2,$3,$5,$6}' \ + | sort -k1,1 -k2,2n > hg19.hg38.identical.bed + + join hg19.fragKey.tab hg38.fragKey.tab \ + | awk '{printf "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0,0,128\n", $11,$12,$13,$9,$10,$12,$13}' \ + | sort -k1,1 -k2,2n > hg38.hg19.identical.bed + + join -v 1 hg19.fragKey.tab hg38.fragKey.tab \ + | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $4,$5,$6,$2,$3}' \ + | sort -k1,1 -k2,2n > hg19.only.bed + + join -v 2 hg19.fragKey.tab hg38.fragKey.tab \ + | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $4,$5,$6,$2,$3}' \ + | sort -k1,1 -k2,2n > hg38.only.bed + + hgLoadBed hg19 hg38ContigDiff hg19.only.bed + hgLoadBed hg38 hg19ContigDiff hg38.only.bed + + wc -l hg??.only.bed + # 6097 hg19.only.bed + # 23632 hg38.only.bed + + # this leaves the outstanding question of "why" they might be in + # the only-in listings. Some contigs may be different versions, + # sometimes different sections of the same contig are used, + # and contigs are dropped from hg19 to hg38, or new contigs added + # to hg38 to fill in gaps from hg19 + # Let's see if we can measure some of this: + awk '{print $4}' hg19.only.bed | sort -u > hg19.only.ids.list + awk '{print $4}' hg38.only.bed | sort -u > hg38.only.ids.list + + # Looks like 5405 idential contigs with different parts used: + comm -12 hg19.only.ids.list hg38.only.ids.list > differentPortions.list + wc -l differentPortions.list + # 5405 + + # and perhaps 63 = 5468-5405 of different versions of same contig: + sed -e "s/\.[0-9]*$//" hg19.only.ids.list | sort -u \ + > hg19.noVersions.ids.list + sed -e "s/\.[0-9]*$//" hg38.only.ids.list | sort -u \ + > hg38.noVersions.ids.list + comm -12 hg19.noVersions.ids.list hg38.noVersions.ids.list | wc -l + # 5468 + sed -e "s/\.[0-9]*$//" differentPortions.list | sort -u \ + > differentPortions.noVersions.list + comm -12 hg19.noVersions.ids.list hg38.noVersions.ids.list | sort -u \ + > noVersions.common.list + # indeed, 63 contigs of different versions: + comm -23 noVersions.common.list differentPortions.noVersions.list \ + | sort -u > differentVersions.list + wc -l differentVersions.list + # 63 + + # dividing up these items: + cat << '_EOF_' > identifyPortions.pl +#!/usr/bin/env perl + +use strict; +use warnings; + +my %differentVersions; +my %differentPortions; + +open (FH, ") { + chomp $line; + $differentVersions{$line} = 1; +} +close (FH); + +open (FH, "differentPortions.list" ) or + die "can not read differentPortions.list"; +while (my $line = ) { + chomp $line; + $differentPortions{$line} = 1; +} +close (FH); + +my %hg19Done; +open (DP, ">hg19.differentPortions.bed") or die "can not write to hg19.differentPortions.bed"; +open (DV, ">hg19.differentVersions.bed") or die "can not write to hg19.differentVersions.bed"; +open (FH, ") { + chomp $line; + my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); + # assume done while $acc is still complete + $hg19Done{$acc} = 1; + if (exists($differentPortions{$acc})) { + printf DP "%s\n", $line; + } else { + my $trimAcc = $acc; + $trimAcc =~ s/\.[0-9]+$//; + if (exists($differentVersions{$trimAcc})) { + printf DV "%s\n", $line; + } else { + # this one does not match + $hg19Done{$acc} = 0; + } + } +} +close (FH); +close (DV); +close (DP); +open (DR, ">hg19.dropped.bed") or die "can not write to hg19.dropped.bed"; +open (FH, ") { + chomp $line; + my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); + if (0 == $hg19Done{$acc}) { + printf DR "%s\n", $line; + } +} +close (FH); +close (DR); + +my %hg38Done; +open (DP, ">hg38.differentPortions.bed") or die "can not write to hg38.differentPortions.bed"; +open (DV, ">hg38.differentVersions.bed") or die "can not write to hg38.differentVersions.bed"; +open (FH, ") { + chomp $line; + my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); + # assume done while $acc is still complete + $hg38Done{$acc} = 1; + if (exists($differentPortions{$acc})) { + printf DP "%s\n", $line; + } else { + my $trimAcc = $acc; + $trimAcc =~ s/\.[0-9]+$//; + if (exists($differentVersions{$trimAcc})) { + printf DV "%s\n", $line; + } else { + # this one does not match + $hg38Done{$acc} = 0; + } + } +} +close (FH); +close (DV); +close (DP); +open (DR, ">hg38.newTo19.bed") or die "can not write to hg38.newTo19.bed"; +open (FH, ") { + chomp $line; + my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); + if (0 == $hg38Done{$acc}) { + printf DR "%s\n", $line; + } +} +close (FH); +close (DR); +'_EOF_' + # << happy emacs + chmod +x identifyPortions.pl + ./identifyPortions.pl + # make sure nothing was lost + sort hg19.differentVersions.bed hg19.differentPortions.bed \ + hg19.dropped.bed | sum + # 43711 233 + sort hg19.only.bed | sum + # 43711 233 + sort hg38.differentVersions.bed hg38.differentPortions.bed \ + hg38.newTo19.bed | sum + # 00502 911 + sort hg38.only.bed | sum + # 00502 911 + + sort -k1,1 -k2,2n hg38.differentVersions.bed hg38.differentPortions.bed \ + hg38.newTo19.bed > hg38.itemRgb.bed + sort -k1,1 -k2,2n hg19.differentVersions.bed hg19.differentPortions.bed \ + hg19.dropped.bed > hg19.itemRgb.bed + + hgLoadBed hg19 hg38ContigDiff hg19.itemRgb.bed + # if you wanted to load the identicals in this track too: + sort -k1,1 -k2,2n hg38.hg19.identical.bed hg38.itemRgb.bed \ + | hgLoadBed hg38 hg38ContigDiff stdin + # but we don't, we deliver only the differences + hgLoadBed hg38 hg38ContigDiff hg38.itemRgb.bed + +######################################################################### +# construct ooc file to be used in blat operations +# DONE - 2012-12-30 - Hiram +# can be done on unmasked sequence the same result as masked: + cd /hive/data/genomes/hg38 + time blat hg38.unmasked.2bit /dev/null /dev/null \ + -tileSize=11 -makeOoc=jkStuff/hg38.11.ooc -repMatch=1024 + + # been confirmed, the 100-base non-bridged gaps are really non-bridged + gapToLift -minGap=100 -bedFile=jkStuff/nonBridgedGaps.bed hg38 \ + jkStuff/hg38.nonBridged.lft + +############################################################################## +# cpgIslands - (DONE - 2014-01-07 - Hiram) + # run on the Hmmer + trfMask sequence + mkdir /hive/data/genomes/hg38/bed/cpgIslands + cd /hive/data/genomes/hg38/bed/cpgIslands + time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ + -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ + -workhorse=hgwdev -smallClusterHub=ku hg38 > do.log 2>&1 + # real 3m31.684s + # wc -l cpgIsland.bed -> 30456 cpgIsland.bed + cat fb.hg38.cpgIslandExt.txt + # 23654068 bases of 3049335806 (0.776%) in intersection + + # Previously in hg19: + featureBits -countGaps hg19 cpgIslandExt + # 21842742 bases of 3137161264 (0.696%) in intersection + + # when run on Hmmer and Trf masked sequence: + # wc -l cpgIsland.bed -> 30416 cpgIsland.bed + # 23635946 bases of 3049335806 (0.775%) in intersection + + # when run on unmasked sequence: + # wc -l cpgIsland.bed -> 55149 cpgIsland.bed + # 33637531 bases of 3049335806 (1.103%) in intersection +############################################################################## +# rerun cpgIslands on contig sequence (DONE - 2014-01-07 - Hiram) + # this is a test of the contig sequence file, + # should get a very similar answer to the above + mkdir /hive/data/genomes/hg38/bed/cpgIslandsContigs + cd /hive/data/genomes/hg38/bed/cpgIslandsContigs + + # run stepwise so the lift can be done on the result before loading + time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ + -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ + -stop=makeBed -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \ + -workhorse=hgwdev -smallClusterHub=ku hg38 > makeBed.log 2>&1 + # real 9m31.502s + # fails on the bedToBigBed creation since this isn't the actual + # hg38 sequence. + mv cpgIsland.bed cpgIsland.beforeLift.bed + liftUp -type=.bed stdout ../../jkStuff/hg38.contigs.lift carry \ + cpgIsland.beforeLift.bed | sort -k1,1 -k2,2n > cpgIsland.bed + bedToBigBed -tab -type=bed4+6 -as=$HOME/kent/src/hg/lib/cpgIslandExt.as \ + cpgIsland.bed ../../chrom.sizes hg38.cpgIslandExt.bb + zcat ../cpgIslands/cpgIsland.bed.gz | sort -k1,1 -k2,2n > t.bed + # Surprisingly, a few more are detected, perhaps due to the different + # masking since this contig run is on the final corrected cross-match rmsk + # plus TRF, the above was on the corrupted HMMER+TRF mask: + wc -l cpgIsland.bed t.bed +# 30477 cpgIsland.bed +# 30456 t.bed + # 2,835 different items between the two: + sort t.bed cpgIsland.bed | uniq -c | awk '$1 < 2' | wc -l + # 2835 + # 29.049 identical items + sort t.bed cpgIsland.bed | uniq -c | awk '$1 == 2' | wc -l + # 29049 + cut -f1-3 cpgIsland.bed | sort > contigs.bed + cut -f1-3 t.bed | sort > fullSequence.bed + # 29,339 identical locations: + comm -12 contigs.bed fullSequence.bed | wc -l + # 29339 + + time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ + -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ + -continue=load -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \ + -workhorse=hgwdev -smallClusterHub=ku hg38 > load.log 2>&1 + # real 0m12.056s + + cat fb.hg38.cpgIslandExt.txt + # 23610399 bases of 3049335806 (0.774%) in intersection + +############################################################################## +# rerun cpgIslands on contig UNMASKED sequence (DONE - 2014-01-07 - Hiram) + mkdir /hive/data/genomes/hg38/bed/cpgIslandsContigsUnmasked + cd /hive/data/genomes/hg38/bed/cpgIslandsContigsUnmasked + + twoBitToFa -noMask ../../hg38.contigs.2bit stdout \ + | faToTwoBit stdin hg38.contigsUnmasked.2bit + + # verify sequence is OK: + twoBitToFa hg38.contigsUnmasked.2bit stdout | faSize stdin +# 3061688741 bases (12372958 N's 3049315783 real 3049315783 upper 0 lower) +# in 733 sequences in 1 files +# %0.00 masked total, %0.00 masked real + twoBitToFa hg38.contigsUnmasked.2bit stdout | faCount stdin | tail -1 +# total 3061688741 898285419 623727342 626335137 900967885 12372958 30979743 + # ACGT CpG same as original hg38.2bit except for the missing N's: +# total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743 + + # run stepwise so the lift can be done on the result before loading + time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ + -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ + -stop=makeBed -maskedSeq=`pwd`/hg38.contigsUnmasked.2bit \ + -workhorse=hgwdev -smallClusterHub=ku hg38 > makeBed.log 2>&1 + # real 11m0.690s + # as above, failed on the bedToBigBed step since this isn't the full hg38 + # sequence + mv cpgIsland.bed cpgIsland.beforeLift.bed + liftUp -type=.bed stdout ../../jkStuff/hg38.contigs.lift carry \ + cpgIsland.beforeLift.bed | sort -k1,1 -k2,2n > cpgIsland.bed + bedToBigBed -tab -type=bed4+6 -as=$HOME/kent/src/hg/lib/cpgIslandExt.as \ + cpgIsland.bed ../../chrom.sizes hg38.cpgIslandExt.bb + # a lot more here that for masked sequence: + wc -l cpgIsland.bed ../cpgIslandsContigs/cpgIsland.bed + # 55149 cpgIsland.bed + # 30477 ../cpgIslandsContigs/cpgIsland.bed + featureBits -countGaps hg38 cpgIsland.bed + # 33637531 bases of 3209286105 (1.048%) in intersection + featureBits -countGaps hg38 ../cpgIslandsContigs/cpgIsland.bed + # 23610399 bases of 3209286105 (0.736%) in intersection + + # debug load step so it can be loaded into a separate table: + $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ + -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ + -debug -continue=load -maskedSeq=`pwd`/hg38.contigsUnmasked.2bit \ + -workhorse=hgwdev -smallClusterHub=ku hg38 + + time ./doLoadCpg.csh > load.log 2>&1 + # real 0m2.179s + # 33637531 bases of 3049335806 (1.103%) in intersection + +######################################################################### +# construct liftOver to hg19 (DONE - 2013-12-31 - Hiram) + # it turns out it doesn't matter if the query or target 2bit files + # are masked. This procedure can be done on completely unmasked sequences + # for both, same result masked or not masked + screen -S hg38 # manage this longish running job in a screen + mkdir /hive/data/genomes/hg38/bed/blat.hg19.2013-12-31 + cd /hive/data/genomes/hg38/bed/blat.hg19.2013-06-10 + # this was run in manual steps as experiments were done about the masking + # check it with -debug first to see if it is going to work: + doSameSpeciesLiftOver.pl -stop=net -buildDir=`pwd` -bigClusterHub=ku \ + -dbHost=hgwdev -workhorse=hgwdev -debug \ + -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc hg38 hg19 + # the debug step doesn't actually construct enough files to run the + # steps manually. The chaining has an extra procedure that is performed + # while not in 'debug' mode + # the run.blat was operated manually, then chaining: + time doSameSpeciesLiftOver.pl -continue=chain -stop=net -buildDir=`pwd` \ + -bigClusterHub=ku \ + -dbHost=hgwdev -workhorse=hgwdev \ + -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc \ + hg38 hg19 > chain.log 2>&1 + # real 22m31.635s + # loading is only a few seconds: + doSameSpeciesLiftOver.pl -continue=load -buildDir=`pwd` \ + -bigClusterHub=ku \ + -dbHost=hgwdev -workhorse=hgwdev \ + -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc \ + hg38 hg19 > load.log 2>&1 + + # verify this file exists: + # /gbdb/hg38/liftOver/hg38ToHg19.over.chain.gz + # and try out the conversion on genome-test from hg38 to hg19 + # same file should exist for downloads: + # /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz + +############################################################################ +# marking the PAR regions: (DONE - 2014-01-09 - Hiram) + # after much experimentation with the AGP files and the given NCBI + # files in hg38/genbank/Primary_Assembly/pseudoautosomal_region + # the PAR region definitions can be seen in the par_align.gff file: +# CM000685.2 10001 2781479 -> CM000686.2 10001 2781479 +# CM000685.2 155701383 156030895 -> CM000686.2 56887903 57217415 + # equivalent to: +# chrX 10001 2781479 -> chrY 10001 2781479 +# chrX 155701383 156030895 -> chrY 56887903 57217415 + + # subtract one for the chromStart position: + cat << '_EOF_' > hg38Par.bed4 +chrX 10000 2781479 PAR1 +chrX 155701382 156030895 PAR2 +chrY 10000 2781479 PAR1 +chrY 56887902 57217415 PAR2 +'_EOF_' + # << happy emacs + + hgLoadBed hg38 par hg38Par.bed4 + checkTableCoords hg38 + + # hg19 had: ++-------+------------+-----------+------+ +| chrom | chromStart | chromEnd | name | ++-------+------------+-----------+------+ +| chrX | 60000 | 2699520 | PAR1 | +| chrX | 154931043 | 155260560 | PAR2 | +| chrY | 10000 | 2649520 | PAR1 | +| chrY | 59034049 | 59363566 | PAR2 | ++-------+------------+-----------+------+ + + # The AGP files come close to definining the location, but not + # precisely. The first region uses different bits of AC006209.25: +zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\ + | grep AC006209.25 +CM000685.2 2665048 2677319 56 F AC006209.25 127483 139754 - +CM000685.2 2677869 2804801 58 F AC006209.25 1 126933 - +zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\ + | grep AC006209.25 +CM000686.2 2665048 2677319 56 F AC006209.25 127483 139754 - +CM000686.2 2677869 2781479 58 F AC006209.25 23323 126933 - + + # and the second region uses different bits of AJ271735.1: +zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\ + | grep AJ271735.1 | head -1 +CM000685.2 155676925 155719966 3096 O AJ271735.1 44687 87728 + +zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\ + | grep AJ271735.1 | head -1 +CM000686.2 56887903 56906486 356 O AJ271735.1 69145 87728 + + + # combining all the contig definitions from each will find all the + # exact identical contig bits: +zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\ + | grep -v "^#" | awk '$5 != "N"' \ + | awk '{printf "%s_%d_%d\t%s\t%d\t%d\n", $6,$7,$8,$1,$2,$3}' \ + | sort > chrY.comp.agp.txt +zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\ + | grep -v "^#" | awk '$5 != "N"' \ + | awk '{printf "%s_%d_%d\t%s\t%d\t%d\n", $6,$7,$8,$1,$2,$3}' \ + | sort > chrX.comp.agp.txt + join -t'^I' chrY.comp.agp.txt chrX.comp.agp.txt | head + +CM000685.2 10001 44821 CM000686.2 10001 44821 +... +CM000685.2 2677320 2677868 CM000686.2 2677320 2677868 + +CM000685.2 155719967 155720351 CM000686.2 56906487 56906871 +... +CM000685.2 155964490 156030895 CM000686.2 57151010 57217415 + +############################################################################ +## altLocations track (DONE - 2014-01-02 - Hiram) + # indicate corresponding locations between haplotypes and reference + mkdir /hive/data/genomes/hg38/bed/altLocations + cd /hive/data/genomes/hg38/bed/altLocations + + find ../../genbank/ALT_* -type f | grep alt_scaffold_placement.txt \ + | while read F +do + grep -v "^#" ${F} | sed -e 's/\./v/;' | awk -F'\t' '{printf "chr%s\t%d\t%d\tchr%s_%s_alt\n", $6,$12-1,$13,$6, $4}' +done | sort -k1,1 -k2,2n > chrToAlt.bed + + # note silent hidden character in the join -t argument + # explicit as written here + +find ../../genbank/ALT_* -type f | grep alt_scaffold_placement.txt \ + | while read F +do + grep -v "^#" ${F} | sed -e 's/\./v/;' | awk -F'\t' '{printf "chr%s_%s_alt\tchr%s:%d-%d\n", $6,$4,$6,$12,$13}' +done | sort > altToChr.tab +sort ../../chrom.sizes | join -t'^I' - altToChr.tab \ + | awk '{printf "%s\t0\t%d\t%s\n", $1,$2,$3}' > altToChr.bed + + + hgLoadBed hg38 altLocations chrToAlt.bed altToChr.bed + featureBits -countGaps hg38 altLocations + # 170113652 bases of 3209286105 (5.301%) in intersection + +############################################################################ +## genscan (DONE - 2014-01-07 - Hiram) + mkdir /hive/data/genomes/hg38/bed/genscan + cd /hive/data/genomes/hg38/bed/genscan + + # using the contig sequence + # running stepwise to allow the lifting of the final result + time $HOME/kent/src/hg/utils/automation/doGenscan.pl hg38 -buildDir=`pwd` \ + -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \ + -stop=makeBed -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + > do.log 2>&1 + # three jobs did not finish due to almost all N's in the sequence, + # just a couple of bases in each piece. Their empty result is good enough. + time $HOME/kent/src/hg/utils/automation/doGenscan.pl hg38 -buildDir=`pwd` \ + -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \ + -continue=makeBed -stop=makeBed -bigClusterHub=ku -dbHost=hgwdev \ + -workhorse=hgwdev > makeBed.log 2>&1 + # real 0m48.161s + + cd lifted + mkdir -p gtf subopt nameFixed/gtf nameFixed/pep newNames pep + for F in ../gtf/000/*.gtf +do + B=`basename $F` + liftUp gtf/${B} ../../../jkStuff/hg38.contigs.lift carry $F + echo $B +done + for F in ../subopt/000/*.bed +do + B=`basename $F` + liftUp subopt/${B} ../../../jkStuff/hg38.contigs.lift carry $F + echo $B +done + + ls gtf/chr*_[0-9][0-9].gtf \ + | sed -e 's/_[0-9][0-9]//; s#gtf/##; s/.gtf//;' | sort -u | while read C +do + cat ../pep/000/${C}_[0-9][0-9].pep > pep/${C}.pep + cat gtf/${C}_[0-9][0-9].gtf | ./gtfFixId.pl ${C} > nameFixed/gtf/${C}.gtf + ./pepNameFix.pl ${C} > nameFixed/pep/${C}.pep +done + + cat nameFixed/gtf/*.gtf > ../hg38.genscan.gtf + ls gtf | egrep -v '^chr[0-9XY][0-9]*_[0-9][0-9].gtf' | while read C +do + cat gtf/${C} +done >> ../hg38.genscan.gtf + + cat nameFixed/pep/*.pep > ../hg38.genscan.pep + ls gtf | egrep -v '^chr[0-9XY][0-9]*_[0-9][0-9].gtf' \ + | sed -e 's/.gtf/.pep/' | while read C +do + cat ../pep/000/${C} +done >> ../hg38.genscan.pep + + cd /hive/data/genomes/hg38/bed/genscan + cat lifted/subopt/*.bed | sort -k1,1 -k2,2n > hg38.genscanSubopt.bed + + gtfToGenePred hg38.genscan.gtf hg38.genscan.gp + genePredCheck -db=hg38 hg38.genscan.gp + # checked: 44149 failed: 0 + genePredToBed hg38.genscan.gp hg38.genscan.bed + bedToBigBed hg38.genscan.bed ../../chrom.sizes hg38.genscan.bb + bedToBigBed hg38.genscanSubopt.bed ../../chrom.sizes hg38.genscanSubopt.bb + ldHgGene -gtf hg38 genscan hg38.genscan.gtf +# Read 44149 transcripts in 339212 lines in 1 files +# 44149 groups 345 seqs 1 sources 1 feature types + + cat fb.hg38.genscan.txt + # 58278346 bases of 3049335806 (1.911%) in intersection + cat fb.hg38.genscanSubopt.txt + # 55020514 bases of 3049335806 (1.804%) in intersection + + # oddly, we are getting half of what hg19 had ? + featureBits hg19 genscan + # 106433874 bases of 2897316137 (3.674%) in intersection + + # This is because hg19 was run on soft-masked sequence and not + # on hard masked sequence + +############################################################################ +## genscan on unmasked sequence experiment (DONE - 2013-12-03 - Hiram) + ## instead, working on unmasked sequence: + mkdir /hive/data/genomes/hg38/bed/genscan/unmaskedRun + cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun + + mkdir liftSpecs + split -a 3 -d -l 1 ../../../jkStuff/hg38.nonBridged.lift liftSpecs/hg38_ + + mkdir fasta +for F in liftSpecs/hg38_* +do + L=`cut -f2 $F` + echo $L + /cluster/home/hiram/kent/src/hg/utils/lft2BitToFa.pl \ + ../../../hg38.unmasked.2bit $F > fasta/${L}.fa +done + + + cat << '_EOF_' > template +#LOOP +./runGsBig.bash $(path1) {check out exists gtf/$(root1).gtf} {check out exists pep/$(root1).pep} {check out exists subopt/$(root1).bed} +#ENDLOOP +'_EOF_' + # << happy emacs + cat << '_EOF_' > runGsBig.bash +#!/bin/bash + +set -beEu -o pipefail + +export seqFile=$1 +export resultGtf=$2 +export resultPep=$3 +export resultSubopt=$4 +/cluster/bin/x86_64/gsBig $seqFile $resultGtf -trans=$resultPep -subopt=$resultSubopt -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000 +'_EOF_' + # << happy emacs + + ls -1S `pwd`/fasta/*.fa > part.list + gensub2 part.list single template jobList + para create jobList + para push + # several jobs crashed: +# Completed: 726 of 733 jobs +# Crashed: 7 jobs +# CPU time in finished jobs: 62501s 1041.68m 17.36h 0.72d 0.002 y +# IO & Wait Time: 2563s 42.72m 0.71h 0.03d 0.000 y +# Average job time: 90s 1.49m 0.02h 0.00d +# Longest finished job: 3288s 54.80m 0.91h 0.04d +# Submission to last job: 3294s 54.90m 0.92h 0.04d + + para status | grep -v -w done | awk '{print $(NF-3)}' > crashed.job.list + + mkdir /hive/data/genomes/hg38/bed/genscan/unmaskedRun/crashedJobs + cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun/crashedJobs + mkdir splitBits + + for F in chr2.06 chr1.03 chr3.05 chr12.07 chr10.05 chr17.08 chr11.04 +do + faSplit -lift=${F}.lift gap ../fasta/${F}.fa 2000000 splitBits/${F}_ +done + + ls -1S `pwd`/splitBits/*.fa > part.list + cat << '_EOF_' > runGsBig.bash +#!/bin/bash + +set -beEu -o pipefail + +export seqFile=$1 +export resultGtf=$2 +export resultPep=$3 +export resultSubopt=$4 +/cluster/bin/x86_64/gsBig $seqFile $resultGtf -trans=$resultPep -subopt=$resultSubopt -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000 +'_EOF_' + # << happy emacs + chmod +x runGsBig.bash + + cat << '_EOF_' > template +#LOOP +./runGsBig.bash $(path1) {check out exists gtf/$(root1).gtf} {check out exists pep/$(root1).pep} {check out exists subopt/$(root1).bed} +#ENDLOOP +'_EOF_' + # << happy emacs + + gensub2 part.list single template jobList + para create jobList + para push +# Completed: 331 of 334 jobs +# Crashed: 3 jobs +# CPU time in finished jobs: 18097s 301.62m 5.03h 0.21d 0.001 y +# IO & Wait Time: 1085s 18.08m 0.30h 0.01d 0.000 y +# Average job time: 58s 0.97m 0.02h 0.00d +# Longest finished job: 79s 1.32m 0.02h 0.00d +# Submission to last job: 249s 4.15m 0.07h 0.00d + # the last three completed with -window=1600000 + + # lifting results: + cat << '_EOF_' > fixIds.pl +#!/usr/bin/env perl + +use strict; +use warnings; + +my $argc = scalar(@ARGV); + +if ($argc != 1) { + printf STDERR "usage: cat chrN.M.lifted | ./fixIds.pl chrN.M\n"; + exit 255; +} + +my $F=shift; +my $C = $F; +$C =~ s/\.[0-9][0-9]//; + +my $id = 0; +my $prevId = ""; +open (GT, ">${F}.gtf") or die "can not write to ${F}.gtf"; +while (my $line=<>) { + chomp $line; + my $geneId = $line; + $geneId =~ s/^${C}.*gene_id "${C}//; + $geneId =~ s/";.*//; + $id += 1 if ( $prevId ne $geneId); + $line =~ s/${C}[0-9]+.[0-9]+/${F}.$id/g; + printf GT "%s\n", $line; + $prevId = $geneId; +} +close (GT); +'_EOF_' + # << happy emacs + chmod +x fixIds.pl + for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05 +do + echo "${F}" 1>&2 + cut -f2 ${F}.lift | while read P + do + liftUp -type=.gtf stdout ${F}.lift error gtf/${P}.gtf + done > ${F}.lifted.gtf + cat ${F}.lifted.gtf | ./fixIds.pl ${F} +done + # copied these results to ../gtf/ to get into the final result +# -rw-rw-r-- 1 3349959 Jan 2 15:33 chr1.03.gtf +# -rw-rw-r-- 1 2439182 Jan 2 15:33 chr10.05.gtf +# -rw-rw-r-- 1 1068097 Jan 2 15:33 chr11.04.gtf +# -rw-rw-r-- 1 2392548 Jan 2 15:33 chr12.07.gtf +# -rw-rw-r-- 1 1831336 Jan 2 15:33 chr17.08.gtf +# -rw-rw-r-- 1 3539694 Jan 2 15:33 chr2.06.gtf +# -rw-rw-r-- 1 2309903 Jan 2 15:33 chr3.05.gtf + + for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05 +do + echo "${F}" 1>&2 + cut -f2 ${F}.lift | while read P + do + liftUp -type=.bed stdout ${F}.lift error subopt/${P}.bed + done > ${F}.lifted.subopt.bed +done + # copied these results to ../subopt/ to get into the final result +# -rw-rw-r-- 1 3349959 Jan 2 15:33 chr1.03.gtf +# -rw-rw-r-- 1 2439182 Jan 2 15:33 chr10.05.gtf +# -rw-rw-r-- 1 1068097 Jan 2 15:33 chr11.04.gtf +# -rw-rw-r-- 1 2392548 Jan 2 15:33 chr12.07.gtf +# -rw-rw-r-- 1 1831336 Jan 2 15:33 chr17.08.gtf +# -rw-rw-r-- 1 3539694 Jan 2 15:33 chr2.06.gtf +# -rw-rw-r-- 1 2309903 Jan 2 15:33 chr3.05.gtf + + + cat << '_EOF_' > pepNameFix.pl +#!/usr/bin/env perl + +use strict; +use warnings; + +# BIG ASSUMPTION ! ! ! - the peptides are in the same order as +# they are in the GTF file ! ! ! + +my $argc = scalar(@ARGV); + +if ($argc != 1) { + printf STDERR "usage: cat chrN.M.needNameFix.pep | ./pepNameFix.pl chrN.M > chrN.M.pep\n"; + exit 255; +} + +my $C=shift; + +my $id = 1; + +while (my $line = <>) { + if ($line =~ m/^>/) { + printf ">%s.%d\n", $C, $id++; + } else { + print $line; + } +} +'_EOF_' + # << happy emacs + chmod +x pepNameFix.pl + +for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05 +do + echo "${F}" 1>&2 + cut -f2 ${F}.lift | while read P + do + cat pep/${P}.pep + done > ${F}.needNameFix.pep + cat ${F}.needNameFix.pep | ./pepNameFix.pl ${F} > ${F}.pep +done + # copied these results to ../pep/ to get into the final result: +# -rw-rw-r-- 1 1592655 Jan 2 15:55 chr1.03.pep +# -rw-rw-r-- 1 1169168 Jan 2 15:55 chr10.05.pep +# -rw-rw-r-- 1 519106 Jan 2 15:55 chr11.04.pep +# -rw-rw-r-- 1 1152111 Jan 2 15:55 chr12.07.pep +# -rw-rw-r-- 1 775052 Jan 2 15:55 chr17.08.pep +# -rw-rw-r-- 1 1799546 Jan 2 15:55 chr2.06.pep +# -rw-rw-r-- 1 1248762 Jan 2 15:55 chr3.05.pep + + # and then, adding in all the results together + + cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun + cat << '_EOF_' > gtfIdFix.pl +#!/usr/bin/env perl + +use strict; +use warnings; + +my $argc = scalar(@ARGV); + +if ($argc != 1) { + printf STDERR "usage: cat lifted/gtf/chrN.gtf | ./gtfIdFix.pl chrN\n"; + exit 255; +} + +my $C=shift; + +my $id = 0; +my $prevId = ""; +open (NM, ">nameFixed/newNames/${C}.tab") or die "can not write to nameFixed/newNames/${C}.tab"; +open (GT, ">nameFixed/gtf/${C}.gtf") or die "can not write to nameFixed/gtf/${C}.gtf"; +while (my $line=<>) { + chomp $line; + my $geneId = $line; + $geneId =~ s/^${C}.*gene_id "//; + $geneId =~ s/";.*//; + if ( $prevId ne $geneId) { + $id += 1; + printf NM "%s\t%s.%d\n", $geneId, $C, $id; + } + $line =~ s/${C}.[0-9]+.[0-9]+/${C}.$id/g; + printf GT "%s\n", $line; + $prevId = $geneId; +} +close (GT); +close (NM); +'_EOF_' + # << happy emacs + chmod +x gtfIdFix.pl + + rm -fr lifted + rm -fr nameFix + mkdir -p lifted + mkdir -p lifted/gtf + mkdir -p lifted/pep + mkdir -p lifted/subopt + mkdir -p nameFix + mkdir -p nameFix/gtf + mkdir -p nameFix/newNames + + for F in liftSpecs/hg38_* +do + L=`cut -f2 $F` + C=`cut -f4 $F` + liftUp -type=.gtf stdout ${F} error gtf/${L}.gtf >> lifted/gtf/${C}.gtf + cat pep/${L}.pep >> lifted/pep/${C}.pep + liftUp -type=.bed stdout ${F} error subopt/${L}.bed >> lifted/subopt/${C}.bed +done + + for F in lifted/gtf/*.gtf +do + C=`basename $F | sed -e 's/.gtf//'` + cat $F | ./gtfIdFix.pl $C +done + +mkdir -p nameFixed/pep + + cat << '_EOF_' > pepNameFix.pl +#!/usr/bin/env perl + +use strict; +use warnings; + +my $argc = scalar(@ARGV); +if ($argc != 1) { + printf STDERR "usage: ./pepNameFix.pl chrN > chrN.pep\n"; + exit 255 +} + +my $C = shift; +my %newName; + +open (FH, ") { + chomp $line; + my ($needFix, $fixedName) = split('\t', $line); + $newName{$needFix} = $fixedName; +} +close (NM); + +while (my $line = ) { + if ($line =~m /^>/) { + chomp $line; + $line =~ s/^>//; + die "can not find name to fix $line" if (!exists($newName{$line})); + printf ">%s\n", $newName{$line}; + } else { + print $line; + } +} +close (FH); +'_EOF_' + # << happy emacs + chmod +x pepNameFix.pl + + for F in lifted/pep/*.pep +do + C=`basename $F | sed -e 's/.pep//'` + echo $C + ./pepNameFix.pl $C > nameFixed/pep/$C.pep +done + +############################################################################# +# Mark the new centromere regions (DONE - 2014-01-09 - Hiram) + mkdir /hive/data/genomes/hg38/bed/centromere + cd /hive/data/genomes/hg38/bed/centromere + grep GJ ../../hg38.agp > hg38.centContigs.agp + + awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' hg38.centContigs.agp \ + > hg38.centContigs.bed4 + + hgLoadBed hg38 centromeres hg38.centContigs.bed4 + checkTableCoords hg38 centromeres + +############################################################################# +## alternate sequence/haplotype alignments (DONE - 2014-01-23 - Hiram) + mkdir /hive/data/genomes/hg38/bed/lastzAltSequences + cd /hive/data/genomes/hg38/bed/lastzAltSequences + +rm -fr hg38.haplotypes.lift temp.lift targetFa queryFa +mkdir targetFa +mkdir queryFa +touch temp.lift + +cat ../altLocations/chrToAlt.bed | while read L +do + chrName=`echo $L | awk '{print $1}'` + chromSize=`egrep "^$chrName " ../../chrom.sizes | cut -f2` + chrStart=`echo $L | awk '{if (($2-10000)>=0) {printf "%d", $2-10000} else {printf "0"}}'` + chrEnd=`echo $L | awk -v chromSize=$chromSize '{if (($3+10000)<=chromSize) {printf "%d", $3+10000} else {printf "%d", chromSize}}'` + chrSize=`echo $chrEnd $chrStart | awk '{print $1-$3}'` + queryName=`echo $L | awk '{print $4}'` + partName="${chrName}_${chrStart}_${chrEnd}" + echo $chrName $chrStart $chrEnd $queryName $partName $chromSize + echo -e "$chrStart\t${partName}\t$chrSize\t$chrName\t$chromSize" >> temp.lift + twoBitToFa ../../hg38.unmasked.2bit:$chrName:$chrStart-$chrEnd stdout | sed -e "s/^>.*/>$partName/;" > targetFa/$queryName.fa + twoBitToFa ../../hg38.unmasked.2bit:$queryName queryFa/$queryName.fa +done + +sort -u temp.lift | sort -k4,4 -k1,1n > hg38.haplotypes.lift + + # these were run serially on hgwdev, they could be a cluster run: + ssh ku + mkdir /hive/data/genomes/hg38/bed/lastzAltSequences/run.blastz + cd /hive/data/genomes/hg38/bed/lastzAltSequences/run.blastz + mkdir ../lav ../psl + + # construct the jobList + ls ../targetFa | sed -e 's/.fa//;' | while read partName +do + echo "./runJob.sh ${partName}" +done > jobList + + cat << '_EOF_' > runJob +#!/bin/sh + +export partName=$1 +export target="../targetFa/$partName.fa" +export query="../queryFa/$partName.fa" +export lav="../lav/$partName.lav" +export psl="../psl/$partName.psl" + +/cluster/bin/penn/lastz-distrib-1.03.46/bin/lastz \ + $target $query \ + Y=15000 T=2 M=254 O=600 H=2000 O=600 E=150 K=10000 L=10000 \ + Q=/scratch/data/blastz/human_chimp.v2.q > $lav +lavToPsl $lav stdout | liftUp $psl ../hg38.haplotypes.lift error stdin +'_EOF_' + # << happy emacs + + # these were run serially on hgwdev, they could be a cluster run: + time ./jobList > do.log + # real 61m35.898s + + # chaining lastz results: + mkdir -p /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain/run/chain + cd /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain/run + + ls ../../psl/*.psl | while read P +do + B=`basename $P | sed -e 's/.psl//'` + echo $B $P + ls -og $P ../../targetFa/${B}.fa ../../queryFa/${B}.fa + /cluster/home/hiram/kent/src/hg/mouseStuff/axtChain/axtChain \ + -psl -scoreScheme=/scratch/data/blastz/human_chimp.v2.q \ + -minScore=1000 -linearGap=medium $P \ + ../../../../hg38.unmasked.2bit \ + ../../../../hg38.unmasked.2bit stdout \ + | chainAntiRepeat ../../../../hg38.unmasked.2bit \ + ../../../../hg38.unmasked.2bit stdin chain/${B}.chain +done + + # real 7m54.677s + + cd /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain + find ./run/chain -name "*.chain" | chainMergeSort -inputList=stdin \ + | nice gzip -c > hg38.haplotypes.all.chain.gz + chainPreNet hg38.haplotypes.all.chain.gz ../../../chrom.sizes \ + /hive/data/genomes/hg38/chrom.sizes stdout \ + | chainNet stdin -minSpace=1 ../../../chrom.sizes \ + ../../../chrom.sizes stdout /dev/null \ + | netSyntenic stdin noClass.net + + # Make liftOver chains from chroms to alternates: + netChainSubset -verbose=0 noClass.net hg38.haplotypes.all.chain.gz stdout \ + | chainStitchId stdin stdout | gzip -c > hg38.haplotypes.over.chain.gz + # swap the alignments to get the alternates to chrom mappings: + chainSwap hg38.haplotypes.over.chain.gz stdout \ + | gzip -c > hg38.reference.over.chain.gz + # and put them all together so mappings go both directions + chainMergeSort hg38.haplotypes.over.chain.gz hg38.reference.over.chain.gz \ + | gzip -c > hg38.haploReference.over.chain.gz + + hgLoadChain -tIndex hg38 chainAltSequence hg38.haploReference.over.chain.gz + netClass -verbose=0 -noAr noClass.net hg38 hg38 hg38.hg38AltSequence.net + netFilter -minGap=10 hg38.hg38AltSequence.net \ + | hgLoadNet -verbose=0 hg38 netAltSequence stdin + + chainToPsl hg38.haploReference.over.chain.gz ../../../chrom.sizes \ + ../../../chrom.sizes \ + /hive/data/genomes/hg38/hg38.unmasked.2bit \ + /hive/data/genomes/hg38/hg38.unmasked.2bit \ + hg38.beforeRecalc.haploReference.over.psl + + pslCheck -targetSizes=../../../chrom.sizes \ + -querySizes=../../../chrom.sizes \ + hg38.beforeRecalc.haploReference.over.psl 2>&1 | tail -1 + # checked: 3092 failed: 57 errors: 57 + + pslRecalcMatch hg38.beforeRecalc.haploReference.over.psl \ + ../../../hg38.unmasked.2bit ../../../hg38.unmasked.2bit \ + hg38.haploReference.over.psl + + pslCheck -targetSizes=../../../chrom.sizes \ + -querySizes=../../../chrom.sizes \ + hg38.haploReference.over.psl 2>&1 | tail -1 + # checked: 3092 failed: 0 errors: 0 + + hgLoadPsl hg38 -table=altSequenceLiftOver hg38.haploReference.over.psl + +############################################################################# +## construct non-bridged contig sequence (DONE - 2014-01-10 - Hiram) + mkdir /hive/data/genomes/hg38/bed/nonBridgedContigs + cd /hive/data/genomes/hg38/bed/nonBridgedContigs + + # only need the actual split chroms in this lift, and the + # _nn name is a bit more convenient than the .nn: + gapToLift -minGap=100 hg38 stdout | sed -e 's/\./_/;' \ + | awk '$1 != 0' > hg38.contigs.lift + # the warnings gapToLift issues are about gaps defined in the table + # that are abutting to each other. teleomere gaps are next to contig gaps + # those lifts in the format of a bed file: + awk '{printf "%s\t%d\t%d\t%s\n", $4, $1, $1+$3, $2}' hg38.contigs.lift \ + > hg38.contigs.bed + # the negation of that is the gaps between the contigs + # fixup the .N to _nn with the awk: + featureBits -not -countGaps hg38 hg38.contigs.bed -bed=stdout \ +| awk '{split($4,a,"."); printf "%s\t%d\t%d\t%s_%02d\n", $1,$2,$3,a[1],a[2]}' \ + > hg38.gaps.bed + # 268613637 bases of 3209286105 (8.370%) in intersection + + # together, those two should be %100 of the genome exactly: + featureBits -countGaps -or hg38 hg38.contigs.bed hg38.gaps.bed + # 3209286105 bases of 3209286105 (100.000%) in intersection + + # the list of all those other bits not in the split chroms: + egrep "_alt|chrUn|chrM|_random" hg38.gaps.bed | cut -f1 \ + | sort > other.bits.list + + # extract those chrom pieces and the other bits from the masked sequence: + (twoBitToFa -bed=hg38.contigs.bed ../../hg38.2bit stdout; \ + twoBitToFa -seqList=other.bits.list ../../hg38.2bit stdout) \ + | faToTwoBit stdin hg38.contigs.2bit + twoBitInfo hg38.contigs.2bit stdout | sort -k2nr > hg38.contigs.chrom.sizes + # verify nothing has been lost: + twoBitToFa ../../hg38.2bit stdout | faCount stdin | tail -1 +# total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743 + twoBitToFa hg38.contigs.2bit stdout | faCount stdin | tail -1 +# total 3061688741 898285419 623727342 626335137 900967885 12372958 30979743 + # the ACGT and CPG counts remain the same, only N's have been lost + + # make a copy of this at the top: + cp -p hg38.contigs.2bit ../.. + cp -p hg38.contigs.lift ../../jkStuff + + # load as a track to be able to see where they are: + egrep "chrUn|chrM|_alt|_random" hg38.contigs.chrom.sizes \ + | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $1}' \ + > fullCoverage.hg38Contigs.bed + cat hg38.contigs.bed >> fullCoverage.hg38Contigs.bed + featureBits -or -countGaps hg38 fullCoverage.hg38Contigs.bed gap + # 3209286105 bases of 3209286105 (100.000%) in intersection + + hgLoadBed hg38 contigAlignmentSegments fullCoverage.hg38Contigs.bed + +############################################################################# +## analysis of repeat elements from each RM run +## (DONE - 2014-01-10 - Hiram) + mkdir /hive/data/genomes/hg38/bed/repeatElementCount + cd /hive/data/genomes/hg38/bed/repeatElementCount + for F in ../rmsk*/hg38.class.profile.txt \ + ../repeatMaskerGenbank/hg38.class.profile.txt +do + D=`dirname $F` + B=`basename $D | sed -e 's/repeatMaskerGenbank/NCBI/; s/rmsk//;'` + echo "==== $B ====" + grep rmskClass $F | sed -e 's#rmskClass/##; s/.tab//;' \ + | awk '{printf "%s\t%d\n", $2, $1}' | sort > ${B}.tab +done + + # Hmmer does not have snRNA and tRNA ? + echo -e "snRNA\t0" >> Hmmer.tab + echo -e "tRNA\t0" >> Hmmer.tab + sort Hmmer.tab > t.tab + mv t.tab Hmmer.tab + + echo "# Repeat Masker item counts" > table.result.txt + echo "# class NCBI cross-match rmblastn HMMER" >> table.result.txt + join NCBI.tab CM.tab | join - Blastn.tab | join - Hmmer.tab \ + | awk '{printf "%-15s\t%7d\t%7d\t%7d\t%7d\n", $1,$2,$3,$4,$5}' \ + | sort -k2,2nr >> table.result.txt + + cat table.result.txt +# Repeat Masker item counts +# class NCBI cross-match rmblastn HMMER +SINE 1849444 1852545 1822406 1884179 +LINE 1586141 1570523 1551012 1702529 +LTR 759248 748597 737799 805427 +DNA 502186 499108 485558 565171 +Simple_repeat 433789 703682 716968 636906 +Low_complexity 396378 102856 105181 95480 +Satellite 10198 7962 7703 10852 +LTR? 5884 5667 5068 9181 +snRNA 4595 4516 4548 0 +Retroposon 4163 5750 5630 11861 +Unknown 2802 5622 5263 3914 +DNA? 2157 3294 3018 4582 +tRNA 2154 2026 1983 0 +rRNA 1915 1840 1810 464 +RC 1860 1784 1706 2059 +srpRNA 1784 1672 1633 1517 +scRNA 1397 1420 1426 6783 +RNA 822 704 611 1484 +SINE? 488 38 38 970 +RC? 445 411 374 806 + +total 5567850 5520017 5459735 5744165 + +############################################################################# +## blat server turned on (DONE - 2014-01-13 - Hiram) +# After getting a blat server assigned by the Blat Server Gods, + ssh hgwdev + + hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ + VALUES ("hg38", "blat4c", "17780", "1", "0"); \ + INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ + VALUES ("hg38", "blat4c", "17781", "0", "1");' \ + hgcentraltest + # test it with some sequence + +############################################################################ +## reset default position to ABO gene (DONE - 2014-01-13 - Hiram) + ssh hgwdev + hgsql -e 'update dbDb set defaultPos="chr9:133252000-133280861" + where name="hg38";' hgcentraltest + +######################################################################### +## update grp table with new set of standard rows (DONE - 2014-01-29 - Hiram) + hgsql -e 'alter table grp rename grpOriginal;' hg38 + hgsql -e 'drop table grp;' hg38 + hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg19.grp" hg38 + hgsql -e 'delete from grp where name="denisova";' hg38 + hgsql -e 'delete from grp where name="pub";' hg38 + hgsql -e 'delete from grp where name="neandertal";' hg38 + hgsql -e 'update grp set defaultIsClosed=0 where name="map";' hg38 + + hgsql -e 'drop table grpOriginal;' hg38 + +############################################################################ +# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR LASTZ (DONE - 2014-01-21 - Hiram) + ssh ku + mkdir /hive/data/genomes/hg38/bed/linSpecRep + cd /hive/data/genomes/hg38/bed/linSpecRep + # create individual .out files from the master record in ../repeatMasker + mkdir splitOut + cat << '_EOF_' > split.csh +#!/bin/csh -fe +set C = $1 +head -3 ../repeatMasker/hg38.sorted.fa.out > splitOut/${C}.out +grep "${C} " ../repeatMasker/hg38.sorted.fa.out >> splitOut/${C}.out +'_EOF_' + # << happy emacs + chmod +x split.csh + + cat << '_EOF_' > template +#LOOP +split.csh $(root1) {check out line+ splitOut/$(root1).out} +#ENDLOOP +'_EOF_' + # << happy emacs + + # small ones first: + cut -f1 ../../chrom.sizes | tac > chrom.list + gensub2 chrom.list single template jobList + para create jobList + para try ... check ... push ... etc... +# Completed: 93 of 93 jobs +# CPU time in finished jobs: 127s 2.12m 0.04h 0.00d 0.000 y +# IO & Wait Time: 17154s 285.90m 4.76h 0.20d 0.001 y +# Average job time: 186s 3.10m 0.05h 0.00d +# Longest finished job: 224s 3.73m 0.06h 0.00d +# Submission to last job: 280s 4.67m 0.08h 0.00d + + # now, we can date and process each of those .out files + # constructing the humanSpecific set of repeats + # this means repeats found in human, and not in others + # using mouse here for 'others' is good enough, a variety + # of other species could be used (rat dog cow) where they all + # produce the same result + mkdir dateRepeats + cd dateRepeats + cat << '_EOF_' > mkLSR +#!/bin/bash +set -beEu -o pipefail +rm -f $1.out_mus-musculus +ln -s ../splitOut/$1.out . +/scratch/data/RepeatMasker/DateRepeats $1.out -query human -comp mouse +rm $1.out +mkdir -p ../humanSpecific +/cluster/bin/scripts/extractRepeats 1 $1.out_mus-musculus \ + > ../humanSpecific/$1.out.spec +'_EOF_' + # << happy emacs + chmod +x mkLSR + + cat << '_EOF_' > template +#LOOP +./mkLSR $(path1) {check out line+ ../humanSpecific/$(path1).out.spec} +#ENDLOOP +'_EOF_' + # << happy emacs + + gensub2 ../chrom.list single template jobList + para try ... check ... push ... etc... + para time +# Completed: 455 of 455 jobs +# CPU time in finished jobs: 13985s 233.08m 3.88h 0.16d 0.000 y +# IO & Wait Time: 1470s 24.50m 0.41h 0.02d 0.000 y +# Average job time: 34s 0.57m 0.01h 0.00d +# Longest finished job: 111s 1.85m 0.03h 0.00d +# Submission to last job: 1427s 23.78m 0.40h 0.02d + + + # We also need the nibs for blastz runs with lineage specific repeats + mkdir /hive/data/genomes/hg38/bed/nibs + cd /hive/data/genomes/hg38/bed/nibs + cut -f1 ../../chrom.sizes | while read C +do + twoBitToFa -seq=${C} ../../hg38.2bit stdout \ + | faToNib -softMask stdin ${C}.nib + echo "${C} done" +done + + # verify nothing lost + cat ../../chrom.sizes \ + | awk '{printf "nibFrag -masked %s.nib 0 %d + stdout\n", $1, $2}' \ + | sh | faSize stdin +# 3209286105 bases (159970322 N's 3049315783 real 1460684798 upper +# 1588630985 lower) in 455 sequences in 1 files +# Total size: mean 7053376.1 sd 31548372.6 +# min 970 (chrUn_KI270394v1.nib:0-970) +# max 248956422 (chr1.nib:0-248956422) median 161218 +# %49.50 masked total, %52.10 masked real + + mkdir /hive/data/staging/data/hg38/nib + rsync -a --progress ./ /hive/data/staging/data/hg38/nib + +############################################################################# +## GRC Contigs/ctgPos2 track (DONE - 2014-12-25 - Hiram) + # provide mapping of UCSC chrom names to GRC names + mkdir /hive/data/genomes/hg38/bed/ctgPos2 + cd /hive/data/genomes/hg38/bed/ctgPos2 + grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \ + | awk '{printf "s/^%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt + + find ../../genbank -type f | grep "/assembled_chromosomes/AGP/" | sed -e 's/.comp//' | while read F +do + if [ -s $F ]; then + zcat $F | grep -v "^#" + fi +done | sed -e "`cat accessionToUcsc.sed.txt`" > ucsc.grch38.agp + + awk '$5 != "N"' ucsc.grch38.agp \ +| awk '{printf "%s\t%d\t%s\t%d\t%d\t%s\n", $6, $3-$2+1, $1, $2-1, $3, $5}' \ + | sort -u | sort -k3,3 -k4,4n > ctgPos2.tab + + + export ctgSize=`awk '{print length($1)}' ctgPos2.tab | sort -n | tail -1` + export chrSize=`awk '{print length($3)}' ctgPos2.tab | sort -n | tail -1` + + sed -e "s/20/$ctgSize/; s/16/$chrSize/;" \ + /cluster/home/hiram/kent/src/hg/lib/ctgPos2.sql > hg38.ctgPos2.sql + + hgLoadSqlTab hg38 ctgPos2 hg38.ctgPos2.sql ctgPos2.tab + +############################################################################ +# constructing download files (WORKING - 2014-01-15 - Hiram) + # add hg38 to all.joiner and verify it is clean: + joinerCheck -database=hg38 -keys all.joiner +# Checking keys on database hg38 +# hg38.ucscToINSDC.chrom - hits 455 of 455 (100.000%) ok + # and all table coordinates are OK: + checkTableCoords hg38 + + cd /hive/data/genomes/hg38 + time $HOME/kent/src/hg/utils/automation/makeDownloads.pl \ + -workhorse=hgwdev hg38 + # makeDownloads.pl has made a preliminary set of files + + # need to fixup these names and add chromFa.tar.gz files + cd /hive/data/genomes/hg38/goldenPath/bigZips + + mkdir chroms + mkdir maskedChroms + + faSplit byname hg38.fa.gz chroms/ + faSplit byname hg38.fa.masked.gz maskedChroms/ + + tar cvzf ./hg38.chromFa.tar.gz ./chroms/ + tar cvzf ./hg38.chromFaMasked.tar.gz ./maskedChroms/ + + cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips + ln -s /hive/data/genomes/hg38/goldenPath/bigZips/hg38.chromFa.tar.gz hg38.chromFa.tar.gz + ln -s /hive/data/genomes/hg38/goldenPath/bigZips/hg38.chromFaMasked.tar.gz hg38.chromFaMasked.tar.gz + + #also added entries for above to md5sum.txt and README.txt + +############################################################################ +# LASTZ MOUSE Mm10 (DONE - 2014-01-23,31 - Hiram) + # can no longer use the lineage specific repeats with the new lastz + # use a screen to manage this longish job: + screen -S hg38Mm10 + + mkdir /hive/data/genomes/hg38/bed/lastzMm10.2014-01-23 + cd /hive/data/genomes/hg38/bed/lastzMm10.2014-01-23 + + # best to always specify an exact path to lastz so we know which one is used + # lastz default parameters are human-mouse parameters + + cat << '_EOF_' > DEF +# human vs mouse +BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz + +# TARGET: Human Hg38 +SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit +SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes +SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit +SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes +SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift +SEQ1_CHUNK=40000000 +SEQ1_LAP=10000 + +# QUERY: Mouse Mm10 +SEQ2_DIR=/scratch/data/mm10/mm10.2bit +SEQ2_LEN=/scratch/data/mm10/chrom.sizes +SEQ2_CHUNK=20000000 +SEQ2_LAP=0 + +BASE=/hive/data/genomes/hg38/bed/lastzMm10.2014-01-23 +TMPDIR=/dev/shm +'_EOF_' + # << happy emacs + + time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ + -verbose=2 \ + -stop=net `pwd`/DEF \ + -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -fileServer=hgwdev \ + -chainMinScore=3000 -chainLinearGap=medium > net.log 2>&1 + # real 1494m26.135s ---- busy cluster + time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ + -verbose=2 \ + -continue=load `pwd`/DEF \ + -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -fileServer=hgwdev \ + -chainMinScore=3000 -chainLinearGap=medium > load.log 2>&1 + # Elapsed time: 43m11s + cat fb.hg38.chainMm10Link.txt + # 964465044 bases of 3049335806 (31.629%) in intersection + + # and the swap + mkdir /hive/data/genomes/mm10/bed/blastz.hg38.swap + cd /hive/data/genomes/mm10/bed/blastz.hg38.swap + time nice -n +19 doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/hg38/bed/lastzMm10.2014-01-23/DEF \ + -swap -syntenicNet \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 + # real 83m28.397s + + cat fb.mm10.chainHg38Link.txt + # 937030766 bases of 2652783500 (35.323%) in intersection + +######################################################################### +# LASTZ Dog CanFam3 (DONE - 2014-01-26 - Hiram) + mkdir /hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26 + cd /hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26 + + cat << '_EOF_' > DEF +# human vs dog +BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz + +# TARGET: Human Hg38 +SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit +SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes +SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit +SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes +SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift +SEQ1_CHUNK=20000000 +SEQ1_LAP=10000 + +# QUERY: Dog CanFam3 +SEQ2_DIR=/hive/data/genomes/canFam3/canFam3.2bit +SEQ2_LEN=/hive/data/genomes/canFam3/chrom.sizes +SEQ2_CHUNK=20000000 +SEQ2_LAP=0 + +BASE=/hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26 +TMPDIR=/dev/shm +'_EOF_' + # << happy emacs + + # establish a screen to control this job + screen hg38CanFam3 + time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ + `pwd`/DEF \ + -syntenicNet \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 + # Elapsed time: 1396m22s - busy cluster + cat fb.hg38.chainCanFam3Link.txt + # 1523987456 bases of 3049335806 (49.978%) in intersection + + # running the swap + mkdir /hive/data/genomes/canFam3/bed/blastz.hg38.swap + cd /hive/data/genomes/canFam3/bed/blastz.hg38.swap + time nice -n +19 doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26/DEF \ + -syntenicNet -swap \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 + # real 107m57.787s + + cat fb.canFam3.chainHg38Link.txt + # 1437624815 bases of 2392715236 (60.083%) in intersection + +######################################################################### +# LASTZ Macaca Mulatta RheMac3 (DONE - 2014-01-27,02-10 - Hiram) + mkdir /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27 + cd /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27 + + # best to always specify an exact path to lastz so we know which one is used + # lastz default parameters are human-mouse parameters + + cat << '_EOF_' > DEF +# human vs macaca mulatta +BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz +# maximum M allowed with lastz is only 254 +BLASTZ_M=254 +BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q +BLASTZ_O=600 +BLASTZ_E=150 +# other parameters from panTro2 vs hg18 lastz on advice from Webb +BLASTZ_K=4500 +BLASTZ_Y=15000 +BLASTZ_T=2 + +# TARGET: Human Hg38 +SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit +SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes +SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit +SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes +SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift +SEQ1_CHUNK=20000000 +SEQ1_LAP=10000 + +# QUERY: Macaca Mulatta RheMac3 +SEQ2_DIR=/scratch/data/rheMac3/rheMac3.2bit +SEQ2_LEN=/scratch/data/rheMac3/chrom.sizes +SEQ2_CHUNK=20000000 +SEQ2_LAP=0 +SEQ2_IN_CONTIGS=0 + +BASE=/hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27 +TMPDIR=/dev/shm +'_EOF_' + # << happy emacs + time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ + `pwd`/DEF \ + -syntenicNet -fileServer=hgwdev \ + -chainMinScore=5000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 + # Elapsed time: 1426m43s - busy cluster + cat fb.hg38.chainRheMac3Link.txt + # 2431208700 bases of 3049335806 (79.729%) in intersection + + # running the swap + mkdir /hive/data/genomes/rheMac3/bed/blastz.hg38.swap + cd /hive/data/genomes/rheMac3/bed/blastz.hg38.swap + time nice -n +19 doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27/DEF \ + -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 + # 82m32.329s + cat fb.rheMac3.chainHg38Link.txt + # 2288533769 bases of 2639145830 (86.715%) in intersection + +######################################################################### +## construct analysis set (DONE - 2014-01-27 - Hiram) + mkdir /hive/data/genomes/hg38/bed/analysisSet + cd /hive/data/genomes/hg38/bed/analysisSet + mkdir -p splitFa + + faToTwoBit \ +../../genbank/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz \ + hg38.unmasked.analysisSet.2bit + + faCount splitFa/c*.fa > splitFa.faCount.txt + + egrep -v "chr[0-9_BGHIJKLXv]*_alt" ../rmskCM/hg38.sorted.fa.out \ + > hg38.analysisSet.out + + twoBitMask hg38.unmasked.analysisSet.2bit hg38.analysisSet.out \ + hg38.rmsk.analysisSet.2bit + + egrep -v "chr[0-9_BGHIJKLXv]*_alt" ../simpleRepeat/trfMask.bed \ + > trfMask.analysisSet.bed + + twoBitMask hg38.rmsk.analysisSet.2bit -add trfMask.analysisSet.bed \ + hg38.analysisSet.2bit + + twoBitToFa hg38.unmasked.analysisSet.2bit stdout | faSize stdin +# 3099922541 bases (165046090 N's 2934876451 real 2934876451 upper 0 lower) +# in 195 sequences in 1 files +# Total size: mean 15897038.7 sd 46804464.6 min 970 (chrUn_KI270394v1) +# max 248956422 (chr1) median 32032 +# %0.00 masked total, %0.00 masked real + + twoBitToFa hg38.analysisSet.2bit stdout | faSize stdin +# 3099922541 bases (165046090 N's 2934876451 real 1409378896 upper 1525497555 +# lower) in 195 sequences in 1 files +# Total size: mean 15897038.7 sd 46804464.6 min 970 (chrUn_KI270394v1) +# max 248956422 (chr1) median 32032 +# %49.21 masked total, %51.98 masked real + + mkdir hg38.analysisSet.chroms + twoBitToFa hg38.analysisSet.2bit stdout \ + | faSplit byname stdin hg38.analysisSet.chroms/ + + tar cvzf ./hg38.analysisSet.chroms.tar.gz ./hg38.analysisSet.chroms + + ln -s `pwd`/hg38.analysisSet.2bit \ + /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips + ln -s `pwd`/hg38.analysisSet.chroms.tar.gz \ + /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips + # add these md5 sums to md5sum.txt + md5sum hg38.analysisSet.2bit hg38.analysisSet.chroms.tar.gz >> \ + /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/md5sum.txt + + cp ../../genbank/README_ANALYSIS_SETS README.analysisSet.txt + # add note at the top of README: + ###################################################################### + UCSC copy of the file from: + + ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/README_ANALYSIS_SETS + + ln -s `pwd`/README.analysisSet.txt \ + /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips + +######################################################################### +# the FULL analysis set (DONE - 2014-03-18 - Hiram + mkdir /hive/data/genomes/hg38/bed/fullAnalysisSet + cd /hive/data/genomes/hg38/bed/fullAnalysisSet + + mkdir hg38.fullAnalysisSet.chroms + twoBitToFa ../analysisSet/hg38.analysisSet.2bit stdout \ + | faSplit byname stdin hg38.fullAnalysisSet.chroms/ + + grep _alt ../../chrom.sizes | cut -f 1 > alt.list + + twoBitToFa -seqList=alt.list ../../hg38.2bit stdout \ + | faSplit byname stdin hg38.fullAnalysisSet.chroms/ + + faCount hg38.fullAnalysisSet.chroms/chr*.fa > faCount.fullAnalysisSet.txt + + faToTwoBit hg38.fullAnalysisSet.chroms/chr*.fa hg38.fullAnalysisSet.2bit + twoBitInfo hg38.fullAnalysisSet.2bit stdout | sort -k2nr > chrom.sizes + + tar cvzf ./hg38.fullAnalysisSet.chroms.tar.gz ./hg38.fullAnalysisSet.chroms + +######################################################################### +# LASTZ Self/hg38 (DONE - 2014-01-25,02-10 - Hiram) + # can no longer use the lineage specific repeats with the new lastz + # use a screen to manage this longish job: + screen -S hg38Self + + mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25 + cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25 + # construct the non-bridged contigs sequence to use: + (twoBitToFa ../nonBridgedContigs/hg38.chroms.contigs.2bit stdout; + twoBitToFa ../../hg38.2bit:chrM stdout) | faToTwoBit stdin hg38.self.2bit + twoBitInfo hg38.self.2bit stdout | sort -k2nr > hg38.self.chrom.sizes + + # best to always specify an exact path to lastz so we know which one is used + # lastz default parameters are human-mouse parameters + + cat << '_EOF_' > DEF +# human vs human with mouse defaults +BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz + +# TARGET: Human Hg38 +SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit +SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes +SEQ1_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit +SEQ1_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes +SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift +SEQ1_CHUNK=20000000 +SEQ1_LAP=10000 + +# QUERY: Human Hg38 +SEQ2_DIR=/hive/data/genomes/hg38/hg38.2bit +SEQ2_LEN=/hive/data/genomes/hg38/chrom.sizes +SEQ2_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit +SEQ2_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes +SEQ2_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift +SEQ2_CHUNK=20000000 +SEQ2_LAP=0 + +BASE=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25 +TMPDIR=/dev/shm +'_EOF_' +_EOF_ + + time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ + -verbose=2 \ + -stop=net `pwd`/DEF \ + -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -fileServer=hgwdev \ + -chainMinScore=3000 -chainLinearGap=medium > net.log 2>&1 + # real 1518m15.817s -- problems + # there was a problem in the 'part014' batch. running that manually: + mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob + cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob + # make 100 jobs out of the 10 parts: + mkdir -p psl + cp ../tParts/part014.lst ./xpart014.lst + split -l 1 xpart014.lst -d -a 3 part + for F in part0* +do + mv $F $F.lst +done + +for T in part0*.lst +do + for Q in part0*.lst + do + mkdir -p psl/${T} + echo /cluster/home/hiram/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf ${T} ${Q} ../../DEF \{check out exists psl/${T}/${T}.${Q}.psl\} + done +done > jobList + para -ram=32g create jobList + para push + # one last failing job: +# Completed: 99 of 100 jobs +# CPU time in finished jobs: 2836s 47.27m 0.79h 0.03d 0.000 y +# IO & Wait Time: 279s 4.65m 0.08h 0.00d 0.000 y +# Average job time: 31s 0.52m 0.01h 0.00d +# Longest finished job: 586s 9.77m 0.16h 0.01d +# Submission to last job: 620s 10.33m 0.17h 0.01d + + mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010 + cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010 + mkdir psl + + twoBitToFa /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit:chr16_03:0-1689648 part010.fa + + faSplit -lift=split010.lift size part010.fa 169000 split010_ +TOP="/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010" + +for T in split*.fa +do + mkdir -p psl/${T} + echo "${TOP}/${T}" > ${T}.lst + faToTwoBit ${T} ${T}.2bit + for Q in split*.fa + do + echo "/cluster/home/hiram/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf ${T}.lst ${Q}.lst DEF {check out exists psl/${T}/${T}.${Q}.psl}" + done +done > jobList + para -ram=32g create jobList + +# Completed: 100 of 100 jobs +# CPU time in finished jobs: 176579s 2942.99m 49.05h 2.04d 0.006 y +# IO & Wait Time: 1239s 20.64m 0.34h 0.01d 0.000 y +# Average job time: 1778s 29.64m 0.49h 0.02d +# Longest finished job: 29343s 489.05m 8.15h 0.34d +# Submission to last job: 29348s 489.13m 8.15h 0.34d + + catDir psl/* | grep -v "^#" > raw.psl + + liftUp -type=.psl stdout split010.lift error raw.psl \ + | liftUp -pslQ -type=.psl chr16_03.psl split010.lift error stdin + + # this combination allowed psl headers to sneak in the middle, + # had to be cleaned: + catDir psl/* | grep -v "^#" > part014.psl + cat split010/chr16_03.psl >> part014.psl + cp -p part014.psl ../../psl/part014.lst/part014.lst_part014.lst.psl + + time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ + -verbose=2 \ + -continue=cat -stop=net `pwd`/DEF \ + -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -fileServer=hgwdev \ + -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 + # real 43m11.340s + # failed in chaining, running manually on hgwdev + time ./bigJobs.sh > bigJobs.log 2>&1 + # real 468m59.648s + + time ./part014.sh > part014.log 2>&1 + + # real 1319m57.911s + # -rw-rw-r-- 1 3581498246 Feb 8 14:37 part014.lst.chain + time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ + -verbose=2 \ + -continue=chainMerge -stop=net `pwd`/DEF \ + -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -fileServer=hgwdev \ + -chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 + + time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ + -verbose=2 \ + -continue=load -stop=load `pwd`/DEF \ + -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -fileServer=hgwdev \ + -chainMinScore=3000 -chainLinearGap=medium > load.log 2>&1 + + hgLoadChain -normScore -tIndex hg38 chainSelf hg38.hg38.all.chain.gz + # Loading 104815249 chains into hg38.chainSelf + + cat fb.hg38.chainSelfLink.txt + # 392419010 bases of 3049335806 (12.869%) in intersection + cd /hive/data/genomes/hg38/bed + ln -s lastzSelf.2014-01-25 lastz.self + ln -s lastzSelf.2014-01-25 lastz.hg38 + +######################################################################### +## 4-Way Multiz for UCSC Genes construction (DONE - 2014-02-11 - Hiram) + ssh hgwdev + mkdir /hive/data/genomes/hg38/bed/multiz4way + cd /hive/data/genomes/hg38/bed/multiz4way + + # extract our 4 organisms from the 44-way on hg18: + ln -s /hive/data/genomes/hg18/bed/multiz44way/44way.4d.nh ./44way.nh + + /cluster/bin/phast/tree_doctor \ + --prune-all-but hg19,mm10,canFam3,rheMac3 $HOME/kent/src/hg/utils/phyloTrees/120way.nh \ + | sed -e "s/hg19/hg38/" > 4way.nh + + # this looks like: + cat 4way.nh +(((hg38:0.033974,rheMac3:0.037601):0.109934,mm10:0.356483):0.020593,canFam3:0.165928); + + + # Use this specification in the phyloGif tool: + # http://genome.ucsc.edu/cgi-bin/phyloGif + # to obtain a gif image for htdocs/images/phylo/hg38_4way.gif + + /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt + # Use this output to create the table below + grep -y hg38 4way.distances.txt | sort -k3,3n +# +# If you can fill in all the numbers in this table, you are ready for +# the multiple alignment procedure +# +# featureBits chainLink measures +# chainHg38Link chain linearGap +# distance on hg38 on other minScore +# 1 0.071575 - rhesus rheMac3 (% 79.729) (% 86.715) 5000 medium +# 2 0.330429 - dog canFam3 (% 49.978) (% 60.083) 3000 medium +# 3 0.500391 - mouse mm10 (% 31.629) (% 35.323) 3000 medium + + # using the syntenic nets + cd /cluster/data/hg38/bed/multiz4way + mkdir mafLinks + cd mafLinks + mkdir rheMac3 canFam3 mm10 + + for D in mm10 canFam3 rheMac3 +do + ln -s ../../../lastz.${D}/axtChain/hg38.${D}.synNet.maf.gz ./${D}/ +done + + mkdir /hive/data/genomes/hg38/bed/multiz4way/mafSplit + cd /hive/data/genomes/hg38/bed/multiz4way/mafSplit + for D in mm10 canFam3 rheMac3 +do + echo "working: ${D}" + zcat ../mafLinks/${D}/hg38.${D}.synNet.maf.gz > ${D}.maf + mkdir -p ${D} + mafSplit -byTarget -useFullSequenceName /dev/null ${D}/${D}_ ${D}.maf + rm -f ${D}.maf +done + + # determine what is the newest version of multiz and use that + cd /hive/data/genomes/hg38/bed/multiz4way + mkdir penn + cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn + cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn + cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn + + # the autoMultiz cluster run + ssh ku + cd /hive/data/genomes/hg38/bed/multiz4way + + # create species list and stripped down tree for autoMZ + sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ + 4way.nh > tmp.nh + echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh + sed 's/[()]//g; s/,/ /g' tree.nh > species.lst + + mkdir run maf + cd run + + # NOTE: you need to set the db and multiz dirname properly in this script + cat > autoMultiz << '_EOF_' +#!/bin/csh -ef +set db = hg38 +set c = $1 +set maf = $2 +set binDir = /hive/data/genomes/hg38/bed/multiz4way/penn +set tmp = /dev/shm/$db/multiz.$c +set pairs = /hive/data/genomes/hg38/bed/multiz4way/mafSplit +rm -fr $tmp +mkdir -p $tmp +cp ../{tree.nh,species.lst} $tmp +pushd $tmp +foreach s (`cat species.lst`) + set in = $pairs/$s/${s}_$c.maf + set out = $db.$s.sing.maf + if ($s == $db) then + continue + endif + if (-e $in.gz) then + zcat $in.gz > $out + else if (-e $in) then + cp $in $out + else + echo "##maf version=1 scoring=autoMZ" > $out + endif +end +set path = ($binDir $path); rehash +$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf +popd +cp $tmp/$c.maf $maf +rm -fr $tmp +'_EOF_' + # << happy emacs + chmod +x autoMultiz + +cat << '_EOF_' > template +#LOOP +./autoMultiz $(root1) {check out line+ /hive/data/genomes/hg38/bed/multiz4way/maf/$(root1).maf} +#ENDLOOP +'_EOF_' + # << happy emacs + + cut -f1 /cluster/data/hg38/chrom.sizes > chrom.lst + gensub2 chrom.lst single template jobList + para create jobList + # 455 jobs + para try ... check ... push ... etc ... +# Completed: 455 of 455 jobs +# CPU time in finished jobs: 50111s 835.18m 13.92h 0.58d 0.002 y +# IO & Wait Time: 5574s 92.91m 1.55h 0.06d 0.000 y +# Average job time: 122s 2.04m 0.03h 0.00d +# Longest finished job: 4717s 78.62m 1.31h 0.05d +# Submission to last job: 4722s 78.70m 1.31h 0.05d + + # combine results into a single file for loading and gbdb reference + cd /hive/data/genomes/hg38/bed/multiz4way + grep "^#" maf/chr19_GL949749v2_alt.maf | grep -v "eof maf" > multiz4way.maf + grep -h -v "^#" maf/*.maf >> multiz4way.maf + grep "^#" maf/chr19_GL949749v2_alt.maf | grep "eof maf" >> multiz4way.maf + # real 3m27.561s + + # makes a 8.5 Gb file: + # -rw-rw-r-- 1 9044143788 Feb 11 12:51 multiz4way.maf + + # Load into database + ssh hgwdev + cd /hive/data/genomes/hg38/bed/multiz4way + mkdir /gbdb/hg38/multiz4way + ln -s /hive/data/genomes/hg38/bed/multiz4way/multiz4way.maf \ + /gbdb/hg38/multiz4way + # the hgLoadMaf generates huge tmp files, locate them in /dev/shm + cd /dev/shm + time nice -n +19 hgLoadMaf hg38 multiz4way + # Loaded 6141667 mafs in 1 files from /gbdb/hg38/multiz4way + # real 2m2.812s + + cd /hive/data/genomes/hg38/bed/multiz4way + time (cat /gbdb/hg38/multiz4way/*.maf \ + | hgLoadMafSummary -verbose=2 -minSize=10000 \ + -mergeGap=500 -maxSize=50000 hg38 multiz4waySummary stdin) + # Created 1266559 summary blocks from 11780291 components and 6141667 mafs + # real 3m0.791s +# -rw-rw-r-- 1 311246327 Feb 11 12:54 multiz4way.tab +# -rw-rw-r-- 1 58730176 Feb 11 12:58 multiz4waySummary.tab + wc -l multiz4way* + # 6141667 multiz4way.tab + # 1266559 multiz4waySummary.tab + # 7408226 total + +######################################################################### +## RE-load alternate sequence for PSL display (DONE - 2016-01-15 - Hiram) +## The procedure below +## "load alternate sequence for PSL display (DONE - #2014-02-24 - Hiram) +## produced an illegal psl Table altSeqLiftOverPsl: + pslCheck -db=hg38 altSeqLiftOverPsl + checked: 266 failed: 264 errors: 1046 + +## Since then, the gff3ToPsl command has been updated to be a bit more +## robust, so, the following sequence produces the new alignment file: + mkdir -p /hive/data/genomes/hg38/bed/altAlignments/redo2016 + cd /hive/data/genomes/hg38/bed/altAlignments/redo2016 + +mkdir -p ucscPsl + +awk -F'/' '{printf "s/^%s\t/%s\t/g;\n", $3,$2}' ../accessionToUcsc.sed.txt \ + > ucscToNcbi.sed.txt + +sed -f ucscToNcbi.sed.txt ../../../chrom.sizes > ncbi.chrom.sizes + +paste ncbi.chrom.sizes ../../../chrom.sizes \ + | awk -F'\t' '{printf "0\t%s\t%d\t%s\t%d\n", $1,$2,$3,$4}' \ + > ncbiToUcsc.lift + +find ../../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \ + | while read gff +do + name=`basename $gff | sed -e 's/_.*//;'` + fasta=`dirname $gff | sed -e 's#alignments#FASTA/alt.scaf.fa.gz#;'` + size=`faCount $fasta | grep -w total | cut -f2` + printf "%s\t%d\n" "$name" "$size" > target.sizes + gff3ToPsl ncbi.chrom.sizes target.sizes $gff $name.psl + pslCheck ${name}.psl + liftUp -type=.psl stdout ncbiToUcsc.lift error ${name}.psl \ + | liftUp -type=.psl -pslQ ucscPsl/${name}.psl ncbiToUcsc.lift error stdin + pslCheck ucscPsl/${name}.psl +done + + pslSort dirs altSeqLiftOverPsl.psl ./tmp ucscPsl + pslCheck -db=hg38 altSeqLiftOverPsl.psl + + hgLoadPsl hg38 altSeqLiftOverPsl.psl + pslCheck -db=hg38 altSeqLiftOverPsl + # checked: 266 failed: 0 errors: 0 + +######################################################################### +## load alternate sequence for PSL display (DONE - 2014-02-24 - Hiram) + mkdir /hive/data/genomes/hg38/bed/altAlignments/sequence + cd /hive/data/genomes/hg38/bed/altAlignments/sequence + + rm -fr hg38.haplotypes.lift temp.lift targetFa queryFa + mkdir targetFa + mkdir queryFa + touch temp.lift + + cat ../../altLocations/chrToAlt.bed | while read L +do + chrName=`echo $L | awk '{print $1}'` + chromSize=`egrep "^$chrName " ../../../chrom.sizes | cut -f2` + chrStart=`echo $L | awk '{printf "%d", $2}'` + chrEnd=`echo $L | awk '{printf "%d", $3}'` + chrSize=`echo $chrEnd $chrStart | awk '{print $1-$3}'` + queryName=`echo $L | awk '{print $4}'` + partName="${chrName}_${chrStart}_${chrEnd}" + echo $chrName $chrStart $chrEnd $queryName $partName $chromSize + echo -e "$chrStart\t${partName}\t$chrSize\t$chrName\t$chromSize" >> temp.lift + twoBitToFa ../../../hg38.2bit:$chrName:$chrStart-$chrEnd stdout | sed -e "s/^>.*/>$partName/;" > targetFa/$queryName.fa + twoBitToFa ../../../hg38.2bit:$queryName queryFa/$queryName.fa +done + +sort -u temp.lift | sort -k4,4 -k1,1n > hg38.haplotypes.lift + + mkdir /gbdb/hg38/ncbiAltMappings + cd /hive/data/genomes/hg38/bed/altAlignments/sequence/queryFa + ln -s `pwd`/*.fa /gbdb/hg38/ncbiAltMappings + cd /hive/data/genomes/hg38/bed/altAlignments/sequence + hgLoadSeq -drop -seqTbl=seqNcbiAltSequence -extFileTbl=extNcbiAltSequence \ + hg38 /gbdb/hg38/ncbiAltMappings/*.fa + + pslSwap ../altAlignments.psl stdout \ + | pslRecalcMatch stdin ../../../hg38.2bit ../../../hg38.2bit \ + hg38.referenceTarget.psl + + # the table name altSeqLiftOverPsl is recognized in hgc to allow display + # of the details of the alignments + hgLoadPsl hg38 -table=altSeqLiftOverPsl hg38.referenceTarget.psl + +######################################################################### +## alternate sequence alignments EXPERIMENT (DONE - 2014-01-17 - Hiram) + # the lastzAltSequences.2014-01-23 alignment was used for this instead + # of this procedure + mkdir /hive/data/genomes/hg38/bed/altAlignments + cd /hive/data/genomes/hg38/bed/altAlignments + + grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \ + | awk '{printf "s/%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt + + find ../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \ + | while read F +do + cat $F | sed -f accessionToUcsc.sed.txt \ + | gff3ToPsl ../../chrom.sizes stdin stdout +done > altAlignments.psl + | xargs cat | sed -f accessionToUcsc.sed.txt \ + | gff3ToPsl ../../chrom.sizes stdin altAlignments.psl + + time pslRecalcMatch altAlignments.psl ../../hg38.2bit ../../hg38.2bit \ + altRecalcMatch.psl + # real 0m51.122s + + # just to see what they look like in different formats: + pslToChain altRecalcMatch.psl altAlignments.chain + chainToAxt altAlignments.chain ../../hg38.2bit ../../hg38.2bit \ + altAlignments.axt + axtToMaf -score altAlignments.axt ../../chrom.sizes ../../chrom.sizes \ + altAlignments.maf + + mkdir mafSplits + mafSplit /dev/null mafSplits/ altAlignments.maf + # doesn't work: +# Can't find chrom in MAF component src: chr6_GL000250v2_alt + + mkdir splits psl + find ../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \ + | while read F +do + chrAlt=`basename $F | sed -e 's/_.*//' | sed -f accessionToUcsc.sed.txt` + echo $chrAlt + cat $F | sed -f accessionToUcsc.sed.txt \ + | gff3ToPsl ../../chrom.sizes stdin splits/${chrAlt}.psl + pslRecalcMatch splits/${chrAlt}.psl ../../hg38.2bit ../../hg38.2bit \ + psl/${chrAlt}.psl +done + + mkdir swap + mkdir swap/psl swap/chain swap/axt swap/maf swap/anno + for F in psl/*.psl +do + B=`basename $F | sed -e 's/.psl//'` + echo $B + pslSwap $F stdout | pslRecalcMatch stdin ../../hg38.2bit ../../hg38.2bit \ + swap/psl/${B}.psl + pslToChain swap/psl/${B}.psl swap/chain/${B}.chain + chainToAxt swap/chain/${B}.chain ../../hg38.2bit ../../hg38.2bit \ + swap/axt/${B}.axt + axtToMaf -score swap/axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \ + | sed -e 's/^s chr$[0-9XYM][0-9]* $/s ref38.chr\1/; s/^s chr$[0-9XYM][0-9]*_$/s alt38.chr\1/;' > swap/maf/${B}.maf + mafAddIRows -nBeds=nBeds swap/maf/${B}.maf ../../hg38.2bit swap/anno/${B}.maf +done +# axtToMaf -score swap/axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \ +# | sed -e 's/^s chr/s hg38.chr/' > swap/maf/${B}.maf + + twoBitInfo -nBed ../../hg38.2bit ../../hg38.N.bed + ln -s ../../hg38.N.bed hg38.bed + ln -s ../../hg38.N.bed ref38.bed + ln -s ../../hg38.N.bed alt38.bed + echo hg38.bed > nBeds + echo ref38.bed >> nBeds + echo alt38.bed >> nBeds + ln -s ../../chrom.sizes hg38.len + ln -s ../../chrom.sizes ref38.len + ln -s ../../chrom.sizes alt38.len + echo hg38.len > sizes + echo ref38.len >> sizes + echo alt38.len >> sizes + + mkdir chain axt maf anno + for F in psl/*.psl +do + B=`basename $F | sed -e 's/.psl//'` + echo $B + pslToChain $F chain/${B}.chain + chainToAxt chain/${B}.chain ../../hg38.2bit ../../hg38.2bit axt/${B}.axt + axtToMaf -score axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \ + | sed -e 's/^s chr$[0-9XYM][0-9]* $/s ref38.chr\1/; s/^s chr$[0-9XYM][0-9]*_$/s alt38.chr\1/;' > maf/${B}.maf + mafAddIRows -nBeds=nBeds maf/${B}.maf ../../hg38.2bit anno/${B}.maf +done + +# axtToMaf -score axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \ +# | sed -e 's/^s chr/s hg38.chr/' > maf/${B}.maf + +############################################################################ +# Liftover Gencode V19 from hg19 (DONE braney 2014-02-14) + +mkdir /cluster/data/hg38/bed/liftOverGencodeV19 +cd /cluster/data/hg38/bed/liftOverGencodeV19 + +echo "show tables like 'wgEncodeGencode%19'" | hgsql hg19 | tail -n +2 > all.gencode.tables +echo " select tableName from trackDb where tableName like 'wgEncodeGencode_%V19';" | hgsql hg19 --skip-column-names > genePred.gencode.tables + +# load the non-genepred table as is. This isn't quite the right thing to do +# with exon support, but it's good enough for our purposes at the moment +join -v 1 *.gencode.tables | while read t; do echo "create table $t select * from hg19.$t" | hgsql hg38; echo $t; done + +for i in `cat genePredExt.gencode.tables`; +do + echo "select name,score,name2 from $i" | hgsql hg19 | sort > $i.name2Score.txt; + genePredToFakePsl hg19 $i $i.psl $i.cds; + pslMap -chainMapFile -swapMap $i.psl /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz stdout | pslCDnaFilter -uniqueMapped stdin stdout | sort -k 14,14 -k 16,16n | pslToBed -cds=$i.cds stdin stdout | bedToGenePred stdin stdout | sort | join /dev/stdin $i.name2Score.txt| tr ' ' '\t' | hgLoadGenePred -genePredExt hg38 $i stdin; + echo $i; +done + +for i in `cat genePred.gencode.tables`; +do + genePredToFakePsl hg19 $i $i.psl $i.cds; + pslMap -chainMapFile -swapMap $i.psl /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz stdout | pslCDnaFilter -uniqueMapped stdin stdout | sort -k 14,14 -k 16,16n | pslToBed -cds=$i.cds stdin stdout | bedToGenePred stdin stdout | tr ' ' '\t' | hgLoadGenePred hg38 $i stdin; + echo $i; +done + +##################################################################### +## tRNAs track ( 2014-02-18 braney DONE) +## this is a preliminary version for UCSC build. NOT FOR RELEASE! +ssh hgwdev +cd /hive/data/genomes/hg38/bed +mkdir tRNAs +cd tRNAs + +cp /hive/users/pchan/tRNAs/Eukaryota/hg38/hg38-tRNAs.bed . + +hgLoadBed -tab hg38 tRNAs hg38-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql + +## tRNAs track (2015-10-04, Chris FINISHING BUILD FOR RELEASE) + cd /hive/data/genomes/hg38/bed/tRNAs + cat /hive/users/pchan/gtrnadb2/Eukaryota/hg38/hg38-tRNAs.bed | sed 's^^^g' | > hg38-tRNAs2.bed + hgsql hg38 -e 'drop table if exists tRNAs' + hgLoadBed -tab hg38 tRNAs hg38-tRNAs2.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql + mkdir gif + cp -p /hive/users/pchan/gtrnadb2/Eukaryota/hg38/images/* gif + cd /hive/data/gbdb/hg38 + ln -s /hive/data/genomes/hg38/bed/tRNAs/gif RNA-img + cd /usr/local/apache/htdocs-ceisenhart/RNA-img + ln -s /gbdb/hg38/RNA-img hg38 + +############################################################################ +# EXONIPHY , lifted from hg19 (DONE - braney 2014-02-19) +# needed for ucscGenes building + # exoniphyHg19.gp is prepared as follows + mkdir /cluster/data/hg38/bed/exoniphy + cd /cluster/data/hg38/bed/exoniphy + hgsql hg19 -e "select * from exoniphy" -N | cut -f 2-16 > exoniphyHg19.gp + time nice -n +19 liftOver -genePred exoniphyHg19.gp \ + /cluster/data/hg19/bed/liftOver/hg19ToHg38.over.chain.gz \ + exoniphyHg38.gp unmapped + # real 0m2.015s + # user 0m1.894s + # sys 0m0.076s + + wc -l * + # 186601 exoniphyHg19.gp + # 186533 exoniphyHg38.gp + # 136 unmapped + # 373270 total + + cd /cluster/data/hg38/bed/exoniphy + nice -n +19 hgLoadGenePred -genePredExt hg38 exoniphy exoniphyHg38.gp + nice -n +19 featureBits hg38 exoniphy + # 28807039 bases of 3049335806 (0.945%) in intersection + nice -n +19 featureBits hg19 exoniphy + # 28661160 bases of 2897316137 (0.989%) in intersection + +######################################################################### +# LASTZ Rat Rn5 (DONE - 2014-02-27 - Hiram) + # establish a screen to control this job + screen -S hg38Rn5 + mkdir /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27 + cd /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27 + + # XXX don't forget to specify the BLASTZ binary: + cat << '_EOF_' > DEF +# human vs rat +BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz + +# TARGET: Human Hg38 +SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit +SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes +SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit +SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes +SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift +SEQ1_CHUNK=20000000 +SEQ1_LAP=10000 + +# QUERY: Rat Rn5 +SEQ2_DIR=/hive/data/genomes/rn5/rn5.2bit +SEQ2_LEN=/hive/data/genomes/rn5/chrom.sizes +SEQ2_CHUNK=10000000 +SEQ2_LIMIT=100 +SEQ2_LAP=0 + +BASE=/hive/data/genomes/hg38/bed/lastzRn5.2014-02-27 +TMPDIR=/scratch/tmp +'_EOF_' + # << happy emacs + + time doBlastzChainNet.pl -verbose=2 \ + `pwd`/DEF \ + -syntenicNet \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 + + # real 658m53.984s + cat fb.hg38.chainRn5Link.txt + # 938823407 bases of 3049335806 (30.788%) in intersection + + # running the swap + mkdir /hive/data/genomes/rn5/bed/blastz.hg38.swap + cd /hive/data/genomes/rn5/bed/blastz.hg38.swap + time nice -n +19 doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27/DEF \ + -swap \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 + # real 66m53.095s + cat fb.rn5.chainHg38Link.txt + # 934256475 bases of 2572853723 (36.312%) in intersection + + # syntenic net for 14-way use 2014-04-02 - Hiram + cd /hive/data/genomes/rn5/bed/blastz.hg38.swap + time nice -n +19 doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27/DEF \ + -continue=syntenicNet -syntenicNet -swap \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1 + # real 16m54.489s + +############################################################################## +# LASTZ Rat Rn4 (DONE - 2014-02-27 - Hiram) + # establish a screen to control this job + screen -S hg38Rn4 + mkdir /hive/data/genomes/hg38/bed/lastzRn4.2014-02-27 + cd /hive/data/genomes/hg38/bed/lastzRn4.2014-02-27 + + # XXX don't forget to specify the BLASTZ binary: + cat << '_EOF_' > DEF +# human vs rat +BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz + +# TARGET: Human Hg38 +SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit +SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes +SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit +SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes +SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift +SEQ1_CHUNK=20000000 +SEQ1_LAP=10000 + +# QUERY: Rat Rn4 +SEQ2_DIR=/hive/data/genomes/rn4/rn4.2bit +SEQ2_LEN=/hive/data/genomes/rn4/chrom.sizes +SEQ2_CHUNK=10000000 +SEQ2_LIMIT=100 +SEQ2_LAP=0 + +BASE=/hive/data/genomes/hg38/bed/lastzRn4.2014-02-27 +TMPDIR=/scratch/tmp +'_EOF_' + # << happy emacs + + doBlastzChainNet.pl -verbose=2 \ + `pwd`/DEF \ + -syntenicNet \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 + # real 658m53.984s + + cat fb.hg38.chainRn4Link.txt + # 913992768 bases of 3049335806 (29.974%) in intersection + + # running the swap + mkdir /hive/data/genomes/rn4/bed/blastz.hg38.swap + cd /hive/data/genomes/rn4/bed/blastz.hg38.swap + time nice -n +19 doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/hg38/bed/lastzRn4.2014-02-27/DEF \ + -swap \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & + # real 73m5.666s + + cat fb.rn4.chainHg38Link.txt + # 889613774 bases of 2571531505 (34.595%) in intersection + +############################################################################## +# GENEID GENE PREDICTIONS (DONE - 2014-03-07 - Hiram) + ssh hgwdev + mkdir /hive/data/genomes/hg38/bed/geneid + cd /hive/data/genomes/hg38/bed/geneid + mkdir download + cd download + for C in `cut -f1 ../../../chrom.sizes` + do + echo $C + wget --timestamping \ +http://genome.crg.es/genepredictions/H.sapiens/hg38/geneid_v1.4/${C}.gtf3 + wget --timestamping \ +http://genome.crg.es/genepredictions/H.sapiens/hg38/geneid_v1.4/${C}.prot + done + + cd .. + cat download/*.gtf | ldHgGene -gtf -genePredExt hg38 geneid stdin + # Read 33428 transcripts in 277332 lines in 1 files + # 33428 groups 92 seqs 1 sources 3 feature types + # 33428 gene predictions + +############################################################################ +# GENEREVIEWS TRACK (DONE 2014-05-17 - Chin) +# This track depends on some tasks completed for hg19, specifically: +# +# $HOME/kent/src/hg/lib/geneReviewsGrshortNBKid.sql +# $HOME/kent/src/hg/lib/geneReviewsGrshortTitleNBKid.sql +# $HOME/kent/src/hg/lib/geneReviewsDetail.sql +# $HOME/kent/src/hg/makeDb/trackDb/human/geneReviews.html +# +# Unlike hg19, this hg38 tracks is generated by the automatic geneReviews +# scripts in +# /hive/data/outside/otto/geneReviews, specifically buildGeneReviews.sh. +# Current data are fetched weekly from NCBI +# ftp://ftp.ncbi.nlm.nih.gov/pub/GeneReviews/ +# to /hive/data/outside/otto/geneReviews/${DATE}. + +########################################################################### +# Chimp Lastz run (DONE - 2014-05-27 - Hiram) + screen -S hg38PanTro4 # use a screen to manage this longish running job + mkdir /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27 + cd /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27 + + # always set the BLASTZ program so we know what version was used + cat << '_EOF_' > DEF +# human vs chimp +BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz +BLASTZ_O=600 +BLASTZ_E=150 +# maximum M allowed with lastz is only 254 +BLASTZ_M=254 + +BLASTZ_T=2 +BLASTZ_Y=15000 +BLASTZ_K=4500 +BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q +# A C G T +# 90 -330 -236 -356 +# -330 100 -318 -236 +# -236 -318 100 -330 +# -356 -236 -330 90 + +# TARGET: Human Hg38 +SEQ1_DIR=/scratch/data/hg38/hg38.2bit +SEQ1_LEN=/scratch/data/hg38/chrom.sizes +SEQ1_CHUNK=10000000 +SEQ1_LAP=10000 +SEQ1_IN_CONTIGS=0 + +# QUERY: Chimp PanTro4 +SEQ2_DIR=/hive/data/genomes/panTro4/panTro4.2bit +SEQ2_LEN=/hive/data/genomes/panTro4/chrom.sizes +SEQ2_CHUNK=10000000 +SEQ2_LAP=0 +SEQ2_LIMIT=200 +SEQ2_IN_CONTIGS=0 + +BASE=/hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27 +TMPDIR=/dev/shm +'_EOF_' + # << emacs + + time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ + -chainMinScore=5000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -syntenicNet) > do.log 2>&1 + # real 154m12.215s + cat fb.hg38.chainPanTro4Link.txt + # 2839294579 bases of 3049335806 (93.112%) in intersection + + # filter with doRecipBest.pl + time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ + hg38 panTro4) > rbest.log 2>&1 + # real 57m55.320s + + # running the swap + mkdir /hive/data/genomes/panTro4/bed/blastz.hg38.swap + cd /hive/data/genomes/panTro4/bed/blastz.hg38.swap + time (doBlastzChainNet.pl -verbose=2 \ + -swap /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27/DEF \ + -chainMinScore=5000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -syntenicNet) > swap.log 2>&1 + cat fb.panTro4.chainHg38Link.txt + # 2776497530 bases of 2902338967 (95.664%) in intersection + # real 98m23.729s + + time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ + panTro4 hg38) > rbest.log 2>&1 + # real 64m33.812s + +############################################################################# +# Opossum Lastz run (DONE - 2014-05-27 - Hiram) + screen -S hg38MonDom5 # use a screen to manage this longish running job + mkdir /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27 + cd /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27 + + # always set the BLASTZ program so we know what version was used + cat << '_EOF_' > DEF +# human vs chimp +BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz +BLASTZ_M=50 + +BLASTZ_Y=3400 +BLASTZ_L=6000 +BLASTZ_K=2200 +BLASTZ_Q=/scratch/data/blastz/HoxD55.q +# A C G T +# 91 -90 -25 -100 +# -90 100 -100 -25 +# -25 -100 100 -90 +# -100 -25 -90 91 + +# TARGET: Human Hg38 +SEQ1_DIR=/scratch/data/hg38/hg38.2bit +SEQ1_LEN=/scratch/data/hg38/chrom.sizes +SEQ1_CHUNK=10000000 +SEQ1_LAP=10000 +SEQ1_LIMIT=5 + +# QUERY: Opossum MonDom5 +SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit +SEQ2_LEN=/hive/data/genomes/monDom5/chrom.sizes +SEQ2_CHUNK=10000000 +SEQ2_LAP=0 + +BASE=/hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27 +TMPDIR=/dev/shm +'_EOF_' + # << emacs + + time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ + -chainMinScore=5000 -chainLinearGap=loose \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -syntenicNet) > do.log 2>&1 + # real 670m13.280s + # one failed chain run for hg19, finished manually on hgwdev, then: + time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ + -continue=chainMerge -chainMinScore=5000 -chainLinearGap=loose \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -syntenicNet) > chainMerge.log 2>&1 + # real 164m28.822s + + cat fb.hg38.chainMonDom5Link.txt + # 438195373 bases of 3049335806 (14.370%) in intersection + + # filter with doRecipBest.pl + time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \ + -dbHost=hgwdev -workhorse=hgwdev hg38 monDom5) > rbest.log 2>&1 + # real 130m22.825s + + # running the swap + mkdir /hive/data/genomes/monDom5/bed/blastz.hg38.swap + cd /hive/data/genomes/monDom5/bed/blastz.hg38.swap + time (doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27/DEF \ + -swap -chainMinScore=5000 -chainLinearGap=loose \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -syntenicNet) > swap.log 2>&1 + # real 102m41.443s + + cat fb.monDom5.chainHg38Link.txt + # 420069915 bases of 3501660299 (11.996%) in intersection + time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \ + -dbHost=hgwdev -workhorse=hgwdev monDom5 hg38) > rbest.log 2>&1 + # real 90m56.189s + +_EOF_ +############################################################################# +# LOCUS REFERENCE GENOMIC (LRG) REGIONS AND TRANSCRIPTS (DONE 10/25/19 angie) +# Redmine #13359, #24285 -- otto-mate To Do #17877 +# previously done 7/7/14, 9/9/16, 5/30/18 +# THIS IS NOW AN OTTO JOB !! + set today = `date +%Y_%m_%d` + mkdir -p /hive/data/genomes/hg38/bed/lrg/$today + cd /hive/data/genomes/hg38/bed/lrg/$today + wget ftp://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_public_xml_files.zip + unzip LRG_public_xml_files.zip + + # Run script to convert LRG*.xml files to BED+ for regions and genePredExt+fa for transcripts: + # parseLrgXml.pl updated 2020-09-16 to add four new fields to the gp output + # the four extra fields are identifiers for: + # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein + + ~/kent/src/hg/utils/automation/parseLrgXml.pl GRCh38 + genePredCheck lrgTranscriptsUnmapped.gp +#Error: lrgTranscriptsUnmapped.gp:765: LRG_7t1 no exonFrame on CDS exon 46 +#checked: 1029 failed: 1 + # If there are complaints e.g. about exonFrame, look for inconsistencies in the + # affected transcript's coding_region/coordinates vs. exon/intron info in xml. + # Contact Variation team leader Fiona Cunningham @EBI to resolve in the background + # (missing exonFrame info doesn't affect our track representation because we end up using + # psl). We agreed to disagree about exon 46 of LRG_7t1 because that last coding exon + # portion is only the stop codon. + + # No longer necessary to filter out alt and fix patches since they have been added to hg38. + + # and we need the transcript plus gene name later: + cut -f1,12 lrgTranscriptsUnmapped.gp | sort > transcript.gene.name.txt + + # five extra columns have been added to the genePred (2020-10-05 - Hiram) + # extract them so they can be added to the psl: + awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s %s %s %s\n", $1,$16,$17,$18,$19, $16,$18,$17,$19}' lrgTranscriptsUnmapped.gp | sort \ + | join -t$'\t' - transcript.gene.name.txt \ + | awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s\t%s %s\n", $1,$2,$3,$4,$5,$7,$6,$7}' > lrgTransExtraFields.tsv + + # the five extra fields are identifiers for: + # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein, + # Gene name + + # Load LRG regions: + #bedToBigBed lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.bb \ + #-tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name + # after ML #29689, added ncbiAcc field, Max, July 1, 2022 + # changed to: + bedToBigBed lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.bb \ + -tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name,ncbiAcc + ln -sf `pwd`/lrg.bb /gbdb/hg38/bbi/lrg.bb + hgBbiDbLink hg38 lrg /gbdb/hg38/bbi/lrg.bb + + # Map LRG fixed_annotation transcripts from LRG coords to hg38 coords (HT MarkD): + lrgToPsl lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.psl + pslCheck lrg.psl +#checked: 919 failed: 0 errors: 0 + awk '{print $10 "\t" $11;}' lrg.psl > lrg.sizes + genePredToFakePsl -chromSize=lrg.sizes placeholder \ + lrgTranscriptsUnmapped.gp lrgTranscriptsFakePsl.psl lrgTranscripts.cds + pslMap lrgTranscriptsFakePsl.psl lrg.psl lrgTranscriptsHg38.psl + mrnaToGene -genePredExt -cdsFile=lrgTranscripts.cds -keepInvalid \ + lrgTranscriptsHg38.psl lrgTranscriptsHg38NoName2.gp +#Warning: no CDS for LRG_163t1 +#Warning: no CDS for LRG_347t1 + # It's OK if mrnaToGene complains about "no CDS" for a non-coding tx (RefSeq accession NR_*). + grep -l NR_ LRG_163.xml LRG_347.xml +#LRG_163.xml +#LRG_347.xml + + cat lrgCdna.tab | sed -e 's/^/>/;' | tr '\t' '\n' > lrgCdna.fa + # construct bigPsl with five extra fields + pslToBigPsl -fa=lrgCdna.fa -cds=lrgTranscripts.cds \ + lrgTranscriptsHg38.psl bigPsl.txt + + # add the five extra identifiers to the bigPsl file: + join -t$'\t' -1 4 \ + -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15\ +,1.16,1.17,1.18,1.19,1.20,1.21,1.22,1.23,1.24,1.25,2.2,2.3,2.4,2.5,2.6,2.7 \ + <(sort -k4 bigPsl.txt) lrgTransExtraFields.tsv \ + | sort -k1,1 -k2,2n > lrgExtraTranscriptsHg38.bigPsl.bed + + bedToBigBed -as=bigPsl+6.as -type=bed12+19 -tab \ + lrgExtraTranscriptsHg38.bigPsl.bed ../../../chrom.sizes lrgBigPsl.bb + bigBedInfo lrgBigPsl.bb + rm -f /gbdb/hg38/bbi/lrgBigPsl.bb + ln -sf `pwd`/lrgBigPsl.bb /gbdb/hg38/bbi + hgBbiDbLink hg38 lrgBigPsl /gbdb/hg38/bbi/lrgBigPsl.bb + + + # Load PSL, CDS and sequences. + hgLoadPsl hg38 -table=lrgTranscriptAli lrgTranscriptsHg38.psl + hgLoadSqlTab hg38 lrgCds ~/kent/src/hg/lib/cdsSpec.sql lrgTranscripts.cds + hgPepPred hg38 tab lrgCdna lrgCdna.tab + hgPepPred hg38 tab lrgPep lrgPep.tab + + +############################################################################# +## 7-Way Multiz (DONE - 2014-06-02 - Hiram) + ssh hgwdev + mkdir /hive/data/genomes/hg38/bed/multiz7way + cd /hive/data/genomes/hg38/bed/multiz7way + + # from the 63-way in the source tree, select out the 7 used here: + /cluster/bin/phast/tree_doctor \ + --prune-all-but hg19,panTro4,rheMac3,mm10,rn5,canFam3,monDom5 \ + /cluster/home/hiram/kent/src/hg/utils/phyloTrees/130way.nh \ + | sed -e 's/hg19/hg38/' > hg38.7way.nh + + # what that looks like: + ~/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.nh +# (((((hg38:0.006550, +# panTro4:0.006840):0.027424, +# rheMac3:0.037601):0.109934, +# (mm10:0.084509, +# rn5:0.091589):0.271974):0.020593, +# canFam3:0.165928):0.258392, +# monDom5:0.340786); + + # extract species list from that .nh file + sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ + hg38.7way.nh | xargs echo | sed 's/ //g; s/,/ /g' \ + | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt + + # construct db to name translation list: + cat species.list.txt | while read DB +do +hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest +done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \ + > db.to.name.txt + + # construct a common name .nh file: + /cluster/bin/phast/tree_doctor --rename \ + "`cat db.to.name.txt`" hg38.7way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \ + | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ + > hg38.7way.commonNames.nh + + $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.nh > t.nh + $HOME/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \ + | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ + > hg38.7way.scientificNames.nh + rm -f t.nh + cat hg38.7way.scientificNames.nh +# (((((Homo_sapiens:0.00655, +# Pan_troglodytes:0.00684):0.027424, +# Macaca_mulatta:0.037601):0.109934, +# (Mus_musculus:0.084509, +# Rattus_norvegicus:0.091589):0.271974):0.020593, +# Canis_lupus_familiaris:0.165928):0.258392, +# Monodelphis_domestica:0.340786); + + ~/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.commonNames.nh +# (((((Human:0.00655, +# Chimp:0.00684):0.027424, +# Rhesus:0.037601):0.109934, +# (Mouse:0.084509, +# Rat:0.091589):0.271974):0.020593, +# Dog:0.165928):0.258392, +# Opossum:0.340786); + + # Use this specification in the phyloGif tool: + # http://genome.ucsc.edu/cgi-bin/phyloGif + # to obtain a png image for src/hg/htdocs/images/phylo/hg38_7way.png + + /cluster/bin/phast/all_dists hg38.7way.nh | grep hg38 \ + | sed -e "s/hg38.//" | sort -k2n > 7way.distances.txt + # Use this output to create the table below + head 7way.distances.txt +# taeGut1 0.075718 +# melUnd1 0.220312 +# galGal4 0.507021 +# melGal1 0.509140 +# hg19 1.175433 +# mm10 1.383071 + + cat << '_EOF_' > sizeStats.pl +#!/usr/bin/env perl + +use strict; +use warnings; + +open (FH, "<7way.distances.txt") or + die "can not read 7way.distances.txt"; + +my $count = 0; +while (my $line = ) { + chomp $line; + my ($D, $dist) = split('\s+', $line); + my $chain = "chain" . ucfirst($D); + my $B="/hive/data/genomes/hg38/bed/lastz.$D/fb.hg38." . + $chain . "Link.txt"; + my $chainLinkMeasure = + `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; + chomp $chainLinkMeasure; + $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); + $chainLinkMeasure =~ s/\%//; + my $swapFile="/hive/data/genomes/${D}/bed/lastz.hg38/fb.${D}.chainHg38Link.txt"; + my $swapMeasure = "N/A"; + if ( -s $swapFile ) { + $swapMeasure = + `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; + chomp $swapMeasure; + $swapMeasure = 0.0 if (length($swapMeasure) < 1); + $swapMeasure =~ s/\%//; + } + my $orgName= + `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; + chomp $orgName; + if (length($orgName) < 1) { + $orgName="N/A"; + } + ++$count; + printf "# %02d %.4f (%% %06.3f) (%% %06.3f) - %s %s\n", $count, $dist, + $chainLinkMeasure, $swapMeasure, $orgName, $D; +} +close (FH); +'_EOF_' + # << happy emacs + chmod +x ./sizeStats.pl + ./sizeStats.pl +# + +# If you can fill in all the numbers in this table, you are ready for +# the multiple alignment procedure + +# featureBits chainLink measures +# chainLink +# N distance on hg38 on other other species +# 01 0.0134 (% 93.112) (% 95.664) - Chimp panTro4 +# 02 0.0716 (% 79.729) (% 86.715) - Rhesus rheMac3 +# 03 0.3304 (% 49.978) (% 60.083) - Dog canFam3 +# 04 0.5004 (% 31.629) (% 35.323) - Mouse mm10 +# 05 0.5075 (% 30.788) (% 36.312) - Rat rn5 +# 06 0.7637 (% 14.370) (% 11.996) - Opossum monDom5 + +# None of this concern for distances matters in building the first step, the +# maf files. + + # create species list and stripped down tree for autoMZ + sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ + hg38.7way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh + + sed 's/[()]//g; s/,/ /g' tree.nh > species.list + # hg38 panTro4 rheMac3 mm10 rn5 canFam3 monDom5 + + # bash shell syntax here ... + cd /hive/data/genomes/hg38/bed/multiz7way + export H=/hive/data/genomes/hg38/bed + mkdir mafLinks + # want syntenic net for: panTro4 rheMac3 mm10 rn5 canFam3 + # and unfiltered maf net for: monDom5 + for G in panTro4 rheMac3 mm10 rn5 canFam3 + do + mkdir mafLinks/$G + echo ln -s ${H}/lastz.$G/axtChain/hg38.${G}.synNet.maf.gz ./mafLinks/$G + ln -s ${H}/lastz.$G/axtChain/hg38.${G}.synNet.maf.gz ./mafLinks/$G + done + + mkdir mafLinks/monDom5 + echo ln -s ${H}/lastz.monDom5/mafNet/*.maf.gz ./mafLinks/monDom5 + ln -s ${H}/lastz.monDom5/mafNet/*.maf.gz ./mafLinks/monDom5 + # verify the symLinks are good: + ls -ogrtL mafLinks/*/* +#-rw-rw-r-- 1 709500062 Jan 25 12:15 mafLinks/mm10/hg38.mm10.synNet.maf.gz +#-rw-rw-r-- 1 1089643630 Jan 27 19:15 mafLinks/canFam3/hg38.canFam3.synNet.maf.gz +#-rw-rw-r-- 1 1277455681 Jan 28 21:52 mafLinks/rheMac3/hg38.rheMac3.synNet.maf.gz +#-rw-rw-r-- 1 687500679 Mar 1 12:27 mafLinks/rn5/hg38.rn5.synNet.maf.gz +#-rw-rw-r-- 1 1463969868 May 27 11:41 mafLinks/panTro4/hg38.panTro4.synNet.maf.gz +#-rw-rw-r-- 1 323347908 May 29 12:38 mafLinks/monDom5/hg38.monDom5.net.maf.gz + + # split the maf files into a set of hashed named files + # this hash named split keeps the same chr/contig names in the same + # named hash file. + mkdir /hive/data/genomes/hg38/bed/multiz7way/mafSplit + cd /hive/data/genomes/hg38/bed/multiz7way/mafSplit + for D in `sed -e "s/hg38 //" ../species.list` +do + echo "${D}" + mkdir $D + cd $D + echo "mafSplit -byTarget -useHashedName=8 /dev/null . ../../mafLinks/${D}/*.maf.gz" + mafSplit -byTarget -useHashedName=8 /dev/null . \ + ../../mafLinks/${D}/*.maf.gz + cd .. +done + + # construct a list of all possible maf file names. + # they do not all exist in each of the species directories + find . -type f | wc -l + # 641 + find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list + wc -l maf.list + # 118 maf.list + + mkdir /hive/data/genomes/hg38/bed/multiz7way/splitRun + cd /hive/data/genomes/hg38/bed/multiz7way/splitRun + mkdir maf run + cd run + mkdir penn + cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn + cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn + cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn + + # set the db and pairs directories here + cat > autoMultiz.csh << '_EOF_' +#!/bin/csh -ef +set db = hg38 +set c = $1 +set result = $2 +set run = `/bin/pwd` +set tmp = /dev/shm/$db/multiz.$c +set pairs = /hive/data/genomes/hg38/bed/multiz7way/mafSplit +/bin/rm -fr $tmp +/bin/mkdir -p $tmp +/bin/cp -p ../../tree.nh ../../species.list $tmp +pushd $tmp > /dev/null +foreach s (`/bin/sed -e "s/$db //" species.list`) + set in = $pairs/$s/$c + set out = $db.$s.sing.maf + if (-e $in.gz) then + /bin/zcat $in.gz > $out + if (! -s $out) then + echo "##maf version=1 scoring=autoMZ" > $out + endif + else if (-e $in) then + /bin/ln -s $in $out + else + echo "##maf version=1 scoring=autoMZ" > $out + endif +end +set path = ($run/penn $path); rehash +$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ + > /dev/null +popd > /dev/null +/bin/rm -f $result +/bin/cp -p $tmp/$c $result +/bin/rm -fr $tmp +'_EOF_' +# << happy emacs + chmod +x autoMultiz.csh + + cat << '_EOF_' > template +#LOOP +./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/hg38/bed/multiz7way/splitRun/maf/$(root1).maf} +#ENDLOOP +'_EOF_' +# << happy emacs + + ln -s ../../mafSplit/maf.list maf.list + ssh ku + cd /hive/data/genomes/hg38/bed/multiz7way/splitRun/run + gensub2 maf.list single template stdout > jobList + para -ram=8g create jobList +# Completed: 118 of 118 jobs +# CPU time in finished jobs: 118241s 1970.69m 32.84h 1.37d 0.004 y +# IO & Wait Time: 682s 11.36m 0.19h 0.01d 0.000 y +# Average job time: 1008s 16.80m 0.28h 0.01d +# Longest finished job: 10068s 167.80m 2.80h 0.12d +# Submission to last job: 10076s 167.93m 2.80h 0.12d + + # combine into one file (the 1>&2 redirect sends the echo to stderr) + cd /hive/data/genomes/hg38/bed/multiz7way + head -1 splitRun/maf/017.maf > multiz7way.maf + for F in splitRun/maf/*.maf +do + echo "${F}" 1>&2 + egrep -v "^#" ${F} +done >> multiz7way.maf + tail -1 splitRun/maf/017.maf >> multiz7way.maf +# -rw-rw-r-- 1 15635828403 Jun 3 11:49 multiz7way.maf + + # Load into database + ssh hgwdev + cd /hive/data/genomes/hg38/bed/multiz7way + mkdir /gbdb/hg38/multiz7way + ln -s `pwd`/multiz7way.maf /gbdb/hg38/multiz7way + cd /dev/shm + time nice -n +17 hgLoadMaf hg38 multiz7way + # Loaded 10270624 mafs in 1 files from /gbdb/hg38/multiz7way + # real 3m51.265s + + time nice -n +17 hgLoadMafSummary -verbose=2 -minSize=30000 \ + -mergeGap=1500 -maxSize=200000 hg38 multiz7waySummary \ + /gbdb/hg38/multiz7way/multiz7way.maf + # Created 1260918 summary blocks from 35384988 components + # and 10270624 mafs from /gbdb/hg38/multiz7way/multiz7way.maf + # real 5m39.388s + + + wc -l multiz7way*.tab + # 10270624 multiz7way.tab + # 1260918 multiz7waySummary.tab + # 11531542 total + + rm multiz7way*.tab + +############################################################################## +# GAP ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2014-06-03 - Hiram) + # mafAddIRows has to be run on single chromosome maf files, it does not + # function correctly when more than one reference sequence + # are in a single file. Need to split of the maf file into individual + # maf files + mkdir -p /hive/data/genomes/hg38/bed/multiz7way/anno/mafSplit + cd /hive/data/genomes/hg38/bed/multiz7way/anno/mafSplit + + time mafSplit -outDirDepth=1 -byTarget -useFullSequenceName \ + /dev/null . ../../multiz7way.maf + # real 4m8.617s + + find . -type f | wc -l + # 353 + + # check for N.bed files everywhere: + cd /hive/data/genomes/hg38/bed/multiz7way/anno + for DB in `cat ../species.list` +do + if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then + echo "MISS: ${DB}" +# cd /hive/data/genomes/${DB} +# twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed + else + echo " OK: ${DB}" + fi +done + + cd /hive/data/genomes/hg38/bed/multiz7way/anno + for DB in `cat ../species.list` +do + echo "${DB} " + ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed + echo ${DB}.bed >> nBeds + ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len + echo ${DB}.len >> sizes +done + # make sure they all are successful symLinks: + ls -ogrtL + + screen -S hg38 # use a screen to control this longish job + ssh ku + cd /hive/data/genomes/hg38/bed/multiz7way/anno + mkdir result + for D in `ls mafSplit` +do + echo mkdir result/${D} + mkdir result/${D} +done + cat << '_EOF_' > template +#LOOP +mafAddIRows -nBeds=nBeds mafSplit/$(path1) /hive/data/genomes/hg38/hg38.2bit {check out exists+ result/$(path1)} +#ENDLOOP +'_EOF_' + # << happy emacs + + find ./mafSplit -type f | sed -e 's#^./mafSplit/##' > maf.list + gensub2 maf.list single template jobList + # limit jobs on a node with the ram=32g requirement because they go fast + para -ram=32g create jobList + para try ... check ... push ... +# Completed: 353 of 353 jobs +# CPU time in finished jobs: 530s 8.83m 0.15h 0.01d 0.000 y +# IO & Wait Time: 1057s 17.62m 0.29h 0.01d 0.000 y +# Average job time: 4s 0.07m 0.00h 0.00d +# Longest finished job: 63s 1.05m 0.02h 0.00d +# Submission to last job: 220s 3.67m 0.06h 0.00d + + # verify all result files have some content, look for 0 size files: + find ./result -type f -size 0 + # should see none + # or in this manner: + find ./result -type f | xargs ls -og | sort -k3nr | tail + + # combine into one file (the 1>&2 redirect sends the echo to stderr) + head -q -n 1 result/0/chr8.maf > hg38.7way.maf + find ./result -type f | while read F +do + echo "${F}" 1>&2 + grep -h -v "^#" ${F} +done >> hg38.7way.maf + + # these maf files do not have the end marker, this does nothing: + # tail -q -n 1 result/0/chr8.maf >> hg38.7way.maf + # How about an official end marker: + echo "##eof maf" >> hg38.7way.maf + ls -og +# -rw-rw-r-- 1 17795297196 Jun 3 14:01 hg38.7way.maf + + du -hsc hg38.7way.maf + # 17G hg38.7way.maf + + # construct symlinks to get the individual maf files into gbdb: + rm /gbdb/hg38/multiz7way/multiz7way.maf # remove previous results + ln -s `pwd`/hg38.7way.maf /gbdb/hg38/multiz7way/multiz7way.maf + + # Load into database + cd /dev/shm + time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/hg38/multiz7way \ + hg38 multiz7way + # Loaded 10359242 mafs in 1 files from /gbdb/hg38/multiz7way + # real 4m21.862s + + time hgLoadMafSummary -verbose=2 -minSize=30000 \ + -mergeGap=1500 -maxSize=200000 hg38 multiz7waySummary \ + /gbdb/hg38/multiz7way/multiz7way.maf +# Created 1260918 summary blocks from 35384988 components +# and 10359242 mafs from /gbdb/hg38/multiz7way/multiz7way.maf +# real 6m6.583s + +# -rw-rw-r-- 1 530538267 Jun 3 14:05 multiz7way.tab +# -rw-rw-r-- 1 60616616 Jun 3 14:15 multiz7waySummary.tab + + rm multiz7way*.tab + +###################################################################### +# MULTIZ7WAY MAF FRAMES (DONE - 2014-06-03 - Hiram) + ssh hgwdev + mkdir /hive/data/genomes/hg38/bed/multiz7way/frames + cd /hive/data/genomes/hg38/bed/multiz7way/frames +# survey all the genomes to find out what kinds of gene tracks they have + cat << '_EOF_' > showGenes.csh +#!/bin/csh -fe +foreach db (`cat ../species.list`) + echo -n "${db}: " + set tables = `hgsql $db -N -e "show tables like '%Gene%'"` + foreach table ($tables) + if ($table == "ensGene" || $table == "refGene" || \ + $table == "mgcGenes" || $table == "knownGene" || \ + $table == "xenoRefGene" ) then + set count = `hgsql $db -N -e "select count(*) from $table"` + echo -n "${table}: ${count}, " + endif + end + set orgName = `hgsql hgcentraltest -N -e \ + "select scientificName from dbDb where name='$db'"` + set orgId = `hgsql hg19 -N -e \ + "select id from organism where name='$orgName'"` + if ($orgId == "") then + echo "Mrnas: 0" + else + set count = `hgsql hg19 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` + echo "Mrnas: ${count}" + endif +end +'_EOF_' + # << happy emacs + chmod +x ./showGenes.csh + time ./showGenes.csh +# hg38: knownGene: 104178, mgcGenes: 34081, refGene: 54852, xenoRefGene: 172740, Mrnas: 10723716 +# panTro4: ensGene: 29160, refGene: 2622, xenoRefGene: 280516, Mrnas: 11163 +# rheMac3: refGene: 6369, xenoRefGene: 275096, Mrnas: 443642 +# mm10: ensGene: 94647, knownGene: 61642, mgcGenes: 26768, refGene: 33765, xenoRefGene: 161178, Mrnas: 5224613 +# rn5: ensGene: 29188, mgcGenes: 6924, refGene: 18567, xenoRefGene: 175416, Mrnas: 1247500 +# canFam3: ensGene: 29884, refGene: 1582, xenoRefGene: 253196, Mrnas: 387195 +# monDom5: ensGene: 24882, refGene: 492, xenoRefGene: 248251, Mrnas: 2461 + + # from that summary, use these gene sets: + # refGene - rheMac3 + # ensGene - panTro4 rn5 canFam3 monDom5 + # knownGene - hg38 mm10 + + mkdir genes + # 1. knownGene: hg38 mm10 + for DB in hg38 mm10 +do + hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ + | genePredSingleCover stdin stdout | gzip -2c \ + > genes/${DB}.gp.gz +done + # 2. ensGene: + for DB in panTro4 rn5 canFam3 monDom5 +do +hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ + | genePredSingleCover stdin stdout | gzip -2c \ + > /scratch/tmp/${DB}.tmp.gz + mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz + echo "${DB} done" +done + # 3. refGene + for DB in rheMac3 +do +hgsql -N -e "select * from refGene" ${DB} | cut -f2- \ + | genePredSingleCover stdin stdout | gzip -2c \ + > /scratch/tmp/${DB}.tmp.gz + mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz + echo "${DB} done" +done + + # verify counts for genes are reasonable: + for T in genes/*.gz +do + echo -n "# $T: " + zcat $T | cut -f1 | sort | uniq -c | wc -l +done +# genes/canFam3.gp.gz: 19507 +# genes/hg38.gp.gz: 21887 +# genes/mm10.gp.gz: 21013 +# genes/monDom5.gp.gz: 21033 +# genes/panTro4.gp.gz: 18657 +# genes/rheMac3.gp.gz: 5614 +# genes/rn5.gp.gz: 22863 + + time (cat ../anno/hg38.7way.maf \ + | nice -n +19 genePredToMafFrames hg38 stdin stdout \ + `sed -e "s#$[a-zA-Z0-9]*$#\1 genes/\1.gp.gz#g" ../species.list` \ + | gzip > multiz7wayFrames.bed.gz) + # real 3m44.591s + + # verify there are frames on everything, should be 7 species: + zcat multiz7wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c +# 265160 canFam3 +# 208941 hg38 +# 253323 mm10 +# 574521 monDom5 +# 200156 panTro4 +# 49802 rheMac3 +# 244731 rn5 + + # load the resulting file + ssh hgwdev + cd /hive/data/genomes/hg38/bed/multiz7way/frames + time hgLoadMafFrames hg38 multiz7wayFrames multiz7wayFrames.bed.gz + # real 0m19.959s + + time featureBits -countGaps hg38 multiz7wayFrames + # 52686177 bases of 3209286105 (1.642%) in intersection + # real 0m12.593s + + # enable the trackDb entries: +# frames multiz7wayFrames +# irows on + # appears to work OK + +######################################################################### +# Phylogenetic tree from 7-way (DONE - 2014-06-04 - Hiram) + mkdir /hive/data/genomes/hg38/bed/multiz7way/4d + cd /hive/data/genomes/hg38/bed/multiz7way/4d + + # the annotated maf is: + ../anno/hg38.7way.maf + + # using knownGene for hg38 + hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" hg38 > hg38.knownGene.gp + + genePredSingleCover hg38.knownGene.gp stdout | sort > hg38.knownGeneNR.gp + wc -l hg38.knownGeneNR.gp + # 21887 hg38.knownGeneNR.gp + + mkdir annoSplit + cd annoSplit + time mafSplit -verbose=2 -outDirDepth=1 -byTarget -useFullSequenceName \ + /dev/null . ../../anno/hg38.7way.maf + # real 5m14.770s + + find . -type f | wc -l + # 353 + ssh ku + mkdir /hive/data/genomes/hg38/bed/multiz7way/4d/run + cd /hive/data/genomes/hg38/bed/multiz7way/4d/run + mkdir ../mfa + + # newer versions of msa_view have a slightly different operation + # the sed of the gp file inserts the reference species in the chr name + cat << '_EOF_' > 4d.csh +#!/bin/csh -fe +set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin +set r = "/hive/data/genomes/hg38/bed/multiz7way" +set c = $1:r +set infile = $r/4d/annoSplit/$2 +set outDir = $r/4d/mfa/$3:h +set outfile = $r/4d/mfa/$3 +/bin/mkdir -p $outDir +cd /scratch/tmp +/bin/awk -v C=$c '$2 == C {print}' $r/4d/hg38.knownGeneNR.gp | sed -e "s/\t$c\t/\thg38.$c\t/" > $c.gp +set NL=`wc -l $c.gp| gawk '{print $1}'` +echo $NL +if ("$NL" != "0") then + $PHASTBIN/msa_view --4d --features $c.gp -i MAF $infile -o SS > $c.ss + $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile +else + echo "" > $outfile +endif +/bin/rm -f $c.gp $c.ss +'_EOF_' + # << happy emacs + chmod +x 4d.csh + + find ../annoSplit -type f | sed -e "s#../annoSplit/##" > maf.list + + cat << '_EOF_' > template +#LOOP +4d.csh $(file1) $(path1) {check out line+ ../mfa/$(dir1)/$(root1).mfa} +#ENDLOOP +'_EOF_' + # << happy emacs + + gensub2 maf.list single template jobList + para create jobList + para try ... check + para time +# Completed: 353 of 353 jobs +# CPU time in finished jobs: 836s 13.93m 0.23h 0.01d 0.000 y +# IO & Wait Time: 1172s 19.54m 0.33h 0.01d 0.000 y +# Average job time: 6s 0.09m 0.00h 0.00d +# Longest finished job: 72s 1.20m 0.02h 0.00d +# Submission to last job: 89s 1.48m 0.02h 0.00d + + # Not all results have contents, that is OK + + # combine mfa files + ssh hgwdev + cd /hive/data/genomes/hg38/bed/multiz7way/4d + # remove the broken empty files, size 0 and size 1: + find ./mfa -type f -size 0 | xargs rm -f + # most interesting, this did not identify files of size 1: +# find ./mfa -type f -size 1 + find ./mfa -type f | xargs ls -og | awk '$3 == 1' | awk '{print $NF}' \ + > empty.list + cat empty.list | xargs rm -f + #want comma-less species.list + /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ + --aggregate "`cat ../species.list`" mfa/*/*.mfa | sed s/"> "/">"/ \ + > 4d.all.mfa + # check they are all in there: + grep "^>" 4d.all.mfa + # >hg38 + # >panTro4 + # >rheMac3 + # >mm10 + # >rn5 + # >canFam3 + # >monDom5 + + sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ + ../hg38.7way.nh | xargs echo | sed -e 's/ //g' > tree_commas.nh + # tree_commas.nh looks like: + # (((((hg38,panTro4),rheMac3),(mm10,rn5)),canFam3),monDom5) + # use phyloFit to create tree model (output is phyloFit.mod) + time nice -n +19 \ + /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ + --EM --precision MED --msa-format FASTA --subst-mod REV \ + --tree tree_commas.nh 4d.all.mfa + # real 0m6.583s + + + mv phyloFit.mod all.mod + + grep TREE all.mod +# TREE: (((((hg38:0.00673596,panTro4:0.00686169):0.0248146,rheMac3:0.0357598):0.0970072,(mm10:0.081661,rn5:0.0874126):0.246527):0.0264964,canFam3:0.156769):0.303241,monDom5:0.303241); + + # compare these calculated lengths to the tree extracted from 130way: + grep TREE all.mod | sed -e 's/TREE: //' \ + | /cluster/bin/phast/all_dists /dev/stdin | grep hg38 | sort -k3n \ + | sed -e "s/hg38.//; s/^/ # /" + # panTro4 0.013598 + # rheMac3 0.067310 + # canFam3 0.311823 + # mm10 0.456746 + # rn5 0.462497 + # monDom5 0.761536 + + # yes, somewhat similar + /cluster/bin/phast/all_dists ../hg38.7way.nh | grep hg38 \ + | sort -k3n | sed -e "s/hg38.//; s/^/ # /" + # panTro4 0.013390 + # rheMac3 0.071575 + # canFam3 0.330429 + # mm10 0.500391 + # rn5 0.507471 + # monDom5 0.763679 + +######################################################################### +# phastCons 7-way (DONE - 2014-06-04 - Hiram) + # split 7way mafs into 10M chunks and generate sufficient statistics + # files for # phastCons + ssh ku + mkdir -p /hive/data/genomes/hg38/bed/multiz7way/cons/SS + cd /hive/data/genomes/hg38/bed/multiz7way/cons/SS + mkdir result done + + cat << '_EOF_' > mkSS.csh +#!/bin/csh -ef +set d = $1 +set c = $2 +set doneDir = done/$d +set MAF = /hive/data/genomes/hg38/bed/multiz7way/anno/result/$d/$c.maf +set WINDOWS = /hive/data/genomes/hg38/bed/multiz7way/cons/SS/result/$d/$c +set WC = `cat $MAF | wc -l` +set NL = `grep "^#" $MAF | wc -l` +if ( -s $3 ) then + exit 0 +endif +if ( -s $3.running ) then + exit 0 +endif + +/bin/mkdir -p $doneDir +/bin/date >> $3.running + +/bin/rm -fr $WINDOWS +/bin/mkdir -p $WINDOWS +pushd $WINDOWS > /dev/null +if ( $WC != $NL ) then +/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ + $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 +endif +popd > /dev/null +/bin/date >> $3 +/bin/rm -f $3.running +'_EOF_' + # << happy emacs + chmod +x mkSS.csh + + cat << '_EOF_' > template +#LOOP +mkSS.csh $(dir1) $(root1) {check out line+ done/$(dir1)/$(root1)} +#ENDLOOP +'_EOF_' + # << happy emacs + + # do the easy ones first to see some immediate results + find ../../anno/result -type f | sed -e "s#../../anno/result/##" > maf.list + + gensub2 maf.list single template jobList + para -ram=32g create jobList + para try ... check ... etc +# Completed: 353 of 353 jobs +# CPU time in finished jobs: 1216s 20.27m 0.34h 0.01d 0.000 y +# IO & Wait Time: 1385s 23.08m 0.38h 0.02d 0.000 y +# Average job time: 7s 0.12m 0.00h 0.00d +# Longest finished job: 111s 1.85m 0.03h 0.00d +# Submission to last job: 189s 3.15m 0.05h 0.00d + + find ./result -type f | wc -l + # 641 + + # Run phastCons + # This job is I/O intensive in its output files, beware where this + # takes place or do not run too many at once. + ssh ku + mkdir -p /hive/data/genomes/hg38/bed/multiz7way/cons/run.cons + cd /hive/data/genomes/hg38/bed/multiz7way/cons/run.cons + + # This is setup for multiple runs based on subsets, but only running + # the 'all' subset here. + # It triggers off of the current working directory + # $cwd:t which is the "grp" in this script. Running: + # all and vertebrates + + cat << '_EOF_' > doPhast.csh +#!/bin/csh -fe +set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin +set c = $1 +set d = $2 +set f = $3 +set len = $4 +set cov = $5 +set rho = $6 +set grp = $cwd:t +set cons = /hive/data/genomes/hg38/bed/multiz7way/cons +set tmp = $cons/tmp/${d}_${c} +mkdir -p $tmp +set ssSrc = $cons/SS/result +set useGrp = "$grp.mod" +if (-s $cons/$grp/$grp.non-inf) then + ln -s $cons/$grp/$grp.mod $tmp + ln -s $cons/$grp/$grp.non-inf $tmp + ln -s $ssSrc/$d/$f $tmp +else + ln -s $ssSrc/$d/$f $tmp + ln -s $cons/$grp/$grp.mod $tmp +endif +pushd $tmp > /dev/null +if (-s $grp.non-inf) then + $PHASTBIN/phastCons $f $useGrp \ + --rho $rho --expected-length $len --target-coverage $cov --quiet \ + --not-informative `cat $grp.non-inf` \ + --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp +else + $PHASTBIN/phastCons $f $useGrp \ + --rho $rho --expected-length $len --target-coverage $cov --quiet \ + --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp +endif +popd > /dev/null +mkdir -p pp/$d bed/$d +sleep 4 +touch pp/$d bed/$d +rm -f pp/$d/$c.pp +rm -f bed/$d/$c.bed +mv $tmp/$c.pp pp/$d +mv $tmp/$c.bed bed/$d +rm -fr $tmp +rmdir --ignore-fail-on-non-empty $cons/tmp/$d:h +'_EOF_' + # << happy emacs + chmod +x doPhast.csh + + # this template will serve for all runs + # root1 == chrom name, file1 == ss file name without .ss suffix + cat << '_EOF_' > template +#LOOP +../run.cons/doPhast.csh $(root1) $(dir1) $(file1) 45 0.3 0.3 {check out line+ pp/$(dir1)/$(root1).pp} +#ENDLOOP +'_EOF_' + # << happy emacs + + find ../SS/result -type f | sed -e "s#../SS/result/##" > ss.list + wc -l ss.list + # 641 ss.list + + # Create parasol batch and run it + # run for all species + cd /hive/data/genomes/hg38/bed/multiz7way/cons + mkdir -p all + cd all + # Using the .mod tree + cp -p ../../4d/all.mod ./all.mod + + gensub2 ../run.cons/ss.list single ../run.cons/template jobList + para -ram=32g create jobList + para try ... check ... + para push +# Completed: 641 of 641 jobs +# CPU time in finished jobs: 6557s 109.29m 1.82h 0.08d 0.000 y +# IO & Wait Time: 4497s 74.94m 1.25h 0.05d 0.000 y +# Average job time: 17s 0.29m 0.00h 0.00d +# Longest finished job: 33s 0.55m 0.01h 0.00d +# Submission to last job: 120s 2.00m 0.03h 0.00d + + # create Most Conserved track + cd /hive/data/genomes/hg38/bed/multiz7way/cons/all + cut -f1 ../../../../chrom.sizes | while read C +do + ls -d bed/?/${C} 2> /dev/null | while read D + do + echo ${D}/${C}*.bed 1>&2 + cat ${D}/${C}*.bed + done | sort -k1,1 -k2,2n \ + | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' +done > tmpMostConserved.bed + + /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed + # -rw-rw-r-- 1 42636652 Jun 4 10:45 tmpMostConserved.bed + # -rw-rw-r-- 1 43721828 Jun 4 10:45 mostConserved.bed + + # load into database + ssh hgwdev + cd /hive/data/genomes/hg38/bed/multiz7way/cons/all + time nice -n +19 hgLoadBed hg38 phastConsElements7way mostConserved.bed + # Read 1234990 elements of size 5 from mostConserved.bed + # real 0m11.390s + + # on human we often try for 5% overall cov, and 70% CDS cov + # most bets are off here for that goal, these alignments are too few + # and too far between + # --rho 0.3 --expected-length 45 --target-coverage 0.3 + featureBits hg38 -enrichment knownGene:cds phastConsElements7way + # knownGene:cds 1.266%, phastConsElements7way 4.551%, + # both 0.888%, cover 70.16%, enrich 15.42x + + # Create merged posterier probability file and wiggle track data files + cd /hive/data/genomes/hg38/bed/multiz7way/cons/all + mkdir downloads + + # the third sed fixes the chrom names, removing the partition extensions + time (find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ + | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ + | sed -e 's/\.[0-9][0-9]*-[0-9][0-9]* start/ start/' \ + | gzip -c > downloads/phastCons7way.wigFix.gz) + # real 37m47.242s + + # check integrity of data with wigToBigWig + time (zcat downloads/phastCons7way.wigFix.gz \ + | wigToBigWig -verbose=2 stdin /hive/data/genomes/hg38/chrom.sizes \ + phastCons7way.bw) > bigWig.log 2>&1 & + tail bigWig.log + # pid=34733: VmPeak: 33106324 kB + # real 40m53.287s + + bigWigInfo phastCons7way.bw +# version: 4 +# isCompressed: yes +# isSwapped: 0 +# primaryDataSize: 5,675,802,079 +# primaryIndexSize: 92,579,900 +# zoomLevels: 10 +# chromCount: 353 +# basesCovered: 2,898,191,577 +# mean: 0.168088 +# min: 0.000000 +# max: 1.000000 +# std: 0.233827 + + # encode those files into wiggle data + time (zcat downloads/phastCons7way.wigFix.gz \ + | wigEncode stdin phastCons7way.wig phastCons7way.wib) + # Converted stdin, upper limit 1.00, lower limit 0.00 + # real 15m28.525s + + du -hsc *.wi? + # 2.7G phastCons7way.wib + # 282M phastCons7way.wig + # 3.0G total + + # Load gbdb and database with wiggle. + ln -s `pwd`/phastCons7way.wib /gbdb/hg38/multiz7way/phastCons7way.wib + time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg38/multiz7way \ + hg38 phastCons7way phastCons7way.wig + # real 0m33.502s + + # use to set trackDb.ra entries for wiggle min and max + # and verify table is loaded correctly + + wigTableStats.sh hg38 phastCons7way +# db.table min max mean count sumData stdDev viewLimits +hg38.phastCons7way 0 1 0.168088 2898191577 4.87152e+08 0.233827 viewLimits=0:1 + + # Create histogram to get an overview of all the data + time nice -n +19 hgWiggle -doHistogram -db=hg38 \ + -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ + phastCons7way > histogram.data 2>&1 + # real 2m40.179s + + # create plot of histogram: + + cat << '_EOF_' | gnuplot > histo.png +set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff +set size 1.4, 0.8 +set key left box +set grid noxtics +set grid ytics +set title " Human hg38 Histogram phastCons7way track" +set xlabel " phastCons7way score" +set ylabel " Relative Frequency" +set y2label " Cumulative Relative Frequency (CRF)" +set y2range [0:1] +set y2tics +set yrange [0:0.02] + +plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ + "histogram.data" using 2:7 axes x1y2 title " CRF" with lines +'_EOF_' + # << happy emacs + + display histo.png & + +######################################################################### +# phyloP for 7-way (DONE - 2014-06-04 - Hiram) + # run phyloP with score=LRT + ssh ku + mkdir /cluster/data/hg38/bed/multiz7way/consPhyloP + cd /cluster/data/hg38/bed/multiz7way/consPhyloP + + mkdir run.phyloP + cd run.phyloP + # Adjust model file base composition background and rate matrix to be + # representative of the chromosomes in play + grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' + # 0.556 + /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ + ../../cons/all/all.mod 0.556 > all.mod + # verify, the BACKGROUND should now be paired up: + grep BACK all.mod + # BACKGROUND: 0.222000 0.278000 0.278000 0.222000 + + cat << '_EOF_' > doPhyloP.csh +#!/bin/csh -fe +set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin +set f = $1 +set d = $f:h +set file1 = $f:t +set out = $2 +set cName = $f:t:r +set grp = $cwd:t +set cons = /hive/data/genomes/hg38/bed/multiz7way/consPhyloP +set tmp = $cons/tmp/$grp/$f +/bin/rm -fr $tmp +/bin/mkdir -p $tmp +set ssSrc = "/hive/data/genomes/hg38/bed/multiz7way/cons/SS/result/$f" +set useGrp = "$grp.mod" +/bin/ln -s $cons/run.phyloP/$grp.mod $tmp +pushd $tmp > /dev/null +$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ + -i SS $useGrp $ssSrc.ss > $file1.wigFix +popd > /dev/null +/bin/mkdir -p $out:h +sleep 4 +/bin/touch $out:h +/bin/mv $tmp/$file1.wigFix $out +/bin/rm -fr $tmp +/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d +/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h +/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp +/bin/rmdir --ignore-fail-on-non-empty $cons/tmp +'_EOF_' + # << happy emacs + + # Create list of chunks + find ../../cons/SS/result -type f | grep ".ss$" \ + | sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list + # make sure the list looks good + wc -l ss.list + # 641 ss.list + + # Create template file + # file1 == $chr/$chunk/file name without .ss suffix + cat << '_EOF_' > template +#LOOP +../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} +#ENDLOOP +'_EOF_' + # << happy emacs + + ###################### Running all species ####################### + # setup run for all species + mkdir /hive/data/genomes/hg38/bed/multiz7way/consPhyloP/all + cd /hive/data/genomes/hg38/bed/multiz7way/consPhyloP/all + rm -fr wigFix + mkdir wigFix + + gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList + # the -ram=8g will allow only one job per node to slow this down since + # it would run too fast otherwise. Either run on one of the small + # klusters or use the -ram=8g on the para create + para -ram=32g create jobList + para try ... check ... push ... etc ... + para time > run.time +# Completed: 641 of 641 jobs +# CPU time in finished jobs: 4755s 79.24m 1.32h 0.06d 0.000 y +# IO & Wait Time: 4343s 72.39m 1.21h 0.05d 0.000 y +# Average job time: 14s 0.24m 0.00h 0.00d +# Longest finished job: 27s 0.45m 0.01h 0.00d +# Submission to last job: 1152s 19.20m 0.32h 0.01d + + # make downloads + mkdir downloads + + time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ + | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ + | gzip -c > downloads/phyloP7way.wigFix.gz) & + # real 29m51.665s + + # check integrity of data with wigToBigWig + time (zcat downloads/phyloP7way.wigFix.gz \ + | wigToBigWig -verbose=2 stdin /hive/data/genomes/hg38/chrom.sizes \ + phyloP7way.bw) > bigWig.log 2>&1 & + egrep "real|VmPeak" bigWig.log + # pid=76577: VmPeak: 33106320 kB + # real 42m53.038s + + + bigWigInfo phyloP7way.bw +# version: 4 +# isCompressed: yes +# isSwapped: 0 +# primaryDataSize: 3,759,451,708 +# primaryIndexSize: 92,579,900 +# zoomLevels: 10 +# chromCount: 353 +# basesCovered: 2,898,191,577 +# mean: 0.074472 +# min: -5.220000 +# max: 1.062000 +# std: 0.545945 + + # encode those files into wiggle data + time (zcat downloads/phyloP7way.wigFix.gz \ + | wigEncode stdin phyloP7way.wig phyloP7way.wib) & + # Converted stdin, upper limit 1.06, lower limit -5.22 + # real 16m11.861s + + + du -hsc *.wi? + # 47M phyloP7way.wib + # 12M phyloP7way.wig + # 58M total + + # Load gbdb and database with wiggle. + ln -s `pwd`/phyloP7way.wib /gbdb/hg38/multiz7way/phyloP7way.wib + nice hgLoadWiggle -pathPrefix=/gbdb/hg38/multiz7way hg38 \ + phyloP7way phyloP7way.wig + + # use to set trackDb.ra entries for wiggle min and max + # and verify table is loaded correctly + + wigTableStats.sh hg38 phyloP7way +# db.table min max mean count sumData +# hg38.phyloP7way -5.22 1.062 0.0744721 2898191577 2.15834e+08 +# stdDev viewLimits +# 0.545945 viewLimits=-2.65525:1.062 + + # that range is: 5.22+1.062 = 6.282 for hBinSize=0.006282 + + # Create histogram to get an overview of all the data + time nice -n +19 hgWiggle -doHistogram \ + -hBinSize=0.006282 -hBinCount=1000 -hMinVal=-5.22 -verbose=2 \ + -db=hg38 phyloP7way > histogram.data 2>&1 + # real 2m55.843s + + + # find out the range for the 2:5 graph + grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin +# Q1 0.000001 +# median 0.000060 +# Q3 0.000656 +# average 0.001022 +# min 0.000000 +# max 0.065461 +# count 978 +# total 0.999982 +# standard deviation 0.004157 + + # create plot of histogram: + cat << '_EOF_' | gnuplot > histo.png +set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff +set size 1.4, 0.8 +set key left box +set grid noxtics +set grid ytics +set title " Human hg38 Histogram phyloP7way track" +set xlabel " phyloP7way score" +set ylabel " Relative Frequency" +set y2label " Cumulative Relative Frequency (CRF)" +set y2range [0:1] +set y2tics +set yrange [0:0.02] + +plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ + "histogram.data" using 2:7 axes x1y2 title " CRF" with lines +'_EOF_' + # << happy emacs + + display histo.png & + +############################################################################# +# construct download files for 7-way (DONE - 2014-06-05 - Hiram) + mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way + mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons7way + mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP7way + mkdir /hive/data/genomes/hg38/bed/multiz7way/downloads + cd /hive/data/genomes/hg38/bed/multiz7way/downloads + mkdir multiz7way phastCons7way phyloP7way + cd multiz7way + time cp -p ../../anno/hg38.7way.maf . + # real 0m55.984s + time gzip *.maf + # real 46m53.149s + + ln -s ../../hg38.7way.nh . + ln -s ../../hg38.7way.commonNames.nh . + time md5sum *.nh *.maf.gz > md5sum.txt + # real 1m55.317s + ln -s `pwd`/* \ + /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way + + du -hsc *.maf.gz ../../anno/hg38.7way.maf + # 3.5G hg38.7way.maf.gz + # 17G ../../anno/hg38.7way.maf + + ##################################################################### + cd /hive/data/genomes/hg38/bed/multiz7way/downloads/phastCons7way + + ln -s ../../cons/all/downloads/phastCons7way.wigFix.gz \ + ./hg38.phastCons7way.wigFix.gz + ln -s ../../cons/all/phastCons7way.bw ./hg38.phastCons7way.bw + ln -s ../../cons/all/all.mod ./hg38.phastCons7way.mod + time md5sum *.gz *.mod *.bw > md5sum.txt + # real 0m37.384s + # obtain the README.txt from petMar2/phastCons7way and update for this + # situation + ln -s `pwd`/*.gz `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ + /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons7way + + ##################################################################### + cd /hive/data/genomes/hg38/bed/multiz7way/downloads/phyloP7way + + ln -s ../../consPhyloP/all/downloads/phyloP7way.wigFix.gz \ + ./hg38.phyloP7way.wigFix.gz + ln -s ../../consPhyloP/run.phyloP/all.mod hg38.phyloP7way.mod + ln -s ../../consPhyloP/all/phyloP7way.bw hg38.phyloP7way.bw + + time md5sum *.mod *.bw *.gz > md5sum.txt + # real 0m29.431s + + # obtain the README.txt from geoFor1/phyloP7way and update for this + # situation + ln -s `pwd`/* \ + /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP7way + + ########################################################################### + ## create upstream refGene maf files + cd /hive/data/genomes/hg38/bed/multiz7way/downloads/multiz7way + # bash script +#!/bin/sh +export geneTbl="knownGene" +for S in 1000 2000 5000 +do + echo "making upstream${S}.maf" + featureBits hg38 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ + | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ + | /cluster/bin/$MACHTYPE/mafFrags hg38 multiz7way \ + stdin stdout \ + -orgs=/hive/data/genomes/hg38/bed/multiz7way/species.list \ + | gzip -c > upstream${S}.${geneTbl}.maf.gz + echo "done upstream${S}.${geneTbl}.maf.gz" +done + # real 60m16.631s + + md5sum upstream*.gz >> md5sum.txt + + # some other symlinks were already made above + # obtain the README.txt from geoFor1/multiz7way and update for this + # situation + ln -s `pwd`/upstream*.gz README.txt \ + /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way + +############################################################################# +# hgPal downloads (DONE - 2014-06-06 - Hiram) +# FASTA from 7-way for knownGene, refGene and knownCanonical + + ssh hgwdev + screen -S hg38HgPal + mkdir /hive/data/genomes/hg38/bed/multiz7way/pal + cd /hive/data/genomes/hg38/bed/multiz7way/pal + cat ../species.list | tr '[ ]' '[\n]' > order.list + + export mz=multiz7way + export gp=knownGene + export db=hg38 + export I=0 + mkdir exonAA exonNuc + for C in `sort -nk2 ../../../chrom.sizes | cut -f1` + do + I=`echo $I | awk '{print $1+1}'` + echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" + echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" + if [ $I -gt 6 ]; then + echo "date" + echo "wait" + I=0 + fi + done > $gp.jobs + echo "date" >> $gp.jobs + echo "wait" >> $gp.jobs + + time ./$gp.jobs > $gp.jobs.log 2>&1 & + # real 28m46.919s + + time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz + # real 0m23.798s + time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz + # real 1m28.197s + + export mz=multiz7way + export gp=knownGene + export db=hg38 + export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments + mkdir -p $pd + md5sum *.fa.gz > md5sum.txt + ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz + ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz + ln -s `pwd`/md5sum.txt $pd/ + + rm -rf exonAA exonNuc + + ### need other gene track alignments also + # running up refGene + cd /hive/data/genomes/hg38/bed/multiz7way/pal + export mz=multiz7way + export gp=refGene + export db=hg38 + export I=0 + mkdir exonAA exonNuc + for C in `sort -nk2 ../../../chrom.sizes | cut -f1` + do + I=`echo $I | awk '{print $1+1}'` + echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" + echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" + if [ $I -gt 6 ]; then + echo "date" + echo "wait" + I=0 + fi + done > $gp.jobs + echo "date" >> $gp.jobs + echo "wait" >> $gp.jobs + + time sh -x $gp.jobs > $gp.jobs.log 2>&1 + # real 15m15.424s + + export mz=multiz7way + export gp=refGene + export db=hg38 + time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz + # real 0m23.119s + time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz + # real 1m15.547s + + du -hsc exonAA exonNuc refGene*.fa.gz + # 59M exonAA + # 101M exonNuc + # 59M refGene.multiz7way.exonAA.fa.gz + # 101M refGene.multiz7way.exonNuc.fa.gz + # 317M total + + rm -rf exonAA exonNuc + + # we're only distributing exons at the moment + export mz=multiz7way + export gp=refGene + export db=hg38 + export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments + mkdir -p $pd + md5sum *.fa.gz > md5sum.txt + ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz + ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz + ln -s `pwd`/md5sum.txt $pd/ + + ### And knownCanonical + cd /hive/data/genomes/hg38/bed/multiz7way/pal + export mz=multiz7way + export gp=knownCanonical + export db=hg38 + mkdir exonAA exonNuc ppredAA ppredNuc knownCanonical + + cut -f1 ../../../chrom.sizes | while read C + do + echo $C + hgsql hg38 -N -e "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$C'" > knownCanonical/$C.known.bed + done + + ls knownCanonical/*.known.bed | while read F + do + if [ -s $F ]; then + echo $F | sed -e 's#knownCanonical/##; s/.known.bed//' + fi + done | while read C + do + echo "date" + echo "mafGene -geneBeds=knownCanonical/$C.known.bed $db $mz knownGene order.list stdout | \ + gzip -c > ppredAA/$C.ppredAA.fa.gz" + echo "mafGene -geneBeds=knownCanonical/$C.known.bed -noTrans $db $mz knownGene order.list stdout | \ + gzip -c > ppredNuc/$C.ppredNuc.fa.gz" + echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons -noTrans $db $mz knownGene order.list stdout | \ + gzip -c > exonNuc/$C.exonNuc.fa.gz" + echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons $db $mz knownGene order.list stdout | \ + gzip -c > exonAA/$C.exonAA.fa.gz" + done > $gp.$mz.jobs + + time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 + # real 72m58.133s + + rm *.known.bed + mz=multiz7way + gp=knownCanonical + db=hg38 + zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz & + zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz & + zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz & + zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz + + rm -rf exonAA exonNuc ppredAA ppredNuc + + mz=multiz7way + gp=knownCanonical + db=hg38 + pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments + mkdir -p $pd + ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz + ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz + cd $pd + md5sum *.exon*.fa.gz > md5sum.txt + +############################################################################# +# wiki page for 7-way (DONE - 2014-06-04 - Hiram) + mkdir /hive/users/hiram/bigWays/hg38.7way + cd /hive/users/hiram/bigWays + echo "hg38" > hg38.7way/ordered.list + awk '{print $1}' /hive/data/genomes/hg38/bed/multiz7way/7way.distances.txt \ + >> hg38.7way/ordered.list + + # sizeStats.sh catches up the cached measurements required for data + # in the tables. They may already be done. + ./sizeStats.sh hg38.7way/ordered.list + # dbDb.sh constructs hg38.7way/Hg38_7-way_conservation_alignment.html + ./dbDb.sh hg38 7way + # sizeStats.pl constructs hg38.7way/Hg38_7-way_Genome_size_statistics.html + ./sizeStats.pl hg38 7way + + # defCheck.pl constructs Hg38_7-way_conservation_lastz_parameters.html + ./defCheck.pl hg38 7way + + # this constructs the html pages in hg38.7way/: +# -rw-rw-r-- 1 4153 Jun 5 11:03 Hg38_7-way_conservation_alignment.html +# -rw-rw-r-- 1 5833 Jun 5 11:04 Hg38_7-way_Genome_size_statistics.html +# -rw-rw-r-- 1 3854 Jun 5 11:04 Hg38_7-way_conservation_lastz_parameters.html + + # add those pages to the genomewiki. Their page names are the + # names of the .html files without the .html: +# Hg38_7-way_conservation_alignment +# Hg38_7-way_Genome_size_statistics +# Hg38_7-way_conservation_lastz_parameters + + # when you view the first one you enter, it will have links to the + # missing two. + +############################################################################# +# GRC Incident database (DONE - 2014-06-14 - Hiram) + # this procedure is run as a cron job in Hiram's account: + + # 33 09 * * * /hive/data/outside/grc/incidentDb/runUpdate.sh makeItSo + + # data comes from: ftp://ftp.ncbi.nlm.nih.gov/pub/grc/ + # processed by /hive/data/outside/grc/incidentDb/grcUpdate.sh + + # the table in the dataBase is: grcIncidentDb + # which is the URL to the bb file, a single row: + # http://genomewiki.ucsc.edu/images/7/7f/Hg38.grcIncidentDb.bb + +############################################################################# +# RepeatMasker Visualization track (DONE - 2014-07-25 - Hiram) + mkdir /hive/data/genomes/hg38/bed/rmskJoined + cd /hive/data/genomes/hg38/bed/rmskJoined + + ln -s ../repeatMasker/hg38.sorted.fa.out . + ln -s ../repeatMasker/hg38.fa.align.gz . + + # working on fixing this script for the next release of RM + /scratch/data/RepeatMasker140131/util/nextVerRmToUCSCTables.pl \ + -out hg38.sorted.fa.out -align hg38.fa.align.gz + + hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/rmskJoined.sql \ + -renameSqlTable -verbose=4 -tab \ + -type=bed4+9 -as=$HOME/kent/src/hg/lib/rmskJoined.as hg38 \ + rmskJoinedBaseline hg38.sorted.fa.join.bed \ + > loadJoined.log 2>&1 + + hgLoadSqlTab hg38 rmskAlignBaseline \ + /cluster/home/hiram/kent/src/hg/lib/rmskAlign.sql \ + hg38.fa.align.tsv > loadAlign.log 2>&1 + + hgLoadOutJoined -verbose=2 hg38 hg38.sorted.fa.out > loadOut.log 2>&1 + + featureBits -countGaps hg38 rmskJoinedBaseline + # 2716777279 bases of 3209286105 (84.654%) in intersection + +############################################################################## +# LASTZ Macaca Mulatta RheMac2 (DONE - 2014-07-13 - braney) + mkdir /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11 + cd /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11 + + # best to always specify an exact path to lastz so we know which one is used + # lastz default parameters are human-mouse parameters + + cat << '_EOF_' > DEF +# human vs macaca mulatta +BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz +# maximum M allowed with lastz is only 254 +BLASTZ_M=254 +BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q +BLASTZ_O=600 +BLASTZ_E=150 +# other parameters from panTro2 vs hg18 lastz on advice from Webb +BLASTZ_K=4500 +BLASTZ_Y=15000 +BLASTZ_T=2 + +# TARGET: Human Hg38 +SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit +SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes +SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit +SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes +SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift +SEQ1_CHUNK=20000000 +SEQ1_LAP=10000 + +# QUERY: Macaca Mulatta RheMac2 +SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit +SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes +SEQ2_CHUNK=20000000 +SEQ2_LAP=0 +SEQ2_IN_CONTIGS=0 + +BASE=/hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11 +TMPDIR=/dev/shm +'_EOF_' + # << happy emacs + time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ + `pwd`/DEF \ + -syntenicNet -fileServer=hgwdev \ + -chainMinScore=5000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 + # Elapsed time: 141m36s + cat fb.hg38.chainRheMac2Link.txt + # 2455106923 bases of 3049335806 (80.513%) in intersection + + # running the swap + mkdir /hive/data/genomes/rheMac2/bed/blastz.hg38.swap + cd /hive/data/genomes/rheMac2/bed/blastz.hg38.swap + time nice -n +19 doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11/DEF \ + -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 + # 83m26.095s + cat fb.rheMac2.chainHg38Link.txt + # 2313950599 bases of 2646704109 (87.428%) in intersection +# + +######################################################################### +# LASTZ Chlorocebus sabaeus (DONE - 2014-07-13 - braney) + mkdir /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11 + cd /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11 + + # best to always specify an exact path to lastz so we know which one is used + # lastz default parameters are human-mouse parameters + + cat << '_EOF_' > DEF +# human vs Chlorocebus sabaeus +BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz +# maximum M allowed with lastz is only 254 +BLASTZ_M=254 +BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q +BLASTZ_O=600 +BLASTZ_E=150 +# other parameters from panTro2 vs hg18 lastz on advice from Webb +BLASTZ_K=4500 +BLASTZ_Y=15000 +BLASTZ_T=2 + + +# TARGET: Human Hg38 +SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit +SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes +SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit +SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes +SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift +SEQ1_CHUNK=20000000 +SEQ1_LAP=10000 + +# QUERY Chlorocebus sabaeus chlSab2 +SEQ2_DIR=/scratch/data/chlSab2/chlSab2.2bit +SEQ2_LEN=/scratch/data/chlSab2/chrom.sizes +SEQ2_CHUNK=20000000 +SEQ2_LAP=0 +SEQ2_IN_CONTIGS=0 + +BASE=/hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11 +TMPDIR=/dev/shm +'_EOF_' + # << happy emacs + time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ + `pwd`/DEF \ + -syntenicNet -fileServer=hgwdev \ + -chainMinScore=5000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 + # Elapsed time: 142m4s + cat fb.hg38.chainChlSab2Link.txt + # 2573435303 bases of 3049335806 (84.393%) in intersection + + # running the swap + mkdir /hive/data/genomes/chlSab2/bed/blastz.hg38.swap + cd /hive/data/genomes/chlSab2/bed/blastz.hg38.swap + time nice -n +19 doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11/DEF \ + -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 + # 88m48.411s + cat fb.chlSab2.chainHg38Link.txt + # 2429053010 bases of 2752019208 (88.264%) in intersection + +######################################################################### +# SEGMENTAL DUPLICATIONS (DONE - 2014-08-13 - Hiram) + # redmine issue: refs #13580 + + # file received in email from Archana Natarajan Raja (araja at uw.edu) + mkdir /hive/data/genomes/hg38/bed/genomicSuperDups + cd /hive/data/genomes/hg38/bed/genomicSuperDups +# -rw-r--r-- 1 16478617 Aug 11 16:18 GenomicSuperDup.tab + + # no longer filtering items smaller than 1,000 bases, see note + # in redmine issue refs #13580 +# While the size of the 24 alignments are less than 1000 bases , the size of +# their pairs to which they align are always >1000, you can confirm this by +# looking at the value in column 22 in your table (alignB -ucsc format), will +# always be >1000 bp . We are seeing this only now because there are lots of +# new and resolved duplications added to hg38. Hence , I would recommend not +# filtering these items and uploading the current set as is. + + # there is no chrEBV in the browser: + grep -v chrEBV GenomicSuperDup.tab | sed -e 's/\t_\t/\t-\t/;' \ + | hgLoadBed hg38 genomicSuperDups stdin \ + -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql + # Read 69894 elements of size 29 from stdin + + checkTableCoords hg38 genomicSuperDups + # (the chrEBV was found with this check) + + featureBits -countGaps hg38 genomicSuperDups + # 175429664 bases of 3209286105 (5.466%) in intersection + + featureBits -countGaps hg19 genomicSuperDups + # 166092393 bases of 3137161264 (5.294%) in intersection + featureBits -countGaps hg18 genomicSuperDups + # 159204446 bases of 3107677273 (5.123%) in intersection + + featureBits -countGaps mm10 genomicSuperDups + # 214917441 bases of 2730871774 (7.870%) in intersection + featureBits -countGaps mm9 genomicSuperDups + # 208214567 bases of 2725765481 (7.639%) in intersection + +############################################################################## +# cloneEnds (DONE - 2014-08-14 - Hiram) + + mkdir /hive/data/genomes/hg38/bed/cloneEnds + cd /hive/data/genomes/hg38/bed/cloneEnds + + # fetch the NCBI INSDC name correspondence file: + rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000001405.26.assembly.txt ./ + + # fetch the clone reports + mkdir reports + rsync -a -P \ +rsync://ftp.ncbi.nih.gov/repository/clone/reports/Homo_sapiens/*.GCF_000001405.26.106.*.gff \ + ./reports/ + + # script to establish refSeq to UCSC chrom names: + + cat << '_EOF_' > refSeqNames.pl +#!/usr/bin/env perl + +use strict; +use warnings; + +open (FH, ") { + chomp $line; + next if ($line =~ m/^#/); + my @a = split('\t', $line); + my $chrN = $a[2]; + my $refSeq = $a[6]; + my $contig = $a[4]; + my $type = $a[1]; + next if (!defined $type); + next if (!defined $refSeq); + next if (!defined $contig); + my $suffix = ""; + if ($type eq "alt-scaffold") { + $suffix = "_alt"; + } elsif ($type eq "unlocalized-scaffold") { + $suffix = "_random"; + } elsif ($type eq "unplaced-scaffold") { + $chrN = "Un"; + } + $chrN = "M" if ($chrN eq "MT"); + if ($a[0] =~ m/_/) { + $contig =~ s/\./v/; + printf "%s\tchr%s_%s%s\n", $refSeq, $chrN, $contig, $suffix; + } else { + printf "%s\tchr%s\n", $refSeq, $chrN; + } +} +close (FH); +'_EOF_' + # << happy emacs + + chmod +x refSeqNames.pl + + ./refSeqNames.pl > refSeq.ucscName.tab + + # establish full library list: + ls reports/*.GCF_000001405.26.106.*.gff | sed -e 's#reports/##' \ + | cut -d"." -f1 | sort -u > library.list.txt + + # a script to scan the GFF files, with the refSeq.ucscName.tab + # name correspondence to construct bed files + + cat << '_EOF_' > hg38.pl +#!/usr/bin/env perl + +use strict; +use warnings; + +my $argc = scalar(@ARGV); + +if ($argc < 1) { + printf STDERR "usage: ./hg38.pl [moreReports.gff]\n"; + exit 255; +} + +my %refSeqToUcsc; # key is refSeq name, value is UCSC chrom name +open (FH, ") { + chomp $line; + my ($refSeq, $ucsc) = split('\t', $line); + $refSeqToUcsc{$refSeq} = $ucsc; +} +close (FH); + +my %chromSizes; # key is UCSC chrom name, key is chrom size +open (FH, ") { + chomp $line; + my ($chr, $size) = split('\t', $line); + $chromSizes{$chr} = $size; +} +close (FH); + +while (my $file = shift) { +my %starts; # key is parent ID, value is start end coordinates start,end +my %ends; # key is parent ID, value is end end coordinates start,end +my %parents; # key is parent ID, value is 1 to signify exists +my %endNames; # key is parent ID, value is the Name of the parent clone_insert + +printf STDERR "# processing $file\n"; + +open (FH, "<$file") or die "can not read $file"; +while (my $line = ) { + chomp $line; + next if ($line=~ m/^#/); + my @a = split('\t', $line); + next if (scalar(@a) < 1); + my $contig = $a[0]; + $contig =~ s/ref.//; + $contig =~ s/\|//; + my $ucscChr = $refSeqToUcsc{$contig}; + if (!defined($ucscChr)) { + printf STDERR "# ERR: contig not in refSeqToUcsc: '$contig'\n"; + next; + } + next if (! exists($chromSizes{$ucscChr})); + my $chromSize = $chromSizes{$ucscChr}; + my $chromStart = $a[3] - 1; + my $chromEnd = $a[4]; + if ($chromStart > $chromSize) { + printf STDERR "# warning chromStart over size $ucscChr $chromStart $chromEnd\n"; + $chromStart = $chromSize-1; + } + if ($chromEnd > $chromSize) { + my $overRun = $chromEnd - $chromSize; + printf STDERR "# warning chromEnd over size by $overRun -> $ucscChr $chromStart $chromEnd\n"; + $chromEnd = $chromSize; + } + my $id="notFound"; + my $name="notFound"; + my $parent="notFound"; + my @b = split(';', $a[8]); + for (my $i = 0; $i < scalar(@b); ++$i) { + my ($tag, $value) = split('=', $b[$i]); + if ($tag eq "ID") { + $id = $value; + if ($id !~ m/-/) { + if (exists($parents{$id})) { + printf STDERR "# WARN: duplicate parent: $id"; + } else { + $parents{$id} = $ucscChr; + } + } + } elsif ($tag eq "Parent") { + $parent = $value; + } elsif ($tag eq "Name") { + $name = $value; + } + } + my $type="notFound"; + my $insertType = $a[2]; + if ($insertType =~ m/clone_insert_start/) { + $type = "start"; + if ($parent eq "notFound") { + printf STDERR "# ERR: can not find parent for start $name Ttype $id\n"; + } else { + if (!exists($parents{$parent})) { + printf STDERR "# ERR: start found $name with no parent $parent declared\n"; + } elsif (exists($starts{$parent})) { + printf STDERR "# ERR: duplicate start for $parent\n"; + } elsif ($ucscChr eq $parents{$parent}) { + $starts{$parent} = sprintf("%s\t%s", $chromStart, $chromEnd); + } else { + printf STDERR "# ERR: start on different chrom $ucscChr than parent $parent $parents{$parent}\n"; + } + } + } elsif ($insertType =~ m/clone_insert_end/) { + $type = "end"; + if ($parent eq "notFound") { + printf STDERR "# ERR: can not find parent for end $name Ttype $id\n"; + } else { + if (!exists($parents{$parent})) { + printf STDERR "# ERR: end found $name with no parent $parent declared\n"; + } elsif (exists($ends{$parent})) { + printf STDERR "# ERR: duplicate end for $parent\n"; + } elsif ($ucscChr eq $parents{$parent}) { + $ends{$parent} = sprintf("%s\t%s", $chromStart, $chromEnd); + } else { + printf STDERR "# ERR: end on different chrom $ucscChr than parent $parent $parents{$parent}\n"; + } + } + } elsif ($insertType =~ m/clone_insert/) { + $type = "insert"; + $endNames{$id} = $name; + } + $name =~ s/gi\|//g; + $id =~ s/gi\|//g; + printf STDERR "%s\t%d\t%d\t%s_%s_%s\t0\t%s\n", $ucscChr, $chromStart, $chromEnd, $name, $type, $id, $a[6]; +} # while (my $line = ) + +close (FH); + +foreach my $parent (keys %parents) { + if (! exists($starts{$parent}) ) { + printf STDERR "# ERR: no start for $parent\n"; + } elsif (! exists($ends{$parent}) ) { + printf STDERR "# ERR: no end for $parent\n"; + } else { + my $strand = "+"; + my $chrStart = 0; + my $chrEnd = 0; + my $blockStart = 0; + my ($sStart, $sEnd) = split('\t', $starts{$parent}); + my ($eStart, $eEnd) = split('\t', $ends{$parent}); + my $startSize = $sEnd - $sStart; + my $endSize = $eEnd - $eStart; + if ($eStart < $sStart) { + $chrStart = $eStart; + $chrEnd = $sEnd; + $blockStart = $sStart - $chrStart; + $strand = "-"; + $startSize = $eEnd - $eStart; + $endSize = $sEnd - $sStart; + } else { + $chrStart = $sStart; + $chrEnd = $eEnd; + $blockStart = $eStart - $chrStart; + } + if ($startSize > $blockStart) { + printf STDERR "# startSize > blockStart $endNames{$parent}\n"; + } else { + printf "%s\t%d\t%d\t%s\t0\t%s\t%d\t%d\t0\t2\t%d,%d\t0,%d\n", $parents{$parent}, $chrStart, $chrEnd, $endNames{$parent}, $strand, $chrStart, $chrEnd, $startSize, $endSize, $blockStart; + } + } +} +} +'_EOF_' + # << happy emacs + + chmod +x hg38.pl + + # process GFF files into bed files into separateLibs/ directory +for L in `cat library.list.txt` +do + export destDir="separateLibs/${L}" + echo "working: ${L}" 1>&1 + mkdir -p "${destDir}" + ./hg38.pl reports/${L}.GCF_000001405.26.106.*.gff \ + 2> ${destDir}/tmp.bed6 | sort -k1,1 -k2,2n > ${destDir}/hg38.${L}.bed + sort -k1,1 -k2,2n ${destDir}/tmp.bed6 > ${destDir}/hg38.${L}.items.bed6 +done + + # use only those libraries with more than 20,000 clone ends + wc -l separateLibs/*/*.bed | sort -n | grep -v total | awk '$1 > 20000' \ + | sed -e 's#.*separateLibs/##; s#/.*##' > libs.over20K.list + + # note those libraries with less than 20,000 clone ends + wc -l separateLibs/*/*.bed | grep -v total | awk '$1 < 20000' | sed -e 's#.*separateLibs/##; s#/.*##' > libs.under20K.list + + # filter out bad ends, length must be <= median size times three + cat libs.over20K.list | while read D +do + if [ ! -s separateLibs/${D}/lengths.txt ]; then + awk '{print $3-$2}' separateLibs/${D}/hg38.${D}.bed \ + > separateLibs/${D}/lengths.txt + fi + median3X=`ave separateLibs/${D}/lengths.txt | grep median | awk '{printf "%d", $2*3}'` + awk '($3-$2) < '$median3X'' separateLibs/${D}/hg38.${D}.bed > separateLibs/${D}/hg38.median3X.bed + awk '($3-$2) >= '$median3X'' separateLibs/${D}/hg38.${D}.bed > separateLibs/${D}/hg38.badMap.bed + before=`cat separateLibs/${D}/hg38.${D}.bed | wc -l` + after=`cat separateLibs/${D}/hg38.median3X.bed | wc -l` + dropped=`echo $before $after | awk '{print $1-$2}'` + perCent=`echo $dropped $before | awk '{printf "%.2f", 100*'$dropped/$before'}'` + echo "$D $before - $after = $dropped -> % $perCent dropped" +done + +# ABC20 24692 - 24474 = 218 -> % 0.88 dropped +# RP11 86660 - 85903 = 757 -> % 0.87 dropped +# CTD 95853 - 94941 = 912 -> % 0.95 dropped +# CH17 105618 - 105060 = 558 -> % 0.53 dropped +# ABC21 182154 - 180973 = 1181 -> % 0.65 dropped +# ABC22 189939 - 188743 = 1196 -> % 0.63 dropped +# COR02 208263 - 206782 = 1481 -> % 0.71 dropped +# ABC18 325080 - 322904 = 2176 -> % 0.67 dropped +# ABC27 334178 - 331822 = 2356 -> % 0.71 dropped +# ABC24 398944 - 395776 = 3168 -> % 0.79 dropped +# ABC23 436965 - 433896 = 3069 -> % 0.70 dropped +# ABC16 452220 - 449101 = 3119 -> % 0.69 dropped +# COR2A 583008 - 578578 = 4430 -> % 0.76 dropped +# WI2 587165 - 582843 = 4322 -> % 0.74 dropped +# ABC7 649297 - 644071 = 5226 -> % 0.80 dropped +# ABC11 729962 - 724864 = 5098 -> % 0.70 dropped +# ABC9 755994 - 750648 = 5346 -> % 0.71 dropped +# ABC12 777816 - 771827 = 5989 -> % 0.77 dropped +# ABC10 787969 - 781331 = 6638 -> % 0.84 dropped +# ABC13 810822 - 803589 = 7233 -> % 0.89 dropped +# ABC14 845573 - 839126 = 6447 -> % 0.76 dropped +# ABC8 1204275 - 1192784 = 11491 -> % 0.95 dropped + + # loading the median3X files +for L in `cat libs.over20K.list` +do + echo $L 1>&2 + hgLoadBed -type=bed12 hg38 cloneEnd${L} \ + separateLibs/${L}/hg38.median3X.bed \ + > separateLibs/loadBed.${L}.log 2>&1 +done + + # loading the dropped ends: + mkdir /hive/data/genomes/hg38/bed/cloneEnds/droppedTooBig + # link them to here + cat ../libs.over20K.list | while read L +do + ln -s ../separateLibs/${L}/hg38.badMap.bed ${L}.badMap.bed +done + # then load + hgLoadBed -type=bed12 hg38 cloneEndbadEnds *.badMap.bed + + # construct multiple mapped ends: +for L in `cat libs.over20K.list` +do + cat separateLibs/${L}/hg38.median3X.bed +done | sort -k4 > allEnds.bed + + cut -f4 allEnds.bed | sort | uniq -c | sort -rn > allEnds.names.count.txt + + awk '$1 > 1' allEnds.names.count.txt | awk '{print $2}' \ + | sort > multiples.names.txt + + join -t' ' -o "2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12" \ + -2 4 multiples.names.txt allEnds.bed | sort -k1,1 -k2,2n \ + > allEnds.multiple.locations.bed + + hgLoadBed -type=bed12 hg38 cloneEndmultipleMaps \ + allEnds.multiple.locations.bed > load.multipleMaps.log 2>&1 + + awk '$6 == "+"' allEnds.bed | sort -k1,1 -k2,2n \ + | bedItemOverlapCount hg38 stdin > allEnds.forward.bedGraph + + awk '$6 == "-"' allEnds.bed | sort -k1,1 -k2,2n \ + | bedItemOverlapCount hg38 stdin > allEnds.reverse.bedGraph + + bedGraphToBigWig allEnds.forward.bedGraph \ + /hive/data/genomes/hg38/chrom.sizes \ + cloneEndcoverageForward.bw + + bedGraphToBigWig allEnds.reverse.bedGraph \ + /hive/data/genomes/hg38/chrom.sizes \ + cloneEndcoverageReverse.bw + + mkdir /gbdb/hg38/bbi/cloneEnd + ln -s `pwd`/cloneEndcoverageForward.bw /gbdb/hg38/bbi/cloneEnd + ln -s `pwd`/cloneEndcoverageReverse.bw /gbdb/hg38/bbi/cloneEnd + + hgBbiDbLink hg38 cloneEndcoverageForward \ + /gbdb/hg38/bbi/cloneEnd/cloneEndcoverageForward.bw + hgBbiDbLink hg38 cloneEndcoverageReverse \ + /gbdb/hg38/bbi/cloneEnd/cloneEndcoverageReverse.bw + + ### Fixup the scores to indicate how many multiple mappings as mentioned + ### in the hg19 bacEnds description page: one mapping: score = 1000 + ### multiple mappings: score = 1500/count + ### the sort | uniq -c | awk does this score calculation with the name + ### in column 1 + ### The join puts the existing table together with those scores + ### DONE - 2015-06-18 - Hiram + + mkdir /hive/data/genomes/hg38/bed/cloneEnds/addCounts + cd /hive/data/genomes/hg38/bed/cloneEnds/addCounts + mkdir score withScore noScore withScore + for table in cloneEndABC10 cloneEndABC11 cloneEndABC12 cloneEndABC13 \ +cloneEndABC14 cloneEndABC16 cloneEndABC18 cloneEndABC20 cloneEndABC21 \ +cloneEndABC22 cloneEndABC23 cloneEndABC24 cloneEndABC27 cloneEndABC7 \ +cloneEndABC8 cloneEndABC9 cloneEndCH17 cloneEndCOR02 cloneEndCOR2A \ +cloneEndCTD cloneEndRP11 cloneEndWI2 cloneEndbadEnds cloneEndmultipleMaps +do + hgsql -N -e "select name from $table;" hg38 | sort | uniq -c | + awk '{ if (1 == $1) {printf "%s\t1000\n", $2} else {printf "%s\t%d\n", $2, 1500/$1} }' \ + | sort > score/hg38.$table.score.tab + hgsql -N -e "select * from $table order by name;" hg38 \ + | sort -k5 > noScore/hg38.$table.tab + join -t'^I' -1 5 noScore/hg38.$table.tab score/hg38.$table.score.tab \ + | awk '{printf "%d\t%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n", $2,$3,$4,$5,$1,$14,$7,$8,$9,$10,$11,$12,$13}' \ + | sort -k2,2 -k3,3n > withScore/hg38.$table.withScore.tab + hgsql -e "delete from $table;" hg38 + hgsql -e "load data local infile \"withScore/hg38.$table.withScore.tab\" into table $table;" hg38 +done + +############################################################################## +# SIB Transcriptome (DONE 2014-08-27 Steve) + + # Create working directory and download data from where Christian + # Iseli (christian.iseli@unil.ch) put it, and unpack. + mkdir -p /hive/data/genomes/hg38/bed/sibTranscriptome + cd /hive/data/genomes/hg38/bed/sibTranscriptome + wget --timestamping http://ludwig-sun1.unil.ch/~chris/HTr.gtf.gz + wget --timestamping http://ludwig-sun1.unil.ch/~chris/txg.tar.gz + + tar -zxvf txg.tar.gz + + zcat HTr.gtf.gz | ldHgGene hg38 sibGene stdin + # Reading stdin + # Read 208508 transcripts in 2824960 lines in 1 files + # 208508 groups 25 seqs 1 sources 2 feature types + # 208508 gene predictions + + # Do a little data cleanup and transformation and load splice graphs + # into database. + sed 's/altGraphX/sibTxGraph/' ~/kent/src/hg/lib/altGraphX.sql > sibTxGraph.sql + cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb \ + -sqlTable=sibTxGraph.sql hg38 sibTxGraph stdin + # Reading stdin + # Read 47817 elements of size 18 from stdin + # Sorted + # Creating table definition for sibTxGraph from sql: sibTxGraph.sql + # Saving bed.tab + # Loading hg38 + + # Create sibAltEvents track for analyzed alt-splices. + # Not on RR for hg18 and hg19, so do not push it out + cat txg/*.txg | txgAnalyze stdin /cluster/data/hg38/hg38.2bit sibAltEvents.bed + awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed + hgLoadBed hg38 sibAltEvents foo.bed + # Reading foo.bed + # Read 452436 elements of size 6 from foo.bed + # Sorted + # Creating table definition for sibAltEvents, bedSize: 6 + # Saving bed.tab + # Loading hg38 + + # push sibGene and sibTxGraph for hg38 + +############################################################################ +# Orangutan Lastz run (DONE - 2014-05-27 - Hiram) + screen -S hg38PonAbe2 # use a screen to manage this longish running job + mkdir /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02 + cd /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02 + + # always set the BLASTZ program so we know what version was used + cat << '_EOF_' > DEF +# human vs chimp +BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz +BLASTZ_O=600 +BLASTZ_E=150 +# maximum M allowed with lastz is only 254 +BLASTZ_M=254 + +BLASTZ_T=2 +BLASTZ_Y=15000 +BLASTZ_K=4500 +BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q +# A C G T +# 90 -330 -236 -356 +# -330 100 -318 -236 +# -236 -318 100 -330 +# -356 -236 -330 90 + +# TARGET: Human Hg38 +SEQ1_DIR=/scratch/data/hg38/hg38.2bit +SEQ1_LEN=/scratch/data/hg38/chrom.sizes +SEQ1_CHUNK=20000000 +SEQ1_LAP=10000 +SEQ1_IN_CONTIGS=0 + +# QUERY: Orangutan PonAbe2 +SEQ2_DIR=/hive/data/genomes/ponAbe2/ponAbe2.2bit +SEQ2_LEN=/hive/data/genomes/ponAbe2/chrom.sizes +SEQ2_CHUNK=10000000 +SEQ2_LAP=0 +SEQ2_LIMIT=100 +SEQ2_IN_CONTIGS=0 + +BASE=/hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02 +TMPDIR=/dev/shm +'_EOF_' + + time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ + -chainMinScore=5000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -syntenicNet) > do.log 2>&1 + # real 144m46.575s + cat fb.hg38.chainPonAbe2Link.txt + # 2719618310 bases of 3049335806 (89.187%) in intersection + + # filter with doRecipBest.pl + time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ + hg38 ponAbe2) > rbest.log 2>&1 + # real 60m1.060s + time (doRecipBest.pl -load -continue=load -workhorse=hgwdev \ + -buildDir=`pwd` hg38 ponAbe2) > loadRBest.log 2>&1 & + # real 3m35.834s + + cat fb.hg38.chainRBestPonAbe2Link.txt + # 2538296592 bases of 3049335806 (83.241%) in intersection + + # running the swap + mkdir /hive/data/genomes/ponAbe2/bed/blastz.hg38.swap + cd /hive/data/genomes/ponAbe2/bed/blastz.hg38.swap + time (doBlastzChainNet.pl -verbose=2 \ + -swap /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02/DEF \ + -chainMinScore=5000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -syntenicNet) > swap.log 2>&1 + # real 102m27.866s + cat fb.ponAbe2.chainHg38Link.txt + # 2773568958 bases of 3093572278 (89.656%) in intersection + + time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ + ponAbe2 hg38) > rbest.log 2>&1 + # real 78m47.312s + + + + +############################################################################# +# Add chrX alts to par (DONE 2014-10-14 angie) +# Thanks to Hiram for pointing out that intersecting chrX positions in +# altLocations and par shows whether a chrX alt overlaps a PAR. + cd /hive/data/genomes/hg38/bed/par + hgsql hg38 -e 'select * from altLocations where chrom = "chrX"' +#+-----+-------+------------+----------+---------------------+ +#| bin | chrom | chromStart | chromEnd | name | +#+-----+-------+------------+----------+---------------------+ +#| 73 | chrX | 319337 | 601516 | chrX_KI270880v1_alt | +#| 73 | chrX | 326487 | 601516 | chrX_KI270913v1_alt | +#| 149 | chrX | 79965153 | 80097082 | chrX_KI270881v1_alt | +#+-----+-------+------------+----------+---------------------+ + hgsql hg38 -e 'select * from par where chrom = "chrX"' +#+-----+-------+------------+-----------+------+ +#| bin | chrom | chromStart | chromEnd | name | +#+-----+-------+------------+-----------+------+ +#| 9 | chrX | 10000 | 2781479 | PAR1 | +#| 221 | chrX | 155701382 | 156030895 | PAR2 | +#+-----+-------+------------+-----------+------+ + # chrX_KI270880v1_alt and chrX_KI270913v1_alt are entirely contained in PAR1; + # chrX_KI270881v1_alt is not in either PAR. + hgsql hg38 -e 'select chrom,size from chromInfo \ + where chrom in ("chrX_KI270880v1_alt", "chrX_KI270913v1_alt");' +#+---------------------+--------+ +#| chrom | size | +#+---------------------+--------+ +#| chrX_KI270880v1_alt | 284869 | +#| chrX_KI270913v1_alt | 274009 | +#+---------------------+--------+ + # Process that into bed4 with name=PAR1: + hgsql hg38 -NBe 'select chrom, 0, size, "PAR1" from chromInfo \ + where chrom in ("chrX_KI270880v1_alt", "chrX_KI270913v1_alt");' \ + >> hg38Par.bed4 + hgLoadBed hg38 par hg38Par.bed4 + checkTableCoords hg38 par + + +############################################################################# +# LASTZ Cow bosTau8 (DONE - 2014-10-15 - Steve) + mkdir /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-215 + cd /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15 + + cat << '_EOF_' > DEF +# human vs cow +# maximum M allowed with lastz is only 254 +BLASTZ_M=254 + +# TARGET: Human hg38 +SEQ1_DIR=/scratch/data/hg38/hg38.2bit +SEQ1_LEN=/scratch/data/hg38/chrom.sizes +SEQ1_CHUNK=10000000 +SEQ1_LAP=10000 + +# QUERY: Cow bosTau8 +SEQ2_DIR=/hive/data/genomes/bosTau8/bosTau8.2bit +SEQ2_LEN=/hive/data/genomes/bosTau8/chrom.sizes +SEQ2_CHUNK=10000000 +SEQ2_LAP=0 + + +BASE=/hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15 +TMPDIR=/scratch/tmp +'_EOF_' + # << happy emacs + time nice -n +19 doBlastzChainNet.pl -verbose=2 \ + `pwd`/DEF \ + -syntenicNet \ + -noLoadChainSplit \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 + # real 602m37.523s + cat fb.hg38.chainBosTau8Link.txt + # 1401921010 bases of 3049335806 (45.975%) in intersection + # Create link + cd /hive/data/genomes/hg38/bed + ln -s lastzBosTau8.2014-10-15 lastz.bosTau8 + + # running the swap + mkdir /hive/data/genomes/bosTau8/bed/blastz.hg38.swap + cd /hive/data/genomes/bosTau8/bed/blastz.hg38.swap + time nice -n +19 doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15/DEF \ + -swap -syntenicNet \ + -noLoadChainSplit \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 + # real 116m32.121s + cat fb.bosTau8.chainHg38Link.txt + # 1336307377 bases of 2649307237 (50.440%) in intersection + cd /hive/data/genomes/bosTau8/bed + ln -s blastz.hg38.swap lastz.hg38 + +############################################################################ +# NCBI ClinVar (new version -DONE - 2014-11-08 - Max) +# see hg19.txt +######################################################################### + +######################################################################## +# CNV Developmental Delay track (2014-11-21 Steve) + + mkdir /hive/data/genomes/hg38/bed/cnvDevDelay + cd /hive/data/genomes/hg38/bed/cnvDevDelay + +wget --timestamping 'ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd100_Coe_et_al_2014/gvf/nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz' +wget --timestamping 'ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd54_Cooper_et_al_2011/gvf/nstd54_Cooper_et_al_2011.GRCh38.remap.all.germline.ucsc.gvf.gz' + +cp /kent/src/hg/utils/automation/gvfToBed8Attrs.pl . +mv gvfToBed8Attrs.pl gvfToBed8AttrsCase.pl +cp gvfToBed8AttrsCase.pl gvfToBed8AttrsControl100.pl +cp gvfToBed8AttrsCase.pl gvfToBed8AttrsControl54.pl + +# made three local copies of Angie's gvf conversion script - one to include +# only case individuals from nstd100, one to include only control individuals +# from nstd100 and one to include only control individuals from nstd54 + +# had to add an additional elsif statement to the nstd100 scripts to filter +# based on sample_name field: + +# } elsif ($tag eq "sample_name") { +# $sample_name = $val; +# } + +# added line 33/35 to each file: + +# next if ($sample_name eq "Unknown"); # keep only "case" individuals from nstd100 +# next if ($sample_name ne "Unknown"); # keep only "control" individuals from nstd100 +# next if ($phenotype ne "not_reported"); # keep only "control" individuals from nstd54 + +zcat nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsCase.pl > cnvDevDelayAllCase.bed +zcat nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsControl100.pl > cnvDevDelayAllControl.bed +zcat nstd54_Cooper_et_al_2011.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsControl54.pl >> cnvDevDelayAllControl.bed + +# GRCh38 data from dbVar had different naming scheme for alternate chromosomes +# (e.g., chr1|NT_187515.1 instead of chr1_KI270762v1_alt), so needed to write +# a script to substitute the correct UCSC names + + cat << '_EOF_' > chromXref.pl +#!/usr/bin/env perl + +use strict; +use warnings; + +sub usage() { + printf STDERR "usage: ./chromXref.pl \n" +} + +my $argc = scalar(@ARGV); + +if ($argc != 2) { + usage; + exit 255; +} + +open (file1, ") { + chomp($line); + my ($type, $chr, $acc1, $acc2) = split('\t', $line); + ($type, undef) = split('-', $type); + ($acc1, my $version) = split('\.', $acc1); + if ($type eq "unlocalized") { + $type = "random"; + } + my $ucscAcc = "_" . $acc1 . "v" . $version . "_" . $type; + $accArray[$i][0] = $ucscAcc; + $accArray[$i][1] = $acc2; + $i++; +} + +close (file1); + +open (file2, "<$ARGV[0]") or die "cannot read $ARGV[0]"; +open (file3, ">$ARGV[1]") or die "cannot read $ARGV[1]"; +local $/; +my $fileContents = ; +for ($i = 0; $i < scalar(@accArray); $i++) { + my $temp1 = $accArray[$i][1]; + my $temp2 = $accArray[$i][0]; + if ($fileContents =~ m/\|$temp1/) { + $fileContents =~ s/\|$temp1/$temp2/g; + } +} + +print file3 $fileContents; +close (file2); +close (file3); +'_EOF_' + # << happy emacs + +cp /hive/data/genomes/hg38/genbank/GCF_000001405.26.assembly.txt . + +cat GCF_000001405.26.assembly.txt | grep -v '^#\|assembled\|unplaced' | awk '{print $2 "\t" $3 "\t" $5 "\t" $7}' > hg38.xref + +chromXref.pl cnvDevDelayAllCase.bed cnvDevDelayAllCaseUcsc.bed +chromXref.pl cnvDevDelayAllControl.bed cnvDevDelayAllControlUcsc.bed + +hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \ + -allowStartEqualEnd hg38 cnvDevDelayCase cnvDevDelayAllCaseUcsc.bed + +hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \ + -allowStartEqualEnd hg38 cnvDevDelayControl cnvDevDelayAllControlUcsc.bed + + checkTableCoords hg38 cnvDevDelayCase + checkTableCoords hg38 cnvDevDelayControl + + +######################################################################### +# RETROFINDER RETROPOSED GENES ucscRetro track VERSION 9 +# (2015-01-12 - 2015-01-20, hartera, DONE) +ssh hgwdev +mkdir -p /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112 +cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112 + +cat << '_EOF_' > DEF + +RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 " +VERSION=9 +RUNDATE="2015-01-12" +DB=hg38 +SCORETHRESH=550 +GENOMENAME='Homo sapiens' +GBDB=hg +DATE=20150112 +RUNDIR=/hive/groups/gencode/pseudogenes/retroFinder/$DB.$DATE +BINDIR=/hive/users/hartera/GencodeWG/retroFinder/branches/version2/bin +KENTDIR=/cluster/home/hartera/kent +KENTBINDIR=/cluster/home/hartera/bin/x86_64 +MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz.$VERSION +TMPMRNA=$RUNDIR/mrnaBlastz/$DB +TMPEST=$RUNDIR/est/$DB +USEALTSEQS=0 +EST=all_est +SPLICED_EST=intronEst +SPLIT_EST=0 +SPLIT_SPLICED_EST=0 +LASTZPROG=/cluster/bin/penn/x86_64/lastz +SCRIPT=/hive/users/hartera/GencodeWG/retroFinder/branches/version2/src/pipeline +GENOME=/hive/data/genomes +RETRODIR=$GENOME/$DB/bed/retro +BASE=$RUNDIR/retro +OUTDIR=${BASE}/version${VERSION}/${DB} +RESULT=$OUTDIR/result +RESULTSPLIT=$OUTDIR/resultSplit +LOG=$OUTDIR/log +OUT=$OUTDIR/out +OVERLAPDIR=$OUTDIR/run.o +TABLE=ucscRetroInfo$VERSION +ORTHOTABLE=ucscRetroOrtho$VERSION +ALIGN=ucscRetroAli$VERSION +LOCAL=/scratch/data/$DB +TWOBIT=$GENOME/$DB/$DB.2bit +RMSK=rmsk +NET1=netMm10 +NET2=netCanFam3 +NET3=netRheMac3 +# these two nets determine which retros are classified as ancient, +# use two farthest nets +ANCIENT1=netMm10 +ANCIENT2=netCanFam3 +GENE1=knownGene +GENE2=refGene +GENE3=wgEncodeGencodeCompV19 +CLUSTER=ku +SPECIES="hg38 mm10" +ROOTDIR="/cluster/home/hartera/public_html/retro/hg38Jun14" +WEBROOT=$ROOTDIR/retro.$VERSION +WEBSERVER=http://hgwdev-hartera.soe.ucsc.edu +SHUFFLEDIR=shuffle +SHUFFLEROOT=$WEBROOT/$SHUFFLEDIR +DUPDIR=dups +DUPROOT=$WEBROOT/$DUPDIR +AGEDIR=age +AGEROOT=$WEBROOT/$AGEDIR +EXPDIR=exp +GENEPFAM=knownGene +PFAM=knownToPfam +PFAMIDFIELD=name +PFAMDOMAIN=value +ALTSPICE= +#ALTSPLICE=sibTxGraph +SPLITBYAGE=$SCRIPT/splitRetrosByAge +PDB=proteins140122 +#ARRAY=gnfAtlas2 +#AFFYPROBE="affyU133A,affyGnf1h" +#ARRAYMEDIAN=hgFixed.gnfHumanAtlas2Median +#ARRAYRATIO=hgFixed.gnfHumanAtlas2AllRatio +#ARRAYABS=hgFixed.gnfHumanAtlas2All +#ARRAYEXP=hgFixed.gnfHumanAtlas2MedianExps +#ARRAYEXPALL=hgFixed.gnfHumanAtlas2AllExps +#ARRAYLOOKUP=knownToGnfAtlas2 +#ARRAYPSLS="/hive/data/genomes/hg19/bed/geneAtlas2/affyU133A.psl /hive/data/genomes/hg19/bed/geneAtlas2/affyGnf1h.psl" +'_EOF_' + # << happy emacs +chmod +x DEF + +mkdir -p /hive/data/genomes/hg38/bed/retro +mkdir -p /hive/data/genomes/hg38/bed/mrnaBlastz.9 +mkdir -p /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/mrnaBlastz +cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/mrnaBlastz +cp ../DEF . + +# Create S1.len file +rom.sizes without random chroms or chrM, there are many alt loci also +# in hg38 that were not in hg19 so 285 chroms total. +cat /hive/data/genomes/hg38/chrom.sizes | grep -v random \ + | grep -v chrUn | grep -v chrM > S1.len +cp S1.len /hive/data/genomes/hg38/bed/mrnaBlastz.9 + +screen +# Run steps 1 to 5 of RetroFinder pipeline from scripts in CCDS SVN source tree: +retroFinder/branches/version2/src/pipeline/ucscStep1.sh DEF +# check cluster jobs on ku +retroFinder/branches/version2/src/pipeline/ucscStep2.sh DEF +retroFinder/branches/version2/src/pipeline/ucscStep3.sh DEF +#check cluster jobs on ku +retroFinder/branches/version2/src/pipeline/ucscStep4.sh DEF +#check cluster jobs on ku + # Load the track +retroFinder/branches/version2/src/pipeline/ucscStep5.sh DEF +cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/retro/version9/hg38 +retroFinder/branches/version2/src/pipeline/filterMrna.sh +retroFinder/branches/version2/src/pipeline/filterEst.sh +# Check cluster jobs on ku +retroFinder/branches/version2/src/pipeline/analyseExpress.sh +# Check cluster jobs on ku +#added ucscRetroAli9 to kent/src/hg/makeDb/human/hg38/trackDb.ra +# copied +# /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/retro/version9/hg38/trackDb.retro +# entry to kent/src/hg/makeDb/trackDb/human/hg38/trackDb.ra and edited it to +# remove the full date and add: +# dataVersion Jan. 2015 +# Scripts copied ucscRetroAli9.psl, ucscRetroInfo9.bed and ucscRetroCds9.tab +# to /hive/data/genomes/hg38/bed/retro/ + +########## +# Make dbVar chrom to UCSC chrom lift file +# DONE braney 2/12/15 +cd /cluster/data/hg38/jkStuff +sort /cluster/data/hg38/chrom.sizes > tmpChrom +grep -v '^#\|assembled' /hive/data/genomes/hg38/genbank/GCF_000001405.26.assembly.txt | awk 'BEGIN {OFS="\t"} {print "chr" $3 "_" $5 "_" $2, "chr" $3 "|"$7}' | sed 's/-scaffold//' | sed 's/unlocalized/random/' | sed 's/_unplaced//' | sed 's/chrna/chrUn/g' | sed 's/\./v/' | sort | join /dev/stdin tmpChrom | awk 'BEGIN {OFS="\t"} {print 0, $2, $3, $1, $3}' > dbVar.lift +awk 'BEGIN {OFS="\t"} {print 0, $1, $2, $1, $2}' /cluster/data/hg38/chrom.sizes >> dbVar.lift +rm tmpChrom + +######################################################################### +# UCSC to RefSeq name correspondence (DONE - 2015-04-13 - Hiram) + + mkdir /hive/data/genomes/hg38/bed/ucscToRefSeq + cd /hive/data/genomes/hg38/bed/ucscToRefSeq + + # columns 5 and 7 are the INSDC and RefSeq names + + grep -v "^#" ../../genbank/GCF_000001405.26.assembly.txt \ + | awk -F'\t' '{printf "%s\t%s\n", $5,$7}' | sort > insdc.refSeq.tab + + hgsql -N -e 'select name,chrom,chromStart,chromEnd from ucscToINSDC;' hg38 \ + | sort > insdc.ucsc.tab + + join insdc.ucsc.tab insdc.refSeq.tab | tr '[ ]' '[\t]' \ + | cut -f2- > ucsc.refSeq.tab + + + export chrSize=`cut -f1 ucsc.refSeq.tab | awk '{print length($0)}' | sort -n | tail -1` + sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ + | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql + hgLoadSqlTab hg38 ucscToRefSeq ./ucscToRefSeq.sql ucsc.refSeq.tab + + checkTableCoords hg38 -table=ucscToRefSeq + +######################################################################### +#CREATE MICROSAT TRACK (DONE - 2015-05-22 - Hiram) + ssh hgwdev + mkdir /cluster/data/hg38/bed/microsat + cd /cluster/data/hg38/bed/microsat + + awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ + ../simpleRepeat/simpleRepeat.bed > microsat.bed + + hgLoadBed hg38 microsat microsat.bed + +############################################################################# +# ENCODE Regulatory tracks (Kate & Chris) + +# see reg.txt +######################################################################### +# GWIPS-viz Ribo-seq - (DONE - 2016-02-05 - Steve) +# contact Audrey Michel (audreymannion@gmail.com) +# redmine #16765 + +obtained bigWig file from shared Google drive +https://drive.google.com/a/soe.ucsc.edu/folderview?id=0B_xvV_5tXzOGQ1h5NEh4bnhNTDg&usp=sharing_eid + +mkdir /hive/data/genomes/hg38/bed/gwipsvizRiboseq +cp Global_RiboProElong.10_02_2016.bw /hive/data/genomes/hg38/bed/gwipsvizRiboseq/gwipsvizRiboseq.bw + +mkdir /gbdb/hg38/bbi/gwipsvizRiboseq +cd /gbdb/hg38/bbi/gwipsvizRiboseq +ln -s /hive/data/genomes/hg38/bed/gwipsvizRiboseq/gwipsvizRiboseq.bw gwipsvizRiboseq.bw + +hgsql hg38 +create table gwipsvizRiboseq select * from gc5BaseBw; +update gwipsvizRiboseq set fileName="/gbdb/hg38/bbi/gwipsvizRiboseq/gwipsvizRiboseq.bw" where fileName="/gbdb/hg38/bbi/gc5BaseBw/gc5Base.bw"; + +######################################################################### +# COSMIC v81 DONE Chris Eisenhart 2017-05-11 +# Make a new COSCMIC track for hg19 +mkdir /hive/data/outside/cosmic/hg38/v81 +cd /hive/data/outside/cosmic/hg38/v81 + +# Get the new data +sftp ceisenha@ucsc.edu@sftp-cancer.sanger.ac.uk +# Login to SFTP server then run these commands +get /files/grch38/cosmic/v81/CosmicMutantExport.tsv.gz + +# Remove the 'NS' fields, search for the \t after to exclude the E'NS'ST transcripts. +zcat CosmicMutantExport.tsv.gz | sed 's/NS\t/\t/g' > cosMut.tsv + +# Use a script to convert to bed format. +cosmicToBed cosMut.tsv cosMut.bed +# This many lines were skipped, 131597 for not having genomic coordinate + +# Sort and convert to big bed using the .as file. +sort -k1,1 -k2,2n cosMut.bed > sCosMut.bed +bedToBigBed -type=bed4+31 -as=cosmicNew.as sCosMut.bed /hive/data/genomes/hg38/chrom.sizes cosMutHg38V81.bb -tab -extraIndex=name,cosmLabel + +# Link it up so the outside world can see it. +cd /gbdb/hg38/cosmic/ +ln -s /hive/data/outside/cosmic/hg38/v81/cosMutHg38V81.bb . +######################################################################### +# hoffmanMappability hub import (2 super tracks) DONE Chris Eisenhart 2017-05-16 +mkdir /hive/data/outside/hoffmanMappability/hg38 +cd /hive/data/outside/hoffmanMappability/hg38 +wget https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/trackDb.txt +# Get the trackDb file +importTrackHub trackDb.txt hofMap.ra /gbdb/hg38/hoffmanMappability/ --trackDbPath=$HOME/kent/src/hg/makeDb/trackDb/human/hg38/ --test +# Check that the commands are what we want, then run for real +importTrackHub trackDb.txt hofMap.ra /gbdb/hg38/hoffmanMappability/ --trackDbPath=$HOME/kent/src/hg/makeDb/trackDb/human/hg38/ +# View the .ra file to make sure things are ok, here changed the groups to map, +# added the alpha tags, and removed the 'show' from 'superTrack on show' +cp hofMap.ra ~/kent/src/hg/makeDb/trackDb/human/hg38 +# Include hofMap.ra in the trackDb.ra file + +# the importTrackHub failed on redirection, fetch all the files manually: +# 2017-09-15 - Hiram + +cd /hive/data/outside/hoffmanMappability/hg38 + +grep bigDataUrl trackDb.txt | awk '{print $NF}' | sed -e 's#https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/##;' | while read F +do + echo $F + rm -f $F + wget --timestamping "https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/${F}" +done + # real 29m40.429s + +######################################################################### +# tcgaExpr super track Chris Eisenhart, DONE, 2017-05-17 +# tcgaTranscExpr +# TCGA transcript level expression barChart track, from TOIL pipeline recompute (John Vivian) +# biorxiv.org/content/biorxiv/early/2016/07/07/062497.full.pdf +mkdir /hive/data/outside/tcgaBarcharts/ +mkdir /hive/data/outside/tcgaBarcharts/transcripts +cd /hive/data/outside/tcgaBarcharts/transcripts + +# Get all the meta data +cp ~max/projects/cirm/datasetPages/tcgaGtex/tcgaMeta.tab . +# Cut out the meta data the script wants, sample name and group. +cut -f 1,5 tcgaMeta.tab | sed 's/ /_/g' > tcgaLargeSamples.tsv + +# Get and clean the matrix +cp ~max/projects/cirm/datasetPages/tcgaGtex/tcga.tpm.tab . +# Clean up the transcript names (remove the .#) +cut -f 1 tcga.tpm.tab | cut -f 1 -d "." > tcgaTranscripts.txt +cut -f 2- tcga.tpm.tab > tcgaTpmValues.tsv +paste tcgaTranscripts.txt tcgaTpmValues.tsv > tcgaMatrix.tsv + +# Build a coordinate map +hgsql hg38 -e "select * from ensGene" | cut -f 2- | sort > ensGene +hgsql hg38 -e "select * from ensemblToGeneName" | sort > ensemblToGeneName +join ensGene ensemblToGeneName | awk '{print $2"\t"$4"\t"$5"\t"$1"\t0\t"$3"\t"$16}' > coord.bed + +# Use the meta data, matrix, and coordinate map to generate a barchart bed +time expMatrixToBarchartBed tcgaLargeSamples.tsv tcgaMatrix.tsv coord.bed tcgaTransExp.bed --groupOrder tcgaGroupOrder.txt + +# NOTE: Use the header line of the bed file to populate the barChartBars field in the trackDb. +# The order of the labels in the barChartBars field should match the order of the labels in the +# expScores column in the bed file header. + +# Sort and convert into a bigBed file. +sort -k1,1 -k2,2n tcgaTransExp.bed > sortedTcgaTransExp.bed +bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartTranscExp.as sortedTcgaTransExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaTransExp.bb + +# Link the files into gbdb +cd /gbdb/hgFixed/human/expMatrix +ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaLargeSamples.tsv tcgaLargeSamples.tab +ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaMatrix.tsv tcgaMatrix.tab +ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaTransExp.bb . + +###########3 +# Reload bigBed with a schema that will be shared with genes track, to support +# configuration as subtracks in a composite +# (2007-08-30 kate) +cd /hive/data/outside/tcgaBarcharts/transcripts +bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExpr.as sortedTcgaTransExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaTranscExpr.hg38.bb +mkdir /gbdb/hg38/tcga +ln -s `pwd`/tcgaTranscExpr.hg38.bb /gbdb/hg38/tcga/tcgaTranscExpr.bb + +# TCGA gene level expression barChart track, from TOIL pipeline recompute (John Vivian) +# tcgaGeneExpr +mkdir ../genes +cd ../genes + +# Get the gene matrix. +cp ~max/projects/cirm/datasetPages/tcgaGtex/tcga.geneTpm.tab . + +# Make a coordinate file, the genes in gtexGeneModelV6 have .# versions which are +# removed with the temp fils. +hgsql hg38 -e "select * from hg38.gtexGeneModelV6" | awk '{print $3"\t"$5"\t"$6"\t"$2"\t0\t"$4"\t"$2}' > coord6+1.bed.temp +cut -f 4 coord6+1.bed.temp | cut -f 1 -d "." > foo +cut -f 1-3 coord6+1.bed.temp > foo2 +paste foo2 foo > foo3 +cut -f 5- coord6+1.bed.temp > foo4 +paste foo3 foo4 > coord6+1.bed +# This bed file didn't have the right gene names (ENS rather than Hugo), fix it. +hgsql hg38 -e "select * From knownCanonical" > foo +wc foo +cut -f 6 foo | cut -f 1 -d "." +cut -f 6 foo | cut -f 1 -d "." > foo2 +head foo +cut -f 1-3 foo > foo3 +paste foo2 foo3 > foo4 +cut -f 4- coord6+1.bed > foo5 +join <(sort foo5) <(sort foo4) | awk '{print $5"\t"$6"\t"$7"\t"$1"\t0\t"$3"\t"$4}' > coord6+1.3.bed + +# Generate the bed file, can use the same transcript file +time expMatrixToBarchartBed ../transcripts/tcgaLargeSamples.tsv tcga.geneTpm.tab coord6+1.3.bed tcgaGeneExp.bed --groupOrder=../transcripts/tcgaGroupOrder.txt + +# Convert to big bed +sort -k1,1 -k2,2n tcgaGeneExp.bed > sortedTcgaGeneExp.bed +bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExp.as sortedTcgaGeneExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaGeneExp.bb + +# Link to gbdb +cd /gbdb/hgFixed/human/expMatrix +ln -s /hive/data/outside/tcgaBarcharts/genes/tcgaGeneExp.bb . +ln -s /hive/data/outside/tcgaBarcharts/genes/tcga.geneTpm.tab tcgaGeneMatrix.tab + +###########3 +# Reload bigBed with a schema that will be shared with transcript track, to support +# configuration as subtracks in a composite +# Apparently Chris actually loaded the #3 file (added gene names, adjusted end coord apparently) +# (2007-08-30 kate) +cd /hive/data/outside/tcgaBarcharts/genes +bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExpr.as sortedTcgaGeneExp3.bed /hive/data/genomes/hg38/chrom.sizes tcgaGeneExpr.hg38.bb +mkdir /gbdb/hg38/tcga +ln -s `pwd`/tcgaGeneExpr.hg38.bb /gbdb/hg38/tcga/tcgaGeneExpr.bb + +######################################################################### +# gtexTransExp Chris Eisenhart, done, 2017-05-23 +# TCGA transcript level RNA-seq, from TOIL pipeline recompute (John Vivian) +# biorxiv.org/content/biorxiv/early/2016/07/07/062497.full.pdf +mkdir /hive/data/outside/gtex/barChartTrack +cd /hive/data/outside/gtex/barChartTrack + +# Seems John included some TCGA data (CML) in the GTEx matrix and samples, the cleaning steps remove this. +# Make a clean sample file +cat ../johnVivianRecompute/sraToSample.txt | sed 's/ male /\tmale\t/g' | sed 's/ female /\tfemale\t/g' | cut -f 3 | sed 's/ - /-/g' | sed 's/ /_/g' > gtexSampleGroups.txt +cat ../johnVivianRecompute/sraToSample.txt | cut -f 1 -d " " > gtexSampleNames.txt +paste gtexSampleNames.txt gtexSampleGroups.txt > gtexSamples.txt +grep -v '(CML)' gtexSamples.tsv > cleanGtexSamples.tsv + +# Make a clean matrix +cut -f 1 ../johnVivianRecompute/gtex.tpm.tab | cut -f 1 -d "." > gtexTranscripts.txt +cut -f 2- ../johnVivianRecompute/gtex.tpm.tab > gtexTpmValues.tsv +paste gtexTranscripts.txt gtexTpmValues.tsv > gtexMatrix.tsv +rowsToCols gtexMatrix.tsv tspsdGtexMatrix.tsv +sort tspsdGtexMatrix.tsv > sortedTspsdGtexMatrix.tsv +grep -v '(CML)' gtexSamples.tsv | cut -f 1 | sed 's/Run_s/#transcript/g' | sort > sortedCleanGtexSamples.tsv +join sortedCleanGtexSamples.tsv sortedTspsdGtexMatrix.tsv > cleanTspsdGtexMatrix.tsv +rowsToCols cleanTspsdMatrix.tsv cleanGtexMatrix.tsv + +# Build a coordinate map +hgsql hg38 -e "select * from ensGene" | cut -f 2- | sort > ensGene +hgsql hg38 -e "select * from ensemblToGeneName" | sort > ensemblToGeneName +join ensGene ensemblToGeneName | awk '{print $2"\t"$4"\t"$5"\t"$1"\t0\t"$3"\t"$16}' > coord.bed +# NOTE: CHRISL10-05-2021 - the above ensGene steps weren't actually done or the files were removed, +# there was a coord.tsv which I used instead so the below re-run could work +tawk '{print $1,$2,$3,$4,0,$5,$6}' coord.tsv > coord.bed +# END CHRISL10-05-2021 NOTE) + +# Get the gtex ordering +hgsql hgFixed -e "select * from gtexTissue" | cut -f 3 | sed 's/ - /-/g' | sed 's/ /_/g' | sed '1D' > gtexGroupOrder.txt + +# Use the meta data, matrix, and coordinate map to generate a barchart bed +# NOTE: CHRISL10-05-2021 - re-ran this step to fix float parsing bug: +time expMatrixToBarchartBed cleanGtexSamples.tsv cleanGtexMatrix.tsv coord.bed gtexTransExp.bed --groupOrderFile gtexGroupOrder.txt + +# NOTE: Use the header line of the bed file to populate the barChartBars field in the trackDb. +# The order of the labels in the barChartBars field should match the order of the labels in the +# expScores column in the bed file header. + +# Sort and convert into a bigBed file. +sort -k1,1 -k2,2n gtexTransExp.bed > sortedGtexTransExp.bed +# NOTE: CHRISL10-05-2021 - re-ran bedToBigBed step with correct file names +bedToBigBed -as=$HOME/kent/src/hg/lib/barChartBed.as -type=bed6+5 sortedGtexTransExp.bed /hive/data/genomes/hg38/chrom.sizes gtexTranscExpr.bb + +# Link the files into gbdb +cd /gbdb/hgFixed/human/expMatrix +ln -s /hive/data/outside/gtex/barChartTrack/cleanGtexSamples.tsv cleanGtexSamples.tab +ln -s /hive/data/outside/gtex/barChartTrack/cleanGtexMatrix.tsv cleanGtexMatris.tab + +# <2007-08-30 kate) +cd /gbdb/hg38/gtex +ln -s /hive/data/outside/gtex/barChartTrack/gtexTranscExpr.bb . + +######################################################################### +# LASTZ human/hg38 vs. Zebrafish /danRer11 +# (DONE - 2017-06-12 - Chris) + + mkdir /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12 + cd /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12 + + printf '# human vs zebrafish danRer11 +BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz +BLASTZ_M=254 + +# TARGET: human hg38 +SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit +SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes +SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit +SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes +SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift +SEQ1_CHUNK=40000000 +SEQ1_LIMIT=20 +SEQ1_LAP=10000 + +# QUERY: zebrafish danRer11 +SEQ2_DIR=/hive/data/genomes/danRer11/danRer11.2bit +SEQ2_LEN=/hive/data/genomes/danRer11/chrom.sizes +SEQ2_CHUNK=20000000 +SEQ2_LIMIT=200 +SEQ2_LAP=0 + +BASE=/hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12 +TMPDIR=/dev/shm +' > DEF + + time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ + -chainMinScore=3000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -noDbNameCheck -syntenicNet) > do.log 2>&1 + # real 3327m39.074s + + cat fb.hg38.chainDanRer11Link.txt + # 41036733 bases of 3049335806 (1.346%) in intersection + + 973293331 bases of 3049335806 (31.918%) in intersection + + time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` hg38 danRer11) \ + > rbest.log 2>&1 & + + # and for the swap: + mkdir /hive/data/genomes/danRer11/bed/blastz.hg38.swap + cd /hive/data/genomes/danRer11/bed/blastz.hg38.swap + + time (doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12/DEF \ + -swap -chainMinScore=3000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -noDbNameCheck -syntenicNet) > swap.log 2>&1 + # real 39m24.916s + + cat fb.danRer11.chainHg38Link.txt + # 47869194 bases of 1674677181 (2.858%) in intersection + + time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` danRer11 hg38) \ + > rbest.log 2>&1 & + # real 638m45.337s +_EOF_ +######################################################################### +# refSeqFuncElems NCBI refSeq functional elements, REDONE 2017-11-29 Angie +# previously done 2017-08-01 by Chris E + +mkdir /hive/data/genomes/hg38/bed/refSeqFuncElems.2017-11-29 +cd /hive/data/genomes/hg38/bed/refSeqFuncElems.2017-11-29 + +# NOTE FOR NEXT TIME: instead of using interim GFF, in the future these annotations might be +# folded into the same main release GFF3 from which the ncbiRefSeq* tables are extracted by +# doNcbiRefSeq.pl. +wget ftp://ftp.ncbi.nlm.nih.gov/genomes/H_sapiens/GFF_interim/interim_GRCh38.p11_top_level_2017-06-27.gff3.gz + +# Get mapping of RefSeq NC_* chromosome accs (and NT_*, NW_*) to hg38 chrom names +hgsql hg38 -NBe 'select alias, chrom from chromAlias where source = "refseq" order by alias' \ +> refSeqToChrom.tab +cut -f 2 refSeqToChrom.tab | sed -e 's/^/^/' > chrom.tab + +# Use Terence Murphy's list of feature types (and the multi-type attribute regulatory_class) +# to identify Functional Elements and swap in hg38 chrom names. +# Use subColumn -miss so it doesn't quit when it sees a patch contig that doesn't map to an +# hg38 chrom. Use grep -f chrom.tab to filter out patch contig annotations. +zcat interim_GRCh38.p11_top_level_2017-06-27.gff3.gz \ +| grep -P "(\t(CAAT_signal|GC_rich_promoter_region|TATA_box|enhancer|insulator|locus_control_region|mobile_genetic_element|origin_of_replication|promoter|protein_binding_site|recombination_feature|regulatory_region|repeat_region|sequence_feature|sequence_secondary_structure|silencer|stem_loop)\t|regulatory_class=)" \ +| subColumn -miss=/dev/null 1 stdin refSeqToChrom.tab stdout \ +| grep -f chrom.tab > funcElems.gff +wc -l funcElems.gff +#5756 funcElems.gff + +# Transform GFF to BED+ +~/kent/src/hg/utils/automation/parseRefSeqFuncElems funcElems.gff /dev/stdout \ +| sort -k1,1 -k2n,2n > refSeqFuncElems.bed +wc -l refSeqFuncElems.bed +#5756 refSeqFuncElems.bed + +# Make bigBed and link from /gbdb +bedToBigBed -tab -type=bed9+7 -as=$HOME/kent/src/hg/lib/refSeqFuncElems.as \ + refSeqFuncElems.bed /hive/data/genomes/hg38/chrom.sizes refSeqFuncElems.bb +rm -f /gbdb/hg38/ncbiRefSeq/refSeqFuncElems.bb +ln -s `pwd`/refSeqFuncElems.bb /gbdb/hg38/ncbiRefSeq/ + +################################################################### +# cosmicRegions (DONE 2017-08-03 Chris) +# Make a new COSCMIC track for hg38 v82 +mkdir /hive/data/outside/cosmic/hg38/v82 +cd /hive/data/outside/cosmic/hg38/v82 + +# Get the new data +sftp ceisenha@ucsc.edu@sftp-cancer.sanger.ac.uk +# Login to SFTP server then run these commands +get /files/grch38/cosmic/v82/CosmicMutantExport.tsv.gz + +# Remove the 'NS' fields, search for the \t after to exclude the E'NS'ST transcripts. +zcat CosmicMutantExport.tsv.gz | sed 's/NS\t/\t/g' > cosMut.tsv + +# Use a script to convert to bed format. +cosmicToBed cosMut.tsv cosMut.bed +# This many lines were skipped, 134601 for not having genomic coordinate + +# Sort and convert to big bed using the .as file. +sort -k1,1 -k2,2n cosMut.bed > sCosMut.bed +bedToBigBed -type=bed8+31 -as=cosmicNew.as sCosMut.bed /hive/data/genomes/hg38/chrom.sizes cosMutHg38V82.bb -tab -extraIndex=name,cosmLabel + + +# Link it up so the outside world can see it. +cd /gbdb/hg38/cosmic/ +ln -s /hive/data/outside/cosmic/hg38/v82/cosMutHg38V82.bb . + +######################################################################### +# RepeatMasker Visualization track update (DONE - 2018-05-04 - ChrisL) + screen -S rmskJoined.2018-05-04 + mkdir /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04 + cd /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04 + + ln -s ../repeatMasker/hg38.sorted.fa.out . + ln -s ../repeatMasker/hg38.fa.align.gz . + + # this script points to the most recent RepeatMasker version: + time (/scratch/data/RepeatMasker/util/rmToUCSCTables.pl \ + -out hg38.sorted.fa.out -align hg38.fa.align.gz) > do.log 2>&1 & + + # no differences, forgot to remake rmsk files + # so instead remake the rmsk track and try again + mkdir /hive/data/genomes/hg38/bed/repeatMasker.2018-05-04 + cd /hive/data/genomes/hg38/bed/repeatMasker.2018-05-04 + + # remake the sorted.fa.out and fa.align.gz, stop after masking + # so rmsk table isn't overwritten + time (doRepeatMasker.pl -stop=mask -useHMMER -bigClusterHub=ku \ + -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38) > mask.log 2>&1 & + # RepeatMasker bug?: Undefined id, line 1440295 of input: + # 10 26.1 0.0 0.0 chr13 114292339 114292382 (71946) C L1P4 LINE/L1 (17) 6149 6106 + # RepeatMasker bug?: Undefined id, line 3529762 of input: + # 992 2.3 0.5 0.0 chr3 180461254 180462048 (17833511) C L1PA3 LINE/L1 (3) 6152 5354 + # RepeatMasker bug?: Undefined id, line 3529763 of input: + # 1153 3.2 0.2 0.0 chr3 180462043 180463006 (17832553) + L1PA3 LINE/L1 4392 5357 (789) + # RepeatMasker bug?: Undefined id, line 5303571 of input: + # 220 22.5 0.0 17.7 chr9 105798076 105799127 (32595590) C SATR2 Satellite (4) 866 1 + # real 643m17.617s + + # get rid of the missing id items: + grep -v "114292339 114292382\|180461254 180462048\|180462043 180463006\|105798076 105799127" \ + hg38.fa.out > clean.hg38.fa.out + mv clean.hg38.fa.out hg38.fa.out + + # finish the last step of doCat.csh: + /cluster/bin/scripts/extractNestedRepeats.pl hg38.fa.out | sort -k1,1 -k2,2n > hg38.nestedRepeats.bed + + cd /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04 + + rm hg38.sorted.fa.out + rm hg38.fa.align.gz + rm *.tsv + ln -s ../repeatMasker.2018-05-04/hg38.sorted.fa.out . + ln -s ../repeatMasker.2018-05-04/hg38.fa.align . + + # and then re-run + time (/scratch/data/RepeatMasker/util/rmToUCSCTables.pl \ + -out hg38.sorted.fa.out -align hg38.fa.align.gz) > rerun.log 2>&1 & + # real 141m7.268s + + # confirm the counts are different from the previous version: + # wc -l ../rmskJoined/hg38.fa.align.tsv ../rmskJoined/hg38.sorted.fa.join.bed ../rmskJoined/hg38.sorted.fa.out.tsv + 7203858 ../rmskJoined/hg38.fa.align.tsv + 4607727 ../rmskJoined/hg38.sorted.fa.join.bed + 5520118 ../rmskJoined/hg38.sorted.fa.out.tsv + 17331703 total + # wc -l *.tsv + 7227245 hg38.fa.align.tsv + 4828114 hg38.sorted.fa.join.tsv + 5916189 hg38.sorted.fa.out.tsv + 17971548 total + + hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/rmskJoined.sql \ + -renameSqlTable -verbose=4 -tab \ + -type=bed4+9 -as=$HOME/kent/src/hg/lib/rmskJoined.as hg38 \ + rmskJoinedCurrent hg38.sorted.fa.join.tsv \ + > loadJoined.log 2>&1 + + hgLoadSqlTab hg38 rmskAlignCurrent \ + /cluster/home/chmalee/kent/src/hg/lib/rmskAlign.sql \ + hg38.fa.align.tsv > loadAlign.log 2>&1 + + hgLoadOutJoined -verbose=2 -table=rmskOutCurrent hg38 hg38.sorted.fa.out > loadOut.log 2>&1 + + featureBits -countGaps hg38 rmskJoinedCurrent + # 2796899855 bases of 3209286105 (87.150%) in intersection +######################################################################### +# Hi-C Visualization based on Krietenstein 2019 (DONE - 2019-10-07 - Jonathan) +mkdir -p /hive/data/genomes/hg38/bed/hic +cd /hive/data/genomes/hg38/bed/hic + +# Files are located on 4D Nucleome (data.4dnucleome.org). The URL for the paper on that +# site is https://data.4dnucleome.org/publications/b13590b2-a341-4e5e-ad5e-72e233b32e9d/. +# The four file IDs downloaded below are for contact matrix .hic files created for +# different cell-line/protocol combinations +wget 'https://data.4dnucleome.org/files-processed/4DNFI2TK7L2F/@@download/4DNFI2TK7L2F.hic' # H1-hESC Micro-C XL +wget 'https://data.4dnucleome.org/files-processed/4DNFIQYQWPF5/@@download/4DNFIQYQWPF5.hic' # H1-hESC in situ +wget 'https://data.4dnucleome.org/files-processed/4DNFI18Q799K/@@download/4DNFI18Q799K.hic' # HFFc6 Micro-C XL +wget 'https://data.4dnucleome.org/files-processed/4DNFIFLJLIS5/@@download/4DNFIFLJLIS5.hic' # HFFc6 in situ + +printf "All files were downloaded from the 4D Nucleome Data Portal at data.4dnucleome.org. +These are processed contact matrices from Krietenstein et al. (2019) Ultrastructural details +of mammalian chromosme architecture. (https://www.biorxiv.org/content/10.1101/639922v1). + +4DNFI2TK7L2F.hic - Micro-C XL data set on H1-hESC +4DNFIQYQWPF5.hic - in situ Hi-C data set on H1-hESC +4DNFI18Q799K.hic - Micro-C XL data set on HFFc6 +4DNFIFLJLIS5.hic - in situ Hi-C data set on HFFc6" > README.txt + +mkdir -p /gbdb/hg38/bbi/hic +cd /gbdb/hg38/bbi/hic +ln -s /hive/data/genomes/hg38/bed/hic/* . + + +######################################################################### +# LASTZ Self/hg38 (DONE 2020-02-11 - Angie) + # RM #24695 + # Re-run with updated process to include pslDropOverlap . + # Use "contigs" from previous run lastzSelf.2014-01-25/hg38.self.2bit + + screen -S hg38Self -t hg38Self + mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 + cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 + cat << _EOF_ > DEF +# human vs human with mouse defaults +BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz + +# TARGET: Human hg38 +SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit +SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes +SEQ1_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit +SEQ1_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes +SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift +SEQ1_CHUNK=20000000 +SEQ1_LAP=10000 + +# QUERY: Human hg38 +SEQ2_DIR=/hive/data/genomes/hg38/hg38.2bit +SEQ2_LEN=/hive/data/genomes/hg38/chrom.sizes +SEQ2_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit +SEQ2_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes +SEQ2_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift +SEQ2_CHUNK=20000000 +SEQ2_LAP=0 + +BASE=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 +TMPDIR=/dev/shm +_EOF_ + + # NOTE FOR NEXT TIME: use -chainMinScore=10000 (at least), not 3000 + + ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ + -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \ + -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ + -stop=net >& do.log & + tail -f do.log + + + # After two days, 4 jobs are running, one of which (part014.lst vs itself) crashed with + # out-of-mem error. After 3 days, 3 jobs completed but part014.lst runs lastz out of mem. + # Split part014.lst up into components, run on hgwdev (more mem). + mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014 + cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014 + mkdir psl + cp /dev/null jobList + for t in $(cat ../tParts/part014.lst); do + tBase=$(basename $t) + for q in $(cat ../tParts/part014.lst); do + qBase=$(basename $q) + echo "$HOME/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf $t $q ../../DEF {check out exists psl/${tBase}_${qBase}.psl }" >> jobList + done + done + para create jobList + para try, check, push, etc, + # 94 of the jobs ran for 12s or less. The other 6 are chr{X_Y}_00 vs. self & each other, + # chr13_16 vs self and chr16_03 vs self. All but chr16_03 vs self completed in < 6 minutes. +#Completed: 99 of 100 jobs +#Crashed: 1 jobs +#CPU time in finished jobs: 1559s 25.98m 0.43h 0.02d 0.000 y +#IO & Wait Time: 248s 4.14m 0.07h 0.00d 0.000 y +#Average job time: 18s 0.30m 0.01h 0.00d +#Longest finished job: 321s 5.35m 0.09h 0.00d +#Submission to last job: 94681s 1578.02m 26.30h 1.10d + + # Dang, chr16_03 vs. self still runs out of mem even on hgwdev. + mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03 + cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03 + twoBitToFa /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit:chr16_03:0-1689648 \ + chr16_03.fa + faSplit -lift=chr16_03.lift size chr16_03.fa 169000 chr16_03_split_ + faToTwoBit chr16_03_split_*.fa chr16_03_split.2bit + twoBitInfo chr16_03_split.2bit stdout | sort -k2nr > chr16_03_split.sizes + sed -re 's@CTGDIR.*@CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03/chr16_03_split.2bit@; + s@CTGLEN.*@CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03/chr16_03_split.sizes@;' \ + ../../../DEF > DEF.split + mkdir psl + cwd=$(pwd) + while read tBase tSize; do + while read qBase qSize; do + echo "$HOME/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf $cwd/chr16_03_split.2bit:$tBase:0-$tSize $cwd/chr16_03_split.2bit:$qBase:0-$qSize DEF.split {check out exists psl/${tBase}_${qBase}.psl}" + done < chr16_03_split.sizes + done < chr16_03_split.sizes > jobList + para create jobList + para try, check, push, etc, +#Completed: 100 of 100 jobs +#CPU time in finished jobs: 142614s 2376.89m 39.61h 1.65d 0.005 y +#IO & Wait Time: 167s 2.79m 0.05h 0.00d 0.000 y +#Average job time: 1428s 23.80m 0.40h 0.02d +#Longest finished job: 22861s 381.02m 6.35h 0.26d +#Submission to last job: 22874s 381.23m 6.35h 0.26d + # 6 hours for chr16_03_split_00 vs. itself. ~4.5h for _09 vs _00. + cat psl/*.psl \ + | liftUp -nohead -type=.psl stdout \ + chr16_03.lift error stdin \ + | liftUp -nohead -type=.psl -pslQ \ + ../psl/hg38.self.2bit:chr16_03:0-1689648_hg38.self.2bit:chr16_03:0-1689648.psl \ + chr16_03.lift error stdin + + cd .. + cat psl/* > ../../psl/part014.lst/part014.lst_part014.lst.psl + + # Make run.time file or doBlastzChainNet.pl won't continue: + cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz + para time >& run.time + + # Resume doBlastzChainNet.pl: + cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 + ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ + -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \ + -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ + -continue=cat -stop=net >& do2.log & + tail -f do2.log +#Batch failed after 4 tries on chain.csh part016.lst chain/part016.lst.chain +#Command failed: +#ssh -x -o 'StrictHostKeyChecking = no' -o 'BatchMode = yes' hgwdev nice /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run/doChainRun.csh + + cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run + para problems + # mostly these: +#errAbort re-entered due to out-of-memory condition. Exiting. + # one job made it through errAbort: +#needLargeMem: Out of memory - request size 564838920 bytes, errno: 12 + para time +#Completed: 59 of 68 jobs +#Crashed: 9 jobs +#CPU time in finished jobs: 24727s 412.12m 6.87h 0.29d 0.001 y +#IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y +#Average job time: 409s 6.82m 0.11h 0.00d +#Longest finished job: 2350s 39.17m 0.65h 0.03d +#Submission to last job: 2462s 41.03m 0.68h 0.03d + para crashed +#chain.csh part012.lst {check out line+ chain/part012.lst.chain} +#chain.csh part017.lst {check out line+ chain/part017.lst.chain} +#chain.csh part016.lst {check out line+ chain/part016.lst.chain} +#chain.csh part015.lst {check out line+ chain/part015.lst.chain} +#chain.csh part014.lst {check out line+ chain/part014.lst.chain} +#chain.csh hg38.self.2bit:chr1_10: {check out line+ chain/hg38.self.2bit:chr1_10:.chain} +#chain.csh hg38.self.2bit:chr10_05: {check out line+ chain/hg38.self.2bit:chr10_05:.chain} +#chain.csh hg38.self.2bit:chr7_00: {check out line+ chain/hg38.self.2bit:chr7_00:.chain} + + # Run the jobs outside of parasol (~11h): + csh -efx chain.csh part012.lst chain/part012.lst.chain & + csh -efx chain.csh part017.lst chain/part017.lst.chain & + csh -efx chain.csh part016.lst chain/part016.lst.chain & + csh -efx chain.csh part015.lst chain/part015.lst.chain & + csh -efx chain.csh part014.lst chain/part014.lst.chain & + csh -efx chain.csh hg38.self.2bit:chr1_10: chain/hg38.self.2bit:chr1_10:.chain & + csh -efx chain.csh hg38.self.2bit:chr10_05: chain/hg38.self.2bit:chr10_05:.chain & + csh -efx chain.csh hg38.self.2bit:chr7_00: chain/hg38.self.2bit:chr7_00:.chain & + csh -efx chain.csh hg38.self.2bit:chr16_08: chain/hg38.self.2bit:chr16_08:.chain & + + # Resume doBlastzChainNet.pl again: + cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 + ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ + -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \ + -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ + -continue=chainMerge -stop=net >& do3.log & + tail -f do3.log +# *** All done ! Elapsed time: 19m11s + + # Load track w/new name chainSelfRedo to compare to existing chainSelf: + hgLoadChain -normScore -tIndex hg38 chainSelfRedo axtChain/hg38.hg38.all.chain.gz + + # No idea why but somehow the liftUp seems not to have worked for part012 and part017, + # so the all.chain had chr22_31, chr8_01 etc. :b run again again. + cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run + mv chain/part012.lst.chain{,.bak} + mv chain/part017.lst.chain{,.bak} + csh -efx chain.csh part012.lst chain/part012.lst.chain >& part012.log & + csh -efx chain.csh part017.lst chain/part017.lst.chain >& part017.log & + # Those completed successfully. Dunno why the earlier ones didn't get lifted. + cd .. + mv hg38.hg38.all{,.oopsPartUnlifted}.chain.gz + # Reconstruct hg38.hg38.all.chain.gz (the chainMerge step is just this command): + find /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run/chain -name "*.chain" \ + | chainMergeSort -inputList=stdin \ + | nice gzip -c \ + > /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/hg38.hg38.all.chain.gz + + # NOTE FOR NEXT TIME: this filtering step will be unnecessary when -minScore=10000 is used + # from the beginning. + # Filter to minScore of 10000 (too much fluff with -minScore=3000) per Jim (see #24695) + cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain + mv hg38.hg38.all.chain.gz hg38.hg38.all.unfiltered.chain.gz + chainFilter hg38.hg38.all.unfiltered chain.gz -minScore=10000 \ + | gzip -c > hg38.hg38.all.chain.gz + hgLoadChain -normScore -tIndex hg38 chainSelfRedo hg38.hg38.all.chain.gz + checkTableCoords hg38 chainSelfRedo + + # Rename to chainSelf and update lastz symlinks and downloads + hgsql hg38 -e 'drop table chainSelf; drop table chainSelfLink; + rename table chainSelfRedo to chainSelf; + rename table chainSelfRedoLink to chainSelfLink;' + cd /hive/data/genomes/hg38/bed + rm lastz.self lastz.hg38 + ln -s lastzSelf.2020-01-27 lastz.self + ln -s lastzSelf.2020-01-27 lastz.hg38 + cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain + cp /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/axtChain/README.txt . + $EDITOR README.txt + md5sum hg38.hg38.all.chain.gz > md5sum.txt + # Make sure that the old download dir has only symlinks, no real files, then remove and rebuild. + ls -lR /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ + rm -r /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ + mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ + cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ + ln -s /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/{README.txt,hg38.hg38.all.chain.gz,md5sum.txt} . + + +######################################################################### +# NCBI ReMap alignments (DONE 2020-02-11 Angie) +# RM 24449 + mkdir /hive/data/genomes/hg38/bed/chainHg19ReMap + cd /hive/data/genomes/hg38/bed/chainHg19ReMap + wget ftp://ftp.ncbi.nlm.nih.gov/pub/remap/Homo_sapiens/current/GCF_000001405.39_GRCh38.p13/GCF_000001405.25_GRCh37.p13/GCF_000001405.39-GCF_000001405.25.gff + # We will need to substitute all the RefSeq chrom and contig IDs with our own names. + # The same alt contig can appear in both assemblies with the same name, so replace + # hg19 names at the beginning of the line and hg38 names after "Target=". + hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \ + | sed -re 's/\./\\./;' \ + | awk '{print "s/^" $1 "\\b/" $2 "/;";}' \ + > hg38.hg19.chromAlias.sed + hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \ + | sed -re 's/\./\\./;' \ + | awk '{print "s/Target=" $1 "\\b/Target=" $2 "/;";}' \ + >> hg38.hg19.chromAlias.sed + + # There are some GRCh38.p13 sequences that we have not yet imported into hg38 -- use -dropT. + sed -f hg38.hg19.chromAlias.sed GCF_000001405.39-GCF_000001405.25.gff \ + | gff3ToPsl -dropT /hive/data/genomes/{hg19,hg38}/chrom.sizes stdin stdout \ + | pslPosTarget stdin stdout \ + | sort -k14,14 -k16n,16n > remap.hg38.hg19.psl + + # Convert to chain for browser display. Some of the remap chains have minScore < 1000 and + # by default would be dropped by chainScore... use -minScore=0 to prevent that. + time pslToChain remap.hg38.hg19.psl stdout \ + | chainScore -minScore=0 stdin /hive/data/genomes/{hg38/hg38.2bit,hg19/hg19.2bit} \ + remap.hg38.hg19.chain +#real 9m31.900s +#user 9m1.624s +#sys 0m20.863s + hgLoadChain hg38 -tIndex chainHg19ReMap remap.hg38.hg19.chain +#Loading 5315 chains into hg38.chainHg19ReMap + time axtChain -psl -linearGap=medium -verbose=0 remap.hg38.hg19.psl \ + /hive/data/genomes/hg38/hg38.2bit /hive/data/genomes/hg19/hg19.2bit \ + remap.axtChain.hg38.hg19.chain +#real 2m26.333s +#user 2m4.237s +#sys 0m22.071s + hgLoadChain hg38 -tIndex chainHg19ReMapAxtChain remap.axtChain.hg38.hg19.chain +#Loading 2115 chains into hg38.chainHg19ReMapAxtChain + +################################################### +#Agilent SNP/CNV arrays 3/11/21 +#Downloaded by web browser +cd /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto +fetchChromSizes hg38 > hg38.chrom.sizes +bedSort hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bed hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bed +uniq hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bed >hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.uniq.bed +bedToBigBed hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.uniq.bed hg38.chrom.sizes hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bb +bedSort hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bed hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bed +uniq hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bed > hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.uniq.bed +bedToBigBed hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.uniq.bed hg38.chrom.sizes hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bb +bedSort hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bed hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bed +uniq hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bed > hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.uniq.bed +bedToBigBed hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.uniq.bed hg38.chrom.sizes hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bb +mkdir -p /gbdb/hg38/snpCnvArrays/agilent +cd /gbdb/hg38/snpCnvArrays/agilent +ln -s /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto/hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bb +ln -s /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto/hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bb +ln -s /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto/hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bb +vi ~/kent/src/hg/makeDb/trackDb/human/hg38/trackDb.ra + +######################################################################### +# DECIPHER CNV & SNV - initial build (DONE 2022-04-08 Jonathan) +# RM 29130 + +cd /hive/data/genomes/outside/otto/decipher +mkdir 2022-04-05 +cd 2022-04-05 + +# manually fetch decipher-variants-grch38-2022-04-03.bed from DECIPHER +../buildDecipher decipher-variants-grch38-2022-04-03.bed + +for i in `cat ../decipher.tables` + do + n=$i"New" + o=$i"Old" + hgsqlSwapTables hg38 $n $i $o -dropTable3 + done + +mkdir -p /gbdb/hg38/decipher +cd /gbdb/hg38/decipher +ln -s /hive/data/outside/otto/decipher/2022-04-05/decipherCnv.bb . + +######################################################################### +# COSMIC (DONE 07-11-2023) +# RM 29625 + +#Fetch file +cd /hive/data/outside/cosmic/hg38/v98/ +wget 'https://cog.sanger.ac.uk/cosmic/GRCh38/ucsc/v98/ucsc_export.bed.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1686847188&Signature=4YV3CuFKudxIhqVdWAaCe0CMAiY%3D' -O ucsc_export.bed.gz +wget 'https://cog.sanger.ac.uk/cosmic/GRCh38/ucsc/v98/ucsc_export.bed.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1687525456&Signature=jBdJOlOOaqmMWNnOtJUyNRptVj4%3D' +mv ucsc_export.bed.gz\?AWSAccessKeyId\=KRV7P7QR9DL41J9EWGA2\&Expires\=1687525456\&Signature\=jBdJOlOOaqmMWNnOtJUyNRptVj4\= ucsc_export.bed.gz + +#Reorder to columns to conform to bed 6+3 +zcat ucsc_export.bed.gz | awk -F'\t' -v OFS="\t" '{ print $1, $2, $3, $7, 0, $6, $4, $5, $8 }' | sort -k1,1 -k2,2n > cosmic.bed + +#Tiny bit of python to identify the broken lines in the file where chromStart > chromEnd + +#for line in myFile: +# newLine = line.split("\t") +# if int(newLine[1]) > int(newLine[2]): +# print(line) +# n+=1 +#print(n) + +#remove those broken records from the file +cat cosmic.bed | grep -vf badRecords.bed > cosmic.fixed.bed + +#subtract to conform to bed format for all the items that have same star and endPos + +cat cosmic.fixed.bed | awk 'BEGIN {OFS="\t"} { +if ($2 == $3) + print $1,$2-1,$3,$4,$5,$6,$7,$8,$9; +else + print $0; +}' > cosmic.fixedPos.bed + +bedToBigBed -type=bed6+3 -as=/hive/data/outside/cosmic/hg38/v98/cosmic.as /hive/data/outside/cosmic/hg38/v98/cosmic.fixedPos.bed /hive/data/genomes/hg38/chrom.sizes /hive/data/outside/cosmic/hg38/v98/cosmic.bb -tab + +#make symlink +ln -s /hive/data/outside/cosmic/hg38/v98/cosmic.bb /gbdb/hg38/cosmic/cosmic.bb + +#This data has since been updated, see new makedoc doc/hg38/cosmicV98.txt and rm #32430 + +############################################################################## +# LIFTOVER TO GCA_018873775.2_hg01243.v3.0 (DONE - 2023-08-13 - Hiram) + ssh hgwdev + # going to need an ooc for hg38.p14.2bit + cd /hive/data/genomes/hg38 + time blat hg38.p14.2bit /dev/null /dev/null -tileSize=11 \ + -makeOoc=hg38.p14.ooc -repMatch=1024 + # Wrote 36808 overused 11-mers to hg38.p14.ooc + # real 0m50.753s + + # and ooc for this GenArk hub + cd /hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0 + time blat GCA_018873775.2_hg01243.v3.0.2bit /dev/null /dev/null -tileSize=11 \ + -makeOoc=GCA_018873775.2_hg01243.v3.0.ooc -repMatch=1024 +# Wrote 39087 overused 11-mers to GCA_018873775.2_hg01243.v3.0.ooc +# real 0m49.426s + + mkdir /hive/data/genomes/hg38/bed/blat.GCA_018873775.2_hg01243.v3.0.2023-08-13 + cd /hive/data/genomes/hg38/bed/blat.GCA_018873775.2_hg01243.v3.0.2023-08-13 + + doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ + -debug -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ + -target2Bit=/hive/data/genomes/hg38/hg38.2bit \ + -targetSizes=/hive/data/genomes/hg38/chrom.sizes \ + -query2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.2bit \ + -querySizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.chrom.sizes \ + -ooc=/hive/data/genomes/hg38/hg38.p14.ooc \ + hg38 GCA_018873775.2 + + # trying -ram=6g to get full use of hgwdev kluster nodes + time (~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl \ + -verbose=2 -buildDir=`pwd` \ + -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ + -target2Bit=/hive/data/genomes/hg38/hg38.2bit \ + -targetSizes=/hive/data/genomes/hg38/chrom.sizes \ + -query2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.2bit \ + -querySizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.chrom.sizes \ + -ooc=/hive/data/genomes/hg38/hg38.p14.ooc \ + hg38 GCA_018873775.2) > doLiftOverToGCA_018873775.2.log 2>&1 + # real 12654m58.134s + + # broken after the alignment was done, with the parasol endless loop + # error message in the log file: + # select failure in rudp: Invalid argument + # killed that, cleaned the 4Tb log file, and gave up on this alignment + # since the lastz/chain/net is much better + + # see if the liftOver menus function in the browser from hg38 + # to GCA_018873775.2 + +############################################################################## +# LIFTOVER GCA_018873775.2_hg01243.v3.0 to hg38 (DONE - 2023-08-13 - Hiram) + ssh hgwdev + + mkdir /hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/trackData/blat.hg38.2023-08-13 + cd /hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/trackData/blat.hg38.2023-08-13 + + doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ + -debug -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ + -target2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.2bit \ + -targetSizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.chrom.sizes \ + -query2Bit=/hive/data/genomes/hg38/hg38.2bit \ + -querySizes=/hive/data/genomes/hg38/chrom.sizes \ + -ooc=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.ooc \ + GCA_018873775.2 hg38 + + # trying -ram=6g to get full use of hgwdev kluster nodes + time (~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl \ + -verbose=2 -buildDir=`pwd` \ + -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ + -target2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.2bit \ + -targetSizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.chrom.sizes \ + -query2Bit=/hive/data/genomes/hg38/hg38.2bit \ + -querySizes=/hive/data/genomes/hg38/chrom.sizes \ + -ooc=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.ooc \ + GCA_018873775.2 hg38) > doLiftOverToHg38.log 2>&1 + + # broken after the alignment was done, with the parasol endless loop + # error message in the log file: + # select failure in rudp: Invalid argument + # killed that, cleaned the 4Tb log file, and gave up on this alignment + # since the lastz/chain/net is much better + # real 193m24.137s + + # see if the liftOver menus function in the browser from GCA_018873775.2 + # to hg38 + +############################################################################## +# LIFTOVER TO GCA_018503275.1_NA19240.pri.mat.f1_v2 (TBD - 2023-08-14 - Hiram) + ssh hgwdev + + # ooc for this GenArk hub + cd /hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2 + time blat GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit /dev/null /dev/null \ + -tileSize=11 -repMatch=1024 \ + -makeOoc=GCA_018503275.1_NA19240.pri.mat.f1_v2.11.ooc + # Wrote 35866 overused 11-mers to GCA_018503275.1_NA19240.pri.mat.f1_v2.11.ooc + # real 0m32.298s + + mkdir /hive/data/genomes/hg38/bed/blat.GCA_018503275.1_NA19240.pri.mat.f1_v2.2023-08-14 + cd /hive/data/genomes/hg38/bed/blat.GCA_018503275.1_NA19240.pri.mat.f1_v2.2023-08-14 + + ~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl -verbose=2 \ + -buildDir=`pwd` -ram=4g -chainRam=16g \ + -debug -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ + -target2Bit=/hive/data/genomes/hg38/hg38.2bit \ + -targetSizes=/hive/data/genomes/hg38/chrom.sizes \ + -query2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit \ + -querySizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.chrom.sizes \ + -ooc=/hive/data/genomes/hg38/hg38.p14.ooc \ + hg38 GCA_018503275.1 + + # trying -ram=4g to get full use of hgwdev kluster nodes + time (~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl \ + -verbose=2 -buildDir=`pwd` -ram=4g -chainRam=16g \ + -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ + -target2Bit=/hive/data/genomes/hg38/hg38.2bit \ + -targetSizes=/hive/data/genomes/hg38/chrom.sizes \ + -query2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit \ + -querySizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.chrom.sizes \ + -ooc=/hive/data/genomes/hg38/hg38.p14.ooc \ + hg38 GCA_018503275.1) > doLiftOverToGCA_018503275.1.log 2>&1 + # real 11370m18.026s + + # broken after the alignment was done, with the parasol endless loop + # error message in the log file: + # select failure in rudp: Invalid argument + # killed that, cleaned the 4Tb log file, and gave up on this alignment + # since the lastz/chain/net is much better + # -rw-rw-r-- 1 4363949695640 Aug 22 09:16 doLiftOverToGCA_018503275.1.log + + # see if the liftOver menus function in the browser from hg38 + # to GCA_018503275.1 + +############################################################################## +# LIFTOVER GCA_018503275.1_NA19240.pri.mat.f1_v2 to hg38 (DONE - 2023-08-14 - Hiram) + ssh hgwdev + + mkdir /hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/trackData/blat.hg38.2023-08-14 + cd /hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/trackData/blat.hg38.2023-08-14 + + ~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl -verbose=2 \ + -buildDir=`pwd` -ram=4g -chainRam=16g \ + -debug -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ + -target2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit \ + -targetSizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.chrom.sizes \ + -query2Bit=/hive/data/genomes/hg38/hg38.2bit \ + -querySizes=/hive/data/genomes/hg38/chrom.sizes \ + -ooc=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.11.ooc \ + GCA_018503275.1 hg38 + + time (~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl -verbose=2 \ + -buildDir=`pwd` -ram=4g -chainRam=16g \ + -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ + -target2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit \ + -targetSizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.chrom.sizes \ + -query2Bit=/hive/data/genomes/hg38/hg38.2bit \ + -querySizes=/hive/data/genomes/hg38/chrom.sizes \ + -ooc=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.11.ooc \ + GCA_018503275.1 hg38) > liftOverToHg38.log 2>&1 + # real 5082m17.500s + + # this is interesting, this alignment completed and actually has good + # coverage: + cat fb.GCA_018503275.1.chain.Hg38Link.txt + # 2928654519 bases of 3032066086 (96.589%) in intersection + + # see if the liftOver menus function in the browser from GCA_018503275.1 + # to hg38 + +############################################################################## +## update grp table add new row for HPRC (DONE - 2023-08-29 - Hiram) +## existing structure: + + hgsql -e 'desc grp;' hg38 + ++-----------------+-----------+------+-----+---------+-------+ +| Field | Type | Null | Key | Default | Extra | ++-----------------+-----------+------+-----+---------+-------+ +| name | char(255) | NO | PRI | | | +| label | char(255) | NO | | | | +| priority | float | NO | | 0 | | +| defaultIsClosed | int(11) | YES | | NULL | | ++-----------------+-----------+------+-----+---------+-------+ + + # add one new row: + hgsql hg38 \ + -e "INSERT INTO grp VALUES ('hprc', 'Human Pangenome - HPRC', 3.6, 0);" + + # resulting table: + + hgsql -e 'select * from grp order by priority;' hg38 ++------------+------------------------------------+----------+-----------------+ +| name | label | priority | defaultIsClosed | ++------------+------------------------------------+----------+-----------------+ +| user | Custom Tracks | 1 | 0 | +| remc | Reference Epigenome Mapping Center | 1.2 | 1 | +| map | Mapping and Sequencing | 2 | 0 | +| genes | Genes and Gene Predictions | 3 | 0 | +| phenDis | Phenotype and Literature | 3.4 | 0 | +| pub | Literature | 3.5 | 0 | +| hprc | Human Pangenome - HPRC | 3.6 | 0 | +| covid | COVID-19 | 3.6 | 0 | +| singleCell | Single Cell RNA-seq | 3.7 | 0 | +| rna | mRNA and EST | 4 | 0 | +| expression | Expression | 4.5 | 0 | +| regulation | Regulation | 5 | 0 | +| compGeno | Comparative Genomics | 6 | 0 | +| varRep | Variation | 7 | 0 | +| rep | Repeats | 8 | 0 | +| x | Experimental | 10 | 1 | ++------------+------------------------------------+----------+-----------------+ + +############################################################################## +# Affy CytoScan HD track, refs #32856 (2024-01-23 Gerardo) +cd /hive/data/genomes/hg38/bed/ +mkdir genotypeArrays; cd genotypeArrays +#The user sent Gerardo a direct email with a shared folder link. Gerardo downloaded the bed files and made them available on dev. +#The user provided two bed files (https://hgwdev-gperez2.gi.ucsc.edu/~gperez2/mlq/mlq_32791/). Gerardo used the version 2 bed file for the track. +wget https://hgwdev-gperez2.gi.ucsc.edu/~gperez2/mlq/mlq_32791/CytoScanHD_Accel_Array.na36.bed.zip +unzip CytoScanHD_Accel_Array.na36.bed.zip +# Removed header and sorted the file +grep -v 'track' CytoScanHD_Accel_Array.na36.bed | bedSort stdin stdout > affyCytoScanHD.bed +bedToBigBed -tab -type=bed12 affyCytoScanHD.bed https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes affyCytoScanHD.bb +cd /gbdb/hg38 +mkdir genotypeArrays; cd genotypeArrays +# Making symlink for big file and raw bed file +ln -s /hive/data/genomes/hg38/bed/genotypeArrays/affyCytoScanHD.bb +ln -s /hive/data/genomes/hg38/bed/genotypeArrays/CytoScanHD_Accel_Array.na36.bed.zip +cd ~/kent/src/hg/makeDb/trackDb/human/hg38 +vi trackDb.ra + +############################################################################## +# LASTZ Human Hg38 vs. California sea lion GCF_009762305.2 +# (DONE - 2024-03-06 - jairo) + + mkdir /hive/data/genomes/hg38/bed/lastzGCF_009762305.2.2024-03-06 + cd /hive/data/genomes/hg38/bed/lastzGCF_009762305.2.2024-03-06 + + printf '# California sea lion GCF_009762305.2 vs. Human Hg38 +BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz + +# TARGET: Human hg38 +SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit +SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes +SEQ1_CHUNK=20000000 +SEQ1_LAP=10000 +SEQ1_LIMIT=40 + +# QUERY: California sea lion 2020-07-14 GCF_009762305.2_mZalCal1.pri.v2 +SEQ2_DIR=/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.2bit +SEQ2_LEN=/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.chrom.sizes.txt +SEQ2_CHUNK=20000000 +SEQ2_LAP=0 +SEQ2_LIMIT=100 + +BASE=/hive/data/genomes/hg38/bed/lastzGCF_009762305.2.2024-03-06 +TMPDIR=/dev/shm + +' > DEF + + time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl -trackHub -noDbNameCheck -verbose=2 `pwd`/DEF -syntenicNet \ + -qAsmId GCF_009762305.2_mZalCal1.pri.v2 -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 + grep -w real do.log | sed -e 's/^/ # /;' + # real 1018m28.119s + + sed -e 's/^/ # /;' fb.hg38.chainGCF_009762305.2Link.txt + # 1633315994 bases of 3299210039 (49.506%) in intersection + sed -e 's/^/ # /;' fb.hg38.chainSynGCF_009762305.2Link.txt + # 1564193911 bases of 3299210039 (47.411%) in intersection + + time (~/kent/src/hg/utils/automation/doRecipBest.pl -trackHub -load -workhorse=hgwdev -buildDir=`pwd` \ + \ + -query2Bit="/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.2bit" \ +-querySizes="/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.chrom.sizes.txt" \ + hg38 GCF_009762305.2) > rbest.log 2>&1 + + grep -w real rbest.log | sed -e 's/^/ # /;' + # real 303m36.739s + + sed -e 's/^/ # /;' fb.hg38.chainRBest.GCF_009762305.2.txt + # 1461974620 bases of 3299210039 (44.313%) in intersection + + ### and for the swap + + cd /hive/data/genomes/asmHubs/allBuild/GCF/009/762/305/GCF_009762305.2_mZalCal1.pri.v2/trackData/blastz.hg38.swap + + time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl -trackHub -noDbNameCheck -swap -verbose=2 \ + -qAsmId GCF_009762305.2_mZalCal1.pri.v2 /hive/data/genomes/hg38/bed/lastzGCF_009762305.2.2024-03-06/DEF -swapDir=`pwd` \ + -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 + + grep -w real swap.log | sed -e 's/^/ # /;' + # real 103m25.220s + + sed -e 's/^/ # /;' fb.GCF_009762305.2.chainHg38Link.txt + # 1493183463 bases of 2409685272 (61.966%) in intersection + sed -e 's/^/ # /;' fb.GCF_009762305.2.chainSynHg38Link.txt + # 1457122207 bases of 2409685272 (60.469%) in intersection +\ time (~/kent/src/hg/utils/automation/doRecipBest.pl -trackHub -load -workhorse=hgwdev -buildDir=`pwd` \ + \ + -target2bit="/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.2bit" \ +-targetSizes="/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.chrom.sizes.txt" \ + GCF_009762305.2 hg38) > rbest.log 2>&1 + + grep -w real rbest.log | sed -e 's/^/ # /;' + # real 286m31.189s + + sed -e 's/^/ # /;' fb.GCF_009762305.2.chainRBest.Hg38.txt + # 1461710350 bases of 2409685272 (60.660%) in intersection + +############################################################################## +# hg38.chromAlias.bb was incorrectly built without indexes so it will not +# work with bedToBigBed 2024-04-08 markd + +cd /hive/data/genomes/hg38/goldenPath/bigZips/initial +mv hg38.chromAlias.bb hg38.chromAlias.noindexes.bb +bigBedInfo -asOut hg38.chromAlias.noindexes.bb >hg38.chromAlias.as +bigBedToBed hg38.chromAlias.noindexes.bb hg38.chromAlias.bed +bedToBigBed -tab -type=bed3+ -as=hg38.chromAlias.as hg38.chromAlias.bed -sizesIs2Bit -extraIndex=ucsc,assembly,ensembl,genbank,refseq hg38.2bit hg38.chromAlias.bb + +############################################################################## + +# ENCODE 4 TF rPeak Clusters - RM #34930 - Lou 12/19/24 + +mkdir /hive/data/genomes/hg38/bed/ENCODEv4TFrPeaks +cd /hive/data/genomes/hg38/bed/ENCODEv4TFrPeaks +hubClone -download https://users.wenglab.org/gaomingshi/TF.rpeak.test.txt +ln -s /hive/data/genomes/hg38/bed/ENCODEv4TFrPeaks/no_trim.TF_name.rPeaks.bb /gbdb/hg38/bbi/ENCODE4/TFrPeakClusters.bb +ln -s /hive/data/genomes/hg38/bed/ENCODEv4TFrPeaks/no_trim.TF_name.decorator.bb /gbdb/hg38/bbi/ENCODE4/TFrPeakClustersDecorator.bb +# Then just moved the files to the ENCODEv4TFrPeaks dir, moved/tweaked HTML and trackDb -# alphaMissense ticket #32269 (Jeltje, Jan 2025) -mkdir -p /hive/data/genomes/hg38/bed/alphaMissense/ -cd /hive/data/genomes/hg38/bed/alphaMissense -wget https://storage.googleapis.com/dm_alphamissense/AlphaMissense_hg38.tsv.gz -time python ~/kent/src/hg/makeDb/outside/alphaMissense/alphaMissenseToWig.py AlphaMissense_hg38.tsv.gz -wigToBigWig a.wig ../../chrom.sizes a.bw & -wigToBigWig c.wig ../../chrom.sizes c.bw & -wigToBigWig g.wig ../../chrom.sizes g.bw & -wigToBigWig t.wig ../../chrom.sizes t.bw & -wait - -##Colors were added using the script -#kent/src/hg/makeDb/scripts/wigColorByColors/makeWigColorByRevelCadd.py