c11dda620b2fd111b46877fa8b05e7250a61b9f8 angie Tue Feb 11 12:50:33 2020 -0800 Instructions for loading up NCBI's ReMap alignments as a chain track. refs #24449 diff --git src/hg/makeDb/doc/hg38/hg38.txt src/hg/makeDb/doc/hg38/hg38.txt index 425d388..88a1e90 100644 --- src/hg/makeDb/doc/hg38/hg38.txt +++ src/hg/makeDb/doc/hg38/hg38.txt @@ -1,6883 +1,6927 @@ # for emacs: -*- mode: sh; -*- # This file describes how we made the browser database on # NCBI build 38 (December 2013 freeze) aka: # GRCh38 - Genome Reference Consortium Human Reference 38 # Assembly Accession: GCA_000001405.2 ############################################################################# ## Download sequence - DONE - 2013-12-24 mkdir /hive/data/genomes/hg38 mkdir /hive/data/genomes/hg38/genbank cd /hive/data/genomes/hg38/genbank time rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/ ./ # sent 19643 bytes received 4914689807 bytes 4490369.53 bytes/sec # total size is 4914019581 speedup is 1.00 # real 18m14.497s ############################################################################# ## convert to UCSC names - DONE - 2013-12-24 # with this release, NCBI has adopted a naming convention that is similar # to UCSC. The delivered sequence with these names can be found in: # /hive/data/genomes/hg38/genbank/seqs_for_alignment_pipelines/ # # The following scripts reproduce this naming scheme from the separate # files in the release # mkdir /hive/data/genomes/hg38/ucsc cat << '_EOF_' > ucscCompositeAgp.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz|") or die "can not read chr${chrN}.comp.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = ) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x ucscCompositeAgp.pl cat << '_EOF_' > unlocalized.pl #!/bin/env perl use strict; use warnings; my %accToChr; my %chrNames; open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $acc =~ s/\./v/; $accToChr{$acc} = $chrN; $chrNames{$chrN} += 1; } close (FH); foreach my $chrN (keys %chrNames) { my $agpFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { chomp $line; my (@a) = split('\t', $line); my $acc = $a[0]; $acc =~ s/\./v/; die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${acc}_random"; printf UC "%s", $ucscName; for (my $i = 1; $i < scalar(@a); ++$i) { printf UC "\t%s", $a[$i]; } printf UC "\n"; } } close (FH); close (UC); printf "chr%s\n", $chrN; open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; my $acc = $line; $acc =~ s/.*gb\|//; $acc =~ s/. Homo.*//; $acc =~ s/\./v/; die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${acc}_random"; printf UC ">$ucscName\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x unlocalized.pl cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chrUn.agp") or die "can not write to chrUn.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\./v/; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chrUn.fa") or die "can not write to chrUn.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/. Homo.*//; $line =~ s/\./v/; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl cat << '_EOF_' > altSequence.pl #!/usr/bin/env perl use strict; use warnings; use File::Basename; open (AG, ">chrAlt.agp") or die "can not write to chrAlt.agp"; open (FA, ">chrAlt.fa") or die "can not write to chrAlt.fa"; open (FH, "find ../genbank/ALT* -type f | grep alt_scaffold_placement.txt|") or die "can not find alt_scaffold_placement.txt files"; while (my $file = ) { chomp $file; my $dirName = dirname($file); my $agpFile = "$dirName/AGP/alt.scaf.agp.gz"; my $fastaFile = "$dirName/FASTA/alt.scaf.fa.gz"; # key is genbank acc name, value is UCSC chr name my %nameDelta; # printf STDERR "# %s\n", $file; open (AL, "<$file") or die "can not read $file"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($alt_asm_name, $prim_asm_name, $alt_scaf_name, $alt_scaf_acc, $parent_type, $parent_name, $parent_acc, $region_name, $ori, $alt_scaf_start, $alt_scaf_stop, $parent_start, $parent_stop, $alt_start_tail, $alt_stop_tail) = split('\t', $line); my $ucscAcc = $alt_scaf_acc; $ucscAcc =~ s/\./v/; my $ucscName = sprintf("chr%s_%s_alt", $parent_name, $ucscAcc); printf "%s %s\n", $alt_scaf_acc, $ucscName; if (exists ($nameDelta{$alt_scaf_acc})) { die "duplicate name incorrect ? $alt_scaf_acc $nameDelta{$alt_scaf_acc} ne $ucscName" if ($nameDelta{$alt_scaf_acc} ne $ucscName); } else { $nameDelta{$alt_scaf_acc} = $ucscName; } } close (AL); open (AL, "zcat $agpFile|") or die "can not read $agpFile"; while (my $line = ) { if ($line =~ m/^#/) { print AG "$line"; } else { my ($acc, $rest) = split('\t', $line, 2); die "can not find ucsc name for $acc" if (!exists($nameDelta{$acc})); printf AG "%s\t%s", $nameDelta{$acc}, $rest; } } close (AL); open (AL, "zcat $fastaFile|") or die "can not read $fastaFile"; while (my $line = ) { chomp $line; if ($line =~ m/^>/) { $line =~ s/.*gb.//; $line =~ s/. Homo.*//; die "can not find ucsc name for $line" if (!exists($nameDelta{$line})); printf FA ">%s\n", $nameDelta{$line}; } else { printf FA "%s\n", $line; } } close (AL); } close (FH); close (AG); close (FA); '_EOF_' # << happy emacs chmod +x altSequence.pl ./ucscCompositeAgp.pl ./unlocalized.pl ./unplaced.pl ./altSequence.pl # temporarily verify the fasta and AGP are complete and compatible faToTwoBit chr*.fa hg38.test.2bit cat chr*.agp > hg38.agp checkAgpAndFa hg38.agp hg38.test.2bit 2>&1 | tail -1 # All AGP and FASTA entries agree - both files are valid rm -f hg38.agp hg38.test.2bit # comparing faCounts of this 2bit file and the sequences delivered # in genbank/seqs_for_alignment_pipelines/ # result in the exact same sequence ############################################################################# ## initial db build - DONE - 2013-12-24 - Hiram cd /hive/data/genomes/hg38 cat << '_EOF_' > hg38.config.ra # Config parameters for makeGenomeDb.pl: db hg38 scientificName Homo sapiens commonName Human assemblyDate Dec. 2013 assemblyLabel GRCh38 Genome Reference Consortium Human Reference 38 (GCA_000001405.2) assemblyShortLabel GRCh38 orderKey 13 mitoAcc none fastaFiles /hive/data/genomes/hg38/ucsc/chr*.fa agpFiles /hive/data/genomes/hg38/ucsc/chr*.agp # qualFiles /dev/null dbDbSpeciesDir human photoCreditURL http://www.cbse.ucsc.edu/ photoCreditName Graphic courtesy of CBSE ncbiGenomeId 51 ncbiAssemblyId 883148 ncbiAssemblyName GRCh38 ncbiBioProject 31257 genBankAccessionID GCA_000001305.2 taxId 9606 '_EOF_' # << happy emacs # step wise to first verify AGP and Fasta files time makeGenomeDb.pl -stop=agp hg38.config.ra > agp.log 2>&1 # looking good, continue: time makeGenomeDb.pl -continue=db hg38.config.ra > db.log 2>&1 # add the files produced by the trackDb build to the source tree # this path is fixed in the makeGenomeDb.pl for next time # honor new convention for bbi location files: cd /gbdb/hg38/bbi mkdir gc5BaseBw mv gc5Base.bw gc5BaseBw cd gc5BaseBw # before hgsql -e 'select * from gc5BaseBw;' hg38 # +---------------------------+ # | fileName | # +---------------------------+ # | /gbdb/hg38/bbi/gc5Base.bw | # +---------------------------+ # and fixed hgBbiDbLink hg38 gc5BaseBw `pwd`/gc5Base.bw hgsql -e 'select * from gc5BaseBw;' hg38 # +-------------------------------------+ # | fileName | # +-------------------------------------+ # | /gbdb/hg38/bbi/gc5BaseBw/gc5Base.bw | # +-------------------------------------+ ############################################################################# ## RepeatMasker with CrossMatch - DONE - 2013-12-24,27 - Hiram mkdir /hive/data/genomes/hg38/bed/repeatMaskerCM cd /hive/data/genomes/hg38/bed/repeatMaskerCM # running this step wise so it can be loaded into its own table time doRepeatMasker.pl -stop=mask -bigClusterHub=ku \ -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1 # real 3443m13.026s # RepeatMasker version June 20 2013 open-4.0.3 # Search Engine: cross-match version 1.090518 # RepeatMasker Database: 20130422 # take the install script from this -debug run and alter it to load # the table into rmskCM time doRepeatMasker.pl -continue=install -stop=install -debug \ -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 cat fb.hg38.rmskCM.txt # 1586326530 bases of 3209286105 (49.429%) in intersection # profile of repeat elements: # 1852545 rmskClass/SINE.tab # 1570523 rmskClass/LINE.tab # 748597 rmskClass/LTR.tab # 703682 rmskClass/Simple_repeat.tab # 499108 rmskClass/DNA.tab # 102856 rmskClass/Low_complexity.tab # 7962 rmskClass/Satellite.tab # 5750 rmskClass/Retroposon.tab # 5667 rmskClass/LTR?.tab # 5622 rmskClass/Unknown.tab # 4516 rmskClass/snRNA.tab # 3294 rmskClass/DNA?.tab # 2026 rmskClass/tRNA.tab # 1840 rmskClass/rRNA.tab # 1784 rmskClass/RC.tab # 1672 rmskClass/srpRNA.tab # 1420 rmskClass/scRNA.tab # 704 rmskClass/RNA.tab # 411 rmskClass/RC?.tab # 38 rmskClass/SINE?.tab # using this RM result with trfMask for the final masked sequence cd /hive/data/genomes/hg38 twoBitMask hg38.rmskCM.2bit -add bed/simpleRepeat/trfMask.bed hg38.2bit twoBitToFa hg38.2bit stdout | faSize stdin > faSize.hg38.2bit.txt # 3209286105 bases (159970322 N's 3049315783 real 1460684798 upper 1588630985 lower) in 455 sequences in 1 files # %49.50 masked total, %52.10 masked real featureBits -countGaps hg38 rmskCM '!rmskHmmer' -bed=crossMatchUnique.bed # 24868153 bases of 3209286105 (0.775%) in intersection hgLoadBed hg38 crossMatchUnique crossMatchUnique.bed # Read 2352219 elements of size 4 from crossMatchUnique.bed ############################################################################# ## repeating RepeatMasker Blastn run (DONE - 2014-01-07 - Hiram) mkdir /hive/data/genomes/hg38/bed/rmskBlastn cd /hive/data/genomes/hg38/bed/rmskBlastn time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -useRMBlastn -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ -stop=mask -buildDir=`pwd` hg38 > mask.log # real 203m33.670s # 3209286105 bases (159970322 N's 3049315783 real 1491207906 upper 1558107877 lower) in 455 sequences in 1 files # %48.55 masked total, %51.10 masked real # install step with debug so the script can be altered to load into # a specific rmskBlastn table: $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -useRMBlastn -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ -continue=install -debug -buildDir=`pwd` hg38 ############################################################################# ## repeating RepeatMasker cross-match run (DONE - 2014-01-07 - Hiram) mkdir /hive/data/genomes/hg38/bed/rmskCM cd /hive/data/genomes/hg38/bed/rmskCM # missed recording stderr .... forgot the 2>&1 time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ -stop=mask -buildDir=`pwd` hg38 > mask.log # real 1897m33.517s # running from Tue Jan 7 16:10:33 PST 2014 thru 08 Jan 23:48 # *** All done! (through the 'mask' step) - Elapsed time: 1897m34s # *** Steps were performed in /hive/data/genomes/hg38/bed/rmskCM # running install manually to allow edit of the script to load # a specific rmskCm table time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ -continue=install -stop=install -buildDir=`pwd` hg38 -debug ############################################################################# ## RepeatMasker with RM Blastn - DONE - 2013-12-24,25 - Hiram mkdir /hive/data/genomes/hg38/bed/repeatMaskerBlastn cd /hive/data/genomes/hg38/bed/repeatMaskerBlastn # running this step wise so it can be loaded into its own table time doRepeatMasker.pl -stop=mask -useRMBlastn -bigClusterHub=ku \ -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1 # real 354m55.842s # take the install script from this -debug run and alter it to load # the table into rmskBlastn doRepeatMasker.pl -useRMBlastn -bigClusterHub=ku -continue=install \ -stop=install -debug -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 # 1560264046 bases of 3209286105 (48.617%) in intersection # profile of repeat elements: # 1824560 rmskClass/SINE.tab # 1552814 rmskClass/LINE.tab # 738435 rmskClass/LTR.tab # 715998 rmskClass/Simple_repeat.tab # 486591 rmskClass/DNA.tab # 105026 rmskClass/Low_complexity.tab # 7712 rmskClass/Satellite.tab # 5638 rmskClass/Retroposon.tab # 5276 rmskClass/Unknown.tab # 5100 rmskClass/LTR?.tab # 4548 rmskClass/snRNA.tab # 3033 rmskClass/DNA?.tab # 1987 rmskClass/tRNA.tab # 1809 rmskClass/rRNA.tab # 1710 rmskClass/RC.tab # 1633 rmskClass/srpRNA.tab # 1428 rmskClass/scRNA.tab # 614 rmskClass/RNA.tab # 376 rmskClass/RC?.tab # 38 rmskClass/SINE?.tab # 3 rmskClass/Unspecified.tab # 5464329 total ############################################################################# ## repeating RepeatMasker run with HMMER - DONE - 2014-01-08 - Hiram mkdir /hive/data/genomes/hg38/bed/rmskHmmer cd /hive/data/genomes/hg38/bed/rmskHmmer # trying cpu=4 and ram=32g time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -stop=mask -useHMMER -bigClusterHub=ku \ -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1 # 6 jobs required more than 32 Gb of memory to complete, ran them on # hgwdev to complete, then continuing: time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -continue=cat -stop=mask -useHMMER -bigClusterHub=ku \ -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > cat.log 2>&1 # real 24m5.274s # 3209286105 bases (159970322 N's 3049315783 real 1314916231 upper 1734399552 lower) in 455 sequences in 1 files # %54.04 masked total, %56.88 masked real # running install manually to allow edit of the script to load # a specific rmskHmmer table time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ -continue=install -debug -useHMMER -bigClusterHub=ku \ -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 time ./doLoad_rmskHmmer.bash > load.log 2>&1 # real 4m47.432s featureBits -countGaps hg38 rmskHmmer > fb.hg38.rmskHmmer.txt 2>&1 # 1734398971 bases of 3209286105 (54.043%) in intersection grep rmskClass hg38.class.profile.txt \ | sed -e 's#rmskClass/##; s/.tab//;' | sort -rn # profile of repeat elements: # 1884179 SINE # 1702529 LINE # 805427 LTR # 636906 Simple_repeat # 565171 DNA # 95480 Low_complexity # 11861 Retroposon # 10852 Satellite # 9181 LTR? # 6783 scRNA # 4582 DNA? # 3914 Unknown # 2059 RC # 1517 srpRNA # 1484 RNA # 970 SINE? # 806 RC? # 464 rRNA # 5744165 total featureBits -countGaps hg38 rmskHmmer '!rmskCM' -bed=hmmerUnique.bed # 172940594 bases of 3209286105 (5.389%) in intersection hgLoadBed hg38 hmmerUnique hmmerUnique.bed # Read 3099505 elements of size 4 from hmmerUnique.bed ############################################################################# ## RepeatMasker with HMMER - DONE - 2013-12-24,26 - Hiram mkdir /hive/data/genomes/hg38/bed/repeatMaskerHMMER cd /hive/data/genomes/hg38/bed/repeatMaskerHMMER time doRepeatMasker.pl -stop=mask -useHMMER -bigClusterHub=ku \ -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1 # take the install script from this -debug run and alter it to load # the table into rmskHmmer doRepeatMasker.pl -continue=install -stop=install -useHMMER \ -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ -buildDir=`pwd` hg38 > mask.log 2>&1 # 1702017722 bases of 3209286105 (53.034%) in intersection # profile of repeat elements: # 1879864 rmskClass/SINE.tab # 1678216 rmskClass/LINE.tab # 794231 rmskClass/LTR.tab # 651561 rmskClass/Simple_repeat.tab # 551965 rmskClass/DNA.tab # 97186 rmskClass/Low_complexity.tab # 10756 rmskClass/Retroposon.tab # 10448 rmskClass/Satellite.tab # 8393 rmskClass/LTR?.tab # 5849 rmskClass/scRNA.tab # 4282 rmskClass/Unknown.tab # 4276 rmskClass/DNA?.tab # 2000 rmskClass/RC.tab # 1573 rmskClass/srpRNA.tab # 1291 rmskClass/RNA.tab # 906 rmskClass/snRNA.tab # 747 rmskClass/SINE?.tab # 723 rmskClass/RC?.tab # 722 rmskClass/rRNA.tab # 468 rmskClass/tRNA.tab # 5705457 total ############################################################################# # rmsk from genbank release (DONE - 2014-12-25 - Hiram) mkdir /hive/data/genomes/hg38/bed/repeatMaskerGenbank cd /hive/data/genomes/hg38/bed/repeatMaskerGenbank head -3 ../repeatMaskerBlastn/hg38.fa.out > genbank.rm.out find ../../genbank -type f | grep rm.out | grep -v "/placed_scaffolds/" | while read F do headRest 3 $F done | sort -k5,45 -k6,6n >> genbank.rm.out grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \ | awk '{printf "s/%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt sed -e "`cat accessionToUcsc.sed.txt`" genbank.rm.out > ucscNames.rm.out head -3 ucscNames.rm.out > hg38.sorted.fa.out tail -n +4 ucscNames.rm.out | sort -k5,5 -k6,6n >> hg38.sorted.fa.out hgLoadOut -table=rmskGenbank -nosplit hg38 hg38.sorted.fa.out hgLoadOut -verbose=2 -tabFile=hg38.rmskGenbank.tab -table=rmskGenbank \ -nosplit hg38 hg38.sorted.fa.out 2> bad.records.txt # fixed up one of the masking scripts from the other runs to construct # the bbi files # 1581568556 bases of 3209286105 (49.281%) in intersection # profile of repeat elements: # 1849444 rmskClass/SINE.tab # 1586141 rmskClass/LINE.tab # 759248 rmskClass/LTR.tab # 502186 rmskClass/DNA.tab # 433789 rmskClass/Simple_repeat.tab # 396378 rmskClass/Low_complexity.tab # 10198 rmskClass/Satellite.tab # 5884 rmskClass/LTR?.tab # 4595 rmskClass/snRNA.tab # 4163 rmskClass/Retroposon.tab # 2802 rmskClass/Unknown.tab # 2157 rmskClass/DNA?.tab # 2154 rmskClass/tRNA.tab # 1915 rmskClass/rRNA.tab # 1860 rmskClass/RC.tab # 1784 rmskClass/srpRNA.tab # 1397 rmskClass/scRNA.tab # 822 rmskClass/RNA.tab # 488 rmskClass/SINE?.tab # 445 rmskClass/RC?.tab # 5567850 total ############################################################################# ## running TRF simple repeats - DONE - 2013-12-24,29 - Hiram # this procedure ran into much trouble on this release. The new # repeat sequences in the centromeres caused trf to run indefinitely. # I tried different sizes of chunks, working down to 20 Mbase chunks. # Even still, some jobs would not complete. Those broke down even # more, eventually to the smallest bit of 30 Kbase that needed to # run all the way down to 3,000 based chunks with 1,000 base overlaps. # this did not work: screen # use screen to manage this day-long job mkdir /hive/data/genomes/hg38/bed/simpleRepeat cd /hive/data/genomes/hg38/bed/simpleRepeat time doSimpleRepeat.pl -bigClusterHub=ku -workhorse=hgwdev \ -smallClusterHub=ku -buildDir=`pwd` hg38 > do.log 2>&1 cd /hive/data/genomes/hg38/bed # move it aside: mv simpleRepeat simpleRepeat.2013-12-24 # Instead, something like this: mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap mkdir -p noGap twoBitToFa ../../../hg38.unmasked.2bit stdout \ | faSplit -lift=noGap.lift gap stdin 5000000 noGap/hg38_ # make sure nothing has gone missing: faCount noGap/*.fa > faCount.txt tail -1 faCount.txt # total 3068387174 898285419 623727342 626335137 900967885 19071391 30979734 # compared to the full sequence, same numbers for ACGT: twoBitToFa ../../../hg38.unmasked.2bit stdout | faCount stdin # total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743 faToTwoBit noGap/*.fa hg38.nogap.2bit twoBitInfo hg38.nogap.2bit stdout | sort -k2,2nr > hg38.nogap.sizes mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M rm -rf /hive/data/genomes/hg38/TrfPart20M /cluster/bin/scripts/simplePartition.pl \ /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap/hg38.nogap.2bit \ 20000000 /hive/data/genomes/hg38/TrfPart20M rm -f /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/TrfPart20M ln -s /hive/data/genomes/hg38/TrfPart20M \ /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/TrfPart20M ssh ku cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M gensub2 /hive/data/genomes/hg38/TrfPart20M/partitions.lst single gsub jobList para create jobList para push # 20 jobs would not complete: # Completed: 143 of 163 jobs # Jobs currently running: 20 # CPU time in finished jobs: 76994s 1283.24m 21.39h 0.89d 0.002 y # IO & Wait Time: 1095s 18.24m 0.30h 0.01d 0.000 y # Time in running jobs: 1807279s 30121.32m 502.02h 20.92d 0.057 y # Average job time: 546s 9.10m 0.15h 0.01d # Longest running job: 90422s 1507.03m 25.12h 1.05d # Longest finished job: 43348s 722.47m 12.04h 0.50d # Submission to last job: 43363s 722.72m 12.05h 0.50d # determine which are the last jobs as individual bits: para status | grep -v done | awk '{print $(NF-1),$NF}' | grep TrfRun \ > not.done.list awk '{print $NF}' not.done.list | sed -e 's/.bed//' | while read F do cat $F done > seq.specs.not.done mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs mkdir fasta for seqSpec in `cat ../seq.specs.not.done` do fName=`echo $seqSpec | sed -e 's/.*://'` echo $fName twoBitToFa $seqSpec fasta/$fName.fa done ls -1S `pwd`/fasta > part.list cat << '_EOF_' > template #LOOP ./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > runTrf #!/bin/bash set -beEu -o pipefail export path1=$1 export inputFN=`basename $1` export outpath=$2 export outputFN=`basename $2` mkdir -p /dev/shm/$outputFN cp -p $path1 /dev/shm/$outputFN cd /dev/shm/$outputFN /cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs rm -f $outpath cp -p /dev/shm/$outputFN/$outputFN $outpath rm -fr /dev/shm/$outputFN/* rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN '_EOF_' # << happy emacs chmod +x runTrf gensub2 part.list single template jobList para create jobList para push # not all of these jobs will finish either: # Completed: 85 of 106 jobs # Jobs currently running: 21 # CPU time in finished jobs: 58076s 967.93m 16.13h 0.67d 0.002 y # IO & Wait Time: 828s 13.81m 0.23h 0.01d 0.000 y # Time in running jobs: 1988997s 33149.95m 552.50h 23.02d 0.063 y # Average job time: 693s 11.55m 0.19h 0.01d # Longest running job: 94730s 1578.83m 26.31h 1.10d # Longest finished job: 34216s 570.27m 9.50h 0.40d # Submission to last job: 34342s 572.37m 9.54h 0.40d # can use what we have here: liftUp result.bed ../../splitGap/noGap.lift error bed/*.bed # find jobs not done para status | grep -v done | awk '{print $(NF-1),$NF}' | grep TrfRun \ > not.done.list # splitting up those last jobs: mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits mkdir noGap awk '{print $2}' ../lastJobs/not.done.list | while read F do cp -p $F ./noGap/ done # split into 1,000,000 chunks with 10,000 overlap: mkdir -p 1M_10K for F in noGap/*.fa do B=`basename $F | sed -e 's/.fa//'` echo "faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/$B_" faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/${B}_ done ls -1S `pwd`/1M_10K/*.fa > part.list cat << '_EOF_' > runTrf #!/bin/bash set -beEu -o pipefail export path1=$1 export inputFN=`basename $1` export outpath=$2 export outputFN=`basename $2` mkdir -p /dev/shm/$outputFN cp -p $path1 /dev/shm/$outputFN cd /dev/shm/$outputFN /cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits rm -f $outpath cp -p /dev/shm/$outputFN/$outputFN $outpath rm -fr /dev/shm/$outputFN/* rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN '_EOF_' # << happy emacs cat << '_EOF_' > template #LOOP ./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 part.list single template jobList para create jobList para push # not all of these jobs will complete either: # Completed: 53 of 96 jobs # CPU time in finished jobs: 212403s 3540.05m 59.00h 2.46d 0.007 y # IO & Wait Time: 1851s 30.85m 0.51h 0.02d 0.000 y # Average job time: 4043s 67.38m 1.12h 0.05d # Longest finished job: 68726s 1145.43m 19.09h 0.80d # Submission to last job: 68890s 1148.17m 19.14h 0.80d # use what results we have here: cat *.lift | liftUp parts.bed stdin error bed/*.bed liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed \ | sort -u | sort -k1,1 -k2,2n > hg38.result.bed para status | grep -v -w done | awk '{print $(NF-1)}' > will.not.finish.txt # split those last bits: mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits mkdir splitBits cat ../splitBits/will.not.finish.txt | while read F do cp -p $F splitBits done # 100K chunks with 10K overlap mkdir -p 100K_10K for F in splitBits/*.fa do B=`basename $F | sed -e 's/.fa//'` echo "faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/$B_" faSplit -lift=$B.lift -extra=10000 size $F 100000 100K_10K/${B}_ done cat << '_EOF_' > runTrf #!/bin/bash set -beEu -o pipefail export path1=$1 export inputFN=`basename $1` export outpath=$2 export outputFN=`basename $2` mkdir -p /dev/shm/$outputFN cp -p $path1 /dev/shm/$outputFN cd /dev/shm/$outputFN /cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits rm -f $outpath cp -p /dev/shm/$outputFN/$outputFN $outpath rm -fr /dev/shm/$outputFN/* rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN '_EOF_' # << happy emacs chmod +x runTrf cat << '_EOF_' > template #LOOP ./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} #ENDLOOP '_EOF_' # << happy emacs ls -1S `pwd`/100K_10K/*.fa > part.list gensub2 part.list single template jobList para create jobList para push # one last bit does not complete: # Completed: 420 of 421 jobs # CPU time in finished jobs: 19862s 331.04m 5.52h 0.23d 0.001 y # IO & Wait Time: 2360s 39.33m 0.66h 0.03d 0.000 y # Average job time: 53s 0.88m 0.01h 0.00d # Longest finished job: 368s 6.13m 0.10h 0.00d # Submission to last job: 448s 7.47m 0.12h 0.01d # can use the results obtained here: cat *.lift | liftUp splitParts.bed stdin error bed/*.bed cat ../splitBits/*.lift | liftUp parts.bed stdin error splitParts.bed liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \ | sort -k1,1 -k2,2n > hg38.result.bed para status | grep -v -w done | awk '{print $(NF-1)}' # last chunk: 100K_10K/hg38_89_2_00.fa mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K cp -p ../splitSplitBits/100K_10K/hg38_89_2_00.fa . # 20K chunks with 10K overlap: mkdir -p 20K_10K for F in hg38_89_2_00.fa do B=`basename $F | sed -e 's/.fa//'` echo "faSplit -lift=$B.lift -extra=10000 size $F 20000 20K_10K/$B_" faSplit -lift=$B.lift -extra=10000 size $F 20000 20K_10K/${B}_ done ls -1S `pwd`/20K_10K/*.fa > part.list cat << '_EOF_' > runTrf #!/bin/bash set -beEu -o pipefail export path1=$1 export inputFN=`basename $1` export outpath=$2 export outputFN=`basename $2` mkdir -p /dev/shm/$outputFN cp -p $path1 /dev/shm/$outputFN cd /dev/shm/$outputFN /cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K rm -f $outpath cp -p /dev/shm/$outputFN/$outputFN $outpath rm -fr /dev/shm/$outputFN/* rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN '_EOF_' # << happy emacs chmod +s runTrf cat << '_EOF_' > template #LOOP ./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 part.list single template jobList para create jobList para push # one of these jobs will not finish: # Completed: 4 of 5 jobs # CPU time in finished jobs: 10s 0.17m 0.00h 0.00d 0.000 y # IO & Wait Time: 16s 0.26m 0.00h 0.00d 0.000 y # Average job time: 7s 0.11m 0.00h 0.00d # Longest finished job: 8s 0.13m 0.00h 0.00d # Submission to last job: 16s 0.27m 0.00h 0.00d # can use the results we have here: cat *.lift | liftUp 20Kparts.bed stdin error bed/*.bed cat ../splitSplitBits/*.lift | liftUp 100Kpart.bed stdin error 20Kparts.bed cat ../splitBits/*.lift | liftUp parts.bed stdin error 100Kpart.bed liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \ | sort -k1,1 -k2,2n > hg38.result.bed # finally, what turns out to be the last batch: mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K cp -p ../last100K/20K_10K/hg38_89_2_00_3.fa . # 2K chunks with 1K overlap mkdir -p 2K_1K for F in hg38_89_2_00_3.fa do B=`basename $F | sed -e 's/.fa//'` echo "faSplit -lift=$B.lift -extra=1000 size $F 2000 2K_1K/$B_" faSplit -lift=$B.lift -extra=1000 size $F 2000 2K_1K/${B}_ done ls -1S `pwd`/2K_1K/*.fa > part.list cat << '_EOF_' > runTrf #!/bin/bash set -beEu -o pipefail export path1=$1 export inputFN=`basename $1` export outpath=$2 export outputFN=`basename $2` mkdir -p /dev/shm/$outputFN cp -p $path1 /dev/shm/$outputFN cd /dev/shm/$outputFN /cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K rm -f $outpath cp -p /dev/shm/$outputFN/$outputFN $outpath rm -fr /dev/shm/$outputFN/* rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN '_EOF_' # << happy emacs chmod +x runTrf cat << '_EOF_' > template #LOOP ./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 part.list single template jobList para create para push # Completed: 15 of 15 jobs # CPU time in finished jobs: 1s 0.02m 0.00h 0.00d 0.000 y # IO & Wait Time: 26s 0.43m 0.01h 0.00d 0.000 y # Average job time: 2s 0.03m 0.00h 0.00d # Longest finished job: 4s 0.07m 0.00h 0.00d # Submission to last job: 14s 0.23m 0.00h 0.00d cat *.lift | liftUp 2Kparts.bed stdin error bed/*.bed cat ../last100K/*.lift | liftUp 20Kpart.bed stdin error 2Kparts.bed cat ../splitSplitBits/*.lift | liftUp 100Kpart.bed stdin error 20Kpart.bed cat ../splitBits/*.lift | liftUp parts.bed stdin error 100Kpart.bed liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \ | sort -k1,1 -k2,2n > hg38.result.bed ## To put it all together: cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M cat /hive/data/genomes/hg38/TrfPart20M/???/*.bed lastJobs/bed/*.bed \ splitBits/parts.bed splitSplitBits/parts.bed last100K/parts.bed \ last30K/parts.bed > beforeLift.simpleRepeat.bed liftUp -type=.bed stdout ../splitGap/noGap.lift error \ beforeLift.simpleRepeat.bed | sort -u \ | sort -k1,1 -k2,2n > simpleRepeat.bed awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed hgLoadBed hg38 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits hg38 simpleRepeat > fb.simpleRepeat 2>&1 cat fb.simpleRepeat # 146785521 bases of 3049335806 (4.814%) in intersection cd /hive/data/genomes/hg38/bed ln -s simpleRepeat.2013-12-27/run20M simpleRepeat ############################################################################ # WINDOWMASKER - DONE - 2013-12-24 - Hiram mkdir /hive/data/genomes/hg38/bed/windowMasker cd /hive/data/genomes/hg38/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev hg38 > do.log 2>&1 & ############################################################################ # Verify all gaps are marked - DONE - 2013-12-24 - Hiram mkdir /hive/data/genomes/hg38/bed/gap cd /hive/data/genomes/hg38/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../hg38.unmasked.2bit > findMotif.txt 2>&1 # real 0m28.634s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits hg38 -not gap -bed=notGap.bed # 3049335806 bases of 3049335806 (100.000%) in intersection time featureBits hg38 allGaps.bed notGap.bed -bed=new.gaps.bed # 20023 bases of 3049335806 (0.001%) in intersection # real 0m20.427s # this indicates that 20,023 bases are not marked as N's # with this element size profile: awk '{print $3-$2}' new.gaps.bed | ave stdin # Q1 1.000000 # median 1.000000 # Q3 100.000000 # average 44.894619 # min 1.000000 # max 1000.000000 # count 446 # total 20023.000000 # standard deviation 81.743447 # the four largest ones: # 1000 chr2 32916625 32917625 chr2.7 # 1000 chr2 32867130 32868130 chr2.6 # 348 chr20 36314371 36314719 chr20.36 # 200 chr12 123443533 123443733 chr12.10 ######################################################################### ## CYTOBAND - fixing the ideogram track (DONE - 2014-06-11 - Hiram) ## the file we used before was broken mkdir -p /hive/data/outside/ncbi/ideogram/2014-06 cd /hive/data/outside/ncbi/ideogram/2014-06 # fetch all the ideogram files: rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/pub/gdp/ ./ mkdir /hive/data/genomes/hg38/bed/cytoBandUpdate cd /hive/data/genomes/hg38/bed/cytoBandUpdate # Create bed file $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl \ /hive/data/outside/ncbi/ideogram/2014-06/ideogram_9606_GCF_000001305.14_850_V1 # add in the other genome data: hgsql -N -e 'select * from cytoBand;' hg38 \ | egrep "chrU|chrM|_alt|_random" >> cytoBand.bed $HOME/kent/src/utils/ncbi/cytoBandVerify.pl # everything checks out OK on 455 chroms # Load the bed file hgLoadBed -tab -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \ hg38 cytoBand cytoBand.bed cut -f1 cytoBand.bed | sort -u | awk '{print length($1)}' | sort -rn | head # 23 sed -e 's/12/23/' $HOME/kent/src/hg/lib/cytoBand.sql > cytoBand.sql sort -k1,1 -k2,2n cytoBand.bed \ | hgLoadSqlTab hg38 cytoBand cytoBand.sql stdin # Make cytoBandIdeo track for ideogram gif on hgTracks page. # cytoBandIdeo is just a replicate of the cytoBand track. hgsql -e "drop table cytoBandIdeo;" hg38 hgsql hg38 -e "create table cytoBandIdeo (index(chrom(23),chromStart)) as select * from cytoBand;" ######################################################################### ## CYTOBAND - ideogram track (DONE - 2014-03-04 - Hiram) ssh hgwdev mkdir -p /hive/data/outside/ncbi/ideogram/2014-03 cd /hive/data/outside/ncbi/ideogram/2014-03 # fetch all the ideogram files: rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/pub/gdp/ ./ mkdir /hive/data/genomes/hg38/bed/cytoBand cd /hive/data/genomes/hg38/bed/cytoBand # Create bed file $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl \ /hive/data/outside/ncbi/ideogram/2014-03/ideogram_9606_GCF_000001305.14_850_V1 # add in the other genome data: hgsql -N -e 'select * from cytoBand;' hg38 > bobTable.bed egrep "chrU|chrM|_alt|_random" bobTable.bed >> cytoBand.bed ## can now verify before load: $HOME/kent/src/utils/ncbi/cytoBandVerify.pl # everything checks out OK on 455 chroms # Load the bed file hgLoadBed -tab -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \ hg38 cytoBand cytoBand.bed cut -f1 cytoBand.bed | sort -u | awk '{print length($1)}' | sort -rn | head # 23 sed -e 's/12/23/' $HOME/kent/src/hg/lib/cytoBand.sql > cytoBand.sql sort -k1,1 -k2,2n cytoBand.bed \ | hgLoadSqlTab hg38 cytoBand cytoBand.sql stdin # Make cytoBandIdeo track for ideogram gif on hgTracks page. # cytoBandIdeo is just a replicate of the cytoBand track. hgsql -e "drop table cytoBandIdeo;" hg38 hgsql hg38 -e "create table cytoBandIdeo (index(chrom(23),chromStart)) as select * from cytoBand;" ########################################################################## # cytoBandIdeo - (DONE - 2013-12-26 - Hiram) mkdir /hive/data/genomes/hg38/bed/cytoBand cd /hive/data/genomes/hg38/bed/cytoBand makeCytoBandIdeo.csh hg38 #making temporary liftover of items from hg19 liftOver /hive/data/genomes/hg19/bed/ncbiCytoBand/cytobands.bed \ /hive/data/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \ cytobands.bed unMapped liftOver -minBlocks=0.5 /hive/data/genomes/hg19/bed/ncbiCytoBand/cytobands.bed \ /hive/data/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \ cytobands.0.5.bed unMapped0.5 ############################### ###################### # cytoBandIdeo - (reDONE - 2014-02-25 - kuhn) # adding centromeres to generic cytonBandIdeo tavle as it exists. # (lifted track is already gone) # get the cen values for hg38 hgsql -Ne "SELECT DISTINCT chrom FROM centromeres" hg38 | sort > hg38.chroms rm -f hg38.cens foreach chrom (`cat hg38.chroms`) set cenStart="" set cenEnd="" set cenStart=`hgsql -Ne 'SELECT MIN(chromStart) FROM centromeres WHERE chrom = "'$chrom'"' hg38` set cenEnd=`hgsql -Ne 'SELECT MAX(chromEnd) FROM centromeres WHERE chrom = "'$chrom'"' hg38` echo "$chrom $cenStart $cenEnd" >> hg38.cens end # Modified makeCytoBandIdeo.csh to use this file instead of looking # for centromeres in a gap table. # Replaced existing cytoBandIdeo table, which was really only a copy # of chromInfo. ########################################################################## # hg19 <-> hg38 difference tracks (DONE - 2013-12-28 - Hiram) mkdir /hive/data/genomes/hg19/bed/liftOverHg38 cd /hive/data/genomes/hg19/bed/liftOverHg38 # not needed, but interesting, collect all the fragment # definitions from the gold tables: hgsql -N -e "select frag,fragStart,fragEnd,strand from gold;" hg19 \ | sort > hg19.gold.frags.tab hgsql -N -e "select frag,fragStart,fragEnd,strand from gold;" hg38 \ | sort > hg38.gold.frags.tab # construct common and difference listings comm -12 hg19.gold.frags.tab hg38.gold.frags.tab \ > identical.hg19.hg38.frags.tab comm -23 hg19.gold.frags.tab hg38.gold.frags.tab \ > unique.hg19Only.frags.tab comm -13 hg19.gold.frags.tab hg38.gold.frags.tab \ > unique.hg38Only.frags.tab # better yet, get full information about each fragment hgsql -N -e "select chrom,chromStart,chromEnd,ix,type,frag,fragStart,fragEnd,strand from gold;" hg19 \ | sort -k6 > hg19.gold.tab hgsql -N -e "select chrom,chromStart,chromEnd,ix,type,frag,fragStart,fragEnd,strand from gold;" hg38 \ | sort -k6 > hg38.gold.tab # construct a single key for each fragment for joining. # the key is frag,fragStart,fragEnd,strand awk '{printf "%s,%d,%d,%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\n", $6,$7,$8,$9,$6,$9,$1,$2,$3,$4,$5}' hg19.gold.tab | sort \ > hg19.fragKey.tab awk '{printf "%s,%d,%d,%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\n", $6,$7,$8,$9,$6,$9,$1,$2,$3,$4,$5}' hg38.gold.tab | sort \ > hg38.fragKey.tab # now, by joining those keys, we can get exact identicals, and # the only-in listings as bed files to load as tracks: join hg19.fragKey.tab hg38.fragKey.tab \ | awk '{printf "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0,0,128\n", $4,$5,$6,$2,$3,$5,$6}' \ | sort -k1,1 -k2,2n > hg19.hg38.identical.bed join hg19.fragKey.tab hg38.fragKey.tab \ | awk '{printf "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0,0,128\n", $11,$12,$13,$9,$10,$12,$13}' \ | sort -k1,1 -k2,2n > hg38.hg19.identical.bed join -v 1 hg19.fragKey.tab hg38.fragKey.tab \ | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $4,$5,$6,$2,$3}' \ | sort -k1,1 -k2,2n > hg19.only.bed join -v 2 hg19.fragKey.tab hg38.fragKey.tab \ | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $4,$5,$6,$2,$3}' \ | sort -k1,1 -k2,2n > hg38.only.bed hgLoadBed hg19 hg38ContigDiff hg19.only.bed hgLoadBed hg38 hg19ContigDiff hg38.only.bed wc -l hg??.only.bed # 6097 hg19.only.bed # 23632 hg38.only.bed # this leaves the outstanding question of "why" they might be in # the only-in listings. Some contigs may be different versions, # sometimes different sections of the same contig are used, # and contigs are dropped from hg19 to hg38, or new contigs added # to hg38 to fill in gaps from hg19 # Let's see if we can measure some of this: awk '{print $4}' hg19.only.bed | sort -u > hg19.only.ids.list awk '{print $4}' hg38.only.bed | sort -u > hg38.only.ids.list # Looks like 5405 idential contigs with different parts used: comm -12 hg19.only.ids.list hg38.only.ids.list > differentPortions.list wc -l differentPortions.list # 5405 # and perhaps 63 = 5468-5405 of different versions of same contig: sed -e "s/\.[0-9]*$//" hg19.only.ids.list | sort -u \ > hg19.noVersions.ids.list sed -e "s/\.[0-9]*$//" hg38.only.ids.list | sort -u \ > hg38.noVersions.ids.list comm -12 hg19.noVersions.ids.list hg38.noVersions.ids.list | wc -l # 5468 sed -e "s/\.[0-9]*$//" differentPortions.list | sort -u \ > differentPortions.noVersions.list comm -12 hg19.noVersions.ids.list hg38.noVersions.ids.list | sort -u \ > noVersions.common.list # indeed, 63 contigs of different versions: comm -23 noVersions.common.list differentPortions.noVersions.list \ | sort -u > differentVersions.list wc -l differentVersions.list # 63 # dividing up these items: cat << '_EOF_' > identifyPortions.pl #!/usr/bin/env perl use strict; use warnings; my %differentVersions; my %differentPortions; open (FH, ") { chomp $line; $differentVersions{$line} = 1; } close (FH); open (FH, "differentPortions.list" ) or die "can not read differentPortions.list"; while (my $line = ) { chomp $line; $differentPortions{$line} = 1; } close (FH); my %hg19Done; open (DP, ">hg19.differentPortions.bed") or die "can not write to hg19.differentPortions.bed"; open (DV, ">hg19.differentVersions.bed") or die "can not write to hg19.differentVersions.bed"; open (FH, ") { chomp $line; my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); # assume done while $acc is still complete $hg19Done{$acc} = 1; if (exists($differentPortions{$acc})) { printf DP "%s\n", $line; } else { my $trimAcc = $acc; $trimAcc =~ s/\.[0-9]+$//; if (exists($differentVersions{$trimAcc})) { printf DV "%s\n", $line; } else { # this one does not match $hg19Done{$acc} = 0; } } } close (FH); close (DV); close (DP); open (DR, ">hg19.dropped.bed") or die "can not write to hg19.dropped.bed"; open (FH, ") { chomp $line; my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); if (0 == $hg19Done{$acc}) { printf DR "%s\n", $line; } } close (FH); close (DR); my %hg38Done; open (DP, ">hg38.differentPortions.bed") or die "can not write to hg38.differentPortions.bed"; open (DV, ">hg38.differentVersions.bed") or die "can not write to hg38.differentVersions.bed"; open (FH, ") { chomp $line; my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); # assume done while $acc is still complete $hg38Done{$acc} = 1; if (exists($differentPortions{$acc})) { printf DP "%s\n", $line; } else { my $trimAcc = $acc; $trimAcc =~ s/\.[0-9]+$//; if (exists($differentVersions{$trimAcc})) { printf DV "%s\n", $line; } else { # this one does not match $hg38Done{$acc} = 0; } } } close (FH); close (DV); close (DP); open (DR, ">hg38.newTo19.bed") or die "can not write to hg38.newTo19.bed"; open (FH, ") { chomp $line; my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); if (0 == $hg38Done{$acc}) { printf DR "%s\n", $line; } } close (FH); close (DR); '_EOF_' # << happy emacs chmod +x identifyPortions.pl ./identifyPortions.pl # make sure nothing was lost sort hg19.differentVersions.bed hg19.differentPortions.bed \ hg19.dropped.bed | sum # 43711 233 sort hg19.only.bed | sum # 43711 233 sort hg38.differentVersions.bed hg38.differentPortions.bed \ hg38.newTo19.bed | sum # 00502 911 sort hg38.only.bed | sum # 00502 911 sort -k1,1 -k2,2n hg38.differentVersions.bed hg38.differentPortions.bed \ hg38.newTo19.bed > hg38.itemRgb.bed sort -k1,1 -k2,2n hg19.differentVersions.bed hg19.differentPortions.bed \ hg19.dropped.bed > hg19.itemRgb.bed hgLoadBed hg19 hg38ContigDiff hg19.itemRgb.bed # if you wanted to load the identicals in this track too: sort -k1,1 -k2,2n hg38.hg19.identical.bed hg38.itemRgb.bed \ | hgLoadBed hg38 hg38ContigDiff stdin # but we don't, we deliver only the differences hgLoadBed hg38 hg38ContigDiff hg38.itemRgb.bed ######################################################################### # construct ooc file to be used in blat operations # DONE - 2012-12-30 - Hiram # can be done on unmasked sequence the same result as masked: cd /hive/data/genomes/hg38 time blat hg38.unmasked.2bit /dev/null /dev/null \ -tileSize=11 -makeOoc=jkStuff/hg38.11.ooc -repMatch=1024 # been confirmed, the 100-base non-bridged gaps are really non-bridged gapToLift -minGap=100 -bedFile=jkStuff/nonBridgedGaps.bed hg38 \ jkStuff/hg38.nonBridged.lft ############################################################################## # cpgIslands - (DONE - 2014-01-07 - Hiram) # run on the Hmmer + trfMask sequence mkdir /hive/data/genomes/hg38/bed/cpgIslands cd /hive/data/genomes/hg38/bed/cpgIslands time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -workhorse=hgwdev -smallClusterHub=ku hg38 > do.log 2>&1 # real 3m31.684s # wc -l cpgIsland.bed -> 30456 cpgIsland.bed cat fb.hg38.cpgIslandExt.txt # 23654068 bases of 3049335806 (0.776%) in intersection # Previously in hg19: featureBits -countGaps hg19 cpgIslandExt # 21842742 bases of 3137161264 (0.696%) in intersection # when run on Hmmer and Trf masked sequence: # wc -l cpgIsland.bed -> 30416 cpgIsland.bed # 23635946 bases of 3049335806 (0.775%) in intersection # when run on unmasked sequence: # wc -l cpgIsland.bed -> 55149 cpgIsland.bed # 33637531 bases of 3049335806 (1.103%) in intersection ############################################################################## # rerun cpgIslands on contig sequence (DONE - 2014-01-07 - Hiram) # this is a test of the contig sequence file, # should get a very similar answer to the above mkdir /hive/data/genomes/hg38/bed/cpgIslandsContigs cd /hive/data/genomes/hg38/bed/cpgIslandsContigs # run stepwise so the lift can be done on the result before loading time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -stop=makeBed -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \ -workhorse=hgwdev -smallClusterHub=ku hg38 > makeBed.log 2>&1 # real 9m31.502s # fails on the bedToBigBed creation since this isn't the actual # hg38 sequence. mv cpgIsland.bed cpgIsland.beforeLift.bed liftUp -type=.bed stdout ../../jkStuff/hg38.contigs.lift carry \ cpgIsland.beforeLift.bed | sort -k1,1 -k2,2n > cpgIsland.bed bedToBigBed -tab -type=bed4+6 -as=$HOME/kent/src/hg/lib/cpgIslandExt.as \ cpgIsland.bed ../../chrom.sizes hg38.cpgIslandExt.bb zcat ../cpgIslands/cpgIsland.bed.gz | sort -k1,1 -k2,2n > t.bed # Surprisingly, a few more are detected, perhaps due to the different # masking since this contig run is on the final corrected cross-match rmsk # plus TRF, the above was on the corrupted HMMER+TRF mask: wc -l cpgIsland.bed t.bed # 30477 cpgIsland.bed # 30456 t.bed # 2,835 different items between the two: sort t.bed cpgIsland.bed | uniq -c | awk '$1 < 2' | wc -l # 2835 # 29.049 identical items sort t.bed cpgIsland.bed | uniq -c | awk '$1 == 2' | wc -l # 29049 cut -f1-3 cpgIsland.bed | sort > contigs.bed cut -f1-3 t.bed | sort > fullSequence.bed # 29,339 identical locations: comm -12 contigs.bed fullSequence.bed | wc -l # 29339 time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -continue=load -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \ -workhorse=hgwdev -smallClusterHub=ku hg38 > load.log 2>&1 # real 0m12.056s cat fb.hg38.cpgIslandExt.txt # 23610399 bases of 3049335806 (0.774%) in intersection ############################################################################## # rerun cpgIslands on contig UNMASKED sequence (DONE - 2014-01-07 - Hiram) mkdir /hive/data/genomes/hg38/bed/cpgIslandsContigsUnmasked cd /hive/data/genomes/hg38/bed/cpgIslandsContigsUnmasked twoBitToFa -noMask ../../hg38.contigs.2bit stdout \ | faToTwoBit stdin hg38.contigsUnmasked.2bit # verify sequence is OK: twoBitToFa hg38.contigsUnmasked.2bit stdout | faSize stdin # 3061688741 bases (12372958 N's 3049315783 real 3049315783 upper 0 lower) # in 733 sequences in 1 files # %0.00 masked total, %0.00 masked real twoBitToFa hg38.contigsUnmasked.2bit stdout | faCount stdin | tail -1 # total 3061688741 898285419 623727342 626335137 900967885 12372958 30979743 # ACGT CpG same as original hg38.2bit except for the missing N's: # total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743 # run stepwise so the lift can be done on the result before loading time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -stop=makeBed -maskedSeq=`pwd`/hg38.contigsUnmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku hg38 > makeBed.log 2>&1 # real 11m0.690s # as above, failed on the bedToBigBed step since this isn't the full hg38 # sequence mv cpgIsland.bed cpgIsland.beforeLift.bed liftUp -type=.bed stdout ../../jkStuff/hg38.contigs.lift carry \ cpgIsland.beforeLift.bed | sort -k1,1 -k2,2n > cpgIsland.bed bedToBigBed -tab -type=bed4+6 -as=$HOME/kent/src/hg/lib/cpgIslandExt.as \ cpgIsland.bed ../../chrom.sizes hg38.cpgIslandExt.bb # a lot more here that for masked sequence: wc -l cpgIsland.bed ../cpgIslandsContigs/cpgIsland.bed # 55149 cpgIsland.bed # 30477 ../cpgIslandsContigs/cpgIsland.bed featureBits -countGaps hg38 cpgIsland.bed # 33637531 bases of 3209286105 (1.048%) in intersection featureBits -countGaps hg38 ../cpgIslandsContigs/cpgIsland.bed # 23610399 bases of 3209286105 (0.736%) in intersection # debug load step so it can be loaded into a separate table: $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -debug -continue=load -maskedSeq=`pwd`/hg38.contigsUnmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku hg38 time ./doLoadCpg.csh > load.log 2>&1 # real 0m2.179s # 33637531 bases of 3049335806 (1.103%) in intersection ######################################################################### # construct liftOver to hg19 (DONE - 2013-12-31 - Hiram) # it turns out it doesn't matter if the query or target 2bit files # are masked. This procedure can be done on completely unmasked sequences # for both, same result masked or not masked screen -S hg38 # manage this longish running job in a screen mkdir /hive/data/genomes/hg38/bed/blat.hg19.2013-12-31 cd /hive/data/genomes/hg38/bed/blat.hg19.2013-06-10 # this was run in manual steps as experiments were done about the masking # check it with -debug first to see if it is going to work: doSameSpeciesLiftOver.pl -stop=net -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -debug \ -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc hg38 hg19 # the debug step doesn't actually construct enough files to run the # steps manually. The chaining has an extra procedure that is performed # while not in 'debug' mode # the run.blat was operated manually, then chaining: time doSameSpeciesLiftOver.pl -continue=chain -stop=net -buildDir=`pwd` \ -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc \ hg38 hg19 > chain.log 2>&1 # real 22m31.635s # loading is only a few seconds: doSameSpeciesLiftOver.pl -continue=load -buildDir=`pwd` \ -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc \ hg38 hg19 > load.log 2>&1 # verify this file exists: # /gbdb/hg38/liftOver/hg38ToHg19.over.chain.gz # and try out the conversion on genome-test from hg38 to hg19 # same file should exist for downloads: # /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz ############################################################################ # marking the PAR regions: (DONE - 2014-01-09 - Hiram) # after much experimentation with the AGP files and the given NCBI # files in hg38/genbank/Primary_Assembly/pseudoautosomal_region # the PAR region definitions can be seen in the par_align.gff file: # CM000685.2 10001 2781479 -> CM000686.2 10001 2781479 # CM000685.2 155701383 156030895 -> CM000686.2 56887903 57217415 # equivalent to: # chrX 10001 2781479 -> chrY 10001 2781479 # chrX 155701383 156030895 -> chrY 56887903 57217415 # subtract one for the chromStart position: cat << '_EOF_' > hg38Par.bed4 chrX 10000 2781479 PAR1 chrX 155701382 156030895 PAR2 chrY 10000 2781479 PAR1 chrY 56887902 57217415 PAR2 '_EOF_' # << happy emacs hgLoadBed hg38 par hg38Par.bed4 checkTableCoords hg38 # hg19 had: +-------+------------+-----------+------+ | chrom | chromStart | chromEnd | name | +-------+------------+-----------+------+ | chrX | 60000 | 2699520 | PAR1 | | chrX | 154931043 | 155260560 | PAR2 | | chrY | 10000 | 2649520 | PAR1 | | chrY | 59034049 | 59363566 | PAR2 | +-------+------------+-----------+------+ # The AGP files come close to definining the location, but not # precisely. The first region uses different bits of AC006209.25: zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\ | grep AC006209.25 CM000685.2 2665048 2677319 56 F AC006209.25 127483 139754 - CM000685.2 2677869 2804801 58 F AC006209.25 1 126933 - zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\ | grep AC006209.25 CM000686.2 2665048 2677319 56 F AC006209.25 127483 139754 - CM000686.2 2677869 2781479 58 F AC006209.25 23323 126933 - # and the second region uses different bits of AJ271735.1: zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\ | grep AJ271735.1 | head -1 CM000685.2 155676925 155719966 3096 O AJ271735.1 44687 87728 + zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\ | grep AJ271735.1 | head -1 CM000686.2 56887903 56906486 356 O AJ271735.1 69145 87728 + # combining all the contig definitions from each will find all the # exact identical contig bits: zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\ | grep -v "^#" | awk '$5 != "N"' \ | awk '{printf "%s_%d_%d\t%s\t%d\t%d\n", $6,$7,$8,$1,$2,$3}' \ | sort > chrY.comp.agp.txt zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\ | grep -v "^#" | awk '$5 != "N"' \ | awk '{printf "%s_%d_%d\t%s\t%d\t%d\n", $6,$7,$8,$1,$2,$3}' \ | sort > chrX.comp.agp.txt join -t'^I' chrY.comp.agp.txt chrX.comp.agp.txt | head CM000685.2 10001 44821 CM000686.2 10001 44821 ... CM000685.2 2677320 2677868 CM000686.2 2677320 2677868 CM000685.2 155719967 155720351 CM000686.2 56906487 56906871 ... CM000685.2 155964490 156030895 CM000686.2 57151010 57217415 ############################################################################ ## altLocations track (DONE - 2014-01-02 - Hiram) # indicate corresponding locations between haplotypes and reference mkdir /hive/data/genomes/hg38/bed/altLocations cd /hive/data/genomes/hg38/bed/altLocations find ../../genbank/ALT_* -type f | grep alt_scaffold_placement.txt \ | while read F do grep -v "^#" ${F} | sed -e 's/\./v/;' | awk -F'\t' '{printf "chr%s\t%d\t%d\tchr%s_%s_alt\n", $6,$12-1,$13,$6, $4}' done | sort -k1,1 -k2,2n > chrToAlt.bed # note silent hidden character in the join -t argument # explicit as written here find ../../genbank/ALT_* -type f | grep alt_scaffold_placement.txt \ | while read F do grep -v "^#" ${F} | sed -e 's/\./v/;' | awk -F'\t' '{printf "chr%s_%s_alt\tchr%s:%d-%d\n", $6,$4,$6,$12,$13}' done | sort > altToChr.tab sort ../../chrom.sizes | join -t'^I' - altToChr.tab \ | awk '{printf "%s\t0\t%d\t%s\n", $1,$2,$3}' > altToChr.bed hgLoadBed hg38 altLocations chrToAlt.bed altToChr.bed featureBits -countGaps hg38 altLocations # 170113652 bases of 3209286105 (5.301%) in intersection ############################################################################ ## genscan (DONE - 2014-01-07 - Hiram) mkdir /hive/data/genomes/hg38/bed/genscan cd /hive/data/genomes/hg38/bed/genscan # using the contig sequence # running stepwise to allow the lifting of the final result time $HOME/kent/src/hg/utils/automation/doGenscan.pl hg38 -buildDir=`pwd` \ -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \ -stop=makeBed -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ > do.log 2>&1 # three jobs did not finish due to almost all N's in the sequence, # just a couple of bases in each piece. Their empty result is good enough. time $HOME/kent/src/hg/utils/automation/doGenscan.pl hg38 -buildDir=`pwd` \ -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \ -continue=makeBed -stop=makeBed -bigClusterHub=ku -dbHost=hgwdev \ -workhorse=hgwdev > makeBed.log 2>&1 # real 0m48.161s cd lifted mkdir -p gtf subopt nameFixed/gtf nameFixed/pep newNames pep for F in ../gtf/000/*.gtf do B=`basename $F` liftUp gtf/${B} ../../../jkStuff/hg38.contigs.lift carry $F echo $B done for F in ../subopt/000/*.bed do B=`basename $F` liftUp subopt/${B} ../../../jkStuff/hg38.contigs.lift carry $F echo $B done ls gtf/chr*_[0-9][0-9].gtf \ | sed -e 's/_[0-9][0-9]//; s#gtf/##; s/.gtf//;' | sort -u | while read C do cat ../pep/000/${C}_[0-9][0-9].pep > pep/${C}.pep cat gtf/${C}_[0-9][0-9].gtf | ./gtfFixId.pl ${C} > nameFixed/gtf/${C}.gtf ./pepNameFix.pl ${C} > nameFixed/pep/${C}.pep done cat nameFixed/gtf/*.gtf > ../hg38.genscan.gtf ls gtf | egrep -v '^chr[0-9XY][0-9]*_[0-9][0-9].gtf' | while read C do cat gtf/${C} done >> ../hg38.genscan.gtf cat nameFixed/pep/*.pep > ../hg38.genscan.pep ls gtf | egrep -v '^chr[0-9XY][0-9]*_[0-9][0-9].gtf' \ | sed -e 's/.gtf/.pep/' | while read C do cat ../pep/000/${C} done >> ../hg38.genscan.pep cd /hive/data/genomes/hg38/bed/genscan cat lifted/subopt/*.bed | sort -k1,1 -k2,2n > hg38.genscanSubopt.bed gtfToGenePred hg38.genscan.gtf hg38.genscan.gp genePredCheck -db=hg38 hg38.genscan.gp # checked: 44149 failed: 0 genePredToBed hg38.genscan.gp hg38.genscan.bed bedToBigBed hg38.genscan.bed ../../chrom.sizes hg38.genscan.bb bedToBigBed hg38.genscanSubopt.bed ../../chrom.sizes hg38.genscanSubopt.bb ldHgGene -gtf hg38 genscan hg38.genscan.gtf # Read 44149 transcripts in 339212 lines in 1 files # 44149 groups 345 seqs 1 sources 1 feature types cat fb.hg38.genscan.txt # 58278346 bases of 3049335806 (1.911%) in intersection cat fb.hg38.genscanSubopt.txt # 55020514 bases of 3049335806 (1.804%) in intersection # oddly, we are getting half of what hg19 had ? featureBits hg19 genscan # 106433874 bases of 2897316137 (3.674%) in intersection # This is because hg19 was run on soft-masked sequence and not # on hard masked sequence ############################################################################ ## genscan on unmasked sequence experiment (DONE - 2013-12-03 - Hiram) ## instead, working on unmasked sequence: mkdir /hive/data/genomes/hg38/bed/genscan/unmaskedRun cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun mkdir liftSpecs split -a 3 -d -l 1 ../../../jkStuff/hg38.nonBridged.lift liftSpecs/hg38_ mkdir fasta for F in liftSpecs/hg38_* do L=`cut -f2 $F` echo $L /cluster/home/hiram/kent/src/hg/utils/lft2BitToFa.pl \ ../../../hg38.unmasked.2bit $F > fasta/${L}.fa done cat << '_EOF_' > template #LOOP ./runGsBig.bash $(path1) {check out exists gtf/$(root1).gtf} {check out exists pep/$(root1).pep} {check out exists subopt/$(root1).bed} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > runGsBig.bash #!/bin/bash set -beEu -o pipefail export seqFile=$1 export resultGtf=$2 export resultPep=$3 export resultSubopt=$4 /cluster/bin/x86_64/gsBig $seqFile $resultGtf -trans=$resultPep -subopt=$resultSubopt -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000 '_EOF_' # << happy emacs ls -1S `pwd`/fasta/*.fa > part.list gensub2 part.list single template jobList para create jobList para push # several jobs crashed: # Completed: 726 of 733 jobs # Crashed: 7 jobs # CPU time in finished jobs: 62501s 1041.68m 17.36h 0.72d 0.002 y # IO & Wait Time: 2563s 42.72m 0.71h 0.03d 0.000 y # Average job time: 90s 1.49m 0.02h 0.00d # Longest finished job: 3288s 54.80m 0.91h 0.04d # Submission to last job: 3294s 54.90m 0.92h 0.04d para status | grep -v -w done | awk '{print $(NF-3)}' > crashed.job.list mkdir /hive/data/genomes/hg38/bed/genscan/unmaskedRun/crashedJobs cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun/crashedJobs mkdir splitBits for F in chr2.06 chr1.03 chr3.05 chr12.07 chr10.05 chr17.08 chr11.04 do faSplit -lift=${F}.lift gap ../fasta/${F}.fa 2000000 splitBits/${F}_ done ls -1S `pwd`/splitBits/*.fa > part.list cat << '_EOF_' > runGsBig.bash #!/bin/bash set -beEu -o pipefail export seqFile=$1 export resultGtf=$2 export resultPep=$3 export resultSubopt=$4 /cluster/bin/x86_64/gsBig $seqFile $resultGtf -trans=$resultPep -subopt=$resultSubopt -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000 '_EOF_' # << happy emacs chmod +x runGsBig.bash cat << '_EOF_' > template #LOOP ./runGsBig.bash $(path1) {check out exists gtf/$(root1).gtf} {check out exists pep/$(root1).pep} {check out exists subopt/$(root1).bed} #ENDLOOP '_EOF_' # << happy emacs gensub2 part.list single template jobList para create jobList para push # Completed: 331 of 334 jobs # Crashed: 3 jobs # CPU time in finished jobs: 18097s 301.62m 5.03h 0.21d 0.001 y # IO & Wait Time: 1085s 18.08m 0.30h 0.01d 0.000 y # Average job time: 58s 0.97m 0.02h 0.00d # Longest finished job: 79s 1.32m 0.02h 0.00d # Submission to last job: 249s 4.15m 0.07h 0.00d # the last three completed with -window=1600000 # lifting results: cat << '_EOF_' > fixIds.pl #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: cat chrN.M.lifted | ./fixIds.pl chrN.M\n"; exit 255; } my $F=shift; my $C = $F; $C =~ s/\.[0-9][0-9]//; my $id = 0; my $prevId = ""; open (GT, ">${F}.gtf") or die "can not write to ${F}.gtf"; while (my $line=<>) { chomp $line; my $geneId = $line; $geneId =~ s/^${C}.*gene_id "${C}//; $geneId =~ s/";.*//; $id += 1 if ( $prevId ne $geneId); $line =~ s/${C}[0-9]+.[0-9]+/${F}.$id/g; printf GT "%s\n", $line; $prevId = $geneId; } close (GT); '_EOF_' # << happy emacs chmod +x fixIds.pl for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05 do echo "${F}" 1>&2 cut -f2 ${F}.lift | while read P do liftUp -type=.gtf stdout ${F}.lift error gtf/${P}.gtf done > ${F}.lifted.gtf cat ${F}.lifted.gtf | ./fixIds.pl ${F} done # copied these results to ../gtf/ to get into the final result # -rw-rw-r-- 1 3349959 Jan 2 15:33 chr1.03.gtf # -rw-rw-r-- 1 2439182 Jan 2 15:33 chr10.05.gtf # -rw-rw-r-- 1 1068097 Jan 2 15:33 chr11.04.gtf # -rw-rw-r-- 1 2392548 Jan 2 15:33 chr12.07.gtf # -rw-rw-r-- 1 1831336 Jan 2 15:33 chr17.08.gtf # -rw-rw-r-- 1 3539694 Jan 2 15:33 chr2.06.gtf # -rw-rw-r-- 1 2309903 Jan 2 15:33 chr3.05.gtf for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05 do echo "${F}" 1>&2 cut -f2 ${F}.lift | while read P do liftUp -type=.bed stdout ${F}.lift error subopt/${P}.bed done > ${F}.lifted.subopt.bed done # copied these results to ../subopt/ to get into the final result # -rw-rw-r-- 1 3349959 Jan 2 15:33 chr1.03.gtf # -rw-rw-r-- 1 2439182 Jan 2 15:33 chr10.05.gtf # -rw-rw-r-- 1 1068097 Jan 2 15:33 chr11.04.gtf # -rw-rw-r-- 1 2392548 Jan 2 15:33 chr12.07.gtf # -rw-rw-r-- 1 1831336 Jan 2 15:33 chr17.08.gtf # -rw-rw-r-- 1 3539694 Jan 2 15:33 chr2.06.gtf # -rw-rw-r-- 1 2309903 Jan 2 15:33 chr3.05.gtf cat << '_EOF_' > pepNameFix.pl #!/usr/bin/env perl use strict; use warnings; # BIG ASSUMPTION ! ! ! - the peptides are in the same order as # they are in the GTF file ! ! ! my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: cat chrN.M.needNameFix.pep | ./pepNameFix.pl chrN.M > chrN.M.pep\n"; exit 255; } my $C=shift; my $id = 1; while (my $line = <>) { if ($line =~ m/^>/) { printf ">%s.%d\n", $C, $id++; } else { print $line; } } '_EOF_' # << happy emacs chmod +x pepNameFix.pl for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05 do echo "${F}" 1>&2 cut -f2 ${F}.lift | while read P do cat pep/${P}.pep done > ${F}.needNameFix.pep cat ${F}.needNameFix.pep | ./pepNameFix.pl ${F} > ${F}.pep done # copied these results to ../pep/ to get into the final result: # -rw-rw-r-- 1 1592655 Jan 2 15:55 chr1.03.pep # -rw-rw-r-- 1 1169168 Jan 2 15:55 chr10.05.pep # -rw-rw-r-- 1 519106 Jan 2 15:55 chr11.04.pep # -rw-rw-r-- 1 1152111 Jan 2 15:55 chr12.07.pep # -rw-rw-r-- 1 775052 Jan 2 15:55 chr17.08.pep # -rw-rw-r-- 1 1799546 Jan 2 15:55 chr2.06.pep # -rw-rw-r-- 1 1248762 Jan 2 15:55 chr3.05.pep # and then, adding in all the results together cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun cat << '_EOF_' > gtfIdFix.pl #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: cat lifted/gtf/chrN.gtf | ./gtfIdFix.pl chrN\n"; exit 255; } my $C=shift; my $id = 0; my $prevId = ""; open (NM, ">nameFixed/newNames/${C}.tab") or die "can not write to nameFixed/newNames/${C}.tab"; open (GT, ">nameFixed/gtf/${C}.gtf") or die "can not write to nameFixed/gtf/${C}.gtf"; while (my $line=<>) { chomp $line; my $geneId = $line; $geneId =~ s/^${C}.*gene_id "//; $geneId =~ s/";.*//; if ( $prevId ne $geneId) { $id += 1; printf NM "%s\t%s.%d\n", $geneId, $C, $id; } $line =~ s/${C}.[0-9]+.[0-9]+/${C}.$id/g; printf GT "%s\n", $line; $prevId = $geneId; } close (GT); close (NM); '_EOF_' # << happy emacs chmod +x gtfIdFix.pl rm -fr lifted rm -fr nameFix mkdir -p lifted mkdir -p lifted/gtf mkdir -p lifted/pep mkdir -p lifted/subopt mkdir -p nameFix mkdir -p nameFix/gtf mkdir -p nameFix/newNames for F in liftSpecs/hg38_* do L=`cut -f2 $F` C=`cut -f4 $F` liftUp -type=.gtf stdout ${F} error gtf/${L}.gtf >> lifted/gtf/${C}.gtf cat pep/${L}.pep >> lifted/pep/${C}.pep liftUp -type=.bed stdout ${F} error subopt/${L}.bed >> lifted/subopt/${C}.bed done for F in lifted/gtf/*.gtf do C=`basename $F | sed -e 's/.gtf//'` cat $F | ./gtfIdFix.pl $C done mkdir -p nameFixed/pep cat << '_EOF_' > pepNameFix.pl #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: ./pepNameFix.pl chrN > chrN.pep\n"; exit 255 } my $C = shift; my %newName; open (FH, ") { chomp $line; my ($needFix, $fixedName) = split('\t', $line); $newName{$needFix} = $fixedName; } close (NM); while (my $line = ) { if ($line =~m /^>/) { chomp $line; $line =~ s/^>//; die "can not find name to fix $line" if (!exists($newName{$line})); printf ">%s\n", $newName{$line}; } else { print $line; } } close (FH); '_EOF_' # << happy emacs chmod +x pepNameFix.pl for F in lifted/pep/*.pep do C=`basename $F | sed -e 's/.pep//'` echo $C ./pepNameFix.pl $C > nameFixed/pep/$C.pep done ############################################################################# # Mark the new centromere regions (DONE - 2014-01-09 - Hiram) mkdir /hive/data/genomes/hg38/bed/centromere cd /hive/data/genomes/hg38/bed/centromere grep GJ ../../hg38.agp > hg38.centContigs.agp awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' hg38.centContigs.agp \ > hg38.centContigs.bed4 hgLoadBed hg38 centromeres hg38.centContigs.bed4 checkTableCoords hg38 centromeres ############################################################################# ## alternate sequence/haplotype alignments (DONE - 2014-01-23 - Hiram) mkdir /hive/data/genomes/hg38/bed/lastzAltSequences cd /hive/data/genomes/hg38/bed/lastzAltSequences rm -fr hg38.haplotypes.lift temp.lift targetFa queryFa mkdir targetFa mkdir queryFa touch temp.lift cat ../altLocations/chrToAlt.bed | while read L do chrName=`echo $L | awk '{print $1}'` chromSize=`egrep "^$chrName " ../../chrom.sizes | cut -f2` chrStart=`echo $L | awk '{if (($2-10000)>=0) {printf "%d", $2-10000} else {printf "0"}}'` chrEnd=`echo $L | awk -v chromSize=$chromSize '{if (($3+10000)<=chromSize) {printf "%d", $3+10000} else {printf "%d", chromSize}}'` chrSize=`echo $chrEnd $chrStart | awk '{print $1-$3}'` queryName=`echo $L | awk '{print $4}'` partName="${chrName}_${chrStart}_${chrEnd}" echo $chrName $chrStart $chrEnd $queryName $partName $chromSize echo -e "$chrStart\t${partName}\t$chrSize\t$chrName\t$chromSize" >> temp.lift twoBitToFa ../../hg38.unmasked.2bit:$chrName:$chrStart-$chrEnd stdout | sed -e "s/^>.*/>$partName/;" > targetFa/$queryName.fa twoBitToFa ../../hg38.unmasked.2bit:$queryName queryFa/$queryName.fa done sort -u temp.lift | sort -k4,4 -k1,1n > hg38.haplotypes.lift # these were run serially on hgwdev, they could be a cluster run: ssh ku mkdir /hive/data/genomes/hg38/bed/lastzAltSequences/run.blastz cd /hive/data/genomes/hg38/bed/lastzAltSequences/run.blastz mkdir ../lav ../psl # construct the jobList ls ../targetFa | sed -e 's/.fa//;' | while read partName do echo "./runJob.sh ${partName}" done > jobList cat << '_EOF_' > runJob #!/bin/sh export partName=$1 export target="../targetFa/$partName.fa" export query="../queryFa/$partName.fa" export lav="../lav/$partName.lav" export psl="../psl/$partName.psl" /cluster/bin/penn/lastz-distrib-1.03.46/bin/lastz \ $target $query \ Y=15000 T=2 M=254 O=600 H=2000 O=600 E=150 K=10000 L=10000 \ Q=/scratch/data/blastz/human_chimp.v2.q > $lav lavToPsl $lav stdout | liftUp $psl ../hg38.haplotypes.lift error stdin '_EOF_' # << happy emacs # these were run serially on hgwdev, they could be a cluster run: time ./jobList > do.log # real 61m35.898s # chaining lastz results: mkdir -p /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain/run/chain cd /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain/run ls ../../psl/*.psl | while read P do B=`basename $P | sed -e 's/.psl//'` echo $B $P ls -og $P ../../targetFa/${B}.fa ../../queryFa/${B}.fa /cluster/home/hiram/kent/src/hg/mouseStuff/axtChain/axtChain \ -psl -scoreScheme=/scratch/data/blastz/human_chimp.v2.q \ -minScore=1000 -linearGap=medium $P \ ../../../../hg38.unmasked.2bit \ ../../../../hg38.unmasked.2bit stdout \ | chainAntiRepeat ../../../../hg38.unmasked.2bit \ ../../../../hg38.unmasked.2bit stdin chain/${B}.chain done # real 7m54.677s cd /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain find ./run/chain -name "*.chain" | chainMergeSort -inputList=stdin \ | nice gzip -c > hg38.haplotypes.all.chain.gz chainPreNet hg38.haplotypes.all.chain.gz ../../../chrom.sizes \ /hive/data/genomes/hg38/chrom.sizes stdout \ | chainNet stdin -minSpace=1 ../../../chrom.sizes \ ../../../chrom.sizes stdout /dev/null \ | netSyntenic stdin noClass.net # Make liftOver chains from chroms to alternates: netChainSubset -verbose=0 noClass.net hg38.haplotypes.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > hg38.haplotypes.over.chain.gz # swap the alignments to get the alternates to chrom mappings: chainSwap hg38.haplotypes.over.chain.gz stdout \ | gzip -c > hg38.reference.over.chain.gz # and put them all together so mappings go both directions chainMergeSort hg38.haplotypes.over.chain.gz hg38.reference.over.chain.gz \ | gzip -c > hg38.haploReference.over.chain.gz hgLoadChain -tIndex hg38 chainAltSequence hg38.haploReference.over.chain.gz netClass -verbose=0 -noAr noClass.net hg38 hg38 hg38.hg38AltSequence.net netFilter -minGap=10 hg38.hg38AltSequence.net \ | hgLoadNet -verbose=0 hg38 netAltSequence stdin chainToPsl hg38.haploReference.over.chain.gz ../../../chrom.sizes \ ../../../chrom.sizes \ /hive/data/genomes/hg38/hg38.unmasked.2bit \ /hive/data/genomes/hg38/hg38.unmasked.2bit \ hg38.beforeRecalc.haploReference.over.psl pslCheck -targetSizes=../../../chrom.sizes \ -querySizes=../../../chrom.sizes \ hg38.beforeRecalc.haploReference.over.psl 2>&1 | tail -1 # checked: 3092 failed: 57 errors: 57 pslRecalcMatch hg38.beforeRecalc.haploReference.over.psl \ ../../../hg38.unmasked.2bit ../../../hg38.unmasked.2bit \ hg38.haploReference.over.psl pslCheck -targetSizes=../../../chrom.sizes \ -querySizes=../../../chrom.sizes \ hg38.haploReference.over.psl 2>&1 | tail -1 # checked: 3092 failed: 0 errors: 0 hgLoadPsl hg38 -table=altSequenceLiftOver hg38.haploReference.over.psl ############################################################################# ## construct non-bridged contig sequence (DONE - 2014-01-10 - Hiram) mkdir /hive/data/genomes/hg38/bed/nonBridgedContigs cd /hive/data/genomes/hg38/bed/nonBridgedContigs # only need the actual split chroms in this lift, and the # _nn name is a bit more convenient than the .nn: gapToLift -minGap=100 hg38 stdout | sed -e 's/\./_/;' \ | awk '$1 != 0' > hg38.contigs.lift # the warnings gapToLift issues are about gaps defined in the table # that are abutting to each other. teleomere gaps are next to contig gaps # those lifts in the format of a bed file: awk '{printf "%s\t%d\t%d\t%s\n", $4, $1, $1+$3, $2}' hg38.contigs.lift \ > hg38.contigs.bed # the negation of that is the gaps between the contigs # fixup the .N to _nn with the awk: featureBits -not -countGaps hg38 hg38.contigs.bed -bed=stdout \ | awk '{split($4,a,"."); printf "%s\t%d\t%d\t%s_%02d\n", $1,$2,$3,a[1],a[2]}' \ > hg38.gaps.bed # 268613637 bases of 3209286105 (8.370%) in intersection # together, those two should be %100 of the genome exactly: featureBits -countGaps -or hg38 hg38.contigs.bed hg38.gaps.bed # 3209286105 bases of 3209286105 (100.000%) in intersection # the list of all those other bits not in the split chroms: egrep "_alt|chrUn|chrM|_random" hg38.gaps.bed | cut -f1 \ | sort > other.bits.list # extract those chrom pieces and the other bits from the masked sequence: (twoBitToFa -bed=hg38.contigs.bed ../../hg38.2bit stdout; \ twoBitToFa -seqList=other.bits.list ../../hg38.2bit stdout) \ | faToTwoBit stdin hg38.contigs.2bit twoBitInfo hg38.contigs.2bit stdout | sort -k2nr > hg38.contigs.chrom.sizes # verify nothing has been lost: twoBitToFa ../../hg38.2bit stdout | faCount stdin | tail -1 # total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743 twoBitToFa hg38.contigs.2bit stdout | faCount stdin | tail -1 # total 3061688741 898285419 623727342 626335137 900967885 12372958 30979743 # the ACGT and CPG counts remain the same, only N's have been lost # make a copy of this at the top: cp -p hg38.contigs.2bit ../.. cp -p hg38.contigs.lift ../../jkStuff # load as a track to be able to see where they are: egrep "chrUn|chrM|_alt|_random" hg38.contigs.chrom.sizes \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $1}' \ > fullCoverage.hg38Contigs.bed cat hg38.contigs.bed >> fullCoverage.hg38Contigs.bed featureBits -or -countGaps hg38 fullCoverage.hg38Contigs.bed gap # 3209286105 bases of 3209286105 (100.000%) in intersection hgLoadBed hg38 contigAlignmentSegments fullCoverage.hg38Contigs.bed ############################################################################# ## analysis of repeat elements from each RM run ## (DONE - 2014-01-10 - Hiram) mkdir /hive/data/genomes/hg38/bed/repeatElementCount cd /hive/data/genomes/hg38/bed/repeatElementCount for F in ../rmsk*/hg38.class.profile.txt \ ../repeatMaskerGenbank/hg38.class.profile.txt do D=`dirname $F` B=`basename $D | sed -e 's/repeatMaskerGenbank/NCBI/; s/rmsk//;'` echo "==== $B ====" grep rmskClass $F | sed -e 's#rmskClass/##; s/.tab//;' \ | awk '{printf "%s\t%d\n", $2, $1}' | sort > ${B}.tab done # Hmmer does not have snRNA and tRNA ? echo -e "snRNA\t0" >> Hmmer.tab echo -e "tRNA\t0" >> Hmmer.tab sort Hmmer.tab > t.tab mv t.tab Hmmer.tab echo "# Repeat Masker item counts" > table.result.txt echo "# class NCBI cross-match rmblastn HMMER" >> table.result.txt join NCBI.tab CM.tab | join - Blastn.tab | join - Hmmer.tab \ | awk '{printf "%-15s\t%7d\t%7d\t%7d\t%7d\n", $1,$2,$3,$4,$5}' \ | sort -k2,2nr >> table.result.txt cat table.result.txt # Repeat Masker item counts # class NCBI cross-match rmblastn HMMER SINE 1849444 1852545 1822406 1884179 LINE 1586141 1570523 1551012 1702529 LTR 759248 748597 737799 805427 DNA 502186 499108 485558 565171 Simple_repeat 433789 703682 716968 636906 Low_complexity 396378 102856 105181 95480 Satellite 10198 7962 7703 10852 LTR? 5884 5667 5068 9181 snRNA 4595 4516 4548 0 Retroposon 4163 5750 5630 11861 Unknown 2802 5622 5263 3914 DNA? 2157 3294 3018 4582 tRNA 2154 2026 1983 0 rRNA 1915 1840 1810 464 RC 1860 1784 1706 2059 srpRNA 1784 1672 1633 1517 scRNA 1397 1420 1426 6783 RNA 822 704 611 1484 SINE? 488 38 38 970 RC? 445 411 374 806 total 5567850 5520017 5459735 5744165 ############################################################################# ## blat server turned on (DONE - 2014-01-13 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("hg38", "blat4c", "17780", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("hg38", "blat4c", "17781", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ## reset default position to ABO gene (DONE - 2014-01-13 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chr9:133252000-133280861" where name="hg38";' hgcentraltest ######################################################################### ## update grp table with new set of standard rows (DONE - 2014-01-29 - Hiram) hgsql -e 'alter table grp rename grpOriginal;' hg38 hgsql -e 'drop table grp;' hg38 hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg19.grp" hg38 hgsql -e 'delete from grp where name="denisova";' hg38 hgsql -e 'delete from grp where name="pub";' hg38 hgsql -e 'delete from grp where name="neandertal";' hg38 hgsql -e 'update grp set defaultIsClosed=0 where name="map";' hg38 hgsql -e 'drop table grpOriginal;' hg38 ############################################################################ # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR LASTZ (DONE - 2014-01-21 - Hiram) ssh ku mkdir /hive/data/genomes/hg38/bed/linSpecRep cd /hive/data/genomes/hg38/bed/linSpecRep # create individual .out files from the master record in ../repeatMasker mkdir splitOut cat << '_EOF_' > split.csh #!/bin/csh -fe set C = $1 head -3 ../repeatMasker/hg38.sorted.fa.out > splitOut/${C}.out grep "${C} " ../repeatMasker/hg38.sorted.fa.out >> splitOut/${C}.out '_EOF_' # << happy emacs chmod +x split.csh cat << '_EOF_' > template #LOOP split.csh $(root1) {check out line+ splitOut/$(root1).out} #ENDLOOP '_EOF_' # << happy emacs # small ones first: cut -f1 ../../chrom.sizes | tac > chrom.list gensub2 chrom.list single template jobList para create jobList para try ... check ... push ... etc... # Completed: 93 of 93 jobs # CPU time in finished jobs: 127s 2.12m 0.04h 0.00d 0.000 y # IO & Wait Time: 17154s 285.90m 4.76h 0.20d 0.001 y # Average job time: 186s 3.10m 0.05h 0.00d # Longest finished job: 224s 3.73m 0.06h 0.00d # Submission to last job: 280s 4.67m 0.08h 0.00d # now, we can date and process each of those .out files # constructing the humanSpecific set of repeats # this means repeats found in human, and not in others # using mouse here for 'others' is good enough, a variety # of other species could be used (rat dog cow) where they all # produce the same result mkdir dateRepeats cd dateRepeats cat << '_EOF_' > mkLSR #!/bin/bash set -beEu -o pipefail rm -f $1.out_mus-musculus ln -s ../splitOut/$1.out . /scratch/data/RepeatMasker/DateRepeats $1.out -query human -comp mouse rm $1.out mkdir -p ../humanSpecific /cluster/bin/scripts/extractRepeats 1 $1.out_mus-musculus \ > ../humanSpecific/$1.out.spec '_EOF_' # << happy emacs chmod +x mkLSR cat << '_EOF_' > template #LOOP ./mkLSR $(path1) {check out line+ ../humanSpecific/$(path1).out.spec} #ENDLOOP '_EOF_' # << happy emacs gensub2 ../chrom.list single template jobList para try ... check ... push ... etc... para time # Completed: 455 of 455 jobs # CPU time in finished jobs: 13985s 233.08m 3.88h 0.16d 0.000 y # IO & Wait Time: 1470s 24.50m 0.41h 0.02d 0.000 y # Average job time: 34s 0.57m 0.01h 0.00d # Longest finished job: 111s 1.85m 0.03h 0.00d # Submission to last job: 1427s 23.78m 0.40h 0.02d # We also need the nibs for blastz runs with lineage specific repeats mkdir /hive/data/genomes/hg38/bed/nibs cd /hive/data/genomes/hg38/bed/nibs cut -f1 ../../chrom.sizes | while read C do twoBitToFa -seq=${C} ../../hg38.2bit stdout \ | faToNib -softMask stdin ${C}.nib echo "${C} done" done # verify nothing lost cat ../../chrom.sizes \ | awk '{printf "nibFrag -masked %s.nib 0 %d + stdout\n", $1, $2}' \ | sh | faSize stdin # 3209286105 bases (159970322 N's 3049315783 real 1460684798 upper # 1588630985 lower) in 455 sequences in 1 files # Total size: mean 7053376.1 sd 31548372.6 # min 970 (chrUn_KI270394v1.nib:0-970) # max 248956422 (chr1.nib:0-248956422) median 161218 # %49.50 masked total, %52.10 masked real mkdir /hive/data/staging/data/hg38/nib rsync -a --progress ./ /hive/data/staging/data/hg38/nib ############################################################################# ## GRC Contigs/ctgPos2 track (DONE - 2014-12-25 - Hiram) # provide mapping of UCSC chrom names to GRC names mkdir /hive/data/genomes/hg38/bed/ctgPos2 cd /hive/data/genomes/hg38/bed/ctgPos2 grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \ | awk '{printf "s/^%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt find ../../genbank -type f | grep "/assembled_chromosomes/AGP/" | sed -e 's/.comp//' | while read F do if [ -s $F ]; then zcat $F | grep -v "^#" fi done | sed -e "`cat accessionToUcsc.sed.txt`" > ucsc.grch38.agp awk '$5 != "N"' ucsc.grch38.agp \ | awk '{printf "%s\t%d\t%s\t%d\t%d\t%s\n", $6, $3-$2+1, $1, $2-1, $3, $5}' \ | sort -u | sort -k3,3 -k4,4n > ctgPos2.tab export ctgSize=`awk '{print length($1)}' ctgPos2.tab | sort -n | tail -1` export chrSize=`awk '{print length($3)}' ctgPos2.tab | sort -n | tail -1` sed -e "s/20/$ctgSize/; s/16/$chrSize/;" \ /cluster/home/hiram/kent/src/hg/lib/ctgPos2.sql > hg38.ctgPos2.sql hgLoadSqlTab hg38 ctgPos2 hg38.ctgPos2.sql ctgPos2.tab ############################################################################ # constructing download files (WORKING - 2014-01-15 - Hiram) # add hg38 to all.joiner and verify it is clean: joinerCheck -database=hg38 -keys all.joiner # Checking keys on database hg38 # hg38.ucscToINSDC.chrom - hits 455 of 455 (100.000%) ok # and all table coordinates are OK: checkTableCoords hg38 cd /hive/data/genomes/hg38 time $HOME/kent/src/hg/utils/automation/makeDownloads.pl \ -workhorse=hgwdev hg38 # makeDownloads.pl has made a preliminary set of files # need to fixup these names and add chromFa.tar.gz files cd /hive/data/genomes/hg38/goldenPath/bigZips mkdir chroms mkdir maskedChroms faSplit byname hg38.fa.gz chroms/ faSplit byname hg38.fa.masked.gz maskedChroms/ tar cvzf ./hg38.chromFa.tar.gz ./chroms/ tar cvzf ./hg38.chromFaMasked.tar.gz ./maskedChroms/ cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips ln -s /hive/data/genomes/hg38/goldenPath/bigZips/hg38.chromFa.tar.gz hg38.chromFa.tar.gz ln -s /hive/data/genomes/hg38/goldenPath/bigZips/hg38.chromFaMasked.tar.gz hg38.chromFaMasked.tar.gz #also added entries for above to md5sum.txt and README.txt ############################################################################ # LASTZ MOUSE Mm10 (DONE - 2014-01-23,31 - Hiram) # can no longer use the lineage specific repeats with the new lastz # use a screen to manage this longish job: screen -S hg38Mm10 mkdir /hive/data/genomes/hg38/bed/lastzMm10.2014-01-23 cd /hive/data/genomes/hg38/bed/lastzMm10.2014-01-23 # best to always specify an exact path to lastz so we know which one is used # lastz default parameters are human-mouse parameters cat << '_EOF_' > DEF # human vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz # TARGET: Human Hg38 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift SEQ1_CHUNK=40000000 SEQ1_LAP=10000 # QUERY: Mouse Mm10 SEQ2_DIR=/scratch/data/mm10/mm10.2bit SEQ2_LEN=/scratch/data/mm10/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/hive/data/genomes/hg38/bed/lastzMm10.2014-01-23 TMPDIR=/dev/shm '_EOF_' # << happy emacs time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 \ -stop=net `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium > net.log 2>&1 # real 1494m26.135s ---- busy cluster time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 \ -continue=load `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium > load.log 2>&1 # Elapsed time: 43m11s cat fb.hg38.chainMm10Link.txt # 964465044 bases of 3049335806 (31.629%) in intersection # and the swap mkdir /hive/data/genomes/mm10/bed/blastz.hg38.swap cd /hive/data/genomes/mm10/bed/blastz.hg38.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzMm10.2014-01-23/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 83m28.397s cat fb.mm10.chainHg38Link.txt # 937030766 bases of 2652783500 (35.323%) in intersection ######################################################################### # LASTZ Dog CanFam3 (DONE - 2014-01-26 - Hiram) mkdir /hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26 cd /hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26 cat << '_EOF_' > DEF # human vs dog BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz # TARGET: Human Hg38 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Dog CanFam3 SEQ2_DIR=/hive/data/genomes/canFam3/canFam3.2bit SEQ2_LEN=/hive/data/genomes/canFam3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26 TMPDIR=/dev/shm '_EOF_' # << happy emacs # establish a screen to control this job screen hg38CanFam3 time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # Elapsed time: 1396m22s - busy cluster cat fb.hg38.chainCanFam3Link.txt # 1523987456 bases of 3049335806 (49.978%) in intersection # running the swap mkdir /hive/data/genomes/canFam3/bed/blastz.hg38.swap cd /hive/data/genomes/canFam3/bed/blastz.hg38.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26/DEF \ -syntenicNet -swap \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 107m57.787s cat fb.canFam3.chainHg38Link.txt # 1437624815 bases of 2392715236 (60.083%) in intersection ######################################################################### # LASTZ Macaca Mulatta RheMac3 (DONE - 2014-01-27,02-10 - Hiram) mkdir /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27 cd /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27 # best to always specify an exact path to lastz so we know which one is used # lastz default parameters are human-mouse parameters cat << '_EOF_' > DEF # human vs macaca mulatta BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters from panTro2 vs hg18 lastz on advice from Webb BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Human Hg38 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Macaca Mulatta RheMac3 SEQ2_DIR=/scratch/data/rheMac3/rheMac3.2bit SEQ2_LEN=/scratch/data/rheMac3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 BASE=/hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27 TMPDIR=/dev/shm '_EOF_' # << happy emacs time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # Elapsed time: 1426m43s - busy cluster cat fb.hg38.chainRheMac3Link.txt # 2431208700 bases of 3049335806 (79.729%) in intersection # running the swap mkdir /hive/data/genomes/rheMac3/bed/blastz.hg38.swap cd /hive/data/genomes/rheMac3/bed/blastz.hg38.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27/DEF \ -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # 82m32.329s cat fb.rheMac3.chainHg38Link.txt # 2288533769 bases of 2639145830 (86.715%) in intersection ######################################################################### ## construct analysis set (DONE - 2014-01-27 - Hiram) mkdir /hive/data/genomes/hg38/bed/analysisSet cd /hive/data/genomes/hg38/bed/analysisSet mkdir -p splitFa faToTwoBit \ ../../genbank/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz \ hg38.unmasked.analysisSet.2bit faCount splitFa/c*.fa > splitFa.faCount.txt egrep -v "chr[0-9_BGHIJKLXv]*_alt" ../rmskCM/hg38.sorted.fa.out \ > hg38.analysisSet.out twoBitMask hg38.unmasked.analysisSet.2bit hg38.analysisSet.out \ hg38.rmsk.analysisSet.2bit egrep -v "chr[0-9_BGHIJKLXv]*_alt" ../simpleRepeat/trfMask.bed \ > trfMask.analysisSet.bed twoBitMask hg38.rmsk.analysisSet.2bit -add trfMask.analysisSet.bed \ hg38.analysisSet.2bit twoBitToFa hg38.unmasked.analysisSet.2bit stdout | faSize stdin # 3099922541 bases (165046090 N's 2934876451 real 2934876451 upper 0 lower) # in 195 sequences in 1 files # Total size: mean 15897038.7 sd 46804464.6 min 970 (chrUn_KI270394v1) # max 248956422 (chr1) median 32032 # %0.00 masked total, %0.00 masked real twoBitToFa hg38.analysisSet.2bit stdout | faSize stdin # 3099922541 bases (165046090 N's 2934876451 real 1409378896 upper 1525497555 # lower) in 195 sequences in 1 files # Total size: mean 15897038.7 sd 46804464.6 min 970 (chrUn_KI270394v1) # max 248956422 (chr1) median 32032 # %49.21 masked total, %51.98 masked real mkdir hg38.analysisSet.chroms twoBitToFa hg38.analysisSet.2bit stdout \ | faSplit byname stdin hg38.analysisSet.chroms/ tar cvzf ./hg38.analysisSet.chroms.tar.gz ./hg38.analysisSet.chroms ln -s `pwd`/hg38.analysisSet.2bit \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips ln -s `pwd`/hg38.analysisSet.chroms.tar.gz \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips # add these md5 sums to md5sum.txt md5sum hg38.analysisSet.2bit hg38.analysisSet.chroms.tar.gz >> \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/md5sum.txt cp ../../genbank/README_ANALYSIS_SETS README.analysisSet.txt # add note at the top of README: ###################################################################### UCSC copy of the file from: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/README_ANALYSIS_SETS ln -s `pwd`/README.analysisSet.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips ######################################################################### # the FULL analysis set (DONE - 2014-03-18 - Hiram mkdir /hive/data/genomes/hg38/bed/fullAnalysisSet cd /hive/data/genomes/hg38/bed/fullAnalysisSet mkdir hg38.fullAnalysisSet.chroms twoBitToFa ../analysisSet/hg38.analysisSet.2bit stdout \ | faSplit byname stdin hg38.fullAnalysisSet.chroms/ grep _alt ../../chrom.sizes | cut -f 1 > alt.list twoBitToFa -seqList=alt.list ../../hg38.2bit stdout \ | faSplit byname stdin hg38.fullAnalysisSet.chroms/ faCount hg38.fullAnalysisSet.chroms/chr*.fa > faCount.fullAnalysisSet.txt faToTwoBit hg38.fullAnalysisSet.chroms/chr*.fa hg38.fullAnalysisSet.2bit twoBitInfo hg38.fullAnalysisSet.2bit stdout | sort -k2nr > chrom.sizes tar cvzf ./hg38.fullAnalysisSet.chroms.tar.gz ./hg38.fullAnalysisSet.chroms ######################################################################### # LASTZ Self/hg38 (DONE - 2014-01-25,02-10 - Hiram) # can no longer use the lineage specific repeats with the new lastz # use a screen to manage this longish job: screen -S hg38Self mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25 cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25 # construct the non-bridged contigs sequence to use: (twoBitToFa ../nonBridgedContigs/hg38.chroms.contigs.2bit stdout; twoBitToFa ../../hg38.2bit:chrM stdout) | faToTwoBit stdin hg38.self.2bit twoBitInfo hg38.self.2bit stdout | sort -k2nr > hg38.self.chrom.sizes # best to always specify an exact path to lastz so we know which one is used # lastz default parameters are human-mouse parameters cat << '_EOF_' > DEF # human vs human with mouse defaults BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz # TARGET: Human Hg38 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes SEQ1_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit SEQ1_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Human Hg38 SEQ2_DIR=/hive/data/genomes/hg38/hg38.2bit SEQ2_LEN=/hive/data/genomes/hg38/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit SEQ2_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes SEQ2_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25 TMPDIR=/dev/shm '_EOF_' _EOF_ time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 \ -stop=net `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium > net.log 2>&1 # real 1518m15.817s -- problems # there was a problem in the 'part014' batch. running that manually: mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob # make 100 jobs out of the 10 parts: mkdir -p psl cp ../tParts/part014.lst ./xpart014.lst split -l 1 xpart014.lst -d -a 3 part for F in part0* do mv $F $F.lst done for T in part0*.lst do for Q in part0*.lst do mkdir -p psl/${T} echo /cluster/home/hiram/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf ${T} ${Q} ../../DEF \{check out exists psl/${T}/${T}.${Q}.psl\} done done > jobList para -ram=32g create jobList para push # one last failing job: # Completed: 99 of 100 jobs # CPU time in finished jobs: 2836s 47.27m 0.79h 0.03d 0.000 y # IO & Wait Time: 279s 4.65m 0.08h 0.00d 0.000 y # Average job time: 31s 0.52m 0.01h 0.00d # Longest finished job: 586s 9.77m 0.16h 0.01d # Submission to last job: 620s 10.33m 0.17h 0.01d mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010 cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010 mkdir psl twoBitToFa /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit:chr16_03:0-1689648 part010.fa faSplit -lift=split010.lift size part010.fa 169000 split010_ TOP="/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010" for T in split*.fa do mkdir -p psl/${T} echo "${TOP}/${T}" > ${T}.lst faToTwoBit ${T} ${T}.2bit for Q in split*.fa do echo "/cluster/home/hiram/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf ${T}.lst ${Q}.lst DEF {check out exists psl/${T}/${T}.${Q}.psl}" done done > jobList para -ram=32g create jobList # Completed: 100 of 100 jobs # CPU time in finished jobs: 176579s 2942.99m 49.05h 2.04d 0.006 y # IO & Wait Time: 1239s 20.64m 0.34h 0.01d 0.000 y # Average job time: 1778s 29.64m 0.49h 0.02d # Longest finished job: 29343s 489.05m 8.15h 0.34d # Submission to last job: 29348s 489.13m 8.15h 0.34d catDir psl/* | grep -v "^#" > raw.psl liftUp -type=.psl stdout split010.lift error raw.psl \ | liftUp -pslQ -type=.psl chr16_03.psl split010.lift error stdin # this combination allowed psl headers to sneak in the middle, # had to be cleaned: catDir psl/* | grep -v "^#" > part014.psl cat split010/chr16_03.psl >> part014.psl cp -p part014.psl ../../psl/part014.lst/part014.lst_part014.lst.psl time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 \ -continue=cat -stop=net `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 # real 43m11.340s # failed in chaining, running manually on hgwdev time ./bigJobs.sh > bigJobs.log 2>&1 # real 468m59.648s time ./part014.sh > part014.log 2>&1 # real 1319m57.911s # -rw-rw-r-- 1 3581498246 Feb 8 14:37 part014.lst.chain time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 \ -continue=chainMerge -stop=net `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ -verbose=2 \ -continue=load -stop=load `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium > load.log 2>&1 hgLoadChain -normScore -tIndex hg38 chainSelf hg38.hg38.all.chain.gz # Loading 104815249 chains into hg38.chainSelf cat fb.hg38.chainSelfLink.txt # 392419010 bases of 3049335806 (12.869%) in intersection cd /hive/data/genomes/hg38/bed ln -s lastzSelf.2014-01-25 lastz.self ln -s lastzSelf.2014-01-25 lastz.hg38 ######################################################################### ## 4-Way Multiz for UCSC Genes construction (DONE - 2014-02-11 - Hiram) ssh hgwdev mkdir /hive/data/genomes/hg38/bed/multiz4way cd /hive/data/genomes/hg38/bed/multiz4way # extract our 4 organisms from the 44-way on hg18: ln -s /hive/data/genomes/hg18/bed/multiz44way/44way.4d.nh ./44way.nh /cluster/bin/phast/tree_doctor \ --prune-all-but hg19,mm10,canFam3,rheMac3 $HOME/kent/src/hg/utils/phyloTrees/120way.nh \ | sed -e "s/hg19/hg38/" > 4way.nh # this looks like: cat 4way.nh (((hg38:0.033974,rheMac3:0.037601):0.109934,mm10:0.356483):0.020593,canFam3:0.165928); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a gif image for htdocs/images/phylo/hg38_4way.gif /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt # Use this output to create the table below grep -y hg38 4way.distances.txt | sort -k3,3n # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainHg38Link chain linearGap # distance on hg38 on other minScore # 1 0.071575 - rhesus rheMac3 (% 79.729) (% 86.715) 5000 medium # 2 0.330429 - dog canFam3 (% 49.978) (% 60.083) 3000 medium # 3 0.500391 - mouse mm10 (% 31.629) (% 35.323) 3000 medium # using the syntenic nets cd /cluster/data/hg38/bed/multiz4way mkdir mafLinks cd mafLinks mkdir rheMac3 canFam3 mm10 for D in mm10 canFam3 rheMac3 do ln -s ../../../lastz.${D}/axtChain/hg38.${D}.synNet.maf.gz ./${D}/ done mkdir /hive/data/genomes/hg38/bed/multiz4way/mafSplit cd /hive/data/genomes/hg38/bed/multiz4way/mafSplit for D in mm10 canFam3 rheMac3 do echo "working: ${D}" zcat ../mafLinks/${D}/hg38.${D}.synNet.maf.gz > ${D}.maf mkdir -p ${D} mafSplit -byTarget -useFullSequenceName /dev/null ${D}/${D}_ ${D}.maf rm -f ${D}.maf done # determine what is the newest version of multiz and use that cd /hive/data/genomes/hg38/bed/multiz4way mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn # the autoMultiz cluster run ssh ku cd /hive/data/genomes/hg38/bed/multiz4way # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 4way.nh > tmp.nh echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir run maf cd run # NOTE: you need to set the db and multiz dirname properly in this script cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = hg38 set c = $1 set maf = $2 set binDir = /hive/data/genomes/hg38/bed/multiz4way/penn set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/hg38/bed/multiz4way/mafSplit rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/${s}_$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP ./autoMultiz $(root1) {check out line+ /hive/data/genomes/hg38/bed/multiz4way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cut -f1 /cluster/data/hg38/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList # 455 jobs para try ... check ... push ... etc ... # Completed: 455 of 455 jobs # CPU time in finished jobs: 50111s 835.18m 13.92h 0.58d 0.002 y # IO & Wait Time: 5574s 92.91m 1.55h 0.06d 0.000 y # Average job time: 122s 2.04m 0.03h 0.00d # Longest finished job: 4717s 78.62m 1.31h 0.05d # Submission to last job: 4722s 78.70m 1.31h 0.05d # combine results into a single file for loading and gbdb reference cd /hive/data/genomes/hg38/bed/multiz4way grep "^#" maf/chr19_GL949749v2_alt.maf | grep -v "eof maf" > multiz4way.maf grep -h -v "^#" maf/*.maf >> multiz4way.maf grep "^#" maf/chr19_GL949749v2_alt.maf | grep "eof maf" >> multiz4way.maf # real 3m27.561s # makes a 8.5 Gb file: # -rw-rw-r-- 1 9044143788 Feb 11 12:51 multiz4way.maf # Load into database ssh hgwdev cd /hive/data/genomes/hg38/bed/multiz4way mkdir /gbdb/hg38/multiz4way ln -s /hive/data/genomes/hg38/bed/multiz4way/multiz4way.maf \ /gbdb/hg38/multiz4way # the hgLoadMaf generates huge tmp files, locate them in /dev/shm cd /dev/shm time nice -n +19 hgLoadMaf hg38 multiz4way # Loaded 6141667 mafs in 1 files from /gbdb/hg38/multiz4way # real 2m2.812s cd /hive/data/genomes/hg38/bed/multiz4way time (cat /gbdb/hg38/multiz4way/*.maf \ | hgLoadMafSummary -verbose=2 -minSize=10000 \ -mergeGap=500 -maxSize=50000 hg38 multiz4waySummary stdin) # Created 1266559 summary blocks from 11780291 components and 6141667 mafs # real 3m0.791s # -rw-rw-r-- 1 311246327 Feb 11 12:54 multiz4way.tab # -rw-rw-r-- 1 58730176 Feb 11 12:58 multiz4waySummary.tab wc -l multiz4way* # 6141667 multiz4way.tab # 1266559 multiz4waySummary.tab # 7408226 total ######################################################################### ## RE-load alternate sequence for PSL display (DONE - 2016-01-15 - Hiram) ## The procedure below ## "load alternate sequence for PSL display (DONE - #2014-02-24 - Hiram) ## produced an illegal psl Table altSeqLiftOverPsl: pslCheck -db=hg38 altSeqLiftOverPsl checked: 266 failed: 264 errors: 1046 ## Since then, the gff3ToPsl command has been updated to be a bit more ## robust, so, the following sequence produces the new alignment file: mkdir -p /hive/data/genomes/hg38/bed/altAlignments/redo2016 cd /hive/data/genomes/hg38/bed/altAlignments/redo2016 mkdir -p ucscPsl awk -F'/' '{printf "s/^%s\t/%s\t/g;\n", $3,$2}' ../accessionToUcsc.sed.txt \ > ucscToNcbi.sed.txt sed -f ucscToNcbi.sed.txt ../../../chrom.sizes > ncbi.chrom.sizes paste ncbi.chrom.sizes ../../../chrom.sizes \ | awk -F'\t' '{printf "0\t%s\t%d\t%s\t%d\n", $1,$2,$3,$4}' \ > ncbiToUcsc.lift find ../../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \ | while read gff do name=`basename $gff | sed -e 's/_.*//;'` fasta=`dirname $gff | sed -e 's#alignments#FASTA/alt.scaf.fa.gz#;'` size=`faCount $fasta | grep -w total | cut -f2` printf "%s\t%d\n" "$name" "$size" > target.sizes gff3ToPsl ncbi.chrom.sizes target.sizes $gff $name.psl pslCheck ${name}.psl liftUp -type=.psl stdout ncbiToUcsc.lift error ${name}.psl \ | liftUp -type=.psl -pslQ ucscPsl/${name}.psl ncbiToUcsc.lift error stdin pslCheck ucscPsl/${name}.psl done pslSort dirs altSeqLiftOverPsl.psl ./tmp ucscPsl pslCheck -db=hg38 altSeqLiftOverPsl.psl hgLoadPsl hg38 altSeqLiftOverPsl.psl pslCheck -db=hg38 altSeqLiftOverPsl # checked: 266 failed: 0 errors: 0 ######################################################################### ## load alternate sequence for PSL display (DONE - 2014-02-24 - Hiram) mkdir /hive/data/genomes/hg38/bed/altAlignments/sequence cd /hive/data/genomes/hg38/bed/altAlignments/sequence rm -fr hg38.haplotypes.lift temp.lift targetFa queryFa mkdir targetFa mkdir queryFa touch temp.lift cat ../../altLocations/chrToAlt.bed | while read L do chrName=`echo $L | awk '{print $1}'` chromSize=`egrep "^$chrName " ../../../chrom.sizes | cut -f2` chrStart=`echo $L | awk '{printf "%d", $2}'` chrEnd=`echo $L | awk '{printf "%d", $3}'` chrSize=`echo $chrEnd $chrStart | awk '{print $1-$3}'` queryName=`echo $L | awk '{print $4}'` partName="${chrName}_${chrStart}_${chrEnd}" echo $chrName $chrStart $chrEnd $queryName $partName $chromSize echo -e "$chrStart\t${partName}\t$chrSize\t$chrName\t$chromSize" >> temp.lift twoBitToFa ../../../hg38.2bit:$chrName:$chrStart-$chrEnd stdout | sed -e "s/^>.*/>$partName/;" > targetFa/$queryName.fa twoBitToFa ../../../hg38.2bit:$queryName queryFa/$queryName.fa done sort -u temp.lift | sort -k4,4 -k1,1n > hg38.haplotypes.lift mkdir /gbdb/hg38/ncbiAltMappings cd /hive/data/genomes/hg38/bed/altAlignments/sequence/queryFa ln -s `pwd`/*.fa /gbdb/hg38/ncbiAltMappings cd /hive/data/genomes/hg38/bed/altAlignments/sequence hgLoadSeq -drop -seqTbl=seqNcbiAltSequence -extFileTbl=extNcbiAltSequence \ hg38 /gbdb/hg38/ncbiAltMappings/*.fa pslSwap ../altAlignments.psl stdout \ | pslRecalcMatch stdin ../../../hg38.2bit ../../../hg38.2bit \ hg38.referenceTarget.psl # the table name altSeqLiftOverPsl is recognized in hgc to allow display # of the details of the alignments hgLoadPsl hg38 -table=altSeqLiftOverPsl hg38.referenceTarget.psl ######################################################################### ## alternate sequence alignments EXPERIMENT (DONE - 2014-01-17 - Hiram) # the lastzAltSequences.2014-01-23 alignment was used for this instead # of this procedure mkdir /hive/data/genomes/hg38/bed/altAlignments cd /hive/data/genomes/hg38/bed/altAlignments grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \ | awk '{printf "s/%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt find ../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \ | while read F do cat $F | sed -f accessionToUcsc.sed.txt \ | gff3ToPsl ../../chrom.sizes stdin stdout done > altAlignments.psl | xargs cat | sed -f accessionToUcsc.sed.txt \ | gff3ToPsl ../../chrom.sizes stdin altAlignments.psl time pslRecalcMatch altAlignments.psl ../../hg38.2bit ../../hg38.2bit \ altRecalcMatch.psl # real 0m51.122s # just to see what they look like in different formats: pslToChain altRecalcMatch.psl altAlignments.chain chainToAxt altAlignments.chain ../../hg38.2bit ../../hg38.2bit \ altAlignments.axt axtToMaf -score altAlignments.axt ../../chrom.sizes ../../chrom.sizes \ altAlignments.maf mkdir mafSplits mafSplit /dev/null mafSplits/ altAlignments.maf # doesn't work: # Can't find chrom in MAF component src: chr6_GL000250v2_alt mkdir splits psl find ../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \ | while read F do chrAlt=`basename $F | sed -e 's/_.*//' | sed -f accessionToUcsc.sed.txt` echo $chrAlt cat $F | sed -f accessionToUcsc.sed.txt \ | gff3ToPsl ../../chrom.sizes stdin splits/${chrAlt}.psl pslRecalcMatch splits/${chrAlt}.psl ../../hg38.2bit ../../hg38.2bit \ psl/${chrAlt}.psl done mkdir swap mkdir swap/psl swap/chain swap/axt swap/maf swap/anno for F in psl/*.psl do B=`basename $F | sed -e 's/.psl//'` echo $B pslSwap $F stdout | pslRecalcMatch stdin ../../hg38.2bit ../../hg38.2bit \ swap/psl/${B}.psl pslToChain swap/psl/${B}.psl swap/chain/${B}.chain chainToAxt swap/chain/${B}.chain ../../hg38.2bit ../../hg38.2bit \ swap/axt/${B}.axt axtToMaf -score swap/axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \ | sed -e 's/^s chr\([0-9XYM][0-9]* \)/s ref38.chr\1/; s/^s chr\([0-9XYM][0-9]*_\)/s alt38.chr\1/;' > swap/maf/${B}.maf mafAddIRows -nBeds=nBeds swap/maf/${B}.maf ../../hg38.2bit swap/anno/${B}.maf done # axtToMaf -score swap/axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \ # | sed -e 's/^s chr/s hg38.chr/' > swap/maf/${B}.maf twoBitInfo -nBed ../../hg38.2bit ../../hg38.N.bed ln -s ../../hg38.N.bed hg38.bed ln -s ../../hg38.N.bed ref38.bed ln -s ../../hg38.N.bed alt38.bed echo hg38.bed > nBeds echo ref38.bed >> nBeds echo alt38.bed >> nBeds ln -s ../../chrom.sizes hg38.len ln -s ../../chrom.sizes ref38.len ln -s ../../chrom.sizes alt38.len echo hg38.len > sizes echo ref38.len >> sizes echo alt38.len >> sizes mkdir chain axt maf anno for F in psl/*.psl do B=`basename $F | sed -e 's/.psl//'` echo $B pslToChain $F chain/${B}.chain chainToAxt chain/${B}.chain ../../hg38.2bit ../../hg38.2bit axt/${B}.axt axtToMaf -score axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \ | sed -e 's/^s chr\([0-9XYM][0-9]* \)/s ref38.chr\1/; s/^s chr\([0-9XYM][0-9]*_\)/s alt38.chr\1/;' > maf/${B}.maf mafAddIRows -nBeds=nBeds maf/${B}.maf ../../hg38.2bit anno/${B}.maf done # axtToMaf -score axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \ # | sed -e 's/^s chr/s hg38.chr/' > maf/${B}.maf ############################################################################ # Liftover Gencode V19 from hg19 (DONE braney 2014-02-14) mkdir /cluster/data/hg38/bed/liftOverGencodeV19 cd /cluster/data/hg38/bed/liftOverGencodeV19 echo "show tables like 'wgEncodeGencode%19'" | hgsql hg19 | tail -n +2 > all.gencode.tables echo " select tableName from trackDb where tableName like 'wgEncodeGencode_%V19';" | hgsql hg19 --skip-column-names > genePred.gencode.tables # load the non-genepred table as is. This isn't quite the right thing to do # with exon support, but it's good enough for our purposes at the moment join -v 1 *.gencode.tables | while read t; do echo "create table $t select * from hg19.$t" | hgsql hg38; echo $t; done for i in `cat genePredExt.gencode.tables`; do echo "select name,score,name2 from $i" | hgsql hg19 | sort > $i.name2Score.txt; genePredToFakePsl hg19 $i $i.psl $i.cds; pslMap -chainMapFile -swapMap $i.psl /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz stdout | pslCDnaFilter -uniqueMapped stdin stdout | sort -k 14,14 -k 16,16n | pslToBed -cds=$i.cds stdin stdout | bedToGenePred stdin stdout | sort | join /dev/stdin $i.name2Score.txt| tr ' ' '\t' | hgLoadGenePred -genePredExt hg38 $i stdin; echo $i; done for i in `cat genePred.gencode.tables`; do genePredToFakePsl hg19 $i $i.psl $i.cds; pslMap -chainMapFile -swapMap $i.psl /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz stdout | pslCDnaFilter -uniqueMapped stdin stdout | sort -k 14,14 -k 16,16n | pslToBed -cds=$i.cds stdin stdout | bedToGenePred stdin stdout | tr ' ' '\t' | hgLoadGenePred hg38 $i stdin; echo $i; done ##################################################################### ## tRNAs track ( 2014-02-18 braney DONE) ## this is a preliminary version for UCSC build. NOT FOR RELEASE! ssh hgwdev cd /hive/data/genomes/hg38/bed mkdir tRNAs cd tRNAs cp /hive/users/pchan/tRNAs/Eukaryota/hg38/hg38-tRNAs.bed . hgLoadBed -tab hg38 tRNAs hg38-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql ## tRNAs track (2015-10-04, Chris FINISHING BUILD FOR RELEASE) cd /hive/data/genomes/hg38/bed/tRNAs cat /hive/users/pchan/gtrnadb2/Eukaryota/hg38/hg38-tRNAs.bed | sed 's^^^g' | > hg38-tRNAs2.bed hgsql hg38 -e 'drop table if exists tRNAs' hgLoadBed -tab hg38 tRNAs hg38-tRNAs2.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql mkdir gif cp -p /hive/users/pchan/gtrnadb2/Eukaryota/hg38/images/* gif cd /hive/data/gbdb/hg38 ln -s /hive/data/genomes/hg38/bed/tRNAs/gif RNA-img cd /usr/local/apache/htdocs-ceisenhart/RNA-img ln -s /gbdb/hg38/RNA-img hg38 ############################################################################ # EXONIPHY , lifted from hg19 (DONE - braney 2014-02-19) # needed for ucscGenes building # exoniphyHg19.gp is prepared as follows mkdir /cluster/data/hg38/bed/exoniphy cd /cluster/data/hg38/bed/exoniphy hgsql hg19 -e "select * from exoniphy" -N | cut -f 2-16 > exoniphyHg19.gp time nice -n +19 liftOver -genePred exoniphyHg19.gp \ /cluster/data/hg19/bed/liftOver/hg19ToHg38.over.chain.gz \ exoniphyHg38.gp unmapped # real 0m2.015s # user 0m1.894s # sys 0m0.076s wc -l * # 186601 exoniphyHg19.gp # 186533 exoniphyHg38.gp # 136 unmapped # 373270 total cd /cluster/data/hg38/bed/exoniphy nice -n +19 hgLoadGenePred -genePredExt hg38 exoniphy exoniphyHg38.gp nice -n +19 featureBits hg38 exoniphy # 28807039 bases of 3049335806 (0.945%) in intersection nice -n +19 featureBits hg19 exoniphy # 28661160 bases of 2897316137 (0.989%) in intersection ######################################################################### # LASTZ Rat Rn5 (DONE - 2014-02-27 - Hiram) # establish a screen to control this job screen -S hg38Rn5 mkdir /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27 cd /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27 # XXX don't forget to specify the BLASTZ binary: cat << '_EOF_' > DEF # human vs rat BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz # TARGET: Human Hg38 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rat Rn5 SEQ2_DIR=/hive/data/genomes/rn5/rn5.2bit SEQ2_LEN=/hive/data/genomes/rn5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/hg38/bed/lastzRn5.2014-02-27 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # real 658m53.984s cat fb.hg38.chainRn5Link.txt # 938823407 bases of 3049335806 (30.788%) in intersection # running the swap mkdir /hive/data/genomes/rn5/bed/blastz.hg38.swap cd /hive/data/genomes/rn5/bed/blastz.hg38.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27/DEF \ -swap \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 66m53.095s cat fb.rn5.chainHg38Link.txt # 934256475 bases of 2572853723 (36.312%) in intersection # syntenic net for 14-way use 2014-04-02 - Hiram cd /hive/data/genomes/rn5/bed/blastz.hg38.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27/DEF \ -continue=syntenicNet -syntenicNet -swap \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1 # real 16m54.489s ############################################################################## # LASTZ Rat Rn4 (DONE - 2014-02-27 - Hiram) # establish a screen to control this job screen -S hg38Rn4 mkdir /hive/data/genomes/hg38/bed/lastzRn4.2014-02-27 cd /hive/data/genomes/hg38/bed/lastzRn4.2014-02-27 # XXX don't forget to specify the BLASTZ binary: cat << '_EOF_' > DEF # human vs rat BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz # TARGET: Human Hg38 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rat Rn4 SEQ2_DIR=/hive/data/genomes/rn4/rn4.2bit SEQ2_LEN=/hive/data/genomes/rn4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/hg38/bed/lastzRn4.2014-02-27 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # real 658m53.984s cat fb.hg38.chainRn4Link.txt # 913992768 bases of 3049335806 (29.974%) in intersection # running the swap mkdir /hive/data/genomes/rn4/bed/blastz.hg38.swap cd /hive/data/genomes/rn4/bed/blastz.hg38.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzRn4.2014-02-27/DEF \ -swap \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 73m5.666s cat fb.rn4.chainHg38Link.txt # 889613774 bases of 2571531505 (34.595%) in intersection ############################################################################## # GENEID GENE PREDICTIONS (DONE - 2014-03-07 - Hiram) ssh hgwdev mkdir /hive/data/genomes/hg38/bed/geneid cd /hive/data/genomes/hg38/bed/geneid mkdir download cd download for C in `cut -f1 ../../../chrom.sizes` do echo $C wget --timestamping \ http://genome.crg.es/genepredictions/H.sapiens/hg38/geneid_v1.4/${C}.gtf3 wget --timestamping \ http://genome.crg.es/genepredictions/H.sapiens/hg38/geneid_v1.4/${C}.prot done cd .. cat download/*.gtf | ldHgGene -gtf -genePredExt hg38 geneid stdin # Read 33428 transcripts in 277332 lines in 1 files # 33428 groups 92 seqs 1 sources 3 feature types # 33428 gene predictions ############################################################################ # GENEREVIEWS TRACK (DONE 2014-05-17 - Chin) # This track depends on some tasks completed for hg19, specifically: # # $HOME/kent/src/hg/lib/geneReviewsGrshortNBKid.sql # $HOME/kent/src/hg/lib/geneReviewsGrshortTitleNBKid.sql # $HOME/kent/src/hg/lib/geneReviewsDetail.sql # $HOME/kent/src/hg/makeDb/trackDb/human/geneReviews.html # # Unlike hg19, this hg38 tracks is generated by the automatic geneReviews # scripts in # /hive/data/outside/otto/geneReviews, specifically buildGeneReviews.sh. # Current data are fetched weekly from NCBI # ftp://ftp.ncbi.nlm.nih.gov/pub/GeneReviews/ # to /hive/data/outside/otto/geneReviews/${DATE}. ########################################################################### # Chimp Lastz run (DONE - 2014-05-27 - Hiram) screen -S hg38PanTro4 # use a screen to manage this longish running job mkdir /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27 cd /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27 # always set the BLASTZ program so we know what version was used cat << '_EOF_' > DEF # human vs chimp BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_O=600 BLASTZ_E=150 # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_T=2 BLASTZ_Y=15000 BLASTZ_K=4500 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q # A C G T # 90 -330 -236 -356 # -330 100 -318 -236 # -236 -318 100 -330 # -356 -236 -330 90 # TARGET: Human Hg38 SEQ1_DIR=/scratch/data/hg38/hg38.2bit SEQ1_LEN=/scratch/data/hg38/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 # QUERY: Chimp PanTro4 SEQ2_DIR=/hive/data/genomes/panTro4/panTro4.2bit SEQ2_LEN=/hive/data/genomes/panTro4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 SEQ2_IN_CONTIGS=0 BASE=/hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27 TMPDIR=/dev/shm '_EOF_' # << emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 154m12.215s cat fb.hg38.chainPanTro4Link.txt # 2839294579 bases of 3049335806 (93.112%) in intersection # filter with doRecipBest.pl time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ hg38 panTro4) > rbest.log 2>&1 # real 57m55.320s # running the swap mkdir /hive/data/genomes/panTro4/bed/blastz.hg38.swap cd /hive/data/genomes/panTro4/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ -swap /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27/DEF \ -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 cat fb.panTro4.chainHg38Link.txt # 2776497530 bases of 2902338967 (95.664%) in intersection # real 98m23.729s time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ panTro4 hg38) > rbest.log 2>&1 # real 64m33.812s ############################################################################# # Opossum Lastz run (DONE - 2014-05-27 - Hiram) screen -S hg38MonDom5 # use a screen to manage this longish running job mkdir /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27 cd /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27 # always set the BLASTZ program so we know what version was used cat << '_EOF_' > DEF # human vs chimp BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_M=50 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # A C G T # 91 -90 -25 -100 # -90 100 -100 -25 # -25 -100 100 -90 # -100 -25 -90 91 # TARGET: Human Hg38 SEQ1_DIR=/scratch/data/hg38/hg38.2bit SEQ1_LEN=/scratch/data/hg38/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: Opossum MonDom5 SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit SEQ2_LEN=/hive/data/genomes/monDom5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27 TMPDIR=/dev/shm '_EOF_' # << emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 670m13.280s # one failed chain run for hg19, finished manually on hgwdev, then: time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -continue=chainMerge -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > chainMerge.log 2>&1 # real 164m28.822s cat fb.hg38.chainMonDom5Link.txt # 438195373 bases of 3049335806 (14.370%) in intersection # filter with doRecipBest.pl time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \ -dbHost=hgwdev -workhorse=hgwdev hg38 monDom5) > rbest.log 2>&1 # real 130m22.825s # running the swap mkdir /hive/data/genomes/monDom5/bed/blastz.hg38.swap cd /hive/data/genomes/monDom5/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 102m41.443s cat fb.monDom5.chainHg38Link.txt # 420069915 bases of 3501660299 (11.996%) in intersection time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \ -dbHost=hgwdev -workhorse=hgwdev monDom5 hg38) > rbest.log 2>&1 # real 90m56.189s _EOF_ ############################################################################# # LOCUS REFERENCE GENOMIC (LRG) REGIONS AND TRANSCRIPTS (DONE 10/25/19 angie) # Redmine #13359, #24285 -- otto-mate To Do #17877 # previously done 7/7/14, 9/9/16, 5/30/18 set today = `date +%Y_%m_%d` mkdir -p /hive/data/genomes/hg38/bed/lrg/$today cd /hive/data/genomes/hg38/bed/lrg/$today wget ftp://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_public_xml_files.zip unzip LRG_public_xml_files.zip # Run script to convert LRG*.xml files to BED+ for regions and genePredExt+fa for transcripts: ~/kent/src/hg/utils/automation/parseLrgXml.pl GRCh38 genePredCheck lrgTranscriptsUnmapped.gp #Error: lrgTranscriptsUnmapped.gp:765: LRG_7t1 no exonFrame on CDS exon 46 #checked: 1029 failed: 1 # If there are complaints e.g. about exonFrame, look for inconsistencies in the # affected transcript's coding_region/coordinates vs. exon/intron info in xml. # Contact Variation team leader Fiona Cunningham @EBI to resolve in the background # (missing exonFrame info doesn't affect our track representation because we end up using # psl). We agreed to disagree about exon 46 of LRG_7t1 because that last coding exon # portion is only the stop codon. # No longer necessary to filter out alt and fix patches since they have been added to hg38. # Load LRG regions: bedToBigBed lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.bb \ -tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name ln -sf `pwd`/lrg.bb /gbdb/hg38/bbi/lrg.bb hgBbiDbLink hg38 lrg /gbdb/hg38/bbi/lrg.bb # Map LRG fixed_annotation transcripts from LRG coords to hg38 coords (HT MarkD): lrgToPsl lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.psl pslCheck lrg.psl #checked: 919 failed: 0 errors: 0 awk '{print $10 "\t" $11;}' lrg.psl > lrg.sizes genePredToFakePsl -chromSize=lrg.sizes placeholder \ lrgTranscriptsUnmapped.gp lrgTranscriptsFakePsl.psl lrgTranscripts.cds pslMap lrgTranscriptsFakePsl.psl lrg.psl lrgTranscriptsHg38.psl mrnaToGene -genePredExt -cdsFile=lrgTranscripts.cds -keepInvalid \ lrgTranscriptsHg38.psl lrgTranscriptsHg38NoName2.gp #Warning: no CDS for LRG_163t1 #Warning: no CDS for LRG_347t1 # It's OK if mrnaToGene complains about "no CDS" for a non-coding tx (RefSeq accession NR_*). grep -l NR_ LRG_163.xml LRG_347.xml #LRG_163.xml #LRG_347.xml # Load PSL, CDS and sequences. hgLoadPsl hg38 -table=lrgTranscriptAli lrgTranscriptsHg38.psl hgLoadSqlTab hg38 lrgCds ~/kent/src/hg/lib/cdsSpec.sql lrgTranscripts.cds hgPepPred hg38 tab lrgCdna lrgCdna.tab hgPepPred hg38 tab lrgPep lrgPep.tab ############################################################################# ## 7-Way Multiz (DONE - 2014-06-02 - Hiram) ssh hgwdev mkdir /hive/data/genomes/hg38/bed/multiz7way cd /hive/data/genomes/hg38/bed/multiz7way # from the 63-way in the source tree, select out the 7 used here: /cluster/bin/phast/tree_doctor \ --prune-all-but hg19,panTro4,rheMac3,mm10,rn5,canFam3,monDom5 \ /cluster/home/hiram/kent/src/hg/utils/phyloTrees/130way.nh \ | sed -e 's/hg19/hg38/' > hg38.7way.nh # what that looks like: ~/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.nh # (((((hg38:0.006550, # panTro4:0.006840):0.027424, # rheMac3:0.037601):0.109934, # (mm10:0.084509, # rn5:0.091589):0.271974):0.020593, # canFam3:0.165928):0.258392, # monDom5:0.340786); # extract species list from that .nh file sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ hg38.7way.nh | xargs echo | sed 's/ //g; s/,/ /g' \ | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt # construct db to name translation list: cat species.list.txt | while read DB do hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \ > db.to.name.txt # construct a common name .nh file: /cluster/bin/phast/tree_doctor --rename \ "`cat db.to.name.txt`" hg38.7way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > hg38.7way.commonNames.nh $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.nh > t.nh $HOME/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > hg38.7way.scientificNames.nh rm -f t.nh cat hg38.7way.scientificNames.nh # (((((Homo_sapiens:0.00655, # Pan_troglodytes:0.00684):0.027424, # Macaca_mulatta:0.037601):0.109934, # (Mus_musculus:0.084509, # Rattus_norvegicus:0.091589):0.271974):0.020593, # Canis_lupus_familiaris:0.165928):0.258392, # Monodelphis_domestica:0.340786); ~/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.commonNames.nh # (((((Human:0.00655, # Chimp:0.00684):0.027424, # Rhesus:0.037601):0.109934, # (Mouse:0.084509, # Rat:0.091589):0.271974):0.020593, # Dog:0.165928):0.258392, # Opossum:0.340786); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/hg38_7way.png /cluster/bin/phast/all_dists hg38.7way.nh | grep hg38 \ | sed -e "s/hg38.//" | sort -k2n > 7way.distances.txt # Use this output to create the table below head 7way.distances.txt # taeGut1 0.075718 # melUnd1 0.220312 # galGal4 0.507021 # melGal1 0.509140 # hg19 1.175433 # mm10 1.383071 cat << '_EOF_' > sizeStats.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "<7way.distances.txt") or die "can not read 7way.distances.txt"; my $count = 0; while (my $line = ) { chomp $line; my ($D, $dist) = split('\s+', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/hg38/bed/lastz.$D/fb.hg38." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.hg38/fb.${D}.chainHg38Link.txt"; my $swapMeasure = "N/A"; if ( -s $swapFile ) { $swapMeasure = `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; printf "# %02d %.4f (%% %06.3f) (%% %06.3f) - %s %s\n", $count, $dist, $chainLinkMeasure, $swapMeasure, $orgName, $D; } close (FH); '_EOF_' # << happy emacs chmod +x ./sizeStats.pl ./sizeStats.pl # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # featureBits chainLink measures # chainLink # N distance on hg38 on other other species # 01 0.0134 (% 93.112) (% 95.664) - Chimp panTro4 # 02 0.0716 (% 79.729) (% 86.715) - Rhesus rheMac3 # 03 0.3304 (% 49.978) (% 60.083) - Dog canFam3 # 04 0.5004 (% 31.629) (% 35.323) - Mouse mm10 # 05 0.5075 (% 30.788) (% 36.312) - Rat rn5 # 06 0.7637 (% 14.370) (% 11.996) - Opossum monDom5 # None of this concern for distances matters in building the first step, the # maf files. # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ hg38.7way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # hg38 panTro4 rheMac3 mm10 rn5 canFam3 monDom5 # bash shell syntax here ... cd /hive/data/genomes/hg38/bed/multiz7way export H=/hive/data/genomes/hg38/bed mkdir mafLinks # want syntenic net for: panTro4 rheMac3 mm10 rn5 canFam3 # and unfiltered maf net for: monDom5 for G in panTro4 rheMac3 mm10 rn5 canFam3 do mkdir mafLinks/$G echo ln -s ${H}/lastz.$G/axtChain/hg38.${G}.synNet.maf.gz ./mafLinks/$G ln -s ${H}/lastz.$G/axtChain/hg38.${G}.synNet.maf.gz ./mafLinks/$G done mkdir mafLinks/monDom5 echo ln -s ${H}/lastz.monDom5/mafNet/*.maf.gz ./mafLinks/monDom5 ln -s ${H}/lastz.monDom5/mafNet/*.maf.gz ./mafLinks/monDom5 # verify the symLinks are good: ls -ogrtL mafLinks/*/* #-rw-rw-r-- 1 709500062 Jan 25 12:15 mafLinks/mm10/hg38.mm10.synNet.maf.gz #-rw-rw-r-- 1 1089643630 Jan 27 19:15 mafLinks/canFam3/hg38.canFam3.synNet.maf.gz #-rw-rw-r-- 1 1277455681 Jan 28 21:52 mafLinks/rheMac3/hg38.rheMac3.synNet.maf.gz #-rw-rw-r-- 1 687500679 Mar 1 12:27 mafLinks/rn5/hg38.rn5.synNet.maf.gz #-rw-rw-r-- 1 1463969868 May 27 11:41 mafLinks/panTro4/hg38.panTro4.synNet.maf.gz #-rw-rw-r-- 1 323347908 May 29 12:38 mafLinks/monDom5/hg38.monDom5.net.maf.gz # split the maf files into a set of hashed named files # this hash named split keeps the same chr/contig names in the same # named hash file. mkdir /hive/data/genomes/hg38/bed/multiz7way/mafSplit cd /hive/data/genomes/hg38/bed/multiz7way/mafSplit for D in `sed -e "s/hg38 //" ../species.list` do echo "${D}" mkdir $D cd $D echo "mafSplit -byTarget -useHashedName=8 /dev/null . ../../mafLinks/${D}/*.maf.gz" mafSplit -byTarget -useHashedName=8 /dev/null . \ ../../mafLinks/${D}/*.maf.gz cd .. done # construct a list of all possible maf file names. # they do not all exist in each of the species directories find . -type f | wc -l # 641 find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list wc -l maf.list # 118 maf.list mkdir /hive/data/genomes/hg38/bed/multiz7way/splitRun cd /hive/data/genomes/hg38/bed/multiz7way/splitRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = hg38 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/hg38/bed/multiz7way/mafSplit /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/hg38/bed/multiz7way/splitRun/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs ln -s ../../mafSplit/maf.list maf.list ssh ku cd /hive/data/genomes/hg38/bed/multiz7way/splitRun/run gensub2 maf.list single template stdout > jobList para -ram=8g create jobList # Completed: 118 of 118 jobs # CPU time in finished jobs: 118241s 1970.69m 32.84h 1.37d 0.004 y # IO & Wait Time: 682s 11.36m 0.19h 0.01d 0.000 y # Average job time: 1008s 16.80m 0.28h 0.01d # Longest finished job: 10068s 167.80m 2.80h 0.12d # Submission to last job: 10076s 167.93m 2.80h 0.12d # combine into one file (the 1>&2 redirect sends the echo to stderr) cd /hive/data/genomes/hg38/bed/multiz7way head -1 splitRun/maf/017.maf > multiz7way.maf for F in splitRun/maf/*.maf do echo "${F}" 1>&2 egrep -v "^#" ${F} done >> multiz7way.maf tail -1 splitRun/maf/017.maf >> multiz7way.maf # -rw-rw-r-- 1 15635828403 Jun 3 11:49 multiz7way.maf # Load into database ssh hgwdev cd /hive/data/genomes/hg38/bed/multiz7way mkdir /gbdb/hg38/multiz7way ln -s `pwd`/multiz7way.maf /gbdb/hg38/multiz7way cd /dev/shm time nice -n +17 hgLoadMaf hg38 multiz7way # Loaded 10270624 mafs in 1 files from /gbdb/hg38/multiz7way # real 3m51.265s time nice -n +17 hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 hg38 multiz7waySummary \ /gbdb/hg38/multiz7way/multiz7way.maf # Created 1260918 summary blocks from 35384988 components # and 10270624 mafs from /gbdb/hg38/multiz7way/multiz7way.maf # real 5m39.388s wc -l multiz7way*.tab # 10270624 multiz7way.tab # 1260918 multiz7waySummary.tab # 11531542 total rm multiz7way*.tab ############################################################################## # GAP ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2014-06-03 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. Need to split of the maf file into individual # maf files mkdir -p /hive/data/genomes/hg38/bed/multiz7way/anno/mafSplit cd /hive/data/genomes/hg38/bed/multiz7way/anno/mafSplit time mafSplit -outDirDepth=1 -byTarget -useFullSequenceName \ /dev/null . ../../multiz7way.maf # real 4m8.617s find . -type f | wc -l # 353 # check for N.bed files everywhere: cd /hive/data/genomes/hg38/bed/multiz7way/anno for DB in `cat ../species.list` do if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then echo "MISS: ${DB}" # cd /hive/data/genomes/${DB} # twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed else echo " OK: ${DB}" fi done cd /hive/data/genomes/hg38/bed/multiz7way/anno for DB in `cat ../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL screen -S hg38 # use a screen to control this longish job ssh ku cd /hive/data/genomes/hg38/bed/multiz7way/anno mkdir result for D in `ls mafSplit` do echo mkdir result/${D} mkdir result/${D} done cat << '_EOF_' > template #LOOP mafAddIRows -nBeds=nBeds mafSplit/$(path1) /hive/data/genomes/hg38/hg38.2bit {check out exists+ result/$(path1)} #ENDLOOP '_EOF_' # << happy emacs find ./mafSplit -type f | sed -e 's#^./mafSplit/##' > maf.list gensub2 maf.list single template jobList # limit jobs on a node with the ram=32g requirement because they go fast para -ram=32g create jobList para try ... check ... push ... # Completed: 353 of 353 jobs # CPU time in finished jobs: 530s 8.83m 0.15h 0.01d 0.000 y # IO & Wait Time: 1057s 17.62m 0.29h 0.01d 0.000 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest finished job: 63s 1.05m 0.02h 0.00d # Submission to last job: 220s 3.67m 0.06h 0.00d # verify all result files have some content, look for 0 size files: find ./result -type f -size 0 # should see none # or in this manner: find ./result -type f | xargs ls -og | sort -k3nr | tail # combine into one file (the 1>&2 redirect sends the echo to stderr) head -q -n 1 result/0/chr8.maf > hg38.7way.maf find ./result -type f | while read F do echo "${F}" 1>&2 grep -h -v "^#" ${F} done >> hg38.7way.maf # these maf files do not have the end marker, this does nothing: # tail -q -n 1 result/0/chr8.maf >> hg38.7way.maf # How about an official end marker: echo "##eof maf" >> hg38.7way.maf ls -og # -rw-rw-r-- 1 17795297196 Jun 3 14:01 hg38.7way.maf du -hsc hg38.7way.maf # 17G hg38.7way.maf # construct symlinks to get the individual maf files into gbdb: rm /gbdb/hg38/multiz7way/multiz7way.maf # remove previous results ln -s `pwd`/hg38.7way.maf /gbdb/hg38/multiz7way/multiz7way.maf # Load into database cd /dev/shm time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/hg38/multiz7way \ hg38 multiz7way # Loaded 10359242 mafs in 1 files from /gbdb/hg38/multiz7way # real 4m21.862s time hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 hg38 multiz7waySummary \ /gbdb/hg38/multiz7way/multiz7way.maf # Created 1260918 summary blocks from 35384988 components # and 10359242 mafs from /gbdb/hg38/multiz7way/multiz7way.maf # real 6m6.583s # -rw-rw-r-- 1 530538267 Jun 3 14:05 multiz7way.tab # -rw-rw-r-- 1 60616616 Jun 3 14:15 multiz7waySummary.tab rm multiz7way*.tab ###################################################################### # MULTIZ7WAY MAF FRAMES (DONE - 2014-06-03 - Hiram) ssh hgwdev mkdir /hive/data/genomes/hg38/bed/multiz7way/frames cd /hive/data/genomes/hg38/bed/multiz7way/frames # survey all the genomes to find out what kinds of gene tracks they have cat << '_EOF_' > showGenes.csh #!/bin/csh -fe foreach db (`cat ../species.list`) echo -n "${db}: " set tables = `hgsql $db -N -e "show tables like '%Gene%'"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || \ $table == "mgcGenes" || $table == "knownGene" || \ $table == "xenoRefGene" ) then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='$db'"` set orgId = `hgsql hg19 -N -e \ "select id from organism where name='$orgName'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql hg19 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end '_EOF_' # << happy emacs chmod +x ./showGenes.csh time ./showGenes.csh # hg38: knownGene: 104178, mgcGenes: 34081, refGene: 54852, xenoRefGene: 172740, Mrnas: 10723716 # panTro4: ensGene: 29160, refGene: 2622, xenoRefGene: 280516, Mrnas: 11163 # rheMac3: refGene: 6369, xenoRefGene: 275096, Mrnas: 443642 # mm10: ensGene: 94647, knownGene: 61642, mgcGenes: 26768, refGene: 33765, xenoRefGene: 161178, Mrnas: 5224613 # rn5: ensGene: 29188, mgcGenes: 6924, refGene: 18567, xenoRefGene: 175416, Mrnas: 1247500 # canFam3: ensGene: 29884, refGene: 1582, xenoRefGene: 253196, Mrnas: 387195 # monDom5: ensGene: 24882, refGene: 492, xenoRefGene: 248251, Mrnas: 2461 # from that summary, use these gene sets: # refGene - rheMac3 # ensGene - panTro4 rn5 canFam3 monDom5 # knownGene - hg38 mm10 mkdir genes # 1. knownGene: hg38 mm10 for DB in hg38 mm10 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/${DB}.gp.gz done # 2. ensGene: for DB in panTro4 rn5 canFam3 monDom5 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # 3. refGene for DB in rheMac3 do hgsql -N -e "select * from refGene" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/canFam3.gp.gz: 19507 # genes/hg38.gp.gz: 21887 # genes/mm10.gp.gz: 21013 # genes/monDom5.gp.gz: 21033 # genes/panTro4.gp.gz: 18657 # genes/rheMac3.gp.gz: 5614 # genes/rn5.gp.gz: 22863 time (cat ../anno/hg38.7way.maf \ | nice -n +19 genePredToMafFrames hg38 stdin stdout \ `sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g" ../species.list` \ | gzip > multiz7wayFrames.bed.gz) # real 3m44.591s # verify there are frames on everything, should be 7 species: zcat multiz7wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c # 265160 canFam3 # 208941 hg38 # 253323 mm10 # 574521 monDom5 # 200156 panTro4 # 49802 rheMac3 # 244731 rn5 # load the resulting file ssh hgwdev cd /hive/data/genomes/hg38/bed/multiz7way/frames time hgLoadMafFrames hg38 multiz7wayFrames multiz7wayFrames.bed.gz # real 0m19.959s time featureBits -countGaps hg38 multiz7wayFrames # 52686177 bases of 3209286105 (1.642%) in intersection # real 0m12.593s # enable the trackDb entries: # frames multiz7wayFrames # irows on # appears to work OK ######################################################################### # Phylogenetic tree from 7-way (DONE - 2014-06-04 - Hiram) mkdir /hive/data/genomes/hg38/bed/multiz7way/4d cd /hive/data/genomes/hg38/bed/multiz7way/4d # the annotated maf is: ../anno/hg38.7way.maf # using knownGene for hg38 hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" hg38 > hg38.knownGene.gp genePredSingleCover hg38.knownGene.gp stdout | sort > hg38.knownGeneNR.gp wc -l hg38.knownGeneNR.gp # 21887 hg38.knownGeneNR.gp mkdir annoSplit cd annoSplit time mafSplit -verbose=2 -outDirDepth=1 -byTarget -useFullSequenceName \ /dev/null . ../../anno/hg38.7way.maf # real 5m14.770s find . -type f | wc -l # 353 ssh ku mkdir /hive/data/genomes/hg38/bed/multiz7way/4d/run cd /hive/data/genomes/hg38/bed/multiz7way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/hg38/bed/multiz7way" set c = $1:r set infile = $r/4d/annoSplit/$2 set outDir = $r/4d/mfa/$3:h set outfile = $r/4d/mfa/$3 /bin/mkdir -p $outDir cd /scratch/tmp /bin/awk -v C=$c '$2 == C {print}' $r/4d/hg38.knownGeneNR.gp | sed -e "s/\t$c\t/\thg38.$c\t/" > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` echo $NL if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $infile -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile else echo "" > $outfile endif /bin/rm -f $c.gp $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh find ../annoSplit -type f | sed -e "s#../annoSplit/##" > maf.list cat << '_EOF_' > template #LOOP 4d.csh $(file1) $(path1) {check out line+ ../mfa/$(dir1)/$(root1).mfa} #ENDLOOP '_EOF_' # << happy emacs gensub2 maf.list single template jobList para create jobList para try ... check para time # Completed: 353 of 353 jobs # CPU time in finished jobs: 836s 13.93m 0.23h 0.01d 0.000 y # IO & Wait Time: 1172s 19.54m 0.33h 0.01d 0.000 y # Average job time: 6s 0.09m 0.00h 0.00d # Longest finished job: 72s 1.20m 0.02h 0.00d # Submission to last job: 89s 1.48m 0.02h 0.00d # Not all results have contents, that is OK # combine mfa files ssh hgwdev cd /hive/data/genomes/hg38/bed/multiz7way/4d # remove the broken empty files, size 0 and size 1: find ./mfa -type f -size 0 | xargs rm -f # most interesting, this did not identify files of size 1: # find ./mfa -type f -size 1 find ./mfa -type f | xargs ls -og | awk '$3 == 1' | awk '{print $NF}' \ > empty.list cat empty.list | xargs rm -f #want comma-less species.list /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # check they are all in there: grep "^>" 4d.all.mfa # >hg38 # >panTro4 # >rheMac3 # >mm10 # >rn5 # >canFam3 # >monDom5 sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ ../hg38.7way.nh | xargs echo | sed -e 's/ //g' > tree_commas.nh # tree_commas.nh looks like: # (((((hg38,panTro4),rheMac3),(mm10,rn5)),canFam3),monDom5) # use phyloFit to create tree model (output is phyloFit.mod) time nice -n +19 \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree tree_commas.nh 4d.all.mfa # real 0m6.583s mv phyloFit.mod all.mod grep TREE all.mod # TREE: (((((hg38:0.00673596,panTro4:0.00686169):0.0248146,rheMac3:0.0357598):0.0970072,(mm10:0.081661,rn5:0.0874126):0.246527):0.0264964,canFam3:0.156769):0.303241,monDom5:0.303241); # compare these calculated lengths to the tree extracted from 130way: grep TREE all.mod | sed -e 's/TREE: //' \ | /cluster/bin/phast/all_dists /dev/stdin | grep hg38 | sort -k3n \ | sed -e "s/hg38.//; s/^/ # /" # panTro4 0.013598 # rheMac3 0.067310 # canFam3 0.311823 # mm10 0.456746 # rn5 0.462497 # monDom5 0.761536 # yes, somewhat similar /cluster/bin/phast/all_dists ../hg38.7way.nh | grep hg38 \ | sort -k3n | sed -e "s/hg38.//; s/^/ # /" # panTro4 0.013390 # rheMac3 0.071575 # canFam3 0.330429 # mm10 0.500391 # rn5 0.507471 # monDom5 0.763679 ######################################################################### # phastCons 7-way (DONE - 2014-06-04 - Hiram) # split 7way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh ku mkdir -p /hive/data/genomes/hg38/bed/multiz7way/cons/SS cd /hive/data/genomes/hg38/bed/multiz7way/cons/SS mkdir result done cat << '_EOF_' > mkSS.csh #!/bin/csh -ef set d = $1 set c = $2 set doneDir = done/$d set MAF = /hive/data/genomes/hg38/bed/multiz7way/anno/result/$d/$c.maf set WINDOWS = /hive/data/genomes/hg38/bed/multiz7way/cons/SS/result/$d/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $3 ) then exit 0 endif if ( -s $3.running ) then exit 0 endif /bin/mkdir -p $doneDir /bin/date >> $3.running /bin/rm -fr $WINDOWS /bin/mkdir -p $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null /bin/date >> $3 /bin/rm -f $3.running '_EOF_' # << happy emacs chmod +x mkSS.csh cat << '_EOF_' > template #LOOP mkSS.csh $(dir1) $(root1) {check out line+ done/$(dir1)/$(root1)} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results find ../../anno/result -type f | sed -e "s#../../anno/result/##" > maf.list gensub2 maf.list single template jobList para -ram=32g create jobList para try ... check ... etc # Completed: 353 of 353 jobs # CPU time in finished jobs: 1216s 20.27m 0.34h 0.01d 0.000 y # IO & Wait Time: 1385s 23.08m 0.38h 0.02d 0.000 y # Average job time: 7s 0.12m 0.00h 0.00d # Longest finished job: 111s 1.85m 0.03h 0.00d # Submission to last job: 189s 3.15m 0.05h 0.00d find ./result -type f | wc -l # 641 # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh ku mkdir -p /hive/data/genomes/hg38/bed/multiz7way/cons/run.cons cd /hive/data/genomes/hg38/bed/multiz7way/cons/run.cons # This is setup for multiple runs based on subsets, but only running # the 'all' subset here. # It triggers off of the current working directory # $cwd:t which is the "grp" in this script. Running: # all and vertebrates cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set d = $2 set f = $3 set len = $4 set cov = $5 set rho = $6 set grp = $cwd:t set cons = /hive/data/genomes/hg38/bed/multiz7way/cons set tmp = $cons/tmp/${d}_${c} mkdir -p $tmp set ssSrc = $cons/SS/result set useGrp = "$grp.mod" if (-s $cons/$grp/$grp.non-inf) then ln -s $cons/$grp/$grp.mod $tmp ln -s $cons/$grp/$grp.non-inf $tmp ln -s $ssSrc/$d/$f $tmp else ln -s $ssSrc/$d/$f $tmp ln -s $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp else $PHASTBIN/phastCons $f $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp endif popd > /dev/null mkdir -p pp/$d bed/$d sleep 4 touch pp/$d bed/$d rm -f pp/$d/$c.pp rm -f bed/$d/$c.bed mv $tmp/$c.pp pp/$d mv $tmp/$c.bed bed/$d rm -fr $tmp rmdir --ignore-fail-on-non-empty $cons/tmp/$d:h '_EOF_' # << happy emacs chmod +x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(root1) $(dir1) $(file1) 45 0.3 0.3 {check out line+ pp/$(dir1)/$(root1).pp} #ENDLOOP '_EOF_' # << happy emacs find ../SS/result -type f | sed -e "s#../SS/result/##" > ss.list wc -l ss.list # 641 ss.list # Create parasol batch and run it # run for all species cd /hive/data/genomes/hg38/bed/multiz7way/cons mkdir -p all cd all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=32g create jobList para try ... check ... para push # Completed: 641 of 641 jobs # CPU time in finished jobs: 6557s 109.29m 1.82h 0.08d 0.000 y # IO & Wait Time: 4497s 74.94m 1.25h 0.05d 0.000 y # Average job time: 17s 0.29m 0.00h 0.00d # Longest finished job: 33s 0.55m 0.01h 0.00d # Submission to last job: 120s 2.00m 0.03h 0.00d # create Most Conserved track cd /hive/data/genomes/hg38/bed/multiz7way/cons/all cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/?/${C} 2> /dev/null | while read D do echo ${D}/${C}*.bed 1>&2 cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # -rw-rw-r-- 1 42636652 Jun 4 10:45 tmpMostConserved.bed # -rw-rw-r-- 1 43721828 Jun 4 10:45 mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/hg38/bed/multiz7way/cons/all time nice -n +19 hgLoadBed hg38 phastConsElements7way mostConserved.bed # Read 1234990 elements of size 5 from mostConserved.bed # real 0m11.390s # on human we often try for 5% overall cov, and 70% CDS cov # most bets are off here for that goal, these alignments are too few # and too far between # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits hg38 -enrichment knownGene:cds phastConsElements7way # knownGene:cds 1.266%, phastConsElements7way 4.551%, # both 0.888%, cover 70.16%, enrich 15.42x # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/hg38/bed/multiz7way/cons/all mkdir downloads # the third sed fixes the chrom names, removing the partition extensions time (find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | sed -e 's/\.[0-9][0-9]*-[0-9][0-9]* start/ start/' \ | gzip -c > downloads/phastCons7way.wigFix.gz) # real 37m47.242s # check integrity of data with wigToBigWig time (zcat downloads/phastCons7way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/hg38/chrom.sizes \ phastCons7way.bw) > bigWig.log 2>&1 & tail bigWig.log # pid=34733: VmPeak: 33106324 kB # real 40m53.287s bigWigInfo phastCons7way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 5,675,802,079 # primaryIndexSize: 92,579,900 # zoomLevels: 10 # chromCount: 353 # basesCovered: 2,898,191,577 # mean: 0.168088 # min: 0.000000 # max: 1.000000 # std: 0.233827 # encode those files into wiggle data time (zcat downloads/phastCons7way.wigFix.gz \ | wigEncode stdin phastCons7way.wig phastCons7way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 15m28.525s du -hsc *.wi? # 2.7G phastCons7way.wib # 282M phastCons7way.wig # 3.0G total # Load gbdb and database with wiggle. ln -s `pwd`/phastCons7way.wib /gbdb/hg38/multiz7way/phastCons7way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg38/multiz7way \ hg38 phastCons7way phastCons7way.wig # real 0m33.502s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh hg38 phastCons7way # db.table min max mean count sumData stdDev viewLimits hg38.phastCons7way 0 1 0.168088 2898191577 4.87152e+08 0.233827 viewLimits=0:1 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=hg38 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons7way > histogram.data 2>&1 # real 2m40.179s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human hg38 Histogram phastCons7way track" set xlabel " phastCons7way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### # phyloP for 7-way (DONE - 2014-06-04 - Hiram) # run phyloP with score=LRT ssh ku mkdir /cluster/data/hg38/bed/multiz7way/consPhyloP cd /cluster/data/hg38/bed/multiz7way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.556 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/all/all.mod 0.556 > all.mod # verify, the BACKGROUND should now be paired up: grep BACK all.mod # BACKGROUND: 0.222000 0.278000 0.278000 0.222000 cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set d = $f:h set file1 = $f:t set out = $2 set cName = $f:t:r set grp = $cwd:t set cons = /hive/data/genomes/hg38/bed/multiz7way/consPhyloP set tmp = $cons/tmp/$grp/$f /bin/rm -fr $tmp /bin/mkdir -p $tmp set ssSrc = "/hive/data/genomes/hg38/bed/multiz7way/cons/SS/result/$f" set useGrp = "$grp.mod" /bin/ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $file1.wigFix popd > /dev/null /bin/mkdir -p $out:h sleep 4 /bin/touch $out:h /bin/mv $tmp/$file1.wigFix $out /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp '_EOF_' # << happy emacs # Create list of chunks find ../../cons/SS/result -type f | grep ".ss$" \ | sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list # make sure the list looks good wc -l ss.list # 641 ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP '_EOF_' # << happy emacs ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/hg38/bed/multiz7way/consPhyloP/all cd /hive/data/genomes/hg38/bed/multiz7way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList # the -ram=8g will allow only one job per node to slow this down since # it would run too fast otherwise. Either run on one of the small # klusters or use the -ram=8g on the para create para -ram=32g create jobList para try ... check ... push ... etc ... para time > run.time # Completed: 641 of 641 jobs # CPU time in finished jobs: 4755s 79.24m 1.32h 0.06d 0.000 y # IO & Wait Time: 4343s 72.39m 1.21h 0.05d 0.000 y # Average job time: 14s 0.24m 0.00h 0.00d # Longest finished job: 27s 0.45m 0.01h 0.00d # Submission to last job: 1152s 19.20m 0.32h 0.01d # make downloads mkdir downloads time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phyloP7way.wigFix.gz) & # real 29m51.665s # check integrity of data with wigToBigWig time (zcat downloads/phyloP7way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/hg38/chrom.sizes \ phyloP7way.bw) > bigWig.log 2>&1 & egrep "real|VmPeak" bigWig.log # pid=76577: VmPeak: 33106320 kB # real 42m53.038s bigWigInfo phyloP7way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 3,759,451,708 # primaryIndexSize: 92,579,900 # zoomLevels: 10 # chromCount: 353 # basesCovered: 2,898,191,577 # mean: 0.074472 # min: -5.220000 # max: 1.062000 # std: 0.545945 # encode those files into wiggle data time (zcat downloads/phyloP7way.wigFix.gz \ | wigEncode stdin phyloP7way.wig phyloP7way.wib) & # Converted stdin, upper limit 1.06, lower limit -5.22 # real 16m11.861s du -hsc *.wi? # 47M phyloP7way.wib # 12M phyloP7way.wig # 58M total # Load gbdb and database with wiggle. ln -s `pwd`/phyloP7way.wib /gbdb/hg38/multiz7way/phyloP7way.wib nice hgLoadWiggle -pathPrefix=/gbdb/hg38/multiz7way hg38 \ phyloP7way phyloP7way.wig # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh hg38 phyloP7way # db.table min max mean count sumData # hg38.phyloP7way -5.22 1.062 0.0744721 2898191577 2.15834e+08 # stdDev viewLimits # 0.545945 viewLimits=-2.65525:1.062 # that range is: 5.22+1.062 = 6.282 for hBinSize=0.006282 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.006282 -hBinCount=1000 -hMinVal=-5.22 -verbose=2 \ -db=hg38 phyloP7way > histogram.data 2>&1 # real 2m55.843s # find out the range for the 2:5 graph grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin # Q1 0.000001 # median 0.000060 # Q3 0.000656 # average 0.001022 # min 0.000000 # max 0.065461 # count 978 # total 0.999982 # standard deviation 0.004157 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human hg38 Histogram phyloP7way track" set xlabel " phyloP7way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# # construct download files for 7-way (DONE - 2014-06-05 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons7way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP7way mkdir /hive/data/genomes/hg38/bed/multiz7way/downloads cd /hive/data/genomes/hg38/bed/multiz7way/downloads mkdir multiz7way phastCons7way phyloP7way cd multiz7way time cp -p ../../anno/hg38.7way.maf . # real 0m55.984s time gzip *.maf # real 46m53.149s ln -s ../../hg38.7way.nh . ln -s ../../hg38.7way.commonNames.nh . time md5sum *.nh *.maf.gz > md5sum.txt # real 1m55.317s ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way du -hsc *.maf.gz ../../anno/hg38.7way.maf # 3.5G hg38.7way.maf.gz # 17G ../../anno/hg38.7way.maf ##################################################################### cd /hive/data/genomes/hg38/bed/multiz7way/downloads/phastCons7way ln -s ../../cons/all/downloads/phastCons7way.wigFix.gz \ ./hg38.phastCons7way.wigFix.gz ln -s ../../cons/all/phastCons7way.bw ./hg38.phastCons7way.bw ln -s ../../cons/all/all.mod ./hg38.phastCons7way.mod time md5sum *.gz *.mod *.bw > md5sum.txt # real 0m37.384s # obtain the README.txt from petMar2/phastCons7way and update for this # situation ln -s `pwd`/*.gz `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons7way ##################################################################### cd /hive/data/genomes/hg38/bed/multiz7way/downloads/phyloP7way ln -s ../../consPhyloP/all/downloads/phyloP7way.wigFix.gz \ ./hg38.phyloP7way.wigFix.gz ln -s ../../consPhyloP/run.phyloP/all.mod hg38.phyloP7way.mod ln -s ../../consPhyloP/all/phyloP7way.bw hg38.phyloP7way.bw time md5sum *.mod *.bw *.gz > md5sum.txt # real 0m29.431s # obtain the README.txt from geoFor1/phyloP7way and update for this # situation ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP7way ########################################################################### ## create upstream refGene maf files cd /hive/data/genomes/hg38/bed/multiz7way/downloads/multiz7way # bash script #!/bin/sh export geneTbl="knownGene" for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits hg38 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags hg38 multiz7way \ stdin stdout \ -orgs=/hive/data/genomes/hg38/bed/multiz7way/species.list \ | gzip -c > upstream${S}.${geneTbl}.maf.gz echo "done upstream${S}.${geneTbl}.maf.gz" done # real 60m16.631s md5sum upstream*.gz >> md5sum.txt # some other symlinks were already made above # obtain the README.txt from geoFor1/multiz7way and update for this # situation ln -s `pwd`/upstream*.gz README.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way ############################################################################# # hgPal downloads (DONE - 2014-06-06 - Hiram) # FASTA from 7-way for knownGene, refGene and knownCanonical ssh hgwdev screen -S hg38HgPal mkdir /hive/data/genomes/hg38/bed/multiz7way/pal cd /hive/data/genomes/hg38/bed/multiz7way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list export mz=multiz7way export gp=knownGene export db=hg38 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time ./$gp.jobs > $gp.jobs.log 2>&1 & # real 28m46.919s time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 0m23.798s time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz # real 1m28.197s export mz=multiz7way export gp=knownGene export db=hg38 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ rm -rf exonAA exonNuc ### need other gene track alignments also # running up refGene cd /hive/data/genomes/hg38/bed/multiz7way/pal export mz=multiz7way export gp=refGene export db=hg38 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 # real 15m15.424s export mz=multiz7way export gp=refGene export db=hg38 time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 0m23.119s time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz # real 1m15.547s du -hsc exonAA exonNuc refGene*.fa.gz # 59M exonAA # 101M exonNuc # 59M refGene.multiz7way.exonAA.fa.gz # 101M refGene.multiz7way.exonNuc.fa.gz # 317M total rm -rf exonAA exonNuc # we're only distributing exons at the moment export mz=multiz7way export gp=refGene export db=hg38 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ ### And knownCanonical cd /hive/data/genomes/hg38/bed/multiz7way/pal export mz=multiz7way export gp=knownCanonical export db=hg38 mkdir exonAA exonNuc ppredAA ppredNuc knownCanonical cut -f1 ../../../chrom.sizes | while read C do echo $C hgsql hg38 -N -e "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$C'" > knownCanonical/$C.known.bed done ls knownCanonical/*.known.bed | while read F do if [ -s $F ]; then echo $F | sed -e 's#knownCanonical/##; s/.known.bed//' fi done | while read C do echo "date" echo "mafGene -geneBeds=knownCanonical/$C.known.bed $db $mz knownGene order.list stdout | \ gzip -c > ppredAA/$C.ppredAA.fa.gz" echo "mafGene -geneBeds=knownCanonical/$C.known.bed -noTrans $db $mz knownGene order.list stdout | \ gzip -c > ppredNuc/$C.ppredNuc.fa.gz" echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons -noTrans $db $mz knownGene order.list stdout | \ gzip -c > exonNuc/$C.exonNuc.fa.gz" echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons $db $mz knownGene order.list stdout | \ gzip -c > exonAA/$C.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 # real 72m58.133s rm *.known.bed mz=multiz7way gp=knownCanonical db=hg38 zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz & zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz & zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz & zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc mz=multiz7way gp=knownCanonical db=hg38 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz cd $pd md5sum *.exon*.fa.gz > md5sum.txt ############################################################################# # wiki page for 7-way (DONE - 2014-06-04 - Hiram) mkdir /hive/users/hiram/bigWays/hg38.7way cd /hive/users/hiram/bigWays echo "hg38" > hg38.7way/ordered.list awk '{print $1}' /hive/data/genomes/hg38/bed/multiz7way/7way.distances.txt \ >> hg38.7way/ordered.list # sizeStats.sh catches up the cached measurements required for data # in the tables. They may already be done. ./sizeStats.sh hg38.7way/ordered.list # dbDb.sh constructs hg38.7way/Hg38_7-way_conservation_alignment.html ./dbDb.sh hg38 7way # sizeStats.pl constructs hg38.7way/Hg38_7-way_Genome_size_statistics.html ./sizeStats.pl hg38 7way # defCheck.pl constructs Hg38_7-way_conservation_lastz_parameters.html ./defCheck.pl hg38 7way # this constructs the html pages in hg38.7way/: # -rw-rw-r-- 1 4153 Jun 5 11:03 Hg38_7-way_conservation_alignment.html # -rw-rw-r-- 1 5833 Jun 5 11:04 Hg38_7-way_Genome_size_statistics.html # -rw-rw-r-- 1 3854 Jun 5 11:04 Hg38_7-way_conservation_lastz_parameters.html # add those pages to the genomewiki. Their page names are the # names of the .html files without the .html: # Hg38_7-way_conservation_alignment # Hg38_7-way_Genome_size_statistics # Hg38_7-way_conservation_lastz_parameters # when you view the first one you enter, it will have links to the # missing two. ############################################################################# # GRC Incident database (DONE - 2014-06-14 - Hiram) # this procedure is run as a cron job in Hiram's account: # 33 09 * * * /hive/data/outside/grc/incidentDb/runUpdate.sh makeItSo # data comes from: ftp://ftp.ncbi.nlm.nih.gov/pub/grc/ # processed by /hive/data/outside/grc/incidentDb/grcUpdate.sh # the table in the dataBase is: grcIncidentDb # which is the URL to the bb file, a single row: # http://genomewiki.ucsc.edu/images/7/7f/Hg38.grcIncidentDb.bb ############################################################################# # RepeatMasker Visualization track (DONE - 2014-07-25 - Hiram) mkdir /hive/data/genomes/hg38/bed/rmskJoined cd /hive/data/genomes/hg38/bed/rmskJoined ln -s ../repeatMasker/hg38.sorted.fa.out . ln -s ../repeatMasker/hg38.fa.align.gz . # working on fixing this script for the next release of RM /scratch/data/RepeatMasker140131/util/nextVerRmToUCSCTables.pl \ -out hg38.sorted.fa.out -align hg38.fa.align.gz hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/rmskJoined.sql \ -renameSqlTable -verbose=4 -tab \ -type=bed4+9 -as=$HOME/kent/src/hg/lib/rmskJoined.as hg38 \ rmskJoinedBaseline hg38.sorted.fa.join.bed \ > loadJoined.log 2>&1 hgLoadSqlTab hg38 rmskAlignBaseline \ /cluster/home/hiram/kent/src/hg/lib/rmskAlign.sql \ hg38.fa.align.tsv > loadAlign.log 2>&1 hgLoadOutJoined -verbose=2 hg38 hg38.sorted.fa.out > loadOut.log 2>&1 featureBits -countGaps hg38 rmskJoinedBaseline # 2716777279 bases of 3209286105 (84.654%) in intersection ############################################################################## # LASTZ Macaca Mulatta RheMac2 (DONE - 2014-07-13 - braney) mkdir /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11 cd /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11 # best to always specify an exact path to lastz so we know which one is used # lastz default parameters are human-mouse parameters cat << '_EOF_' > DEF # human vs macaca mulatta BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters from panTro2 vs hg18 lastz on advice from Webb BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Human Hg38 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Macaca Mulatta RheMac2 SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 BASE=/hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11 TMPDIR=/dev/shm '_EOF_' # << happy emacs time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # Elapsed time: 141m36s cat fb.hg38.chainRheMac2Link.txt # 2455106923 bases of 3049335806 (80.513%) in intersection # running the swap mkdir /hive/data/genomes/rheMac2/bed/blastz.hg38.swap cd /hive/data/genomes/rheMac2/bed/blastz.hg38.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11/DEF \ -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # 83m26.095s cat fb.rheMac2.chainHg38Link.txt # 2313950599 bases of 2646704109 (87.428%) in intersection # ######################################################################### # LASTZ Chlorocebus sabaeus (DONE - 2014-07-13 - braney) mkdir /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11 cd /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11 # best to always specify an exact path to lastz so we know which one is used # lastz default parameters are human-mouse parameters cat << '_EOF_' > DEF # human vs Chlorocebus sabaeus BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q BLASTZ_O=600 BLASTZ_E=150 # other parameters from panTro2 vs hg18 lastz on advice from Webb BLASTZ_K=4500 BLASTZ_Y=15000 BLASTZ_T=2 # TARGET: Human Hg38 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY Chlorocebus sabaeus chlSab2 SEQ2_DIR=/scratch/data/chlSab2/chlSab2.2bit SEQ2_LEN=/scratch/data/chlSab2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_IN_CONTIGS=0 BASE=/hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11 TMPDIR=/dev/shm '_EOF_' # << happy emacs time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # Elapsed time: 142m4s cat fb.hg38.chainChlSab2Link.txt # 2573435303 bases of 3049335806 (84.393%) in intersection # running the swap mkdir /hive/data/genomes/chlSab2/bed/blastz.hg38.swap cd /hive/data/genomes/chlSab2/bed/blastz.hg38.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11/DEF \ -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # 88m48.411s cat fb.chlSab2.chainHg38Link.txt # 2429053010 bases of 2752019208 (88.264%) in intersection ######################################################################### # SEGMENTAL DUPLICATIONS (DONE - 2014-08-13 - Hiram) # redmine issue: refs #13580 # file received in email from Archana Natarajan Raja (araja at uw.edu) mkdir /hive/data/genomes/hg38/bed/genomicSuperDups cd /hive/data/genomes/hg38/bed/genomicSuperDups # -rw-r--r-- 1 16478617 Aug 11 16:18 GenomicSuperDup.tab # no longer filtering items smaller than 1,000 bases, see note # in redmine issue refs #13580 # While the size of the 24 alignments are less than 1000 bases , the size of # their pairs to which they align are always >1000, you can confirm this by # looking at the value in column 22 in your table (alignB -ucsc format), will # always be >1000 bp . We are seeing this only now because there are lots of # new and resolved duplications added to hg38. Hence , I would recommend not # filtering these items and uploading the current set as is. # there is no chrEBV in the browser: grep -v chrEBV GenomicSuperDup.tab | sed -e 's/\t_\t/\t-\t/;' \ | hgLoadBed hg38 genomicSuperDups stdin \ -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql # Read 69894 elements of size 29 from stdin checkTableCoords hg38 genomicSuperDups # (the chrEBV was found with this check) featureBits -countGaps hg38 genomicSuperDups # 175429664 bases of 3209286105 (5.466%) in intersection featureBits -countGaps hg19 genomicSuperDups # 166092393 bases of 3137161264 (5.294%) in intersection featureBits -countGaps hg18 genomicSuperDups # 159204446 bases of 3107677273 (5.123%) in intersection featureBits -countGaps mm10 genomicSuperDups # 214917441 bases of 2730871774 (7.870%) in intersection featureBits -countGaps mm9 genomicSuperDups # 208214567 bases of 2725765481 (7.639%) in intersection ############################################################################## # cloneEnds (DONE - 2014-08-14 - Hiram) mkdir /hive/data/genomes/hg38/bed/cloneEnds cd /hive/data/genomes/hg38/bed/cloneEnds # fetch the NCBI INSDC name correspondence file: rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000001405.26.assembly.txt ./ # fetch the clone reports mkdir reports rsync -a -P \ rsync://ftp.ncbi.nih.gov/repository/clone/reports/Homo_sapiens/*.GCF_000001405.26.106.*.gff \ ./reports/ # script to establish refSeq to UCSC chrom names: cat << '_EOF_' > refSeqNames.pl #!/usr/bin/env perl use strict; use warnings; open (FH, ") { chomp $line; next if ($line =~ m/^#/); my @a = split('\t', $line); my $chrN = $a[2]; my $refSeq = $a[6]; my $contig = $a[4]; my $type = $a[1]; next if (!defined $type); next if (!defined $refSeq); next if (!defined $contig); my $suffix = ""; if ($type eq "alt-scaffold") { $suffix = "_alt"; } elsif ($type eq "unlocalized-scaffold") { $suffix = "_random"; } elsif ($type eq "unplaced-scaffold") { $chrN = "Un"; } $chrN = "M" if ($chrN eq "MT"); if ($a[0] =~ m/_/) { $contig =~ s/\./v/; printf "%s\tchr%s_%s%s\n", $refSeq, $chrN, $contig, $suffix; } else { printf "%s\tchr%s\n", $refSeq, $chrN; } } close (FH); '_EOF_' # << happy emacs chmod +x refSeqNames.pl ./refSeqNames.pl > refSeq.ucscName.tab # establish full library list: ls reports/*.GCF_000001405.26.106.*.gff | sed -e 's#reports/##' \ | cut -d"." -f1 | sort -u > library.list.txt # a script to scan the GFF files, with the refSeq.ucscName.tab # name correspondence to construct bed files cat << '_EOF_' > hg38.pl #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc < 1) { printf STDERR "usage: ./hg38.pl [moreReports.gff]\n"; exit 255; } my %refSeqToUcsc; # key is refSeq name, value is UCSC chrom name open (FH, ") { chomp $line; my ($refSeq, $ucsc) = split('\t', $line); $refSeqToUcsc{$refSeq} = $ucsc; } close (FH); my %chromSizes; # key is UCSC chrom name, key is chrom size open (FH, ") { chomp $line; my ($chr, $size) = split('\t', $line); $chromSizes{$chr} = $size; } close (FH); while (my $file = shift) { my %starts; # key is parent ID, value is start end coordinates start,end my %ends; # key is parent ID, value is end end coordinates start,end my %parents; # key is parent ID, value is 1 to signify exists my %endNames; # key is parent ID, value is the Name of the parent clone_insert printf STDERR "# processing $file\n"; open (FH, "<$file") or die "can not read $file"; while (my $line = ) { chomp $line; next if ($line=~ m/^#/); my @a = split('\t', $line); next if (scalar(@a) < 1); my $contig = $a[0]; $contig =~ s/ref.//; $contig =~ s/\|//; my $ucscChr = $refSeqToUcsc{$contig}; if (!defined($ucscChr)) { printf STDERR "# ERR: contig not in refSeqToUcsc: '$contig'\n"; next; } next if (! exists($chromSizes{$ucscChr})); my $chromSize = $chromSizes{$ucscChr}; my $chromStart = $a[3] - 1; my $chromEnd = $a[4]; if ($chromStart > $chromSize) { printf STDERR "# warning chromStart over size $ucscChr $chromStart $chromEnd\n"; $chromStart = $chromSize-1; } if ($chromEnd > $chromSize) { my $overRun = $chromEnd - $chromSize; printf STDERR "# warning chromEnd over size by $overRun -> $ucscChr $chromStart $chromEnd\n"; $chromEnd = $chromSize; } my $id="notFound"; my $name="notFound"; my $parent="notFound"; my @b = split(';', $a[8]); for (my $i = 0; $i < scalar(@b); ++$i) { my ($tag, $value) = split('=', $b[$i]); if ($tag eq "ID") { $id = $value; if ($id !~ m/-/) { if (exists($parents{$id})) { printf STDERR "# WARN: duplicate parent: $id"; } else { $parents{$id} = $ucscChr; } } } elsif ($tag eq "Parent") { $parent = $value; } elsif ($tag eq "Name") { $name = $value; } } my $type="notFound"; my $insertType = $a[2]; if ($insertType =~ m/clone_insert_start/) { $type = "start"; if ($parent eq "notFound") { printf STDERR "# ERR: can not find parent for start $name Ttype $id\n"; } else { if (!exists($parents{$parent})) { printf STDERR "# ERR: start found $name with no parent $parent declared\n"; } elsif (exists($starts{$parent})) { printf STDERR "# ERR: duplicate start for $parent\n"; } elsif ($ucscChr eq $parents{$parent}) { $starts{$parent} = sprintf("%s\t%s", $chromStart, $chromEnd); } else { printf STDERR "# ERR: start on different chrom $ucscChr than parent $parent $parents{$parent}\n"; } } } elsif ($insertType =~ m/clone_insert_end/) { $type = "end"; if ($parent eq "notFound") { printf STDERR "# ERR: can not find parent for end $name Ttype $id\n"; } else { if (!exists($parents{$parent})) { printf STDERR "# ERR: end found $name with no parent $parent declared\n"; } elsif (exists($ends{$parent})) { printf STDERR "# ERR: duplicate end for $parent\n"; } elsif ($ucscChr eq $parents{$parent}) { $ends{$parent} = sprintf("%s\t%s", $chromStart, $chromEnd); } else { printf STDERR "# ERR: end on different chrom $ucscChr than parent $parent $parents{$parent}\n"; } } } elsif ($insertType =~ m/clone_insert/) { $type = "insert"; $endNames{$id} = $name; } $name =~ s/gi\|//g; $id =~ s/gi\|//g; printf STDERR "%s\t%d\t%d\t%s_%s_%s\t0\t%s\n", $ucscChr, $chromStart, $chromEnd, $name, $type, $id, $a[6]; } # while (my $line = ) close (FH); foreach my $parent (keys %parents) { if (! exists($starts{$parent}) ) { printf STDERR "# ERR: no start for $parent\n"; } elsif (! exists($ends{$parent}) ) { printf STDERR "# ERR: no end for $parent\n"; } else { my $strand = "+"; my $chrStart = 0; my $chrEnd = 0; my $blockStart = 0; my ($sStart, $sEnd) = split('\t', $starts{$parent}); my ($eStart, $eEnd) = split('\t', $ends{$parent}); my $startSize = $sEnd - $sStart; my $endSize = $eEnd - $eStart; if ($eStart < $sStart) { $chrStart = $eStart; $chrEnd = $sEnd; $blockStart = $sStart - $chrStart; $strand = "-"; $startSize = $eEnd - $eStart; $endSize = $sEnd - $sStart; } else { $chrStart = $sStart; $chrEnd = $eEnd; $blockStart = $eStart - $chrStart; } if ($startSize > $blockStart) { printf STDERR "# startSize > blockStart $endNames{$parent}\n"; } else { printf "%s\t%d\t%d\t%s\t0\t%s\t%d\t%d\t0\t2\t%d,%d\t0,%d\n", $parents{$parent}, $chrStart, $chrEnd, $endNames{$parent}, $strand, $chrStart, $chrEnd, $startSize, $endSize, $blockStart; } } } } '_EOF_' # << happy emacs chmod +x hg38.pl # process GFF files into bed files into separateLibs/ directory for L in `cat library.list.txt` do export destDir="separateLibs/${L}" echo "working: ${L}" 1>&1 mkdir -p "${destDir}" ./hg38.pl reports/${L}.GCF_000001405.26.106.*.gff \ 2> ${destDir}/tmp.bed6 | sort -k1,1 -k2,2n > ${destDir}/hg38.${L}.bed sort -k1,1 -k2,2n ${destDir}/tmp.bed6 > ${destDir}/hg38.${L}.items.bed6 done # use only those libraries with more than 20,000 clone ends wc -l separateLibs/*/*.bed | sort -n | grep -v total | awk '$1 > 20000' \ | sed -e 's#.*separateLibs/##; s#/.*##' > libs.over20K.list # note those libraries with less than 20,000 clone ends wc -l separateLibs/*/*.bed | grep -v total | awk '$1 < 20000' | sed -e 's#.*separateLibs/##; s#/.*##' > libs.under20K.list # filter out bad ends, length must be <= median size times three cat libs.over20K.list | while read D do if [ ! -s separateLibs/${D}/lengths.txt ]; then awk '{print $3-$2}' separateLibs/${D}/hg38.${D}.bed \ > separateLibs/${D}/lengths.txt fi median3X=`ave separateLibs/${D}/lengths.txt | grep median | awk '{printf "%d", $2*3}'` awk '($3-$2) < '$median3X'' separateLibs/${D}/hg38.${D}.bed > separateLibs/${D}/hg38.median3X.bed awk '($3-$2) >= '$median3X'' separateLibs/${D}/hg38.${D}.bed > separateLibs/${D}/hg38.badMap.bed before=`cat separateLibs/${D}/hg38.${D}.bed | wc -l` after=`cat separateLibs/${D}/hg38.median3X.bed | wc -l` dropped=`echo $before $after | awk '{print $1-$2}'` perCent=`echo $dropped $before | awk '{printf "%.2f", 100*'$dropped/$before'}'` echo "$D $before - $after = $dropped -> % $perCent dropped" done # ABC20 24692 - 24474 = 218 -> % 0.88 dropped # RP11 86660 - 85903 = 757 -> % 0.87 dropped # CTD 95853 - 94941 = 912 -> % 0.95 dropped # CH17 105618 - 105060 = 558 -> % 0.53 dropped # ABC21 182154 - 180973 = 1181 -> % 0.65 dropped # ABC22 189939 - 188743 = 1196 -> % 0.63 dropped # COR02 208263 - 206782 = 1481 -> % 0.71 dropped # ABC18 325080 - 322904 = 2176 -> % 0.67 dropped # ABC27 334178 - 331822 = 2356 -> % 0.71 dropped # ABC24 398944 - 395776 = 3168 -> % 0.79 dropped # ABC23 436965 - 433896 = 3069 -> % 0.70 dropped # ABC16 452220 - 449101 = 3119 -> % 0.69 dropped # COR2A 583008 - 578578 = 4430 -> % 0.76 dropped # WI2 587165 - 582843 = 4322 -> % 0.74 dropped # ABC7 649297 - 644071 = 5226 -> % 0.80 dropped # ABC11 729962 - 724864 = 5098 -> % 0.70 dropped # ABC9 755994 - 750648 = 5346 -> % 0.71 dropped # ABC12 777816 - 771827 = 5989 -> % 0.77 dropped # ABC10 787969 - 781331 = 6638 -> % 0.84 dropped # ABC13 810822 - 803589 = 7233 -> % 0.89 dropped # ABC14 845573 - 839126 = 6447 -> % 0.76 dropped # ABC8 1204275 - 1192784 = 11491 -> % 0.95 dropped # loading the median3X files for L in `cat libs.over20K.list` do echo $L 1>&2 hgLoadBed -type=bed12 hg38 cloneEnd${L} \ separateLibs/${L}/hg38.median3X.bed \ > separateLibs/loadBed.${L}.log 2>&1 done # loading the dropped ends: mkdir /hive/data/genomes/hg38/bed/cloneEnds/droppedTooBig # link them to here cat ../libs.over20K.list | while read L do ln -s ../separateLibs/${L}/hg38.badMap.bed ${L}.badMap.bed done # then load hgLoadBed -type=bed12 hg38 cloneEndbadEnds *.badMap.bed # construct multiple mapped ends: for L in `cat libs.over20K.list` do cat separateLibs/${L}/hg38.median3X.bed done | sort -k4 > allEnds.bed cut -f4 allEnds.bed | sort | uniq -c | sort -rn > allEnds.names.count.txt awk '$1 > 1' allEnds.names.count.txt | awk '{print $2}' \ | sort > multiples.names.txt join -t' ' -o "2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12" \ -2 4 multiples.names.txt allEnds.bed | sort -k1,1 -k2,2n \ > allEnds.multiple.locations.bed hgLoadBed -type=bed12 hg38 cloneEndmultipleMaps \ allEnds.multiple.locations.bed > load.multipleMaps.log 2>&1 awk '$6 == "+"' allEnds.bed | sort -k1,1 -k2,2n \ | bedItemOverlapCount hg38 stdin > allEnds.forward.bedGraph awk '$6 == "-"' allEnds.bed | sort -k1,1 -k2,2n \ | bedItemOverlapCount hg38 stdin > allEnds.reverse.bedGraph bedGraphToBigWig allEnds.forward.bedGraph \ /hive/data/genomes/hg38/chrom.sizes \ cloneEndcoverageForward.bw bedGraphToBigWig allEnds.reverse.bedGraph \ /hive/data/genomes/hg38/chrom.sizes \ cloneEndcoverageReverse.bw mkdir /gbdb/hg38/bbi/cloneEnd ln -s `pwd`/cloneEndcoverageForward.bw /gbdb/hg38/bbi/cloneEnd ln -s `pwd`/cloneEndcoverageReverse.bw /gbdb/hg38/bbi/cloneEnd hgBbiDbLink hg38 cloneEndcoverageForward \ /gbdb/hg38/bbi/cloneEnd/cloneEndcoverageForward.bw hgBbiDbLink hg38 cloneEndcoverageReverse \ /gbdb/hg38/bbi/cloneEnd/cloneEndcoverageReverse.bw ### Fixup the scores to indicate how many multiple mappings as mentioned ### in the hg19 bacEnds description page: one mapping: score = 1000 ### multiple mappings: score = 1500/count ### the sort | uniq -c | awk does this score calculation with the name ### in column 1 ### The join puts the existing table together with those scores ### DONE - 2015-06-18 - Hiram mkdir /hive/data/genomes/hg38/bed/cloneEnds/addCounts cd /hive/data/genomes/hg38/bed/cloneEnds/addCounts mkdir score withScore noScore withScore for table in cloneEndABC10 cloneEndABC11 cloneEndABC12 cloneEndABC13 \ cloneEndABC14 cloneEndABC16 cloneEndABC18 cloneEndABC20 cloneEndABC21 \ cloneEndABC22 cloneEndABC23 cloneEndABC24 cloneEndABC27 cloneEndABC7 \ cloneEndABC8 cloneEndABC9 cloneEndCH17 cloneEndCOR02 cloneEndCOR2A \ cloneEndCTD cloneEndRP11 cloneEndWI2 cloneEndbadEnds cloneEndmultipleMaps do hgsql -N -e "select name from $table;" hg38 | sort | uniq -c | awk '{ if (1 == $1) {printf "%s\t1000\n", $2} else {printf "%s\t%d\n", $2, 1500/$1} }' \ | sort > score/hg38.$table.score.tab hgsql -N -e "select * from $table order by name;" hg38 \ | sort -k5 > noScore/hg38.$table.tab join -t'^I' -1 5 noScore/hg38.$table.tab score/hg38.$table.score.tab \ | awk '{printf "%d\t%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n", $2,$3,$4,$5,$1,$14,$7,$8,$9,$10,$11,$12,$13}' \ | sort -k2,2 -k3,3n > withScore/hg38.$table.withScore.tab hgsql -e "delete from $table;" hg38 hgsql -e "load data local infile \"withScore/hg38.$table.withScore.tab\" into table $table;" hg38 done ############################################################################## # SIB Transcriptome (DONE 2014-08-27 Steve) # Create working directory and download data from where Christian # Iseli (christian.iseli@unil.ch) put it, and unpack. mkdir -p /hive/data/genomes/hg38/bed/sibTranscriptome cd /hive/data/genomes/hg38/bed/sibTranscriptome wget --timestamping http://ludwig-sun1.unil.ch/~chris/HTr.gtf.gz wget --timestamping http://ludwig-sun1.unil.ch/~chris/txg.tar.gz tar -zxvf txg.tar.gz zcat HTr.gtf.gz | ldHgGene hg38 sibGene stdin # Reading stdin # Read 208508 transcripts in 2824960 lines in 1 files # 208508 groups 25 seqs 1 sources 2 feature types # 208508 gene predictions # Do a little data cleanup and transformation and load splice graphs # into database. sed 's/altGraphX/sibTxGraph/' ~/kent/src/hg/lib/altGraphX.sql > sibTxGraph.sql cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb \ -sqlTable=sibTxGraph.sql hg38 sibTxGraph stdin # Reading stdin # Read 47817 elements of size 18 from stdin # Sorted # Creating table definition for sibTxGraph from sql: sibTxGraph.sql # Saving bed.tab # Loading hg38 # Create sibAltEvents track for analyzed alt-splices. # Not on RR for hg18 and hg19, so do not push it out cat txg/*.txg | txgAnalyze stdin /cluster/data/hg38/hg38.2bit sibAltEvents.bed awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed hgLoadBed hg38 sibAltEvents foo.bed # Reading foo.bed # Read 452436 elements of size 6 from foo.bed # Sorted # Creating table definition for sibAltEvents, bedSize: 6 # Saving bed.tab # Loading hg38 # push sibGene and sibTxGraph for hg38 ############################################################################ # Orangutan Lastz run (DONE - 2014-05-27 - Hiram) screen -S hg38PonAbe2 # use a screen to manage this longish running job mkdir /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02 cd /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02 # always set the BLASTZ program so we know what version was used cat << '_EOF_' > DEF # human vs chimp BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_O=600 BLASTZ_E=150 # maximum M allowed with lastz is only 254 BLASTZ_M=254 BLASTZ_T=2 BLASTZ_Y=15000 BLASTZ_K=4500 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q # A C G T # 90 -330 -236 -356 # -330 100 -318 -236 # -236 -318 100 -330 # -356 -236 -330 90 # TARGET: Human Hg38 SEQ1_DIR=/scratch/data/hg38/hg38.2bit SEQ1_LEN=/scratch/data/hg38/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_IN_CONTIGS=0 # QUERY: Orangutan PonAbe2 SEQ2_DIR=/hive/data/genomes/ponAbe2/ponAbe2.2bit SEQ2_LEN=/hive/data/genomes/ponAbe2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 SEQ2_IN_CONTIGS=0 BASE=/hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02 TMPDIR=/dev/shm '_EOF_' time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 144m46.575s cat fb.hg38.chainPonAbe2Link.txt # 2719618310 bases of 3049335806 (89.187%) in intersection # filter with doRecipBest.pl time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ hg38 ponAbe2) > rbest.log 2>&1 # real 60m1.060s time (doRecipBest.pl -load -continue=load -workhorse=hgwdev \ -buildDir=`pwd` hg38 ponAbe2) > loadRBest.log 2>&1 & # real 3m35.834s cat fb.hg38.chainRBestPonAbe2Link.txt # 2538296592 bases of 3049335806 (83.241%) in intersection # running the swap mkdir /hive/data/genomes/ponAbe2/bed/blastz.hg38.swap cd /hive/data/genomes/ponAbe2/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ -swap /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02/DEF \ -chainMinScore=5000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 102m27.866s cat fb.ponAbe2.chainHg38Link.txt # 2773568958 bases of 3093572278 (89.656%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ ponAbe2 hg38) > rbest.log 2>&1 # real 78m47.312s ############################################################################# # Add chrX alts to par (DONE 2014-10-14 angie) # Thanks to Hiram for pointing out that intersecting chrX positions in # altLocations and par shows whether a chrX alt overlaps a PAR. cd /hive/data/genomes/hg38/bed/par hgsql hg38 -e 'select * from altLocations where chrom = "chrX"' #+-----+-------+------------+----------+---------------------+ #| bin | chrom | chromStart | chromEnd | name | #+-----+-------+------------+----------+---------------------+ #| 73 | chrX | 319337 | 601516 | chrX_KI270880v1_alt | #| 73 | chrX | 326487 | 601516 | chrX_KI270913v1_alt | #| 149 | chrX | 79965153 | 80097082 | chrX_KI270881v1_alt | #+-----+-------+------------+----------+---------------------+ hgsql hg38 -e 'select * from par where chrom = "chrX"' #+-----+-------+------------+-----------+------+ #| bin | chrom | chromStart | chromEnd | name | #+-----+-------+------------+-----------+------+ #| 9 | chrX | 10000 | 2781479 | PAR1 | #| 221 | chrX | 155701382 | 156030895 | PAR2 | #+-----+-------+------------+-----------+------+ # chrX_KI270880v1_alt and chrX_KI270913v1_alt are entirely contained in PAR1; # chrX_KI270881v1_alt is not in either PAR. hgsql hg38 -e 'select chrom,size from chromInfo \ where chrom in ("chrX_KI270880v1_alt", "chrX_KI270913v1_alt");' #+---------------------+--------+ #| chrom | size | #+---------------------+--------+ #| chrX_KI270880v1_alt | 284869 | #| chrX_KI270913v1_alt | 274009 | #+---------------------+--------+ # Process that into bed4 with name=PAR1: hgsql hg38 -NBe 'select chrom, 0, size, "PAR1" from chromInfo \ where chrom in ("chrX_KI270880v1_alt", "chrX_KI270913v1_alt");' \ >> hg38Par.bed4 hgLoadBed hg38 par hg38Par.bed4 checkTableCoords hg38 par ############################################################################# # LASTZ Cow bosTau8 (DONE - 2014-10-15 - Steve) mkdir /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-215 cd /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15 cat << '_EOF_' > DEF # human vs cow # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Human hg38 SEQ1_DIR=/scratch/data/hg38/hg38.2bit SEQ1_LEN=/scratch/data/hg38/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau8 SEQ2_DIR=/hive/data/genomes/bosTau8/bosTau8.2bit SEQ2_LEN=/hive/data/genomes/bosTau8/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # real 602m37.523s cat fb.hg38.chainBosTau8Link.txt # 1401921010 bases of 3049335806 (45.975%) in intersection # Create link cd /hive/data/genomes/hg38/bed ln -s lastzBosTau8.2014-10-15 lastz.bosTau8 # running the swap mkdir /hive/data/genomes/bosTau8/bed/blastz.hg38.swap cd /hive/data/genomes/bosTau8/bed/blastz.hg38.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 116m32.121s cat fb.bosTau8.chainHg38Link.txt # 1336307377 bases of 2649307237 (50.440%) in intersection cd /hive/data/genomes/bosTau8/bed ln -s blastz.hg38.swap lastz.hg38 ############################################################################ # NCBI ClinVar (new version -DONE - 2014-11-08 - Max) # see hg19.txt ######################################################################### ######################################################################## # CNV Developmental Delay track (2014-11-21 Steve) mkdir /hive/data/genomes/hg38/bed/cnvDevDelay cd /hive/data/genomes/hg38/bed/cnvDevDelay wget --timestamping 'ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd100_Coe_et_al_2014/gvf/nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz' wget --timestamping 'ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd54_Cooper_et_al_2011/gvf/nstd54_Cooper_et_al_2011.GRCh38.remap.all.germline.ucsc.gvf.gz' cp /kent/src/hg/utils/automation/gvfToBed8Attrs.pl . mv gvfToBed8Attrs.pl gvfToBed8AttrsCase.pl cp gvfToBed8AttrsCase.pl gvfToBed8AttrsControl100.pl cp gvfToBed8AttrsCase.pl gvfToBed8AttrsControl54.pl # made three local copies of Angie's gvf conversion script - one to include # only case individuals from nstd100, one to include only control individuals # from nstd100 and one to include only control individuals from nstd54 # had to add an additional elsif statement to the nstd100 scripts to filter # based on sample_name field: # } elsif ($tag eq "sample_name") { # $sample_name = $val; # } # added line 33/35 to each file: # next if ($sample_name eq "Unknown"); # keep only "case" individuals from nstd100 # next if ($sample_name ne "Unknown"); # keep only "control" individuals from nstd100 # next if ($phenotype ne "not_reported"); # keep only "control" individuals from nstd54 zcat nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsCase.pl > cnvDevDelayAllCase.bed zcat nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsControl100.pl > cnvDevDelayAllControl.bed zcat nstd54_Cooper_et_al_2011.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsControl54.pl >> cnvDevDelayAllControl.bed # GRCh38 data from dbVar had different naming scheme for alternate chromosomes # (e.g., chr1|NT_187515.1 instead of chr1_KI270762v1_alt), so needed to write # a script to substitute the correct UCSC names cat << '_EOF_' > chromXref.pl #!/usr/bin/env perl use strict; use warnings; sub usage() { printf STDERR "usage: ./chromXref.pl \n" } my $argc = scalar(@ARGV); if ($argc != 2) { usage; exit 255; } open (file1, ") { chomp($line); my ($type, $chr, $acc1, $acc2) = split('\t', $line); ($type, undef) = split('-', $type); ($acc1, my $version) = split('\.', $acc1); if ($type eq "unlocalized") { $type = "random"; } my $ucscAcc = "_" . $acc1 . "v" . $version . "_" . $type; $accArray[$i][0] = $ucscAcc; $accArray[$i][1] = $acc2; $i++; } close (file1); open (file2, "<$ARGV[0]") or die "cannot read $ARGV[0]"; open (file3, ">$ARGV[1]") or die "cannot read $ARGV[1]"; local $/; my $fileContents = ; for ($i = 0; $i < scalar(@accArray); $i++) { my $temp1 = $accArray[$i][1]; my $temp2 = $accArray[$i][0]; if ($fileContents =~ m/\|$temp1/) { $fileContents =~ s/\|$temp1/$temp2/g; } } print file3 $fileContents; close (file2); close (file3); '_EOF_' # << happy emacs cp /hive/data/genomes/hg38/genbank/GCF_000001405.26.assembly.txt . cat GCF_000001405.26.assembly.txt | grep -v '^#\|assembled\|unplaced' | awk '{print $2 "\t" $3 "\t" $5 "\t" $7}' > hg38.xref chromXref.pl cnvDevDelayAllCase.bed cnvDevDelayAllCaseUcsc.bed chromXref.pl cnvDevDelayAllControl.bed cnvDevDelayAllControlUcsc.bed hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \ -allowStartEqualEnd hg38 cnvDevDelayCase cnvDevDelayAllCaseUcsc.bed hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \ -allowStartEqualEnd hg38 cnvDevDelayControl cnvDevDelayAllControlUcsc.bed checkTableCoords hg38 cnvDevDelayCase checkTableCoords hg38 cnvDevDelayControl ######################################################################### # RETROFINDER RETROPOSED GENES ucscRetro track VERSION 9 # (2015-01-12 - 2015-01-20, hartera, DONE) ssh hgwdev mkdir -p /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112 cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112 cat << '_EOF_' > DEF RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 " VERSION=9 RUNDATE="2015-01-12" DB=hg38 SCORETHRESH=550 GENOMENAME='Homo sapiens' GBDB=hg DATE=20150112 RUNDIR=/hive/groups/gencode/pseudogenes/retroFinder/$DB.$DATE BINDIR=/hive/users/hartera/GencodeWG/retroFinder/branches/version2/bin KENTDIR=/cluster/home/hartera/kent KENTBINDIR=/cluster/home/hartera/bin/x86_64 MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz.$VERSION TMPMRNA=$RUNDIR/mrnaBlastz/$DB TMPEST=$RUNDIR/est/$DB USEALTSEQS=0 EST=all_est SPLICED_EST=intronEst SPLIT_EST=0 SPLIT_SPLICED_EST=0 LASTZPROG=/cluster/bin/penn/x86_64/lastz SCRIPT=/hive/users/hartera/GencodeWG/retroFinder/branches/version2/src/pipeline GENOME=/hive/data/genomes RETRODIR=$GENOME/$DB/bed/retro BASE=$RUNDIR/retro OUTDIR=${BASE}/version${VERSION}/${DB} RESULT=$OUTDIR/result RESULTSPLIT=$OUTDIR/resultSplit LOG=$OUTDIR/log OUT=$OUTDIR/out OVERLAPDIR=$OUTDIR/run.o TABLE=ucscRetroInfo$VERSION ORTHOTABLE=ucscRetroOrtho$VERSION ALIGN=ucscRetroAli$VERSION LOCAL=/scratch/data/$DB TWOBIT=$GENOME/$DB/$DB.2bit RMSK=rmsk NET1=netMm10 NET2=netCanFam3 NET3=netRheMac3 # these two nets determine which retros are classified as ancient, # use two farthest nets ANCIENT1=netMm10 ANCIENT2=netCanFam3 GENE1=knownGene GENE2=refGene GENE3=wgEncodeGencodeCompV19 CLUSTER=ku SPECIES="hg38 mm10" ROOTDIR="/cluster/home/hartera/public_html/retro/hg38Jun14" WEBROOT=$ROOTDIR/retro.$VERSION WEBSERVER=http://hgwdev-hartera.soe.ucsc.edu SHUFFLEDIR=shuffle SHUFFLEROOT=$WEBROOT/$SHUFFLEDIR DUPDIR=dups DUPROOT=$WEBROOT/$DUPDIR AGEDIR=age AGEROOT=$WEBROOT/$AGEDIR EXPDIR=exp GENEPFAM=knownGene PFAM=knownToPfam PFAMIDFIELD=name PFAMDOMAIN=value ALTSPICE= #ALTSPLICE=sibTxGraph SPLITBYAGE=$SCRIPT/splitRetrosByAge PDB=proteins140122 #ARRAY=gnfAtlas2 #AFFYPROBE="affyU133A,affyGnf1h" #ARRAYMEDIAN=hgFixed.gnfHumanAtlas2Median #ARRAYRATIO=hgFixed.gnfHumanAtlas2AllRatio #ARRAYABS=hgFixed.gnfHumanAtlas2All #ARRAYEXP=hgFixed.gnfHumanAtlas2MedianExps #ARRAYEXPALL=hgFixed.gnfHumanAtlas2AllExps #ARRAYLOOKUP=knownToGnfAtlas2 #ARRAYPSLS="/hive/data/genomes/hg19/bed/geneAtlas2/affyU133A.psl /hive/data/genomes/hg19/bed/geneAtlas2/affyGnf1h.psl" '_EOF_' # << happy emacs chmod +x DEF mkdir -p /hive/data/genomes/hg38/bed/retro mkdir -p /hive/data/genomes/hg38/bed/mrnaBlastz.9 mkdir -p /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/mrnaBlastz cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/mrnaBlastz cp ../DEF . # Create S1.len file rom.sizes without random chroms or chrM, there are many alt loci also # in hg38 that were not in hg19 so 285 chroms total. cat /hive/data/genomes/hg38/chrom.sizes | grep -v random \ | grep -v chrUn | grep -v chrM > S1.len cp S1.len /hive/data/genomes/hg38/bed/mrnaBlastz.9 screen # Run steps 1 to 5 of RetroFinder pipeline from scripts in CCDS SVN source tree: retroFinder/branches/version2/src/pipeline/ucscStep1.sh DEF # check cluster jobs on ku retroFinder/branches/version2/src/pipeline/ucscStep2.sh DEF retroFinder/branches/version2/src/pipeline/ucscStep3.sh DEF #check cluster jobs on ku retroFinder/branches/version2/src/pipeline/ucscStep4.sh DEF #check cluster jobs on ku # Load the track retroFinder/branches/version2/src/pipeline/ucscStep5.sh DEF cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/retro/version9/hg38 retroFinder/branches/version2/src/pipeline/filterMrna.sh retroFinder/branches/version2/src/pipeline/filterEst.sh # Check cluster jobs on ku retroFinder/branches/version2/src/pipeline/analyseExpress.sh # Check cluster jobs on ku #added ucscRetroAli9 to kent/src/hg/makeDb/human/hg38/trackDb.ra # copied # /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/retro/version9/hg38/trackDb.retro # entry to kent/src/hg/makeDb/trackDb/human/hg38/trackDb.ra and edited it to # remove the full date and add: # dataVersion Jan. 2015 # Scripts copied ucscRetroAli9.psl, ucscRetroInfo9.bed and ucscRetroCds9.tab # to /hive/data/genomes/hg38/bed/retro/ ########## # Make dbVar chrom to UCSC chrom lift file # DONE braney 2/12/15 cd /cluster/data/hg38/jkStuff sort /cluster/data/hg38/chrom.sizes > tmpChrom grep -v '^#\|assembled' /hive/data/genomes/hg38/genbank/GCF_000001405.26.assembly.txt | awk 'BEGIN {OFS="\t"} {print "chr" $3 "_" $5 "_" $2, "chr" $3 "|"$7}' | sed 's/-scaffold//' | sed 's/unlocalized/random/' | sed 's/_unplaced//' | sed 's/chrna/chrUn/g' | sed 's/\./v/' | sort | join /dev/stdin tmpChrom | awk 'BEGIN {OFS="\t"} {print 0, $2, $3, $1, $3}' > dbVar.lift awk 'BEGIN {OFS="\t"} {print 0, $1, $2, $1, $2}' /cluster/data/hg38/chrom.sizes >> dbVar.lift rm tmpChrom ######################################################################### # UCSC to RefSeq name correspondence (DONE - 2015-04-13 - Hiram) mkdir /hive/data/genomes/hg38/bed/ucscToRefSeq cd /hive/data/genomes/hg38/bed/ucscToRefSeq # columns 5 and 7 are the INSDC and RefSeq names grep -v "^#" ../../genbank/GCF_000001405.26.assembly.txt \ | awk -F'\t' '{printf "%s\t%s\n", $5,$7}' | sort > insdc.refSeq.tab hgsql -N -e 'select name,chrom,chromStart,chromEnd from ucscToINSDC;' hg38 \ | sort > insdc.ucsc.tab join insdc.ucsc.tab insdc.refSeq.tab | tr '[ ]' '[\t]' \ | cut -f2- > ucsc.refSeq.tab export chrSize=`cut -f1 ucsc.refSeq.tab | awk '{print length($0)}' | sort -n | tail -1` sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql hgLoadSqlTab hg38 ucscToRefSeq ./ucscToRefSeq.sql ucsc.refSeq.tab checkTableCoords hg38 -table=ucscToRefSeq ######################################################################### #CREATE MICROSAT TRACK (DONE - 2015-05-22 - Hiram) ssh hgwdev mkdir /cluster/data/hg38/bed/microsat cd /cluster/data/hg38/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed hg38 microsat microsat.bed ############################################################################# # ENCODE Regulatory tracks (Kate & Chris) # see reg.txt ######################################################################### # GWIPS-viz Ribo-seq - (DONE - 2016-02-05 - Steve) # contact Audrey Michel (audreymannion@gmail.com) # redmine #16765 obtained bigWig file from shared Google drive https://drive.google.com/a/soe.ucsc.edu/folderview?id=0B_xvV_5tXzOGQ1h5NEh4bnhNTDg&usp=sharing_eid mkdir /hive/data/genomes/hg38/bed/gwipsvizRiboseq cp Global_RiboProElong.10_02_2016.bw /hive/data/genomes/hg38/bed/gwipsvizRiboseq/gwipsvizRiboseq.bw mkdir /gbdb/hg38/bbi/gwipsvizRiboseq cd /gbdb/hg38/bbi/gwipsvizRiboseq ln -s /hive/data/genomes/hg38/bed/gwipsvizRiboseq/gwipsvizRiboseq.bw gwipsvizRiboseq.bw hgsql hg38 create table gwipsvizRiboseq select * from gc5BaseBw; update gwipsvizRiboseq set fileName="/gbdb/hg38/bbi/gwipsvizRiboseq/gwipsvizRiboseq.bw" where fileName="/gbdb/hg38/bbi/gc5BaseBw/gc5Base.bw"; ######################################################################### # COSMIC v81 DONE Chris Eisenhart 2017-05-11 # Make a new COSCMIC track for hg19 mkdir /hive/data/outside/cosmic/hg38/v81 cd /hive/data/outside/cosmic/hg38/v81 # Get the new data sftp ceisenha@ucsc.edu@sftp-cancer.sanger.ac.uk # Login to SFTP server then run these commands get /files/grch38/cosmic/v81/CosmicMutantExport.tsv.gz # Remove the 'NS' fields, search for the \t after to exclude the E'NS'ST transcripts. zcat CosmicMutantExport.tsv.gz | sed 's/NS\t/\t/g' > cosMut.tsv # Use a script to convert to bed format. cosmicToBed cosMut.tsv cosMut.bed # This many lines were skipped, 131597 for not having genomic coordinate # Sort and convert to big bed using the .as file. sort -k1,1 -k2,2n cosMut.bed > sCosMut.bed bedToBigBed -type=bed4+31 -as=cosmicNew.as sCosMut.bed /hive/data/genomes/hg38/chrom.sizes cosMutHg38V81.bb -tab -extraIndex=name,cosmLabel # Link it up so the outside world can see it. cd /gbdb/hg38/cosmic/ ln -s /hive/data/outside/cosmic/hg38/v81/cosMutHg38V81.bb . ######################################################################### # hoffmanMappability hub import (2 super tracks) DONE Chris Eisenhart 2017-05-16 mkdir /hive/data/outside/hoffmanMappability/hg38 cd /hive/data/outside/hoffmanMappability/hg38 wget https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/trackDb.txt # Get the trackDb file importTrackHub trackDb.txt hofMap.ra /gbdb/hg38/hoffmanMappability/ --trackDbPath=$HOME/kent/src/hg/makeDb/trackDb/human/hg38/ --test # Check that the commands are what we want, then run for real importTrackHub trackDb.txt hofMap.ra /gbdb/hg38/hoffmanMappability/ --trackDbPath=$HOME/kent/src/hg/makeDb/trackDb/human/hg38/ # View the .ra file to make sure things are ok, here changed the groups to map, # added the alpha tags, and removed the 'show' from 'superTrack on show' cp hofMap.ra ~/kent/src/hg/makeDb/trackDb/human/hg38 # Include hofMap.ra in the trackDb.ra file # the importTrackHub failed on redirection, fetch all the files manually: # 2017-09-15 - Hiram cd /hive/data/outside/hoffmanMappability/hg38 grep bigDataUrl trackDb.txt | awk '{print $NF}' | sed -e 's#https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/##;' | while read F do echo $F rm -f $F wget --timestamping "https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/${F}" done # real 29m40.429s ######################################################################### # tcgaExpr super track Chris Eisenhart, DONE, 2017-05-17 # tcgaTranscExpr # TCGA transcript level expression barChart track, from TOIL pipeline recompute (John Vivian) # biorxiv.org/content/biorxiv/early/2016/07/07/062497.full.pdf mkdir /hive/data/outside/tcgaBarcharts/ mkdir /hive/data/outside/tcgaBarcharts/transcripts cd /hive/data/outside/tcgaBarcharts/transcripts # Get all the meta data cp ~max/projects/cirm/datasetPages/tcgaGtex/tcgaMeta.tab . # Cut out the meta data the script wants, sample name and group. cut -f 1,5 tcgaMeta.tab | sed 's/ /_/g' > tcgaLargeSamples.tsv # Get and clean the matrix cp ~max/projects/cirm/datasetPages/tcgaGtex/tcga.tpm.tab . # Clean up the transcript names (remove the .#) cut -f 1 tcga.tpm.tab | cut -f 1 -d "." > tcgaTranscripts.txt cut -f 2- tcga.tpm.tab > tcgaTpmValues.tsv paste tcgaTranscripts.txt tcgaTpmValues.tsv > tcgaMatrix.tsv # Build a coordinate map hgsql hg38 -e "select * from ensGene" | cut -f 2- | sort > ensGene hgsql hg38 -e "select * from ensemblToGeneName" | sort > ensemblToGeneName join ensGene ensemblToGeneName | awk '{print $2"\t"$4"\t"$5"\t"$1"\t0\t"$3"\t"$16}' > coord.bed # Use the meta data, matrix, and coordinate map to generate a barchart bed time expMatrixToBarchartBed tcgaLargeSamples.tsv tcgaMatrix.tsv coord.bed tcgaTransExp.bed --groupOrder tcgaGroupOrder.txt # NOTE: Use the header line of the bed file to populate the barChartBars field in the trackDb. # The order of the labels in the barChartBars field should match the order of the labels in the # expScores column in the bed file header. # Sort and convert into a bigBed file. sort -k1,1 -k2,2n tcgaTransExp.bed > sortedTcgaTransExp.bed bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartTranscExp.as sortedTcgaTransExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaTransExp.bb # Link the files into gbdb cd /gbdb/hgFixed/human/expMatrix ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaLargeSamples.tsv tcgaLargeSamples.tab ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaMatrix.tsv tcgaMatrix.tab ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaTransExp.bb . ###########3 # Reload bigBed with a schema that will be shared with genes track, to support # configuration as subtracks in a composite # (2007-08-30 kate) cd /hive/data/outside/tcgaBarcharts/transcripts bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExpr.as sortedTcgaTransExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaTranscExpr.hg38.bb mkdir /gbdb/hg38/tcga ln -s `pwd`/tcgaTranscExpr.hg38.bb /gbdb/hg38/tcga/tcgaTranscExpr.bb # TCGA gene level expression barChart track, from TOIL pipeline recompute (John Vivian) # tcgaGeneExpr mkdir ../genes cd ../genes # Get the gene matrix. cp ~max/projects/cirm/datasetPages/tcgaGtex/tcga.geneTpm.tab . # Make a coordinate file, the genes in gtexGeneModelV6 have .# versions which are # removed with the temp fils. hgsql hg38 -e "select * from hg38.gtexGeneModelV6" | awk '{print $3"\t"$5"\t"$6"\t"$2"\t0\t"$4"\t"$2}' > coord6+1.bed.temp cut -f 4 coord6+1.bed.temp | cut -f 1 -d "." > foo cut -f 1-3 coord6+1.bed.temp > foo2 paste foo2 foo > foo3 cut -f 5- coord6+1.bed.temp > foo4 paste foo3 foo4 > coord6+1.bed # This bed file didn't have the right gene names (ENS rather than Hugo), fix it. hgsql hg38 -e "select * From knownCanonical" > foo wc foo cut -f 6 foo | cut -f 1 -d "." cut -f 6 foo | cut -f 1 -d "." > foo2 head foo cut -f 1-3 foo > foo3 paste foo2 foo3 > foo4 cut -f 4- coord6+1.bed > foo5 join <(sort foo5) <(sort foo4) | awk '{print $5"\t"$6"\t"$7"\t"$1"\t0\t"$3"\t"$4}' > coord6+1.3.bed # Generate the bed file, can use the same transcript file time expMatrixToBarchartBed ../transcripts/tcgaLargeSamples.tsv tcga.geneTpm.tab coord6+1.3.bed tcgaGeneExp.bed --groupOrder=../transcripts/tcgaGroupOrder.txt # Convert to big bed sort -k1,1 -k2,2n tcgaGeneExp.bed > sortedTcgaGeneExp.bed bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExp.as sortedTcgaGeneExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaGeneExp.bb # Link to gbdb cd /gbdb/hgFixed/human/expMatrix ln -s /hive/data/outside/tcgaBarcharts/genes/tcgaGeneExp.bb . ln -s /hive/data/outside/tcgaBarcharts/genes/tcga.geneTpm.tab tcgaGeneMatrix.tab ###########3 # Reload bigBed with a schema that will be shared with transcript track, to support # configuration as subtracks in a composite # Apparently Chris actually loaded the #3 file (added gene names, adjusted end coord apparently) # (2007-08-30 kate) cd /hive/data/outside/tcgaBarcharts/genes bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExpr.as sortedTcgaGeneExp3.bed /hive/data/genomes/hg38/chrom.sizes tcgaGeneExpr.hg38.bb mkdir /gbdb/hg38/tcga ln -s `pwd`/tcgaGeneExpr.hg38.bb /gbdb/hg38/tcga/tcgaGeneExpr.bb ######################################################################### # gtexTransExp Chris Eisenhart, done, 2017-05-23 # TCGA transcript level RNA-seq, from TOIL pipeline recompute (John Vivian) # biorxiv.org/content/biorxiv/early/2016/07/07/062497.full.pdf mkdir /hive/data/outside/gtex/barChartTrack cd /hive/data/outside/gtex/barChartTrack # Seems John included some TCGA data (CML) in the GTEx matrix and samples, the cleaning steps remove this. # Make a clean sample file cat ../johnVivianRecompute/sraToSample.txt | sed 's/ male /\tmale\t/g' | sed 's/ female /\tfemale\t/g' | cut -f 3 | sed 's/ - /-/g' | sed 's/ /_/g' > gtexSampleGroups.txt cat ../johnVivianRecompute/sraToSample.txt | cut -f 1 -d " " > gtexSampleNames.txt paste gtexSampleNames.txt gtexSampleGroups.txt > gtexSamples.txt grep -v '(CML)' gtexSamples.tsv > cleanGtexSamples.tsv # Make a clean matrix cut -f 1 ../johnVivianRecompute/gtex.tpm.tab | cut -f 1 -d "." > gtexTranscripts.txt cut -f 2- ../johnVivianRecompute/gtex.tpm.tab > gtexTpmValues.tsv paste gtexTranscripts.txt gtexTpmValues.tsv > gtexMatrix.tsv rowsToCols gtexMatrix.tsv tspsdGtexMatrix.tsv sort tspsdGtexMatrix.tsv > sortedTspsdGtexMatrix.tsv grep -v '(CML)' gtexSamples.tsv | cut -f 1 | sed 's/Run_s/#transcript/g' | sort > sortedCleanGtexSamples.tsv join sortedCleanGtexSamples.tsv sortedTspsdGtexMatrix.tsv > cleanTspsdGtexMatrix.tsv rowsToCols cleanTspsdMatrix.tsv cleanGtexMatrix.tsv # Build a coordinate map hgsql hg38 -e "select * from ensGene" | cut -f 2- | sort > ensGene hgsql hg38 -e "select * from ensemblToGeneName" | sort > ensemblToGeneName join ensGene ensemblToGeneName | awk '{print $2"\t"$4"\t"$5"\t"$1"\t0\t"$3"\t"$16}' > coord.bed # Get the gtex ordering hgsql hgFixed -e "select * from gtexTissue" | cut -f 3 | sed 's/ - /-/g' | sed 's/ /_/g' | sed '1D' > gtexGroupOrder.txt # Use the meta data, matrix, and coordinate map to generate a barchart bed time expMatrixToBarchartBed cleanGtexSamples.tsv cleanGtexMatrix.tsv coord.bed gtexTransExp.bed --groupOrderFile gtexGroupOrder.txt # NOTE: Use the header line of the bed file to populate the barChartBars field in the trackDb. # The order of the labels in the barChartBars field should match the order of the labels in the # expScores column in the bed file header. # Sort and convert into a bigBed file. sort -k1,1 -k2,2n gtexTransExp.bed > sortedGtexTransExp.bed bedToBigBed -as=$HOME/kent/src/hg/lib/barChartTranscExp.as -type=bed6+5 sortedGtexTransMed.bed /hive/data/genomes/hg38/chrom.sizes gtexTranscExpr.bb # Link the files into gbdb cd /gbdb/hgFixed/human/expMatrix ln -s /hive/data/outside/gtex/barChartTrack/cleanGtexSamples.tsv cleanGtexSamples.tab ln -s /hive/data/outside/gtex/barChartTrack/cleanGtexMatrix.tsv cleanGtexMatris.tab # <2007-08-30 kate) cd /gbdb/hg38/gtex ln -s /hive/data/outside/gtex/barChartTrack/gtexTranscExpr.bb . ######################################################################### # LASTZ human/hg38 vs. Zebrafish /danRer11 # (DONE - 2017-06-12 - Chris) mkdir /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12 cd /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12 printf '# human vs zebrafish danRer11 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: human hg38 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift SEQ1_CHUNK=40000000 SEQ1_LIMIT=20 SEQ1_LAP=10000 # QUERY: zebrafish danRer11 SEQ2_DIR=/hive/data/genomes/danRer11/danRer11.2bit SEQ2_LEN=/hive/data/genomes/danRer11/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -noDbNameCheck -syntenicNet) > do.log 2>&1 # real 3327m39.074s cat fb.hg38.chainDanRer11Link.txt # 41036733 bases of 3049335806 (1.346%) in intersection 973293331 bases of 3049335806 (31.918%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` hg38 danRer11) \ > rbest.log 2>&1 & # and for the swap: mkdir /hive/data/genomes/danRer11/bed/blastz.hg38.swap cd /hive/data/genomes/danRer11/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -noDbNameCheck -syntenicNet) > swap.log 2>&1 # real 39m24.916s cat fb.danRer11.chainHg38Link.txt # 47869194 bases of 1674677181 (2.858%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` danRer11 hg38) \ > rbest.log 2>&1 & # real 638m45.337s _EOF_ ######################################################################### # refSeqFuncElems NCBI refSeq functional elements, REDONE 2017-11-29 Angie # previously done 2017-08-01 by Chris E mkdir /hive/data/genomes/hg38/bed/refSeqFuncElems.2017-11-29 cd /hive/data/genomes/hg38/bed/refSeqFuncElems.2017-11-29 # NOTE FOR NEXT TIME: instead of using interim GFF, in the future these annotations might be # folded into the same main release GFF3 from which the ncbiRefSeq* tables are extracted by # doNcbiRefSeq.pl. wget ftp://ftp.ncbi.nlm.nih.gov/genomes/H_sapiens/GFF_interim/interim_GRCh38.p11_top_level_2017-06-27.gff3.gz # Get mapping of RefSeq NC_* chromosome accs (and NT_*, NW_*) to hg38 chrom names hgsql hg38 -NBe 'select alias, chrom from chromAlias where source = "refseq" order by alias' \ > refSeqToChrom.tab cut -f 2 refSeqToChrom.tab | sed -e 's/^/^/' > chrom.tab # Use Terence Murphy's list of feature types (and the multi-type attribute regulatory_class) # to identify Functional Elements and swap in hg38 chrom names. # Use subColumn -miss so it doesn't quit when it sees a patch contig that doesn't map to an # hg38 chrom. Use grep -f chrom.tab to filter out patch contig annotations. zcat interim_GRCh38.p11_top_level_2017-06-27.gff3.gz \ | grep -P "(\t(CAAT_signal|GC_rich_promoter_region|TATA_box|enhancer|insulator|locus_control_region|mobile_genetic_element|origin_of_replication|promoter|protein_binding_site|recombination_feature|regulatory_region|repeat_region|sequence_feature|sequence_secondary_structure|silencer|stem_loop)\t|regulatory_class=)" \ | subColumn -miss=/dev/null 1 stdin refSeqToChrom.tab stdout \ | grep -f chrom.tab > funcElems.gff wc -l funcElems.gff #5756 funcElems.gff # Transform GFF to BED+ ~/kent/src/hg/utils/automation/parseRefSeqFuncElems funcElems.gff /dev/stdout \ | sort -k1,1 -k2n,2n > refSeqFuncElems.bed wc -l refSeqFuncElems.bed #5756 refSeqFuncElems.bed # Make bigBed and link from /gbdb bedToBigBed -tab -type=bed9+7 -as=$HOME/kent/src/hg/lib/refSeqFuncElems.as \ refSeqFuncElems.bed /hive/data/genomes/hg38/chrom.sizes refSeqFuncElems.bb rm -f /gbdb/hg38/ncbiRefSeq/refSeqFuncElems.bb ln -s `pwd`/refSeqFuncElems.bb /gbdb/hg38/ncbiRefSeq/ ################################################################### # cosmicRegions (DONE 2017-08-03 Chris) # Make a new COSCMIC track for hg38 v82 mkdir /hive/data/outside/cosmic/hg38/v82 cd /hive/data/outside/cosmic/hg38/v82 # Get the new data sftp ceisenha@ucsc.edu@sftp-cancer.sanger.ac.uk # Login to SFTP server then run these commands get /files/grch38/cosmic/v82/CosmicMutantExport.tsv.gz # Remove the 'NS' fields, search for the \t after to exclude the E'NS'ST transcripts. zcat CosmicMutantExport.tsv.gz | sed 's/NS\t/\t/g' > cosMut.tsv # Use a script to convert to bed format. cosmicToBed cosMut.tsv cosMut.bed # This many lines were skipped, 134601 for not having genomic coordinate # Sort and convert to big bed using the .as file. sort -k1,1 -k2,2n cosMut.bed > sCosMut.bed bedToBigBed -type=bed8+31 -as=cosmicNew.as sCosMut.bed /hive/data/genomes/hg38/chrom.sizes cosMutHg38V82.bb -tab -extraIndex=name,cosmLabel # Link it up so the outside world can see it. cd /gbdb/hg38/cosmic/ ln -s /hive/data/outside/cosmic/hg38/v82/cosMutHg38V82.bb . ######################################################################### # RepeatMasker Visualization track update (DONE - 2018-05-04 - ChrisL) screen -S rmskJoined.2018-05-04 mkdir /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04 cd /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04 ln -s ../repeatMasker/hg38.sorted.fa.out . ln -s ../repeatMasker/hg38.fa.align.gz . # this script points to the most recent RepeatMasker version: time (/scratch/data/RepeatMasker/util/rmToUCSCTables.pl \ -out hg38.sorted.fa.out -align hg38.fa.align.gz) > do.log 2>&1 & # no differences, forgot to remake rmsk files # so instead remake the rmsk track and try again mkdir /hive/data/genomes/hg38/bed/repeatMasker.2018-05-04 cd /hive/data/genomes/hg38/bed/repeatMasker.2018-05-04 # remake the sorted.fa.out and fa.align.gz, stop after masking # so rmsk table isn't overwritten time (doRepeatMasker.pl -stop=mask -useHMMER -bigClusterHub=ku \ -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38) > mask.log 2>&1 & # RepeatMasker bug?: Undefined id, line 1440295 of input: # 10 26.1 0.0 0.0 chr13 114292339 114292382 (71946) C L1P4 LINE/L1 (17) 6149 6106 # RepeatMasker bug?: Undefined id, line 3529762 of input: # 992 2.3 0.5 0.0 chr3 180461254 180462048 (17833511) C L1PA3 LINE/L1 (3) 6152 5354 # RepeatMasker bug?: Undefined id, line 3529763 of input: # 1153 3.2 0.2 0.0 chr3 180462043 180463006 (17832553) + L1PA3 LINE/L1 4392 5357 (789) # RepeatMasker bug?: Undefined id, line 5303571 of input: # 220 22.5 0.0 17.7 chr9 105798076 105799127 (32595590) C SATR2 Satellite (4) 866 1 # real 643m17.617s # get rid of the missing id items: grep -v "114292339 114292382\|180461254 180462048\|180462043 180463006\|105798076 105799127" \ hg38.fa.out > clean.hg38.fa.out mv clean.hg38.fa.out hg38.fa.out # finish the last step of doCat.csh: /cluster/bin/scripts/extractNestedRepeats.pl hg38.fa.out | sort -k1,1 -k2,2n > hg38.nestedRepeats.bed cd /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04 rm hg38.sorted.fa.out rm hg38.fa.align.gz rm *.tsv ln -s ../repeatMasker.2018-05-04/hg38.sorted.fa.out . ln -s ../repeatMasker.2018-05-04/hg38.fa.align . # and then re-run time (/scratch/data/RepeatMasker/util/rmToUCSCTables.pl \ -out hg38.sorted.fa.out -align hg38.fa.align.gz) > rerun.log 2>&1 & # real 141m7.268s # confirm the counts are different from the previous version: # wc -l ../rmskJoined/hg38.fa.align.tsv ../rmskJoined/hg38.sorted.fa.join.bed ../rmskJoined/hg38.sorted.fa.out.tsv 7203858 ../rmskJoined/hg38.fa.align.tsv 4607727 ../rmskJoined/hg38.sorted.fa.join.bed 5520118 ../rmskJoined/hg38.sorted.fa.out.tsv 17331703 total # wc -l *.tsv 7227245 hg38.fa.align.tsv 4828114 hg38.sorted.fa.join.tsv 5916189 hg38.sorted.fa.out.tsv 17971548 total hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/rmskJoined.sql \ -renameSqlTable -verbose=4 -tab \ -type=bed4+9 -as=$HOME/kent/src/hg/lib/rmskJoined.as hg38 \ rmskJoinedCurrent hg38.sorted.fa.join.tsv \ > loadJoined.log 2>&1 hgLoadSqlTab hg38 rmskAlignCurrent \ /cluster/home/chmalee/kent/src/hg/lib/rmskAlign.sql \ hg38.fa.align.tsv > loadAlign.log 2>&1 hgLoadOutJoined -verbose=2 -table=rmskOutCurrent hg38 hg38.sorted.fa.out > loadOut.log 2>&1 featureBits -countGaps hg38 rmskJoinedCurrent # 2796899855 bases of 3209286105 (87.150%) in intersection ######################################################################### # Hi-C Visualization based on Krietenstein 2019 (DONE - 2019-10-07 - Jonathan) mkdir -p /hive/data/genomes/hg38/bed/hic cd /hive/data/genomes/hg38/bed/hic # Files are located on 4D Nucleome (data.4dnucleome.org). The URL for the paper on that # site is https://data.4dnucleome.org/publications/b13590b2-a341-4e5e-ad5e-72e233b32e9d/. # The four file IDs downloaded below are for contact matrix .hic files created for # different cell-line/protocol combinations wget 'https://data.4dnucleome.org/files-processed/4DNFI2TK7L2F/@@download/4DNFI2TK7L2F.hic' # H1-hESC Micro-C XL wget 'https://data.4dnucleome.org/files-processed/4DNFIQYQWPF5/@@download/4DNFIQYQWPF5.hic' # H1-hESC in situ wget 'https://data.4dnucleome.org/files-processed/4DNFI18Q799K/@@download/4DNFI18Q799K.hic' # HFFc6 Micro-C XL wget 'https://data.4dnucleome.org/files-processed/4DNFIFLJLIS5/@@download/4DNFIFLJLIS5.hic' # HFFc6 in situ printf "All files were downloaded from the 4D Nucleome Data Portal at data.4dnucleome.org. These are processed contact matrices from Krietenstein et al. (2019) Ultrastructural details of mammalian chromosme architecture. (https://www.biorxiv.org/content/10.1101/639922v1). 4DNFI2TK7L2F.hic - Micro-C XL data set on H1-hESC 4DNFIQYQWPF5.hic - in situ Hi-C data set on H1-hESC 4DNFI18Q799K.hic - Micro-C XL data set on HFFc6 4DNFIFLJLIS5.hic - in situ Hi-C data set on HFFc6" > README.txt mkdir -p /gbdb/hg38/bbi/hic cd /gbdb/hg38/bbi/hic ln -s /hive/data/genomes/hg38/bed/hic/* . ######################################################################### # LASTZ Self/hg38 (DONE 2020-02-11 - Angie) # RM #24695 # Re-run with updated process to include pslDropOverlap . # Use "contigs" from previous run lastzSelf.2014-01-25/hg38.self.2bit screen -S hg38Self -t hg38Self mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 cat << _EOF_ > DEF # human vs human with mouse defaults BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz # TARGET: Human hg38 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes SEQ1_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit SEQ1_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Human hg38 SEQ2_DIR=/hive/data/genomes/hg38/hg38.2bit SEQ2_LEN=/hive/data/genomes/hg38/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit SEQ2_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes SEQ2_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift SEQ2_CHUNK=20000000 SEQ2_LAP=0 BASE=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 TMPDIR=/dev/shm _EOF_ # NOTE FOR NEXT TIME: use -chainMinScore=10000 (at least), not 3000 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \ -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ -stop=net >& do.log & tail -f do.log # After two days, 4 jobs are running, one of which (part014.lst vs itself) crashed with # out-of-mem error. After 3 days, 3 jobs completed but part014.lst runs lastz out of mem. # Split part014.lst up into components, run on hgwdev (more mem). mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014 cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014 mkdir psl cp /dev/null jobList for t in $(cat ../tParts/part014.lst); do tBase=$(basename $t) for q in $(cat ../tParts/part014.lst); do qBase=$(basename $q) echo "$HOME/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf $t $q ../../DEF {check out exists psl/${tBase}_${qBase}.psl }" >> jobList done done para create jobList para try, check, push, etc, # 94 of the jobs ran for 12s or less. The other 6 are chr{X_Y}_00 vs. self & each other, # chr13_16 vs self and chr16_03 vs self. All but chr16_03 vs self completed in < 6 minutes. #Completed: 99 of 100 jobs #Crashed: 1 jobs #CPU time in finished jobs: 1559s 25.98m 0.43h 0.02d 0.000 y #IO & Wait Time: 248s 4.14m 0.07h 0.00d 0.000 y #Average job time: 18s 0.30m 0.01h 0.00d #Longest finished job: 321s 5.35m 0.09h 0.00d #Submission to last job: 94681s 1578.02m 26.30h 1.10d # Dang, chr16_03 vs. self still runs out of mem even on hgwdev. mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03 cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03 twoBitToFa /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit:chr16_03:0-1689648 \ chr16_03.fa faSplit -lift=chr16_03.lift size chr16_03.fa 169000 chr16_03_split_ faToTwoBit chr16_03_split_*.fa chr16_03_split.2bit twoBitInfo chr16_03_split.2bit stdout | sort -k2nr > chr16_03_split.sizes sed -re 's@CTGDIR.*@CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03/chr16_03_split.2bit@; s@CTGLEN.*@CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03/chr16_03_split.sizes@;' \ ../../../DEF > DEF.split mkdir psl cwd=$(pwd) while read tBase tSize; do while read qBase qSize; do echo "$HOME/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf $cwd/chr16_03_split.2bit:$tBase:0-$tSize $cwd/chr16_03_split.2bit:$qBase:0-$qSize DEF.split {check out exists psl/${tBase}_${qBase}.psl}" done < chr16_03_split.sizes done < chr16_03_split.sizes > jobList para create jobList para try, check, push, etc, #Completed: 100 of 100 jobs #CPU time in finished jobs: 142614s 2376.89m 39.61h 1.65d 0.005 y #IO & Wait Time: 167s 2.79m 0.05h 0.00d 0.000 y #Average job time: 1428s 23.80m 0.40h 0.02d #Longest finished job: 22861s 381.02m 6.35h 0.26d #Submission to last job: 22874s 381.23m 6.35h 0.26d # 6 hours for chr16_03_split_00 vs. itself. ~4.5h for _09 vs _00. cat psl/*.psl \ | liftUp -nohead -type=.psl stdout \ chr16_03.lift error stdin \ | liftUp -nohead -type=.psl -pslQ \ ../psl/hg38.self.2bit:chr16_03:0-1689648_hg38.self.2bit:chr16_03:0-1689648.psl \ chr16_03.lift error stdin cd .. cat psl/* > ../../psl/part014.lst/part014.lst_part014.lst.psl # Make run.time file or doBlastzChainNet.pl won't continue: cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz para time >& run.time # Resume doBlastzChainNet.pl: cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \ -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ -continue=cat -stop=net >& do2.log & tail -f do2.log #Batch failed after 4 tries on chain.csh part016.lst chain/part016.lst.chain #Command failed: #ssh -x -o 'StrictHostKeyChecking = no' -o 'BatchMode = yes' hgwdev nice /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run/doChainRun.csh cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run para problems # mostly these: #errAbort re-entered due to out-of-memory condition. Exiting. # one job made it through errAbort: #needLargeMem: Out of memory - request size 564838920 bytes, errno: 12 para time #Completed: 59 of 68 jobs #Crashed: 9 jobs #CPU time in finished jobs: 24727s 412.12m 6.87h 0.29d 0.001 y #IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y #Average job time: 409s 6.82m 0.11h 0.00d #Longest finished job: 2350s 39.17m 0.65h 0.03d #Submission to last job: 2462s 41.03m 0.68h 0.03d para crashed #chain.csh part012.lst {check out line+ chain/part012.lst.chain} #chain.csh part017.lst {check out line+ chain/part017.lst.chain} #chain.csh part016.lst {check out line+ chain/part016.lst.chain} #chain.csh part015.lst {check out line+ chain/part015.lst.chain} #chain.csh part014.lst {check out line+ chain/part014.lst.chain} #chain.csh hg38.self.2bit:chr1_10: {check out line+ chain/hg38.self.2bit:chr1_10:.chain} #chain.csh hg38.self.2bit:chr10_05: {check out line+ chain/hg38.self.2bit:chr10_05:.chain} #chain.csh hg38.self.2bit:chr7_00: {check out line+ chain/hg38.self.2bit:chr7_00:.chain} # Run the jobs outside of parasol (~11h): csh -efx chain.csh part012.lst chain/part012.lst.chain & csh -efx chain.csh part017.lst chain/part017.lst.chain & csh -efx chain.csh part016.lst chain/part016.lst.chain & csh -efx chain.csh part015.lst chain/part015.lst.chain & csh -efx chain.csh part014.lst chain/part014.lst.chain & csh -efx chain.csh hg38.self.2bit:chr1_10: chain/hg38.self.2bit:chr1_10:.chain & csh -efx chain.csh hg38.self.2bit:chr10_05: chain/hg38.self.2bit:chr10_05:.chain & csh -efx chain.csh hg38.self.2bit:chr7_00: chain/hg38.self.2bit:chr7_00:.chain & csh -efx chain.csh hg38.self.2bit:chr16_08: chain/hg38.self.2bit:chr16_08:.chain & # Resume doBlastzChainNet.pl again: cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \ -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ -continue=chainMerge -stop=net >& do3.log & tail -f do3.log # *** All done ! Elapsed time: 19m11s # Load track w/new name chainSelfRedo to compare to existing chainSelf: hgLoadChain -normScore -tIndex hg38 chainSelfRedo axtChain/hg38.hg38.all.chain.gz # No idea why but somehow the liftUp seems not to have worked for part012 and part017, # so the all.chain had chr22_31, chr8_01 etc. :b run again again. cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run mv chain/part012.lst.chain{,.bak} mv chain/part017.lst.chain{,.bak} csh -efx chain.csh part012.lst chain/part012.lst.chain >& part012.log & csh -efx chain.csh part017.lst chain/part017.lst.chain >& part017.log & # Those completed successfully. Dunno why the earlier ones didn't get lifted. cd .. mv hg38.hg38.all{,.oopsPartUnlifted}.chain.gz # Reconstruct hg38.hg38.all.chain.gz (the chainMerge step is just this command): find /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run/chain -name "*.chain" \ | chainMergeSort -inputList=stdin \ | nice gzip -c \ > /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/hg38.hg38.all.chain.gz # NOTE FOR NEXT TIME: this filtering step will be unnecessary when -minScore=10000 is used # from the beginning. # Filter to minScore of 10000 (too much fluff with -minScore=3000) per Jim (see #24695) cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain mv hg38.hg38.all.chain.gz hg38.hg38.all.unfiltered.chain.gz chainFilter hg38.hg38.all.unfiltered chain.gz -minScore=10000 \ | gzip -c > hg38.hg38.all.chain.gz hgLoadChain -normScore -tIndex hg38 chainSelfRedo hg38.hg38.all.chain.gz checkTableCoords hg38 chainSelfRedo # Rename to chainSelf and update lastz symlinks and downloads hgsql hg38 -e 'drop table chainSelf; drop table chainSelfLink; rename table chainSelfRedo to chainSelf; rename table chainSelfRedoLink to chainSelfLink;' cd /hive/data/genomes/hg38/bed rm lastz.self lastz.hg38 ln -s lastzSelf.2020-01-27 lastz.self ln -s lastzSelf.2020-01-27 lastz.hg38 cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain cp /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/axtChain/README.txt . $EDITOR README.txt md5sum hg38.hg38.all.chain.gz > md5sum.txt # Make sure that the old download dir has only symlinks, no real files, then remove and rebuild. ls -lR /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ rm -r /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ ln -s /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/{README.txt,hg38.hg38.all.chain.gz,md5sum.txt} . ######################################################################### +# NCBI ReMap alignments (DONE 2020-02-11 Angie) +# RM 24449 + mkdir /hive/data/genomes/hg38/bed/chainHg19ReMap + cd /hive/data/genomes/hg38/bed/chainHg19ReMap + wget ftp://ftp.ncbi.nlm.nih.gov/pub/remap/Homo_sapiens/current/GCF_000001405.39_GRCh38.p13/GCF_000001405.25_GRCh37.p13/GCF_000001405.39-GCF_000001405.25.gff + # We will need to substitute all the RefSeq chrom and contig IDs with our own names. + # The same alt contig can appear in both assemblies with the same name, so replace + # hg19 names at the beginning of the line and hg38 names after "Target=". + hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \ + | sed -re 's/\./\\./;' \ + | awk '{print "s/^" $1 "\\b/" $2 "/;";}' \ + > hg38.hg19.chromAlias.sed + hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \ + | sed -re 's/\./\\./;' \ + | awk '{print "s/Target=" $1 "\\b/Target=" $2 "/;";}' \ + >> hg38.hg19.chromAlias.sed + + # There are some GRCh38.p13 sequences that we have not yet imported into hg38 -- use -dropT. + sed -f hg38.hg19.chromAlias.sed GCF_000001405.39-GCF_000001405.25.gff \ + | gff3ToPsl -dropT /hive/data/genomes/{hg19,hg38}/chrom.sizes stdin stdout \ + | pslPosTarget stdin stdout \ + | sort -k14,14 -k16n,16n > remap.hg38.hg19.psl + + # Convert to chain for browser display. Some of the remap chains have minScore < 1000 and + # by default would be dropped by chainScore... use -minScore=0 to prevent that. + time pslToChain remap.hg38.hg19.psl stdout \ + | chainScore -minScore=0 stdin /hive/data/genomes/{hg38/hg38.2bit,hg19/hg19.2bit} \ + remap.hg38.hg19.chain +#real 9m31.900s +#user 9m1.624s +#sys 0m20.863s + hgLoadChain hg38 -tIndex chainHg19ReMap remap.hg38.hg19.chain +#Loading 5315 chains into hg38.chainHg19ReMap + time axtChain -psl -linearGap=medium -verbose=0 remap.hg38.hg19.psl \ + /hive/data/genomes/hg38/hg38.2bit /hive/data/genomes/hg19/hg19.2bit \ + remap.axtChain.hg38.hg19.chain +#real 2m26.333s +#user 2m4.237s +#sys 0m22.071s + hgLoadChain hg38 -tIndex chainHg19ReMapAxtChain remap.axtChain.hg38.hg19.chain +#Loading 2115 chains into hg38.chainHg19ReMapAxtChain + + +#########################################################################