ee98569fb749e16cf3e4601a7cef331432d062f8 jeltje.van.baren Tue Jan 21 10:25:19 2025 -0800 adding alphaMissense diff --git src/hg/makeDb/doc/hg38/hg38.txt src/hg/makeDb/doc/hg38/hg38.txt index 1f3dc1234db..20b6e2a715a 100644 --- src/hg/makeDb/doc/hg38/hg38.txt +++ src/hg/makeDb/doc/hg38/hg38.txt @@ -1,7390 +1,14 @@ -# for emacs: -*- mode: sh; -*- - -# This file describes how we made the browser database on -# NCBI build 38 (December 2013 freeze) aka: -# GRCh38 - Genome Reference Consortium Human Reference 38 -# Assembly Accession: GCA_000001405.2 - -############################################################################# -## Download sequence - DONE - 2013-12-24 - mkdir /hive/data/genomes/hg38 - mkdir /hive/data/genomes/hg38/genbank - cd /hive/data/genomes/hg38/genbank - time rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/ ./ -# sent 19643 bytes received 4914689807 bytes 4490369.53 bytes/sec -# total size is 4914019581 speedup is 1.00 - -# real 18m14.497s - -############################################################################# -## convert to UCSC names - DONE - 2013-12-24 -# with this release, NCBI has adopted a naming convention that is similar -# to UCSC. The delivered sequence with these names can be found in: -# /hive/data/genomes/hg38/genbank/seqs_for_alignment_pipelines/ -# -# The following scripts reproduce this naming scheme from the separate -# files in the release -# - mkdir /hive/data/genomes/hg38/ucsc - cat << '_EOF_' > ucscCompositeAgp.pl -#!/bin/env perl - -use strict; -use warnings; - -my %accToChr; - -open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or - die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; -while (my $line = <FH>) { - next if ($line =~ m/^#/); - chomp $line; - my ($chrN, $acc) = split('\s+', $line); - $accToChr{$acc} = $chrN; -} -close (FH); - -foreach my $acc (keys %accToChr) { - my $chrN = $accToChr{$acc}; - print "$acc $accToChr{$acc}\n"; - open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz|") or die "can not read chr${chrN}.comp.agp.gz"; - open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; - while (my $line = <FH>) { - if ($line =~ m/^#/) { - print UC $line; - } else { - $line =~ s/^$acc/chr${chrN}/; - print UC $line; - } - } - close (FH); - close (UC); - open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; - open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; - while (my $line = <FH>) { - if ($line =~ m/^>/) { - printf UC ">chr${chrN}\n"; - } else { - print UC $line; - } - } - close (FH); - close (UC); -} -'_EOF_' - # << happy emacs - chmod +x ucscCompositeAgp.pl - - cat << '_EOF_' > unlocalized.pl -#!/bin/env perl - -use strict; -use warnings; - -my %accToChr; -my %chrNames; - -open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or - die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf"; -while (my $line = <FH>) { - next if ($line =~ m/^#/); - chomp $line; - my ($chrN, $acc) = split('\s+', $line); - $acc =~ s/\./v/; - $accToChr{$acc} = $chrN; - $chrNames{$chrN} += 1; -} -close (FH); - -foreach my $chrN (keys %chrNames) { - my $agpFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz"; - my $fastaFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz"; - open (FH, "zcat $agpFile|") or die "can not read $agpFile"; - open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp"; - while (my $line = <FH>) { - if ($line =~ m/^#/) { - print UC $line; - } else { - chomp $line; - my (@a) = split('\t', $line); - my $acc = $a[0]; - $acc =~ s/\./v/; - die "ERROR: chrN $chrN not correct for $acc" - if ($accToChr{$acc} ne $chrN); - my $ucscName = "chr${chrN}_${acc}_random"; - printf UC "%s", $ucscName; - for (my $i = 1; $i < scalar(@a); ++$i) { - printf UC "\t%s", $a[$i]; - } - printf UC "\n"; - } - } - close (FH); - close (UC); - printf "chr%s\n", $chrN; - open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; - open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa"; - while (my $line = <FH>) { - if ($line =~ m/^>/) { - chomp $line; - my $acc = $line; - $acc =~ s/.*gb\|//; - $acc =~ s/. Homo.*//; - $acc =~ s/\./v/; - die "ERROR: chrN $chrN not correct for $acc" - if ($accToChr{$acc} ne $chrN); - my $ucscName = "chr${chrN}_${acc}_random"; - printf UC ">$ucscName\n"; - } else { - print UC $line; - } - } - close (FH); - close (UC); -} -'_EOF_' - # << happy emacs - chmod +x unlocalized.pl - - cat << '_EOF_' > unplaced.pl -#!/bin/env perl - -use strict; -use warnings; - -my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; -my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; -open (FH, "zcat $agpFile|") or die "can not read $agpFile"; -open (UC, ">chrUn.agp") or die "can not write to chrUn.agp"; -while (my $line = <FH>) { - if ($line =~ m/^#/) { - print UC $line; - } else { - $line =~ s/\./v/; - printf UC "chrUn_%s", $line; - } -} -close (FH); -close (UC); - -open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; -open (UC, ">chrUn.fa") or die "can not write to chrUn.fa"; -while (my $line = <FH>) { - if ($line =~ m/^>/) { - chomp $line; - $line =~ s/.*gb\|//; - $line =~ s/. Homo.*//; - $line =~ s/\./v/; - printf UC ">chrUn_$line\n"; - } else { - print UC $line; - } -} -close (FH); -close (UC); -'_EOF_' - # << happy emacs - chmod +x unplaced.pl - - cat << '_EOF_' > altSequence.pl -#!/usr/bin/env perl - -use strict; -use warnings; -use File::Basename; - -open (AG, ">chrAlt.agp") or die "can not write to chrAlt.agp"; -open (FA, ">chrAlt.fa") or die "can not write to chrAlt.fa"; -open (FH, "find ../genbank/ALT* -type f | grep alt_scaffold_placement.txt|") or die "can not find alt_scaffold_placement.txt files"; -while (my $file = <FH>) { - chomp $file; - my $dirName = dirname($file); - my $agpFile = "$dirName/AGP/alt.scaf.agp.gz"; - my $fastaFile = "$dirName/FASTA/alt.scaf.fa.gz"; - # key is genbank acc name, value is UCSC chr name - my %nameDelta; -# printf STDERR "# %s\n", $file; - open (AL, "<$file") or die "can not read $file"; - while (my $line = <AL>) { - next if ($line =~ m/^#/); - chomp $line; - my ($alt_asm_name, $prim_asm_name, $alt_scaf_name, $alt_scaf_acc, - $parent_type, $parent_name, $parent_acc, $region_name, $ori, - $alt_scaf_start, $alt_scaf_stop, $parent_start, $parent_stop, - $alt_start_tail, $alt_stop_tail) = split('\t', $line); - my $ucscAcc = $alt_scaf_acc; - $ucscAcc =~ s/\./v/; - my $ucscName = sprintf("chr%s_%s_alt", $parent_name, $ucscAcc); - printf "%s %s\n", $alt_scaf_acc, $ucscName; - if (exists ($nameDelta{$alt_scaf_acc})) { - die "duplicate name incorrect ? $alt_scaf_acc $nameDelta{$alt_scaf_acc} ne $ucscName" if ($nameDelta{$alt_scaf_acc} ne $ucscName); - } else { - $nameDelta{$alt_scaf_acc} = $ucscName; - } - } - close (AL); - open (AL, "zcat $agpFile|") or die "can not read $agpFile"; - while (my $line = <AL>) { - if ($line =~ m/^#/) { - print AG "$line"; - } else { - my ($acc, $rest) = split('\t', $line, 2); - die "can not find ucsc name for $acc" if (!exists($nameDelta{$acc})); - printf AG "%s\t%s", $nameDelta{$acc}, $rest; - } - } - close (AL); - open (AL, "zcat $fastaFile|") or die "can not read $fastaFile"; - while (my $line = <AL>) { - chomp $line; - if ($line =~ m/^>/) { - $line =~ s/.*gb.//; - $line =~ s/. Homo.*//; - die "can not find ucsc name for $line" if (!exists($nameDelta{$line})); - printf FA ">%s\n", $nameDelta{$line}; - } else { - printf FA "%s\n", $line; - } - } - close (AL); -} -close (FH); -close (AG); -close (FA); -'_EOF_' - # << happy emacs - chmod +x altSequence.pl - - ./ucscCompositeAgp.pl - ./unlocalized.pl - ./unplaced.pl - ./altSequence.pl - - # temporarily verify the fasta and AGP are complete and compatible - faToTwoBit chr*.fa hg38.test.2bit - cat chr*.agp > hg38.agp - checkAgpAndFa hg38.agp hg38.test.2bit 2>&1 | tail -1 -# All AGP and FASTA entries agree - both files are valid - - rm -f hg38.agp hg38.test.2bit - - # comparing faCounts of this 2bit file and the sequences delivered - # in genbank/seqs_for_alignment_pipelines/ - # result in the exact same sequence - -############################################################################# -## initial db build - DONE - 2013-12-24 - Hiram - - cd /hive/data/genomes/hg38 - cat << '_EOF_' > hg38.config.ra -# Config parameters for makeGenomeDb.pl: -db hg38 -scientificName Homo sapiens -commonName Human -assemblyDate Dec. 2013 -assemblyLabel GRCh38 Genome Reference Consortium Human Reference 38 (GCA_000001405.2) -assemblyShortLabel GRCh38 -orderKey 13 -mitoAcc none -fastaFiles /hive/data/genomes/hg38/ucsc/chr*.fa -agpFiles /hive/data/genomes/hg38/ucsc/chr*.agp -# qualFiles /dev/null -dbDbSpeciesDir human -photoCreditURL http://www.cbse.ucsc.edu/ -photoCreditName Graphic courtesy of CBSE -ncbiGenomeId 51 -ncbiAssemblyId 883148 -ncbiAssemblyName GRCh38 -ncbiBioProject 31257 -genBankAccessionID GCA_000001305.2 -taxId 9606 -'_EOF_' - # << happy emacs - - # step wise to first verify AGP and Fasta files - time makeGenomeDb.pl -stop=agp hg38.config.ra > agp.log 2>&1 - - # looking good, continue: - time makeGenomeDb.pl -continue=db hg38.config.ra > db.log 2>&1 - - # add the files produced by the trackDb build to the source tree - - # this path is fixed in the makeGenomeDb.pl for next time - # honor new convention for bbi location files: - cd /gbdb/hg38/bbi - mkdir gc5BaseBw - mv gc5Base.bw gc5BaseBw - cd gc5BaseBw - # before - hgsql -e 'select * from gc5BaseBw;' hg38 -# +---------------------------+ -# | fileName | -# +---------------------------+ -# | /gbdb/hg38/bbi/gc5Base.bw | -# +---------------------------+ - # and fixed - hgBbiDbLink hg38 gc5BaseBw `pwd`/gc5Base.bw - hgsql -e 'select * from gc5BaseBw;' hg38 -# +-------------------------------------+ -# | fileName | -# +-------------------------------------+ -# | /gbdb/hg38/bbi/gc5BaseBw/gc5Base.bw | -# +-------------------------------------+ - -############################################################################# -## RepeatMasker with CrossMatch - DONE - 2013-12-24,27 - Hiram - mkdir /hive/data/genomes/hg38/bed/repeatMaskerCM - cd /hive/data/genomes/hg38/bed/repeatMaskerCM - # running this step wise so it can be loaded into its own table - time doRepeatMasker.pl -stop=mask -bigClusterHub=ku \ - -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1 - # real 3443m13.026s -# RepeatMasker version June 20 2013 open-4.0.3 -# Search Engine: cross-match version 1.090518 -# RepeatMasker Database: 20130422 - - # take the install script from this -debug run and alter it to load - # the table into rmskCM - time doRepeatMasker.pl -continue=install -stop=install -debug \ - -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 - cat fb.hg38.rmskCM.txt - # 1586326530 bases of 3209286105 (49.429%) in intersection - - # profile of repeat elements: -# 1852545 rmskClass/SINE.tab -# 1570523 rmskClass/LINE.tab -# 748597 rmskClass/LTR.tab -# 703682 rmskClass/Simple_repeat.tab -# 499108 rmskClass/DNA.tab -# 102856 rmskClass/Low_complexity.tab -# 7962 rmskClass/Satellite.tab -# 5750 rmskClass/Retroposon.tab -# 5667 rmskClass/LTR?.tab -# 5622 rmskClass/Unknown.tab -# 4516 rmskClass/snRNA.tab -# 3294 rmskClass/DNA?.tab -# 2026 rmskClass/tRNA.tab -# 1840 rmskClass/rRNA.tab -# 1784 rmskClass/RC.tab -# 1672 rmskClass/srpRNA.tab -# 1420 rmskClass/scRNA.tab -# 704 rmskClass/RNA.tab -# 411 rmskClass/RC?.tab -# 38 rmskClass/SINE?.tab - - # using this RM result with trfMask for the final masked sequence - cd /hive/data/genomes/hg38 - twoBitMask hg38.rmskCM.2bit -add bed/simpleRepeat/trfMask.bed hg38.2bit - twoBitToFa hg38.2bit stdout | faSize stdin > faSize.hg38.2bit.txt -# 3209286105 bases (159970322 N's 3049315783 real 1460684798 upper 1588630985 lower) in 455 sequences in 1 files -# %49.50 masked total, %52.10 masked real - - featureBits -countGaps hg38 rmskCM '!rmskHmmer' -bed=crossMatchUnique.bed - # 24868153 bases of 3209286105 (0.775%) in intersection - hgLoadBed hg38 crossMatchUnique crossMatchUnique.bed - # Read 2352219 elements of size 4 from crossMatchUnique.bed - -############################################################################# -## repeating RepeatMasker Blastn run (DONE - 2014-01-07 - Hiram) - mkdir /hive/data/genomes/hg38/bed/rmskBlastn - cd /hive/data/genomes/hg38/bed/rmskBlastn - - time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ - -useRMBlastn -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ - -stop=mask -buildDir=`pwd` hg38 > mask.log - # real 203m33.670s - -# 3209286105 bases (159970322 N's 3049315783 real 1491207906 upper 1558107877 lower) in 455 sequences in 1 files -# %48.55 masked total, %51.10 masked real - - # install step with debug so the script can be altered to load into - # a specific rmskBlastn table: - - $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ - -useRMBlastn -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ - -continue=install -debug -buildDir=`pwd` hg38 - -############################################################################# -## repeating RepeatMasker cross-match run (DONE - 2014-01-07 - Hiram) - mkdir /hive/data/genomes/hg38/bed/rmskCM - cd /hive/data/genomes/hg38/bed/rmskCM - - # missed recording stderr .... forgot the 2>&1 - time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ - -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ - -stop=mask -buildDir=`pwd` hg38 > mask.log - # real 1897m33.517s - # running from Tue Jan 7 16:10:33 PST 2014 thru 08 Jan 23:48 -# *** All done! (through the 'mask' step) - Elapsed time: 1897m34s -# *** Steps were performed in /hive/data/genomes/hg38/bed/rmskCM - # running install manually to allow edit of the script to load - # a specific rmskCm table - time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ - -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ - -continue=install -stop=install -buildDir=`pwd` hg38 -debug - -############################################################################# -## RepeatMasker with RM Blastn - DONE - 2013-12-24,25 - Hiram - mkdir /hive/data/genomes/hg38/bed/repeatMaskerBlastn - cd /hive/data/genomes/hg38/bed/repeatMaskerBlastn - # running this step wise so it can be loaded into its own table - time doRepeatMasker.pl -stop=mask -useRMBlastn -bigClusterHub=ku \ - -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1 - # real 354m55.842s - - # take the install script from this -debug run and alter it to load - # the table into rmskBlastn - doRepeatMasker.pl -useRMBlastn -bigClusterHub=ku -continue=install \ - -stop=install -debug -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 - # 1560264046 bases of 3209286105 (48.617%) in intersection - # profile of repeat elements: -# 1824560 rmskClass/SINE.tab -# 1552814 rmskClass/LINE.tab -# 738435 rmskClass/LTR.tab -# 715998 rmskClass/Simple_repeat.tab -# 486591 rmskClass/DNA.tab -# 105026 rmskClass/Low_complexity.tab -# 7712 rmskClass/Satellite.tab -# 5638 rmskClass/Retroposon.tab -# 5276 rmskClass/Unknown.tab -# 5100 rmskClass/LTR?.tab -# 4548 rmskClass/snRNA.tab -# 3033 rmskClass/DNA?.tab -# 1987 rmskClass/tRNA.tab -# 1809 rmskClass/rRNA.tab -# 1710 rmskClass/RC.tab -# 1633 rmskClass/srpRNA.tab -# 1428 rmskClass/scRNA.tab -# 614 rmskClass/RNA.tab -# 376 rmskClass/RC?.tab -# 38 rmskClass/SINE?.tab -# 3 rmskClass/Unspecified.tab -# 5464329 total - -############################################################################# -## repeating RepeatMasker run with HMMER - DONE - 2014-01-08 - Hiram - mkdir /hive/data/genomes/hg38/bed/rmskHmmer - cd /hive/data/genomes/hg38/bed/rmskHmmer - - # trying cpu=4 and ram=32g - time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ - -stop=mask -useHMMER -bigClusterHub=ku \ - -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1 - # 6 jobs required more than 32 Gb of memory to complete, ran them on - # hgwdev to complete, then continuing: - time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ - -continue=cat -stop=mask -useHMMER -bigClusterHub=ku \ - -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > cat.log 2>&1 - # real 24m5.274s -# 3209286105 bases (159970322 N's 3049315783 real 1314916231 upper 1734399552 lower) in 455 sequences in 1 files -# %54.04 masked total, %56.88 masked real - - # running install manually to allow edit of the script to load - # a specific rmskHmmer table - time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \ - -continue=install -debug -useHMMER -bigClusterHub=ku \ - -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 - - time ./doLoad_rmskHmmer.bash > load.log 2>&1 - # real 4m47.432s - - featureBits -countGaps hg38 rmskHmmer > fb.hg38.rmskHmmer.txt 2>&1 - # 1734398971 bases of 3209286105 (54.043%) in intersection - - grep rmskClass hg38.class.profile.txt \ - | sed -e 's#rmskClass/##; s/.tab//;' | sort -rn - # profile of repeat elements: -# 1884179 SINE -# 1702529 LINE -# 805427 LTR -# 636906 Simple_repeat -# 565171 DNA -# 95480 Low_complexity -# 11861 Retroposon -# 10852 Satellite -# 9181 LTR? -# 6783 scRNA -# 4582 DNA? -# 3914 Unknown -# 2059 RC -# 1517 srpRNA -# 1484 RNA -# 970 SINE? -# 806 RC? -# 464 rRNA -# 5744165 total - - featureBits -countGaps hg38 rmskHmmer '!rmskCM' -bed=hmmerUnique.bed - # 172940594 bases of 3209286105 (5.389%) in intersection - hgLoadBed hg38 hmmerUnique hmmerUnique.bed - # Read 3099505 elements of size 4 from hmmerUnique.bed - -############################################################################# -## RepeatMasker with HMMER - DONE - 2013-12-24,26 - Hiram - mkdir /hive/data/genomes/hg38/bed/repeatMaskerHMMER - cd /hive/data/genomes/hg38/bed/repeatMaskerHMMER - - time doRepeatMasker.pl -stop=mask -useHMMER -bigClusterHub=ku \ - -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1 - # take the install script from this -debug run and alter it to load - # the table into rmskHmmer - doRepeatMasker.pl -continue=install -stop=install -useHMMER \ - -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \ - -buildDir=`pwd` hg38 > mask.log 2>&1 - # 1702017722 bases of 3209286105 (53.034%) in intersection - # profile of repeat elements: -# 1879864 rmskClass/SINE.tab -# 1678216 rmskClass/LINE.tab -# 794231 rmskClass/LTR.tab -# 651561 rmskClass/Simple_repeat.tab -# 551965 rmskClass/DNA.tab -# 97186 rmskClass/Low_complexity.tab -# 10756 rmskClass/Retroposon.tab -# 10448 rmskClass/Satellite.tab -# 8393 rmskClass/LTR?.tab -# 5849 rmskClass/scRNA.tab -# 4282 rmskClass/Unknown.tab -# 4276 rmskClass/DNA?.tab -# 2000 rmskClass/RC.tab -# 1573 rmskClass/srpRNA.tab -# 1291 rmskClass/RNA.tab -# 906 rmskClass/snRNA.tab -# 747 rmskClass/SINE?.tab -# 723 rmskClass/RC?.tab -# 722 rmskClass/rRNA.tab -# 468 rmskClass/tRNA.tab -# 5705457 total - -############################################################################# -# rmsk from genbank release (DONE - 2014-12-25 - Hiram) - mkdir /hive/data/genomes/hg38/bed/repeatMaskerGenbank - cd /hive/data/genomes/hg38/bed/repeatMaskerGenbank - - head -3 ../repeatMaskerBlastn/hg38.fa.out > genbank.rm.out -find ../../genbank -type f | grep rm.out | grep -v "/placed_scaffolds/" | while read F -do - headRest 3 $F -done | sort -k5,45 -k6,6n >> genbank.rm.out - grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \ - | awk '{printf "s/%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt - - sed -e "`cat accessionToUcsc.sed.txt`" genbank.rm.out > ucscNames.rm.out - - head -3 ucscNames.rm.out > hg38.sorted.fa.out - tail -n +4 ucscNames.rm.out | sort -k5,5 -k6,6n >> hg38.sorted.fa.out - - hgLoadOut -table=rmskGenbank -nosplit hg38 hg38.sorted.fa.out - hgLoadOut -verbose=2 -tabFile=hg38.rmskGenbank.tab -table=rmskGenbank \ - -nosplit hg38 hg38.sorted.fa.out 2> bad.records.txt - # fixed up one of the masking scripts from the other runs to construct - # the bbi files - - # 1581568556 bases of 3209286105 (49.281%) in intersection - # profile of repeat elements: -# 1849444 rmskClass/SINE.tab -# 1586141 rmskClass/LINE.tab -# 759248 rmskClass/LTR.tab -# 502186 rmskClass/DNA.tab -# 433789 rmskClass/Simple_repeat.tab -# 396378 rmskClass/Low_complexity.tab -# 10198 rmskClass/Satellite.tab -# 5884 rmskClass/LTR?.tab -# 4595 rmskClass/snRNA.tab -# 4163 rmskClass/Retroposon.tab -# 2802 rmskClass/Unknown.tab -# 2157 rmskClass/DNA?.tab -# 2154 rmskClass/tRNA.tab -# 1915 rmskClass/rRNA.tab -# 1860 rmskClass/RC.tab -# 1784 rmskClass/srpRNA.tab -# 1397 rmskClass/scRNA.tab -# 822 rmskClass/RNA.tab -# 488 rmskClass/SINE?.tab -# 445 rmskClass/RC?.tab -# 5567850 total - -############################################################################# -## running TRF simple repeats - DONE - 2013-12-24,29 - Hiram - # this procedure ran into much trouble on this release. The new - # repeat sequences in the centromeres caused trf to run indefinitely. - # I tried different sizes of chunks, working down to 20 Mbase chunks. - # Even still, some jobs would not complete. Those broke down even - # more, eventually to the smallest bit of 30 Kbase that needed to - # run all the way down to 3,000 based chunks with 1,000 base overlaps. - - # this did not work: - screen # use screen to manage this day-long job - mkdir /hive/data/genomes/hg38/bed/simpleRepeat - cd /hive/data/genomes/hg38/bed/simpleRepeat - time doSimpleRepeat.pl -bigClusterHub=ku -workhorse=hgwdev \ - -smallClusterHub=ku -buildDir=`pwd` hg38 > do.log 2>&1 - cd /hive/data/genomes/hg38/bed - # move it aside: - mv simpleRepeat simpleRepeat.2013-12-24 - - # Instead, something like this: - mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap - cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap - mkdir -p noGap - - twoBitToFa ../../../hg38.unmasked.2bit stdout \ - | faSplit -lift=noGap.lift gap stdin 5000000 noGap/hg38_ - # make sure nothing has gone missing: - faCount noGap/*.fa > faCount.txt - tail -1 faCount.txt -# total 3068387174 898285419 623727342 626335137 900967885 19071391 30979734 - # compared to the full sequence, same numbers for ACGT: - twoBitToFa ../../../hg38.unmasked.2bit stdout | faCount stdin -# total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743 - faToTwoBit noGap/*.fa hg38.nogap.2bit - twoBitInfo hg38.nogap.2bit stdout | sort -k2,2nr > hg38.nogap.sizes - - - mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M - cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M - rm -rf /hive/data/genomes/hg38/TrfPart20M - /cluster/bin/scripts/simplePartition.pl \ -/hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap/hg38.nogap.2bit \ - 20000000 /hive/data/genomes/hg38/TrfPart20M - rm -f /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/TrfPart20M - ln -s /hive/data/genomes/hg38/TrfPart20M \ - /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/TrfPart20M - ssh ku - cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M - gensub2 /hive/data/genomes/hg38/TrfPart20M/partitions.lst single gsub jobList - para create jobList - para push - # 20 jobs would not complete: -# Completed: 143 of 163 jobs -# Jobs currently running: 20 -# CPU time in finished jobs: 76994s 1283.24m 21.39h 0.89d 0.002 y -# IO & Wait Time: 1095s 18.24m 0.30h 0.01d 0.000 y -# Time in running jobs: 1807279s 30121.32m 502.02h 20.92d 0.057 y -# Average job time: 546s 9.10m 0.15h 0.01d -# Longest running job: 90422s 1507.03m 25.12h 1.05d -# Longest finished job: 43348s 722.47m 12.04h 0.50d -# Submission to last job: 43363s 722.72m 12.05h 0.50d - # determine which are the last jobs as individual bits: - para status | grep -v done | awk '{print $(NF-1),$NF}' | grep TrfRun \ - > not.done.list - awk '{print $NF}' not.done.list | sed -e 's/.bed//' | while read F -do - cat $F -done > seq.specs.not.done - - mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs - cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs - mkdir fasta - for seqSpec in `cat ../seq.specs.not.done` -do - fName=`echo $seqSpec | sed -e 's/.*://'` - echo $fName - twoBitToFa $seqSpec fasta/$fName.fa -done - ls -1S `pwd`/fasta > part.list - cat << '_EOF_' > template -#LOOP -./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} -#ENDLOOP -'_EOF_' - # << happy emacs - - cat << '_EOF_' > runTrf -#!/bin/bash -set -beEu -o pipefail -export path1=$1 -export inputFN=`basename $1` -export outpath=$2 -export outputFN=`basename $2` -mkdir -p /dev/shm/$outputFN -cp -p $path1 /dev/shm/$outputFN -cd /dev/shm/$outputFN -/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ - $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm -cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs -rm -f $outpath -cp -p /dev/shm/$outputFN/$outputFN $outpath -rm -fr /dev/shm/$outputFN/* -rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN -'_EOF_' - # << happy emacs - chmod +x runTrf - - gensub2 part.list single template jobList - para create jobList - para push - # not all of these jobs will finish either: -# Completed: 85 of 106 jobs -# Jobs currently running: 21 -# CPU time in finished jobs: 58076s 967.93m 16.13h 0.67d 0.002 y -# IO & Wait Time: 828s 13.81m 0.23h 0.01d 0.000 y -# Time in running jobs: 1988997s 33149.95m 552.50h 23.02d 0.063 y -# Average job time: 693s 11.55m 0.19h 0.01d -# Longest running job: 94730s 1578.83m 26.31h 1.10d -# Longest finished job: 34216s 570.27m 9.50h 0.40d -# Submission to last job: 34342s 572.37m 9.54h 0.40d - - # can use what we have here: - liftUp result.bed ../../splitGap/noGap.lift error bed/*.bed - # find jobs not done - para status | grep -v done | awk '{print $(NF-1),$NF}' | grep TrfRun \ - > not.done.list - # splitting up those last jobs: - mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits - cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits - mkdir noGap - awk '{print $2}' ../lastJobs/not.done.list | while read F -do - cp -p $F ./noGap/ -done - - # split into 1,000,000 chunks with 10,000 overlap: - mkdir -p 1M_10K - -for F in noGap/*.fa -do - B=`basename $F | sed -e 's/.fa//'` - echo "faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/$B_" - faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/${B}_ -done - - ls -1S `pwd`/1M_10K/*.fa > part.list - cat << '_EOF_' > runTrf -#!/bin/bash -set -beEu -o pipefail -export path1=$1 -export inputFN=`basename $1` -export outpath=$2 -export outputFN=`basename $2` -mkdir -p /dev/shm/$outputFN -cp -p $path1 /dev/shm/$outputFN -cd /dev/shm/$outputFN -/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ - $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm -cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits -rm -f $outpath -cp -p /dev/shm/$outputFN/$outputFN $outpath -rm -fr /dev/shm/$outputFN/* -rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN -'_EOF_' - # << happy emacs - - cat << '_EOF_' > template -#LOOP -./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} -#ENDLOOP -'_EOF_' - # << happy emacs - - gensub2 part.list single template jobList - para create jobList - para push - # not all of these jobs will complete either: -# Completed: 53 of 96 jobs -# CPU time in finished jobs: 212403s 3540.05m 59.00h 2.46d 0.007 y -# IO & Wait Time: 1851s 30.85m 0.51h 0.02d 0.000 y -# Average job time: 4043s 67.38m 1.12h 0.05d -# Longest finished job: 68726s 1145.43m 19.09h 0.80d -# Submission to last job: 68890s 1148.17m 19.14h 0.80d - # use what results we have here: - cat *.lift | liftUp parts.bed stdin error bed/*.bed - liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed \ - | sort -u | sort -k1,1 -k2,2n > hg38.result.bed - - para status | grep -v -w done | awk '{print $(NF-1)}' > will.not.finish.txt - - # split those last bits: - mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits - cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits - mkdir splitBits - cat ../splitBits/will.not.finish.txt | while read F -do - cp -p $F splitBits -done - - # 100K chunks with 10K overlap - mkdir -p 100K_10K - -for F in splitBits/*.fa -do - B=`basename $F | sed -e 's/.fa//'` - echo "faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/$B_" - faSplit -lift=$B.lift -extra=10000 size $F 100000 100K_10K/${B}_ -done - - cat << '_EOF_' > runTrf -#!/bin/bash -set -beEu -o pipefail -export path1=$1 -export inputFN=`basename $1` -export outpath=$2 -export outputFN=`basename $2` -mkdir -p /dev/shm/$outputFN -cp -p $path1 /dev/shm/$outputFN -cd /dev/shm/$outputFN -/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ - $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm -cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits -rm -f $outpath -cp -p /dev/shm/$outputFN/$outputFN $outpath -rm -fr /dev/shm/$outputFN/* -rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN -'_EOF_' - # << happy emacs - chmod +x runTrf - - cat << '_EOF_' > template -#LOOP -./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} -#ENDLOOP -'_EOF_' - # << happy emacs - - ls -1S `pwd`/100K_10K/*.fa > part.list - gensub2 part.list single template jobList - para create jobList - para push - # one last bit does not complete: -# Completed: 420 of 421 jobs -# CPU time in finished jobs: 19862s 331.04m 5.52h 0.23d 0.001 y -# IO & Wait Time: 2360s 39.33m 0.66h 0.03d 0.000 y -# Average job time: 53s 0.88m 0.01h 0.00d -# Longest finished job: 368s 6.13m 0.10h 0.00d -# Submission to last job: 448s 7.47m 0.12h 0.01d - - # can use the results obtained here: - cat *.lift | liftUp splitParts.bed stdin error bed/*.bed - cat ../splitBits/*.lift | liftUp parts.bed stdin error splitParts.bed - liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \ - | sort -k1,1 -k2,2n > hg38.result.bed - - para status | grep -v -w done | awk '{print $(NF-1)}' - # last chunk: 100K_10K/hg38_89_2_00.fa - - mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K - cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K - cp -p ../splitSplitBits/100K_10K/hg38_89_2_00.fa . - - # 20K chunks with 10K overlap: - mkdir -p 20K_10K - -for F in hg38_89_2_00.fa -do - B=`basename $F | sed -e 's/.fa//'` - echo "faSplit -lift=$B.lift -extra=10000 size $F 20000 20K_10K/$B_" - faSplit -lift=$B.lift -extra=10000 size $F 20000 20K_10K/${B}_ -done - - ls -1S `pwd`/20K_10K/*.fa > part.list - cat << '_EOF_' > runTrf -#!/bin/bash -set -beEu -o pipefail -export path1=$1 -export inputFN=`basename $1` -export outpath=$2 -export outputFN=`basename $2` -mkdir -p /dev/shm/$outputFN -cp -p $path1 /dev/shm/$outputFN -cd /dev/shm/$outputFN -/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ - $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm -cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K -rm -f $outpath -cp -p /dev/shm/$outputFN/$outputFN $outpath -rm -fr /dev/shm/$outputFN/* -rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN -'_EOF_' - # << happy emacs - chmod +s runTrf - cat << '_EOF_' > template -#LOOP -./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} -#ENDLOOP -'_EOF_' - # << happy emacs - - gensub2 part.list single template jobList - para create jobList - para push - # one of these jobs will not finish: -# Completed: 4 of 5 jobs -# CPU time in finished jobs: 10s 0.17m 0.00h 0.00d 0.000 y -# IO & Wait Time: 16s 0.26m 0.00h 0.00d 0.000 y -# Average job time: 7s 0.11m 0.00h 0.00d -# Longest finished job: 8s 0.13m 0.00h 0.00d -# Submission to last job: 16s 0.27m 0.00h 0.00d - - # can use the results we have here: - cat *.lift | liftUp 20Kparts.bed stdin error bed/*.bed - cat ../splitSplitBits/*.lift | liftUp 100Kpart.bed stdin error 20Kparts.bed - cat ../splitBits/*.lift | liftUp parts.bed stdin error 100Kpart.bed - liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \ - | sort -k1,1 -k2,2n > hg38.result.bed - - # finally, what turns out to be the last batch: - mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K - cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K - cp -p ../last100K/20K_10K/hg38_89_2_00_3.fa . - - # 2K chunks with 1K overlap - mkdir -p 2K_1K - -for F in hg38_89_2_00_3.fa -do - B=`basename $F | sed -e 's/.fa//'` - echo "faSplit -lift=$B.lift -extra=1000 size $F 2000 2K_1K/$B_" - faSplit -lift=$B.lift -extra=1000 size $F 2000 2K_1K/${B}_ -done - - ls -1S `pwd`/2K_1K/*.fa > part.list - cat << '_EOF_' > runTrf -#!/bin/bash -set -beEu -o pipefail -export path1=$1 -export inputFN=`basename $1` -export outpath=$2 -export outputFN=`basename $2` -mkdir -p /dev/shm/$outputFN -cp -p $path1 /dev/shm/$outputFN -cd /dev/shm/$outputFN -/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \ - $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm -cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K -rm -f $outpath -cp -p /dev/shm/$outputFN/$outputFN $outpath -rm -fr /dev/shm/$outputFN/* -rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN -'_EOF_' - # << happy emacs - chmod +x runTrf - cat << '_EOF_' > template -#LOOP -./runTrf {check in line+ $(path1)} {check out line bed/$(root1).bed} -#ENDLOOP -'_EOF_' - # << happy emacs - - gensub2 part.list single template jobList - para create - para push -# Completed: 15 of 15 jobs -# CPU time in finished jobs: 1s 0.02m 0.00h 0.00d 0.000 y -# IO & Wait Time: 26s 0.43m 0.01h 0.00d 0.000 y -# Average job time: 2s 0.03m 0.00h 0.00d -# Longest finished job: 4s 0.07m 0.00h 0.00d -# Submission to last job: 14s 0.23m 0.00h 0.00d - - cat *.lift | liftUp 2Kparts.bed stdin error bed/*.bed - cat ../last100K/*.lift | liftUp 20Kpart.bed stdin error 2Kparts.bed - cat ../splitSplitBits/*.lift | liftUp 100Kpart.bed stdin error 20Kpart.bed - cat ../splitBits/*.lift | liftUp parts.bed stdin error 100Kpart.bed - liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \ - | sort -k1,1 -k2,2n > hg38.result.bed - - ## To put it all together: - cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M - cat /hive/data/genomes/hg38/TrfPart20M/???/*.bed lastJobs/bed/*.bed \ - splitBits/parts.bed splitSplitBits/parts.bed last100K/parts.bed \ - last30K/parts.bed > beforeLift.simpleRepeat.bed - liftUp -type=.bed stdout ../splitGap/noGap.lift error \ - beforeLift.simpleRepeat.bed | sort -u \ - | sort -k1,1 -k2,2n > simpleRepeat.bed - - awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed - - hgLoadBed hg38 simpleRepeat simpleRepeat.bed \ - -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql - featureBits hg38 simpleRepeat > fb.simpleRepeat 2>&1 - cat fb.simpleRepeat -# 146785521 bases of 3049335806 (4.814%) in intersection - - cd /hive/data/genomes/hg38/bed - ln -s simpleRepeat.2013-12-27/run20M simpleRepeat - -############################################################################ - - # WINDOWMASKER - DONE - 2013-12-24 - Hiram - mkdir /hive/data/genomes/hg38/bed/windowMasker - cd /hive/data/genomes/hg38/bed/windowMasker - time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ - -dbHost=hgwdev hg38 > do.log 2>&1 & - -############################################################################ -# Verify all gaps are marked - DONE - 2013-12-24 - Hiram - mkdir /hive/data/genomes/hg38/bed/gap - cd /hive/data/genomes/hg38/bed/gap - time nice -n +19 findMotif -motif=gattaca -verbose=4 \ - -strand=+ ../../hg38.unmasked.2bit > findMotif.txt 2>&1 - # real 0m28.634s - grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed - featureBits hg38 -not gap -bed=notGap.bed - # 3049335806 bases of 3049335806 (100.000%) in intersection - time featureBits hg38 allGaps.bed notGap.bed -bed=new.gaps.bed - # 20023 bases of 3049335806 (0.001%) in intersection - # real 0m20.427s - # this indicates that 20,023 bases are not marked as N's - # with this element size profile: - awk '{print $3-$2}' new.gaps.bed | ave stdin -# Q1 1.000000 -# median 1.000000 -# Q3 100.000000 -# average 44.894619 -# min 1.000000 -# max 1000.000000 -# count 446 -# total 20023.000000 -# standard deviation 81.743447 - - # the four largest ones: -# 1000 chr2 32916625 32917625 chr2.7 -# 1000 chr2 32867130 32868130 chr2.6 -# 348 chr20 36314371 36314719 chr20.36 -# 200 chr12 123443533 123443733 chr12.10 - -######################################################################### -## CYTOBAND - fixing the ideogram track (DONE - 2014-06-11 - Hiram) - ## the file we used before was broken - mkdir -p /hive/data/outside/ncbi/ideogram/2014-06 - cd /hive/data/outside/ncbi/ideogram/2014-06 - # fetch all the ideogram files: - rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/pub/gdp/ ./ - mkdir /hive/data/genomes/hg38/bed/cytoBandUpdate - cd /hive/data/genomes/hg38/bed/cytoBandUpdate - - # Create bed file - $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl \ -/hive/data/outside/ncbi/ideogram/2014-06/ideogram_9606_GCF_000001305.14_850_V1 - - # add in the other genome data: - hgsql -N -e 'select * from cytoBand;' hg38 \ - | egrep "chrU|chrM|_alt|_random" >> cytoBand.bed - - $HOME/kent/src/utils/ncbi/cytoBandVerify.pl - # everything checks out OK on 455 chroms - - # Load the bed file - hgLoadBed -tab -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \ - hg38 cytoBand cytoBand.bed - cut -f1 cytoBand.bed | sort -u | awk '{print length($1)}' | sort -rn | head - # 23 - sed -e 's/12/23/' $HOME/kent/src/hg/lib/cytoBand.sql > cytoBand.sql - sort -k1,1 -k2,2n cytoBand.bed \ - | hgLoadSqlTab hg38 cytoBand cytoBand.sql stdin - - # Make cytoBandIdeo track for ideogram gif on hgTracks page. - # cytoBandIdeo is just a replicate of the cytoBand track. - hgsql -e "drop table cytoBandIdeo;" hg38 - hgsql hg38 -e "create table cytoBandIdeo (index(chrom(23),chromStart)) as select * from cytoBand;" - -######################################################################### -## CYTOBAND - ideogram track (DONE - 2014-03-04 - Hiram) - ssh hgwdev - mkdir -p /hive/data/outside/ncbi/ideogram/2014-03 - cd /hive/data/outside/ncbi/ideogram/2014-03 - - # fetch all the ideogram files: - rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/pub/gdp/ ./ - - mkdir /hive/data/genomes/hg38/bed/cytoBand - cd /hive/data/genomes/hg38/bed/cytoBand - - # Create bed file - $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl \ -/hive/data/outside/ncbi/ideogram/2014-03/ideogram_9606_GCF_000001305.14_850_V1 - - # add in the other genome data: - hgsql -N -e 'select * from cytoBand;' hg38 > bobTable.bed - - egrep "chrU|chrM|_alt|_random" bobTable.bed >> cytoBand.bed - - ## can now verify before load: - $HOME/kent/src/utils/ncbi/cytoBandVerify.pl - # everything checks out OK on 455 chroms - - # Load the bed file - hgLoadBed -tab -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \ - hg38 cytoBand cytoBand.bed - cut -f1 cytoBand.bed | sort -u | awk '{print length($1)}' | sort -rn | head - # 23 - sed -e 's/12/23/' $HOME/kent/src/hg/lib/cytoBand.sql > cytoBand.sql - sort -k1,1 -k2,2n cytoBand.bed \ - | hgLoadSqlTab hg38 cytoBand cytoBand.sql stdin - - # Make cytoBandIdeo track for ideogram gif on hgTracks page. - # cytoBandIdeo is just a replicate of the cytoBand track. - hgsql -e "drop table cytoBandIdeo;" hg38 - hgsql hg38 -e "create table cytoBandIdeo (index(chrom(23),chromStart)) as select * from cytoBand;" - -########################################################################## -# cytoBandIdeo - (DONE - 2013-12-26 - Hiram) - mkdir /hive/data/genomes/hg38/bed/cytoBand - cd /hive/data/genomes/hg38/bed/cytoBand - makeCytoBandIdeo.csh hg38 - -#making temporary liftover of items from hg19 -liftOver /hive/data/genomes/hg19/bed/ncbiCytoBand/cytobands.bed \ - /hive/data/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \ - cytobands.bed unMapped - -liftOver -minBlocks=0.5 /hive/data/genomes/hg19/bed/ncbiCytoBand/cytobands.bed \ - /hive/data/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \ - cytobands.0.5.bed unMapped0.5 - -############################### ###################### -# cytoBandIdeo - (reDONE - 2014-02-25 - kuhn) - -# adding centromeres to generic cytonBandIdeo tavle as it exists. -# (lifted track is already gone) - -# get the cen values for hg38 -hgsql -Ne "SELECT DISTINCT chrom FROM centromeres" hg38 | sort > hg38.chroms -rm -f hg38.cens -foreach chrom (`cat hg38.chroms`) - set cenStart="" - set cenEnd="" - set cenStart=`hgsql -Ne 'SELECT MIN(chromStart) FROM centromeres WHERE chrom = "'$chrom'"' hg38` - set cenEnd=`hgsql -Ne 'SELECT MAX(chromEnd) FROM centromeres WHERE chrom = "'$chrom'"' hg38` - echo "$chrom $cenStart $cenEnd" >> hg38.cens -end - -# Modified makeCytoBandIdeo.csh to use this file instead of looking -# for centromeres in a gap table. -# Replaced existing cytoBandIdeo table, which was really only a copy -# of chromInfo. - -########################################################################## -# hg19 <-> hg38 difference tracks (DONE - 2013-12-28 - Hiram) - mkdir /hive/data/genomes/hg19/bed/liftOverHg38 - cd /hive/data/genomes/hg19/bed/liftOverHg38 - - # not needed, but interesting, collect all the fragment - # definitions from the gold tables: - hgsql -N -e "select frag,fragStart,fragEnd,strand from gold;" hg19 \ - | sort > hg19.gold.frags.tab - - hgsql -N -e "select frag,fragStart,fragEnd,strand from gold;" hg38 \ - | sort > hg38.gold.frags.tab - - # construct common and difference listings - comm -12 hg19.gold.frags.tab hg38.gold.frags.tab \ - > identical.hg19.hg38.frags.tab - comm -23 hg19.gold.frags.tab hg38.gold.frags.tab \ - > unique.hg19Only.frags.tab - comm -13 hg19.gold.frags.tab hg38.gold.frags.tab \ - > unique.hg38Only.frags.tab - - # better yet, get full information about each fragment - hgsql -N -e "select chrom,chromStart,chromEnd,ix,type,frag,fragStart,fragEnd,strand from gold;" hg19 \ - | sort -k6 > hg19.gold.tab - - hgsql -N -e "select chrom,chromStart,chromEnd,ix,type,frag,fragStart,fragEnd,strand from gold;" hg38 \ - | sort -k6 > hg38.gold.tab - - # construct a single key for each fragment for joining. - # the key is frag,fragStart,fragEnd,strand - awk '{printf "%s,%d,%d,%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\n", - $6,$7,$8,$9,$6,$9,$1,$2,$3,$4,$5}' hg19.gold.tab | sort \ - > hg19.fragKey.tab - awk '{printf "%s,%d,%d,%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\n", - $6,$7,$8,$9,$6,$9,$1,$2,$3,$4,$5}' hg38.gold.tab | sort \ - > hg38.fragKey.tab - - # now, by joining those keys, we can get exact identicals, and - # the only-in listings as bed files to load as tracks: - join hg19.fragKey.tab hg38.fragKey.tab \ - | awk '{printf "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0,0,128\n", $4,$5,$6,$2,$3,$5,$6}' \ - | sort -k1,1 -k2,2n > hg19.hg38.identical.bed - - join hg19.fragKey.tab hg38.fragKey.tab \ - | awk '{printf "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0,0,128\n", $11,$12,$13,$9,$10,$12,$13}' \ - | sort -k1,1 -k2,2n > hg38.hg19.identical.bed - - join -v 1 hg19.fragKey.tab hg38.fragKey.tab \ - | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $4,$5,$6,$2,$3}' \ - | sort -k1,1 -k2,2n > hg19.only.bed - - join -v 2 hg19.fragKey.tab hg38.fragKey.tab \ - | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $4,$5,$6,$2,$3}' \ - | sort -k1,1 -k2,2n > hg38.only.bed - - hgLoadBed hg19 hg38ContigDiff hg19.only.bed - hgLoadBed hg38 hg19ContigDiff hg38.only.bed - - wc -l hg??.only.bed - # 6097 hg19.only.bed - # 23632 hg38.only.bed - - # this leaves the outstanding question of "why" they might be in - # the only-in listings. Some contigs may be different versions, - # sometimes different sections of the same contig are used, - # and contigs are dropped from hg19 to hg38, or new contigs added - # to hg38 to fill in gaps from hg19 - # Let's see if we can measure some of this: - awk '{print $4}' hg19.only.bed | sort -u > hg19.only.ids.list - awk '{print $4}' hg38.only.bed | sort -u > hg38.only.ids.list - - # Looks like 5405 idential contigs with different parts used: - comm -12 hg19.only.ids.list hg38.only.ids.list > differentPortions.list - wc -l differentPortions.list - # 5405 - - # and perhaps 63 = 5468-5405 of different versions of same contig: - sed -e "s/\.[0-9]*$//" hg19.only.ids.list | sort -u \ - > hg19.noVersions.ids.list - sed -e "s/\.[0-9]*$//" hg38.only.ids.list | sort -u \ - > hg38.noVersions.ids.list - comm -12 hg19.noVersions.ids.list hg38.noVersions.ids.list | wc -l - # 5468 - sed -e "s/\.[0-9]*$//" differentPortions.list | sort -u \ - > differentPortions.noVersions.list - comm -12 hg19.noVersions.ids.list hg38.noVersions.ids.list | sort -u \ - > noVersions.common.list - # indeed, 63 contigs of different versions: - comm -23 noVersions.common.list differentPortions.noVersions.list \ - | sort -u > differentVersions.list - wc -l differentVersions.list - # 63 - - # dividing up these items: - cat << '_EOF_' > identifyPortions.pl -#!/usr/bin/env perl - -use strict; -use warnings; - -my %differentVersions; -my %differentPortions; - -open (FH, "<differentVersions.list" ) or - die "can not read differentVersions.list"; -while (my $line = <FH>) { - chomp $line; - $differentVersions{$line} = 1; -} -close (FH); - -open (FH, "differentPortions.list" ) or - die "can not read differentPortions.list"; -while (my $line = <FH>) { - chomp $line; - $differentPortions{$line} = 1; -} -close (FH); - -my %hg19Done; -open (DP, ">hg19.differentPortions.bed") or die "can not write to hg19.differentPortions.bed"; -open (DV, ">hg19.differentVersions.bed") or die "can not write to hg19.differentVersions.bed"; -open (FH, "<hg19.only.bed" ) or die "can not read hg19.only.bed"; -while (my $line = <FH>) { - chomp $line; - my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); - # assume done while $acc is still complete - $hg19Done{$acc} = 1; - if (exists($differentPortions{$acc})) { - printf DP "%s\n", $line; - } else { - my $trimAcc = $acc; - $trimAcc =~ s/\.[0-9]+$//; - if (exists($differentVersions{$trimAcc})) { - printf DV "%s\n", $line; - } else { - # this one does not match - $hg19Done{$acc} = 0; - } - } -} -close (FH); -close (DV); -close (DP); -open (DR, ">hg19.dropped.bed") or die "can not write to hg19.dropped.bed"; -open (FH, "<hg19.only.bed" ) or die "can not read hg19.only.bed"; -while (my $line = <FH>) { - chomp $line; - my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); - if (0 == $hg19Done{$acc}) { - printf DR "%s\n", $line; - } -} -close (FH); -close (DR); - -my %hg38Done; -open (DP, ">hg38.differentPortions.bed") or die "can not write to hg38.differentPortions.bed"; -open (DV, ">hg38.differentVersions.bed") or die "can not write to hg38.differentVersions.bed"; -open (FH, "<hg38.only.bed" ) or die "can not read hg38.only.bed"; -while (my $line = <FH>) { - chomp $line; - my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); - # assume done while $acc is still complete - $hg38Done{$acc} = 1; - if (exists($differentPortions{$acc})) { - printf DP "%s\n", $line; - } else { - my $trimAcc = $acc; - $trimAcc =~ s/\.[0-9]+$//; - if (exists($differentVersions{$trimAcc})) { - printf DV "%s\n", $line; - } else { - # this one does not match - $hg38Done{$acc} = 0; - } - } -} -close (FH); -close (DV); -close (DP); -open (DR, ">hg38.newTo19.bed") or die "can not write to hg38.newTo19.bed"; -open (FH, "<hg38.only.bed" ) or die "can not read hg38.only.bed"; -while (my $line = <FH>) { - chomp $line; - my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line); - if (0 == $hg38Done{$acc}) { - printf DR "%s\n", $line; - } -} -close (FH); -close (DR); -'_EOF_' - # << happy emacs - chmod +x identifyPortions.pl - ./identifyPortions.pl - # make sure nothing was lost - sort hg19.differentVersions.bed hg19.differentPortions.bed \ - hg19.dropped.bed | sum - # 43711 233 - sort hg19.only.bed | sum - # 43711 233 - sort hg38.differentVersions.bed hg38.differentPortions.bed \ - hg38.newTo19.bed | sum - # 00502 911 - sort hg38.only.bed | sum - # 00502 911 - - sort -k1,1 -k2,2n hg38.differentVersions.bed hg38.differentPortions.bed \ - hg38.newTo19.bed > hg38.itemRgb.bed - sort -k1,1 -k2,2n hg19.differentVersions.bed hg19.differentPortions.bed \ - hg19.dropped.bed > hg19.itemRgb.bed - - hgLoadBed hg19 hg38ContigDiff hg19.itemRgb.bed - # if you wanted to load the identicals in this track too: - sort -k1,1 -k2,2n hg38.hg19.identical.bed hg38.itemRgb.bed \ - | hgLoadBed hg38 hg38ContigDiff stdin - # but we don't, we deliver only the differences - hgLoadBed hg38 hg38ContigDiff hg38.itemRgb.bed - -######################################################################### -# construct ooc file to be used in blat operations -# DONE - 2012-12-30 - Hiram -# can be done on unmasked sequence the same result as masked: - cd /hive/data/genomes/hg38 - time blat hg38.unmasked.2bit /dev/null /dev/null \ - -tileSize=11 -makeOoc=jkStuff/hg38.11.ooc -repMatch=1024 - - # been confirmed, the 100-base non-bridged gaps are really non-bridged - gapToLift -minGap=100 -bedFile=jkStuff/nonBridgedGaps.bed hg38 \ - jkStuff/hg38.nonBridged.lft - -############################################################################## -# cpgIslands - (DONE - 2014-01-07 - Hiram) - # run on the Hmmer + trfMask sequence - mkdir /hive/data/genomes/hg38/bed/cpgIslands - cd /hive/data/genomes/hg38/bed/cpgIslands - time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ - -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ - -workhorse=hgwdev -smallClusterHub=ku hg38 > do.log 2>&1 - # real 3m31.684s - # wc -l cpgIsland.bed -> 30456 cpgIsland.bed - cat fb.hg38.cpgIslandExt.txt - # 23654068 bases of 3049335806 (0.776%) in intersection - - # Previously in hg19: - featureBits -countGaps hg19 cpgIslandExt - # 21842742 bases of 3137161264 (0.696%) in intersection - - # when run on Hmmer and Trf masked sequence: - # wc -l cpgIsland.bed -> 30416 cpgIsland.bed - # 23635946 bases of 3049335806 (0.775%) in intersection - - # when run on unmasked sequence: - # wc -l cpgIsland.bed -> 55149 cpgIsland.bed - # 33637531 bases of 3049335806 (1.103%) in intersection -############################################################################## -# rerun cpgIslands on contig sequence (DONE - 2014-01-07 - Hiram) - # this is a test of the contig sequence file, - # should get a very similar answer to the above - mkdir /hive/data/genomes/hg38/bed/cpgIslandsContigs - cd /hive/data/genomes/hg38/bed/cpgIslandsContigs - - # run stepwise so the lift can be done on the result before loading - time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ - -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ - -stop=makeBed -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \ - -workhorse=hgwdev -smallClusterHub=ku hg38 > makeBed.log 2>&1 - # real 9m31.502s - # fails on the bedToBigBed creation since this isn't the actual - # hg38 sequence. - mv cpgIsland.bed cpgIsland.beforeLift.bed - liftUp -type=.bed stdout ../../jkStuff/hg38.contigs.lift carry \ - cpgIsland.beforeLift.bed | sort -k1,1 -k2,2n > cpgIsland.bed - bedToBigBed -tab -type=bed4+6 -as=$HOME/kent/src/hg/lib/cpgIslandExt.as \ - cpgIsland.bed ../../chrom.sizes hg38.cpgIslandExt.bb - zcat ../cpgIslands/cpgIsland.bed.gz | sort -k1,1 -k2,2n > t.bed - # Surprisingly, a few more are detected, perhaps due to the different - # masking since this contig run is on the final corrected cross-match rmsk - # plus TRF, the above was on the corrupted HMMER+TRF mask: - wc -l cpgIsland.bed t.bed -# 30477 cpgIsland.bed -# 30456 t.bed - # 2,835 different items between the two: - sort t.bed cpgIsland.bed | uniq -c | awk '$1 < 2' | wc -l - # 2835 - # 29.049 identical items - sort t.bed cpgIsland.bed | uniq -c | awk '$1 == 2' | wc -l - # 29049 - cut -f1-3 cpgIsland.bed | sort > contigs.bed - cut -f1-3 t.bed | sort > fullSequence.bed - # 29,339 identical locations: - comm -12 contigs.bed fullSequence.bed | wc -l - # 29339 - - time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ - -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ - -continue=load -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \ - -workhorse=hgwdev -smallClusterHub=ku hg38 > load.log 2>&1 - # real 0m12.056s - - cat fb.hg38.cpgIslandExt.txt - # 23610399 bases of 3049335806 (0.774%) in intersection - -############################################################################## -# rerun cpgIslands on contig UNMASKED sequence (DONE - 2014-01-07 - Hiram) - mkdir /hive/data/genomes/hg38/bed/cpgIslandsContigsUnmasked - cd /hive/data/genomes/hg38/bed/cpgIslandsContigsUnmasked - - twoBitToFa -noMask ../../hg38.contigs.2bit stdout \ - | faToTwoBit stdin hg38.contigsUnmasked.2bit - - # verify sequence is OK: - twoBitToFa hg38.contigsUnmasked.2bit stdout | faSize stdin -# 3061688741 bases (12372958 N's 3049315783 real 3049315783 upper 0 lower) -# in 733 sequences in 1 files -# %0.00 masked total, %0.00 masked real - twoBitToFa hg38.contigsUnmasked.2bit stdout | faCount stdin | tail -1 -# total 3061688741 898285419 623727342 626335137 900967885 12372958 30979743 - # ACGT CpG same as original hg38.2bit except for the missing N's: -# total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743 - - # run stepwise so the lift can be done on the result before loading - time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ - -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ - -stop=makeBed -maskedSeq=`pwd`/hg38.contigsUnmasked.2bit \ - -workhorse=hgwdev -smallClusterHub=ku hg38 > makeBed.log 2>&1 - # real 11m0.690s - # as above, failed on the bedToBigBed step since this isn't the full hg38 - # sequence - mv cpgIsland.bed cpgIsland.beforeLift.bed - liftUp -type=.bed stdout ../../jkStuff/hg38.contigs.lift carry \ - cpgIsland.beforeLift.bed | sort -k1,1 -k2,2n > cpgIsland.bed - bedToBigBed -tab -type=bed4+6 -as=$HOME/kent/src/hg/lib/cpgIslandExt.as \ - cpgIsland.bed ../../chrom.sizes hg38.cpgIslandExt.bb - # a lot more here that for masked sequence: - wc -l cpgIsland.bed ../cpgIslandsContigs/cpgIsland.bed - # 55149 cpgIsland.bed - # 30477 ../cpgIslandsContigs/cpgIsland.bed - featureBits -countGaps hg38 cpgIsland.bed - # 33637531 bases of 3209286105 (1.048%) in intersection - featureBits -countGaps hg38 ../cpgIslandsContigs/cpgIsland.bed - # 23610399 bases of 3209286105 (0.736%) in intersection - - # debug load step so it can be loaded into a separate table: - $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \ - -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ - -debug -continue=load -maskedSeq=`pwd`/hg38.contigsUnmasked.2bit \ - -workhorse=hgwdev -smallClusterHub=ku hg38 - - time ./doLoadCpg.csh > load.log 2>&1 - # real 0m2.179s - # 33637531 bases of 3049335806 (1.103%) in intersection - -######################################################################### -# construct liftOver to hg19 (DONE - 2013-12-31 - Hiram) - # it turns out it doesn't matter if the query or target 2bit files - # are masked. This procedure can be done on completely unmasked sequences - # for both, same result masked or not masked - screen -S hg38 # manage this longish running job in a screen - mkdir /hive/data/genomes/hg38/bed/blat.hg19.2013-12-31 - cd /hive/data/genomes/hg38/bed/blat.hg19.2013-06-10 - # this was run in manual steps as experiments were done about the masking - # check it with -debug first to see if it is going to work: - doSameSpeciesLiftOver.pl -stop=net -buildDir=`pwd` -bigClusterHub=ku \ - -dbHost=hgwdev -workhorse=hgwdev -debug \ - -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc hg38 hg19 - # the debug step doesn't actually construct enough files to run the - # steps manually. The chaining has an extra procedure that is performed - # while not in 'debug' mode - # the run.blat was operated manually, then chaining: - time doSameSpeciesLiftOver.pl -continue=chain -stop=net -buildDir=`pwd` \ - -bigClusterHub=ku \ - -dbHost=hgwdev -workhorse=hgwdev \ - -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc \ - hg38 hg19 > chain.log 2>&1 - # real 22m31.635s - # loading is only a few seconds: - doSameSpeciesLiftOver.pl -continue=load -buildDir=`pwd` \ - -bigClusterHub=ku \ - -dbHost=hgwdev -workhorse=hgwdev \ - -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc \ - hg38 hg19 > load.log 2>&1 - - # verify this file exists: - # /gbdb/hg38/liftOver/hg38ToHg19.over.chain.gz - # and try out the conversion on genome-test from hg38 to hg19 - # same file should exist for downloads: - # /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz - -############################################################################ -# marking the PAR regions: (DONE - 2014-01-09 - Hiram) - # after much experimentation with the AGP files and the given NCBI - # files in hg38/genbank/Primary_Assembly/pseudoautosomal_region - # the PAR region definitions can be seen in the par_align.gff file: -# CM000685.2 10001 2781479 -> CM000686.2 10001 2781479 -# CM000685.2 155701383 156030895 -> CM000686.2 56887903 57217415 - # equivalent to: -# chrX 10001 2781479 -> chrY 10001 2781479 -# chrX 155701383 156030895 -> chrY 56887903 57217415 - - # subtract one for the chromStart position: - cat << '_EOF_' > hg38Par.bed4 -chrX 10000 2781479 PAR1 -chrX 155701382 156030895 PAR2 -chrY 10000 2781479 PAR1 -chrY 56887902 57217415 PAR2 -'_EOF_' - # << happy emacs - - hgLoadBed hg38 par hg38Par.bed4 - checkTableCoords hg38 - - # hg19 had: -+-------+------------+-----------+------+ -| chrom | chromStart | chromEnd | name | -+-------+------------+-----------+------+ -| chrX | 60000 | 2699520 | PAR1 | -| chrX | 154931043 | 155260560 | PAR2 | -| chrY | 10000 | 2649520 | PAR1 | -| chrY | 59034049 | 59363566 | PAR2 | -+-------+------------+-----------+------+ - - # The AGP files come close to definining the location, but not - # precisely. The first region uses different bits of AC006209.25: -zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\ - | grep AC006209.25 -CM000685.2 2665048 2677319 56 F AC006209.25 127483 139754 - -CM000685.2 2677869 2804801 58 F AC006209.25 1 126933 - -zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\ - | grep AC006209.25 -CM000686.2 2665048 2677319 56 F AC006209.25 127483 139754 - -CM000686.2 2677869 2781479 58 F AC006209.25 23323 126933 - - - # and the second region uses different bits of AJ271735.1: -zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\ - | grep AJ271735.1 | head -1 -CM000685.2 155676925 155719966 3096 O AJ271735.1 44687 87728 + -zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\ - | grep AJ271735.1 | head -1 -CM000686.2 56887903 56906486 356 O AJ271735.1 69145 87728 + - - # combining all the contig definitions from each will find all the - # exact identical contig bits: -zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\ - | grep -v "^#" | awk '$5 != "N"' \ - | awk '{printf "%s_%d_%d\t%s\t%d\t%d\n", $6,$7,$8,$1,$2,$3}' \ - | sort > chrY.comp.agp.txt -zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\ - | grep -v "^#" | awk '$5 != "N"' \ - | awk '{printf "%s_%d_%d\t%s\t%d\t%d\n", $6,$7,$8,$1,$2,$3}' \ - | sort > chrX.comp.agp.txt - join -t'^I' chrY.comp.agp.txt chrX.comp.agp.txt | head - -CM000685.2 10001 44821 CM000686.2 10001 44821 -... -CM000685.2 2677320 2677868 CM000686.2 2677320 2677868 - -CM000685.2 155719967 155720351 CM000686.2 56906487 56906871 -... -CM000685.2 155964490 156030895 CM000686.2 57151010 57217415 - -############################################################################ -## altLocations track (DONE - 2014-01-02 - Hiram) - # indicate corresponding locations between haplotypes and reference - mkdir /hive/data/genomes/hg38/bed/altLocations - cd /hive/data/genomes/hg38/bed/altLocations - - find ../../genbank/ALT_* -type f | grep alt_scaffold_placement.txt \ - | while read F -do - grep -v "^#" ${F} | sed -e 's/\./v/;' | awk -F'\t' '{printf "chr%s\t%d\t%d\tchr%s_%s_alt\n", $6,$12-1,$13,$6, $4}' -done | sort -k1,1 -k2,2n > chrToAlt.bed - - # note silent hidden <tab> character in the join -t argument - # explicit as written here - -find ../../genbank/ALT_* -type f | grep alt_scaffold_placement.txt \ - | while read F -do - grep -v "^#" ${F} | sed -e 's/\./v/;' | awk -F'\t' '{printf "chr%s_%s_alt\tchr%s:%d-%d\n", $6,$4,$6,$12,$13}' -done | sort > altToChr.tab -sort ../../chrom.sizes | join -t'^I' - altToChr.tab \ - | awk '{printf "%s\t0\t%d\t%s\n", $1,$2,$3}' > altToChr.bed - - - hgLoadBed hg38 altLocations chrToAlt.bed altToChr.bed - featureBits -countGaps hg38 altLocations - # 170113652 bases of 3209286105 (5.301%) in intersection - -############################################################################ -## genscan (DONE - 2014-01-07 - Hiram) - mkdir /hive/data/genomes/hg38/bed/genscan - cd /hive/data/genomes/hg38/bed/genscan - - # using the contig sequence - # running stepwise to allow the lifting of the final result - time $HOME/kent/src/hg/utils/automation/doGenscan.pl hg38 -buildDir=`pwd` \ - -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \ - -stop=makeBed -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ - > do.log 2>&1 - # three jobs did not finish due to almost all N's in the sequence, - # just a couple of bases in each piece. Their empty result is good enough. - time $HOME/kent/src/hg/utils/automation/doGenscan.pl hg38 -buildDir=`pwd` \ - -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \ - -continue=makeBed -stop=makeBed -bigClusterHub=ku -dbHost=hgwdev \ - -workhorse=hgwdev > makeBed.log 2>&1 - # real 0m48.161s - - cd lifted - mkdir -p gtf subopt nameFixed/gtf nameFixed/pep newNames pep - for F in ../gtf/000/*.gtf -do - B=`basename $F` - liftUp gtf/${B} ../../../jkStuff/hg38.contigs.lift carry $F - echo $B -done - for F in ../subopt/000/*.bed -do - B=`basename $F` - liftUp subopt/${B} ../../../jkStuff/hg38.contigs.lift carry $F - echo $B -done - - ls gtf/chr*_[0-9][0-9].gtf \ - | sed -e 's/_[0-9][0-9]//; s#gtf/##; s/.gtf//;' | sort -u | while read C -do - cat ../pep/000/${C}_[0-9][0-9].pep > pep/${C}.pep - cat gtf/${C}_[0-9][0-9].gtf | ./gtfFixId.pl ${C} > nameFixed/gtf/${C}.gtf - ./pepNameFix.pl ${C} > nameFixed/pep/${C}.pep -done - - cat nameFixed/gtf/*.gtf > ../hg38.genscan.gtf - ls gtf | egrep -v '^chr[0-9XY][0-9]*_[0-9][0-9].gtf' | while read C -do - cat gtf/${C} -done >> ../hg38.genscan.gtf - - cat nameFixed/pep/*.pep > ../hg38.genscan.pep - ls gtf | egrep -v '^chr[0-9XY][0-9]*_[0-9][0-9].gtf' \ - | sed -e 's/.gtf/.pep/' | while read C -do - cat ../pep/000/${C} -done >> ../hg38.genscan.pep - - cd /hive/data/genomes/hg38/bed/genscan - cat lifted/subopt/*.bed | sort -k1,1 -k2,2n > hg38.genscanSubopt.bed - - gtfToGenePred hg38.genscan.gtf hg38.genscan.gp - genePredCheck -db=hg38 hg38.genscan.gp - # checked: 44149 failed: 0 - genePredToBed hg38.genscan.gp hg38.genscan.bed - bedToBigBed hg38.genscan.bed ../../chrom.sizes hg38.genscan.bb - bedToBigBed hg38.genscanSubopt.bed ../../chrom.sizes hg38.genscanSubopt.bb - ldHgGene -gtf hg38 genscan hg38.genscan.gtf -# Read 44149 transcripts in 339212 lines in 1 files -# 44149 groups 345 seqs 1 sources 1 feature types - - cat fb.hg38.genscan.txt - # 58278346 bases of 3049335806 (1.911%) in intersection - cat fb.hg38.genscanSubopt.txt - # 55020514 bases of 3049335806 (1.804%) in intersection - - # oddly, we are getting half of what hg19 had ? - featureBits hg19 genscan - # 106433874 bases of 2897316137 (3.674%) in intersection - - # This is because hg19 was run on soft-masked sequence and not - # on hard masked sequence - -############################################################################ -## genscan on unmasked sequence experiment (DONE - 2013-12-03 - Hiram) - ## instead, working on unmasked sequence: - mkdir /hive/data/genomes/hg38/bed/genscan/unmaskedRun - cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun - - mkdir liftSpecs - split -a 3 -d -l 1 ../../../jkStuff/hg38.nonBridged.lift liftSpecs/hg38_ - - mkdir fasta -for F in liftSpecs/hg38_* -do - L=`cut -f2 $F` - echo $L - /cluster/home/hiram/kent/src/hg/utils/lft2BitToFa.pl \ - ../../../hg38.unmasked.2bit $F > fasta/${L}.fa -done - - - cat << '_EOF_' > template -#LOOP -./runGsBig.bash $(path1) {check out exists gtf/$(root1).gtf} {check out exists pep/$(root1).pep} {check out exists subopt/$(root1).bed} -#ENDLOOP -'_EOF_' - # << happy emacs - cat << '_EOF_' > runGsBig.bash -#!/bin/bash - -set -beEu -o pipefail - -export seqFile=$1 -export resultGtf=$2 -export resultPep=$3 -export resultSubopt=$4 -/cluster/bin/x86_64/gsBig $seqFile $resultGtf -trans=$resultPep -subopt=$resultSubopt -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000 -'_EOF_' - # << happy emacs - - ls -1S `pwd`/fasta/*.fa > part.list - gensub2 part.list single template jobList - para create jobList - para push - # several jobs crashed: -# Completed: 726 of 733 jobs -# Crashed: 7 jobs -# CPU time in finished jobs: 62501s 1041.68m 17.36h 0.72d 0.002 y -# IO & Wait Time: 2563s 42.72m 0.71h 0.03d 0.000 y -# Average job time: 90s 1.49m 0.02h 0.00d -# Longest finished job: 3288s 54.80m 0.91h 0.04d -# Submission to last job: 3294s 54.90m 0.92h 0.04d - - para status | grep -v -w done | awk '{print $(NF-3)}' > crashed.job.list - - mkdir /hive/data/genomes/hg38/bed/genscan/unmaskedRun/crashedJobs - cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun/crashedJobs - mkdir splitBits - - for F in chr2.06 chr1.03 chr3.05 chr12.07 chr10.05 chr17.08 chr11.04 -do - faSplit -lift=${F}.lift gap ../fasta/${F}.fa 2000000 splitBits/${F}_ -done - - ls -1S `pwd`/splitBits/*.fa > part.list - cat << '_EOF_' > runGsBig.bash -#!/bin/bash - -set -beEu -o pipefail - -export seqFile=$1 -export resultGtf=$2 -export resultPep=$3 -export resultSubopt=$4 -/cluster/bin/x86_64/gsBig $seqFile $resultGtf -trans=$resultPep -subopt=$resultSubopt -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000 -'_EOF_' - # << happy emacs - chmod +x runGsBig.bash - - cat << '_EOF_' > template -#LOOP -./runGsBig.bash $(path1) {check out exists gtf/$(root1).gtf} {check out exists pep/$(root1).pep} {check out exists subopt/$(root1).bed} -#ENDLOOP -'_EOF_' - # << happy emacs - - gensub2 part.list single template jobList - para create jobList - para push -# Completed: 331 of 334 jobs -# Crashed: 3 jobs -# CPU time in finished jobs: 18097s 301.62m 5.03h 0.21d 0.001 y -# IO & Wait Time: 1085s 18.08m 0.30h 0.01d 0.000 y -# Average job time: 58s 0.97m 0.02h 0.00d -# Longest finished job: 79s 1.32m 0.02h 0.00d -# Submission to last job: 249s 4.15m 0.07h 0.00d - # the last three completed with -window=1600000 - - # lifting results: - cat << '_EOF_' > fixIds.pl -#!/usr/bin/env perl - -use strict; -use warnings; - -my $argc = scalar(@ARGV); - -if ($argc != 1) { - printf STDERR "usage: cat chrN.M.lifted | ./fixIds.pl chrN.M\n"; - exit 255; -} - -my $F=shift; -my $C = $F; -$C =~ s/\.[0-9][0-9]//; - -my $id = 0; -my $prevId = ""; -open (GT, ">${F}.gtf") or die "can not write to ${F}.gtf"; -while (my $line=<>) { - chomp $line; - my $geneId = $line; - $geneId =~ s/^${C}.*gene_id "${C}//; - $geneId =~ s/";.*//; - $id += 1 if ( $prevId ne $geneId); - $line =~ s/${C}[0-9]+.[0-9]+/${F}.$id/g; - printf GT "%s\n", $line; - $prevId = $geneId; -} -close (GT); -'_EOF_' - # << happy emacs - chmod +x fixIds.pl - for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05 -do - echo "${F}" 1>&2 - cut -f2 ${F}.lift | while read P - do - liftUp -type=.gtf stdout ${F}.lift error gtf/${P}.gtf - done > ${F}.lifted.gtf - cat ${F}.lifted.gtf | ./fixIds.pl ${F} -done - # copied these results to ../gtf/ to get into the final result -# -rw-rw-r-- 1 3349959 Jan 2 15:33 chr1.03.gtf -# -rw-rw-r-- 1 2439182 Jan 2 15:33 chr10.05.gtf -# -rw-rw-r-- 1 1068097 Jan 2 15:33 chr11.04.gtf -# -rw-rw-r-- 1 2392548 Jan 2 15:33 chr12.07.gtf -# -rw-rw-r-- 1 1831336 Jan 2 15:33 chr17.08.gtf -# -rw-rw-r-- 1 3539694 Jan 2 15:33 chr2.06.gtf -# -rw-rw-r-- 1 2309903 Jan 2 15:33 chr3.05.gtf - - for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05 -do - echo "${F}" 1>&2 - cut -f2 ${F}.lift | while read P - do - liftUp -type=.bed stdout ${F}.lift error subopt/${P}.bed - done > ${F}.lifted.subopt.bed -done - # copied these results to ../subopt/ to get into the final result -# -rw-rw-r-- 1 3349959 Jan 2 15:33 chr1.03.gtf -# -rw-rw-r-- 1 2439182 Jan 2 15:33 chr10.05.gtf -# -rw-rw-r-- 1 1068097 Jan 2 15:33 chr11.04.gtf -# -rw-rw-r-- 1 2392548 Jan 2 15:33 chr12.07.gtf -# -rw-rw-r-- 1 1831336 Jan 2 15:33 chr17.08.gtf -# -rw-rw-r-- 1 3539694 Jan 2 15:33 chr2.06.gtf -# -rw-rw-r-- 1 2309903 Jan 2 15:33 chr3.05.gtf - - - cat << '_EOF_' > pepNameFix.pl -#!/usr/bin/env perl - -use strict; -use warnings; - -# BIG ASSUMPTION ! ! ! - the peptides are in the same order as -# they are in the GTF file ! ! ! - -my $argc = scalar(@ARGV); - -if ($argc != 1) { - printf STDERR "usage: cat chrN.M.needNameFix.pep | ./pepNameFix.pl chrN.M > chrN.M.pep\n"; - exit 255; -} - -my $C=shift; - -my $id = 1; - -while (my $line = <>) { - if ($line =~ m/^>/) { - printf ">%s.%d\n", $C, $id++; - } else { - print $line; - } -} -'_EOF_' - # << happy emacs - chmod +x pepNameFix.pl - -for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05 -do - echo "${F}" 1>&2 - cut -f2 ${F}.lift | while read P - do - cat pep/${P}.pep - done > ${F}.needNameFix.pep - cat ${F}.needNameFix.pep | ./pepNameFix.pl ${F} > ${F}.pep -done - # copied these results to ../pep/ to get into the final result: -# -rw-rw-r-- 1 1592655 Jan 2 15:55 chr1.03.pep -# -rw-rw-r-- 1 1169168 Jan 2 15:55 chr10.05.pep -# -rw-rw-r-- 1 519106 Jan 2 15:55 chr11.04.pep -# -rw-rw-r-- 1 1152111 Jan 2 15:55 chr12.07.pep -# -rw-rw-r-- 1 775052 Jan 2 15:55 chr17.08.pep -# -rw-rw-r-- 1 1799546 Jan 2 15:55 chr2.06.pep -# -rw-rw-r-- 1 1248762 Jan 2 15:55 chr3.05.pep - - # and then, adding in all the results together - - cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun - cat << '_EOF_' > gtfIdFix.pl -#!/usr/bin/env perl - -use strict; -use warnings; - -my $argc = scalar(@ARGV); - -if ($argc != 1) { - printf STDERR "usage: cat lifted/gtf/chrN.gtf | ./gtfIdFix.pl chrN\n"; - exit 255; -} - -my $C=shift; - -my $id = 0; -my $prevId = ""; -open (NM, ">nameFixed/newNames/${C}.tab") or die "can not write to nameFixed/newNames/${C}.tab"; -open (GT, ">nameFixed/gtf/${C}.gtf") or die "can not write to nameFixed/gtf/${C}.gtf"; -while (my $line=<>) { - chomp $line; - my $geneId = $line; - $geneId =~ s/^${C}.*gene_id "//; - $geneId =~ s/";.*//; - if ( $prevId ne $geneId) { - $id += 1; - printf NM "%s\t%s.%d\n", $geneId, $C, $id; - } - $line =~ s/${C}.[0-9]+.[0-9]+/${C}.$id/g; - printf GT "%s\n", $line; - $prevId = $geneId; -} -close (GT); -close (NM); -'_EOF_' - # << happy emacs - chmod +x gtfIdFix.pl - - rm -fr lifted - rm -fr nameFix - mkdir -p lifted - mkdir -p lifted/gtf - mkdir -p lifted/pep - mkdir -p lifted/subopt - mkdir -p nameFix - mkdir -p nameFix/gtf - mkdir -p nameFix/newNames - - for F in liftSpecs/hg38_* -do - L=`cut -f2 $F` - C=`cut -f4 $F` - liftUp -type=.gtf stdout ${F} error gtf/${L}.gtf >> lifted/gtf/${C}.gtf - cat pep/${L}.pep >> lifted/pep/${C}.pep - liftUp -type=.bed stdout ${F} error subopt/${L}.bed >> lifted/subopt/${C}.bed -done - - for F in lifted/gtf/*.gtf -do - C=`basename $F | sed -e 's/.gtf//'` - cat $F | ./gtfIdFix.pl $C -done - -mkdir -p nameFixed/pep - - cat << '_EOF_' > pepNameFix.pl -#!/usr/bin/env perl - -use strict; -use warnings; - -my $argc = scalar(@ARGV); -if ($argc != 1) { - printf STDERR "usage: ./pepNameFix.pl chrN > chrN.pep\n"; - exit 255 -} - -my $C = shift; -my %newName; - -open (FH, "<lifted/pep/$C.pep") or die "can not read <lifted/pep/$C.pep"; -open (NM, "<nameFixed/newNames/$C.tab") or die "can not read nameFixed/newNames/$C.tab"; -while (my $line = <NM>) { - chomp $line; - my ($needFix, $fixedName) = split('\t', $line); - $newName{$needFix} = $fixedName; -} -close (NM); - -while (my $line = <FH>) { - if ($line =~m /^>/) { - chomp $line; - $line =~ s/^>//; - die "can not find name to fix $line" if (!exists($newName{$line})); - printf ">%s\n", $newName{$line}; - } else { - print $line; - } -} -close (FH); -'_EOF_' - # << happy emacs - chmod +x pepNameFix.pl - - for F in lifted/pep/*.pep -do - C=`basename $F | sed -e 's/.pep//'` - echo $C - ./pepNameFix.pl $C > nameFixed/pep/$C.pep -done - -############################################################################# -# Mark the new centromere regions (DONE - 2014-01-09 - Hiram) - mkdir /hive/data/genomes/hg38/bed/centromere - cd /hive/data/genomes/hg38/bed/centromere - grep GJ ../../hg38.agp > hg38.centContigs.agp - - awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' hg38.centContigs.agp \ - > hg38.centContigs.bed4 - - hgLoadBed hg38 centromeres hg38.centContigs.bed4 - checkTableCoords hg38 centromeres - -############################################################################# -## alternate sequence/haplotype alignments (DONE - 2014-01-23 - Hiram) - mkdir /hive/data/genomes/hg38/bed/lastzAltSequences - cd /hive/data/genomes/hg38/bed/lastzAltSequences - -rm -fr hg38.haplotypes.lift temp.lift targetFa queryFa -mkdir targetFa -mkdir queryFa -touch temp.lift - -cat ../altLocations/chrToAlt.bed | while read L -do - chrName=`echo $L | awk '{print $1}'` - chromSize=`egrep "^$chrName " ../../chrom.sizes | cut -f2` - chrStart=`echo $L | awk '{if (($2-10000)>=0) {printf "%d", $2-10000} else {printf "0"}}'` - chrEnd=`echo $L | awk -v chromSize=$chromSize '{if (($3+10000)<=chromSize) {printf "%d", $3+10000} else {printf "%d", chromSize}}'` - chrSize=`echo $chrEnd $chrStart | awk '{print $1-$3}'` - queryName=`echo $L | awk '{print $4}'` - partName="${chrName}_${chrStart}_${chrEnd}" - echo $chrName $chrStart $chrEnd $queryName $partName $chromSize - echo -e "$chrStart\t${partName}\t$chrSize\t$chrName\t$chromSize" >> temp.lift - twoBitToFa ../../hg38.unmasked.2bit:$chrName:$chrStart-$chrEnd stdout | sed -e "s/^>.*/>$partName/;" > targetFa/$queryName.fa - twoBitToFa ../../hg38.unmasked.2bit:$queryName queryFa/$queryName.fa -done - -sort -u temp.lift | sort -k4,4 -k1,1n > hg38.haplotypes.lift - - # these were run serially on hgwdev, they could be a cluster run: - ssh ku - mkdir /hive/data/genomes/hg38/bed/lastzAltSequences/run.blastz - cd /hive/data/genomes/hg38/bed/lastzAltSequences/run.blastz - mkdir ../lav ../psl - - # construct the jobList - ls ../targetFa | sed -e 's/.fa//;' | while read partName -do - echo "./runJob.sh ${partName}" -done > jobList - - cat << '_EOF_' > runJob -#!/bin/sh - -export partName=$1 -export target="../targetFa/$partName.fa" -export query="../queryFa/$partName.fa" -export lav="../lav/$partName.lav" -export psl="../psl/$partName.psl" - -/cluster/bin/penn/lastz-distrib-1.03.46/bin/lastz \ - $target $query \ - Y=15000 T=2 M=254 O=600 H=2000 O=600 E=150 K=10000 L=10000 \ - Q=/scratch/data/blastz/human_chimp.v2.q > $lav -lavToPsl $lav stdout | liftUp $psl ../hg38.haplotypes.lift error stdin -'_EOF_' - # << happy emacs - - # these were run serially on hgwdev, they could be a cluster run: - time ./jobList > do.log - # real 61m35.898s - - # chaining lastz results: - mkdir -p /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain/run/chain - cd /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain/run - - ls ../../psl/*.psl | while read P -do - B=`basename $P | sed -e 's/.psl//'` - echo $B $P - ls -og $P ../../targetFa/${B}.fa ../../queryFa/${B}.fa - /cluster/home/hiram/kent/src/hg/mouseStuff/axtChain/axtChain \ - -psl -scoreScheme=/scratch/data/blastz/human_chimp.v2.q \ - -minScore=1000 -linearGap=medium $P \ - ../../../../hg38.unmasked.2bit \ - ../../../../hg38.unmasked.2bit stdout \ - | chainAntiRepeat ../../../../hg38.unmasked.2bit \ - ../../../../hg38.unmasked.2bit stdin chain/${B}.chain -done - - # real 7m54.677s - - cd /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain - find ./run/chain -name "*.chain" | chainMergeSort -inputList=stdin \ - | nice gzip -c > hg38.haplotypes.all.chain.gz - chainPreNet hg38.haplotypes.all.chain.gz ../../../chrom.sizes \ - /hive/data/genomes/hg38/chrom.sizes stdout \ - | chainNet stdin -minSpace=1 ../../../chrom.sizes \ - ../../../chrom.sizes stdout /dev/null \ - | netSyntenic stdin noClass.net - - # Make liftOver chains from chroms to alternates: - netChainSubset -verbose=0 noClass.net hg38.haplotypes.all.chain.gz stdout \ - | chainStitchId stdin stdout | gzip -c > hg38.haplotypes.over.chain.gz - # swap the alignments to get the alternates to chrom mappings: - chainSwap hg38.haplotypes.over.chain.gz stdout \ - | gzip -c > hg38.reference.over.chain.gz - # and put them all together so mappings go both directions - chainMergeSort hg38.haplotypes.over.chain.gz hg38.reference.over.chain.gz \ - | gzip -c > hg38.haploReference.over.chain.gz - - hgLoadChain -tIndex hg38 chainAltSequence hg38.haploReference.over.chain.gz - netClass -verbose=0 -noAr noClass.net hg38 hg38 hg38.hg38AltSequence.net - netFilter -minGap=10 hg38.hg38AltSequence.net \ - | hgLoadNet -verbose=0 hg38 netAltSequence stdin - - chainToPsl hg38.haploReference.over.chain.gz ../../../chrom.sizes \ - ../../../chrom.sizes \ - /hive/data/genomes/hg38/hg38.unmasked.2bit \ - /hive/data/genomes/hg38/hg38.unmasked.2bit \ - hg38.beforeRecalc.haploReference.over.psl - - pslCheck -targetSizes=../../../chrom.sizes \ - -querySizes=../../../chrom.sizes \ - hg38.beforeRecalc.haploReference.over.psl 2>&1 | tail -1 - # checked: 3092 failed: 57 errors: 57 - - pslRecalcMatch hg38.beforeRecalc.haploReference.over.psl \ - ../../../hg38.unmasked.2bit ../../../hg38.unmasked.2bit \ - hg38.haploReference.over.psl - - pslCheck -targetSizes=../../../chrom.sizes \ - -querySizes=../../../chrom.sizes \ - hg38.haploReference.over.psl 2>&1 | tail -1 - # checked: 3092 failed: 0 errors: 0 - - hgLoadPsl hg38 -table=altSequenceLiftOver hg38.haploReference.over.psl - -############################################################################# -## construct non-bridged contig sequence (DONE - 2014-01-10 - Hiram) - mkdir /hive/data/genomes/hg38/bed/nonBridgedContigs - cd /hive/data/genomes/hg38/bed/nonBridgedContigs - - # only need the actual split chroms in this lift, and the - # _nn name is a bit more convenient than the .nn: - gapToLift -minGap=100 hg38 stdout | sed -e 's/\./_/;' \ - | awk '$1 != 0' > hg38.contigs.lift - # the warnings gapToLift issues are about gaps defined in the table - # that are abutting to each other. teleomere gaps are next to contig gaps - # those lifts in the format of a bed file: - awk '{printf "%s\t%d\t%d\t%s\n", $4, $1, $1+$3, $2}' hg38.contigs.lift \ - > hg38.contigs.bed - # the negation of that is the gaps between the contigs - # fixup the .N to _nn with the awk: - featureBits -not -countGaps hg38 hg38.contigs.bed -bed=stdout \ -| awk '{split($4,a,"."); printf "%s\t%d\t%d\t%s_%02d\n", $1,$2,$3,a[1],a[2]}' \ - > hg38.gaps.bed - # 268613637 bases of 3209286105 (8.370%) in intersection - - # together, those two should be %100 of the genome exactly: - featureBits -countGaps -or hg38 hg38.contigs.bed hg38.gaps.bed - # 3209286105 bases of 3209286105 (100.000%) in intersection - - # the list of all those other bits not in the split chroms: - egrep "_alt|chrUn|chrM|_random" hg38.gaps.bed | cut -f1 \ - | sort > other.bits.list - - # extract those chrom pieces and the other bits from the masked sequence: - (twoBitToFa -bed=hg38.contigs.bed ../../hg38.2bit stdout; \ - twoBitToFa -seqList=other.bits.list ../../hg38.2bit stdout) \ - | faToTwoBit stdin hg38.contigs.2bit - twoBitInfo hg38.contigs.2bit stdout | sort -k2nr > hg38.contigs.chrom.sizes - # verify nothing has been lost: - twoBitToFa ../../hg38.2bit stdout | faCount stdin | tail -1 -# total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743 - twoBitToFa hg38.contigs.2bit stdout | faCount stdin | tail -1 -# total 3061688741 898285419 623727342 626335137 900967885 12372958 30979743 - # the ACGT and CPG counts remain the same, only N's have been lost - - # make a copy of this at the top: - cp -p hg38.contigs.2bit ../.. - cp -p hg38.contigs.lift ../../jkStuff - - # load as a track to be able to see where they are: - egrep "chrUn|chrM|_alt|_random" hg38.contigs.chrom.sizes \ - | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $1}' \ - > fullCoverage.hg38Contigs.bed - cat hg38.contigs.bed >> fullCoverage.hg38Contigs.bed - featureBits -or -countGaps hg38 fullCoverage.hg38Contigs.bed gap - # 3209286105 bases of 3209286105 (100.000%) in intersection - - hgLoadBed hg38 contigAlignmentSegments fullCoverage.hg38Contigs.bed - -############################################################################# -## analysis of repeat elements from each RM run -## (DONE - 2014-01-10 - Hiram) - mkdir /hive/data/genomes/hg38/bed/repeatElementCount - cd /hive/data/genomes/hg38/bed/repeatElementCount - for F in ../rmsk*/hg38.class.profile.txt \ - ../repeatMaskerGenbank/hg38.class.profile.txt -do - D=`dirname $F` - B=`basename $D | sed -e 's/repeatMaskerGenbank/NCBI/; s/rmsk//;'` - echo "==== $B ====" - grep rmskClass $F | sed -e 's#rmskClass/##; s/.tab//;' \ - | awk '{printf "%s\t%d\n", $2, $1}' | sort > ${B}.tab -done - - # Hmmer does not have snRNA and tRNA ? - echo -e "snRNA\t0" >> Hmmer.tab - echo -e "tRNA\t0" >> Hmmer.tab - sort Hmmer.tab > t.tab - mv t.tab Hmmer.tab - - echo "# Repeat Masker item counts" > table.result.txt - echo "# class NCBI cross-match rmblastn HMMER" >> table.result.txt - join NCBI.tab CM.tab | join - Blastn.tab | join - Hmmer.tab \ - | awk '{printf "%-15s\t%7d\t%7d\t%7d\t%7d\n", $1,$2,$3,$4,$5}' \ - | sort -k2,2nr >> table.result.txt - - cat table.result.txt -# Repeat Masker item counts -# class NCBI cross-match rmblastn HMMER -SINE 1849444 1852545 1822406 1884179 -LINE 1586141 1570523 1551012 1702529 -LTR 759248 748597 737799 805427 -DNA 502186 499108 485558 565171 -Simple_repeat 433789 703682 716968 636906 -Low_complexity 396378 102856 105181 95480 -Satellite 10198 7962 7703 10852 -LTR? 5884 5667 5068 9181 -snRNA 4595 4516 4548 0 -Retroposon 4163 5750 5630 11861 -Unknown 2802 5622 5263 3914 -DNA? 2157 3294 3018 4582 -tRNA 2154 2026 1983 0 -rRNA 1915 1840 1810 464 -RC 1860 1784 1706 2059 -srpRNA 1784 1672 1633 1517 -scRNA 1397 1420 1426 6783 -RNA 822 704 611 1484 -SINE? 488 38 38 970 -RC? 445 411 374 806 - -total 5567850 5520017 5459735 5744165 - -############################################################################# -## blat server turned on (DONE - 2014-01-13 - Hiram) -# After getting a blat server assigned by the Blat Server Gods, - ssh hgwdev - - hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ - VALUES ("hg38", "blat4c", "17780", "1", "0"); \ - INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ - VALUES ("hg38", "blat4c", "17781", "0", "1");' \ - hgcentraltest - # test it with some sequence - -############################################################################ -## reset default position to ABO gene (DONE - 2014-01-13 - Hiram) - ssh hgwdev - hgsql -e 'update dbDb set defaultPos="chr9:133252000-133280861" - where name="hg38";' hgcentraltest - -######################################################################### -## update grp table with new set of standard rows (DONE - 2014-01-29 - Hiram) - hgsql -e 'alter table grp rename grpOriginal;' hg38 - hgsql -e 'drop table grp;' hg38 - hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg19.grp" hg38 - hgsql -e 'delete from grp where name="denisova";' hg38 - hgsql -e 'delete from grp where name="pub";' hg38 - hgsql -e 'delete from grp where name="neandertal";' hg38 - hgsql -e 'update grp set defaultIsClosed=0 where name="map";' hg38 - - hgsql -e 'drop table grpOriginal;' hg38 - -############################################################################ -# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR LASTZ (DONE - 2014-01-21 - Hiram) - ssh ku - mkdir /hive/data/genomes/hg38/bed/linSpecRep - cd /hive/data/genomes/hg38/bed/linSpecRep - # create individual .out files from the master record in ../repeatMasker - mkdir splitOut - cat << '_EOF_' > split.csh -#!/bin/csh -fe -set C = $1 -head -3 ../repeatMasker/hg38.sorted.fa.out > splitOut/${C}.out -grep "${C} " ../repeatMasker/hg38.sorted.fa.out >> splitOut/${C}.out -'_EOF_' - # << happy emacs - chmod +x split.csh - - cat << '_EOF_' > template -#LOOP -split.csh $(root1) {check out line+ splitOut/$(root1).out} -#ENDLOOP -'_EOF_' - # << happy emacs - - # small ones first: - cut -f1 ../../chrom.sizes | tac > chrom.list - gensub2 chrom.list single template jobList - para create jobList - para try ... check ... push ... etc... -# Completed: 93 of 93 jobs -# CPU time in finished jobs: 127s 2.12m 0.04h 0.00d 0.000 y -# IO & Wait Time: 17154s 285.90m 4.76h 0.20d 0.001 y -# Average job time: 186s 3.10m 0.05h 0.00d -# Longest finished job: 224s 3.73m 0.06h 0.00d -# Submission to last job: 280s 4.67m 0.08h 0.00d - - # now, we can date and process each of those .out files - # constructing the humanSpecific set of repeats - # this means repeats found in human, and not in others - # using mouse here for 'others' is good enough, a variety - # of other species could be used (rat dog cow) where they all - # produce the same result - mkdir dateRepeats - cd dateRepeats - cat << '_EOF_' > mkLSR -#!/bin/bash -set -beEu -o pipefail -rm -f $1.out_mus-musculus -ln -s ../splitOut/$1.out . -/scratch/data/RepeatMasker/DateRepeats $1.out -query human -comp mouse -rm $1.out -mkdir -p ../humanSpecific -/cluster/bin/scripts/extractRepeats 1 $1.out_mus-musculus \ - > ../humanSpecific/$1.out.spec -'_EOF_' - # << happy emacs - chmod +x mkLSR - - cat << '_EOF_' > template -#LOOP -./mkLSR $(path1) {check out line+ ../humanSpecific/$(path1).out.spec} -#ENDLOOP -'_EOF_' - # << happy emacs - - gensub2 ../chrom.list single template jobList - para try ... check ... push ... etc... - para time -# Completed: 455 of 455 jobs -# CPU time in finished jobs: 13985s 233.08m 3.88h 0.16d 0.000 y -# IO & Wait Time: 1470s 24.50m 0.41h 0.02d 0.000 y -# Average job time: 34s 0.57m 0.01h 0.00d -# Longest finished job: 111s 1.85m 0.03h 0.00d -# Submission to last job: 1427s 23.78m 0.40h 0.02d - - - # We also need the nibs for blastz runs with lineage specific repeats - mkdir /hive/data/genomes/hg38/bed/nibs - cd /hive/data/genomes/hg38/bed/nibs - cut -f1 ../../chrom.sizes | while read C -do - twoBitToFa -seq=${C} ../../hg38.2bit stdout \ - | faToNib -softMask stdin ${C}.nib - echo "${C} done" -done - - # verify nothing lost - cat ../../chrom.sizes \ - | awk '{printf "nibFrag -masked %s.nib 0 %d + stdout\n", $1, $2}' \ - | sh | faSize stdin -# 3209286105 bases (159970322 N's 3049315783 real 1460684798 upper -# 1588630985 lower) in 455 sequences in 1 files -# Total size: mean 7053376.1 sd 31548372.6 -# min 970 (chrUn_KI270394v1.nib:0-970) -# max 248956422 (chr1.nib:0-248956422) median 161218 -# %49.50 masked total, %52.10 masked real - - mkdir /hive/data/staging/data/hg38/nib - rsync -a --progress ./ /hive/data/staging/data/hg38/nib - -############################################################################# -## GRC Contigs/ctgPos2 track (DONE - 2014-12-25 - Hiram) - # provide mapping of UCSC chrom names to GRC names - mkdir /hive/data/genomes/hg38/bed/ctgPos2 - cd /hive/data/genomes/hg38/bed/ctgPos2 - grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \ - | awk '{printf "s/^%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt - - find ../../genbank -type f | grep "/assembled_chromosomes/AGP/" | sed -e 's/.comp//' | while read F -do - if [ -s $F ]; then - zcat $F | grep -v "^#" - fi -done | sed -e "`cat accessionToUcsc.sed.txt`" > ucsc.grch38.agp - - awk '$5 != "N"' ucsc.grch38.agp \ -| awk '{printf "%s\t%d\t%s\t%d\t%d\t%s\n", $6, $3-$2+1, $1, $2-1, $3, $5}' \ - | sort -u | sort -k3,3 -k4,4n > ctgPos2.tab - - - export ctgSize=`awk '{print length($1)}' ctgPos2.tab | sort -n | tail -1` - export chrSize=`awk '{print length($3)}' ctgPos2.tab | sort -n | tail -1` - - sed -e "s/20/$ctgSize/; s/16/$chrSize/;" \ - /cluster/home/hiram/kent/src/hg/lib/ctgPos2.sql > hg38.ctgPos2.sql - - hgLoadSqlTab hg38 ctgPos2 hg38.ctgPos2.sql ctgPos2.tab - -############################################################################ -# constructing download files (WORKING - 2014-01-15 - Hiram) - # add hg38 to all.joiner and verify it is clean: - joinerCheck -database=hg38 -keys all.joiner -# Checking keys on database hg38 -# hg38.ucscToINSDC.chrom - hits 455 of 455 (100.000%) ok - # and all table coordinates are OK: - checkTableCoords hg38 - - cd /hive/data/genomes/hg38 - time $HOME/kent/src/hg/utils/automation/makeDownloads.pl \ - -workhorse=hgwdev hg38 - # makeDownloads.pl has made a preliminary set of files - - # need to fixup these names and add chromFa.tar.gz files - cd /hive/data/genomes/hg38/goldenPath/bigZips - - mkdir chroms - mkdir maskedChroms - - faSplit byname hg38.fa.gz chroms/ - faSplit byname hg38.fa.masked.gz maskedChroms/ - - tar cvzf ./hg38.chromFa.tar.gz ./chroms/ - tar cvzf ./hg38.chromFaMasked.tar.gz ./maskedChroms/ - - cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips - ln -s /hive/data/genomes/hg38/goldenPath/bigZips/hg38.chromFa.tar.gz hg38.chromFa.tar.gz - ln -s /hive/data/genomes/hg38/goldenPath/bigZips/hg38.chromFaMasked.tar.gz hg38.chromFaMasked.tar.gz - - #also added entries for above to md5sum.txt and README.txt - -############################################################################ -# LASTZ MOUSE Mm10 (DONE - 2014-01-23,31 - Hiram) - # can no longer use the lineage specific repeats with the new lastz - # use a screen to manage this longish job: - screen -S hg38Mm10 - - mkdir /hive/data/genomes/hg38/bed/lastzMm10.2014-01-23 - cd /hive/data/genomes/hg38/bed/lastzMm10.2014-01-23 - - # best to always specify an exact path to lastz so we know which one is used - # lastz default parameters are human-mouse parameters - - cat << '_EOF_' > DEF -# human vs mouse -BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz - -# TARGET: Human Hg38 -SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit -SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes -SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit -SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes -SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift -SEQ1_CHUNK=40000000 -SEQ1_LAP=10000 - -# QUERY: Mouse Mm10 -SEQ2_DIR=/scratch/data/mm10/mm10.2bit -SEQ2_LEN=/scratch/data/mm10/chrom.sizes -SEQ2_CHUNK=20000000 -SEQ2_LAP=0 - -BASE=/hive/data/genomes/hg38/bed/lastzMm10.2014-01-23 -TMPDIR=/dev/shm -'_EOF_' - # << happy emacs - - time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ - -verbose=2 \ - -stop=net `pwd`/DEF \ - -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -fileServer=hgwdev \ - -chainMinScore=3000 -chainLinearGap=medium > net.log 2>&1 - # real 1494m26.135s ---- busy cluster - time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ - -verbose=2 \ - -continue=load `pwd`/DEF \ - -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -fileServer=hgwdev \ - -chainMinScore=3000 -chainLinearGap=medium > load.log 2>&1 - # Elapsed time: 43m11s - cat fb.hg38.chainMm10Link.txt - # 964465044 bases of 3049335806 (31.629%) in intersection - - # and the swap - mkdir /hive/data/genomes/mm10/bed/blastz.hg38.swap - cd /hive/data/genomes/mm10/bed/blastz.hg38.swap - time nice -n +19 doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/hg38/bed/lastzMm10.2014-01-23/DEF \ - -swap -syntenicNet \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 - # real 83m28.397s - - cat fb.mm10.chainHg38Link.txt - # 937030766 bases of 2652783500 (35.323%) in intersection - -######################################################################### -# LASTZ Dog CanFam3 (DONE - 2014-01-26 - Hiram) - mkdir /hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26 - cd /hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26 - - cat << '_EOF_' > DEF -# human vs dog -BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz - -# TARGET: Human Hg38 -SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit -SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes -SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit -SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes -SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift -SEQ1_CHUNK=20000000 -SEQ1_LAP=10000 - -# QUERY: Dog CanFam3 -SEQ2_DIR=/hive/data/genomes/canFam3/canFam3.2bit -SEQ2_LEN=/hive/data/genomes/canFam3/chrom.sizes -SEQ2_CHUNK=20000000 -SEQ2_LAP=0 - -BASE=/hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26 -TMPDIR=/dev/shm -'_EOF_' - # << happy emacs - - # establish a screen to control this job - screen hg38CanFam3 - time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \ - `pwd`/DEF \ - -syntenicNet \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 - # Elapsed time: 1396m22s - busy cluster - cat fb.hg38.chainCanFam3Link.txt - # 1523987456 bases of 3049335806 (49.978%) in intersection - - # running the swap - mkdir /hive/data/genomes/canFam3/bed/blastz.hg38.swap - cd /hive/data/genomes/canFam3/bed/blastz.hg38.swap - time nice -n +19 doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26/DEF \ - -syntenicNet -swap \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 - # real 107m57.787s - - cat fb.canFam3.chainHg38Link.txt - # 1437624815 bases of 2392715236 (60.083%) in intersection - -######################################################################### -# LASTZ Macaca Mulatta RheMac3 (DONE - 2014-01-27,02-10 - Hiram) - mkdir /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27 - cd /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27 - - # best to always specify an exact path to lastz so we know which one is used - # lastz default parameters are human-mouse parameters - - cat << '_EOF_' > DEF -# human vs macaca mulatta -BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz -# maximum M allowed with lastz is only 254 -BLASTZ_M=254 -BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q -BLASTZ_O=600 -BLASTZ_E=150 -# other parameters from panTro2 vs hg18 lastz on advice from Webb -BLASTZ_K=4500 -BLASTZ_Y=15000 -BLASTZ_T=2 - -# TARGET: Human Hg38 -SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit -SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes -SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit -SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes -SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift -SEQ1_CHUNK=20000000 -SEQ1_LAP=10000 - -# QUERY: Macaca Mulatta RheMac3 -SEQ2_DIR=/scratch/data/rheMac3/rheMac3.2bit -SEQ2_LEN=/scratch/data/rheMac3/chrom.sizes -SEQ2_CHUNK=20000000 -SEQ2_LAP=0 -SEQ2_IN_CONTIGS=0 - -BASE=/hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27 -TMPDIR=/dev/shm -'_EOF_' - # << happy emacs - time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ - `pwd`/DEF \ - -syntenicNet -fileServer=hgwdev \ - -chainMinScore=5000 -chainLinearGap=medium \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 - # Elapsed time: 1426m43s - busy cluster - cat fb.hg38.chainRheMac3Link.txt - # 2431208700 bases of 3049335806 (79.729%) in intersection - - # running the swap - mkdir /hive/data/genomes/rheMac3/bed/blastz.hg38.swap - cd /hive/data/genomes/rheMac3/bed/blastz.hg38.swap - time nice -n +19 doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27/DEF \ - -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 - # 82m32.329s - cat fb.rheMac3.chainHg38Link.txt - # 2288533769 bases of 2639145830 (86.715%) in intersection - -######################################################################### -## construct analysis set (DONE - 2014-01-27 - Hiram) - mkdir /hive/data/genomes/hg38/bed/analysisSet - cd /hive/data/genomes/hg38/bed/analysisSet - mkdir -p splitFa - - faToTwoBit \ -../../genbank/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz \ - hg38.unmasked.analysisSet.2bit - - faCount splitFa/c*.fa > splitFa.faCount.txt - - egrep -v "chr[0-9_BGHIJKLXv]*_alt" ../rmskCM/hg38.sorted.fa.out \ - > hg38.analysisSet.out - - twoBitMask hg38.unmasked.analysisSet.2bit hg38.analysisSet.out \ - hg38.rmsk.analysisSet.2bit - - egrep -v "chr[0-9_BGHIJKLXv]*_alt" ../simpleRepeat/trfMask.bed \ - > trfMask.analysisSet.bed - - twoBitMask hg38.rmsk.analysisSet.2bit -add trfMask.analysisSet.bed \ - hg38.analysisSet.2bit - - twoBitToFa hg38.unmasked.analysisSet.2bit stdout | faSize stdin -# 3099922541 bases (165046090 N's 2934876451 real 2934876451 upper 0 lower) -# in 195 sequences in 1 files -# Total size: mean 15897038.7 sd 46804464.6 min 970 (chrUn_KI270394v1) -# max 248956422 (chr1) median 32032 -# %0.00 masked total, %0.00 masked real - - twoBitToFa hg38.analysisSet.2bit stdout | faSize stdin -# 3099922541 bases (165046090 N's 2934876451 real 1409378896 upper 1525497555 -# lower) in 195 sequences in 1 files -# Total size: mean 15897038.7 sd 46804464.6 min 970 (chrUn_KI270394v1) -# max 248956422 (chr1) median 32032 -# %49.21 masked total, %51.98 masked real - - mkdir hg38.analysisSet.chroms - twoBitToFa hg38.analysisSet.2bit stdout \ - | faSplit byname stdin hg38.analysisSet.chroms/ - - tar cvzf ./hg38.analysisSet.chroms.tar.gz ./hg38.analysisSet.chroms - - ln -s `pwd`/hg38.analysisSet.2bit \ - /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips - ln -s `pwd`/hg38.analysisSet.chroms.tar.gz \ - /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips - # add these md5 sums to md5sum.txt - md5sum hg38.analysisSet.2bit hg38.analysisSet.chroms.tar.gz >> \ - /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/md5sum.txt - - cp ../../genbank/README_ANALYSIS_SETS README.analysisSet.txt - # add note at the top of README: - ###################################################################### - UCSC copy of the file from: - - ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/README_ANALYSIS_SETS - - ln -s `pwd`/README.analysisSet.txt \ - /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips - -######################################################################### -# the FULL analysis set (DONE - 2014-03-18 - Hiram - mkdir /hive/data/genomes/hg38/bed/fullAnalysisSet - cd /hive/data/genomes/hg38/bed/fullAnalysisSet - - mkdir hg38.fullAnalysisSet.chroms - twoBitToFa ../analysisSet/hg38.analysisSet.2bit stdout \ - | faSplit byname stdin hg38.fullAnalysisSet.chroms/ - - grep _alt ../../chrom.sizes | cut -f 1 > alt.list - - twoBitToFa -seqList=alt.list ../../hg38.2bit stdout \ - | faSplit byname stdin hg38.fullAnalysisSet.chroms/ - - faCount hg38.fullAnalysisSet.chroms/chr*.fa > faCount.fullAnalysisSet.txt - - faToTwoBit hg38.fullAnalysisSet.chroms/chr*.fa hg38.fullAnalysisSet.2bit - twoBitInfo hg38.fullAnalysisSet.2bit stdout | sort -k2nr > chrom.sizes - - tar cvzf ./hg38.fullAnalysisSet.chroms.tar.gz ./hg38.fullAnalysisSet.chroms - -######################################################################### -# LASTZ Self/hg38 (DONE - 2014-01-25,02-10 - Hiram) - # can no longer use the lineage specific repeats with the new lastz - # use a screen to manage this longish job: - screen -S hg38Self - - mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25 - cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25 - # construct the non-bridged contigs sequence to use: - (twoBitToFa ../nonBridgedContigs/hg38.chroms.contigs.2bit stdout; - twoBitToFa ../../hg38.2bit:chrM stdout) | faToTwoBit stdin hg38.self.2bit - twoBitInfo hg38.self.2bit stdout | sort -k2nr > hg38.self.chrom.sizes - - # best to always specify an exact path to lastz so we know which one is used - # lastz default parameters are human-mouse parameters - - cat << '_EOF_' > DEF -# human vs human with mouse defaults -BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz - -# TARGET: Human Hg38 -SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit -SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes -SEQ1_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit -SEQ1_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes -SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift -SEQ1_CHUNK=20000000 -SEQ1_LAP=10000 - -# QUERY: Human Hg38 -SEQ2_DIR=/hive/data/genomes/hg38/hg38.2bit -SEQ2_LEN=/hive/data/genomes/hg38/chrom.sizes -SEQ2_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit -SEQ2_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes -SEQ2_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift -SEQ2_CHUNK=20000000 -SEQ2_LAP=0 - -BASE=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25 -TMPDIR=/dev/shm -'_EOF_' -_EOF_ - - time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ - -verbose=2 \ - -stop=net `pwd`/DEF \ - -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -fileServer=hgwdev \ - -chainMinScore=3000 -chainLinearGap=medium > net.log 2>&1 - # real 1518m15.817s -- problems - # there was a problem in the 'part014' batch. running that manually: - mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob - cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob - # make 100 jobs out of the 10 parts: - mkdir -p psl - cp ../tParts/part014.lst ./xpart014.lst - split -l 1 xpart014.lst -d -a 3 part - for F in part0* -do - mv $F $F.lst -done - -for T in part0*.lst -do - for Q in part0*.lst - do - mkdir -p psl/${T} - echo /cluster/home/hiram/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf ${T} ${Q} ../../DEF \{check out exists psl/${T}/${T}.${Q}.psl\} - done -done > jobList - para -ram=32g create jobList - para push - # one last failing job: -# Completed: 99 of 100 jobs -# CPU time in finished jobs: 2836s 47.27m 0.79h 0.03d 0.000 y -# IO & Wait Time: 279s 4.65m 0.08h 0.00d 0.000 y -# Average job time: 31s 0.52m 0.01h 0.00d -# Longest finished job: 586s 9.77m 0.16h 0.01d -# Submission to last job: 620s 10.33m 0.17h 0.01d - - mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010 - cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010 - mkdir psl - - twoBitToFa /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit:chr16_03:0-1689648 part010.fa - - faSplit -lift=split010.lift size part010.fa 169000 split010_ -TOP="/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010" - -for T in split*.fa -do - mkdir -p psl/${T} - echo "${TOP}/${T}" > ${T}.lst - faToTwoBit ${T} ${T}.2bit - for Q in split*.fa - do - echo "/cluster/home/hiram/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf ${T}.lst ${Q}.lst DEF {check out exists psl/${T}/${T}.${Q}.psl}" - done -done > jobList - para -ram=32g create jobList - -# Completed: 100 of 100 jobs -# CPU time in finished jobs: 176579s 2942.99m 49.05h 2.04d 0.006 y -# IO & Wait Time: 1239s 20.64m 0.34h 0.01d 0.000 y -# Average job time: 1778s 29.64m 0.49h 0.02d -# Longest finished job: 29343s 489.05m 8.15h 0.34d -# Submission to last job: 29348s 489.13m 8.15h 0.34d - - catDir psl/* | grep -v "^#" > raw.psl - - liftUp -type=.psl stdout split010.lift error raw.psl \ - | liftUp -pslQ -type=.psl chr16_03.psl split010.lift error stdin - - # this combination allowed psl headers to sneak in the middle, - # had to be cleaned: - catDir psl/* | grep -v "^#" > part014.psl - cat split010/chr16_03.psl >> part014.psl - cp -p part014.psl ../../psl/part014.lst/part014.lst_part014.lst.psl - - time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ - -verbose=2 \ - -continue=cat -stop=net `pwd`/DEF \ - -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -fileServer=hgwdev \ - -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 - # real 43m11.340s - # failed in chaining, running manually on hgwdev - time ./bigJobs.sh > bigJobs.log 2>&1 - # real 468m59.648s - - time ./part014.sh > part014.log 2>&1 - - # real 1319m57.911s - # -rw-rw-r-- 1 3581498246 Feb 8 14:37 part014.lst.chain - time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ - -verbose=2 \ - -continue=chainMerge -stop=net `pwd`/DEF \ - -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -fileServer=hgwdev \ - -chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 - - time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ - -verbose=2 \ - -continue=load -stop=load `pwd`/DEF \ - -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -fileServer=hgwdev \ - -chainMinScore=3000 -chainLinearGap=medium > load.log 2>&1 - - hgLoadChain -normScore -tIndex hg38 chainSelf hg38.hg38.all.chain.gz - # Loading 104815249 chains into hg38.chainSelf - - cat fb.hg38.chainSelfLink.txt - # 392419010 bases of 3049335806 (12.869%) in intersection - cd /hive/data/genomes/hg38/bed - ln -s lastzSelf.2014-01-25 lastz.self - ln -s lastzSelf.2014-01-25 lastz.hg38 - -######################################################################### -## 4-Way Multiz for UCSC Genes construction (DONE - 2014-02-11 - Hiram) - ssh hgwdev - mkdir /hive/data/genomes/hg38/bed/multiz4way - cd /hive/data/genomes/hg38/bed/multiz4way - - # extract our 4 organisms from the 44-way on hg18: - ln -s /hive/data/genomes/hg18/bed/multiz44way/44way.4d.nh ./44way.nh - - /cluster/bin/phast/tree_doctor \ - --prune-all-but hg19,mm10,canFam3,rheMac3 $HOME/kent/src/hg/utils/phyloTrees/120way.nh \ - | sed -e "s/hg19/hg38/" > 4way.nh - - # this looks like: - cat 4way.nh -(((hg38:0.033974,rheMac3:0.037601):0.109934,mm10:0.356483):0.020593,canFam3:0.165928); - - - # Use this specification in the phyloGif tool: - # http://genome.ucsc.edu/cgi-bin/phyloGif - # to obtain a gif image for htdocs/images/phylo/hg38_4way.gif - - /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt - # Use this output to create the table below - grep -y hg38 4way.distances.txt | sort -k3,3n -# -# If you can fill in all the numbers in this table, you are ready for -# the multiple alignment procedure -# -# featureBits chainLink measures -# chainHg38Link chain linearGap -# distance on hg38 on other minScore -# 1 0.071575 - rhesus rheMac3 (% 79.729) (% 86.715) 5000 medium -# 2 0.330429 - dog canFam3 (% 49.978) (% 60.083) 3000 medium -# 3 0.500391 - mouse mm10 (% 31.629) (% 35.323) 3000 medium - - # using the syntenic nets - cd /cluster/data/hg38/bed/multiz4way - mkdir mafLinks - cd mafLinks - mkdir rheMac3 canFam3 mm10 - - for D in mm10 canFam3 rheMac3 -do - ln -s ../../../lastz.${D}/axtChain/hg38.${D}.synNet.maf.gz ./${D}/ -done - - mkdir /hive/data/genomes/hg38/bed/multiz4way/mafSplit - cd /hive/data/genomes/hg38/bed/multiz4way/mafSplit - for D in mm10 canFam3 rheMac3 -do - echo "working: ${D}" - zcat ../mafLinks/${D}/hg38.${D}.synNet.maf.gz > ${D}.maf - mkdir -p ${D} - mafSplit -byTarget -useFullSequenceName /dev/null ${D}/${D}_ ${D}.maf - rm -f ${D}.maf -done - - # determine what is the newest version of multiz and use that - cd /hive/data/genomes/hg38/bed/multiz4way - mkdir penn - cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn - cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn - cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn - - # the autoMultiz cluster run - ssh ku - cd /hive/data/genomes/hg38/bed/multiz4way - - # create species list and stripped down tree for autoMZ - sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ - 4way.nh > tmp.nh - echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh - sed 's/[()]//g; s/,/ /g' tree.nh > species.lst - - mkdir run maf - cd run - - # NOTE: you need to set the db and multiz dirname properly in this script - cat > autoMultiz << '_EOF_' -#!/bin/csh -ef -set db = hg38 -set c = $1 -set maf = $2 -set binDir = /hive/data/genomes/hg38/bed/multiz4way/penn -set tmp = /dev/shm/$db/multiz.$c -set pairs = /hive/data/genomes/hg38/bed/multiz4way/mafSplit -rm -fr $tmp -mkdir -p $tmp -cp ../{tree.nh,species.lst} $tmp -pushd $tmp -foreach s (`cat species.lst`) - set in = $pairs/$s/${s}_$c.maf - set out = $db.$s.sing.maf - if ($s == $db) then - continue - endif - if (-e $in.gz) then - zcat $in.gz > $out - else if (-e $in) then - cp $in $out - else - echo "##maf version=1 scoring=autoMZ" > $out - endif -end -set path = ($binDir $path); rehash -$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf -popd -cp $tmp/$c.maf $maf -rm -fr $tmp -'_EOF_' - # << happy emacs - chmod +x autoMultiz - -cat << '_EOF_' > template -#LOOP -./autoMultiz $(root1) {check out line+ /hive/data/genomes/hg38/bed/multiz4way/maf/$(root1).maf} -#ENDLOOP -'_EOF_' - # << happy emacs - - cut -f1 /cluster/data/hg38/chrom.sizes > chrom.lst - gensub2 chrom.lst single template jobList - para create jobList - # 455 jobs - para try ... check ... push ... etc ... -# Completed: 455 of 455 jobs -# CPU time in finished jobs: 50111s 835.18m 13.92h 0.58d 0.002 y -# IO & Wait Time: 5574s 92.91m 1.55h 0.06d 0.000 y -# Average job time: 122s 2.04m 0.03h 0.00d -# Longest finished job: 4717s 78.62m 1.31h 0.05d -# Submission to last job: 4722s 78.70m 1.31h 0.05d - - # combine results into a single file for loading and gbdb reference - cd /hive/data/genomes/hg38/bed/multiz4way - grep "^#" maf/chr19_GL949749v2_alt.maf | grep -v "eof maf" > multiz4way.maf - grep -h -v "^#" maf/*.maf >> multiz4way.maf - grep "^#" maf/chr19_GL949749v2_alt.maf | grep "eof maf" >> multiz4way.maf - # real 3m27.561s - - # makes a 8.5 Gb file: - # -rw-rw-r-- 1 9044143788 Feb 11 12:51 multiz4way.maf - - # Load into database - ssh hgwdev - cd /hive/data/genomes/hg38/bed/multiz4way - mkdir /gbdb/hg38/multiz4way - ln -s /hive/data/genomes/hg38/bed/multiz4way/multiz4way.maf \ - /gbdb/hg38/multiz4way - # the hgLoadMaf generates huge tmp files, locate them in /dev/shm - cd /dev/shm - time nice -n +19 hgLoadMaf hg38 multiz4way - # Loaded 6141667 mafs in 1 files from /gbdb/hg38/multiz4way - # real 2m2.812s - - cd /hive/data/genomes/hg38/bed/multiz4way - time (cat /gbdb/hg38/multiz4way/*.maf \ - | hgLoadMafSummary -verbose=2 -minSize=10000 \ - -mergeGap=500 -maxSize=50000 hg38 multiz4waySummary stdin) - # Created 1266559 summary blocks from 11780291 components and 6141667 mafs - # real 3m0.791s -# -rw-rw-r-- 1 311246327 Feb 11 12:54 multiz4way.tab -# -rw-rw-r-- 1 58730176 Feb 11 12:58 multiz4waySummary.tab - wc -l multiz4way* - # 6141667 multiz4way.tab - # 1266559 multiz4waySummary.tab - # 7408226 total - -######################################################################### -## RE-load alternate sequence for PSL display (DONE - 2016-01-15 - Hiram) -## The procedure below -## "load alternate sequence for PSL display (DONE - #2014-02-24 - Hiram) -## produced an illegal psl Table altSeqLiftOverPsl: - pslCheck -db=hg38 altSeqLiftOverPsl - checked: 266 failed: 264 errors: 1046 - -## Since then, the gff3ToPsl command has been updated to be a bit more -## robust, so, the following sequence produces the new alignment file: - mkdir -p /hive/data/genomes/hg38/bed/altAlignments/redo2016 - cd /hive/data/genomes/hg38/bed/altAlignments/redo2016 - -mkdir -p ucscPsl - -awk -F'/' '{printf "s/^%s\t/%s\t/g;\n", $3,$2}' ../accessionToUcsc.sed.txt \ - > ucscToNcbi.sed.txt - -sed -f ucscToNcbi.sed.txt ../../../chrom.sizes > ncbi.chrom.sizes - -paste ncbi.chrom.sizes ../../../chrom.sizes \ - | awk -F'\t' '{printf "0\t%s\t%d\t%s\t%d\n", $1,$2,$3,$4}' \ - > ncbiToUcsc.lift - -find ../../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \ - | while read gff -do - name=`basename $gff | sed -e 's/_.*//;'` - fasta=`dirname $gff | sed -e 's#alignments#FASTA/alt.scaf.fa.gz#;'` - size=`faCount $fasta | grep -w total | cut -f2` - printf "%s\t%d\n" "$name" "$size" > target.sizes - gff3ToPsl ncbi.chrom.sizes target.sizes $gff $name.psl - pslCheck ${name}.psl - liftUp -type=.psl stdout ncbiToUcsc.lift error ${name}.psl \ - | liftUp -type=.psl -pslQ ucscPsl/${name}.psl ncbiToUcsc.lift error stdin - pslCheck ucscPsl/${name}.psl -done - - pslSort dirs altSeqLiftOverPsl.psl ./tmp ucscPsl - pslCheck -db=hg38 altSeqLiftOverPsl.psl - - hgLoadPsl hg38 altSeqLiftOverPsl.psl - pslCheck -db=hg38 altSeqLiftOverPsl - # checked: 266 failed: 0 errors: 0 - -######################################################################### -## load alternate sequence for PSL display (DONE - 2014-02-24 - Hiram) - mkdir /hive/data/genomes/hg38/bed/altAlignments/sequence - cd /hive/data/genomes/hg38/bed/altAlignments/sequence - - rm -fr hg38.haplotypes.lift temp.lift targetFa queryFa - mkdir targetFa - mkdir queryFa - touch temp.lift - - cat ../../altLocations/chrToAlt.bed | while read L -do - chrName=`echo $L | awk '{print $1}'` - chromSize=`egrep "^$chrName " ../../../chrom.sizes | cut -f2` - chrStart=`echo $L | awk '{printf "%d", $2}'` - chrEnd=`echo $L | awk '{printf "%d", $3}'` - chrSize=`echo $chrEnd $chrStart | awk '{print $1-$3}'` - queryName=`echo $L | awk '{print $4}'` - partName="${chrName}_${chrStart}_${chrEnd}" - echo $chrName $chrStart $chrEnd $queryName $partName $chromSize - echo -e "$chrStart\t${partName}\t$chrSize\t$chrName\t$chromSize" >> temp.lift - twoBitToFa ../../../hg38.2bit:$chrName:$chrStart-$chrEnd stdout | sed -e "s/^>.*/>$partName/;" > targetFa/$queryName.fa - twoBitToFa ../../../hg38.2bit:$queryName queryFa/$queryName.fa -done - -sort -u temp.lift | sort -k4,4 -k1,1n > hg38.haplotypes.lift - - mkdir /gbdb/hg38/ncbiAltMappings - cd /hive/data/genomes/hg38/bed/altAlignments/sequence/queryFa - ln -s `pwd`/*.fa /gbdb/hg38/ncbiAltMappings - cd /hive/data/genomes/hg38/bed/altAlignments/sequence - hgLoadSeq -drop -seqTbl=seqNcbiAltSequence -extFileTbl=extNcbiAltSequence \ - hg38 /gbdb/hg38/ncbiAltMappings/*.fa - - pslSwap ../altAlignments.psl stdout \ - | pslRecalcMatch stdin ../../../hg38.2bit ../../../hg38.2bit \ - hg38.referenceTarget.psl - - # the table name altSeqLiftOverPsl is recognized in hgc to allow display - # of the details of the alignments - hgLoadPsl hg38 -table=altSeqLiftOverPsl hg38.referenceTarget.psl - -######################################################################### -## alternate sequence alignments EXPERIMENT (DONE - 2014-01-17 - Hiram) - # the lastzAltSequences.2014-01-23 alignment was used for this instead - # of this procedure - mkdir /hive/data/genomes/hg38/bed/altAlignments - cd /hive/data/genomes/hg38/bed/altAlignments - - grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \ - | awk '{printf "s/%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt - - find ../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \ - | while read F -do - cat $F | sed -f accessionToUcsc.sed.txt \ - | gff3ToPsl ../../chrom.sizes stdin stdout -done > altAlignments.psl - | xargs cat | sed -f accessionToUcsc.sed.txt \ - | gff3ToPsl ../../chrom.sizes stdin altAlignments.psl - - time pslRecalcMatch altAlignments.psl ../../hg38.2bit ../../hg38.2bit \ - altRecalcMatch.psl - # real 0m51.122s - - # just to see what they look like in different formats: - pslToChain altRecalcMatch.psl altAlignments.chain - chainToAxt altAlignments.chain ../../hg38.2bit ../../hg38.2bit \ - altAlignments.axt - axtToMaf -score altAlignments.axt ../../chrom.sizes ../../chrom.sizes \ - altAlignments.maf - - mkdir mafSplits - mafSplit /dev/null mafSplits/ altAlignments.maf - # doesn't work: -# Can't find chrom in MAF component src: chr6_GL000250v2_alt - - mkdir splits psl - find ../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \ - | while read F -do - chrAlt=`basename $F | sed -e 's/_.*//' | sed -f accessionToUcsc.sed.txt` - echo $chrAlt - cat $F | sed -f accessionToUcsc.sed.txt \ - | gff3ToPsl ../../chrom.sizes stdin splits/${chrAlt}.psl - pslRecalcMatch splits/${chrAlt}.psl ../../hg38.2bit ../../hg38.2bit \ - psl/${chrAlt}.psl -done - - mkdir swap - mkdir swap/psl swap/chain swap/axt swap/maf swap/anno - for F in psl/*.psl -do - B=`basename $F | sed -e 's/.psl//'` - echo $B - pslSwap $F stdout | pslRecalcMatch stdin ../../hg38.2bit ../../hg38.2bit \ - swap/psl/${B}.psl - pslToChain swap/psl/${B}.psl swap/chain/${B}.chain - chainToAxt swap/chain/${B}.chain ../../hg38.2bit ../../hg38.2bit \ - swap/axt/${B}.axt - axtToMaf -score swap/axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \ - | sed -e 's/^s chr\([0-9XYM][0-9]* \)/s ref38.chr\1/; s/^s chr\([0-9XYM][0-9]*_\)/s alt38.chr\1/;' > swap/maf/${B}.maf - mafAddIRows -nBeds=nBeds swap/maf/${B}.maf ../../hg38.2bit swap/anno/${B}.maf -done -# axtToMaf -score swap/axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \ -# | sed -e 's/^s chr/s hg38.chr/' > swap/maf/${B}.maf - - twoBitInfo -nBed ../../hg38.2bit ../../hg38.N.bed - ln -s ../../hg38.N.bed hg38.bed - ln -s ../../hg38.N.bed ref38.bed - ln -s ../../hg38.N.bed alt38.bed - echo hg38.bed > nBeds - echo ref38.bed >> nBeds - echo alt38.bed >> nBeds - ln -s ../../chrom.sizes hg38.len - ln -s ../../chrom.sizes ref38.len - ln -s ../../chrom.sizes alt38.len - echo hg38.len > sizes - echo ref38.len >> sizes - echo alt38.len >> sizes - - mkdir chain axt maf anno - for F in psl/*.psl -do - B=`basename $F | sed -e 's/.psl//'` - echo $B - pslToChain $F chain/${B}.chain - chainToAxt chain/${B}.chain ../../hg38.2bit ../../hg38.2bit axt/${B}.axt - axtToMaf -score axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \ - | sed -e 's/^s chr\([0-9XYM][0-9]* \)/s ref38.chr\1/; s/^s chr\([0-9XYM][0-9]*_\)/s alt38.chr\1/;' > maf/${B}.maf - mafAddIRows -nBeds=nBeds maf/${B}.maf ../../hg38.2bit anno/${B}.maf -done - -# axtToMaf -score axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \ -# | sed -e 's/^s chr/s hg38.chr/' > maf/${B}.maf - -############################################################################ -# Liftover Gencode V19 from hg19 (DONE braney 2014-02-14) - -mkdir /cluster/data/hg38/bed/liftOverGencodeV19 -cd /cluster/data/hg38/bed/liftOverGencodeV19 - -echo "show tables like 'wgEncodeGencode%19'" | hgsql hg19 | tail -n +2 > all.gencode.tables -echo " select tableName from trackDb where tableName like 'wgEncodeGencode_%V19';" | hgsql hg19 --skip-column-names > genePred.gencode.tables - -# load the non-genepred table as is. This isn't quite the right thing to do -# with exon support, but it's good enough for our purposes at the moment -join -v 1 *.gencode.tables | while read t; do echo "create table $t select * from hg19.$t" | hgsql hg38; echo $t; done - -for i in `cat genePredExt.gencode.tables`; -do - echo "select name,score,name2 from $i" | hgsql hg19 | sort > $i.name2Score.txt; - genePredToFakePsl hg19 $i $i.psl $i.cds; - pslMap -chainMapFile -swapMap $i.psl /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz stdout | pslCDnaFilter -uniqueMapped stdin stdout | sort -k 14,14 -k 16,16n | pslToBed -cds=$i.cds stdin stdout | bedToGenePred stdin stdout | sort | join /dev/stdin $i.name2Score.txt| tr ' ' '\t' | hgLoadGenePred -genePredExt hg38 $i stdin; - echo $i; -done - -for i in `cat genePred.gencode.tables`; -do - genePredToFakePsl hg19 $i $i.psl $i.cds; - pslMap -chainMapFile -swapMap $i.psl /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz stdout | pslCDnaFilter -uniqueMapped stdin stdout | sort -k 14,14 -k 16,16n | pslToBed -cds=$i.cds stdin stdout | bedToGenePred stdin stdout | tr ' ' '\t' | hgLoadGenePred hg38 $i stdin; - echo $i; -done - -##################################################################### -## tRNAs track ( 2014-02-18 braney DONE) -## this is a preliminary version for UCSC build. NOT FOR RELEASE! -ssh hgwdev -cd /hive/data/genomes/hg38/bed -mkdir tRNAs -cd tRNAs - -cp /hive/users/pchan/tRNAs/Eukaryota/hg38/hg38-tRNAs.bed . - -hgLoadBed -tab hg38 tRNAs hg38-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql - -## tRNAs track (2015-10-04, Chris FINISHING BUILD FOR RELEASE) - cd /hive/data/genomes/hg38/bed/tRNAs - cat /hive/users/pchan/gtrnadb2/Eukaryota/hg38/hg38-tRNAs.bed | sed 's^</BLOCKQUOTE>^^g' | > hg38-tRNAs2.bed - hgsql hg38 -e 'drop table if exists tRNAs' - hgLoadBed -tab hg38 tRNAs hg38-tRNAs2.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql - mkdir gif - cp -p /hive/users/pchan/gtrnadb2/Eukaryota/hg38/images/* gif - cd /hive/data/gbdb/hg38 - ln -s /hive/data/genomes/hg38/bed/tRNAs/gif RNA-img - cd /usr/local/apache/htdocs-ceisenhart/RNA-img - ln -s /gbdb/hg38/RNA-img hg38 - -############################################################################ -# EXONIPHY , lifted from hg19 (DONE - braney 2014-02-19) -# needed for ucscGenes building - # exoniphyHg19.gp is prepared as follows - mkdir /cluster/data/hg38/bed/exoniphy - cd /cluster/data/hg38/bed/exoniphy - hgsql hg19 -e "select * from exoniphy" -N | cut -f 2-16 > exoniphyHg19.gp - time nice -n +19 liftOver -genePred exoniphyHg19.gp \ - /cluster/data/hg19/bed/liftOver/hg19ToHg38.over.chain.gz \ - exoniphyHg38.gp unmapped - # real 0m2.015s - # user 0m1.894s - # sys 0m0.076s - - wc -l * - # 186601 exoniphyHg19.gp - # 186533 exoniphyHg38.gp - # 136 unmapped - # 373270 total - - cd /cluster/data/hg38/bed/exoniphy - nice -n +19 hgLoadGenePred -genePredExt hg38 exoniphy exoniphyHg38.gp - nice -n +19 featureBits hg38 exoniphy - # 28807039 bases of 3049335806 (0.945%) in intersection - nice -n +19 featureBits hg19 exoniphy - # 28661160 bases of 2897316137 (0.989%) in intersection - -######################################################################### -# LASTZ Rat Rn5 (DONE - 2014-02-27 - Hiram) - # establish a screen to control this job - screen -S hg38Rn5 - mkdir /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27 - cd /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27 - - # XXX don't forget to specify the BLASTZ binary: - cat << '_EOF_' > DEF -# human vs rat -BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz - -# TARGET: Human Hg38 -SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit -SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes -SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit -SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes -SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift -SEQ1_CHUNK=20000000 -SEQ1_LAP=10000 - -# QUERY: Rat Rn5 -SEQ2_DIR=/hive/data/genomes/rn5/rn5.2bit -SEQ2_LEN=/hive/data/genomes/rn5/chrom.sizes -SEQ2_CHUNK=10000000 -SEQ2_LIMIT=100 -SEQ2_LAP=0 - -BASE=/hive/data/genomes/hg38/bed/lastzRn5.2014-02-27 -TMPDIR=/scratch/tmp -'_EOF_' - # << happy emacs - - time doBlastzChainNet.pl -verbose=2 \ - `pwd`/DEF \ - -syntenicNet \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 - - # real 658m53.984s - cat fb.hg38.chainRn5Link.txt - # 938823407 bases of 3049335806 (30.788%) in intersection - - # running the swap - mkdir /hive/data/genomes/rn5/bed/blastz.hg38.swap - cd /hive/data/genomes/rn5/bed/blastz.hg38.swap - time nice -n +19 doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27/DEF \ - -swap \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 - # real 66m53.095s - cat fb.rn5.chainHg38Link.txt - # 934256475 bases of 2572853723 (36.312%) in intersection - - # syntenic net for 14-way use 2014-04-02 - Hiram - cd /hive/data/genomes/rn5/bed/blastz.hg38.swap - time nice -n +19 doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27/DEF \ - -continue=syntenicNet -syntenicNet -swap \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1 - # real 16m54.489s - -############################################################################## -# LASTZ Rat Rn4 (DONE - 2014-02-27 - Hiram) - # establish a screen to control this job - screen -S hg38Rn4 - mkdir /hive/data/genomes/hg38/bed/lastzRn4.2014-02-27 - cd /hive/data/genomes/hg38/bed/lastzRn4.2014-02-27 - - # XXX don't forget to specify the BLASTZ binary: - cat << '_EOF_' > DEF -# human vs rat -BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz - -# TARGET: Human Hg38 -SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit -SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes -SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit -SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes -SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift -SEQ1_CHUNK=20000000 -SEQ1_LAP=10000 - -# QUERY: Rat Rn4 -SEQ2_DIR=/hive/data/genomes/rn4/rn4.2bit -SEQ2_LEN=/hive/data/genomes/rn4/chrom.sizes -SEQ2_CHUNK=10000000 -SEQ2_LIMIT=100 -SEQ2_LAP=0 - -BASE=/hive/data/genomes/hg38/bed/lastzRn4.2014-02-27 -TMPDIR=/scratch/tmp -'_EOF_' - # << happy emacs - - doBlastzChainNet.pl -verbose=2 \ - `pwd`/DEF \ - -syntenicNet \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 - # real 658m53.984s - - cat fb.hg38.chainRn4Link.txt - # 913992768 bases of 3049335806 (29.974%) in intersection - - # running the swap - mkdir /hive/data/genomes/rn4/bed/blastz.hg38.swap - cd /hive/data/genomes/rn4/bed/blastz.hg38.swap - time nice -n +19 doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/hg38/bed/lastzRn4.2014-02-27/DEF \ - -swap \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & - # real 73m5.666s - - cat fb.rn4.chainHg38Link.txt - # 889613774 bases of 2571531505 (34.595%) in intersection - -############################################################################## -# GENEID GENE PREDICTIONS (DONE - 2014-03-07 - Hiram) - ssh hgwdev - mkdir /hive/data/genomes/hg38/bed/geneid - cd /hive/data/genomes/hg38/bed/geneid - mkdir download - cd download - for C in `cut -f1 ../../../chrom.sizes` - do - echo $C - wget --timestamping \ -http://genome.crg.es/genepredictions/H.sapiens/hg38/geneid_v1.4/${C}.gtf3 - wget --timestamping \ -http://genome.crg.es/genepredictions/H.sapiens/hg38/geneid_v1.4/${C}.prot - done - - cd .. - cat download/*.gtf | ldHgGene -gtf -genePredExt hg38 geneid stdin - # Read 33428 transcripts in 277332 lines in 1 files - # 33428 groups 92 seqs 1 sources 3 feature types - # 33428 gene predictions - -############################################################################ -# GENEREVIEWS TRACK (DONE 2014-05-17 - Chin) -# This track depends on some tasks completed for hg19, specifically: -# -# $HOME/kent/src/hg/lib/geneReviewsGrshortNBKid.sql -# $HOME/kent/src/hg/lib/geneReviewsGrshortTitleNBKid.sql -# $HOME/kent/src/hg/lib/geneReviewsDetail.sql -# $HOME/kent/src/hg/makeDb/trackDb/human/geneReviews.html -# -# Unlike hg19, this hg38 tracks is generated by the automatic geneReviews -# scripts in -# /hive/data/outside/otto/geneReviews, specifically buildGeneReviews.sh. -# Current data are fetched weekly from NCBI -# ftp://ftp.ncbi.nlm.nih.gov/pub/GeneReviews/ -# to /hive/data/outside/otto/geneReviews/${DATE}. - -########################################################################### -# Chimp Lastz run (DONE - 2014-05-27 - Hiram) - screen -S hg38PanTro4 # use a screen to manage this longish running job - mkdir /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27 - cd /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27 - - # always set the BLASTZ program so we know what version was used - cat << '_EOF_' > DEF -# human vs chimp -BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz -BLASTZ_O=600 -BLASTZ_E=150 -# maximum M allowed with lastz is only 254 -BLASTZ_M=254 - -BLASTZ_T=2 -BLASTZ_Y=15000 -BLASTZ_K=4500 -BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q -# A C G T -# 90 -330 -236 -356 -# -330 100 -318 -236 -# -236 -318 100 -330 -# -356 -236 -330 90 - -# TARGET: Human Hg38 -SEQ1_DIR=/scratch/data/hg38/hg38.2bit -SEQ1_LEN=/scratch/data/hg38/chrom.sizes -SEQ1_CHUNK=10000000 -SEQ1_LAP=10000 -SEQ1_IN_CONTIGS=0 - -# QUERY: Chimp PanTro4 -SEQ2_DIR=/hive/data/genomes/panTro4/panTro4.2bit -SEQ2_LEN=/hive/data/genomes/panTro4/chrom.sizes -SEQ2_CHUNK=10000000 -SEQ2_LAP=0 -SEQ2_LIMIT=200 -SEQ2_IN_CONTIGS=0 - -BASE=/hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27 -TMPDIR=/dev/shm -'_EOF_' - # << emacs - - time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ - -chainMinScore=5000 -chainLinearGap=medium \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -syntenicNet) > do.log 2>&1 - # real 154m12.215s - cat fb.hg38.chainPanTro4Link.txt - # 2839294579 bases of 3049335806 (93.112%) in intersection - - # filter with doRecipBest.pl - time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ - hg38 panTro4) > rbest.log 2>&1 - # real 57m55.320s - - # running the swap - mkdir /hive/data/genomes/panTro4/bed/blastz.hg38.swap - cd /hive/data/genomes/panTro4/bed/blastz.hg38.swap - time (doBlastzChainNet.pl -verbose=2 \ - -swap /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27/DEF \ - -chainMinScore=5000 -chainLinearGap=medium \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -syntenicNet) > swap.log 2>&1 - cat fb.panTro4.chainHg38Link.txt - # 2776497530 bases of 2902338967 (95.664%) in intersection - # real 98m23.729s - - time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ - panTro4 hg38) > rbest.log 2>&1 - # real 64m33.812s - -############################################################################# -# Opossum Lastz run (DONE - 2014-05-27 - Hiram) - screen -S hg38MonDom5 # use a screen to manage this longish running job - mkdir /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27 - cd /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27 - - # always set the BLASTZ program so we know what version was used - cat << '_EOF_' > DEF -# human vs chimp -BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz -BLASTZ_M=50 - -BLASTZ_Y=3400 -BLASTZ_L=6000 -BLASTZ_K=2200 -BLASTZ_Q=/scratch/data/blastz/HoxD55.q -# A C G T -# 91 -90 -25 -100 -# -90 100 -100 -25 -# -25 -100 100 -90 -# -100 -25 -90 91 - -# TARGET: Human Hg38 -SEQ1_DIR=/scratch/data/hg38/hg38.2bit -SEQ1_LEN=/scratch/data/hg38/chrom.sizes -SEQ1_CHUNK=10000000 -SEQ1_LAP=10000 -SEQ1_LIMIT=5 - -# QUERY: Opossum MonDom5 -SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit -SEQ2_LEN=/hive/data/genomes/monDom5/chrom.sizes -SEQ2_CHUNK=10000000 -SEQ2_LAP=0 - -BASE=/hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27 -TMPDIR=/dev/shm -'_EOF_' - # << emacs - - time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ - -chainMinScore=5000 -chainLinearGap=loose \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -syntenicNet) > do.log 2>&1 - # real 670m13.280s - # one failed chain run for hg19, finished manually on hgwdev, then: - time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ - -continue=chainMerge -chainMinScore=5000 -chainLinearGap=loose \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -syntenicNet) > chainMerge.log 2>&1 - # real 164m28.822s - - cat fb.hg38.chainMonDom5Link.txt - # 438195373 bases of 3049335806 (14.370%) in intersection - - # filter with doRecipBest.pl - time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \ - -dbHost=hgwdev -workhorse=hgwdev hg38 monDom5) > rbest.log 2>&1 - # real 130m22.825s - - # running the swap - mkdir /hive/data/genomes/monDom5/bed/blastz.hg38.swap - cd /hive/data/genomes/monDom5/bed/blastz.hg38.swap - time (doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27/DEF \ - -swap -chainMinScore=5000 -chainLinearGap=loose \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -syntenicNet) > swap.log 2>&1 - # real 102m41.443s - - cat fb.monDom5.chainHg38Link.txt - # 420069915 bases of 3501660299 (11.996%) in intersection - time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \ - -dbHost=hgwdev -workhorse=hgwdev monDom5 hg38) > rbest.log 2>&1 - # real 90m56.189s - -_EOF_ -############################################################################# -# LOCUS REFERENCE GENOMIC (LRG) REGIONS AND TRANSCRIPTS (DONE 10/25/19 angie) -# Redmine #13359, #24285 -- otto-mate To Do #17877 -# previously done 7/7/14, 9/9/16, 5/30/18 -# THIS IS NOW AN OTTO JOB !! - set today = `date +%Y_%m_%d` - mkdir -p /hive/data/genomes/hg38/bed/lrg/$today - cd /hive/data/genomes/hg38/bed/lrg/$today - wget ftp://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_public_xml_files.zip - unzip LRG_public_xml_files.zip - - # Run script to convert LRG*.xml files to BED+ for regions and genePredExt+fa for transcripts: - # parseLrgXml.pl updated 2020-09-16 to add four new fields to the gp output - # the four extra fields are identifiers for: - # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein - - ~/kent/src/hg/utils/automation/parseLrgXml.pl GRCh38 - genePredCheck lrgTranscriptsUnmapped.gp -#Error: lrgTranscriptsUnmapped.gp:765: LRG_7t1 no exonFrame on CDS exon 46 -#checked: 1029 failed: 1 - # If there are complaints e.g. about exonFrame, look for inconsistencies in the - # affected transcript's coding_region/coordinates vs. exon/intron info in xml. - # Contact Variation team leader Fiona Cunningham @EBI to resolve in the background - # (missing exonFrame info doesn't affect our track representation because we end up using - # psl). We agreed to disagree about exon 46 of LRG_7t1 because that last coding exon - # portion is only the stop codon. - - # No longer necessary to filter out alt and fix patches since they have been added to hg38. - - # and we need the transcript plus gene name later: - cut -f1,12 lrgTranscriptsUnmapped.gp | sort > transcript.gene.name.txt - - # five extra columns have been added to the genePred (2020-10-05 - Hiram) - # extract them so they can be added to the psl: - awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s %s %s %s\n", $1,$16,$17,$18,$19, $16,$18,$17,$19}' lrgTranscriptsUnmapped.gp | sort \ - | join -t$'\t' - transcript.gene.name.txt \ - | awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s\t%s %s\n", $1,$2,$3,$4,$5,$7,$6,$7}' > lrgTransExtraFields.tsv - - # the five extra fields are identifiers for: - # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein, - # Gene name - - # Load LRG regions: - #bedToBigBed lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.bb \ - #-tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name - # after ML #29689, added ncbiAcc field, Max, July 1, 2022 - # changed to: - bedToBigBed lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.bb \ - -tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name,ncbiAcc - ln -sf `pwd`/lrg.bb /gbdb/hg38/bbi/lrg.bb - hgBbiDbLink hg38 lrg /gbdb/hg38/bbi/lrg.bb - - # Map LRG fixed_annotation transcripts from LRG coords to hg38 coords (HT MarkD): - lrgToPsl lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.psl - pslCheck lrg.psl -#checked: 919 failed: 0 errors: 0 - awk '{print $10 "\t" $11;}' lrg.psl > lrg.sizes - genePredToFakePsl -chromSize=lrg.sizes placeholder \ - lrgTranscriptsUnmapped.gp lrgTranscriptsFakePsl.psl lrgTranscripts.cds - pslMap lrgTranscriptsFakePsl.psl lrg.psl lrgTranscriptsHg38.psl - mrnaToGene -genePredExt -cdsFile=lrgTranscripts.cds -keepInvalid \ - lrgTranscriptsHg38.psl lrgTranscriptsHg38NoName2.gp -#Warning: no CDS for LRG_163t1 -#Warning: no CDS for LRG_347t1 - # It's OK if mrnaToGene complains about "no CDS" for a non-coding tx (RefSeq accession NR_*). - grep -l NR_ LRG_163.xml LRG_347.xml -#LRG_163.xml -#LRG_347.xml - - cat lrgCdna.tab | sed -e 's/^/>/;' | tr '\t' '\n' > lrgCdna.fa - # construct bigPsl with five extra fields - pslToBigPsl -fa=lrgCdna.fa -cds=lrgTranscripts.cds \ - lrgTranscriptsHg38.psl bigPsl.txt - - # add the five extra identifiers to the bigPsl file: - join -t$'\t' -1 4 \ - -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15\ -,1.16,1.17,1.18,1.19,1.20,1.21,1.22,1.23,1.24,1.25,2.2,2.3,2.4,2.5,2.6,2.7 \ - <(sort -k4 bigPsl.txt) lrgTransExtraFields.tsv \ - | sort -k1,1 -k2,2n > lrgExtraTranscriptsHg38.bigPsl.bed - - bedToBigBed -as=bigPsl+6.as -type=bed12+19 -tab \ - lrgExtraTranscriptsHg38.bigPsl.bed ../../../chrom.sizes lrgBigPsl.bb - bigBedInfo lrgBigPsl.bb - rm -f /gbdb/hg38/bbi/lrgBigPsl.bb - ln -sf `pwd`/lrgBigPsl.bb /gbdb/hg38/bbi - hgBbiDbLink hg38 lrgBigPsl /gbdb/hg38/bbi/lrgBigPsl.bb - - - # Load PSL, CDS and sequences. - hgLoadPsl hg38 -table=lrgTranscriptAli lrgTranscriptsHg38.psl - hgLoadSqlTab hg38 lrgCds ~/kent/src/hg/lib/cdsSpec.sql lrgTranscripts.cds - hgPepPred hg38 tab lrgCdna lrgCdna.tab - hgPepPred hg38 tab lrgPep lrgPep.tab - - -############################################################################# -## 7-Way Multiz (DONE - 2014-06-02 - Hiram) - ssh hgwdev - mkdir /hive/data/genomes/hg38/bed/multiz7way - cd /hive/data/genomes/hg38/bed/multiz7way - - # from the 63-way in the source tree, select out the 7 used here: - /cluster/bin/phast/tree_doctor \ - --prune-all-but hg19,panTro4,rheMac3,mm10,rn5,canFam3,monDom5 \ - /cluster/home/hiram/kent/src/hg/utils/phyloTrees/130way.nh \ - | sed -e 's/hg19/hg38/' > hg38.7way.nh - - # what that looks like: - ~/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.nh -# (((((hg38:0.006550, -# panTro4:0.006840):0.027424, -# rheMac3:0.037601):0.109934, -# (mm10:0.084509, -# rn5:0.091589):0.271974):0.020593, -# canFam3:0.165928):0.258392, -# monDom5:0.340786); - - # extract species list from that .nh file - sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ - hg38.7way.nh | xargs echo | sed 's/ //g; s/,/ /g' \ - | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt - - # construct db to name translation list: - cat species.list.txt | while read DB -do -hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest -done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \ - > db.to.name.txt - - # construct a common name .nh file: - /cluster/bin/phast/tree_doctor --rename \ - "`cat db.to.name.txt`" hg38.7way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \ - | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ - > hg38.7way.commonNames.nh - - $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.nh > t.nh - $HOME/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \ - | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ - > hg38.7way.scientificNames.nh - rm -f t.nh - cat hg38.7way.scientificNames.nh -# (((((Homo_sapiens:0.00655, -# Pan_troglodytes:0.00684):0.027424, -# Macaca_mulatta:0.037601):0.109934, -# (Mus_musculus:0.084509, -# Rattus_norvegicus:0.091589):0.271974):0.020593, -# Canis_lupus_familiaris:0.165928):0.258392, -# Monodelphis_domestica:0.340786); - - ~/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.commonNames.nh -# (((((Human:0.00655, -# Chimp:0.00684):0.027424, -# Rhesus:0.037601):0.109934, -# (Mouse:0.084509, -# Rat:0.091589):0.271974):0.020593, -# Dog:0.165928):0.258392, -# Opossum:0.340786); - - # Use this specification in the phyloGif tool: - # http://genome.ucsc.edu/cgi-bin/phyloGif - # to obtain a png image for src/hg/htdocs/images/phylo/hg38_7way.png - - /cluster/bin/phast/all_dists hg38.7way.nh | grep hg38 \ - | sed -e "s/hg38.//" | sort -k2n > 7way.distances.txt - # Use this output to create the table below - head 7way.distances.txt -# taeGut1 0.075718 -# melUnd1 0.220312 -# galGal4 0.507021 -# melGal1 0.509140 -# hg19 1.175433 -# mm10 1.383071 - - cat << '_EOF_' > sizeStats.pl -#!/usr/bin/env perl - -use strict; -use warnings; - -open (FH, "<7way.distances.txt") or - die "can not read 7way.distances.txt"; - -my $count = 0; -while (my $line = <FH>) { - chomp $line; - my ($D, $dist) = split('\s+', $line); - my $chain = "chain" . ucfirst($D); - my $B="/hive/data/genomes/hg38/bed/lastz.$D/fb.hg38." . - $chain . "Link.txt"; - my $chainLinkMeasure = - `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; - chomp $chainLinkMeasure; - $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); - $chainLinkMeasure =~ s/\%//; - my $swapFile="/hive/data/genomes/${D}/bed/lastz.hg38/fb.${D}.chainHg38Link.txt"; - my $swapMeasure = "N/A"; - if ( -s $swapFile ) { - $swapMeasure = - `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; - chomp $swapMeasure; - $swapMeasure = 0.0 if (length($swapMeasure) < 1); - $swapMeasure =~ s/\%//; - } - my $orgName= - `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; - chomp $orgName; - if (length($orgName) < 1) { - $orgName="N/A"; - } - ++$count; - printf "# %02d %.4f (%% %06.3f) (%% %06.3f) - %s %s\n", $count, $dist, - $chainLinkMeasure, $swapMeasure, $orgName, $D; -} -close (FH); -'_EOF_' - # << happy emacs - chmod +x ./sizeStats.pl - ./sizeStats.pl -# - -# If you can fill in all the numbers in this table, you are ready for -# the multiple alignment procedure - -# featureBits chainLink measures -# chainLink -# N distance on hg38 on other other species -# 01 0.0134 (% 93.112) (% 95.664) - Chimp panTro4 -# 02 0.0716 (% 79.729) (% 86.715) - Rhesus rheMac3 -# 03 0.3304 (% 49.978) (% 60.083) - Dog canFam3 -# 04 0.5004 (% 31.629) (% 35.323) - Mouse mm10 -# 05 0.5075 (% 30.788) (% 36.312) - Rat rn5 -# 06 0.7637 (% 14.370) (% 11.996) - Opossum monDom5 - -# None of this concern for distances matters in building the first step, the -# maf files. - - # create species list and stripped down tree for autoMZ - sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ - hg38.7way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh - - sed 's/[()]//g; s/,/ /g' tree.nh > species.list - # hg38 panTro4 rheMac3 mm10 rn5 canFam3 monDom5 - - # bash shell syntax here ... - cd /hive/data/genomes/hg38/bed/multiz7way - export H=/hive/data/genomes/hg38/bed - mkdir mafLinks - # want syntenic net for: panTro4 rheMac3 mm10 rn5 canFam3 - # and unfiltered maf net for: monDom5 - for G in panTro4 rheMac3 mm10 rn5 canFam3 - do - mkdir mafLinks/$G - echo ln -s ${H}/lastz.$G/axtChain/hg38.${G}.synNet.maf.gz ./mafLinks/$G - ln -s ${H}/lastz.$G/axtChain/hg38.${G}.synNet.maf.gz ./mafLinks/$G - done - - mkdir mafLinks/monDom5 - echo ln -s ${H}/lastz.monDom5/mafNet/*.maf.gz ./mafLinks/monDom5 - ln -s ${H}/lastz.monDom5/mafNet/*.maf.gz ./mafLinks/monDom5 - # verify the symLinks are good: - ls -ogrtL mafLinks/*/* -#-rw-rw-r-- 1 709500062 Jan 25 12:15 mafLinks/mm10/hg38.mm10.synNet.maf.gz -#-rw-rw-r-- 1 1089643630 Jan 27 19:15 mafLinks/canFam3/hg38.canFam3.synNet.maf.gz -#-rw-rw-r-- 1 1277455681 Jan 28 21:52 mafLinks/rheMac3/hg38.rheMac3.synNet.maf.gz -#-rw-rw-r-- 1 687500679 Mar 1 12:27 mafLinks/rn5/hg38.rn5.synNet.maf.gz -#-rw-rw-r-- 1 1463969868 May 27 11:41 mafLinks/panTro4/hg38.panTro4.synNet.maf.gz -#-rw-rw-r-- 1 323347908 May 29 12:38 mafLinks/monDom5/hg38.monDom5.net.maf.gz - - # split the maf files into a set of hashed named files - # this hash named split keeps the same chr/contig names in the same - # named hash file. - mkdir /hive/data/genomes/hg38/bed/multiz7way/mafSplit - cd /hive/data/genomes/hg38/bed/multiz7way/mafSplit - for D in `sed -e "s/hg38 //" ../species.list` -do - echo "${D}" - mkdir $D - cd $D - echo "mafSplit -byTarget -useHashedName=8 /dev/null . ../../mafLinks/${D}/*.maf.gz" - mafSplit -byTarget -useHashedName=8 /dev/null . \ - ../../mafLinks/${D}/*.maf.gz - cd .. -done - - # construct a list of all possible maf file names. - # they do not all exist in each of the species directories - find . -type f | wc -l - # 641 - find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list - wc -l maf.list - # 118 maf.list - - mkdir /hive/data/genomes/hg38/bed/multiz7way/splitRun - cd /hive/data/genomes/hg38/bed/multiz7way/splitRun - mkdir maf run - cd run - mkdir penn - cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn - cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn - cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn - - # set the db and pairs directories here - cat > autoMultiz.csh << '_EOF_' -#!/bin/csh -ef -set db = hg38 -set c = $1 -set result = $2 -set run = `/bin/pwd` -set tmp = /dev/shm/$db/multiz.$c -set pairs = /hive/data/genomes/hg38/bed/multiz7way/mafSplit -/bin/rm -fr $tmp -/bin/mkdir -p $tmp -/bin/cp -p ../../tree.nh ../../species.list $tmp -pushd $tmp > /dev/null -foreach s (`/bin/sed -e "s/$db //" species.list`) - set in = $pairs/$s/$c - set out = $db.$s.sing.maf - if (-e $in.gz) then - /bin/zcat $in.gz > $out - if (! -s $out) then - echo "##maf version=1 scoring=autoMZ" > $out - endif - else if (-e $in) then - /bin/ln -s $in $out - else - echo "##maf version=1 scoring=autoMZ" > $out - endif -end -set path = ($run/penn $path); rehash -$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ - > /dev/null -popd > /dev/null -/bin/rm -f $result -/bin/cp -p $tmp/$c $result -/bin/rm -fr $tmp -'_EOF_' -# << happy emacs - chmod +x autoMultiz.csh - - cat << '_EOF_' > template -#LOOP -./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/hg38/bed/multiz7way/splitRun/maf/$(root1).maf} -#ENDLOOP -'_EOF_' -# << happy emacs - - ln -s ../../mafSplit/maf.list maf.list - ssh ku - cd /hive/data/genomes/hg38/bed/multiz7way/splitRun/run - gensub2 maf.list single template stdout > jobList - para -ram=8g create jobList -# Completed: 118 of 118 jobs -# CPU time in finished jobs: 118241s 1970.69m 32.84h 1.37d 0.004 y -# IO & Wait Time: 682s 11.36m 0.19h 0.01d 0.000 y -# Average job time: 1008s 16.80m 0.28h 0.01d -# Longest finished job: 10068s 167.80m 2.80h 0.12d -# Submission to last job: 10076s 167.93m 2.80h 0.12d - - # combine into one file (the 1>&2 redirect sends the echo to stderr) - cd /hive/data/genomes/hg38/bed/multiz7way - head -1 splitRun/maf/017.maf > multiz7way.maf - for F in splitRun/maf/*.maf -do - echo "${F}" 1>&2 - egrep -v "^#" ${F} -done >> multiz7way.maf - tail -1 splitRun/maf/017.maf >> multiz7way.maf -# -rw-rw-r-- 1 15635828403 Jun 3 11:49 multiz7way.maf - - # Load into database - ssh hgwdev - cd /hive/data/genomes/hg38/bed/multiz7way - mkdir /gbdb/hg38/multiz7way - ln -s `pwd`/multiz7way.maf /gbdb/hg38/multiz7way - cd /dev/shm - time nice -n +17 hgLoadMaf hg38 multiz7way - # Loaded 10270624 mafs in 1 files from /gbdb/hg38/multiz7way - # real 3m51.265s - - time nice -n +17 hgLoadMafSummary -verbose=2 -minSize=30000 \ - -mergeGap=1500 -maxSize=200000 hg38 multiz7waySummary \ - /gbdb/hg38/multiz7way/multiz7way.maf - # Created 1260918 summary blocks from 35384988 components - # and 10270624 mafs from /gbdb/hg38/multiz7way/multiz7way.maf - # real 5m39.388s - - - wc -l multiz7way*.tab - # 10270624 multiz7way.tab - # 1260918 multiz7waySummary.tab - # 11531542 total - - rm multiz7way*.tab - -############################################################################## -# GAP ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2014-06-03 - Hiram) - # mafAddIRows has to be run on single chromosome maf files, it does not - # function correctly when more than one reference sequence - # are in a single file. Need to split of the maf file into individual - # maf files - mkdir -p /hive/data/genomes/hg38/bed/multiz7way/anno/mafSplit - cd /hive/data/genomes/hg38/bed/multiz7way/anno/mafSplit - - time mafSplit -outDirDepth=1 -byTarget -useFullSequenceName \ - /dev/null . ../../multiz7way.maf - # real 4m8.617s - - find . -type f | wc -l - # 353 - - # check for N.bed files everywhere: - cd /hive/data/genomes/hg38/bed/multiz7way/anno - for DB in `cat ../species.list` -do - if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then - echo "MISS: ${DB}" -# cd /hive/data/genomes/${DB} -# twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed - else - echo " OK: ${DB}" - fi -done - - cd /hive/data/genomes/hg38/bed/multiz7way/anno - for DB in `cat ../species.list` -do - echo "${DB} " - ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed - echo ${DB}.bed >> nBeds - ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len - echo ${DB}.len >> sizes -done - # make sure they all are successful symLinks: - ls -ogrtL - - screen -S hg38 # use a screen to control this longish job - ssh ku - cd /hive/data/genomes/hg38/bed/multiz7way/anno - mkdir result - for D in `ls mafSplit` -do - echo mkdir result/${D} - mkdir result/${D} -done - cat << '_EOF_' > template -#LOOP -mafAddIRows -nBeds=nBeds mafSplit/$(path1) /hive/data/genomes/hg38/hg38.2bit {check out exists+ result/$(path1)} -#ENDLOOP -'_EOF_' - # << happy emacs - - find ./mafSplit -type f | sed -e 's#^./mafSplit/##' > maf.list - gensub2 maf.list single template jobList - # limit jobs on a node with the ram=32g requirement because they go fast - para -ram=32g create jobList - para try ... check ... push ... -# Completed: 353 of 353 jobs -# CPU time in finished jobs: 530s 8.83m 0.15h 0.01d 0.000 y -# IO & Wait Time: 1057s 17.62m 0.29h 0.01d 0.000 y -# Average job time: 4s 0.07m 0.00h 0.00d -# Longest finished job: 63s 1.05m 0.02h 0.00d -# Submission to last job: 220s 3.67m 0.06h 0.00d - - # verify all result files have some content, look for 0 size files: - find ./result -type f -size 0 - # should see none - # or in this manner: - find ./result -type f | xargs ls -og | sort -k3nr | tail - - # combine into one file (the 1>&2 redirect sends the echo to stderr) - head -q -n 1 result/0/chr8.maf > hg38.7way.maf - find ./result -type f | while read F -do - echo "${F}" 1>&2 - grep -h -v "^#" ${F} -done >> hg38.7way.maf - - # these maf files do not have the end marker, this does nothing: - # tail -q -n 1 result/0/chr8.maf >> hg38.7way.maf - # How about an official end marker: - echo "##eof maf" >> hg38.7way.maf - ls -og -# -rw-rw-r-- 1 17795297196 Jun 3 14:01 hg38.7way.maf - - du -hsc hg38.7way.maf - # 17G hg38.7way.maf - - # construct symlinks to get the individual maf files into gbdb: - rm /gbdb/hg38/multiz7way/multiz7way.maf # remove previous results - ln -s `pwd`/hg38.7way.maf /gbdb/hg38/multiz7way/multiz7way.maf - - # Load into database - cd /dev/shm - time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/hg38/multiz7way \ - hg38 multiz7way - # Loaded 10359242 mafs in 1 files from /gbdb/hg38/multiz7way - # real 4m21.862s - - time hgLoadMafSummary -verbose=2 -minSize=30000 \ - -mergeGap=1500 -maxSize=200000 hg38 multiz7waySummary \ - /gbdb/hg38/multiz7way/multiz7way.maf -# Created 1260918 summary blocks from 35384988 components -# and 10359242 mafs from /gbdb/hg38/multiz7way/multiz7way.maf -# real 6m6.583s - -# -rw-rw-r-- 1 530538267 Jun 3 14:05 multiz7way.tab -# -rw-rw-r-- 1 60616616 Jun 3 14:15 multiz7waySummary.tab - - rm multiz7way*.tab - -###################################################################### -# MULTIZ7WAY MAF FRAMES (DONE - 2014-06-03 - Hiram) - ssh hgwdev - mkdir /hive/data/genomes/hg38/bed/multiz7way/frames - cd /hive/data/genomes/hg38/bed/multiz7way/frames -# survey all the genomes to find out what kinds of gene tracks they have - cat << '_EOF_' > showGenes.csh -#!/bin/csh -fe -foreach db (`cat ../species.list`) - echo -n "${db}: " - set tables = `hgsql $db -N -e "show tables like '%Gene%'"` - foreach table ($tables) - if ($table == "ensGene" || $table == "refGene" || \ - $table == "mgcGenes" || $table == "knownGene" || \ - $table == "xenoRefGene" ) then - set count = `hgsql $db -N -e "select count(*) from $table"` - echo -n "${table}: ${count}, " - endif - end - set orgName = `hgsql hgcentraltest -N -e \ - "select scientificName from dbDb where name='$db'"` - set orgId = `hgsql hg19 -N -e \ - "select id from organism where name='$orgName'"` - if ($orgId == "") then - echo "Mrnas: 0" - else - set count = `hgsql hg19 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` - echo "Mrnas: ${count}" - endif -end -'_EOF_' - # << happy emacs - chmod +x ./showGenes.csh - time ./showGenes.csh -# hg38: knownGene: 104178, mgcGenes: 34081, refGene: 54852, xenoRefGene: 172740, Mrnas: 10723716 -# panTro4: ensGene: 29160, refGene: 2622, xenoRefGene: 280516, Mrnas: 11163 -# rheMac3: refGene: 6369, xenoRefGene: 275096, Mrnas: 443642 -# mm10: ensGene: 94647, knownGene: 61642, mgcGenes: 26768, refGene: 33765, xenoRefGene: 161178, Mrnas: 5224613 -# rn5: ensGene: 29188, mgcGenes: 6924, refGene: 18567, xenoRefGene: 175416, Mrnas: 1247500 -# canFam3: ensGene: 29884, refGene: 1582, xenoRefGene: 253196, Mrnas: 387195 -# monDom5: ensGene: 24882, refGene: 492, xenoRefGene: 248251, Mrnas: 2461 - - # from that summary, use these gene sets: - # refGene - rheMac3 - # ensGene - panTro4 rn5 canFam3 monDom5 - # knownGene - hg38 mm10 - - mkdir genes - # 1. knownGene: hg38 mm10 - for DB in hg38 mm10 -do - hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ - | genePredSingleCover stdin stdout | gzip -2c \ - > genes/${DB}.gp.gz -done - # 2. ensGene: - for DB in panTro4 rn5 canFam3 monDom5 -do -hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \ - | genePredSingleCover stdin stdout | gzip -2c \ - > /scratch/tmp/${DB}.tmp.gz - mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz - echo "${DB} done" -done - # 3. refGene - for DB in rheMac3 -do -hgsql -N -e "select * from refGene" ${DB} | cut -f2- \ - | genePredSingleCover stdin stdout | gzip -2c \ - > /scratch/tmp/${DB}.tmp.gz - mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz - echo "${DB} done" -done - - # verify counts for genes are reasonable: - for T in genes/*.gz -do - echo -n "# $T: " - zcat $T | cut -f1 | sort | uniq -c | wc -l -done -# genes/canFam3.gp.gz: 19507 -# genes/hg38.gp.gz: 21887 -# genes/mm10.gp.gz: 21013 -# genes/monDom5.gp.gz: 21033 -# genes/panTro4.gp.gz: 18657 -# genes/rheMac3.gp.gz: 5614 -# genes/rn5.gp.gz: 22863 - - time (cat ../anno/hg38.7way.maf \ - | nice -n +19 genePredToMafFrames hg38 stdin stdout \ - `sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g" ../species.list` \ - | gzip > multiz7wayFrames.bed.gz) - # real 3m44.591s - - # verify there are frames on everything, should be 7 species: - zcat multiz7wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c -# 265160 canFam3 -# 208941 hg38 -# 253323 mm10 -# 574521 monDom5 -# 200156 panTro4 -# 49802 rheMac3 -# 244731 rn5 - - # load the resulting file - ssh hgwdev - cd /hive/data/genomes/hg38/bed/multiz7way/frames - time hgLoadMafFrames hg38 multiz7wayFrames multiz7wayFrames.bed.gz - # real 0m19.959s - - time featureBits -countGaps hg38 multiz7wayFrames - # 52686177 bases of 3209286105 (1.642%) in intersection - # real 0m12.593s - - # enable the trackDb entries: -# frames multiz7wayFrames -# irows on - # appears to work OK - -######################################################################### -# Phylogenetic tree from 7-way (DONE - 2014-06-04 - Hiram) - mkdir /hive/data/genomes/hg38/bed/multiz7way/4d - cd /hive/data/genomes/hg38/bed/multiz7way/4d - - # the annotated maf is: - ../anno/hg38.7way.maf - - # using knownGene for hg38 - hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" hg38 > hg38.knownGene.gp - - genePredSingleCover hg38.knownGene.gp stdout | sort > hg38.knownGeneNR.gp - wc -l hg38.knownGeneNR.gp - # 21887 hg38.knownGeneNR.gp - - mkdir annoSplit - cd annoSplit - time mafSplit -verbose=2 -outDirDepth=1 -byTarget -useFullSequenceName \ - /dev/null . ../../anno/hg38.7way.maf - # real 5m14.770s - - find . -type f | wc -l - # 353 - ssh ku - mkdir /hive/data/genomes/hg38/bed/multiz7way/4d/run - cd /hive/data/genomes/hg38/bed/multiz7way/4d/run - mkdir ../mfa - - # newer versions of msa_view have a slightly different operation - # the sed of the gp file inserts the reference species in the chr name - cat << '_EOF_' > 4d.csh -#!/bin/csh -fe -set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin -set r = "/hive/data/genomes/hg38/bed/multiz7way" -set c = $1:r -set infile = $r/4d/annoSplit/$2 -set outDir = $r/4d/mfa/$3:h -set outfile = $r/4d/mfa/$3 -/bin/mkdir -p $outDir -cd /scratch/tmp -/bin/awk -v C=$c '$2 == C {print}' $r/4d/hg38.knownGeneNR.gp | sed -e "s/\t$c\t/\thg38.$c\t/" > $c.gp -set NL=`wc -l $c.gp| gawk '{print $1}'` -echo $NL -if ("$NL" != "0") then - $PHASTBIN/msa_view --4d --features $c.gp -i MAF $infile -o SS > $c.ss - $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile -else - echo "" > $outfile -endif -/bin/rm -f $c.gp $c.ss -'_EOF_' - # << happy emacs - chmod +x 4d.csh - - find ../annoSplit -type f | sed -e "s#../annoSplit/##" > maf.list - - cat << '_EOF_' > template -#LOOP -4d.csh $(file1) $(path1) {check out line+ ../mfa/$(dir1)/$(root1).mfa} -#ENDLOOP -'_EOF_' - # << happy emacs - - gensub2 maf.list single template jobList - para create jobList - para try ... check - para time -# Completed: 353 of 353 jobs -# CPU time in finished jobs: 836s 13.93m 0.23h 0.01d 0.000 y -# IO & Wait Time: 1172s 19.54m 0.33h 0.01d 0.000 y -# Average job time: 6s 0.09m 0.00h 0.00d -# Longest finished job: 72s 1.20m 0.02h 0.00d -# Submission to last job: 89s 1.48m 0.02h 0.00d - - # Not all results have contents, that is OK - - # combine mfa files - ssh hgwdev - cd /hive/data/genomes/hg38/bed/multiz7way/4d - # remove the broken empty files, size 0 and size 1: - find ./mfa -type f -size 0 | xargs rm -f - # most interesting, this did not identify files of size 1: -# find ./mfa -type f -size 1 - find ./mfa -type f | xargs ls -og | awk '$3 == 1' | awk '{print $NF}' \ - > empty.list - cat empty.list | xargs rm -f - #want comma-less species.list - /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ - --aggregate "`cat ../species.list`" mfa/*/*.mfa | sed s/"> "/">"/ \ - > 4d.all.mfa - # check they are all in there: - grep "^>" 4d.all.mfa - # >hg38 - # >panTro4 - # >rheMac3 - # >mm10 - # >rn5 - # >canFam3 - # >monDom5 - - sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ - ../hg38.7way.nh | xargs echo | sed -e 's/ //g' > tree_commas.nh - # tree_commas.nh looks like: - # (((((hg38,panTro4),rheMac3),(mm10,rn5)),canFam3),monDom5) - # use phyloFit to create tree model (output is phyloFit.mod) - time nice -n +19 \ - /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ - --EM --precision MED --msa-format FASTA --subst-mod REV \ - --tree tree_commas.nh 4d.all.mfa - # real 0m6.583s - - - mv phyloFit.mod all.mod - - grep TREE all.mod -# TREE: (((((hg38:0.00673596,panTro4:0.00686169):0.0248146,rheMac3:0.0357598):0.0970072,(mm10:0.081661,rn5:0.0874126):0.246527):0.0264964,canFam3:0.156769):0.303241,monDom5:0.303241); - - # compare these calculated lengths to the tree extracted from 130way: - grep TREE all.mod | sed -e 's/TREE: //' \ - | /cluster/bin/phast/all_dists /dev/stdin | grep hg38 | sort -k3n \ - | sed -e "s/hg38.//; s/^/ # /" - # panTro4 0.013598 - # rheMac3 0.067310 - # canFam3 0.311823 - # mm10 0.456746 - # rn5 0.462497 - # monDom5 0.761536 - - # yes, somewhat similar - /cluster/bin/phast/all_dists ../hg38.7way.nh | grep hg38 \ - | sort -k3n | sed -e "s/hg38.//; s/^/ # /" - # panTro4 0.013390 - # rheMac3 0.071575 - # canFam3 0.330429 - # mm10 0.500391 - # rn5 0.507471 - # monDom5 0.763679 - -######################################################################### -# phastCons 7-way (DONE - 2014-06-04 - Hiram) - # split 7way mafs into 10M chunks and generate sufficient statistics - # files for # phastCons - ssh ku - mkdir -p /hive/data/genomes/hg38/bed/multiz7way/cons/SS - cd /hive/data/genomes/hg38/bed/multiz7way/cons/SS - mkdir result done - - cat << '_EOF_' > mkSS.csh -#!/bin/csh -ef -set d = $1 -set c = $2 -set doneDir = done/$d -set MAF = /hive/data/genomes/hg38/bed/multiz7way/anno/result/$d/$c.maf -set WINDOWS = /hive/data/genomes/hg38/bed/multiz7way/cons/SS/result/$d/$c -set WC = `cat $MAF | wc -l` -set NL = `grep "^#" $MAF | wc -l` -if ( -s $3 ) then - exit 0 -endif -if ( -s $3.running ) then - exit 0 -endif - -/bin/mkdir -p $doneDir -/bin/date >> $3.running - -/bin/rm -fr $WINDOWS -/bin/mkdir -p $WINDOWS -pushd $WINDOWS > /dev/null -if ( $WC != $NL ) then -/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ - $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 -endif -popd > /dev/null -/bin/date >> $3 -/bin/rm -f $3.running -'_EOF_' - # << happy emacs - chmod +x mkSS.csh - - cat << '_EOF_' > template -#LOOP -mkSS.csh $(dir1) $(root1) {check out line+ done/$(dir1)/$(root1)} -#ENDLOOP -'_EOF_' - # << happy emacs - - # do the easy ones first to see some immediate results - find ../../anno/result -type f | sed -e "s#../../anno/result/##" > maf.list - - gensub2 maf.list single template jobList - para -ram=32g create jobList - para try ... check ... etc -# Completed: 353 of 353 jobs -# CPU time in finished jobs: 1216s 20.27m 0.34h 0.01d 0.000 y -# IO & Wait Time: 1385s 23.08m 0.38h 0.02d 0.000 y -# Average job time: 7s 0.12m 0.00h 0.00d -# Longest finished job: 111s 1.85m 0.03h 0.00d -# Submission to last job: 189s 3.15m 0.05h 0.00d - - find ./result -type f | wc -l - # 641 - - # Run phastCons - # This job is I/O intensive in its output files, beware where this - # takes place or do not run too many at once. - ssh ku - mkdir -p /hive/data/genomes/hg38/bed/multiz7way/cons/run.cons - cd /hive/data/genomes/hg38/bed/multiz7way/cons/run.cons - - # This is setup for multiple runs based on subsets, but only running - # the 'all' subset here. - # It triggers off of the current working directory - # $cwd:t which is the "grp" in this script. Running: - # all and vertebrates - - cat << '_EOF_' > doPhast.csh -#!/bin/csh -fe -set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin -set c = $1 -set d = $2 -set f = $3 -set len = $4 -set cov = $5 -set rho = $6 -set grp = $cwd:t -set cons = /hive/data/genomes/hg38/bed/multiz7way/cons -set tmp = $cons/tmp/${d}_${c} -mkdir -p $tmp -set ssSrc = $cons/SS/result -set useGrp = "$grp.mod" -if (-s $cons/$grp/$grp.non-inf) then - ln -s $cons/$grp/$grp.mod $tmp - ln -s $cons/$grp/$grp.non-inf $tmp - ln -s $ssSrc/$d/$f $tmp -else - ln -s $ssSrc/$d/$f $tmp - ln -s $cons/$grp/$grp.mod $tmp -endif -pushd $tmp > /dev/null -if (-s $grp.non-inf) then - $PHASTBIN/phastCons $f $useGrp \ - --rho $rho --expected-length $len --target-coverage $cov --quiet \ - --not-informative `cat $grp.non-inf` \ - --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp -else - $PHASTBIN/phastCons $f $useGrp \ - --rho $rho --expected-length $len --target-coverage $cov --quiet \ - --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp -endif -popd > /dev/null -mkdir -p pp/$d bed/$d -sleep 4 -touch pp/$d bed/$d -rm -f pp/$d/$c.pp -rm -f bed/$d/$c.bed -mv $tmp/$c.pp pp/$d -mv $tmp/$c.bed bed/$d -rm -fr $tmp -rmdir --ignore-fail-on-non-empty $cons/tmp/$d:h -'_EOF_' - # << happy emacs - chmod +x doPhast.csh - - # this template will serve for all runs - # root1 == chrom name, file1 == ss file name without .ss suffix - cat << '_EOF_' > template -#LOOP -../run.cons/doPhast.csh $(root1) $(dir1) $(file1) 45 0.3 0.3 {check out line+ pp/$(dir1)/$(root1).pp} -#ENDLOOP -'_EOF_' - # << happy emacs - - find ../SS/result -type f | sed -e "s#../SS/result/##" > ss.list - wc -l ss.list - # 641 ss.list - - # Create parasol batch and run it - # run for all species - cd /hive/data/genomes/hg38/bed/multiz7way/cons - mkdir -p all - cd all - # Using the .mod tree - cp -p ../../4d/all.mod ./all.mod - - gensub2 ../run.cons/ss.list single ../run.cons/template jobList - para -ram=32g create jobList - para try ... check ... - para push -# Completed: 641 of 641 jobs -# CPU time in finished jobs: 6557s 109.29m 1.82h 0.08d 0.000 y -# IO & Wait Time: 4497s 74.94m 1.25h 0.05d 0.000 y -# Average job time: 17s 0.29m 0.00h 0.00d -# Longest finished job: 33s 0.55m 0.01h 0.00d -# Submission to last job: 120s 2.00m 0.03h 0.00d - - # create Most Conserved track - cd /hive/data/genomes/hg38/bed/multiz7way/cons/all - cut -f1 ../../../../chrom.sizes | while read C -do - ls -d bed/?/${C} 2> /dev/null | while read D - do - echo ${D}/${C}*.bed 1>&2 - cat ${D}/${C}*.bed - done | sort -k1,1 -k2,2n \ - | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' -done > tmpMostConserved.bed - - /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed - # -rw-rw-r-- 1 42636652 Jun 4 10:45 tmpMostConserved.bed - # -rw-rw-r-- 1 43721828 Jun 4 10:45 mostConserved.bed - - # load into database - ssh hgwdev - cd /hive/data/genomes/hg38/bed/multiz7way/cons/all - time nice -n +19 hgLoadBed hg38 phastConsElements7way mostConserved.bed - # Read 1234990 elements of size 5 from mostConserved.bed - # real 0m11.390s - - # on human we often try for 5% overall cov, and 70% CDS cov - # most bets are off here for that goal, these alignments are too few - # and too far between - # --rho 0.3 --expected-length 45 --target-coverage 0.3 - featureBits hg38 -enrichment knownGene:cds phastConsElements7way - # knownGene:cds 1.266%, phastConsElements7way 4.551%, - # both 0.888%, cover 70.16%, enrich 15.42x - - # Create merged posterier probability file and wiggle track data files - cd /hive/data/genomes/hg38/bed/multiz7way/cons/all - mkdir downloads - - # the third sed fixes the chrom names, removing the partition extensions - time (find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ - | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ - | sed -e 's/\.[0-9][0-9]*-[0-9][0-9]* start/ start/' \ - | gzip -c > downloads/phastCons7way.wigFix.gz) - # real 37m47.242s - - # check integrity of data with wigToBigWig - time (zcat downloads/phastCons7way.wigFix.gz \ - | wigToBigWig -verbose=2 stdin /hive/data/genomes/hg38/chrom.sizes \ - phastCons7way.bw) > bigWig.log 2>&1 & - tail bigWig.log - # pid=34733: VmPeak: 33106324 kB - # real 40m53.287s - - bigWigInfo phastCons7way.bw -# version: 4 -# isCompressed: yes -# isSwapped: 0 -# primaryDataSize: 5,675,802,079 -# primaryIndexSize: 92,579,900 -# zoomLevels: 10 -# chromCount: 353 -# basesCovered: 2,898,191,577 -# mean: 0.168088 -# min: 0.000000 -# max: 1.000000 -# std: 0.233827 - - # encode those files into wiggle data - time (zcat downloads/phastCons7way.wigFix.gz \ - | wigEncode stdin phastCons7way.wig phastCons7way.wib) - # Converted stdin, upper limit 1.00, lower limit 0.00 - # real 15m28.525s - - du -hsc *.wi? - # 2.7G phastCons7way.wib - # 282M phastCons7way.wig - # 3.0G total - - # Load gbdb and database with wiggle. - ln -s `pwd`/phastCons7way.wib /gbdb/hg38/multiz7way/phastCons7way.wib - time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg38/multiz7way \ - hg38 phastCons7way phastCons7way.wig - # real 0m33.502s - - # use to set trackDb.ra entries for wiggle min and max - # and verify table is loaded correctly - - wigTableStats.sh hg38 phastCons7way -# db.table min max mean count sumData stdDev viewLimits -hg38.phastCons7way 0 1 0.168088 2898191577 4.87152e+08 0.233827 viewLimits=0:1 - - # Create histogram to get an overview of all the data - time nice -n +19 hgWiggle -doHistogram -db=hg38 \ - -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ - phastCons7way > histogram.data 2>&1 - # real 2m40.179s - - # create plot of histogram: - - cat << '_EOF_' | gnuplot > histo.png -set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff -set size 1.4, 0.8 -set key left box -set grid noxtics -set grid ytics -set title " Human hg38 Histogram phastCons7way track" -set xlabel " phastCons7way score" -set ylabel " Relative Frequency" -set y2label " Cumulative Relative Frequency (CRF)" -set y2range [0:1] -set y2tics -set yrange [0:0.02] - -plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ - "histogram.data" using 2:7 axes x1y2 title " CRF" with lines -'_EOF_' - # << happy emacs - - display histo.png & - -######################################################################### -# phyloP for 7-way (DONE - 2014-06-04 - Hiram) - # run phyloP with score=LRT - ssh ku - mkdir /cluster/data/hg38/bed/multiz7way/consPhyloP - cd /cluster/data/hg38/bed/multiz7way/consPhyloP - - mkdir run.phyloP - cd run.phyloP - # Adjust model file base composition background and rate matrix to be - # representative of the chromosomes in play - grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' - # 0.556 - /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ - ../../cons/all/all.mod 0.556 > all.mod - # verify, the BACKGROUND should now be paired up: - grep BACK all.mod - # BACKGROUND: 0.222000 0.278000 0.278000 0.222000 - - cat << '_EOF_' > doPhyloP.csh -#!/bin/csh -fe -set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin -set f = $1 -set d = $f:h -set file1 = $f:t -set out = $2 -set cName = $f:t:r -set grp = $cwd:t -set cons = /hive/data/genomes/hg38/bed/multiz7way/consPhyloP -set tmp = $cons/tmp/$grp/$f -/bin/rm -fr $tmp -/bin/mkdir -p $tmp -set ssSrc = "/hive/data/genomes/hg38/bed/multiz7way/cons/SS/result/$f" -set useGrp = "$grp.mod" -/bin/ln -s $cons/run.phyloP/$grp.mod $tmp -pushd $tmp > /dev/null -$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ - -i SS $useGrp $ssSrc.ss > $file1.wigFix -popd > /dev/null -/bin/mkdir -p $out:h -sleep 4 -/bin/touch $out:h -/bin/mv $tmp/$file1.wigFix $out -/bin/rm -fr $tmp -/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d -/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h -/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp -/bin/rmdir --ignore-fail-on-non-empty $cons/tmp -'_EOF_' - # << happy emacs - - # Create list of chunks - find ../../cons/SS/result -type f | grep ".ss$" \ - | sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list - # make sure the list looks good - wc -l ss.list - # 641 ss.list - - # Create template file - # file1 == $chr/$chunk/file name without .ss suffix - cat << '_EOF_' > template -#LOOP -../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} -#ENDLOOP -'_EOF_' - # << happy emacs - - ###################### Running all species ####################### - # setup run for all species - mkdir /hive/data/genomes/hg38/bed/multiz7way/consPhyloP/all - cd /hive/data/genomes/hg38/bed/multiz7way/consPhyloP/all - rm -fr wigFix - mkdir wigFix - - gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList - # the -ram=8g will allow only one job per node to slow this down since - # it would run too fast otherwise. Either run on one of the small - # klusters or use the -ram=8g on the para create - para -ram=32g create jobList - para try ... check ... push ... etc ... - para time > run.time -# Completed: 641 of 641 jobs -# CPU time in finished jobs: 4755s 79.24m 1.32h 0.06d 0.000 y -# IO & Wait Time: 4343s 72.39m 1.21h 0.05d 0.000 y -# Average job time: 14s 0.24m 0.00h 0.00d -# Longest finished job: 27s 0.45m 0.01h 0.00d -# Submission to last job: 1152s 19.20m 0.32h 0.01d - - # make downloads - mkdir downloads - - time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ - | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ - | gzip -c > downloads/phyloP7way.wigFix.gz) & - # real 29m51.665s - - # check integrity of data with wigToBigWig - time (zcat downloads/phyloP7way.wigFix.gz \ - | wigToBigWig -verbose=2 stdin /hive/data/genomes/hg38/chrom.sizes \ - phyloP7way.bw) > bigWig.log 2>&1 & - egrep "real|VmPeak" bigWig.log - # pid=76577: VmPeak: 33106320 kB - # real 42m53.038s - - - bigWigInfo phyloP7way.bw -# version: 4 -# isCompressed: yes -# isSwapped: 0 -# primaryDataSize: 3,759,451,708 -# primaryIndexSize: 92,579,900 -# zoomLevels: 10 -# chromCount: 353 -# basesCovered: 2,898,191,577 -# mean: 0.074472 -# min: -5.220000 -# max: 1.062000 -# std: 0.545945 - - # encode those files into wiggle data - time (zcat downloads/phyloP7way.wigFix.gz \ - | wigEncode stdin phyloP7way.wig phyloP7way.wib) & - # Converted stdin, upper limit 1.06, lower limit -5.22 - # real 16m11.861s - - - du -hsc *.wi? - # 47M phyloP7way.wib - # 12M phyloP7way.wig - # 58M total - - # Load gbdb and database with wiggle. - ln -s `pwd`/phyloP7way.wib /gbdb/hg38/multiz7way/phyloP7way.wib - nice hgLoadWiggle -pathPrefix=/gbdb/hg38/multiz7way hg38 \ - phyloP7way phyloP7way.wig - - # use to set trackDb.ra entries for wiggle min and max - # and verify table is loaded correctly - - wigTableStats.sh hg38 phyloP7way -# db.table min max mean count sumData -# hg38.phyloP7way -5.22 1.062 0.0744721 2898191577 2.15834e+08 -# stdDev viewLimits -# 0.545945 viewLimits=-2.65525:1.062 - - # that range is: 5.22+1.062 = 6.282 for hBinSize=0.006282 - - # Create histogram to get an overview of all the data - time nice -n +19 hgWiggle -doHistogram \ - -hBinSize=0.006282 -hBinCount=1000 -hMinVal=-5.22 -verbose=2 \ - -db=hg38 phyloP7way > histogram.data 2>&1 - # real 2m55.843s - - - # find out the range for the 2:5 graph - grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin -# Q1 0.000001 -# median 0.000060 -# Q3 0.000656 -# average 0.001022 -# min 0.000000 -# max 0.065461 -# count 978 -# total 0.999982 -# standard deviation 0.004157 - - # create plot of histogram: - cat << '_EOF_' | gnuplot > histo.png -set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff -set size 1.4, 0.8 -set key left box -set grid noxtics -set grid ytics -set title " Human hg38 Histogram phyloP7way track" -set xlabel " phyloP7way score" -set ylabel " Relative Frequency" -set y2label " Cumulative Relative Frequency (CRF)" -set y2range [0:1] -set y2tics -set yrange [0:0.02] - -plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ - "histogram.data" using 2:7 axes x1y2 title " CRF" with lines -'_EOF_' - # << happy emacs - - display histo.png & - -############################################################################# -# construct download files for 7-way (DONE - 2014-06-05 - Hiram) - mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way - mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons7way - mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP7way - mkdir /hive/data/genomes/hg38/bed/multiz7way/downloads - cd /hive/data/genomes/hg38/bed/multiz7way/downloads - mkdir multiz7way phastCons7way phyloP7way - cd multiz7way - time cp -p ../../anno/hg38.7way.maf . - # real 0m55.984s - time gzip *.maf - # real 46m53.149s - - ln -s ../../hg38.7way.nh . - ln -s ../../hg38.7way.commonNames.nh . - time md5sum *.nh *.maf.gz > md5sum.txt - # real 1m55.317s - ln -s `pwd`/* \ - /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way - - du -hsc *.maf.gz ../../anno/hg38.7way.maf - # 3.5G hg38.7way.maf.gz - # 17G ../../anno/hg38.7way.maf - - ##################################################################### - cd /hive/data/genomes/hg38/bed/multiz7way/downloads/phastCons7way - - ln -s ../../cons/all/downloads/phastCons7way.wigFix.gz \ - ./hg38.phastCons7way.wigFix.gz - ln -s ../../cons/all/phastCons7way.bw ./hg38.phastCons7way.bw - ln -s ../../cons/all/all.mod ./hg38.phastCons7way.mod - time md5sum *.gz *.mod *.bw > md5sum.txt - # real 0m37.384s - # obtain the README.txt from petMar2/phastCons7way and update for this - # situation - ln -s `pwd`/*.gz `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ - /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons7way - - ##################################################################### - cd /hive/data/genomes/hg38/bed/multiz7way/downloads/phyloP7way - - ln -s ../../consPhyloP/all/downloads/phyloP7way.wigFix.gz \ - ./hg38.phyloP7way.wigFix.gz - ln -s ../../consPhyloP/run.phyloP/all.mod hg38.phyloP7way.mod - ln -s ../../consPhyloP/all/phyloP7way.bw hg38.phyloP7way.bw - - time md5sum *.mod *.bw *.gz > md5sum.txt - # real 0m29.431s - - # obtain the README.txt from geoFor1/phyloP7way and update for this - # situation - ln -s `pwd`/* \ - /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP7way - - ########################################################################### - ## create upstream refGene maf files - cd /hive/data/genomes/hg38/bed/multiz7way/downloads/multiz7way - # bash script -#!/bin/sh -export geneTbl="knownGene" -for S in 1000 2000 5000 -do - echo "making upstream${S}.maf" - featureBits hg38 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ - | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ - | /cluster/bin/$MACHTYPE/mafFrags hg38 multiz7way \ - stdin stdout \ - -orgs=/hive/data/genomes/hg38/bed/multiz7way/species.list \ - | gzip -c > upstream${S}.${geneTbl}.maf.gz - echo "done upstream${S}.${geneTbl}.maf.gz" -done - # real 60m16.631s - - md5sum upstream*.gz >> md5sum.txt - - # some other symlinks were already made above - # obtain the README.txt from geoFor1/multiz7way and update for this - # situation - ln -s `pwd`/upstream*.gz README.txt \ - /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way - -############################################################################# -# hgPal downloads (DONE - 2014-06-06 - Hiram) -# FASTA from 7-way for knownGene, refGene and knownCanonical - - ssh hgwdev - screen -S hg38HgPal - mkdir /hive/data/genomes/hg38/bed/multiz7way/pal - cd /hive/data/genomes/hg38/bed/multiz7way/pal - cat ../species.list | tr '[ ]' '[\n]' > order.list - - export mz=multiz7way - export gp=knownGene - export db=hg38 - export I=0 - mkdir exonAA exonNuc - for C in `sort -nk2 ../../../chrom.sizes | cut -f1` - do - I=`echo $I | awk '{print $1+1}'` - echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" - echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" - if [ $I -gt 6 ]; then - echo "date" - echo "wait" - I=0 - fi - done > $gp.jobs - echo "date" >> $gp.jobs - echo "wait" >> $gp.jobs - - time ./$gp.jobs > $gp.jobs.log 2>&1 & - # real 28m46.919s - - time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz - # real 0m23.798s - time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz - # real 1m28.197s - - export mz=multiz7way - export gp=knownGene - export db=hg38 - export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments - mkdir -p $pd - md5sum *.fa.gz > md5sum.txt - ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz - ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz - ln -s `pwd`/md5sum.txt $pd/ - - rm -rf exonAA exonNuc - - ### need other gene track alignments also - # running up refGene - cd /hive/data/genomes/hg38/bed/multiz7way/pal - export mz=multiz7way - export gp=refGene - export db=hg38 - export I=0 - mkdir exonAA exonNuc - for C in `sort -nk2 ../../../chrom.sizes | cut -f1` - do - I=`echo $I | awk '{print $1+1}'` - echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" - echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" - if [ $I -gt 6 ]; then - echo "date" - echo "wait" - I=0 - fi - done > $gp.jobs - echo "date" >> $gp.jobs - echo "wait" >> $gp.jobs - - time sh -x $gp.jobs > $gp.jobs.log 2>&1 - # real 15m15.424s - - export mz=multiz7way - export gp=refGene - export db=hg38 - time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz - # real 0m23.119s - time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz - # real 1m15.547s - - du -hsc exonAA exonNuc refGene*.fa.gz - # 59M exonAA - # 101M exonNuc - # 59M refGene.multiz7way.exonAA.fa.gz - # 101M refGene.multiz7way.exonNuc.fa.gz - # 317M total - - rm -rf exonAA exonNuc - - # we're only distributing exons at the moment - export mz=multiz7way - export gp=refGene - export db=hg38 - export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments - mkdir -p $pd - md5sum *.fa.gz > md5sum.txt - ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz - ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz - ln -s `pwd`/md5sum.txt $pd/ - - ### And knownCanonical - cd /hive/data/genomes/hg38/bed/multiz7way/pal - export mz=multiz7way - export gp=knownCanonical - export db=hg38 - mkdir exonAA exonNuc ppredAA ppredNuc knownCanonical - - cut -f1 ../../../chrom.sizes | while read C - do - echo $C - hgsql hg38 -N -e "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$C'" > knownCanonical/$C.known.bed - done - - ls knownCanonical/*.known.bed | while read F - do - if [ -s $F ]; then - echo $F | sed -e 's#knownCanonical/##; s/.known.bed//' - fi - done | while read C - do - echo "date" - echo "mafGene -geneBeds=knownCanonical/$C.known.bed $db $mz knownGene order.list stdout | \ - gzip -c > ppredAA/$C.ppredAA.fa.gz" - echo "mafGene -geneBeds=knownCanonical/$C.known.bed -noTrans $db $mz knownGene order.list stdout | \ - gzip -c > ppredNuc/$C.ppredNuc.fa.gz" - echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons -noTrans $db $mz knownGene order.list stdout | \ - gzip -c > exonNuc/$C.exonNuc.fa.gz" - echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons $db $mz knownGene order.list stdout | \ - gzip -c > exonAA/$C.exonAA.fa.gz" - done > $gp.$mz.jobs - - time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 - # real 72m58.133s - - rm *.known.bed - mz=multiz7way - gp=knownCanonical - db=hg38 - zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz & - zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz & - zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz & - zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz - - rm -rf exonAA exonNuc ppredAA ppredNuc - - mz=multiz7way - gp=knownCanonical - db=hg38 - pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments - mkdir -p $pd - ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz - ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz - cd $pd - md5sum *.exon*.fa.gz > md5sum.txt - -############################################################################# -# wiki page for 7-way (DONE - 2014-06-04 - Hiram) - mkdir /hive/users/hiram/bigWays/hg38.7way - cd /hive/users/hiram/bigWays - echo "hg38" > hg38.7way/ordered.list - awk '{print $1}' /hive/data/genomes/hg38/bed/multiz7way/7way.distances.txt \ - >> hg38.7way/ordered.list - - # sizeStats.sh catches up the cached measurements required for data - # in the tables. They may already be done. - ./sizeStats.sh hg38.7way/ordered.list - # dbDb.sh constructs hg38.7way/Hg38_7-way_conservation_alignment.html - ./dbDb.sh hg38 7way - # sizeStats.pl constructs hg38.7way/Hg38_7-way_Genome_size_statistics.html - ./sizeStats.pl hg38 7way - - # defCheck.pl constructs Hg38_7-way_conservation_lastz_parameters.html - ./defCheck.pl hg38 7way - - # this constructs the html pages in hg38.7way/: -# -rw-rw-r-- 1 4153 Jun 5 11:03 Hg38_7-way_conservation_alignment.html -# -rw-rw-r-- 1 5833 Jun 5 11:04 Hg38_7-way_Genome_size_statistics.html -# -rw-rw-r-- 1 3854 Jun 5 11:04 Hg38_7-way_conservation_lastz_parameters.html - - # add those pages to the genomewiki. Their page names are the - # names of the .html files without the .html: -# Hg38_7-way_conservation_alignment -# Hg38_7-way_Genome_size_statistics -# Hg38_7-way_conservation_lastz_parameters - - # when you view the first one you enter, it will have links to the - # missing two. - -############################################################################# -# GRC Incident database (DONE - 2014-06-14 - Hiram) - # this procedure is run as a cron job in Hiram's account: - - # 33 09 * * * /hive/data/outside/grc/incidentDb/runUpdate.sh makeItSo - - # data comes from: ftp://ftp.ncbi.nlm.nih.gov/pub/grc/ - # processed by /hive/data/outside/grc/incidentDb/grcUpdate.sh - - # the table in the dataBase is: grcIncidentDb - # which is the URL to the bb file, a single row: - # http://genomewiki.ucsc.edu/images/7/7f/Hg38.grcIncidentDb.bb - -############################################################################# -# RepeatMasker Visualization track (DONE - 2014-07-25 - Hiram) - mkdir /hive/data/genomes/hg38/bed/rmskJoined - cd /hive/data/genomes/hg38/bed/rmskJoined - - ln -s ../repeatMasker/hg38.sorted.fa.out . - ln -s ../repeatMasker/hg38.fa.align.gz . - - # working on fixing this script for the next release of RM - /scratch/data/RepeatMasker140131/util/nextVerRmToUCSCTables.pl \ - -out hg38.sorted.fa.out -align hg38.fa.align.gz - - hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/rmskJoined.sql \ - -renameSqlTable -verbose=4 -tab \ - -type=bed4+9 -as=$HOME/kent/src/hg/lib/rmskJoined.as hg38 \ - rmskJoinedBaseline hg38.sorted.fa.join.bed \ - > loadJoined.log 2>&1 - - hgLoadSqlTab hg38 rmskAlignBaseline \ - /cluster/home/hiram/kent/src/hg/lib/rmskAlign.sql \ - hg38.fa.align.tsv > loadAlign.log 2>&1 - - hgLoadOutJoined -verbose=2 hg38 hg38.sorted.fa.out > loadOut.log 2>&1 - - featureBits -countGaps hg38 rmskJoinedBaseline - # 2716777279 bases of 3209286105 (84.654%) in intersection - -############################################################################## -# LASTZ Macaca Mulatta RheMac2 (DONE - 2014-07-13 - braney) - mkdir /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11 - cd /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11 - - # best to always specify an exact path to lastz so we know which one is used - # lastz default parameters are human-mouse parameters - - cat << '_EOF_' > DEF -# human vs macaca mulatta -BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz -# maximum M allowed with lastz is only 254 -BLASTZ_M=254 -BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q -BLASTZ_O=600 -BLASTZ_E=150 -# other parameters from panTro2 vs hg18 lastz on advice from Webb -BLASTZ_K=4500 -BLASTZ_Y=15000 -BLASTZ_T=2 - -# TARGET: Human Hg38 -SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit -SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes -SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit -SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes -SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift -SEQ1_CHUNK=20000000 -SEQ1_LAP=10000 - -# QUERY: Macaca Mulatta RheMac2 -SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit -SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes -SEQ2_CHUNK=20000000 -SEQ2_LAP=0 -SEQ2_IN_CONTIGS=0 - -BASE=/hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11 -TMPDIR=/dev/shm -'_EOF_' - # << happy emacs - time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ - `pwd`/DEF \ - -syntenicNet -fileServer=hgwdev \ - -chainMinScore=5000 -chainLinearGap=medium \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 - # Elapsed time: 141m36s - cat fb.hg38.chainRheMac2Link.txt - # 2455106923 bases of 3049335806 (80.513%) in intersection - - # running the swap - mkdir /hive/data/genomes/rheMac2/bed/blastz.hg38.swap - cd /hive/data/genomes/rheMac2/bed/blastz.hg38.swap - time nice -n +19 doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11/DEF \ - -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 - # 83m26.095s - cat fb.rheMac2.chainHg38Link.txt - # 2313950599 bases of 2646704109 (87.428%) in intersection -# - -######################################################################### -# LASTZ Chlorocebus sabaeus (DONE - 2014-07-13 - braney) - mkdir /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11 - cd /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11 - - # best to always specify an exact path to lastz so we know which one is used - # lastz default parameters are human-mouse parameters - - cat << '_EOF_' > DEF -# human vs Chlorocebus sabaeus -BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz -# maximum M allowed with lastz is only 254 -BLASTZ_M=254 -BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q -BLASTZ_O=600 -BLASTZ_E=150 -# other parameters from panTro2 vs hg18 lastz on advice from Webb -BLASTZ_K=4500 -BLASTZ_Y=15000 -BLASTZ_T=2 - - -# TARGET: Human Hg38 -SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit -SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes -SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit -SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes -SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift -SEQ1_CHUNK=20000000 -SEQ1_LAP=10000 - -# QUERY Chlorocebus sabaeus chlSab2 -SEQ2_DIR=/scratch/data/chlSab2/chlSab2.2bit -SEQ2_LEN=/scratch/data/chlSab2/chrom.sizes -SEQ2_CHUNK=20000000 -SEQ2_LAP=0 -SEQ2_IN_CONTIGS=0 - -BASE=/hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11 -TMPDIR=/dev/shm -'_EOF_' - # << happy emacs - time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \ - `pwd`/DEF \ - -syntenicNet -fileServer=hgwdev \ - -chainMinScore=5000 -chainLinearGap=medium \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 - # Elapsed time: 142m4s - cat fb.hg38.chainChlSab2Link.txt - # 2573435303 bases of 3049335806 (84.393%) in intersection - - # running the swap - mkdir /hive/data/genomes/chlSab2/bed/blastz.hg38.swap - cd /hive/data/genomes/chlSab2/bed/blastz.hg38.swap - time nice -n +19 doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11/DEF \ - -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 - # 88m48.411s - cat fb.chlSab2.chainHg38Link.txt - # 2429053010 bases of 2752019208 (88.264%) in intersection - -######################################################################### -# SEGMENTAL DUPLICATIONS (DONE - 2014-08-13 - Hiram) - # redmine issue: refs #13580 - - # file received in email from Archana Natarajan Raja (araja at uw.edu) - mkdir /hive/data/genomes/hg38/bed/genomicSuperDups - cd /hive/data/genomes/hg38/bed/genomicSuperDups -# -rw-r--r-- 1 16478617 Aug 11 16:18 GenomicSuperDup.tab - - # no longer filtering items smaller than 1,000 bases, see note - # in redmine issue refs #13580 -# While the size of the 24 alignments are less than 1000 bases , the size of -# their pairs to which they align are always >1000, you can confirm this by -# looking at the value in column 22 in your table (alignB -ucsc format), will -# always be >1000 bp . We are seeing this only now because there are lots of -# new and resolved duplications added to hg38. Hence , I would recommend not -# filtering these items and uploading the current set as is. - - # there is no chrEBV in the browser: - grep -v chrEBV GenomicSuperDup.tab | sed -e 's/\t_\t/\t-\t/;' \ - | hgLoadBed hg38 genomicSuperDups stdin \ - -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql - # Read 69894 elements of size 29 from stdin - - checkTableCoords hg38 genomicSuperDups - # <silence> (the chrEBV was found with this check) - - featureBits -countGaps hg38 genomicSuperDups - # 175429664 bases of 3209286105 (5.466%) in intersection - - featureBits -countGaps hg19 genomicSuperDups - # 166092393 bases of 3137161264 (5.294%) in intersection - featureBits -countGaps hg18 genomicSuperDups - # 159204446 bases of 3107677273 (5.123%) in intersection - - featureBits -countGaps mm10 genomicSuperDups - # 214917441 bases of 2730871774 (7.870%) in intersection - featureBits -countGaps mm9 genomicSuperDups - # 208214567 bases of 2725765481 (7.639%) in intersection - -############################################################################## -# cloneEnds (DONE - 2014-08-14 - Hiram) - - mkdir /hive/data/genomes/hg38/bed/cloneEnds - cd /hive/data/genomes/hg38/bed/cloneEnds - - # fetch the NCBI INSDC name correspondence file: - rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000001405.26.assembly.txt ./ - - # fetch the clone reports - mkdir reports - rsync -a -P \ -rsync://ftp.ncbi.nih.gov/repository/clone/reports/Homo_sapiens/*.GCF_000001405.26.106.*.gff \ - ./reports/ - - # script to establish refSeq to UCSC chrom names: - - cat << '_EOF_' > refSeqNames.pl -#!/usr/bin/env perl - -use strict; -use warnings; - -open (FH, "<GCF_000001405.26.assembly.txt") or die "can not read GCF_000001405.26.assembly.txt"; -while (my $line = <FH>) { - chomp $line; - next if ($line =~ m/^#/); - my @a = split('\t', $line); - my $chrN = $a[2]; - my $refSeq = $a[6]; - my $contig = $a[4]; - my $type = $a[1]; - next if (!defined $type); - next if (!defined $refSeq); - next if (!defined $contig); - my $suffix = ""; - if ($type eq "alt-scaffold") { - $suffix = "_alt"; - } elsif ($type eq "unlocalized-scaffold") { - $suffix = "_random"; - } elsif ($type eq "unplaced-scaffold") { - $chrN = "Un"; - } - $chrN = "M" if ($chrN eq "MT"); - if ($a[0] =~ m/_/) { - $contig =~ s/\./v/; - printf "%s\tchr%s_%s%s\n", $refSeq, $chrN, $contig, $suffix; - } else { - printf "%s\tchr%s\n", $refSeq, $chrN; - } -} -close (FH); -'_EOF_' - # << happy emacs - - chmod +x refSeqNames.pl - - ./refSeqNames.pl > refSeq.ucscName.tab - - # establish full library list: - ls reports/*.GCF_000001405.26.106.*.gff | sed -e 's#reports/##' \ - | cut -d"." -f1 | sort -u > library.list.txt - - # a script to scan the GFF files, with the refSeq.ucscName.tab - # name correspondence to construct bed files - - cat << '_EOF_' > hg38.pl -#!/usr/bin/env perl - -use strict; -use warnings; - -my $argc = scalar(@ARGV); - -if ($argc < 1) { - printf STDERR "usage: ./hg38.pl <report.gff> [moreReports.gff]\n"; - exit 255; -} - -my %refSeqToUcsc; # key is refSeq name, value is UCSC chrom name -open (FH, "<refSeq.ucscName.tab") or die "can not read refSeq.ucscName.tab"; -while (my $line = <FH>) { - chomp $line; - my ($refSeq, $ucsc) = split('\t', $line); - $refSeqToUcsc{$refSeq} = $ucsc; -} -close (FH); - -my %chromSizes; # key is UCSC chrom name, key is chrom size -open (FH, "</hive/data/genomes/hg38/chrom.sizes") or die "can not read hg38/chrom.sizes"; -while (my $line = <FH>) { - chomp $line; - my ($chr, $size) = split('\t', $line); - $chromSizes{$chr} = $size; -} -close (FH); - -while (my $file = shift) { -my %starts; # key is parent ID, value is start end coordinates start,end -my %ends; # key is parent ID, value is end end coordinates start,end -my %parents; # key is parent ID, value is 1 to signify exists -my %endNames; # key is parent ID, value is the Name of the parent clone_insert - -printf STDERR "# processing $file\n"; - -open (FH, "<$file") or die "can not read $file"; -while (my $line = <FH>) { - chomp $line; - next if ($line=~ m/^#/); - my @a = split('\t', $line); - next if (scalar(@a) < 1); - my $contig = $a[0]; - $contig =~ s/ref.//; - $contig =~ s/\|//; - my $ucscChr = $refSeqToUcsc{$contig}; - if (!defined($ucscChr)) { - printf STDERR "# ERR: contig not in refSeqToUcsc: '$contig'\n"; - next; - } - next if (! exists($chromSizes{$ucscChr})); - my $chromSize = $chromSizes{$ucscChr}; - my $chromStart = $a[3] - 1; - my $chromEnd = $a[4]; - if ($chromStart > $chromSize) { - printf STDERR "# warning chromStart over size $ucscChr $chromStart $chromEnd\n"; - $chromStart = $chromSize-1; - } - if ($chromEnd > $chromSize) { - my $overRun = $chromEnd - $chromSize; - printf STDERR "# warning chromEnd over size by $overRun -> $ucscChr $chromStart $chromEnd\n"; - $chromEnd = $chromSize; - } - my $id="notFound"; - my $name="notFound"; - my $parent="notFound"; - my @b = split(';', $a[8]); - for (my $i = 0; $i < scalar(@b); ++$i) { - my ($tag, $value) = split('=', $b[$i]); - if ($tag eq "ID") { - $id = $value; - if ($id !~ m/-/) { - if (exists($parents{$id})) { - printf STDERR "# WARN: duplicate parent: $id"; - } else { - $parents{$id} = $ucscChr; - } - } - } elsif ($tag eq "Parent") { - $parent = $value; - } elsif ($tag eq "Name") { - $name = $value; - } - } - my $type="notFound"; - my $insertType = $a[2]; - if ($insertType =~ m/clone_insert_start/) { - $type = "start"; - if ($parent eq "notFound") { - printf STDERR "# ERR: can not find parent for start $name Ttype $id\n"; - } else { - if (!exists($parents{$parent})) { - printf STDERR "# ERR: start found $name with no parent $parent declared\n"; - } elsif (exists($starts{$parent})) { - printf STDERR "# ERR: duplicate start for $parent\n"; - } elsif ($ucscChr eq $parents{$parent}) { - $starts{$parent} = sprintf("%s\t%s", $chromStart, $chromEnd); - } else { - printf STDERR "# ERR: start on different chrom $ucscChr than parent $parent $parents{$parent}\n"; - } - } - } elsif ($insertType =~ m/clone_insert_end/) { - $type = "end"; - if ($parent eq "notFound") { - printf STDERR "# ERR: can not find parent for end $name Ttype $id\n"; - } else { - if (!exists($parents{$parent})) { - printf STDERR "# ERR: end found $name with no parent $parent declared\n"; - } elsif (exists($ends{$parent})) { - printf STDERR "# ERR: duplicate end for $parent\n"; - } elsif ($ucscChr eq $parents{$parent}) { - $ends{$parent} = sprintf("%s\t%s", $chromStart, $chromEnd); - } else { - printf STDERR "# ERR: end on different chrom $ucscChr than parent $parent $parents{$parent}\n"; - } - } - } elsif ($insertType =~ m/clone_insert/) { - $type = "insert"; - $endNames{$id} = $name; - } - $name =~ s/gi\|//g; - $id =~ s/gi\|//g; - printf STDERR "%s\t%d\t%d\t%s_%s_%s\t0\t%s\n", $ucscChr, $chromStart, $chromEnd, $name, $type, $id, $a[6]; -} # while (my $line = <FH>) - -close (FH); - -foreach my $parent (keys %parents) { - if (! exists($starts{$parent}) ) { - printf STDERR "# ERR: no start for $parent\n"; - } elsif (! exists($ends{$parent}) ) { - printf STDERR "# ERR: no end for $parent\n"; - } else { - my $strand = "+"; - my $chrStart = 0; - my $chrEnd = 0; - my $blockStart = 0; - my ($sStart, $sEnd) = split('\t', $starts{$parent}); - my ($eStart, $eEnd) = split('\t', $ends{$parent}); - my $startSize = $sEnd - $sStart; - my $endSize = $eEnd - $eStart; - if ($eStart < $sStart) { - $chrStart = $eStart; - $chrEnd = $sEnd; - $blockStart = $sStart - $chrStart; - $strand = "-"; - $startSize = $eEnd - $eStart; - $endSize = $sEnd - $sStart; - } else { - $chrStart = $sStart; - $chrEnd = $eEnd; - $blockStart = $eStart - $chrStart; - } - if ($startSize > $blockStart) { - printf STDERR "# startSize > blockStart $endNames{$parent}\n"; - } else { - printf "%s\t%d\t%d\t%s\t0\t%s\t%d\t%d\t0\t2\t%d,%d\t0,%d\n", $parents{$parent}, $chrStart, $chrEnd, $endNames{$parent}, $strand, $chrStart, $chrEnd, $startSize, $endSize, $blockStart; - } - } -} -} -'_EOF_' - # << happy emacs - - chmod +x hg38.pl - - # process GFF files into bed files into separateLibs/ directory -for L in `cat library.list.txt` -do - export destDir="separateLibs/${L}" - echo "working: ${L}" 1>&1 - mkdir -p "${destDir}" - ./hg38.pl reports/${L}.GCF_000001405.26.106.*.gff \ - 2> ${destDir}/tmp.bed6 | sort -k1,1 -k2,2n > ${destDir}/hg38.${L}.bed - sort -k1,1 -k2,2n ${destDir}/tmp.bed6 > ${destDir}/hg38.${L}.items.bed6 -done - - # use only those libraries with more than 20,000 clone ends - wc -l separateLibs/*/*.bed | sort -n | grep -v total | awk '$1 > 20000' \ - | sed -e 's#.*separateLibs/##; s#/.*##' > libs.over20K.list - - # note those libraries with less than 20,000 clone ends - wc -l separateLibs/*/*.bed | grep -v total | awk '$1 < 20000' | sed -e 's#.*separateLibs/##; s#/.*##' > libs.under20K.list - - # filter out bad ends, length must be <= median size times three - cat libs.over20K.list | while read D -do - if [ ! -s separateLibs/${D}/lengths.txt ]; then - awk '{print $3-$2}' separateLibs/${D}/hg38.${D}.bed \ - > separateLibs/${D}/lengths.txt - fi - median3X=`ave separateLibs/${D}/lengths.txt | grep median | awk '{printf "%d", $2*3}'` - awk '($3-$2) < '$median3X'' separateLibs/${D}/hg38.${D}.bed > separateLibs/${D}/hg38.median3X.bed - awk '($3-$2) >= '$median3X'' separateLibs/${D}/hg38.${D}.bed > separateLibs/${D}/hg38.badMap.bed - before=`cat separateLibs/${D}/hg38.${D}.bed | wc -l` - after=`cat separateLibs/${D}/hg38.median3X.bed | wc -l` - dropped=`echo $before $after | awk '{print $1-$2}'` - perCent=`echo $dropped $before | awk '{printf "%.2f", 100*'$dropped/$before'}'` - echo "$D $before - $after = $dropped -> % $perCent dropped" -done - -# ABC20 24692 - 24474 = 218 -> % 0.88 dropped -# RP11 86660 - 85903 = 757 -> % 0.87 dropped -# CTD 95853 - 94941 = 912 -> % 0.95 dropped -# CH17 105618 - 105060 = 558 -> % 0.53 dropped -# ABC21 182154 - 180973 = 1181 -> % 0.65 dropped -# ABC22 189939 - 188743 = 1196 -> % 0.63 dropped -# COR02 208263 - 206782 = 1481 -> % 0.71 dropped -# ABC18 325080 - 322904 = 2176 -> % 0.67 dropped -# ABC27 334178 - 331822 = 2356 -> % 0.71 dropped -# ABC24 398944 - 395776 = 3168 -> % 0.79 dropped -# ABC23 436965 - 433896 = 3069 -> % 0.70 dropped -# ABC16 452220 - 449101 = 3119 -> % 0.69 dropped -# COR2A 583008 - 578578 = 4430 -> % 0.76 dropped -# WI2 587165 - 582843 = 4322 -> % 0.74 dropped -# ABC7 649297 - 644071 = 5226 -> % 0.80 dropped -# ABC11 729962 - 724864 = 5098 -> % 0.70 dropped -# ABC9 755994 - 750648 = 5346 -> % 0.71 dropped -# ABC12 777816 - 771827 = 5989 -> % 0.77 dropped -# ABC10 787969 - 781331 = 6638 -> % 0.84 dropped -# ABC13 810822 - 803589 = 7233 -> % 0.89 dropped -# ABC14 845573 - 839126 = 6447 -> % 0.76 dropped -# ABC8 1204275 - 1192784 = 11491 -> % 0.95 dropped - - # loading the median3X files -for L in `cat libs.over20K.list` -do - echo $L 1>&2 - hgLoadBed -type=bed12 hg38 cloneEnd${L} \ - separateLibs/${L}/hg38.median3X.bed \ - > separateLibs/loadBed.${L}.log 2>&1 -done - - # loading the dropped ends: - mkdir /hive/data/genomes/hg38/bed/cloneEnds/droppedTooBig - # link them to here - cat ../libs.over20K.list | while read L -do - ln -s ../separateLibs/${L}/hg38.badMap.bed ${L}.badMap.bed -done - # then load - hgLoadBed -type=bed12 hg38 cloneEndbadEnds *.badMap.bed - - # construct multiple mapped ends: -for L in `cat libs.over20K.list` -do - cat separateLibs/${L}/hg38.median3X.bed -done | sort -k4 > allEnds.bed - - cut -f4 allEnds.bed | sort | uniq -c | sort -rn > allEnds.names.count.txt - - awk '$1 > 1' allEnds.names.count.txt | awk '{print $2}' \ - | sort > multiples.names.txt - - join -t' ' -o "2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12" \ - -2 4 multiples.names.txt allEnds.bed | sort -k1,1 -k2,2n \ - > allEnds.multiple.locations.bed - - hgLoadBed -type=bed12 hg38 cloneEndmultipleMaps \ - allEnds.multiple.locations.bed > load.multipleMaps.log 2>&1 - - awk '$6 == "+"' allEnds.bed | sort -k1,1 -k2,2n \ - | bedItemOverlapCount hg38 stdin > allEnds.forward.bedGraph - - awk '$6 == "-"' allEnds.bed | sort -k1,1 -k2,2n \ - | bedItemOverlapCount hg38 stdin > allEnds.reverse.bedGraph - - bedGraphToBigWig allEnds.forward.bedGraph \ - /hive/data/genomes/hg38/chrom.sizes \ - cloneEndcoverageForward.bw - - bedGraphToBigWig allEnds.reverse.bedGraph \ - /hive/data/genomes/hg38/chrom.sizes \ - cloneEndcoverageReverse.bw - - mkdir /gbdb/hg38/bbi/cloneEnd - ln -s `pwd`/cloneEndcoverageForward.bw /gbdb/hg38/bbi/cloneEnd - ln -s `pwd`/cloneEndcoverageReverse.bw /gbdb/hg38/bbi/cloneEnd - - hgBbiDbLink hg38 cloneEndcoverageForward \ - /gbdb/hg38/bbi/cloneEnd/cloneEndcoverageForward.bw - hgBbiDbLink hg38 cloneEndcoverageReverse \ - /gbdb/hg38/bbi/cloneEnd/cloneEndcoverageReverse.bw - - ### Fixup the scores to indicate how many multiple mappings as mentioned - ### in the hg19 bacEnds description page: one mapping: score = 1000 - ### multiple mappings: score = 1500/count - ### the sort | uniq -c | awk does this score calculation with the name - ### in column 1 - ### The join puts the existing table together with those scores - ### DONE - 2015-06-18 - Hiram - - mkdir /hive/data/genomes/hg38/bed/cloneEnds/addCounts - cd /hive/data/genomes/hg38/bed/cloneEnds/addCounts - mkdir score withScore noScore withScore - for table in cloneEndABC10 cloneEndABC11 cloneEndABC12 cloneEndABC13 \ -cloneEndABC14 cloneEndABC16 cloneEndABC18 cloneEndABC20 cloneEndABC21 \ -cloneEndABC22 cloneEndABC23 cloneEndABC24 cloneEndABC27 cloneEndABC7 \ -cloneEndABC8 cloneEndABC9 cloneEndCH17 cloneEndCOR02 cloneEndCOR2A \ -cloneEndCTD cloneEndRP11 cloneEndWI2 cloneEndbadEnds cloneEndmultipleMaps -do - hgsql -N -e "select name from $table;" hg38 | sort | uniq -c | - awk '{ if (1 == $1) {printf "%s\t1000\n", $2} else {printf "%s\t%d\n", $2, 1500/$1} }' \ - | sort > score/hg38.$table.score.tab - hgsql -N -e "select * from $table order by name;" hg38 \ - | sort -k5 > noScore/hg38.$table.tab - join -t'^I' -1 5 noScore/hg38.$table.tab score/hg38.$table.score.tab \ - | awk '{printf "%d\t%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n", $2,$3,$4,$5,$1,$14,$7,$8,$9,$10,$11,$12,$13}' \ - | sort -k2,2 -k3,3n > withScore/hg38.$table.withScore.tab - hgsql -e "delete from $table;" hg38 - hgsql -e "load data local infile \"withScore/hg38.$table.withScore.tab\" into table $table;" hg38 -done - -############################################################################## -# SIB Transcriptome (DONE 2014-08-27 Steve) - - # Create working directory and download data from where Christian - # Iseli (christian.iseli@unil.ch) put it, and unpack. - mkdir -p /hive/data/genomes/hg38/bed/sibTranscriptome - cd /hive/data/genomes/hg38/bed/sibTranscriptome - wget --timestamping http://ludwig-sun1.unil.ch/~chris/HTr.gtf.gz - wget --timestamping http://ludwig-sun1.unil.ch/~chris/txg.tar.gz - - tar -zxvf txg.tar.gz - - zcat HTr.gtf.gz | ldHgGene hg38 sibGene stdin - # Reading stdin - # Read 208508 transcripts in 2824960 lines in 1 files - # 208508 groups 25 seqs 1 sources 2 feature types - # 208508 gene predictions - - # Do a little data cleanup and transformation and load splice graphs - # into database. - sed 's/altGraphX/sibTxGraph/' ~/kent/src/hg/lib/altGraphX.sql > sibTxGraph.sql - cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb \ - -sqlTable=sibTxGraph.sql hg38 sibTxGraph stdin - # Reading stdin - # Read 47817 elements of size 18 from stdin - # Sorted - # Creating table definition for sibTxGraph from sql: sibTxGraph.sql - # Saving bed.tab - # Loading hg38 - - # Create sibAltEvents track for analyzed alt-splices. - # Not on RR for hg18 and hg19, so do not push it out - cat txg/*.txg | txgAnalyze stdin /cluster/data/hg38/hg38.2bit sibAltEvents.bed - awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed - hgLoadBed hg38 sibAltEvents foo.bed - # Reading foo.bed - # Read 452436 elements of size 6 from foo.bed - # Sorted - # Creating table definition for sibAltEvents, bedSize: 6 - # Saving bed.tab - # Loading hg38 - - # push sibGene and sibTxGraph for hg38 - -############################################################################ -# Orangutan Lastz run (DONE - 2014-05-27 - Hiram) - screen -S hg38PonAbe2 # use a screen to manage this longish running job - mkdir /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02 - cd /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02 - - # always set the BLASTZ program so we know what version was used - cat << '_EOF_' > DEF -# human vs chimp -BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz -BLASTZ_O=600 -BLASTZ_E=150 -# maximum M allowed with lastz is only 254 -BLASTZ_M=254 - -BLASTZ_T=2 -BLASTZ_Y=15000 -BLASTZ_K=4500 -BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q -# A C G T -# 90 -330 -236 -356 -# -330 100 -318 -236 -# -236 -318 100 -330 -# -356 -236 -330 90 - -# TARGET: Human Hg38 -SEQ1_DIR=/scratch/data/hg38/hg38.2bit -SEQ1_LEN=/scratch/data/hg38/chrom.sizes -SEQ1_CHUNK=20000000 -SEQ1_LAP=10000 -SEQ1_IN_CONTIGS=0 - -# QUERY: Orangutan PonAbe2 -SEQ2_DIR=/hive/data/genomes/ponAbe2/ponAbe2.2bit -SEQ2_LEN=/hive/data/genomes/ponAbe2/chrom.sizes -SEQ2_CHUNK=10000000 -SEQ2_LAP=0 -SEQ2_LIMIT=100 -SEQ2_IN_CONTIGS=0 - -BASE=/hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02 -TMPDIR=/dev/shm -'_EOF_' - - time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ - -chainMinScore=5000 -chainLinearGap=medium \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -syntenicNet) > do.log 2>&1 - # real 144m46.575s - cat fb.hg38.chainPonAbe2Link.txt - # 2719618310 bases of 3049335806 (89.187%) in intersection - - # filter with doRecipBest.pl - time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ - hg38 ponAbe2) > rbest.log 2>&1 - # real 60m1.060s - time (doRecipBest.pl -load -continue=load -workhorse=hgwdev \ - -buildDir=`pwd` hg38 ponAbe2) > loadRBest.log 2>&1 & - # real 3m35.834s - - cat fb.hg38.chainRBestPonAbe2Link.txt - # 2538296592 bases of 3049335806 (83.241%) in intersection - - # running the swap - mkdir /hive/data/genomes/ponAbe2/bed/blastz.hg38.swap - cd /hive/data/genomes/ponAbe2/bed/blastz.hg38.swap - time (doBlastzChainNet.pl -verbose=2 \ - -swap /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02/DEF \ - -chainMinScore=5000 -chainLinearGap=medium \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -syntenicNet) > swap.log 2>&1 - # real 102m27.866s - cat fb.ponAbe2.chainHg38Link.txt - # 2773568958 bases of 3093572278 (89.656%) in intersection - - time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ - ponAbe2 hg38) > rbest.log 2>&1 - # real 78m47.312s - - - - -############################################################################# -# Add chrX alts to par (DONE 2014-10-14 angie) -# Thanks to Hiram for pointing out that intersecting chrX positions in -# altLocations and par shows whether a chrX alt overlaps a PAR. - cd /hive/data/genomes/hg38/bed/par - hgsql hg38 -e 'select * from altLocations where chrom = "chrX"' -#+-----+-------+------------+----------+---------------------+ -#| bin | chrom | chromStart | chromEnd | name | -#+-----+-------+------------+----------+---------------------+ -#| 73 | chrX | 319337 | 601516 | chrX_KI270880v1_alt | -#| 73 | chrX | 326487 | 601516 | chrX_KI270913v1_alt | -#| 149 | chrX | 79965153 | 80097082 | chrX_KI270881v1_alt | -#+-----+-------+------------+----------+---------------------+ - hgsql hg38 -e 'select * from par where chrom = "chrX"' -#+-----+-------+------------+-----------+------+ -#| bin | chrom | chromStart | chromEnd | name | -#+-----+-------+------------+-----------+------+ -#| 9 | chrX | 10000 | 2781479 | PAR1 | -#| 221 | chrX | 155701382 | 156030895 | PAR2 | -#+-----+-------+------------+-----------+------+ - # chrX_KI270880v1_alt and chrX_KI270913v1_alt are entirely contained in PAR1; - # chrX_KI270881v1_alt is not in either PAR. - hgsql hg38 -e 'select chrom,size from chromInfo \ - where chrom in ("chrX_KI270880v1_alt", "chrX_KI270913v1_alt");' -#+---------------------+--------+ -#| chrom | size | -#+---------------------+--------+ -#| chrX_KI270880v1_alt | 284869 | -#| chrX_KI270913v1_alt | 274009 | -#+---------------------+--------+ - # Process that into bed4 with name=PAR1: - hgsql hg38 -NBe 'select chrom, 0, size, "PAR1" from chromInfo \ - where chrom in ("chrX_KI270880v1_alt", "chrX_KI270913v1_alt");' \ - >> hg38Par.bed4 - hgLoadBed hg38 par hg38Par.bed4 - checkTableCoords hg38 par - - -############################################################################# -# LASTZ Cow bosTau8 (DONE - 2014-10-15 - Steve) - mkdir /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-215 - cd /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15 - - cat << '_EOF_' > DEF -# human vs cow -# maximum M allowed with lastz is only 254 -BLASTZ_M=254 - -# TARGET: Human hg38 -SEQ1_DIR=/scratch/data/hg38/hg38.2bit -SEQ1_LEN=/scratch/data/hg38/chrom.sizes -SEQ1_CHUNK=10000000 -SEQ1_LAP=10000 - -# QUERY: Cow bosTau8 -SEQ2_DIR=/hive/data/genomes/bosTau8/bosTau8.2bit -SEQ2_LEN=/hive/data/genomes/bosTau8/chrom.sizes -SEQ2_CHUNK=10000000 -SEQ2_LAP=0 - - -BASE=/hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15 -TMPDIR=/scratch/tmp -'_EOF_' - # << happy emacs - time nice -n +19 doBlastzChainNet.pl -verbose=2 \ - `pwd`/DEF \ - -syntenicNet \ - -noLoadChainSplit \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 - # real 602m37.523s - cat fb.hg38.chainBosTau8Link.txt - # 1401921010 bases of 3049335806 (45.975%) in intersection - # Create link - cd /hive/data/genomes/hg38/bed - ln -s lastzBosTau8.2014-10-15 lastz.bosTau8 - - # running the swap - mkdir /hive/data/genomes/bosTau8/bed/blastz.hg38.swap - cd /hive/data/genomes/bosTau8/bed/blastz.hg38.swap - time nice -n +19 doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15/DEF \ - -swap -syntenicNet \ - -noLoadChainSplit \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 - # real 116m32.121s - cat fb.bosTau8.chainHg38Link.txt - # 1336307377 bases of 2649307237 (50.440%) in intersection - cd /hive/data/genomes/bosTau8/bed - ln -s blastz.hg38.swap lastz.hg38 - -############################################################################ -# NCBI ClinVar (new version -DONE - 2014-11-08 - Max) -# see hg19.txt -######################################################################### - -######################################################################## -# CNV Developmental Delay track (2014-11-21 Steve) - - mkdir /hive/data/genomes/hg38/bed/cnvDevDelay - cd /hive/data/genomes/hg38/bed/cnvDevDelay - -wget --timestamping 'ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd100_Coe_et_al_2014/gvf/nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz' -wget --timestamping 'ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd54_Cooper_et_al_2011/gvf/nstd54_Cooper_et_al_2011.GRCh38.remap.all.germline.ucsc.gvf.gz' - -cp /kent/src/hg/utils/automation/gvfToBed8Attrs.pl . -mv gvfToBed8Attrs.pl gvfToBed8AttrsCase.pl -cp gvfToBed8AttrsCase.pl gvfToBed8AttrsControl100.pl -cp gvfToBed8AttrsCase.pl gvfToBed8AttrsControl54.pl - -# made three local copies of Angie's gvf conversion script - one to include -# only case individuals from nstd100, one to include only control individuals -# from nstd100 and one to include only control individuals from nstd54 - -# had to add an additional elsif statement to the nstd100 scripts to filter -# based on sample_name field: - -# } elsif ($tag eq "sample_name") { -# $sample_name = $val; -# } - -# added line 33/35 to each file: - -# next if ($sample_name eq "Unknown"); # keep only "case" individuals from nstd100 -# next if ($sample_name ne "Unknown"); # keep only "control" individuals from nstd100 -# next if ($phenotype ne "not_reported"); # keep only "control" individuals from nstd54 - -zcat nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsCase.pl > cnvDevDelayAllCase.bed -zcat nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsControl100.pl > cnvDevDelayAllControl.bed -zcat nstd54_Cooper_et_al_2011.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsControl54.pl >> cnvDevDelayAllControl.bed - -# GRCh38 data from dbVar had different naming scheme for alternate chromosomes -# (e.g., chr1|NT_187515.1 instead of chr1_KI270762v1_alt), so needed to write -# a script to substitute the correct UCSC names - - cat << '_EOF_' > chromXref.pl -#!/usr/bin/env perl - -use strict; -use warnings; - -sub usage() { - printf STDERR "usage: ./chromXref.pl <infile> <outfile>\n" -} - -my $argc = scalar(@ARGV); - -if ($argc != 2) { - usage; - exit 255; -} - -open (file1, "<hg38.xref") or die "cannot read hg38.xref"; - -my @accArray = (); -my $i = 0; -while (my $line = <file1>) { - chomp($line); - my ($type, $chr, $acc1, $acc2) = split('\t', $line); - ($type, undef) = split('-', $type); - ($acc1, my $version) = split('\.', $acc1); - if ($type eq "unlocalized") { - $type = "random"; - } - my $ucscAcc = "_" . $acc1 . "v" . $version . "_" . $type; - $accArray[$i][0] = $ucscAcc; - $accArray[$i][1] = $acc2; - $i++; -} - -close (file1); - -open (file2, "<$ARGV[0]") or die "cannot read $ARGV[0]"; -open (file3, ">$ARGV[1]") or die "cannot read $ARGV[1]"; -local $/; -my $fileContents = <file2>; -for ($i = 0; $i < scalar(@accArray); $i++) { - my $temp1 = $accArray[$i][1]; - my $temp2 = $accArray[$i][0]; - if ($fileContents =~ m/\|$temp1/) { - $fileContents =~ s/\|$temp1/$temp2/g; - } -} - -print file3 $fileContents; -close (file2); -close (file3); -'_EOF_' - # << happy emacs - -cp /hive/data/genomes/hg38/genbank/GCF_000001405.26.assembly.txt . - -cat GCF_000001405.26.assembly.txt | grep -v '^#\|assembled\|unplaced' | awk '{print $2 "\t" $3 "\t" $5 "\t" $7}' > hg38.xref - -chromXref.pl cnvDevDelayAllCase.bed cnvDevDelayAllCaseUcsc.bed -chromXref.pl cnvDevDelayAllControl.bed cnvDevDelayAllControlUcsc.bed - -hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \ - -allowStartEqualEnd hg38 cnvDevDelayCase cnvDevDelayAllCaseUcsc.bed - -hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \ - -allowStartEqualEnd hg38 cnvDevDelayControl cnvDevDelayAllControlUcsc.bed - - checkTableCoords hg38 cnvDevDelayCase - checkTableCoords hg38 cnvDevDelayControl - - -######################################################################### -# RETROFINDER RETROPOSED GENES ucscRetro track VERSION 9 -# (2015-01-12 - 2015-01-20, hartera, DONE) -ssh hgwdev -mkdir -p /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112 -cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112 - -cat << '_EOF_' > DEF - -RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 " -VERSION=9 -RUNDATE="2015-01-12" -DB=hg38 -SCORETHRESH=550 -GENOMENAME='Homo sapiens' -GBDB=hg -DATE=20150112 -RUNDIR=/hive/groups/gencode/pseudogenes/retroFinder/$DB.$DATE -BINDIR=/hive/users/hartera/GencodeWG/retroFinder/branches/version2/bin -KENTDIR=/cluster/home/hartera/kent -KENTBINDIR=/cluster/home/hartera/bin/x86_64 -MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz.$VERSION -TMPMRNA=$RUNDIR/mrnaBlastz/$DB -TMPEST=$RUNDIR/est/$DB -USEALTSEQS=0 -EST=all_est -SPLICED_EST=intronEst -SPLIT_EST=0 -SPLIT_SPLICED_EST=0 -LASTZPROG=/cluster/bin/penn/x86_64/lastz -SCRIPT=/hive/users/hartera/GencodeWG/retroFinder/branches/version2/src/pipeline -GENOME=/hive/data/genomes -RETRODIR=$GENOME/$DB/bed/retro -BASE=$RUNDIR/retro -OUTDIR=${BASE}/version${VERSION}/${DB} -RESULT=$OUTDIR/result -RESULTSPLIT=$OUTDIR/resultSplit -LOG=$OUTDIR/log -OUT=$OUTDIR/out -OVERLAPDIR=$OUTDIR/run.o -TABLE=ucscRetroInfo$VERSION -ORTHOTABLE=ucscRetroOrtho$VERSION -ALIGN=ucscRetroAli$VERSION -LOCAL=/scratch/data/$DB -TWOBIT=$GENOME/$DB/$DB.2bit -RMSK=rmsk -NET1=netMm10 -NET2=netCanFam3 -NET3=netRheMac3 -# these two nets determine which retros are classified as ancient, -# use two farthest nets -ANCIENT1=netMm10 -ANCIENT2=netCanFam3 -GENE1=knownGene -GENE2=refGene -GENE3=wgEncodeGencodeCompV19 -CLUSTER=ku -SPECIES="hg38 mm10" -ROOTDIR="/cluster/home/hartera/public_html/retro/hg38Jun14" -WEBROOT=$ROOTDIR/retro.$VERSION -WEBSERVER=http://hgwdev-hartera.soe.ucsc.edu -SHUFFLEDIR=shuffle -SHUFFLEROOT=$WEBROOT/$SHUFFLEDIR -DUPDIR=dups -DUPROOT=$WEBROOT/$DUPDIR -AGEDIR=age -AGEROOT=$WEBROOT/$AGEDIR -EXPDIR=exp -GENEPFAM=knownGene -PFAM=knownToPfam -PFAMIDFIELD=name -PFAMDOMAIN=value -ALTSPICE= -#ALTSPLICE=sibTxGraph -SPLITBYAGE=$SCRIPT/splitRetrosByAge -PDB=proteins140122 -#ARRAY=gnfAtlas2 -#AFFYPROBE="affyU133A,affyGnf1h" -#ARRAYMEDIAN=hgFixed.gnfHumanAtlas2Median -#ARRAYRATIO=hgFixed.gnfHumanAtlas2AllRatio -#ARRAYABS=hgFixed.gnfHumanAtlas2All -#ARRAYEXP=hgFixed.gnfHumanAtlas2MedianExps -#ARRAYEXPALL=hgFixed.gnfHumanAtlas2AllExps -#ARRAYLOOKUP=knownToGnfAtlas2 -#ARRAYPSLS="/hive/data/genomes/hg19/bed/geneAtlas2/affyU133A.psl /hive/data/genomes/hg19/bed/geneAtlas2/affyGnf1h.psl" -'_EOF_' - # << happy emacs -chmod +x DEF - -mkdir -p /hive/data/genomes/hg38/bed/retro -mkdir -p /hive/data/genomes/hg38/bed/mrnaBlastz.9 -mkdir -p /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/mrnaBlastz -cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/mrnaBlastz -cp ../DEF . - -# Create S1.len file -rom.sizes without random chroms or chrM, there are many alt loci also -# in hg38 that were not in hg19 so 285 chroms total. -cat /hive/data/genomes/hg38/chrom.sizes | grep -v random \ - | grep -v chrUn | grep -v chrM > S1.len -cp S1.len /hive/data/genomes/hg38/bed/mrnaBlastz.9 - -screen -# Run steps 1 to 5 of RetroFinder pipeline from scripts in CCDS SVN source tree: -retroFinder/branches/version2/src/pipeline/ucscStep1.sh DEF -# check cluster jobs on ku -retroFinder/branches/version2/src/pipeline/ucscStep2.sh DEF -retroFinder/branches/version2/src/pipeline/ucscStep3.sh DEF -#check cluster jobs on ku -retroFinder/branches/version2/src/pipeline/ucscStep4.sh DEF -#check cluster jobs on ku - # Load the track -retroFinder/branches/version2/src/pipeline/ucscStep5.sh DEF -cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/retro/version9/hg38 -retroFinder/branches/version2/src/pipeline/filterMrna.sh -retroFinder/branches/version2/src/pipeline/filterEst.sh -# Check cluster jobs on ku -retroFinder/branches/version2/src/pipeline/analyseExpress.sh -# Check cluster jobs on ku -#added ucscRetroAli9 to kent/src/hg/makeDb/human/hg38/trackDb.ra -# copied -# /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/retro/version9/hg38/trackDb.retro -# entry to kent/src/hg/makeDb/trackDb/human/hg38/trackDb.ra and edited it to -# remove the full date and add: -# dataVersion Jan. 2015 -# Scripts copied ucscRetroAli9.psl, ucscRetroInfo9.bed and ucscRetroCds9.tab -# to /hive/data/genomes/hg38/bed/retro/ - -########## -# Make dbVar chrom to UCSC chrom lift file -# DONE braney 2/12/15 -cd /cluster/data/hg38/jkStuff -sort /cluster/data/hg38/chrom.sizes > tmpChrom -grep -v '^#\|assembled' /hive/data/genomes/hg38/genbank/GCF_000001405.26.assembly.txt | awk 'BEGIN {OFS="\t"} {print "chr" $3 "_" $5 "_" $2, "chr" $3 "|"$7}' | sed 's/-scaffold//' | sed 's/unlocalized/random/' | sed 's/_unplaced//' | sed 's/chrna/chrUn/g' | sed 's/\./v/' | sort | join /dev/stdin tmpChrom | awk 'BEGIN {OFS="\t"} {print 0, $2, $3, $1, $3}' > dbVar.lift -awk 'BEGIN {OFS="\t"} {print 0, $1, $2, $1, $2}' /cluster/data/hg38/chrom.sizes >> dbVar.lift -rm tmpChrom - -######################################################################### -# UCSC to RefSeq name correspondence (DONE - 2015-04-13 - Hiram) - - mkdir /hive/data/genomes/hg38/bed/ucscToRefSeq - cd /hive/data/genomes/hg38/bed/ucscToRefSeq - - # columns 5 and 7 are the INSDC and RefSeq names - - grep -v "^#" ../../genbank/GCF_000001405.26.assembly.txt \ - | awk -F'\t' '{printf "%s\t%s\n", $5,$7}' | sort > insdc.refSeq.tab - - hgsql -N -e 'select name,chrom,chromStart,chromEnd from ucscToINSDC;' hg38 \ - | sort > insdc.ucsc.tab - - join insdc.ucsc.tab insdc.refSeq.tab | tr '[ ]' '[\t]' \ - | cut -f2- > ucsc.refSeq.tab - - - export chrSize=`cut -f1 ucsc.refSeq.tab | awk '{print length($0)}' | sort -n | tail -1` - sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ - | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql - hgLoadSqlTab hg38 ucscToRefSeq ./ucscToRefSeq.sql ucsc.refSeq.tab - - checkTableCoords hg38 -table=ucscToRefSeq - -######################################################################### -#CREATE MICROSAT TRACK (DONE - 2015-05-22 - Hiram) - ssh hgwdev - mkdir /cluster/data/hg38/bed/microsat - cd /cluster/data/hg38/bed/microsat - - awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ - ../simpleRepeat/simpleRepeat.bed > microsat.bed - - hgLoadBed hg38 microsat microsat.bed - -############################################################################# -# ENCODE Regulatory tracks (Kate & Chris) - -# see reg.txt -######################################################################### -# GWIPS-viz Ribo-seq - (DONE - 2016-02-05 - Steve) -# contact Audrey Michel (audreymannion@gmail.com) -# redmine #16765 - -obtained bigWig file from shared Google drive -https://drive.google.com/a/soe.ucsc.edu/folderview?id=0B_xvV_5tXzOGQ1h5NEh4bnhNTDg&usp=sharing_eid - -mkdir /hive/data/genomes/hg38/bed/gwipsvizRiboseq -cp Global_RiboProElong.10_02_2016.bw /hive/data/genomes/hg38/bed/gwipsvizRiboseq/gwipsvizRiboseq.bw - -mkdir /gbdb/hg38/bbi/gwipsvizRiboseq -cd /gbdb/hg38/bbi/gwipsvizRiboseq -ln -s /hive/data/genomes/hg38/bed/gwipsvizRiboseq/gwipsvizRiboseq.bw gwipsvizRiboseq.bw - -hgsql hg38 -create table gwipsvizRiboseq select * from gc5BaseBw; -update gwipsvizRiboseq set fileName="/gbdb/hg38/bbi/gwipsvizRiboseq/gwipsvizRiboseq.bw" where fileName="/gbdb/hg38/bbi/gc5BaseBw/gc5Base.bw"; - -######################################################################### -# COSMIC v81 DONE Chris Eisenhart 2017-05-11 -# Make a new COSCMIC track for hg19 -mkdir /hive/data/outside/cosmic/hg38/v81 -cd /hive/data/outside/cosmic/hg38/v81 - -# Get the new data -sftp ceisenha@ucsc.edu@sftp-cancer.sanger.ac.uk -# Login to SFTP server then run these commands -get /files/grch38/cosmic/v81/CosmicMutantExport.tsv.gz - -# Remove the 'NS' fields, search for the \t after to exclude the E'NS'ST transcripts. -zcat CosmicMutantExport.tsv.gz | sed 's/NS\t/\t/g' > cosMut.tsv - -# Use a script to convert to bed format. -cosmicToBed cosMut.tsv cosMut.bed -# This many lines were skipped, 131597 for not having genomic coordinate - -# Sort and convert to big bed using the .as file. -sort -k1,1 -k2,2n cosMut.bed > sCosMut.bed -bedToBigBed -type=bed4+31 -as=cosmicNew.as sCosMut.bed /hive/data/genomes/hg38/chrom.sizes cosMutHg38V81.bb -tab -extraIndex=name,cosmLabel - -# Link it up so the outside world can see it. -cd /gbdb/hg38/cosmic/ -ln -s /hive/data/outside/cosmic/hg38/v81/cosMutHg38V81.bb . -######################################################################### -# hoffmanMappability hub import (2 super tracks) DONE Chris Eisenhart 2017-05-16 -mkdir /hive/data/outside/hoffmanMappability/hg38 -cd /hive/data/outside/hoffmanMappability/hg38 -wget https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/trackDb.txt -# Get the trackDb file -importTrackHub trackDb.txt hofMap.ra /gbdb/hg38/hoffmanMappability/ --trackDbPath=$HOME/kent/src/hg/makeDb/trackDb/human/hg38/ --test -# Check that the commands are what we want, then run for real -importTrackHub trackDb.txt hofMap.ra /gbdb/hg38/hoffmanMappability/ --trackDbPath=$HOME/kent/src/hg/makeDb/trackDb/human/hg38/ -# View the .ra file to make sure things are ok, here changed the groups to map, -# added the alpha tags, and removed the 'show' from 'superTrack on show' -cp hofMap.ra ~/kent/src/hg/makeDb/trackDb/human/hg38 -# Include hofMap.ra in the trackDb.ra file - -# the importTrackHub failed on redirection, fetch all the files manually: -# 2017-09-15 - Hiram - -cd /hive/data/outside/hoffmanMappability/hg38 - -grep bigDataUrl trackDb.txt | awk '{print $NF}' | sed -e 's#https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/##;' | while read F -do - echo $F - rm -f $F - wget --timestamping "https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/${F}" -done - # real 29m40.429s - -######################################################################### -# tcgaExpr super track Chris Eisenhart, DONE, 2017-05-17 -# tcgaTranscExpr -# TCGA transcript level expression barChart track, from TOIL pipeline recompute (John Vivian) -# biorxiv.org/content/biorxiv/early/2016/07/07/062497.full.pdf -mkdir /hive/data/outside/tcgaBarcharts/ -mkdir /hive/data/outside/tcgaBarcharts/transcripts -cd /hive/data/outside/tcgaBarcharts/transcripts - -# Get all the meta data -cp ~max/projects/cirm/datasetPages/tcgaGtex/tcgaMeta.tab . -# Cut out the meta data the script wants, sample name and group. -cut -f 1,5 tcgaMeta.tab | sed 's/ /_/g' > tcgaLargeSamples.tsv - -# Get and clean the matrix -cp ~max/projects/cirm/datasetPages/tcgaGtex/tcga.tpm.tab . -# Clean up the transcript names (remove the .#) -cut -f 1 tcga.tpm.tab | cut -f 1 -d "." > tcgaTranscripts.txt -cut -f 2- tcga.tpm.tab > tcgaTpmValues.tsv -paste tcgaTranscripts.txt tcgaTpmValues.tsv > tcgaMatrix.tsv - -# Build a coordinate map -hgsql hg38 -e "select * from ensGene" | cut -f 2- | sort > ensGene -hgsql hg38 -e "select * from ensemblToGeneName" | sort > ensemblToGeneName -join ensGene ensemblToGeneName | awk '{print $2"\t"$4"\t"$5"\t"$1"\t0\t"$3"\t"$16}' > coord.bed - -# Use the meta data, matrix, and coordinate map to generate a barchart bed -time expMatrixToBarchartBed tcgaLargeSamples.tsv tcgaMatrix.tsv coord.bed tcgaTransExp.bed --groupOrder tcgaGroupOrder.txt - -# NOTE: Use the header line of the bed file to populate the barChartBars field in the trackDb. -# The order of the labels in the barChartBars field should match the order of the labels in the -# expScores column in the bed file header. - -# Sort and convert into a bigBed file. -sort -k1,1 -k2,2n tcgaTransExp.bed > sortedTcgaTransExp.bed -bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartTranscExp.as sortedTcgaTransExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaTransExp.bb - -# Link the files into gbdb -cd /gbdb/hgFixed/human/expMatrix -ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaLargeSamples.tsv tcgaLargeSamples.tab -ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaMatrix.tsv tcgaMatrix.tab -ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaTransExp.bb . - -###########3 -# Reload bigBed with a schema that will be shared with genes track, to support -# configuration as subtracks in a composite -# (2007-08-30 kate) -cd /hive/data/outside/tcgaBarcharts/transcripts -bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExpr.as sortedTcgaTransExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaTranscExpr.hg38.bb -mkdir /gbdb/hg38/tcga -ln -s `pwd`/tcgaTranscExpr.hg38.bb /gbdb/hg38/tcga/tcgaTranscExpr.bb - -# TCGA gene level expression barChart track, from TOIL pipeline recompute (John Vivian) -# tcgaGeneExpr -mkdir ../genes -cd ../genes - -# Get the gene matrix. -cp ~max/projects/cirm/datasetPages/tcgaGtex/tcga.geneTpm.tab . - -# Make a coordinate file, the genes in gtexGeneModelV6 have .# versions which are -# removed with the temp fils. -hgsql hg38 -e "select * from hg38.gtexGeneModelV6" | awk '{print $3"\t"$5"\t"$6"\t"$2"\t0\t"$4"\t"$2}' > coord6+1.bed.temp -cut -f 4 coord6+1.bed.temp | cut -f 1 -d "." > foo -cut -f 1-3 coord6+1.bed.temp > foo2 -paste foo2 foo > foo3 -cut -f 5- coord6+1.bed.temp > foo4 -paste foo3 foo4 > coord6+1.bed -# This bed file didn't have the right gene names (ENS rather than Hugo), fix it. -hgsql hg38 -e "select * From knownCanonical" > foo -wc foo -cut -f 6 foo | cut -f 1 -d "." -cut -f 6 foo | cut -f 1 -d "." > foo2 -head foo -cut -f 1-3 foo > foo3 -paste foo2 foo3 > foo4 -cut -f 4- coord6+1.bed > foo5 -join <(sort foo5) <(sort foo4) | awk '{print $5"\t"$6"\t"$7"\t"$1"\t0\t"$3"\t"$4}' > coord6+1.3.bed - -# Generate the bed file, can use the same transcript file -time expMatrixToBarchartBed ../transcripts/tcgaLargeSamples.tsv tcga.geneTpm.tab coord6+1.3.bed tcgaGeneExp.bed --groupOrder=../transcripts/tcgaGroupOrder.txt - -# Convert to big bed -sort -k1,1 -k2,2n tcgaGeneExp.bed > sortedTcgaGeneExp.bed -bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExp.as sortedTcgaGeneExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaGeneExp.bb - -# Link to gbdb -cd /gbdb/hgFixed/human/expMatrix -ln -s /hive/data/outside/tcgaBarcharts/genes/tcgaGeneExp.bb . -ln -s /hive/data/outside/tcgaBarcharts/genes/tcga.geneTpm.tab tcgaGeneMatrix.tab - -###########3 -# Reload bigBed with a schema that will be shared with transcript track, to support -# configuration as subtracks in a composite -# Apparently Chris actually loaded the #3 file (added gene names, adjusted end coord apparently) -# (2007-08-30 kate) -cd /hive/data/outside/tcgaBarcharts/genes -bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExpr.as sortedTcgaGeneExp3.bed /hive/data/genomes/hg38/chrom.sizes tcgaGeneExpr.hg38.bb -mkdir /gbdb/hg38/tcga -ln -s `pwd`/tcgaGeneExpr.hg38.bb /gbdb/hg38/tcga/tcgaGeneExpr.bb - -######################################################################### -# gtexTransExp Chris Eisenhart, done, 2017-05-23 -# TCGA transcript level RNA-seq, from TOIL pipeline recompute (John Vivian) -# biorxiv.org/content/biorxiv/early/2016/07/07/062497.full.pdf -mkdir /hive/data/outside/gtex/barChartTrack -cd /hive/data/outside/gtex/barChartTrack - -# Seems John included some TCGA data (CML) in the GTEx matrix and samples, the cleaning steps remove this. -# Make a clean sample file -cat ../johnVivianRecompute/sraToSample.txt | sed 's/ male /\tmale\t/g' | sed 's/ female /\tfemale\t/g' | cut -f 3 | sed 's/ - /-/g' | sed 's/ /_/g' > gtexSampleGroups.txt -cat ../johnVivianRecompute/sraToSample.txt | cut -f 1 -d " " > gtexSampleNames.txt -paste gtexSampleNames.txt gtexSampleGroups.txt > gtexSamples.txt -grep -v '(CML)' gtexSamples.tsv > cleanGtexSamples.tsv - -# Make a clean matrix -cut -f 1 ../johnVivianRecompute/gtex.tpm.tab | cut -f 1 -d "." > gtexTranscripts.txt -cut -f 2- ../johnVivianRecompute/gtex.tpm.tab > gtexTpmValues.tsv -paste gtexTranscripts.txt gtexTpmValues.tsv > gtexMatrix.tsv -rowsToCols gtexMatrix.tsv tspsdGtexMatrix.tsv -sort tspsdGtexMatrix.tsv > sortedTspsdGtexMatrix.tsv -grep -v '(CML)' gtexSamples.tsv | cut -f 1 | sed 's/Run_s/#transcript/g' | sort > sortedCleanGtexSamples.tsv -join sortedCleanGtexSamples.tsv sortedTspsdGtexMatrix.tsv > cleanTspsdGtexMatrix.tsv -rowsToCols cleanTspsdMatrix.tsv cleanGtexMatrix.tsv - -# Build a coordinate map -hgsql hg38 -e "select * from ensGene" | cut -f 2- | sort > ensGene -hgsql hg38 -e "select * from ensemblToGeneName" | sort > ensemblToGeneName -join ensGene ensemblToGeneName | awk '{print $2"\t"$4"\t"$5"\t"$1"\t0\t"$3"\t"$16}' > coord.bed -# NOTE: CHRISL10-05-2021 - the above ensGene steps weren't actually done or the files were removed, -# there was a coord.tsv which I used instead so the below re-run could work -tawk '{print $1,$2,$3,$4,0,$5,$6}' coord.tsv > coord.bed -# END CHRISL10-05-2021 NOTE) - -# Get the gtex ordering -hgsql hgFixed -e "select * from gtexTissue" | cut -f 3 | sed 's/ - /-/g' | sed 's/ /_/g' | sed '1D' > gtexGroupOrder.txt - -# Use the meta data, matrix, and coordinate map to generate a barchart bed -# NOTE: CHRISL10-05-2021 - re-ran this step to fix float parsing bug: -time expMatrixToBarchartBed cleanGtexSamples.tsv cleanGtexMatrix.tsv coord.bed gtexTransExp.bed --groupOrderFile gtexGroupOrder.txt - -# NOTE: Use the header line of the bed file to populate the barChartBars field in the trackDb. -# The order of the labels in the barChartBars field should match the order of the labels in the -# expScores column in the bed file header. - -# Sort and convert into a bigBed file. -sort -k1,1 -k2,2n gtexTransExp.bed > sortedGtexTransExp.bed -# NOTE: CHRISL10-05-2021 - re-ran bedToBigBed step with correct file names -bedToBigBed -as=$HOME/kent/src/hg/lib/barChartBed.as -type=bed6+5 sortedGtexTransExp.bed /hive/data/genomes/hg38/chrom.sizes gtexTranscExpr.bb - -# Link the files into gbdb -cd /gbdb/hgFixed/human/expMatrix -ln -s /hive/data/outside/gtex/barChartTrack/cleanGtexSamples.tsv cleanGtexSamples.tab -ln -s /hive/data/outside/gtex/barChartTrack/cleanGtexMatrix.tsv cleanGtexMatris.tab - -# <2007-08-30 kate) -cd /gbdb/hg38/gtex -ln -s /hive/data/outside/gtex/barChartTrack/gtexTranscExpr.bb . - -######################################################################### -# LASTZ human/hg38 vs. Zebrafish /danRer11 -# (DONE - 2017-06-12 - Chris) - - mkdir /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12 - cd /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12 - - printf '# human vs zebrafish danRer11 -BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz -BLASTZ_M=254 - -# TARGET: human hg38 -SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit -SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes -SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit -SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes -SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift -SEQ1_CHUNK=40000000 -SEQ1_LIMIT=20 -SEQ1_LAP=10000 - -# QUERY: zebrafish danRer11 -SEQ2_DIR=/hive/data/genomes/danRer11/danRer11.2bit -SEQ2_LEN=/hive/data/genomes/danRer11/chrom.sizes -SEQ2_CHUNK=20000000 -SEQ2_LIMIT=200 -SEQ2_LAP=0 - -BASE=/hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12 -TMPDIR=/dev/shm -' > DEF - - time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ - -chainMinScore=3000 -chainLinearGap=medium \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -noDbNameCheck -syntenicNet) > do.log 2>&1 - # real 3327m39.074s - - cat fb.hg38.chainDanRer11Link.txt - # 41036733 bases of 3049335806 (1.346%) in intersection - - 973293331 bases of 3049335806 (31.918%) in intersection - - time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` hg38 danRer11) \ - > rbest.log 2>&1 & - - # and for the swap: - mkdir /hive/data/genomes/danRer11/bed/blastz.hg38.swap - cd /hive/data/genomes/danRer11/bed/blastz.hg38.swap - - time (doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12/DEF \ - -swap -chainMinScore=3000 -chainLinearGap=medium \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -noDbNameCheck -syntenicNet) > swap.log 2>&1 - # real 39m24.916s - - cat fb.danRer11.chainHg38Link.txt - # 47869194 bases of 1674677181 (2.858%) in intersection - - time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` danRer11 hg38) \ - > rbest.log 2>&1 & - # real 638m45.337s -_EOF_ -######################################################################### -# refSeqFuncElems NCBI refSeq functional elements, REDONE 2017-11-29 Angie -# previously done 2017-08-01 by Chris E - -mkdir /hive/data/genomes/hg38/bed/refSeqFuncElems.2017-11-29 -cd /hive/data/genomes/hg38/bed/refSeqFuncElems.2017-11-29 - -# NOTE FOR NEXT TIME: instead of using interim GFF, in the future these annotations might be -# folded into the same main release GFF3 from which the ncbiRefSeq* tables are extracted by -# doNcbiRefSeq.pl. -wget ftp://ftp.ncbi.nlm.nih.gov/genomes/H_sapiens/GFF_interim/interim_GRCh38.p11_top_level_2017-06-27.gff3.gz - -# Get mapping of RefSeq NC_* chromosome accs (and NT_*, NW_*) to hg38 chrom names -hgsql hg38 -NBe 'select alias, chrom from chromAlias where source = "refseq" order by alias' \ -> refSeqToChrom.tab -cut -f 2 refSeqToChrom.tab | sed -e 's/^/^/' > chrom.tab - -# Use Terence Murphy's list of feature types (and the multi-type attribute regulatory_class) -# to identify Functional Elements and swap in hg38 chrom names. -# Use subColumn -miss so it doesn't quit when it sees a patch contig that doesn't map to an -# hg38 chrom. Use grep -f chrom.tab to filter out patch contig annotations. -zcat interim_GRCh38.p11_top_level_2017-06-27.gff3.gz \ -| grep -P "(\t(CAAT_signal|GC_rich_promoter_region|TATA_box|enhancer|insulator|locus_control_region|mobile_genetic_element|origin_of_replication|promoter|protein_binding_site|recombination_feature|regulatory_region|repeat_region|sequence_feature|sequence_secondary_structure|silencer|stem_loop)\t|regulatory_class=)" \ -| subColumn -miss=/dev/null 1 stdin refSeqToChrom.tab stdout \ -| grep -f chrom.tab > funcElems.gff -wc -l funcElems.gff -#5756 funcElems.gff - -# Transform GFF to BED+ -~/kent/src/hg/utils/automation/parseRefSeqFuncElems funcElems.gff /dev/stdout \ -| sort -k1,1 -k2n,2n > refSeqFuncElems.bed -wc -l refSeqFuncElems.bed -#5756 refSeqFuncElems.bed - -# Make bigBed and link from /gbdb -bedToBigBed -tab -type=bed9+7 -as=$HOME/kent/src/hg/lib/refSeqFuncElems.as \ - refSeqFuncElems.bed /hive/data/genomes/hg38/chrom.sizes refSeqFuncElems.bb -rm -f /gbdb/hg38/ncbiRefSeq/refSeqFuncElems.bb -ln -s `pwd`/refSeqFuncElems.bb /gbdb/hg38/ncbiRefSeq/ - -################################################################### -# cosmicRegions (DONE 2017-08-03 Chris) -# Make a new COSCMIC track for hg38 v82 -mkdir /hive/data/outside/cosmic/hg38/v82 -cd /hive/data/outside/cosmic/hg38/v82 - -# Get the new data -sftp ceisenha@ucsc.edu@sftp-cancer.sanger.ac.uk -# Login to SFTP server then run these commands -get /files/grch38/cosmic/v82/CosmicMutantExport.tsv.gz - -# Remove the 'NS' fields, search for the \t after to exclude the E'NS'ST transcripts. -zcat CosmicMutantExport.tsv.gz | sed 's/NS\t/\t/g' > cosMut.tsv - -# Use a script to convert to bed format. -cosmicToBed cosMut.tsv cosMut.bed -# This many lines were skipped, 134601 for not having genomic coordinate - -# Sort and convert to big bed using the .as file. -sort -k1,1 -k2,2n cosMut.bed > sCosMut.bed -bedToBigBed -type=bed8+31 -as=cosmicNew.as sCosMut.bed /hive/data/genomes/hg38/chrom.sizes cosMutHg38V82.bb -tab -extraIndex=name,cosmLabel - - -# Link it up so the outside world can see it. -cd /gbdb/hg38/cosmic/ -ln -s /hive/data/outside/cosmic/hg38/v82/cosMutHg38V82.bb . - -######################################################################### -# RepeatMasker Visualization track update (DONE - 2018-05-04 - ChrisL) - screen -S rmskJoined.2018-05-04 - mkdir /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04 - cd /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04 - - ln -s ../repeatMasker/hg38.sorted.fa.out . - ln -s ../repeatMasker/hg38.fa.align.gz . - - # this script points to the most recent RepeatMasker version: - time (/scratch/data/RepeatMasker/util/rmToUCSCTables.pl \ - -out hg38.sorted.fa.out -align hg38.fa.align.gz) > do.log 2>&1 & - - # no differences, forgot to remake rmsk files - # so instead remake the rmsk track and try again - mkdir /hive/data/genomes/hg38/bed/repeatMasker.2018-05-04 - cd /hive/data/genomes/hg38/bed/repeatMasker.2018-05-04 - - # remake the sorted.fa.out and fa.align.gz, stop after masking - # so rmsk table isn't overwritten - time (doRepeatMasker.pl -stop=mask -useHMMER -bigClusterHub=ku \ - -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38) > mask.log 2>&1 & - # RepeatMasker bug?: Undefined id, line 1440295 of input: - # 10 26.1 0.0 0.0 chr13 114292339 114292382 (71946) C L1P4 LINE/L1 (17) 6149 6106 - # RepeatMasker bug?: Undefined id, line 3529762 of input: - # 992 2.3 0.5 0.0 chr3 180461254 180462048 (17833511) C L1PA3 LINE/L1 (3) 6152 5354 - # RepeatMasker bug?: Undefined id, line 3529763 of input: - # 1153 3.2 0.2 0.0 chr3 180462043 180463006 (17832553) + L1PA3 LINE/L1 4392 5357 (789) - # RepeatMasker bug?: Undefined id, line 5303571 of input: - # 220 22.5 0.0 17.7 chr9 105798076 105799127 (32595590) C SATR2 Satellite (4) 866 1 - # real 643m17.617s - - # get rid of the missing id items: - grep -v "114292339 114292382\|180461254 180462048\|180462043 180463006\|105798076 105799127" \ - hg38.fa.out > clean.hg38.fa.out - mv clean.hg38.fa.out hg38.fa.out - - # finish the last step of doCat.csh: - /cluster/bin/scripts/extractNestedRepeats.pl hg38.fa.out | sort -k1,1 -k2,2n > hg38.nestedRepeats.bed - - cd /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04 - - rm hg38.sorted.fa.out - rm hg38.fa.align.gz - rm *.tsv - ln -s ../repeatMasker.2018-05-04/hg38.sorted.fa.out . - ln -s ../repeatMasker.2018-05-04/hg38.fa.align . - - # and then re-run - time (/scratch/data/RepeatMasker/util/rmToUCSCTables.pl \ - -out hg38.sorted.fa.out -align hg38.fa.align.gz) > rerun.log 2>&1 & - # real 141m7.268s - - # confirm the counts are different from the previous version: - # wc -l ../rmskJoined/hg38.fa.align.tsv ../rmskJoined/hg38.sorted.fa.join.bed ../rmskJoined/hg38.sorted.fa.out.tsv - 7203858 ../rmskJoined/hg38.fa.align.tsv - 4607727 ../rmskJoined/hg38.sorted.fa.join.bed - 5520118 ../rmskJoined/hg38.sorted.fa.out.tsv - 17331703 total - # wc -l *.tsv - 7227245 hg38.fa.align.tsv - 4828114 hg38.sorted.fa.join.tsv - 5916189 hg38.sorted.fa.out.tsv - 17971548 total - - hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/rmskJoined.sql \ - -renameSqlTable -verbose=4 -tab \ - -type=bed4+9 -as=$HOME/kent/src/hg/lib/rmskJoined.as hg38 \ - rmskJoinedCurrent hg38.sorted.fa.join.tsv \ - > loadJoined.log 2>&1 - - hgLoadSqlTab hg38 rmskAlignCurrent \ - /cluster/home/chmalee/kent/src/hg/lib/rmskAlign.sql \ - hg38.fa.align.tsv > loadAlign.log 2>&1 - - hgLoadOutJoined -verbose=2 -table=rmskOutCurrent hg38 hg38.sorted.fa.out > loadOut.log 2>&1 - - featureBits -countGaps hg38 rmskJoinedCurrent - # 2796899855 bases of 3209286105 (87.150%) in intersection -######################################################################### -# Hi-C Visualization based on Krietenstein 2019 (DONE - 2019-10-07 - Jonathan) -mkdir -p /hive/data/genomes/hg38/bed/hic -cd /hive/data/genomes/hg38/bed/hic - -# Files are located on 4D Nucleome (data.4dnucleome.org). The URL for the paper on that -# site is https://data.4dnucleome.org/publications/b13590b2-a341-4e5e-ad5e-72e233b32e9d/. -# The four file IDs downloaded below are for contact matrix .hic files created for -# different cell-line/protocol combinations -wget 'https://data.4dnucleome.org/files-processed/4DNFI2TK7L2F/@@download/4DNFI2TK7L2F.hic' # H1-hESC Micro-C XL -wget 'https://data.4dnucleome.org/files-processed/4DNFIQYQWPF5/@@download/4DNFIQYQWPF5.hic' # H1-hESC in situ -wget 'https://data.4dnucleome.org/files-processed/4DNFI18Q799K/@@download/4DNFI18Q799K.hic' # HFFc6 Micro-C XL -wget 'https://data.4dnucleome.org/files-processed/4DNFIFLJLIS5/@@download/4DNFIFLJLIS5.hic' # HFFc6 in situ - -printf "All files were downloaded from the 4D Nucleome Data Portal at data.4dnucleome.org. -These are processed contact matrices from Krietenstein et al. (2019) Ultrastructural details -of mammalian chromosme architecture. (https://www.biorxiv.org/content/10.1101/639922v1). - -4DNFI2TK7L2F.hic - Micro-C XL data set on H1-hESC -4DNFIQYQWPF5.hic - in situ Hi-C data set on H1-hESC -4DNFI18Q799K.hic - Micro-C XL data set on HFFc6 -4DNFIFLJLIS5.hic - in situ Hi-C data set on HFFc6" > README.txt - -mkdir -p /gbdb/hg38/bbi/hic -cd /gbdb/hg38/bbi/hic -ln -s /hive/data/genomes/hg38/bed/hic/* . - - -######################################################################### -# LASTZ Self/hg38 (DONE 2020-02-11 - Angie) - # RM #24695 - # Re-run with updated process to include pslDropOverlap . - # Use "contigs" from previous run lastzSelf.2014-01-25/hg38.self.2bit - - screen -S hg38Self -t hg38Self - mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 - cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 - cat << _EOF_ > DEF -# human vs human with mouse defaults -BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz - -# TARGET: Human hg38 -SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit -SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes -SEQ1_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit -SEQ1_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes -SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift -SEQ1_CHUNK=20000000 -SEQ1_LAP=10000 - -# QUERY: Human hg38 -SEQ2_DIR=/hive/data/genomes/hg38/hg38.2bit -SEQ2_LEN=/hive/data/genomes/hg38/chrom.sizes -SEQ2_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit -SEQ2_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes -SEQ2_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift -SEQ2_CHUNK=20000000 -SEQ2_LAP=0 - -BASE=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 -TMPDIR=/dev/shm -_EOF_ - - # NOTE FOR NEXT TIME: use -chainMinScore=10000 (at least), not 3000 - - ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ - -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \ - -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ - -stop=net >& do.log & - tail -f do.log - - - # After two days, 4 jobs are running, one of which (part014.lst vs itself) crashed with - # out-of-mem error. After 3 days, 3 jobs completed but part014.lst runs lastz out of mem. - # Split part014.lst up into components, run on hgwdev (more mem). - mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014 - cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014 - mkdir psl - cp /dev/null jobList - for t in $(cat ../tParts/part014.lst); do - tBase=$(basename $t) - for q in $(cat ../tParts/part014.lst); do - qBase=$(basename $q) - echo "$HOME/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf $t $q ../../DEF {check out exists psl/${tBase}_${qBase}.psl }" >> jobList - done - done - para create jobList - para try, check, push, etc, - # 94 of the jobs ran for 12s or less. The other 6 are chr{X_Y}_00 vs. self & each other, - # chr13_16 vs self and chr16_03 vs self. All but chr16_03 vs self completed in < 6 minutes. -#Completed: 99 of 100 jobs -#Crashed: 1 jobs -#CPU time in finished jobs: 1559s 25.98m 0.43h 0.02d 0.000 y -#IO & Wait Time: 248s 4.14m 0.07h 0.00d 0.000 y -#Average job time: 18s 0.30m 0.01h 0.00d -#Longest finished job: 321s 5.35m 0.09h 0.00d -#Submission to last job: 94681s 1578.02m 26.30h 1.10d - - # Dang, chr16_03 vs. self still runs out of mem even on hgwdev. - mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03 - cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03 - twoBitToFa /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit:chr16_03:0-1689648 \ - chr16_03.fa - faSplit -lift=chr16_03.lift size chr16_03.fa 169000 chr16_03_split_ - faToTwoBit chr16_03_split_*.fa chr16_03_split.2bit - twoBitInfo chr16_03_split.2bit stdout | sort -k2nr > chr16_03_split.sizes - sed -re 's@CTGDIR.*@CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03/chr16_03_split.2bit@; - s@CTGLEN.*@CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03/chr16_03_split.sizes@;' \ - ../../../DEF > DEF.split - mkdir psl - cwd=$(pwd) - while read tBase tSize; do - while read qBase qSize; do - echo "$HOME/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf $cwd/chr16_03_split.2bit:$tBase:0-$tSize $cwd/chr16_03_split.2bit:$qBase:0-$qSize DEF.split {check out exists psl/${tBase}_${qBase}.psl}" - done < chr16_03_split.sizes - done < chr16_03_split.sizes > jobList - para create jobList - para try, check, push, etc, -#Completed: 100 of 100 jobs -#CPU time in finished jobs: 142614s 2376.89m 39.61h 1.65d 0.005 y -#IO & Wait Time: 167s 2.79m 0.05h 0.00d 0.000 y -#Average job time: 1428s 23.80m 0.40h 0.02d -#Longest finished job: 22861s 381.02m 6.35h 0.26d -#Submission to last job: 22874s 381.23m 6.35h 0.26d - # 6 hours for chr16_03_split_00 vs. itself. ~4.5h for _09 vs _00. - cat psl/*.psl \ - | liftUp -nohead -type=.psl stdout \ - chr16_03.lift error stdin \ - | liftUp -nohead -type=.psl -pslQ \ - ../psl/hg38.self.2bit:chr16_03:0-1689648_hg38.self.2bit:chr16_03:0-1689648.psl \ - chr16_03.lift error stdin - - cd .. - cat psl/* > ../../psl/part014.lst/part014.lst_part014.lst.psl - - # Make run.time file or doBlastzChainNet.pl won't continue: - cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz - para time >& run.time - - # Resume doBlastzChainNet.pl: - cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 - ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ - -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \ - -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ - -continue=cat -stop=net >& do2.log & - tail -f do2.log -#Batch failed after 4 tries on chain.csh part016.lst chain/part016.lst.chain -#Command failed: -#ssh -x -o 'StrictHostKeyChecking = no' -o 'BatchMode = yes' hgwdev nice /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run/doChainRun.csh - - cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run - para problems - # mostly these: -#errAbort re-entered due to out-of-memory condition. Exiting. - # one job made it through errAbort: -#needLargeMem: Out of memory - request size 564838920 bytes, errno: 12 - para time -#Completed: 59 of 68 jobs -#Crashed: 9 jobs -#CPU time in finished jobs: 24727s 412.12m 6.87h 0.29d 0.001 y -#IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y -#Average job time: 409s 6.82m 0.11h 0.00d -#Longest finished job: 2350s 39.17m 0.65h 0.03d -#Submission to last job: 2462s 41.03m 0.68h 0.03d - para crashed -#chain.csh part012.lst {check out line+ chain/part012.lst.chain} -#chain.csh part017.lst {check out line+ chain/part017.lst.chain} -#chain.csh part016.lst {check out line+ chain/part016.lst.chain} -#chain.csh part015.lst {check out line+ chain/part015.lst.chain} -#chain.csh part014.lst {check out line+ chain/part014.lst.chain} -#chain.csh hg38.self.2bit:chr1_10: {check out line+ chain/hg38.self.2bit:chr1_10:.chain} -#chain.csh hg38.self.2bit:chr10_05: {check out line+ chain/hg38.self.2bit:chr10_05:.chain} -#chain.csh hg38.self.2bit:chr7_00: {check out line+ chain/hg38.self.2bit:chr7_00:.chain} - - # Run the jobs outside of parasol (~11h): - csh -efx chain.csh part012.lst chain/part012.lst.chain & - csh -efx chain.csh part017.lst chain/part017.lst.chain & - csh -efx chain.csh part016.lst chain/part016.lst.chain & - csh -efx chain.csh part015.lst chain/part015.lst.chain & - csh -efx chain.csh part014.lst chain/part014.lst.chain & - csh -efx chain.csh hg38.self.2bit:chr1_10: chain/hg38.self.2bit:chr1_10:.chain & - csh -efx chain.csh hg38.self.2bit:chr10_05: chain/hg38.self.2bit:chr10_05:.chain & - csh -efx chain.csh hg38.self.2bit:chr7_00: chain/hg38.self.2bit:chr7_00:.chain & - csh -efx chain.csh hg38.self.2bit:chr16_08: chain/hg38.self.2bit:chr16_08:.chain & - - # Resume doBlastzChainNet.pl again: - cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27 - ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ - -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \ - -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ - -continue=chainMerge -stop=net >& do3.log & - tail -f do3.log -# *** All done ! Elapsed time: 19m11s - - # Load track w/new name chainSelfRedo to compare to existing chainSelf: - hgLoadChain -normScore -tIndex hg38 chainSelfRedo axtChain/hg38.hg38.all.chain.gz - - # No idea why but somehow the liftUp seems not to have worked for part012 and part017, - # so the all.chain had chr22_31, chr8_01 etc. :b run again again. - cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run - mv chain/part012.lst.chain{,.bak} - mv chain/part017.lst.chain{,.bak} - csh -efx chain.csh part012.lst chain/part012.lst.chain >& part012.log & - csh -efx chain.csh part017.lst chain/part017.lst.chain >& part017.log & - # Those completed successfully. Dunno why the earlier ones didn't get lifted. - cd .. - mv hg38.hg38.all{,.oopsPartUnlifted}.chain.gz - # Reconstruct hg38.hg38.all.chain.gz (the chainMerge step is just this command): - find /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run/chain -name "*.chain" \ - | chainMergeSort -inputList=stdin \ - | nice gzip -c \ - > /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/hg38.hg38.all.chain.gz - - # NOTE FOR NEXT TIME: this filtering step will be unnecessary when -minScore=10000 is used - # from the beginning. - # Filter to minScore of 10000 (too much fluff with -minScore=3000) per Jim (see #24695) - cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain - mv hg38.hg38.all.chain.gz hg38.hg38.all.unfiltered.chain.gz - chainFilter hg38.hg38.all.unfiltered chain.gz -minScore=10000 \ - | gzip -c > hg38.hg38.all.chain.gz - hgLoadChain -normScore -tIndex hg38 chainSelfRedo hg38.hg38.all.chain.gz - checkTableCoords hg38 chainSelfRedo - - # Rename to chainSelf and update lastz symlinks and downloads - hgsql hg38 -e 'drop table chainSelf; drop table chainSelfLink; - rename table chainSelfRedo to chainSelf; - rename table chainSelfRedoLink to chainSelfLink;' - cd /hive/data/genomes/hg38/bed - rm lastz.self lastz.hg38 - ln -s lastzSelf.2020-01-27 lastz.self - ln -s lastzSelf.2020-01-27 lastz.hg38 - cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain - cp /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/axtChain/README.txt . - $EDITOR README.txt - md5sum hg38.hg38.all.chain.gz > md5sum.txt - # Make sure that the old download dir has only symlinks, no real files, then remove and rebuild. - ls -lR /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ - rm -r /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ - mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ - cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/ - ln -s /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/{README.txt,hg38.hg38.all.chain.gz,md5sum.txt} . - - -######################################################################### -# NCBI ReMap alignments (DONE 2020-02-11 Angie) -# RM 24449 - mkdir /hive/data/genomes/hg38/bed/chainHg19ReMap - cd /hive/data/genomes/hg38/bed/chainHg19ReMap - wget ftp://ftp.ncbi.nlm.nih.gov/pub/remap/Homo_sapiens/current/GCF_000001405.39_GRCh38.p13/GCF_000001405.25_GRCh37.p13/GCF_000001405.39-GCF_000001405.25.gff - # We will need to substitute all the RefSeq chrom and contig IDs with our own names. - # The same alt contig can appear in both assemblies with the same name, so replace - # hg19 names at the beginning of the line and hg38 names after "Target=". - hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \ - | sed -re 's/\./\\./;' \ - | awk '{print "s/^" $1 "\\b/" $2 "/;";}' \ - > hg38.hg19.chromAlias.sed - hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \ - | sed -re 's/\./\\./;' \ - | awk '{print "s/Target=" $1 "\\b/Target=" $2 "/;";}' \ - >> hg38.hg19.chromAlias.sed - - # There are some GRCh38.p13 sequences that we have not yet imported into hg38 -- use -dropT. - sed -f hg38.hg19.chromAlias.sed GCF_000001405.39-GCF_000001405.25.gff \ - | gff3ToPsl -dropT /hive/data/genomes/{hg19,hg38}/chrom.sizes stdin stdout \ - | pslPosTarget stdin stdout \ - | sort -k14,14 -k16n,16n > remap.hg38.hg19.psl - - # Convert to chain for browser display. Some of the remap chains have minScore < 1000 and - # by default would be dropped by chainScore... use -minScore=0 to prevent that. - time pslToChain remap.hg38.hg19.psl stdout \ - | chainScore -minScore=0 stdin /hive/data/genomes/{hg38/hg38.2bit,hg19/hg19.2bit} \ - remap.hg38.hg19.chain -#real 9m31.900s -#user 9m1.624s -#sys 0m20.863s - hgLoadChain hg38 -tIndex chainHg19ReMap remap.hg38.hg19.chain -#Loading 5315 chains into hg38.chainHg19ReMap - time axtChain -psl -linearGap=medium -verbose=0 remap.hg38.hg19.psl \ - /hive/data/genomes/hg38/hg38.2bit /hive/data/genomes/hg19/hg19.2bit \ - remap.axtChain.hg38.hg19.chain -#real 2m26.333s -#user 2m4.237s -#sys 0m22.071s - hgLoadChain hg38 -tIndex chainHg19ReMapAxtChain remap.axtChain.hg38.hg19.chain -#Loading 2115 chains into hg38.chainHg19ReMapAxtChain - -################################################### -#Agilent SNP/CNV arrays 3/11/21 -#Downloaded by web browser -cd /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto -fetchChromSizes hg38 > hg38.chrom.sizes -bedSort hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bed hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bed -uniq hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bed >hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.uniq.bed -bedToBigBed hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.uniq.bed hg38.chrom.sizes hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bb -bedSort hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bed hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bed -uniq hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bed > hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.uniq.bed -bedToBigBed hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.uniq.bed hg38.chrom.sizes hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bb -bedSort hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bed hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bed -uniq hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bed > hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.uniq.bed -bedToBigBed hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.uniq.bed hg38.chrom.sizes hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bb -mkdir -p /gbdb/hg38/snpCnvArrays/agilent -cd /gbdb/hg38/snpCnvArrays/agilent -ln -s /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto/hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bb -ln -s /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto/hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bb -ln -s /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto/hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bb -vi ~/kent/src/hg/makeDb/trackDb/human/hg38/trackDb.ra - -######################################################################### -# DECIPHER CNV & SNV - initial build (DONE 2022-04-08 Jonathan) -# RM 29130 - -cd /hive/data/genomes/outside/otto/decipher -mkdir 2022-04-05 -cd 2022-04-05 - -# manually fetch decipher-variants-grch38-2022-04-03.bed from DECIPHER -../buildDecipher decipher-variants-grch38-2022-04-03.bed - -for i in `cat ../decipher.tables` - do - n=$i"New" - o=$i"Old" - hgsqlSwapTables hg38 $n $i $o -dropTable3 - done - -mkdir -p /gbdb/hg38/decipher -cd /gbdb/hg38/decipher -ln -s /hive/data/outside/otto/decipher/2022-04-05/decipherCnv.bb . - -######################################################################### -# COSMIC (DONE 07-11-2023) -# RM 29625 - -#Fetch file -cd /hive/data/outside/cosmic/hg38/v98/ -wget 'https://cog.sanger.ac.uk/cosmic/GRCh38/ucsc/v98/ucsc_export.bed.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1686847188&Signature=4YV3CuFKudxIhqVdWAaCe0CMAiY%3D' -O ucsc_export.bed.gz -wget 'https://cog.sanger.ac.uk/cosmic/GRCh38/ucsc/v98/ucsc_export.bed.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1687525456&Signature=jBdJOlOOaqmMWNnOtJUyNRptVj4%3D' -mv ucsc_export.bed.gz\?AWSAccessKeyId\=KRV7P7QR9DL41J9EWGA2\&Expires\=1687525456\&Signature\=jBdJOlOOaqmMWNnOtJUyNRptVj4\= ucsc_export.bed.gz - -#Reorder to columns to conform to bed 6+3 -zcat ucsc_export.bed.gz | awk -F'\t' -v OFS="\t" '{ print $1, $2, $3, $7, 0, $6, $4, $5, $8 }' | sort -k1,1 -k2,2n > cosmic.bed - -#Tiny bit of python to identify the broken lines in the file where chromStart > chromEnd - -#for line in myFile: -# newLine = line.split("\t") -# if int(newLine[1]) > int(newLine[2]): -# print(line) -# n+=1 -#print(n) - -#remove those broken records from the file -cat cosmic.bed | grep -vf badRecords.bed > cosmic.fixed.bed - -#subtract to conform to bed format for all the items that have same star and endPos - -cat cosmic.fixed.bed | awk 'BEGIN {OFS="\t"} { -if ($2 == $3) - print $1,$2-1,$3,$4,$5,$6,$7,$8,$9; -else - print $0; -}' > cosmic.fixedPos.bed - -bedToBigBed -type=bed6+3 -as=/hive/data/outside/cosmic/hg38/v98/cosmic.as /hive/data/outside/cosmic/hg38/v98/cosmic.fixedPos.bed /hive/data/genomes/hg38/chrom.sizes /hive/data/outside/cosmic/hg38/v98/cosmic.bb -tab - -#make symlink -ln -s /hive/data/outside/cosmic/hg38/v98/cosmic.bb /gbdb/hg38/cosmic/cosmic.bb - -#This data has since been updated, see new makedoc doc/hg38/cosmicV98.txt and rm #32430 - -############################################################################## -# LIFTOVER TO GCA_018873775.2_hg01243.v3.0 (DONE - 2023-08-13 - Hiram) - ssh hgwdev - # going to need an ooc for hg38.p14.2bit - cd /hive/data/genomes/hg38 - time blat hg38.p14.2bit /dev/null /dev/null -tileSize=11 \ - -makeOoc=hg38.p14.ooc -repMatch=1024 - # Wrote 36808 overused 11-mers to hg38.p14.ooc - # real 0m50.753s - - # and ooc for this GenArk hub - cd /hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0 - time blat GCA_018873775.2_hg01243.v3.0.2bit /dev/null /dev/null -tileSize=11 \ - -makeOoc=GCA_018873775.2_hg01243.v3.0.ooc -repMatch=1024 -# Wrote 39087 overused 11-mers to GCA_018873775.2_hg01243.v3.0.ooc -# real 0m49.426s - - mkdir /hive/data/genomes/hg38/bed/blat.GCA_018873775.2_hg01243.v3.0.2023-08-13 - cd /hive/data/genomes/hg38/bed/blat.GCA_018873775.2_hg01243.v3.0.2023-08-13 - - doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ - -debug -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ - -target2Bit=/hive/data/genomes/hg38/hg38.2bit \ - -targetSizes=/hive/data/genomes/hg38/chrom.sizes \ - -query2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.2bit \ - -querySizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.chrom.sizes \ - -ooc=/hive/data/genomes/hg38/hg38.p14.ooc \ - hg38 GCA_018873775.2 - - # trying -ram=6g to get full use of hgwdev kluster nodes - time (~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl \ - -verbose=2 -buildDir=`pwd` \ - -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ - -target2Bit=/hive/data/genomes/hg38/hg38.2bit \ - -targetSizes=/hive/data/genomes/hg38/chrom.sizes \ - -query2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.2bit \ - -querySizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.chrom.sizes \ - -ooc=/hive/data/genomes/hg38/hg38.p14.ooc \ - hg38 GCA_018873775.2) > doLiftOverToGCA_018873775.2.log 2>&1 - # real 12654m58.134s - - # broken after the alignment was done, with the parasol endless loop - # error message in the log file: - # select failure in rudp: Invalid argument - # killed that, cleaned the 4Tb log file, and gave up on this alignment - # since the lastz/chain/net is much better - - # see if the liftOver menus function in the browser from hg38 - # to GCA_018873775.2 - -############################################################################## -# LIFTOVER GCA_018873775.2_hg01243.v3.0 to hg38 (DONE - 2023-08-13 - Hiram) - ssh hgwdev - - mkdir /hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/trackData/blat.hg38.2023-08-13 - cd /hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/trackData/blat.hg38.2023-08-13 - - doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ - -debug -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ - -target2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.2bit \ - -targetSizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.chrom.sizes \ - -query2Bit=/hive/data/genomes/hg38/hg38.2bit \ - -querySizes=/hive/data/genomes/hg38/chrom.sizes \ - -ooc=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.ooc \ - GCA_018873775.2 hg38 - - # trying -ram=6g to get full use of hgwdev kluster nodes - time (~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl \ - -verbose=2 -buildDir=`pwd` \ - -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ - -target2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.2bit \ - -targetSizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.chrom.sizes \ - -query2Bit=/hive/data/genomes/hg38/hg38.2bit \ - -querySizes=/hive/data/genomes/hg38/chrom.sizes \ - -ooc=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/873/775/GCA_018873775.2_hg01243.v3.0/GCA_018873775.2_hg01243.v3.0.ooc \ - GCA_018873775.2 hg38) > doLiftOverToHg38.log 2>&1 - - # broken after the alignment was done, with the parasol endless loop - # error message in the log file: - # select failure in rudp: Invalid argument - # killed that, cleaned the 4Tb log file, and gave up on this alignment - # since the lastz/chain/net is much better - # real 193m24.137s - - # see if the liftOver menus function in the browser from GCA_018873775.2 - # to hg38 - -############################################################################## -# LIFTOVER TO GCA_018503275.1_NA19240.pri.mat.f1_v2 (TBD - 2023-08-14 - Hiram) - ssh hgwdev - - # ooc for this GenArk hub - cd /hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2 - time blat GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit /dev/null /dev/null \ - -tileSize=11 -repMatch=1024 \ - -makeOoc=GCA_018503275.1_NA19240.pri.mat.f1_v2.11.ooc - # Wrote 35866 overused 11-mers to GCA_018503275.1_NA19240.pri.mat.f1_v2.11.ooc - # real 0m32.298s - - mkdir /hive/data/genomes/hg38/bed/blat.GCA_018503275.1_NA19240.pri.mat.f1_v2.2023-08-14 - cd /hive/data/genomes/hg38/bed/blat.GCA_018503275.1_NA19240.pri.mat.f1_v2.2023-08-14 - - ~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl -verbose=2 \ - -buildDir=`pwd` -ram=4g -chainRam=16g \ - -debug -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ - -target2Bit=/hive/data/genomes/hg38/hg38.2bit \ - -targetSizes=/hive/data/genomes/hg38/chrom.sizes \ - -query2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit \ - -querySizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.chrom.sizes \ - -ooc=/hive/data/genomes/hg38/hg38.p14.ooc \ - hg38 GCA_018503275.1 - - # trying -ram=4g to get full use of hgwdev kluster nodes - time (~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl \ - -verbose=2 -buildDir=`pwd` -ram=4g -chainRam=16g \ - -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ - -target2Bit=/hive/data/genomes/hg38/hg38.2bit \ - -targetSizes=/hive/data/genomes/hg38/chrom.sizes \ - -query2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit \ - -querySizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.chrom.sizes \ - -ooc=/hive/data/genomes/hg38/hg38.p14.ooc \ - hg38 GCA_018503275.1) > doLiftOverToGCA_018503275.1.log 2>&1 - # real 11370m18.026s - - # broken after the alignment was done, with the parasol endless loop - # error message in the log file: - # select failure in rudp: Invalid argument - # killed that, cleaned the 4Tb log file, and gave up on this alignment - # since the lastz/chain/net is much better - # -rw-rw-r-- 1 4363949695640 Aug 22 09:16 doLiftOverToGCA_018503275.1.log - - # see if the liftOver menus function in the browser from hg38 - # to GCA_018503275.1 - -############################################################################## -# LIFTOVER GCA_018503275.1_NA19240.pri.mat.f1_v2 to hg38 (DONE - 2023-08-14 - Hiram) - ssh hgwdev - - mkdir /hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/trackData/blat.hg38.2023-08-14 - cd /hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/trackData/blat.hg38.2023-08-14 - - ~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl -verbose=2 \ - -buildDir=`pwd` -ram=4g -chainRam=16g \ - -debug -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ - -target2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit \ - -targetSizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.chrom.sizes \ - -query2Bit=/hive/data/genomes/hg38/hg38.2bit \ - -querySizes=/hive/data/genomes/hg38/chrom.sizes \ - -ooc=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.11.ooc \ - GCA_018503275.1 hg38 - - time (~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl -verbose=2 \ - -buildDir=`pwd` -ram=4g -chainRam=16g \ - -bigClusterHub=hgwdev -dbHost=hgwdev -workhorse=hgwdev \ - -target2Bit=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.2bit \ - -targetSizes=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.chrom.sizes \ - -query2Bit=/hive/data/genomes/hg38/hg38.2bit \ - -querySizes=/hive/data/genomes/hg38/chrom.sizes \ - -ooc=/hive/data/genomes/asmHubs/genbankBuild/GCA/018/503/275/GCA_018503275.1_NA19240.pri.mat.f1_v2/GCA_018503275.1_NA19240.pri.mat.f1_v2.11.ooc \ - GCA_018503275.1 hg38) > liftOverToHg38.log 2>&1 - # real 5082m17.500s - - # this is interesting, this alignment completed and actually has good - # coverage: - cat fb.GCA_018503275.1.chain.Hg38Link.txt - # 2928654519 bases of 3032066086 (96.589%) in intersection - - # see if the liftOver menus function in the browser from GCA_018503275.1 - # to hg38 - -############################################################################## -## update grp table add new row for HPRC (DONE - 2023-08-29 - Hiram) -## existing structure: - - hgsql -e 'desc grp;' hg38 - -+-----------------+-----------+------+-----+---------+-------+ -| Field | Type | Null | Key | Default | Extra | -+-----------------+-----------+------+-----+---------+-------+ -| name | char(255) | NO | PRI | | | -| label | char(255) | NO | | | | -| priority | float | NO | | 0 | | -| defaultIsClosed | int(11) | YES | | NULL | | -+-----------------+-----------+------+-----+---------+-------+ - - # add one new row: - hgsql hg38 \ - -e "INSERT INTO grp VALUES ('hprc', 'Human Pangenome - HPRC', 3.6, 0);" - - # resulting table: - - hgsql -e 'select * from grp order by priority;' hg38 -+------------+------------------------------------+----------+-----------------+ -| name | label | priority | defaultIsClosed | -+------------+------------------------------------+----------+-----------------+ -| user | Custom Tracks | 1 | 0 | -| remc | Reference Epigenome Mapping Center | 1.2 | 1 | -| map | Mapping and Sequencing | 2 | 0 | -| genes | Genes and Gene Predictions | 3 | 0 | -| phenDis | Phenotype and Literature | 3.4 | 0 | -| pub | Literature | 3.5 | 0 | -| hprc | Human Pangenome - HPRC | 3.6 | 0 | -| covid | COVID-19 | 3.6 | 0 | -| singleCell | Single Cell RNA-seq | 3.7 | 0 | -| rna | mRNA and EST | 4 | 0 | -| expression | Expression | 4.5 | 0 | -| regulation | Regulation | 5 | 0 | -| compGeno | Comparative Genomics | 6 | 0 | -| varRep | Variation | 7 | 0 | -| rep | Repeats | 8 | 0 | -| x | Experimental | 10 | 1 | -+------------+------------------------------------+----------+-----------------+ - -############################################################################## -# Affy CytoScan HD track, refs #32856 (2024-01-23 Gerardo) -cd /hive/data/genomes/hg38/bed/ -mkdir genotypeArrays; cd genotypeArrays -#The user sent Gerardo a direct email with a shared folder link. Gerardo downloaded the bed files and made them available on dev. -#The user provided two bed files (https://hgwdev-gperez2.gi.ucsc.edu/~gperez2/mlq/mlq_32791/). Gerardo used the version 2 bed file for the track. -wget https://hgwdev-gperez2.gi.ucsc.edu/~gperez2/mlq/mlq_32791/CytoScanHD_Accel_Array.na36.bed.zip -unzip CytoScanHD_Accel_Array.na36.bed.zip -# Removed header and sorted the file -grep -v 'track' CytoScanHD_Accel_Array.na36.bed | bedSort stdin stdout > affyCytoScanHD.bed -bedToBigBed -tab -type=bed12 affyCytoScanHD.bed https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes affyCytoScanHD.bb -cd /gbdb/hg38 -mkdir genotypeArrays; cd genotypeArrays -# Making symlink for big file and raw bed file -ln -s /hive/data/genomes/hg38/bed/genotypeArrays/affyCytoScanHD.bb -ln -s /hive/data/genomes/hg38/bed/genotypeArrays/CytoScanHD_Accel_Array.na36.bed.zip -cd ~/kent/src/hg/makeDb/trackDb/human/hg38 -vi trackDb.ra - -############################################################################## -# LASTZ Human Hg38 vs. California sea lion GCF_009762305.2 -# (DONE - 2024-03-06 - jairo) - - mkdir /hive/data/genomes/hg38/bed/lastzGCF_009762305.2.2024-03-06 - cd /hive/data/genomes/hg38/bed/lastzGCF_009762305.2.2024-03-06 - - printf '# California sea lion GCF_009762305.2 vs. Human Hg38 -BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz - -# TARGET: Human hg38 -SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit -SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes -SEQ1_CHUNK=20000000 -SEQ1_LAP=10000 -SEQ1_LIMIT=40 - -# QUERY: California sea lion 2020-07-14 GCF_009762305.2_mZalCal1.pri.v2 -SEQ2_DIR=/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.2bit -SEQ2_LEN=/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.chrom.sizes.txt -SEQ2_CHUNK=20000000 -SEQ2_LAP=0 -SEQ2_LIMIT=100 - -BASE=/hive/data/genomes/hg38/bed/lastzGCF_009762305.2.2024-03-06 -TMPDIR=/dev/shm - -' > DEF - - time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl -trackHub -noDbNameCheck -verbose=2 `pwd`/DEF -syntenicNet \ - -qAsmId GCF_009762305.2_mZalCal1.pri.v2 -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ - -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 - grep -w real do.log | sed -e 's/^/ # /;' - # real 1018m28.119s - - sed -e 's/^/ # /;' fb.hg38.chainGCF_009762305.2Link.txt - # 1633315994 bases of 3299210039 (49.506%) in intersection - sed -e 's/^/ # /;' fb.hg38.chainSynGCF_009762305.2Link.txt - # 1564193911 bases of 3299210039 (47.411%) in intersection - - time (~/kent/src/hg/utils/automation/doRecipBest.pl -trackHub -load -workhorse=hgwdev -buildDir=`pwd` \ - \ - -query2Bit="/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.2bit" \ --querySizes="/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.chrom.sizes.txt" \ - hg38 GCF_009762305.2) > rbest.log 2>&1 - - grep -w real rbest.log | sed -e 's/^/ # /;' - # real 303m36.739s - - sed -e 's/^/ # /;' fb.hg38.chainRBest.GCF_009762305.2.txt - # 1461974620 bases of 3299210039 (44.313%) in intersection - - ### and for the swap - - cd /hive/data/genomes/asmHubs/allBuild/GCF/009/762/305/GCF_009762305.2_mZalCal1.pri.v2/trackData/blastz.hg38.swap - - time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl -trackHub -noDbNameCheck -swap -verbose=2 \ - -qAsmId GCF_009762305.2_mZalCal1.pri.v2 /hive/data/genomes/hg38/bed/lastzGCF_009762305.2.2024-03-06/DEF -swapDir=`pwd` \ - -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ - -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 - - grep -w real swap.log | sed -e 's/^/ # /;' - # real 103m25.220s - - sed -e 's/^/ # /;' fb.GCF_009762305.2.chainHg38Link.txt - # 1493183463 bases of 2409685272 (61.966%) in intersection - sed -e 's/^/ # /;' fb.GCF_009762305.2.chainSynHg38Link.txt - # 1457122207 bases of 2409685272 (60.469%) in intersection -\ time (~/kent/src/hg/utils/automation/doRecipBest.pl -trackHub -load -workhorse=hgwdev -buildDir=`pwd` \ - \ - -target2bit="/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.2bit" \ --targetSizes="/hive/data/genomes/asmHubs/GCF/009/762/305/GCF_009762305.2/GCF_009762305.2.chrom.sizes.txt" \ - GCF_009762305.2 hg38) > rbest.log 2>&1 - - grep -w real rbest.log | sed -e 's/^/ # /;' - # real 286m31.189s - - sed -e 's/^/ # /;' fb.GCF_009762305.2.chainRBest.Hg38.txt - # 1461710350 bases of 2409685272 (60.660%) in intersection - -############################################################################## -# hg38.chromAlias.bb was incorrectly built without indexes so it will not -# work with bedToBigBed 2024-04-08 markd - -cd /hive/data/genomes/hg38/goldenPath/bigZips/initial -mv hg38.chromAlias.bb hg38.chromAlias.noindexes.bb -bigBedInfo -asOut hg38.chromAlias.noindexes.bb >hg38.chromAlias.as -bigBedToBed hg38.chromAlias.noindexes.bb hg38.chromAlias.bed -bedToBigBed -tab -type=bed3+ -as=hg38.chromAlias.as hg38.chromAlias.bed -sizesIs2Bit -extraIndex=ucsc,assembly,ensembl,genbank,refseq hg38.2bit hg38.chromAlias.bb - -############################################################################## - -# ENCODE 4 TF rPeak Clusters - RM #34930 - Lou 12/19/24 - -mkdir /hive/data/genomes/hg38/bed/ENCODEv4TFrPeaks -cd /hive/data/genomes/hg38/bed/ENCODEv4TFrPeaks -hubClone -download https://users.wenglab.org/gaomingshi/TF.rpeak.test.txt -ln -s /hive/data/genomes/hg38/bed/ENCODEv4TFrPeaks/no_trim.TF_name.rPeaks.bb /gbdb/hg38/bbi/ENCODE4/TFrPeakClusters.bb -ln -s /hive/data/genomes/hg38/bed/ENCODEv4TFrPeaks/no_trim.TF_name.decorator.bb /gbdb/hg38/bbi/ENCODE4/TFrPeakClustersDecorator.bb -# Then just moved the files to the ENCODEv4TFrPeaks dir, moved/tweaked HTML and trackDb +# alphaMissense ticket #32269 (Jeltje, Jan 2025) +mkdir -p /hive/data/genomes/hg38/bed/alphaMissense/ +cd /hive/data/genomes/hg38/bed/alphaMissense +wget https://storage.googleapis.com/dm_alphamissense/AlphaMissense_hg38.tsv.gz +time python ~/kent/src/hg/makeDb/outside/alphaMissense/alphaMissenseToWig.py AlphaMissense_hg38.tsv.gz +wigToBigWig a.wig ../../chrom.sizes a.bw & +wigToBigWig c.wig ../../chrom.sizes c.bw & +wigToBigWig g.wig ../../chrom.sizes g.bw & +wigToBigWig t.wig ../../chrom.sizes t.bw & +wait + +##Colors were added using the script +#kent/src/hg/makeDb/scripts/wigColorByColors/makeWigColorByRevelCadd.py