src/hg/makeDb/doc/hg18.txt 1.378
1.378 2009/08/23 04:14:05 hartera
Documented adding new code to handle Vega Genes track and loaded a vegaGtp table.
Index: src/hg/makeDb/doc/hg18.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg18.txt,v
retrieving revision 1.377
retrieving revision 1.378
diff -b -B -U 1000000 -r1.377 -r1.378
--- src/hg/makeDb/doc/hg18.txt 18 Aug 2009 22:47:49 -0000 1.377
+++ src/hg/makeDb/doc/hg18.txt 23 Aug 2009 04:14:05 -0000 1.378
@@ -1,28663 +1,28703 @@
# for emacs: -*- mode: sh; -*-
# This file describes how we made the browser database on
# NCBI build 36 (October 2005 freeze)
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +---------------+-------------------------------------+
# | tableName | type |
# +---------------+-------------------------------------+
# | knownGene | genePred knownGenePep knownGeneMrna |
# | refGene | genePred refPep refMrna |
# | xenoRefGene | genePred xenoRefPep xenoRefMrna |
# | mgcGenes | genePred |
# | ensGene | genePred ensPep |
# | nscanGene | genePred nscanPep |
# | sgpGene | genePred sgpPep |
# | geneid | genePred geneidPep |
# | genscan | genePred genscanPep |
# | exonWalk | genePred |
# | ecoresTetNig1 | genePred |
# +---------------+-------------------------------------+
# HOW TO BUILD AN ASSEMBLY FROM NCBI FILES
# ---------------------------------------
# 10/06/2005
# Make gs.19 directory, gs.19/build36 directory, and gs.19/ffa directory.
ssh kkstore02
mkdir /cluster/store11/gs.19
mkdir /cluster/store11/gs.19/build36
mkdir /cluster/store11/gs.19/agp
mkdir /cluster/store11/gs.19/ffa
# Make a symbolic link from /cluster/store1 to this location
# (I assume there is some use for this later ?)
cd /cluster/store1
ln -s /cluster/store11/gs.19 ./gs.19
ln -s /cluster/store11/gs.19/build36 /cluster/data/hg18
# Make a symbolic link from your home directory to the build dir:
# (Investigate what this is used for, may no longer be necessary)
cd
ln -s /cluster/store11/gs.19/build36 ~/oo
# NCBI download site, fetch everything into this one directory:
# with the machine and password in your $HOME/.netrc file, this
# wget command will require no login. Your $HOME/.netrc file
# is set to 'chmod 600 .netrc' to prevent anyone from finding
# the data. (There were some early files that later moved
# into an OLD subdirectory. They were broken.)
# 11/16/2005
# Received answer from Greg to go ahead with the new build.
ssh kkstore02
mkdir /cluster/store11/gs.19/ncbi
cd /cluster/store11/gs.19/ncbi
bash
wget --timestamp ftp://ftp-private.ncbi.nih.gov/build_36/*
# New to this build is the sequence: NC_001807 which is the
# mitochondria sequence. This prefix NC_ is new to the process
# and will have to be accounted for below. The other two special
# prefixes are similar to what was seen before:
# from DR52.agp NG_002392
# Homo sapiens major histocompatibility complex, class II,
# DR52 haplotype (DR52) on chromosome 6
# and from DR53.agp NG_002433
# Homo sapiens major histocompatibility complex, class II,
# DR53 haplotype (DR53) on chromosome 6
# Fixup seq_contig.md
#
# It has a bunch of stuff belonging to the Celera
# genome assembly. Filter those out. I don't know what the
# NT_07959[0-7] items are, but there are no definitions for them
# in the agp files and no sequence in any fa.gz file.
# Fixup the names for the NG_ items, and change chrom MT to be M
# get the seq_contig.md file Craig just made for us on 11/28/05.
cd /cluster/store11/gs.19/ncbi
wget --timestamp ftp://ftp-private.ncbi.nih.gov/build_36/seq_contig.md
# remove Celera and Toronto entries
# and replace chrom number for those haplotypes
ssh hgwdev
cd /cluster/store11/gs.19/build36
egrep -v "Celera|NT_07959[0-7]" ../ncbi/seq_contig.md |grep -v CRA_TCA >seq_contig0.tab
hgsql hg18 -e 'drop table seq_contig0'
hgsql hg18 <~/src/hg/lib/seq_contig0.sql
hgsql hg18 -e 'load data local infile "seq_contig0.tab" into table seq_contig0'
# fix seq_contig and
# get the randoms sorted in proper order. The createNcbiLifts
# does not work correctly if the randoms are not grouped together
# by chromosome
fixMd0 hg18 |sed -e "s/6_qbl_hap1/6_qbl_hap2/"| sed -e "s/MT/M/" | grep -v "|" >seq_contig1.tab
hgsql hg18 -e 'drop table seq_contig1'
hgsql hg18 <~/src/hg/lib/seq_contig1.sql
hgsql hg18 -e 'load data local infile "seq_contig1.tab" into table seq_contig1'
fixMd hg18 seq_contig1 >seq_contig.md
# This pulls out all the randoms and groups them within the
# same chrom but leaving them in the same order as they orginally
# were (warning this is BASH code ...)
bash
grep "|" seq_contig0.tab | awk -F"|" '{print $1}' | \
awk '{print $2}' | sort -n -u | while read CHR
do
grep "[^0-9]${CHR}|" seq_contig0.tab
done >> seq_contig.md
exit
hgsql hg18 -e 'drop table seq_contig'
hgsql hg18 <~/src/hg/lib/seq_contig.sql
hgsql hg18 -e 'load data local infile "seq_contig.md" into table seq_contig'
# FYI: agp file format documented at:
# http://www.ncbi.nlm.nih.gov/Genbank/WGS.agpformat.html# fixup a couple of names for our own purposes here
cd /cluster/store11/gs.19/agp
ln -s ../ncbi/chr*.agp ../ncbi/chr*.fa.gz .
sed -e "s#MT/NC_001807#NC_001807#" ../ncbi/chrMT.agp > chrM.agp
cat ../ncbi/c22_H2.agp > chr22_h2_hap1.agp
cat ../ncbi/c5_H2.agp > chr5_h2_hap1.agp
cat ../ncbi/c6_COX.agp > chr6_cox_hap1.agp
cat ../ncbi/c6_QBL.agp > chr6_qbl_hap2.agp
cp -p ../ncbi/c22_H2.fa.gz chr22_h2_hap1.fa.gz
cp -p ../ncbi/c5_H2.fa.gz chr5_h2_hap1.fa.gz
cp -p ../ncbi/c6_COX.fa.gz chr6_cox_hap1.fa.gz
cp -p ../ncbi/c6_QBL.fa.gz chr6_qbl_hap2.fa.gz
mkdir sav
cp -p *hap*.agp sav
# fix hap type agp files that have multiple contigs.
fixAgp hg18 sav/chr5_h2_hap1.agp chr5_h2_hap1.agp
fixAgp hg18 sav/chr6_qbl_hap2.agp chr6_qbl_hap2.agp
# PLEASE NOTE THAT THESE TWO CORRECTED .agp FILES ABOVE ARE USED LATER,
# NOT BY THE NEXT STEP IMMEDIATELY.
# Put all the agp files together into one.
# The chrM sequence now has its own agp, remove it from
# ref_placed.agp
# sed -e "/^NC_001807/d" ../ncbi/ref_placed.agp > ref_placed.agp
# PLEASE NOTE THAT THE ORIGINAL NCBI .agp FILES FOR THOSE
# SPECIAL HAP TYPE SEQUENCES ARE USED, NOT THE CORRECTED ONES.
cd /cluster/store11/gs.19/build36
cat ../ncbi/ref_placed.agp ../agp/chrM.agp ../ncbi/ref_unplaced.agp \
../ncbi/c22_H2.agp \
../ncbi/c5_H2.agp \
../ncbi/c6_COX.agp \
../ncbi/c6_QBL.agp \
../ncbi/PAR.agp > ncbi_build36.agp
# cat ../ncbi/ref_placed.agp ../agp/chrM.agp ../ncbi/ref_unplaced.agp \
# ../agp/chr22_h2_hap1.agp ../agp/chr5_h2_hap1.agp \
# ../agp/chr6_cox_hap1.agp ../agp/chr6_qbl_hap2.agp \
# ../ncbi/PAR.agp > ncbi_build36.agp
zcat ../ncbi/chrMT.fa.gz | \
sed -e "s/gi|17981852|ref|NC_001807.4/ref|NC_001807/" | \
gzip > chrM.fa.gz
# and into ffa
cd /cluster/store11/gs.19/ffa
# NO LONGER TRUE FOR GS19!
# There is a single bogus line at the end of ref_placed.fa.gz
# declaring the NC_001807 MT sequence, this was later replaced by
# chrMT.fa.gz, so remove that one line:
zcat ../ncbi/ref_placed.fa.gz | sed -e "/^>ref|NC_001807/d" | \
gzip > ref_placed.fa.gz
# (That's a 40 minute job)
# sequence.inf is usually here, symlink it
#ln -s ../ncbi/sequence.inf
ln -s ../ncbi/chromosome_extents.inf
# put all the fa.gz files together in one big fa.gz
# time zcat ref_placed.fa.gz ../agp/chrM.fa.gz ../ncbi/ref_unplaced.fa.gz \
time zcat ../ncbi/ref_placed.fa.gz ../ncbi/ref_unplaced.fa.gz \
../agp/*hap?.fa.gz ../ncbi/PAR.fa.gz | gzip \
> ncbi_build36.fa.gz
# Make a listing of all the fasta record headers, just FYI:
cd /cluster/store11/gs.19
zcat ffa/ncbi_build36.fa.gz | grep "^>" > ncbi.fa.headers
# Sanity check, checkYbr was updated to handle the NC_ identifier
cd /cluster/store11/gs.19/build36
zcat ../ffa/ncbi_build36.fa.gz | $HOME/bin/i386/checkYbr ncbi_build36.agp stdin seq_contig.md >check.seq_contig
# result should be clean:
cat check.seq_contig
# Read 378 contigs from ncbi_build36.agp
# Verifying sequence sizes in stdin
# 0 problems detected
# Convert fa files into UCSC style fa files and place in "contigs"
# directory inside the gs.19/build36 directory
# (a check that can be done here is make a list of the contigs
# in this ./contigs directory before and compare it with the
# list of distributed contigs created after they have been
# disbursed.)
# faNcbiToUcsc was fixed to handle the NC_ identifier
cd /cluster/store11/gs.19/build36
# We've been through this often
# mv contigs contigs.0
zcat ../ffa/ncbi_build36.fa.gz | $HOME/bin/i386/faNcbiToUcsc \
-split -ntLast stdin contigs
# If you want to compare anything to previous work, check now, then:
# rm -fr contigs.0
# Determine the chromosome sizes from agps
# Watch carefully how chrY gets constructed. I'm not sure
# this chrom_sizes represents the whole length of chrY with
# the PAR added. We will see about that.
# Script updated to handle new chrom names:
# my @chroms = (1 .. 22, 'X', 'Y', 'M', '6_hla_hap1', '6_hla_hap2');
cd /cluster/store11/gs.19/build36
/cluster/bin/scripts/getChromSizes ../agp
# Create chrom.lst list for use in foreach() loops
awk '{print $1}' chrom_sizes | sed -e "s/chr//" > chrom.lst
# Create lift files (this will create chromosome directory structure) and
# inserts file
/cluster/bin/scripts/createNcbiLifts -s chrom_sizes seq_contig.md .
# Create contig agp files (will create contig directory structure)
/cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build36.agp .
# Create chromsome random agp files.
/cluster/bin/scripts/createNcbiChrAgp -randomonly .
# Copy the original chrN.agp files from the gs.19/agp directory
# into each of the chromosome directories since they contain better
# gap information. Delete the comments at top from these.
cd /cluster/store11/gs.19/build36
foreach c ( `cat chrom.lst` )
sed -e "/^#.*/d" ../agp/chr${c}.agp > ./${c}/chr${c}.agp
end
# chrM needs a name fixup
sed -e "s#NC_001807#chrM#" ../agp/chrM.agp > M/chrM.agp
# Distribute contig .fa to appropriate directory (assumes all files
# are in "contigs" directory).
# Create inserts file from agp and lift files (new - added by Terry, 2004-07-12)
/cluster/bin/scripts/createInserts /cluster/data/hg18 > /cluster/data/hg18/inserts
# create global data link for everyone. No more home directory
# links required.
ln -s /cluster/store11/gs.19/build36 /cluster/data/hg18
cd /cluster/data/hg18
/cluster/bin/scripts/distNcbiCtgFa contigs .
# Verify that everything was moved properly, the contigs directory
# should be empty:
ls contigs
# Nothing there, then remove it
rmdir contigs
# Make a list of the contigs for use later
rm contig.lst
touch contig.lst
foreach chrom ( `cat chrom.lst` )
foreach c ( $chrom/N{C,G,T}_?????? )
set contig = $c:t
echo "${chrom}/${contig}/${contig}.fa" >> contig.lst
end
end
# For later comparisons, this is how many contigs we have:
wc -l contig.lst
# 378 contig.lst
# Note 2004-06-30 - there are some clone numbers left in some of
# the NCBI files that are incorrect. Due to version number
# changes, more than one version is listed. Namely for accession
# numbers: AC004491 AC004921 AC004983 AC005088 AC006014 AC099654
# The AGP files are correct, the sequence.inf file lists these
# twice: AC004491.1 AC004491.2
# AC004921.1 AC004921.2 AC004983.2 AC004983.3
# AC005088.2 AC005088.3 AC006014.2 AC006014.3
# AC099654.4 AC099654.5
# for hg18, NCBI did not provide the seq.inf file.
# FILES ARE NOW READY FOR REPEAT MASKING - start that process as
# other steps here can proceed in parallel.
# Previous practice used to copy everything over for jkStuff from a
# previous build. Rather than do that, pick up whatever is needed
# at the time it is needed and verify that it is going to do what
# you expect.
cd /cluster/data/hg18
mkdir jkStuff
# Create the contig.gl files - XXX - NCBI doesn't deliver
# contig_overlaps.agp - 2004-06-18 - this is beginning to come
# together and there is now a contig_overlaps.agp file
# This is properly done below with a combination of psLayout
# alignments to create the contig_overlaps.agp file
# /cluster/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md
# Create chromosome gl files
# jkStuff/liftGl.csh contig.gl
# CREATING DATABASE (DONE - 2005-11-30 - Fan)
ssh hgwdev
# Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
df -h /var/lib/mysql
# Filesystem Size Used Avail Use% Mounted on
# /dev/sdc1 1.8T 1.3T 356G 79% /var/lib/mysql
# Create the database.
hgsql -e 'create database hg18' mysql
# Copy over grp table (for track grouping) from another database:
hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" hg18
# The DB updates to grp below are not needed since we copied from hg17.
# ENCODE groups
# Added 2005-08016 kate
echo 'UPDATE grp SET priority=7 WHERE name="varRep"'| hgsql hg18
echo 'UPDATE grp SET priority=8 WHERE name="encode"'| hgsql hg18
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeGenes", "ENCODE Regions and Genes", 8.1)' | hgsql hg18
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeTxLevels", "ENCODE Transcript Levels", 8.2)' | hgsql hg18
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChip", "ENCODE Chromatin Immunoprecipitation", 8.3)' | hgsql hg18
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChrom", "ENCODE Chromosome, Chromatin and DNA Structure", 8.4)' | hgsql hg18
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeCompGeno", "ENCODE Comparative Genomics", 8.5)' | hgsql hg18
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeVariation", "ENCODE Variation", 8.6)' | hgsql hg18
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeAnalysis", "ENCODE Analysis", 8.9)' | hgsql hg18
# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS
# (DONE - 2005-12-02 - Fan)
# Make nib/, unmasked until RepeatMasker and TRF steps are done.
# Do this now so that the chromInfo table will exist and thus the
# trackDb tables can be built in the next step.
# These unmasked nibs will be replaced by the masked nibs after
# repeat mask and trf are done.
ssh kkstore02
cd /cluster/data/hg18
cp /cluster/data/hg17/jkStuff/chrFa.csh jkStuff -p
# Make chr*.fa from contig .fa
# Copied chrFa.sh from hg17/jkStuff, renamed it to chrFa.csh
bash
time ./jkStuff/chrFa.csh
# real 2m34.406s
# user 1m17.405s
# sys 0m16.730s
exit
mkdir nib
foreach c (`cat chrom.lst`)
foreach f ($c/chr${c}{,_random}.fa)
if (-e $f) then
echo "nibbing $f"
/cluster/bin/i386/faToNib $f nib/$f:t:r.nib
endif
end
end
# Make symbolic links from /gbdb/hg18/nib to the real nibs.
ssh hgwdev
mkdir -p /gbdb/hg18/nib
ln -s /cluster/data/hg18/nib/chr*.nib /gbdb/hg18/nib
# Load /gbdb/hg18/nib paths into database and save size info.
cd /cluster/data/hg18
hgsql hg18 < $HOME/kent/src/hg/lib/chromInfo.sql
hgNibSeq -preMadeNib hg18 /gbdb/hg18/nib */chr*.fa
hgsql -N -e "select chrom,size from chromInfo order by chrom" hg18 \
> chrom.sizes
# You can compare this chrom.sizes with the previously created
# chrom_sizes. Should be no difference
sort chrom_sizes > s0
sort chrom.sizes | grep -v random > s1
diff s0 s1
rm s0 s1
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE - 2005-12-06 - Fan)
# dbDb orderKey updated 2005-12-06 - Fan
ssh hgwdev
# reset dbDb orderKey - these have never been ordered properly
# before, this will get them on the program.
hgsql -e 'update dbDb set orderKey=11 where name = "hg17";' \
-h genome-testdb hgcentraltest
hgsql -e 'update dbDb set orderKey=12 where name = "hg16";' \
-h genome-testdb hgcentraltest
hgsql -e 'update dbDb set orderKey=13 where name = "hg15";' \
-h genome-testdb hgcentraltest
hgsql -e 'update dbDb set orderKey=14 where name = "hg13";' \
-h genome-testdb hgcentraltest
# Enter hg18 into hgcentraltest.dbDb so test browser knows about it:
hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) \
VALUES("hg18", "Feb. 2006", "/gbdb/hg18/nib", "Human", \
"chr7:127,664,479-127,689,005", 1, 10, "Human", "Homo sapiens", \
"/gbdb/hg18/html/description.html", 0, 0, "NCBI Build 36.1");' \
-h genome-testdb hgcentraltest
# Make trackDb table so browser knows what tracks to expect:
cd ~/kent/src/hg/makeDb/trackDb
cvs up -d -P .
# Edit the makefile to add hg18 in all the right places and do
make update
make alpha
cvs commit makefile
# MAKE LIFTALL.LFT, NCBI.LFT (DONE - 2005-12-07 Fan)
cd /cluster/data/hg18
mkdir -p jkStuff
cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft
# Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly.
# Note: this ncbi.lift will not lift floating contigs to chr_random coords,
# but it will show the strand orientation of the floating contigs
# (grep for '|').
# mdToNcbiLift seq_contig.md jkStuff/ncbi.lft
# XXXX - appears to be unused, not done - Hiram
# REPEAT MASKING (DONE - 2005-12-09 - Fan)
# Record the RM version here:
# as this changes over time and there is no record in the results
ls -l /cluster/bluearc/RepeatMasker
# lrwxrwxrwx 1 angie protein 18 Nov 3 10:40
# /cluster/bluearc/RepeatMasker -> RepeatMasker051101
# beware that you can not actually include the precise single line output
# by this command since it is a CVS ident line and it will get
# changed as this file is checked into CVS. Remove the Id and
# dollar sign business to allow it to stay as it is here.
/cluster/bluearc/RepeatMasker/RepeatMasker | head -1
# RepeatMasker version development-:
# RepeatMasker,v 1.10 2005/11/03 18:39:27 angie Exp
cat /cluster/bluearc/RepeatMasker051101/Libraries/version
# RepBase Update 9.11, RM database version 20050112
# Split contigs, run RepeatMasker, lift results
# This split takes a few minutes
ssh kkstore02
cd /cluster/data/hg18
foreach chrom ( `cat chrom.lst` )
foreach c ( $chrom/N{C,G,T}_?????? )
set contig = $c:t
echo "splitting ${chrom}/${contig}/${contig}.fa"
faSplit size ${chrom}/${contig}/$contig.fa 500000 \
${chrom}/${contig}/${contig}_ \
-lift=${chrom}/${contig}/$contig.lft -maxN=500000
end
end
#- Make the run directory and job list:
cd /cluster/data/hg18
mkdir -p jkStuff
# According to RepeatMasker help file, no arguments are required to
# specify species because its default is set for primate (human)
# This run script saves the .tbl file to be sent to Arian. He uses
# those for his analysis. Sometimes he needs the .cat and .align files for
# checking problems. Krish needs the .align files, they are large.
cat << '_EOF_' > jkStuff/RMHuman
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/hg18/$2
/bin/cp $2 /tmp/hg18/$2/
cd /tmp/hg18/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2
popd
/bin/cp /tmp/hg18/$2/$2.out ./
if (-e /tmp/hg18/$2/$2.align) /bin/cp /tmp/hg18/$2/$2.align ./
if (-e /tmp/hg18/$2/$2.tbl) /bin/cp /tmp/hg18/$2/$2.tbl ./
# if (-e /tmp/hg18/$2/$2.cat) /bin/cp /tmp/hg18/$2/$2.cat ./
/bin/rm -fr /tmp/hg18/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/hg18/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/hg18
'_EOF_'
# << this line makes emacs coloring happy
chmod +x jkStuff/RMHuman
ssh kkstore02
cd /cluster/data/hg18
mkdir RMRun
rm -f RMRun/RMJobs
touch RMRun/RMJobs
foreach d ( `cat chrom.lst` )
foreach c ( ${d}/N{C,G,T}_*/N{C,G,T}_*_*.fa )
set f = $c:t
set cc = $c:h
set contig = $cc:t
echo /cluster/store11/gs.19/build36/jkStuff/RMHuman \
/cluster/store11/gs.19/build36/${d}/${contig} $f \
'{'check out line+ /cluster/store11/gs.19/build36/${d}/${contig}/$f.out'}' \
>> RMRun/RMJobs
end
end
# We have 5990 jobs in RMJobs:
wc RMRun/RMJobs
# 5990 41930 1127992 RMRun/RMJobs
#- Do the run
ssh pk
cd /cluster/data/hg18/RMRun
para create RMJobs
para try, para check, para check, para push, para check,...
#- While that is running, you can run TRF (simpleRepeat) on the small
# cluster. See SIMPLE REPEAT section below
# Completed: 5990 of 5990 jobs
# CPU time in finished jobs: 30661460s 511024.34m 8517.07h 354.88d 0.972 y
# IO & Wait Time: 38038s 633.96m 10.57h 0.44d 0.001 y
# Average job time: 5125s 85.42m 1.42h 0.06d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 6693s 111.55m 1.86h 0.08d
# Submission to last job: 86532s 1442.20m 24.04h 1.00d
# Lift up the split-contig .out's to contig-level .out's
#
# If a mistake is made in the following it would be possible to
# destroy all the RM output. So, just to be paranoid, save all
# the RM output in bluearc for the time being:
ssh kkstore02
cd /cluster/data/hg18
mkdir /cluster/bluearc/hg18/RMOutput
foreach c ( `cat chrom.lst` )
foreach d ( ${c}/N{C,G,T}_* )
set T = /cluster/bluearc/hg18/RMOutput/${d}
mkdir -p ${T}
cd ${d}
set contig = $d:t
cp -p ${contig}_?{,?,??}.fa.out ${T}
cd ../..
echo "${d} done"
end
end
# Make sure we got them all:
# (this doesn't work later since there are more *.fa.out files
# after the lifting. More explicitly to find just these:
# find . -name "N?_*_*.fa.out" -print | wc -l
find . -name "*.fa.out" -print | wc -l
# 5990
find /cluster/bluearc/hg18/RMOutput -type f | wc -l
# 5990
# same count
# OK, now you can try this operation, do it in a script like this
# and save the output of the script for a record of what happened.
cat << '_EOF_' > jkStuff/liftRM.csh
#!/bin/csh -fe
foreach c ( `cat chrom.lst` )
foreach d ( ${c}/N{C,G,T}_* )
cd $d
set contig = $d:t
liftUp $contig.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out
cd ../..
end
end
'_EOF_'
chmod +x jkStuff/liftRM.csh
mkdir scriptsOutput
script lift.log
bash
time jkStuff/liftRM.csh > scriptsOutput/liftRM.1 2>&1
exit
exit
# Check that they all were done:
grep "fa.out" scriptsOutput/liftRM.1 | wc -l
# 5990
# same count as above
#- Lift up RepeatMask .out files to chromosome coordinates via
# picked up jkStuff/liftOut2.sh from the hg17 build. Renamed to
# liftOut2.csh, changed the line that does the chrom listing
bash
time ./jkStuff/liftOut2.csh > scriptsOutput/liftOut2 2>&1
# real 0m30.488s
# user 0m24.670s
# sys 0m2.797s
# seems much faster than hg17 ???
# hg17 numbers:
# real 9m46.780s
# user 1m18.900s
# sys 7m33.990s
#- By this point, the database should have been created (above):
ssh hgwdev
cd /cluster/data/hg18
bash
time hgLoadOut hg18 ?/*.fa.out ??/*.fa.out *hap*/*.fa.out > \
scriptsOutput/hgLoadOut 2>&1
# real 9m9.045s
# user 2m19.500s
# sys 0m24.440s
# errors during this load: (there are always a couple of these)
# Strange perc. field -1.2 line 153851 of 2/chr2.fa.out
# Strange perc. field -10423.3 line 174747 of 3/chr3.fa.out
# Strange perc. field -5635.9 line 174747 of 3/chr3.fa.out
# Strange perc. field -259.3 line 174747 of 3/chr3.fa.out
# Strange perc. field -1.4 line 205545 of 4/chr4.fa.out
# Strange perc. field -0.1 line 167690 of 7/chr7.fa.out
# Strange perc. field -1331.2 line 198656 of 7/chr7.fa.out
# Strange perc. field -1460.4 line 198656 of 7/chr7.fa.out
# Strange perc. field -4.2 line 223183 of 7/chr7.fa.out
# Strange perc. field -3192.0 line 60424 of 8/chr8.fa.out
# Strange perc. field -423.4 line 60424 of 8/chr8.fa.out
# Strange perc. field -784.0 line 60424 of 8/chr8.fa.out
# Strange perc. field -0.1 line 52020 of X/chrX.fa.out
# Strange perc. field -4526.7 line 190254 of X/chrX.fa.out
# Strange perc. field -3757.2 line 190254 of X/chrX.fa.out
# Strange perc. field -597.2 line 190254 of X/chrX.fa.out
# Strange perc. field -13030.4 line 137624 of 16/chr16.fa.out
# Strange perc. field -1359.8 line 137624 of 16/chr16.fa.out
# Strange perc. field -2223.5 line 137624 of 16/chr16.fa.out
# Strange perc. field -1.3 line 11573 of 22/chr22.fa.out
# Strange perc. field -12.7 line 69873 of 22/chr22.fa.out
# Verify we have similar results to previous assembly:
# featureBits hg18 rmsk
# 1406290513 bases of 3107677273 (45.252%) in intersection
# featureBits -countGaps hg17 rmsk
# 1390952984 bases of 3095016460 (44.942%) in intersection
# featureBits hg17 rmsk
# 1391378842 bases of 2867328468 (48.525%) in intersection
# featureBits hg16 rmsk
# 1388770568 bases of 2865248791 (48.469%) in intersection
# Now proceed to MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF
# following the SIMPLE REPEAT sections below
# let Rachel know that RepeatMask is done.
# SIMPLE REPEAT [TRF] TRACK (DONE - 2005-12-07 - Fan)
# Copy the contigs, first to the bluearc, then to /iscratch/i
ssh kkstore02
mkdir /cluster/bluearc/hg18
mkdir /cluster/bluearc/hg18/contigs
cd /cluster/data/hg18
foreach ctg ( `cat contig.lst` )
set c = $ctg:t
echo "$ctg > /cluster/bluearc/hg18/contigs/$c"
cp -p $ctg /cluster/bluearc/hg18/contigs/$c
end
# Check how much is there:
# du -hsc /cluster/bluearc/hg18/contigs
# 2.8G /cluster/bluearc/hg18/contigs
exit
# Distribute contigs to /iscratch/i
ssh pk
mkdir -p /san/sanvol1/scratch/hg18/unmaskedContigs
cd /san/sanvol1/scratch/hg18/unmaskedContigs
cp -p /cluster/bluearc/hg18/contigs/* .
ls .
# Verify same amount made it there:
# du -hsc /san/sanvol1/scratch/hg18/unmaskedContigs
# 2.9G /san/sanvol1/scratch/hg18/unmaskedContigs
# Then send them to the other 7 Iservers
# /cluster/bin/iSync
# Go to the small cluster for this business:
ssh pk
mkdir -p /cluster/data/hg18/bed/simpleRepeat
cd /cluster/data/hg18/bed/simpleRepeat
mkdir trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
# << this line makes emacs coloring happy
chmod +x runTrf
cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
ls -1S /san/sanvol1/scratch/hg18/unmaskedContigs/*.fa > genome.lst
gensub2 genome.lst single gsub jobList
para create jobList
para try
para check
para push
para check
# Completed: 378 of 378 jobs
# CPU time in finished jobs: 18956s 315.93m 5.27h 0.22d 0.001 y
# IO & Wait Time: 2519s 41.98m 0.70h 0.03d 0.000 y
# Average job time: 57s 0.95m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 2345s 39.08m 0.65h 0.03d
# Submission to last job: 2427s 40.45m 0.67h 0.03d
bash
liftUp simpleRepeat.bed /cluster/data/hg18/jkStuff/liftAll.lft \
warn trf/*.bed > lu.out 2>&1
# Load into the database:
ssh hgwdev
cd /cluster/data/hg18/bed/simpleRepeat
/cluster/bin/i386/hgLoadBed hg18 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# Loaded 629076 elements of size 16
# Compare with previous assembly
featureBits hg18 simpleRepeat
# 56164158 bases of 3107677273 (1.807%) in intersection
# featureBits hg17 simpleRepeat
# 54952425 bases of 2866216770 (1.917%) in intersection
# featureBits hg16 simpleRepeat
# 54320136 bases of 2865248791 (1.896%) in intersection
# GAPS weren't in hg18 yet at this point, after gaps added:
# featureBits hg18 simpleRepeat
# 54964044 bases of 2867328468 (1.917%) in intersection
# featureBits -countGaps hg18 simpleRepeat
# 54964044 bases of 3096628158 (1.775%) in intersection
# CREATE MICROSAT TRACK (done 2006-7-5 JK)
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir microsat
cd microsat
awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed
/cluster/bin/i386/hgLoadBed hg18 microsat microsat.bed
# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2005-12-09 - Fan)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh kkstore02
mkdir -p cd /cluster/data/hg18/bed/simpleRepeat
cd /cluster/data/hg18/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# The 4 lines below were left over from makeHg17.doc.
# EXPERIMENT, at a filter of <= 12, we have coverage:
# 20904399 bases of 2867328468 (0.729%) in intersection
# at a filter of <= 9, we have coverage:
# 19271270 bases of 2867328468 (0.672%) in intersection
# Lift up filtered trf output to chrom coords as well:
cd /cluster/data/hg18
mkdir bed/simpleRepeat/trfMaskChrom
foreach c ( `cat chrom.lst` )
if (-e $c/lift/ordered.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/ordered.lst > $c/lift/oTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
endif
if (-e $c/lift/random.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/random.lst > $c/lift/rTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
endif
end
# MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE - 2005-12-09, Fan)
# This used to be done right after RepeatMasking. Now, we mask with
# TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above,
# and after Repeat Masker is complete.
ssh kkstore02
cd /cluster/data/hg18
# Make chr*.fa from contig .fa
# chrFa.csh was already copied from hg17/jkStuff
bash
time ./jkStuff/chrFa.csh > scriptsOutput/chrFa.out 2>&1 &
# real 2m35.734s
# user 1m18.351s
# sys 0m16.596s
# much faster than hg17 numbers as shown below. ???
# old hg17 numbers:
# real 13m18.512s
# user 9m1.670s
# sys 1m7.290s
#- Soft-mask (lower-case) the contig and chr .fa's
time ./jkStuff/makeFaMasked.csh > scriptsOutput/maFaMasked.out 2>&1
# real 8m47.289s
# user 3m45.698s
# sys 1m44.416s
# old hg17 numbers:
# real 29m31.623s
# user 13m49.700s
# sys 5m58.750s
#- Make hard-masked .fa.masked files as well:
time ./jkStuff/makeHardMasked.csh > scriptsOutput/maHardMasked.out 2>&1
# real 5m48.833s
# user 1m41.926s
# sys 0m52.084s
#- Create the bothMasksNib/ directory
time ./jkStuff/makeNib.csh > scriptsOutput/maNib.out 2>&1
# real 2m23.280s
# user 1m6.462s
# sys 0m19.795s
# old hg17 numbers:
# real 14m41.694s
# user 6m28.000s
# sys 1m42.500s
# Make symbolic links from /gbdb/hg18/nib to the real nibs.
ssh hgwdev
cd /cluster/store11/gs.19/build36
mv nib nib.raw
mv bothMasksNib nib
rm /gbdb/hg18/nib/*.nib
ln -s `pwd`/nib/* /gbdb/hg18/nib
# Load /gbdb/hg18/nib paths into database and save size info.
cd /cluster/data/hg18
hgNibSeq -preMadeNib hg18 /gbdb/hg18/nib */chr*.fa
# 3107677273 total bases
# Should be the same size as before
hgsql -N -e "select chrom,size from chromInfo order by chrom" hg18 \
> chrom.sizes.masked
diff chrom.sizes chrom.sizes.masked
# should be no output at all, thus:
rm chrom.sizes.masked
# Copy the masked contig fa to /scratch and /iscratch
# And everything else we will need for blastz runs, etc ...
# Best to do this sequence first to /cluster/bluearc/scratch,
# which is going to be the source for the /scratch copy.
# And then from there to the /iscratch
# Make sure you are on the fileserver for the original source:
ssh kkstore02
mkdir -p /cluster/bluearc/scratch/hg/gs.19/build36
cd /cluster/bluearc/scratch/hg/gs.19/build36
# these copies take less than 2 minutes each
mkdir bothMaskedNibs
cp -p /cluster/data/hg18/nib/*.nib ./bothMaskedNibs
mkdir maskedContigs
foreach chrom ( `cat /cluster/data/hg18/chrom.lst` )
cp -p /cluster/data/hg18/${chrom}/N{C,G,T}_*/N{C,G,T}_??????.fa \
./maskedContigs
echo "done ${chrom}"
end
# make sure you have them all:
ls maskedContigs | wc -l
# 378
wc -l /cluster/data/hg18/contig.lst
# 378
mkdir rmsk
foreach chrom ( `cat /cluster/data/hg18/chrom.lst` )
cp -p /cluster/data/hg18/${chrom}/*.out ./rmsk
echo "done ${chrom}"
end
# Now, go to the destination for /iscratch and copy from the
# bluearc
ssh kkr1u00
mkdir -p /iscratch/i/gs.19/build36
cd /iscratch/i/gs.19/build36
# This takes about 5 minutes
rsync -arlv /cluster/bluearc/scratch/hg/gs.19/build36/ .
bash
time /cluster/bin/iSync
# real 7m27.649s
# request rsync of /cluster/bluearc/scratch to the KiloKluster /scratch
# Ask sysadmin to bring up BLAT server.
# update central dbDb table to add the new blat server entry
echo 'INSERT INTO blatServers (db, host, port, isTrans) \
VALUES ("hg18", "blat19", "17778", "1"); \
INSERT INTO blatServers (db, host, port, isTrans) \
VALUES ("hg18", "blat19", "17779", "0");' \
| hgsql -h genome-testdb hgcentraltest
# LOAD ctgPos table - Contig position track
# After fixing up hgCtgPos to accept the -chromLst argument, simply:
cd /cluster/data/hg18
hgCtgPos -chromLst=chrom.lst hg18 .
# GOLD AND GAP TRACKS (DONE - 2005-12-10 - Fan)
(RE-DONE - 2006-04-06 - Fan)
ssh hgwdev
cd /cluster/data/hg18
# manually edit the 4 haplotype .agp files to change the first col from
# contig IDs into chrom name.
hgGoldGapGl -noGl -chromLst=chrom.lst hg18 /cluster/data/hg18 .
# Disappointing to see this create so many tables ...
# _gap and _gold for each chrom
# contig.gl ... section skipped for the time being. (Fan 2005-12-13).
#############################################################################
# GC5BASE (DONE - 2005-12-13 - Fan)
ssh kkstore02
mkdir -p /cluster/data/hg18/bed/gc5Base
cd /cluster/data/hg18/bed/gc5Base
hgGcPercent -wigOut -doGaps -file=stdout -win=5 hg18 \
/cluster/data/hg18/nib | wigEncode stdin gc5Base.wig gc5Base.wib
# runs for about 17 minutes
# load database
ssh hgwdev
cd /cluster/data/hg18/bed/gc5Base
mkdir /gbdb/hg18/wib
ln -s `pwd`/gc5Base.wib /gbdb/hg18/wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 gc5Base gc5Base.wig
# verify index is correct:
hgsql hg18 -e "show index from gc5Base;"
# should see good numbers in Cardinality column
#########################################################################
# GENBANK auto update (DONE 2005-12-13 Fan)
# align with revised genbank process. drop xeno ESTs.
cd ~/kent/src/hg/makeDb/genbank
cvs update -d etc
# edit etc/genbank.conf to add hg18
# hg18
hg18.serverGenome = /cluster/data/hg18/nib/chr*.nib
hg18.clusterGenome = /scratch/hg/gs.18/build36/bothMaskedNibs/chr*.nib
hg18.ooc = /scratch/hg/h/11.ooc
hg18.lift = /cluster/store11/gs.19/build36/jkStuff/liftAll.lft
hg18.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter}
hg18.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter}
hg18.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
hg18.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter}
#hg18.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter}
#hg18.genbank.est.xeno.pslCDnaFilter = ${finished.genbank.est.xeno.pslCDnaFilter}
#hg18.genbank.est.xeno.load = yes
hg18.refseq.mrna.xeno.load = yes
hg18.refseq.mrna.xeno.loadDesc = yes
hg18.mgcTables.default = full
hg18.mgcTables.mgc = all
hg18.downloadDir = hg18
### NOTE: in the future, enable orfeome tracks as part of this (markd)
# update /cluster/data/genbank/
make etc-update
ssh kkstore02
cd /cluster/data/genbank
nice bin/gbAlignStep -initial hg18 &
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
nice ./bin/gbDbLoadStep -drop -initialLoad hg18&
# CPGISLANDS (DONE - 2005-12-14 - Fan)
ssh hgwdev
mkdir -p /cluster/data/hg18/bed/cpgIsland
cd /cluster/data/hg18/bed/cpgIsland
# Build software from Asif Chinwalla (achinwal at watson.wustl.edu)
cvs co hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
make
# gcc readseq.c cpg_lh.c -o cpglh.exe
mv cpglh.exe /cluster/data/hg18/bed/cpgIsland/
# cpglh.exe requires hard-masked (N) .fa's.
# There may be warnings about "bad character" for IUPAC ambiguous
# characters like R, S, etc. Ignore the warnings.
ssh kkstore02
cd /cluster/data/hg18/bed/cpgIsland
foreach f (../../*/chr*.fa.masked)
set fout=$f:t:r:r.cpg
echo running cpglh on $f to $fout
./cpglh.exe $f > $fout
end
# the warnings:
# Bad char 0x52 = 'R' at line 2046, base 102229, sequence chr16_random
# Bad char 0x4d = 'M' at line 1216113, base 60805573, sequence chr3
# Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3
# Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3
# Transform cpglh output to bed +
cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
# << this line makes emacs coloring happy
awk -f filter.awk chr*.cpg > cpgIsland.bed
ssh hgwdev
cd /cluster/data/hg18/bed/cpgIsland
hgLoadBed hg18 cpgIslandExt -tab -noBin \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
# Reading cpgIsland.bed
# Loaded 28226 elements of size 10
# Sorted
# Saving bed.tab
# Loading hg18
########################################################################
# PRODUCING GENSCAN PREDICTIONS (DONE - 2005-12-16 - Fan)
# RELOADED PEPTIDE TABLE, GENSCANPEP (DONE, 2006-07-11, hartera)
ssh hgwdev
mkdir /cluster/data/hg18/bed/genscan
cd /cluster/data/hg18/bed/genscan
cvs co hg3rdParty/genscanlinux
ssh kkstore02
cd /cluster/data/hg18/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the contigs
# *that do not have pure Ns* (due to heterochromatin, unsequencable
# stuff) which would cause genscan to run forever.
rm -f genome.list
bash
for f in `cat /cluster/data/hg18/contig.lst`
do
egrep '[ACGT]' /cluster/data/hg18/$f.masked > /dev/null
if [ $? = 0 ]; then
echo /cluster/data/hg18/$f.masked >> genome.list
fi
done
# exit your bash shell if you are [t]csh ...
# This egrep matched all the contigs in hg18. I guess none of
# them are complete Ns* at this point.
# Log into kki (not kk !). kki is the driver node for the small
# cluster (kkr2u00 -kkr8u00. Genscan has problem running on the
# big cluster, due to limitation of memory and swap space on each
# processing node).
ssh kki
cd /cluster/data/hg18/bed/genscan
# Create template file, gsub, for gensub2. For example (3-line file):
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.list single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 377 of 378 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 78976s 1316.27m 21.94h 0.91d 0.003 y
# IO & Wait Time: 4961s 82.68m 1.38h 0.06d 0.000 y
# Average job time: 223s 3.71m 0.06h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 3491s 58.18m 0.97h 0.04d
# Submission to last job: 7541s 125.68m 2.09h 0.09d
# Running the single failed job on kolossus with a smaller window:
ssh kkr7u00.kilokluster.ucsc.edu
/cluster/bin/x86_64/gsBig /cluster/data/hg18/5/NT_006576/NT_006576.fa.masked \
gtf/NT_006576.fa.gtf -trans=pep/NT_006576.fa.pep \
-subopt=subopt/NT_006576.fa.bed -exe=hg3rdParty/genscanlinux/genscan \
-par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000
# If there were out-of-memory problems (run "para problems"), then
# re-run those jobs by hand but change the -window arg from 2400000
# something lower. In build33, this was 22/NT_011519
# In build34 there were NO failures !
# Convert these to chromosome level files as so:
ssh kkstore02
cd /cluster/data/hg18/bed/genscan
$HOME/bin/i386/liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/N*.gtf
$HOME/bin/i386/liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft \
warn subopt/N*.bed
cat pep/*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/hg18/bed/genscan
ldHgGene hg18 genscan genscan.gtf
# Reading genscan.gtf
# Read 43122 transcripts in 329799 lines in 1 files
# 43122 groups 49 seqs 1 sources 1 feature types
# 43122 gene predictions
hgPepPred hg18 generic genscanPep genscan.pep
# Processing genscan.pep
hgLoadBed hg18 genscanSubopt genscanSubopt.bed
# Reading genscanSubopt.bed
# Loaded 514065 elements of size 6
# Sorted
# Creating table definition for
# Saving bed.tab
# Loading hg18
# featureBits hg18 genscan
# 56039161 bases of 2881515245 (1.945%) in intersection
# featureBits hg17 genscan
# 55323340 bases of 2866216770 (1.930%) in intersection
# featureBits hg16 genscan
# 55333689 bases of 2865248791 (1.931%) in intersection
# featureBits hg18 genscanSubopt
# 55685959 bases of 2881515245 (1.933%) in intersection
# featureBits hg17 genscanSubopt
# 55986178 bases of 2866216770 (1.953%) in intersection
# featureBits hg16 genscanSubopt
# 56082952 bases of 2865248791 (1.957%) in intersection
# Should be zero intersection with rmsk
# featureBits -chrom=chr1 hg18 genscan rmsk
# Reload genscanPep table - requested by a user. It has been dropped
# from hgwdev.
# (hartera, 2006-07-11)
ssh hgwdev
cd /cluster/data/hg18/bed/genscan
hgPepPred hg18 generic genscanPep genscan.pep
############################################################################
# CREATE 2 BIT FILE (DONE 12/20/05, Fan)
ssh kkstore02
cd /cluster/data/hg18
faToTwoBit */chr*.fa hg18.2bit
# BLASTZ, CHAIN, NET, MAFNET, AXTNET AND ALIGNMENT DOWNLOADS FOR
# ZEBRAFISH (danRer3) (DONE, 2005-12-23, hartera)
ssh pk
# Blastz uses lineage-specific repeats. There are none for mouse
# and fish so use all repeats for each species as lineage-specific.
mkdir -p /san/sanvol1/scratch/hg18/linSpecRep.notInOthers
foreach f (/cluster/bluearc/hg18/linSpecRep/notInOthers/chr*.out.spec)
cp -p $f /san/sanvol1/scratch/hg18/linSpecRep.notInOthers/
end
# get only lineage specific repeats for chr1-25 and chrM
mkdir -p /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
foreach f (/cluster/data/danRer3/*/chr[0-9M]*.fa.out)
cp -p $f \
/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers/$f:t:r:r.out.spec
end
# make a nib dir without random chroms
mkdir -p /san/sanvol1/scratch/hg18/chromNib
cp -p /cluster/data/hg18/nib/chr*.nib \
/san/sanvol1/scratch/hg18/chromNib
rm -r chr*_random.nib
# make a nib dir that is also just chr1-25 and chrM
mkdir -p /san/sanvol1/scratch/danRer3/chromNib
cp /cluster/data/danRer3/nib/chr[0-9M]*.nib \
/san/sanvol1/scratch/danRer3/chromNib
ssh kkstore02
mkdir /cluster/data/hg18/bed/blastz.danRer3.2005-12-17
cd /cluster/data/hg18/bed
ln -s blastz.danRer3.2005-12-17 blastz.danRer3
# Three separate runs done to create chains. Runs 1 and 3 could be
# combined into one.
# RUN 1: hg18 chroms (no randoms) vs danRer3 chr1-25 and chrM using
# lineage-specific repeats.
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.danRer3
# make run dir
mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun
ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun
# make out dir
mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut
ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut
cd chromsRun
# use parameters as for hg17 vs danRer2 - see makeHg17.doc
cat << '_EOF_' > DEF
# human (hg18) vs zebrafish (danRer3)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64
# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human (hg18)
SEQ1_DIR=/san/sanvol1/scratch/hg18/chromNib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/san/sanvol1/scratch/hg18/linSpecRep.notInOthers
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Zebrafish (danRer3)
# just chroms 1-25 and chrM
SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
SEQ2_LIMIT=30
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1chroms.len
SEQ2_LEN=$BASE/S2chroms.len
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x DEF
grep -v random /cluster/data/hg18/chrom.sizes > S1chroms.len
grep -v chrUn /cluster/data/danRer3/chrom.sizes \
| grep -v chrNA > S2chroms.len
# do blastz and create chains for danRer3 chr1-25 and chrM using
# all repeats as lineage-specific repeats.
# chickenHumanTuned.gap scoring matrix is now used by axtChain if the
# linearGap parameter is set to "loose".
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut \
-chainMinScore=5000 \
-chainLinearGap loose \
-stop chainRun `pwd`/DEF >& doChains.log &
# Took 2 hours 45 minutes to run.
# Then run the human hg18 chroms and randoms vs danRer3 chrUn and chrNA
ssh hgwdev
# get file of scaffolds for hg18 randoms. Use the Table Browser to
# select sequence from the whole genome for the ctgPos table of contigs
# restricting to chrom like "%_random" in the Free-form query box of
# the filter. hg18RandomContigs.fa
cd /cluster/data/hg18/bed/blastz.danRer3
# get the position and contig name from the ctgPos table
hgsql -N -e 'select chrom, chromStart, chromEnd, contig from ctgPos \
where chromlike "%_random";' hg18 > contigPosAndNames.txt
ssh kkstore02
cd /cluster/data/hg18/bed/blastz.danRer3
# change header to just the position
perl -pi.bak -e 's/>.+range=(chr[0-9XY]+_random:[0-9]+\-[0-9]+).+/>$1/' \
hg18RandomContigs.fa
awk '{print "perl -pi.bak -e s/"$1":"$2+1"-"$3"/"$4"/ hg18RandomContigs.fa"}' \
contigPosAndNames.txt > addContigNames.csh
chmod +x addContigNames.csh
# run script
addContigNames.csh
ssh hgwdev
# make a 2 bit file of the chroms and random scaffolds
cd /cluster/data/hg18
set dir=/san/sanvol1/scratch/hg18
faToTwoBit [1-9]/chr[1-9].fa [12][0-9]/chr[12][0-9].fa M/chrM.fa \
X/chrX.fa Y/chrY.fa *hap[12]/chr*.fa \
/cluster/data/hg18/bed/blastz.danRer3/hg18RandomContigs.fa \
$dir/chromsAndRandoms.2bit
twoBitInfo $dir/chromsAndRandoms.2bit $dir/chromsAndRandoms.len
# make a 2 bit file for just the random scaffolds
faToTwoBit /cluster/data/hg18/bed/blastz.danRer3/hg18RandomContigs.fa \
$dir/randoms.2bit
twoBitInfo $dir/randoms.2bit $dir/randoms.len
# make sure all the random chroms contigs are included - should be 88.
# make a 2 bit file for all the chroms and random chroms, make sure to
# get the haplotype chrom sequences.
faToTwoBit [1-9MXY]/chr*.fa [12][0-9]/chr*.fa *hap[12]/chr*.fa \
$dir/hg18.2bit
twoBitInfo $dir/hg18.2bit $dir/hg18Chroms.len
twoBitInfo /san/sanvol1/scratch/danRer3/danRer3.2bit \
/san/sanvol1/danRer3/danRer3Chroms.len
# make file of scaffolds lengths for NA and Un scaffolds
twoBitInfo \
/san/sanvol1/scratch/danRer3/scaffoldsNAandUn/danRer3NAandUnScaf.2bit \
/san/sanvol1/scratch/danRer3/scaffoldsNAandUn/NAandUnScafs.len
cd /cluster/data/hg18/bed/blastz.danRer3
# make a lift file for the hg18 randoms contigs
cat /cluster/data/hg18/*/lift/random.lft >> $dir/randomContigs.lft
# RUN 2: hg18 chroms and random chroms contigs vs danRer3 chrNA and
# chrUn scaffolds with no lineage-specific repeats as there are too
# many scaffolds in chrNA and chrUn. Use the dynamic masking function
# of Blastz instead.
# make run dir
mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsRun
ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsRun
# make out dir
mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsOut
ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsOut
cd chromsAndRandomsRun
# use parameters similar to hg17 vs danRer2 - see makeHg17.doc
# As lineage-specific repeats can not be used with chrUn and chrNA
# scaffolds, then use dynamic masking, M=50.
cat << '_EOF_' > DEF
# human (hg18) vs zebrafish (danRer3)
# human chroms and random chrom contigs vs zebrafish chrNA and chrUn scaffolds
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64
# Reuse some parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Human (hg18)
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.2bit
SEQ1_CTGDIR=/san/sanvol1/scratch/hg18/chromsAndRandoms.2bit
SEQ1_LIFT=/san/sanvol1/scratch/hg18/randomContigs.lft
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
# 500 kb target with 5 kb overlap
SEQ1_CHUNK=500000
SEQ1_LAP=5000
# QUERY: Zebrafish (danRer3)
# just scaffolds for chrUn and chrNA
SEQ2_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit
SEQ2_CTGDIR=/san/sanvol1/scratch/danRer3/scaffoldsNAandUn/danRer3NAandUnScaf.2bit
SEQ2_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=1000000000
SEQ2_LAP=0
BASE=/san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsRun
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=/san/sanvol1/scratch/hg18/hg18Chroms.len
SEQ1_CTGLEN=/san/sanvol1/scratch/hg18/chromsAndRandoms.len
SEQ2_LEN=/san/sanvol1/scratch/danRer3/danRer3Chroms.len
SEQ2_CTGLEN=/san/sanvol1/scratch/danRer3/scaffoldsNAandUn/NAandUnScafs.len
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x DEF
# do blastz and create chains for human chroms and random chroms in contigs
# vs zebrafish danRer3 chrNA and chrUn in scaffolds without
# lineage-specific repeats but using blastz's dynamic masking.
# chickenHumanTuned.gap scoring matrix is now used by axtChain if the
# linearGap parameter is set to "loose".
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsOut \
-chainMinScore=5000 \
-chainLinearGap loose \
-stop chainRun `pwd`/DEF >& doChains.log &
# Took about 15 hours to finish.
ssh hgwdev
# Try running hg18 random chroms in contigs vs danRer3 chroms 1-25 and chrM
# with lineage-specific repeats.
# make directory of human contigs repeats to serve as lineage-specific
# repeats for the random chroms contigs.
mkdir -p /san/sanvol1/scratch/hg18/linSpecRepRandoms.notInOthers
cd /cluster/data/hg18/bed/blastz.danRer3
awk '{print $4}' contigPosAndNames.txt > contigNames.txt
foreach c (`cat contigNames.txt`)
foreach f (/cluster/data/hg18/*/${c}/${c}.fa.out)
cp -p $f \
/san/sanvol1/scratch/hg18/linSpecRepRandoms.notInOthers/$f:t:r:r.out.spec
end
end
# RUN 3: hg18 random chroms contigs vs danRer3 chr1-25 and chrM using
# lineage-specific repeats.
# make run dir
mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/randomsRun
ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/randomsRun
# make out dir
mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/randomsOut
ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/randomsOut
set dir=/san/sanvol1/scratch
cp $dir/hg18/blastzDanRer3/chromsRun/S2chroms.len \
$dir/danRer3/chr1to25andM.len
# make nib dir for random contigs for hg18
mkdir -p $dir/hg18/randomContigsNib
foreach c (`cat contigNames.txt`)
foreach f (/cluster/data/hg18/*/${c}/${c}.fa)
faToNib -softMask $f $dir/hg18/randomContigsNib/$f:t:r.nib
end
end
cd randomsRun
cat << '_EOF_' > DEF
# human (hg18) vs zebrafish (danRer3)
# human random chrom contigs vs zebrafish chr1-15 and chrM
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64
# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human (hg18)
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.2bit
SEQ1_CTGDIR=/san/sanvol1/scratch/hg18/randomContigsNib
SEQ1_LIFT=/san/sanvol1/scratch/hg18/randomContigs.lft
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/san/sanvol1/scratch/hg18/linSpecRepRandoms.notInOthers
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Zebrafish (danRer3)
# just chr1-25 and chrM
SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_LIMIT=30
SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/san/sanvol1/scratch/hg18/blastzDanRer3/randomsRun
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=/san/sanvol1/scratch/hg18/hg18Chroms.len
SEQ1_CTGLEN=/san/sanvol1/scratch/hg18/randoms.len
SEQ2_LEN=/san/sanvol1/scratch/danRer3/chr1to25andM.len
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x DEF
# do blastz and create chains for human random chroms in contigs
# vs zebrafish danRer3 chroms 1 to 25 and chrM using all repeats
# as lineage-specific repeats.
# chickenHumanTuned.gap scoring matrix is now used by axtChain if the
# linearGap parameter is set to "loose".
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/randomsOut \
-chainMinScore=5000 \
-chainLinearGap loose \
-stop chainRun `pwd`/DEF >& doChains.log &
# Took 15 minutes.
# chains are sorted by score so move into one directory and use
# chainMergeSort
ssh kolossus
set blastzDir=/cluster/data/hg18/bed/blastz.danRer3
cd $blastzDir/chromsRun/axtChain
mkdir -p chainsNotMerged
foreach r (chromsRun chromsAndRandomsRun randomsRun)
nice cp -p ${blastzDir}/${r}/axtChain/run/chain/*.chain \
${blastzDir}/chromsRun/axtChain/chainsNotMerged/
end
nice chainMergeSort ./chainsNotMerged/*.chain | nice gzip -c \
> hg18.danRer3.all.chain.gz
# split into chains by chrom
nice zcat hg18.danRer3.all.chain.gz | chainSplit chain stdin
# check chains, there are 48 should be 49. Chains for chr11_random
# are missing. These sequences have a lot of repeats in the regions that
# hits danRer3 with BLAT.
# carry on with doBlastzChainNet.pl starting from net step
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.danRer3/chromsRun
mv DEF DEF.chroms
# edit DEF to give hg18.2bit as the SEQ1_DIR and danRer3.2bit as SEQ2_DIR
# and remove lineage-specfic repeats.
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut \
-chainMinScore=5000 \
-chainLinearGap loose \
-continue net `pwd`/DEF >& doNetAndDownloads.log &
# Took about 25 minutes.
# crashed on ssh -X sanhead1 for cleanup so re-run script
cleanUp.csh
# copy chainDanRer3.html and netDanRer3.html to
# kent/src/hg/makeDb/trackDb/human/hg18/ and edit to describe method used.
# Add tracks to trackDb.ra there. Edit README.txt in the downloads
# directory to describe method used for alignments.
# featureBits -chrom=chr1 hg18 refGene:cds chainDanRer3Link -enrichment
# refGene:cds 1.378%, chainDanRer3Link 2.601%, both 0.927%, cover 67.26%,
# enrich 25.86x
# featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 1.386%, chainDanRer3Link 2.742%, both 0.909%, cover 65.58%,
# enrich 23.91x
# So similar coverage and enrichment to hg17 vs danRer2 chains.
#########################################################################
# BLASTZ MOUSE Mm7 second time (DONE - 2005-12-24 - 2005-12-25 Fan)
# After fixing a bug in the lineage specific repeat snip business
# in blastz-run-ucsc script
ssh pk
mkdir /cluster/data/hg18/bed/blastzMm7.2005-12-24
cd /cluster/data/hg18/bed
rm blastz.mm7
ln -s blastzMm7.2005-12-24 blastz.mm7
cd blastzMm7.2005-12-24
cat << '_EOF_' > DEF
# human vs mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/scratch/hg/hg18/linSpecRep/notInMouse
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
# QUERY: Mouse Mm7 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/mm7/nib
SEQ2_SMSK=/scratch/hg/mm7/linSpecRep/notInHumanDogCow
SEQ2_LEN=/cluster/bluearc/mm7/chrom.sizes
SEQ2_CHUNK=3000000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzMm7.2005-12-24
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-stop=load \
`pwd`/DEF > to-load.out 2>&1 &
# Started 2005-12-24 06:15
mv to-load.out to-load.out.1
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=chainMerge -stop=load \
`pwd`/DEF > to-load.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# PLEASE NOTE THAT SOME .OUT FILES MIGHT HAVE BEEN OVERWRITTEN
# DUE TO RETRIES AND/OR NEXT STEP COMMAND NOT FULLY EDITED CORRECTLY.
# Measurements:
ssh hgwdev
featureBits mm7 chainHg18Link
# 990285408 bases of 2583394090 (38.333%) in intersection
featureBits hg18 chainMm7Link
# 991769039 bases of 2881515245 (34.418%) in intersection
# each of above took about half hour.
#########################################################################
# BLASTZ CHICKEN GalGal2 second time (DONE - 2005-12-28 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzGalGal2.2005-12-28
cd /cluster/data/hg18/bed
rm blastz.galGal2
ln -s blastzGalGal2.2005-12-28 blastz.galGal2
cd blastzGalGal2.2005-12-28
cat << '_EOF_' > DEF
# human vs chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chicken GalGal2 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/galGal2/nib
SEQ2_LEN=/cluster/bluearc/galGal2/chrom.sizes
SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzGalGal2.2005-12-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started 2005-12-28 10:35
# Two jobs stuck in the same node. Did manual para stop and para push.
# Both finished within a few minutes.
# Done! On Wed Dec 28 15:32:45 PST 2005.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
# Had an error at the net step
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=net -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
# the gzip job on kolossus seems not moving at all.
# killed it manually. Try again.
# Seemed not moving, kill it again. Now use pk instead of kolossus.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-workhorse=pk \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-workhorse=pk \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# Done! Wed Dec 28 20:39:44 PST 2005
# Measurements:
ssh hgwdev
nice featureBits galGal2 chainHg18Link
# 91564024 bases of 1054197620 (8.686%) in intersection
nice featureBits hg18 chainGalGal2Link
# 102417858 bases of 2881515245 (3.554%) in intersection
nice featureBits galGal2 chainHg17Link
# 93277286 bases of 1054197620 (8.848%) in intersection
nice featureBits hg17 chainGalGal2Link
# 103882699 bases of 2866216770 (3.624%) in intersection
#########################################################################
# BLASTZ DOG CanFam2 time (DONE - 2005-12-28 - 2005-12-29 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzCanFam2.2005-12-28
cd /cluster/data/hg18/bed
rm blastz.canFam2
ln -s blastzCanFam2.2005-12-28 blastz.canFam2
cd blastzCanFam2.2005-12-28
cat << '_EOF_' > DEF
# human vs dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
# Specific settings for dog (per Webb email to Brian Raney)
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Dog CanFam2 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_LEN=/cluster/bluearc/canFam2/chrom.sizes
SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzCanFam2.2005-12-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started 2005-12-28 21:33
# Two jobs stuck in the same node. Did manual para stop and para push.
# Both finished within a few minutes.
# Done! On Thu Dec 29 05:27:31 PST 2005.
# system seems hang on kolossus (3 processes of [tcsh -c nice chainMergeSort], not moving)
# manually killed the jobs.
# now use pk as the workhorse.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-continue=chainMerge \
-stop=load \
`pwd`/DEF > load2.out 2>&1 &
# Done! Thu Dec 29 09:10:02 PST 2005.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
# Had an error at the load step,
# mySQL error 2013: Lost connection to MySQL server during query,
# probably due to sys admin working on network connections,
# continue at the load step
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-swap -continue=load -stop=load \
`pwd`/DEF > swap-load2.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-workhorse=pk \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# Done! Dec 29 13:21
# Measurements:
ssh hgwdev
nice featureBits canFam2 chainHg18Link
# 1477551526 bases of 2384996543 (61.952%) in intersection
nice featureBits hg18 chainCanFam2Link
# 1524764349 bases of 2881515245 (52.915%) in intersection
nice featureBits canFam2 chainHg17Link
# 1487483112 bases of 2384996543 (62.368%) in intersection
nice featureBits hg17 chainCanFam2Link
# 1530197469 bases of 2866216770 (53.387%) in intersection
# ENABLE GENBANK UPDATE (1/3/06 Fan)
# add hg18 to the following two files and check them in.
src/hg/makeDb/genbank/etc/align.dbs
src/hg/makeDb/genbank/etc/hgwdev.dbs
# then go to /cluster/data/genbank/etc and do cvs update on these two files.
#########################################################################
# BLASTZ RAT Rn3 (STARTED - 2005-12-22, DONE 2006-01-05 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzRn3.2005-12-22
cd /cluster/data/hg18/bed
rm blastz.rn3
ln -s blastzRn3.2005-12-22 blastz.rn3
cd blastzRn3.2005-12-22
cat << '_EOF_' > DEF
# human vs rat
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Muman Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/scratch/hg/hg18/linSpecRep/notInRat
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rat Rn3 - chunk big enough to do all chroms in single whole
pieces
SEQ2_DIR=/scratch/rat/rn3/softNib
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
SEQ2_LEN=/cluster/bluearc/rat/rn3/chrom.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzRn3.2005-12-22
TMPDIR=/scratch/tmp
'_EOF_'
# happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-stop=load \
`pwd`/DEF > to-load.out 2>&1 &
# start processing again on 12/31/05.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-swap \
-stop=load \
`pwd`/DEF > swap.out 2>&1 &
# Either UCSC RR and hgwdev systems or network went down around 11 AM 12/31/05.
# After holidays, start again on 1/3/06 and again on 1/5/06.
ssh pk
cd /cluster/data/hg18/bed
cd blastzRn3.2005-12-22
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-swap \
-continue=net \
-stop=load \
`pwd`/DEF > swap6.out 2>&1 &
# DONE! Jan 5 13:39
# Measurements:
nice featureBits rn3 chainHg18Link
# 962630574 bases of 2571104688 (37.440%) in intersection
nice featureBits hg18 chainRn3Link
# 964251210 bases of 2881515245 (33.463%) in intersection
#########################################################################
# BLASTZ FUGU fr1 (STARTED - 2005-12-20, DONE 2006-01-05 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzFr1.2005-12-20
cd /cluster/data/hg18/bed
ln -s blastzFr1.2005-12-20 blastz.fr1
cd blastzFr1.2005-12-20
cat << '_EOF_' > DEF
# human vs. fugu
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
# Reuse parameters from human-chicken, except L=6000 (more relaxed)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human Hg18 - testing 100,000,000 sized chunk on pk kluster
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
# QUERY: Fugu Fr1 - chunk big enough to run the whole chrom at once
SEQ2_DIR=/san/sanvol1/scratch/fr1/nib
SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ2_CHUNK=400000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzFr1.2005-12-20
'_EOF_'
# << happy emacs
# establish a screen to control this job
ssh pk
cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -stop=load \
`pwd`/DEF > thruLoad.out 2>&1 &
ssh pk
cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -continue=chainMerge -stop=load \
`pwd`/DEF > thruLoad.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -continue=download \
`pwd`/DEF > download.clean.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -swap \
`pwd`/DEF > swap.out 2>&1 &
# Finish the remaining step, 1/4/05.
ssh pk
cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 \
-swap -continue=download \
`pwd`/DEF > DownloadSwap.out 2>&1 &
# First try found the DEF was some how altered for rn3.
# Re-generated DEF and try again.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 \
-swap -continue=download \
`pwd`/DEF > DownloadSwap2.out 2>&1 &
# Done. Jan 4 09:48.
# measurements
nice featureBits hg18 chainFr1Link
# 51795958 bases of 2881515245 (1.798%) in intersection
nice featureBits hg17 chainFr1Link
#50831650 bases of 2866216770 (1.773%) in intersection
nice featureBits hg18 netFr1
# 691148929 bases of 2881515245 (23.986%) in intersection
nice featureBits hg17 netFr1
# 714234935 bases of 2866216770 (24.919%) in intersection
nice featureBits fr1 chainHg18Link
# 43267869 bases of 315518167 (13.713%) in intersection
# nice featureBits fr1 chainHg17Link
0 bases of 315518167 (0.000%) in intersection
nice featureBits fr1 netHg18
# 140843080 bases of 315518167 (44.639%) in intersection
nice featureBits fr1 netHg17
# 0 bases of 315518167 (0.000%) in intersection
# BLASTZ TETRAODON TetNig1 second time (DONE - 2006-01-07 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzTetNig1.2006-01-07
cd /cluster/data/hg18/bed
rm blastz.tetNig1
ln -s blastzTetNig1.2006-01-07 blastz.tetNig1
cd blastzTetNig1.2006-01-07
cat << '_EOF_' > DEF
# human vs tetraodon
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Tetraodon TetNig1 - single chunk big enough to run entire genome
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
SEQ2_CHUNK=410000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzTetNig1.2006-01-07
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started Sat Jan 7 05:40:51 PST 2006
# Encountered an error:
startStep: 0, at step 5 net to stopStep 6
netChains: looks like previous stage was not successful (can't find [hg18.tetNig1.]all.chain[.gz]).
# Try it with pk as the workhorse.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-continue=net \
-stop=load \
`pwd`/DEF > load2.out 2>&1 &
# Load done. Sat Jan 7 07:34:56 PST 2006
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# Done! Sat Jan 7 08:02:14 PST 2006
# The download and swap-download took less than 10 seconds each. ???
# Measurements:
ssh hgwdev
nice featureBits tetNig1 chainHg18Link
# 50026847 bases of 342403326 (14.611%) in intersection
nice featureBits hg18 chainTetNig1Link
# 57654754 bases of 2881515245 (2.001%) in intersection
nice featureBits tetNig1 chainHg17Link
# 34379509 bases of 342403326 (10.041%) in intersection
nice featureBits hg17 chainTetNig1Link
# 35910128 bases of 2866216770 (1.253%) in intersection
# BLASTZ FROG XenTro1 second time (STARTED - 2006-01-06 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzXenTro1.2006-01-06
cd /cluster/data/hg18/bed
rm blastz.xenTro1
ln -s blastzXenTro1.2006-01-06 blastz.xenTro1
cd blastzXenTro1.2006-01-06
cat << '_EOF_' > DEF
# human vs frog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
# QUERY: Frog XenTro1 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzXenTro1.2006-01-06
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started Fri Jan 6 20:19:30 PST 2006
# Blastz run done. Jan 7 02:07 load.out
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
# got the following error:
startStep: 4, at step 5 net to stopStep 6
netChains: looks like previous stage was not successful (can't find [xenTro1.hg18.]all.chain[.gz]).
# Try it with pk instead of kolossus:
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-swap -stop=load \
`pwd`/DEF > swap-load2.out 2>&1 &
# It worked, swap-load done. Jan 7 06:05
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-workhorse=pk \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# Done! Jan 7 06:18
# Measurements:
ssh hgwdev
nice featureBits xenTro1 chainHg18Link
# 61197900 bases of 1381238994 (4.431%) in intersection
nice featureBits hg18 chainXenTro1Link
# 67810866 bases of 2881515245 (2.353%) in intersection
nice featureBits xenTro1 chainHg17Link
# 81777842 bases of 1381238994 (5.921%) in intersection
nice featureBits hg17 chainXenTro1Link
# 85701475 bases of 2866216770 (2.990%) in intersection
############################################################################
# BLASTZ COW BosTau2 second time (STARTED - 2006-01-07, DONE 2006-01-08 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzBosTau2.2006-01-07
cd /cluster/data/hg18/bed
rm blastz.bosTau2
ln -s blastzBosTau2.2006-01-07 blastz.bosTau2
cd blastzBosTau2.2006-01-07
cat << '_EOF_' > DEF
# human vs cow
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cow BosTau2 - single chunk big enough to run entire genome
SEQ2_DIR=/san/sanvol1/scratch/bosTau2/bosTau2.2bit
SEQ2_LEN=/san/sanvol1/scratch/bosTau2/chrom.sizes
SEQ2_CHUNK=3200000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzBosTau2.2006-01-07
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-stop=load \
-workhorse=pk \
`pwd`/DEF > load.out 2>&1 &
# Started Sat Jan 7 07:57:22 PST 2006
# blastz run (and load) done Jan 8 00:13
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
# took a long time to finish.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# Done! Jan 8 21:10
# Measurements:
ssh hgwdev
nice featureBits bosTau2 chainHg18Link
# 1357027317 bases of 2812203870 (48.255%) in intersection
nice featureBits hg18 chainBosTau2Link
# 1357291762 bases of 2881515245 (47.103%) in intersection
nice featureBits bosTau2 chainHg17Link
# 0 bases of 2812203870 (0.000%) in intersection
nice featureBits hg17 chainBosTau2Link
# 1350076765 bases of 2866216770 (47.103%) in intersection
#######################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2006-01-11 - Fan)
ssh kkstore02
cd /cluster/data/hg18
blat hg18.2bit \
/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
# Wrote 30378 overused 11-mers to 11.ooc
# Copy over to the bluearc
cp -p 11.ooc /cluster/bluearc/hg18
#######################################################################
# PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE
# (DONE - 2006-01-12 - 2006-04-04 - Hiram)
# (RE-DONE 2006-10-31 - Hiram - see section:)
# REWORK PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE
ssh kkstore02
mkdir /cluster/data/hg18/bed/coverage
cd /cluster/data/hg18/bed/coverage
# find all the clones that were used in the assembly
sed -e "/^#.*/d" ../../ncbi_build36.agp | \
awk '{if (!match($5,"N")) {print $6}}' | \
sort -u > placed_in_assembly.list
wc -l placed_in_assembly.list
# 27093 placed_in_assembly.list
# And all possible clones considered for assembly.
# The AADB clones are the Celera assembly, don't want them.
sed -e "/^#.*/d" /cluster/store11/gs.19/ncbi/sequence.inf | \
grep for_assembly | grep -v AADB | awk '{print $1}' | sort -u \
> allButOneClonesConsidered.list
(grep AADB01066164.1 \
/cluster/store11/gs.19/ncbi/sequence.inf | awk '{print $1}'; \
cat allButOneClonesConsidered.list) | sort -u \
> allClonesConsidered.list
# The grep for AADB eliminates a single clone: AADB01066164.1
# Which actually should be in the list since it is in the
# ncbi_build36.agp file. Back in Hg17, this was the only AADB
# clone in the sequence.inf file, now there are 400,673 of them in
# this Hg18 sequence.inf file marked "for_assembly"
# Later after a lot of this was done, it was discovered that some
# of the clones on this allConsidered list are actually obsolete
# and have newer versions in use. They were identified by the
# following perl script:
cat << '_EOF_' > ckMultipleVersions.pl
#!/usr/bin/env perl
use warnings;
use strict;
sub usage() {
print "usage: ./ckMultipleVersions.pl allClonesConsidered.list\n";
exit 255;
}
my $argc = scalar(@ARGV);
if ($argc != 1) { usage; }
my $fileName = shift;
open (FH,"<$fileName") or die "Can not open $fileName";
my %cloneAcc; # key is clone accession major number, value is version
while (my $clone = <FH>) {
chomp $clone;
my ($major, $version) = split('\.', $clone);
if (exists($cloneAcc{$major})) {
my $previousVersion = $cloneAcc{$major};
if ($previousVersion >= $version) {
printf STDERR "$major.$version - obsolete\n";
} else {
printf STDERR "$major.$previousVersion - obsolete\n";
$cloneAcc{$major} = $version;
}
} else {
$cloneAcc{$major} = $version;
}
}
close (FH);
foreach my $major (sort keys %cloneAcc) {
printf "$major.$cloneAcc{$major}\n";
}
'_EOF_'
# happy emacs
chmod +x ckMultipleVersions.pl
./ckMultipleVersions.pl allClonesConsidered.list \
2> obsoleteClone.list > allClones.notObsolete.list
# After this obsolete list was made, those clone results were
# removed from the kluster run hierarchies of results.
# And when we finally got to loading up the coverage track
# 2006-04-04, a few additional ones had crept into the mix.
# These were added to this list at that loading time.
comm -12 allClonesConsidered.list \
/cluster/data/hg17/bed/contig_overlaps/sequence.list \
> allClones.InHg17AndHg18.list
comm -23 allClonesConsidered.list \
/cluster/data/hg17/bed/contig_overlaps/sequence.list \
> allClones.InHg18NotHg17.list
comm -13 allClonesConsidered.list \
/cluster/data/hg17/bed/contig_overlaps/sequence.list \
> allClones.InHg17NotHg18.list
# how many are the same as previous build:
comm -12 /cluster/data/hg17/bed/contig_overlaps/placed_in_assembly.list \
placed_in_assembly.list > sameAsHg17.list
wc sameAsHg17.list
# 26775 26775 300641 sameAsHg17.list
# There is one clone: AADB01066164.1
# Which is listed in allClones.InHg17NotHg18.list
# But it is on the Hg18 placed_in_assembly.list
# And it is on the Hg17 placed_in_assembly.list but it isn't
# actually found in Hg17 ? Perhaps it didn't align good enough.
comm -23 /cluster/data/hg17/bed/contig_overlaps/placed_in_assembly.list \
placed_in_assembly.list > uniqueToHg17.list
wc uniqueToHg17.list
# 97 97 1080 uniqueToHg17.list
# and unique to hg18, not in hg17:
comm -13 /cluster/data/hg17/bed/contig_overlaps/placed_in_assembly.list \
placed_in_assembly.list > newToHg18.list
wc newToHg18.list
# 318 318 3547 newToHg18.list
# make a list of these new contigs:
# using the previous perl scripts:
cp -p /cluster/data/hg17/bed/contig_overlaps/*.pl .
# Now, we need to distribute the clone sequence files in a
# directory hierarchy by chrom name. Using the contigAcc.pl file
# from the previous release:
cp /cluster/data/hg17/bed/contig_overlaps/contigAcc.pl .
# This newer version is generalized a bit better to take command
# line arguments for the two files it is to read instead of having
# them explicitly in the code, then:
./contigAcc.pl /cluster/data/hg18/ncbi_build36.agp \
/cluster/data/hg18/seq_contig.md > cloneToChrom.list 2>&1
# And now, since most of the clone sequence already exists in the
# Hg17 work directory, we only need to make symlinks to the
# existing ones, and move only the new ones. The following script
# will find an existing copy and symlink it correctly.
cat << '_EOF_' > createPlacedHierarchy.sh
#!/bin/sh
mkdir -p placedClones
sed -e "/^#.*/d" cloneToChrom.list | while read L
do
CHROM=`echo "${L}" | awk '{print $1}'`
CLONE=`echo "${L}" | awk '{print $2}'`
if [ ! -d "placedClones/${CHROM}" ]; then
mkdir placedClones/${CHROM}
fi
HG17_version="/cluster/data/hg17/bed/contig_overlaps/${CHROM}/${CLONE}"
HG18_version_0="/cluster/data/hg18/bed/coverage/newToHg18/${CLONE}"
HG18_version_1="/cluster/data/hg18/bed/coverage/allClones.newToHg18/${CLONE}"
if [ -f "${HG17_version}" ]; then
if [ -f "${HG18_version_0}" -o -f "${HG18_version_1}" ]; then
echo "ERROR: Why is there both an Hg17 and Hg18 version for ${CLONE}"
exit 255
fi
ln -s "/cluster/data/hg17/bed/contig_overlaps/${CHROM}/${CLONE}" \
"./placedClones/${CHROM}/${CLONE}"
else
if [ -f "${HG18_version_0}" -a -f "${HG18_version_1}" ]; then
echo "ERROR: Why are there two Hg18 copies for ${CLONE}"
exit 255
fi
if [ -f "${HG18_version_0}" ]; then
ln -s "${HG18_version_0}" "./placedClones/${CHROM}/${CLONE}"
else
if [ -f "${HG18_version_1}" ]; then
ln -s "${HG18_version_1}" "./placedClones/${CHROM}/${CLONE}"
else
# must be on a different chrom in hg17
HG17_chrom=`grep -v "^#" \
/cluster/data/hg17/bed/contig_overlaps/disburseEm.list \
| grep "^${L}$" | awk '{print $1}'`
HG17_version="/cluster/data/hg17/bed/contig_overlaps/${HG17_chrom}/${CLONE}"
if [ -f "${HG17_version}" ]; then
echo "ERROR: Why is there no version for ${CLONE}"
exit 255
fi
ln -s "${HG17_version}" "./placedClones/${CHROM}/${CLONE}"
fi
fi
fi
done
'_EOF_'
# happy emacs
chmod +x createPlacedHierarchy.sh
./createPlacedHierarchy.sh
# There should be no errors
# We need masked contigs for the psLayout alignments
ssh hgwdev
mkdir /cluster/data/hg18/bed/coverage/maskedContigs
cd /cluster/data/hg18/bed/coverage/maskedContigs
hgsql -N \
-e "select chrom,chromStart,chromEnd,contig,size from ctgPos;" hg18 \
> ctgPos.txt
ssh kkstore02
cd /cluster/data/hg18/bed/coverage/maskedContigs
# verify each contig only listed once:
awk '{print $4}' ctgPos.txt | sort | uniq -c | sort -n | less
# should all have a count of one
# verify all chrom sizes match the contig sizes:
awk '{print $3-$2}' ctgPos.txt > chrSize.list
awk '{print $5}' ctgPos.txt > ctgSize.list
diff ctgSize.list chrSize.list
# should be no difference
# OK, now fetch the contigs from the twoBit file:
cat << '_EOF_' > 2bitToFa.pl
#!/usr/bin/env perl
use warnings;
use strict;
while (my $line=<>) {
chomp $line;
my ($chrom, $start, $end, $contig, $size) = split('\s',$line);
$chrom =~ s/chr//;
printf "echo -n 'working $contig ...'; mkdir -p $chrom; twoBitToFa /cluster/data/hg18/hg18.2bit:chr$chrom:$start-$end stdout | sed -e 's/^>.*/>$contig/' > $chrom/$contig.fa; gzip $chrom/$contig.fa; echo 'done'\n";
}
'_EOF_'
# happy emacs
chmod +x 2bitToFa.pl
cat ctgPos.txt | ./2bitToFa.pl > 2bitToFa.sh
chmod +x 2bitToFa.sh
time ./2bitToFa.sh
# and create a lift file for these contigs
cat << '_EOF_' > mkCtgLift.pl
#!/usr/bin/env perl
use warnings;
use strict;
while (my $line=<>)
{
chomp $line;
my ($start, $chrCtg, $size, $chrom, $chrLen) = split('\s',$line);
$chrCtg =~ s#.*/##;
printf "%s\t%s\t%s\t%s\t%s\n", $start, $chrCtg, $size, $chrom, $chrLen;
}
'_EOF_'
# happy emacs
chmod +x mkCtgLift.pl
cat /cluster/data/hg18/jkStuff/liftAll.lft \
| ./mkCtgLift.pl > liftContigs.lft
# Create individual ooc files for each contig
mkdir ooc
for C in `ls */*.fa.gz | sed -e "s/.fa.gz//"`
do
CONTIG=`basename ${C}`
CHR=`dirname ${C}`
mkdir -p ooc/${CHR}
zcat ${C}.fa.gz | blat -repMatch=256 \
-makeOoc=ooc/${CHR}/${CONTIG}.10.ooc -tileSize=10 \
stdin /dev/null /dev/null
echo "done: ${CONTIG}"
done
# Copy everything to san filesystem for kluster run:
ssh pk
mkdir /san/sanvol1/scratch/hg18/coverage
cd /san/sanvol1/scratch/hg18/coverage
rsync -a --progress --copy-links \
/cluster/data/hg18/bed/coverage/placedClones/ ./placedClones/
rsync -a --progress --copy-links \
/cluster/data/hg18/bed/coverage/maskedContigs/ ./maskedContigs/
mkdir /san/sanvol1/scratch/hg18/coverage/runPlaced
cd /san/sanvol1/scratch/hg18/coverage/runPlaced
cat << '_EOF_' > runPsLayout.sh
#!/bin/sh
# runPsLayout.sh <chrom> <clone> <contig>
# where <chrom> is the chrom this contig is on
# <clone> is one of the .fa.gz files in
# /san/sanvol1/scratch/hg18/coverage/placedClones/<chrom>/<clone>.fa.gz
# <contig> is one of the contigs found in:
# /san/sanvol1/scratch/hg18/coverage/maskedContigs/<chrom>/<contig>.fa.gz
#
HERE=`pwd`
CHROM=$1
CLONE=$2
CONTIG=$3
TARGET=/san/sanvol1/scratch/hg18/coverage/maskedContigs/$CHROM/$CONTIG.fa.gz
CLONESRC=/san/sanvol1/scratch/hg18/coverage/placedClones/$CHROM/$CLONE.fa.gz
OOC=/san/sanvol1/scratch/hg18/coverage/maskedContigs/ooc/$CHROM/$CONTIG.10.ooc
RESULT="${HERE}/psl/${CHROM}/${CONTIG}/${CLONE}.psl"
mkdir -p psl/${CHROM}/${CONTIG}
if [ ! -s ${CLONESRC} ]; then
echo "Can not find: ${CLONESRC}" 1>/dev/stderr
exit 255
fi
if [ ! -s ${TARGET} ]; then
echo "Can not find: ${TARGET}" 1>/dev/stderr
exit 255
fi
if [ ! -s ${OOC} ]; then
echo "Can not find: ${OOC}" 1>/dev/stderr
exit 255
fi
WRKDIR="/scratch/tmp/hg18_${CHROM}/${CONTIG}/${CLONE}"
mkdir -p "${WRKDIR}"
cd ${WRKDIR}
zcat ${CLONESRC} > ${CLONE}.fa
zcat ${TARGET} > ${CONTIG}.fa
cp -p ${OOC} ./10.ooc
/cluster/bin/x86_64/psLayout ${CONTIG}.fa ${CLONE}.fa genomic 10.ooc ${RESULT}
RET=$?
cd ${HERE}
rm -fr ${WRKDIR}
rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}/${CONTIG}"
rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}"
exit ${RET}
'_EOF_'
# happy emacs
chmod +x runPsLayout.sh
# create jobList from cloneToChrom.list:
grep -v "^#" /cluster/data/hg18/bed/coverage/cloneToChrom.list \
| sed -e "s/.fa.gz//" \
| awk '{
printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s/%s.psl}\n",
$1, $2, $3, $1, $3, $2
}' > masterJobList
# To do a quick test, run just chrM:
grep " M " masterJobList > jobList
s
para create jobList
para try ... check ... etc ...
# Then, the whole run:
rm -fr psl err
para create masterJobList
para try ... check ... push ... etc ...
# running 2006-01-17 16:41
# We need the phase information from the sequence.inf file:
ssh hgwdev
cd /cluster/data/hg18/bed/coverage
cp /cluster/data/hg17/phase.pl .
# this script was fixed up for hg18 to take an argument to the
# sequence.inf file:
./phase.pl /cluster/data/hg18/ncbi/sequence.inf > phase.txt
# what kind of phases do we have:
awk '{print $2}' phase.txt | sort | uniq -c
# 1134 D
# 562513 F
# 17270 P
# Compared to hg17 we had:
awk '{print $2}' /cluster/data/hg17/phase.txt | sort | uniq -c
# 1088 D
# 146900 F
# 17300 P
# Back in the kluster runPlaced directory, we put together the
# kluster run results with:
ssh pk
mkdir /san/sanvol1/scratch/hg18/coverage/runPlaced/filteredLifted
cd /san/sanvol1/scratch/hg18/coverage/runPlaced/filteredLifted
cat << '_EOF_' > filterLift.sh
#!/bin/sh
for C in 22
do
echo -n "chr${C} working ... "
mkdir -p ${C}
OUT="${C}/filterLift.out"
pslSort dirs ${C}/raw.psl tmp ../psl/${C}/N* > ${OUT} 2>&1
pslReps -singleHit -nearTop=0.001 ${C}/raw.psl ${C}/repsSingle.psl \
/dev/null >> ${OUT} 2>&1
liftUp ${C}/chr${C}.psl ../../maskedContigs/liftContigs.lft warn \
${C}/repsSingle.psl >> ${OUT} 2>&1
clusterClone -agp -minCover=80 -maxGap=60000 ${C}/repsSingle.psl \
> ${C}/single.agp 2>> ${OUT} 2>&1
liftUp ${C}/rawLifted.psl ../../maskedContigs/liftContigs.lft warn \
${C}/raw.psl >> ${OUT} 2>&1
clusterClone -agp -minCover=80 -maxGap=60000 ${C}/chr${C}.psl \
> ${C}/chr${C}.bed 2>> ${OUT}
echo "done"
done
'_EOF_'
# happy emacs
chmod +x filterLift.sh
time ./filterLift.sh
cp /cluster/data/hg17/fixPhase.pl .
# fixed up the script to take an argument pointing to the phase.txt file
ssh kkstore02
cd /cluster/data/hg18
grep "for_assembly" ncbi/sequence.inf \
| sed -e "s/\tW\t/\t3\t/;" > sequence.inf
cd /cluster/store11/gs.19/ffa
ln -s ../build36/sequence.inf .
ssh hgwdev
cd /cluster/data/hg18
# currently working only on chr22
echo "22" > clonePos.list
# need to reload gold gap *and* gl at this time. gl wasn't loaded
# before this. It is required for the clonePos track.
hgGoldGapGl -chrom=chr22 hg18 /cluster/store11/gs.19 build36
hgClonePos -maxErr=3 -maxWarn=2000 -chromLst=clonePos.list \
hg18 /cluster/data/hg18 ./sequence.inf /cluster/store11/gs.19 \
2> clone.pos.errors
# OK, now for the hard part. The unplaced clones.
# First we will make an attempt to determine which clones they
# belong to by using information from the previous build, the
# sequence.inf file, the seq_contig.md file, and the
# ncbi_build36.agp file.
ssh kkstore02
cd /cluster/data/hg18/bed/coverage
comm -13 placed_in_assembly.list allClonesConsidered.list \
> unplaced.clone.list
comm -12 unplaced.clone.list allClones.InHg17AndHg18.list \
> common.to.hg17.unplaced.list
comm -23 unplaced.clone.list allClones.InHg17AndHg18.list \
> unique.to.hg18.unplaced.list
awk '{print $1,$6}' /cluster/data/hg17/contig_overlaps.agp \
| sed -e "s/_[0-9]*$//" | sort -u > hg17.contig.clone.list
awk '{print $1,$6}' ../../sequence.inf | sed -e "s/(//; s/)//" \
> cloneToChrom.from.seq.inf.txt
# using the contig to clone information from Hg17, attempt to
# locate the common.to.hg17.unplaced.list in terms of chrom and
# contig. Along with the ncbi_build36.agp, seq_contig.md and
# cloneToChrom.from.seq.inf.txt infomation, we can attempt to
# place clones that have perhaps moved, or don't have entries in
# one file or another. The relationships obtained from the
# various files:
# ncbi_build36.agp - gives clone to contig name and clone to chr name
# but for placed clones only, not useful here
# unless they moved from hg17 (try this with the
# placed list)
# seq_contig.md - gives contig to chrom relationship
./chrCloneContig.pl /cluster/data/hg18/ncbi_build36.agp \
hg17.contig.clone.list /cluster/data/hg18/seq_contig.md \
common.to.hg17.unplaced.list cloneToChrom.from.seq.inf.txt \
> chrCloneContigCommonToHg17.list \
2> common.to.hg17.unplaced.stderr
# With this chrCloneContigCommonToHg17.list list in hand, can now
# create a hierarchy of ./unPlacedClones/
./createUnplacedHierarchy.sh
# Then, copy them to the san for kluster run
ssh pk
cd /san/sanvol1/scratch/hg18/coverage
rsync -a --progress --copy-links \
/cluster/data/hg18/bed/coverage/unPlacedClones/ ./unPlacedClones/
mkdir runUnPlaced
cd runUnPlaced
# create jobList from the chrCloneContigCommonToHg17.list
egrep -v "^#|XX_000" \
/cluster/data/hg18/bed/coverage/chrCloneContigCommonToHg17.list \
| sed -e "s/.fa.gz//" \
| awk '{
printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s/%s.psl}\n",
$1, $2, $3, $1, $3, $2
}' > masterJobList
# Test a subset:
grep " Y " masterJobList > jobListY
para create jobListY
para try ... check ... etc ...
# ... some time later ... 2006-04-04
# All the clones were eventually run through the placement kluster
# runs. Ending up with five different directory results:
[hiram@hgwdev64 /san/sanvol1/scratch/hg18/coverage]
# -rw-rw-r-- 1 3144245541 Mar 15 09:24 runFishClones/raw.psl
# -rw-rw-r-- 1 91182723 Mar 15 10:44 runUnPlaced/raw.psl
# -rw-rw-r-- 1 102642706 Mar 15 10:49 runPlaced/raw.psl
# -rw-rw-r-- 1 15839733941 Mar 15 14:56 runLastRecover/raw.psl
# -rw-rw-r-- 1 14338192704 Mar 15 18:25 runLastOnes/raw.psl
# Combining those results together required a large memory
# machine and a couple of days processing time:
ssh hgwdev64
cd /san/sanvol1/scratch/hg18/coverage
pslSort dirs raw.psl tmp runPlaced runUnPlaced runFishClones \
runLastRecover runLastOnes > raw.psl.out 2>&1
# resulting in a 33 Gb result file:
-rw-rw-r-- 1 33515995907 Apr 2 10:54 raw.psl
# trimming that down with pslReps:
time pslReps -nohead -nearTop=0.001 -singleHit \
raw.psl repsSingle.psl /dev/null
# real 14m58.371s
# -rw-rw-r-- 1 42333543 Apr 4 10:22 repsSingle.psl
# wc -l repsSingle.psl
# 48005 repsSingle.psl
# Now, clustering those alignments together:
clusterClone -allowDuplicates -agp -minCover=80 -maxGap=60000 \
repsSingle.psl > single.agp 2> single.out
wc -l single.agp
# 45714 single.agp
# Sort them, and set their phase correctly:
sort -k1,1 -k2,2n single.agp \
| ./fixPhase.pl /cluster/data/hg18/bed/coverage/phase.txt \
> contig_overlaps.agp
# some of them are not in the phase.txt file, these are
# set to draft status:
# WARN: can not find contig AC024654.2 in phase.txt
# WARN: can not find contig AL133291.12 in phase.txt
# WARN: can not find contig AC055712.12 in phase.txt
# WARN: can not find contig AC024480.2 in phase.txt
# WARN: can not find contig AC068738.2 in phase.txt
# WARN: can not find contig AL354703.14 in phase.txt
# WARN: can not find contig AL354756.17 in phase.txt
# WARN: can not find contig AL157825.11 in phase.txt
# WARN: can not find contig AC073306.1 in phase.txt
# WARN: can not find contig AL138892.13 in phase.txt
# WARN: can not find contig AL590104.7 in phase.txt
# WARN: can not find contig AC079146.4 in phase.txt
# WARN: can not find contig AC024497.3 in phase.txt
# WARN: can not find contig AC021295.3 in phase.txt
# WARN: can not find contig AC040906.3 in phase.txt
# WARN: can not find contig AC008372.5 in phase.txt
# WARN: can not find contig AC026054.3 in phase.txt
# WARN: can not find contig AC053504.4 in phase.txt
# create the gl files from that overlaps.agp file:
ssh hgwdev
cd /cluster/data/hg18
cp -p /san/sanvol1/scratch/hg18/coverage/contig_overlaps.agp .
# after going through this sequence and loading everything,
# a few clones were discovered to have crept into the list that
# were obsolete. So, add them to the list used by the
# removeObsoleteClones.sh script:
awk '{print $6}' contig_overlaps.agp > clone.coverage.list
bed/coverage/ckMultipleVersions.pl clone.coverage.list \
> /dev/null 2> /tmp/clone.transitions
awk '{if (! match($1,$3)){ print }}' /tmp/clone.transitions \
>> bed/coverage/obsoleteClone.list
time ./removeObsoleteClones.sh
wc -l /san/sanvol1/scratch/hg18/coverage/contig_overlaps.agp \
./contig_overlaps.agp
# 45714 /san/sanvol1/scratch/hg18/coverage/contig_overlaps.agp
# 45597 ./contig_overlaps.agp
# after adding ten new ones the second time around:
# 45587 ./contig_overlaps.agp
time agpToGl contig_overlaps.agp . -md=seq_contig.md
# this liftGl.csh finds all the contig.gl files under each
# contig directory and creates chromsome coordinate chr*.gl
# files in each chrom directory
jkStuff/liftGl.csh contig.gl
# Then hgGoldGapGl uses those chrom level chr*.gl files to add
# the gl tables (as well as gold and gap
hgGoldGapGl -chromLst=chrom.lst hg18 /cluster/store11/gs.19 build36
# strip some business from the sequence.inf file that is not needed
# The sed here has to be done in a shell script, those tabs are
# actual tabs and not the explicit ^I
mkdir -p /scratch/tmp
grep -v AADB /cluster/store11/gs.19/ncbi/sequence.inf \
> /scratch/tmp/seq0.inf
(cat /scratch/tmp/seq0.inf; \
grep AADB01066164.1 /cluster/store11/gs.19/ncbi/sequence.inf) \
| grep "for_assembly" \
| sed -e "s/^IW^I/^I3^I/" > cleanedSequence.inf
# Then hgClonePos uses those tables to create the Coverage track
hgClonePos -maxErr=600 -maxWarn=50000 -chromLst=clonePos.list \
hg18 /cluster/data/hg18 ./cleanedSequence.inf /cluster/store11/gs.19 \
> clone.pos.errors 2>&1
###########################################################################
# RECOMBINATION RATES (DONE 2006-02-15 Fan)
# The STS MArkers track must be completed prior to creating this track
ssh kkstore02
cd /cluster/data/hg18/bed
mkdir -p recombRate
cd recombRate
# Copy other necessary files here (in future, can take from previous version)
# NOTE: these are stable, and could be saved in a permanent spot
cp -p /projects/hg2/booch/psl/info/decode_all .
cp -p /projects/hg2/booch/psl/info/marshfield_all .
cp -p /projects/hg2/booch/psl/info/genethon_all .
# Compared these 3 files with the 3 files of hg17, they are identical.
# Determine maximum concordant set of markers for each of the maps
/cluster/bin/scripts/assignGPsts -full -maxcon \
/cluster/data/ncbi/sts.10/stsAlias.bed \
/cluster/data/hg18/bed/sts/stsMarkers_pos.rdb \
decode_all > decode.marker.rdb
/cluster/bin/scripts/assignGPsts -full -maxcon \
/cluster/data/ncbi/sts.10/stsAlias.bed \
/cluster/data/hg18/bed/sts/stsMarkers_pos.rdb \
marshfield_all > marshfield.marker.rdb
/cluster/bin/scripts/assignGPsts -full -maxcon \
/cluster/data/ncbi/sts.10/stsAlias.bed \
/cluster/data/hg18/bed/sts/stsMarkers_pos.rdb \
genethon_all > genethon.marker.rdb
# Determine the rates for each of the maps
/cluster/bin/scripts/markers_to_recomb_rate.terry.pl decode.marker.rdb \
/cluster/data/hg18/chrom.sizes 1000000 1000000 \
> decode_1mb_slide_1mb
/cluster/bin/scripts/markers_to_recomb_rate.terry.pl genethon.marker.rdb \
/cluster/data/hg18/chrom.sizes 1000000 1000000 \
* genethon_1mb_slide_1mb
# got 338 "... DISCARDING" messages.
/cluster/bin/scripts/markers_to_recomb_rate.terry.pl marshfield.marker.rdb \
/cluster/data/hg18/chrom.sizes 1000000 1000000 \
* marshfield_1mb_slide_1mb
# Got 424 "... DISCARDING" messages.
# Convert files to proper format
/cluster/bin/scripts/convertRecombRate decode_1mb_slide_1mb \
/cluster/data/hg18/inserts \
/cluster/data/hg18 1000 > decode_1mb_slide_1mb_conv
/cluster/bin/scripts/convertRecombRate marshfield_1mb_slide_1mb \
/cluster/data/hg18/inserts \
/cluster/data/hg18 1000 > marshfield_1mb_slide_1mb_conv
/cluster/bin/scripts/convertRecombRate genethon_1mb_slide_1mb \
/cluster/data/hg18/inserts \
/cluster/data/hg18 1000 > genethon_1mb_slide_1mb_conv
# Create bed file and load
/cluster/bin/scripts/createRRbed decode_1mb_slide_1mb_conv \
marshfield_1mb_slide_1mb_conv genethon_1mb_slide_1mb_conv \
> recombRate.bed
ssh hgwdev
cd /cluster/store11/gs.19/build36/bed/recombRate
hgLoadBed -noBin -tab \
-sqlTable=/cluster/home/kent/src/hg/lib/recombRate.sql \
hg18 recombRate recombRate.bed
###########################################################################
# FISH CLONES (DONE - 2006-01-13 - 2006-02-07 - Hiram)
# **** RE-LOAD fishClones after bacEnds update - see below 2007-09-04 ****
# The STS Marker, Coverage, and BAC End Pairs tracks must be completed prior to
# creating this track (and why is this ?)
ssh kkstore01
mkdir /cluster/data/ncbi/fishClones/fishClones.2006-01/
cd /cluster/data/ncbi/fishClones/fishClones.2006-01/
# Download information from NCBI
# point browser at:
# http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
# change "Sequence tag:" to "placed on contig"
# change "Show details on sequence-tag" to "yes"
# change "Download or Display" to "Download table for UNIX"
# press Submit - save as
# /cluster/data/ncbi/fishClones/fishClones.2006-01/hbrc.txt
chmod 664 /cluster/data/ncbi/fishClones/fishClones.2006-01/hbrc.txt
# Unfortunately the format of this hbrc file has changed since
# last time. The columns have been rearranged, and one important
# column is missing, the contig information. So, let's see if we
# can recover the original format by putting this together with
# some other things we have here.
$HOME/kent/src/hg/fishClones/fixup.hbrc.pl hbrc.txt \
/cluster/data/hg18/bed/fishClones/seq_clone.pmd > fixed.hbrc.txt \
2> dbg
# the seq_clone.pmd file was obtained via email from Wonhee Jang
# jang at ncbi.nlm.nih.gov - I have asked for clarification where
# such a file can be fetched without resorting to email.
# Get current clone/accession information
wget --timestamping http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out
# Create initial Fish Clones bed file
ssh kkstore02
mkdir /cluster/data/hg18/bed/fishClones
cd /cluster/data/hg18/bed/fishClones
# Copy previous sts info from fhcrc (take from previous build in future)
cp -p /cluster/data/ncbi/fishClones/fishClones.2004-07/fhcrc.sts .
# This fhcrc.sts listing doesn't change. It is merely a listing
# of aliases that remain in effect.
# Create cl_acc_gi_len file form cloneend information:
grep -v "^#" /cluster/data/hg18/bed/cloneend/all.txt \
| awk '{gsub("\.[0-9]*$", "", $2);
printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,$8}' > cl_acc_gi_len
ssh hgwdev
# have to be on hgwdev for this since it is going to read from the
# database. Had to work on this program to get it past what is
# evidently a bad entry in hbrc.fixed where columns of information
# are missing for one clone in particular
time fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg18 \
/cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt \
/cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out \
./cl_acc_gi_len \
/cluster/data/hg18/bed/bacends/bacEnds.lifted.psl \
fishClones
# real 2m4.708s
# Reading Fish Clones file /cluster/data/ncbi/fishClones/fishClones.2006-01/hbrc.fixed
# reading fishInfo file /cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
# Reading Clone/Acc (clac.out) file /cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out
# Reading BAC Ends file ./cl_acc_gi_len
# Reading BAC Ends psl file /cluster/data/hg18/bed/bacends/bacEnds.lifted.psl
# Reading additional STS Marker links fhcrc.sts
# Determining good positions
# findClonePos: determining positions of fish clones
# Writing output file
# ERROR: at line # 170, no cytoband info for chrX:104048913-104206974
# RP11-79L11
# ERROR: at line # 171, no cytoband info for chrX:104048913-104206974
# RP11-79L11
# Load the track
ssh hgwdev
cd /cluster/data/hg18/bed/fishClones
hgLoadBed -notItemRgb -noBin -tab \
-sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \
hg18 fishClones fishClones.bed
# Loaded 9461 elements of size 16
###########################################################################
# CHROMOSOME BANDS TRACK (DONE - 2006-01-20 - 2006-02-07 - Hiram)
# This must wait until the Fish Clones tracks is done
# This was loaded in place of the previously loaded ideoband data
# created from NCBI information, see below for "ideogram"
ssh hgwdev
mkdir /cluster/data/hg18/bed/cytoband
cd /cluster/data/hg18/bed/cytoband
# Copy in some necessary files (usually from previous version)
cp -p /cluster/data/hg17/bed/cytoband/pctSetBands.txt .
cp -p /cluster/data/hg17/bed/cytoband/ISCN800.txt .
# Create some preliminary information files
/cluster/bin/scripts/createSetBands pctSetBands.txt \
/cluster/data/hg18/inserts /cluster/data/hg18 100 > setBands.txt
/cluster/bin/scripts/makeBands ISCN800.txt \
/cluster/data/hg18 > cytobands.pct.bed
/cluster/bin/scripts/makeBandRanges cytobands.pct.bed \
> cytobands.pct.ranges
# Reformat fishClones file
/cluster/bin/scripts/createBanderMarkers \
/cluster/data/hg18/bed/fishClones/fishClones.bed > fishClones.txt
/cluster/bin/scripts/runBander fishClones.txt \
ISCN800.txt setBands.txt /cluster/data/hg18
# Should be 862 bands
wc -l cytobands.bed
# 862 cytobands.bed
hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBand.sql \
hg18 cytoBand cytobands.bed
hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBandIdeo.sql \
hg18 cytoBandIdeo cytobands.bed
###########################################################################
# BLASTZ SELF (DONE - 2006-01-17 - 2006-01-20 - Hiram)
ssh pk
mkdir /cluster/data/hg18/bed/blastzSelf.2006-01-17
cd /cluster/data/hg18/bed/blastzSelf.2006-01-17
cat << '_EOF_' > DEF
# human vs human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_M=400
# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/selfNib
SEQ1_LEN=/san/sanvol1/scratch/hg18/self.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Human Hg18
SEQ2_DIR=/san/sanvol1/scratch/hg18/selfNib
SEQ2_LEN=/san/sanvol1/scratch/hg18/self.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/cluster/data/hg18/bed/blastzSelf.2006-01-17
TMPDIR=/scratch/tmp
'_EOF_'
# happy emacs
cd /cluster/data/hg18/bed/blastzSelf.2006-01-17
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
`pwd`/DEF > blastz.out 2>&1 &
# real 640m37.637s
ssh kolossus
cd /cluster/data/hg18/bed/blastzSelf.2006-01-17
time HGDB_CONF=~/.hg.conf.read-only featureBits \
-noRandom -noHap hg18 chainSelfLink > fb.chainSelfLink 2>&1 &
# real 21m52.697s
# 324067552 bases of 2858034764 (11.339%) in intersection
# compared to Hg17:
cd /cluster/data/hg17/bed/blastzSelf.2004-07-01
time HGDB_CONF=~/.hg.conf.read-only featureBits \
-noRandom -noHap hg17 chainSelfLink > fb.chainSelfLink 2>&1 &
# real 56m34.802s
# 240976607 bases of 2851352871 (8.451%) in intersection
# reloaded these chains to add normalized score column
ssh hgwdev
cd /cluster/data/hg18/bed/blastzSelf.2006-01-17/axtChain
chainSplit chain hg18.hg18.all.chain.gz
cd /cluster/data/hg18/bed/blastzSelf.2006-01-17/axtChain/chain
foreach f (*.chain)
set c = $f:r
hgLoadChain -normScore hg18 ${c}_chainSelf $f
end
cd ..
rm -fr chain
##############################################################################
# CLONE ENDS - BACEND TRACK (DONE - 2006-01-11 - Fan)
ssh kkstore02
cd /cluster/data/hg18
# check disk space: 73Gb free
df -h .
# Filesystem Size Used Avail Use% Mounted on
# /export/cluster/store11
1.8T 1.4T 323G 82% /cluster/store11
mkdir -p bed/cloneend/ncbi
cd bed/cloneend/ncbi
wget --timestamping ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/*
# Somehow the wget did not work. Did it by hand.
cd /cluster/data/hg18/bed/cloneend
# seems like the *.mfa files were split just for convenience
# concatenate
bash
for F in ncbi/*.mfa.gz
do
zcat ${F}
done | gzip > all.mfa.gz
exit
# Convert the title line of the all.mfa file
cat << '_EOF_' > convert.pl
#!/usr/bin/env perl
use strict;
use warnings;
while (my $line = <>) {
if ($line !~ m/^>/) {
print $line
} else {
my @fields = split('\|', $line);
my $fieldCount = scalar(@fields);
my $printed = 0;
for (my $i = 0; $i < $fieldCount; $i++) {
if ($fields[$i] eq "gb" || $fields[$i] eq "dbj" || $fields[$i] eq "emb") {
(my $name, my $vers) = split(/\./,$fields[$i+1]);
print ">$name\n";
$i= $fieldCount;
$printed = 1;
}
}
if (!$printed) {
die("Failed for $line\n");
}
}
}
'_EOF_'
# < happy emacs
chmod +x convert.pl
zcat all.mfa | ./convert.pl | gzip > cloneEnds.fa.gz
# make sure nothing got broken:
faSize all.mfa.gz
# 400704107 bases (5941742 N's 394762365 real 255711893 upper 139050472 lower) in 832860 sequences in 1 files
faSize cloneEnds.fa.gz
# 400704107 bases (5941742 N's 394762365 real 255711893 upper 139050472 lower) in 832860 sequences in 1 files
# identical numbers
# concatenate the text files, too
bash
for F in ncbi/*.txt.gz
do
zcat ${F}
done | gzip > all.txt.gz
# generate cloneEndPairs.txt and cloneEndSingles.txt
cp -p /cluster/data/mm6/bed/cloneend/ncbi/convertTxt.pl .
zcat all.txt.gz >all.txt
./convertTxt.pl all.txt
# Reading in end info
# Writing out pair info
# Writing out singleton info
# 249619 pairs and 318500 singles
# faSplit does not function correctly if given a .gz source file
# AND, we need the unzipped file for sequence loading below
gunzip cloneEnds.fa.gz
# split
mkdir splitdir
cd splitdir
faSplit sequence ../cloneEnds.fa 100 cloneEnds
# Check to ensure no breakage:
cat *.fa | faSize stdin
# 400704107 bases (5941742 N's 394762365 real 255711893 upper 139050472 lower) in 832860 sequences in 1 files
# same numbers as before
# Copy to san for cluster runs
ssh pk
cd /cluster/data/hg18/bed/cloneend/splitdir
mkdir /san/sanvol1/scratch/hg18/cloneEnds
cp -p *.fa /san/sanvol1/scratch/hg18/cloneEnds
rm *
cd ..
rmdir splitdir
# load sequences
ssh hgwdev
mkdir /gbdb/hg18/cloneend
cd /gbdb/hg18/cloneend
ln -s /cluster/data/hg18/bed/cloneend/cloneEnds.fa .
cd /tmp
hgLoadSeq hg18 /gbdb/hg18/cloneend/cloneEnds.fa
# Advisory lock created
# Creating .tab file
# Adding /gbdb/hg18/cloneend/cloneEnds.fa
# 832860 sequences
# Updating seq table
# Advisory lock has been released
# All done
############################################################################
# BACEND SEQUENCE ALIGNMENTS (STARTED - 2006-01-11, DONE 2006-01-18 - Fan)
# REDONE 2006-02-02 - Hiram
ssh pk
# The ooc file was created earlier into /cluster/bluearc/hg18/11.ooc
cp -p /cluster/bluearc/hg18/11.ooc /san/sanvol1/scratch/hg18/11.ooc
mkdir /san/sanvol1/scratch/hg18/bacends
cd /san/sanvol1/scratch/hg18/bacends
ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst
ls -1S /san/sanvol1/scratch/hg18/cloneEnds/cloneEnds???.fa > bacends.lst
# 378 contigs vs 98 bacends files -> 37,044 jobs
mkdir out
cat > template << '_EOF_'
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/hg18/11.ooc {check out line+ out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
gensub2 contigs.lst bacends.lst template jobList
foreach f (`cat bacends.lst`)
set d = $f:r:t
echo $d
mkdir out/$d
end
para create jobList
# 37044 jobs in batch
para try, check, push, etc ...
# lift alignments
ssh pk
cd /san/sanvol1/scratch/hg18/bacends
pslSort dirs raw.psl temp out/cloneEnds*
# 37044 files in 98 dirs
# Got 37044 files 192 files per mid file
# real 32m24.804s
# -rw-rw-r-- 1 6487445210 Feb 2 21:08 raw.psl
time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 &
# real 6m33.218s
# Processed 51898639 alignments
mkdir lifted
time liftUp lifted/bacEnds.lifted.psl ./liftContigs.lft warn bacEnds.psl
# real 0m30.067s
pslSort dirs bacEnds.sorted.psl temp lifted
# cleanup
rmdir temp
rm -fr out /cluster/store7/kate/hg17/bacends
wc -l *.sorted.psl
# 2490892 bacEnds.sorted.psl
time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.sorted.psl \
/cluster/data/hg18/bed/cloneend/cloneEndPairs.txt \
all_bacends bacEnds
# Reading pair file
# Reading psl file
# Creating Pairs
# Writing to files
# real 0m11.221s
# this creates the files:
# -rw-rw-r-- 1 16224182 Feb 2 21:36 bacEnds.pairs
# -rw-rw-r-- 1 4655633 Feb 2 21:36 bacEnds.orphan
# -rw-rw-r-- 1 399525 Feb 2 21:36 bacEnds.slop
# -rw-rw-r-- 1 106252 Feb 2 21:36 bacEnds.mismatch
# -rw-rw-r-- 1 634909 Feb 2 21:36 bacEnds.short
# -rw-rw-r-- 1 4023 Feb 2 21:36 bacEnds.long
# create header required by "rdb" tools
# TODO: replace w/ awk & sort
echo -e \
'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
echo -e '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
cat header bacEnds.pairs | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairs.bed
cat header bacEnds.slop bacEnds.short bacEnds.long \
bacEnds.mismatch bacEnds.orphan \
| row score ge 300 | sorttbl chr start | headchg -del \
> bacEndPairsBad.bed
extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \
bacEndPairsBad.bed | \
sorttbl tname tstart | headchg -del > bacEnds.load.psl
# Move the previous build out of the way and copy these
# results over to the primary hg18 bed location:
mv /cluster/data/hg18/bed/bacends /cluster/data/hg18/bed/bacends.2006-01-18
mkdir /cluster/data/hg18/bed/bacends
cp -p bacEnd* /cluster/data/hg18/bed/bacends
cp -p lifted/bacEnds.lifted.psl /cluster/data/hg18/bed/bacends
# load them into the database
ssh hgwdev
cd /cluster/data/hg18/bed/bacends
# CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
awk '{print $5}' bacEndPairs.bed | sort | uniq -c
# result should be the scores, no extraneous strings:
# 156984 1000
# 195 300
# 316 375
# 297 500
# 1476 750
# edit the file and fix it if it has a bad name.
hgLoadBed -notItemRgb hg18 bacEndPairs bacEndPairs.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
# Loaded 159268
# note - this track isn't pushed to RR, just used for assembly QA
hgLoadBed -notItemRgb hg18 bacEndPairsBad bacEndPairsBad.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
# Loaded 69788
#hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl
# NOTE: truncates file to 0 if -nobin is used
# NOTE: truncates file to 0 if -nobin is used
hgLoadPsl hg18 -table=all_bacends bacEnds.load.psl
# no complaints ! Usually there are, this loaded:
hgsql -N -e "select count(*) from all_bacends;" hg18
# 1249956
nice featureBits hg18 all_bacends
# 191078854 bases of 2881515245 (6.631%) in intersection
nice featureBits hg17 all_bacends
# 225763317 bases of 2866216770 (7.877%) in intersection
nice featureBits hg18 bacEndPairs
# 2842800422 bases of 2881515245 (98.656%) in intersection
nice featureBits hg17 bacEndPairs
# 2846568377 bases of 2866216770 (99.314%) in intersection
nice featureBits hg18 bacEndPairsBad
# 729313572 bases of 2881515245 (25.310%) in intersection
nice featureBits hg17 bacEndPairsBad
# 797412909 bases of 2866216770 (27.821%) in intersection
############################################################################
# BACEND PAIRS TRACK (OBSOLETE - DONE ABOVE) (DONE - 2006-01-18 - Fan)
ssh kolossus
cd /cluster/data/hg18/bacends
bash
time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl \
../bed/cloneend/cloneEndPairs.txt all_bacends bacEnds
# create header required by "rdb" tools
echo -e \
"chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header
echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header
cat header bacEnds.pairs | \
/cluster/bin/scripts/row score ge 300 | \
/cluster/bin/scripts/sorttbl chr start | \
/cluster/bin/scripts/headchg -del > bacEndPairs.bed
cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \
/cluster/bin/scripts/sorttbl chr start | \
/cluster/bin/scripts/headchg -del > bacEndPairsBad.bed
/cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
bacEndPairsBad.bed >j1.out
cat j1.out| /cluster/bin/scripts/sorttbl tname tstart >j2.out
cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl
rm j1.out j2.out
# CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
awk '{print $5}' bacEndPairs.bed | sort | uniq -c
# result should be the scores, no extraneous strings:
# 156984 1000
# 195 300
# 316 375
# 297 500
# 1476 750
# edit the file and fix it if it has a bad name.
# load into database
ssh hgwdev
cd /cluster/data/hg18/bacends
hgLoadBed -strict -notItemRgb hg18 bacEndPairs bacEndPairs.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
# Loaded 146284 elements of size 11
# note - this track isn't pushed to RR, just used for assembly QA
hgLoadBed -strict -notItemRgb hg18 bacEndPairsBad bacEndPairsBad.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
# Loaded 75995 elements of size 11
# NOTE: truncates file to 0 if -nobin is used
hgLoadPsl hg18 -table=all_bacends bacEnds.load.psl
nice featureBits hg18 all_bacends
# 162081172 bases of 2881515245 (5.625%) in intersection
nice featureBits hg17 all_bacends
# 225763317 bases of 2866216770 (7.877%) in intersection
nice featureBits hg18 bacEndPairs
# 2835522069 bases of 2881515245 (98.404%) in intersection
nice featureBits hg17 bacEndPairs
# 2846568377 bases of 2866216770 (99.314%) in intersection
nice featureBits hg18 bacEndPairsBad
# 781697678 bases of 2881515245 (27.128%) in intersection
nice featureBits hg17 bacEndPairsBad
# 797412909 bases of 2866216770 (27.821%) in intersection
##########################################################################
# BLASTZ OPOSSUM monDom2 second time (DONE - 2006-02-13 - Hiram)
ssh kk
mkdir /cluster/data/hg18/bed/blastzMonDom2.2006-02-13
cd /cluster/data/hg18/bed
ln -s blastzMonDom2.2006-02-13 blastz.monDom4
cd blastzMonDom2.2006-02-13
cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
BLASTZ=blastz.v7
# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Human (hg18)
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Opossum monDom4
SEQ2_DIR=/iscratch/i/monDom4/monDom4RMExtra.2bit
SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzMonDom4.2006-02-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
cd /cluster/data/hg18/bed/blastzMonDom2.2006-02-13
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
ssh kolossus
cd /cluster/data/hg18/bed/blastzMonDom2.2006-02-13
time nice -n +19 featureBits hg18 chainMonDom4Link \
> fb.hg18.chainMonDom4Link 2>&1 &
cat fb.hg18.chainMonDom4Link
# 356865888 bases of 2881515245 (12.385%) in intersection
# for the swap, see makeMonDom4.doc 2006-04-28
# Creating download directory (DONE - 2006-07-18 - Hiram)
ssh hgwdev
cd /cluster/data/hg18/bed/blastzMonDom2.2006-02-13
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-continue=download -stop=download `pwd`/DEF > download.out 2>&1
##########################################################################
# BLASTZ OPOSSUM monDom2 first time (EXPERIMENT - 2006-01-23 - Hiram)
ssh pk
mkdir /cluster/data/hg18/bed/blastzMonDom2.2006-01-23
cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23
cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Human (hg18)
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Opossum monDom2
SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit
SEQ2_LEN=/san/sanvol1/scratch/monDom2/chrom.sizes
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzMonDom2.2006-01-23
TMPDIR=/scratch/tmp
'_EOF_'
# happy emacs
cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
# real 912m22.818s
# This failed during the load of the chains due to the size of
# chr19.chain. So, go to kolossus:
ssh kolossus
# There isn't any hg18 db here yet, get it established with a
# chromInfo and a 2bit sequence:
hgsql -e "create database hg18;" mysql
cd /cluster/data/hg18
twoBitInfo hg18.2bit stdout |
awk '{printf "%s\t%s\t/gbdb/hg18/hg18.2bit\n", $1,$2}' \
> chromInfo.kolossus.tab
hgsql hg18 < $HOME/kent/src/hg/lib/chromInfo.sql
hgsql hg18 \
-e 'load data local infile "chromInfo.kolossus.tab" into table chromInfo;'
mkdir /gbdb/hg18
ln -s /cluster/data/hg18/hg18.2bit /gbdb/hg18/hg18.2bit
# now, loading only chr19:
cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23/axtChain
hgLoadChain hg18 chr19_chainMonDom2 chain/chr19.chain
# while that is running, back on hgwdev, get the other chains loaded
ssh hgwdev
cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23/axtChain
cp loadUp.csh loadUp.noChr19.csh
# change the foreach line to eliminate the chr19.chain:
diff loadUp.csh loadUp.noChr19.csh
< foreach f (*.chain)
---
> foreach f (`ls *.chain | grep -v chr19.chain`)
# And then run that script
time ./loadUp.noChr19.csh > load.noChr19.out 2>&1
# When the kolossus load finishes, email to push-request and ask
# for the two tables to be pushed from kolossus to hgwdev:
# chr19_chainMonDom2
# chr19_chainMonDom2Link
# then, continuing:
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-continue=download -bigClusterHub=pk -chainMinScore=5000 \
-chainLinearGap=loose `pwd`/DEF > download.out 2>&1 &
# real 2m42.505s
ssh kolossus
cd /cluster/data/hg18/bed/blastz.monDom2
time HGDB_CONF=~/.hg.conf.read-only featureBits \
hg18 chainMonDom2Link > fb.hg18.chainMonDom2Link 2>&1
# real 124m34.435s
cat fb.hg18.chainMonDom2Link
# 357258631 bases of 2881515245 (12.398%) in intersection
# then, to swap
ssh pk
cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > swap.out 2>&1 &
# running 2006-01-25 17:28
# real 51m27.447s
# this swap failed at:
# startStep: 4, at step 5 net to stopStep 9
# netChains: looks like previous stage was not successful
# (can't find [monDom2.hg18.]all.chain[.gz]).
# This failure does not make any sense. The end of swapChains
# does an nfsNoodge on this file to verify it exists.
# I don't understand why it wouldn't be in existence
# as netChains starts up.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=net `pwd`/DEF > net-swap.out 2>&1 &
# running 2006-01-26 09:28
# real 27m57.077s
# This swap failed at the load chain:
# startStep: 5, at step 6 load to stopStep 9
# # chmod a+x
# # /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain/loadUp.csh
# # ssh -x hgwdev nice
# # /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain/loadUp.csh
# cd /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain
# hgLoadChain -tIndex monDom2 chainHg18 monDom2.hg18.all.chain.gz
# Out of memory needMem - request size 56 bytes
# So, over to kolossus to give it a try:
# There isn't any monDom2 db here yet, get it established with a
# chromInfo and a 2bit sequence:
hgsql -e "create database monDom2;" mysql
cd /cluster/data/monDom2
hgsql monDom2 < $HOME/kent/src/hg/lib/chromInfo.sql
hgsql monDom2 \
-e 'load data local infile "chromInfo.tab" into table chromInfo;'
mkdir /gbdb/monDom2
ln -s /cluster/data/monDom2/monDom2.2bit /gbdb/monDom2/monDom2.2bit
# now, loading into monDom2
cd /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain
time hgLoadChain -tIndex monDom2 chainHg18 monDom2.hg18.all.chain.gz \
> kolossus.load
# running - 2006-01-26
##########################################################################
# test BLASTZ Opossum MonDom1 (DONE - 2006-01-30 - Hiram)
# to see what happened with the blow up of data in monDom2
#
ssh kk
mkdir /cluster/data/hg18/bed/blastzMonDom1.2006-01-30
cd /cluster/data/hg18/bed/blastzMonDom1.2006-01-30
cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
BLASTZ=blastz.v7
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Human (hg18)
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Opossum monDom1
SEQ2_DIR=/iscratch/i/monDom1/chunks
SEQ2_LEN=/iscratch/i/monDom1/chrom.sizes
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzMonDom1.2006-01-30
TMPDIR=/scratch/tmp
'_EOF_'
# happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
# started 2006-01-30 - 15:40
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-continue=cat -stop=load `pwd`/DEF > cat_load.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-stop=net `pwd`/DEF > blastz.out 2>&1 &
############################################################################
############################################################################
# STS MARKERS (STARTED 2006-01-27 Fan - DONE 2006-02-06 - Hiram)
# FOR NEXT TIME - a lot of the perl scripts used in this process
# need to be placed into the source tree and cleaned up to modern
# perl warnings and strict standards. In particular, one script
# was placed into the source tree this time: src/utils/findAccession.pl
# update from NCBI
ssh kkstore02
# use store11 for space
mkdir -p /cluster/store11/sts.2006-01
ln -s /cluster/store11/sts.2006-01 /cluster/data/ncbi
ln -s /cluster/data/ncbi/sts.2006-01 sts.10
cd /cluster/data/ncbi/sts.2006-01
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
# old
# wget --timestamping ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts
# wget --timestamping ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases
wget --timestamping ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
gunzip sts.gz
mv sts dbSTS.fa
# these items are copied in from the previous builds
cp -p /cluster/data/ncbi/sts.9/all.STS.fa ./all.STS.fa.prev
cp -p /cluster/data/ncbi/sts.9/stsInfo2.bed ./stsInfo2.bed.prev
# Convert dbSTS.fa file to easier reading format, and get accessions
/cluster/bin/scripts/convertGbFaFile dbSTS.fa > UniSTS.convert.fa
grep ">" UniSTS.convert.fa | cut -f 2 -d ">" > UniSTS.acc
# NOTE: updateStsInfo creates new stsInfo2.bed, all.primers,
# all.STS.fa, stsAlias.bed files
#### XXX - FOR NEXT TIME: need to fix something here for the
#### XXX - broken symbol AFM067XA9 which has over 6,000 aliases.
#### XXX - This isn't right
#### hand-editted the record for AFM067XA9. KUHN/ARCHANA 10-08-2007
#### preserving the list of otherNames that showed up stsInfo2.otherNames for
#### trueName=AFM067XA9
#### cp hg18.AFM067XA9.otherNames /cluster/data/hg18/bed/sts
#### preserving the list of stsMarkers that showed up in stsAlias.alias
#### in excess of those in the above file (10 k total)
#### cp hg18.AFM067XA9.dropped.aliases /cluster/data/hg18/bed/sts
updateStsInfo -verbose=1 -gb=UniSTS.acc stsInfo2.bed.prev all.STS.fa.prev \
UniSTS.sts UniSTS.aliases UniSTS.convert.fa new
# 5610 MFD330 1000006 (0) not in dbSTS anymore
# 5667 D3S4560 1000008 (0) not in dbSTS anymore
# 5686 ATA92F01 1000007 (0) not in dbSTS anymore
# 5945 MFD206 1000009 (0) not in dbSTS anymore
# 6591 MFD311 1000011 (0) not in dbSTS anymore
# 6841 MFD306 1000013 (0) not in dbSTS anymore
# 6842 MFD310 1000012 (0) not in dbSTS anymore
# 6844 MFD349 1000026 (0) not in dbSTS anymore
# 7024 D12S2343 1000015 (0) not in dbSTS anymore
# 7042 ATA73C05 1000014 (0) not in dbSTS anymore
# 7226 MFD341 1000016 (0) not in dbSTS anymore
# 7500 D17S2200 1000018 (0) not in dbSTS anymore
# 7628 ATA92E03 1000020 (0) not in dbSTS anymore
# 7642 GATA178F11 1000019 (0) not in dbSTS anymore
# 7910 MFD338 1000022 (0) not in dbSTS anymore
# 97723 GATA172D05 1000023 (0) not in dbSTS anymore
# 205088 CPLA3610 1000000 (0) not in dbSTS anymore
# 205089 COX_1935 1000001 (0) not in dbSTS anymore
# 205090 24534CA2 1000002 (0) not in dbSTS anymore
# 205091 D5S811 1000003 (0) not in dbSTS anymore
# 205092 AC016604-5 1000004 (0) not in dbSTS anymore
# 205093 CA-JAP-180 1000005 (0) not in dbSTS anymore
# 205094 D10S1120 1000025 (0) not in dbSTS anymore
# 205095 D21S2039 1000024 (0) not in dbSTS anymore
# 205102 D12S1013 1000028 (0) not in dbSTS anymore
mv new.info stsInfo2.bed
mv new.primers all.primers
mv new.alias stsAlias.bed
mv new.fa all.STS.fa
# get list of all STS id's in the fasta file
sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n > all.STS.id
wc -l all.STS.id
# 93698 total sequences
/cluster/bin/scripts/convertPrimerToFA all.primers > all.primers.fa
# Copy stsInfo2.bed and stsAlias.bed to data directory becuase
# these will be loaded into the database later
mkdir -p /cluster/data/hg18/bed/sts
cp -p stsInfo2.bed /cluster/data/hg18/bed/sts/
cp -p stsAlias.bed /cluster/data/hg18/bed/sts/
# Create sts sequence alignments
mkdir /san/sanvol1/scratch/hg18/sts
mkdir /san/sanvol1/scratch/hg18/sts/split
faSplit sequence all.STS.fa 200 /san/sanvol1/scratch/hg18/sts/split/sts
cp -p all.STS.fa /san/sanvol1/scratch/hg18/sts
ssh pk
cd /cluster/data/hg18/bed/sts
mkdir run
cd run
ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst
ls -1S /san/sanvol1/scratch/hg18/sts/split/sts*.fa > sts.lst
mkdir /san/sanvol1/scratch/hg18/sts/out
foreach f (`cat sts.lst`)
set d = $f:t:r
mkdir /san/sanvol1/scratch/hg18/sts/out/$d
end
# create alignments
cat > template << '_EOF_'
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/hg18/11.ooc -stepSize=5 {check out line+ /san/sanvol1/scratch/hg18/sts/out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
# happy emacs
gensub2 contigs.lst sts.lst template jobList
para create jobList
# 70686 jobs
para try ... check ... push ... etc
# Completed: 70686 of 70686 jobs
# CPU time in finished jobs: 117490s 1958.16m 32.64h 1.36d 0.004 y
# IO & Wait Time: 195274s 3254.57m 54.24h 2.26d 0.006 y
# Average job time: 4s 0.07m 0.00h 0.00d
# Longest finished job: 97s 1.62m 0.03h 0.00d
# Submission to last job: 8085s 134.75m 2.25h 0.09d
# Compile sts sequence results
ssh pk
cd /san/sanvol1/scratch/hg18/sts
time pslSort dirs raw.psl temp out/sts*
# real 8m50.714s
# -rw-rw-r-- 1 810548945 Feb 3 14:19 raw.psl
# 70686 files in 187 dirs
# Got 70686 files 266 files per mid file
rm -rf temp
time pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons raw.psl \
stsMarkers.psl /dev/null
# Processed 7252745 alignments
# real 0m28.102s
# -rw-rw-r-- 1 10981952 Feb 3 14:26 stsMarkers.psl
cp -p stsMarkers.psl /cluster/data/hg18/bed/sts/run
# Lift them and get them ready to combine with primer alignments
liftUp -nohead stsMarkers.lifted.psl \
/cluster/data/hg18/jkStuff/liftContigs.lft \
warn stsMarkers.psl
/cluster/bin/scripts/extractPslInfo stsMarkers.lifted.psl
# creates stsMarkers.lifted.psl.initial
wc stsMarkers.lifted.psl.initial
# 93236 559416 4111801 stsMarkers.lifted.psl.initial
$HOME/kent/src/utils/findAccession.pl -agp stsMarkers.lifted.psl.initial \
/cluster/data/hg18
wc stsMarkers.lifted.psl.initial.acc
# 93236 652652 4947261 stsMarkers.lifted.psl.initial.acc
sort -k4,4n stsMarkers.lifted.psl.initial.acc > stsMarkers.final
# determine found markers (4th field in file)
cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
wc -l stsMarkers.found
# 90676 stsMarkers.found
# out of 93698 total sequences
# from wc /cluster/data/ncbi/sts.2006-01/all.STS.id)
# extract sequences for markers not yet found, and
# blat w/o ooc to try to place more
comm -1 -3 stsMarkers.found /cluster/data/ncbi/sts.2006-01/all.STS.id \
> stsMarkers.notFound
wc -l stsMarkers.notFound
# 3022 stsMarkers.notFound
faSomeRecords /san/sanvol1/scratch/hg18/sts/all.STS.fa stsMarkers.notFound \
notFound.STS.fa
mkdir /san/sanvol1/scratch/hg18/sts/splitNotFound
faSplit sequence notFound.STS.fa 20 \
/san/sanvol1/scratch/hg18/sts/splitNotFound/sts
# blat with 11.ooc misses alignments, so reblat w/o the
# sequences that aren't found
# NOTE: filtering produces yield of only 101 markers placed (out of 3022).
# not enough to justify this step next time
ssh pk
mkdir /cluster/data/hg18/bed/sts/run.noOoc
cd /cluster/data/hg18/bed/sts/run.noOoc
ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst
ls -1S /san/sanvol1/scratch/hg18/sts/splitNotFound/sts*.fa > sts.lst
mkdir /san/sanvol1/scratch/hg18/sts/out.noOoc
foreach f (`cat sts.lst`)
set d = $f:t:r
mkdir /san/sanvol1/scratch/hg18/sts/out.noOoc/$d
end
cat > template << '_EOF_'
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -stepSize=5 {check out line+ /san/sanvol1/scratch/hg18/sts/out.noOoc/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
# happy emacs
gensub2 contigs.lst sts.lst template jobList
para create jobList
# 7182 jobs written to batch
para try
para check
# process this set of alignments
cd /san/sanvol1/scratch/hg18/sts
pslSort dirs raw.noOoc.psl temp out.noOoc/*
# -rw-rw-r-- 1 459858612 Feb 3 15:56 raw.noOoc.psl
# Wow, that is almost half the size of the original raw.psl with
# everything in it.
rm -rf temp
pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons \
raw.noOoc.psl stsMarkers.noOoc.psl /dev/null
# Processed 4027664 alignments
# Lift them and get them ready to combine with primer alignments
liftUp -nohead stsMarkers.noOoc.lifted.psl \
/cluster/data/hg18/jkStuff/liftContigs.lft \
warn stsMarkers.noOoc.psl
/cluster/bin/scripts/extractPslInfo stsMarkers.noOoc.lifted.psl
# creates <file>.initial
$HOME/kent/src/utils/findAccession.pl -agp \
stsMarkers.noOoc.lifted.psl.initial /cluster/data/hg18
#rm stsMarkers.lifted.psl.initial
mv stsMarkers.final stsMarkers.ooc.final
sort -k4,4n stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.extra
sort -k4,4n stsMarkers.lifted.psl.initial.acc \
stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.final
# determine found markers (4th field in file)
cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.more.found
wc -l stsMarkers.more.found
# 90777 stsMarkers.found
cut -f 4 stsMarkers.extra | sort -n -u > stsMarkers.extra.found
wc -l stsMarkers.extra.found
# 101 out of 3022 attempted
# out of 93698 total sequences
cp -p stsMarkers.final stsMarkers.lifted.psl \
stsMarkers.*lifted.psl.initial* stsMarkers.found \
/cluster/data/hg18/bed/sts
# Alignments from noOoc set were not added to all_sts_seq but info for the
# markers is in stsMap and stsInfo2. Some of the alignments are bad so
# filter by removing all alignments from noOoc psl file where
# tBaseInsert >=1000. Add the remaining alignments to the set of final
# alignments for stsMarkers. The information for the removed markers
# from the filtered set was also removed from stsMap and stsInfo2.
ssh pk
mkdir /cluster/data/hg18/bed/sts/fix
cd /cluster/data/hg18/bed/sts/fix
cp /san/sanvol1/scratch/hg18/sts/stsMarkers.noOoc.lifted.psl .
awk '{if ($8 < 1000) print}' stsMarkers.noOoc.lifted.psl \
> stsMarkers.noOoc.lifted.filt1000.psl
wc -l *.filt*.psl
# 23 483 4206 stsMarkers.noOoc.lifted.filt1000.psl
sort -k4,4n \
/san/sanvol1/scratch/hg18/sts/stsMarkers.noOoc.lifted.psl.initial.acc \
> stsMarkers.extra
awk '{print $4}' stsMarkers.extra | sort -n | uniq > extra.ids
# in psl file, the ids are the 10th field
awk '{print $10}' stsMarkers.noOoc.lifted.psl | sort -n | uniq \
> noOoc.ids
diff extra.ids noOoc.ids
# there is no difference as expected
# get list of IDs from filtered file, filter < 1000
awk '{print $10}' stsMarkers.noOoc.lifted.filt1000.psl \
| sort -n | uniq > filt1000.ids
for i in `cat filt1000.ids`
do
awk 'BEGIN {OFS="\t"} \
{if ($4 == "'$i'") print $1, $2, $3, $4, $5, $6, $7}' \
stsMarkers.extra >> stsMarkers.extra.filt1000
done
cp -p ../stsMarkers.final stsMarkers.final
# need to filter stsMarkers.final not just cat this on the end
# get list of alignments with tBaseInsert >= 1000 and remove these
cd /cluster/data/hg18/bed/sts/fix
awk '{if ($8 >= 1000) print;}' stsMarkers.noOoc.lifted.psl \
> stsMarkers.noOoc.lifted.filtToRemove.psl
wc -l *.filt*.psl
# 23 stsMarkers.noOoc.lifted.filt1000.psl
# 175 stsMarkers.noOoc.lifted.filtToRemove.psl
# get list of IDs that need to be removed
awk '{print $10;}' stsMarkers.noOoc.lifted.filtToRemove.psl | sort -n \
| uniq > noOoc.IdsToRemove.txt
# get chrom and co-ordinates for IDs to be removed
awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \
stsMarkers.noOoc.lifted.filtToRemove.psl | sort | uniq \
> sts.noOoc.filtToRemove.coords
# checked that the stsMarkers.final contain the noOoc alignments
# use this perl script to remove lines with these IDs from stsMarkers.final
cat << '_EOF_' > removeIds.pl
#!/usr/bin/env perl
use warnings;
use strict;
my $ids = $ARGV[0];
my $file = $ARGV[1];
# list of IDs with chrom and coords to remove
open(IDS, $ids) || die "Can not open $ids: $!\n";
# file for removal of IDs
open(FILE, $file) || die "Can not open $file: $!\n";
open(OUT, ">removed.txt") || die "Can not create removed.txt: $!\n";
my %idsHash;
while (<IDS>) {
chomp;
my @a = split(/\t/);
my $chr = $a[0];
my $st = $a[1];
my $end = $a[2];
my $id = $a[3];
my $key = $id."_".$chr . "_" . $st . "_" . $end;
$idsHash{$key}->{chrom} = $chr;
$idsHash{$key}->{start} = $st;
$idsHash{$key}->{end} = $end;
}
close IDS;
while (<FILE>) {
chomp;
my $l = $_;
my $found = "FALSE";
my @f = split(/\t/, $l);
foreach my $k (keys(%idsHash)) {
# if the id is contained in the key
if ($k =~ /^$f[3]/) {
my $c = $idsHash{$k}->{chrom};
my $s = $idsHash{$k}->{start};
my $e = $idsHash{$k}->{end};
if ($f[0] eq $c && $f[1] == $s && $f[2] == $e) {
print OUT "$c\t$s\t$e\t$f[3]\n";
$found = "TRUE";
}
}
}
if ($found eq "FALSE") {
print "$l\n";
}
}
'_EOF_'
chmod +x removeIds.pl
./removeIds.pl sts.noOoc.filtToRemove.coords stsMarkers.final \
> stsMarkers.final.new
wc -l stsMarkers.final*
wc stsMarkers.final*
# 93434 654038 4957784 stsMarkers.final
# 93259 652813 4948484 stsMarkers.final.new
# There are 175 ids and sets of co-ordinates in list of Ids to remove
# 175 stsMarkers.noOoc.lifted.filtToRemove.psl
# check that stsMarkers.final.new contains all the alignments that
# are in filtered set: stsMarkers.noOoc.lifted.filt1000.psl
awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \
stsMarkers.noOoc.lifted.filt1000.psl | sort | uniq \
> sts.noOoc.filt1000.coords
awk 'BEGIN {OFS = "\t"} {print $1,$2,$3,$4}' \
stsMarkers.final.new | sort | uniq \
> sts.finalnew.coords
diff sts.finalnew.coords sts.noOoc.filt1000.coords > finalnewvsfilt1000
grep '>' finalnewvsfilt1000
# there is nothing in sts.noOoc.filt1000.coords not found in the
# sts.finalnew.coords file therefore this contains all the alignments
# from the filtered noOoc file.
cp ../primers/primers.final .
awk '{print $4}' stsMarkers.final.new | sort | uniq > stsfinal.new.ids
# primers
ssh eieio
cd /cluster/data/ncbi/sts.10
# strip out N's and wobbles (KS) from primers, as isPcr
# can't currently handle them
# strip out primers < 10 as isPcr can't handle them
awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \
all.primers > all.primers.ispcr
mkdir -p /san/sanvol1/scratch/hg18/sts.10/primers
cd /san/sanvol1/scratch/hg18/sts.10/primers
split -l 4000 /cluster/data/ncbi/sts.10/all.primers.ispcr primers_
ssh pk
mkdir /cluster/data/hg18/bed/sts/primers
cd /cluster/data/hg18/bed/sts/primers
mkdir run
cd run
ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst
ls -1S /san/sanvol1/scratch/hg18/sts.10/primers/primers_* > primers.lst
mkdir /san/sanvol1/scratch/hg18/sts.10/primers/out
cat > template << '_EOF_'
#LOOP
/cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 -ooc=/san/sanvol1/scratch/hg18/10ooc/$(root1).10.ooc -stepSize=5 $(path1) $(path2) {check out line /san/sanvol1/scratch/hg18/sts.10/primers/out/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# happy emacs
gensub2 contigs.lst primers.lst template jobList
para create jobList
# 29106 jobs
para try ... check ... push ... etc ...
# Completed: 29106 of 29106 jobs
# CPU time in finished jobs: 658245s 10970.76m 182.85h 7.62d 0.021 y
# IO & Wait Time: 82764s 1379.39m 22.99h 0.96d 0.003 y
# Average job time: 25s 0.42m 0.01h 0.00d
# Longest finished job: 534s 8.90m 0.15h 0.01d
# Submission to last job: 2282s 38.03m 0.63h 0.03d
# Filter output file quickly based on simple parameters
ssh pk
cd /san/sanvol1/scratch/hg18/sts.10/primers
mkdir filter
pslQuickFilter -minMatch=26 -maxMismatch=5 \
-maxTinsert=5000 -verbose out/ filter/
# Note: there will be many messages saying files are empty - this is OK
time pslSort dirs ../primers.psl.unlifted temp filter
# Got 29106 files 171 files per mid file
# real 3m31.401s
# filter primer alignments and create not found primer file for ePCR run
cd /san/sanvol1/scratch/hg18/sts.10
pslFilterPrimers primers.psl.unlifted \
/cluster/data/ncbi/sts.10/all.primers primers.filter.unlifted.psl
# creates primers.filter.unlifted.psl.notfound.primers
wc -l primers.filter.unlifted.psl.notfound.primers
# 22943 primers.filter.unlifted.psl.notfound.primers
# use Greg Schuler's ePCR to attempt alignment of primers missed
# by isPcr
ssh pk
mkdir /san/sanvol1/scratch/hg18/sts.10/epcr
mkdir /san/sanvol1/scratch/hg18/sts.10/epcr/out
cd /san/sanvol1/scratch/hg18/sts.10/epcr
split -l 3000 ../primers.filter.unlifted.psl.notfound.primers primers_
mkdir /cluster/data/hg18/bed/sts/primers/run.epcr
cd /cluster/data/hg18/bed/sts/primers/run.epcr
ls -1S /san/sanvol1/scratch/hg18/sts.10/epcr/primers_* > primers.lst
# These jobs are going to go quickly, make sure all I/O comes and
# goes from something that can handle it.
ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contig.lst
# This runEpcr64 script was made from the existing runEpcr script
# and from the looks of it, I doubt the original script works in
# the way this was set up here. It appears to be reading the
# second argument $(path2) line by line and sending that as
# arguments to e-PCR. That wouldn't be right here.
cat > template << '_EOF_'
#LOOP
/cluster/bin/scripts/runEpcr64 $(path1) $(path2) {check out line /san/sanvol1/scratch/hg18/sts.10/epcr/out/$(root1).$(root2).epcr}
#ENDLOOP
'_EOF_'
# << emacs
gensub2 primers.lst contig.lst template jobList
para create jobList
# 3420 jobs
para try ... check ... push ... etc ...
# Completed: 3024 of 3024 jobs
# CPU time in finished jobs: 31802s 530.04m 8.83h 0.37d 0.001 y
# IO & Wait Time: 12804s 213.40m 3.56h 0.15d 0.000 y
# Average job time: 15s 0.25m 0.00h 0.00d
# Longest finished job: 193s 3.22m 0.05h 0.00d
# Submission to last job: 372s 6.20m 0.10h 0.00d
# merge output
ssh pk
cd /cluster/bluearc/hg17/sts/primers/epcr
cd /san/sanvol1/scratch/hg18/sts.10/epcr
cat out/*.epcr > all.epcr
wc -l all.epcr
# 3792
# should be on the fileserver (kkstore02) for the following heavy
# I/O operations. Didn't do that here, was on pk instead.
# use all.epcr file to re-filter alignemnts and determine which
# ePCR records to keep
cp all.epcr /cluster/data/hg18/bed/sts/primers
cd /cluster/data/hg18/bed/sts/primers
pslFilterPrimers -epcr=all.epcr -verbose=1 \
/san/sanvol1/scratch/hg18/sts.10/primers.psl.unlifted \
/cluster/data/ncbi/sts.10/all.primers primers.unlifted.epcr.psl
# creates three files:
# -rw-rw-r- 1 148528 Feb 6 10:39 epcr.not.found
# -rw-rw-r- 1 51632003 Feb 6 10:39 primers.unlifted.epcr.psl
# -rw-rw-r- 1 1189756 Feb 6 10:39 primers.unlifted.epcr.psl.notfound.primers
# convert to PSL and combine with other psl file
time /cluster/bin/scripts/epcrToHgPsl epcr.not.found \
/cluster/data/ncbi/sts.10/all.primers /cluster/data/hg18
# real 81m24.041s (on pk, may have been better on kkstore02
# where all of the data is)
cat primers.unlifted.epcr.psl epcr.not.found.psl \
| sort -k 10n > primers.final.unlifted.psl
wc -l primers.final.unlifted.psl
# 454869 primers.final.unlifted.psl
# should have been on kkstore02 already
ssh kkstore02
cd /cluster/data/hg18/bed/sts/primers
# Fix the query gap lengths so that they match the all.primers.fa
# file lengths
time /cluster/bin/scripts/fixPrimersQueryGaps \
/cluster/data/ncbi/sts.10/all.primers primers.final.unlifted.psl \
> primers.final.unlifted.fix.psl
# real 0m19.814s
wc -l primers.final.unlifted.fix.psl
# 454869 primers.final.unlifted.fix.psl
# lift results from contigs to chrom coordinates, and create final file
time liftUp -nohead primers.psl \
/cluster/data/hg18/jkStuff/liftContigs.lft warn \
primers.final.unlifted.fix.psl
# real 0m2.897s
wc -l primers.psl
# 454869 primers.psl
# Extract relevant info, make alignments unique, and create final file to
# be merged with full sequence alignments
time /cluster/bin/scripts/extractPslInfo primers.psl
# real 0m15.303s
wc -l primers.psl.initial
# 451023 primers.psl.initial
$HOME/kent/src/utils/findAccession.pl -agp primers.psl.initial \
/cluster/data/hg18
wc -l primers.psl.initial.acc
# 451023 primers.psl.initial.acc
/cluster/bin/scripts/getStsId /cluster/data/hg18/bed/sts/stsInfo2.bed \
primers.psl.initial.acc \
| sort -k 4n > primers.final
#rm primers.psl.initial.acc
wc -l primers.final
# 451023 primers.final
# There doesn't appear to be any use for this primers.ids list
# except for curiosity. Check the head and tail of this list to
# verify no garbage is in here. There should just be numbers.
awk '{print $4}' primers.final | sort -n | uniq > primers.ids
wc -l primers.ids
# 287465 primers.ids
# Merge primer and sequence files to create final bed file
# Merge (combineSeqPrimerPos) takes about an hour to run
ssh kkstore02
cd /cluster/data/hg18/bed/sts
time /cluster/bin/scripts/combineSeqPrimerPos stsMarkers.final \
primers/primers.final
# real 55m33.254so
wc -l stsMarkers_pos.rdb
# 307082 stsMarkers_pos.rdb
time /cluster/bin/scripts/createSTSbed \
/cluster/data/ncbi/sts.10/stsInfo2.bed stsMarkers_pos.rdb > stsMap.bed
# real 0m13.351s
wc -l stsMap.bed
# 300492 stsMap.bed
# Set up sequence files
ssh hgwdev
mkdir /gbdb/hg18/sts.10/
ln -s /cluster/data/ncbi/sts.10/all.STS.fa /gbdb/hg18/sts.10/all.STS.fa
ln -s /cluster/data/ncbi/sts.10/all.primers.fa \
/gbdb/hg18/sts.10/all.primers.fa
# Load all files
cd /cluster/data/hg18/bed/sts
hgLoadSeq hg18 /gbdb/hg18/sts.10/all.STS.fa /gbdb/hg18/sts.10/all.primers.fa
# Advisory lock created
# Creating .tab file
# Adding /gbdb/hg18/sts.10/all.STS.fa
# 93698 sequences
# Adding /gbdb/hg18/sts.10/all.primers.fa
# 306885 sequences
# Updating seq table
# Advisory lock has been released
# All done
# real 1m25.459s
hgsql hg18 < $HOME/kent/src/hg/lib/stsInfo2.sql
hgsql hg18 < $HOME/kent/src/hg/lib/stsAlias.sql
# these two files are already here from previous operations above
# cp /cluster/data/ncbi/sts.10/{stsInfo2.bed,stsAlias.bed} .
hgsql hg18 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
hgsql hg18 -e 'load data local infile "stsAlias.bed" into table stsAlias'
# a couple minutes for each load above
hgLoadBed -notItemRgb -noBin -tab \
-sqlTable=$HOME/kent/src/hg/lib/stsMap.sql \
hg18 stsMap stsMap.bed
hgLoadPsl -nobin -table=all_sts_primer hg18 primers/primers.psl
# load of all_sts_primer did not go as planned: 454869 record(s),
# 0 row(s) skipped, 10 warning(s) loading primers/primers.psl
hgLoadPsl -nobin -table=all_sts_seq hg18 stsMarkers.lifted.psl
# PRUNE stsMap RECORDS (DONE 3/3/06)
hgsql hg18 -e 'delete from stsMap where chromEnd-chromStart > 5000'
###########################################################################
# CREATE HAPLOTYPEPOS TRACK (DONE 1/31/06, Fan)
ssh kkstore02
cd /cluster/data/hg18/bed
mkdir haplotypePos
cd haplotypePos
cp /cluster/data/hg18/*hap*/*.fa . -p
ls *.fa|sed -e 's/chr/split1 chr/' |sed -e 's/.fa//' >splitAll
cat << '_EOF_' > split1
echo processing $1
faSplit2 -lift=$1.lft -overlap=500 size $1.fa 3500 split/$1
'_EOF_'
chmod +x split*
mkdir split
mkdir result
splitAll
ls ./split/*.fa > split.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/store11/gs.19/build36/bed/haplotypePos/hblat1 $(file1) {check out line+ /cluster/store11/gs.19/build36/bed/haplotypePos/result/$(root1).psl}
#ENDLOOP
'_EOF_'
gensub2 split.lst single gsub jobList
ssh pk
cd /cluster/data/hg18/bed/haplotypePos
mkdir result
para create jobList
para try, push, check ...
# Completed: 3091 of 3092 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 33164s 552.73m 9.21h 0.38d 0.001 y
# IO & Wait Time: 172783s 2879.72m 48.00h 2.00d 0.005 y
# Average job time: 67s 1.11m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 300s 5.00m 0.08h 0.00d
# Submission to last job: 743s 12.38m 0.21h 0.01d
# The single job that crashed was due to chr5_h2_hap1368.fa, which
# does not have a decent alignment on chr5.
# collect BLAT results
cat result/*.psl >all.psl
# keep the main alignments
pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 all.psl all_filtered.psl all.psr
cat chr*.lft > hap.lft
liftUp lifted.psl hap.lft warn all_filtered.psl -pslQ
mkdir tNibs qNibs
cp -p /cluster/data/hg18/nib/*hap*.nib qNibs
cp -p /cluster/data/hg18/nib/chr5.nib tNibs
cp -p /cluster/data/hg18/nib/chr6.nib tNibs
cp -p /cluster/data/hg18/nib/chr22.nib tNibs
axtChain -psl -linearGap=medium lifted.psl tNibs qNibs out.chain
chainAntiRepeat tNibs qNibs out.chain final.chain
cat << '_EOF_' > hap.chrom.lis
/cluster/data/hg18/nib/chr5.nib
/cluster/data/hg18/nib/chr6.nib
/cluster/data/hg18/nib/chr22.nib
'_EOF_'
ls *.fa >q.lis
chainToPsl final.chain /cluster/data/hg18/chrom.sizes \
/cluster/data/hg18/chrom.sizes hap.chrom.lis q.lis haplotypePos.psl
# took about 20 minutes
hgLoadPsl hg18 haplotypePos.psl
# add haplotypePos entry in trackDb.ra
###########################################################################
# LOAD AFFYRATIO (DONE - 2006-02-01 - Fan)
# Copied from Hg17 doc
# NOTE: Jim recommends that, in the future, all AFFY blat alignments should drop
# -mask=lower for blat and drop -minIdentity=95 to -minIdentity=90 as the
# higher minIdentity is causing alignments to be dropped that should not be.
# e.g.
# /cluster/bin/i386/blat -fine -minIdentity=90 -ooc=/cluster/bluearc/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
# pslReps can be used to handle filtering at a later step. Blat's minIdentity
# seems to be more severe than that for pslReps as it takes insertions and
# deletions into account.
#
# NOTE FROM QA (brooke, 8/28/07): In the future, run hgLoadBed without the
# -sqlTable=$HOME/src/hg/lib/affyRatio.sql option, so that tableDescriptions
# will be built properly. affyRatio.sql was needed before Jim added bed15
# capability to hgLoadBed (in Oct. 2003), but now bed15 tables can use the
# default bedExp.as and bedExp.sql files.
#
# Set up cluster job to align consenesus/exemplars to hg18
ssh kkstore02
mkdir /cluster/bluearc/hg18/affyGnf
cp -p /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa \
/cluster/bluearc/hg18/affyGnf
ssh kkr1u00
mkdir -p /iscratch/i/affyGnf
cp -p /cluster/bluearc/hg18/affyGnf/* /iscratch/i/affyGnf
/cluster/bin/iSync
ssh kki
mkdir /cluster/data/hg18/bed/affyGnf.2004-06-09
cd /cluster/data/hg18/bed/affyGnf.2004-06-09
ls -1 /iscratch/i/affyGnf/* > affy.lst
ls -1 /iscratch/i/gs.19/build36/maskedContigs/* > allctg.lst
cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 allctg.lst affy.lst template.sub jobList
mkdir psl
para create jobList
para try, push, check
# Completed: 378 of 378 jobs
# CPU time in finished jobs: 3055s 50.91m 0.85h 0.04d 0.000 y
# IO & Wait Time: 1267s 21.12m 0.35h 0.01d 0.000 y
# Average job time: 11s 0.19m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 78s 1.30m 0.02h 0.00d
# Submission to last job: 367s 6.12m 0.10h 0.00d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU95.psl
ssh kkstore02
cd /cluster/data/hg18/bed/affyGnf.2004-06-09
pslSort dirs raw.psl tmp psl
# change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least 95% identity in aligned
# region.
# minAli = 0.97 too high. low minCover as a lot of n's in these
# sequences
pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
liftUp affyU95.psl ../../jkStuff/liftAll.lft warn contig.psl
# Eliminate the long names
sed -e "s/U95Av2://" affyU95.psl | sed -e "s/;//" > affyU95shortQname.psl
# Merge with spot data and load into database. added -chip flag to
# affyPslAndAtlasToBed to allow correct parsing
ssh hgwdev
cd /cluster/data/hg18/bed/affyGnf.2004-06-09
bash
/cluster/home/sugnet/bin/i386/affyPslAndAtlasToBed -chip=U95Av2 \
affyU95shortQname.psl \
/projects/compbio/data/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt \
affyRatio.bed affyRatio.exr > affyPslAndAtlasToBed.log 2>&1
hgLoadBed -sqlTable=$HOME/src/hg/lib/affyRatio.sql hg18 \
affyRatio affyRatio.bed
# Loaded 13043 elements of size 15
mkdir affyU95
hgLoadPsl hg18 -table=affyU95 affyU95shortQname.psl
# sequences loaded 2006-02-1
hgLoadSeq -abbr=U95Av2: hg18 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
# Advisory lock created
# Creating .tab file
# Adding /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
# 12386 sequences
# Updating seq table
# Advisory lock has been released
# All done
# Load AFFYUCLANORM, extended version of affyUcla track. Hopefully
# final freeze of data set. (DONE - 2006-02-01 - Fan)
ssh hgwdev
mkdir /cluster/data/hg18/bed/affyUclaNorm
cd /cluster/data/hg18/bed/affyUclaNorm
cp -p /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa .
ssh pk
cd /cluster/data/hg18/bed/affyUclaNorm
ls -1 /scratch/hg/gs.19/build36/maskedContigs/* > contig.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << keep emacs happy
mkdir psl
ls HG-U133AB_all.fa > affy.lst
gensub2 contig.lst affy.lst gsub jobList
para create jobList
para try
para check
para push ... etc
# Completed: 378 of 378 jobs
# CPU time in finished jobs: 6766s 112.77m 1.88h 0.08d 0.000 y
# IO & Wait Time: 1541s 25.68m 0.43h 0.02d 0.000 y
# Average job time: 22s 0.37m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 202s 3.37m 0.06h 0.00d
# Submission to last job: 302s 5.03m 0.08h 0.00d
ssh kkstore02
cd /cluster/data/hg18/bed/affyUclaNorm
pslSort dirs hg18.affyU133AB_all.psl tmp psl
wc hg18.affyU133AB_all.psl
# 62043 1302842 13163424 hg18.affyU133AB_all.psl
liftUp hg18.affyU133AB_all.lifted.psl \
/cluster/data/hg18/jkStuff/liftAll.lft warn hg18.affyU133AB_all.psl
pslReps -minCover=0.5 -sizeMatters -minAli=0.97 \
-nearTop=0.005 hg18.affyU133AB_all.lifted.psl \
hg18.affyU133AB_all.lifted.pslReps.psl out.psr
# Processed 62038 alignments
~/kent/src/hg/affyGnf/affyUclaMergePslData \
-pslFile=hg18.affyU133AB_all.lifted.pslReps.psl \
-affyFile=/projects/compbio/data/microarray/affyUcla/data/feature_biomaterial_chip_logratios_formatForTrack.txt \
-bedOut=hg18.affyUcla.bed \
-expRecordOut=hg18.affyUcla.expRecords \
-expFile=/projects/compbio/data/microarray/affyUcla/data/expNames.sorted.txt
~/kent/src/hg/affyGnf/addUclaAnnotations.pl hg18.affyUcla.expRecords \
/projects/compbio/data/microarray/affyUcla/data/normal_tissue_database_annotations2.txt > hg18.affyUcla.annotations.expRecords
# Load the databases
ssh hgwdev
cd /cluster/data/hg18/bed/affyUclaNorm
sed -e 's/affyRatio/affyUclaNorm/' ~/kent/src/hg/lib/affyRatio.sql > affyUclaNorm.sql
hgLoadBed hg18 affyUclaNorm hg18.affyUcla.bed -sqlTable=affyUclaNorm.sql
############################################################################
# MAKE AFFY U133 - made after above affyUclaNorm (DONE - 2006-02-01 - Fan)
# Someday the names can be fixed.
ssh hgwdev
mkdir /cluster/data/hg18/bed/affyU133
cd /cluster/data/hg18/bed/affyU133
ln -s ../affyUclaNorm/hg18.affyU133AB_all.lifted.pslReps.psl affyU133.psl
hgLoadPsl hg18 affyU133.psl
hgsql -e "select count(*) from affyU133;" hg18
# row count in hg17: 44620, in hg18: 45559
hgLoadSeq hg18 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa
# 44792 sequences
# GNF ATLAS 2 (DONE - 2006-02-01 - Fan)
# Align probes from GNF1H chip.
ssh pk
cd /cluster/data/hg18/bed
mkdir -p geneAtlas2/run/psl
cd geneAtlas2/run
# This bluearc/geneAtlas2 directory already exists
# mkdir -p /cluster/bluearc/geneAtlas2
# cp /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /cluster/bluearc/geneAtlas2
ls -1 /scratch/hg/gs.19/build36/maskedContigs > genome.lst
ls -1 /cluster/bluearc/geneAtlas2/gnf1h.fa > mrna.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -fine -ooc=/scratch/hg/h/11.ooc /scratch/hg/gs.19/build36/maskedContigs/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.lst mrna.lst gsub jobList
para create jobList
para try
para check
para push
para time
# Completed: 378 of 378 jobs
# CPU time in finished jobs: 4038s 67.29m 1.12h 0.05d 0.000 y
# IO & Wait Time: 2182s 36.37m 0.61h 0.03d 0.000 y
# Average job time: 16s 0.27m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 250s 4.17m 0.07h 0.00d
# Submission to last job: 322s 5.37m 0.09h 0.00d
# Estimated complete: 0s 0.00m 0.00h 0.00d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create gnf1h.psl.
pslSort dirs raw.psl tmp psl
pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
# Processed 79733 alignments
liftUp ../affyGnf1h.psl ../../../jkStuff/liftAll.lft warn contig.psl
rm -r contig.psl raw.psl psl
# Load probes and alignments from GNF1H into database.
ssh hgwdev
cd /cluster/data/hg18/bed/geneAtlas2
# Already symlinked
# ln -s /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa \
# /gbdb/hgFixed/affyProbes
hgLoadPsl hg18 affyGnf1h.psl
hgLoadSeq hg18 /gbdb/hgFixed/affyProbes/gnf1h.fa
grep -v U133B ../affyUclaNorm/hg18.affyU133AB_all.lifted.pslReps.psl \
| sed -e "s/exemplar://; s/consensus://; s/U133A://" \
| sed -e "s/;//" > affyU133A.psl
hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \
affyU133A.psl /cluster/data/hg18/bed/geneAtlas2/affyGnf1h.psl
# Loaded 44696 rows of expression data from hgFixed.gnfHumanAtlas2MedianRatio
# Mapped 32926, multiply-mapped 2000, missed 48, unmapped 11770
hgLoadBed hg18 gnfAtlas2 gnfAtlas2.bed
# Loaded 34926 elements of size 15
########################################################################
# Creating the ideoband data track (DONE - 2006-02-02 - Hiram)
# This was reloaded upon completion of the cytoband sequence
# mentioned above.
# Received the following files in email from Wonhee Jang from NCBI:
# -rw-rw-r-- 1 1917 Feb 2 14:01 setBands.txt
# -rw-rw-r-- 1 39058 Feb 2 14:01 human_ideogram.dat
# -rw-rw-r-- 1 673148 Feb 2 14:01 fish.markers.bed
# placed them into /cluster/data/hg18/bed/ideogram
ssh hgwdev
mkdir /cluster/data/hg18/bed/ideogram
cd /cluster/data/hg18/bed/ideogram
cat << '_EOF_' > mkBands.sh
#!/bin/sh
T=/cluster/data/hg18/bed/ideogram
HI=${T}/human_ideogram.dat
FM=${T}/fish.markers.bed
SB=${T}/setBands.txt
bander chr1 ${HI} ${FM} ${SB} 1 247199719 100 2.0 2
bander chr2 ${HI} ${FM} ${SB} 2 242751149 100 2.0 2
bander chr3 ${HI} ${FM} ${SB} 3 199446827 100 2.0 2
bander chr4 ${HI} ${FM} ${SB} 4 191263063 100 2.0 2
bander chr5 ${HI} ${FM} ${SB} 5 180837866 100 2.0 2
bander chr6 ${HI} ${FM} ${SB} 6 170896992 100 2.0 2
bander chr7 ${HI} ${FM} ${SB} 7 158821424 100 2.0 2
bander chr8 ${HI} ${FM} ${SB} 8 146274826 100 2.0 2
bander chr9 ${HI} ${FM} ${SB} 9 140273252 100 2.0 2
bander chr10 ${HI} ${FM} ${SB} 10 135374737 100 2.0 2
bander chr11 ${HI} ${FM} ${SB} 11 134452384 100 2.0 2
bander chr12 ${HI} ${FM} ${SB} 12 132289534 100 2.0 2
bander chr13 ${HI} ${FM} ${SB} 13 114127980 100 2.0 2
bander chr14 ${HI} ${FM} ${SB} 14 106360585 100 2.0 2
bander chr15 ${HI} ${FM} ${SB} 15 100338915 100 2.0 2
bander chr16 ${HI} ${FM} ${SB} 16 88822254 100 2.0 2
bander chr17 ${HI} ${FM} ${SB} 17 78654742 100 2.0 2
bander chr18 ${HI} ${FM} ${SB} 18 76117153 100 2.0 2
bander chr19 ${HI} ${FM} ${SB} 19 63806651 100 2.0 2
bander chr20 ${HI} ${FM} ${SB} 20 62435964 100 2.0 2
bander chr21 ${HI} ${FM} ${SB} 21 46944323 100 2.0 2
bander chr22 ${HI} ${FM} ${SB} 22 49591432 100 2.0 2
bander chrX ${HI} ${FM} ${SB} X 154913754 100 2.0 2
bander chrY ${HI} ${FM} ${SB} Y 57443437 100 2.0 2
for I in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
do
cat chr${I}.bed
done > cytobands.bed
'_EOF_'
# happy emacs
chmod +x mkBands.sh
./mkBands.sh
# should be 862
wc cytobands.bed
# 862 4310 29911 cytobands.bed
hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBand.sql \
hg18 cytoBand cytobands.bed
hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBandIdeo.sql \
hg18 cytoBandIdeo cytobands.bed
############################################################################
# H-INVITATIONAL GENE ANNOTATION DATABASE (DONE 2006-0202, Fan)
# http://www.jbirc.aist.go.jp/hinv/top.html
# Create knownGene table to reference HINV gene ID's
# for link on knownGenes details page
# Also, create an HINV gene track
# download CDNA file release 2.2 (Jan 20, 2006) -- got release # from downloads page).
ssh kkstore03
cd /cluster/data/hinv
mkdir 2005-02-02
cd 2005-02-02
wget --timestamp http://www.jbirc.aist.go.jp/hinv/download/alldata/flatfile/FCDNA.gz
gunzip FCDNA.gz
mv FCDNA FCDNA.2.2
# set up assembly work area
ssh kkstore02
cd /cluster/data/hg18
mkdir -p bed/hinv
cd bed/hinv
# extract H-INV ID's and Genbank accessions of mRNAs
awk '/CDNA_ACCESSION-NO:/ {print $2}' < /cluster/data/hinv/2005-02-02/FCDNA.2.2 > accessions.txt
awk '/CDNA_H-INVITATIONAL-ID:/ {print $2}' < /cluster/data/hinv/2005-02-02/FCDNA.2.2 > ids.txt
paste accessions.txt ids.txt > queries.txt
wc -l ids.txt
# 56419 ids.txt
# create PSL file from alignments for these mRNA's, extracted from the
# table of all aligned mRNA's
ssh hgwdev
cd /cluster/data/hg18/bed/hinv
hgsql hg18 -s -e "SELECT * FROM all_mrna" | cut -f 2- > all_mrna.tab
ssh kkstore02
cd /cluster/data/hg18/bed/hinv
pslReps /dev/null stdout /dev/null | cat - all_mrna.tab > all_mrna.psl
# using pslReps to generate the PSL file header
pslSelect -queryPairs=queries.txt all_mrna.psl hinv_mrna.psl
# NEXT TIME, LOAD HInvGeneMrna TABLE AFTER HInv TABLE IS LOADED TO AVOID
# joinerCheck TO COMPLAIN.
# load track of mrna alignments
ssh hgwdev
cd /cluster/data/hg18/bed/hinv
hgLoadPsl hg18 -table=HInvGeneMrna hinv_mrna.psl
hgsql hg18 -s -e \
"select distinct(qName) from HInvGeneMrna order by qName" > hg18.mrna
hgsql hg17 -s -e \
"select distinct(qName) from HInvGeneMrna order by qName" > hg17.mrna
wc -l hg*.mrna
# 41023 hg17.mrna
# 54974 hg18.mrna
comm -1 -3 *.mrna > hg18.aligned
wc -l hg18.aligned
# 14758 (transcripts newly aligned in hg18)
comm -2 -3 *.mrna > hg17.aligned
wc -l hg17.aligned
# 807 (transcripts no longer aligned in hg18)
comm -2 -3 ids.txt hg18.mrna > hg18.notaligned
wc -l hg18.notaligned
# 1445 (transcripts not aligned in hg18 -- checking on why...)
# also make a table with various useful items for each transcript
ssh hgwdev
hgsql hg18 < ~/kent/src/hg/lib/HInv.sql
cd /cluster/data/hg18/bed/hinv
/cluster/data/hinv/hinvToTable.pl < /cluster/data/hinv/2005-02-02/FCDNA.2.2 > HInv.tab
echo 'load data local infile "HInv.tab" into table HInv' | hgsql hg18
hgsql hg17 -s -e "select count(*) from HInv"
# 41118
hgsql hg18 -s -e "select count(*) from HInv"
# 56419
# !!! DO THIS AFTER KG IS BUILD !!!
# DONE (4/13/06 Fan).
# create table for knownGenes detail page
ssh hgwdev
cd /cluster/data/hg18/bed/hinv
hgMapToGene hg18 HInvGeneMrna knownGene knownToHInv
# QA NOTE (3-6-2006): did a mytouch to update the time for the HInvGeneMrna table
# (because joinerCheck was complaining during -times check):
# sudo mytouch hg18 HInvGeneMrna 200602031600.00
# touch -t 200602031600.00 /var/lib/mysql/hg18/HInvGeneMrna.MYD
# PRODUCE FUGU BLAT ALIGNMENT (DONE - 2006-02-02 - Fan)
ssh kk
mkdir /cluster/data/hg18/bed/blatFr1
cd /cluster/data/hg18/bed/blatFr1
mkdir psl
# next time, use N?_?????? (to pick up NG_ contigs)
foreach f ( `cat /cluster/data/hg18/contig.lst` )
set c=$f:t:r
echo $c
mkdir psl/$c
end
# create cluster job
mkdir run
cd run
ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
ls -1S /scratch/hg/gs.19/build36/maskedContigs/*.fa > human.lst
cat << 'EOF' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/hg18/bed/blatFr1/psl/$(root1)/$(root1)_$(root2).psl}
#ENDLOOP
'EOF'
# << keep emacs happy
gensub2 human.lst fugu.lst gsub jobList
para create jobList
# 218484 jobs written to batch
para try
para check
para push -maxQueue=300000 -maxPush=220000
para check
# Completed: 218484 of 218484 jobs
# CPU time in finished jobs: 5073329s 84555.48m 1409.26h 58.72d 0.161 y
# IO & Wait Time: 692572s 11542.87m 192.38h 8.02d 0.022 y
# Average job time: 26s 0.44m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 910s 15.17m 0.25h 0.01d
# Submission to last job: 14753s 245.88m 4.10h 0.17d
# cd ../psl
# count files with aligments
# find . -not -size 427c | wc -l
# 44458
# count files with no aligments
# find . -size 427c | wc -l
# 174405
# When cluster run is done, sort alignments
# into chrom directory
ssh kkstore02
cd /cluster/data/hg18/bed/blatFr1
pslCat -dir psl/N?_?????? | \
liftUp -type=.psl stdout \
/cluster/data/hg18/jkStuff/liftAll.lft warn stdin | \
pslSortAcc nohead chrom temp stdin
# Processed 218887 lines into 1 temp files
# Rename to correspond with tables and load into database:
ssh hgwdev
cd /cluster/data/hg18/bed/blatFr1/chrom
foreach i (chr*.psl)
set r = $i:r
echo mv $i ${r}_blatFr1.psl
mv $i ${r}_blatFr1.psl
end
# lift fugu scaffolds to Fugu browser chrUn,
# so you can link to other browser. And don't need to load sequence
cd /cluster/data/hg18/bed/blatFr1
liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl
hgLoadPsl -table=blatFr1 hg18 all.psl
nice featureBits hg18 blatFr1 refGene:CDS
# 14636876 bases of 2881515245 (0.508%) in intersection
nice featureBits hg17 blatFr1 refGene:CDS
# 14488047 bases of 2866216770 (0.505%) in intersection
#######################################################################
# OPOSSUM BLASTZ - (DONE - 2006-02-10 - Hiram)
ssh kk
# this was done again after this, see 2006-02-13
mkdir /cluster/data/hg18/bed/blastzMonDom4.2006-02-10
cd /cluster/data/hg18/bed/blastzMonDom4.2006-02-10
cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
BLASTZ=blastz.v7
# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Human (hg18)
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Opossum monDom4
SEQ2_DIR=/iscratch/i/monDom4/monDom4.2bit
SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzMonDom4.2006-02-10
TMPDIR=/scratch/tmp
'_EOF_'
# happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-stop=net `pwd`/DEF > blastz.out 2>&1 &
# running 2006-02-10
# Completed: 43469 of 43470 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 25745592s 429093.20m 7151.55h 297.98d 0.816 y
# IO & Wait Time: 8466642s 141110.70m 2351.85h 97.99d 0.268 y
# Average job time: 787s 13.12m 0.22h 0.01d
# Longest finished job: 51561s 859.35m 14.32h 0.60d
# Submission to last job: 103470s 1724.50m 28.74h 1.20d
# There wasn't actually an outstanding job, it had been completed.
# Completed: 345 of 345 jobs
# CPU time in finished jobs: 620s 10.33m 0.17h 0.01d 0.000 y
# IO & Wait Time: 1631s 27.19m 0.45h 0.02d 0.000 y
# Average job time: 7s 0.11m 0.00h 0.00d
# Longest finished job: 69s 1.15m 0.02h 0.00d
# Submission to last job: 255s 4.25m 0.07h 0.00d
# Completed: 49 of 49 jobs
# CPU time in finished jobs: 224697s 3744.94m 62.42h 2.60d 0.007 y
# IO & Wait Time: 4790s 79.84m 1.33h 0.06d 0.000 y
# Average job time: 4683s 78.06m 1.30h 0.05d
# Longest finished job: 115041s 1917.35m 31.96h 1.33d
# Submission to last job: 115147s 1919.12m 31.99h 1.33d
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-continue=cat -stop=net `pwd`/DEF > cat-net.out 2>&1 &
# running 2006-02-11
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-continue=load -stop=load `pwd`/DEF > load.out 2>&1 &
ssh kolossus
cd /cluster/data/hg18/bed/blastz.monDom4
time nice -n +19 featureBits hg18 chainMonDom4Link \
> fb.hg18.chainMonDom4Link 2>&1 &
cat fb.hg18.chainMonDom4Link
# 356865888 bases of 2881515245 (12.385%) in intersection
####################################################################################
# BUILD KNOWN GENES TABLES (STARTED 2/1/06, DONE 2/13/06 Fan)
# First build protein databases, sp060115 and proteins060115
# See makeProteins060115.doc for details.
# Create working subdirectories and temporary databases (kgHg18A)
ssh hgwdev
cd /cluster/store11/kg
mkdir kgHg18A
ln -s /cluster/store11/kg/kgHg18A /cluster/store6/kgDB/bed/kgHg18A
ln -s /cluster/store11/kg/kgHg18A /cluster/data/hg18/bed/kgHg18A
hgsql hg18 -e "create database kgHg18A"
hgsql hg18 -e "create database kgHg18ATemp"
mkdir /cluster/bluearc/kgDB/kgHg18A
mkdir /cluster/bluearc/kgDB/kgHg18A/protBlat
ln -s /cluster/bluearc/kgDB/kgHg18A/protBlat /cluster/store11/kg/kgHg18A/protBlat
cd /cluster/store11/kg/kgHg18A/protBlat
# Get all human protein sequences
hgsql -N sp060115 -e \
'select p.acc, p.val from protein p, accToTaxon x where x.taxon=9606 and p.acc=x.acc'\
|awk '{print ">" $1;print $2}' >humanProt.fa
hgsql -N sp060115 -e \
'select v.varAcc, p.val from varAcc v, protein p, accToTaxon x where v.parAcc = p.acc and x.taxon=9606 and v.parAcc=x.acc'\
|awk '{print ">" $1;print $2}' \
>humanVarProt.fa
# append var proteins to humanProt.fa
cat humanVarProt.fa >>humanProt.fa
# Prepare and perform cluster run for protein/genome alignment
ssh pk
cd /cluster/data/hg18/bed/kgHg18A/protBlat
mkdir prot
faSplit sequence humanProt.fa 2000 prot/prot
ls /cluster/bluearc/kgDB/kgHg18A/protBlat/prot/* > prot.lis
ssh hgwdev
cd /cluster/data/hg18/bed/kgHg18A/protBlat
hgsql hg18 -N -e 'select chrom from chromInfo' > chrom.lis
exit
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -t=dnax -q=prot /cluster/data/hg18/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgHg18A/protBlat/result/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
mkdir result
gensub2 chrom.lis prot.lis gsub jobList
para create jobList
para try
para check
para push
para check ...
# Completed: 97020 of 97020 jobs
# CPU time in finished jobs: 16070335s 267838.92m 4463.98h 186.00d 0.510 y
# IO & Wait Time: 279789s 4663.15m 77.72h 3.24d 0.009 y
# Average job time: 169s 2.81m 0.05h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 152051s 2534.18m 42.24h 1.76d
# Submission to last job: 152235s 2537.25m 42.29h 1.76d
# This cluster run took a little less than 2 days.
# collect BLAT results
pslSort -nohead dirs raw.psl temp result
pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 raw.psl protBlat.psl /dev/null
ssh hgwdev
cd /cluster/bluearc/kgDB/kgHg18A/protBlat
hgLoadPsl hg18 protBlat.psl
# create all_mrna.psl and tight_mrna.psl
hgsql hg18 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl
pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \
all_mrna.psl tight_mrna.psl /dev/null
# Save a copy of the following hg18 tables:
all_mrna
gbCdnaInfo
gbExtFile
gbLoaded
gbSeq
gbStatus
genbank.lis
refFlat
refGene
refLink
refSeqAli
refSeqStatus
refSeqSummary
xenoMrna
xenoRefFlat
xenoRefGene
xenoRefSeqAli
# Use overlapSelect to get protein and mRNA alignment overlaps
overlapSelect -statsOutput -dropped=protOut.psl -overlapThreshold=0.90 \
-selectFmt=psl -inFmt=psl tight_mrna.psl protBlat.psl protMrna.stat
overlapSelect -mergeOutput -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \
-inFmt=psl tight_mrna.psl protBlat.psl protMrna.out
# Create protein/mRNA pair and protein lists
cut -f 10,31 protMrna.out|sort -u >spMrna.tab
cut -f 10 protMrna.out|sort -u >protein.lis
mv protein.lis ..
# Load spMrna.tab into spMrna table in temp DB.
hgsql kgHg18ATemp < ~/src/hg/lib/spMrna.sql
hgsql kgHg18ATemp -e 'load data local infile "spMrna.tab" into table spMrna'
hgsql kgHg18ATemp -e 'create index mrnaID on spMrna(mrnaID)'
# Prepare and perform cluster run of protein/mRNA alignment
# Get mRNA fa file.
cd /cluster/data/hg18/bed/kgHg18A
/cluster/data/genbank/bin/i386/gbGetSeqs -native -db=hg18 \
-gbRoot=/cluster/data/genbank genbank mrna mrna.fa
# Create mrnaSeq table in kgHg18ATemp DB.
faToTab mrna.fa mrnaSeq.tab
hgsql kgHg18ATemp -e 'drop table mrnaSeq'
hgsql kgHg18ATemp <~/src/hg/lib/mrnaSeq.sql
hgsql kgHg18ATemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq'
# Prepare files for cluster run
cd /cluster/bluearc/kgDB/kgHg18A
~/src/hg/protein/KG2B.sh kgHg18A hg18 060115
# Perform cluster run of protein/mRNA alignment
~/src/hg/protein/KG3.sh kgHg18A hg18 060115
# Collect cluster run results
cd kgBestMrna
ls out | sed -e 's/prot/do1 prot/g' >doall
# create do1 with the following 2 lines:
cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protMrnaRaw.psl
'_EOF_'
chmod +x do*
doall
# Filter out low quality alignments
pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null
cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis
wc protMrna.lis
# Load BLAT results into temp DB.
ssh hgwdev
cd /cluster/store11/kg/kgHg18A/kgBestMrna
hgsql kgHg18ATemp < ~/src/hg/lib/protMrnaBlat.sql
hgsql kgHg18ATemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat'
hgsql kgHg18ATemp -e 'create index tName on protMrnaBlat(tName)'
# Create CDS files from protein/mRNA alignment results.
hgsql kgHg18ATemp -N -e \
'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\
|sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds
# Create protMrna.psl with proteinID_mrnaID as query ID.
cut -f 22-30 ../protBlat/protMrna.out > j1.tmp
cut -f 32-42 ../protBlat/protMrna.out > j2.tmp
cut -f 10,31 ../protBlat/protMrna.out|sed -e 's/\t/_/g' >j3.tmp
paste j1.tmp j3.tmp j2.tmp >protMrna.psl
rm j1.tmp j2.tmp j3.tmp
# Run mrnaToGene to create protMrna.gp
bash
mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
exit
# move kgBestMrna to /san/sanvol1 to save space on store11
mv /cluster/store11/kg/kgHg18A/kgBestMrna/clusterRun /san/sanvol1/scratch/fan/hg18/kgHg18A/kgBestMrna
ln -s /san/sanvol1/scratch/fan/hg18/kgHg18A/kgBestMrna/clusterRun \
/cluster/store11/kg/kgHg18A/kgBestMrna/clusterRun
# Prepare refGene and all_mrna gp files.
cd ..
cp -p base/refGene.tab ref.gp
# hgsql hg18 -N -e 'select * from refGene' >ref.gp
hgsql hg18 -N -e \
'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \
|sort -u > all_mrna.cds
cat base/all_mrna.tab |cut -f 2-22 >all_mrna.psl
bash
mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
exit
# Align proteins to RefSeq.
overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
protBlat/protBlat.psl ref.gp ref.stat
overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
protBlat/protBlat.psl ref.gp protRef.gp
overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\
-selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out
cut -f 10,22 protRef.out | sort -u >spRef.tab
cut -f 10 protRef.out | sort -u >protRef.lis
hgsql kgHg18ATemp -e 'drop table spRef'
hgsql kgHg18ATemp <~/src/hg/lib/spRef.sql
hgsql kgHg18ATemp -e 'load data local infile "spRef.tab" into table spRef'
# Prepare and perform cluster runs for protein/RefSeq alignments
~/src/hg/protein/KGRef2.sh kgHg18A hg18 060115
# Took 7 hours. This step should be investigated and improved.
~/src/hg/protein/KGRef3.sh kgHg18A hg18 060115
cd kgBestRef
ls out | sed -e 's/prot/do1 prot/g' >doall
cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protRefRaw.psl
'_EOF_'
chmod +x do*
doall
# Filter out low quality alignments.
pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null
cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis
wc protRef.lis
hgsql kgHg18ATemp -e 'drop table protRefBlat'
hgsql kgHg18ATemp < ~/src/hg/lib/protRefBlat.sql
hgsql kgHg18ATemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat'
hgsql kgHg18ATemp -e 'create index tName on protRefBlat(tName)'
# Run gene-check to filter out invalid gp entries
cd /cluster/data/hg18/bed/kgHg18A
cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp
gene-check -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/data/hg18/nib kgCandidate0.gp kgCandidate0.check
hgsql kgHg18ATemp -e 'drop table kgCandidate0'
hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidate0.sql
hgsql kgHg18ATemp -e 'load data local infile "kgCandidate0.gp" into table kgCandidate0'
hgsql kgHg18ATemp -e 'drop table geneCheck'
hgsql kgHg18ATemp < ~/src/hg/lib/geneCheck.sql
hgsql kgHg18ATemp -e 'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines'
# Run kgCheck to get all KG candidates that pass the KG gene check criteria
kgCheck kgHg18ATemp hg18 kgCandidate0 geneCheck kgCandidate.tab
hgsql kgHg18ATemp -e 'drop table kgCandidate'
hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidate.sql
hgsql kgHg18ATemp -e 'load data local infile "kgCandidate.tab" into table kgCandidate'
hgsql kgHg18ATemp -e 'create index alignID on kgCandidate(alignID)'
# ####### NEXT TIME AROUND PUT IN AN EXTRA STEP TO BRING IN ITEMS ON A "PUT BACK" LIST
# FOR SPECIAL CASES LIKE SELENOCYSTEINE, NON-AUG INITIATION CODON, RIBOSOMAL SLIPPAGE, ETC.
# #######
# Construct the kgCandidateX table that has alignID in the name field.
cut -f 2-10 kgCandidate.tab >j2.tmp
cut -f 11 kgCandidate.tab >j1.tmp
paste j1.tmp j2.tmp >kgCandidateX.tab
hgsql kgHg18ATemp -e 'drop table kgCandidateX'
hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidateX.sql
hgsql kgHg18ATemp -e 'load data local infile "kgCandidateX.tab" into table kgCandidateX'
# Score protein/mRna and protein/RefSeq alignments
ln -s protBlat/protein.lis protein.lis
kgResultBestMrna2 060115 kgHg18ATemp hg18 protMrnaBlat|sort -u >protMrnaBlatScore.tab
kgResultBestRef2 060115 kgHg18ATemp hg18 protRefBlat|sort -u >protRefScore.tab
# Combine scoring results and load them into temp DB.
cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
hgsql kgHg18ATemp -e 'drop table protMrnaScore'
hgsql kgHg18ATemp < ~/src/hg/lib/protMrnaScore.sql
hgsql kgHg18ATemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore'
hgsql kgHg18ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'
# Run kgGetCds to get CDS structure of each gene
kgGetCds kgHg18ATemp 060115 kgCandidateX jY.tmp
cat jY.tmp |sort -u >kgCandidateY.tab
rm jY.tmp
hgsql kgHg18ATemp -e 'drop table kgCandidateY'
hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidateY.sql
hgsql kgHg18ATemp -e 'load data local infile "kgCandidateY.tab" into table kgCandidateY'
# Run kgPickPrep to replace long cds structure string with cdsId.
kgPickPrep kgHg18ATemp kgCandidateZ.tab
hgsql kgHg18ATemp -e 'drop table kgCandidateZ'
hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidateZ.sql
hgsql kgHg18ATemp -e 'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
hgsql kgHg18ATemp -e 'create index cdsId on kgCandidateZ(cdsId)'
# Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.
kgPick kgHg18ATemp hg18 sp060115 kg3.tmp dupSpMrna.tmp
sort -u dupSpMrna.tmp >dupSpMrna.tab
# Create put back list
# gbGetSeqs2, a modified version of gbGetSeqs output the RefSeq IDs at the beginning of each output line.
gbGetSeqs2 -gbRoot=/cluster/data/genbank db=hg18 -get=ra RefSeq mrna ref.ra
cat ref.ra | sed -e 's/ /\t/' | sort -u >refRa.tab
hgsql hg18 -e 'drop table refRa'
hgsql hg18 < ~/src/hg/lib/refRa.sql
hgsql hg18 -e 'load data local infile "refRa.tab" into table refRa ignore 1 lines'
hgsql hg18 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="selenocysteine" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \
>kgPutBack2.tab
hgsql hg18 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%ribosomal frameshift%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \
>>kgPutBack2.tab
hgsql hg18 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%non-AUG%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \
>>kgPutBack2.tab
hgsql hg18 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="translExcept" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \
>>kgPutBack2.tab
hgsql hg18 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="exception" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \
>>kgPutBack2.tab
hgsql kgHg18ATemp -e 'drop table kgPutBack2'
hgsql kgHg18ATemp < ~/src/hg/lib/kgPutBack2.sql
hgsql kgHg18ATemp -e 'load data local infile "kgPutBack2.tab" into table kgPutBack2'
kgPutBack kgHg18ATemp hg18 sp060115 kgPutBack2 kgPutBack2.gp
# No matching protein found for NM_201397.
# No matching protein found for NM_203341.
# No matching protein found for NM_213593.
# No matching protein found for NM_052987.
# No matching protein found for NM_201397.
# No matching protein found for NM_203341.
# No matching protein found for NM_213593.
# Sort KG genes to make the kg4.gp table file.
cat kgPutBack2.gp kg3.tmp > kg4.tmp
~/kent/src/hg/protein/sortKg.pl kg4.tmp >knownGene.tab
hgsql kgHg18ATemp -e 'drop table knownGene'
hgsql kgHg18ATemp < ~/src/hg/lib/knownGene.sql
hgsql kgHg18ATemp -e 'load data local infile "knownGene.tab" into table knownGene'
# Load data into hg18 knownGene table.
hgsql hg18 -e 'drop table knownGene'
hgsql hg18 < ~/src/hg/lib/knownGene.sql
hgsql hg18 -e 'load data local infile "knownGene.tab" into table knownGene'
# Load dupSpMrna table after knownGene table is loaded so that joinerCheck does not complain.
hgsql hg18 -e 'drop table dupSpMrna'
hgsql hg18 < ~/src/hg/lib/dupSpMrna.sql
hgsql hg18 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna'
# Perform analysis on KG
# Build knownGeneMrna and knownGenePep tables.
kgPepMrna kgHg18ATemp hg18 060115
hgsql hg18 -e 'drop table knownGeneMrna'
hgsql hg18 < ~/src/hg/lib/knownGeneMrna.sql
hgsql hg18 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
hgsql hg18 -e 'drop table knownGenePep'
hgsql hg18 < ~/src/hg/lib/knownGenePep.sql
hgsql hg18 -e 'load data local infile "knownGenePep.tab" into table knownGenePep'
# Build kgXref table
kgXref2 kgHg18ATemp 060115 hg18
hgsql hg18 -e 'drop table kgXref'
hgsql hg18 < ~/src/hg/lib/kgXref.sql
hgsql hg18 -e 'load data local infile "kgXref.tab" into table kgXref'
# Build spMrna table
hgsql hg18 -N -e 'select proteinID, name from knownGene' >kgSpMrna.tab
hgsql hg18 -e 'drop table spMrna'
hgsql hg18 <~/src/hg/lib/spMrna.sql
hgsql hg18 -e 'load data local infile "kgSpMrna.tab" into table spMrna'
# Build kgProtMap table
~/src/hg/protein/kgProtMap2.sh kgHg18A hg18 060115
# Found the number of kgProtMap table was less than 20,000,
# indicating missing a lot of entries. The problem was
# due to that tight_mrna.psl was now in ~/hg18Kg/protBlat.
# Manually ran the following to correct the problem:
cd ~/hg18Kg/kgProtMap/psl.tmp
cat ~/hg18Kg/protBlat/tight_mrna.psl refSeqAli.psl > both.psl
pslMap kgProtMrna.psl both.psl stdout | sort -u| \
sort -k 14,14 -k 16,16n -k 17,17n > kgProtMap.psl
hgsql hg18 -e "drop table kgProtMap;"
hgLoadPsl -tNameIx hg18 kgProtMap.psl
#####################################
# Build alias tables.
kgAliasM hg18 proteins060115
# kgAliasKgXref reads from hg18.knownGene.proteinID,
# hg18.knownGene.name, hg18.kgXref.geneSymbol
# to create kgAliasKgXref.tab
kgAliasKgXref hg18
# kgAliasRefseq reads from hg18.knownGene.name,
# hg18.knownGene.proteinID, hg18.kgXref.refseq
# to create kgAliasRefseq.tab
kgAliasRefseq hg18
hgsql sp060115 -N -e 'select name,gene.val from hg18.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
| sort -u > kgAliasP.tab
hgsql hg18 -N -e 'select name, name from knownGene' >kgAliasDup.tab
hgsql hg18 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
sort |uniq > kgAlias.tab
hgsql -e "drop table kgAlias;" hg18
hgsql hg18 < ~/kent/src/hg/lib/kgAlias.sql
hgsql hg18 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias'
# kgProtAlias reads from hg18.knownGene.name,
# hg18.knownGene.proteinID, hg18.knownGene.alignID,
# proteins060115.spXref3.accession, proteins060115.spSecondaryID, proteins060115.pdbSP.pdb
# to create kgProtAlias.tab#
kgProtAlias hg18 060115
hgsql hg18 -N -e \
'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
| sort -u >kgProtAliasNCBI.tab
# include variant splice protein IDs
hgsql hg18 -N -e \
'select name, proteinID, parAcc from knownGene,sp060115.varAcc where varAcc=proteinID'\
|sort -u >kgProtAliasDup.tab
# include duplicate protein IDs from dupSpMrna table
hgsql hg18 -N -e \
'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
|sort -u >>kgProtAliasDup.tab
# catch parent acc from dupProteinID too
hgsql hg18 -N -e\
'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp060115.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
|sort -u >>kgProtAliasDup.tab
cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab
echo "`date` creating table kgProtAlias"
hgsql hg18 -e "drop table kgProtAlias;"
hgsql hg18 <~/src/hg/lib/kgProtAlias.sql;
hgsql hg18 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'
# Build kgSpAlias table
hgsql hg18 -e \
'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
hgsql hg18 -e \
'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
>>j.tmp
cat j.tmp|sort -u |grep -v 'kgID' >hg18.kgSpAlias.tab
rm j.tmp
hgsql hg18 -e 'drop table kgSpAlias';
hgsql hg18 < ~/src/hg/lib/kgSpAlias.sql
hgsql hg18 -e 'load data local infile "hg18.kgSpAlias.tab" into table kgSpAlias'
# QA NOTE (3-6-2006): did a mytouch to update the time for the knownGene table
# (because joinerCheck was complaining during -times check):
# [hgwdev:~/joiner> sudo mytouch hg18 knownGene 200602061707
# touch -t 200602061707 /var/lib/mysql/hg18/knownGene.MYD
# MAKE FOLDUTR TABLES (DONE 2006-02-09, Fan)
# First set up directory structure and extract UTR sequence on hgwdev
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir rnaStruct.2006-02-09
rm rnaStruct
ln -s rnaStruct.2006-02-09 rnaStruct
cd rnaStruct
mkdir -p utr3/split utr5/split utr3/fold utr5/fold
utrFa hg18 knownGene utr3 utr3/utr.fa
utrFa hg18 knownGene utr5 utr5/utr.fa
# Split up files and make files that define job.
ssh pk
cd /cluster/data/hg18/bed/rnaStruct
faSplit sequence utr3/utr.fa 50000 utr3/split/s
faSplit sequence utr5/utr.fa 50000 utr5/split/s
ls -1 utr3/split > utr3/in.lst
ls -1 utr5/split > utr5/in.lst
cd utr3
cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
cp gsub ../utr5
# Do cluster run for 3' UTRs
gensub2 in.lst single gsub spec
para create spec
para try
para push
# Completed: 36097 of 36097 jobs
# CPU time in finished jobs: 335580s 5593.00m 93.22h 3.88d 0.011 y
# IO & Wait Time: 653230s 10887.16m 181.45h 7.56d 0.021 y
# Average job time: 27s 0.46m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 1730s 28.83m 0.48h 0.02d
# Submission to last job: 6007s 100.12m 1.67h 0.07d
# Do cluster run for 5' UTRs
cd ../utr5
gensub2 in.lst single gsub spec
para create spec
para try
para push
# Completed: 34011 of 34011 jobs
# CPU time in finished jobs: 78543s 1309.05m 21.82h 0.91d 0.002 y
# IO & Wait Time: 938250s 15637.50m 260.62h 10.86d 0.030 y
# Average job time: 30s 0.50m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 5873s 97.88m 1.63h 0.07d
# Submission to last job: 6139s 102.32m 1.71h 0.07d
# Load database
ssh hgwdev
cd /cluster/data/hg18/bed/rnaStruct/utr5
hgLoadRnaFold hg18 foldUtr5 fold
cd ../utr3
hgLoadRnaFold hg18 foldUtr3 fold
# Clean up
rm -r split fold err batch.bak
cd ../utr5
rm -r split fold err batch.bak
# Build KEGG pathway tables. DONE 5/19/05. Fan.
ssh hgwdev
cd /cluster/store11/kg/kgHg18A
md kegg
cd kegg
~/src/hg/protein/KGpath.sh kgHg18A hg18 060115
hgsql hg18 -e "drop table keggMapDesc"
hgsql hg18 -e "drop table keggPathway"
hgsql hg18 <~/src/hg/lib/keggMapDesc.sql
hgsql hg18 <~/src/hg/lib/keggPathway.sql
hgsql hg18 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
hgsql hg18 -e 'load data local infile "keggPathway.tab" into table keggPathway'
# Build CGAP pathway tables
# RELOAD cgapAlias TABLE AFTER REMOVING REPLICATE ROWS (hartera, 2005-07-26)
# duplicate rows. (hartera, 2005-07-26)
# RELOADED cgapAlias AGAIN AS TOO MANY ROWS REMOVED BEFORE (hartera, 2005-10-06)
cd ..
mkdir cgap
cd cgap
~/src/hg/protein/KGcgap.sh kgHg18A hg18 060115
cat cgapBIOCARTAdesc.tab |sort -u > cgapBIOCARTAdescSorted.tab
hgsql hg18 -e "drop table cgapAlias"
hgsql hg18 -e "drop table cgapBiocDesc"
hgsql hg18 -e "drop table cgapBiocPathway"
hgsql hg18 <~/src/hg/lib/cgapAlias.sql
hgsql hg18 <~/src/hg/lib/cgapBiocDesc.sql
hgsql hg18 <~/src/hg/lib/cgapBiocPathway.sql
hgsql hg18 -e 'load data local infile "cgapAlias.tab" \
into table cgapAlias'
hgsql hg18 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
hgsql hg18 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'
# Build hg18 PROTEOME BROWSER TABLES
# These are instructions for building tables
# needed for the Proteome Browser.
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.
# This build is based on proteins DBs dated 060115.
# Create the working directory
ssh hgwdev
mkdir /cluster/store11/kg/kgHg18A/pb-2006-02-10
cd /cluster/data/hg18/bed
rm pb
ln -s /cluster/store11/kg/kgHg18A/pb-2006-02-10 pb
cd pb
# Define pep* tables in hg18 DB
cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql
# First edit out pepPred table definition, then
hgsql hg18 < pepAll.sql
# Build the pepMwAa table
hgsql proteins060115 -N -e \
"select info.acc, molWeight, aaSize from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > pepMwAa.tab
hgsql hg18 -e 'load data local infile "pepMwAa.tab" into table pepMwAa'
o Build the pepPi table
hgsql proteins060115 -e \
"select info.acc from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > protAcc.lis
hgsql hg18 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis
pbCalPi protAcc.lis sp060115 pepPi.tab
hgsql hg18 -e 'delete from pepPi'
hgsql hg18 -e 'load data local infile "pepPi.tab" into table hg18.pepPi'
# Calculate and load pep distributions
pbCalDist sp060115 proteins060115 9606 hg18 >pbCalDist.out
wc pbCalDist.out
hgsql hg18
load data local infile "pepExonCntDist.tab" into table hg18.pepExonCntDist;
load data local infile "pepCCntDist.tab" into table hg18.pepCCntDist;
load data local infile "pepHydroDist.tab" into table hg18.pepHydroDist;
load data local infile "pepMolWtDist.tab" into table hg18.pepMolWtDist;
load data local infile "pepResDist.tab" into table hg18.pepResDist;
load data local infile "pepIPCntDist.tab" into table hg18.pepIPCntDist;
load data local infile "pepPiDist.tab" into table hg18.pepPiDist;
quit
# Calculate frequency distributions
pbCalResStd sp060115 9606 hg18
# Create pbAnomLimit and pbResAvgStd tables
hgsql hg18 -e "drop table pbAnomLimit"
hgsql hg18 -e "drop table pbResAvgStd"
hgsql hg18 < ~/src/hg/lib/pbAnomLimit.sql
hgsql hg18 < ~/src/hg/lib/pbResAvgStd.sql
hgsql hg18 -e 'load data local infile "pbResAvgStd.tab" into table hg18.pbResAvgStd;'
hgsql hg18 -e 'load data local infile "pbAnomLimit.tab" into table hg18.pbAnomLimit;'
# Create pbStamp table for PB
hgsql hg18 -e "drop table pbStamp"
hgsql hg18 < ~/src/hg/lib/pbStamp.sql
hgsql hg17 -N -e 'select * from pbStamp' > pbStamp.tab
hgsql hg18 -e 'load data local infile "pbStamp.tab" into table hg18.pbStamp'
# Turn on Proteome Browser for hg18.
hgsql -e 'delete from dbDb where name="hg18"' \
-h genome-testdb hgcentraltest
hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) \
VALUES("hg18", "Feb. 2006", "/gbdb/hg18/nib", "Human", \
"chr7:127,664,479-127,689,005", 1, 10, "Human", "Homo sapiens", \
"/gbdb/hg18/html/description.html", 0, 1, "NCBI Build 36.1");' \
-h genome-testdb hgcentraltest
# Adjust drawing parameters for Proteome Browser stamps
Now invoke Proteome Browser and adjust various drawing parameters
(mostly the ymax of each stamp) if necessary, by updating the
pbStamp.tab file and then delete and reload the pbStamp table.
hgsql hg18 -e "drop table pbStamp"
hgsql hg18 < ~/src/hg/lib/pbStamp.sql
hgsql hg18 -e 'load data local infile "pbStamp.tab" into table hg18.pbStamp'
# Perform preliminary review of Proteome Browser for hg18, then
notify QA for formal review.
# First build entrez DB tables.
cd /cluster/store10/entrez
mkdir 060208
ln -s /cluster/store10/entrez/060208 /cluster/data/entrez/060208
cd /cluster/data/entrez/060208
wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz
wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
gzip -d *.gz
cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab
cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab
cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab
hgsql entrez -e 'drop table entrezRefseq'
hgsql entrez -e 'drop table entrezMrna'
hgsql entrez -e 'drop table entrezRefProt'
hgsql entrez < ~/src/hg/lib/entrezRefseq.sql
hgsql entrez < ~/src/hg/lib/entrezMrna.sql
hgsql entrez < ~/src/hg/lib/entrezRefProt.sql
hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq'
hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna'
hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt'
cd /cluster/store11/kg/kgHg18A
hgsql entrez -N -e \
'select mrna, refseq from entrezRefseq, entrezMrna, hg18.all_mrna where qName=mrna and entrezRefseq.geneID=entrezMrna.geneID' \
>mrnaRefseq1.tab
# Include RefSeq as valid mRNA too.
hgsql hg18 -N -e 'select name, name from refGene' >mrnaRefseq2.tab
cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab
hgsql hg18 -e 'drop table mrnaRefseq'
hgsql hg18 < ~/src/hg/lib/mrnaRefseq.sql
hgsql hg18 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'
# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 2/16/06 Fan)
# This depends on the go and uniProt databases as well as
# the kgAlias and kgProAlias tables. The hgKgGetText takes
# about 5 minutes when the database is not too busy. The rest
# is real quick.
ssh hgwdev
cd /cluster/store11/kg/kgHg18A
mkdir index
cd index
hgKgGetText hg18 knownGene.text
ixIxx knownGene.text knownGene.ix knownGene.ixx
ln -s /cluster/store11/kg/kgHg18A/index/knownGene.ix /gbdb/hg18/knownGene.ix
ln -s /cluster/store11/kg/kgHg18A/index/knownGene.ixx /gbdb/hg18/knownGene.ixx
# BUILD KNOWN GENE LIST FOR GOOGLE. (REDONE 8/12/08 JK)
# make knownGeneLists.html hg18GeneList.html mm5GeneList.html rm3GeneList.html
cd /cluster/data/hg18/bed
rm -rf knownGeneList/hg18
# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/hg18
hgKnownGeneList hg18
# copy over to /usr/local/apache/htdocs
rm -rf /usr/local/apache/htdocs/knownGeneList/hg18
mkdir -p /usr/local/apache/htdocs/knownGeneList/hg18
cp -Rfp knownGeneList/hg18/* /usr/local/apache/htdocs/knownGeneList/hg18
##################################################################################
# Create description.html for hg18
mkdir -p ~/kent/src/hg/makeDb/trackDb/human/hg18
cd ~/kent/src/hg/makeDb/trackDb/human/hg18
cp ../hg17/description.html .
vi description.html
# Change release date and build number and change hg17 to hg18
# Check it into CVS
mkdir -p /cluster/data/hg18/html
cp -p description.html /cluster/data/hg18/html
ln -s /cluster/data/hg18/html/description.html /gbdb/hg18/html/description.html
# BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED 2006-02-11, DONE 2006-02-14 - Fan)
# This should be done after KG tables are complete from known genes build
# process.
#
# Cluster together various alt-splicing isoforms.
# Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/hg18/bed/geneSorter.2006-02-11
# remove old symbolic link
rm /cluster/data/hg18/bed/geneSorter
ln -s /cluster/data/hg18/bed/geneSorter.2006-02-11 /cluster/data/hg18/bed/geneSorter
cd /cluster/data/hg18/bed/geneSorter
hgClusterGenes hg18 knownGene knownIsoforms knownCanonical
# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/hg18/bed/geneSorter/blastp
cd /cluster/data/hg18/bed/geneSorter/blastp
pepPredToFa hg18 knownGenePep known.faa
# You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
# This command is in /projects/compbio/bin/$MACH/formatdb
# Copy over database to bluearc
rm -fr /cluster/bluearc/hg18/blastp
mkdir -p /cluster/bluearc/hg18/blastp
cp -p /cluster/data/hg18/bed/geneSorter/blastp/known.* /cluster/bluearc/hg18/blastp
# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/hg18/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg
# Make parasol run directory
ssh pk
mkdir /cluster/data/hg18/bed/geneSorter/blastp/self
cd /cluster/data/hg18/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/hg18/blastp/known -i $1 -o $2 \
-e 0.01 -m 8 -b 1000
'_EOF_'
# << keep emacs happy
chmod +x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# 'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para push
para check
# Completed: 7733 of 7733 jobs
# CPU time in finished jobs: 56608s 943.47m 15.72h 0.66d 0.002 y
# IO & Wait Time: 467120s 7785.33m 129.76h 5.41d 0.015 y
# Average job time: 68s 1.13m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 119s 1.98m 0.03h 0.00d
# Submission to last job: 1433s 23.88m 0.40h 0.02d
# Load into database. This takes about 20 minutes
ssh hgwdev
cd /cluster/data/hg18/bed/geneSorter/blastp/self/run/out
bash
time hgLoadBlastTab hg18 knownBlastTab *.tab
# Scanning through 7733 files
# Loading database with 9647176 rows
# real 21m51.039s
cd /cluster/data/hg18/bed/geneSorter
# Create table that maps between known genes and RefSeq
hgMapToGene hg18 refGene knownGene knownToRefSeq
# may need to build this command in src/hg/near/hgMapToGene
# hgsql -e "select count(*) from knownToRefSeq;" hg18
# row count changed 34267
# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg18 > refToLl.txt
hgMapToGene hg18 refGene knownGene knownToLocusLink -lookup=refToLl.txt
# hgsql -e "select count(*) from knownToLocusLink;" hg18
# row count changed to 34267
# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg18 knownGene name proteinID Pfam knownToPfam
# hgsql -e "select count(*) from knownToPfam;" hg18
# row count changed to 34177
# Create table to map between known genes and GNF Atlas2
# expression data.
hgMapToGene hg18 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
# hgsql -e "select count(*) from knownToGnfAtlas2;" hg18
# row count changed to 32015
# Create expression distance table - takes about an hour
hgExpDistance hg18 hgFixed.gnfHumanAtlas2MedianRatio \
hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnfAtlas2 &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 32015 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
# hgsql -e "select count(*) from gnfAtlas2Distance;" hg18
# row count changed to 32015000
# Create a table that maps between known genes and
# the nice affy expression data.
hgMapToGene "-type=bed 12" hg18 affyUclaNorm knownGene knownToU133
# hgsql -e "select count(*) from knownToU133;" hg18
# row count changed to 32632
# Create expression distance table. This will take about 2.5 hours
cd /tmp
cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
time hgExpDistance hg18 affyUclaNorm affyUclaExp knownExpDistance \
-weights=affyUcla.weight -lookup=knownToU133 &
# Have 43039 elements in affyUclaNorm
# 211 genes, 42 weights, 26.500000 total wieght
# Got 32965 unique elements in affyUclaNorm
# Create table that maps between known genes and
# the GNF data.
cd /tmp
hgMapToGene hg18 affyU95 knownGene knownToU95
# row count changed to 17401
# hgFixed.gnfHumanU95Exps argument is unused, no need to exist
hgExpDistance hg18 hgFixed.gnfHumanU95MedianRatio \
hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 &
# Have 11545 elements in hgFixed.gnfHumanU95MedianRatio
# Got 16378 unique elements in hgFixed.gnfHumanU95MedianRatio
# row count changed to 16378000
# Create known gene mapping table and expression distance tables
# for GNF Atlas 2. (The hgExpDistance takes only 10 minutes.)
hgMapToGene hg18 affyGnf1h knownGene knownToGnf1h
hgExpDistance hg18 hgFixed.gnfHumanAtlas2MedianRatio \
hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnf1h &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 8739 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
# AFFYMETRIX HG-U133 PLUS TRACK (DONE, 2006-02-11, Fan)
# Loaded the HG-U133 Plus 2 sequences for hg18 (DONE, 2006-03-29, hartera)
# The below was already done.
# vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
ssh hgwdev
mkdir -p /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
# Go to http://www.affymetrix.com/support/technical/byproduct.affx?product=hg-u133-plus
# and download the consensus and exemplar sequences to this directory
cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
unzip HG-U133_Plus_2_consensus.zip
unzip HG-U133_Plus_2_exemplar.zip
cat HG-U133_Plus_2_consensus HG-U133_Plus_2_exemplar >> U133Plus2_all.fa
perl -pi.bak -e "s/(consensus|exemplar):HG-U133_Plus_2:/U133+2:/" \
U133Plus2_all.fa
# remove ";" from probe set names
perl -pi.bak -e "s/;//" U133Plus2_all.fa
# clean up
rm *.zip *.bak
# Set up cluster job to align consensus/exemplars to hg16
ssh kkr1u00
mkdir -p /iscratch/i/affy
mv /cluster/data/hg18/bed/affyU133Plus2.2006-02-11/U133Plus2_all.fa \
/iscratch/i/affy
iSync
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# The above is already done by Rachel during hg17 build.
ssh hgwdev
cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
mkdir -p /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
cp -p U133Plus2_all.fa /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
mkdir -p /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
cp -p U133Plus2_all.fa /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
ssh kk
cd /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
ls -1 /iscratch/i/affy/U133Plus2_all.fa > affy.lst
ls -1 /iscratch/i/gs.19/build36/maskedContigs/* > allctg.lst
cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << for emacs
gensub2 allctg.lst affy.lst template.sub para.spec
mkdir psl
para create para.spec
para try, para check, para push ...
# Completed: 378 of 378 jobs
# CPU time in finished jobs: 24764s 412.74m 6.88h 0.29d 0.001 y
# IO & Wait Time: 13823s 230.38m 3.84h 0.16d 0.000 y
# Average job time: 102s 1.70m 0.03h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 782s 13.03m 0.22h 0.01d
# Submission to last job: 827s 13.78m 0.23h 0.01d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU133Plus2.psl
pslSort dirs raw.psl tmp psl
# use filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least 95% identity in aligned region.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
liftUp affyU133Plus2.psl ../../jkStuff/liftAll.lft warn contig.psl
perl -pi.bak -e "s/U133\+2://" affyU133Plus2.psl
# load into the database
ssh hgwdev
cd /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
hgLoadPsl hg18 affyU133Plus2.psl
# The below was already done.
# vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
# Add sequence data to database
# Copy probe sequence to /gbdb if it isn't already
mkdir -p /gbdb/hgFixed/affyProbes
cd /gbdb/hgFixed/affyProbes
ln -s /projects/compbio/data/microarray/affyHuman/HG-U133Plus2/U133Plus2_all.fa .
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# The above is already done by Rachel during hg17 build.
cd /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
# the sequences need to be loaded for the hg18 database
# (2006-03-29, hartera)
hgLoadSeq -abbr=U133+2: hg18 /gbdb/hgFixed/affyProbes/U133Plus2_all.fa
# clean up
rm -r psl tmp err contig.psl raw.psl *.bak psl.tab seq.tab
# Added knownToU133Plus2 track
cd /cluster/data/hg18/bed/geneSorter
hgMapToGene hg18 affyU133Plus2 knownGene knownToU133Plus2
# row count changed to 34745
# Make knownToCdsSnp table (DONE Sept 12, 2007, jk)
ssh hgwdev
hgMapToGene hg18 snp126 knownGene knownToCdsSnp -all -cds
# approx. 5 minutes running time
# UPDATE GO DATABASE
# Download the terms and make the database.
ssh hgwdev
mkdir /cluster/store1/geneOntology/20060211
cd /cluster/store1/geneOntology/20060211
wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200601-assocdb-data.gz
hgsql mysql <<end
create database go060211;
end
zcat go_*data.gz | sed -e 's/ENGINE=MyISAM DEFAULT CHARSET=latin1/TYPE=MyISAM/g' >j.tmp
hgsql go060211 <j.tmp
rm j.tmp
wget --timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_uniprot.gz
# The format of gene_association.goa_uniprot.gz changed, there is 6 comment lines at the head now.
# Updated hgGoAssociation.c to skip first 6 lines.
zcat gene_association.goa_uniprot.gz | /cluster/home/fanhsu/bin/i386/hgGoAssociation go060211 goaPart stdin
# Passed 6832447 of 7933823 of 7933823, 86.12%
# Ask sys-admin to switch the database pointer go to point to go060211.
# HGNEAR PROTEIN BLAST TABLES (DONE 2/12/06 Fan)
ssh hgwdev
mkdir /cluster/data/hg18/bed/hgNearBlastp
cd /cluster/data/hg18/bed/hgNearBlastp
cat << _EOF_ > config.ra
# Latest human vs. other Gene Sorter orgs:
# mouse, rat, zebrafish, worm, yeast, fly
targetGenesetPrefix human
targetDb hg18
queryDbs mm7 rn3 danRer3 ce2 sacCer1 dm2
hg18Fa /cluster/data/hg18/bed/blastp/known.faa
mm7Fa /cluster/data/mm7/bed/geneSorter/blastp/known.faa
rn3Fa /cluster/data/rn3/bed/blastp/known.faa
danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa
sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
dm2Fa /cluster/data/dm2/bed/flybase4.1/flybasePep.fa
buildDir /cluster/data/hg18/bed/hgNearBlastp
scratchDir /san/sanvol1/scratch/hg18HgNearBlastp
_EOF_
# doHgNearBlastp.pl config.ra >& do.log &
doHgNearBlastp.pl config.ra >do3.log
# tail -f do.log
0657.tab dm2_0658.tab dm2_0659.tab dm2_0660.tab dm2_0661.tab dm2_0662.tab dm2_0663.tab dm2_0664.tab dm2_0665.tab dm2_0666.tab dm2_0667.tab dm2_0668.tab dm2_0669.tab dm2_0670.tab
Scanning through 671 files
Loading database with 14488 rows
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/hg18.split
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/hg18.formatdb
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/mm7.split
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/mm7.formatdb
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/rn3.split
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/rn3.formatdb
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/danRer3.split
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/danRer3.formatdb
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/ce2.split
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/ce2.formatdb
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/sacCer1.split
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/sacCer1.formatdb
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/dm2.split
# ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/dm2.formatdb
# ssh -x pk rmdir /san/sanvol1/scratch/hg18HgNearBlastp
*** All done!
*** Check these tables in hg18:
*** humanBlastTab mmBlastTab rnBlastTab drBlastTab ceBlastTab scBlastTab dmBlastTab
*** and hgBlastTab in these databases:
*** mm7 rn3 danRer3 ce2 sacCer1 dm2
# MAKE ORGANISM-SPECIFIC HGNEARDATA FILES
cd ~/kent/src/hg/near/hgNear/hgNearData
# any updates necessary?
# ENABLE HGNEAR FOR HG18 IN HGCENTRALTEST
echo "update dbDb set hgNearOk = 1 where name = 'hg18';" \
| hgsql -h genome-testdb hgcentraltest
# END OF HGNEAR STUFF
#############################################################################
# UPDATE BIOCYCTABLES NEEDED BY hgGene (DONE 2/16/06 Fan)
# First register with BioCyc to download their HumanCyc database
# The site will email you the URL for download
wget --timestamping http://bioinformatics.ai.sri.com/ecocyc/dist/pdff-XXXXXX/humancyc-flatfiles.zip
unzip humancyc-flatfiles.zip
cp genes.col genes.tab
cp pathways.col pathways.tab
# delete the first 20 or so header lines from these two files.
vi genes.tab
vi pathways.tab
hgsql hg18 -e 'create database bioCyc060216'
hgsql bioCyc060216 < ~/src/hg/lib/bioCycGenes.sql
hgsql bioCyc060216 -e 'load data local infile "genes.tab" into table genes'
hgsql bioCyc060216 < ~/src/hg/lib/bioCycPathways.sql
hgsql bioCyc060216 -e 'load data local infile "pathways.tab" into table pathways'
# Create bioCycMapDesc.tab
hgsql bioCyc060216 -N -e 'select UNIQUE_ID, NAME from pathways' |sort -u > bioCycMapDesc.tab
# Create bioCycPathway.tab
kgBioCyc0 bioCyc060216 hg18 hg17
hgsql hg18 -e 'delete from bioCycPathway'
hgsql hg18 -e 'delete from bioCycMapDesc'
hgsql hg18 < ~/src/hg/lib/bioCycPathway.sql
hgsql hg18 < ~/src/hg/lib/bioCycMapDesc.sql
# Load results into hg18.
hgsql hg18 -e 'LOAD DATA local INFILE "bioCycMapDesc.tab" into table bioCycMapDesc'
hgsql hg18 -e 'LOAD DATA local INFILE "bioCycPathway.tab" into table bioCycPathway'
#############################################################################
# BLASTZ/CHAIN/NET RN4 (DONE 2/17/06 Fan)
ssh kkstore02
cd /cluster/store11/gs.19/build36
cp -Rp linSpecRep /san/sanvol1/scratch/hg18
cp -Rp nib /san/sanvol1/scratch/hg18
mkdir /cluster/data/hg18/bed/blastz.rn4.2006-02-17
cd /cluster/data/hg18/bed/blastz.rn4.2006-02-17
cat << '_EOF_' > DEF
# human vs. rat
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human
SEQ1_DIR=/san/sanvol1/scratch/hg18/nib
SEQ1_SMSK=/san/sanvol1/scratch/hg18/linSpecRep/notInRat
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rat
SEQ2_DIR=/san/sanvol1/scratch/rn4/nib
SEQ2_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInHuman
SEQ2_LEN=/cluster/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.rn4.2006-02-17
'_EOF_'
# << for emacs
doBlastzChainNet.pl DEF -chainLinearGap medium \
-bigClusterHub pk -smallClusterHub pk -workhorse pk \
-blastzOutRoot /san/sanvol1/scratch/blastzHg17Rn4Out >& do.log &
tail -f do.log
rm -f /cluster/data/hg18/bed/blastz.rn4
ln -s blastz.rn4.2006-02-17 /cluster/data/hg18/bed/blastz.rn4
#############################################################################
# BUILD WGRNA TRACK (DONE, 2006-02-22, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2006-05-15, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-02-09, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan)
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir wgRna-2006-02-22
cd wgRna-2006-02-22
# Received the data file, wg_hg18_track.txt, from Michel Weber's email
# (Michel.Weber at ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg18/bed/wgRna-2006-02-22.
cp -p wg_hg18_track.txt wgRna.tab
hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab
#############################################################################
# RE-BUILD WGRNA TRACK (DONE, 2006-05-15, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-02-09, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2008-05-29, Fan)
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir wgRna-2008-05-28
cd wgRna-2008-05-28
# Received the data file, wgtrack_may2008.doc, from Michel Weber's
# email
# (Michel.Weber at ibcg.biotoul.fr)
# Save it as .txt file and change all blanks into tabs.
# and place it under cd /cluster/data/hg18/bed/wgRna-2008-05-28.
cp -p wgtrack_may2008.txt wgRna.tab
hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab
#############################################################################
# 17-WAY MULTIZ ALIGNMENTS (DONE - 2006-02-22 Fan)
# copy net mafs to cluster-friendly storage for multiz run
ssh kkstore02
ln -s /cluster/data/hg18/bed/blastzMonDom4.2006-02-13 /cluster/data/hg18/bed/blastz.monDom4
cd /cluster/data/hg18/bed/blastz.monDom4
cd /cluster/data/hg18/bed
mkdir -p multiz17way.2006-02-18
ln -s multiz17way.2006-02-18 multiz17way
cd multiz17way
# copy MAF's to cluster-friendly server
# These MAF's already on bluearc:
# canFam2, fr1, galGal2, panTro1, rn4
mkdir -p /san/sanvol1/scratch/hg18/mafNet
cd /san/sanvol1/scratch/hg18/mafNet
ln -s /cluster/bluearc/hg18/mafNet/{*} .
# copy others
foreach s (bosTau2 canFam2 danRer3 dasNov1 echTel1 fr1 galGal2 loxAfr1 \
mm8 monDom4 oryCun1 panTro1 rn4 tetNig1 xenTro1 rheMac2)
echo $s
cp -Rp /cluster/data/hg18/bed/blastz.$s/mafNet $s
end
# danRer3 directory structure is different. It is under /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun
ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun/mafNet /san/sanvol1/scratch/hg18/mafNet/danRer3
# thanks for the tree, Hiram! Taken from mm7 17way...
cd /cluster/data/hg18/bed/multiz17way
cat << '_EOF_' > 17way.nh
(((((((((
(human_hg18:0.006690,chimp_panTro1:0.007571):0.024272,
macaque_rheMac2:0.0592):0.023960,
((rat_rn4:0.081728,mouse_mm8:0.077017):0.229273,
rabbit_oryCun1:0.206767):0.1065):0.023026,
(cow_bosTau2:0.159182,dog_canFam2:0.147731):0.039450):0.028505,
armadillo_dasNov1:0.149862):0.015994,
(elephant_loxAfr1:0.104891,tenrec_echTel1:0.259797):0.040371):0.218400,
monodelphis_monDom4:0.371073):0.189124,
chicken_galGal2:0.454691):0.123297,
xenopus_xenTro1:0.782453):0.156067,
((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.492961,
zebrafish_danRer3:0.782561):0.156067);
'_EOF_'
/cluster/bin/draw_tree 17way.nh > 17way.ps
/cluster/bin/phast/all_dists 17way.nh > 17way.distances.txt
grep hg18 17way.distances.txt | sort -k3,3n | \
awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
# edit distances.txt to include featureBits, and chain parameters
# from blastz run.
cat distances.txt
# 0.0143 chimp_panTro1
# 0.0902 macaque_rheMac2
# 0.2563 armadillo_dasNov1
# 0.2651 dog_canFam2
# 0.2677 elephant_loxAfr1
# 0.2766 cow_bosTau2
# 0.3682 rabbit_oryCun1
# 0.4226 tenrec_echTel1
# 0.4677 mouse_mm8
# 0.4724 rat_rn4
# use loose chain params and score from here, down (5000)
# 0.7119 monodelphis_monDom4
# 0.9847 chicken_galGal2
# 1.4357 xenopus_xenTro1
# 1.6577 tetraodon_tetNig1
# 1.6983 fugu_fr1
# 1.7480 zebrafish_danRer3
# the order in the browser display will be by tree topology,
# not by distance, so it will be:
# >> # 0.0143 chimp_panTro1
# >> # 0.0902 macaque_rheMac2
# >> # 0.4677 mouse_mm8
# >> # 0.4724 rat_rn4
# >> # 0.3682 rabbit_oryCun1
# >> # 0.2651 dog_canFam2
# >> # 0.2766 cow_bosTau2
# >> # 0.2563 armadillo_dasNov1
# >> # 0.2677 elephant_loxAfr1
# >> # 0.4226 tenrec_echTel1
# >> # 0.7119 monodelphis_monDom4
# >> # 0.9847 chicken_galGal2
# >> # 1.4357 xenopus_xenTro1
# >> # 1.6577 tetraodon_tetNig1
# >> # 1.6983 fugu_fr1
# >> # 1.7480 zebrafish_danRer3
# make output dir and run dir
ssh pk
cd /cluster/data/hg18/bed/multiz17way.2006-02-18
# create species list and stripped down tree for autoMZ
sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 17way.nh > tmp.nh
echo `cat tmp.nh` > tree-commas.nh
echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
mkdir -p maf run
cd run
# stash binaries
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
set db = hg18
set c = $1
set maf = $2
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/mafNet
rm -fr $tmp
mkdir -p $tmp
cp ../tree/tree.nh ../species.lst $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == hg18) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'EOF'
# << happy emacs
chmod +x autoMultiz.csh
cat << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg18/bed/multiz17way.2006-02-18/maf/$(root1).maf}
#ENDLOOP
'EOF'
# << happy emacs
awk '{print $1}' /cluster/data/hg18/chrom.sizes > chrom.lst
gensub2 chrom.lst single spec jobList
para create jobList
# 49 files
para try
para check
para push
# NOTE: much faster than V10 (40 hrs for hg17 V10, 14.53 hrs for hg17 V11)
# Completed: 49 of 49 jobs
# CPU time in finished jobs: 341776s 5696.26m 94.94h 3.96d 0.011 y
# IO & Wait Time: 122801s 2046.69m 34.11h 1.42d 0.004 y
# Average job time: 9481s 158.02m 2.63h 0.11d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 81334s 1355.57m 22.59h 0.94d
# Submission to last job: 81334s 1355.57m 22.59h 0.94d
# Load into database
ssh hgwdev
cd /cluster/data/hg18/bed/multiz17way/maf
mkdir -p /gbdb/hg18/multiz17way/maf
ln -s /cluster/data/hg18/bed/multiz17way/maf/*.maf \
/gbdb/hg18/multiz17way/maf
cat > loadMaf.csh << 'EOF'
time hgLoadMaf -pathPrefix=/gbdb/hg18/multiz17way/maf hg18 multiz17way
cat *.maf | \
nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 -maxSize=200000 multiz17waySummary stdin
'EOF'
# 3213116
#<< happy emacs
# expect lengthy load time for this -- a few hours ?
# csh loadMaf.csh >&! loadMaf.log &
script loadMaf.log
csh loadMaf.csh
exit
###############################################################
# PHASTCONS CONSERVATION (DONE, 2ND TIME, 2006-03-28 Fan)
# This process is distilled from Hiram and Adam's experiments
# on mouse (mm7) 17way track. Many parameters are now fixed, without
# being experimentally derived, either because the experiments
# were lengthy and produced similar results, or because they
# weren't runnable given the alignment size.
# These parameters are:
# --rho
# --expected-length
# --target-coverage
# Also, instead of generating cons and noncons tree models,
# we use a single, pre-existing tree model -- Elliot Margulies' model
# from the (37-way) ENCODE alignments.
# NOTE: reusing cluster-friendly chrom fasta files created earlier
ssh kkstore02
mkdir /cluster/bluearc/hg18/chrom
cd /cluster/data/hg18
foreach f (`cat chrom.lst`)
echo $f
cp $f/*.fa /cluster/bluearc/hg18/chrom
end
# Split chromosome MAF's into windows and use to generate
# "sufficient statistics" (ss) files for phastCons input
# NOTE: as the SAN fs has lotsa space, we're leaving these
# big (temp) files unzipped, to save time during phastCons run.
# Note also the larger chunk sizes from previous runs -- this
# reduces run-time on the split, slows down the actual phastCons
# enough so jobs don't crash (jobs are very quick, just a minute
# or so), and according to Adam, will produce better results.
# The previous small chunks were probably required by
# the phyloFit step, which we are no longer using for the
# human alignments.
ssh pk
mkdir /cluster/data/hg18/bed/multiz17way.2006-02-18/cons
cd /cluster/data/hg18/bed/multiz17way.2006-02-18/cons
cp /cluster/store5/gs.18/build35/bed/multiz17way.2005-12-20/cons/elliotsEncode.mod .
# edit, change to hg18, monDom4, mm8, and rn4.
mkdir run.split
cd run.split
set WINDOWS = /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/ss
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
# unfortunately this exhausts 2G mem limit currently on pk
# next time, run on mini-cluster
set MAFS = /cluster/data/hg18/bed/multiz17way.2006-02-18/maf
set WINDOWS = /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/ss
cd $WINDOWS
set c = $1
echo $c
rm -fr $c
mkdir $c
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
-M /cluster/bluearc/hg18/chrom/$c.fa \
-o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
echo "Done" >> $c.done
'EOF'
# << happy emacs
chmod +x doSplit.csh
rm -f jobList
foreach f (../../maf/*.maf)
set c = $f:t:r
echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
end
para create jobList
# 49 jobs
para try
para check
para push
# Completed: 49 of 49 jobs
# CPU time in finished jobs: 9254s 154.24m 2.57h 0.11d 0.000 y
# IO & Wait Time: 15027s 250.44m 4.17h 0.17d 0.000 y
# Average job time: 496s 8.26m 0.14h 0.01d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 1916s 31.93m 0.53h 0.02d
# Submission to last job: 1921s 32.02m 0.53h 0.02d
# check tree model on 5MB chunk, using params recommended by Adam,
# (to verify branch lengths on 2X species)
# he ok'ed the results -- not necessary for next human run
ssh kolossus
cd /cluster/data/hg18/bed/multiz17way.2006-02-18/cons
/cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
--tree "`cat ../tree-commas.nh`" \
/san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/ss/chr7/chr7.110000001-120000000.ss \
-o phyloFit.tree
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
# cd ..
mkdir run.cons
cd run.cons
cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set tmp = /scratch/tmp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons
cp -p $san/ss/$c/$f.ss ../elliotsEncode.mod $tmp
pushd $tmp > /dev/null
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncode.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative panTro1,rheMac2 \
--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/pp/$c $san/bed/$c
sleep 1
mv $tmp/$f.pp $san/pp/$c
mv $tmp/$f.bed $san/bed/$c
rm -fr $tmp
'EOF'
# emacs happy
chmod a+x doPhast.csh
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create gsub file
cat > template << 'EOF'
#LOOP
doPhast.csh $(root1) $(file1) 14 .008 .28
#ENDLOOP
'EOF'
# happy emacs
# Create parasol batch and run it
pushd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons
# mkdir /cluster/data/hg18/bed/multiz17way/cons/run.cons
ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
/cluster/data/hg18/bed/multiz17way/cons/run.cons/in.list
ssh pk
cd /cluster/store11/gs.19/build36/bed/multiz17way.2006-02-18/cons/run.cons
gensub2 in.list single template jobList
para create jobList
# 337 jobs
para try
para check
para push
# NOTE: some jobs crashed due to can not stat some /san/... files, but worked when pushed once again
# Completed: 337 of 337 jobs
# CPU time in finished jobs: 16000s 266.66m 4.44h 0.19d 0.001 y
# IO & Wait Time: 13307s 221.79m 3.70h 0.15d 0.000 y
# Average job time: 87s 1.45m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 173s 2.88m 0.05h 0.00d
# Submission to last job: 225s 3.75m 0.06h 0.00d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons
# The sed's and the sort get the file names in chrom,start order
# (Hiram tricks -- split into columns on [.-/] with
# identifying x,y,z, to allow column sorting and
# restoring the filename. Warning: the sort column
# will depend on how deep you are in the dir
find ./bed -name "chr*.bed" | \
sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
sort -k7,7 -k9,9n | \
sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
xargs cat | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/hg18/bed/multiz17way/cons
# load into database
ssh hgwdev
cd /cluster/data/hg18/bed/multiz17way/cons
hgLoadBed -strict hg18 phastConsElements17way mostConserved.bed
# Loaded 2037557 elements
# compare with previous tracks
hgsql hg18 -e "select count(*) from phastConsElements17way"
# 2260575
# hgsql hg18 -e "select count(*) from phastConsElements"
# hg18 does not have phastConsElements table
# 1601903
# Try for 5% overall cov, and 70% CDS cov (used elen=13, tcov=.007, rho=.27)
featureBits hg18 -enrichment refGene:cds phastConsElements17way
# refGene:cds 1.072%, phastConsElements17way 5.510%, both 0.759%, cover 70.83%, enrich 12.86x
featureBits hg17 -enrichment refGene:cds phastConsElements17way
# refGene:cds 1.064%, phastConsElements17way 5.104%, both 0.748%, cover 70.29%, enrich 13.77x
# compare with previous tracks
featureBits hg18 -enrichment refGene:cds phastConsElements10way
# refGene:cds 1.062%, phastConsElements10way 5.003%, both 0.734%, cover 69.18%, enrich 13.83x
featureBits hg18 -enrichment refGene:cds phastConsElements
# refGene:cds 1.062%, phastConsElements 4.810%, both 0.771%, cover 72.65%, enrich 15.11x
# Create merged posterier probability file and wiggle track data files
# pk is currently closer to the san than any other machine
ssh pk
cd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
find ./pp -name "chr*.pp" | \
sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
sort -k7,7 -k9,9n | \
sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
xargs cat | \
nice wigEncode stdin phastCons17way.wig phastCons17way.wib
# about 23 minutes for above
cp -p phastCons17way.wi? /cluster/data/hg18/bed/multiz17way/cons
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/hg18/bed/multiz17way/cons
ln -s `pwd`/phastCons17way.wib /gbdb/hg18/multiz17way/phastCons17way.wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz17way hg18 \
phastCons17way phastCons17way.wig
# ~ 3 minute load
# Downloads (2006-02-22 Fan)
ssh hgwdev
cd /cluster/data/hg18/bed/multiz17way
mkdir mafDownloads
cd mafDownloads
# upstream mafs (mafFrags takes a while)
cat > mafFrags.csh << 'EOF'
date
foreach i (1000 2000 5000)
echo "making upstream$i.maf"
nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
rm up.bad
nice mafFrags hg18 multiz17way up.bed upstream$i.maf \
-orgs=../species.lst
rm up.bed
end
date
'EOF'
time csh mafFrags.csh > mafFrags.log
nice gzip up*.maf
ssh kkstore02
cd /cluster/data/hg18/bed/multiz17way/mafDownloads
cat > downloads.csh << 'EOF'
date
foreach f (../maf/chr*.maf)
set c = $f:t:r
echo $c
nice gzip -c $f > $c.maf.gz
end
md5sum *.gz > md5sum.txt
date
'EOF'
time csh downloads.csh > downloads.log
ssh hgwdev
set dir = /usr/local/apache/htdocs/goldenPath/hg18/multiz17way
mkdir $dir
ln -s /cluster/data/hg18/bed/multiz17way/mafDownloads/{*.gz,md5sum.txt} $dir
##############################################################################
# SET DEFAULT POSITION TO chrX:151,073,054-151,383,976, TO SHOW GENE GABRA3
hgsql -e 'delete from dbDb where name="hg18"' \
-h genome-testdb hgcentraltest
hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) \
VALUES("hg18", "Feb. 2006", "/gbdb/hg18/nib", "Human", \
"chrX:151,073,054-151,383,976", 1, 9, "Human", "Homo sapiens", \
"/gbdb/hg18/html/description.html", 1, 1, "NCBI Build 36.1");' \
-h genome-testdb hgcentraltest
############################################################################
# HG16/HG17 -> HG18 LIFTOVER CHAINS (DONE 2/24/06 Fan)
# These chains hopefully don't suck.
# Sorry I only used the makeLoChain-align script from the set of scripts
# already created for this task. I wanted more control. I should mention
# I used a size of 10kb instead of 3kb for the split (blat query) sizes in
# hg18. This had a huge affect on the amount of hits in the blat, which
# then had a huge effect on the amount of chains. I should also mention
# that hg18 chromosomes chr1 and chr2 were split further
# into more than a single query file. This helped a LOT in avoiding
# cluster hippos classically associated with those chroms.
######## LIFTOVER PREPARATION
# Split up hg18
ssh pk
cd /san/sanVol1/scratch/hg18
mkdir -p liftSplits/{split,lift}
bash
for fa in /cluster/data/hg18/?{,?,*hap*}/*.fa; do
c=`basename $fa .fa`
echo $c
faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c
done
mkdir -p biggerSplits/split
cd biggerSplits/
ln -s ../liftSplits/lift
cd split/
ln -s ../../liftSplits/split/* .
faSplit sequence chr1.fa 5 chr1_
faSplit sequence chr2.fa 5 chr2_
rm chr{1,2}.fa
# Make some dirs
cd /san/sanVol1/scratch
mkdir -p hg{15,16,17}
# Copy 11.ooc files to each of hg15, hg16, hg17 dirs.
cp -p /cluster/store5/gs.16/build33/11.ooc hg15
cp -p /cluster/store4/gs.17/build34/11.ooc hg16
cp -p /cluster/store5/gs.18/build35/11.ooc hg17
## First, copy over Andy's scripts.
mkdir -p /san/sanVol1/scratch/fan
cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan
cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan
######## LIFTOVER BLATING
# HG16
ssh pk
cd /cluster/data/hg16
makeLoChain-align hg16 /scratch/hg/hg16/bothMaskedNibs hg18 \
/san/sanVol1/scratch/hg18/biggerSplits/split
cd bed/
mv blat.hg18.2006-02-24 /san/sanVol1/scratch/hg16
cd /san/sanVol1/scratch/hg16/blat.hg18.2006-02-24/run/
sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg16ToHg18"}' > newspec
para create newspec
para try
para push
# Completed: 2394 of 2394 jobs
# CPU time in finished jobs: 623927s 10398.79m 173.31h 7.22d 0.020 y
# IO & Wait Time: 13255s 220.91m 3.68h 0.15d 0.000 y
# Average job time: 266s 4.44m 0.07h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 3613s 60.22m 1.00h 0.04d
# Submission to last job: 4112s 68.53m 1.14h 0.05d
# HG17
ssh pk
cd /cluster/data/hg17
makeLoChain-align hg17 /scratch/hg/hg17/bothMaskedNibs hg18 /san/sanVol1/scratch/hg18/biggerSplits/split
cd bed/
mv blat.hg18.2006-02-24/ /san/sanVol1/scratch/hg17
cd /san/sanVol1/scratch/hg17/blat.hg18.2006-02-24/run/
sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg17ToHg18"}' > newspec
para create newspec
para try
para push
# Completed: 2622 of 2622 jobs
# CPU time in finished jobs: 618557s 10309.28m 171.82h 7.16d 0.020 y
# IO & Wait Time: 13735s 228.92m 3.82h 0.16d 0.000 y
# Average job time: 241s 4.02m 0.07h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 3655s 60.92m 1.02h 0.04d
# Submission to last job: 4228s 70.47m 1.17h 0.05d
######## LIFTOVER CHAINING
# LIFTING
ssh pk
cd /san/sanVol1/scratch/fan
cp mm7SplitLift.sh hg18SplitLift.sh
# change andy to fan, mm7 to hg18, and chrX to chr2, and remove chrUn_random
vi hg18SplitLift.sh
cat << 'EOF' > hg18ChainMergeSplit.sh
#!/bin/bash
cp -r chainRaw/ /scratch/fan/hg18Lifts
pushd /scratch/fan/hg18Lifts
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
cp -r chain `dirs +1`
rm -rf chain chainRaw
'EOF'
chmod +x hg18ChainMergeSplit.sh
# HG16
cd /san/sanVol1/scratch/hg16/blat.hg18.2006-02-24/raw
/san/sanVol1/scratch/fan/hg18SplitLift.sh
cd ../
mkdir chainRun chainRaw
cd chainRun
cat > gsub << 'EOF'
#LOOP
/cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg16/bothMaskedNibs /san/sanVol1/scratch/hg18/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
'EOF'
ls -1S ../psl/*.psl > in.lst
gensub2 in.lst single gsub spec
para create spec
para push
para time
# Completed: 49 of 49 jobs
# CPU time in finished jobs: 3599s 59.98m 1.00h 0.04d 0.000 y
# IO & Wait Time: 1040s 17.34m 0.29h 0.01d 0.000 y
# Average job time: 95s 1.58m 0.03h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 303s 5.05m 0.08h 0.00d
# Submission to last job: 303s 5.05m 0.08h 0.00d
# HG17
cd /san/sanVol1/scratch/hg17/blat.hg18.2006-02-24/raw
/san/sanVol1/scratch/fan/hg18SplitLift.sh
cd ../
mkdir chainRun chainRaw
cd chainRun
cat > gsub << 'EOF'
#LOOP
/cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg17/bothMaskedNibs /san/sanVol1/scratch/hg18/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
'EOF'
ls -1S ../psl/*.psl > in.lst
gensub2 in.lst single gsub spec
para create spec
para try
para push
para time
# Completed: 49 of 49 jobs
# CPU time in finished jobs: 3671s 61.19m 1.02h 0.04d 0.000 y
# IO & Wait Time: 1186s 19.76m 0.33h 0.01d 0.000 y
# Average job time: 99s 1.65m 0.03h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 282s 4.70m 0.08h 0.00d
# Submission to last job: 282s 4.70m 0.08h 0.00d
######### CHAINMERGE/NET/NETSUBSET
ssh kolossus
mkdir -p /scratch/fan/hg18Lifts
cd /scratch/fan/hg18Lifts
cp -rp /san/sanvol1/scratch/hg17/blat.hg18.2006-02-24/chainRaw/ .
mkdir chain
time /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
cp -rp chain /san/sanvol1/scratch/hg17/blat.hg18.2006-02-24/
mv chain chain.17
# remove it later
rm -rf chain.17
cp -r /san/sanvol1/scratch/hg16/blat.hg18.2006-02-24/chainRaw/ .
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
# about 30 minutes.
cp -rp chain /san/sanvol1/scratch/hg16/blat.hg18.2006-02-24/
rm -rf chain*
ssh pk
cd /san/sanvol1/scratch/fan
cat << 'EOF' > netOver.sh
#!/bin/bash
chain=$1
chrom=`basename $chain .chain`
sizesHGOld=$2
sizesHG18=/cluster/data/hg18/chrom.sizes
chainDir=`dirname $chain`
blatDir=`dirname $chainDir`
net=${blatDir}/net/${chrom}.net
over=${blatDir}/over/${chrom}.over
mkdir -p ${blatDir}/{over,net}
/cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG18 $net /dev/null
/cluster/bin/x86_64/netChainSubset $net $chain $over
'EOF'
chmod +x netOver.sh
mkdir netRun
cd netRun/
find /san/sanVol1/scratch/hg16/blat.hg18.2006-02-24/chain -name "*.chain" \
| awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg16/chrom.sizes"}' >> spec
find /san/sanVol1/scratch/hg17/blat.hg18.2006-02-24/chain -name "*.chain" \
| awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg17/chrom.sizes"}' >> spec
para create spec
para push
para time
# Completed: 88 of 88 jobs
# CPU time in finished jobs: 881s 14.68m 0.24h 0.01d 0.000 y
# IO & Wait Time: 284s 4.74m 0.08h 0.00d 0.000 y
# Average job time: 13s 0.22m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 33s 0.55m 0.01h 0.00d
# Submission to last job: 73s 1.22m 0.02h 0.00d
# seems much faster than mm7.
########## FINISHING
ssh hgwdev
# HG16
cd /san/sanvol1/scratch/hg16/blat.hg18.2006-02-24/over
cat * >> ../hg16ToHg18.over.chain
cd ../
rm -rf psl/ net/ chain/ chainRaw/ over/
cd ../
cp -rp blat.hg18.2006-02-24/ /cluster/data/hg16/bed
cd /cluster/data/hg16/bed
ln -s blat.hg18.2006-02-24 blat.hg18
ln -s `pwd`/blat.hg18/hg16ToHg18.over.chain liftOver/hg16ToHg18.over.chain
ln -s `pwd`/liftOver/hg16ToHg18.over.chain /gbdb/hg16/liftOver/hg16ToHg18.over.chain
mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/liftOver
cd /usr/local/apache/htdocs/goldenPath/hg16/liftOver
cp /gbdb/hg16/liftOver/hg16ToHg18.over.chain .
gzip hg16ToHg18.over.chain
hgAddLiftOverChain hg16 hg18 /gbdb/hg16/liftOver/hg16ToHg18.over.chain
# HG17
cd /san/sanvol1/scratch/hg17/blat.hg18.2006-02-24/over
cat * >> ../hg17ToHg18.over.chain
cd ../
rm -rf psl/ net/ chain/ chainRaw/ over/
cd ../
cp -r blat.hg18.2006-02-24/ /cluster/data/hg17/bed
cd /cluster/data/hg17/bed
ln -s blat.hg18.2006-02-24 blat.hg18
ln -s `pwd`/blat.hg18/hg17ToHg18.over.chain liftOver/hg17ToHg18.over.chain
ln -s `pwd`/liftOver/hg17ToHg18.over.chain /gbdb/hg17/liftOver/hg17ToHg18.over.chain
mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/liftOver
cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
cp /gbdb/hg17/liftOver/hg17ToHg18.over.chain .
gzip hg17ToHg18.over.chain
hgAddLiftOverChain hg17 hg18 /gbdb/hg17/liftOver/hg17ToHg18.over.chain
############################################################################
## BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram)
ssh pk
cd /cluster/data/mm8/bed/blastzHg18.2006-02-16
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits hg18 chainMm8Link
# 994530182 bases of 2881515245 (34.514%) in intersection
# GENOSCOPE TETRAODON (tetNig1) ECORES (DONE, 2006-03-03, Fan)
# GENOSCOPE TETRAODON (tetNig1) ECORES (REBUILT, 2006-04-04, Fan)
ssh kkstore02
mkdir -p /cluster/data/hg18/bed/ecoresTetNig1
cd /cluster/data/hg18/bed/ecoresTetNig1
wget --timestamp \
http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecoresHumBuild36/EXOFISH_HS_WITH_TN.gff
wget --timestamp \
http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecoresHumBuild36/EXOFISH_TN_WITH_HS.gff
# this is in gff format
# remove "Ecotig" from name field
sed -e 's/Ecotig EG/EG/g' EXOFISH_HS_WITH_TN.gff |sed -e 's/CHR//' > ExofishHs36Tnig1.gff
# sed -e 's/Ecotig EG/EG/g' ExofishHs36Tnig1 > ExofishHs36Tnig1.gff
# need to have tabs between fields not a space to load file into table
sed -e 's/ /\t/g' ExofishHs36Tnig1.gff > Hs36Tnig1format.gff
# if "ecore" is changed to "CDS" and "ecotig" to "transcript" this loads
# correctly into the table.
sed -e 's/ecore/CDS/' Hs36Tnig1format.gff | sed -e 's/ecotig/transcript/' \
| cut -f 1-8,11 > Hg18vstetNig1.gff
# add "chr" in front of the chromsome name in first field (2005-02-08)
perl -pi.bak -e 's/^([0-9XYM]{1,2})/chr$1/' Hg18vstetNig1.gff
rm *.bak
# need to reload table
ssh hgwdev
cd /cluster/data/hg18/bed/ecoresTetNig1
echo 'drop table ecoresTetNig1;' | hgsql hg18
nice ldHgGene hg18 ecoresTetNig1 Hg18vstetNig1.gff
#########################################################################
# BUILD MAF ANNOTATION FOR MULTIZ17WAY (DONE 2006-03-07, Fan)
ssh kkstore01
cd /cluster/data/monDom4
twoBitInfo -nBed monDom4.2bit monDom4.N.bed
cd /cluster/data/rn4
twoBitInfo -nBed rn4.2bit rn4.N.bed
cd /cluster/data/mm8
twoBitInfo -nBed mm8.2bit mm8.N.bed
ssh kolossus
cd /cluster/data/hg18/bed/multiz17way
mkdir anno
cd anno
mkdir maf run
cd run
rm sizes nBeds
foreach i (`cat /cluster/data/hg18/bed/multiz17way/species.lst`)
ln -s /cluster/data/$i/chrom.sizes $i.len
ln -s /cluster/data/$i/$i.N.bed $i.bed
echo $i.bed >> nBeds
echo $i.len >> sizes
end
echo date > jobs.csh
foreach i (../../maf/*.maf)
echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/hg18/hg18.2bit ../maf/`basename $i` >> jobs.csh
echo "echo $i" >> jobs.csh
end
echo date >> jobs.csh
# do smaller jobs first
tac jobs.csh > jobsRev.csh
mv jobsRev.csh jobs.csh
csh jobs.csh > jobs.log
# This took 10 hours. Hg17 took 1.5 hrs.
ssh kolossus
# loading here because summary table load crashed on hgwdev
cd /cluster/data/hg18/bed/multiz17way/anno/maf
mkdir -p /gbdb/hg18/multiz17way/anno/maf
ln -s /cluster/data/hg18/bed/multiz17way/anno/maf/*.maf \
/gbdb/hg18/multiz17way/anno/maf
cat > loadMaf.csh << 'EOF'
date
hgLoadMaf -pathPrefix=/gbdb/hg18/multiz17way/anno/maf \
hg18 multiz17way
date
cat *.maf | \
nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz17waySummary stdin
date
'EOF'
csh loadMaf.csh > loadMaf.log
ssh kkstore02
cd /cluster/data/hg18/bed/multiz17way
mkdir frames
cd frames
cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .
#edit Makefile to correct species names
cat > copy.csh << 'EOF'
set dir = /cluster/bluearc/hg18/multiz17way/frames/maf
mkdir -p $dir
foreach i (../maf/*.maf)
echo $i
cp -p $i $dir
end
'EOF'
csh copy.csh > copy.log
ssh hgwdev
cd /cluster/data/hg18/bed/multiz17way/frames
time make getGenes > getGenes.log
# 26.100u 4.360s 1:02.78 48.5% 0+0k 0+0io 29643pf+0w
time make getFrames > getFrames.log
# Batch failed after 4 tries on ../mkMafFrames bosTau2 hg18 /san/sanvol1/scratch/hg18/multiz17way/frames/genes/bosTau2.gp.gz /cluster/data/hg18/bed/multiz17way/maf/chr1.maf /san/sanvol1/scratch/hg18/multiz17way/frames/mafFrames/bosTau2/chr1.mafFrames
#make[1]: *** [mafFrames/bosTau2.cluster.done] Error 255
# copy Makefile to Makefile.try2 and remove bosTau2
time make -f Makefile.try2 getFrames > getFrames.try2.log
# copy Makefile to Makefile.try3 and with only bosTau2 remains
time make -f Makefile.try3 getGenes > getGenes.try3.log
time make -f Makefile.try3 getFrames > getFrames.try3.log
time make -f Makefile.try3 getFrames > getFrames.try5.log
time make -f Makefile.try3 getFrames > getFrames.try6.log
# Finally after Mark fixed the bug and recompiled, it worked.
time make -f Makefile.try3 getFrames > getFrames.try7.log
time make loadDb > loadDb.log
#########################################################################
# Build maf annotation for multiz17way (STARTED 2006-02-28, DONE 2006-03-09, Fan)
# rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
ssh kkstore01
cd /cluster/data/monDom4
twoBitInfo -nBed monDom4.2bit monDom4.N.bed
cd /cluster/data/rn4
twoBitInfo -nBed rn4.2bit rn4.N.bed
cd /cluster/data/mm8
twoBitInfo -nBed mm8.2bit mm8.N.bed
ssh kolossus
cd /cluster/data/hg18/bed/multiz17way
mkdir anno
cd anno
mkdir maf run
cd run
rm sizes nBeds
foreach i (`cat /cluster/data/hg18/bed/multiz17way/species.lst`)
ln -s /cluster/data/$i/chrom.sizes $i.len
ln -s /cluster/data/$i/$i.N.bed $i.bed
echo $i.bed >> nBeds
echo $i.len >> sizes
end
echo date > jobs.csh
foreach i (../../maf/*.maf)
echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/hg18/hg18.2bit ../maf/`basename $i` >> jobs.csh
echo "echo $i" >> jobs.csh
end
echo date >> jobs.csh
# do smaller jobs first
tac jobs.csh > jobsRev.csh
mv jobsRev.csh jobs.csh
csh jobs.csh > jobs.log
# This took 10 hours. Hg17 took 1.5 hrs.
ssh hgwdev
# loading here because summary table load crashed on hgwdev
cd /cluster/data/hg18/bed/multiz17way/anno/maf
mkdir -p /gbdb/hg18/multiz17way/anno/maf
ln -s /cluster/data/hg18/bed/multiz17way/anno/maf/*.maf \
/gbdb/hg18/multiz17way/anno/maf
cat > loadMaf.csh << 'EOF'
date
hgLoadMaf -pathPrefix=/gbdb/hg18/multiz17way/anno/maf \
hg18 multiz17way
date
cat *.maf | \
nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz17waySummary stdin
date
'EOF'
csh loadMaf.csh > loadMaf.log
# Dropped unused indexes (2006-05-09 kate)
# NOTE: this is not required in the future, as the loader
# has been fixed to not generate these indexes
hgsql hg18 -e "alter table multiz17waySummary drop index chrom_2"
hgsql hg18 -e "alter table multiz17waySummary drop index chrom_3"
ssh kkstore02
cd /cluster/data/hg18/bed/multiz17way
mkdir frames
cd frames
cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .
# !!! NEXT TIME, COPY ALL maf FILES OVER TO san TO AVOID kkstore02 OVERLOAD.
# edit Makefile to correct species names
cat > copy.csh << 'EOF'
set dir = /cluster/bluearc/hg18/multiz17way/frames/maf
mkdir -p $dir
foreach i (../maf/*.maf)
echo $i
cp -p $i $dir
end
'EOF'
csh copy.csh > copy.log
#for i in ../../maf/*.maf; do echo $i; cp $i /san/sanvol1/scratch/hg18/multiz17wayFrames/maf/$i; done
ssh hgwdev
cd /cluster/data/hg18/bed/multiz17way/frames
time make getGenes > getGenes.log
# 26.100u 4.360s 1:02.78 48.5% 0+0k 0+0io 29643pf+0w
time make getFrames > getFrames.log
# ~2 hours
time make loadDb > loadDb.log
###
# rebuild frames to get bug fix, using 1-pass maf methodology
# (2006-06-09 markd)
ssh kkstore02
cd /cluster/data/hg18/bed/multiz17way/frames
mv mafFrames/ mafFrames.old2
nice tcsh # easy way to get process niced
(cat ../maf/*.maf | time genePredToMafFrames hg18 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer3 genes/danRer3.gp.gz fr1 genes/fr1.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz oryCun1 genes/oryCun1.gp.gz panTro1 genes/panTro1.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz xenTro1 genes/xenTro1.gp.gz | gzip >multiz17way.mafFrames.gz)>&frames.log&
ssh hgwdev
cd /cluster/data/hg18/bed/multiz17way/frames
hgLoadMafFrames hg18 multiz17wayFrames multiz17way.mafFrames.gz >&log&
##########################################################################
# BUILD ALLEN BRAIN TRACK (DONE 03/11/06 Fan)
# Make the working directory
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir allenBrain
cd allenBrain
# Remap the probe alignments from mm7 to hg18
zcat /gbdb/mm7/liftOver/mm7ToHg18.over.chain.gz \
| pslMap -chainMapFile -swapMap \
/cluster/data/mm7/bed/allenBrain/allenBrainAli.psl stdin stdout \
| sort -k 14,14 -k 16,16n > unscored.psl
pslRecalcMatch unscored.psl /cluster/data/hg18/nib \
/cluster/data/mm7/bed/allenBrain/allProbes.fa allenBrainAli.psl
# Load the database
hgsql hg18 < ~/kent/src/hg/lib/allenBrainUrl.sql
hgsql hg18 -e 'load data local infile "/cluster/data/mm7/bed/allenBrain/allenBrainUrl.tab" into table allenBrainUrl;'
hgLoadPsl hg18 allenBrainAli.psl
mkdir /gbdb/hg18/allenBrain
ln -s /cluster/data/mm7/bed/allenBrain/allProbes.fa /gbdb/hg18/allenBrain/allProbes.fa
hgLoadSeq hg18 /gbdb/hg18/allenBrain/allProbes.fa
# Make mapping between known genes and allenBrain
hgMapToGene hg18 allenBrainAli -type=psl knownGene knownToAllenBrain
##########################################################################
#### Blat knownGene proteins to determine exons
# (DONE - 2006-03-15 - 2006-03-24 - hiramc)
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir blat.hg18KG.2006-03-15
rm blat.hg18KG
ln -s blat.hg18KG.2006-03-15 blat.hg18KG
cd blat.hg18KG
pepPredToFa hg18 knownGenePep known.fa
# The kluster run
ssh pk
cd /cluster/data/hg18/bed/blat.hg18KG
cat << '_EOF_' > blatSome
#!/bin/csh -fe
blat -t=dnax -q=prot -out=pslx /scratch/hg/gs.19/build36/bothMaskedNibs/$1.nib \
kgfa/$2.fa $3
'_EOF_'
# << keep emacs happy
chmod +x blatSome
ls -1S /scratch/hg/gs.19/build36/bothMaskedNibs > human.lst
mkdir kgfa
cd kgfa
# This split should be done on the file server, not over NFS
faSplit sequence ../known.fa 3000 kg
ls -1S *.fa > ../kg.lst
cd ..
cat << '_EOF_' > template
#LOOP
blatSome $(root1) $(root2) {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << keep emacs happy
gensub2 human.lst kg.lst template jobList
mkdir psl
cd psl
sed -e "s/.nib//" ../human.lst | xargs mkdir
cd ..
para create jobList
para try ... check ... push ... etc
# Completed: 142100 of 142100 jobs
# CPU time in finished jobs: 7520598s 125343.30m 2089.06h 87.04d 0.238 y
# IO & Wait Time: 415523s 6925.38m 115.42h 4.81d 0.013 y
# Average job time: 56s 0.93m 0.02h 0.00d
# Longest finished job: 5737s 95.62m 1.59h 0.07d
# Submission to last job: 72538s 1208.97m 20.15h 0.84d
ssh kkstore02
cd /cluster/data/hg18/bed/blat.hg18KG.2006-03-15
pslSort dirs raw.psl /tmp psl/*
# -rw-rw-r-- 1 568238823 Mar 20 13:30 raw.psl
pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
# -rw-rw-r-- 1 43446007 Mar 24 11:13 cooked.psl
pslUniq cooked.psl hg18KG.psl
# -rw-rw-r-- 1 41321225 Mar 24 11:14 hg18KG.psl
cut -f 10 hg18KG.psl > kgName.lst
faSomeRecords known.fa kgName.lst hg18KG.fa
faSize hg18KG.fa
# 16419953 bases (12961273 N's 3458680 real 3458680 upper 0 lower)
# in 36727 sequences in 1 files
faSize known.fa
# 16430067 bases (12969298 N's 3460769 real 3460769 upper 0 lower)
# in 36798 sequences in 1 files
# You may need to build this pslxToFa - it is not in the standard build
pslxToFa hg18KG.psl hg18KG_ex.fa -liftTarget=genome.lft \
-liftQuery=protein.lft
# -rw-rw-r-- 1 11294262 Mar 24 11:31 protein.lft
# -rw-rw-r-- 1 21428637 Mar 24 11:31 hg18KG_ex.fa
# -rw-rw-r-- 1 14324928 Mar 24 11:31 genome.lft
wc -l *.psl *.lft *.fa kgName.lst
# 39908 cooked.psl
# 36727 hg18KG.psl
# 1521400 raw.psl
# 303516 genome.lft
# 303516 protein.lft
# 383037 hg18KG.fa
# 607032 hg18KG_ex.fa
# 383348 known.fa
# 36727 kgName.lst
# 3615211 total
# back on hgwdev
ssh hgwdev
cd /cluster/data/hg18/bed/blat.hg18KG
kgName hg18 hg18KG.psl blastKGRef04
# After about an hour, it exited with this message:
# sqlFreeConnection called on cache (hg18) that doesn't contain
# the given connection
# This may be a lurking error in this program, because the
# resulting file seems to have the correct number of lines:
hgsql hg18 < ~/kent/src/hg/lib/blastRef.sql
echo "rename table blastRef to blastKGRef04" | hgsql hg18
echo "load data local infile 'blastKGRef04' into table blastKGRef04" | hgsql hg18
wc -l kgName.lst blastKGRef04 hg18KG.psl
# 36727 kgName.lst
# 36727 blastKGRef04
# 36727 hg18KG.psl
# 110181 total
hgPepPred hg18 generic blastKGPep04 hg18KG.fa
# end blat proteins
##########################################################################
# BUILD NIBB IMAGE PROGES (DONE 2006-03-14 galt following Jim's hg17 example)
# Make directory on san for cluster job and copy in sequence
ssh pk
mkdir /san/sanvol1/scratch/hg18/nibbPics
cd /san/sanvol1/scratch/hg18/nibbPics
cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
# Make parasol job dir and sequence list files
mkdir run
cd run
mkdir psl
ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst
echo ../nibbImageProbes.fa > mrna.lst
# Create parasol gensub file file
cat << '_EOF_' > gsub
#LOOP
blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
#ENDLOOP
'_EOF_'
# Create parasol batch
gensub2 genome.lst mrna.lst gsub spec
para create spec
# Do para try/push/time etc.
#Completed: 49 of 49 jobs
#CPU time in finished jobs: 12585s 209.74m 3.50h 0.15d 0.000 y
#IO & Wait Time: 411s 6.86m 0.11h 0.00d 0.000 y
#Average job time: 265s 4.42m 0.07h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 1145s 19.08m 0.32h 0.01d
#Submission to last job: 1195s 19.92m 0.33h 0.01d
# Make sort and filter
catDir psl | sort -k 10 \
| pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
| sort -k 14,14 -k 16,16n \
| sed 's#/san/sanvol1/scratch/hg18/nib/chr#chr#' \
| sed 's/.nib//' > ../nibbImageProbes.psl
# Make bed file and copy in stuff
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir nibbPics
cd nibbPics
cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
cp /san/sanvol1/scratch/hg18/nibbPics/nibbImageProbes.psl .
# Load into database
ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa /gbdb/hg18/nibbImageProbes.fa
hgLoadSeq hg18 /gbdb/hg18/nibbImageProbes.fa
hgLoadPsl hg18 nibbImageProbes.psl
##########################################################################
# UPDATED hg18.knownToVisiGene (2006-03-15 galt)
# after making sure hg18.vgAllProbes was up to date (see makeVisiGene.doc)
ssh hgwdev
knownToVisiGene hg18 -fromProbePsl=vgAllProbes
##########################################################################
# GENERATE SUMMARY STATISTICS (DONE, Fan 3/18/06)
ssh hgwdev
cd /cluster/data/hg18
mkdir stat
cd stat
stats.pl ~/hg18 >hg18.pl.out
hgCalStat hg18.pl.out hg18 hg18.out
cp hg18.out hg18.out.sorted
# Editi hg18.out.sorted to order by chromosomes and
# replace the "?" in the Y chrom line with 6265435 and align its position.
vi hg18.out.sorted
# Add the hg18 stats to goldenPath/stats.html
cd ~/browser/goldenPath
# insert hg18.out.sorted into stats.html and add necessary
# surrounding HTML lines for the hg18 section.
vi stats.html
cvs update stats.html
cvs commit stats.html
# Change description of hg18, per suggestion by Kim at NCBI (3/20/06, Fan).
ssh hgwdev
echo "update dbDb set description='Mar. 2006' where name = 'hg18';" \
| hgsql -h genome-testdb hgcentraltest
############################################################################
# hg18 -> hg17 LIFTOVER CHAINS (DONE 3/20/06 Fan)
# I used a size of 10kb instead of 3kb for the split (blat query) sizes in
# hg17. This had a huge affect on the amount of hits in the blat, which
# then had a huge effect on the amount of chains. I should also mention
# that hg17 chromosomes chr1 and chr2 were split further
# into more than a single query file. This helped a LOT in avoiding
# cluster hippos classically associated with those chroms.
######## LIFTOVER PREPARATION
# The following paragraph was already done during hg15 to hg17 liftover built
# Split up hg17
ssh pk
cd /san/sanVol1/scratch/hg17
mkdir -p liftSplits/{split,lift}
bash
for fa in /cluster/data/hg17/?{,?,*hap*}/*.fa; do
c=`basename $fa .fa`
echo $c
faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c
done
mkdir -p biggerSplits/split
cd biggerSplits/
ln -s ../liftSplits/lift
cd split/
ln -s ../../liftSplits/split/* .
faSplit sequence chr1.fa 5 chr1_
faSplit sequence chr2.fa 5 chr2_
rm chr{1,2}.fa
# Make some dirs
cd /san/sanVol1/scratch
mkdir -p hg18
# Copy 11.ooc files to hg18 subdirectory.
# cp -p /cluster/store5/gs.16/build33/11.ooc hg18
## First, copy over scripts. (Already done before)
# mkdir -p /san/sanVol1/scratch/fan
# cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan
# cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan
######## LIFTOVER BLATING
# HG18
ssh pk
cd /cluster/data/hg18
makeLoChain-align hg18 /scratch/hg/hg18/nib hg17 /san/sanVol1/scratch/hg17/biggerSplits/split
cd bed
mv blat.hg17.2006-03-20 /san/sanVol1/scratch/hg18
cd /san/sanVol1/scratch/hg18/blat.hg17.2006-03-20/run/
sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg18ToHg17"}' > newspec
para create newspec
para try
para push
# Completed: 2646 of 2646 jobs
# CPU time in finished jobs: 633021s 10550.35m 175.84h 7.33d 0.020 y
# IO & Wait Time: 14063s 234.39m 3.91h 0.16d 0.000 y
# Average job time: 245s 4.08m 0.07h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 3645s 60.75m 1.01h 0.04d
# Submission to last job: 6153s 102.55m 1.71h 0.07d
######## LIFTOVER CHAINING
# LIFTING
ssh pk
cd /san/sanVol1/scratch/fan
cp mm7SplitLift.sh hg17SplitLift.sh
# change andy to fan, mm7 to hg17, and chrX to chr2, and remove chrUn_random
vi hg17SplitLift.sh
cat << 'EOF' > hg17ChainMergeSplit.sh
#!/bin/bash
cp -r chainRaw/ /scratch/fan/hg17Lifts
pushd /scratch/fan/hg17Lifts
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
cp -r chain `dirs +1`
rm -rf chain chainRaw
'EOF'
chmod +x hg17ChainMergeSplit.sh
# HG18
cd /san/sanVol1/scratch/hg18/blat.hg17.2006-03-20/raw
/san/sanVol1/scratch/fan/hg17SplitLift.sh
cd ../
mkdir chainRun chainRaw
cd chainRun
cat > gsub << 'EOF'
#LOOP
/cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg18/nib /san/sanVol1/scratch/hg17/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
'EOF'
ls -1S ../psl/*.psl > in.lst
gensub2 in.lst single gsub spec
para create spec
para try
para push
para time
# Completed: 46 of 46 jobs
# CPU time in finished jobs: 3713s 61.88m 1.03h 0.04d 0.000 y
# IO & Wait Time: 1284s 21.41m 0.36h 0.01d 0.000 y
# Average job time: 109s 1.81m 0.03h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 310s 5.17m 0.09h 0.00d
# Submission to last job: 310s 5.17m 0.09h 0.00d
######### CHAINMERGE/NET/NETSUBSET
ssh kolossus
mkdir -p /scratch/fan/hg17Lifts
cd /scratch/fan/hg17Lifts
cp -r /san/sanvol1/scratch/hg18/blat.hg17.2006-03-20/chainRaw/ .
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
# about 30 minutes.
cp -rp chain /san/sanvol1/scratch/hg18/blat.hg17.2006-03-20/
rm -rf chain
rm -rf chainRaw
ssh pk
cd /san/sanvol1/scratch/fan
cat << 'EOF' > netOver.sh
#!/bin/bash
chain=$1
chrom=`basename $chain .chain`
sizesHGOld=$2
sizesHG17=/cluster/data/hg17/chrom.sizes
chainDir=`dirname $chain`
blatDir=`dirname $chainDir`
net=${blatDir}/net/${chrom}.net
over=${blatDir}/over/${chrom}.over
mkdir -p ${blatDir}/{over,net}
/cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG17 $net /dev/null
/cluster/bin/x86_64/netChainSubset $net $chain $over
'EOF'
chmod +x netOver.sh
mkdir netRun
cd netRun/
find /san/sanVol1/scratch/hg18/blat.hg17.2006-03-20/chain -name "*.chain" \
| awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg18/chrom.sizes"}' > spec
para create spec
para push
para time
# Completed: 49 of 49 jobs
# CPU time in finished jobs: 431s 7.18m 0.12h 0.00d 0.000 y
# IO & Wait Time: 151s 2.52m 0.04h 0.00d 0.000 y
# Average job time: 12s 0.20m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 30s 0.50m 0.01h 0.00d
# Submission to last job: 43s 0.72m 0.01h 0.00d
########## FINISHING
ssh hgwdev
# HG18
cd /san/sanvol1/scratch/hg18/blat.hg17.2006-03-20/over
cat * >> ../hg18ToHg17.over.chain
cd ../
rm -rf psl/ net/ chain/ chainRaw/ over/
cd ../
cp -rp blat.hg17.2006-03-20/ /cluster/data/hg18/bed
cd /cluster/data/hg18/bed
ln -s blat.hg17.2006-03-20 blat.hg17
ln -s `pwd`/blat.hg17/hg18ToHg17.over.chain liftOver/hg18ToHg17.over.chain
ln -s `pwd`/liftOver/hg18ToHg17.over.chain /gbdb/hg18/liftOver/hg18ToHg17.over.chain
mkdir -p /usr/local/apache/htdocs/goldenPath/hg18/liftOver
cd /usr/local/apache/htdocs/goldenPath/hg18/liftOver
cp /gbdb/hg18/liftOver/hg18ToHg17.over.chain .
gzip hg18ToHg17.over.chain
hgAddLiftOverChain hg18 hg17 /gbdb/hg18/liftOver/hg18ToHg17.over.chain
##########################################################################
# NSCAN track - ( markd)
# hg17 had both NSCAN and NSCAN-EST tracks, in a composite track.
# currently have only NSCAN for hg18
cd /cluster/data/hg18/bed/nscan/
# obtainedf NSCAN predictions from michael brent's group
# at WUSTL
wget -nv http://genes.cse.wustl.edu/jeltje/hg18/hg18.nscan.gtf
wget -r -np -nv http://genes.cse.wustl.edu/jeltje/hg18/chr_ptx/
mv genes.cse.wustl.edu/jeltje/hg18/chr_ptx .
rm -rf genes.cse.wustl.edu chr_ptx/index.html*
gzip -9 hg18.nscan.gtf chr_ptx/*.fa
chmod a-w hg18.nscan.gtf.gz chr_ptx/*.gz
# load tracks. Note that these have *utr features, rather than
# exon features. currently ldHgGene creates separate genePred exons
# for these.
ldHgGene -bin -gtf -genePredExt hg18 nscanGene hg18.nscan.gtf.gz
# add .a suffix to match transcript id
hgPepPred -suffix=.a hg18 generic nscanPep chr_ptx/*.fa.gz
rm -f *.tab
# update trackDb; need a hg18-specific page to describe informants
human/hg18/nscanGene.html
human/hg18/trackDb.ra
# QA NOTE [ASZ 9-11-2006]: mytouch nscanPep 200603271900.00
##########################################################################
# UPDATED hg18.knownToVisiGene (2006-04-05 galt)
# after making sure hg18.vgAllProbes was up to date (see makeVisiGene.doc)
ssh hgwdev
knownToVisiGene hg18 -fromProbePsl=vgAllProbes
##############################################################################
# BLASTZ CHIMP PanTro1 second time (STARTED - 2006-01-05, DONE 2006-01-13 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzPanTro1.2006-01-05
cd /cluster/data/hg18/bed
rm blastz.panTro1
ln -s blastzPanTro1.2006-01-05 blastz.panTro1
cd blastzPanTro1.2006-01-05
cat << '_EOF_' > DEF
# human vs chimp
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/hg18.2bit
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
# QUERY: Chimp PanTro1 - single chunk big enough to run entire genome
SEQ2_DIR=/san/sanvol1/scratch/panTro1/panTro1.2bit
SEQ2_LEN=/scratch/hg/panTro1/chrom.sizes
SEQ2_CHUNK= 30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzPanTro1.2006-01-05
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started Thu Jan 5 11:26:45 PST 2006
# Encountered an error at the net step:
startStep: 0, at step 5 net to stopStep 6
# chmod a+x /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain/netChains.csh
# ssh -x pk nice /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain/netChains.csh
cd /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain
chainPreNet hg18.panTro1.all.chain.gz /scratch/hg/hg18/chrom.sizes /scratch/hg/panTro1/chrom.sizes stdout
chainNet stdin -minSpace=1 /scratch/hg/hg18/chrom.sizes /scratch/hg/panTro1/chrom.sizes stdout /dev/null
netSyntenic stdin noClass.net
Got 49 chroms in /scratch/hg/hg18/chrom.sizes, 52 in /scratch/hg/panTro1/chrom.sizes
Finishing nets
writing stdout
writing /dev/null
memory usage 363347968, utime 1042 s/100, stime 56
netChainSubset -verbose=0 noClass.net hg18.panTro1.all.chain.gz stdout
chainSort stdin stdout
gzip -c
Out of memory needMem - request size 6 bytes
gzip: stdout: Broken pipe
Command failed:
ssh -x pk nice /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain/netChains.csh
# 1/9/06, Retry again
ssh pk
cd /cluster/data/hg18/bed
cd blastzPanTro1.2006-01-05
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-continue=net \
-stop=load \
`pwd`/DEF > load2.out 2>&1 &
# Same error.
# Try with kolossus
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=net \
-stop=load \
`pwd`/DEF > load3.out 2>&1 &
# Still have problems, which seem to be related to the
# wrong $MACHTYPE and $PATH on kolossus. Updated my .cshrc
# Did the following manually on kolossus:
# vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
/cluster/bin/x86_64/netChainSubset -verbose=0 noClass.net hg18.panTro1.all.chain.gz stdout | chainSort stdin stdout | gzip -c > hg18.panTro1.over.chain.gz
mkdir -p /cluster/data/hg18/bed/liftOver
cp -p hg18.panTro1.over.chain.gz /cluster/data/hg18/bed/liftOver/hg18ToPanTro1.over.chain.gz
# Make axtNet for download: one .axt per hg18 seq.
netSplit noClass.net net
cd ..
mkdir axtNet
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
/scratch/hg/hg18/hg18.2bit /san/sanvol1/scratch/panTro1/panTro1.2bit stdout \
| axtSort stdin stdout \
| gzip -c > axtNet/$f:t:r.hg18.panTro1.net.axt.gz
end
# Make mafNet for multiz: one .maf per hg18 seq.
mkdir mafNet
foreach f (axtNet/*.hg18.panTro1.net.axt.gz)
axtToMaf -tPrefix=hg18. -qPrefix=panTro1. $f \
/scratch/hg/hg18/chrom.sizes /scratch/hg/panTro1/chrom.sizes \
stdout \
| gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ above by hand.
ssh pk
cd /cluster/data/hg18/bed
cd blastzPanTro1.2006-01-05
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=load \
-stop=load \
`pwd`/DEF > load4.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# Measurements:
# Go to kolossus to run featureBits to avoid out of memory problem.
ssh kolossus
bash
time HGDB_CONF=~/.hg.conf.read-only featureBits panTro1 chainHg18Link
# 2641472125 bases of 2733948177 (96.617%) in intersection
time HGDB_CONF=~/.hg.conf.read-only featureBits hg18 chainPanTro1Link
# 2681146909 bases of 2881515245 (93.046%) in intersection
time HGDB_CONF=~/.hg.conf.read-only featureBits panTro1 chainHg17Link
# 0 bases of 2733948177 (0.000%) in intersection
time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 chainPanTro1Link
# 2633869032 bases of 2866216770 (91.894%) in intersection
#########################################################################
# BLASTZ RAT Rn3 (STARTED - 2005-12-22, DONE 2006-01-05 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzRn3.2005-12-22
cd /cluster/data/hg18/bed
rm blastz.rn3
ln -s blastzRn3.2005-12-22 blastz.rn3
cd blastzRn3.2005-12-22
cat << '_EOF_' > DEF
# human vs rat
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Muman Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/scratch/hg/hg18/linSpecRep/notInRat
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rat Rn3 - chunk big enough to do all chroms in single whole
pieces
SEQ2_DIR=/scratch/rat/rn3/softNib
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
SEQ2_LEN=/cluster/bluearc/rat/rn3/chrom.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzRn3.2005-12-22
TMPDIR=/scratch/tmp
'_EOF_'
# happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-stop=load \
`pwd`/DEF > to-load.out 2>&1 &
# start processing again on 12/31/05.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-swap \
-stop=load \
`pwd`/DEF > swap.out 2>&1 &
# Either UCSC RR and hgwdev systems or network went down around 11 AM 12/31/05.
# After holidays, start again on 1/3/06 and again on 1/5/06.
ssh pk
cd /cluster/data/hg18/bed
cd blastzRn3.2005-12-22
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-swap \
-continue=net \
-stop=load \
`pwd`/DEF > swap6.out 2>&1 &
# DONE! Jan 5 13:39
# Measurements:
nice featureBits rn3 chainHg18Link
# 962630574 bases of 2571104688 (37.440%) in intersection
nice featureBits hg18 chainRn3Link
# 964251210 bases of 2881515245 (33.463%) in intersection
#########################################################################
# BLASTZ ARMADILLO DasNov1 (STARTED - 2006-01-06 - 2006-01-09 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzDasNov1.2006-01-06
cd /cluster/data/hg18/bed
rm blastz.dasNov1
ln -s blastzDasNov1.2006-01-06 blastz.dasNov1
cd blastzDasNov1.2006-01-06
cat << '_EOF_' > DEF
# human vs armadillo
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
# Specific settings for armadillo (per Webb email to Brian Raney)
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=30000000
SEQ1_LAP=10000
# QUERY: Armadillo DasNov1
SEQ2_DIR=/scratch/hg/dasNov1/dasNov1.2bit
SEQ2_LEN=/scratch/hg/dasNov1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzDasNov1.2006-01-06
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started Fri Jan 6 06:20:12 PST 2006
# 1:20 PM, 1/7/06
# The blastz cluster run seemed finished OK, but make jobList some how
# does not end, even after creating the run.time file manually. Kill it manually.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-continue=cat \
-stop=load \
`pwd`/DEF > load2.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
# Done, Jan 8 21:40.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# Done! Jan 9 06:11
# Reciprocal best net mafs for multiz (kate)
ssh kkstore02
cd /cluster/data/hg18/bed/blastz.dasNov1
~/kent/src/hg/utils/automation/doRecipBest.pl hg18 dasNov1 >&! rbest.log &
# Load nets (2007-03-12 kate)
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.dasNov1/axtChain
netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netDasNov1 stdin
netFilter -minGap=10 hg18.dasNov1.rbest.net.gz | \
hgLoadNet -warn hg18 netRBestDasNov1 stdin
#########################################################################
# BLASTZ DOG CanFam2 second time (DONE - 2005-12-28 - 2005-12-29 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzCanFam2.2005-12-28
cd /cluster/data/hg18/bed
rm blastz.canFam2
ln -s blastzCanFam2.2005-12-28 blastz.canFam2
cd blastzCanFam2.2005-12-28
cat << '_EOF_' > DEF
# human vs dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
# Specific settings for dog (per Webb email to Brian Raney)
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Dog CanFam2 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_LEN=/cluster/bluearc/canFam2/chrom.sizes
SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzCanFam2.2005-12-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started 2005-12-28 21:33
# Two jobs stuck in the same node. Did manual para stop and para push.
# Both finished within a few minutes.
# Done! On Thu Dec 29 05:27:31 PST 2005.
# system seems hang on kolossus (3 processes of [tcsh -c nice chainMergeSort], not moving)
# manually killed the jobs.
# now use pk as the workhorse.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-continue=chainMerge \
-stop=load \
`pwd`/DEF > load2.out 2>&1 &
# Done! Thu Dec 29 09:10:02 PST 2005.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
# Had an error at the load step,
# mySQL error 2013: Lost connection to MySQL server during query,
# probably due to sys admin working on network connections,
# continue at the load step
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-swap -continue=load -stop=load \
`pwd`/DEF > swap-load2.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-workhorse=pk \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# Done! Dec 29 13:21
# Measurements:
ssh hgwdev
nice featureBits canFam2 chainHg18Link
# 1477551526 bases of 2384996543 (61.952%) in intersection
nice featureBits hg18 chainCanFam2Link
# 1524764349 bases of 2881515245 (52.915%) in intersection
nice featureBits canFam2 chainHg17Link
# 1487483112 bases of 2384996543 (62.368%) in intersection
nice featureBits hg17 chainCanFam2Link
# 1530197469 bases of 2866216770 (53.387%) in intersection
#########################################################################
# BLASTZ ELEPHANT LoxAfr1 second time (STARTED - 2006-01-03 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03
cd /cluster/data/hg18/bed
rm blastz.loxAfr1
ln -s blastzLoxAfr1.2006-01-03 blastz.loxAfr1
cd blastzLoxAfr1.2006-01-03
cat << '_EOF_' > DEF
# human vs elephant
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Elephant LoxAfr1 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/loxAfr1/loxAfr1.2bit
SEQ2_LEN=/scratch/hg/loxAfr1/chrom.sizes
SEQ2_LIMIT=300
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# failed at step 2 due to kki cluster not started.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=pk \
-smallClusterHub=pk \
-continue=cat \
-stop=load \
`pwd`/DEF > load2.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-smallClusterHub=pk \
-continue=net \
-stop=load \
`pwd`/DEF > load3.out 2>&1 &
# Same broken pipe error.
netChainSubset -verbose=0 noClass.net hg18.loxAfr1.all.chain.gz stdout
chainSort stdin stdout
gzip -c
Out of memory needMem - request size 28 bytes
gzip: stdout: Broken pipe
Command failed:
ssh -x kolossus nice /cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03/axtChain/netChains.csh
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=net \
-stop=load \
`pwd`/DEF > load4.out 2>&1 &
# Finally, a success!
tail load4.out
#...
# cd /cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03/axtChain
#netClass -verbose=0 -noAr noClass.net hg18 loxAfr1 hg18.loxAfr1.net
#netFilter -minGap=10 hg18.loxAfr1.net
#hgLoadNet -verbose=0 hg18 netLoxAfr1 stdin
#startStep: 5, at step 7 download to stopStep 6
# *** All done!
# *** Add {chain,net}LoxAfr1 tracks to trackDb.ra if necessary.
# The swap-load was not successful, after several tries.
# Last one seems was due to out of memory problem.
# Per Hiram, we no longer do swap for 2X genomes, unless specifically requested.
# Mark made an inquiry, but said he can get by with hg18->loxAfr1 nets.
# reciprocal best net mafs for multiz (2007-03-09 kate)
ssh kkstore02
cd /cluster/data/hg18/bed/blastz.loxAfr1
~/kent/src/hg/utils/automation/doRecipBest.pl hg18 loxAfr1 >&! rbest.log &
# load net and reciprocal best net for comparison
# note sure why these tables and cleanup aren't done -- ask Fan
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.loxAfr1/axtChain
netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netLoxAfr1 stdin
netFilter -minGap=10 hg18.loxAfr1.rbest.net.gz | \
hgLoadNet -warn hg18 netRBestLoxAfr1 stdin
#########################################################################
# BLASTZ COW BosTau2 second time (STARTED - 2006-01-07 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzBosTau2.2006-01-07
cd /cluster/data/hg18/bed
rm blastz.bosTau2
ln -s blastzBosTau2.2006-01-07 blastz.bosTau2
cd blastzBosTau2.2006-01-07
cat << '_EOF_' > DEF
# human vs cow
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cow BosTau2 - single chunk big enough to run entire genome
SEQ2_DIR=/san/sanvol1/scratch/bosTau2/bosTau2.2bit
SEQ2_LEN=/san/sanvol1/scratch/bosTau2/chrom.sizes
SEQ2_CHUNK=3200000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzBosTau2.2006-01-07
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-stop=load \
-workhorse=pk \
`pwd`/DEF > load.out 2>&1 &
# Started Sat Jan 7 07:57:22 PST 2006
# blastz run (and load) done Jan 8 00:13
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
# took a long time to finish.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# Done! Jan 8 21:10
# Measurements:
ssh hgwdev
nice featureBits bosTau2 chainHg18Link
# 1357027317 bases of 2812203870 (48.255%) in intersection
nice featureBits hg18 chainBosTau2Link
# 1357291762 bases of 2881515245 (47.103%) in intersection
nice featureBits bosTau2 chainHg17Link
# 0 bases of 2812203870 (0.000%) in intersection
# nice featureBits hg17 chainBosTau2Link
1350076765 bases of 2866216770 (47.103%) in intersection
#########################################################################
# BLASTZ TENREC EchTel1 second time (STARTED - 2006-01-09 DONE 2006-01-12 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzEchTel1.2006-01-09
cd /cluster/data/hg18/bed
rm blastz.echTel1
ln -s blastzEchTel1.2006-01-09 blastz.echTel1
cd blastzEchTel1.2006-01-09
cat << '_EOF_' > DEF
# human vs tenrec
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
# QUERY: Tenrec EchTel1
SEQ2_DIR=/scratch/hg/echTel1/echTel1.2bit
SEQ2_LEN=/scratch/hg/echTel1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzEchTel1.2006-01-09
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started Mon Jan 9 08:09:03 PST 2006
# Found over a thousand jobs failed, all with the following 7 hosts.
[pk:run.blastz> fgrep host j1.err | sort -u
host: kkr10u06.kilokluster.ucsc.edu
host: kkr10u58.kilokluster.ucsc.edu
host: kkr10u62.kilokluster.ucsc.edu
host: kkr11u34.kilokluster.ucsc.edu
host: kkr11u39.kilokluster.ucsc.edu
host: kkr12u18.kilokluster.ucsc.edu
host: kkr12u29.kilokluster.ucsc.edu
# manually created /scratch/tmp on above machines (except one).
# 2 jobs still running for more than 5 hours each.
para stop
para recover jobList newJobList
# newJobList contains only 2 jobs. Checked the .psl files under psl confirming only two files missing.
para create newJobList
para push
# This 2 jobs finished within a couple of mintues!
para time >run.time
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=cat \
-stop=load \
`pwd`/DEF > load2.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=net \
-swap \
-stop=load \
`pwd`/DEF > swap-load3.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# Done! On Jan 12 09:18
# reciprocal best net mafs for multiz (2007-03-09 kate)
ssh kkstore02
cd /cluster/data/hg18/bed/blastz.echTel1
~/kent/src/hg/utils/automation/doRecipBest.pl hg18 echTel1 >&! rbest.log &
# reloading chains which disappeared (2007-04-17 kate)
cd /cluster/data/hg18/bed/blastz.echTel1/axtChain
# edit loadUp.csh --> create loadUp2.csh and loadUp3.csh
# run loadUp2.csh (does chainSplit) on kkstore02
# run loadUp3.csh (does hgLoadChain) on hgwdev
#########################################################################
# BLASTZ CHICKEN GalGal2 second time (DONE - 2005-12-28 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzGalGal2.2005-12-28
cd /cluster/data/hg18/bed
rm blastz.galGal2
ln -s blastzGalGal2.2005-12-28 blastz.galGal2
cd blastzGalGal2.2005-12-28
cat << '_EOF_' > DEF
# human vs chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chicken GalGal2 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/galGal2/nib
SEQ2_LEN=/cluster/bluearc/galGal2/chrom.sizes
SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzGalGal2.2005-12-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started 2005-12-28 10:35
# Two jobs stuck in the same node. Did manual para stop and para push.
# Both finished within a few minutes.
# Done! On Wed Dec 28 15:32:45 PST 2005.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
# Had an error at the net step
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=net -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
# the gzip job on kolossus seems not moving at all.
# killed it manually. Try again.
# Seemed not moving, kill it again. Now use pk instead of kolossus.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-workhorse=pk \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-workhorse=pk \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# Done! Wed Dec 28 20:39:44 PST 2005
# Measurements:
ssh hgwdev
nice featureBits galGal2 chainHg18Link
# 91564024 bases of 1054197620 (8.686%) in intersection
nice featureBits hg18 chainGalGal2Link
# 102417858 bases of 2881515245 (3.554%) in intersection
nice featureBits galGal2 chainHg17Link
# 93277286 bases of 1054197620 (8.848%) in intersection
nice featureBits hg17 chainGalGal2Link
# 103882699 bases of 2866216770 (3.624%) in intersection
# BLASTZ FROG XenTro1 second time (DONE - 2006-01-07 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzXenTro1.2006-01-06
cd /cluster/data/hg18/bed
rm blastz.xenTro1
ln -s blastzXenTro1.2006-01-06 blastz.xenTro1
cd blastzXenTro1.2006-01-06
cat << '_EOF_' > DEF
# human vs frog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
# QUERY: Frog XenTro1 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzXenTro1.2006-01-06
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started Fri Jan 6 20:19:30 PST 2006
# Blastz run done. Jan 7 02:07 load.out
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
# got the following error:
startStep: 4, at step 5 net to stopStep 6
netChains: looks like previous stage was not successful (can't find [xenTro1.hg18.]all.chain[.gz]).
# Try it with pk instead of kolossus:
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-swap -stop=load \
`pwd`/DEF > swap-load2.out 2>&1 &
# It worked, swap-load done. Jan 7 06:05
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-workhorse=pk \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# Done! Jan 7 06:18
# Measurements:
ssh hgwdev
nice featureBits xenTro1 chainHg18Link
# 61197900 bases of 1381238994 (4.431%) in intersection
nice featureBits hg18 chainXenTro1Link
# 67810866 bases of 2881515245 (2.353%) in intersection
nice featureBits xenTro1 chainHg17Link
# 81777842 bases of 1381238994 (5.921%) in intersection
nice featureBits hg17 chainXenTro1Link
# 85701475 bases of 2866216770 (2.990%) in intersection
# BLASTZ FROG XenTro1 second time (STARTED - 2006-01-06 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzXenTro1.2006-01-06
cd /cluster/data/hg18/bed
rm blastz.xenTro1
ln -s blastzXenTro1.2006-01-06 blastz.xenTro1
cd blastzXenTro1.2006-01-06
cat << '_EOF_' > DEF
# human vs frog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
# QUERY: Frog XenTro1 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzXenTro1.2006-01-06
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started Fri Jan 6 20:19:30 PST 2006
# Blastz run done. Jan 7 02:07 load.out
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
# got the following error:
startStep: 4, at step 5 net to stopStep 6
netChains: looks like previous stage was not successful (can't find [xenTro1.hg18.]all.chain[.gz]).
# Try it with pk instead of kolossus:
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-swap -stop=load \
`pwd`/DEF > swap-load2.out 2>&1 &
# It worked, swap-load done. Jan 7 06:05
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-workhorse=pk \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# Done! Jan 7 06:18
# Measurements:
ssh hgwdev
nice featureBits xenTro1 chainHg18Link
# 61197900 bases of 1381238994 (4.431%) in intersection
nice featureBits hg18 chainXenTro1Link
# 67810866 bases of 2881515245 (2.353%) in intersection
nice featureBits xenTro1 chainHg17Link
# 81777842 bases of 1381238994 (5.921%) in intersection
nice featureBits hg17 chainXenTro1Link
# 85701475 bases of 2866216770 (2.990%) in intersection
# BLASTZ TETRAODON TetNig1 second time (DONE - 2006-01-07 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzTetNig1.2006-01-07
cd /cluster/data/hg18/bed
rm blastz.tetNig1
ln -s blastzTetNig1.2006-01-07 blastz.tetNig1
cd blastzTetNig1.2006-01-07
cat << '_EOF_' > DEF
# human vs tetraodon
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Tetraodon TetNig1 - single chunk big enough to run entire genome
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
SEQ2_CHUNK=410000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzTetNig1.2006-01-07
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-stop=load \
`pwd`/DEF > load.out 2>&1 &
# Started Sat Jan 7 05:40:51 PST 2006
# Encountered an error:
startStep: 0, at step 5 net to stopStep 6
netChains: looks like previous stage was not successful (can't find [hg18.tetNig1.]all.chain[.gz]).
# Try it with pk as the workhorse.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-continue=net \
-stop=load \
`pwd`/DEF > load2.out 2>&1 &
# Load done. Sat Jan 7 07:34:56 PST 2006
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-swap -stop=load \
`pwd`/DEF > swap-load.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-continue=download \
`pwd`/DEF > download.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=pk \
-swap -continue=download \
`pwd`/DEF > swap-download.out 2>&1 &
# Done! Sat Jan 7 08:02:14 PST 2006
# The download and swap-download took less than 10 seconds each. ???
# Measurements:
ssh hgwdev
nice featureBits tetNig1 chainHg18Link
# 50026847 bases of 342403326 (14.611%) in intersection
nice featureBits hg18 chainTetNig1Link
# 57654754 bases of 2881515245 (2.001%) in intersection
nice featureBits tetNig1 chainHg17Link
# 34379509 bases of 342403326 (10.041%) in intersection
nice featureBits hg17 chainTetNig1Link
# 35910128 bases of 2866216770 (1.253%) in intersection
#########################################################################
# BLASTZ FUGU fr1 (STARTED - 2005-12-20, DONE 2006-01-04 Fan)
ssh pk
mkdir /cluster/data/hg18/bed/blastzFr1.2005-12-20
cd /cluster/data/hg18/bed
ln -s blastzFr1.2005-12-20 blastz.fr1
cd blastzFr1.2005-12-20
cat << '_EOF_' > DEF
# human vs. fugu
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
# Reuse parameters from human-chicken, except L=6000 (more relaxed)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human Hg18 - testing 100,000,000 sized chunk on pk kluster
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
# QUERY: Fugu Fr1 - chunk big enough to run the whole chrom at once
SEQ2_DIR=/san/sanvol1/scratch/fr1/nib
SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ2_CHUNK=400000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzFr1.2005-12-20
'_EOF_'
# << happy emacs
# establish a screen to control this job
ssh pk
cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -stop=load \
`pwd`/DEF > thruLoad.out 2>&1 &
ssh pk
cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -continue=chainMerge -stop=load \
`pwd`/DEF > thruLoad.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -continue=download \
`pwd`/DEF > download.clean.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -swap \
`pwd`/DEF > swap.out 2>&1 &
# Finish the remaining step, 1/4/05.
ssh pk
cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
screen
bash
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 \
-swap -continue=download \
`pwd`/DEF > DownloadSwap.out 2>&1 &
# First try found the DEF was some how altered for rn3.
# Re-generated DEF and try again.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 \
-swap -continue=download \
`pwd`/DEF > DownloadSwap2.out 2>&1 &
# Done. Jan 4 09:48.
# measurements
nice featureBits hg18 chainFr1Link
# 51795958 bases of 2881515245 (1.798%) in intersection
nice featureBits hg17 chainFr1Link
#50831650 bases of 2866216770 (1.773%) in intersection
nice featureBits hg18 netFr1
# 691148929 bases of 2881515245 (23.986%) in intersection
nice featureBits hg17 netFr1
# 714234935 bases of 2866216770 (24.919%) in intersection
nice featureBits fr1 chainHg18Link
# 43267869 bases of 315518167 (13.713%) in intersection
# nice featureBits fr1 chainHg17Link
0 bases of 315518167 (0.000%) in intersection
nice featureBits fr1 netHg18
# 140843080 bases of 315518167 (44.639%) in intersection
nice featureBits fr1 netHg17
# 0 bases of 315518167 (0.000%) in intersection
##################################################
# For blastz runs between hg18 and other organisms, they are documented in
# makeMm8.doc makeRn4.doc, makeRheMac2.doc, makeDanRer3.doc.
# PHASTCONS SCORES DOWNLOADABLES FOR 17WAY (2006-04-06 Fan)
ssh kkstore02
cd /cluster/data/hg18/bed/multiz17way
mkdir phastConsDownloads
cd phastConsDownloads
cat > downloads.csh << 'EOF'
date
cd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/pp
foreach chr (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`)
echo $chr
cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \
| nice gzip -c \
> /cluster/data/hg18/bed/multiz17way/phastConsDownloads/$chr.gz
end
date
'EOF'
csh downloads.csh >&! downloads.log &
# ~20 minutes
# << happy emacs
ssh hgwdev
cd /cluster/data/hg18/bed/multiz17way/phastConsDownloads
md5sum *.gz > md5sum.txt
set dir = /usr/local/apache/htdocs/goldenPath/hg18/phastCons17way
mkdir $dir
ln -s /cluster/data/hg18/bed/multiz17way/phastConsDownloads/{*.gz,md5sum.txt} $dir
cp /usr/local/apache/htdocs/goldenPath/hg17/phastCons17way/README.txt $dir
# edit this file to reflect the latest releases used.
vi $dir/README.txt
##########################################################################
# RE-BUILT GO DATABASE (DONE 4/12/06, Fan)
# GO changed the content of gene_association.goa_uniprot.gz.
# Tho original one we use no longer has human, mouse, etc in it.
# They are placed in separate files.
# Per GO's suggestion, we now get the file from the submission sub-directory.
# This seems cover more than concatenating the individual goa... files.
# Download the terms and make the database.
ssh hgwdev
mkdir /cluster/store1/geneOntology/20060330
cd /cluster/store1/geneOntology/20060330
wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200603-assocdb-data.gz
hgsql mysql <<end
create database go060330;
end
zcat go_*data.gz | sed -e 's/ENGINE=MyISAM DEFAULT CHARSET=latin1/TYPE=MyISAM/g' >j.tmp
hgsql go060330 <j.tmp
rm j.tmp
wget --timestamping \
"ftp://ftp.geneontology.org/pub/go/gene-associations/submission/gene_association.goa_uniprot.gz"
# Updated hgGoAssociation.c so that it does not skip any line in the beginning */
zcat gene_association.goa_uniprot.gz|\
/cluster/home/fanhsu/bin/i386/hgGoAssociation go060330 goaPart stdin
# Ask sys-admin to switch the database pointer go to point to go060330.
##########################################################################
# GENEID GENE PREDICTIONS (DONE - 2006-04-21 FIXED: 2006-05-09 - Hiram)
# RELOADED PEPTIDE TABLE, GENEIDPEP (DONE, 2006-07-11, hartera)
ssh hgwdev
mkdir /cluster/data/hg18/bed/geneid
cd /cluster/data/hg18/bed/geneid
for C in `awk '{print $1;}' ../../chrom.sizes`
do
wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/geneid_v1.2/$C.gtf
wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/geneid_v1.2/$C.prot
done
# Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
for F in chr*.prot
do
perl -wpe 's/^(>chr\S+)/$1.1/' $F
done >> geneid.fa
# one of the files in this delivery, chr1.prot, did *not* have a
# terminal <CR> character and it caused the next protein in the
# next file processed, chr10.prot, to be a continuation of the
# last protein in chr1.prot. To check for this:
grep ">" geneid.fa | grep -v "^>"
# shows a line:
# AVSET>chr10_1.1
# This turns out to have been the result of a truncated file.
# Fetch that file again:
mv chr1.prot chr1.prot.orig
wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/geneid_v1.2/chr1.prot
# That's better:
wc -l chr1.prot chr1.prot.orig
# 24494 chr1.prot
# 4524 chr1.prot.orig
rm chr1.prot.orig
# run the above loop again to generate geneid.fa after:
rm geneid.fa
ldHgGene -gtf -genePredExt hg18 geneid *.gtf
# Read 33410 transcripts in 275347 lines in 49 files
# 33410 groups 49 seqs 1 sources 3 feature types
# 33410 gene predictions
hgPepPred hg18 generic geneidPep geneid.fa
# verify same names in both tables:
awk '{print $1}' geneidPep.tab | sort > pep.names
awk '{print $1}' genePred.tab | sort > id.names
wc -l pep.names id.names
# 33410 pep.names
# 33410 id.names
comm -12 pep.names id.names | wc -l
# 33410
# QA NOTE (ASZ 5-11-2006) I dropped the geneidPep table and the reference
# to it from the trackDb.ra file. This functionality is now done on the
# fly and this table is no longer needed.
# Added back the geneidPep table as requested by a user
# (hartera, 2006-07-11)
ssh hgwdev
cd /cluster/data/hg18/bed/geneid
hgPepPred hg18 generic geneidPep geneid.fa
# The trackDb.ra file in kent/src/makeDb seems to have a reference
# to the geneidPep table already.
##########################################################################
# BLASTZ/CHAIN/NET XENTRO2 (DONE 4/20/06 angie)
ssh kkstore02
mkdir /cluster/data/hg18/bed/blastz.xenTro2.2006-04-20
cd /cluster/data/hg18/bed/blastz.xenTro2.2006-04-20
cat << '_EOF_' > DEF
# human vs. frog
BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
# Use same params as used for mammal-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Frog xenTro2 - single chunk big enough to run two of the
# largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/cluster/data/hg18/bed/blastz.xenTro2.2006-04-20
'_EOF_'
# << emacs
doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/hg18XenTro2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
>& do.log & tail -f do.log
ln -s blastz.xenTro2.2006-04-20 /cluster/data/hg18/bed/blastz.xenTro2
###########################################################################
# BLASTZ CHAIN SWAP FOR ZEBRAFISH (danRer4) (DONE, 2006-04-25, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS
# See also makeDanRer4.doc
# alignments are in: /cluster/data/hg18/bed/blastz.danRer4.swap
# Blastz parameters used were:
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=6000
# BLASTZ_K=2200
# BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
# There are no lineage-specific repeats defined for this species pair so
# all repeats were used as lineage-specific.
ssh pk
cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF >& doSwap.log &
# Took about 15 minutes.
# check with featureBits and compare to danRer3 chains:
featureBits hg18 chainDanRer4Link
# 57415379 bases of 2881515245 (1.993%) in intersection
featureBits hg18 chainDanRer3Link
# 64801985 bases of 2881515245 (2.249%) in intersection
featureBits -chrom=chr1 hg18 refGene:cds chainDanRer4Link -enrichment
# refGene:cds 1.389%, chainDanRer4Link 2.337%, both 0.937%, cover 67.47%,
# enrich 28.87x
featureBits -chrom=chr1 hg18 refGene:cds chainDanRer3Link -enrichment
# refGene:cds 1.389%, chainDanRer3Link 2.601%, both 0.931%, cover 67.01%,
# enrich 25.76x
featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 1.395%, chainDanRer2Link 2.742%, both 0.911%, cover 65.31%,
# enrich 23.82x
# similar coverage and enrichment for danRer4 and danRer3 chains
# which is good.
featureBits -chrom=chr1 hg18 refGene:cds netDanRer4 -enrichment
# refGene:cds 1.389%, netDanRer4 31.001%, both 1.096%, cover 78.91%,
# enrich 2.55x
featureBits -chrom=chr1 hg18 refGene:cds netDanRer3 -enrichment
# refGene:cds 1.389%, netDanRer3 29.929%, both 1.080%, cover 77.72%,
# enrich 2.60x
# Similar coverage and enrichment for danRer4 net on hg18 as for danRer3.
# LOAD FIRSTEF TRACK (DONE 2006-04-25 Fan)
ssh hgwdev
mkdir -p /cluster/data/hg18/bed/firstEF
cd /cluster/data/hg18/bed/firstEF
# receive the file firstEFMar05New.bed.gz from email (ramana.davuluri at osumc.edu) into this subdirectory
cat << '_EOF_' > sedScript
s/chr23/chrX/g
s/chr24/chrY/g
/^>/d
/^$/d
/^No/d
'_EOF_'
# << this line keeps emacs coloring happy
bash
zcat firstEFMar05New.bed.gz | sed -f sedScript | awk "{OFS=\"\t\"} {\$3 +=1; print \$0}" > firstEF.bed
exit
hgLoadBed hg18 firstEF firstEF.bed
rm firstEF.bed bed.tab
#done firstEF
###########################################################################
# ALTGRAPHX TRACK (sugnet) Wed Apr 26 13:46:46 PDT 2006
cd /cluster/store1/sugnet/altSplice/
mkdir hg18-2006.04.13
cd hg18-2006.04.13
mkdir rnaCluster
cd rnaCluster
# Don't use RAGE libraries for clone bounds.
~/latestJk/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg18 rage.libs
# Make spec file to run.
foreach c (`echo 'select chrom from chromInfo' | hgsql hg18 | grep -v chrom`)
set out = chrom/$c.bed
echo "clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c" >> clusterRna.spec
end
# Tried running it on the minicluster, but can't connect to the
# cluster accounts so run it from here on hgwdev.
chmod 755 clusterRna.spec
mkdir chrom
./clusterRna.spec >& clusterRna.log
cd ..
# Make script to setup parasol job file for raw altGraphX files on human
cat << '_EOF_' > makeRun.sh
#!/bin/sh
for chrom in `echo "select chrom from chromInfo" | hgsql hg18 | grep -v chrom`; do
echo 'echo "Doing $chrom"'
echo "/cluster/home/sugnet/bin/i386/altSplice -db=hg18 -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/hg18.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/hg18/nib/$chrom.nib"
done
'_EOF_'
# << this line makes emacs coloring happy
mkdir agxs
chmod 755 makeRun.sh
chmod 755 toRun.sh
./toRun.sh >& toRun.log &
cat agxs/*.agx > hg18.agx
mkdir hg18
mv agxs/ makeRun.sh toRun.log toRun.sh hg18.agx hg18
cd ..
mkdir mm7
cd mm7
# make the rnaClusters
mkdir rnaCluster
cd rnaCluster/
mkdir chrom
# Don't use RAGE libraries for clone bounds.
~/latestJk/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh mm7 rage.libs
foreach c (`echo 'select chrom from chromInfo' | hgsql mm7 | grep -v chrom`)
set out = chrom/$c.bed
echo "clusterRna -mrnaExclude=mm7.rage.libs mm7 /dev/null $out -chrom=$c" >> clusterRna.spec
end
# tried to run on kki, but no longer can access db from minicluster.
chmod 755 clusterRna.spec
./clusterRna.spec >& clusterRna.log &
cd ..
cat << '_EOF_' > makeRun.sh
#!/bin/sh
for chrom in `echo "select chrom from chromInfo" | hgsql mm7 | grep -v chrom`; do
echo 'echo "Doing $chrom"'
echo "/cluster/home/sugnet/bin/i386/altSplice -db=mm7 -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/mm7.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/mm7/nib/$chrom.nib"
done
'_EOF_'
# << this line keeps emacs coloring happy
chmod 755 makeRun.sh
./makeRun.sh > toRun.sh
chmod 755 toRun.sh
mkdir agxs
./toRun.sh >& toRun.log &
cat agxs/*.agx > mm7.agxc
cd ..
mkdir orthoSpliceExoniphy
cd orthoSpliceExoniphy/
echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" | hgsql hg17 | grep -v txStart > hg17.exoniphy.bed
liftOver hg17.exoniphy.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.exoniphy.bed hg17.exoniphy.unmapped.bed
mkdir orthoSplice
cd orthoSplice
ln -s ../orthoSpliceExoniphy/hg18.exoniphy.bed .
echo 'select chrom, size from chromInfo' | hgsql hg18 | grep -v chrom > chromSizes.tab
cp /cluster/data/hg18/bed/blastz.mm7/axtChain/hg18.mm7.all.chain.gz .
chainSplit chains hg18.mm7.all.chain
cp /cluster/data/hg18/bed/blastz.mm7/axtChain/hg18.mm7.net.gz .
netSplit hg18.mm7.net.gz nets
mkdir agx report logs
cat << '_EOF_' > makeRun.sh
#!/usr/bin/perl -w
open(IN, 'chromSizes.tab') or die "Can't open chromSizes.tab\n";
while(<IN>) {
chomp;
@w = split;
print "/cluster/home/sugnet/bin/i386/orthoSplice -chromSize=$w[1] -exonFile=hg18.exoniphy.bed -trumpNum=3 -chrom=$w[0] -altInFile=../hg18/agxs/hg18.$w[0].agx -orthoAgxFile=../mm7/mm7.agx -db=hg18 -orthoDb=mm7 -netFile=nets/$w[0].net -chainFile=chains/$w[0].chain -commonFile=agx/$w[0].hg18.mm7.cons.t3.agx -reportFile=report/$w[0].hg18.report -edgeFile=report/$w[0].hg18.edge.report >& logs/$w[0].test.log\n";
}
'_EOF_'
# << this line keeps emacs coloring happy
# clean up disk space we're not using
rm hg18.mm7.all.chain hg18.mm7.net.gz nets/* chains/*
chmod 755 makeRun.sh
./makeRun.sh > orthoSplice.para.spec
ssh kki
cd /cluster/store1/sugnet/altSplice/hg18-2006.04.13/orthoSplice
para create orthoSplice.para.spec
para push
cat agx/*.agx > hg18.mm7.t3.exoniphy.agx
cp ~/latestJk/kent/src/hg/lib/altGraphX.sql .
hgLoadBed -notItemRgb -sqlTable=altGraphX.sql hg18 altGraphX hg18.mm7.t3.exoniphy.agx
# end AltGraphX track.
####################################################################
# EXONWALK TRACK (sugnet) Wed Apr 26 13:51:14 PDT 2006
# first make altGraphX track (see above)
cd /cluster/store1/sugnet/altSplice/hg18-2006.04.13/orthoSplice
mkdir exonWalk
mkdir beds
cd exonWalk
mkdir beds
foreach file (`ls ../agx/*.agx`)
set base=`basename $file .agx`
echo "/cluster/home/sugnet/bin/i386/exonWalk db=hg18 minPercent=0 trumpSize=100000 $file beds/$base.bed" >> exonWalk.para.spec
end
para create exonWalk.para.spec
para push
cat beds/*.bed > hg18.mm7.cons.t3.exoniphy.bed
mkdir orfs
cd orfs
mkdir bedOrf beds fa borf
cp ~/store1/altSplice/hg17-2005.01.09/orthoSpliceExonify/exonWalk/orfs.mrna2/*.sh ./
splitFile ../../hg18.mm7.cons.t3.exoniphy.bed 500 exonWalk.
cat << '_EOF_' > makeFa.sh
#!/bin/sh
for file in "$@"
do
base=`basename $file`
echo "Doing $file"
echo "sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa "
sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa
done
'_EOF_'
chmod 755 makeFa.sh
makeFa.sh beds/*
cat << '_EOF_' > makeGenePred.sh
#!/bin/sh
for file in "$@"
do
base=`basename $file`
/cluster/home/sugnet/bin/i386/borfMatcher -keepNmd beds/$base borf/$base.borf bedOrf/$base.bed genePred/$base.gp
done
'_EOF_'
chmod 755 makeGenePred.sh
makeGenePred.sh beds/*
cat beds/* > hg18.mm7.exonWalk.bed
cat genePred/*.gp > hg18.mm7.exonWalk.gp
ldHgGene -predTab hg18 exonWalk hg18.mm7.exonWalk.gp
cat << '_EOF_' > makeNoNmdGenePred.sh
#!/bin/sh
for file in "$@"
do
base=`basename $file`
/cluster/home/sugnet/bin/i386/borfMatcher beds/$base borf/$base.borf bedOrfNoNmd/$base.bed genePredNoNmd/$base.gp
done
'_EOF_'
mkdir bedOrfNoNmd genePredNoNmd
chmod 755 ./makeNoNmdGenePred.sh
wc beds/*
275987 3311844 57319256 total
wc genePredNoNmd/*.gp
169203 1692030 59907679 total
wc genePred/*.gp
225252 2252520 83619240 total
cat genePred/*.gp > hg18.mm7.exonWalk.nmd.gp
cat genePredNoNmd/*.gp > hg18.mm7.exonWalk.noNmd.gp
cat beds/* > hg18.mm7.exonWalk.all.bed
# Plain "exonWalk" track is the only one used on regular genome browser.
ldHgGene -predTab hg18 exonWalk hg18.mm7.exonWalk.noNmd.gp
hgLoadBed hg18 exonWalkAll hg18.mm7.exonWalk.all.bed
ldHgGene -predTab hg18 exonWalkWithNmd hg18.mm7.exonWalk.nmd.gp
cat hg18.mm7.exonWalk.noNmd.gp | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt
Q1 1.000000
median 3.000000
Q3 7.000000
average 10.670556
min 1.000000
max 3844.000000
count 15857
total 169203.000000
standard deviation 63.330761
cat hg18.mm7.exonWalk.nmd.gp | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt
ave counts.txt
Q1 1.000000
median 3.000000
Q3 8.000000
average 14.037891
min 1.000000
max 7278.000000
count 16046
total 225252.000000
standard deviation 99.406890
trackGenome hg18 all refGene:cds trackGenome.spec
Track Specification track overlap track cov track new cum
size size geno track cov cov cov
-----------------------------------------------------------------------------
exonWalk:cds 31207765 27951670 1.00% 89.57% 90.24% 90.24% 90.24%
# end ExonWalk track.
###########################################################################
# ALTGRAPHX2 TRACK (kent) in progress Fri Jan 19 11:27:45 PST 2007
# The exoniphy and human/mouse blastz/chain/nets need to be done before
# this.
ssh hgwdev
cd /cluster/store1/sugnet/altSplice/
mkdir hg18-2007.01.19
cd hg18-2007.01.19
mkdir rnaCluster
cd rnaCluster
# Don't use RAGE libraries for clone bounds.
~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg18 rage.libs
# Make spec file to run.
echo "#!/bin/tcsh -ef@ > clusterRna.spec
foreach c (`echo 'select chrom from chromInfo' | hgsql hg18 | grep -v chrom`)
set out = chrom/$c.bed
echo "clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c" >> clusterRna.spec
end
# Run the file. Needs to be done on machine with database access.
# Takes an hour or so.
chmod 755 clusterRna.spec
mkdir chrom
./clusterRna.spec >& clusterRna.log
cd ..
# Make script to setup job file for raw altGraphX files on human
# If we had a cluster with database access this could be run there.
# As it is, run it on hgwdev. This took 45 minutes.
cat << '_EOF_' > makeRun.sh
#!/bin/sh
echo "#!/bin/tcsh -ef"
for chrom in `echo "select chrom from chromInfo" | hgsql hg18 | grep -v chrom`; do
echo "echo 'Doing $chrom'"
echo "altSplice -db=hg18 -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/hg18.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/hg18/nib/$chrom.nib"
done
'_EOF_'
# << this line makes emacs coloring happy
mkdir agxs
chmod 755 makeRun.sh
./makeRun.sh > toRun.sh
chmod 755 toRun.sh
./toRun.sh >& toRun.log &
cat agxs/*.agx > hg18.agx
mkdir hg18
mv agxs/ makeRun.sh toRun.log toRun.sh hg18.agx hg18
cd ..
mkdir mm8
cd mm8
# make the rnaClusters
mkdir rnaCluster
cd rnaCluster/
mkdir chrom
# Don't use RAGE libraries for clone bounds.
~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh mm8 rage.libs
echo "#!/bin/tcsh -ef" > clusterRna.spec
foreach c (`echo 'select chrom from chromInfo' | hgsql mm8 | grep -v chrom`)
set out = chrom/$c.bed
echo "clusterRna -mrnaExclude=mm8.rage.libs mm8 /dev/null $out -chrom=$c" >> clusterRna.spec
end
# Could make this a cluster run if had a cluster with database access.
# as is, took about 15 minutes on hgwdev. (Faster than human since less ESTs.)
chmod 755 clusterRna.spec
./clusterRna.spec >& clusterRna.log &
cd ..
# Make batch file file to run altSplice program (by making a batch file).
cat << '_EOF_' > makeRun.sh
#!/bin/sh
echo "#!/bin/tcsh -ef"
for chrom in `echo "select chrom from chromInfo" | hgsql mm8 | grep -v chrom`; do
echo "echo 'Doing $chrom'"
echo "/cluster/home/sugnet/bin/i386/altSplice -db=mm8 -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/mm8.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/mm8/nib/$chrom.nib"
done
'_EOF_'
# << this line keeps emacs coloring happy
chmod 755 makeRun.sh
./makeRun.sh > toRun.sh
chmod 755 toRun.sh
# Run altSplice. This takes about 12 minutes.
mkdir agxs
./toRun.sh >& toRun.log &
cat agxs/*.agx > mm8.agx
cd ..
mkdir orthoSpliceExoniphy
cd orthoSpliceExoniphy/
echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" | hgsql hg18 | grep -v txStart > hg18.exoniphy.bed
mkdir orthoSplice
cd orthoSplice
echo 'select chrom, size from chromInfo' | hgsql hg18 | grep -v chrom > chromSizes.tab
zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.all.chain.gz | chainSplit chains stdin
zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.net.gz | netSplit stdin nets
mkdir agx report logs
cat << '_EOF_' > makeRun.sh
#!/usr/bin/perl -w
open(IN, 'chromSizes.tab') or die "Can't open chromSizes.tab\n";
while(<IN>) {
chomp;
@w = split;
print "orthoSplice -chromSize=$w[1] -exonFile=../hg18.exoniphy.bed -trumpNum=3 -chrom=$w[0] -altInFile=../../hg18/agxs/hg18.$w[0].agx -orthoAgxFile=../../mm8/mm8.agx -db=hg18 -orthoDb=mm8 -netFile=nets/$w[0].net -chainFile=chains/$w[0].chain -commonFile=agx/$w[0].hg18.mm8.cons.t3.agx -reportFile=report/$w[0].hg18.report -edgeFile=report/$w[0].hg18.edge.report >& logs/$w[0].test.log\n";
}
'_EOF_'
# << this line keeps emacs coloring happy
chmod 755 makeRun.sh
./makeRun.sh > orthoSplice.para.spec
# do a little cluster run
ssh kki
cd /cluster/store1/sugnet/altSplice/hg18-2007.01.19/orthoSpliceExoniphy/orthoSplice
para create orthoSplice.para.spec
para push
# Do para check, etc until done. Here's the para time results.
#
# 49 jobs in batch
# 147 jobs (including everybody's) in Parasol queue.
# Checking finished jobs
# Completed: 47 of 49 jobs
# Crashed: 2 jobs
# CPU time in finished jobs: 7002s 116.70m 1.94h 0.08d 0.000 y
# IO & Wait Time: 196s 3.27m 0.05h 0.00d 0.000 y
# Average job time: 153s 2.55m 0.04h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 1283s 21.38m 0.36h 0.01d
# Submission to last job: 1283s 21.38m 0.36h 0.01d
#
# The two jobs that crashed are ok, it was simply the result of no input on
# some of the small random chroms. It'd be good to take the jobs out earlier
# somehow. Probably Angie could figure out a way to add a file existence
# test in a line of the perl script above. The altInFile is missing in this
# case.
# Concatenate cluster output and load it into the database.
ssh hgwdev
cd /cluster/store1/sugnet/altSplice/hg18-2007.01.19/orthoSpliceExoniphy/orthoSplice
cat agx/*.agx > hg18.mm8.t3.exoniphy.agx
cp ~/kent/src/hg/lib/altGraphX.sql .
hgLoadBed -notItemRgb -sqlTable=altGraphX.sql hg18 altGraphX2 hg18.mm8.t3.exoniphy.agx
# clean up disk space we're not using
rm hg18.mm7.all.chain hg18.mm7.net.gz nets/* chains/*
# end AltGraphX2 track.
####################################################################
# EXONWALK2 TRACK (kent) Tue Jan 24 2007
# first make altGraphX2 track (see above)
ssh hgwdev
cd
/cluster/store1/sugnet/altSplice/hg18-2007.01.19/orthoSpliceExoniphy
mkdir exonWalk
mkdir beds
cd exonWalk
mkdir beds
foreach file (`ls ../orthoSplice/agx/*.agx`)
set base=`basename $file .agx`
echo "exonWalk db=hg18 minPercent=0 trumpSize=100000 $file beds/$base.bed" >> exonWalk.para.spec
end
# Execute para spec as batch file since wants database access.
# takes about 2.5 hours
#para create exonWalk.para.spec
#para push
#cat beds/*.bed > hg18.mm7.cons.t3.exoniphy.bed
time tcsh -efx exonWalk.para.spec
#8256.940u 21.747s 2:18:07.32 99.8% 0+0k 0+0io 0pf+0w
mkdir orfs
cd orfs
mkdir bedOrf beds fa borf genePred
cd beds
# cp /cluster/store1/sugnet/store1/altSplice/hg17-2005.01.09/orthoSpliceExonify/exonWalk/orfs.mrna2/*.sh ./
cat ../../beds/*.bed | splitFile stdin 500 exonWalk.
cd ..
cat << '_EOF_' > makeFa.sh
#!/bin/sh
for file in "$@"
do
base=`basename $file`
echo "Doing $file"
echo "sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa "
sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa
done
'_EOF_'
chmod 755 makeFa.sh
makeFa.sh beds/*
cat << '_EOF_' > makeBorf.sh
#!/bin/sh
for file in "$@"
do
base=`basename $file`
echo "Doing $file"
echo "borfBig $file borf/$base.borf "
borfBig $file borf/$base.borf
done
'_EOF_'
chmod 755 makeBorf.sh
makeBorf.sh fa/*.fa
# Alternatively do this on the cluster. It takes a little doing to
# get a version of bestorf set up to be cluster accessible. I
# just copied it in from /projects/compbio/bin/borf, including
# copying in some binary fiels that script referenced.
# As a parasol job on kk, here's what para time said:
CPU time in finished jobs: 51577s 859.61m 14.33h 0.60d 0.002 y
IO & Wait Time: 25442s 424.04m 7.07h 0.29d 0.001 y
Average job time: 132s 2.19m 0.04h 0.00d
Longest running job: 0s 0.00m 0.00h 0.00d
Longest finished job: 179s 2.98m 0.05h 0.00d
Submission to last job: 307s 5.12m 0.09h 0.00d
cat << '_EOF_' > makeGenePred.sh
#!/bin/sh
for file in "$@"
do
base=`basename $file`
borfMatcher -keepNmd beds/$base borf/$base.borf bedOrf/$base.bed genePred/$base.gp
done
'_EOF_'
chmod 755 makeGenePred.sh
makeGenePred.sh beds/*
cat beds/* > hg18.mm7.exonWalk.bed
cat genePred/*.gp | ldHgGene -predTab hg18 exonWalk2 stdin
cat << '_EOF_' > makeNoNmdGenePred.sh
#!/bin/sh
for file in "$@"
do
base=`basename $file`
/cluster/home/sugnet/bin/i386/borfMatcher beds/$base borf/$base.borf bedOrfNoNmd/$base.bed genePredNoNmd/$base.gp
done
'_EOF_'
mkdir bedOrfNoNmd genePredNoNmd
chmod 755 ./makeNoNmdGenePred.sh
wc beds/*
275987 3311844 57319256 total
wc genePredNoNmd/*.gp
169203 1692030 59907679 total
wc genePred/*.gp
225252 2252520 83619240 total
cat genePred/*.gp > hg18.mm7.exonWalk.nmd.gp
cat genePredNoNmd/*.gp > hg18.mm7.exonWalk.noNmd.gp
cat beds/* > hg18.mm7.exonWalk.all.bed
# Plain "exonWalk" track is the only one used on regular genome browser.
ldHgGene -predTab hg18 exonWalk hg18.mm7.exonWalk.noNmd.gp
hgLoadBed hg18 exonWalkAll hg18.mm7.exonWalk.all.bed
ldHgGene -predTab hg18 exonWalkWithNmd hg18.mm7.exonWalk.nmd.gp
cat hg18.mm7.exonWalk.noNmd.gp | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt
Q1 1.000000
median 3.000000
Q3 7.000000
average 10.670556
min 1.000000
max 3844.000000
count 15857
total 169203.000000
standard deviation 63.330761
cat hg18.mm7.exonWalk.nmd.gp | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt
ave counts.txt
Q1 1.000000
median 3.000000
Q3 8.000000
average 14.037891
min 1.000000
max 7278.000000
count 16046
total 225252.000000
standard deviation 99.406890
trackGenome hg18 all refGene:cds trackGenome.spec
Track Specification track overlap track cov track new cum
size size geno track cov cov cov
-----------------------------------------------------------------------------
exonWalk:cds 31207765 27951670 1.00% 89.57% 90.24% 90.24% 90.24%
# end ExonWalk track.
####################################################################
# LOAD ENSEMBL GENES (DONE, 2006-05-02, Fan)
# ADDED STABLE URL TO TRACKDB (DONE, 2006-05-29, hartera)
# ADDED RELEASE ALPHA AND RELEASE BETA VERSIONS OF TRACK ENTRY IN
# trackDb.ra SO THAT CORRECT ENSEMBL BUILD VERSION DISPLAYED AND LINKED TO
# AS DIFFERENT ENSEMBL BUILDS ON RR AND HGWDEV (DONE, 2007-09-25, hartera)
mkdir /cluster/data/hg18/bed/ensembl
cd /cluster/data/hg18/bed/ensembl
# Get the ensembl protein data from
# http://www.ensembl.org/Homo_sapiens/martview
# Follow this sequence through the pages:
# Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Structures" box.
# Page 4) Choose GTF as the ouput. choose gzip compression. hit export.
# Save as ensemblGene.gtf.gz
# Ensembl handles random chromosomes differently than us, so we
# strip this data. Fortunately it just loses a couple of genes.
# Add "chr" to front of each line in the gene data gtf file to make
# it compatible with our software.
# Finally, get rid of the ".1" or ".2" after the name
gunzip -c ensemblGene.gtf.gz \
|sed -e 's/c22_H2/22_h2_hap1/'\
|sed -e 's/c5_H2/5_h2_hap1/'\
|sed -e 's/c6_COX/6_cox_hap1/'\
|sed -e 's/c6_QBL/6_qbl_hap2/'\
| perl -wpe 's/^([0-9]|X|Y|Un|MT|5_h2_hap1|22_h2_hap1|6_cox_hap1|6_qbl_hap2)/chr$1/ || die "Line $. doesnt start with human chrom:\n$_"' \
| sed -e 's/\..\"/\"/g' \
| sed -e 's/chrMT/chrM/' \
> ensGene.gtf
ssh hgwdev
cd /cluster/data/hg18/bed/ensembl
# Remove hap chroms entries because Ensembl is using different genomic coordinates.
fgrep -v hap ensGene.gtf > ensGeneNew.gtf
/cluster/bin/i386/ldHgGene hg18 ensGene ensGeneNew.gtf
# Read 58424 transcripts in 1014240 lines in 1 files
# 58424 groups 25 seqs 1 sources 4 feature types
# 58424 gene predictions
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper. Use ensMart to create it as above, except:
# Page 3) Choose the "Features" box. In "Ensembl Attributes", check
# Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# Choose Text, tab-separated as the output format. Result name ensGtp.
# Save file as ensGtp.txt.gz
gunzip ensGtp.txt.gz
hgsql hg18 < ~/kent/src/hg/lib/ensGtp.sql
# remove header line from ensGtp.txt
echo "load data local infile 'ensGtp.txt' into table ensGtp ignore 1 lines" | hgsql -N hg18
# Load Ensembl peptides:
# Get them from ensembl as above in the gene section except for
# Page 2) Choose protein_coding for gene type
# Page 3) Choose the "Sequences" box.
# Page 4) check Ensembl Gene ID, Transcript ID, and Peptid ID, uncheck chrom, Transcripts/Proteins. Peptide. Format = FASTA.
# Save file as ensemblPep.fa.gz
gunzip ensemblPep.fa.gz
hgPepPred hg18 ensembl ensemblPep.fa
# Added stable archive URL for Ensembl v38 to human/hg18/trackDb.ra
# (2006-05-29, hartera)
# Changed url line for ensGene entry to:
# url http://apr2006.archive.ensembl.org/perl/transview?transcript=$$
# (2007-09-25, hartera)
# Created a release beta version of this track in human/hg18/trackDb.ra
# with the ensArchive setting set to apr2006 to create the correct URL
# as above and add the correct version (version 38) in the label:
track ensGene
release beta
shortLabel Ensembl Genes
longLabel Ensembl (Build 38) Gene Predictions
group genes
priority 40
visibility hide
color 150,0,0
type genePred ensPep
ensArchive apr2006
# A separate trackDb entry (release alpha) was made for the updated
# track on hgwdev which is Build 46 (aug2007). This means that the
# correct version will be displayed and the correct links made on both
# the RR and hgwdev.
# Create knownToEnsembl column (updated 2007-11-15 - Jim Kent)
hgMapToGene hg18 ensGene knownGene knownToEnsembl
# QA NOTE [ASZ: 9-11-2006]: mytouch on ensGtp and ensPep. This is because
# ensGene was updated later than they were. Ensembl treats hap chroms
# differently than we do. So the ensGene table was reloaded.
# sudo mytouch hg18 ensGtp 200605241000.00
# sudo mytouch hg18 ensPep 200605241000.00
# SGP GENES (DONE 5/3/06 Fan)
# See below for: SGP GENES Update (DONE - 2007-10-02 - Hiram)
ssh hgwdev
mkdir /cluster/data/hg18/bed/sgp
cd /cluster/data/hg18/bed/sgp
foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/SGP/$chr.gtf
wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/SGP/$chr.prot
end
ldHgGene -gtf -genePredExt hg18 sgpGene chr*.gtf
# VEGA LIFT FROM HG17 (DONE 5/22/06 acs)
# This can be replaced when the new version comes out (Tim Hubbard says soon)
ssh hgwdev
cd /cluster/store8/ensembl/vega33_35f
# there's a bad record at the top of both of these files
awk 'NF == 15 ' vegaGene.gp > tmp.gp
awk 'NF == 15 ' vegaPseudo.gp > tmp2.gp
zcat /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz | liftOver tmp.gp stdin vegaGeneHg18.gp unMapped.gp -genePred
# only 6 dropped
zcat /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz | liftOver tmp2.gp stdin vegaPseudoGeneHg18.gp unMappedPseudo.gp -genePred
# only 11 dropped
ldHgGene hg18 vegaGene -predTab vegaGeneHg18.gp -genePredExt
ldHgGene hg18 vegaPseudoGene -predTab vegaPseudoGeneHg18.gp -genePredExt
hgsql hg18 -N -B < /cluster/home/acs/kent/src/hg/lib/vegaInfo.sql
echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg18 -N -B
# SYNTENIC NETS FOR PANTRO2, RHEMAC2, MM8, RN4, AND CANFAM2 AS COMPOSITE TRACK (DONE 5/22/06 acs)
# (for use in defining orthologs for macaque paper)
ssh hgwdev
# load syntenic nets created previously by Robert
hgLoadNet hg18 netSyntenyPanTro2 /cluster/data/hg18/bed/blastz.panTro2/axtChain/hg18.panTro2.syn.net
zcat /cluster/data/hg18/bed/blastz.rheMac2/axtChain/hg18.rheMac2.syn.net.gz | hgLoadNet hg18 netSyntenyRheMac2 stdin
zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.syn.net.gz | hgLoadNet hg18 netSyntenyMm8 stdin
zcat /cluster/data/hg18/bed/blastz.rn4/axtChain/hg18.rn4.syn.net.gz | hgLoadNet hg18 netSyntenyRn4 stdin
zcat /cluster/data/hg18/bed/blastz.canFam2/axtChain/hg18.canFam2.syn.net.gz | hgLoadNet hg18 netSyntenyCanFam2 stdin
# add more distant vertebrates to track so we can evaluate
# syntenic netting for multiple alignment (2007-03-10 kate)
cd /cluster/data/hg18/bed
netFilter -syn blastz.danRer4/axtChain/hg18.danRer4.net.gz | \
hgLoadNet hg18 netSyntenyDanRer4 stdin
netFilter -syn blastz.galGal3/axtChain/hg18.galGal3.net.gz | \
hgLoadNet hg18 netSyntenyGalGal3 stdin
netFilter -syn blastz.monDom4/axtChain/hg18.monDom4.net.gz | \
hgLoadNet -warn hg18 netSyntenyMonDom4 stdin
netFilter -syn blastz.ornAna1/axtChain/hg18.ornAna1.net.gz | \
hgLoadNet hg18 netSyntenyOrnAna1 stdin
netFilter -syn blastz.anoCar1/axtChain/hg18.anoCar1.net.gz | \
hgLoadNet hg18 netSyntenyAnoCar1 stdin
netFilter -syn blastz.xenTro2/axtChain/hg18.xenTro2.net.gz | \
hgLoadNet hg18 netSyntenyXenTro2 stdin
netFilter -syn blastz.fr2/axtChain/hg18.fr2.net.gz | \
hgLoadNet hg18 netSyntenyFr2 stdin
netFilter -syn blastz.equCab1/axtChain/hg18.equCab1.net.gz | \
hgLoadNet hg18 netSyntenyEquCab1 stdin
netFilter -syn blastz.bosTau3/axtChain/hg18.bosTau3.net.gz | \
hgLoadNet -warn hg18 netSyntenyBosTau3 stdin
netFilter -syn blastz.oryLat1/axtChain/hg18.oryLat1.net.gz | \
hgLoadNet hg18 netSyntenyOryLat1 stdin
cat > netCov.csh << 'EOF'
#!/bin/csh -ef
foreach db (PanTro2 RheMac2 Mm8 Rn4 CanFam2 EquCab1 BosTau3 MonDom4 OrnAna1 GalGal3 AnoCar1 XenTro2 DanRer4 Fr2 OryLat1)
echo -n " "
featureBits -countGaps -chrom=chr1 hg18 refGene:cds net$db -enrichment
featureBits -countGaps -chrom=chr1 hg18 refGene:cds netSynteny$db -enrichment
echo ""
end
'EOF'
csh netCov.csh >&! netCov.log &
cat netCov.log
#refGene:cds 1.282%, netPanTro2 99.979%, both 1.282%, cover 100.00%, enrich 1.00x
#refGene:cds 1.282%, netSyntenyPanTro2 99.978%, both 1.282%, cover 100.00%, enrich 1.00x
#refGene:cds 1.282%, netRheMac2 99.970%, both 1.282%, cover 100.00%, enrich 1.00x
#refGene:cds 1.282%, netSyntenyRheMac2 99.961%, both 1.282%, cover 99.97%, enrich 1.00x
#refGene:cds 1.282%, netMm8 98.650%, both 1.278%, cover 99.69%, enrich 1.01x
#refGene:cds 1.282%, netSyntenyMm8 98.352%, both 1.255%, cover 97.89%, enrich 1.00x
#refGene:cds 1.282%, netRn4 98.404%, both 1.281%, cover 99.89%, enrich 1.02x
#refGene:cds 1.282%, netSyntenyRn4 98.074%, both 1.258%, cover 98.10%, enrich 1.00x
#refGene:cds 1.282%, netCanFam2 99.527%, both 1.281%, cover 99.91%, enrich 1.00x
#refGene:cds 1.282%, netSyntenyCanFam2 99.274%, both 1.272%, cover 99.16%, enrich 1.00x
#refGene:cds 1.282%, netEquCab1 99.457%, both 1.281%, cover 99.87%, enrich 1.00x
#refGene:cds 1.282%, netSyntenyEquCab1 99.020%, both 1.270%, cover 99.06%, enrich 1.00x
#refGene:cds 1.282%, netBosTau3 99.641%, both 1.282%, cover 100.00%, enrich 1.00x
#refGene:cds 1.282%, netSyntenyBosTau3 99.493%, both 1.280%, cover 99.81%, enrich 1.00x
#refGene:cds 1.282%, netMonDom4 98.718%, both 1.279%, cover 99.72%, enrich 1.01x
#refGene:cds 1.282%, netSyntenyMonDom4 98.029%, both 1.260%, cover 98.26%, enrich 1.00x
#refGene:cds 1.282%, netOrnAna1 68.119%, both 1.168%, cover 91.06%, enrich 1.34x
#refGene:cds 1.282%, netSyntenyOrnAna1 56.729%, both 0.714%, cover 55.67%, enrich 0.98x
#refGene:cds 1.282%, netGalGal3 82.246%, both 1.189%, cover 92.68%, enrich 1.13x
#refGene:cds 1.282%, netSyntenyGalGal3 80.379%, both 1.101%, cover 85.86%, enrich 1.07x
#refGene:cds 1.282%, netAnoCar1 63.263%, both 1.128%, cover 87.97%, enrich 1.39x
#refGene:cds 1.282%, netSyntenyAnoCar1 54.068%, both 0.816%, cover 63.65%, enrich 1.18x
#refGene:cds 1.282%, netXenTro2 45.072%, both 1.057%, cover 82.44%, enrich 1.83x
#refGene:cds 1.282%, netSyntenyXenTro2 31.985%, both 0.596%, cover 46.44%, enrich 1.45x
#refGene:cds 1.282%, netDanRer4 28.211%, both 1.012%, cover 78.87%, enrich 2.80x
#refGene:cds 1.282%, netSyntenyDanRer4 7.631%, both 0.177%, cover 13.83%, enrich 1.81x
#refGene:cds 1.282%, netFr2 26.938%, both 0.975%, cover 76.03%, enrich 2.82x
#refGene:cds 1.282%, netSyntenyFr2 7.991%, both 0.200%, cover 15.62%, enrich 1.95x
# Conclusion: CDS coverage loss is small in all placentals and opossum, so
# use syntenic net mafs for these in multiz.
# Ask about chicken -- it's marginal
# Robert prepped synMafNet's for some species, but the files lack
# soft-masked sequence, so redo if time.
# (set up trackDb.ra entry for composite track)
# SYNTENIC NET MAFS FOR MULTIZ (2007-03-09 kate)
# Compare with Robert's
ssh kkstore02
cd /cluster/data/hg18/bed/blastz.rheMac2
mv mafSynNet mafSynNet.robert
~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
-syntenicNet -continue syntenicNet >&! synnet.log &
ssh kkstore02
cd /cluster/data/hg18/bed/blastz.panTro2
# need DEF file for syntenic net, but this was
# a swapped run, so we will simulate
cp /cluster/data/panTro2/bed/blastz.hg18/DEF .
# edit to reverse target and query, and change BASE dir
~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
-syntenicNet -continue syntenicNet >&! synnet.log &
rm DEF
# edit DEF file to reference kolossus-accessible sequence and chrom.sizes
cd /cluster/data/hg18/bed/blastz.monDom4
~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
-syntenicNet -continue syntenicNet >&! synnet.log &
cd /cluster/data/hg18/bed/blastz.equCab1
~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
-syntenicNet -continue syntenicNet >&! synnet.log &
cd /cluster/data/hg18/bed/blastz.bosTau3
~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
-syntenicNet -continue syntenicNet >&! synnet.log &
cd /cluster/data/hg18/bed/blastz.mm8
cp /cluster/data/mm8/bed/blastz.hg18/DEF .
# edit to reverse target & query, change BASE
~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
-syntenicNet -continue syntenicNet >&! synnet.log
rm -f DEF
cd /cluster/data/hg18/bed/blastz.rn4
~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
-syntenicNet -continue syntenicNet >&! synnet.log
cd /cluster/data/hg18/bed/blastz.canFam2
~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
-syntenicNet -continue syntenicNet >&! synnet.log &
# use syntenic net on opossum too
cd /cluster/data/hg18/bed/blastz.monDom4
~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
-syntenicNet -continue syntenicNet >&! synnet.log &
# NET AND RECIPROCAL BEST TABLES FOR 2X MAMMALS
# load net and reciprocal best net for comparison
# rabbit
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.oryCun1/axtChain
netFilter -minGap=10 hg18.oryCun1.net | hgLoadNet -warn hg18 netOryCun1 stdin
netFilter -minGap=10 hg18.oryCun1.rbest.net.gz | \
hgLoadNet -warn hg18 netRBestOryCun1 stdin
# tenrec
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.echTel1/axtChain
netFilter -minGap=10 hg18.echTel1.net.gz | hgLoadNet -warn hg18 netEchTel1 stdin
netFilter -minGap=10 hg18.echTel1.rbest.net.gz | \
hgLoadNet -warn hg18 netRBestEchTel1 stdin
# net coverage
ssh hgwdev
cd /cluster/data/hg18/bed
cat > netRBestCov.csh << 'EOF'
#!/bin/csh -ef
foreach db (OtoGar1 OryCun1 CavPor2 LoxAfr1 EchTel1 DasNov1)
echo -n " "
featureBits -countGaps -chrom=chr1 hg18 refGene:cds net$db -enrichment
featureBits -countGaps -chrom=chr1 hg18 refGene:cds netRBest$db -enrichment
echo ""
end
'EOF'
# << emacs
csh netRBestCov.csh >&! netRBestCov.log &
##########################################################################
# EVOFOLD (Done, 05/12/06) Jakob Skou Pedersen
# RNA secondary structure predictions lifted from hg17 and filtered
ssh -C hgwdev
mkdir -p /cluster/data/hg18/bed/evofold
cd /cluster/data/hg18/bed/evofold
echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from evofold;" | hgsql hg17 | sed -e 1d > foldsHg17.bed
liftOver -minMatch=1.0 foldsHg17.bed /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz tmp.bed unmapped.bed
# remove elements which are wrong size after lifting
awk '$3-$2 == $7' tmp.bed | sort -k4,4 > rawFoldsHg18.bed
# structure filters
# first, remove pairs that can't form in human
cut -f 1-6 rawFoldsHg18.bed > tmp.bed
# sequenceForBed can be found and compiled from here: $HOME/kent/src/hg/altSplice/altSplice/
nice /cluster/home/sugnet/bin/i386/sequenceForBed -db=hg18 -bedIn=tmp.bed -fastaOut=tmp.fa
cat tmp.fa | sed -e 's/\.[+-]\.chr.*$//' \
| sed -e '/^>/s/$/\t/' | tr -d '\n' | sed -e 's/>/\n/g' | sed -e '1d' -e '$s/$/\n/' | sort -k1,1 > foldsHg18Seq.tab
join -1 4 -2 1 -o "1.4 1.8 2.2" rawFoldsHg18.bed foldsHg18Seq.tab | sed -e 's/ */\t/g' | sort -k1,1 \
| /cluster/home/jsp/scripts/tabFoldFilter.py > cleanFolds.tab
join -1 4 -2 1 -o "1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.2 1.9" rawFoldsHg18.bed cleanFolds.tab | sed -e 's/ */\t/g' > tmp1.bed
# second, remove poor predictions
# scripts can be found in cvs tree at: cvsroot/jsp/scripts/. They use a few modules which can be found at: cvsroot/jsp/py_modules
cat tmp1.bed | /cluster/home/jsp/scripts/bedRnassFilter.py --dangling --minAvrStemSize=3 | /cluster/home/jsp/scripts/bedRnassFilter.sh 1 3 \
| /cluster/home/jsp/scripts/roundListFloats.py -c9 > foldsHg18.bed
# clean up
rm tmp.bed tmp1.bed foldsHg17.bed foldsHg18Seq.tab rawFoldsHg18.bed tmp.fa cleanFolds.tab
# upload
hgLoadBed -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/evofold.sql hg18 evofold foldsHg18.bed
#########################################################################
# BLASTZ CHICKEN galGal3 (DONE 5/23/06 angie)
ssh pk
mkdir /cluster/data/hg18/bed/blastz.galGal3.2006-05-22
cd /cluster/data/hg18/bed/blastz.galGal3.2006-05-22
cat << '_EOF_' > DEF
# human vs chicken
BLASTZ=blastz.v7.x86_64
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/san/sanvol1/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.galGal3.2006-05-22
'_EOF_'
# << emacs
~/kent/src/utils/doBlastzChainNet.pl DEF \
-bigClusterHub=pk -smallClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=loose \
>& do.log & tail -f do.log
ln -s blastz.galGal3.2006-05-22 /cluster/data/hg18/bed/blastz.galGal3
# running syntenicNet 2008-10-30
# had to update the DEF file to correspond to new hive layout
cd /cluster/data/hg18/bed/blastz.galGal3.2006-05-22
mv DEF DEF.0
cat << '_EOF_' > DEF
# human vs chicken
BLASTZ=blastz.v7.x86_64
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_SMSK=/scratch/data/hg18/linSpecRep/notInMouseRat
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/data/galGal3/nib
SEQ2_LEN=/scratch/data/galGal3/chrom.sizes
SEQ2_SMSK=/scratch/data/galGal3/linSpecRep
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/hive/data/genomes/hg18/bed/blastz.galGal3.2006-05-22
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
-bigClusterHub=swarm -smallClusterHub=memk \
-continue=syntenicNet -syntenicNet \
-chainMinScore=5000 -chainLinearGap=loose > synNet.log 2>&1
# worked OK in about 3 minutes
#########################################################################
# REGULATORY POTENTIAL (DONE - 2006-06-09 - Hiram)
# download data from "James Taylor" <james at bx.psu.edu>
ssh kkstore02
mkdir /cluster/data/hg18/bed/regPotential7X
cd /cluster/data/hg18/bed/regPotential7X
# This is a lot of data
for C in 1 2 3 4 5 6 7 8 9 X Y 10 11 12 13 14 15 16 17 18 19 20 21 22
do
wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg18/chr${C}.scores.truncated.bz2"
done
wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg18/trackDb.html" -O description.html
time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
do
bzcat chr${C}.scores.truncated.bz2
done | wigEncode -noOverlap stdin regPotential7X.wig regPotential7X.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# real 23m27.454s
# user 22m41.058s
# sys 0m41.850s
# Loading the table on hgwdev
ssh hgwdev
cd /cluster/data/hg18/bed/regPotential7X
ln -s /cluster/data/hg18/bed/regPotential7X/regPotential7X.wib \
/gbdb/hg18/wib/regPotential7X.wib
# using the tmpDir is faster since it is on local disk and it will
# clean up any temporary .tab file it creates there
time hgLoadWiggle -tmpDir=/scratch/tmp \
hg18 regPotential7X regPotential7X.wig
# How about a histogram of the data.
# find min and max for everything to verify it is 0 to 1
ssh kkstore02
cd /cluster/data/hg18/bed/regPotential7X
time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
do
echo " ============ ${C} ======================="
bzcat chr${C}.scores.truncated.bz2 | ave -col=2 stdin
done > stats.all 2>&1
grep "^min" stats.all | sort -u
# min 0.000000
grep "^max" stats.all | sort -u
# max 1.000000
ssh kolossus
cd /cluster/data/hg18/bed/regPotential7X
time hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 -hBinCount=100 \
-hMinVal=0.0 -db=hg18 regPotential7X > histogram.data 2>&1
# real 2m42.311s
# 73 % of the data values are zero
# create download gzip files from the bz2 files:
ssh kkstore02
cd /cluster/data/hg18/bed/regPotential7X
for F in chr*.scores.truncated.bz2
do
C=`echo $F | awk -F'.' '{print $1}'`
echo -n "${C}.regPotential7X.hg18.gz working ... "
bzcat ${F} | gzip > ${C}.regPotential7X.hg18.gz
echo
done
#########################################################################
# create md5sum.txt under bigZips (DONE, 6/7/06, Fan)
cd /cluster/store11/gs.19/build36/downloads/bigZips
md5sum *.zip *.2bit README.txt > md5sum.txt
#########################################################################
# UPDATE BACENDS track (DONE - 2006-06-16 - Hiram)
# An attempt to recover some of the missing clones from the
# bacEnds track. It turns out the perl processing script wasn't
# properly catagorizing all the clone ends, thus a lot of them
# were being left out of the final track
ssh hgwdev
mkdir /cluster/data/hg18/bed/updateCloneEnds
cd /cluster/data/hg18/bed/updateCloneEnds
ln -s ../cloneend/all.txt.gz .
# Checked this script into the source tree and fixed it up to
# recognize more of the catagories of clone ends
zcat all.txt.gz | $HOME/kent/src/hg/utils/cloneEndParse.pl /dev/stdin
# Reading in end info
# Writing out pair info
# Writing out singleton info
# 301377 pairs and 204698 singles
# Note that there are none marked at "unclassified" - this script
# will print out that message to stderr if it doesn't recognize
# any marker classifications. This produces the files:
# -rw-rw-r-- 1 9645568 Jun 16 14:09 cloneEndPairs.txt
# -rw-rw-r-- 1 4906468 Jun 16 14:09 cloneEndSingles.txt
wc -l clone*.txt
# 301377 cloneEndPairs.txt
# 204698 cloneEndSingles.txt
# This is a lot better than previous:
wc -l ../cloneend/cloneEnd*.txt
# 249619 ../cloneend/cloneEndPairs.txt
# 318500 ../cloneend/cloneEndSingles.txt
mkdir /san/sanvol1/scratch/hg18/updateBacEnds
cd /san/sanvol1/scratch/hg18/updateBacEnds
ln -s ../bacends/bacEnds.sorted.psl .
ln -s ../bacends/lifted .
pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.sorted.psl \
/cluster/data/hg18/bed/updateCloneEnds/cloneEndPairs.txt \
all_bacends bacEnds
echo -e \
'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
echo -e '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
cat header bacEnds.pairs | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairs.bed
cat header bacEnds.slop bacEnds.short bacEnds.long \
bacEnds.mismatch bacEnds.orphan \
| row score ge 300 | sorttbl chr start | headchg -del \
> bacEndPairsBad.bed
extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \
bacEndPairsBad.bed | \
sorttbl tname tstart | headchg -del > bacEnds.load.psl
# looks like we are getting a lot more now in every catagory:
wc -l bacEnds.* bacEndPairs* | sort -n
49 bacEnds.long
1399 bacEnds.mismatch
4516 bacEnds.slop
7202 bacEnds.short
66861 bacEnds.orphan
78900 bacEndPairsBad.bed
205443 bacEndPairs.bed
207997 bacEnds.pairs
1727387 bacEnds.load.psl
# Previously:
wc -l ../bacends/bacEnds.* ../bacends/bacEndPairs* | sort -n
40 ../bacends/bacEnds.long
1061 ../bacends/bacEnds.mismatch
3954 ../bacends/bacEnds.slop
6279 ../bacends/bacEnds.short
59245 ../bacends/bacEnds.orphan
69788 ../bacends/bacEndPairsBad.bed
159268 ../bacends/bacEndPairs.bed
161251 ../bacends/bacEnds.pairs
1249956 ../bacends/bacEnds.load.psl
# Move the previous build out of the way and copy these
# results over to the primary hg18 bed location:
mv /cluster/data/hg18/bed/bacends /cluster/data/hg18/bed/bacends.2006-02-02
mkdir /cluster/data/hg18/bed/bacends
cp -p bacEnd* /cluster/data/hg18/bed/bacends
cp -p lifted/bacEnds.lifted.psl /cluster/data/hg18/bed/bacends
# load them into the database
ssh hgwdev
cd /cluster/data/hg18/bed/bacends
# CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
awk '{print $5}' bacEndPairs.bed | sort | uniq -c
# result should be the scores, no extraneous strings:
# 202488 1000
# 255 300
# 416 375
# 384 500
# 1900 750
# edit the file and fix it if it has a bad name.
sed -e "s/bacEndPairs /bacEndPairsUpdate /" \
$HOME/kent/src/hg/lib/bacEndPairs.sql > bacEndPairsUpdate.sql
hgLoadBed -notItemRgb hg18 bacEndPairsUpdate bacEndPairs.bed \
-sqlTable=bacEndPairsUpdate.sql
# Loaded 205443 elements of size 11
# Previously was:
# Loaded 159268
# note - this track isn't pushed to RR, just used for assembly QA
sed -e "s/bacEndPairsBad /bacEndPairsBadUpdate /" \
$HOME/kent/src/hg/lib/bacEndPairsBad.sql > bacEndPairsBadUpdate.sql
hgLoadBed -notItemRgb hg18 bacEndPairsBadUpdate bacEndPairsBad.bed \
-sqlTable=bacEndPairsBadUpdate.sql
# Loaded 78900 elements of size 11
# Previously was:
# Loaded 69788
#hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl
# NOTE: truncates file to 0 if -nobin is used
# NOTE: truncates file to 0 if -nobin is used
hgLoadPsl hg18 -table=all_bacendsUpdate bacEnds.load.psl
# no complaints ! Usually there are, this loaded:
hgsql -N -e "select count(*) from all_bacendsUpdate;" hg18
# 1727387
# Previously this was:
# 1249956
nice featureBits hg18 all_bacendsUpdate
# 227770876 bases of 2881515245 (7.905%) in intersection
nice featureBits hg18 all_bacends
# 191078854 bases of 2881515245 (6.631%) in intersection
nice featureBits hg17 all_bacends
# 225763317 bases of 2866216770 (7.877%) in intersection
nice featureBits hg18 bacEndPairsUpdate
# 162690030 bases of 2881515245 (5.646%) in intersection
nice featureBits hg18 bacEndPairs
# 130270940 bases of 2881515245 (4.521%) in intersection
nice featureBits hg17 bacEndPairs
# 162099487 bases of 2866216770 (5.656%) in intersection
nice featureBits hg18 bacEndPairsBadUpdate
# 37326990 bases of 2881515245 (1.295%) in intersection
nice featureBits hg18 bacEndPairsBad
# 33650226 bases of 2881515245 (1.168%) in intersection
nice featureBits hg17 bacEndPairsBad
# 37437558 bases of 2866216770 (1.306%) in intersection
# Renamed the new BAC End Pairs tables (7-27-2006 Brooke)
mysql> alter table all_bacends rename all_bacendsOld;
Query OK, 0 rows affected (0.01 sec)
mysql> alter table bacEndPairs rename bacEndPairsOld;
Query OK, 0 rows affected (0.00 sec)
mysql> alter table all_bacendsUpdate rename all_bacends;
Query OK, 0 rows affected (0.00 sec)
mysql> alter table bacEndPairsUpdate rename bacEndPairs;
Query OK, 0 rows affected (0.00 sec)
#########################################################################
# dbSNP BUILD 126 (Heather, June 2006)
# Set up directory structure
ssh kkstore02
cd /cluster/data/dbSNP
mkdir 126
cd 126
mkdir human
cd human
mkdir data
mkdir schema
mkdir rs_fasta
# Get data from NCBI (anonymous FTP)
cd /cluster/data/dbSNP/126/human/data
ftp ftp.ncbi.nih.gov
cd snp/organisms/human_9606/database/organism_data
# ContigLoc table has coords, orientation, loc_type, and refNCBI allele
get b126_SNPContigLoc_36_1.bcp.gz
# ContigLocusId has function
get b126_SNPContigLocusId_36_1.bcp.gz
get b126_ContigInfo_36_1.bcp.gz
# MapInfo has alignment weights
get b126_SNPMapInfo_36_1.bcp.gz
# SNP has univar_id, validation status and heterozygosity
get SNP.bcp.gz
# Get schema from NCBI
cd /cluster/data/dbSNP/126/human/schema
ftp ftp.ncbi.nih.gov
cd snp/organisms/human_9606/database/organism_schema
get human_9606_table.sql.gz
# Get fasta files from NCBI
# using headers of fasta files for molType
cd /cluster/data/dbSNP/126/human/rs_fasta
ftp ftp.ncbi.nih.gov
cd snp/organisms/human_9606/rs_fasta
mget *.gz
# Simplify names of data files
cd /cluster/data/dbSNP/126/human/data
mv b126_SNPContigLoc_36_1.bcp.gz ContigLoc.gz
mv b126_SNPContigLocusId_36_1.bcp.gz ContigLocusId.gz
mv b126_ContigInfo_36_1.bcp.gz ContigInfo.gz
mv b126_SNPMapInfo_36_1.bcp.gz MapInfo.gz
mv SNP.bcp.gz SNP.gz
ls -1 *.gz > filelist
# edit table descriptions
cd /cluster/data/dbSNP/126/human/schema
# get CREATE statements from human_9606_table.sql for our 5 tables
# store in table.tmp
# convert and rename tables
sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp
rm table.tmp
sed -f 'tableRename.sed' table2.tmp > table.sql
rm table2.tmp
# Get updated UniVariation table
cd /cluster/data/dbSNP/126/shared
ftp ftp.ncbi.nih.gov
cd snp/organisms/human_9606/database/shared_data
get UniVariation.bcp.gz
cd ../shared_schema
get dbSNP_main_table.sql.gz
# get UniVariation CREATE statement from dbSNP_main_table.sql
# use mssqlToMysql.sed to convert
# get header lines from rs_fasta
cd /cluster/data/dbSNP/126/human/rs_fasta
/bin/csh gnl.csh
# add rs_fasta to seq/extFile
# 2 edits first: strip header to just rsId, and remove duplicates
# work on /cluster/store12 (kkstore05) which has more disk space
# also for human, don't include chrUn
cp rs_ch*.fas.gz /cluster/store12/snp/126/human/rs_fasta
ssh kkstore05
cd /cluster/store12/snp/126/human/rs_fasta
mkdir unarchive
mv rs_chUn.fas.gz unarchive
# concat into rsAll.fas
cat << '_EOF_' > concat.csh
#!/bin/csh -ef
rm -f rsAll.fas
foreach file (rs_ch*.fas.gz)
echo $file
zcat $file >> rsAll.fas
end
'_EOF_'
# << emacs
# snpCleanSeq strips the header and skips duplicates
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa
rm rsAll.fas
# load on hgwdev
ssh hgwdev
mkdir /gbdb/hg18/snp
ln -s /cluster/store12/snp/126/human/rs_fasta/snp.fa /gbdb/hg18/snp/snp.fa
cd /cluster/store12/snp/126/human/rs_fasta
hgLoadSeq hg18 /gbdb/hg18/snp/snp.fa
# look up id in extFile
# move into separate table
hgsql hg18 < snpSeq.sql
hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 15200238' hg18
hgsql -e 'delete from seq where extFile = 15200238' hg18
hgsql -e 'alter table snpSeq add index acc (acc)' hg18
# clean up after hgLoadSeq
rm seq.tab
# load on kkr5u00
ssh kkr5u00
hgsql -e mysql 'create database hg18snp126'
cd /cluster/data/dbSNP/126/human/schema
hgsql hg18snp126 < table.sql
cd ../data
/bin/csh load.csh
# note rowcount
# ContigLoc 27007176
# SNP 11961761
# MapInfo 11712346
# ContigLocusId 11854143
cd /cluster/data/dbSNP/126/shared
hgsql hg18snp126 < UniVariation.sql
zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' hg18snp126
# create working /scratch dir
cd /scratch/snp
mkdir 126
cd 126
mkdir human
cd human
# get hg18 ctgPos, load into dbSnpHumanBuild126, compare contig list between ctgPos and ContigInfo
# Note: missing chrY PAR regions
# get gnl files
cp /cluster/data/dbSNP/126/human/rs_fasta/*.gnl .
# examine ContigInfo for group_term and edit pipeline.csh
# use "ref_assembly"
cd /scratch/snp/126/human
# filter ContigLoc into ContigLocFilter
# this lifts from contig coords to chrom coords
# phys_pos_from is used to check coords for non-random chroms
# errors reported to stdout
# this gets rid of alternate assemblies (using ContigInfo)
# this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo)
# assumes all contigs are positively oriented; will abort if not true
mysql> desc ContigLocFilter;
# +---------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +---------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromName | varchar(32) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | start | int(11) | NO | | | |
# | end | int(11) | YES | | NULL | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# +---------------+-------------+------+-----+---------+-------+
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter hg18snp126 ref_assembly reference
# note rowcount
# ContigLocFilter 12368145
# how many are positive strand? hopefully 90%
mysql> select count(*) from ContigLocFilter where orientation = 0;
# 10622168
# note count by loc_type
mysql> select count(*), loc_type from ContigLocFilter group by loc_type;
# +----------+----------+
# | count(*) | loc_type |
# +----------+----------+
# | 205359 | 1 |
# | 10678378 | 2 |
# | 1464642 | 3 |
# | 9025 | 4 |
# | 1117 | 5 |
# | 9624 | 6 |
# +----------+----------+
# filter ContigLocusId into ContigLocusIdFilter
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter hg18snp126 ref_assembly
# note rowcount
# ContigLocusIdFilter 5812538
# condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions)
# assumes SNPs are in numerical order; will errAbort if not true
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense hg18snp126
# note rowcount; expect about 50% for human
# ContigLocusIdCondense 3975405 (note this is smaller than hg17/snp125)
# could delete ContigLocusIdFilter table here
# create chrN_snpFasta tables from *.gnl files
# we are just using molType, but also storing class and observed
# 266,366 duplicates detected in snpMoltype.errors
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta hg18snp126
# (could start using pipeline.csh here)
# (pipeline.csh takes about 35 minutes to run)
# split ContigLocFilter by chrom
# create the first chrN_snpTmp
# we will reuse this table name, adding/changing columns as we go
# at this point chrN_snpTmp will have the same description as ContigLocFilter
# this opens a file handle for every chrom, so will not scale to scaffold-based assemblies
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom hg18snp126 ref_assembly
# adjust coords using loc_type
# possible errors logged to snpLocType.error:
# Unknown locType
# Between with end != start + 1
# Between with allele != '-'
# Exact with end != start
# Range with end < start
# possible exceptions logged to snpLocType.exceptions:
# RefAlleleWrongSize
# This run no errors, no exceptions
# I do note that out of 25K rows where loc_type == 6, 12259 have asn_from == asn_to
# All of loc_type == 1, 4, 5 have zero rows where asn_from == asn_to
# This was also true in build125
# morph chrN_snpTmp
mysql> desc chr1_snpTmp;
# +---------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +---------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# +---------------+-------------+------+-----+---------+-------+
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype hg18snp126 ref_assembly
# expand allele as necessary
# report syntax errors to snpExpandAllele.errors
# possible exceptions logged to snpExpandAllele.exceptions:
# RefAlleleWrongSize
# This run no errors, no exceptions
# 8092 alleles expanded
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele hg18snp126 ref_assembly
# the next few steps prepare for working in UCSC space
# sort by position
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort hg18snp126 ref_assembly
# rename MT --> M (pipeline.csh takes care of this)
hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" hg18snp126
# get hg18 nib files
# get hg18 chromInfo, load into hg18snp126 with editted path
# lookup reference allele in nibs
# keep reverse complement to use in error checking (snpCheckAlleles)
# check here for SNPs larger than 1024
# errAbort if detected
# check for coords that are too large, log to snpRefUCSC.error and skip
# This run we got 30678 lines in snpRefUCSC.error
# 12178 from chr14 (reported to dbSNP)
# also 18423 from chr1_random and 77 from chr6_random
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC hg18snp126
# morph chrN_snpTmp
mysql> desc chr1_snpTmp;
# +--------------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +--------------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# | refUCSC | blob | YES | | NULL | |
# | refUCSCReverseComp | blob | YES | | NULL | |
# +--------------------+-------------+------+-----+---------+-------+
# compare allele from dbSNP to refUCSC
# locType between is excluded from this check
# log exceptions to snpCheckAllele.exceptions
# if SNP is positive strand, expect allele == refUCSC
# log RefAlleleMismatch if not
# if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp
# If allele == refUCSCRevComp, log RefAlleleNotRevComp
# If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch
# This run we got:
# 0 RefAlleleMismatch
# 119366 RefAlleleNotRevComp
# Note this is double from build125
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles hg18snp126
# add class and observed using univar_id from SNP table
# to get class (subsnp_class) and observed (var_str) from UniVariation
# log errors to snpClassAndObserved.errors
# errors detected:
# class = 0 in UniVariation
# class > 8 in UniVariation
# univar_id = 0 in SNP
# no row in SNP for snp_id in chrN_snpTmp
# This run we got:
# 3 class = 0 in UniVariation
# 0 class > 8 in UniVariation
# 39059 univar_id = 0 in SNP
# 879 no row in SNP for snp_id in chrN_snpTmp (all chr6)
# dbSNP has class = 'in-del'
# we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved hg18snp126
# morph chrN_snpTmp
# +--------------------+---------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +--------------------+---------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | class | varchar(255) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# | refUCSC | blob | YES | | NULL | |
# | refUCSCReverseComp | blob | YES | | NULL | |
# | observed | blob | YES | | NULL | |
# +--------------------+---------------+------+-----+---------+-------+
# generate exceptions for class and observed
# SingleClassBetweenLocType
# SingleClassRangeLocType
# NamedClassWrongLocType
# ObservedWrongFormat
# ObservedWrongSize (twice as many as hg17/snp125)
# ObservedMismatch (nearly 3x as many as hg17/snp125)
# RangeSubstitutionLocTypeExactMatch
# SingleClassTriAllelic
# SingleClassQuadAllelic
# This will also detect IUPAC symbols in allele
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved hg18snp126
# add function
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction hg18snp126
# add validation status and heterozygosity
# log error if validation status > 31 or missing
# this run we got 8 missing
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP hg18snp126
# add molType
# errors detected: missing or duplicate molType
# no errors this run
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype hg18snp126
# generate chrN_snp126 and snp126Exceptions tables
cp snpCheckAlleles.exceptions snpCheckAlleles.tab
cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab
cp snpExpandAllele.exceptions snpExpandAllele.tab
cp snpLocType.exceptions snpLocType.tab
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable hg18snp126 126
# handle chrY PAR SNPs (still missing from dbSNP)
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpPAR hg18snp126
hgsql -e 'load data local infile "snpPARexceptions.tab" into table snp126Exceptions' hg18snp126
# concat into snp126.tab
# cat chr*_snp126.tab >> snp126.tab
# note chr18_random_snp126.tab is empty (just 2 rows in hg17/snp125)
/bin/sh concat.sh
# check for multiple alignments
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple hg18snp126
mysql> load data local infile 'snpMultiple.tab' into table snp126Exceptions;
# run and review snpCompareLoctype
# load snp125subset
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCompareLoctype hg18snp126 snp125subset snp126
# cat snpCompareLoctypeCounts.out
# note: rangeToExact is 2x 124/125 conversion rate
# exactToExact = 8747888
# exactToBetween = 1071
# exactToRange = 6673
# betweenToBetween = 321371
# betweenToExact 1323
# betweenToRange 514
# rangeToRange = 95562
# rangeToBetween = 1794
# rangeToExact = 15148
# oldToNew = 10649
# run and review snpCompareWeight
# load into database snp125snp126
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCompareWeight snp125snp126 weight125 weight126
# cat snpCompareWeightCounts.out
# oneToOne = 9161896
# oneToTwo = 0 <-- good
# oneToThree = 531 <--- interesting but minor
# twoToTwo = 38 <-- okay
# twoToOne = 1896 <--- improvement
# twoToThree = 0 <-- good
# threeToThree = 494 <-- okay
# threeToOne = 37571 <-- improvement
# threeToTwo = 12 <-- improvement
# load on hgwdev
cp snp126.tab /cluster/home/heather/transfer/snp
hgsql hg18snp126 -e 'select * from snp126Exceptions' > /cluster/home/heather/transfer/snp/snp126Exceptions.tab
ssh hgwdev
mysql> load data local infile 'snp126.tab' into table snp126;
mysql> load data local infile 'snp126Exceptions.tab' into table snp126Exceptions;
# create indexes
mysql> alter table snp126 add index name (name);
mysql> alter table snp126 add index chrom (chrom, bin);
mysql> alter table snp126Exceptions add index name(name);
# create snp126ExceptionDesc table
cd /cluster/data/dbSNP
hgsql hg18 < snp126ExceptionDesc.sql
# add counts to exception.human.126, can start with exception.template
hgsql -e 'select count(*), exception from snp126Exceptions group by exception' hg18
mysql> load data local infile 'exception.human.126' into table snp126ExceptionDesc;
################################################################
# SNP126 edit: condense UTR/intron func into just intron at Jim's request
ssh kkr5u00
cd /scratch/snp/126/human
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense hg18snp126
/bin/csh pipeline.csh
ssh hgwdev
cd /cluster/home/heather/transfer/snp
hgsql hg18 -e 'drop table snp126'
hgsql hg18 < /cluster/home/heather/kent/src/hg/lib/snp126.sql
hgsql hg18 -e 'load data local infile "snp126.tab" into table snp126'
hgsql hg18 -e 'alter table snp126 add index name (name)'
hgsql hg18 -e 'alter table snp126 add index chrom (chrom, bin)'
################################################################
# SNP126 edit: detect clustering errors (Heather, Sept. 2006)
# for locType = 'between' (class = 'insertion')
# 1,393,040 candidates
# exceptions:
# DuplicatedObserved (3020 of these)
# MixedObserved (1312 of these)
# create and populate a simple table snp126insertions
mysql> insert into snp126insertions
select chrom, chromStart, chromEnd, name, score, strand, observed from snp126
where locType = 'between' and class = 'insertion';
# generate and load data
cd /cluster/home/heather/kent/src/hg/snp/snpLoad
./snpCheckCluster hg18 snp126insertions
mysql> load data local infile 'snpCheckCluster.tab' into table snp126Exceptions;
# update snp126ExceptionDesc
################################################################
# generate snpMasked sequence for snp126 (Heather, Sept. 2006)
# snpMaskChrom was run too, not documented here.
# OBSOLETED by snp128Mask, see below.
# 3 steps: simple filtering, advanced filtering, generate sequence
# simple filtering: create and populate tables
# insertions: 1,393,040
# deletions: 783,454
ssh hgwdev
mysql> insert into snp126insertions select * from snp126
where locType = 'between' and class = 'insertion';
mysql> insert into snp126deletions select * from snp126
where class = 'deletion';
# advanced filtering -- insertions
cd /cluster/home/heather/kent/src/hg/snp/snpLoad
# this removes SNPs with weight != 1
# this removes SNPs that align to more than one position
# this removes SNPs that cluster together with conflicting observations
# (these should be class = 'mixed')
# this removes SNPs with invalid observed string
# this asserts end == start
# final count 1,352,380
# written to insertions.tab
./snpGetInsertions hg18 snp126insertions snp126Exceptions
# advanced filtering -- deletions
cd /cluster/home/heather/kent/src/hg/snp/snpLoad
# this removes SNPs with weight != 1
# this removes SNPs that align to more than one position
# this removes SNPs with invalid observed string
# this removes SNPs with exception ObservedWrongSize
# this asserts end > start
# final count 621,024
# written to deletions.tab
./snpGetDeletions hg18 snp126deletions snp126Exceptions
# Note: the advanced filtering pretty much removes all SNPs from chrN_random
# generate sequence -- insertions
# use kent/src/hg/snp/snpMask/seqWithInsertions.c
# this asserts that position doesn't exceed chromSize
# this will reverse complement observed if strand is negative
# if no SNPs found, output sequence == input sequence
# write to chrN.fat
ssh kkr5u00
mysql> load data local infile
"/cluster/home/heather/kent/src/hg/snp/snpLoad/insertions.tab" into table
snp126insertionsClean;
cd /scratch/snp126/human/fat
/bin/sh fat.sh
cp *.fat /cluster/data/hg18/snpMask/insertions
ssh kkstore02
cd /cluster/data/hg18/snpMask/insertions
nice gzip *.fat
# generate sequence -- deletions
# use kent/src/hg/snp/snpMask/seqWithoutDeletions.c
# this asserts that position doesn't exceed chromSize
# if no SNPs found, output sequence == input sequence
# write to chrN.skinny
ssh kkr5u00
mysql> load data local infile
"/cluster/home/heather/kent/src/hg/snp/snpLoad/deletions.tab" into table
snp126deletionsClean;
cd /scratch/snp126/human/skinny
/bin/sh skinny.sh
cp *.skinny /cluster/data/hg18/snpMask/deletions
ssh kkstore02
cd /cluster/data/hg18/snpMask/deletions
nice gzip *.skinny
# create links on hgwdev
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg18/snpMask/insertions
/bin/sh link.sh
cd /usr/local/apache/htdocs/goldenPath/hg18/snpMask/deletions
/bin/sh link.sh
############################################################################
# Lift simple bi-allelic SNPs to rheMac2 and panTro2 (Heather, August 2006)
# OBSOLETED by snp128Ortho, see below.
ssh hgwdev
cd /cluster/data/dbSNP/ortho/hg18/snpDump
# dump raw data -- this creates snpGetSimple.chr*
# exceptions table is used to skip SNPs that align in multiple places
# We also skip SNPs on chrN_random
# We also skip triallelic and quadallelic
# We don't filter on weight
# This yields 9,092,533 SNPs
# This data is also stored into hg18.snp126simple for later use
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpGetSimple hg18 snp126 snp126Exceptions
# split up into just under 200 files to make for an efficient pk run
# using file size of 60K lines
# this creates /cluster/data/dbSNP/ortho/hg18/split/chr1-01, chr1-02, chr1-03, etc.
# 165 files created
# 140 files have 60k lines
/bin/csh split.csh
# prepare cluster runs
# I didn't use -bedPlus=6, didn't seem to need it
cp /cluster/data/dbSNP/ortho/hg18/split/* /san/sanvol1/snp/liftOver/hg18/rheMac2/input
cp /cluster/data/dbSNP/ortho/hg18/split/* /san/sanvol1/snp/liftOver/hg18/panTro2/input
cd /san/sanvol1/snp/liftOver/hg18/rheMac2
/bin/csh makeJobList.csh
rm -f jobList
foreach fileName (`ls input/chr*`)
set baseName = $fileName:t
echo liftOver $fileName /cluster/data/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz output/$baseName.out unmapped/$baseName.unmapped >> jobList
end
cd /san/sanvol1/snp/liftOver/hg18/panTro2
/bin/csh makeJobList.csh
rm -f jobList
foreach fileName (`ls input/chr*`)
set baseName = $fileName:t
echo liftOver $fileName /cluster/data/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz output/$baseName.out unmapped/$baseName.unmapped >> jobList
end
# do cluster runs
# this only took a few minutes
# got 7321537 lifts for rheMac2
# got 8517465 lifts for panTro2
ssh pk
cd /san/sanvol1/snp/liftOver/hg18/rheMac2
para create jobList
para try; para check; para push; para check; etc.
cd /san/sanvol1/snp/liftOver/hg18/panTro2
para create jobList
para try; para check; para push; para check; etc.
# concatenate output files into all.out
cd /san/sanvol1/snp/liftOver/hg18/rheMac2/output
/bin/csh concat.csh
cd /san/sanvol1/snp/liftOver/hg18/panTro2/output
/bin/csh concat.csh
# load into panTro2 and rheMac2
# Doing the load and split so I can easily load sequence for a full chrom
ssh hgwdev
cp /san/sanvol1/snp/liftOver/hg18/rheMac2/output/all.out /cluster/data/dbSNP/ortho/hg18/rheMac2Lift
cd /cluster/data/dbSNP/ortho/hg18/rheMac2Lift
hgsql rheMac2 < snp126hg18ortho.sql
hgsql -e 'load data local infile "all.out" into table snp126hg18ortho' rheMac2
cp /san/sanvol1/snp/liftOver/hg18/panTro2/output/all.out /cluster/data/dbSNP/ortho/hg18/panTro2Lift
cd /cluster/data/dbSNP/ortho/hg18/panTro2Lift
hgsql panTro2 < snp126hg18ortho.sql
hgsql -e 'load data local infile "all.out" into table snp126hg18ortho' panTro2
# split by chrom
# this creates tables chrN_snp126hg18ortho and can be run from anywhere
# it will create chrN_snp126hg18ortho.tab files which can be deleted
cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom2 rheMac2 snp126hg18ortho
rm chr*.tab
# rm snp126ortho.tab
cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom2 panTro2 snp126hg18ortho
rm chr*.tab
# rm snp126ortho.tab
# get sequence
# this creates chrN_snp126hg18orthoPrelim.tab files
# random chroms are okay here
# note we are including Ns
# This will log to fetchSeq.errors any examples where chromEnd != chromStart + 1
# It will also check for coordinates past the end of the chrom.
# No errors for rheMac2 or panTro2.
cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
/cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq rheMac2 /cluster/data/rheMac2/rheMac2.2bit
# ssh kkstore02
# cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
/bin/csh concat.csh
# cleanup; remove split tables from rheMac2, keep snp126hg18orthoPrelim
hgsql rheMac2 < drop.sql
rm chr*.tab
cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
/cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq panTro2 /cluster/data/panTro2/panTro2.2bit
# ssh kkstore02
# cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
/bin/sh concat.sh
# cleanup; remove split tables from panTro2, keep snp126hg18orthoPrelim
hgsql panTro2 < drop.sql
rm chr*.tab
# do a preliminary load -- combine chimp and macaque
cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
hgsql hg18 < snp126orthoPrelim.sql
hgsql -e 'load data local infile "snp126orthoPrelim.tab" into table snp126orthoPrelim' hg18
cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
hgsql -e 'load data local infile "snp126orthoPrelim.tab" into table snp126orthoPrelim' hg18
# add human chrom, chromStart, chromEnd, allele, variant
# liftOver loses the chrom, chromStart and chromEnd
# liftOver does retain the allele and variant
cd /cluster/data/dbSNP/ortho/hg18/integrate
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrthoLookup hg18 snp126simple snp126orthoPrelim
# load final table with separate rows for chimp and macaque
# drop snp126orthoPrelim because it has non-human coords
# rm tab file because it is huge
hgsql hg18 < snp126ortho.sql
load data local infile "snpOrthoLookup.tab" into table snp126ortho
drop table snp126orthoPrelim
rm snpOrthoLookup.tab
# create indices
mysql> alter table snp126ortho add index name (name);
mysql> alter table snp126ortho add index chrom (chrom, bin);
# manually validate a few examples on various chroms, various strands
# I used rheMac2:
# rs533274, hg18 chr1 +, rheMac2 chr18 -
# rs1690550, hg18 chr1 -, rheMac2 chr19 +
# rs3121568, hg18 chr1 -, rheMac2 chr19 -
# rs28709562, hg18 chr1 +, rheMac2 chr19 +
# rs34675838, also hg18 chr1 +, rheMac2 chr19 +
# create alternate format with both alleles in same row
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrthoJoin hg18 snp126simple snp126ortho
# 8517465 rows in hash for panTro2
# 7321537 rows in hash for rheMac2
# humanCount = 9092533
# chimpOnlyCount = 1418324
# macaqueOnlyCount = 222396
# missingCount = 352672
# bothCount = 7098141
# confirm that chimpOnly + macaqueOnly + missing + both = human
hgsql hg18 < snp126orthoPanTro2RheMac2.sql
hgsql -e "load data local infile 'snpOrthoJoin.tab' into table snp126orthoPanTro2RheMac2" hg18
mysql> alter table snp126orthoPanTro2RheMac2 add index name (name);
mysql> alter table snp126orthoPanTro2RheMac2 add index chrom (chrom, bin);
################################################################
### CREATE chimpHiQualDiff -- panTro2 (Daryl; May 1, 2006)
# Make file/table of high quality single base pair differences
# between hg18 and panTro2
set bedDir = /cluster/data/hg18/bed/chimpHiQualDiffs
mkdir -p $bedDir
cd $bedDir
sed 's/simpleNucDiff/chimpHiQualDiffs/' ~/kent/src/hg/lib/simpleNucDiff.sql >! chimpHiQualDiffs.sql
set axtDir = /cluster/data/hg18/bed/blastz.panTro2/axtRBestNet
mkdir -p chroms; cd chroms
ls -1 $axtDir | grep chr | grep axt | sed 's/.hg18.panTro2.net.axt.gz//' | grep -v random | grep -v "_" | xargs mkdir
set workDir = /scratch/chqd
mkdir -p $workDir
touch $workDir/chqd.log
# time nice /cluster/home/daryl/bin/i386/chimpHiQualDiffs $workDir/$f /cluster/data/panTro2/bed/quality/qac/panTro2.qac $f.chimpHiQualDiffs
.bed>>& $workDir/chqd.log
foreach f (chr*)
echo -n $f " "
mkdir -p $workDir/$f/
cp $axtDir/$f.*.axt.gz $workDir/$f/
gunzip $workDir/$f/$f.*.axt.gz
time nice /cluster/home/daryl/bin/i386/chimpHiQualDiffs $workDir/$f /cluster/data/panTro2/bed/quality/qac/panTro2.qac $f.chimpHiQualDiffs
.bed
rm -f $workDir/$f/$f.*axt
rmdir $workDir/$f/
end
mv $workDir/chqd.log .
cat chr*bed >! ../chimpHiQualDiffs.bed
## The load (sort) ran out of memory on hgwdev, so sort the
## file first on kolossus and then load it on hgwdev
ssh kolossus
time hgLoadBed -strict -sqlTable=chimpHiQualDiffs.sql -noLoad hg18 chimpHiQualDiffs chimpHiQualDiffs.bed
# 110.214u 10.836s 2:24.42 83.8% 0+0k 0+0io 1pf+0w
exit
## hgwdev
time hgLoadBed -hasBin -noSort -sqlTable=chimpHiQualDiffs.sql hg18 chimpHiQualDiffs bed.tab
# 328.890u 113.230s 42:26.00 17.3% 0+0k 0+0io 197676pf+0w
## TODO: need to filter out polymorphic sites (SNPs)
#################################################################
###### BUILD SUPERFAMILY RELATED TABLES (DONE - 2006-06-20 - Fan)
# Build Superfamily track and create sf tables needed for PB
ssh hgwdev
hgsql hg18 < ~/src/hg/lib/sfAssign.sql
cd /cluster/data/superfamily/060619
hgsql hg18 -e 'load data local infile "ass_18-Jun-2006.tab" into table hg18.sfAssign;'
# If hg18.sfDes already exists, drop it.
mkdir /cluster/data/hg18/bed/sf
cd /cluster/data/hg18/bed/sf
hgsql superfam060619 -N -e "select * from des" >sfDes.tab
hgsql hg18 < ~/src/hg/lib/sfDes.sql
hgsql hg18 -e 'load data local infile "sfDes.tab" into table sfDes'
# Build ensemblXref3
# Get the ensembl gene/protein cross-reference data from Ensembl BioMart
# http://www.ensembl.org/Multi/martview
# Follow this sequence through the pages:
# Page 1) Select Ensembl39 and Homo Sapien. Hit next.
# Page 2) Do not select anything. Hit next.
# Page 3) Choose the "Feature" box, select Ensembl gene ID, transcript ID, peptide ID,
UniProt/TrEMBL ID, UniProt/SWISSPROT ID, and UniProt/SWISSPROT Accession
# Page 4) Choose "Text, tab separated". choose gzip compression. hit export.
# Save as ensembXref3.gz
ssh hgwdev
cd /cluster/data/hg18/bed/ensembl
gzip -d ensembXref3.gz
hgsql hg18 < ~/src/hg/lib/ensemblXref3Temp.sql
hgsql hg18 -e \
'load data local infile "ensemblXref3" into table ensemblXref3Temp ignore 1 lines'
hgsql hg18 -N -e \
'select gene, "0", transcript, "0", protein, "0", tremblAcc, swissDisplayId, swissAcc from ensemblXref3Temp' \
> ensemblXref3.tab
hgsql hg18 -e 'drop table ensemblXref3'
hgsql hg18 <~/src/hg/lib/ensemblXref3.sql
hgsql hg18 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3'
# If hg18.superfamily already exists, drop it.
cd /cluster/data/hg18/bed/sf
hgSuperfam hg18 superfam060619 > sf.log
# It is normal that many proteins do not have corresponding Superfamily entries.
# If hg18.sfDescription exists, drop it.
hgsql hg18 < ~/src/hg/lib/sfDescription.sql
hgsql hg18 -e 'LOAD DATA local INFILE "sfDescription.tab" into table hg18.sfDescription;'
# Finally, load the superfamily table.
hgLoadBed hg18 superfamily superfamily.tab -tab
# Create knownToSuperfamily table
# Note hs is changed into ht for this Superfamily release.
cat /cluster/data/superfamily/060619/ass_18-Jun-2006.tab \
| hgKnownToSuper hg18 hs stdin
# created 27,511 rows in knownToSuper
############################################################################
# SEGMENTAL DUPLICATIONS (DONE 7/14/06 angie)
# File emailed from Xinwei She <xws at u.washington.edu>
mkdir /cluster/data/hg18/bed/genomicSuperDups
cd /cluster/data/hg18/bed/genomicSuperDups
# The sed command is necessary to fix "_" used as strand.
# The awk command was necessary for some recent other species
# genomicSuperDups that had some too-short regions. It does not seem
# to be necessary here, but doesn't hurt and may be useful in
# future builds.
sed -e 's/\t_\t/\t-\t/' hg18genomicSuperDup.tab \
| awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
| hgLoadBed hg18 genomicSuperDups stdin \
-sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
############################################################################
# GENE BOUNDS (RNACLUSTER) (DONE 08-09-2006 Fan)
# Create rnaCluster table (depends on {est,mrna}OrientInfo)
cd /cluster/data/hg18/bed
mkdir rnaCluster
cd rnaCluster/
mkdir chrom
# Create a list of accessions that come from RAGE libraries and need to be excluded.
~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg18 rage.libs
foreach f (/cluster/data/hg18/nib/chr*.nib)
set c = $f:t:r
set out = chrom/$c.bed
# Exclude accesions in the RAGE file
echo clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c
clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c
end
hgLoadBed hg18 rnaCluster chrom/*.bed
############################################################################
############################################################################
# POLYA_DB TRACK (DONE 08-28-2006 Andy)
mkdir /cluster/data/hg18/bed/polyaDB
cd /cluster/data/hg18/bed/polyaDB
wget http://polya.umdnj.edu/download/polyAsite.gz
gunzip polyAsite.gz
find /cluster/data/hg16/ -name 'ordered.lft' | xargs cat > hg16.lft
sed 's/\(\s\).*\//\1/; s/chr/hg16.chr/' hg16.lft > tmp
mv tmp hg16.lft
cut -f2 hg16.lft > hg16.lft.names
grep -F -f hg16.lft.names polyAsite > hg16.polyAsite
awk '{printf("%s\t%d\t%d\t%s\n", $3, ($5-1), $5, $1);}' hg16.polyAsite > hg16.polyAsite.bed
liftUp lifted.bed hg16.lft warn hg16.polyAsite.bed
sed 's/hg16\.//' lifted.bed > final.bed
liftOver final.bed /gbdb/hg16/liftOver/hg16ToHg18.over.chain.gz hg18.bed unmapped
hgLoadBed hg18 polyaDB hg18.bed
# trackDb entry/html in human/hg18
############################################################################
# Translate SNP Array data from hg17 (Heather August 2006)
# Affy500
cd /cluster/data/hg18/bed/snp/affy
# get rsId/affy name pairs from hg17 where rsId != 'unknown'
# 257954 candidates from Nsp (4311 with unknown rsId)
# 234765 candidates from Sty (3540 with unknown rsId)
hgsql hg17 < getHg17-Nsp.sql > nsp.hg17
hgsql hg17 < getHg17-Sty.sql > sty.hg17
# get name, chrom, chromStart, chromEnd, strand, observed from snp126simple
# snp126simple contains only class = "simple", locType = "exact",
# chromEnd = chromStart + 1, biallelic, singly-aligning
hgsql hg18 < getHg18.sql > snp126simple.hg18
# sort and join
# 257213in nsp.join
# 233941 in sty.join
# 741 in nsp.missing
# 824 in sty.missing
sort nsp.hg17 > nsp.hg17.sort
sort sty.hg17 > sty.hg17.sort
sort snp126simple.hg18 > snp126simple.hg18.sort
join nsp.hg17.sort snp126simple.hg18.sort > nsp.join
join sty.hg17.sort snp126simple.hg18.sort > sty.join
join -v 1 nsp.hg17.sort snp126simple.hg18.sort > nsp.missing
join -v 1 sty.hg17.sort snp126simple.hg18.sort > sty.missing
# fix column order
awk '{print $3, $4, $5, $2, 0, $6, $7, $1}' nsp.join > nsp.bed
awk '{print $3, $4, $5, $2, 0, $6, $7, $1}' sty.join > sty.bed
# load
hgLoadBed hg18 snpArrayAffy250Nsp nsp.bed -sqlTable=snpArrayAffy250Nsp.sql
hgLoadBed hg18 snpArrayAffy250Sty sty.bed -sqlTable=snpArrayAffy250Sty.sql
# cleanup
rm nsp.hg17 nsp.hg17.sort nsp.join
rm sty.hg17 sty.hg17.sort sty.join
rm snp126simple.hg18 bed.tab
mv snp126simple.hg18.sort ../illumina
gzip nsp.bed sty.bed
# Illumina300
cd /cluster/data/hg18/bed/snp/illumina
# 317,100 candidates from hg17
hgsql -e 'select name from snpArrayIllumina300' hg17 > hg17.data
# sort and join
# 314,093 in join.out
# 3,007 in join.missing
sort hg17.data > hg17.data.sort
join hg17.data.sort hg18.data.sort > join.out
join -v 1 hg17.data.sort hg18.data.sort > join.missing
# fix column order
awk '{print $2, $3, $4, $1}' join.out > illumina.bed
# load
hgsql hg18 < snpArrayIllumina300.sql
hgLoadBed hg18 snpArrayIllumina300 illumina.bed -sqlTable=snpArrayIllumina300.sql
# cleanup
rm hg17.data hg17.data.sort hg18.data.sort bed.tab join.out
gzip illumina.bed
##########################################################################
# New SNP Array data (Heather April 2007)
# Affymetrix introduced a new genotyping array in February
# I got the data from Venu in April
# It is based on dbSNP build 126
# Venu reviewed the load
ssh hgwdev
cd /cluster/data/hg18/bed/snp/affy
# There were 60 lines with no chrom, chromEnd or strand
grep -v NULL GenomeWideSNP_5_ucsc.tsv > genomewide.in
# little Perl script to add chromEnd & score for bed format
genomewide.pl < genomewide.id > genomewide.bed
# preliminary load
hgLoadBed hg18 snpArrayAffyGenomeWidePrelim genomewide.bed -tab -sqlTable=snpArrayAffyGenomeWidePrelim.sql
# based on position, lookup rsId
# 2 runs
# first run: don't include dbSNP if class != single or locType != exact or
# chromEnd != chromStart + 1
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg18 snpArrayAffyGenomeWidePrelim snp126
# missing count = 5279
# multiple count = 44
# second run: use all of snp126
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg18 snpArrayAffyGenomeWidePrelim snp126
# missing count = 5210
# multiple count = 724
# Use the first run (better to avoid nearly 700 multiples at the cost of
# 69 more unknown)
hgLoadBed hg18 snpArrayAffy5 affyLookup.out -tab -sqlTable=snpArrayAffy5.sql
##########################################################################
# More new SNP Array data from Affymetrix (Heather May 2007)
# Source: Venu_Valmeekam at affymetrix.com
# This is the 6.0 array, announced mid-May
# It contains 2 components: single-base substitutions and copy-number probes
# Single-base substitutions are based on snp127
ssh hgwdev
cd /cluster/data/hg18/bed/snp/affy/6.0/single
unzip GenomeWideSNP_6_ucsc_1.tsv.zip
unzip GenomeWideSNP_6_ucsc_2.tsv.zip
format.pl < GenomeWideSNP_6_ucsc_1.tsv > 1.bed
format.pl < GenomeWideSNP_6_ucsc_2.tsv > 2.bed
cp 1.bed all.bed
cat 2.bed >> all.bed
hgLoadBed hg18 snpArrayAffy6Prelim all.bed -tab -sqltable=snpArrayAffy6Prelim.sql
mysql> update snpArrayAffy6Prelim set chrom = "chrM" where chrom = "chrMT";
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg18 snpArrayAffy6Prelim snp127
# missing count = 1149
# multiple count = 2396
# used the strict version of affyLookup (class="single", locType="exact", size=1)
hgLoadBed hg18 snpArrayAffy6 affyLookup.out -tab -sqlTable=snpArrayAffy6.sql
mysql> alter table snpArrayAffy6 add index name(name);
mysql> alter table snpArrayAffy6 add index chrom(chrom, bin);
cd /cluster/data/hg18/bed/snp/affy/6.0/sv
unzip GenomeWideSNP_6_CN_ucsc_1.tsv.zip
unzip GenomeWideSNP_6_CN_ucsc_2.tsv.zip
format.pl < GenomeWideSNP_6_CN_ucsc_1.tsv > 1.bed
format.pl < GenomeWideSNP_6_CN_ucsc_2.tsv > 2.bed
cp 1.bed all.bed
cat 2.bed >> all.bed
hgLoadBed hg18 snpArrayAffy6SV all.bed -tab
mysql> delete from snpArrayAffy6SV where chrom = "chr0";
mysql> update snpArrayAffy6SV set chromStart = chromStart - 1;
##########################################################################
# Venu from Affy requested to remove about 25,000 items from
# snpArrayAffy6 track.
#
# Imported the list into the table, snpArrayAffy6Remove, in hg18.
#
# Issued a simple MySQL command to delete records in snpArrayAffy6
# that having ids in snpArrayAffy6Remove (sorry did not write down it).
#
# This was done 10/8/07. Fan.
##########################################################################
# New Illumina Array data (Heather April 2007)
# HumanHap300v3, HumanHap550v3, HumanHap650v3
# Data from Luana Galver (lgalver at illumina.com)
# Based on dbSNP build 126
ssh hgwdev
cd /cluster/data/hg18/bed/snp/illumina
# split off chrM from zips
bed.pl < 300.in > 300.bed
bed.pl < 550.in > 550.bed
bed.pl < 650.in > 650.bed
chrM.pl < 550.in.M > 550.bed.M
chrM.pl < 650.in.M > 650.bed.M
hgLoadBed hg18 snpArrayIllumina300 300.bed -sqlTable=snpArrayIllumina300.sql -tab
hgLoadBed hg18 snpArrayIllumina550 550.bed -sqlTable=snpArrayIllumina550.sql -tab
hgLoadBed hg18 snpArrayIllumina650 650.bed -sqlTable=snpArrayIllumina650.sql -tab
hgLoadBed hg18 snpArrayIllumina550 550.bed.M -tab -oldTable
hgLoadBed hg18 snpArrayIllumina650 650.bed.M -tab -oldTable
# add indices
mysql> alter table snpArrayIllumina300 add index name (name);
mysql> alter table snpArrayIllumina300 add index chrom (chrom, bin);
mysql> alter table snpArrayIllumina550 add index name (name);
mysql> alter table snpArrayIllumina550 add index chrom (chrom, bin);
mysql> alter table snpArrayIllumina650 add index name (name);
mysql> alter table snpArrayIllumina650 add index chrom (chrom, bin);
# fix strand convention
mysql> update snpArrayIllumina300 set strand = "+" where strand = "F";
mysql> update snpArrayIllumina300 set strand = "-" where strand = "R";
mysql> update snpArrayIllumina550 set strand = "+" where strand = "F";
mysql> update snpArrayIllumina550 set strand = "-" where strand = "R";
mysql> update snpArrayIllumina650 set strand = "+" where strand = "F";
mysql> update snpArrayIllumina650 set strand = "-" where strand = "R";
# Note no A/T or C/G!!
mysql> select distinct(observed) from snpArrayIllumina300;
# +----------+
# | observed |
# +----------+
# | [A/G] |
# | [T/C] |
# | [A/C] |
# | [T/G] |
# +----------+
# fix observed
mysql> update snpArrayIllumina300 set observed = "A/C" where observed = "[A/C]";
mysql> update snpArrayIllumina550 set observed = "A/C" where observed = "[A/C]";
mysql> update snpArrayIllumina650 set observed = "A/C" where observed = "[A/C]";
mysql> update snpArrayIllumina300 set observed = "A/G" where observed = "[A/G]";
mysql> update snpArrayIllumina550 set observed = "A/G" where observed = "[A/G]";
mysql> update snpArrayIllumina650 set observed = "A/G" where observed = "[A/G]";
mysql> update snpArrayIllumina300 set observed = "C/T" where observed = "[T/C]";
mysql> update snpArrayIllumina550 set observed = "C/T" where observed = "[T/C]";
mysql> update snpArrayIllumina650 set observed = "C/T" where observed = "[T/C]";
mysql> update snpArrayIllumina300 set observed = "G/T" where observed = "[T/G]";
mysql> update snpArrayIllumina550 set observed = "G/T" where observed = "[T/G]";
mysql> update snpArrayIllumina650 set observed = "G/T" where observed = "[T/G]";
# Note 2 rows in 300 and 15 rows in 550 and 650 where chrom = "chrXY"
# validation
/cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg18 snpArrayIllumina300 snp126 snp126Exceptions illuminaLookup.hg18.300
/cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg18 snpArrayIllumina550 snp126 snp126Exceptions illuminaLookup.hg18.550
/cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg18 snpArrayIllumina650 snp126 snp126Exceptions illuminaLookup.hg18.650
# Not found: 2 in 300, 15 in 550 and 650
# These are in snp127
# Mixed: 55 in 300, 74 in 550, 81 in 650
# Found 2 strange things here:
# First of all, for snps that are illumina forward strand, dbSNP reverse strand:
# in all cases, the observed polymorphism is identical.
# Counts:
# 36k on the HumanHap300v3
# 52k on the HumanHap550v3
# 59k on the HumanHap650v3
# This surprises me, because the dbSNP observation is intended to be reverse-complemented.
# Examples from HumanHap300v3 include rs1000007, rs1000031, rs1000041, rs1000071, rs1000078.
# Secondly, for snps that are illumina reverse strand:
# in all cases is that your observed polymorphism is the reverse complement of the dbSNP polymorphism.
# this could only make sense for the dbSNP forward strand OR the dbSNP reverse strand, although I don't think it matters which one.
# examples:
# rs3934834: illumina A/G (-), dbSNP C/T (+)
# rs6687776: illumina A/G (-), dbSNP C/T (+)
# rs2298217: illumina A/G (-), dbSNP C/T (+)
# rs9442380: illumina A/G (-), dbSNP C/T (+)
# rs3737728: illumina A/G (-), dbSNP C/T (-)
# rs3813199: illumina A/G (-), dbSNP C/T (-)
# rs880051: illumina A/G (-), dbSNP C/T (-)
# rs12562034: illumina C/T (-), dbSNP A/G (+)
# rs9442372: illumina C/T (-), dbSNP A/G (+)
# rs11260588: illumina C/T (-), dbSNP A/G (+)
# rs12726255: illumina C/T (-), dbSNP A/G (+)
# rs2887286: illumina C/T (-), dbSNP A/G (-)
# rs2649588: illumina C/T (-), dbSNP A/G (-)
# rs2296716: illumina C/T (-), dbSNP A/G (-)
# rs2474460: illumina C/T (-), dbSNP A/G (-)
# redo this, just using name/chrom/pos from illumina
bed2.pl < 300.in > 300.bed.2
hgLoadBed hg18 snpArrayIllumina300Prelim 300.bed.2 -tab
/cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup2 hg18 snpArrayIllumina300Prelim snp126 snp126Exceptions
mv illuminaLookup.out lookup.300
mv illuminaLookup.err lookup.300.err
hgLoadBed hg18 snpArrayIllumina300 lookup.300 -tab -sqlTable=snpArrayIllumina300.sql
hgsql -N -e 'drop table snpArrayIllumina300Prelim' hg18
bed2.pl < 550.in > 550.bed.2
hgLoadBed hg18 snpArrayIllumina550Prelim 550.bed.2 -tab
/cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup2 hg18 snpArrayIllumina550Prelim snp126 snp126Exceptions
mv illuminaLookup.err lookup.550.err
mv illuminaLookup.out lookup.550
hgLoadBed hg18 snpArrayIllumina550 lookup.550 -tab -sqlTable=snpArrayIllumina550.sql
hgsql -N -e 'drop table snpArrayIllumina550Prelim' hg18
bed2.pl < 650.in > 650.bed.2
hgLoadBed hg18 snpArrayIllumina650Prelim 650.bed.2 -tab
/cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup2 hg18 snpArrayIllumina650Prelim snp126 snp126Exceptions
mv illuminaLookup.out lookup.650
mv illuminaLookup.err lookup.650.err
hgLoadBed hg18 snpArrayIllumina650 lookup.650 -tab -sqlTable=snpArrayIllumina650.sql
hgsql -N -e 'drop table snpArrayIllumina650Prelim' hg18
# add indices
mysql> alter table snpArrayIllumina300 add index name (name);
mysql> alter table snpArrayIllumina300 add index chrom (chrom, bin);
mysql> alter table snpArrayIllumina550 add index name (name);
mysql> alter table snpArrayIllumina550 add index chrom (chrom, bin);
mysql> alter table snpArrayIllumina650 add index name (name);
mysql> alter table snpArrayIllumina650 add index chrom (chrom, bin);
##########################################################################
# Added gvPos table for Locus Variants (Belinda Giardine Sept 2006)
# This uses the gv* tables in hgFixed for the related data. The track has
# been on hg17, just added to hg18. Most variants were mapped directly to
# hg18 only the LSDB BGMUT was lifted using liftOver.
# Update, reloaded table Dec 2006 Belinda Giardine
# new entries for previous sources and more IDbases
# Update, reloaded table January 2007 Belinda Giardine
# new source (first set of LOVD) and some fixes to IDbases and HbVar
# Update most LSDBs, add more genes for LMDp(LOVD) Jan 11, 2008
# loaded and tested first at PSU
#update old dbs and add dbPEX March 22-23, 2007
#need to truncate and reload all tables (new entries in old)
#prepare positions for loading
cd gvNov2006
cat gvPosARdb.hg17.txt gvPosSrd5a2.hg17.txt gvPosPah.hg17.txt > ../gvMar2007/gvPosNov2006.hg17.txt
cd ../gvMar2007
cat ../gvJan2007/gvPosLOVD.hg17.txt *.hg17.txt > gvPos.Hg17.txt
grep "^chr" gvPos.Hg17.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg17.bed
cd gvNov2006
cat gvPosARdb.hg18.txt gvPosSrd5a2.hg18.txt gvPosPah.hg18.txt > ../gvMar2007/gvPosNov2006.hg18.txt
cd ../gvMar2007
cat ../gvJan2007/gvPosLOVD.hg18.txt *.hg18.txt > gvPos.Hg18.txt
grep "^chr" gvPos.Hg18.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg18.bed
#run checks
~giardine/gv/checkLinksRaFile.pl /cluster/store6/giardine/gvMar2007/
~giardine/gv/checkSeq.pl hg18 < gvPos.Hg18.txt > errors.txt
~giardine/gv/checkSeq.pl hg17 < gvPos.Hg17.txt > errors17.txt
#start reload
hgsql hgFixed < emptyTables.sql
#copy and paste from reloadHgFixed.txt
#load new dbs
hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvRettBASE.txt
hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrRettBASE.txt
hgLoadSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkRettBASE.txt
hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvdbPEX.txt
hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrdbPEX.txt
hgLoadSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkdbPEX.txt
#load position tables
hgLoadBed hg18 gvPos gvPosSorted.Hg18.bed -noSort -oldTable -tab
hgLoadBed hg17 gvPos gvPosSorted.Hg17.bed -noSort -oldTable -tab
#run remaining checks
select distinct attrType from gvAttr;
select distinct attrType from gvLink;
#and compare against gvAttrTypeKey in hg/lib/gvUi.c
~/gv/joinerChecks.pl table1 IDfield1 table2 IDfield2
#for gv, gvPos, gvSrc, gvAttr, and gvLink
#script to check for non unique rows in database
~/gv/uniqueCheck.pl gvAttr > gvAttrNonunique.txt
~/gv/uniqueCheck.pl gvLink > gvLinkNonunique.txt
#add IPNMDB and reload LOVD with more genes April 12, 2007
cat *.hg17.txt > gvPos.Hg17.txt
grep "^chr" gvPos.Hg17.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg17.bed
cat *.hg18.txt > gvPos.Hg18.txt
grep "^chr" gvPos.Hg18.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg18.bed
#run checks
~giardine/gv/checkLinksRaFile.pl /cluster/store6/giardine/gvMar2007/
~giardine/gv/checkSeq.pl hg18 < gvPos.Hg18.txt > errors.txt
~giardine/gv/checkSeq.pl hg17 < gvPos.Hg17.txt > errors17.txt
#remove old LOVD entries
hgsql hgFixed
delete from gvLink where id like 'FKRP%';
delete from gvAttr where id like 'FKRP%';
delete from gv where id like 'FKRP%';
insert into gvSrc values ('IPNMDB', 'LSDB', 'Mutation Database of Inherited Peripheral Neuropathies');
#load new dbs
hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvLOVD.txt
hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrLOVD.txt
hgLoadSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkLOVD.txt
hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvIPNMDB.txt
hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrIPNMDB.txt
adSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkIPNMDB.txt
hgsql hg18
truncate table gvPos;
hgsql hg17
truncate table gvPos;
#load position tables
hgLoadBed hg18 gvPos gvPosSorted.Hg18.bed -noSort -oldTable -tab
hgLoadBed hg17 gvPos gvPosSorted.Hg17.bed -noSort -oldTable -tab
#run remaining checks
select distinct attrType from gvAttr;
select distinct attrType from gvLink;
#and compare against gvAttrTypeKey in hg/lib/gvUi.c
~/gv/joinerChecks.pl table1 IDfield1 table2 IDfield2
#for gv, gvPos, gvSrc, gvAttr, and gvLink
#script to check for non unique rows in database
~/gv/uniqueCheck.pl gvAttr
~/gv/uniqueCheck.pl gvLink
#found missing common names
hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrIPNMDBcommonName.txt
##########################################################################
# hars 1 to 202 Sol 09/10/2006
set bedDir = /gbdb/hg18/haseq/bed
mkdir -p $bedDir/hars
pushd /projects/hg/wet/Sol/hars1to49
cp -p hars_1to202.hg18.bed $bedDir/hars/hars_1to202.bed
hgLoadBed hg18 hars $bedDir/hars/hars_1to202.bed
rm -f $bedDir/hars/hars_1to202.bed
popd
# BUILD HPRD DATA FOR KNOWN GENE DETAILS PAGE LINKS (DONE 9/11/06)
# Download HPRD_XML_060106.tar.gz from www.hprd.org
gzip -d HPRD_XML_060106.tar.gz
tar -xvf HPRD_XML_060106.tar.gz
# This will create 18838 xxxx.xml files under HPRD_XML_060106
# Create hprdToCdna table
echo 'grep -H entry_cdna HPRD_XML_060106/$1.xml' >do1Cdna
ls HPRD_XML_060106 >j
cat j |sed -e 's/.xml/\tdo1Cdna/g' >jj
cut -f 1 jj >j.2
cut -f 2 jj >j.1
paste j.1 j.2 >doAllCdna
chmod +x do*
./doAllCdna >j.cdna
cat j.cdna| sed -e 's/\//\t/' | sed -e 's/.xml/\t/' |\
sed -e 's/<entry_cdna>/\t/' | sed -e 's/<\//\t/'| sed -e 's/\./\t/'| cut -f 2,4|\
grep -v None >hprdToCdna.tab
hgsql hg18 -e 'drop table hprdToCdna'
hgsql hg18 <~/src/hg/lib/hprdToCdna.sql
hgsql hg18 -e 'load data local infile "hprdToCdna.tab" into table hprdToCdna'
# Create hprdToUniProt table
echo 'fgrep -H Swiss HPRD_XML_060106/$1.xml' >do1
ls HPRD_XML_060106 >j
cat j |sed -e 's/.xml/\tdo1/g' >jj
cut -f 1 jj >j.2
cut -f 2 jj >j.1
paste j.1 j.2 >doall
chmod +x do*
./doall >j.out
cat j.out|grep SwissProt | sed -e 's/\//\t/' | sed -e 's/.xml/\t/' | \
sed -e 's/Prot>/\t/' | sed -e 's/<\//\t/'| cut -f 2,4|grep -v None >hgrdToUniProt.tab
hgsql hg18 -e 'drop table hprdToUniProt'
hgsql hg18 <~/src/hg/lib/hprdToUniProt.sql
hgsql hg18 -e 'load data local infile "hprdToUniProt.tab" into table hprdToUniProt'
# build knownToHprd table
hgsql hg18 -N -e 'select kgId,hprdId from hprdToCdna, kgXref where cdnaId=kgId' >j.kg1
hgsql hg18 -N -e 'select kgId,hprdId from hprdToUniProt, kgXref where uniProtId=spId' >j.kg2
cat j.kg1 j.kg2 |sort -u >knownToHprd.tab
wc knownToHprd.tab
hgsql hg18 -e 'drop table knownToHprd'
hgsql hg18 <~/src/hg/lib/knownToHprd.sql
hgsql hg18 -e 'load data local infile "knownToHprd.tab" into table knownToHprd'
hgsql hg18 -e 'select count(*) from knownToHprd'
# 19,646 records created.
# remove temporary files.
rm j*
# Do the same for hg17. See hg17.txt for details.
##########################################################################
# ORegAnno: oreganno, oregannoAttr, oregannoLink
# Belinda Giardine August 3, 2007
# updated Oct 26, 2007
# updated July 7, 2008
# This has regulatory annotations from ORegAnno.
# Get updated file from ORegAnno wiki page
# http://www.bcgsc.ca/wiki/display/oreganno/DataFiles
# Parse flat file into 3 tables, truncate tables, load.
# Has other species but only Human, Fly, sacSer1 has enough entries for now.
cd /cluster/store6/giardine/oreganno/20071026/
~giardine/oreganno/parseOra hg18 < oreganno_UCSC_25Oct07.txt
hgsql hg18
truncate table oreganno;
truncate table oregannoAttr;
truncate table oregannoLink;
quit;
grep "^chr" oreganno.hg18.txt | sort -k1,1 -k2,2n > oreganno.bed
hgLoadBed hg18 oreganno oreganno.bed -noSort -oldTable -tab
hgLoadSqlTab -oldTable hg18 oregannoAttr
~/humPhen/kent/src/hg/lib/oreganno.sql oregannoAttr.hg18.txt
hgLoadSqlTab -oldTable hg18 oregannoLink
~/humPhen/kent/src/hg/lib/oreganno.sql oregannoLink.hg18.txt
##########################################################################
# LIFT ACEMBLY FROM HG17 TO HG18 (DONE, Fan, 9/28/06)
# OBSOLETED BY LOAD OF NEW DATA, SEE BELOW 8/28/07 angie
# get acembly data from hg17
hgsql hg17 -N -e 'select * from acembly' >hg17Acembly.gp
# lift to hg18
zcat /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz | \
liftOver hg17Acembly.gp stdin acembly.gp unMapped.gp -genePred
# load the genePred table
ldHgGene hg18 acembly -predTab acembl.gp
# get acemblyPep and acemblyClass table from hg17 and load them into hg18.
hgsql hg17 -N -e 'select * from acemblyPep' >acemblyPep.tab
hgsql hg18 -e 'drop table acemblyPep'
hgsql hg18 < ~/src/hg/lib/acemblyPep.sql
hgsql hg18 -e 'load data local infile "acemblyPep.tab" into table acemblyPep'
hgsql hg17 -N -e 'select * from acemblyClass' >acemblyClass.tab
hgsql hg18 -e 'drop table acemblyClass'
hgsql hg18 < ~/src/hg/lib/acemblyClass.sql
hgsql hg18 -e 'load data local infile "acemblyClass.tab" into table acemblyClass'
##########################################################################
# LIFT RNAGENE FROM HG17 TO HG18 (DONE, Robert, 10/3/06)
mkdir /cluster/data/hg18/bed/rnaGene
cd /cluster/data/hg18/bed/rnaGene
hgsql hg18 < rnaGene.sql
liftOver ~/hg17/rnaGene/rnaGenes.tab /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz rnaGenes.bed unmapped -bedPlus=10 -tab
hgLoadBed hg18 rnaGene rnaGenes.bed -oldTable -tab -noBin
##########################################################################
# SWAP/CHAIN/NET GASACU1 (DONE 10/17/06 angie)
ssh kkstore02
mkdir /cluster/data/hg18/bed/blastz.gasAcu1.swap
cd /cluster/data/hg18/bed/blastz.gasAcu1.swap
doBlastzChainNet.pl -swap /cluster/data/gasAcu1/bed/blastz.hg18/DEF \
-chainMinScore=2000 -chainLinearGap=loose >& do.log & tail -f do.log
ln -s blastz.gasAcu1.swap /cluster/data/hg18/bed/blastz.gasAcu1
nice featureBits hg18 chainGasAcu1Link
#55424609 bases of 2881515245 (1.923%) in intersection
##########################################################################
# YALE TRANSCRIPTIONALLY ACTIVE REGIONS (TARs/TransFrags) TRACK IDENTIFIED #
# USING A WHOLE GENOME TILING ARRAY (DONE, 2006-10-12 - 2006-10-13, hartera)
# Data is from the paper: Bertone et al. Science 24 December 2004:
# Vol. 306. no. 5705, pp. 2242 - 2246. From Mark Gerstein's lab at Yale.
# Contact at Yale: Joel S. Rozowsky, joel.rozowsky at yale.edu
# The data consist of Transcriptionally Active Regions (TARs or TransFrags)
# found using Affymetrix genome tiling arrays. The data is from the lab
# of Mark Gerstein at Yale.
ssh kkstore02
mkdir /cluster/data/hg18/bed/yaleBertoneTars/
cd /cluster/data/hg18/bed/yaleBertoneTars/
# download Bertone et al. data from this URL:
#http://dart.gersteinlab.org/cgi-bin/ar/download.cgi?ID=TAR_data_NCBI31.txt
# and put it in this directory.
# The sequences used to design the microarrays were from
# UCSC hg13/NCBI Build 31 so the sequences
# should be aligned again using Blat since this is probably better
# than using liftOver across so many assemblies.
# Get sequences from TARs file and put in FASTA format:
# Remove characters from Windows:
dos2unix TAR_data_NCBI31.txt
# The TARs are in order of IDs in the file so the first TAR has ID 1, the
# second is 2 up to the last which is 17517. These IDs are used to link
# to the DART database of TARs at Yale so use these IDs in the FASTA
# header lines. Need to add "TAR" as prefix to ID so that it is unique
# in the seq table.
awk 'BEGIN {FS="\t";n=0;}{if ($1 ~ /^chr/) print ">TAR"n"\n"$14"\n";n++;}' \
TAR_data_NCBI31.txt > yaleBertoneTARSeqs.fa
ssh pk
mkdir -p /san/sanvol1/scratch/hg18/TARs/
cp /cluster/data/hg18/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \
/san/sanvol1/scratch/hg18/TARs/
# Set up to Blat the TAR sequences against hg18
cd /cluster/data/hg18/bed/yaleBertoneTars
ls -1 /san/sanvol1/scratch/hg18/TARs/yaleBertoneTARSeqs.fa > tars.lst
ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst
# output dir
mkdir psl
cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/x86_64/blat -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << for emacs
gensub2 genome.lst tars.lst template.sub para.spec
para create para.spec
para try, para check, para push ...
para time
# Completed: 49 of 49 jobs
#CPU time in finished jobs: 396s 6.61m 0.11h 0.00d 0.000y
#IO & Wait Time: 198s 3.29m 0.05h 0.00d 0.000 y
#Average job time: 12s 0.20m 0.00h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 39s 0.65m 0.01h 0.00d
#Submission to last job: 253s 4.22m 0.07h 0.00d
# sort and then filter
pslSort dirs raw.psl tmp psl
# use these parameters as for Genbank alignments of native mRNAs
# for finished assemblies.
pslCDnaFilter -minId=0.96 -minCover=0.25 -localNearBest=0.001 \
-minQSize=20 -minNonRepSize=16 -ignoreNs -bestOverlap \
raw.psl yaleBertoneTars.psl
# seqs aligns
# total: 17512 38243
# drop minNonRepSize: 159 403
# drop minIdent: 3822 14798
# drop minCover: 563 895
# weird over: 242 832
# kept weird: 204 210
# drop localBest: 2410 4018
# kept: 17469 18129
# 99.75% were kept.
# check how many aligned
grep '>' yaleBertoneTARSeqs.fa | wc -l
# 17517
# 99.7% of the original set of sequences are in this filtered PSL file.
pslCheck yaleBertoneTars.psl
# psl is ok
# load into database
ssh hgwdev
cd /cluster/data/hg18/bed/yaleBertoneTars
hgLoadPsl hg18 yaleBertoneTars.psl
# Add sequences to /gbdb/hg18 and to seq and extFile tables.
mkdir -p /gbdb/hg18/yaleTARs/
ln -s /cluster/data/hg18/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \
/gbdb/hg18/yaleTARs/
hgLoadSeq hg18 /gbdb/hg18/yaleTARs/yaleBertoneTARSeqs.fa
# Add trackDb.ra entry to trackDb/human/trackDb.ra and create
# a description page.
##############################################################################
# Update upstream maf files, fixing a problem of RefSeq ID being trucated. (2006-10-20 Fan)
ssh hgwdev
cd /cluster/data/hg18/bed/multiz17way
cd mafDownloads
# upstream mafs (mafFrags takes a while)
cat > mafFrags.csh << 'EOF'
date
foreach i (1000 2000 5000)
echo "making upstream$i.maf"
nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad
cat up.bad|sed -e "s/_up_${i}_/\t/" >up.bad2
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, $4, 0, $6)}' up.bad2 > up.bed
rm up.bad up.bad2
nice mafFrags hg18 multiz17way up.bed upstream$i.maf \
-orgs=/cluster/store11/gs.19/build36/bed/multiz17way.2006-02-18/species.lst
rm up.bed
end
date
'EOF'
# << happy emacs
time csh mafFrags.csh > mafFrags.log
nice gzip up*.maf
md5sum up*.gz >> md5sum.txt
#########################################################################
# BLASTZ/CHAIN/NET FELCAT3 (Done Nov 09 2006 heather)
# working in /cluster/data/felCat3 because /cluster/data/hg18 is 96% full
# make this a link in /cluster/data/hg18
mkdir /cluster/data/felCat3/bed/blastz.hg18.2006-11-09
ln -s /cluster/data/felCat3/bed/blastz.hg18.2006-11-09 /cluster/data/hg18/bed/blastz.felCat3
cd /cluster/data/felCat3/bed/blastz.hg18.2006-11-09
cat << '_EOF_' > DEF
BLASTZ_M=50
# TARGET: Human Hg18
# Can we use 2bit here?
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cat felCat3
SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit
SEQ2_LEN=/san/sanvol1/scratch/felCat3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/felCat3/bed/blastz.hg18.2006-11-09
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-bigClusterHub pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/felCat3/blastz.hg18 >& do.log &
tail -f do.log
nice featureBits -chrom=chr1 hg18 chainFelCat3Link
# 86932463 bases of 224999719 (38.637%) in intersection
# reciprocal best net mafs for multiz
~/kent/src/hg/utils/automation/doRecipBest.pl hg18 felCat3 >&! rbest.log &
#########################################################################
# BLASTZ/CHAIN/NET BOSTAU3 (Done Feb 2007 heather)
mkdir /cluster/data/hg18/bed/blastz.bosTau3.2007-02-23
ln -s /cluster/data/hg18/bed/blastz.bosTau3.2007-02-23 /cluster/data/hg18/bed/blastz.bosTau3
cd /cluster/data/hg18/bed/blastz.bosTau3
cat << '_EOF_' > DEF
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cow bosTau3
SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit
SEQ2_LEN=/san/sanvol1/scratch/bosTau3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=500
SEQ2_CHUNK=50000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.bosTau3.2007-02-23
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-bigClusterHub pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/bosTau3/blastz.hg18 >& do.log &
tail -f do.log
nice featureBits -chrom=chr1 hg18 chainBosTau3Link
# 114562908 bases of 224999719 (50.917%) in intersection
##############################################################################
# MAKE tfbsConsSites and tfbsConsFactors for TFBS conserved track (DONE weirauch 11/19/06)
# Questions? weirauch at soe.ucsc.edu or braney at soe.ucsc.edu
ssh hgwdev
mkdir /cluster/data/hg18/bed/tfbsCons
cd /cluster/data/hg18/bed/tfbsCons
# Define all parameters in 'PARAMS.txt'
# Define all chromosomes in 'CHROMS.txt'
# Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts weirauch at soe.ucsc.edu
set tarfile=/cluster/data/hg18/bed/tfbsCons/tfbsConsUtils.tar.gz
tar zxf $tarfile
nice ./getRefseqStats.pl &
nice ./getBatchQueries.pl &
ssh kk
mkdir /cluster/bluearc/braney/tfloc
# Copy ./tmp/ctfbs_batch_list.txt to this dir
# Copy ./scripts/doit to this dir
para create ctfbs_batch_list.txt
para try
para push
# When the run is done (within a day or so), the results will be in individual dirs, one for each chromosome.
ssh kksilo (or hgwdev, or whatever)
nice ./getBedFile.pl &
hgLoadBed -noSort hg18 tfbsConsSites -sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql tfbsConsSites.bed -tab
hgLoadBed -noSort hg18 tfbsConsFactors -sqlTable=$HOME/kent/src/hg/lib/tfbsConsFactors.sql tfbsConsFactors.bed -tab
# Feel free to delete or gzip anything in ./tmp (particularly the huge .maf and .bed files) after the final two bed files are sucessfully loaded
# fixed up the tfbsConsSites.bed file to remove extra indexes, then:
hgsql -e "drop index chrom_2 on tfbsConsSites;" hg18
hgsql -e "drop index chrom_3 on tfbsConsSites;" hg18
# the tfbsConsFactors table had extra names, they were removed:
for N in `cat extra.tfbsConsFactors.name`
do
echo "delete from tfbsConsFactors where name=\"${N}\";" hg18
hgsql -e "delete from tfbsConsFactors where name=\"${N}\";" hg18
done
# the extra names were:
# B$CRP_C F$DDE1_B F$STRE_01 P$GBP_Q6 V$ACAAT_B V$APOLYA_B V$ATATA_B
# V$BARBIE_01 V$BEL1_B V$CAAT_01 V$CAAT_C V$CAP_01 V$DTYPEPA_B V$E2F_Q2
# V$ETF_Q6 V$ETS_Q6 V$GC_01 V$GEN_INI2_B V$GEN_INI3_B V$GEN_INI_B V$HFH8_01
# V$HOGNESS_B V$LBP1_Q6 V$LDSPOLYA_B V$LEF1_Q2 V$LPOLYA_B V$MEF3_B V$MINI19_B
# V$MINI20_B V$MTATA_B V$MUSCLE_INI_B V$PADS_C V$PEA3_Q6 V$POLY_C V$SRY_01
# V$STAT4_01 V$STAT5A_03 V$STAT5A_04 V$STAT6_02 V$TAACC_B V$TANTIGEN_B
# V$TEF1_Q6 V$USF2_Q6
# And re-load once again since the above data was based on transfac data that
# is too new (2006-11-03 - Hiram)
cd /cluster/data/hg18/bed/tfbsCons
hgLoadBed -tab -strict hg18 tfbsConsSites \
-sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql tfbsConsSites.bed
# And this leads once again to a bunch of extra names in Factors
hgsql -N -e "select name from tfbsConsSites;" hg18 | sort -u > names.new
hgsql -N -e "select name from tfbsConsFactors;" hg18 \
| sort -u > names.factors
comm -13 names.new names.factors > names.extra.factors
for N in `cat names.extra.factors`
do
echo "delete from tfbsConsFactors where name=\"${N}\";" hg18
hgsql -e "delete from tfbsConsFactors where name=\"${N}\";" hg18
done
# Reload tfbsCons to correct errors (2007-07-17 - Hiram)
cd /cluster/data/hg18/bed/tfbsCons
hgLoadBed -tab hg18 tfbsConsSites \
-sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql tfbsConsSites.bed
hgsql -N -e "select name from tfbsConsSites;" hg18 | sort -u \
> names.new.2007-07-17
# showing zero difference still, nothing more to be done
comm -13 names.new.2007-07-17 names.factors
##############################################################################
# REWORK PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE
# (WORKING - 2006-10-23 - Hiram)
# five different cluster runs are described here for different classes
# of clones
# runPlacedNotSplit - all placed clones split or not split with blat
# runFish - 392 fish clones against all 378 contigs, with blat
# runUnPlaced - 14,569 clones on known contigs - with psLayout
# runUnPlacedChr - 297 clones on known chroms - with psLayout
# runLastOnes - 1,877 clones against 378 contigs - with blat
# The original run of this forgot to split of the BAC clones that were just
# a fasta file full of unordered pieces. They need to be split up
# to work properly.
ssh pk
mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23
cd /san/sanvol1/scratch/hg18/coverage.2006-10-23
# Going to copy over the BAC clones from the previous runs and split
# them up if they have too many N's (>100) (indicating pieces)
# This may actually split up a couple of BACs that are not actually
# pieces, but in the cases I could find, and they were rare, the big
# BACs appear to break into only two pieces.
# The first set to do are the clones that were used in the assembly
# Since they were placed, we know where they all belong. Only 50 of
# them end up being split, and then usually only in 2 pieces.
# We could tediously go through each of these 50 and determine if they
# are actually unordered pieces. Although this raises the question,
# how could unordered pieces be used in the assembly ? Doesn't make any
# sense.
cat << '_EOF_' > placedClones.sh
#!/bin/sh
D0=placedNotSplit
D1=placedSplit
export D0 D1
find ../coverage/placedClones -type f | grep -v faCount.all.txt | while read F
do
BN=`basename "${F}"`
DN=`dirname "${F}"`
CHROM=`basename "${DN}"`
Ncount=`faSize "${F}" | sed -e "s/N's.*//; s/.* bases (//;"`
if [ "${Ncount}" -gt 99 ]; then
out="${D1}/${CHROM}/${BN}"
mkdir -p ${D1}/${CHROM}
echo "gapSplit -minGap=100 ${F} ${out}"
gapSplit -minGap=100 ${F} stdout | gzip > ${out}
faSize "${F}"
faSize "${out}"
else
out="${D0}/${CHROM}/${BN}"
mkdir -p ${D0}/${CHROM}
echo "cp -p ${F} ${out}"
cp -p ${F} ${out}
fi
done
'_EOF_'
# << happy emacs
# Going to use blat this time instead of psLayout
# It is faster and appears to do just about the same exact job
mkdir runPlacedNotSplit
cd runPlacedNotSplit
# Re-use the previous jobList
sed -e "s/runPsLayout.sh/runBlat.csh/" \
../../coverage/runPlaced/masterJobList > jobList
cat << '_EOF_' > runBlat.csh
#!/bin/csh -fe
set chrom = $1
set clone = $2
set contig = $3
set result = $4
set target = /san/sanvol1/scratch/hg18/coverage/maskedContigs/$chrom/$contig.fa.gz
set query = /san/sanvol1/scratch/hg18/coverage.2006-10-23/placedNotSplit/$chrom/$clone.fa.gz
if ( ! -f $query ) then
set query = /san/sanvol1/scratch/hg18/coverage.2006-10-23/placedSplit/$chrom/$clone.fa.gz
endif
set scrTmp = "/scratch/tmp/$contig/$clone"
set ooc = /san/sanvol1/scratch/hg18/coverage/contigOoc/$contig.10.ooc
mkdir -p "$scrTmp"
zcat $target > $scrTmp/$contig.fa
zcat $query > $scrTmp/$clone.fa
cp -p $ooc $scrTmp/10.ooc
pushd $scrTmp
pwd
ls -l
blat -minIdentity=98 -fastMap -tileSize=10 -t=dna -q=dna -ooc=10.ooc $contig.fa $clone.fa $clone.psl
popd
mkdir -p psl/$chrom/$contig
cp -p $scrTmp/$clone.psl $result
rm $scrTmp/*
rmdir $scrTmp
rmdir --ignore-fail-on-non-empty /scratch/tmp/$contig
'_EOF_'
# << happy emacs
para create jobList
para try; para check; etc ...
para time
# Completed: 27093 of 27093 jobs
# CPU time in finished jobs: 435042s 7250.69m 120.84h 5.04d 0.014 y
# IO & Wait Time: 74031s 1233.86m 20.56h 0.86d 0.002 y
# Average job time: 19s 0.31m 0.01h 0.00d
# Longest finished job: 463s 7.72m 0.13h 0.01d
# Submission to last job: 3079s 51.32m 0.86h 0.04d
# combine the results into one large raw.psl file
time pslSort dirs raw.psl tmp psl/*/*
ls -og raw.psl
# -rw-rw-r-- 1 52067774 Oct 31 12:06 raw.psl
# This raw.psl file will be included in the overall results, but as a
# check, it is possible to turn just these results into a .bed file for
# uploading as a custom track to take a look at them.
time pslReps -nohead -nearTop=0.001 -singleHit \
raw.psl repsSingle.psl /dev/null
clusterClone -allowDuplicates -agp -minCover=80 \
-maxGap=60000 repsSingle.psl > single.agp 2> single.out
sort -k1,1 -k2,2n single.agp | ../../coverage/fixPhase.pl \
/cluster/data/hg18/bed/coverage/phase.txt > contig_overlaps.agp \
2> singleToOverlaps.out
awk -F'\t' '{printf "%s\t%s\t%s\t%s\t0\t%s\n", $1,$2,$3,$6,$9}' \
contig_overlaps.agp > cOverlaps.bed
liftUp chrOverlaps.bed /san/sanvol1/scratch/hg18/bacends/liftContigs.lft \
warn cOverlaps.bed
# Load up that chrOverlaps.bed as a custom track to see these results
##################################################################
# The next big group are the FISH clones
cd /san/sanvol1/scratch/hg18/coverage.2006-10-23
# Split or not split depending on gap count >= 100
cat << '_EOF_' > splitFishClones.sh
#!/bin/sh
D0=fishSplit
export D0
find ../coverage/fishClones/sequence -type f | while read F
do
BN=`basename "${F}"`
Ncount=`faSize "${F}" | sed -e "s/N's.*//; s/.* bases (//;"`
if [ "${Ncount}" -gt 99 ]; then
out="${D0}/fishPieces/${BN}"
echo "gapSplit -minGap=100 ${F} ${out}"
gapSplit -minGap=100 ${F} stdout | gzip > ${out}
faSize "${F}"
faSize "${out}"
else
out="${D0}/noPieces/${BN}"
echo "cp -p ${F} ${out}"
cp -p "${F}" "${out}"
fi
done
'_EOF_'
# << happy emacs
mkdir fishSplit
chmod +x splitFishClones.sh
time ./splitFishClones.sh
# combine them all into large fasta files to lower the file count
cd fishSplit
for F in fishPieces/* noPieces/*
do
zcat "${F}"
done | gzip > all.fa.gz
faSplit about all.fa.gz 500000 split/f_
mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/runFish
cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/runFish
ls ../fishSplit/split | sed -e "s/.fa.gz//" > fish.list
ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/?/* | \
sed -e \
"s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \
> contig.list
ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/??/* | \
sed -e \
"s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \
>> contig.list
ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/?_*/* | \
sed -e \
"s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \
>> contig.list
ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/??_*/* | \
sed -e \
"s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \
>> contig.list
cat << '_EOF_' > template
#LOOP
./runBlat.csh $(path1) $(path2) {check out line+ psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
cat << '_EOF_' > runBlat.csh
#!/bin/csh -fe
set target = /san/sanvol1/scratch/hg18/coverage/maskedContigs/$1.fa.gz
set query = /san/sanvol1/scratch/hg18/coverage.2006-10-23/fishSplit/split/$2.fa.gz
set contig = $target:t:r:r
set fishPiece = $query:t:r:r
set result = psl/$contig/$fishPiece.psl
set scrTmp = "/scratch/tmp/$contig/$fishPiece"
set ooc = /san/sanvol1/scratch/hg18/coverage/contigOoc/$contig.10.ooc
mkdir -p "$scrTmp"
zcat $target > $scrTmp/$contig.fa
zcat $query > $scrTmp/$fishPiece.fa
cp -p $ooc $scrTmp/10.ooc
pushd $scrTmp
pwd
ls -l
blat -fastMap -tileSize=10 -t=dna -q=dna -ooc=10.ooc $contig.fa $fishPiece.fa $fishPiece.psl
popd
mkdir -p psl/$contig
cp -p $scrTmp/$fishPiece.psl $result
rm $scrTmp/*
rmdir $scrTmp
rmdir --ignore-fail-on-non-empty /scratch/tmp/$contig
'_EOF_'
# << happy emacs
chmod +x runBlat.csh
para create contig.list fish.list template jobList
para try; para create; etc ...
para time
# Completed: 148176 of 148176 jobs
# CPU time in finished jobs: 2884533s 48075.56m 801.26h 33.39d 0.091 y
# IO & Wait Time: 385142s 6419.03m 106.98h 4.46d 0.012 y
# Average job time: 22s 0.37m 0.01h 0.00d
# Longest finished job: 270s 4.50m 0.07h 0.00d
# Submission to last job: 9510s 158.50m 2.64h 0.11d
# put all the results together into a single file
pslSort dirs raw.psl tmp psl/*
# this is a big result
ls -og raw.psl
# -rw-rw-r-- 1 6972351482 Oct 25 16:25 raw.psl
# can do the same thing as above to look at these results individually
# not listed here
##################################################################
# The next big group are the unplaced clones. In the original run, the
# contig location of these items were inferred from Hg17 results, and
# thus many of them can be aligned against their respective contig. For
# some cases, the contig isn't known, but the chrom is, thus they can be
# aligned to all the contigs for a chrom. And finally, those completely
# unknown have to be aligned to all contigs.
# There are two sections here, those for which contig details are
# unknown, and those for which contigs are known. First, those for
# which details are unknown:
cd /san/sanvol1/scratch/hg18/coverage.2006-10-23
cat << '_EOF_' > splitUnplacedClones.sh
#!/bin/sh
find ../coverage/unPlacedClones -type f | while read F
do
BN=`basename "${F}"`
DN=`dirname "${F}"`
CONTIG=`basename "${DN}"`
DN=`dirname "${DN}"`
CHROM=`basename "${DN}"`
out="unPlacedSplit/${CHROM}/${CONTIG}/${BN}"
# echo "${CHROM}/${CONTIG}/${BN}"
mkdir -p unPlacedSplit/${CHROM}/${CONTIG}
Ncount=`faSize "${F}" | sed -e "s/N's.*//; s/.* bases (//;"`
if [ "${Ncount}" -gt 99 ]; then
echo "gapSplit -minGap=100 ${F} ${out}"
gapSplit -minGap=100 ${F} stdout | gzip > ${out}
faSize "${F}"
faSize "${out}"
fi
done
'_EOF_'
# << happy emacs
chmod +x splitUnplacedClones.sh
mkdir unPlacedSplit
time ./splitUnplacedClones.sh > unPlaced.out 2>&1
mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlacedChr
cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlacedChr
ls ../unPlacedSplit/*/XX*/*.fa.gz > bac.list
cat << '_EOF_' > mkJobList.sh
#!/bin/sh
cat bac.list | while read F
do
CHR=`echo "${F}" | sed -e "s#.*unPlacedSplit/##; s#/.*##"`
CLONE=`basename ${F} | sed -e "s/.fa.gz//"`
case $CHR in
U|Un)
for C in /san/sanvol1/scratch/hg18/coverage/maskedContigs/? \
/san/sanvol1/scratch/hg18/coverage/maskedContigs/?? \
/san/sanvol1/scratch/hg18/coverage/maskedContigs/?_* \
/san/sanvol1/scratch/hg18/coverage/maskedContigs/??_*
do
CH=`basename ${C}`
for CT in /san/sanvol1/scratch/hg18/coverage/maskedContigs/${CH}/*
do
CONTIG=`basename ${CT} | sed -e "s/.fa.gz//"`
echo "./runPsLayout.sh $CH $CLONE $CONTIG {check out line+ psl/$
CH/$CONTIG/$CLONE.psl}"
done
done
;;
*)
for CT in /san/sanvol1/scratch/hg18/coverage/maskedContigs/${CHR}/*
do
CONTIG=`basename ${CT} | sed -e "s/.fa.gz//"`
echo "./runPsLayout.sh $CHR $CLONE $CONTIG {check out line+ psl/$CHR
/$CONTIG/$CLONE.psl}"
done
;;
esac
'_EOF_'
# << happy emacs
chmod +x mkJobList.sh
./mkJobList.sh > jobList
cat << '_EOF_' > runPsLayout.sh
#!/bin/sh
# runPsLayout.sh <chrom> <clone> <contig>
# where <chrom> is the chrom this contig is on
# <clone> is one of the .fa.gz files in
# /san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/<chrom>/<clone>.fa.gz
# <contig> is one of the contigs found in:
# /san/sanvol1/scratch/hg18/coverage/maskedContigs/<chrom>/<contig>.fa.gz
#
HERE=`pwd`
CHROM=$1
CLONE=$2
CONTIG=$3
TARGET=/san/sanvol1/scratch/hg18/coverage/maskedContigs/$CHROM/$CONTIG.fa.gz
CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/$CHROM/XX_000000/$CLONE.fa.gz
OOC=/san/sanvol1/scratch/hg18/coverage/maskedContigs/ooc/$CHROM/$CONTIG.10.ooc
RESULT="${HERE}/psl/${CHROM}/${CONTIG}/${CLONE}.psl"
export CHROM CLONE CONTIG TARGET CLONESRC RESULT
mkdir -p psl/${CHROM}/${CONTIG}
if [ ! -s ${CLONESRC} ]; then
CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/U/XX_000000/$CLONE.fa.gz
if [ ! -s ${CLONESRC} ]; then
CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/Un/XX_000000/$CLONE.fa.gz
if [ ! -s ${CLONESRC} ]; then
echo "Can not find: ${CLONESRC}" 1>/dev/stderr
exit 255
fi
fi
fi
if [ ! -s ${TARGET} ]; then
echo "Can not find: ${TARGET}" 1>/dev/stderr
exit 255
fi
if [ ! -s ${OOC} ]; then
echo "Can not find: ${OOC}" 1>/dev/stderr
exit 255
fi
WRKDIR="/scratch/tmp/hg18_${CHROM}/${CONTIG}/${CLONE}"
mkdir -p "${WRKDIR}"
cd ${WRKDIR}
zcat ${CLONESRC} > ${CLONE}.fa
zcat ${TARGET} > ${CONTIG}.fa
cp -p ${OOC} ./10.ooc
/cluster/bin/x86_64/psLayout ${CONTIG}.fa ${CLONE}.fa genomic 10.ooc ${RESULT}
RET=$?
cd ${HERE}
rm -fr ${WRKDIR}
rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}/${CONTIG}"
rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}"
exit ${RET}
'_EOF_'
# << happy emacs
chmod +x ./runPsLayout.sh
mkdir psl
para create jobList
para try; para check; ... etc ...
para time
# Completed: 40509 of 40509 jobs
# CPU time in finished jobs: 5354801s 89246.69m 1487.44h 61.98d 0.170 y
# IO & Wait Time: 115279s 1921.31m 32.02h 1.33d 0.004 y
# Average job time: 135s 2.25m 0.04h 0.00d
# Longest finished job: 164276s 2737.93m 45.63h 1.90d
# Submission to last job: 187712s 3128.53m 52.14h 2.17d
# combine into one result file
pslSort dirs raw.psl tmp psl/*/*
##################################################################
# Now, for those unplaced clones for which contig details are known
ssh pk
mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlaced
cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlaced
cat << '_EOF_' > mkJobList.sh
#!/bin/sh
find ../unPlacedSplit -type f | grep -v XX_ | while read F
do
BN=`basename ${F} | sed -e "s/.fa.gz//"`
DN=`dirname ${F}`
CONTIG=`basename ${DN}`
DN=`dirname ${DN}`
CHROM=`basename ${DN}`
echo "./runPsLayout.sh ${CHROM} ${BN} ${CONTIG} {check out line+ psl/${CHROM
}/${CONTIG}/${BN}.psl}"
done
'_EOF_'
# << happy emacs
chmod +x mkJobList.sh
./mkJobList.sh > jobList
cat << '_EOF_' > runPsLayout.sh
#!/bin/sh
# runPsLayout.sh <chrom> <clone> <contig>
# where <chrom> is the chrom this contig is on
# <clone> is one of the .fa.gz files in
# /san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/<chrom>/<clone>.fa.gz
# <contig> is one of the contigs found in:
# /san/sanvol1/scratch/hg18/coverage/maskedContigs/<chrom>/<contig>.fa.gz
#
HERE=`pwd`
CHROM=$1
CLONE=$2
CONTIG=$3
TARGET=/san/sanvol1/scratch/hg18/coverage/maskedContigs/$CHROM/$CONTIG.fa.gz
CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/$CHROM/$CONTIG/$CLONE.fa.gz
OOC=/san/sanvol1/scratch/hg18/coverage/maskedContigs/ooc/$CHROM/$CONTIG.10.ooc
RESULT="${HERE}/psl/${CHROM}/${CONTIG}/${CLONE}.psl"
mkdir -p psl/${CHROM}/${CONTIG}
if [ ! -s ${CLONESRC} ]; then
echo "Can not find: ${CLONESRC}" 1>/dev/stderr
exit 255
fi
if [ ! -s ${TARGET} ]; then
echo "Can not find: ${TARGET}" 1>/dev/stderr
exit 255
fi
if [ ! -s ${OOC} ]; then
echo "Can not find: ${OOC}" 1>/dev/stderr
exit 255
fi
WRKDIR="/scratch/tmp/hg18_${CHROM}/${CONTIG}/${CLONE}"
mkdir -p "${WRKDIR}"
cd ${WRKDIR}
zcat ${CLONESRC} > ${CLONE}.fa
zcat ${TARGET} > ${CONTIG}.fa
cp -p ${OOC} ./10.ooc
/cluster/bin/x86_64/psLayout ${CONTIG}.fa ${CLONE}.fa genomic 10.ooc ${RESULT}
RET=$?
cd ${HERE}
rm -fr ${WRKDIR}
rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}/${CONTIG}"
rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}"
exit ${RET}
'_EOF_'
# << happy emacs
chmod +x runPsLayout.sh
para create jobList
para try; para check; ... etc ...
para time
# Completed: 14569 of 14569 jobs
# CPU time in finished jobs: 4863551s 81059.19m 1350.99h 56.29d 0.154 y
# IO & Wait Time: 64196s 1069.93m 17.83h 0.74d 0.002 y
# Average job time: 338s 5.64m 0.09h 0.00d
# Longest finished job: 36681s 611.35m 10.19h 0.42d
# Submission to last job: 68213s 1136.88m 18.95h 0.79d
# combine into a single result
pslSort dirs raw.psl tmp psl/*/*
# combine into a single result
time pslSort dirs raw.psl tmp psl/*
# real 550m57.744s
# user 324m56.251s
# sys 10m15.358s
ls -og raw.psl
# -rw-rw-r-- 1 39273644954 Nov 2 20:23 raw.psl
# Wow ...
time pslReps -nohead -nearTop=0.001 -singleHit \
raw.psl repsSingle.psl /dev/null
# real 15m14.462s
# user 13m6.580s
# sys 1m50.304s
ls -og repsSingle.psl
# -rw-rw-r-- 1 73403317 Nov 3 09:44 repsSingle.psl
###########################################################
# And now, combining all results together
mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/finalPsl
cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/finalPsl
ln -s ../runLastOnes/repsSingle.psl lastOnes.psl
ln -s ../runFish/raw.psl fish.psl
ln -s ../runUnPlaced/raw.psl unPlaced.psl
ln -s ../runUnPlacedChr/raw.psl unPlacedChr.psl
ln -s ../runPlacedNotSplit/raw.psl placed.psl
cd /san/sanvol1/scratch/hg18/coverage.2006-10-23
time pslSort dirs raw.psl tmp finalPsl
# real 18m53.770s
# user 12m19.002s
# sys 1m17.504s
ls -og raw.psl
# -rw-rw-r-- 1 7742802124 Nov 3 10:10 raw.psl
time pslReps -nohead -nearTop=0.001 -singleHit \
raw.psl repsSingle.psl /dev/null
clusterClone -allowDuplicates -agp -minCover=80 \
-maxGap=60000 repsSingle.psl > single.agp 2> single.out
sort -k1,1 -k2,2n single.agp | ../coverage/fixPhase.pl \
/cluster/data/hg18/bed/coverage/phase.txt > contig_overlaps.agp \
2> singleToOverlaps.out
awk -F'\t' '{printf "%s\t%s\t%s\t%s\t0\t%s\n", $1,$2,$3,$6,$9}' \
contig_overlaps.agp > cOverlaps.bed
liftUp chrOverlaps.bed /san/sanvol1/scratch/hg18/bacends/liftContigs.lft \
warn cOverlaps.bed
# Load up that chrOverlaps.bed as a custom track to see these results
# And back to the original business of eliminating obsolete clones
awk '{print $6}' contig_overlaps.agp | sort -u > clone.coverage.list
time $HOME/kent/src/hg/makeDb/hgClonePos/ckMultipleVersions.pl \
clone.coverage.list > /dev/null 2> obsolete.clones
time $HOME/kent/src/hg/makeDb/hgClonePos/removeObsoleteClones.sh \
contig_overlaps.agp obsolete.clones > clean_overlaps.agp
# looks like it removes 295 lines
wc -l contig_overlaps.agp clean_overlaps.agp
# 613577 contig_overlaps.agp
# 613507 clean_overlaps.agp
mv contig_overlaps.agp contig_overlapsWithObsoletes.agp
mv clean_overlaps.agp contig_overlaps.agp
cd /cluster/data/hg18
# save all existing .gl files before we overwrite them all
tar cvzf ./save.glFiles.tgz ./?/*.gl ./??/*.gl ./?_*/*.gl \
./??_*/*.gl ./?/*/*.gl ./??/*/*.gl ./?_*/*/*.gl ./??_*/*/*.gl
time agpToGl contig_overlaps.agp . -md=seq_contig.md
# real 1m4.253s
time ./jkStuff/liftGl.csh contig.gl
# saw some errors such as: NT_113974/contig.gl doesn't exist, skipping
# I'm guessing they were contigs with no alignment results
# capture these new .gl files for future reference
tar cvzf ./new.glFiles.tgz ./?/*.gl ./??/*.gl ./?_*/*.gl \
./??_*/*.gl ./?/*/*.gl ./??/*/*.gl ./?_*/*/*.gl ./??_*/*/*.gl
# now reload all the _gold, _gap and _gl tables
# Tested this load on a dummy database and found that the contents of
# the gold and gap tables do not change
hgGoldGapGl -chromLst=chrom.lst hg18 /cluster/store11/gs.19 build36
# Then hgClonePos uses those tables to create the Coverage track
# table: clonePos
hgClonePos -maxErr=600 -maxWarn=50000 -chromLst=chrom.lst \
hg18 /cluster/data/hg18 ./cleanedSequence.inf /cluster/store11/gs.19 \
> updated.clone.pos.errors 2>&1
# Now let's check for clones that are excessively wrong
cd /tmp
hgsql -N -e \
"select chrom,chromStart,chromEnd,name,chromEnd-chromStart,seqSize from clonePos;" \
hg18 > clonePos.hg18.lengths
awk '{if ($6 > 0) { printf "%.2f\t%s\n", 100.0*$5/$6,$0}}' \
clonePos.hg18.lengths | sort -n > clonePos.hg18.deviations
# Looking at that list of deviations, there are still a number of them
# that are extreme deviants, but there are a lot less than there were
# before. Previously:
ave clonePos.hg18.deviations
# Q1 100.000000
# median 100.000000
# Q3 109.172500
# average 350.043843
# min 80.000000
# max 23574.310000
# count 44978
# total 15744271.980000
# standard deviation 851.762186
# Over 3,500 of them larger than 10 times too large:
awk '{if ($1 > 1000) {print}}' clonePos.hg18.deviations | wc
# 3881 27167 223039
# This new lot:
ave clonePos.hg18.deviations
# Q1 100.000000
# median 100.000000
# Q3 100.360000
# average 140.353820
# min 0.250000
# max 40838.840000
# count 43734
# total 6138233.960000
# standard deviation 381.871589
# Only 277 are larger than 10 times too big:
awk '{if ($1 > 1000) {print}}' clonePos.hg18.deviations | wc
# 277 1939 15747
# QA NOTE: ran mytouch on the *gold and *gap tables because the values were
# unachaged, but they got a new date/time in the above process (ASZ
# 11-14-2006):
# sudo mytouch hg18 'chr*_gold' 200604060800.00
# sudo mytouch hg18 'chr*_gap' 200604060800.00
##############################################################################
# LongSAGE (2006-10-20 markd)
# Load LongSAGE composite tag with genomo mappings of tag clusters
# obtained from "Martin Hirst" <mhirst at bcgsc.ca>
ftp ftp2.bcgsc.ca
user: ucsc
<password from martin >
download SHE*_u.map
chmod a-w *.map
~/compbio/kent/src/hg/makeDb/outside/bcgscSage/bcgscSageLoad hg18 *_u.map
####################################################################
# MAKE UNIGENE/SAGE TRACK (DONE - 2006-11-20 Fan)
# Create the uniGene alignments
# /cluster/data/hg18/uniGene/hg18.uniGene.lifted.pslReps.psl
# Download of the latest UniGene version is now automated by a
# cron job -- see /cluster/home/angie/crontab ,
# /cluster/home/angie/unigeneVers/unigene.csh .
# If hgwdev gets rebooted, that needs to be restarted... maybe there's
# a more stable place to set up that cron job.
ssh hgwdev
cd /cluster/store11/gs.19/build36/bed
mkdir uniGene
cd uniGene
set Version = 196
zcat /cluster/store7/uniGene/uniGene.$Version/Hs.seq.uniq.gz|\
sed -e "s#>.*/ug=#>#; s# /len.*##;" > Hs.seq.uniq.simpleHeader.fa
ssh pk
set Version = 196
mkdir -p /san/sanvol1/scratch/hg18/uniGene/
cd /san/sanvol1/scratch/hg18/uniGene/
cp -p /cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa .
ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst
ls -1S \
/cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa \
> uniGene.lst
cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/x86_64/blat -repeats=lower -minIdentity=95 ooc=/san/sanvol1/scratch/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
gensub2 genome.lst uniGene.lst template.sub para.spec
para create para.spec
mkdir psl
para try
para check
para push
# Completed: 49 of 49 jobs
# CPU time in finished jobs: 46855s 780.92m 13.02h 0.54d 0.001 y
# IO & Wait Time: 240s 3.99m 0.07h 0.00d 0.000 y
# Average job time: 961s 16.02m 0.27h 0.01d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 3629s 60.48m 1.01h 0.04d
# Submission to last job: 4337s 72.28m 1.20h 0.05d
pslSort dirs raw.psl tmp psl >& pslSort.log
cat raw.psl|\
pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
stdin hg18.uniGene.pslReps.psl /dev/null
# Processed 553470 alignments
gzip raw.psl
gzip Hs.seq.uniq.simpleHeader.fa
ssh hgwdev
cd /cluster/store11/gs.19/build36/bed/uniGene
cp -p /san/sanvol1/scratch/hg18/uniGene/hg18.uniGene.pslReps.psl .
hgLoadPsl -table=uniGene_3 hg18 hg18.uniGene.pslReps.psl
####################################################################
# EXONIPHY (2006-12-05 acs)
# predictions provided by Brona Brejova in Siepel Lab (bb248 at cornell.edu).
# stored in /cluster/data/hg18/bed/exoniphy/exoniphy.gff
ldHgGene -genePredExt -gtf hg18 exoniphy exoniphy.gff
####################################################################
# HapMap CNVRs (copy number variable regions) from Matt Hurles (Heather Dec. 2006)
# Change bed3 to bed6 to match hg17
cd /cluster/data/hg18/bed/sv
redon.pl < cnpRedon.hg18 > redon.bed
hgLoadBed hg18 cnpRedon cnpRedon.bed
#########################################################
# Structural Variation from Lars Feuk (Heather Jan - April 2007)
# These tables are all tiny so I'm not using indices
# I kept the bin column in all but Sebat but I could have done without that, # too
ssh hgwdev
cd /cluster/data/hg18/bed/sv
# 8 *txt files from Lars
# Sharp (format different from hg17)
cp Sharp*txt sharp.in
# use editor to remove header from sharp.in
# grab the data we need
sharp.pl < sharp.in > sharp.prelim
# adjust
sharp2.pl < sharp.prelim > sharp.bed
hgLoadBed hg18 cnpSharp2 sharp.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/cnpSharp2.sql
# Iafrate (format different from hg17)
cp Iafrate*txt iafrate.in
# use editor to change TABTAB to TAB0TAB and get rid of header
iafrate.pl < iafrate.in > iafrate.bed
hgLoadBed hg18 cnpIafrate2 iafrate.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/cnpIafrate2.sql
# Sebat (format different from hg17)
cp Sebat*txt sebat.in
# use editor to get rid of header
sebat.pl < sebat.in > sebat.bed
hgLoadBed hg18 cnpSebat2 sebat.bed -noBin -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/cnpSebat2.sql
# Tuzun (I called this cnpFosmid in hg17)
# simple bed 4 .
cp Tuzun*txt tuzun.in
# use editor to get rid of header
tuzun.pl < tuzun.in > tuzun.bed
hgLoadBed hg18 cnpTuzun tuzun.bed -tab
# McCarroll (same format as hg17, simple bed 4 .)
# need to sort and assign ids
cp McCarroll*txt mccarroll.in
# use editor to get rid of header
mccarroll.pl < mccarroll.in > mccarroll.prelim
sort -g mccarroll.prelim > mccarroll.sort
# sort isn't perfect, use editor to finish
mccarroll2.pl < mccarroll.sort > mccarroll.bed
hgLoadBed hg18 delMccarroll mccarroll.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/delMccarroll.sql
# Conrad (different format from hg17)
cp Conrad*txt conrad.prelim
# use editor to shorten "Study" column
conrad.pl < conrad.prelim > conrad.prelim2
cp conrad.prelim2 conrad.prelim3
# use editor to sort conrad.prelim3 (lame)
# assign Ids
conradId.pl < conrad.prelim3 > conrad.bed
hgLoadBed hg18 delConrad2 conrad.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/delConrad2.sql
# Hinds (different format from hg17)
cp Hinds*txt hinds.in
# use editor to remove header
hinds.pl < hinds.in > hinds.prelim
sort -g hinds.prelim > hinds.sort
# sort isn't perfect, use editor to finish
hinds2.pl < hinds.sort > hinds.bed
hgLoadBed hg18 delHinds2 hinds.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/delHinds2.sql
# Locke (new data)
cp Locke*txt locke.in
locke.pl < locke.in > locke.prelim
sort -g locke.prelim > locke.??
locke2.pl
#########################################################
# BUILD GAD TRACK (Done, 12/12/06, Fan)
mkdir /cluster/store12/gad061211
rm /cluster/data/gad
ln -s /cluster/store12/gad061211 /cluster/data/gad
# Receive "GAD-Hg18DATA.txt" from GAD/NIA
# contact person: Shenoy, Narmada, shenoyn at grc.nia.nih.gov
hgsql hg18 -e 'drop table gadAll'
hgsql hg18 <~/src/hg/lib/gadAll.sql
hgsql hg18 -e 'load data local infile "GAD-Hg18DATA.txt" into table gadAll ignore 1 lines'
hgsql hg18 -e 'create index geneSymbol on gadAll(geneSymbol(10))'
# create gad table
hgsql hg18 -N -e \
'select "chr",chromosome, chromStart, chromEnd, geneSymbol from gadAll where chromStart <>0 and chromEnd <>0 and chromosome<>""'|\
sed -e 's/chr\t/chr/' |grep -v "chr\." |grep -v " "|sort -u >gadHg18.bed
hgLoadBed hg18 gad gadHg18.bed
#########################################################################
# BLASTZ/CHAIN/NET oryLat1 (DONE - 2006-12-14 - Hiram)
# third time with randoms and chrUn in scaffolds on both sequences
ssh kkstore02
mkdir /cluster/data/hg18/bed/blastz.oryLat1.2006-12-14
cd /cluster/data/hg18/bed/blastz.oryLat1.2006-12-14
cat << '_EOF_' > DEF
# Human vs. Medaka
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human hg18, randoms in contigs, lifted to their chr*_random
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CTGDIR=/san/sanvol1/scratch/hg18/hg18.randomContigs.sdTrf.2bit
SEQ1_CTGLEN=/san/sanvol1/scratch/hg18/hg18.randomContigs.sdTrf.sizes
SEQ1_LIFT=/san/sanvol1/scratch/hg18/hg18.randomContigs.lift
SEQ1_CHUNK=10000000
SEQ1_LIMIT=1
SEQ1_LAP=10000
# QUERY: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
# chrUn in Scaffolds for this alignment run
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=40000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.oryLat1.2006-12-14
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -verbose=2 \
-blastzOutRoot /cluster/bluearc/hg18OryLat1 > do.log 2>&1 &
### this did not work, abandoned
#########################################################################
# BLASTZ/CHAIN/NET oryLat1 (DONE - 2006-12-14 - Hiram)
# fourth time with randoms and chrUn in scaffolds for only Medaka
# All chroms and randoms as they are complete on Human
ssh kkstore04
mkdir /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24
cd /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24
cat << '_EOF_' > DEF
# Human vs. Medaka
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human hg18, randoms complete, as they are, no contig confusion
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1
# QUERY: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
# chrUn in Scaffolds for this alignment run
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=40000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.oryLat1.2007-02-24
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -verbose=2 \
-blastzOutRoot /cluster/bluearc/hg18OryLat1 > do.log 2>&1 &
# real 318m45.339s
# typical failure:
# HgStepManager: executing step 'net'.
# netChains: looks like previous stage was not successful
# (can't find [hg18.oryLat1.]all.chain[.gz]).
# continuing net:
time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-continue=net -bigClusterHub=pk -verbose=2 \
-blastzOutRoot /cluster/bluearc/hg18OryLat1 > net.log 2>&1 &
# real 39m25.853s
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24
nice -n +19 featureBits hg18 chainOryLat1Link \
> fb.hg18.chainOryLat1Link.txt 2>&1 &
# 57393910 bases of 2881515245 (1.992%) in intersection
ssh kkstore04
mkdir /cluster/data/oryLat1/bed/blastz.hg18.swap
cd /cluster/data/oryLat1/bed/blastz.hg18.swap
time doBlastzChainNet.pl -chainMinScore=2000 -chainLinearGap=loose \
/cluster/data/hg18/bed/blastz.oryLat1.2007-02-24/DEF \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -verbose=2 -swap > swap.log 2>&1 &
ssh hgwdev
cd /cluster/data/oryLat1/bed/blastz.hg18.swap
nice -n +19 featureBits oryLat1 chainHg18Link \
> fb.oryLat1.chainHg18Link.txt 2>&1 &
# 48002423 bases of 700386597 (6.854%) in intersection
##########################################################################
# AFFY HUEX1 OFF-BY-ONE FIX (Andy 2006-12-14)
ssh hgwdev
cd /cluster/data/hg18/bed/affyHumanExon
liftOver /cluster/data/hg17/bed/affyHumanExon/affyHuEx1.bed \
/gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz affyHuEx1.bed affyHuEx1.unmapped
awk 'BEGIN{OFS="\t"}{print $4,$3-$2}' affyHuEx1.bed | sort -k2,2nr | head
#2440970 81664
#3016074 9552
#3641787 8061
#2321649 8054
# It seems the liftOver problem still happens for that segmental dupe.
# So the start is correct and the end is correct. Just make two entries, both
# with size == 305.
grep -v "\b2440970\b" affyHuEx1.bed > tmp.bed
grep "\b2440970\b" affyHuEx1.bed > bad.bed
awk 'BEGIN{OFS="\t"}{print $1,$2,$2+305,$4,$5,$6}' bad.bed > good.bed
awk 'BEGIN{OFS="\t"}{print $1,$3-305,$3,$4,$5,$6}' bad.bed >> good.bed
cat tmp.bed good.bed > affyHuEx1.bed
bedSort affyHuEx1.bed tmp.bed
mv tmp.bed affyHuEx1.bed
rm good.bed bad.bed
hgLoadBed hg18 affyHuEx1 affyHuEx1.bed
##########################################################################
# CGAP SAGE (In progress Andy 2007-01-09)
# This is the BED part.
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir /san/sanVol1/scratch/andy/cgapSage
ln -s /san/sanVol1/scratch/andy/cgapSage cgapSage
wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_hs_long_forward_v36.1.tar.gz
wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_hs_long_reverse_v36.1.tar.gz
tar xfz SAGE_hs_long_forward_v36.1.tar.gz
tar xfz SAGE_hs_long_reverse_v36.1.tar.gz
cd hs_forward/
cat * | awk 'BEGIN{OFS="\t"}{print $1, $3, $4, $2, 1000, "+"}' > ../unlifted.bed
cd ../hs_reverse/
cat * | awk 'BEGIN{OFS="\t"}{print $1, $4, $3, $2, 1000, "-"}' >> ../unlifted.bed
cd ../
rm -rf hs*
liftUp lifted.bed /cluster/data/hg18/jkStuff/liftAll.lft warn unlifted.bed
#Got 378 lifts in /cluster/data/hg18/jkStuff/liftAll.lft
#Lifting unlifted.bed
#Expecting number field 3 line 13868252 of unlifted.bed, got CCATCGGATGCCCACCT
# Looks like there was a funny line in unlifted.bed:
grep CCATCGGATGCCCACCT unlifted.bed
#NT_011362 24364534NT_004321 CCATCGGATGCCCACCT AATAAGCCAGAGTCTAT 1000 -
#NT_004321 7900 7884 CCATCGGATGCCCACCT 1000 -
# Ok so there's one record for CCATCGGATGCCCACCT in addition... and for
# AATAAGCCAGAGTCTAT?
grep AATAAGCCAGAGTCTAT unlifted.bed
#NT_011362 24364534NT_004321 CCATCGGATGCCCACCT AATAAGCCAGAGTCTAT 1000 -
#NT_011362 24364534 24364518 AATAAGCCAGAGTCTAT 1000 -
# Looks like that one's got a record too. So just get rid of the stupid
# line:
grep -v 24364534NT_004321 unlifted.bed > tmp
mv tmp unlifted.bed
liftUp lifted.bed /cluster/data/hg18/jkStuff/liftAll.lft warn unlifted.bed
rm unlifted.bed
head lifted.bed
#chr1 649 665 TGTCTGCGCCTGCGCCG 1000 -
#chr1 670 686 CTAGCGCGTCGGGGTGG 1000 +
nibFrag /cluster/data/hg18/nib/chr1.nib 669 686 "+" /dev/stdout
#>/cluster/data/hg18/nib/chr1.nib:669-686
#ctagcgcgtcggggtgg
nibFrag /cluster/data/hg18/nib/chr1.nib 649 665 m /dev/stdout
#>/cluster/data/hg18/nib/chr1.nib:649-665
#tgtctgcgcctgcgcc
# It looks like there's off-by-one errors, so fix em:
awk 'BEGIN{OFS="\t"}{start=$2; end=$3;if ($6 == "-") { end = end+1; } else { start = start-1 } print $1, start, end, $4, $5, $6}' \
< lifted.bed > mapping.bed6
rm lifted.bed
# Add thickStart/thickEnd fields
awk 'BEGIN{OFS="\t"}{thickStart=$2; thickEnd=$3; if ($6=="-") {thickStart = thickStart+13; } else { thickEnd = thickEnd-13; } print $0, thickStart, thickEnd}' \
< mapping.bed6 > mapping.bed
##########################################################################
# xxBlastTab - Help filter out unwanted paralogs (Galt 2007-01-10)
#
# Background: The xxBlastTab tables are made with a simple blastall
# (blastp with -b 1) which chooses the best match. Unfortunately this
# means that if there is no proper match it will still pick something
# even though it's probably not orthologous. This is especially a problem
# in organisms like rat knownGene which has only 30% gene coverage.
# The strategy here is to filter our xxBlastTab using synteny mappings from
# the chains. This is done by simply taking hg18.kg and using /gbdb/$db chains
# and pslMap to lift the genes to the target xx assembly. Then hgMapToGene
# will find which of those mapped ids have good overlap with xx.knownGene.
# The final mapping is then created by doing an inner join between
# the traditional xxBlastTab and the mapping table produced above.
# Then simply drop the old table and rename the new table.
#
#
# We are starting with xxBlastTab tables already built in the usual way with
# blastall/blastp, probably with doHgNearBlastp.pl script.
#
# I created a new utility script called synBlastp.csh since I have to do this
# several times.
#
# we want to update hg18 for rat and mouse,
# so check ./hgGeneData/Human/hg18/otherOrgs.ra for current settings
ssh hgwdev
synBlastp.csh hg18 rn4
#hg18.rnBlastTab results:
#new number of unique query values:
# 13120
#new number of unique target values
# 6431
#old number of unique query values:
# 26982
#old number of unique target values
# 6732
synBlastp.csh hg18 mm8
#hg18.mmBlastTab results:
#new number of unique query values:
# 28733
#new number of unique target values
# 15366
#old number of unique query values:
# 33016
#old number of unique target values
# 15918
##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page
# genbank release 157.0 now contains misc_diff fields for MGC clones
# reloading mRNAs results in gbMiscDiff table being created.
./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna hg18
#################################################
# BUILD ncRna TRACK (DONE, 1/12/07, Fan)
# Download the terms and make the database.
ssh hgwdev
cd /cluster/store11/gs.19/build36
cd bed
mkdir ncRna
# copy Perl file at:
# http://cvs.sanger.ac.uk/cgi-bin/viewcvs.cgi/biomart-perl/scripts/webExample.pl?view=markup
# into getBiomart.pl
# create the following query xml file, ncRna.xml:
cat << '_EOF_' >ncRna.xml
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE Query>
<Query virtualSchemaName="default" Header="1" count="" softwareVersion="0.5">
<Dataset name="hsapiens_gene_ensembl" interface="default" Formatter="GTF" >
<Attribute name="biotype" />
<Attribute name="str_chrom_name" />
<Attribute name="gene_stable_id" />
<Attribute name="exon_chrom_start" />
<Attribute name="exon_chrom_end" />
<Attribute name="transcript_chrom_strand" />
<Attribute name="external_gene_id" />
</Dataset>
</Query>
'_EOF_'
# get Ensembl gene data from BioMart and filter out protein-conding genes
perl getBiomart.pl ncRna.xml | grep -v protein_coding >ncRna0.tab
# cut and paste different cols to form ncRna.tab
cat ncRna0.tab | sed -e 's/ENSG/chr\tENSG/'>j1
cut -f 2 j1 >j.chr0
cut -f 1 j1 >j.chr
cat j.chr0|sed -e 's/chr/0/' >j.0
cut -f 6 j1 >j.strand
cut -f 4,5 j1 >j.startEnd
cut -f 3 j1 >j.name
cut -f 7 j1 >j.type
cut -f 8 j1 >j.extGeneId
paste j.chr0 j.chr j.startEnd j.name j.0 j.strand j.0 j.0 j.type j.extGeneId >j.all
cat j.all|grep -v c6_COX|grep -v c6_QBL|grep -v c5_H2\
|sed -e 's/chr\t/chr/'\
|grep -v NT_\
|sed -e 's/\t-1\t/\t-\t/' |sed -e 's/\t1\t/\t+\t/' \
|sed -e 's/chrMT/chrM/'\
|sort -k1,1 -k2,2n -k3,3n >ncRna.tab
hgLoadBed -strict -tab -sqlTable=/cluster/home/fanhsu/src/hg/lib/ncRna.sql hg18 ncRna ncRna.tab
rm j.*
rm j1
###########################################################
# MAKE Drosophila Proteins track (DONE 2007-02-06 braney)
ssh kkstore02
sandir=/san/sanvol1/scratch/hg18
mkdir $sandir
cd /cluster/data/hg18
cat noUn/chr*fa > temp.fa
faSplit gap temp.fa 1000000 $sandir/blastDb/x -lift=$sandir/blastDb.lft
cat randomContigs/*.fa > temp.fa
faSplit sequence temp.fa 150 $sandir/blastDb/y
rm temp.fa
cd $sandir/blastDb
for i in *.fa
do
/cluster/bluearc/blast229/formatdb -i $i -p F
done
rm *.fa
mkdir -p /cluster/data/hg18/bed/tblastn.dm2FB
cd /cluster/data/hg18/bed/tblastn.dm2FB
echo /san/sanvol1/scratch/hg18/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 3066 query.lst
# we want around 150000 jobs
calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl | awk "{print \\\$1}"`/\(150000/`wc query.lst | awk "{print \\\$1}"`\)
# 18929/(150000/3066) = 386.908760
mkdir -p /cluster/bluearc/hg18/bed/tblastn.dm2FB/fbfa
split -l 387 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl /cluster/bluearc/hg18/bed/tblastn.dm2FB/fbfa/kg
ln -s /cluster/bluearc/hg18/bed/tblastn.dm2FB/fbfa
cd fbfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S fbfa/*.fa > fb.lst
mkdir -p /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut
ln -s /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut
for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/hg18/bed/tblastn.dm2FB
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /san/sanvol1/scratch/hg18/blastDb.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
exit
chmod +x blastSome
gensub2 query.lst fb.lst blastGsub blastSpec
ssh pk
cd /cluster/data/hg18/bed/tblastn.dm2FB
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 150234 of 150234 jobs
# CPU time in finished jobs: 8313632s 138560.53m 2309.34h 96.22d 0.264 y
# IO & Wait Time: 882301s 14705.02m 245.08h 10.21d 0.028 y
# Average job time: 61s 1.02m 0.02h 0.00d
# Longest finished job: 545s 9.08m 0.15h 0.01d
# Submission to last job: 40693s 678.22m 11.30h 0.47d
ssh kkstore02
cd /cluster/data/hg18/bed/tblastn.dm2FB
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut/c.`basename $1`.psl)
'_EOF_'
exit
chmod +x chainOne
ls -1dS /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh kk
cd /cluster/data/hg18/bed/tblastn.dm2FB/chainRun
para create chainSpec
para maxNode 30
para try, check, push, check etc.
# Completed: 48 of 49 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 209872s 3497.86m 58.30h 2.43d 0.007 y
# IO & Wait Time: 48501s 808.35m 13.47h 0.56d 0.002 y
# Average job time: 5383s 89.71m 1.50h 0.06d
# Longest finished job: 19336s 322.27m 5.37h 0.22d
# Submission to last job: 19336s 322.27m 5.37h 0.22d
ssh kkstore02
cd /cluster/data/hg18/bed/tblastn.dm2FB/blastOut
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/hg18/bed/tblastn.dm2FB/unliftBlastDm2FB.psl
cd ..
pslCheck unliftBlastDm2FB.psl
sed "s/[0-9XY]*\///" unliftBlastDm2FB.psl | liftUp -type=.psl -nohead stdout ../../randomContigs/hg18.randomContigs.lift carry stdin | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > blastDm2FB.psl
# load table
ssh hgwdev
cd /cluster/data/hg18/bed/tblastn.dm2FB
hgLoadPsl hg18 blastDm2FB.psl
# check coverage
featureBits hg18 blastDm2FB
# 5976178 bases of 2881515245 (0.207%) in intersection
featureBits hg18 knownGene:cds blastDm2FB -enrichment
# knownGene:cds 1.111%, blastDm2FB 0.207%, both 0.130%, cover 11.71%, enrich 56.45x
ssh kkstore04
rm -rf /cluster/data/hg18/bed/tblastn.dm2FB/blastOut
rm -rf /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut
#end tblastn
##########################################################################
#########################################################################
# BLASTZ/CHAIN/NET FR2 (DONE - 2007-01-26 - Hiram)
## Align to fr2 scaffolds,
## results lifted to fr2 chrUn coordinates
ssh kkstore02
mkdir /cluster/data/hg18/bed/blastz.fr2.2007-01-24
cd /cluster/data/hg18/bed/blastz.fr2.2007-01-24
cat << '_EOF_' > DEF
# Human vs. Fugu
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LIMIT=1
SEQ1_LAP=10000
# QUERY: Fugu fr2
# Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.fr2.2007-01-24
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-verbose=2 -bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/hg18Fr2 > do.log 2>&1 &
# real 414m47.505s
## Swap back to fr2 (duplicated in fr2.txt also)
mkdir /cluster/data/fr2/bed/blastz.hg18.swap
cd /cluster/data/fr2/bed/blastz.hg18.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/hg18/bed/blastz.fr2.2007-01-24/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -swap > swap.log 2>&1 &
# real 47m14.554s
ssh hgwdev
cd /cluster/data/fr2/bed/blastz.hg18.swap
time nice -n +19 featureBits fr2 chainHg18Link \
> fb.fr2.chainHg18Link.txt 2>&1 &
# 42875664 bases of 393312790 (10.901%) in intersection
############################################################################
## BLASTZ mm8 test with WindowMasker sequence (DONE - 2007-01-30 - Hiram)
ssh kkstore04
mkdir /cluster/data/hg18/bed/blastz.mm8.2007-01-30
cd /cluster/data/hg18/bed/blastz.mm8.2007-01-30
cat << '_EOF_' > DEF
# human vs mouse
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/hg18/hg18.noUn.sdTrf.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Mouse Mm8 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/san/sanvol1/scratch/mm8/sdTrf/mm8.noUn.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/mm8/sdTrf/noUn.sdTrf.sizes
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.mm8.2007-01-30
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/hg18Mm8 \
-chainMinScore=3000 -chainLinearGap=medium > do.out 2>&1 &
time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/hg18Mm8 \
-continue=cat -stop=net \
-chainMinScore=3000 -chainLinearGap=medium > cat.out 2>&1 &
# real 635m55.126s
nice -n +19 featureBits -noRandom hg18 chainMm8Link \
> fb.noRandom.hg18.chainMm8Link.txt 2>&1
# 991429484 bases of 2868834265 (34.559%) in intersection
nice -n +19 featureBits -noRandom hg18 chainMm8WMLink \
> fb.noRandom.hg18.chainMm8WMLink.txt 2>&1
# 1071083201 bases of 2868834265 (37.335%) in intersection
## swap to mm8
mkdir /cluster/data/mm8/bed/blastz.hg18.swap.2007-02-01
cd /cluster/data/mm8/bed/blastz.hg18.swap.2007-02-01
time doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
/cluster/data/hg18/bed/blastz.mm8.2007-01-30/DEF \
-swap -stop=net -chainMinScore=3000 \
-chainLinearGap=medium > swap.out 2>&1 &
# this created the directory /cluster/data/mm8/bed/blastz.hg18.swap
# after it was done, move to here blastz.hg18.swap.2007-02-01 since
# it is on a filesystem with some free space
nice -n +19 featureBits -noRandom mm8 chainHg18Link \
> fb.noRandom.mm8.chainHg18Link.txt 2>&1
# 983004750 bases of 2550172871 (38.547%) in intersection
nice -n +19 featureBits -noRandom mm8 chainHg18WMLink \
> fb.noRandom.mm8.chainHg18WMLink.txt 2>&1
# 976774811 bases of 2550172871 (38.302%) in intersection
###########################################################
# MAKE C. elegans proteins track
ssh kkstore02
sandir=/san/sanvol1/scratch/hg18
mkdir -p /cluster/data/hg18/bed/tblastn.ce3WB
cd /cluster/data/hg18/bed/tblastn.ce3WB
echo /san/sanvol1/scratch/hg18/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 3066 query.lst
# we want around 200000 jobs
calc `wc /cluster/data/ce3/bed/blat.ce3WB/ce3WB.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)
# 22395/(200000/3066) = 343.315350
mkdir -p /cluster/bluearc/hg18/bed/tblastn.ce3WB/wbfa
split -l 343 /cluster/data/ce3/bed/blat.ce3WB/ce3WB.psl /cluster/bluearc/hg18/bed/tblastn.ce3WB/wbfa/wb
ln -s /cluster/bluearc/hg18/bed/tblastn.ce3WB/wbfa
cd wbfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S wbfa/*.fa > wb.lst
mkdir -p /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut
ln -s /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut
for i in `cat wb.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/hg18/bed/tblastn.ce3WB
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /san/sanvol1/scratch/hg18/blastDb.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/ce3/bed/blat.ce3WB/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
exit
chmod +x blastSome
gensub2 query.lst wb.lst blastGsub blastSpec
ssh pk
cd /cluster/data/hg18/bed/tblastn.ce3WB
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 195603 of 195603 jobs
# CPU time in finished jobs: 12047221s 200787.01m 3346.45h 139.44d 0.382 y
# IO & Wait Time: 9089287s 151488.12m 2524.80h 105.20d 0.288 y
# Average job time: 108s 1.80m 0.03h 0.00d
# Longest finished job: 1002s 16.70m 0.28h 0.01d
# Submission to last job: 192221s 3203.68m 53.39h 2.22d
ssh kkstore02
cd /cluster/data/hg18/bed/tblastn.ce3WB
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut/c.`basename $1`.psl)
'_EOF_'
exit
chmod +x chainOne
ls -1dS /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut/wb?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh kk
cd /cluster/data/hg18/bed/tblastn.ce3WB/chainRun
para create chainSpec
para maxNode 30
para try, check, push, check etc.
# Completed: 66 of 66 jobs
# CPU time in finished jobs: 161714s 2695.23m 44.92h 1.87d 0.005 y
# IO & Wait Time: 40315s 671.92m 11.20h 0.47d 0.001 y
# Average job time: 3061s 51.02m 0.85h 0.04d
# Longest finished job: 9372s 156.20m 2.60h 0.11d
# Submission to last job: 11599s 193.32m 3.22h 0.13d
ssh kkstore02
cd /cluster/data/hg18/bed/tblastn.ce3WB/blastOut
for i in wb??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/hg18/bed/tblastn.ce3WB/unliftBlastCe3WB.psl
cd ..
pslCheck unliftBlastCe3WB.psl
sed "s/[0-9XY]*\///" unliftBlastCe3WB.psl | liftUp -type=.psl -nohead stdout ../../randomContigs/hg18.randomContigs.lift carry stdin | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > blastCe3WB.psl
# load table
ssh hgwdev
cd /cluster/data/hg18/bed/tblastn.ce3WB
hgLoadPsl hg18 blastCe3WB.psl
# check coverage
featureBits hg18 blastCe3WB
# 4326489 bases of 2881515245 (0.150%) in intersection
featureBits hg18 knownGene:cds blastCe3WB -enrichment
# knownGene:cds 1.111%, blastCe3WB 0.150%, both 0.086%, cover 7.76%, enrich 51.67x
ssh kkstore04
rm -rf /cluster/data/hg18/bed/tblastn.ce3WB/blastOut
rm -rf /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut
#end tblastn
##########################################################################
#############################################################################
# RE-BUILD WGRNA TRACK (DONE, 2007-02-09, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan)
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir wgRna-2007-02-07
cd wgRna-2007-02-07
# Received the data file, wg_feb2007.txt (saved from wg_feb2007.doc)
# from Michel Weber's email
# (Michel.Weber at ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg18/bed/wgRna-2007-02-07.
# The record of hsa-mir-770 was found missing the strand info.
# manually add "+" to wg_feb2007.txt for the record of hsa-mir-770.
cat wg_feb2007.txt|sed -e 's/ /\t/g' > wgRna.tab
hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab
#############################################################################
# RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan)
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir wgRna-2007-02-12
cd wgRna-2007-02-12
# Received the data file, wg_feb2007_corrected.txt (saved from wg_feb2007_corrected.doc)
# from Michel Weber's email
# (Michel.Weber at ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg18/bed/wgRna-2007-02-12.
# The record of hsa-mir-770 was found missing the strand info.
# manually add "+" to wg_feb2007_corrected.txt for the record of hsa-mir-770.
cat wg_feb2007_corrected.txt|sed -e 's/ /\t/g' > wgRna.tab
hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab
#########################################################################
## BLASTZ ANOCAR1 - Lizard - (DONE - 2007-02-17 - 2007-02-18 - Hiram)
ssh kkstore02
mkdir /cluster/data/hg18/bed/blastz.anoCar1.2007-02-17
cd /cluster/data/hg18/bed/blastz.anoCar1.2007-02-17
cat << '_EOF_' > DEF
# human vs lizard
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1
# QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold
SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.anoCar1.2007-02-17
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl DEF -chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-verbose=2 -bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/hg18AnoCar1 > do.log 2>&1 &
# real 684m40.568s
# there was a pause in there as the pk kluster was corrected during the
# first kluster run to get it to finish.
# appears to have successfully finished
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.anoCar1.2007-02-17
time nice -n +19 featureBits hg18 chainAnoCar1Link \
> fb.hg18.chainAnoCar1Link.txt 2>&1
# real 2m28.318s
# 137554843 bases of 2881515245 (4.774%) in intersection
# running the swap to anoCar1 - instructions in anoCar1.txt
cd /cluster/data/anoCar1/bed/blastz.hg18.swap
time nice -n +19 featureBits anoCar1 chainHg18Link \
> fb.anoCar1.chainHg18Link.txt 2>&1
# real 3m16.810s
# 112434396 bases of 1741478929 (6.456%) in intersection
# reciprocal best net mafs for multiz 2008-10-30 - Hiram
time nice -n +19 doRecipBest.pl hg18 anoCar1 > rbest.log 2>&1 &
# this failed immediately:
# cd /cluster/data/hg18/bed/blastz.anoCar1/axtChain
# chainStitchId hg18.anoCar1.over.chain.gz stdout
# chainSwap stdin stdout
# chainSort stdin anoCar1.hg18.tBest.chain
# t end mismatch -526389042 vs 10481870 line 1920305 of stdin
# Command failed:
# ssh -x kkr14u04 nice /cluster/data/hg18/bed/blastz.anoCar1/axtChain/doRecipBest.csh
# but, then, when run locally on hgwdev, it proceeded just fine:
time nice -n +19 ./doRecipBest.csh > doRecipBest.log 2>&1 &
# real 175m54.202s
doRecipBest.pl -continue=download hg18 anoCar1
##########################################################################
# UPDATED hg18.knownToVisiGene (DONE galt 2007-02-15)
# after making sure hg18.vgAllProbes was up to date (see makeVisiGene.doc)
ssh hgwdev
knownToVisiGene hg18 -fromProbePsl=vgAllProbes
#########################################################################
## BLASTZ OTOGAR1 - Bushbaby - (2007-02-26 kate)
#
# NOTE: using masked sequence (unlike Brian Raney's alignments)
cd /cluster/data/otoGar1
ln -s otoGar1.rmsk.2bit otoGar1.2bit
mkdir -p /san/sanvol1/scratch/otoGar1
cp -p otoGar1.2bit chrom.sizes /san/sanvol1/scratch/otoGar1
ssh pk
mkdir /cluster/data/hg18/bed/blastz.otoGar1.2007-02-26
cd /cluster/data/hg18/bed/blastz.otoGar1.2007-02-26
cat << '_EOF_' > DEF
# human vs bushbaby
# params from Hiram & Brian
BLASTZ=blastz.v7.x86_64
# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1
# QUERY: Bushbaby otoGar1 - single chunk big enough to run largest scaffold
SEQ2_DIR=/san/sanvol1/scratch/otoGar1/otoGar1.2bit
SEQ2_LEN=/cluster/data/otoGar1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.otoGar1.2007-02-26
TMPDIR=/scratch/tmp
'_EOF_'
# << emacs
/cluster/bin/scripts/doBlastzChainNet.pl DEF \
-bigClusterHub=pk -smallClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium \
>& do.log & tail -f do.log
# problems on cluster -- stale NFS mounts and a routing problem
# so batch failed with 4 retries. I restarted cluster run
# with retries=8, and all finished except 38. These failed due
# to output files existing; as the results look OK, I'm proceeding.
para time > run.time
/cluster/bin/scripts/doBlastzChainNet.pl DEF \
-continue=cat -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium \
>&! do2.log &
tail -f do2.log
# failed due to pre-existing liftOver chain from Brian's run
/cluster/bin/scripts/doBlastzChainNet.pl DEF \
-continue=net -bigClusterHub=pk \
>&! do3.log &
tail -f do3.log
# reciprocal best net mafs for multiz
~/kent/src/hg/utils/automation/doRecipBest.pl hg18 otoGar1 >&! rbest.log &
# Load net (2007-03-12 kate)
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.otoGar1/axtChain
netFilter -minGap=10 hg18.otoGar1.rbest.net.gz | \
hgLoadNet -warn hg18 netRBestOtoGar1 stdin
#########################################################################
# BLASTZ/CHAIN/NET CAVPOR2 (IN PROGRESS 2007-03-06 kate)
ssh kkstore02
mkdir /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06
cd /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06
cat << '_EOF_' > DEF
# human vs. guinea pig
# dynamic masking param
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Guinea pig cavPor2
# using cat-like params, as this has similar #scaffolds
SEQ2_DIR=/san/sanvol1/scratch/cavPor2/cavPor2.2bit
SEQ2_LEN=/san/sanvol1/scratch/cavPor2/chrom.sizes
# Maximum number of scaffolds that can be lumped together
# this makes ~200K jobs
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.cavPor2.2007-03-06
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-bigClusterHub pk \
-chainMinScore=3000 -chainLinearGap=medium >& do.log &
tail -f do.log
# cluster brought down by site work
# restart on 3/7
ssh pk
cd /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06/run.blastz
para recover jobList jobList2
para make jobList2 >&! do2.log &
para time > run.time
# entire run took probably 36 hours cluster time
ssh kkstore02
cd /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06
/cluster/bin/scripts/doBlastzChainNet.pl DEF \
-bigClusterHub pk -continue=cat -stop cleanup \
-chainMinScore=3000 -chainLinearGap=medium >& do3.log &
# reciprocal best net mafs for multiz
~/kent/src/hg/utils/automation/doRecipBest.pl hg18 cavPor2 >&! rbest.log &
# load nets manually -- automated loading fails as classification info
# not available (no database)
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.cavPor2/axtChain
netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netCavPor2 stdin
netFilter -minGap=10 hg18.cavPor2.rbest.net.gz | \
hgLoadNet -warn hg18 netRBestCavPor2 stdin
#########################################################################
# BLASTZ/CHAIN/NET ERIEUR1 (IN PROGRESS 2007-03-08 kate)
ssh kkstore02
mkdir /cluster/data/hg18/bed/blastz.eriEur1.2007-03-08
cd /cluster/data/hg18/bed/blastz.eriEur1.2007-03-08
cat << '_EOF_' > DEF
# human vs. hedgehog
# dynamic masking param
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: hedgehog eriEur1
# using cat-like params, as this has similar #scaffolds
SEQ2_DIR=/san/sanvol1/scratch/eriEur1/eriEur1.2bit
SEQ2_LEN=/san/sanvol1/scratch/eriEur1/chrom.sizes
# Maximum number of scaffolds that can be lumped together
# this makes ~200K jobs
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.eriEur1.2007-03-08
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-bigClusterHub pk \
-chainMinScore=3000 -chainLinearGap=medium >& do.log &
tail -f do.log
# Reciprocal best net mafs for multiz (kate)
ssh kkstore02
cd /cluster/data/hg18/bed/blastz.eriEur1
~/kent/src/hg/utils/automation/doRecipBest.pl hg18 eriEur1 >&! rbest.log &
#GOT HERE
# Load nets (2007-03-12 kate)
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.dasNov1/axtChain
netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netDasNov1 stdin
netFilter -minGap=10 hg18.dasNov1.rbest.net.gz | \
hgLoadNet -warn hg18 netRBestDasNov1 stdin
#########################################################################
# BLASTZ/CHAIN/NET SORARA1 (IN PROGRESS 2007-03-08 kate)
ssh kkstore02
mkdir /cluster/data/hg18/bed/blastz.sorAra1.2007-03-08
cd /cluster/data/hg18/bed/blastz.sorAra1.2007-03-08
cat << '_EOF_' > DEF
# human vs. hedgehog
# dynamic masking param
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: shrew sorAra1
# using cat-like params, as this has similar #scaffolds
SEQ2_DIR=/san/sanvol1/scratch/sorAra1/sorAra1.2bit
SEQ2_LEN=/san/sanvol1/scratch/sorAra1/chrom.sizes
# Maximum number of scaffolds that can be lumped together
# this makes ~200K jobs
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.sorAra1.2007-03-08
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-bigClusterHub pk \
-chainMinScore=3000 -chainLinearGap=medium >& do.log &
tail -f do.log
# stopped during load step due to missing database for classifying net
# Reciprocal best net mafs for multiz (2007-03-12 kate)
ssh kkstore02
cd /cluster/data/hg18/bed/blastz.sorAra1
~/kent/src/hg/utils/automation/doRecipBest.pl hg18 sorAra1 >&! rbest.log &
# GOT HERE
# Load nets (2007-03-12 kate)
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.sorAra1/axtChain
netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netSorAra1 stdin
netFilter -minGap=10 hg18.sorAra1.rbest.net.gz | \
hgLoadNet -warn hg18 netRBestSorAra1 stdin
#########################################################################
# BLASTZ ORNANA1 (PLATYPUS) - (DONE 2007-02-23, REDONE 2007-04-04 angie)
# The first time around, the copy of ornAna1.2bit still had the pre-release --
# doh! Differences are miniscule (a couple contigs' orientation changed),
# but redo just to get it 100% right.
# In the re-run, I changed SEQ2_LIMIT which made the cluster run more
# efficient but had side-effects on the results because blastz's dynamic
# masking was applied differently (different groupings of sequences) --
# in retrospect, would have been better to use the suboptimal SEQ2_LIMIT
# and have fewer differences to slog through.
ssh kkstore02
mkdir /cluster/data/hg18/bed/blastz.ornAna1.2007-04-02
cd /cluster/data/hg18/bed/blastz.ornAna1.2007-04-02
cat << '_EOF_' > DEF
# human vs. platypus
# Use same params as used for hg18-danRer4
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: hg18
SEQ1_DIR=/scratch/hg/hg18/hg18.2bit
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
# QUERY: ornAna1
SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
SEQ2_LEN=/iscratch/i/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.ornAna1.2007-04-02
TMPDIR=/scratch/tmp
'_EOF_'
# << emacs
doBlastzChainNet.pl DEF \
-blastzOutRoot /cluster/bluearc/hg18.ornAna1 \
>& do.log & tail -f do.log
cd /cluster/data/hg18/bed/blastz.ornAna1.2007-04-02
time nice -n +19 doRecipBest.pl hg18 ornAna1 > rbest.log 2>&1 &
# real 238m22.247s
# worked OK
########################################################################
# 28-WAY VERTEBRATE MULTIZ (2007-03-20 kate)
ssh kkstore02
cd /cluster/data/hg18/bed
mkdir multiz28way.2007-03-20
ln -s multiz28way.2007-03-20 multiz28way
cd multiz28way
# start with 17way tree; update assemblies and add new species
mkdir tree
cd tree
cp /cluster/data/hg18/bed/multiz17way/tree.nh tree.asm.nh
# edit and create tree.28.nh, with Webb's assistance
echo `sed 's/[a-zA-Z0-9]*_//g' tree.asm.nh` > tree.28.nh
#
# create version for download that includes common names and assemblies
cp tree.asm.nh ../28way.nh
# edit
# create version for phyloGif program (replace spaces with commas)
cp 28way.gif /usr/local/apache/htdocs/images/phylo/hg18_28way.gif
# create species list
cd ..
sed -e 's/[()]//g' -e 's/ /\n/g' tree/tree.28.nh | \
sed -e '/^$/d'| sort > species.28.lst
wc -l species.28.lst
ln -s species.28.lst species.lst
# Organisms:
(N)ew, (U)pdated, (S)ame species since 17way:
U chimp (panTro2)
S rhesus (rheMac2)
-N bushbaby (otoGar1) "Otolemur garnetti" (galago) 2X
N tree_shrew (tupBel1) "Tupaia belangeri"
S rat (rn4)
S mouse (mm8)
-N guinea_pig (cavPor2) "Cavia porcellus" 2X
S rabbit (oryCun1) 2X
-N shrew (sorAra1) "Sorex araneus" 2X
-N hedgehog (eriEur1) "Erinaceus europaeus" 2X
S dog (canFam2)
N cat (felCat3) "Felis catus" 2X
-N horse (equCab1) "Equus caballus"
U cow (bosTau3)
S armadillo (dasNov1) "Dasypus novemcinctus" 2X
S elephant (loxAfr1) 2X
S tenrec (echTel1) 2X
S opossum (monDom4)
N platypus (ornAna1) "Ornithorhychus anatinus"
U chicken (galGal3)
N lizard (anoCar1) "Anolis carolinensis" (Green Anole), Iguana family
U frog (xenTro2)
U fugu (fr2)
S tetraodon (tetNig1)
N stickleback (gasAcu1) "Gasterosteus aculeatus"
N medaka (oryLat1) "Oryzias latipes"
U zebrafish (danRer4)
ssh kkstore02
cd /cluster/data/hg18/bed/multiz28way
# verify all blastz's exists
cat > listMafs.csh << 'EOF'
foreach db (`cat species.lst`)
set bdir = /cluster/data/hg18/bed/blastz.$db
if (-e $bdir/mafRBestNet/chr1.maf.gz) then
echo "$db mafRBestNet"
else if (-e $bdir/mafSynNet/chr1.maf.gz) then
echo "$db mafSynNet"
else if (-e $bdir/mafNet/chr1.maf.gz) then
echo "$db mafNet"
else
echo "$db mafs not found"
endif
end
'EOF'
# gather chain stats
ssh hgwdev
cd /cluster/data/hg18/bed/multiz28way
cat > getChainStats.csh << 'EOF'
set species = $1
foreach db (`cat $species`)
echo -n "${db} "
set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'`
set fb = /cluster/data/hg18/bed/blastz.$db/fb.hg18.chain${Db}Link.txt
if (! -e $fb || -z $fb ) then
nice featureBits hg18 chain${Db}Link >& $fb
endif
sed 's/.*(\(.*\)).*/\1/' $fb
end
'EOF'
# << happy emacs
csh getChainStats.csh species.lst >&! species.chainStats
# Maf types:
# 2X mammals -> reciprocal best net
# high cov placental mammals and opossum -> syntenic net
# other -> standard net
csh listMafs.csh > listMafs.log &
cat listMafs.log
# add links of the formt blastz.<db> to blastz.<db>.<date> dirs:
cd /cluster/data/hg18/bed
ln -s blastz.fr2.2007-01-24 blastz.fr2
ln -s blastz.ornAna1.2007-02-21 blastz.ornAna1
ln -s blastz.oryLat1.swap blastz.oryLat1
# copy net mafs to cluster-friendly storage, splitting chroms
# into 50MB chunks to improve run-time
# NOTE: splitting will be different for scaffold-based reference asemblies
ssh hgwdev
cd /cluster/data/hg18/bed/multiz28way
mkdir run.split
cd run.split
mafSplitPos hg18 50 mafSplit.bed
ssh kki
cd /cluster/data/hg18/bed/multiz28way
cd run.split
cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
set db = $1
set sdir = /san/sanvol1/scratch/hg18/splitStrictMafNet
mkdir -p $sdir
if (-e $sdir/$db) then
echo "directory $sdir/$db already exists -- remove and retry"
exit 1
endif
set bdir = /cluster/data/hg18/bed/blastz.$db
if (! -e $bdir) then
echo "directory $bdir not found"
exit 1
endif
mkdir -p $sdir/$db
if (-e $bdir/mafRBestNet) then
set mdir = $bdir/mafRBestNet
else if (-e $bdir/mafSynNet) then
set mdir = $bdir/mafSynNet
else if (-e $bdir/mafNet) then
set mdir = $bdir/mafNet
else
echo "$bdir maf dir not found"
exit 1
endif
echo $mdir
foreach f ($mdir/*)
set c = $f:t:r:r
echo " $c"
nice mafSplit mafSplit.bed $sdir/$db/ $f
end
echo "gzipping $sdir/$db mafs"
nice gzip $sdir/$db/*
endif
echo $mdir > $db.done
'EOF'
# << happy emacs
chmod +x doSplit.csh
grep -v hg18 ../species.28.lst > split.lst
cat > spec << 'EOF'
#LOOP
doSplit.csh $(path1) {check out line+ $(path1).done}
#ENDLOOP
'EOF'
gensub2 split.lst single spec jobList
para create jobList
# 24 jobs
para try
para check
para push
# till complete
para time >&! run.time
# 30 minutes
# run multiz
ssh pk
cd /cluster/data/hg18/bed/multiz28way
mkdir -p maf run
cd run
mkdir penn
# use latest penn utilities
set PENN_BIN = /cluster/bin/penn/multiz.v11.2007-03-19
cp -p $PENN_BIN/{autoMZ,multiz,maf_project} penn
# list chrom chunks, any db dir will do; better would be for the
# splitter to generate this file
# We temporarily use __ instead of . to delimit chunk in filename
# so we can use $(root) to get basename
set mdir = /san/sanvol1/scratch/hg18/splitStrictMafNet
ls $mdir/fr2 | sed -e 's/.maf.gz//' -e 's/\./__/' > chromChunks.lst
wc -l chromChunks.lst
# 93
cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
set db = hg18
set c = $1
set maf = $2
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/splitStrictMafNet
rm -fr $tmp
mkdir -p $tmp
cp ../tree/tree.28.nh ../species.28.lst $tmp
pushd $tmp
foreach s (`cat species.28.lst`)
set c2 = `echo $c | sed 's/__/./'`
set in = $pairs/$s/$c2.maf
set out = $db.$s.sing.maf
if ($s == hg18) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.28.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'EOF'
# << happy emacs
chmod +x autoMultiz.csh
cat << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg18/bed/multiz28way/maf/$(root1).maf}
#ENDLOOP
'EOF'
# << emacs
gensub2 chromChunks.lst single spec jobList
para create jobList
# 93 jobs
para try
para check
para push
para time > run.time
# 4 hours! (~9 min/species)
# load tables for a look
ssh hgwdev
mkdir -p /gbdb/hg18/multiz28way/maf
ln -s /cluster/data/hg18/bed/multiz28way/maf/*.maf \
/gbdb/hg18/multiz28way/maf
cd /cluster/data/hg18/bed/multiz28way
cat > loadMaf.csh << 'EOF'
date
hgLoadMaf -pathPrefix=/gbdb/hg18/multiz28way/maf hg18 multiz28way
# load summary table
cat maf/*.maf | nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz28waySummary stdin
'EOF'
csh loadMaf.csh >&! loadMaf.log &
# look at coverage
ssh kkstore02
cd /cluster/data/hg18/bed/multiz25wayStrict
mkdir mafCov
cd mafCov
cat ../maf/chr7__*.maf | nice mafRanges stdin hg18 \
-otherDb=canFam2 chr7.canFam2.bed | bedSort > chr7.canFam2.bed
echo canFam2 > species.lst
cat ../maf/chr7__*.maf | mafSpeciesSubset stdin species.lst stdout | \
mafToAxt stdin hg18 canFam2 stdout | \
axtToPsl stdin /cluster/data/hg18/chrom.sizes \
/cluster/data/canFam2/chrom.sizes chr7.canFam2.psl
cat ../maf/chr7__*.maf | nice mafRanges stdin hg18 -otherDb=oryCun1 chr7.oryCun1.bed
cat ../maf/chr7__*.maf | nice mafRanges stdin hg18 -otherDb=tetNig1 chr7.tetNig1.bed
ssh hgwdev
cd /cluster/data/hg18/bed/multiz25wayStrict/mafCov
# canFam2 syntenic net vs standard net
nice featureBits hg18 -chrom=chr7 chr7.canFam2.bed
# 82967535 bases of 154952424 (53.544%) in intersection
nice featureBits hg18 -chrom=chr7 ../../multiz25way/mafCov/chr7.canFam2.bed
# 86391682 bases of 154952424 (55.754%) in intersection
nice featureBits hg18 -chrom=chr7 ../../multiz17way/mafCov/chr7.canFam2.bed
# 86248995 bases of 154952424 (55.662%) in intersection
# compare using another method
cat ../maf/chr7__*.maf | mafSpeciesSubset stdin species.lst chr7.canFam2.maf
mafToAxt chr7.canFam2.maf hg18 canFam2 chr7.canFam2.axt
axtToPsl chr7.canFam2.axt /cluster/data/hg18/chrom.sizes \
/cluster/data/canFam2/chrom.sizes chr7.canFam2.psl
nice featureBits hg18 -chrom=chr7 chr7.canFam2.psl
# 75497734 bases of 154952424 (48.723%) in intersection
# oryCun1 reciprocal best net vs standard net
nice featureBits hg18 -chrom=chr7 chr7.oryCun1.bed
# 53157578 bases of 154952424 (34.306%) in intersection
nice featureBits hg18 -chrom=chr7 ../../multiz25way/mafCov/chr7.oryCun1.bed
# 56858022 bases of 154952424 (36.694%) in intersection
# tetNig1 both used standard net
nice featureBits hg18 -chrom=chr7 chr7.tetNig1.bed
# 2905058 bases of 154952424 (1.875%) in intersection
nice featureBits hg18 -chrom=chr7 ../../multiz25way/mafCov/chr7.tetNig1.bed
# 2901708 bases of 154952424 (1.873%) in intersection
# NOTE: Next time concatenate split mafs before proceeding further
# Gap Annotation
# prepare bed files with gap info
ssh kkstore02
cd /cluster/data/hg18/bed/multiz28way
mkdir anno
cd anno
mkdir maf run
cd run
cat > doNBed.csh << 'EOF'
foreach db (`cat species.lst`)
echo -n "$db "
set cdir = /cluster/data/$db
if (! -e $cdir/$db.N.bed) then
echo "creating N.bed"
twoBitInfo -nBed $cdir/$db.2bit $cdir/$db.N.bed
else
echo ""
endif
end
'EOF'
csh doNBed.csh >&! doNBed.log &
rm -f nBeds sizes
foreach db (`grep -v hg18 ../../species.lst`)
echo "$db "
ln -s /cluster/data/$db/$db.N.bed $db.bed
echo $db.bed >> nBeds
ln -s /cluster/data/$db/chrom.sizes $db.len
echo $db.len >> sizes
end
ssh kki
cd /cluster/data/hg18/bed/multiz28way/anno/run
cat > doAnno.csh << 'EOF'
#!/bin/csh -ef
set dir = /cluster/data/hg18/bed/multiz28way
set c = $1
cat $dir/maf/${c}__*.maf | \
nice mafAddIRows -nBeds=nBeds -sizes=sizes stdin \
/cluster/data/hg18/hg18.2bit $2
'EOF'
#<< happy emacs
chmod +x doAnno.csh
cat > spec << 'EOF'
#LOOP
./doAnno.csh $(root1) {check out line+ /cluster/data/hg18/bed/multiz28way/anno/maf/$(root1).maf}
#ENDLOOP
'EOF'
#<< happy emacs
awk '{print $1}' /cluster/data/hg18/chrom.sizes > chroms.lst
gensub2 chroms.lst single spec jobList
para create jobList
para try
ssh hgwdev
cd /cluster/data/hg18/bed/multiz28way/anno
mkdir -p /gbdb/hg18/multiz28way/anno/maf
ln -s /cluster/data/hg18/bed/multiz28way/anno/maf/*.maf \
/gbdb/hg18/multiz28way/anno/maf
cat > loadMaf.csh << 'EOF'
date
nice hgLoadMaf -pathPrefix=/gbdb/hg18/multiz28way/anno/maf \
hg18 multiz28wayAnno
date
cat maf/*.maf | \
nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz28wayAnnoSummary stdin
date
'EOF'
csh loadMaf.csh >& loadMaf.log &
# NOTE: rebuilt hgLoadMafSummary to exclude chroms<1MB (2007-06-21 kate)
########################################################################
# ANNOTATE 28-WAY ALIGNMENT WITH QUALITY DATA (2007-06-11 rico at bx.psu.edu)
#
# The basic idea here is to create a qac file which has quality data for each
# (chromosome/scaffold/etc) and then index the qac file. Once this is done,
# mafAddQRows can be used to add the quality data to a given maf. The agp
# files are used so that gaps can be represented in the qac files as a special
# value.
## create .qac and .qdx files for each species in the 28-way alignment
o human (hg18)
Unable to find quality data.
o chimp (panTro2)
/cluster/data/panTro2/bed/quality/qac/*.qac
/cluster/data/panTro2/wustl/*.agp
qacAddGapIdx in.agp in.qac panTro2.qac panTro2.qdx
o rhesus (rheMac2)
/cluster/data/rheMac2/qual/foo.qv
/cluster/data/rheMac2/downloads/foo.agp
qacAddGapIdx in.agp in.qac rheMac2.qac rheMac2.qdx
o bushbaby (otoGar1)
http://www.broad.mit.edu/ftp/pub/assemblies/mammals/bushbaby/otoGar1
assembly.agp
Draft_v1.agp.chromosome.qual.gz
qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz otoGar1.qac otoGar1.qdx
o treeshrew (tupBel1)
http://www.broad.mit.edu/ftp/pub/assemblies/mammals/treeShrew/tupBel1
assembly.agp
Draft_v1.agp.chromosome.qual.gz
qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz tupBel1.qac tupBel1.qdx
o rat (rn4)
/cluster/data/rn4/downloads/foo.qual
/cluster/data/rn4/CHROM/foo.agp
qacAddGapIdx in.agp in.qac rn4.qac rn4.qdx
o mouse (mm8)
Unable to find quality data.
o guinea pig (cavPor2)
http://www.broad.mit.edu/ftp/pub/assemblies/mammals/guineaPig/cavPor2
assembly.agp
Draft_v2.agp.chromosome.qual.gz
qaAgpToQacIdx assembly.agp Draft_v2.agp.chromosome.qual.gz cavPor2.qac cavPor2.qdx
o rabbit (oryCun1)
http://www.broad.mit.edu/ftp/pub/assemblies/mammals/rabbit/oryCun1
assembly.agp
Draft_v1.agp.chromosome.qual.gz
qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz oryCun1.qac oryCun1.qdx
o shrew (sorAra1)
http://www.broad.mit.edu/ftp/pub/assemblies/mammals/commonShrew/sorAra1
assembly.agp
Draft_v1.agp.chromosome.qual.gz
qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz sorAra1.qac sorAra1.qdx
o hedgehog (eriEur1)
http://www.broad.mit.edu/ftp/pub/assemblies/mammals/hedgehog/eriEur1
assembly.agp
Draft_v1.agp.chromosome.qual.gz
qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz eriEur1.qac eriEur1.qdx
o dog (canFam2)
/cluster/data/canFam2/bed/quality/chrom.qac
/cluster/data/canFam2/broad/foo.agp
qacAddGapIdx in.agp in.qac canFam2.qac canFam2.qdx
o cat (felCat3)
/cluster/data/felCat3/downloads/assembly.agp
/cluster/data/felCat3/downloads/Draft_v3.agp.chromosome.qual.gz
qaAgpToQacIdx assembly.agp Draft_v3.agp.chromosome.qual.gz felCat3.qac felCat3.qdx
o horse (equCab1)
/cluster/data/equCab1/downloads/assembly.agp
/cluster/data/equCab1/downloads/Draft_v1.agp.chromosome.qual.gz
qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz equCab1.qac equCab1.qdx
o cow (bosTau3)
/cluster/data/bosTau3/baylor/chroms/foo.qual
/cluster/data/bosTau3/baylor/foo.agp
qacAddGapIdx in.agp in.qac bosTau3.qac bosTau3.qdx
o armadillo (dasNov1)
http://www.broad.mit.edu/ftp/pub/assemblies/mammals/armadillo/dasNov1
assembly.agp
assembly.quals.gz
combineQuals assembly.agp assembly.quals.gz combined.quals
qaAgpToQacIdx assembly.agp combined.quals.gz dasNov1.qac dasNov1.qdx
o elephant (loxAfr1)
http://www.broad.mit.edu/ftp/pub/assemblies/mammals/elephant/loxAfr1
assembly.agp
assembly.quals.gz
combineQuals assembly.agp assembly.quals.gz combined.quals
qaAgpToQacIdx assembly.agp combined.quals.gz loxAfr1.qac loxAfr1.qdx
o tenrec (echTel1)
http://www.broad.mit.edu/ftp/pub/assemblies/mammals/tenrec/echTel1
assembly.agp
Draft_v1.agp.chromosome.qual.gz
qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz echTel1.qac echTel1.qdx
o opossum (monDom4)
/cluster/data/monDom4/broad.mit.edu/foo.qac
/cluster/data/monDom4/broad.mit.edu/foo.agp
qacAddGapIdx in.agp in.qac monDom4.qac monDom4.qdx
o platypus (ornAna1)
/cluster/data/ornAna1
agp files are present, but there are no quality files
o chicken (galGal3)
Unable to find quality data.
o lizard (anoCar1)
/cluster/data/anoCar1/downloads/assembly.agp
/cluster/data/anoCar1/downloads/scaffold.lifted.qac
qacAddGapIdx in.agp in.qac anoCar1.qac anoCar1.qdx
o frog (xenTro2)
Unable to find quality data.
o tetraodon (tetNig1)
Unable to find quality data.
o fugu (fr2)
Unable to find quality data.
o stickleback (gasAcu1)
/cluster/data/gasAcu1/downloads/foo.agp
/cluster/data/gasAcu1/downloads/foo.qual
qacAddGapIdx in.agp in.qac gasAcu1.qac gasAcu1.qdx
o medaka (oryLat1)
/cluster/data/oryLat1/bed/qual/foo.qual
/cluster/data/oryLat1/downloads/foo.agp
qacAddGapIdx in.agp in.qac oryLat1.qac oryLat1.qdx
o zebrafish (danRer4)
Unable to find quality data.
## NOTE
quality data for chrM needed: dog, guineapig, horse, hedgehog, stickleback, medaka, rat
quality data for chrUn needed: medaka
## copy all .qac and .qdx files to the san
cp *.qac *.qdx /san/sanvol1/rico/quality
## create species list (species.lst) containing the following
anoCar1 /san/sanvol1/rico/quality
bosTau3 /san/sanvol1/rico/quality
canFam2 /san/sanvol1/rico/quality
cavPor2 /san/sanvol1/rico/quality
dasNov1 /san/sanvol1/rico/quality
echTel1 /san/sanvol1/rico/quality
equCab1 /san/sanvol1/rico/quality
eriEur1 /san/sanvol1/rico/quality
felCat3 /san/sanvol1/rico/quality
gasAcu1 /san/sanvol1/rico/quality
loxAfr1 /san/sanvol1/rico/quality
monDom4 /san/sanvol1/rico/quality
oryCun1 /san/sanvol1/rico/quality
oryLat1 /san/sanvol1/rico/quality
otoGar1 /san/sanvol1/rico/quality
panTro2 /san/sanvol1/rico/quality
rheMac2 /san/sanvol1/rico/quality
rn4 /san/sanvol1/rico/quality
sorAra1 /san/sanvol1/rico/quality
tupBel1 /san/sanvol1/rico/quality
## the following script will add quality data to each of the mafs
cat > addQData << 'EOF'
#!/bin/sh
INPUT_DIR=/cluster/data/hg18/bed/multiz28way/anno/maf
OUTPUT_DIR=/cluster/store12/rico/hg18/bed/multiz28way/qual/maf
for maf in `ls -1Sr ${INPUT_DIR}/*.maf`
do
file=`basename $maf`
mafAddQRows species.lst $maf ${OUTPUT_DIR}/$file
done
'EOF'
# Gene frames
ssh hgwdev
cd /cluster/data/hg18/bed/multiz28way
mkdir frames
cd frames
cat > showGenes.csh << 'EOF'
foreach db (`grep -v hg18 ../species.lst`)
echo " $db"
echo -n "Tables: "
set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
foreach table ($tables)
if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
$table == "knownGene") then
echo -n "${table}: "
hgsql $db -N -e "select count(*) from $table"
endif
end
echo -n "Mrnas: "
set orgName = `hgsql hgcentraltest -N -e \
"select scientificName from dbDb where name='$db'"`
set orgId = `hgsql hg18 -N -e \
"select id from organism where name='$orgName'"`
if ($orgId == "") then
echo "0"
else
hgsql hg18 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"
endif
end
'EOF'
# based on output, pick gene tables, according to the following criteria:
# KG if present, else refGene if >10000 entries, else ensGene (unless dog),
# else mgcGenes, else mrnas if > 10000 else none. In all cases
# except none, add in refGene.
hg18: knownGene
bosTau3: mrna
canFam2: mrna
cavPor2: mrna
danRer4: refGene (13K) or ensGene (36K ?)
equCab1: mrna
fr2: ensGene
galGal3: mrna
gasAcu1: ensGene
mm8: knownGene
monDom4: ensGene
oryCun1: mrna
panTro2: refGene
rheMac2: ensGene
rn4: knownGene ? (8K) or refGene (10K) or ensGene(34K) ?
tetNig1: mrna
xenTro2: mrna
# get the genes for all genomes
# mRNAs with CDS. single select to get cds+psl, then split that up and
# create genePred
# using mrna table as genes: bostau3, canFam2, cavPor2, equCab1, galGal3, oryCun1, tetNig1, xenTro2
cat > getGenes.csh << 'EOF'
rm -fr genes
mkdir -p genes
#set mrnaDbs = "bosTau3 canFam2 cavPor2 equCab1 galGal3 oryCun1 tetNig1 xenTro2"
# use only those with databases for now
set mrnaDbs = "bosTau3 canFam2 equCab1 galGal3 oryCun1 tetNig1 xenTro2"
foreach queryDb ($mrnaDbs)
set tmpExt = `mktemp temp.XXXXXX`
set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
set tmpMrna = ${queryDb}.mrna.${tmpExt}
set tmpCds = ${queryDb}.cds.${tmpExt}
echo $queryDb
hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
from all_mrna,gbCdnaInfo,cds \
where (all_mrna.qName = gbCdnaInfo.acc) and \
(gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
$queryDb > ${tmpMrnaCds}
cut -f 1-2 ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$queryDb.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
rm -f $tmpExt
end
# using knownGene for rn4 mm8 hg18
# using refGene for panTro2
# using ensGene for danRer4, fr2, gasSAcu1, monDom4, rheMac2
# genePreds; (must keep only the first 10 columns for knownGene)
set geneDbs = "hg18 mm8 rn4 danRer4 panTro2 fr2 gasAcu1 monDom4 rheMac2"
foreach queryDb ($geneDbs)
if ($queryDb == "danRer4" || $queryDb == "fr2" || $queryDb == "gasAcu1" || \
$queryDb == "monDom4" || $queryDb == "rheMac2") then
set geneTbl = ensGene
else if ($queryDb == "panTro2") then
set geneTbl = refGene
else if ($queryDb == "hg18" || $queryDb == "mm8" || $queryDb == "rn4") then
set geneTbl = knownGene
endif
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from $geneTbl" ${queryDb} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$queryDb.tmp.gz
mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
end
'EOF'
csh getGenes.csh >&! getGenes.log &
ssh kkstore02
cd /cluster/data/hg18/bed/multiz28way/frames
# leaving out cavPor2 (no db) and tetNig1 (too few gene preds)
(cat ../maf/*.maf | nice genePredToMafFrames hg18 stdin stdout bosTau3 genes/bosTau3.gp.gz canFam2 genes/canFam2.gp.gz danRer4 genes/danRer4.gp.gz fr2 genes/fr2.gp.gz galGal3 genes/galGal3.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz oryCun1 genes/oryCun1.gp.gz panTro2 genes/panTro2.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz xenTro2 genes/xenTro2.gp.gz gasAcu1 genes/gasAcu1.gp.gz monDom4 genes/monDom4.gp.gz equCab1 genes/equCab1.gp.gz | gzip > multiz28way.mafFrames.gz) >& frames.log &
ssh hgwdev
cd /cluster/data/hg18/bed/multiz28way/frames
nice hgLoadMafFrames hg18 multiz28wayFrames multiz28way.mafFrames.gz >& loadFrames. log &
# from 17way:
hg18 = knownGene
rn4 = knownGene
mm8 = knownGene
panTro1 = ensGene
rheMac2 = mrna
oryCun1 = mrna
#dasNov1 =
canFam2 = mrna
#loxAfr1 =
bosTau2 = mrna
#echTel1 =
#monDom4 =
galGal2 = refGene
xenTro1 = mgcGenes
#tetNig1 =
fr1 = ensGene
danRer3 = mrna
############################################################################
# PHASTCONS FOR 28WAY (2007-04-04 kate)
# generate tree model with branch lengths using phyloFit from Adam
# Siepel's # phastCons package. Input is 28way alignments of
# 4-fold degenerate sites (4d sites) determined from a
# nonredundant (non-overlapping) gene set. Elliott Margulies
# has a perl script (extract_coding_alignments.pl) that he used
# with the ENCODE alignments.
# Adam uses his msa_view tool with the --4d option.
# For first try, use Gencode Oct '05 reference set filtered
# to longest transcript, then lifted to hg18
# Compare results with hgClusterGenes and /cluster/bin/phast/refeature,
# and genePredSingleCover
hgsql hg17 -N -e 'select * from encodeGencodeGeneKnownOct05' > gencodeKnown.gp
wc -l gencodeKnown.gp
# 2608 gencodeKnown.gp
hgsql hg17 -N -e "select count(*) from encodeGencodeGeneKnownOct05 where cdsStart <> 0 and cdsEnd <> 0"
# 1097
hgsql hg17 -N -e "select count(*) from encodeGencodeGeneKnownOct05 where cdsStartStat='cmpl' and cdsEndStat='cmpl'"
# 752
# Jim's gene uniquifier
hgClusterGenes -noProt hg17 encodeGencodeGeneKnownOct05 \
encodeGencodeGeneKnownOct05Clusters encodeGencodeGeneKnownOct05Canonical
# Got 457 clusters, from 2608 genes in 46 chromosomes
hgsql hg17 -N -e "select transcript from encodeGencodeGeneKnownOct05Canonical order by transcript" > genes.jim
# Adam's feature uniquifier
# requires cdsStart and cdsEnd in gene pred
hgsql hg17 -N -e 'select * from encodeGencodeGeneKnownOct05 where cdsStart<>0 and cdsEnd <> 0' > gencodeKnownCds.gp
wc -l gencodeKnownCds.gp
# 1097 gencodeKnownCds.gp
/cluster/bin/phast/refeature --unique gencodeKnownCds.gp > \
gencodeKnownCdsNR.gff
awk '{print $10}' gencodeKnownCdsNR.gff | sort | uniq | wc -l
# 333
/cluster/bin/phast/refeature -o genepred --unique \
gencodeKnownCds.gp | sort > gencodeKnownCdsNR.gp
wc -l gencodeKnownCdsNR.gp
# 333
awk '{print $1}' gencodeKnownCdsNR.gp | sort > genes.adam
# get intersection
comm -1 -2 genes.jim genes.adam > genes.both
wc -l genes.both
# 235
# genePredSingleCover filters but leaves extended gene pred
genePredSingleCover gencodeKnownCds.gp stdout | sort > gencodeKnownCdsNR2.gp
wc -l gencodeKnownCdsNR2.gp
# 423
awk '{print $1}' gencodeKnownCdsNR2.gp | sort > genes.scov
comm -1 -2 genes.scov genes.both > genes.all
wc -l genes.all
# 224 -- all 3 methods picked these
liftOver -genePred gencodeKnownCdsNR2.gp \
/cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz \
gencodeKnown.hg18.gp unmapped.gp
genePredCheck gencodeKnown.hg18.gp
# checked: 423 failed: 0
# all genes mapped
# consider using only intersection of above 3 methods
grep chr22 gencodeKnown.hg18.gp > gencodeKnown.hg18.chr22.gp
/cluster/bin/phast/msa_view --4d --features gencodeKnown.hg18.chr22.gp \
-i MAF ../maf/chr22__0.maf > chr22.mfa
# extract ENCODE regions from MAF's
ssh hgwdev
cd /cluster/data/hg18/bed/multiz28way/4d
hgsql hg18 -N -e \
"select chrom, chromStart, chromEnd, name from encodeRegions" \
> encodeRegions.bed
ssh kkstore02
cd /cluster/data/hg18/bed/multiz28way/4d
cat > encodeMafs.csh << 'EOF'
mkdir -p encodeMafs
set chroms = `awk '{print $1}' encodeRegions.bed | sort | uniq`
foreach c ($chroms)
echo $c
# needed till mafsInRegion is fixed to handle split maf files
cat ../maf/${c}__?.maf > $c.maf
awk -v CHR=$c '$1 == CHR {print}' encodeRegions.bed > regions.bed
mafsInRegion regions.bed -outDir encodeMafs/ $c.maf
end
'EOF'
csh encodeMafs.csh >&! encodeMafs.log &
# try it out on a few regions
set r = "ENm001"
set r = "ENr231"
perl -wpe 's/^s ([^.]+)\.\S+/s $1/' encodeMafs/$r.maf > $r.clean.maf
# generate ss file
/cluster/bin/phast/msa_view --4d --features gencodeKnown.hg18.gp \
-i MAF $r.clean.maf -o SS > $r.4d.3.ss
/cluster/bin/phast/msa_view -i SS -o FASTA $r.4d.3.ss > $r.4d.3.mfa
/cluster/bin/phast/msa_view -i SS --tuple-size 1 $r.4d.3.ss -o SS > $r.4d.1.ss
/cluster/bin/phast/msa_view -i SS -o FASTA $r.4d.1.ss > $r.4d.1.mfa
# now on all regions
cat > encode4d.csh << 'EOF'
mkdir mfa4d
foreach f (encodeMafs/*.maf)
set r = $f:t:r
echo $r
perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $f > clean.maf
/cluster/bin/phast/msa_view --4d --features gencodeKnown.hg18.gp \
-i MAF clean.maf -o SS | \
/cluster/bin/phast/msa_view -i SS --tuple-size 1 - > mfa4d/$r.4d.mfa
# remove empties to satisfy msa_view --aggregate
if (-z mfa4d/$r.4d.mfa) then
rm mfa4d/$r.4d.mfa
endif
end
'EOF'
csh encode4d.csh >&! encode4d.log &
set species1 = `sed 's/$/,/g' ../species.lst`
set species = `echo $species1 | sed -e 's/ //g' -e 's/,$//'`
# From Elliott's script:
#/cluster/bin/phast/msa_view --aggregate $species EN*.mfa | \
#sed s/"> "/">"/ > some-4d_align.mfa
/cluster/bin/phast/msa_view --aggregate $species mfa4d/EN*.4d.mfa | \
sed s/"> "/">"/ > all-4d_align.mfa
# tweak input tree -- remove common names, include commas
sed 's/[a-z][a-z]*_//g' ../tree/tree.web.commas.nh > tree.commas.nh
# From Elliott's script with Adam's mods (use --EM, MED)
/cluster/bin/phast/phyloFit --EM --precision MED \
--msa-format FASTA --subst-mod REV \
--tree tree.commas.nh all-4d_align.mfa
grep TREE phyloFit.mod | sed 's/TREE\:\ //' > tree_4d.28way.nh
/cluster/bin/phast/tree_doctor --dissect tree_4d.28way.nh | \
awk '$1 == "dparent" {x += $3} END {print x}'
# 9.0516
# extract species distances
/cluster/bin/phast/all_dists tree_4d.28way.nh > 28way.distances.txt
grep hg18 28way.distances.txt | sort -k3,3n | \
awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
# get chain stats ordered by distance
awk '{print $2}' distances.txt > species.byDistance
csh ../getChainStats.csh species.byDistance >&! species.chainStats
# spruce up names for tree drawing
/cluster/bin/phast/tree_doctor \
--rename="hg18 -> human ; panTro2 -> chimp ; rheMac2 -> macaque ; otoGar1 -> bushbaby ; tupBel1 -> tree_shrew ; rn4 -> rat ; mm8 -> mouse ; cavPor2 -> guinea_pig ; oryCun1 -> rabbit ; sorAra1 -> shrew ; eriEur1 -> hedgehog ; canFam2 -> dog ; felCat3 -> cat ; equCab1 -> horse ; bosTau3 -> cow ; dasNov1 -> armadillo ; loxAfr1 -> elephant ; echTel1 -> tenrec ; monDom4 -> opossum ; ornAna1 -> platypus ; galGal3 -> chicken ; anoCar1 -> lizard ; xenTro2 -> frog ; tetNig1 -> tetraodon ; fr2 -> fugu ; gasAcu1 -> stickleback ; oryLat1 -> medaka ; danRer4 -> zebrafish" \
tree_4d.28way.nh > tree_4d.28way.common.nh
# compare to Elliott's latest ENCODE tree, pruned to match
/cluster/bin/phast/tree_doctor \
--prune-all-but=human,chimp,macaque,galago,rat,mouse,guinea_pig,rabbit,cow,cat,dog,hedgehog,shrew,armadillo,elephant,tenrec,monodelphis,platypus,chicken,xenopus \
--rename="xenopus -> frog ; galago -> bushbaby; monodelphis -> opossum"\
encode2007.nh > encode2007.pruned.nh
# my 4d tree with only species in the pruned ENCODE tree
/cluster/bin/phast/tree_doctor \
--prune-all-but=human,chimp,macaque,bushbaby,rat,mouse,guinea_pig,rabbit,cow,cat,dog,hedgehog,shrew,armadillo,elephant,tenrec,opossum,platypus,chicken,frog \
tree_4d.28way.common.nh > tree_4d.20way.common.nh
# Create chrom mafs from split mafs (do this earlier next time)
ssh kki
cd /cluster/data/hg18/bed/multiz28way
mkdir chromMaf
mkdir run.merge
cd run.merge
cat > doMerge.csh << 'EOF'
#!/bin/csh -ef
set c = $1
set cmaf = ../chromMaf/${c}.maf
# NOTE: need to change mafFilter to retain (and uniquify) comments
# begin with ##maf header
head -1 ../maf/${c}__0.maf > $cmaf
grep -h '# ' ../maf/${c}__?.maf | sed 's/\/scratch\/tmp.* //' | sort | uniq \
>> $cmaf
# don't filter out blocks with alignment this time -- might be needed
# for symmetry with irows version, or for analysis. Check on this.
mafFilter -minRow=1 ../maf/${c}__?.maf >> $cmaf
'EOF'
# << happy emacs
chmod a+x doMerge.csh
cat > spec << 'EOF'
#LOOP
./doMerge.csh $(root1) {check out line+ ../chromMaf/$(root1).maf}
#ENDLOOP
'EOF'
# << happy emacs
awk '{print $1}' /cluster/data/hg18/chrom.sizes > chrom.lst
gensub2 chrom.lst single spec jobList
para create jobList
# 49 jobs
para try
para check
para push
# Split chromosome MAF's into windows and use to generate
# "sufficient statistics" (ss) files for phastCons input
# large mem jobs so use mini-cluster
ssh kki
cd /cluster/data/hg18/bed/multiz28way
mkdir cons
cd cons
# Create tree model for phastCons
# Adjust model file base composition background and rate matrix to be
# representative of whole-genome (.41 -- as was done for ENCODE)
# using utility, 'modFreqs' from Adam (5/07)
# NOTE: updated all phast source and rebuilt to phast.2007-05-04
set gc = `grep BACKGROUND /cluster/data/hg18/bed/multiz17way/cons/elliotsEncode.mod | \
awk '{printf "%0.3f\n", $3 + $4;}'`
echo $gc
# .41
/cluster/bin/phast.2007-05-04/modFreqs ../4d/phyloFit.mod $gc > 28way.mod
# split 28way mafs into 10M chunks and generate sufficient statistics
# files for # phastCons
mkdir run.split
cd run.split
set WINDOWS = /san/sanvol1/scratch/hg18/multiz28way/cons/ss
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/hg18/bed/multiz28way/chromMaf
set WINDOWS = /san/sanvol1/scratch/hg18/multiz28way/cons/ss
cd $WINDOWS
set c = $1
echo $c
rm -fr $c
mkdir $c
# need to truncate odd-ball scaffold/chrom names that include dots
# as phastCons utils can't handle them
set TMP = /scratch/tmp/$c.clean.maf.$$
#perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $MAFS/$c.maf > $TMP
perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$c.maf > $TMP
/cluster/bin/phast/$MACHTYPE/msa_split $TMP -i MAF \
-M /cluster/bluearc/hg18/chrom/$c.fa \
-o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
rm -f $TMP
echo "Done" >> $c.done
'EOF'
# << happy emacs
chmod +x doSplit.csh
rm -f jobList
foreach f (../../chromMaf/*.maf)
set c = $f:t:r
echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
end
para create jobList
# 49 jobs
para try
para check
para push
# completed shorter jobs in a few hours, but others failed on memory.
# redo on kolossus -- 14 hours!
# NOTE: next time try harder working with split mafs!
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
ssh pk
cd /cluster/data/hg18/bed/multiz28way/cons
mkdir run.cons
cd run.cons
cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.2007-05-04
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/hg18/multiz28way/cons
cp -p $grp/$grp.mod $grp/$grp.non-inf .
cp -p $san/ss/$c/$f.ss ../../$grp/$grp.mod ../../$grp/$grp.non-inf $tmp
pushd $tmp > /dev/null
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative `cat $grp.non-inf` \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
sleep 1
mv $tmp/$f.pp $san/$grp/pp/$c
mv $tmp/$f.bed $san/$grp/bed/$c
rm -fr $tmp
'EOF'
# << happy emacs
chmod a+x doPhast.csh
# Create parasol batch and run it
pushd /san/sanvol1/scratch/hg18/multiz28way/cons
ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
/cluster/data/hg18/bed/multiz28way/cons/run.cons/in.list
popd
# run for all species
cd ..
mkdir -p all run.cons/all
cd all
cp ../28way.mod all.mod
# non-informative option for closest relatives (exclude regions with only these aligning),
# and till Adam fixes the problem, also exclude all species removed from tree (below)
echo "panTro2,rheMac2" > all.non-inf
cd ../run.cons
# Create template file
# root1 == chrom name, file1 == ss file name without .ss suffix
cat > template << 'EOF'
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31
#ENDLOOP
'EOF'
# << happy emacs
cd all
gensub2 ../in.list single ../template jobList
para create jobList
# 337 jobs
para try
para check
para push
# NOTE: These jobs regularly crash (too quick ?), and have to be repushed.
# Also, a few hang, and need to be stopped and restarted.
# The whole batch runs so fast, this isn't a problem
# CPU time in finished jobs: 34253s 570.89m 9.51h 0.40d 0.001 y
IO & Wait Time: 61148s 1019.13m 16.99h 0.71d 0.002 y
Average job time: 283s 4.72m 0.08h 0.00d
Longest running job: 0s 0.00m 0.00h 0.00d
Longest finished job: 496s 8.27m 0.14h 0.01d
Submission to last job: 995s 16.58m 0.28h 0.01d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/hg18/multiz28way/cons/all
cat bed/*/chr*.bed | ~/bin/${MACHTYPE}/bedSort stdin stdout | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/hg18/bed/multiz28way/cons/all
# load into database
ssh hgwdev
cd /cluster/data/hg18/bed/multiz28way/cons/all
hgLoadBed hg18 phastConsElements28way mostConserved.bed
# Loaded 2183600 elements
# compare with previous tracks
hgsql hg18 -s -N -e "select count(*) from phastConsElements17way"
# 2229902
hgsql hg18 -s -N -e "select count(*) from phastConsElements17way where chrom='chr7'"
# 114703
# Try for 5% overall cov, and 70% CDS cov
featureBits hg18 -enrichment refGene:cds phastConsElements28way >& fb.out &
# Compare to chr7 for 17way -- chr7 is .7% lower than whole genome,
# so aim for 4.3% on chr7
featureBits hg18 -chrom=chr7 -enrichment refGene:cds phastConsElements28way
# USED FOR 17WAY
# too little coverage
# 14 .008 .28
# refGene:cds 0.911%, phastConsElements28way 3.551%, both 0.653%, cover 71.74%, enrich 20.20x
# 14 .1 .28
# refGene:cds 0.911%, phastConsElements28way 3.954%, both 0.648%, cover 71.12%, enrich 17.98x
# 12 .1 .28
# refGene:cds 0.911%, phastConsElements28way 3.914%, both 0.644%, cover 70.74%, enrich 18.08x
# 14 .2 .3
# 234653 elements
# refGene:cds 0.911%, phastConsElements28way 4.423%, both 0.659%, cover 72.34%, enrich 16.36x
# 13 .2 .28
# refGene:cds 0.911%, phastConsElements28way 4.266%, both 0.644%, cover 70.73%, enrich 16.58x
# USE THIS ONE
# minimum change to params to achieve coverage
# 14 .2 .28
# 249585 elements
# refGene:cds 0.911%, phastConsElements28way 4.269%, both 0.646%, cover 70.92%, enrich 16.61x
# 15 .2 .28
# refGene:cds 0.911%, phastConsElements28way 4.271%, both 0.647%, cover 71.08%, enrich 16.64x
# 14 .3 .28
# refGene:cds 0.911%, phastConsElements28way 4.644%, both 0.645%, cover 70.89%, enrich 15.27x
# 14 .35 .28
# refGene:cds 0.911%, phastConsElements28way 4.879%, both 0.646%, cover 70.90%, enrich 14.53x
# 14 .15 .3
# 207188 elements
# refGene:cds 0.912%, phastConsElements28way 4.260%, both 0.660%, cover 72.34%, enrich 16.98x
# 16 .15 .3
# 193531 elements
# refGene:cds 0.912%, phastConsElements28way 4.289%, both 0.663%, cover 72.66%, enrich 16.94x
# 20 .15 .3
# 173668 elements
# refGene:cds 0.912%, phastConsElements28way 4.321%, both 0.667%, cover 73.11%, enrich 16.92x
# 24 .15 .3
# 159646 elements
# refGene:cds 0.912%, phastConsElements28way 4.338%, both 0.670%, cover 73.40%, enrich 16.92x
# 30 .15 .3
# 144399 elements
# refGene:cds 0.912%, phastConsElements28way 4.349%, both 0.673%, cover 73.72%, enrich 16.95x
# 40 .15 .3
# 128087 elements
# refGene:cds 0.912%, phastConsElements28way 4.353%, both 0.676%, cover 74.09%, enrich 17.02x
# 50 .15 .3
# 117338 elements
# refGene:cds 0.912%, phastConsElements28way 4.352%, both 0.678%, cover 74.32%, enrich 17.08x
# 50 .1 .3
# 116930 elements
# refGene:cds 0.912%, phastConsElements28way 4.347%, both 0.678%, cover 74.32%, enrich 17.10x
# 50 .05 .3
# 93391 elements
# refGene:cds 0.912%, phastConsElements28way 4.193%, both 0.680%, cover 74.57%, enrich 17.78x
# 50 .07 .3
# 99358
# refGene:cds 0.912%, phastConsElements28way 4.231%, both 0.680%, cover 74.51%, enrich 17.61x
# 45 .07 .3
# 102864 elements
# refGene:cds 0.912%, phastConsElements28way 4.227%, both 0.679%, cover 74.41%, enrich 17.60x
# USE THIS ONE
# matches element count for 17way
# 45 .1 .3
# 110836 elements
# refGene:cds 0.912%, phastConsElements28way 4.277%, both 0.678%, cover 74.33%, enrich 17.38x
# 75 .1 .3
# Try for really long elements
# 93524 elements
# refGene:cds 0.912%, phastConsElements28way 4.279%, both 0.682%, cover 74.73%, enrich 17.47x
# 100 .1 .3
# 85757 elements
# refGene:cds 0.912%, phastConsElements28way 4.270%, both 0.683%, cover 74.90%, enrich 17.54
# 71218 elements
# 200 .1 .3
# refGene:cds 0.912%, phastConsElements28way 4.225%, both 0.686%, cover 75.16%, enrich 17.79x
# 200 .12 .3
# refGene:cds 0.912%, phastConsElements28way 4.241%, both 0.686%, cover 75.13%, enrich 17.72x
# USE THIS ONE
# for really long elements
# 200 .15 .3
# 75659
# refGene:cds 0.912%, phastConsElements28way 4.261%, both 0.685%, cover 75.11%, enrich 17.63x
featureBits hg18 -chrom=chr7 -enrichment refGene:cds phastConsElements17way
# refGene:cds 0.911%, phastConsElements17way 4.838%, both 0.639%, cover 70.22%, enrich 14.51x
featureBits hg18 -enrichment refGene:cds phastConsElements17way
# refGene:cds 1.072%, phastConsElements17way 5.510%, both 0.759%, cover 70.83%, enrich 12.86x
# compare element sizes to other runs:
# e.g. select min(chromEnd-chromStart) from encodeTbaPhastConsEl
# hg17 ENCODE TBA phastCons: min=1, max=1961
# hg17 ENCODE TBA gerp: min=3, max=1426
# hg18 17way: min=1, max=12590 #el on chr7: 114703
# 45 .3 .31
# featureBits hg18 -enrichment refGene:cds phastConsElements28way
refGene:cds 1.095%, phastConsElements28way 4.920%, both 0.827%, cover 75.48%, enrich 15.34x
# 2906254 elements
# Create merged posterier probability file and wiggle track data files
# pk is currently closer to the san than any other machine
ssh pk
cd /san/sanvol1/scratch/hg18/multiz28way/cons/all
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cat > listPp.csh << 'EOF'
foreach d (pp/chr*/)
ls $d/*.pp | sort -n -t\. -k2
end
'EOF'
csh listPp.csh | xargs cat | \
nice wigEncode stdin phastCons28way.wig phastCons28way.wib
# about 23 minutes for above
cp -p phastCons28way.wi? /cluster/data/hg18/bed/multiz28way/cons/all
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/hg18/bed/multiz28way/cons/all
ln -s /cluster/data/hg18/bed/multiz28way/cons/all/phastCons28way.wib \
/gbdb/hg18/multiz28way
hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \
phastCons28way phastCons28way.wig
# ~ 3 minute load
## Run phastCons on subgroup (placentals)
ssh pk
cd /cluster/data/hg18/bed/multiz28way/cons
# create pruned tree
set species = `cat ../species.lst`
echo $species | sed 's/ /,/g'
#anoCar1,bosTau3,canFam2,cavPor2,danRer4,dasNov1,echTel1,equCab1,eriEur1,felCat3,fr2,galGal3,gasAcu1,hg18,loxAfr1,mm8,monDom4,ornAna1,oryCun1,oryLat1,otoGar1,panTro2,rheMac2,rn4,sorAra1,tetNig1,tupBel1,xenTro2
# setup placental-only run
mkdir placental run.cons/placental
cd placental
# placental-only: exclude from phastCons: 10 non-placentals
# (platypus, opossum, 5 fish, chicken, lizard, frog)
/cluster/bin/phast.new/tree_doctor ../28way.mod \
--prune-all-but=bosTau3,canFam2,cavPor2,dasNov1,echTel1,equCab1,eriEur1,felCat3,hg18,loxAfr1,mm8,oryCun1,otoGar1,panTro2,rheMac2,rn4,sorAra1,tupBel1 \
> placental.mod
echo "panTro2,rheMac2,anoCar1,danRer4,fr2,galGal3,gasAcu1,monDom4,ornAna1,oryLat1,tetNig1,xenTro2" \
> placental.non-inf
cd ../run.cons/placental
gensub2 ../in.list single ../template jobList
para create jobList
para try
para check
para push
# ~30 minutes on pk
# NOTE: sometimes jobs crash or hang due to access problems on SAN
# para stop then push to recover
cd ../../
mkdir hqAll run.cons/hqAll
cd hqAll
# high-qual only: exclude 10 low-qual mammals
/cluster/bin/phast.new/tree_doctor 28way.mod \
--prune-all-but=anoCar1,bosTau3,canFam2,danRer4,equCab1,fr2,galGal3,gasAcu1,hg18,mm8,monDom4,ornAna1,oryLat1,panTro2,rheMac2,rn4,tetNig1,xenTro2 \
> hqAll.mod
echo "panTro2,rheMac2,cavPor2,dasNov1,echTel1,loxAfr1,eriEur1,felCat3,oryCun1,otoGar1,sorAra1,tupBel1" \
> hqAll.non-inf
cd ../run.cons/hqAll
gensub2 ../in.list single ../template jobList
para create jobList
para try
para check
para push
cd ../../
mkdir hqPlacental run.cons/hqPlacental
cd hqPlacental
# high-qual placental only: exclude 10 non-placentals and 10 low-qual mammals,
/cluster/bin/phast.new/tree_doctor ../28way.mod \
--prune-all-but=bosTau3,canFam2,equCab1,hg18,mm8,panTro2,rheMac2,rn4 \
> hqPlacental.mod
echo "panTro2,rheMac2,cavPor2,dasNov1,echTel1,loxAfr1,eriEur1,felCat3,oryCun1,otoGar1,sorAra1,tupBel1,anoCar1,danRer4,fr2,galGal3,gasAcu1,monDom4,ornAna1,oryLat1,tetNig1,xenTro2" \
> hqPlacental.non-inf
cd ../run.cons/hqPlacental
gensub2 ../in.list single ../template jobList
para create jobList
para try
para check
para push
# add placental elements to Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/hg18/multiz28way/cons/placental
cat bed/*/chr*.bed | ~/bin/${MACHTYPE}/bedSort stdin stdout | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/hg18/bed/multiz28way/cons/placental
# load into database
ssh hgwdev
cd /cluster/data/hg18/bed/multiz28way/cons/placental
hgLoadBed hg18 phastConsElements28wayPlacMammal mostConserved.bed
featureBits hg18 -enrichment refGene:cds phastConsElements28wayPlacMammal >&! ../run.cons/placental/fb.out
# experiments
# USING THIS ONE: min change from 17way to achieve coverage
# 14.2.28
# 169516 elements
# 169518
# refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Short 3.437%, both 0.615%, cover 67.40%, enrich 19.61x
# refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Short 3.437%, both 0.615%, cover 67.40%, enrich 19.61x
# USING THIS ONE: vertebrate elements have similar count to 17way ("medium")
# 45.1.3
# 76715 elements
# 76718 elements
# refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Med 3.312%, both 0.642%, cover 70.33%, enrich 21.24x
#refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Med 3.312%, both 0.642%, cover 70.33%, enrich 21.24x
# Create merged posterier probability file and wiggle track data files
# pk is currently closer to the san than any other machine
ssh pk
cd /san/sanvol1/scratch/hg18/multiz28way/cons/placental
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cat > listPp.csh << 'EOF'
foreach d (pp/chr*/)
ls $d/*.pp | sort -n -t\. -k2
end
'EOF'
csh ../listPp.csh | xargs cat | \
nice wigEncode stdin \
phastCons28wayPlacMammal.wig phastCons28wayPlacMammal.wib
# about 23 minutes for above
cp -p phastCons28wayPlacMammal.wi? /cluster/data/hg18/bed/multiz28way/cons/placental
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/hg18/bed/multiz28way/cons/placental
ln -s \
/cluster/data/hg18/bed/multiz28way/cons/placental/phastCons28wayPlacMammal.wib \
/gbdb/hg18/multiz28way
hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \
phastCons28wayPlacMammal phastCons28wayPlacMammal.wig
# WARNING: Exceeded chr4_random size 842649 > 842648. dropping 2 data point(s)
# NOTE: weird msa_split on this chrom -- sent inquiry to Adam about this
# ~ 3 minute load
########################################################################
# phyloP conservation
# split SS files into 100K chunks (5 min./job)
ssh kki
cd /cluster/data/hg18/bed/multiz28way/cons/
mkdir run.phyloP.split
cd run.phyloP.split
cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
set c = $1
set san = /san/sanvol1/scratch/hg18/multiz28way
set in = $san/cons/ss
set out = $san/phyloP/ss
set PHASTBIN = /cluster/bin/phast.2007-05-04
@ i=0
foreach f (`ls $in/$c/*.ss | sort -n -t\. -k2`)
@ i++
mkdir -p $out/$c/$i
$PHASTBIN/msa_split $f -i SS -o SS \
-r $out/$c/$i/$c.$i -w 100000,0 -I 1000 -B 5000
end
echo "Done" >> $out/$c.done
'EOF'
# << happy emacs
chmod +x doSplit.csh
set san = /san/sanvol1/scratch/hg18/multiz28way
set JOBS = /cluster/data/hg18/bed/multiz28way/cons/run.phyloP.split/jobList
rm -f $JOBS
foreach c (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`)
echo "doSplit.csh $c {check out line+ $san/phyloP/ss/$c.done}" >> $JOBS
end
para create jobList
# 49 jobs
para try
para check
para push
para time
# Completed: 49 of 49 jobs
# CPU time in finished jobs: 8827s 147.12m 2.45h 0.10d 0.000 y
# IO & Wait Time: 6837s 113.95m 1.90h 0.08d 0.000 y
# Average job time: 320s 5.33m 0.09h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 1343s 22.38m 0.37h 0.02d
# Submission to last job: 1528s 25.47m 0.42h 0.02d
########################################################################
# phyloP scoring method experiments on chr7 (2008-11-11 kate)
ssh pk
cd /cluster/data/hg18/bed/multiz28way/cons
mkdir -p run.phyloPMethod
cd run.phyloPMethod
cat > doPhyloP.csh << 'EOF'
set method = $1
set f = $2
set out = $3
set c = $f:r:r
set n = $f:r:e
set tmp = /scratch/tmp/$f
mkdir -p $tmp
cp -p /san/sanvol1/scratch/hg18/multiz28way/phyloP/ss/$c/$n/$f.ss ../tree.mod $tmp
pushd $tmp > /dev/null
# Built phast from CornellCVS on 11/11/08 in /cluster/bin/phast.build.
# Symlinked the bin to /cluster/bin/phast.2008
set PHASTBIN = /cluster/bin/phast.2008-11-13
# PHAST version is 0.9.9.8b
$PHASTBIN/phyloP --method $method --mode CONACC --wig-scores --chrom $c \
-i SS tree.mod $f.ss > $f.wig
popd > /dev/null
mkdir -p $out:h
mv $tmp/$f.wig $out
rm -fr $tmp
'EOF'
# Create list of chunks (just chr7 for now)
pushd /san/sanvol1/scratch/hg18/multiz28way/phyloP/ss
ls chr7/*/chr7.*.ss | sed -e 's/.ss$//' -e 's/^\.\///' > \
/cluster/data/hg18/bed/multiz28way/cons/run.phyloPMethod/in.list
# setup run
mkdir -p all
cd all
cp ../../28way.mod tree.mod
mkdir -p SCORE
cd SCORE
# Create template file
# file1 == $chr/$chunk/file name without .ss suffix
cat > template << 'EOF'
#LOOP
csh ../../doPhyloP.csh SCORE $(file1) {check out line+ /san/sanvol1/scratch/hg18/multiz28way/phyloPMethod/all/SCORE/$(path1).wig}
#ENDLOOP
'EOF'
# << happy emacs
gensub2 ../../in.list single template jobList
para create jobList
para try
para check
para push
# Completed: 1552 of 1552 jobs
# CPU time in finished jobs: 15411s 256.84m 4.28h 0.18d 0.000 y
# IO & Wait Time: 7678s 127.97m 2.13h 0.09d 0.000 y
# Average job time: 15s 0.25m 0.00h 0.00d
# Longest finished job: 29s 0.48m 0.01h 0.00d
# Submission to last job: 236s 3.93m 0.07h 0.00d
# Estimated complete: 0s 0.00m 0.00h 0.00d
cd ..
mkdir -p LRT
cd LRT
# Create template file
# file1 == $chr/$chunk/file name without .ss suffix
cat > template << 'EOF'
#LOOP
csh ../../doPhyloP.csh LRT $(file1) {check out line+ /san/sanvol1/scratch/hg18/multiz28way/phyloPMethod/all/LRT/$(path1).wig}
#ENDLOOP
'EOF'
# << happy emacs
gensub2 ../../in.list single template jobList
para create jobList
para try
para check
para push
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cd /san/sanvol1/scratch/hg18/multiz28way/phyloP/all
cat > listPp.csh << 'EOF'
foreach c (`ls -d chr*`)
foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
ls -1 $d/*.wig | sort -n -t\. -k3
end
end
'EOF'
csh listPp.csh | xargs cat | \
nice wigEncode stdin phyloP28way.wig phyloP28way.wib
mkdir /cluster/data/hg18/bed/multiz28way/cons/phyloP/all
cp -p phyloP28way.wi? /cluster/data/hg18/bed/multiz28way/cons/phyloP/all
# setup placental run
mkdir -p placental
cd all
cp ../../placental.mod tree.mod
# Create template file
# file1 == $chr/$chunk/file name without .ss suffix
cat > template << 'EOF'
#LOOP
csh ../doPhyloP.csh $(file1) {check out line+ /san/sanvol1/scratch/hg18/multiz28way/phyloP/placental/$(path1).wig}
#ENDLOOP
'EOF'
# << happy emacs
gensub2 ../in.list single template jobList
para create jobList
para try
para check
para push
#CPU time in finished jobs: 1934553s 32242.55m 537.38h 22.39d 0.061 y
#IO & Wait Time: 82007s 1366.78m 22.78h 0.95d 0.003 y
#Average job time: 70s 1.16m 0.02h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 147s 2.45m 0.04h 0.00d
#Submission to last job: 37642s 627.37m 10.46h 0.44d
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
ssh pk
cd /san/sanvol1/scratch/hg18/multiz28way/phyloP/placental
# check for clean dir here -- chr* will match garbage if it's there
cat > listPp.csh << 'EOF'
foreach c (`ls -d chr*`)
foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
ls -1 $d/*.wig | sort -n -t\. -k3
end
end
'EOF'
csh listPp.csh | xargs cat | \
nice wigEncode stdin phyloP28wayPlacMammal.wig phyloP28wayPlacMammal.wib
mkdir /cluster/data/hg18/bed/multiz28way/cons/phyloP/placental
cp -p phyloP28wayPlacMammal.wi? /cluster/data/hg18/bed/multiz28way/cons/phyloP/placental
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/hg18/bed/multiz28way/cons/phyloP/all
ln -s \
/cluster/data/hg18/bed/multiz28way/cons/phyloP/all/phyloP28way.wib \
/gbdb/hg18/multiz28way/phyloP28way.wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \
phyloP28way phyloP28way.wig
# WARNING: Exceeded chr4_random size 842649 > 842648. dropping 2 data point(s)
cd ../placental
ln -s \
/cluster/data/hg18/bed/multiz28way/cons/phyloP/all/phyloP28wayPlacMammal.wib \
/gbdb/hg18/multiz28way/phyloP28wayPlacMammal.wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \
phyloP28wayPlacMammal phyloP28wayPlacMammal.wig
hgWiggle phyloP28wayChr7 | textHistogram -col=2 -real -skip=7 -binSize=.2
0.000000 **************** 26649187
0.200000 ************************************************************ 101774235
0.400000 ********** 16325655
0.600000 *** 4331032
0.800000 * 1029490
1.000000 0
1.200000 456666
1.400000 0
1.600000 240876
1.800000 0
2.000000 246969
2.200000 0
2.400000 0
2.600000 0
2.800000 134764
cd ../placental
hgWiggle phyloP28wayPlacMammalChr7 | textHistogram -col=2 -real -skip=7 -binSize=.2 stdin
cd ../../all
hgWiggle phastCons28wayChr7Short | textHistogram -col=2 -real -skip=7 -binSize=.1 stdin
0.000000 ************************************************************ 128445730
0.100000 **** 7648620
0.200000 ** 3473415
0.300000 * 1986801
0.400000 * 1399849
0.500000 * 1096292
0.600000 912539
0.700000 893991
0.800000 1008630
0.900000 * 2940535
1.000000 * 1383115
############################################################################
# PhyloP experiments with new scoring methods: LRT and SCORE, implemented in 2008
# Using new PHAST package (rebuilt from cornellCVS)
# chr7-only
# 2008-11-11 kate
############################################################################
# DOWNLOADS FOR 28WAY (2007-05-30 kate)
ssh kkstore02
cd /cluster/data/hg18/bed/multiz28way
cat > downloads.csh << 'EOF'
date
set dir = /cluster/data/hg18/bed/multiz28way
mkdir -p mafDownloads
cd $dir/mafDownloads
foreach f (../maf/chr*.maf)
set c = $f:t:r
echo $c
nice gzip -c $f > $c.maf.gz
end
md5sum *.gz > md5sum.txt
cd $dir
mkdir -p phastConsDownloads/vertebrate phastConsDownloads/placental
cd /san/sanvol1/scratch/hg18/multiz28way/cons
foreach chr (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`)
echo $chr
cat `ls -1 all/pp/$chr/$chr.*.pp | sort -t\. -k2,2n` \
| nice gzip -c \
> $dir/phastConsDownloads/vertebrate/$chr.pp.gz
cat `ls -1 placental/pp/$chr/$chr.*.pp | sort -t\. -k2,2n` \
| nice gzip -c \
> $dir/phastConsDownloads/placental/$chr.pp.gz
end
cd /cluster/data/hg18/bed/multiz28way/phastConsDownloads/vertebrate
md5sum *.gz > md5sum.txt
cd ../placental
md5sum *.gz > md5sum.txt
date
'EOF'
csh downloads.csh >&! downloads.log &
# << happy emacs
ssh hgwdev
set dir = /usr/local/apache/htdocs/goldenPath/hg18/phastCons28way
mkdir -p $dir/vertebrate $dir/placental
ln -s /cluster/data/hg18/bed/multiz28way/phastConsDownloads/vertebrate/{*.gz,md5sum.txt} $dir/vertebrate
ln -s /cluster/data/hg18/bed/multiz28way/phastConsDownloads/placental/{*.gz,md5sum.txt} $dir/placental
cp /usr/local/apache/htdocs/goldenPath/hg18/phastCons17way/README.txt $dir
# edit this file to reflect the latest releases used.
vi $dir/README.txt
set dir = /usr/local/apache/htdocs/goldenPath/hg18/multiz28way/maf
mkdir $dir
ln -s /cluster/data/hg18/bed/multiz28way/mafDownloads/{*.gz,md5sum.txt} $dir
# upstream mafs (mafFrags takes a while)
ssh hgwdev
cd /cluster/data/hg18/bed/multiz28way/mafDownloads
cat > mafFrags.csh << 'EOF'
date
foreach i (1000 2000 5000)
echo "making upstream$i.maf"
nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad
cat up.bad|sed -e "s/_up_${i}_/\t/" >up.bad2
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, $4, 0, $6)}' up.bad2 > up.bed
rm up.bad up.bad2
nice mafFrags hg18 multiz28way up.bed upstream$i.maf \
-orgs=/cluster/data/hg18/bed/multiz28way/species.lst
rm up.bed
end
date
'EOF'
# << happy emacs
ssh kkstore02
cd /cluster/data/hg18/bed/multiz28way/mafDownloads
csh mafFrags.csh > mafFrags.log &
nice gzip up*.maf
md5sum up*.gz >> md5sum.txt
ssh hgwdev
cd /cluster/data/hg18/bed/multiz28way
# link filtered nets and chains to downloads area (doRecipBest.pl could
# be changed for this)
# Species where syntenic net was used
foreach db (panTro2 rheMac2 equCab1 canFam2 bosTau3 mm8 rn4,monDom4)
echo $db
set cd = /cluster/data/hg18/bed/blastz.$db/axtChain
cd $cd
set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'`
set f = hg18.$db.syn.net.gz
if (! -e $f) then
netFilter -syn hg18.$db.net.gz > $f
endif
set d = /usr/local/apache/htdocs/goldenPath/hg18/vs$Db
ln -s $cd/$f $d
nice md5sum $f >> $d/md5sum.txt
end
# Create downloads dir for new species without genome databases
#foreach db (tupBel1 cavPor2 eriEur1 sorAra1)
# NOTE: Keeping these only on genome-test for now.
foreach db (tupBel1 cavPor2 eriEur1 sorAra1)
echo $db
set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'`
set d = /usr/local/apache/htdocs/goldenPath/hg18
mkdir -p $d/vs$Db
cp $d/vsOryCun1/README.txt $d/vs$Db
set bd = /cluster/data/hg18/bed/blastz.$db
cd $bd/axtChain
set f = hg18.$db.net.gz
if (! -e $f) then
cat net/*.net | gzip -c > $f
endif
nice md5sum hg18.$db.{all.chain,net}.gz > md5sum.txt
cd ..
nice md5sum axtNet/*.gz >> axtChain/md5sum.txt
ln -s $bd/axtChain/hg18.$db.{all.chain,net}.gz $d/vs$Db
ln -s $bd/axtChain/md5sum.txt $d/vs$Db
ln -s $bd/axtNet $d/vs$Db
end
# EDIT README's for the species
# Post reciprocal best nets
# NOTE: Keeping these only on genome-test for now.
cat > downloads4.csh << 'EOF'
foreach db (felCat3 otoGar1 loxAfr1 oryCun1 echTel1 dasNov1 \
tupBel1 cavPor2 eriEur1 sorAra1)
echo $db
set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'`
set d = /usr/local/apache/htdocs/goldenPath/hg18/vs$Db
set cd = /cluster/data/hg18/bed/blastz.$db/axtChain
ln -s $cd/hg18.$db.rbest.{chain,net}.gz $d
cd $d
md5sum hg18.$db.rbest.{chain,net}.gz >> md5sum.txt
end
'EOF'
# EDIT README's to include reciprocal best chains & nets
############################################################################
# 28-way PhyloP downloads
# 2008-10-21 kate
ssh kolossus
cd /san/sanvol1/scratch/hg18/multiz28way/phyloP
cat > merge.csh << 'EOF'
set out = $1
rm -f *.lst
foreach c (`ls -d chr*`)
echo $c
touch $c.lst
foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
ls -1 $d/*.wig | sort -n -t\. -k3 >> $c.lst
xargs < $c.lst cat > $out/$c.wigFix
end
end
'EOF'
# all species
cd all
csh ../merge.csh /cluster/data/hg18/bed/multiz28way/cons/phyloP/all > merge.log
cd ../placental
csh ../merge.csh /cluster/data/hg18/bed/multiz28way/cons/phyloP/placental > merge.log
cd /cluster/data/hg18/bed/multiz28way/cons/phyloP
# post to downloads
cd /usr/local/apache/htdocs/goldenPath/hg18
mkdir phyloP28way
cd phyloP28way
ln -s /cluster/data/hg18/bed/multiz28way/cons/phyloP/{all,placental} .
cd all
nice gzip $out/$c.wigFix
cd ../placental
nice gzip $out/$c.wigFix
############################################################################
# Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-03-02 markd)
# see hg17.txt for build temporary ccds database for CCDS.20070228
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg18 ccdsInfo ccdsGene
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=hg18 -loadDb ccdsGene knownGene ccdsKgMap
checkTableCoords hg18 -verbose=2 ccdsGene
# update all.jointer to include hg18 in ccdsDb
joinerCheck -database=hg18 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
# build initial version of ccdsMgcMap table, updated by nightly genbank update
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=hg18 ccdsGene mgcGenes ccdsMgcMap
# load trackDb
cd kent/src/hg/makeDb/trackDb
make alpha
# check in browser
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
ccdsMgcMap
# << emacs
#########################################################################
# RECIPROCAL BEST CHIMP PANTRO2 (2007-03-02 kate)
# Requested by Daryl
cd /cluster/data/hg18/bed/blastz.panTro2
doRecipBest.pl hg18 panTro2 >&! rbest.log &
#########################################################################
# EPONINE-TSS (TRANSCRIPTON START SITE) PREDICTION
# (DONE, 2007-03-08, hartera)
# The Eponine software is version 2 and has not changed in several years
# (contact: Thomas Down at Sanger, td2 at sanger.ac.uk). The version downloaded
# for hg16 should be the same as the current version but download again just
# to check. The application includes the TSS model file: eponine-tss2.xml
ssh kkstore02
# Eponine runs fine on 2.5Mb contig, but barfs on much larger contig;
# chop up sequence at gaps into ~2.5Mb chunks for cluster run.
mkdir /san/sanvol1/scratch/hg18/chunks
cd /cluster/data/hg18
foreach f (?{,?}/NT_*/NT_??????.fa)
set ctg = $f:t:r
/cluster/bin/x86_64/faSplit -minGapSize=10 \
-lift=/san/sanvol1/scratch/hg18/chunks/${ctg}.lft \
gap $f 2500000 /san/sanvol1/scratch/hg18/chunks/${ctg}.chunk
end
# seems to ignore the chunk part of the file name
mkdir /cluster/data/hg18/bed/eponine
cd /cluster/data/hg18/bed/eponine
wget --timestamping \
http://www.sanger.ac.uk/Software/analysis/eponine/eponine-scan.jar
# file has the same date and same size as the one downloaded for hg16
# the script requires all of the path setting found in my .tcshrc file.
# Using only set path = (/usr/java/jre1.5.0_06/bin $path)
# as in the doEpo file for hg16 does not work.
cat << '_EOF_' > doEpo
#!/bin/csh -ef
set path = (/usr/java/jre1.5.0_06/bin /bin /usr/bin /usr/X11R6/bin \
/usr/local/bin . /cluster/home/hartera/bin/x86_64 \
/cluster/bin/x86_64 /projects/compbio/bin/x86_64 \
/projects/compbio/bin /projects/compbio/bin/x86_64-linux \
/cluster/bin/scripts)
java -jar ./eponine-scan.jar -threshold 0.999 -seq $1 > $2
'_EOF_'
chmod a+x doEpo
cp /dev/null jobList
foreach f (/san/sanvol1/scratch/hg18/chunks/NT*.fa)
echo "./doEpo {check in line+ $f} {check out exists out/$f:t:r.gff}" \
>> jobList
end
mkdir out
ssh pk
cd /cluster/data/hg18/bed/eponine
/parasol/bin/para create jobList
/parasol/bin/para try, check, push, check etc.....
/parasol/bin/para time
# Completed: 1408 of 1408 jobs
# CPU time in finished jobs: 105248s 1754.13m 29.24h 1.22d 0.003 y
# IO & Wait Time: 4369s 72.82m 1.21h 0.05d 0.000 y
# Average job time: 78s 1.30m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 104s 1.73m 0.03h 0.00d
# Submission to last job: 1295s 21.58m 0.36h 0.01d
# lift chunks -> contigs
mkdir contigs/
foreach l (/san/sanvol1/scratch/hg18/chunks/*.lft)
set ctg = $l:t:r
liftUp contigs/$ctg.gff $l warn out/${ctg}*.gff
end
# lift contigs -> chrom
liftUp eponine.gff /cluster/data/hg18/jkStuff/liftAll.lft \
warn contigs/NT_*.gff
# Translate to bed 4 + float-score -- it would be a shame to lose
# those scores in genePred or bed 5 (int score)
awk 'BEGIN {i=0;} \
{printf "%s\t%d\t%d\t%s.%d\t%s\t%s\n", $1, $4-1, $5, $1, i, $6, $7; \
i = i + 1;}' \
eponine.gff > eponine.bed
# load up
ssh hgwdev
cd /cluster/data/hg18/bed/eponine
sed -e 's/bed6FloatScore/eponine/g' \
$HOME/kent/src/hg/lib/bed6FloatScore.sql > eponine.sql
hgLoadBed hg18 eponine eponine.bed -tab -sqlTable=eponine.sql
# Loaded 61359 elements of size 6
# trackDb.ra entry and eponine.html already exist in trackDb directory.
###########################################################################
# ACEScan Track (DONE 2007-03-15 Andy
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir acescan
cd acescan/
cp /cluster/data/hg17/bed/acescan/acescan.hg17.gp .
liftOver -genePred acescan.hg17.gp /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \
acescan.hg18.gp unmapped
ldHgGene -predTab hg18 acescan acescan.hg18.gp
##############################################################################
# Update central DB gdbPdb table in preparation for KG III (DONE 3/22/07, Fan)
mysql -u hgcat -p$HGPSWD -h genome-testdb -A hgcentraltest
update gdbPdb set proteomeDb = "proteins070202" where genomeDb = "hg18";
quit
##############################################################################
# UPDATE CGAP TABLES (DONE, 3/26/07, Fan)
cd /cluster/data/hg18/bed/ucsc.10
mkdir cgap
cd cgap
wget --timestamping -O Hs_GeneData.dat "ftp://ftp1.nci.nih.gov/pub/CGAP/Hs_GeneData.dat"
hgCGAP Hs_GeneData.dat
cat cgapSEQUENCE.tab cgapSYMBOL.tab cgapALIAS.tab|sort -u > cgapAlias.tab
hgLoadSqlTab hg18 cgapAlias ~/kent/src/hg/lib/cgapAlias.sql ./cgapAlias.tab
hgLoadSqlTab hg18 cgapBiocPathway ~/kent/src/hg/lib/cgapBiocPathway.sql ./cgapBIOCARTA.tab
cat cgapBIOCARTAdesc.tab|sort -u > cgapBIOCARTAdescSorted.tab
hgLoadSqlTab hg18 cgapBiocDesc ~/kent/src/hg/lib/cgapBiocDesc.sql cgapBIOCARTAdescSorted.tab
##############################################################################
# UPDATE CGAP TABLES (DONE, 8/05/08, JK)
cd /cluster/data/hg18/bed/ucsc.11
mkdir cgap
cd cgap
wget --timestamping -O Hs_GeneData.dat "ftp://ftp1.nci.nih.gov/pub/CGAP/Hs_GeneData.dat"
hgCGAP Hs_GeneData.dat
cat cgapSEQUENCE.tab cgapSYMBOL.tab cgapALIAS.tab|sort -u > cgapAlias.tab
hgLoadSqlTab hg18 cgapAlias ~/kent/src/hg/lib/cgapAlias.sql ./cgapAlias.tab
hgLoadSqlTab hg18 cgapBiocPathway ~/kent/src/hg/lib/cgapBiocPathway.sql ./cgapBIOCARTA.tab
cat cgapBIOCARTAdesc.tab|sort -u > cgapBIOCARTAdescSorted.tab
hgLoadSqlTab hg18 cgapBiocDesc ~/kent/src/hg/lib/cgapBiocDesc.sql cgapBIOCARTAdescSorted.tab
##############################################################################
## BLASTZ HUMAN HG18 (DONE - 2007-03-26 - Hiram)
ssh kkstore02
mkdir /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
cd /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
cat << '_EOF_' > DEF
# human vs lancelet
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1
# QUERY: Lancelet braFlo1 - largest chunk big enough for largest scaffold
# Largest scaffold 7,200,735 - 3032 scaffolds + chrM
SEQ2_DIR=/san/sanvol1/scratch/braFlo1/braFlo1.2bit
SEQ2_LEN=/san/sanvol1/scratch/braFlo1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/braFlo1/braFlo1.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -verbose=2 \
-blastzOutRoot /cluster/bluearc/hg18BraFlo1 > do.log 2>&1 &
# real 458m43.961s
cat fb.hg18.chainBraFlo1Link.txt
# 26455595 bases of 2881515245 (0.918%) in intersection
# test reciprocal best chains/nets for 5-way maf alignments
# on braFlo1, this did not work right there
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 braFlo1 \
> rbest.log 2>&1 &
# real 105m14.176s
# and now the swap, also documented in braFlo1.txt
mkdir /cluster/data/braFlo1/bed/blastz.hg18.swap
cd /cluster/data/braFlo1/bed/blastz.hg18.swap
time doBlastzChainNet.pl -chainMinScore=2000 -chainLinearGap=loose \
/cluster/data/hg18/bed/blastz.braFlo1.2007-03-26/DEF \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -verbose=2 \
-swap > swap.log 2>&1 &
# real 83m46.258s
cat fb.braFlo1.chainHg18Link.txt
# 30912893 bases of 923355587 (3.348%) in intersection
##############################################################################
# RE-BUILD knownGeneList, (DONE, 3/29/07, Fan)
cd /cluster/data/hg18/bed
rm -rf knownGeneList/hg18
# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/hg18
hgKnownGeneList hg18
# copy over to /usr/local/apache/htdocs
rm -rf /usr/local/apache/htdocs/knownGeneList/hg18
mkdir -p /usr/local/apache/htdocs/knownGeneList/hg18
cp -Rfp knownGeneList/hg18/* /usr/local/apache/htdocs/knownGeneList/hg18
##############################################################################
# Update entrez DB tables.
cd /cluster/store10/entrez
mkdir 070329
ln -s /cluster/store10/entrez/070329 /cluster/data/entrez/070329
cd /cluster/data/entrez/070329
wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz
wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
gzip -d *.gz
cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g'|cut -f 1-2 > entrezMrna.tab
cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab
cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g'|cut -f 1-2 > entrezRefseq.tab
cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g'|cut -f 1,2,4 > entrezRefProt.tab
hgLoadSqlTab entrez entrezRefseq ~/src/hg/lib/entrezRefseq.sql ./entrezRefseq.tab
hgLoadSqlTab entrez entrezMrna ~/src/hg/lib/entrezMrna.sql ./entrezMrna.tab
hgLoadSqlTab entrez entrezRefProt ~/src/hg/lib/entrezRefProt.sql ./entrezRefProt.tab
cd /cluster/data/hg18/bed/ucsc.10
hgsql entrez -N -e \
'select mrna, refseq from entrezRefseq, entrezMrna, hg18.all_mrna where qName=mrna and entrezRefseq.geneID=entrezMrna.geneID' \
>mrnaRefseq1.tab
# Include RefSeq as valid mRNA too.
hgsql hg18 -N -e 'select name, name from refGene' >mrnaRefseq2.tab
cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab
hgLoadSqlTab hg18 mrnaRefseq ~/src/hg/lib/mrnaRefseq.sql ./mrnaRefseq.tab
##############################################################################
# RE-BUILD KEGG RELATED TABLES FOR KG III. (DONE, 3/29/07, Fan)
wget --timestamping -O hsa.html \
"http://www.genome.ad.jp/dbget-bin/www_bfind_sub?dbkey=pathway&keywords=hsa&mode=bfind&max_hit=1000&.cgifields=max_hit"
grep href hsa.html | perl -wpe "s/<[^>]+>//g" > hsa.lis
# edit hsa.lis to get rid of the first blank line and last line which is an unrelated line.
~/kent/src/hg/protein/getKeggList2.pl hsa > keggList.tab
hgLoadSqlTab hg18 keggList ~/src/hg/lib/keggList.sql ./keggList.tab
# Before running hgKegg3, make sure entrez DB is updated.
hgKegg3 hg18 hg18
# Load resulting data
hgLoadSqlTab hg18 keggPathway ~/src/hg/lib/keggPathway.sql ./keggPathway.tab
hgLoadSqlTab hg18 keggMapDesc ~/src/hg/lib/keggMapDesc.sql ./keggMapDesc.tab
##############################################################################
# REATTACH KEGG TO KNOWN GENES. (DONE, 8/12/08, JK)
mkdir -p /cluster/data/hg18/bed/ucsc.11/kegg
cd /cluster/data/hg18/bed/ucsc.11/kegg
kgAttachKegg hg18 ../../ucsc.10/kegg/keggList.tab keggPathway.tab
hgLoadSqlTab hg18 keggPathway ~/src/hg/lib/keggPathway.sql ./keggPathway.tab
##############################################################################
# REATTACH SPMRNA TABLE TO KNOWN GENES. (DONE, 8/12/08, JK)
hgsql hg18 -N -e "select spDisplayID,kgID from kgXref where spDisplayID != ''" > spMrna.tab;
hgLoadSqlTab hg18 spMrna ~/kent/src/hg/lib/spMrna.sql spMrna.tab
##############################################################################
# UPDATE BIOCYCTABLES NEEDED BY hgGene (DONE 3/27/07 Fan)
# First register with BioCyc to download their HumanCyc database
# The site will email you the URL for download
wget --timesatmping \
http://bioinformatics.ai.sri.com/ecocyc/dist/flatfiles-52983746/humancyc-flatfiles.zip
unzip humancyc-flatfiles.zip
cp genes.col genes.tab
cp pathways.col pathways.tab
# delete the first 20 or so header lines from these two files.
vi genes.tab
vi pathways.tab
hgsql hg18 -e 'create database bioCyc070327'
hgLoadSqlTab bioCyc070327 genes ~/src/hg/lib/bioCycGenes.sql ./genes.tab
hgLoadSqlTab bioCyc070327 pathways ~/src/hg/lib/bioCycPathways.sql ./pathways.tab
# Create bioCycMapDesc.tab
hgsql bioCyc070327 -N -e 'select UNIQUE_ID, NAME from pathways' |sort -u > bioCycMapDesc.tab
# Create bioCycPathway.tab
kgBioCyc0 bioCyc070327 hg18 hg18
hgLoadSqlTab hg18 bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
hgLoadSqlTab hg18 bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
##########################################################################
# PARTIAL UPDATE OF BIOCYCTABLES NEEDED BY hgGene (DONE 8/05/08 JK)
# Note, ideally would get new data from bioCyc, but they never sent me the
# URL for the files though I filled out their web form a week ago. So reusing the
# 3/27/07 pathways.col and genes.col files. I did write a new kbBioCyc1 to do
# the actual load, and it is on the new UCSC genes. It looks to be a slight
# improvement. About 10% more genes in pathways.
mkdir /cluster/data/hg18/bed/ucsc.11/bioCyc
cd /cluster/data/hg18/bed/ucsc.11/bioCyc
grep -v '^#' /cluster/data/hg18/bed/ucsc.10/bioCyc/pathways.col > pathways.tab
grep -v '^#' /cluster/data/hg18/bed/cusc.10/bioCyc/genes.col > genes.tab
kgBioCyc1 genes.tab pathways.tab $db bioCycPathway.tab bioCycMapDesc.tab
hgLoadSqlTab hg18 bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
hgLoadSqlTab hg18 bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
###########################################################################
# SwitchDB TSS Track (DONE 2007-04-12 Andy)
ssh hgwdev
mkdir /cluster/data/hg18/bed/switchDbTss
cd /cluster/data/hg18/bed/switchDbTss
ln -s /cluster/data/hg17/bed/switchDbTss/switchDbTss.bed hg17.bed
liftOver -bedPlus=5 hg17.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.bed unMapped
wc -l unMapped
#12 unMapped (12 are "deleted in new")
ln -s ~/kent/src/hg/lib/switchDbTss.sql
hgLoadBed -sqlTable=switchDbTss.sql hg18 switchDbTss hg18.bed
###########################################################################
# ADD KG TO TREEFAM LINKS (DONE, 2007-04-13 Fan)
# Generate ucscToEnsembl.txt and send it to TreeFam
# zhongzhongchen [chenzhzh at genomics.org.cn]
hgsql hg18 -N -e 'select * from knownToEnsembl' >ucscToEnsembl.txt
ssh hgwdev
cd /cluster/store12
mkdir treeFam070413
ln -s /cluster/store12/treeFam070413 /cluster/data/treeFam
cd /cluster/data/treeFam
# Receive the following files from TreeFam
ucscToEnsemblToTreefamToRefToUniprot.txt
ucscToEnsemblToTreefamToRef.txt
ucscToEnsemblTotreefam.txt
# Use ucscToEnsemblTotreefam.txt to construct knownToTreefam table.
cut -f 1,3 ucscToEnsemblTotreefam.txt >knownToTreefam.tab
hgLoadSqlTab hg18 knownToTreefam \
~/src/hg/lib/knownToTreefam.sql ./knownToTreefam.tab
# Add the following section into kent/src/hg/hgGene/hgGeneData/links.ra
name treeFam
shortLabel Treefam
tables knownToTreefam
idSql select value from knownToTreefam where name = '%s';
url http://www.treefam.org/cgi-bin/TFinfo.pl?ac=%s
priority 10
###########################################################################
# BLASTZ/CHAIN/NET HORSE (equCab1) (STARTED 2/16/07, DONE 2/21/07, Fan)
ssh kkstore05
mkdir /cluster/data/equCab1/bed/blastz.hg18.2007-02-15
cd /cluster/data/equCab1/bed/blastz.hg18.2007-02-15
# NOTE: THE TARGET WAS ORIGINALLY INTENDED TO BE HORSE, BUT I DID NOT
# DISCOVER THIS UNTIL THE TASK IS DONE.
cat << '_EOF_' > DEF
# Horse vs. Human
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Horse equCab1
SEQ2_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ2_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/equCab1/bed/blastz.hg18.2007-02-15
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-bigClusterHub pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/equCab1/blastz.hg18 >& do.log &
tail -f do.log
ln -s blastz.hg18.2007-02-15 /cluster/data/hg18/bed/blastz.equCab1
nice featureBits hg18 -chrom=chr1 chainEquCab1Link
# 132947074 bases of 224999719 (59.088%) in intersection
ssh hgwdev
cd /cluster/data/equCab1/bed/blastz.hg18.2007-02-15
bash
time nice -n 19 featureBits hg18 chainEquCab1Link \
> fb.hg18.chainEquCab1Link.txt 2>&1 &
# 1643928877 bases of 2881515245 (57.051%) in intersection
#########################################################################
# enable ORFeome track build. (markd 2007-05-02)
cd ~/kent/src/hg/makeDb/genbank
cvs update -d etc
# edit etc/genbank.conf to add
hg18.orfeomeTables.hgwdev = yes
hg18.orfeomeTables.hgwbeta = yes
# will need to enable for rr later. In the future, this can just be enabled
# as part the normal genbank build. Change above to:
hg18.orfeomeTables.default = yes
#########################################################################
# exaptedRepeats track (4/30/07, Craig)
# for full methods an analysis see: Lowe, Bejerano, Haussler.
# Thousands of human mobile element fragments undergo
# strong purifying selection near developmental genes.
# PNAS. (in press). Epub 2007 Apr 26.
#
# Code to re-make this track is in:
# build36/bed/exapted/create.csh
#
# To re-make the track all you have to do is run that c-shell
# while you are in its directory.
# It is easiest if you are on hgwdev since it uses featureBits a few times
# and gets some info from the sql database. I would say it takes
# about two hours to run.
#
#######################################################################
# UCSC GENES (DONE 2007-03-xx kent)
see file: ucscGenes10.txt
#######################################################################
# CGAP SAGE (DONE 2007-04-17 Andy)
ssh hgwdev
bash
mkdir /san/sanVol1/scratch/andy/cgapSage
cd /san/sanVol1/scratch/andy/cgapSage
echo "select * from cgapSageLib" | hgsql hg18 | tail +2 > libs.txt
echo "select * from snp127 where class='single' and locType='exact'" | hgsql hg18 | tail +2 | cut -f2- > allSnpss.txt
echo "select name from snp127Exceptions where exception='ObservedWrongSize' or exception='SingleClassBetweenLocType' or exception='SingleClassRangeLocType' or exception='MultipleAlignment'" | hgsql hg18 | tail +2 > exceptions
tabGrep -v exceptions 4 allSnps.txt > snps.txt
rm allSnps.txt exceptions
echo select chrom,chromStart,chromEnd,name from simpleRepeat | hgsql hg18 | tail +2 > trf.bed
cut -f1-4 snps.txt > snps.bed
overlapSelect -selectFmt=bed -inFmt=bed -nonOverlapping trf.bed snps.bed snps.noTrf.bed
cut -f4 snps.noTrf.bed > snps.noTrf
tabGrep snps.noTrf 4 snps.txt > snps.noTrf.txt
mv snps.noTrf.txt snps.txt
grep -v random /cluster/data/hg18/chrom.sizes | grep -v hap > chrom.sizes
mkdir chromSnps
for c in `cat chrom.sizes | cut -f1`; do
awk "{if (\$1==\"$c\") print;}" snps.txt > chromSnps/$c.snps.txt;
echo $c;
done
rm snps.txt
wget ftp://ftp1.nci.nih.gov/pub/SAGE/HUMAN/Hs.libraries.gz
gunzip Hs.libraries.gz
cat << "EOF" > cleanLibs.awk
BEGIN{FS="\t"}
{
for (i = 1; i <= 12; i++)
{
printf("%s\t", $i);
}
sex = "";
if ($13=="male")
{
sex = "male,";
}
else if ($13=="female")
{
sex = "female,";
}
else if ($13=="male and female")
{
sex = "male,female,";
}
else if ($13=="unknown")
{
sex = "";
}
printf("%s\t", sex);
for (i = 14; i <= 20; i++)
{
printf("%s\t", $i);
}
printf("%s\n", $21);
}
EOF
tail +2 Hs.libraries | awk -f cleanLibs.awk > libs.txt
ln -s ~/kent/src/hg/lib/cgapSage/cgapSageLib.sql
hgLoadSqlTab hg18 cgapSageLib cgapSageLib.sql libs.txt
partitionSequence.pl -lstDir small 5000000 30 hg18.2bit chrom.sizes 0 > sequence.lst
grep -v small sequence.lst > seq.lst
cat small/* >> seq.lst
mv seq.lst sequence.lst
rm -rf small/
wget ftp://ftp1.nci.nih.gov/pub/SAGE/HUMAN/Hs_long.frequencies.gz
gunzip Hs_long.frequencies.gz
cat << "EOF" > doJobList.sh
#!/bin/bash
# basic vars
part=$1;
range=${part#*2bit:};
chrom=${range%:*};
nums=${range#*:}
firstnum=${nums:0:1}
outDir=output/${chrom}/${firstnum}
mkdir -p $outDir
echo ./doFind.sh $1 {check out exists `pwd`/${outDir}/${range}.bed}
EOF
chmod +x doJobList.sh
for part in `cat sequence.lst`; do ./doJobList.sh $part >> jobList; done
cat << "EOF" > doFind.sh
#!/bin/bash
# basic vars
part=$1;
range=${part#*2bit:};
chrom=${range%:*};
nums=${range#*:}
firstnum=${nums:0:1}
# dirs/files
startDir=`pwd`
scratch=/scratch/tmp/$part
output=$2
# begin
mkdir -p $scratch
pushd $scratch
twoBitToFa -noMask $startDir/"$part" part.fa
cgapSageFind part.fa $startDir/Hs_long.frequencies $startDir/libs.txt \
$startDir/chromSnps/${chrom}.snps.txt output.bed
cp output.bed $output
popd
rm -rf $scratch
EOF
chmod +x doFind.sh
ssh pk
cd /san/sanVol1/scratch/andy/cgapSage
para create jobList
para try
para push
# takes like 5-10 min
exit
# back to hgwdev
find output/ -name '*.bed' -exec cat '{}' >> output.bed \;
cgapSageDupeRemove output.bed tmp.bed
cgapSageDupeRemove -unique tmp.bed final.bed
ln -s ~/kent/src/hg/lib/cgapSage/cgapSage.sql
hgLoadBed -sqlTable=cgapSage.sql -tab hg18 cgapSage final.bed
#########################################################################
# HapMap SNPs (DONE 2007-05-23 Andy)
# rel22
# OBSOLETED by Phase II+III SNPs 3/09 angie (see HAPMAP REL27 GENOTYPES)
# Tables renamed to [originalName]PhaseII 3/9/09
ssh hgwdev
bash
cd /cluster/data/hg18/bed
mkdir -p hapmap/zips
cd hapmap/zips
# archived to http://www.hapmap.org/genotypes/2007-03
wget -nd -r -N -A html http://www.hapmap.org/genotypes/latest_ncbi_build36/rs_strand/non-redundant/
grep gz index.html | sed 's/^.*href=\"\(geno.*\.txt\.gz\)\".*$/\1/' > files.txt
wget -N -i files.txt --base=http://www.hapmap.org/genotypes/latest_ncbi_build36/rs_strand/non-redundant/
rm index.html robots.txt files.txt
cd ../
mkdir samples
cd samples/
wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_CEU.txt.gz
wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_CHB.txt.gz
wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_JPT.txt.gz
wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_YRI.txt.gz
cp /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant/*.pl .
ln -s ../zips
./filterPedigree.pl < pedinfo2sample_CEU.txt > filtered.CEU
./filterPedigree.pl < pedinfo2sample_YRI.txt > filtered.YRI
zcat zips/*chr22_CEU* | head -1 | tr ' ' '\n' > header.CEU
zcat zips/*chr22_YRI* | head -1 | tr ' ' '\n' > header.YRI
grep -n -f filtered.CEU header.CEU | cut -f1 -d':' > offsets.CEU
grep -n -f filtered.YRI header.YRI | cut -f1 -d':' > offsets.YRI
for pop in CEU YRI CHB JPT; do
for f in zips/genotypes_chr*_${pop}_r22_nr.b36.txt.gz; do
zcat $f | ./filter${pop}.pl >> ../${pop}.merge
echo Done with $f
done
done
cd ../
for pop in CEU YRI CHB JPT; do
~/kent/src/hg/snp/snpLoad/hapmap1 ${pop}.merge ${pop}.condense
mv hapmap1.log ${pop}.hapmap1.log
done
wc -l *.log
#0 CEU.hapmap1.log
#0 CHB.hapmap1.log
#0 JPT.hapmap1.log
#0 YRI.hapmap1.log
#0 total
rm *.log
cp ~/kent/src/hg/lib/hapmapSnps.sql .
for pop in CEU CHB JPT YRI; do
sed "s/hapmapSnps/hapmapSnps$pop/" hapmapSnps.sql > hapmapSnps${pop}.sql
hgLoadBed -sqlTable=hapmapSnps${pop}.sql hg18 hapmapSnps$pop ${pop}.condense
done
# Don't worry if you see:
#load of hapmapSnpsCEU did not go as planned... etc.
# unless it says rows skipped.
~/kent/src/hg/snp/snpLoad/hapmap2 hg18
#building CEU hash...
#Can't start query:
#select * from hapmapAllelesCEU
#
#mySQL error 1146: Table 'hg18.hapmapAllelesCEU' doesn't exist
# But this works:
~heather/kent/src/hg/snp/snpLoad/hapmap2 hg18
# (gotta bug Heather about that one)
ln -s ~/kent/src/hg/lib/hapmapSnpsCombined.sql
hgLoadBed -sqlTable=hapmapSnpsCombined.sql hg18 hapmapSnpsCombined hapmapSnpsCombined.tab
# Checks:
~heather/kent/src/hg/snp/snpLoad/snpCheckCluster2 hg18 hapmapSnpsCombined
#match count = 0
### clean up
rm *.sql hapmapSnpsCombined.tab bed.tab
tar cfvz merge.tar.gz *.merge
tar cfvz condense.tar.gz *.condense
rm *.condense *.merge
mkdir logs
mv *.errors *.log *.out logs
mkdir orthos
cd orthos/
# hgWiggle output has the chromosome in a comment, followed by the values
# This script prints that chromosome on every line
cat << "EOF" > joinify.awk
{
if ($1 == "variableStep")
{
sub("chrom=", "", $2);
chrom = $2;
}
else if ($1 != "#")
{
printf("%s,%s\t%s\n", chrom, $1, $2);
}
}
EOF
cat << "EOF" > join.sh
#!/bin/bash
sed 's/\(^chr\w\+\)\t/\1,/' $1 > bed
sort -k1,1 bed > tmp; mv tmp bed
awk -f joinify.awk $2 > scores
sort -k1,1 scores > tmp; mv tmp scores
join -1 1 -2 1 bed scores | tr ',' ' ' |
awk '{printf("%s\t%s\t%s\t%s\t%d\t%s\t%s\n", $1, $2, $3, $4, $8, $6, $7);}' > qual.tab
rm scores bed
EOF
chmod +x join.sh
# chimp alleles
cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
awk '{printf("%s\t%s\t%s\t%s\t0\t%s\t%s\n", $2, $3, $4, $5, $7, $8);}' snp126orthoPrelim.tab > snp126orthoPrelim.bed
cp snp126orthoPrelim.bed /cluster/data/hg18/bed/hapmap/orthos/panTro2.bed.new
cd /cluster/data/hg18/bed/hapmap/orthos
hgWiggle -db=panTro2 -bedFile=panTro2.bed quality > panTro2.scores
# create qual.tab; combine panTro2 sequence with panTro2 quality score
./join.sh panTro2.bed.new panTro2.scores
grep chr21 panTro2.bed.new >> qual.tab
grep chrY panTro2.bed.new >> qual.tab
# create snpOrtho.tab; a table in human coords that has associated ortho alleles
~heather/kent/src/hg/snp/snpLoad/snpOrtho hg18 snp126 qual.tab
sed 's/snpOrtho/snp126OrthoPanTro2/' ~/kent/src/hg/lib/snpOrtho.sql > snpOrthoPanTro2.sql
hgLoadBed -tab -sqlTable=snpOrthoPanTro2.sql hg18 snp126OrthoPanTro2 snpOrtho.tab
mysql> update snp126OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chr21";
mysql> update snp126OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY";
mysql> update snp126OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY_random";
# get the HapMap subset
sed 's/hapmapAllelesOrtho/hapmapAllelesChimp/' ~/kent/src/hg/lib/hapmapAllelesOrtho.sql > hapmapAllelesChimp.sql
~heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg18 hapmapSnpsCombined snp126OrthoPanTro2
hgLoadBed -tab -sqlTable=hapmapAllelesChimp.sql hg18 hapmapAllelesChimp hapmapOrtho.tab
# sanity check
mysql> select count(*) from hapmapAllelesChimp where chrom = orthoChrom;
# 3,492,708
mysql> select count(*) from hapmapAllelesChimp where chrom != orthoChrom;
# 374,010
# macaque alleles
cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
awk '{printf("%s\t%s\t%s\t%s\t0\t%s\t%s\n", $2, $3, $4, $5, $7, $8);}' snp126orthoPrelim.tab > snp126orthoPrelim.bed
cp snp126orthoPrelim.bed /cluster/data/hg18/bed/hapmap/orthos/rheMac2.bed.new
cd /cluster/data/hg18/bed/hapmap/orthos
hgWiggle -db=rheMac2 -bedFile=rheMac2.bed quality > rheMac2.scores
# create qual.tab: combine rheMac2 sequence with rheMac2 quality score
./join.sh rheMac2.bed.new rheMac2.scores
# create snpOrtho.tab; a table in human coords that has associated ortho alleles
~heather/kent/src/hg/snp/snpLoad/snpOrtho hg18 snp126 qual.tab
sed 's/snpOrtho/snp126OrthoRheMac2/' ~/kent/src/hg/lib/snpOrtho.sql > snpOrthoRheMac2.sql
hgLoadBed -tab -sqlTable=snpOrthoRheMac2.sql hg18 snp126OrthoRheMac2 snpOrtho.tab
# get the HapMap subset
sed 's/hapmapAllelesOrtho/hapmapAllelesMacaque/' ~/kent/src/hg/lib/hapmapAllelesOrtho.sql > hapmapAllelesMacaque.sql
~heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg18 hapmapSnpsCombined snp126OrthoRheMac2
hgLoadBed -tab -sqlTable=hapmapAllelesMacaque.sql hg18 hapmapAllelesMacaque hapmapOrtho.tab
# create summary table
~heather/kent/src/hg/snp/snpLoad/hapmapSummary hg18 hapmapSnpsCombined hapmapAllelesChimp hapmapAllelesMacaque
ln -s ~/kent/src/hg/lib/hapmapAllelesSummary.sql
hgLoadBed -tab -sqlTable=hapmapAllelesSummary.sql hg18 hapmapAllelesSummary hapmapSummary.tab
#############################################################################
# RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan)
# rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan)
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir wgRna-2007-05-31
cd wgRna-2007-05-31
# Received the data file, wg_may2007.txt (saved from wg_may2007.doc)
# from Michel Weber's email
# (Michel.Weber at ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg18/bed/wgRna-2007-05-31.
cat wg_may2007.txt|sed -e 's/ /\t/g' > wgRna.tab
hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab
#############################################################################
# N-SCAN GENES track (2007-06-21 markd)
# create a composite track with exists ab-inito and new PASA N-SCAN predictions
# download pasa predictions
cd /cluster/data/hg18/bed/nscan/pasa
wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.pasa.gtf
wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.prot.fa
bzip2 hg18.*
chmod a-w hg18.*
ldHgGene -gtf -genePredExt hg18 nscanPasaGene hg18.pasa.gtf.bz2
hgPepPred hg18 generic nscanPasaPep hg18.prot.fa.bz2
rm *.tab
# update trackDb; need a hg18-specific page to describe informants and PASA
human/hg18/nscan.html
human/hg18/trackDb.ra
# remove old human/hg18/nscanGene.html
###########################################################################
# AUGUSTUS track (DONE 2007-7-3 Mario)
#
# augustusHints subtrack
mkdir -p /cluster/data/hg18/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.X.final
cd /cluster/data/hg18/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.X.final
wget http://augustus.gobics.de/predictions/hg18/usingEvidence/augustus.hg18.Trefseq.hmRNA.hsEST.R.X.pep.gff
wget http://augustus.gobics.de/predictions/hg18/usingEvidence/augustus.hg18.Trefseq.hmRNA.hsEST.R.X.pep.aa
ldHgGene -bin hg18 augustusHints augustus.hg18.Trefseq.hmRNA.hsEST.R.X.gff
hgPepPred hg18 generic augustusHintsPep augustus.hg18.Trefseq.hmRNA.hsEST.R.X.pep.aa
# augustus de novo subtrack
mkdir -p /cluster/data/hg18/bed/augustus/usingHints/predictions/Xp.RA.it
cd /cluster/data/hg18/bed/augustus/usingHints/predictions/Xp.RA.it
wget http://augustus.gobics.de/predictions/hg18/deNovo/augustus.hg18.Xp.RA.it.pep.gff
wget http://augustus.gobics.de/predictions/hg18/deNovo/augustus.hg18.Xp.RA.it.pep.aa
ldHgGene -bin hg18 augustusXRA augustus.hg18.Xp.RA.it.gff
hgPepPred hg18 generic augustusXRAPep augustus.hg18.Xp.RA.it.pep.aa
# augustus ab initio subtrack
mkdir -p /cluster/data/hg18/bed/augustus/abinitio
cd /cluster/data/hg18/bed/augustus/abinitio
wget http://augustus.gobics.de/predictions/hg18/abinitio/augustus.pep.gff
wget http://augustus.gobics.de/predictions/hg18/abinitio/augustus.pep.aa
ldHgGene -bin hg18 augustusAbinitio augustus.gff
hgPepPred hg18 generic augustusAbinitioPep augustus.pep.aa
#############################################################################
# Stanford NRSF ChIP-seq (DONE, Heather, July 2007)
# Add color-by-strand and overlap table (2008-05-27 kate)
# BED file of sites provided May 2008 by Tim Reddy (treddy@gmail.com)
ssh kkstore03
cd /cluster/data/encode/stanford/2007-03-14
# lift to hg18
liftOver fix.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.bed core.unmapped
liftOver control_fix.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.control.bed control.unmapped
# add color by strand (red for +, blue for minus)
awk 'OFS="\t" {$6=="+" ? c="255,0,0" : c="0,0,255"; print $1, $2, $3, "", $5, $6, $2, $3, c}' hg18.bed > hg18.fixc.bed
awk 'OFS="\t" {$6=="+" ? c="255,0,0" : c="0,0,255"; print $1, $2, $3, "", $5, $6, $2, $3, c}' hg18.control.bed > hg18.control_fixc.bed
# load into database
hgwdev
cd /cluster/data/encode/stanford/2007-03-14
hgLoadBed hg18 stanfordNRSFEnriched hg18.fixc.bed -tab
hgLoadBed hg18 stanfordNRSFControl hg18.control_fixc.bed -tab
# overlap tables
set prefix = /gbdb/hg18/wib
set table = stanfordNRSFEnrichedOverlaps
sort -k1,1 -k2,2n hg18.bed | bedItemOverlapCount hg18 stdin | \
wigEncode stdin ${table}.wig ${table}.wib
ln -s /cluster/data/encode/stanford/2007-03-14/${table}.wib $prefix
hgLoadWiggle -pathPrefix=$prefix hg18 $table ${table}.wig
set table = stanfordNRSFControlOverlaps
sort -k1,1 -k2,2n hg18.control.bed | bedItemOverlapCount hg18 stdin | \
wigEncode stdin ${table}.wig ${table}.wib
ln -s /cluster/data/encode/stanford/2007-03-14/${table}.wib $prefix
hgLoadWiggle -pathPrefix=$prefix hg18 $table ${table}.wig
# peaks (provided May 2008)
sort -k1,1 -k2,2n lab/NRSF_Peak_Calls.bed | \
awk '{print $1, $2, $3}' > peaks.bed
wc -l peaks.bed
# 2116
hgLoadBed -noBin hg18 stanfordNRSFSites peaks.bed
#########################################################################
# REGULATORY POTENTIAL UPDATE (DONE - 2007-08-01 - Hiram)
# download data from "James Taylor" <james at bx.psu.edu>
ssh kkstore02
mkdir /cluster/data/hg18/bed/regPotential7X.update
cd /cluster/data/hg18/bed/regPotential7X.update
# This is a lot of data
for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
do
wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg18/chr${C}.scores.truncated.bz2"
echo "DONE - chr${C}.scores.truncated.bz2"
done
# create download gzip files from the bz2 files:
time for F in chr*.scores.truncated.bz2
do
C=`echo $F | awk -F'.' '{print $1}'`
echo -n "${C}.regPotential7X.hg18.gz working ... "
bzcat ${F} | gzip > ${C}.regPotential7X.hg18.gz
touch -r "${F}" "${C}.regPotential7X.hg18.gz"
echo "done"
done
time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
do
zcat chr${C}.regPotential7X.hg18.gz
done | wigEncode -noOverlap stdin regPotential7X.wig regPotential7X.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# real 16m40.347s
# Loading the table on hgwdev
ssh hgwdev
cd /cluster/data/hg18/bed/regPotential7X.update
mkdir /gbdb/hg18/wib/070118
ln -s /cluster/data/hg18/bed/regPotential7X.update/regPotential7X.wib \
/gbdb/hg18/wib/070118/regPotential7X.wib
# using the tmpDir is faster since it is on local disk and it will
# clean up any temporary .tab file it creates there
time nice -n +19 hgLoadWiggle -tmpDir=/scratch/tmp \
-pathPrefix=/gbdb/hg18/wib/070118 hg18 regPotential7X regPotential7X.wig
# real 0m38.247s
# How about a histogram of the data.
ssh kolossus
cd /cluster/data/hg18/bed/regPotential7X.update
time nice -n +19 hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 \
-hBinCount=100 -hMinVal=0.0 -db=hg18 regPotential7X > histogram.data 2>&1
# real 3m15.934s
# 73 % of the data values are zero
# create download gzip files from the bz2 files:
ssh kkstore02
cd /cluster/data/hg18/bed/regPotential7X
for F in chr*.scores.truncated.bz2
do
C=`echo $F | awk -F'.' '{print $1}'`
echo -n "${C}.regPotential7X.hg18.gz working ... "
bzcat ${F} | gzip > ${C}.regPotential7X.hg18.gz
echo
done
# renaming file directory -- kuhn 08-17-2007
cd /gbdb/hg18/wib
mv 070118 regPot070118
hgsql -e " update regPotential7X SET file = " \
"/gbdb/hg18/wib/regPot070118/regPotential7X.wib" hg18
Query OK, 2341572 rows affected (31.59 sec)
Rows matched: 2341572 Changed: 2341572 Warnings: 0
#############################################################################
# SIB Transcriptome (DONE Aug 29, 2007 - JK)
# Create working directory and download data from where Christian Iseli
# (Christian.Iseli at licr.org) put it, and unpack. The download takes about
# ten minutes (161M file).
cd /cluster/data/hg18/bed
mkdir sibTranscriptome
cd sibTranscriptome
wget ftp://ftp.licr.org/pub/databases/trome/human/txg.tar.gz
wget ftp://ftp.licr.org/pub/databases/trome/human/HTR.gtf.gz
tar -zxvf txg.tar.gz
# Load up sibGene table
zcat HTR.gtf.gz | ldHgGene hg18 sibGene stdin
# Do a little data cleanup and transformation and load splice graphs into database.
sed 's/altGraphX/sibTxGraph/' ~/src/hg/lib/altGraphX.sql > sibTxGraph.sql
sed 's/chrMt/chrM/' txg/chromMt.txg > txg/chromM.txg
rm txg/chromMt.txt
cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb -sqlTable=sibTxGraph.sql hg18 sibTxGraph stdin
# Create sibAltEvents track for analysed alt-splices.
cat txg/*.txg | txgAnalyze stdin /cluster/data/hg18/hg18.2bit sibAltEvents.bed
awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed
hgLoadBed hg18 sibAltEvents foo.bed
#########################################################################
# BLASTZ MOUSE Mm9 (DONE - 2007-08-20 - Hiram)
ssh kkstore02
mkdir /cluster/data/hg18/bed/blastzMm9.2007-08-09
cd /cluster/data/hg18/bed/blastzMm9.2007-08-09
# Started this before the rsync to /scratch/data/mm9/ had completed,
# hence the /cluster/bluearc/scratch/data/mm9/ location is used
# here. (hg18 was also in transition to a new location)
cat << '_EOF_' > DEF
# human vs mouse
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human Hg18
SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib
SEQ1_SMSK=/cluster/bluearc/scratch/data/hg18/linSpecRep/notInMouseRat
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=0
# QUERY: Mouse Mm9
SEQ2_DIR=/cluster/bluearc/scratch/data/mm9/nib
SEQ2_SMSK=/cluster/bluearc/scratch/data/mm9/notInOthers
SEQ2_LEN=/cluster/data/mm9/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
BASE=/cluster/data/hg18/bed/blastzMm9.2007-08-09
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
# real 1480m54.483s
# failed due to pk node difficulties, finish the run.blastz
# manually
# Completed: 102120 of 102120 jobs
# CPU time in finished jobs: 6908585s 115143.08m 1919.05h 79.96d 0.219 y
# IO & Wait Time: 50958894s 849314.90m 14155.25h 589.80d 1.616 y
# Average job time: 567s 9.44m 0.16h 0.01d
# Longest finished job: 3000s 50.00m 0.83h 0.03d
# Submission to last job: 446177s 7436.28m 123.94h 5.16d
# continuing
time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 -bigClusterHub=pk -chainMinScore=3000 \
-chainLinearGap=medium -continue=cat `pwd`/DEF > cat.out 2>&1 &
# real 111m59.041s
cat /cluster/data/hg18/bed/blastzMm9.2007-08-09/fb.hg18.chainMm9Link.txt
# 1014323175 bases of 2881515245 (35.201%) in intersection
cat /cluster/data/hg18/bed/blastz.mm8/fb.hg18.chainMm8Link.txt
# 994530182 bases of 2881515245 (34.514%) in intersection
cd /cluster/data/hg18/bed
ln -s blastzMm9.2007-08-09 blastz.mm9
# Then to swap over to Mm9 (also in mm9.txt)
mkdir /cluster/data/mm9/bed/blastz.hg18.swap
cd /cluster/data/mm9/bed/blastz.hg18.swap
time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 -swap -bigClusterHub=pk -chainMinScore=3000 \
-chainLinearGap=medium \
/cluster/data/hg18/bed/blastz.mm9/DEF > swap.out 2>&1 &
# real 67m21.146s
cat /cluster/data/mm9/bed/blastz.hg18.swap/fb.mm9.chainHg18Link.txt
# 1008812599 bases of 2620346127 (38.499%) in intersection
cat /cluster/data/mm8/bed/blastz.hg18/fb.mm8.chainHg18Link
# 984380268 bases of 2567283971 (38.343%) in intersection
cd /cluster/data/mm9/bed
ln -s blastz.hg18.swap blastz.hg18
## make syntenic net (DONE - 2007-08-20 - Hiram)
cd /cluster/data/hg18/bed/blastzMm9.2007-08-09
time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 -bigClusterHub=pk -chainMinScore=3000 \
-syntenicNet -chainLinearGap=medium -continue=syntenicNet \
`pwd`/DEF > syntenic.out 2>&1 &
## real 25m47.767s
#########################################################################
# LOAD ACEMBLY (DONE 8/28/07 angie)
ssh kkstore02
cd /cluster/data/hg18/bed/acembly
# Move aside liftOver run results
mkdir liftOver
mv a* g* h* j* u* liftOver
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_36.human.genes/AceView.ncbi_36.genes_gff.tar.gz
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_36.human.genes/AceView.ncbi_36.good_proteins_fasta.tar.gz
tar xvzf AceView.ncbi_36.genes_gff.tar.gz
tar xvzf AceView.ncbi_36.good_proteins_fasta.tar.gz
cd AceView.ncbi_36.genes_gff
# If the result of this command is > 0, then some lines have end < start
# and need to be fixed:
awk '$5 < $4 {print;}' *.gff | wc -l
#0
# Filter out empty lines, lines where the product_id has a stray
# newline before it, and $chr|Hs# IDs that don't appear liftable.
egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' *.gff \
| sed -e 's/^/chr/;' \
> acembly.gff
# Extract annotation classes from original gff:
egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' *.gff \
| perl -wpe 's/^.*Gene_type (\w+); transcript_id (\S+);.*/$2\t$1/; \
s/Main$/main/ || s/Putative$/putative/ || \
die "Unrecognized class/Gene_type:\n$_\n";' \
| sort -u \
> acemblyClass.tab
# Some gff transcript_id's end in -unspliced (no intron), but the
# corresponding protein fasta IDs to not have that suffix. We need
# them to match, so add where necessary.
# Use perl to make a perl script to add -unspliced to protein IDs
# where necessary:
grep unspliced acemblyClass.tab | wc -l
#70156
egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' *.gff \
| perl -wpe 's@^.*transcript_id (\S+)-unspliced;.*$@\$unsp{"$1"} = 1;@ \
|| s/^.*\n$//;' \
| sort -u \
> ../addUnspliced.pl
wc -l ../addUnspliced.pl
#70156 ../addUnspliced.pl
cat >> ../addUnspliced.pl <<'_EOF_'
while (<>) {
if (/^>(\S+)$/) {
if ($unsp{$1}) {
s/^>(\S+)/>$1-unspliced/;
}
}
print;
}
'_EOF_'
# << emacs
# Add -unspliced suffix to protein IDs where necessary, and pare down
# proteins to just the ones that we have transcripts for:
cd /cluster/data/hg18/bed/acembly/AceView.ncbi_36.good_proteins_fasta
awk '{print $1;}' ../AceView.ncbi_36.genes_gff/acemblyClass.tab \
> transcriptNames.txt
perl ../addUnspliced.pl *.fasta \
| faSomeRecords stdin transcriptNames.txt acemblyPep.fa
grep unspliced acemblyPep.fa | wc -l
#55931
# Danielle Thierry-Mieg explained that noncoding genes are included so
# the number of proteins can be smaller than the number of transcripts.
# Load tables
ssh hgwdev
cd /cluster/data/hg18/bed/acembly/AceView.ncbi_36.genes_gff
ldHgGene -gtf hg18 acembly acembly.gff
#Read 258618 transcripts in 3451107 lines in 1 files
# 258618 groups 24 seqs 1 sources 5 feature types
#258618 gene predictions
hgLoadSqlTab hg18 acemblyClass ~/kent/src/hg/lib/acemblyClass.sql \
acemblyClass.tab
cd /cluster/data/hg18/bed/acembly/AceView.ncbi_36.good_proteins_fasta
hgPepPred hg18 generic acemblyPep acemblyPep.fa
rm acemblyPep.tab
runJoiner.csh hg18 acembly
# hg18.acemblyPep.name - hits 210003 of 210003 ok
# hg18.acemblyClass.name - hits 258618 of 258618 ok
###########################################################################
## Create gc5Base download raw data file (DONE - 2007-08-29 - Hiram)
ssh kkstore02
cd /cluster/data/hg18/bed/gc5Base
hgGcPercent -wigOut -doGaps -file=stdout -win=5 \
hg18 /cluster/data/hg18/hg18.2bit 2> /dev/null \
| gzip > hg18.gc5Base.txt.gz
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/hg18/gc5Base
cd /usr/local/apache/htdocs/goldenPath/hg18/gc5Base
ln -s /cluster/data/hg18/bed/gc5Base/hg18.gc5Base.txt.gz .
###########################################################################
# GENE BOUNDS (RNACLUSTER) (REBUILT 08-30-2007 Fan)
# Create rnaCluster table (depends on {est,mrna}OrientInfo)
cd /cluster/data/hg18/bed
mv rnaCluster rnaCluster.old
mkdir rnaCluster
cd rnaCluster/
mkdir chrom
# Create a list of accessions that come from RAGE libraries and need to be excluded.
~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg18 rage.libs
foreach f (/cluster/data/hg18/nib/chr*.nib)
set c = $f:t:r
set out = chrom/$c.bed
# Exclude accesions in the RAGE file
echo clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c
clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c
end
hgLoadBed hg18 rnaCluster chrom/*.bed
###########################################################################
# RE-LOAD FISH CLONES after bacEnds update (DONE - 2007-09-04 - Hiram)
# The bacEnds processing results are used here
ssh hgwdev
mkdir /cluster/data/hg18/bed/fishClones.2007-08-29
cd /cluster/data/hg18/bed/fishClones.2007-08-29
ln -s ../fishClones/cl_acc_gi_len .
ln -s ../fishClones/fhcrc.sts .
# have to be on hgwdev for this since it is going to read from the db
time nice -n +19 fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg18 \
/cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt \
/cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out \
./cl_acc_gi_len \
/cluster/data/hg18/bed/bacends/bacEnds.lifted.psl \
fishClones > fishClones.out 2>&1
# real 0m53.783s
# Reading Fish Clones file /cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
# reading fishInfo file /cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
# Reading Clone/Acc (clac.out) file /cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out
# Reading BAC Ends file ./cl_acc_gi_len
# Reading BAC Ends psl file /cluster/data/hg18/bed/bacends/bacEnds.lifted.psl
# Reading additional STS Marker links fhcrc.sts
# Determining good positions
# findClonePos: determining positions of fish clones
# Writing output file
# ERROR: at line # 177, no cytoband info for chrX:104048913-104206974
# RP11-79L11
# ERROR: at line # 178, no cytoband info for chrX:104048913-104206974
# RP11-79L11
# Load the track
hgLoadBed -notItemRgb -noBin -tab \
-sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \
hg18 fishClones fishClones.bed
# Loaded 9788 elements of size 16
############################################################################
# INDEL-BASED CONSERVATION TRACK (DONE, 2007-09-03 - 2007-09-17, hartera)
# Data from the Gerton Lunter (gerton.lunter at anat.ox.ac.uk), MRC
# Functional Genetics Unit, University of Oxford, United Kingdom.
# Data is from the paper:
# Lunter G, Ponting CP and Hein J Genome-wide identification of human
# functional DNA using a neutral indel model. PLoS Comput Biol. 2006
# Jan;2(1):e5.
ssh kkstore02
mkdir -p /cluster/data/hg18/bed/consIndels/data
cd /cluster/data/hg18/bed/consIndels
# Add a README.indels with the e-mail from Gerton Lunter
# get the data
wget --timestamping \
http://wwwfgu.anat.ox.ac.uk/~gerton/igs-hg18mm8cf2.zip
# 38 Mb zip file in GFF format. This contains data for hg18
# comparing it to mm8 and cf2 (canFam2).
unzip igs-hg18mm8cf2.zip
mv *.gff ./data/
foreach f (./data/*.gff)
set r = $f:r
echo $r
grep -v "track" $f > ${r}NoHeader.gff
end
# strip off the end of the name e.g. IGS0001.1:p=.74; FDR 0.27
# so that the name displayed is short - IGS0001.1. The score field
# is used to determine colouring and this is calculated from FDR
ssh kkstore02
cd /cluster/data/hg18/bed/consIndels
perl -pi.bak -e \
's/(IGS[0-9a-z]+\.[0-9XY]+):p=?<?\.[0-9]+;\sFDR\s[0-9]+\.[0-9]+/$1/' \
./data/igs*NoHeader.gff
# check this looks ok then clean up
rm *.bak
# makes sense to store this as a BED5 table in order to use the score
# for display.
foreach f (./data/*NoHeader.gff)
awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1,$4,$5,$9,$6}' $f \
>> consIndelsHg18Mm8CanFam2.bed
end
# load data
ssh hgwdev
cd /cluster/data/hg18/bed/consIndels
hgsql -e 'drop table consIndelsHg18Mm8CanFam2;' hg18
hgLoadBed hg18 consIndelsHg18Mm8CanFam2 consIndelsHg18Mm8CanFam2.bed
# Loaded 2603017 elements of size 5
# Get the IDs, posterior probabilities (p) for the segment being neutral,
# and the FDR from the original GFFs for a separate table. Some items
# have p<.001. Can not do Table Browser queries restricting
# p to <, =, or > a specified value unless all values are floats.
# Contacted the data contributor, Gerton Lunter, and he said it would be
# ok to change all p<.001 to p=0.0005
ssh kkstore02
cd /cluster/data/hg18/bed/consIndels/
foreach c (`cat /cluster/data/hg18/chrom.lst`)
echo $c
foreach f (./data/igs.chr${c}.gff)
echo $f
awk 'BEGIN {FS="\t"} {if ($9 ~ /IGS/) print $9;}' $f \
| sed -e 's/:/\t/' \
| sed -e 's/p=\./0\./' | sed -e 's/p<\.001/0\.0005/' \
| sed -e 's/;\sFDR/\t/' >> consIndelsHg18Mm8CanFam2Conf.txt
end
end
# there are no GFF files for the haplotype chroms
# Create a table definition for the table of identifier, posterior
# probability and false discovery rate (FDR).
cat << 'EOF' > $HOME/kent/src/hg/lib/itemConf.as
table itemConf
"Probability and false discovery rate (FDR) for an element in a track."
(
string id; "Identifier of element"
float probability; "Probability associated with element"
float fdr; "False Discovery Rate (FDR) associated with element"
)
'EOF'
# << emacs
cd $HOME/kent/src/hg/lib
autoSql itemConf.as itemConf
mv itemConf.h ../inc/
# commit ../inc/itemConf.h, itemConf.c, itemConf.as and
# itemConf.sql to CVS. Add itemConf.o to src/hg/lib/makefile
ssh hgwdev
cd /cluster/data/hg18/bed/consIndels
hgLoadSqlTab hg18 consIndelsHg18Mm8CanFam2Conf \
$HOME/kent/src/hg/lib/itemConf.sql \
consIndelsHg18Mm8CanFam2Conf.txt
# check that all itesm are in this table.
hgsql -N -e 'select distinct(name) from consIndelsHg18Mm8CanFam2;' hg18 \
| sort > consIndels.names.sort
hgsql -N -e 'select distinct(id) from consIndelsHg18Mm8CanFam2Conf;' hg18 \
| sort > consIndels.idsfromConf.sort
wc -l *.sort
# 2603017 consIndels.idsfromConf.sort
# 2603017 consIndels.names.sort
comm -12 consIndels.names.sort consIndels.idsfromConf.sort | wc -l
# 2603017
# so all element IDs are in both tables.
# cleanup
rm ./data/*.bak *.sort
# add trackDb/human/hg18/trackDb.ra entry and add description that
# was written by the data contributor. Add code to hgc.c to display
# the posterior probability and the FDR on the details page for
# track elements. Gerton Lunter provided a description for the data
# on 2007-09-12.
############################################################################
# Promote UCSD genome-wide ENCODE Chip tracks:
# UCSD TAF1 IMR90 Chip/chip to Regulation group
# (2007-09-14 kate)
hgsql hg18 -e "alter table encodeUcsdNgChipSignal rename to wgEncodeUcsdNgTaf1Signal"
hgsql hg18 -e "update wgEncodeUcsdNgTaf1Signal set file='/gbdb/hg18/encode/wib/wgEncodeUcsdNgTaf1Signal.wib'"
hgsql hg18 -e "alter table encodeUcsdNgChipKnownSites rename to wgEncodeUcsdNgTaf1KnownSites"
hgsql hg18 -e "alter table encodeUcsdNgChipNovelSites rename to wgEncodeUcsdNgTaf1NovelSites"
hgsql hg18 -e "alter table encodeUcsdNgValChipH3K4me rename to wgEncodeUcsdNgTaf1ValidH3K4me"
hgsql hg18 -e "alter table encodeUcsdNgValChipH3ac rename to wgEncodeUcsdNgTaf1ValidH3ac"
hgsql hg18 -e "alter table encodeUcsdNgValChipRnap rename to wgEncodeUcsdNgTaf1ValidRnap"
hgsql hg18 -e "alter table encodeUcsdNgValChipTaf rename to wgEncodeUcsdNgTaf1ValidTaf"
############################################################################
# NESTED REPEATS (DONE 9/20/07 angie)
# This track is now generated by doRepeatMasker.pl; added to this older
# assembly for interest.
ssh kkstore02
# First, re-liftUp the .out -- liftUp has been enhanced to uniquify the
# RepeatMasker IDs.
cd /cluster/data/hg18
foreach c ( `cat chrom.lst` )
echo lifting chr$c chunks to contigs
foreach d ( ${c}/N{C,G,T}_* )
cd $d
set contig = $d:t
liftUp $contig.IDs.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out \
> /dev/null
cd ../..
end
echo lifting contigs to chr$c
cd $c
if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
liftUp chr$c.IDs.fa.out lift/ordered.lft warn \
`sed -e 's/.fa.out$/.IDs.fa.out/' lift/oOut.lst` \
> /dev/null
endif
if (-e lift/random.lft && ! -z lift/random.lft) then
liftUp chr${c}_random.IDs.fa.out lift/random.lft warn \
`sed -e 's/.fa.out$/.IDs.fa.out/' lift/rOut.lst` \
> /dev/null
endif
cd ..
end
# Now join fragments using shared IDs:
ssh kolossus
mkdir /cluster/data/hg18/bed/nestedRepeats
cd /cluster/data/hg18/bed/nestedRepeats
extractNestedRepeats.pl ../../?{,?}/chr*.IDs.fa.out \
> hg18.nestedRepeats.bed
# Load table:
ssh hgwdev
cd /cluster/data/hg18/bed/nestedRepeats
hgLoadBed hg18 nestedRepeats hg18.nestedRepeats.bed \
-sqlTable=$HOME/kent/src/hg/lib/nestedRepeats.sql
############################################################################
# Promote GIS genome-wide ENCODE tracks:
# GIS PET RNA and GIS ChIP-PET to Regulation group
# (2007-09-20 kate)
hgsql hg18 -e "alter table encodeGisChipPet rename to wgEncodeGisChipPet"
hgsql hg18 -e "alter table encodeGisChipPetHes3H3K27me3 rename to wgEncodeGisChipPetHes3H3K27me3"
hgsql hg18 -e "alter table encodeGisChipPetHes3H3K4me3 rename to wgEncodeGisChipPetHes3H3K4me3"
hgsql hg18 -e "alter table encodeGisChipPetMycP493 rename to wgEncodeGisChipPetMycP493"
hgsql hg18 -e "alter table encodeGisChipPetStat1Gif rename to wgEncodeGisChipPetStat1Gif"
hgsql hg18 -e "alter table encodeGisChipPetStat1NoGif rename to wgEncodeGisChipPetStat1NoGif"
hgsql hg18 -e "alter table encodeGisRnaPetHCT116 rename to wgEncodeGisRnaPetHCT116"
hgsql hg18 -e "alter table encodeGisRnaPetHes3 rename to wgEncodeGisRnaPetHes3"
hgsql hg18 -e "alter table encodeGisRnaPetMCF7 rename to wgEncodeGisRnaPetMCF7"
hgsql hg18 -e "alter table encodeGisRnaPetMCF7Estr rename to wgEncodeGisRnaPetMCF7Estr"
##########################################################
# Case Control Consortium (DONE 2007-09-20 (Andy)
ssh hgwdev
bash
mkdir /cluster/data/hg17/bed/caseControl
cd /cluster/data/hg17/bed/caseControl
wget ftp://ftp.sanger.ac.uk/pub/WTCCC/summary_stats/summary_stats_auto_all.zip
unzip summary_stats_auto_all.zip
cd basic/
for disease in BD CAD CD HT RA T1D T2D; do
echo $disease
jkDisease=${disease:0:1}`echo ${disease:1} | tr [[:upper:]] [[:lower:]]`
for f in *${disease}*.txt; do
tail +2 $f | awk '{if ($21 == "1") print;}' | \
cut -f1,15 >> ../chromGraphs/cccTrendPval${jkDisease}.cg
done
done
cd ../chromGraphs/
mkdir hg17 hg18
for f in *.cg; do
table=${f%.cg};
echo $table
hgLoadChromGraph -idTable=affy500k -minusLog10 -pathPrefix=/gbdb/hg17/chromGraph hg17 $table $f 2> ${table}.hg17.errors
mv ${table}.cgb hg17/
hgLoadChromGraph -idTable=affy500k -minusLog10 -pathPrefix=/gbdb/hg18/chromGraph hg18 $table $f 2> ${table}.hg18.errors
mv ${table}.cgb hg18/
done
pushd /gbdb/hg18/chromGraph
ln -s /cluster/data/hg17/bed/caseControl/chromGraphs/hg18/*.cgb .
popd
pushd /gbdb/hg17/chromGraph
ln -s /cluster/data/hg17/bed/caseControl/chromGraphs/hg17/*.cgb .
popd
# Add the hack row into metaChromGraph for the composite tracks.
hgsql hg17 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile) values ("caseControl", 0, 0, "composite")'
hgsql hg18 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile) values ("caseControl", 0, 0, "composite")'
#############################################################################
# RGD HUMAN QTL (DONE 9/24/07 angie)
ssh hgwdev
mkdir /cluster/data/hg18/bed/rgdQtl
cd /cluster/data/hg18/bed/rgdQtl
wget ftp://rgd.mcw.edu/pub/data_release/QTLS
# Pick out the human QTLs and liftOver hg17 --> hg18.
# Make bed4 and rgdQtlLink:
perl -we 'open(BED, ">rgdQtl.bed") || die; \
open(LINK, ">rgdQtlLink.txt") || die; \
while (<>) { \
chomp; my @w = split("\t"); \
next unless ($w[1] eq "human" && $w[15]); \
$w[5] =~ s/^/chr/; \
$w[15] =~ s/^([-\d]+).*$/$1/ || die "parse start pos"; \
$w[16] =~ s/^(\d+).*$/$1/ || die "parse end pos"; \
if ($w[15] > $w[16]) { \
$tmp = $w[15]; $w[15] = $w[16]; $w[16] = $tmp; \
} \
$w[15]--; \
$w[15] = 0 if ($w[15] < 0); \
print BED "$w[5]\t$w[15]\t$w[16]\t$w[2]\n"; \
print LINK "$w[0]\t$w[2]\t$w[3]\n"; \
} \
close(BED); close(LINK);' \
QTLS
mv rgdQtl.bed hg17.rgdQtl.bed
# Using a fairly loose minMatch -- the regions covered are huge.
liftOver -minMatch=0.5 hg17.rgdQtl.bed \
/cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz \
hg18.rgdQtl.{bed,unmapped}
wc -l hg18*
# 254 hg18.rgdQtl.bed
# 2 hg18.rgdQtl.unmapped
ssh hgwdev
cd /cluster/data/hg18/bed/rgdQtl
hgLoadBed hg18 rgdQtl hg18.rgdQtl.bed
hgLoadSqlTab hg18 rgdQtlLink ~/kent/src/hg/lib/rgdQtlLink.sql rgdQtlLink.txt
# Make sure there aren't any illegal coords:
checkTableCoords -verbose=2 hg18 rgdQtl
#############################################################################
# RGD RAT QTL MAPPED TO HUMAN (DONE 9/26/07 angie)
#====== Begin work that was discarded because its output was too voluminous
# to be very useful IMHO. Keeping it in the doc as a lesson learned.
# See below for what I ended up loading.
ssh hgwdev
cd /cluster/data/hg18/bed/rgdQtl
genePredToPsl -bedFormat rn4 /cluster/data/rn4/bed/rgdQtl/rgdQtl.bed \
rn4.rgdQtl.psl
time ssh -x kolossus pslMap `pwd`/rn4.rgdQtl.psl \
-chainMapFile /cluster/data/hg18/bed/liftOver/hg18ToRn4.over.chain.gz \
`pwd`/hg18.rgdRatQtl.psl
#0.011u 0.006s 10:58.56 0.0% 0+0k 0+0io 0pf+0w
# That created an 11G monstrosity of a file that dwarfs the original
# input. Linecount increased 3 orders of magnitude, filesize increased
# 5 orders of magnitude.
wc -l rn4.rgdQtl.psl
#1067 rn4.rgdQtl.psl
ssh -x kkstore02 wc -l `pwd`/hg18.rgdRatQtl.psl
#1228306 /cluster/store11/gs.19/build36/bed/rgdQtl/hg18.rgdRatQtl.psl
# Let's see what liftOver does...
time ssh -x kolossus \
liftOver -minMatch=0.5 -multiple \
/cluster/data/rn4/bed/rgdQtl/rgdQtl.bed \
/cluster/data/rn4/bed/liftOver/rn4ToHg18.over.chain.gz \
`pwd`/hg18.rgdRatQtl.lo.{bed,unmapped}
#0.014u 0.004s 0:59.27 0.0% 0+0k 0+0io 0pf+0w
wc -l hg18.rgdRatQtl.lo.{bed,unmapped}
# 1214366 hg18.rgdRatQtl.lo.bed
# 14 hg18.rgdRatQtl.lo.unmapped
# Still got 1M lines... ugh. Mapped all over the place, of course.
#====== end discarded work.
# Use a stringently filtered version of over.chain to do the mapping,
# so we only pick up large chunks (targeting >10,000bases) of these
# enormous regions (up to 235M in rn4).
ssh kolossus
cd /cluster/data/hg18/bed/rgdQtl
# rn4ToHg18 was built before doBlastz included chainStitchId in the
# pipe to create over.chain. Run it here, to repair any chain breaks:
chainStitchId /cluster/data/rn4/bed/liftOver/rn4ToHg18.over.chain.gz \
rn4ToHg18Stitch.over.chain
# I looked at the summed scores from chainStitchId vs. the length
# spanned by the stitched chains, and arbitrarily picked what I
# think is a sweet spot for mapping very large ranges: at scores
# near 500000, chains seem to span 40-60k bases. Pretty much all
# of the rat and human chromosomes (except human randoms) have at
# least some chains with scores >= 500000. So I'll filter the
# stitched chains to keep those with score >= 500000.
# NOTE FOR NEXT TIME: consider filtering by length (see jaxQtl below).
chainFilter rn4ToHg18Stitch.over.chain -minScore=500000 \
> rn4ToHg18Coarse.over.chain
# I tried liftOver with -minMatch=0.5, 0.33, 0.25 and 0.2. These are the
# wc -l stats for each run -- not surprisingly, many more matches with
# lower minMatch:
#0.5:
# 1256 hg18.rgdRatQtl.coarse.lo.bed
# 998 hg18.rgdRatQtl.coarse.lo.unmapped
#0.33:
# 6748 hg18.rgdRatQtl.coarse.lo.bed
# 92 hg18.rgdRatQtl.coarse.lo.unmapped
#0.25:
# 9609 hg18.rgdRatQtl.coarse.lo.bed
# 36 hg18.rgdRatQtl.coarse.lo.unmapped
#0.2:
# 10529 hg18.rgdRatQtl.coarse.lo.bed
# 30 hg18.rgdRatQtl.coarse.lo.unmapped
# I spot-checked by viewing a rat QTL and hg18 chains in rn4, and
# eyeballing whether the net track looked like there were solid
# matches for large regions. With minMatch=0.25, most mappings
# and unmapped looked pretty reasonable, but I still saw a few
# (like Alc4) where a nice long chain was not being used, so I
# kicked it down to 0.2 and checked again -- looks good.
time liftOver -minMatch=0.2 -multiple \
/cluster/data/rn4/bed/rgdQtl/rgdQtl.bed rn4ToHg18Coarse.over.chain \
hg18.rgdRatQtl.coarse.lo.{bed,unmapped}
#100.476u 10.925s 1:52.31 99.1% 0+0k 0+0io 0pf+0w
wc -l hg18.rgdRatQtl.coarse.lo.{bed,unmapped}
# see above.
# Many of the records are completely contained within other records
# for the same QTL (inversions I suppose) -- they don't really tell
# us anything new about the murky QTL region, so merge them in.
# NOTE FOR NEXT TIME: instead of the perl+sort, use something like this:
# liftOverMerge -mergeGap=10000 hg18.rgdRatQtl.coarse.lo.bed stdout \
# | mergeOverlapBed4.pl - > hg18.rgdRatQtl.coarse.lo.pruned.bed
# liftOverMerge joins items separated by small (a relative term) gaps.
perl -we \
'while (<>) { \
chomp; ($chrom, $start, $end, $name) = split; \
push @{$item2coords{"$chrom.$name"}}, [$start, $end]; \
} \
foreach $item (keys %item2coords) { \
@sortedCoords = sort { $a->[0] <=> $b->[0] } @{$item2coords{$item}}; \
($chrom, $name) = split(/\./, $item); \
($mergeStart, $mergeEnd) = @{shift @sortedCoords}; \
foreach $rangeRef (@sortedCoords) { \
($rangeStart, $rangeEnd) = @{$rangeRef}; \
next if ($rangeEnd <= $mergeEnd); \
if ($rangeStart > $mergeEnd) { \
print "$chrom\t$mergeStart\t$mergeEnd\t$name\n"; \
($mergeStart, $mergeEnd) = ($rangeStart, $rangeEnd); \
} else { \
$mergeEnd = $rangeEnd; \
} \
} \
print "$chrom\t$mergeStart\t$mergeEnd\t$name\n" if ($mergeEnd); \
} \
' hg18.rgdRatQtl.coarse.lo.bed \
| sort -k1,1 -k2n,2n -k4,4r \
> hg18.rgdRatQtl.coarse.lo.pruned.bed
ssh hgwdev
cd /cluster/data/hg18/bed/rgdQtl
hgLoadBed hg18 rgdRatQtl hg18.rgdRatQtl.coarse.lo.pruned.bed
# Just use rn4's non-positional associated info:
sed -e 's/rgdQtlLink/rgdRatQtlLink/' ~/kent/src/hg/lib/rgdQtlLink.sql \
> rgdRatQtlLink.sql
hgLoadSqlTab hg18 rgdRatQtlLink rgdRatQtlLink.sql \
/cluster/data/rn4/bed/rgdQtl/rgdQtlLink.txt
# Make sure there aren't any illegal coords:
checkTableCoords -verbose=2 hg18 rgdRatQtl
runJoiner.csh hg18 rgdRatQtl
#====== more discarded work 10/2/07:
ssh kolossus
cd /cluster/data/hg18/bed/rgdQtl
# Try pslMap with the same filtered chains:
time pslMap -swapMap rn4.rgdQtl.psl \
-chainMapFile rn4ToHg18Coarse.over.chain \
hg18.rgdRatQtl.coarse.pm.psl
#444.915u 29.914s 11:20.08 69.8% 0+0k 0+0io 0pf+0w
wc -l hg18.rgdRatQtl.coarse.pm.psl
#10755 hg18.rgdRatQtl.coarse.pm.psl
# Again, linecount is comparable to liftOver, but the block-by-block
# detail from pslMap creates an enormous file (10GB) even with the
# filtered chains.
# Recover 21G of disk space:
rm hg18.rgdRatQtl.psl hg18.rgdRatQtl.coarse.pm.psl
#====== end discarded work.
#############################################################################
# N-SCAN GENES partial reload (2007-09-26 markd)
# reload nscanPasaGene to get fixed names and to fix search criteria
# download pasa predictions
cd /cluster/data/hg18/bed/nscan/pasa2
wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.pasa.gtf
wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.prot.fa
bzip2 hg18.*
chmod a-w hg18.*
ldHgGene -gtf -genePredExt hg18 nscanPasaGene hg18.pasa.gtf.bz2
hgPepPred hg18 generic nscanPasaPep hg18.prot.fa.bz2
rm *.tab
# update trackDb to add correct termRegex entries
human/hg18/trackDb.ra
# push nscanPasaGene nscanPasaPep and trackDb
#############################################################################
# Blastz hg18 to J. Craig Venter chrom attempt (DONE - 2007-09-27 - Hiram)
ssh kkstore06
screen # use a screen to control this job
mkdir /cluster/data/hg18/bed/blastzVenter1.2007-09-27
cd /cluster/data/hg18/bed/blastzVenter1.2007-09-27
cat << '_EOF_' > DEF
# human reference vs J. Craig Venter
# using -chainMinScore=10000 and -chainLinearGap=medium
# during doBlastzChainNet.pl run
# parameters on advice from Webb for K and Q
# M as in hg18 self, O and E from Q
# Y and T as in hg18-panTro2 and mm9-rn4
BLASTZ_K=10000
BLASTZ_M=400
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_Y=15000
BLASTZ_T=2
BLASTZ_Q=/cluster/data/blastz/human_chimp.v2.q
# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/selfNib
SEQ1_LEN=/san/sanvol1/scratch/hg18/self.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Venter1
SEQ2_DIR=/iscratch/i/venter1/venter1.unmasked.2bit
SEQ2_LEN=/cluster/data/venter1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzVenter1.2007-09-27
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 -chainMinScore=10000 -chainLinearGap=medium \
-bigClusterHub=kk -noDbNameCheck DEF > do.log 2>&1 &
# real 163m10.634s
# this doesn't work, it failed due to mistakenly thinking it was a self
# alignment. Plus, we need to do the raw scaffolds, not these fake
# chroms.
#############################################################################
# CONTRAST GENES (2007-10-02 markd)
# recieved predictions from Sam Gross <ssgross at stanford.edu>
cd /cluster/data/hg18/bed/contrastGene/
wget http://www.stanford.edu/~ssgross/contrast.hg18.bed
# this is a custom track, not a pure BED
tail +2 contrast.hg18.bed | hgLoadBed -tab hg18 contrastGene stdin
# verify
# load track db (ra and contrastGene.html are global
# request push of contrastGene
###########################################################################
# SGP GENES Update (DONE - 2007-10-02 - Hiram)
ssh kkstore02
mkdir /cluster/data/hg18/bed/sgp.2007-10-02
cd /cluster/data/hg18/bed/sgp.2007-10-02
SITE="genome.imim.es/genepredictions/H.sapiens/golden_path_200603_x_mm9"
for C in `cut -f1 ../../chrom.sizes`
do
wget --timestamping "http://${SITE}/SGP/${C}.gtf" -O ${C}.gtf
wget --timestamping "http://${SITE}/SGP/${C}.prot" -O ${C}.prot
done
# before reloading the table, measure the previous set:
nice -n +19 featureBits -enrichment hg18 refGene:CDS sgpGene
# refGene:CDS 1.123%, sgpGene 1.272%, both 0.964%, cover 85.83%, enrich 67.47x
nice -n +19 featureBits -enrichment hg18 knownGene:CDS sgpGene
# knownGene:CDS 1.185%, sgpGene 1.272%, both 0.989%, cover 83.43%, enrich 65.58x
# now reload the table
ldHgGene -gtf -genePredExt hg18 sgpGene chr*.gtf
# Read 34023 transcripts in 288520 lines in 49 files
# 34023 groups 46 seqs 1 sources 3 feature types
# 34023 gene predictions
# and now measure this new set
nice -n +19 featureBits -enrichment hg18 refGene:CDS sgpGene
# refGene:CDS 1.123%, sgpGene 1.270%, both 0.964%, cover 85.84%, enrich 67.59x
nice -n +19 featureBits -enrichment hg18 knownGene:CDS sgpGene
# knownGene:CDS 1.185%, sgpGene 1.270%, both 0.988%, cover 83.41%, enrich 65.68x
###########################################################################
# Blastz Orangutan ponAbe2 (DONE - 2007-10-02 - 2007-10-05 - Hiram)
ssh kkstore02
screen # use screen to control this job
mkdir /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02
cd /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02
cat << '_EOF_' > DEF
# Human vs orangutan
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Orangutan ponAbe2
SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzPonAbe2.2007-10-02
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=pk > blastz.log 2>&1 &
# real 388m20.443s
# Completed: 126960 of 126960 jobs
# CPU time in finished jobs: 7068824s 117813.73m 1963.56h 81.82d 0.224 y
# IO & Wait Time: 517624s 8627.07m 143.78h 5.99d 0.016 y
# Average job time: 60s 1.00m 0.02h 0.00d
# Longest finished job: 4940s 82.33m 1.37h 0.06d
# Submission to last job: 62056s 1034.27m 17.24h 0.72d
# some jobs failed (because they were done but parasol didn't realize that)
# after recovery, continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-continue=cat -bigClusterHub=pk > cat.log 2>&1 &
# real 390m56.934s
cat fb.hg18.chainPonAbe2Link.txt
# 2676696124 bases of 2881515245 (92.892%) in intersection
# And the swap
mkdir /cluster/data/ponAbe2/bed/blastz.hg18.swap
cd /cluster/data/ponAbe2/bed/blastz.hg18.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/hg18/bed/blastzPonAbe2.2007-10-02/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-swap -bigClusterHub=pk > swap.log 2>&1 &
# real 123m9.197s
cat fb.ponAbe2.chainHg18Link.txt
# 2824501297 bases of 3093572278 (91.302%) in intersection
##############################################################
# NIMH Bipolar Genome Graphs built-in (DONE 2007-10-04 Galt)
ssh hgwdev
mkdir /cluster/data/hg17/bed/nimhBipolar
# I registered and downloaded :
wget http://mapgenetics.nimh.nih.gov/BP_POOLING/german_data_share.csv.zip \
--user=galt --password=mypassword
wget http://mapgenetics.nimh.nih.gov/BP_POOLING/nimh_data_share.csv.zip \
--user=galt --password=mypassword
unzip german_data_share.csv.zip
unzip nimh_data_share.csv.zip
mkdir chromGraphs
tail +2 nimh_data_share.csv | tr -d '"' | gawk -F ',' '{print $1 "\t" $9}' \
> chromGraphs/nimhBipolarUs.cgt
tail +2 german_data_share.csv | tr -d '"' | gawk -F ',' '{print $1 "\t" $9}' \
> chromGraphs/nimhBipolarDe.cgt
cd chromGraphs/
mkdir hg17 hg18
hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10
-pathPrefix=/gbdb/hg17/chromGraph hg17 nimhBipolarUs nimhBipolarUs.cgt \
>& nimhBipolarUs.hg17.errors
mv nimhBipolarUs.cgb hg17/
hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10
-pathPrefix=/gbdb/hg17/chromGraph hg17 nimhBipolarDe nimhBipolarDe.cgt \
>& nimhBipolarDe.hg17.errors
mv nimhBipolarDe.cgb hg17/
hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10
-pathPrefix=/gbdb/hg18/chromGraph hg18 nimhBipolarUs nimhBipolarUs.cgt \
>& nimhBipolarUs.hg18.errors
mv nimhBipolarUs.cgb hg18/
hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10
-pathPrefix=/gbdb/hg18/chromGraph hg18 nimhBipolarDe nimhBipolarDe.cgt \
>& nimhBipolarDe.hg18.errors
mv nimhBipolarDe.cgb hg18/
pushd /gbdb/hg17/chromGraph
ln -s /cluster/data/hg17/bed/nimhBipolar/chromGraphs/hg17/*.cgb .
popd
pushd /gbdb/hg18/chromGraph
ln -s /cluster/data/hg17/bed/nimhBipolar/chromGraphs/hg18/*.cgb .
popd
# Add the hack row into metaChromGraph for the composite tracks.
hgsql hg17 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile)
values ("bipolar", 0, 0, "composite")'
hgsql hg18 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile)
values ("bipolar", 0, 0, "composite")'
#Add composite track info to src/hg/makeDb/trackDb/human/trackDb.ra:
############################################################################
# MGI MOUSE QTL MAPPED TO HUMAN (DONE 10/10/07 angie)
# Use a stringently filtered version of over.chain to do the mapping,
# so we only pick up large chunks (targeting >10,000bases) of the
# large fuzzy QTL regions.
# Of the MGI QTLs, some are large as expected, but most are tiny --
# they have only the peak STS marker coords, no indication of the
# range. Jim suggested padding those out to 100k. So I will process
# these in two batches, and make subtracks -- one for original, one
# for our modified set.
### NOTE FOR NEXT TIME ###
### Use jaxQtl instead of jaxQTL throughout.
ssh kolossus
mkdir /cluster/data/hg18/bed/jaxQTL
cd /cluster/data/hg18/bed/jaxQTL
# mm8ToHg18 was built before doBlastz included chainStitchId in the
# pipe to create over.chain. Run it here, to repair any chain breaks:
chainStitchId /cluster/data/mm8/bed/liftOver/mm8ToHg18.over.chain.gz \
/scratch/tmp/mm8ToHg18Stitch.over.chain
# For rn4->hg18 (rgdRatQtl above), I eyeballed scores vs. spans of
# stitched chains, to try to find a score threshold over which almost
# all spans were at least 10 or 20k, most >50k. For mm8->hg18, the
# correspondence is not quite so smooth, and in order to keep all spans
# >= 100k, the score threshold would have to be 170k (compared to
# 500k for rn4-hg18) and would pick up a lot of short chains.
# So this time I'll try filtering directly by span instead of score
# (but add a reasonable minScore to kick out some outliers).
chainFilter /scratch/tmp/mm8ToHg18Stitch.over.chain \
-tMinSize=20000 -qMinSize=20000 -minScore=10000\
> mm8ToHg18Coarse.over.chain
# Separate the mm8 jaxQtl's by size and reduce to bed4:
awk 'BEGIN{OFS="\t";} \
($3-$2) < 1000 {s = $2 > 50000 ? $2-50000 : 0; \
print $1, s, $3+50000, $4;}' \
/cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed \
> mm8.jaxQtl.padded.bed
cp /dev/null tmp.bed
foreach chr (`awk '{print $1;}' /cluster/data/mm8/chrom.sizes`)
set size = `awk '$1 == "'$chr'" {print $2;}' /cluster/data/mm8/chrom.sizes`
awk 'BEGIN{OFS="\t";} \
$1 == "'$chr'" && $3 > '$size' {$3 = '$size';} \
$1 == "'$chr'" && $3 > $2 {print;}' \
mm8.jaxQtl.padded.bed >> tmp.bed
end
mv tmp.bed mm8.jaxQtl.padded.bed
awk 'BEGIN{OFS="\t";} ($3-$2) > 100000 {print $1, $2, $3, $4;}' \
/cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed \
> mm8.jaxQtl.asIs.bed
# Make sure we didn't miss any between those two size ranges (except for
# the 4 markers whose coords are completely off the end of mm8 chroms):
wc -l mm8.*.bed
# 73 mm8.jaxQtl.asIs.bed
# 1468 mm8.jaxQtl.padded.bed
# 1541 total
wc -l /cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed
#1545 /cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed
# Try liftOver with various -minMatch settings. Compare the number
# mapped and unmapped; eyeball some of the unmapped in mm8, see if
# the hg18 Nets are truly weak there.
foreach minMatch (0.1 0.2 0.25 0.33)
time liftOver -minMatch=$minMatch -multiple \
mm8.jaxQtl.asIs.bed mm8ToHg18Coarse.over.chain \
hg18.jaxQTL.asIs.$minMatch.{bed,unmapped}
time liftOver -minMatch=$minMatch -multiple \
mm8.jaxQtl.padded.bed mm8ToHg18Coarse.over.chain \
hg18.jaxQTL.padded.$minMatch.{bed,unmapped}
wc -l hg18.jaxQTL.*.$minMatch.{bed,unmapped}
echo ""
end
#typical time: 23s for asIs, 45s for padded
# 757 hg18.jaxQTL.asIs.0.1.bed
# 1471 hg18.jaxQTL.padded.0.1.bed
# 0 hg18.jaxQTL.asIs.0.1.unmapped
# 54 hg18.jaxQTL.padded.0.1.unmapped
# 634 hg18.jaxQTL.asIs.0.2.bed
# 1429 hg18.jaxQTL.padded.0.2.bed
# 0 hg18.jaxQTL.asIs.0.2.unmapped
# 128 hg18.jaxQTL.padded.0.2.unmapped
# 532 hg18.jaxQTL.asIs.0.25.bed
# 1345 hg18.jaxQTL.padded.0.25.bed
# 2 hg18.jaxQTL.asIs.0.25.unmapped
# 282 hg18.jaxQTL.padded.0.25.unmapped
# 362 hg18.jaxQTL.asIs.0.33.bed
# 1146 hg18.jaxQTL.padded.0.33.bed
# 8 hg18.jaxQTL.asIs.0.33.unmapped
# 670 hg18.jaxQTL.padded.0.33.unmapped
# I eyeballed the 0.1 .bed and .unmapped files, and they look
# pretty good, esp. for mapped... we could probably get away with
# 0.2 for the asIs but 0.1 looks OK.
# Many of the records are completely contained within other records
# for the same QTL (inversions I suppose) -- they don't really tell
# us anything new about the murky QTL region, so merge them in.
# NOTE FOR NEXT TIME: try this:
# liftOverMerge -mergeGap=10000 hg18.jaxQTL.asIs.0.1.bed stdout \
# | mergeOverlapBed4.pl - > hg18.jaxQTL.asIs.0.1.pruned.bed
# liftOverMerge joins items separated by small (a relative term) gaps.
mergeOverlapBed4.pl hg18.jaxQTL.asIs.0.1.bed \
> hg18.jaxQTL.asIs.0.1.pruned.bed
mergeOverlapBed4.pl hg18.jaxQTL.padded.0.1.bed \
> hg18.jaxQTL.padded.0.1.pruned.bed
wc -l hg18.jaxQTL.*.pruned.bed
# 398 hg18.jaxQTL.asIs.0.1.pruned.bed
# 1463 hg18.jaxQTL.padded.0.1.pruned.bed
ssh hgwdev
cd /cluster/data/hg18/bed/jaxQTL
### NOTE FOR NEXT TIME ###
### Call the tables jaxQtl* instead of jaxQTL* -- QA doesn't like jaxQTL.
hgLoadBed hg18 jaxQTLAsIs hg18.jaxQTL.asIs.0.1.pruned.bed
hgLoadBed hg18 jaxQTLPadded hg18.jaxQTL.padded.0.1.pruned.bed
# Make sure there aren't any illegal coords:
checkTableCoords -verbose=2 hg18 jaxQTLAsIs
checkTableCoords -verbose=2 hg18 jaxQTLPadded
runJoiner.csh hg18 jaxQTLAsIs
runJoiner.csh hg18 jaxQTLPadded
# Tables renamed kuhn 10-12-2007
# jaxQTLAsIs to jaxQtlAsIs
# jaxQTLPadded to jaxQtlPadded
###########################################################################
# Build targetScanS track - (DONE - 2007-10-05 - 2007-10-31 - Hiram)
# requested by: George Bell gbell at wi.mit.edu
ssh hgwdev
mkdir -p /cluster/data/hg18/bed/targetScanS
cd /cluster/data/hg18/bed/targetScanS
wget --timestamping \
http://jura.wi.mit.edu/targetscan/vert_40/ucsc/hg18/hg18ConsChrALL.bed
hgLoadBed hg18 targetScanS -tmpDir=/scratch/tmp hg18ConsChrALL.bed
# Loaded 50764 elements of size 6
featureBits hg18 targetScanS
# 313293 bases of 2881515245 (0.011%) in intersection
################################
# previous attempts listed below
# the don't supply them all, but we don't know which ones they
# don't. So, ask for them all, and remove the files that are empty.
for C in `cut -f1 ../../chrom.sizes | sed -e "s/chr//"`
do
wget --timestamping \
"http://jura.wi.mit.edu/targetscan/vert_40/ucsc/NR/hg18ConsChr${C}.bed" \
-O hg18ConsChr${C}.bed
if [ ! -s "hg18ConsChr${C}.bed" ]; then
rm -f "hg18ConsChr${C}.bed"
fi
done
# Remove the browser/track lines from these custom track files
# and load into the hg18.targetScanS table
egrep -h -v "^browser|^track" hg*.bed | \
hgLoadBed hg18 targetScanS -tmpDir=/scratch/tmp stdin
# Loaded 50802 elements of size 6
featureBits hg18 targetScanS
# 312951 bases of 2881515245 (0.011%) in intersection
# Create/edit/check in targetScans.html and trackDb.ra under
# kent/src/hg/makeDb/trackDb/human/hg18
###########################################################################
# RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan)
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir wgRna-2007-10-05
cd wgRna-2007-10-05
# Received the data file, wgtrack_oct2007.txt (saved from wgtrack_oct2007.doc)
# from Michel Weber's email
# (Michel.Weber at ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg18/bed/wgRna-2007-10-05.
cat wg_track_oct2007.txt|sed -e 's/ /\t/g' > wgRna.tab
hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab
#############################################################################
# BLASTZ calJac1 - Marmoset (2007-10-09 kate)
ssh kkstore02
mkdir /cluster/data/hg18/bed/blastz.calJac1.2007-10-07
cd /cluster/data/hg18/bed/blastz.calJac1.2007-10-07
cat << '_EOF_' > DEF
# human vs. marmoset
# dynamic masking param
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Marmoset (calJac1)
SEQ2_DIR=/san/sanvol1/scratch/calJac1/calJac1.2bit
SEQ2_LEN=/san/sanvol1/scratch/calJac1/chrom.sizes
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.calJac1.2007-10-07
'_EOF_'
# << happy emacs
doBlastzChainNet.pl DEF \
-bigClusterHub pk \
-chainMinScore=3000 -chainLinearGap=medium >& do.log &
tail -f do.log
# failed at download step do to pre-existing file of Brian's
doBlastzChainNet.pl DEF \
-bigClusterHub pk -continue=download \
-chainMinScore=3000 -chainLinearGap=medium >& do2.log &
tail -f do2.log
#########################################################
# RE-BUILD GAD TRACK (Done, 10/17/06, Fan)
mkdir /cluster/store12/gad071011
rm /cluster/data/gad
ln -s /cluster/store12/gad071011 /cluster/data/gad
cd /cluster/data/gad
# Receive "all.txt" from GAD
# contact person: Garner, John (NIH/NIA/IRP) [F] [garnerjr@mail.nih.gov]
hgsql hg18 -e 'drop table gadAll'
hgsql hg18 <~/src/hg/lib/gadAll.sql
hgsql hg18 -e 'load data local infile "all.txt" into table gadAll ignore 3 lines'
# create gad table
gadPos hg18 j18.tmp
cat j18.tmp |sort -u >hg18.gad.tab
hgLoadBed hg18 gad hg18.gad.tab
rm j18.tmp
#########################################################################
# HAPMAP LD (DONE 10/26/07 angie -- phased REDONE 1/30/08)
# Based on Daryl's hg17 work. Data version here is release #22,
# March 2007 (2007-03).
# 1/30/08: HapMap re-released the phased genotypes 1/22/08 -- re-run,
# but without the removal of question marks that we had to do the
# first time around.
# hapmap.org offers ld_data downloads that look like the output of
# makeDcc -- but only for older versions. To get LD for the latest
# release (and for hg18 coords), compute LD from genotype as Daryl did.
############################# unphased ##############################
#*** NOTE FOR NEXT TIME: don't bother with individual CHB and JPT subsets,
#*** {CEU, CHB+JPT, YRI} is what we display.
#*** Actually, if there is a next time, we'll probably just start with
#*** phased and ignore unphased.
ssh kolossus
mkdir -p /san/sanvol1/scratch/hg18/bed/hapmapLd/genotypes_2007-03/run.Haploview
cd /san/sanvol1/scratch/hg18/bed/hapmapLd/genotypes_2007-03
# wget all genotype data:
wget ftp://ftp.hapmap.org/pub/hapmap/public/00README.releasenotes_rel22
wget ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2007-03/fwd_strand/non-redundant/genotypes_chr\*.txt.gz
# Use latest Haploview to compute LD scores:
wget http://www.broad.mit.edu/mpg/haploview/downloads/Haploview.jar
# Haploview cluster run on whole-chrom genotype files was a bust.
# Even on kki nodes, with java memory maxed out, 47 of 120 jobs crashed
# and one was still running after 5.5 days so I killed it.
# Meanwhile, Daryl suggested using the phased data instead. It is
# not yet available for all chrom/pops, but start with what's there
# to iron out the flow.
# New approach to unphased -> LD -- split, run Haploview, merge.
ssh pk
# Note: although the genotypes_ files are *mostly* sorted by position,
# they're not completely sorted! That can cause splitGenotype.pl to
# screw up (as well as other downstream stuff), so sort them on the way
# into splitGenotype.
mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.splitUnphased
cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.splitUnphased
cat > runSplit.csh <<'_EOF_'
#!/bin/csh -ef
set f = $1
set base = $f:t:r:r
set scriptBin = ~/kent/src/hg/snp/hapmapLd
set tmpDir = `mktemp -d -p /scratch/tmp runSplit.XXXXXX`
zcat $f \
| sort -k4n,4n \
| $scriptBin/splitGenotype.pl -suffix .txt.gz \
10000000 250000 $tmpDir/$base
mv $tmpDir/$base.* ../splitUnphased/$base/
rmdir $tmpDir
'_EOF_'
# << emacs
chmod a+x runSplit.csh
cp /dev/null jobList
foreach f (../genotypes_2007-03/genotypes_chr*.txt.gz)
mkdir -p ../splitUnphased/$f:t:r:r
echo ./runSplit.csh $f >> jobList
end
para make jobList
para time
#Completed: 120 of 120 jobs
#CPU time in finished jobs: 826s 13.77m 0.23h 0.01d 0.000 y
#IO & Wait Time: 457s 7.61m 0.13h 0.01d 0.000 y
#Average job time: 11s 0.18m 0.00h 0.00d
#Longest finished job: 22s 0.37m 0.01h 0.00d
#Submission to last job: 29s 0.48m 0.01h 0.00d
# Run Haploview on split files.
ssh pk
mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewSplitUnphased
cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewSplitUnphased
set scriptBin = ~/kent/src/hg/snp/hapmapLd
set hvPath = /san/sanvol1/scratch/hg18/bed/hapmapLd/Haploview.jar
# Latest installed java on the cluster nodes (not on the para hub machine):
set javaPath = /usr/java/jre1.5.0_12/bin/java
set javaMemSize = 1500M
find /san/sanvol1/scratch/hg18/bed/hapmapLd/splitUnphased \
-name \*.txt.gz -ls \
| awk '{print $7, $11;}' | sort -nr > filesBySize
cp /dev/null jobList
foreach f (`awk '{print $2;}' filesBySize`)
echo $scriptBin/runHaploview.csh $f $javaPath $hvPath $javaMemSize \
>> jobList
end
para make jobList
para time
#Completed: 1493 of 1493 jobs
#CPU time in finished jobs: 582015s 9700.25m 161.67h 6.74d 0.018 y
#IO & Wait Time: 6558s 109.30m 1.82h 0.08d 0.000 y
#Average job time: 394s 6.57m 0.11h 0.00d
#Longest finished job: 1711s 28.52m 0.48h 0.02d
#Submission to last job: 1740s 29.00m 0.48h 0.02d
# Merge Haploview results.
ssh pk
mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.mergeSplitHapLD
cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.mergeSplitHapLD
cat > runMerge.csh <<'_EOF_'
#!/bin/csh -ef
set mapFile = $1
set outFile = $2
set scriptBin = ~/kent/src/hg/snp/hapmapLd
set tmpOut = `mktemp -p /scratch/tmp runMerge.XXXXXX`
$scriptBin/mergeHaploviewLD.pl $mapFile $tmpOut
mv $tmpOut $outFile
'_EOF_'
# << emacs
chmod a+x runMerge.csh
mkdir ../mergedUnphasedLD
cp /dev/null jobList
foreach f (`ls -1S ../splitUnphased/genotypes_chr*/genotypes_chr*.map`)
set base = $f:t:r
echo ./runMerge.csh $f ../mergedUnphasedLD/$base.txt.LD.gz >> jobList
end
para make jobList
para time
#Completed: 120 of 120 jobs
#CPU time in finished jobs: 16035s 267.25m 4.45h 0.19d 0.001 y
#IO & Wait Time: 17282s 288.03m 4.80h 0.20d 0.001 y
#Average job time: 278s 4.63m 0.08h 0.00d
#Longest finished job: 737s 12.28m 0.20h 0.01d
#Submission to last job: 738s 12.30m 0.20h 0.01d
# Compare results of unsplit run with split/merge:
ssh kolossus
cd /san/sanvol1/scratch/hg18/bed/hapmapLd
# Compare SNP pairs:
zcat genotypes_2007-03/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz \
| awk '{print $1, $2;}' > /tmp/1
zcat mergedUnphasedLD/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz \
| awk '{print $1, $2;}' > /tmp/2
wc -l /tmp/1 /tmp/2
# 32514982 /tmp/1
# 32514982 /tmp/2
cmp /tmp/1 /tmp/2
# Compare entire files:
zcat genotypes_2007-03/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz > /tmp/1
zcat mergedUnphasedLD/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz > /tmp/2
head /tmp/1 /tmp/2
cmp /tmp/1 /tmp/2
# Woohoo!
############################# phased ##############################
# For this build, Daryl suggested using the phased data (output of
# Jonathan Marchini's PHASE program) instead of raw genotype data
ssh kolossus
mkdir -p /san/sanvol1/scratch/hg18/bed/hapmapLd/phased_2007-08_rel22
cd /san/sanvol1/scratch/hg18/bed/hapmapLd
cd /san/sanvol1/scratch/hg18/bed/hapmapLd/phased_2007-08_rel22
# 1/30/08: re-run from this point on, to pick up re-release (same URL)
wget --timestamping \
ftp://ftp.hapmap.org/pub/hapmap/public/phasing/2007-08_rel22/phased/\*.gz
# Downstream stuff depends on the inputs being sorted by position -- check:
cd /san/sanvol1/scratch/hg18/bed/hapmapLd
foreach f (phased_2007-08_rel22/*_legend.txt.gz)
echo $f
zcat $f | tail +2 | awk '{print $2;}' > /tmp/1
sort -n /tmp/1 > /tmp/2
cmp /tmp/1 /tmp/2
end
rm -f /tmp/1 /tmp/2
# kki cluster run -- need lots of memory! more than pk's 2G hard limit.
# (would use memk but it doesn't have java and kki is sufficient)
ssh kki
mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewPhased
cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewPhased
set scriptBin = $HOME/kent/src/hg/snp/hapmapLd
set hv = $scriptBin/runHaploviewPhased.csh
set phaseDir = /san/sanvol1/scratch/hg18/bed/hapmapLd/phased_2007-08_rel22
set hvPath = /san/sanvol1/scratch/hg18/bed/hapmapLd/Haploview.jar
# Latest installed java on the cluster nodes (not on the para hub machine):
set javaPath = /usr/java/jre1.5.0_12/bin/java
set javaMemSize = 4G
# Sort by size (descending) to kick off the biggest jobs first:
cp /dev/null jobList
foreach f (`ls -1S $phaseDir/genotypes_chr*.phase.gz`)
echo $hv $f:r:r $javaPath $hvPath $javaMemSize >> jobList
end
para make jobList
para time
#Completed: 66 of 66 jobs
#CPU time in finished jobs: 406845s 6780.76m 113.01h 4.71d 0.013 y
#IO & Wait Time: 1517s 25.28m 0.42h 0.02d 0.000 y
#Average job time: 6187s 103.12m 1.72h 0.07d
#Longest finished job: 15667s 261.12m 4.35h 0.18d
#Submission to last job: 29868s 497.80m 8.30h 0.35d
# Our software assumes that LD scores are given for consecutive SNPs
# without gaps in between, so scores in the encoded lists can be
# associated with other SNPs just by their position in the list.
# Make sure that's the case! I suspect this also depends on the
# inputs to Haploview being sorted by position -- checked those above.
ssh kolossus
cd /san/sanvol1/scratch/hg18/bed/hapmapLd
cp /dev/null checkLD.log
foreach f ( mergedUnphasedLD/*.LD.gz phased_2007-08_rel22/*.LD.gz )
echo $f >> checkLD.log
$scriptBin/checkLDSnpOrder.pl $f >>& checkLD.log
echo "" >> checkLD.log
date
end
# Takes a long time (~4 minutes for 184 files -> 11-12 hours) --
# left to run overnight.
# Cluster run to translate Haploview .LD output into the DCC's
# ld_data downloads format, and in turn into our bed4+ format.
ssh pk
mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/{dcc,bed}{Phased,Unphased}
mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatUnphased
cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatUnphased
cat > runFormatsUnphased.csh <<'_EOF_'
#!/bin/csh -ef
set base = $1
set db = hg18
set scriptBin = ~/kent/src/hg/snp/hapmapLd
set hapDir = /san/sanvol1/scratch/$db/bed/hapmapLd
set unphDir = $hapDir/genotypes_2007-03
set unphLDDir = $hapDir/mergedUnphasedLD
set dccOut = `echo $base | sed -e 's/^genotypes_/ld_/; s/$/.txt.gz/;'`
set chr = `echo $base | perl -wpe 's/^.*_(chr[0-9MXY]+)_.*/$1/'`
set pop = `echo $base | perl -wpe 's/^.*_chr[0-9MXY]+_([A-Z+]+)_.*/$1/'`
set bedOut = $db.${pop}_$chr.bed.gz
$scriptBin/makeDccAndLdBed.pl \
$unphDir/$base.txt.gz $unphLDDir/$base.txt.LD.gz \
$hapDir/dccUnphased/$dccOut $hapDir/bedUnphased/$bedOut
'_EOF_'
# << emacs
chmod a+x runFormatsUnphased.csh
cp /dev/null jobList
foreach f (`ls -1S ../mergedUnphasedLD/genotypes_chr*.txt.LD.gz`)
echo ./runFormatsUnphased.csh $f:t:r:r:r >> jobList
end
para make jobList
para time
#Completed: 120 of 120 jobs
#CPU time in finished jobs: 101968s 1699.46m 28.32h 1.18d 0.003 y
#IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y
#Average job time: 847s 14.11m 0.24h 0.01d
#Longest finished job: 2276s 37.93m 0.63h 0.03d
#Submission to last job: 2276s 37.93m 0.63h 0.03d
mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatPhased
cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatPhased
cat > runFormatsPhased.csh <<'_EOF_'
#!/bin/csh -ef
set basePath = $1
set base = $basePath:t
set db = hg18
set scriptBin = ~/kent/src/hg/snp/hapmapLd
set hapDir = /san/sanvol1/scratch/$db/bed/hapmapLd
set dccOut = `echo $base | sed -e 's/^genotypes_/ld_/; s/$/.txt.gz/;'`
set chr = `echo $base | perl -wpe 's/^.*_(chr[0-9MXY]+)_.*/$1/'`
set pop = `echo $base | perl -wpe 's/^.*_chr[0-9MXY]+_([A-Z+]+)_.*/$1/'`
set bedOut = $db.${pop}_$chr.bed.gz
$scriptBin/makeDccAndLdBed.pl ${basePath}_legend.txt.gz $basePath.LD.gz \
$hapDir/dccPhased/$dccOut $hapDir/bedPhased/$bedOut
'_EOF_'
# << emacs
chmod a+x runFormatsPhased.csh
cp /dev/null jobList
foreach f (`ls -1S ../phased_2007-08_rel22/genotypes_chr*.LD.gz`)
echo ./runFormatsPhased.csh $f:r:r >> jobList
end
para make jobList
para time
#Completed: 66 of 66 jobs
#CPU time in finished jobs: 66155s 1102.58m 18.38h 0.77d 0.002 y
#IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y
#Average job time: 972s 16.20m 0.27h 0.01d
#Longest finished job: 2292s 38.20m 0.64h 0.03d
#Submission to last job: 2292s 38.20m 0.64h 0.03d
# Create empty tables, then load one pop_chr at a time in order
# to avoid thrashing.
# hg17 took about half an hour to an hour per population on hgwdev.
# Load on kolossus, then ask cluster-admin to rsync to hgwdev.
ssh kolossus
cd /san/sanvol1/scratch/hg18/bed/hapmapLd
cat > loadOne.csh <<'_EOF_'
#!/bin/csh -ef
set tableBase = $1
set Pop = $2
set bedDir = $3
set table = $tableBase$Pop
hgsql hg18 -e "drop table if exists $table;"
sed "s/ld2/$table/" $HOME/kent/src/hg/lib/ld2.sql \
| hgsql hg18
set pop = `echo $Pop | perl -wpe 's/ChbJpt/JPT+CHB/; tr/a-z/A-Z/;'`
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
set bed = $bedDir/hg18.${pop}_chr$c.bed.gz
if (-e $bed) then
echo $bed
hgLoadBed -noSort -oldTable hg18 $table $bed
else
echo "\n$bed does not exist\n"
endif
echo ""
end
echo -n "\nDone with $table. "; date
'_EOF_'
# << emacs
chmod a+x loadOne.csh
# phased:
cp /dev/null loadPhased.log
foreach Pop (Ceu ChbJpt Yri)
./loadOne.csh hapmapLdPh $Pop bedPhased >>& loadPhased.log
end
# ~16 minutes for all phased on kolossus
# 1/30/08: ~11 minutes for all phased on hgwdev! bg load ~1.25
# unphased:
cp /dev/null loadUnphased.log
foreach Pop (Ceu Chb ChbJpt Jpt Yri)
./loadOne.csh hapmapLd $Pop bedUnphased >>& loadUnphased.log
end
# ~21 minutes -- got segfaults for empty gzipped chrY files, debug later.
rm -f bed.tab
# Repeat hg17 sanity checks on the unphased results.
ssh pk
mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.maxDist
cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.maxDist
# Find the largest distance between any paired SNPs in DCC ld_* files.
# Should be 249999 or less. Also count the number of unique starting
# coords. We can compare those to the SNP counts in checkLD.log.
cat > runMaxDist.csh <<'_EOF_'
#!/bin/csh -ef
set dccIn = $1
set out = $dccIn:r:r.check
echo -n "$dccIn:t " > $out
zcat $dccIn \
| awk '{if ($2-$1>max) max=$2-$1} \
{if (prevStart && $1 != prevStart) count++; prevStart = $1;} \
END {print max "\t" count; \
if (max > 249999) print "ERROR: maxDistance too large!";}' \
>> $out
'_EOF_'
# << emacs
chmod a+x runMaxDist.csh
cp /dev/null jobList
foreach f (../dccUnphased/ld_*.txt.gz)
echo ./runMaxDist.csh $f >> jobList
end
para make jobList
para time
#Completed: 120 of 120 jobs
#CPU time in finished jobs: 12274s 204.56m 3.41h 0.14d 0.000 y
#IO & Wait Time: 4137s 68.96m 1.15h 0.05d 0.000 y
#Average job time: 137s 2.28m 0.04h 0.00d
#Longest finished job: 365s 6.08m 0.10h 0.00d
#Submission to last job: 365s 6.08m 0.10h 0.00d
cd ..
cat dccUnphased/*.check > maxDist.txt
grep -B1 ERROR maxDist.txt
# Other cleanup:
rm -r splitUnphased
#########################################################################
# University of Uppsala, Sweden Chip-chip (2007-10-18 kate)
# 3 datasets (Usf1, Usf2, H3ac) -- wiggle and bed for each, in hg16 coords
# Submitted by Adam Ameur
ssh kkstore02
cd /cluster/data/hg18/bed
mkdir uppsalaChip
cd uppsalaChip
foreach f (H3ac Usf1 Usf2)
#wget -nd http://www.lcb.uu.se/~mada/UUtracks_hg16/${f}_hg16.wig.gz
wget -nd http://www.lcb.uu.se/~mada/UUtracks_hg16/${f}_hg16.bed
end
wget -nd http://www.lcb.uu.se/~mada/UUtracks_hg16/UCSCdescription.html
# lift to hg18
foreach f (lab/*hg16.bed)
set b = `echo $f:t | sed 's/_.*//'`
echo $b
tail +2 $f | \
liftOver stdin \
/cluster/data/hg16/bed/liftOver/hg16ToHg18.over.chain.gz \
$b.bed $b.bed.unmapped
end
ssh kolossus
cd /cluster/data/hg18/bed
cd uppsalaChip
# remove duplicate regions resulting from liftOver
cat > trimDups.awk << 'EOF'
BEGIN {chr=""; start="";}
{
if (!(($1 == chr) && ($2 == start)))
print;
chr = $1;
start = $2;
}
'EOF'
# process in 2 unix pipelines, so as not to overload machine
cat > load.csh << 'EOF'
foreach f (lab/*hg16.wig.gz)
set b = `echo $f:t | sed 's/_.*//'`
echo $b
date
nice zcat $f | tail +2 | \
nice varStepToBedGraph.pl stdin | \
nice liftOver stdin \
/cluster/data/hg16/bed/liftOver/hg16ToHg18.over.chain.gz \
$b.wigBed $b.wigBed.unmapped
nice bedSort $b.wigBed stdout | \
nice awk -f trimDups.awk | \
nice wigEncode stdin $b.wig $b.wib
date
end
'EOF'
csh load.csh >&! load.log &
# approx. 50 minutes to process the 3 datasets
# load bed and wiggles into database
ssh hgwdev
cd /cluster/data/hg18/bed/uppsalaChip
cat > load2.csh << 'EOF'
foreach f (*.wig)
set b = $f:r
echo $b
date
set table = uppsalaChip${b}Sites
hgLoadBed hg18 $table $b.bed
set table = uppsalaChip${b}Signal
ln -s /cluster/data/hg18/bed/uppsalaChip/$b.wib /gbdb/hg18/wib/uppsalaChip${b}Signal.wib
hgLoadWiggle hg18 $table $f
date
end
'EOF'
csh load2.csh >&! load2.log &
# just a few minutes runtime
# somehow 2 beds were left out above (lifted files were missing)
cat > loadBed.csh << 'EOF'
foreach f (*.bed)
set b = $f:r
echo $b
hgLoadBed hg18 uppsalaChip${b}Sites $f
end
'EOF'
# << emacs
csh loadBed.csh >& loadBed.log &
# data distribution
textHistogram H3ac.wigBed -minVal=-2 -real -col=4 -binSize=.5
-2.000000 611
-1.500000 5711
-1.000000 * 391229
-0.500000 ************************************************************ 21240336
0.000000 ******************************************************* 19325712
0.500000 ** 689267
1.000000 99083
1.500000 24453
2.000000 4635
2.500000 635
3.000000 49
3.500000 3
<minVal or >=4.000000 562
#########################################################################
# BLASTZ Zebrafish danRer5 (DONE - 2007-10-18 - Hiram)
ssh kkstore02
mkdir /cluster/data/hg18/bed/blastzDanRer5.2007-10-17
cd /cluster/data/hg18/bed/blastzDanRer5.2007-10-17
cat << '_EOF_' > DEF
# Human (hg18) vs zebrafish (danRer5)
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=0
# QUERY - zebrafish (danRer5)
SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
SEQ2_LEN=/cluster/data/danRer5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzDanRer5.2007-10-17
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 &
# real 369m20.490s
cat fb.hg18.chainDanRer5Link.txt
# 73923439 bases of 2881515245 (2.565%) in intersection
mkdir /cluster/data/danRer5/bed/blastz.hg18.swap
cd /cluster/data/danRer5/bed/blastz.hg18.swap
time nice -n +19 doBlastzChainNet.pl \
-chainMinScore=5000 \
/cluster/data/hg18/bed/blastzDanRer5.2007-10-17/DEF \
-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
> swap.log 2>&1 &
# real 11m35.536s
cat fb.danRer5.chainHg18Link.txt
# 74166352 bases of 1435609608 (5.166%) in intersection
#########################################################################
# Vista Enhancers (2007-10-18, conodera)
# see also /projects/compbiousr/wet/browser/vista_enhancer/17Oct2007/Makefile
#
# download data file from the vista browser (coordinates are for hg17)
# http://enhancer.lbl.gov/cgi-bin/imagedb.pl?show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1
# save as enhancerbrowser.datadownload.txt
cd /projects/compbiousr/wet/browser/vista_enhancer/
# liftOver hg17 file
liftOver vista_enhancer.hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz vista_enhancer.hg18.bed vista_enhancer.hg17ToHg18.unMapped
hgLoadBed hg18 vistaEnhancers vista_enhancer.hg18.bed
############################################################################
# Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-10-30 markd)
cd /cluster/data/genbank/data/ccds/
ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
get CCDS.20071030.tar.gz
mkdir /scratch/tmp/ccds
cd /scratch/tmp/ccds
tar -zxf /cluster/data/genbank/data/ccds/CCDS.20071030.tar.gz
# import ccds database tables
/cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg18 ccdsInfo ccdsGene
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=hg18 -loadDb ccdsGene knownGene ccdsKgMap
checkTableCoords hg18 -verbose=2 ccdsGene
# update all.jointer to include hg18 in ccdsDb
joinerCheck -database=hg18 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
# build initial version of ccdsMgcMap table, updated by nightly genbank update
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=hg18 ccdsGene mgcGenes ccdsMgcMap
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
ccdsMgcMap
# << emacs
#########################################################################
# Load ENSEMBL ver 45 (2007-09-5 markd)
mkdir /cluster/data/hg18/bed/ensembl45
cd /cluster/data/hg18/bed/ensembl45
##
# need to find bounds of haplotype chromosomes
##
# get unmasked haplotype pseudochroms from ensemble (dna, NOT dna_rm)
wget ftp://ftp.ensembl.org/pub/current_homo_sapiens/data/fasta/dna/
Homo_sapiens.NCBI36.46.dna.chromosome.c22_H2.fa.gz
Homo_sapiens.NCBI36.46.dna.chromosome.c5_H2.fa.gz
Homo_sapiens.NCBI36.46.dna.chromosome.c6_COX.fa.gz
Homo_sapiens.NCBI36.46.dna.chromosome.c6_QBL.fa.gz
# get gap locations and create hap.lift
foreach f ( *.fa.gz )
faGapLocs $f $f:r:r.lift
end
# build lift file for randons and haps
(mkRandomNTLift hg18 && cat hap.lift) > randHap.lift
# load ensembl genes
hgLoadEnsembl -l randHap.lift -p homo_sapiens core_45_36g hg18>&log
# got 1 genes with CDS exons with no frame:
ENST00000374459
# add this to problem ids and rerun
hgLoadEnsembl -l randHap.lift -f problem.ids homo_sapiens core_45_36g hg18>&log
# load pseudogenes
hgLoadEnsembl -l randHap.lift -p homo_sapiens core_45_36g hg18>&log
# got 3 pseudogenes with CDS bounds outside of exons
ENST00000342841
ENST00000361218
ENST00000388856
# add this to problem ids and rerun
hgLoadEnsembl -l randHap.lift -f problem.ids -p homo_sapiens core_45_36g hg18>&log
# vega code is not working in robert's scripts.
# done to support CCDS; push not requested awaiting resolution of vega
# stuff
#########################################################################
# AFFY TRANSCRIPTOME PHASE 3 (2007-11-06, Andy)
ssh hgwdev
bash
cd /san/sanVol1/scratch/andy/transcriptome
mkdir splits
cd originalWigs/
for f in *.wigVar; do
table=${f%.wigVar};
mkdir ../splits/$table
grep -v "^track" $f | splitWig stdin 1000000 ../splits/${table}/split
echo Done with $table
done
# Done with cluster run
ssh kolossus
cd /san/sanVol1/scratch/andy/transcriptome/lift/bed
for tab in *; do
for split in ${tab}/*; do
cat $split >> ${tab}.bed
done
echo done catting $tab
done
# Split into chrom beds (with a cluster run)
for f in `ls -1 hg18.bed`; do
tab=${f%.bed};
for c in `cut -f1 chrom.sizes`; do
cfile=hg18.bed.chromSplit/${tab}.${c}.bed;
outFile=hg18.wigVar/${tab}.wigVar;
if [ -e $cfile ]; then
echo variableStep chrom=${c} span=1 >> $outFile;
bedSort $cfile stdout | awk 'BEGIN{FS="\t"}{print $2+1, $4;}' | awk -f noDupe.awk >> $outFile;
echo Added $cfile to $outFile >> the.log;
fi;
done;
echo DONE with $tab >> the.log;
wigEncode $outFile hg18.wigVar/${tab}.wig hg18.wigVar/${tab}.wib >> the.log;
gzip $outFile
done
cd hg18.wigVar/
mkdir -p /cluster/data/hg18/bed/affyTxnPhase3/wib
for f in *.wib; do
echo copying $f...;
cp $f /cluster/data/hg18/bed/affyTxnPhase3/wib/;
done
pushd /gbdb/hg18/wib
ln -s /cluster/data/hg18/bed/affyTxnPhase3/wib/* .
popd
mkdir -p /cluster/data/hg18/bed/affyTxnPhase3/downloads
cp *.wigVar.gz /cluster/data/hg18/bed/affyTxnPhase3/downloads
mkdir -p /usr/local/apache/htdocs/goldenPath/hg18/affyTxnPhase3
pushd /usr/local/apache/htdocs/goldenPath/hg18/affyTxnPhase3
ln -s /cluster/data/hg18/bed/affyTxnPhase3/downloads/* .
for f in *Strand*; do mv $f sRNA.$f; done
for f in affyTxnPhase3*; do mv $f lRNA.$f; done
#########################################################################
# Blastz Marmoset calJac1 (DONE - 2007-11-09 - Hiram)
## this is not necessary - already done by Kate in October
ssh kkstore06
screen # use screen to control this job
mkdir /cluster/data/hg18/bed/blastzCalJac1.2007-11-09
cd /cluster/data/hg18/bed/blastzCalJac1.2007-11-09
cat << '_EOF_' > DEF
# Human vs marmoset
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Marmoset calJac1
SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzCalJac1.2007-11-09
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=pk > blastz.log 2>&1 &
# real 542m2.359s
# Completed: 230805 of 230805 jobs
# CPU time in finished jobs: 7279638s 121327.30m 2022.12h 84.26d 0.231 y
# IO & Wait Time: 831303s 13855.05m 230.92h 9.62d 0.026 y
# Average job time: 35s 0.59m 0.01h 0.00d
# Longest finished job: 972s 16.20m 0.27h 0.01d
# Submission to last job: 20572s 342.87m 5.71h 0.24d
cat fb.hg18.chainCalJac1Link.txt
# 2236493373 bases of 2881515245 (77.615%) in intersection
###########################################################################
# LIFT RM ALIGN FILES, MAKE PER-CHROM DOWNLOADS (DONE 12/7/07 angie)
# Lifting of .align files is now automated by doRepeatMasker.pl, but we
# got a user request for .align files from this pre-automation db.
ssh kkstore02
cd /cluster/data/hg18
mkdir downloads/RMalign
foreach c (?{,?} ?{,?}_*hap?)
echo linking/lifting to contigs of $c:t
foreach ctgdir ($c/N[TC]_??????)
set nt = $ctgdir:t
if (! -f $ctgdir/$nt.fa.align) then
pushd $ctgdir
liftRMAlign.pl $nt.lft > $nt.fa.align
popd
endif
ln -s $nt/$nt.fa.align $c/
end
set chr = chr$c:t
if (-e $c/lift/ordered.lft && ! -z $c/lift/ordered.lft) then
echo lifting contigs to chr$c
liftRMAlign.pl $c/lift/ordered.lft \
| gzip -c > downloads/RMalign/$chr.fa.align.gz
endif
if (-e $c/lift/random.lft && ! -z $c/lift/random.lft) then
echo lifting contigs to chr${c}_random
liftRMAlign.pl $c/lift/random.lft \
| gzip -c > downloads/RMalign/${chr}_random.fa.align.gz
endif
end
md5sum downloads/RMalign/*.gz > downloads/RMalign/md5sum.txt
ssh hgwdev ln -s /cluster/data/hg18/downloads/RMalign \
/usr/local/apache/htdocs/goldenPath/hg18/
#########################################################################
# ADD LINKS TO GENETESTS ON hgGene DETAILS PAGE (DONE 12/12/07 Fan)
ssh hgwdev
cd /cluster/store11/gs.19/build36/bed
mkdir geneTests
cd geneTests
# paste the 3 cols gene list from GeneTest web site into file geneTests.lis
cut -f 1 geneTests.lis >j1
cut -f 2 geneTests.lis >j2
cut -f 3 geneTests.lis >j3
cat j1 j2 j3 |sort -u >geneTests.tab
rm j1 j2 j3
hgsql hg18 -e 'drop table geneTests'
hgsql hg18 < ~/src/hg/lib/geneTests.sql
hgsql hg18 -e 'load data local infile "geneTests.tab" into table geneTests
ignore 1 lines'
# the list is independent of hg18, so load it into hg17 too.
hgsql hg17 -e 'drop table geneTests'
hgsql hg17 < ~/src/hg/lib/geneTests.sql
hgsql hg17 -e 'load data local infile "geneTests.tab" into table geneTests
ignore 1 lines'
###########################################################################
# ADD SeattleSNPs PGA GENES ON hgGene DETAILS PAGE. (DONE, Fan, 12/13/07).
cd /cluster/store12/snp
mkdir pga
cd pga
# download data from SeattleSNPs
wget --timestamping http://pga.gs.washington.edu/data.tar.gz
gzip -d *.gz
tar -xvf *.tar
# create SeattleSNPs PGA gene list
cut -f 1 FinishedGenes.txt >j1
cut -f 2 FinishedGenes.txt >j2
cat j1 j2 |sort -u >pga.tab
rm j1 j2
# load the data into the pga table.
hgsql hg18 -e 'drop table pga'
hgsql hg18 < ~/src/hg/lib/pga.sql
hgsql hg18 -e 'load data local infile "pga.tab" into table pga'
###########################################################################
# Reload CCDS (2007-12-12 markd)
# import ccds database as described in ccds.txt
set db=hg18
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
# ccdsKgMap
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
# build initial version of ccdsMgcMap table, updated by nightly genbank update
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap
checkTableCoords ${db} -verbose=2 ccdsGene
# update all.jointer to include ${db} in ccdsDb
joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
ccdsMgcMap
############################################################################
# dbSNP BUILD 128 (DONE 1/22/08 angie)
# updated snp128ExceptionDesc (tweaked wording) 3/7/08
# 8/7/08: Regenerated snp128.sql with only those enum/set values that are
# actually used (except always keep unknown, the default) and reloaded snp128.
# No data change -- just the sql field definitions for enums and sets.
# QA NOTE: used sudo mytouch on the snp128 table to reset the timestamp to
# .2008-01-22 00:00:00 (was .2008-08-07 16:08:27 after Angie's re-load) in
# order to keep joinerCheck happy and avoid confusion. (8/8/08 brooke)
# Set up build directory
ssh kkstore06
mkdir -p /cluster/store3/dbSNP128/{human,shared}
ln -s /cluster/store3/dbSNP128 /cluster/data/dbSNP/128
# Get field encodings -- if there are changes or additions to the
# encoding of the corresponding fields, you might need to update
# snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
# hg/lib/snp125Ui.c).
cd /cluster/data/dbSNP/128/shared
set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
wg $ftpSnpDb/shared_data/LocTypeCode.bcp.gz
wg $ftpSnpDb/shared_data/SnpClassCode.bcp.gz
wg $ftpSnpDb/shared_data/SnpFunctionCode.bcp.gz
wg $ftpSnpDb/shared_data/SnpValidationCode.bcp.gz
# Here is another source -- it is not as up-to-date as the above, but
# our encodings (enums and sets in snp128.sql) are named more similar
# to those in the 2005 ASN:
# ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn
########################## DOWNLOAD #############################
cd /cluster/data/dbSNP/128/human
mkdir data schema rs_fasta
# Get data from NCBI (anonymous FTP)
wget ftp://ftp.ncbi.nih.gov/snp/00readme.txt
cd /cluster/data/dbSNP/128/human/data
alias wg wget --timestamping
set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
# ContigLoc table has coords, orientation, loc_type, and refNCBI allele
wg $ftpSnpDb/organism_data/b128_SNPContigLoc_36_2.bcp.gz
wg $ftpSnpDb/organism_data/b128_SNPContigLocusId_36_2.bcp.gz
wg $ftpSnpDb/organism_data/b128_ContigInfo_36_2.bcp.gz
# MapInfo has alignment weights
wg $ftpSnpDb/organism_data/b128_SNPMapInfo_36_2.bcp.gz
# SNP has univar_id, validation status and heterozygosity
wg $ftpSnpDb/organism_data/SNP.bcp.gz
# Get schema
cd /cluster/data/dbSNP/128/human/schema
wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz
# Get fasta files
# using headers of fasta files for molType, class, observed
cd /cluster/data/dbSNP/128/human/rs_fasta
wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz
########################## LOAD NCBI TABLES #############################
# Simplify names of data files -- strip version & extras to get
# local canonical table names.
cd /cluster/data/dbSNP/128/human/data
foreach f (*.bcp.gz)
set new = `echo $f \
| sed -e 's/^b128_SNP//; s/^b128_//; s/_36_2//; s/.bcp//;'`
mv $f $new
echo $new
end
# Extract just the tables that we need from the NCBI msSQL table
# creation file, and get CREATE statements from
# human_9606_table.sql for our 5 tables
cd /cluster/data/dbSNP/128/human/schema
zcat human_9606_table.sql.gz \
| perl -we '$/ = "\nGO\n\n\n\n"; \
while (<>) { \
next unless /^CREATE TABLE \[(b128_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_36_2)?\]/; \
s/b128_(SNP)?//; s/_36_2//; \
s/[\[\]]//g; s/GO\n\n\n/;/; s/smalldatetime/datetime/g; \
s/ON PRIMARY//g; s/COLLATE//g; s/Latin1_General_BIN//g; \
s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
s/nvarchar/varchar/g; s/set quoted/--set quoted/g; \
s/(image|varchar\s+\(\d+\))/BLOB/g; \
print; \
}' \
> table.sql
# load on kolossus or a small cluster machine (mysql5 is OK for this).
ssh kolossus
hgsql '' -e 'create database hg18snp128'
cd /cluster/data/dbSNP/128/human/schema
hgsql hg18snp128 < table.sql
cd ../data
foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
zcat $t.gz \
| perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
| hgLoadSqlTab -oldTable hg18snp128 $t placeholder stdin
end
# There were some warnings (many cleared up by the perl substitution)
# but no rows were dropped. I eyeballed a few examples, seemed OK.
foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
echo -n "${t}:\t"
hgsql -N -B hg18snp128 -e 'select count(*) from '$t
end
#ContigInfo: 7067
#ContigLoc: 24685256
#ContigLocusId: 13129868
#MapInfo: 24132236
#SNP: 11833664
# these counts (except for MapInfo which has ~doubled) are
# slightly down from 126. MapInfo has a lot of alternate assembly
# mappings, esp. the celera assembly; maybe that's new?
# load hg18.ctgPos into dbSnpHumanBuild128, compare contig list between
# ctgPos and ContigInfo
# NOTE FOR NEXT TIME: instead of going through mysql, just make a
# tab-sep dump file of ctgPos.
ssh hgwdev hgsql hg18 -N -B -e '"select * from ctgPos;"' \
| hgLoadSqlTab hg18snp128 ctgPos ~/kent/src/hg/lib/ctgPos.sql stdin
hgsql hg18snp128 -N -B -e 'select contig from ctgPos;' | sort > /tmp/1
# Note: we used to look for group_term = "ref_assembly", but that leaves
# behind some contigs that we include. So use a list of group_label:
hgsql hg18snp128 -NBe 'select distinct(group_label) from ContigInfo'
# --> ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2")
hgsql hg18snp128 -N -B -e 'select contig_acc from ContigInfo \
where group_label in \
("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' | sort > /tmp/2
diff /tmp/1 /tmp/2
# No diff.
#################### EXTRACT INFO FROM NCBI TABLES ####################
mkdir -p /scratch/snp/128/human
cd /scratch/snp/128/human
# Fields of the SNP table and their NCBI source table/file:
# chrom ContigLoc / contigInfo / liftUp
# chromStart ContigLoc / liftUp; check vs phys_pos_from
# chromEnd ContigLoc / liftUp
# name rs + numeric ID that joins all the other sources
# score 0
# strand ContigLoc.orientation
# refNCBI ContigLoc.allele
# refUCSC ContigLoc.allele if insertion, othw. from genomic
# observed fasta headers
# molType fasta headers
# class fasta headers
# valid SNP
# avHet SNP
# avHetSE SNP
# func ContigLocusId
# locType ContigLoc
# weight MapInfo
time hgsql hg18snp128 -e \
'alter table ContigLoc add index (ctg_id); \
alter table ContigInfo add index (ctg_id);'
#kolossus load was already 1.0.
#0.001u 0.002s 4:04.73 0.0% 0+0k 0+0io 0pf+0w
time hgsql hg18snp128 -e \
'alter table ContigInfo add index (group_label(9));'
#0.001u 0.001s 0:00.07 0.0% 0+0k 0+0io 0pf+0w
# Make sure there are no orient != 0 contigs among those selected.
hgsql hg18snp128 -NBe \
'select count(*) from ContigInfo where orient != 0 and \
group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");'
#0
# For joining files by shared column, we need a unique identifier in
# that shared column. snp_id is not unique -- the same rsID can appear
# in both the reference assembly and on one of the others e.g. c6_COX.
# So concatenate the assembly identifier and snp_id to get hopefully
# unique label.
time hgsql hg18snp128 -NBe \
'select concat(ContigInfo.group_label, ".", snp_id), \
ContigInfo.contig_acc, asn_from, asn_to, \
loc_type, orientation, allele, phys_pos_from \
from ContigLoc, ContigInfo \
where ContigLoc.ctg_id = ContigInfo.ctg_id and ContigInfo.group_label \
in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \
| sort \
> ucscContigLoc.txt
# no time output because of the pipe... took 4 minutes (load was 3 or 4).
# Make sure these IDs are unique.
wc -l ucscContigLoc.txt
#12275300 ucscContigLoc.txt
awk '{print $1;}' ucscContigLoc.txt | uniq | wc -l
#11863799
# Doh! Find non-unique IDs:
awk 'prev == $1 {print;} {prev = $1;}' ucscContigLoc.txt | head
grep ^c5_H2.10035195 ucscContigLoc.txt
#c5_H2.10035195 NT_113801 639954 639954 2 0 G 69605321
#c5_H2.10035195 NT_113801 660407 660407 2 0 G 69625774
#c5_H2.10035195 NT_113801 911780 911780 2 1 C 69877147
# OK, they can be duplicated within the same contig. See if we can
# get by with anchoring everything to ucscContigLoc.txt. But everybody
# else better have unique IDs!
# SNP -> valid, avHet, avHetSE
# SNP has only snp_id as identifier, nothing relating to assembly.
hgsql hg18snp128 -NBe \
'select snp_id, validation_status, avg_heterozygosity, het_se \
from SNP;' \
| sort \
> ucscSNP.txt
# Check ID uniqueness:
wc -l ucscSNP.txt
#11833664 ucscSNP.txt
awk '{print $1;}' ucscSNP.txt | uniq | wc -l
#11833664
# ContigLocusId -> func
# ContigLocusId has only snp_id as an identifier (it gives one
# example contig if the SNP is on multiple contigs).
# The sort options and awk are to convert multiple entries with different
# function classes for the same SNP into one entry per SNP with a list
# of function classes.
hgsql hg18snp128 -NBe \
'select snp_id, fxn_class from ContigLocusId;' \
| sort -u -k1,1 -k2,2n \
| awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \
else { if (prevId) {print prevId "\t" prevFunc;} \
prevFunc = $2 ","; }} \
{prevId = $1;} \
END {print prevId "\t" prevFunc;}' \
> ucscFunc.txt
# Check ID uniqueness:
wc -l ucscFunc.txt
#4676589 ucscFunc.txt
awk '{print $1;}' ucscFunc.txt | sort -u | wc -l
#4676589
# MapInfo -> weight
# MapInfo needs assembly+snp_ids in order to have unique IDs.
time hgsql hg18snp128 -e \
'alter table MapInfo add index (assembly(9));'
#0.000u 0.004s 2:22.64 0.0% 0+0k 0+0io 0pf+0w
hgsql hg18snp128 -NBe \
'select concat(assembly, ".", snp_id), weight \
from MapInfo where assembly \
in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \
| sort \
> weight.txt
# ~1 minute
# Check ID uniqueness:
wc -l weight.txt
#11863799 weight.txt
awk '{print $1;}' weight.txt | uniq | wc -l
#11863799
awk '{print $2;}' weight.txt | sort -n | uniq -c
# 47454 0
#11621954 1
# 91766 2
# 100142 3
# 2483 10
# SNPs w/weight 0 and 10 will be discarded later.
# fasta headers -> observed, molType, class
zcat /cluster/data/dbSNP/128/human/rs_fasta/rs_ch*.fas.gz \
| grep '^>gnl' \
| perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
| sort \
> ucscGnl.txt
# ~4 minutes
wc -l ucscGnl.txt
#11833664 ucscGnl.txt
awk '{print $1;}' ucscGnl.txt | uniq | wc -l
#11833664
############### JOIN NCBI COLUMNS TO GET UCSC SNP COLUMNS ################
# Join files by ID. Start with ContigLoc and MapInfo because they
# share the concatenated assembly+snp_id IDs.
time join -a 1 -e MISSING -t ' ' ucscContigLoc.txt weight.txt \
> ucscCL+w.txt
#25.408u 3.551s 0:29.26 98.9% 0+0k 0+0io 0pf+0w
wc -l ucscCL+w.txt
#12275300 ucscCL+w.txt
# Same as ucscContigLoc.txt above, good.
# Any missing weights?
grep MISSING ucscCL+w.txt | head
# No output, good.
# Join the files with SNP-only IDs.
time join -e MISSING -t ' ' ucscGnl.txt ucscSNP.txt \
> ucscG+S.txt
#16.805u 1.996s 0:19.04 98.6% 0+0k 0+0io 0pf+0w
wc -l ucscG+S.txt
#11833664 ucscG+S.txt
# Same as ucscSNP.txt and ucscGnl.txt above.
grep MISSING ucscG+S.txt | wc -l
#0
time join -a 1 -e MISSING -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 \
-t ' ' ucscG+S.txt ucscFunc.txt \
> ucscG+S+F.txt
#17.656u 2.318s 0:20.10 99.3% 0+0k 0+0io 0pf+0w
wc -l ucscG+S+F.txt
#11833664 ucscG+S+F.txt
grep MISSING ucscG+S+F.txt | wc -l
#7157075
# Not surprising -- ucscFunc.txt has only 4676589 lines.
expr 11833664 - 4676589
#7157075
# Convert assembly+snp_id's to just snp_id (sorted) for final join.
perl -wpe 's/^\S+\.(\d+)/$1/;' ucscCL+w.txt \
| sort > ucscCL+w.snp_id.txt
awk '{print $1;}' ucscCL+w.snp_id.txt | uniq | wc -l
#11727742
# Interesting... which snp_ids are missing from ContigLoc?
awk '{print $1;}' ucscCL+w.snp_id.txt | uniq > /tmp/1
awk '{print $1;}' ucscGnl.txt | uniq > /tmp/2
comm -13 /tmp/1 /tmp/2 > notInContigLoc.txt
comm -23 /tmp/1 /tmp/2 > notInSNP.txt
wc -l notIn*.txt
#105994 notInContigLoc.txt
# 72 notInSNP.txt
expr 11833664 + 72 - 105994
#11727742
# Final join -- treat ContigLoc as authoritative (since it has coords).
# Arrange columns in same order as in the SNP table, with extras for
# checking at the end (phys_pos_from).
# chr chrS chrE name strand refN obs molT cls val aH aHSE fxn locT wt ...
time join -a 1 -e MISSING -t ' ' \
-o '1.2 1.3 1.4 1.1 1.6 1.7 2.2 2.3 2.4 2.5 2.6 2.7 2.8 1.5 1.9 1.8' \
ucscCL+w.snp_id.txt ucscG+S+F.txt \
> ucscNcbiSnp.ctg.txt
#38.497u 5.536s 2:08.18 34.3% 0+0k 0+0io 0pf+0w
wc -l ucscNcbiSnp.ctg.txt
#12275300 ucscNcbiSnp.ctg.txt
grep MISSING ucscNcbiSnp.ctg.txt | awk '{print $4;}' | uniq | wc -l
#7058898
# a bit less than the 7157075 missing FUNC's above -- some overlap with
# notInContigLoc would explain.
# Lift the map contig coordinates to chrom coordinates (~2m);
time liftUp ucscNcbiSnp.bed \
/cluster/data/hg18/jkStuff/liftContigs.lft warn \
ucscNcbiSnp.ctg.txt
#98.038u 5.974s 1:45.65 98.4% 0+0k 0+0io 5pf+0w
wc -l ucscNcbiSnp.bed
#12275300 ucscNcbiSnp.bed
# At this point, move back from /scratch to /cluster/data.
nice gzip ucscNcbiSnp.bed
cp -p ucscNcbiSnp.bed.gz /cluster/data/dbSNP/128/human/
# Drum roll please... translate NCBI's encoding into UCSC's, and
# perform a bunch of checks. This is where developer involvement
# is most likely as NCBI extends the encodings used in dbSNP.
cd /cluster/data/dbSNP/128/human/
gunzip ucscNcbiSnp.bed.gz
# Re-ran this command 8/7/08 to get new snp128.sql that includes
# only those enum/set values that are actually used. No other output
# files changed.
time snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/hg18/hg18.2bit \
snp128
#spaces stripped from observed:
#chr12 5963395 5963395 rs41402545
#count of snps with weight 0 = 59123
#count of snps with weight 1 = 11654498
#count of snps with weight 2 = 191647
#count of snps with weight 3 = 335214
#count of snps with weight 10 = 34818
#Skipped 167 snp mappings due to errors -- see snp128Errors.bed
#176.712u 17.466s 3:34.82 90.3% 0+0k 0+0io 0pf+0w
# The 167 errors are all for SNPs for which we don't have fasta,
# so we also don't have observed, class, or molType. I spot-checked
# a few, and they have been deleted from dbSNP. Nothing to show,
# so we skip those 167 -- nothing catastrophic. Watch out for new
# types of errors reported, though:
awk -F"\t" '{print $5;}' snp128Errors.bed | sort -u | wc -l
#1
wc -l snp*
# 12181192 snp128.bed
# 22 snp128.sql
# 167 snp128Errors.bed
# 18 snp128ExceptionDesc.tab
# 1013020 snp128Exceptions.bed
# Make one big fasta file. (note: snp126 skipped chrUn... but it's small
# compared to chr1, chr2 etc.)
# It's a monster: 14G! Can we split by hashing rsId?
zcat rs_fasta/rs_ch*.fas.gz \
| perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
> snp128.fa
# Check for duplicates.
grep ^\>rs snp128.fa | sort > /scratch/tmp/seqHeaders
wc -l /scratch/tmp/seqHeaders
#11833664 /scratch/tmp/seqHeaders
uniq /scratch/tmp/seqHeaders | wc -l
#11833664
# Use hgLoadSeq to generate .tab output for sequence file offsets,
# and keep only the columns that we need: acc and file_offset.
# Index it and translate to snpSeq table format.
time hgLoadSeq -test placeholder snp128.fa
#107.137u 37.140s 2:39.16 90.6% 0+0k 0+0io 0pf+0w
cut -f 2,6 seq.tab > snp128Seq.tab
rm seq.tab
ssh hgwdev
# Load up main track tables.
cd /cluster/data/dbSNP/128/human
# Re-ran this command 8/7/08 to get new snp128.sql that includes
# only those enum/set values that are actually used. No data values
# changed. Removed -noSort because Brooke had spotted some entries
# sorted by chromEnd instead of chromStart.
time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
hg18 snp128 -sqlTable=snp128.sql snp128.bed
#78.060u 13.298s 7:32.71 20.1% 0+0k 0+0io 0pf+0w
sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125Exceptions.sql \
> snp128Exceptions.sql
time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
hg18 snp128Exceptions -sqlTable=snp128Exceptions.sql \
snp128Exceptions.bed
#5.915u 0.492s 0:28.69 22.3% 0+0k 0+0io 0pf+0w
sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
> snp128ExceptionDesc.sql
# 3/7/08: reloaded snp128ExceptionDesc (tweaked wording)
hgLoadSqlTab hg18 snp128ExceptionDesc snp128ExceptionDesc.sql \
snp128ExceptionDesc.tab
# Load up sequences.
sed -e 's/snpSeq/snp128Seq/' ~/kent/src/hg/lib/snpSeq.sql \
> snp128Seq.sql
mkdir -p /gbdb/hg18/snp
ln -s /cluster/data/dbSNP/128/human/snp128.fa /gbdb/hg18/snp/snp128.fa
time nice hgLoadSqlTab hg18 snp128Seq snp128Seq.sql snp128Seq.tab
#0.001u 0.000s 2:31.19 0.0% 0+0k 0+0io 0pf+0w
# Put in a link where one would expect to find the track build dir...
ln -s /cluster/data/dbSNP/128/human /cluster/data/hg18/bed/snp128
#######################################################################
# SNPMASKED SEQUENCE FOR SNP128 (DONE 2/1/08 angie)
ssh kolossus
mkdir /cluster/data/hg18/snp128Mask
cd /cluster/data/hg18/snp128Mask
# Identify rsIds with various problems -- we will exclude those.
# MultipleAlignments is kinda broad because anything that maps on
# both chrN and chrN_foo_hap1 will be excluded... similarly, extra
# matches on chrN_random might disqualify good matches on chrN.
# Well, erring on the side of caution is good.
awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \
/cluster/data/dbSNP/128/human/snp128Exceptions.bed \
| sort -u \
> snp128ExcludeRsIds.txt
time grep -vFwf snp128ExcludeRsIds.txt \
/cluster/data/dbSNP/128/human/snp128.bed \
> snp128Cleaned.bed
#100.027u 11.779s 2:09.61 86.2% 0+0k 0+0io 0pf+0w
# Substitutions:
mkdir substitutions
snpMaskSingle snp128Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
| faSplit byname stdin substitutions/
#-- 79 warnings about differing observed at same base positions
#-- (66 distinct positions) -- send to NCBI. snp-admin@ncbi.nlm.nih.gov
# Also this warning about total size -- just means that some chroms
# didn't have any SNPS that survived the stringent filtering.
#Masked 9146694 snps in 9146642 out of 3091528550 genomic bases
#/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3091528550 (difference is 16148723)
# Make sure that sizes are identical, first diffs are normal -> IUPAC,
# and first diffs' case is preserved:
foreach f (substitutions/chr*.fa)
faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ"
end
#(output OK)
foreach f (substitutions/chr*.fa)
echo $f:t:r
mv $f $f:r.subst.fa
gzip $f:r.subst.fa
end
# Insertions:
mkdir insertions
snpMaskAddInsertions snp128Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
| faSplit byname stdin insertions/
#Added 1332737 snps totaling 2372942 bases to 3085151178 genomic bases
#/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085151178 (difference is 22526095)
# Again, that just means that some chroms didn't have filtered SNPs.
# Make sure that all sizes have increased relative to original:
foreach f (insertions/chr*.fa)
faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
|& perl -we '$_=<>; \
if (/^\w+ in \S+ has (\d+) bases. \w+ in \S+ has (\d+) bases/) { \
if ($1 > $2) {print "OK: ins size $1 > $2\n";} \
else {die "ERROR: ins size $1 <= $2\n";} \
} else {die $_;}'
end
#(output OK)
foreach f (insertions/chr*.fa)
mv $f $f:r.ins.fa
gzip $f:r.ins.fa
end
# Deletions:
mkdir deletions
snpMaskCutDeletions snp128Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
| faSplit byname stdin deletions/
#Cut 661637 snps totaling 1248873 bases from 3085167749 genomic bases
#/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524)
# Again, that just means that some chroms didn't have filtered SNPs.
# Make sure that all sizes have decreased relative to original:
foreach f (deletions/chr*.fa)
faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
|& perl -we '$_=<>; \
if (/^\w+ in \S+ has (\d+) bases. \w+ in \S+ has (\d+) bases/) { \
if ($1 < $2) {print "OK: del size $1 < $2\n";} \
else {die "ERROR: del size $1 >= $2\n";} \
} else {die $_;}'
end
#(output OK)
foreach f (deletions/chr*.fa)
mv $f $f:r.del.fa
gzip $f:r.del.fa
end
# Clean up and prepare for download:
gzip snp128Cleaned.bed
foreach d (substitutions insertions deletions)
pushd $d
md5sum *.gz > md5sum.txt
popd
end
# Make a README.txt in each subdir.
# Create download links on hgwdev.
# NOTE: I am going to start by offering only the substitutions.
# If we get any user requests, then maybe we can put the insertions
# and deletions out there.
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask
ln -s /cluster/data/hg18/snp128Mask/substitutions/* \
/usr/local/apache/htdocs/goldenPath/hg18/snp128Mask/
## If there is user demand for ins & del, then start over with an empty
## goldenPath/snp128Mask and do this:
## foreach type (substitutions insertions deletions)
## mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask/$type
## ln -s /cluster/data/hg18/snp128Mask/$type/* \
## /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask/$type/
## end
#######################################################################
# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP128 (DONE 2/8/08 angie)
# REDONE 2/29/08 (upcase ortho alleles)
ssh kolossus
mkdir /cluster/data/hg18/bed/snp128Ortho
cd /cluster/data/hg18/bed/snp128Ortho
# Following Heather's lead in snp126orthos, filter SNPs to to keep
# only those with class=single, length=1, chrom!~random;
# Exclude those with exceptions MultipleAlignments,
# SingleClassTriAllelic or SingleClassQuadAllelic.
# Unlike snp masking, we do not filter for weight -- don't know why.
awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
/cluster/data/dbSNP/128/human/snp128Exceptions.bed \
| sort -u \
> snp128ExcludeIds.txt
awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
/cluster/data/dbSNP/128/human/snp128.bed \
| grep -vFwf snp128ExcludeIds.txt \
> snp128Simple.bed
# took ~3 minutes
wc -l snp128Simple.bed
#9133704 snp128Simple.bed
# This is the analog of db table snp126simple.
# Glom all human info that we need for the final table onto the
# name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
awk 'BEGIN{OFS="\t";} \
{print $1, $2, $3, \
$4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
0, $6;}' \
snp128Simple.bed > snp128ForLiftOver.bed
# 2/29/08 -- re-ran from this point on to regenerate cleaned up
# cluster run results (oops) and then force ortho alleles to upper
# case, for consistency with dbSNP formatting.
# Map coords to chimp using liftOver.
# I don't know why chimp took so much longer than macaque... the
# chimp .over has fewer chains and fewer bytes than the macaque .over.
mkdir run.liftOChimp
cd run.liftOChimp
mkdir split out
splitFile ../snp128ForLiftOver.bed 25000 split/chunk
cp /dev/null jobList
foreach f (split/chunk*)
echo liftOver $f \
/cluster/data/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \
\{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \
>> jobList
end
ssh pk
cd /cluster/data/hg18/bed/snp128Ortho/run.liftOChimp
para make jobList
#Completed: 366 of 366 jobs
#CPU time in finished jobs: 71660s 1194.33m 19.91h 0.83d 0.002 y
#IO & Wait Time: 5377s 89.62m 1.49h 0.06d 0.000 y
#Average job time: 210s 3.51m 0.06h 0.00d
#Longest finished job: 518s 8.63m 0.14h 0.01d
#Submission to last job: 518s 8.63m 0.14h 0.01d
# Map coords to macaque using liftOver.
mkdir ../run.liftOMac
cd ../run.liftOMac
mkdir out
ln -s ../run.liftOChimp/split .
cp /dev/null jobList
foreach f (split/chunk*)
echo liftOver $f \
/cluster/data/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \
\{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \
>> jobList
end
para make jobList
#Completed: 366 of 366 jobs
#CPU time in finished jobs: 5663s 94.38m 1.57h 0.07d 0.000 y
#IO & Wait Time: 12066s 201.10m 3.35h 0.14d 0.000 y
#Average job time: 48s 0.81m 0.01h 0.00d
#Longest finished job: 102s 1.70m 0.03h 0.00d
#Submission to last job: 102s 1.70m 0.03h 0.00d
# Average job time was 54s with 50000 chunks, but those made chimp
# jobs run too long.
ssh kolossus
cd /cluster/data/hg18/bed/snp128Ortho
# Here is a script that looks up the base value in the ortho species
# and swizzles columns to prepare for the joining and re-swizzling
# of both ortho species' columns into the final product. If it is
# used more than once, should be checked in, perhaps in hg/snp/snpLoad.
cat > getOrthoSeq.pl <<'_EOF_'
#!/usr/bin/env perl
# Dig up orthologous alleles and swizzle columns so the glommed name that
# includes human position info etc. is first. It will be used as a key for
# joining up multiple other-species' ortho data. Also swizzle columns so
# that the remaining columns are in order of appearance in the final result,
# snp128OrthoPanTro2RheMac2. Upcase ortho alleles for consistency w/dbSNP.
use warnings;
use strict;
my $twoBitFName = shift @ARGV
|| die "usage: getOrthoSeq.pl orthoDb.2bit [file(s)]\n";
sub getOChrSeq($$) {
# Slurp in fasta sequence using twoBitToFa.
my ($twoBitFName, $oChr) = @_;
open(P, "twoBitToFa -noMask $twoBitFName -seq=$oChr stdout |")
|| die "Can't open pipe from twoBitToFa $twoBitFName -seq=$oChr: $!\n";
<P> =~ /^>\w+/
|| die "Doesn't look like we got fasta -- first line is this:\n$_";
# From man perlfaq5: trick to slurp entire contents:
my $c = 0;
my $seq = do { local $/; my $data = <P>; $c = ($data =~ s/\n//g); $data; };
close(P);
return $seq;
}
my %rc = ( "a" => "t", "c" => "g", "g" => "c", "t" => "a",
"A" => "T", "C" => "G", "G" => "C", "T" => "A", );
sub revComp($) {
# Reverse-complement fasta input. (Pass through non-agtc chars.)
my ($seq) = @_;
my $rcSeq = reverse $seq;
for (my $i = 0; $i < length($rcSeq); $i++) {
my $base = substr($rcSeq, $i, 1);
my $cBase = $rc{$base} || $base;
substr($rcSeq, $i, 1, $cBase);
}
return $rcSeq;
}
my $prevOChr;
my ($oChrSeq, $oChrSize);
while (<>) {
chomp;
my ($oChr, $oStart, $oEnd, $nameGlom, undef, $oStrand) = split;
if (! defined $prevOChr || $oChr ne $prevOChr) {
$oChrSeq = &getOChrSeq($twoBitFName, $oChr);
$oChrSize = length($oChrSeq);
}
die "Coords out of range, input line $.: $oEnd > $oChr size $oChrSize\n\t"
if ($oEnd > $oChrSize);
my $oAllele = substr($oChrSeq, $oStart, $oEnd - $oStart);
$oAllele = &revComp($oAllele) if ($oStrand eq "-");
print join("\t", $nameGlom, $oChr, $oStart, $oEnd, $oAllele, $oStrand) .
"\n";
$prevOChr = $oChr;
}
'_EOF_'
# << emacs
chmod a+x getOrthoSeq.pl
# Concatenate the chimp results, sorting by chimp pos in order to
# efficiently access 2bit sequence in ./getOrthoSeq. The output of
# that is then sorted by the glommed human info field, so that we
# can use join to combine chimp and macaque results in the next step.
sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
| ./getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
| sort > panTro2.orthoGlom.txt
# ditto for macaque:
sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
| ./getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
| sort > rheMac2.orthoGlom.txt
# The whole pipeline takes ~4-6 minutes each.
wc -l panTro2.orthoGlom.txt rheMac2.orthoGlom.txt
# 8549323 panTro2.orthoGlom.txt
# 7324851 rheMac2.orthoGlom.txt
# Use the glommed name field as a key to join up chimp and macaque
# allele data. Include glommed name from both files because if only
# file 2 has a line for the key in 2.1, then 1.1 is empty. Then plop
# in the orthoGlom files from each file, which are in the same order
# as the chimp and macaque columns of snp128OrthoPanTro2RheMac2.
join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
-a 1 -a 2 -e 0 \
panTro2.orthoGlom.txt rheMac2.orthoGlom.txt \
| perl -wpe 'chomp; \
($glom1, $glom2, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
$o2Chr, $o2Start, $o2End, $o2Al, $o2Strand) = split; \
$glomKey = ($glom1 ne "0") ? $glom1 : $glom2; \
($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
split(/\|/, $glomKey); \
$o1Chr =~ s/^0$/?/; $o2Chr =~ s/^0$/?/; \
$o1Al =~ s/^0$/?/; $o2Al =~ s/^0$/?/; \
$o1Strand =~ s/^0$/?/; $o2Strand =~ s/^0$/?/; \
print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
$o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
$o2Chr, $o2Start, $o2End, $o2Al, $o2Strand) . "\n"; \
s/^.*$//;' \
| sort -k1,1 -k2n,2n > snp128OrthoPanTro2RheMac2.bed
# took ~5 minutes.
wc -l snp128OrthoPanTro2RheMac2.bed
#8770301 snp128OrthoPanTro2RheMac2.bed
ssh hgwdev
cd /cluster/data/hg18/bed/snp128Ortho
sed -e 's/snpOrthoPanTroRheMac/snp128OrthoPanTro2RheMac2/' \
~/kent/src/hg/lib/snpOrthoPanTroRheMac.sql \
> snp128OrthoPanTro2RheMac2.sql
time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
hg18 snp128OrthoPanTro2RheMac2 -sqlTable=snp128OrthoPanTro2RheMac2.sql \
snp128OrthoPanTro2RheMac2.bed
#Loaded 8770301 elements of size 17
#52.659u 8.528s 5:18.68 19.1% 0+0k 0+0io 0pf+0w
# Cleanup on fileserver:
cd /cluster/data/hg18/bed/snp128Ortho
nice gzip snp128Simple.bed snp128ExcludeIds.txt snp128ForLiftOver.bed
rm -r run*/split *.orthoGlom.txt
#######################################################################
# COMPARE SNP128 TO SNP126 (DONE 2/7/08 angie)
# First, do a featureBits venn, on some machine other than hgwdev.
# I can't find the file from which snp126 was loaded... but kkr5u00
# has an hg18snp126 database with a snp126 that is a few hours newer,
# but apparently the same as, hgwdev's hg18.snp126... so use that
# (had to add gap tables too):
ssh kkr5u00
time featureBits hg18snp126 snp126
#12451939 bases of 2881515245 (0.432%) in intersection
#57.274u 15.283s 1:20.56 90.0% 0+0k 0+0io 0pf+0w
# Now make sure we have a file copy of snp126 in case we need it in
# the future:
hgsql hg18snp126 -NBe 'select * from snp126' \
| cut -f 2-18 \
> /cluster/data/dbSNP/126/human/snp126.bed
rsync /cluster/data/dbSNP/128/human/snp128.bed /scratch/tmp/
time featureBits hg18 /scratch/tmp/snp128.bed
#12387071 bases of 2881515245 (0.430%) in intersection
#636.834u 47.039s 11:24.02 99.9% 0+0k 0+0io 0pf+0w
# OK, db is a lot faster!
# I am not worried about the drop -- spot-checking, I have seen some
# dropped rsIds and some that used to have multiple mappings but now
# have only one mapping -- an improvement.
pushd /cluster/data/dbSNP/128/human
hgLoadBed -tab -noSort -onServer -tmpDir=/scratch/tmp \
hg18snp126 snp128 -sqlTable=snp128.sql snp128.bed
popd
# How many covered bases in common?
time featureBits hg18snp126 snp126 snp128
#11576806 bases of 2881515245 (0.402%) in intersection
#114.365u 26.671s 3:15.55 72.1% 0+0k 0+0io 0pf+0w
# Base coverage Venn counts:
# snp126 snp128 !snp126 !snp128
# snp126 12451939 11576806 0 875133
# snp128 11576806 12387071 810265 0
# Do the same for SNPs (rs* records as opposed to bases):
hgsql hg18snp126 -NBe 'select name from snp126' \
| sort -u > /scratch/tmp/1
hgsql hg18snp126 -NBe 'select name from snp128' \
| sort -u > /scratch/tmp/2
wc -l /scratch/tmp/[12]
# 11647909 /scratch/tmp/1
# 11677826 /scratch/tmp/2
comm -12 /scratch/tmp/[12] | wc -l
#11531282
cd /cluster/data/dbSNP/128/human
comm -23 /scratch/tmp/[12] \
> /cluster/data/dbSNP/128/human/ids.inSnp126Not128.txt
comm -13 /scratch/tmp/[12] \
> /cluster/data/dbSNP/128/human/ids.inSnp128Not126.txt
# rsId Venn counts:
# snp126 snp128 !snp126 !snp128
# snp126 11647909 11531282 0 116627
# snp128 11531282 11677826 146544 0
# Interesting that snp128 has more new rsIds but fewer new bases.
# It has been 2 versions since 126... also, when spot-checking
# exceptions I noticed that a lot of deletion SNPs used to be
# mapped to the appropriate span in 126, but in 128 were mapped to
# a single base and had some kind of range*tion locType... not an
# improvement. But that kind of observation best falls out of an
# examination of exception cases... and that is what will be
# useful for us to report to NCBI.
############################################################################
# BLASTZ SELF chain minScore=2000 (DONE - 2007-12-19 - Hiram)
ssh kkstore02
screen # use screen to manage this job
mkdir /cluster/data/hg18/bed/blastzSelf.2007-12-17
cd /cluster/data/hg18/bed/blastzSelf.2007-12-17
cat << '_EOF_' > DEF
# human vs human
BLASTZ_M=400
# TARGET: Human Hg18
SEQ1_DIR=/san/sanvol1/scratch/hg18/selfNib
SEQ1_LEN=/san/sanvol1/scratch/hg18/self.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
# QUERY: Human Hg18
SEQ2_DIR=/san/sanvol1/scratch/hg18/selfNib
SEQ2_LEN=/san/sanvol1/scratch/hg18/self.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0
BASE=/cluster/data/hg18/bed/blastzSelf.2006-01-17
TMPDIR=/scratch/tmp
'_EOF_'
# happy emacs
cd /cluster/data/hg18/bed/blastzSelf.2007-12-17
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
`pwd`/DEF -verbose=2 -chainMinScore=2000 -chainLinearGap=medium \
-stop=net -smallClusterHub=memk -bigClusterHub=pk > do.log 2>&1 &
# real 640m37.637s
## crafted a special loadUp.csh to avoid haplotypes and randoms,
# and load with normScore
ssh hgwdev
cd /cluster/data/hg18/bed/blastzSelf.2007-12-17/axtChain
time nice -n +19 ./loadUp.csh >loadUp.out 2>&1
# real 24m51.669s
cd /cluster/data/hg18/bed/blastzSelf.2007-12-17
time nice -n +19 featureBits hg18 chainSelf2KLink \
-noRandom -noHap > fb.hg18.chainSelf2KLink.txt 2>&1 &
# real 11m30.010s
cat fb.hg18.chainSelf2KLink.txt
# 346885376 bases of 2858034764 (12.137%) in intersection
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
`pwd`/DEF -verbose=2 -chainMinScore=2000 -chainLinearGap=medium \
-continue=download \
-stop=download -smallClusterHub=memk -bigClusterHub=pk \
> download.log 2>&1 &
############################################################################
# RE-BUILD GAD TRACK (Done, 1/16/08, Fan)
# During previous build, all.txt was corrupted during receiving file from
# email.
mkdir /cluster/store12/gad080116
rm /cluster/data/gad
ln -s /cluster/store12/gad080116 /cluster/data/gad
cd /cluster/data/gad
# Receive "all.txt" from GAD
# contact person: Garner, John (NIH/NIA/IRP) [F] [garnerjr@mail.nih.gov]
hgsql hg18 -e 'drop table gadAll'
hgsql hg18 <~/src/hg/lib/gadAll.sql
hgsql hg18 -e 'load data local infile "all.txt" into table gadAll ignore 3 lines'
# create gad table
gadPos hg18 j18.tmp
cat j18.tmp |sort -u >hg18.gad.tab
# removed 1 record from hg18.gad.tab that has multiple words in geneSymbol
# field.
# use -nobin option to ensure display order is according to genomic position
hgLoadBed -nobin hg18 gad hg18.gad.tab
rm j18.tmp
#######################################################################
# BLASTZ/CHAIN/NET Lamprey petMar1 (DONE - 2008-01-29 - Hiram)
# with contigs for Lamprey
ssh kkstore02
screen # use screen to control this job
mkdir /cluster/data/hg18/bed/blastzPetMar1.2008-01-29
cd /cluster/data/hg18/bed/blastzPetMar1.2008-01-29
cat << '_EOF_' > DEF
# Human vs. Lamprey
# using the "close" genome alignment parameters
# see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human - WindowMasker sequence
SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1
# QUERY: Lamprey petMar1
SEQ2_DIR=/cluster/bluearc/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzPetMar1.2008-01-29
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk > do.log 2>&1 &
# real 414m33.533s
cat fb.hg18.chainPetMar1Link.txt
# 36042598 bases of 2881515245 (1.251%) in intersection
# That is OK, now for the swap:
mkdir /cluster/data/petMar1/bed/blastz.hg18.swap
cd /cluster/data/petMar1/bed/blastz.hg18.swap
time doBlastzChainNet.pl -verbose=2 -swap \
/cluster/data/hg18/bed/blastzPetMar1.2008-01-29/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk > swap.log 2>&1 &
# real 60m1.928s
cat fb.petMar1.chainHg18Link.txt
# 26751073 bases of 831696438 (3.216%) in intersection
#######################################################################
###################
# Build recip-best alignments with calJac1 (DONE 2008-01-25 braney)
cd /cluster/data/hg18/bed
ln -s blastz.calJac1.2007-10-07 blastz.calJac1
cd blastz.calJac1
screen
/cluster/bin/scripts/doRecipBest.pl hg18 calJac1
###################
# Build syntenic net for orang (DONE 2008-01-25 braney)
cd /cluster/data/hg18/bed/blastz.ponAbe2
screen
/cluster/bin/scripts/doBlastzChainNet.pl -syntenicNet -continue syntenicNet -stop syntenicNet `pwd`/DEF 2>&1 | tee syntenic.out
#########################################################################
## Primate Multiz (Working
##
ssh hgwdev
mkdir /cluster/data/hg18/bed/multizPrimate
cd /cluster/data/hg18/bed/multizPrimate
# take the 30-way tree from mm9 and eliminate genomes not in
# this alignment
# rearrange to get hg18 on the top of the graph
# paste this tree into the on-line phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# to create the image for the tree diagram
/cluster/bin/phast/tree_doctor --prune-all-but Human_hg18,Mouse_mm9,Chimp_panTro2,Orangutan_ponAbe2,Rhesus_rheMac2,Marmoset_calJac1,Bushbaby_otoGar1,TreeShrew_tupBel1,Rat_rn4,Dog_canFam2 /cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh > primate.fullNames.nh
# looks something like this:
(((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.249544,((((((Human_hg18:0.005873,Chimp
_panTro2:0.007668):0.013037,Orangutan_ponAbe2:0.020000):0.013037,Rhesus_rheMac2:
0.031973):0.036500,Marmoset_calJac1:0.070000):0.036500,Bushbaby_otoGar1:0.151185
):0.015682,TreeShrew_tupBel1:0.162844):0.006272):0.019763,Dog_canFam2:0.187963);
# rearrange to get human at the top:
# this leaves us with:
cat << _EOF_ > hg18.primate.nh
((((((((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037,Orangutan_ponAbe2:0.020000):0.013037,Rhesus_rheMac2:0.031973):0.036500,Marmoset_calJac1:0.070000):0.036500,Bushbaby_otoGar1:0.151185):0.015682,TreeShrew_tupBel1:0.162844):0.006272,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.249544):0.019763,Dog_canFam2:0.187963);
_EOF_
# << happy emacs
# create a species list from that file:
sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' hg18.primate.nh \
| sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
| sed -e "s/.*_//; s/:.*//" | sort > species.list
# create a stripped down nh file for use in autoMZ run
echo \
`sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' hg18.primate.nh \
| sed -e "s/ / /g"` > tree.primate.nh
# that looks like, as a single line:
# ((((((((hg18 panTro2) ponAbe2) rheMac2) calJac1) otoGar1) tupBel1) (mm9 rn4)) canFam2)
# verify all blastz's exists
cat << '_EOF_' > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/hg18/bed/multizPrimate
foreach db (`cat species.list`)
set bdir = /cluster/data/hg18/bed/blastz.$db
if (-e $bdir/mafRBestNet/chr1.maf.gz) then
echo "$db mafRBestNet"
else if (-e $bdir/mafSynNet/chr1.maf.gz) then
echo "$db mafSynNet"
else if (-e $bdir/mafNet/chr1.maf.gz) then
echo "$db mafNet"
else
echo "$db mafs not found"
endif
end
'_EOF_'
# << happy emacs
chmod +x ./listMafs.csh
# see what it says, the "mafs not found" should only show up on hg18
./listMafs.csh
# calJac1 mafRBestNet
# canFam2 mafSynNet
# hg18 mafNet
# mm9 mafSynNet
# otoGar1 mafRBestNet
# panTro2 mafSynNet
# ponAbe2 mafSynNet
# rheMac2 mafSynNet
# rn4 mafSynNet
# tupBel1 mafRBestNet
/cluster/bin/phast/all_dists hg18.primate.nh > Primate.distances.txt
grep -i hg18 Primate.distances.txt | sort -k3,3n
# Human_hg18 Chimp_panTro2 0.013541
# Human_hg18 Orangutan_ponAbe2 0.038910
# Human_hg18 Rhesus_rheMac2 0.063920
# Human_hg18 Marmoset_calJac1 0.138447
# Human_hg18 Bushbaby_otoGar1 0.256132
# Human_hg18 TreeShrew_tupBel1 0.283473
# Human_hg18 Dog_canFam2 0.334627
# Human_hg18 Mouse_mm9 0.452719
# Human_hg18 Rat_rn4 0.460828
# copy net mafs to cluster-friendly storage, splitting chroms
# into 50MB chunks to improve run-time
# NOTE: splitting will be different for scaffold-based reference asemblies
ssh hgwdev
mkdir /cluster/data/hg18/bed/multizPrimate/run.split
cd /cluster/data/hg18/bed/multizPrimate/run.split
# this works by examining the rmsk table for likely repeat areas
# that won't be used in blastz
mafSplitPos hg18 50 mafSplit.bed
ssh kki
cd /cluster/data/hg18/bed/multizPrimate/run.split
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set targDb = "hg18"
set db = $1
set sdir = /san/sanvol1/scratch/$targDb/BRsplitStrictMafNet
mkdir -p $sdir
if (-e $sdir/$db) then
echo "directory $sdir/$db already exists -- remove and retry"
exit 1
endif
set bdir = /cluster/data/$targDb/bed/blastz.$db
if (! -e $bdir) then
echo "directory $bdir not found"
exit 1
endif
mkdir -p $sdir/$db
if (-e $bdir/mafRBestNet) then
set mdir = $bdir/mafRBestNet
else if (-e $bdir/mafSynNet) then
set mdir = $bdir/mafSynNet
else if (-e $bdir/mafNet) then
set mdir = $bdir/mafNet
else
echo "$bdir maf dir not found"
exit 1
endif
echo $mdir
foreach f ($mdir/*)
set c = $f:t:r:r
echo " $c"
nice mafSplit mafSplit.bed $sdir/$db/ $f
end
echo "gzipping $sdir/$db mafs"
nice gzip $sdir/$db/*
endif
echo $mdir > $db.done
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
grep -v hg18 ../species.list > split.list
cat << '_EOF_' > template
#LOOP
doSplit.csh $(path1) {check out line+ $(path1).done}
#ENDLOOP
'_EOF_'
gensub2 split.list single template jobList
para create jobList
# start these gently, this is a good load on the san filesystem
para -maxPush=3 push
# wait a while, verify these are running OK
para push
# let that run to a couple completions, a few minutes, then again:
para try
# etc ...
# Completed: 9 of 9 jobs
# CPU time in finished jobs: 9090s 151.50m 2.52h 0.11d 0.000 y
# IO & Wait Time: 3093s 51.55m 0.86h 0.04d 0.000 y
# Average job time: 1354s 22.56m 0.38h 0.02d
# Longest finished job: 2134s 35.57m 0.59h 0.02d
# Submission to last job: 2153s 35.88m 0.60h 0.02d
# ready for the multiz run
ssh pk
cd /cluster/data/hg18/bed/multizPrimate
# actually, the result directory here should be maf.split instead of maf
mkdir -p maf run
cd run
mkdir penn
# use latest penn utilities
P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
cp -p $P/{autoMZ,multiz,maf_project} penn
# list chrom chunks, any db dir will do; better would be for the
# splitter to generate this file
# We temporarily use __ instead of . to delimit chunk in filename
# so we can use $(root) to get basename
find /san/sanvol1/scratch/hg18/BRsplitStrictMafNet -type f \
| while read F; do basename $F; done \
| sed -e 's/.maf.gz//' -e 's/\./__/' | sort -u > chromChunks.list
wc -l chromChunks.list
# 93 chromChunks.list
cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = hg18
set c = $1
set maf = $2
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/BRsplitStrictMafNet
rm -fr $tmp
mkdir -p $tmp
cp ../tree.primate.nh ../species.list $tmp
pushd $tmp
foreach s (`cat species.list`)
set c2 = `echo $c | sed 's/__/./'`
set in = $pairs/$s/$c2.maf
set out = $db.$s.sing.maf
if ($s == hg18) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.primate.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz.csh
cat << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg18/bed/multizPrimate/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << emacs
gensub2 chromChunks.list single template jobList
para create jobList
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 302126s 5035.43m 83.92h 3.50d 0.010 y
# IO & Wait Time: 3499s 58.32m 0.97h 0.04d 0.000 y
# Average job time: 3286s 54.77m 0.91h 0.04d
# Longest finished job: 6972s 116.20m 1.94h 0.08d
# Submission to last job: 7052s 117.53m 1.96h 0.08d
# put the split maf results back together into single chroms
ssh kkstore02
cd /cluster/data/hg18/bed/multizPrimate
# here is where the result directory maf should have already been maf.split
mv maf maf.split
mkdir maf
# going to sort out the redundant header garbage to leave a cleaner maf
for C in `ls maf.split | sed -e "s#__.*##" | sort -u`
do
echo ${C}
head -q -n 1 maf.split/${C}__*.maf | sort -u > maf/${C}.maf
grep -h "^#" maf.split/${C}__*.maf | egrep -v "maf version=1|eof maf" | \
sed -e "s#_MZ_[^ ]* # #g; s#__[0-9]##g" | sort -u >> maf/${C}.maf
grep -h -v "^#" maf.split/${C}__*.maf >> maf/${C}.maf
tail -q -n 1 maf.split/${C}__*.maf | sort -u >> maf/${C}.maf
done
# load tables for a look
ssh hgwdev
mkdir -p /gbdb/hg18/multizPrimate/maf
ln -s /cluster/data/hg18/bed/multizPrimate/maf/*.maf \
/gbdb/hg18/multizPrimate/maf
# this generates a large 1 Gb multizPrimate.tab file in the directory
# where it is running. Best to run this over in scratch.
cd /scratch/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/hg18/multizPrimate/maf hg18 multizPrimate
# Loaded 12531777 mafs in 49 files from /gbdb/hg18/multizPrimate/maf
# real 8m44.516s
# load summary table
time nice -n +19 cat /gbdb/hg18/multizPrimate/maf/*.maf \
| hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multizPrimateSummary stdin
# Created 1417364 summary blocks from 29928557 components
# and 6981421 mafs from stdin
# real 21m35.057s
# Gap Annotation
# prepare bed files with gap info
ssh kkstore02
mkdir /cluster/data/hg18/bed/multizPrimate/anno
cd /cluster/data/hg18/bed/multizPrimate/anno
mkdir maf run
# these actually already all exist from previous multiple alignments
for DB in `cat ../species.list`
do
CDIR="/cluster/data/${DB}"
if [ ! -f ${CDIR}/${DB}.N.bed ]; then
echo "creating ${DB}.N.bed"
echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
else
ls -og ${CDIR}/${DB}.N.bed
fi
done
cd run
rm -f nBeds sizes
for DB in `grep -v hg18 ../../species.list`
do
echo "${DB} "
ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len
echo ${DB}.len >> sizes
done
ssh kki
cd /cluster/data/hg18/bed/multizPrimate/anno/run
cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set dir = /cluster/data/hg18/bed/multizPrimate
set c = $1
cat $dir/maf/${c}.maf | \
nice mafAddIRows -nBeds=nBeds stdin /cluster/data/hg18/hg18.2bit $2
'_EOF_'
# << happy emacs
chmod +x doAnno.csh
cat << '_EOF_' > template
#LOOP
./doAnno.csh $(root1) {check out line+ /cluster/data/hg18/bed/multizPrimate/anno/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 /cluster/data/hg18/chrom.sizes > chrom.list
gensub2 chrom.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Completed: 49 of 49 jobs
# CPU time in finished jobs: 10782s 179.71m 3.00h 0.12d 0.000 y
# IO & Wait Time: 3380s 56.33m 0.94h 0.04d 0.000 y
# Average job time: 289s 4.82m 0.08h 0.00d
# Longest finished job: 751s 12.52m 0.21h 0.01d
# Submission to last job: 1479s 24.65m 0.41h 0.02d
ssh hgwdev
cd /cluster/data/hg18/bed/multizPrimate/anno
mkdir -p /gbdb/hg18/multizPrimate/anno/maf
ln -s /cluster/data/hg18/bed/multizPrimate/anno/maf/*.maf \
/gbdb/hg18/multizPrimate/anno/maf
# by loading this into the table multizPrimate, it will replace the
# previously loaded table with the unannotated mafs
# huge temp files are made, do them on local disk
cd /scratch/tmp
time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/hg18/multizPrimate/anno/maf \
hg18 multizPrimate
# Loaded 7331265 mafs in 55 files from /gbdb/hg18/multizPrimate/anno/maf
# real 8m31.092s
cat /cluster/data/hg18/chrom.sizes | \
awk '{if ($2 > 1000000) { print $1 }}' |
while read C
do
echo /gbdb/hg18/multizPrimate/anno/maf/$C.maf
done | xargs cat | \
hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multizPrimateSummary stdin
# Created 1621960 summary blocks from 75794119 components and 12601786
# mafs from stdin
# remove the multizPrimate*.tab files in this /scratch/tmp directory
rm multizPrimate*
#######
################################################################################
# RE-SEQUENCING TRACE DOWNLOAD (DONE 2008-01-25, Andy)
ssh kolossus
bash
cd /san/sanVol1/scratch/andy
mkdir traces
cd traces/
cat < "EOF" > getOldTraces.sh
#!/bin/bash
echo Retrieving sequences before Jan 2008
echo Starting at `date`
# Query the database and figure out the total number of pages needed
count=`./query_tracedb "query count species_code='HOMO SAPIENS' and strategy='Re-Sequencing' and load_date<'1/1/2008'"`
pages=$(( (count/40000) + ((count % 40000) > 0) ))
echo
echo Total of $count sequences and $pages pages to retrieve
echo
for ((page=0; page < pages; page++)); do
pagenum=`printf "%03d" $((page+1))`
./query_tracedb "query page_size 40000 page_number $page binary species_code='HOMO SAPIENS' and strategy='Re-Sequencing' and load_date<'1/1/2008'" > page.bin
echo -n "Retrieving page $((page+1)) of $pages compressed fasta... "
(echo -n "retrieve_gz fasta 0b"; cat page.bin ) | ./query_tracedb > page-${pagenum}.fa.gz
echo "done at `date +%T`"
echo -n "Retrieving page $((page+1)) of $pages compressed quality file... "
(echo -n "retrieve_gz quality 0b"; cat page.bin ) | ./query_tracedb > page-${pagenum}.qa.gz
echo "done at `date +%T`"
echo -n "Retrieving page $((page+1)) of $pages xml file... "
(echo -n "retrieve xml_info 0b"; cat page.bin ) | ./query_tracedb > page-${pagenum}.xml
gzip page-${pagenum}.xml
echo "done at `date +%T`"
rm page.bin
done
echo
echo All done at `date`!
EOF
chmod +x getOldTraces.sh
screen
./getOldTraces.sh > download.log
# detach screen
# tail -f download.log
#Retrieving sequences before Jan 2008
#Starting at Wed Jan 23 11:47:04 PST 2008
#
#Total of 13978657 sequences and 350 pages to retrieve
#
#Retrieving page 1 of 350 compressed fasta... done at 11:48:40
#Retrieving page 1 of 350 compressed quality file... done at 11:49:10
#Retrieving page 1 of 350 xml file... done at 11:51:05
#Retrieving page 2 of 350 compressed fasta... done at 11:52:40
#Retrieving page 2 of 350 compressed quality file... done at 11:53:10
# ...
#Retrieving page 350 of 350 compressed quality file... done at 07:07:08
#Retrieving page 350 of 350 xml file... done at 07:08:16
#
#All done at Fri Jan 25 07:08:16 PST 2008!
################################################################################
# RE-SEQUENCING TRACE ALIGNMENT TO HG18 (DONE 2008-01-31, Andy)
ssh kkr12u22
cd /san/sanVol1/scratch/andy/traces
mkdir run
cd run/
ls -1 /scratch/hg/hg18/nib/* | grep -v hap > nib.lst
ls -1 /san/sanVol1/scratch/andy/traces/page-*.fa.gz > traces.lst
cat < "EOF" > gsub
#LOOP
./doBlat.sh {check in exists $(path1)} $(path2) {check out line+ $(root2)/$(root1).$(root2).maf}
#ENDLOOP
cat < "EOF" > doBlat.sh
#!/bin/bash
thisDir=`pwd -P`
fa=`basename $1`
nib=$2
f=${fa%.fa.gz}
n=`basename $2`
n=${n%.nib}
name=${f}.${n}
out=${name}.maf
mkdir -p /scratch/tmp/andy/$name
mkdir -p $n
pushd /scratch/tmp/andy/$name
cp $1 .
blat -minMatch=12 -ooc=/scratch/hg/hg18/11.ooc -out=maf $nib $fa $out
cp $out ${thisDir}/$n
popd
rm -rf /scratch/tmp/andy/$name
EOF
chmod +x doBlat.sh
ssh pk
cd /san/sanVol1/scratch/andy/traces/run
gensub2 traces.lst nib.lst gsub spec
sed 's/\.fa\.c/.c/' spec > tmp; mv tmp spec
para create spec
para try, push, check
para time
#15750 jobs in batch
#100 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 15750 of 15750 jobs
#CPU time in finished jobs: 385991s 6433.19m 107.22h 4.47d 0.012 y
#IO & Wait Time: 47866s 797.76m 13.30h 0.55d 0.002 y
#Average job time: 28s 0.46m 0.01h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 186s 3.10m 0.05h 0.00d
#Submission to last job: 1551s 25.85m 0.43h 0.02d
# Cat all the alignments
ssh hgwdev
cd /san/sanVol1/scratch/andy/traces/run
head -n1 chrY/page-112.chrY.maf > maf.header
for ((i=0; i < 350; i++)); do
echo page $((i+1))
pagenum=`printf "%03d" $((i+1))`
prefix=page-$pagenum
newfile=cat/${prefix}.maf
cp maf.header $newfile
for f in `find . -name "${prefix}*"`; do
tail +2 $f | sed 's/gnl|ti|//' >> $newfile
done
done
############################################################################
# Reload CCDS (2008-02-01 markd)
# import ccds database as described in ccds.txt
set db=hg18
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
# ccdsKgMap
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
checkTableCoords ${db} -verbose=2 ccdsGene
# update all.jointer to include ${db} in ccdsDb
joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
# << emacs
#############################################################################
# phastCons multizPrimage
## (DONE - 2008-02-11 braney )
# split mafs into 10M chunks and generate sufficient statistics
# files for # phastCons
ssh kki
mkdir /cluster/data/hg18/bed/multizPrimate/msa.split
mkdir -p /san/sanvol1/scratch/hg18/multizPrimate/cons/ss
cd /cluster/data/hg18/bed/multizPrimate
# just use primates
cat << '_EOF_' > primates.list
hg18
panTro2
ponAbe2
rheMac2
calJac1
otoGar1
'_EOF_'
cd /cluster/data/hg18/bed/multizPrimate/msa.split
zcat /san/sanvol1/braney/multizPrimate/chr1.maf.gz | \
perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' | \
mafOrder stdin /cluster/data/hg18/bed/multizPrimate/primates.list chr1.maf
twoBitToFa -seq=chr1 /scratch/data/hg18/hg18.2bit chr1.fa
/cluster/bin/phast/$MACHTYPE/msa_split chr1.maf -i MAF -M chr1.fa \
-o SS -r chr1 -w 300000000,0 -I 1000 -B 5000
time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \
chr1.1-247249719.ss --tree \
"(((((hg18,panTro2),ponAbe2),rheMac2),calJac1),otoGar1)" \
--out-root starting-tree
rm chr1.maf chr1.fa chr1.1-247249719.ss
mkdir -p /san/sanvol1/scratch/hg18/multizPrimate/cons/estimate
cp msa.split/starting-tree.mod /san/sanvol1/scratch/hg18/multizPrimate/cons/estimate
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /san/sanvol1/braney/multizPrimate
set WINDOWS = /san/sanvol1/scratch/hg18/multizPrimate/cons/ss
pushd $WINDOWS
set c = $1
rm -fr $c
mkdir $c
twoBitToFa -seq=$c /scratch/data/hg18/hg18.2bit /scratch/tmp/hg18.$c.fa
set TMP = /scratch/BR.$c.maf
zcat $MAFS/$c.maf.gz | perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' | \
mafOrder stdin /cluster/data/hg18/bed/multizPrimate/primates.list $TMP
/cluster/bin/phast/$MACHTYPE/msa_split $TMP \
-i MAF \
-M /scratch/tmp/hg18.$c.fa \
-o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
rm -f $TMP /scratch/tmp/hg18.$c.fa
popd
date >> $c.done
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
cat << '_EOF_' > template
#LOOP
doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
'_EOF_'
# << happy emacs
# do the easy ones first to see some immediate results
ls -1S -r ../anno/maf | sed -e "s/.maf//" > maf.list
gensub2 maf.list single template jobList
para create jobList
para try ... check ... etc
# Completed: 49 of 49 jobs
# CPU time in finished jobs: 3520s 58.66m 0.98h 0.04d 0.000 y
# IO & Wait Time: 1200s 20.00m 0.33h 0.01d 0.000 y
# Average job time: 96s 1.61m 0.03h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 464s 7.73m 0.13h 0.01d
# Submission to last job: 723s 12.05m 0.20h 0.01d
# XXXX Estimates were attempted, not really very useful, instead, as seen
# below, merely take the cons and noncons trees from the mouse 30-way
# Estimate phastCons parameters
# see also:
# http://compgen.bscb.cornell.edu/~acs/phastCons-HOWTO.html
# Create a list of .ss files over 3,000,000 in length
# this is almost everything
cd /san/sanvol1/scratch/hg18/multizPrimate/cons/ss
ls -1l chr*/chr*.ss | egrep -v "_hap|chrUn|random" | \
awk '$5 > 3000000 {print $9;}' > ../tuningRun.list
# Set up parasol directory to calculate trees on these 50 regions
ssh pk
mkdir /cluster/data/hg18/bed/multizPrimate/treeRun2
cd /cluster/data/hg18/bed/multizPrimate/treeRun2
mkdir tree log most
# Tuning this loop should come back to here to recalculate
# Create script that calls phastCons with right arguments
cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set SAN="/san/sanvol1/scratch/hg18/multizPrimate/cons"
set SS=$1
set C=$1:h
set F=$1:t
set tmpDir="/scratch/tmp/pA2_$2"
rm -fr $tmpDir
mkdir $tmpDir
mkdir -p log/${C} tree/${C} most/${C}
cp -p $SAN/ss/$1 $tmpDir/$F
cp -p $SAN/estimate/starting-tree.mod $tmpDir
pushd $tmpDir
/cluster/bin/phast/$MACHTYPE/phastCons $F starting-tree.mod \
--gc 0.355 --nrates 1,1 --no-post-probs --ignore-missing \
--expected-length 45 --target-coverage 0.3 --most-conserved $F.most \
--quiet --log $F.log --estimate-trees $F.tree
popd
cp -p $tmpDir/$F.log log/$C
cp -p $tmpDir/$F.most most/$C
cp -p $tmpDir/$F.tree.*cons.mod tree/$C
rm -fr $tmpDir
'_EOF_'
# << happy emacs
chmod a+x makeTree.csh
# Create gensub file
cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1) $(num1)
#ENDLOOP
'_EOF_'
# << happy emacs
# Make cluster job and run it
scp -p braney@pk:/san/sanvol1/scratch/hg18/multizPrimate/cons/tuningRun.list .
gensub2 tuningRun.list single template jobList
para create jobList
para try/push/check/etc
# Completed: 310 of 310 jobs
# CPU time in finished jobs: 226767s 3779.45m 62.99h 2.62d 0.007 y
# IO & Wait Time: 1224s 20.40m 0.34h 0.01d 0.000 y
# Average job time: 735s 12.26m 0.20h 0.01d
# Longest finished job: 908s 15.13m 0.25h 0.01d
# Submission to last job: 4948s 82.47m 1.37h 0.06d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ls -1 tree/chr*/*.cons.mod > cons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
--output-average ave.cons.mod > cons_summary.txt
ls -1 tree/chr*/*.noncons.mod > noncons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
--output-average ave.noncons.mod > noncons_summary.txt
sort -k1,1 -k2,2n most/chr*/*.most > mostConserved.bed
wc -l mostConserved.bed
# 1192414 mostConserved.bed
# measuring entropy
# consEntopy <target coverage> <expected lengths>
# ave.cons.mod ave.noncons.mod --NH 9.78
/cluster/bin/phast/$MACHTYPE/consEntropy .3 45 \
ave.cons.mod ave.noncons.mod
# Transition parameters: gamma=0.300000, omega=45.000000, mu=0.022222,
# nu=0.009524
# Relative entropy: H=0.141789 bits/site
# Expected min. length: L_min=98.721504 sites
# Expected max. length: L_max=62.917932 sites
# Phylogenetic information threshold: PIT=L_min*H=13.997639 bits
ssh hgwdev featureBits -noRandom -noHap hg18 `pwd`/mostConserved.bed
# 372348946 bases of 2858034764 (13.028%) in intersection
ssh hgwdev featureBits -noRandom -noHap -enrichment hg18 genscan:cds \
`pwd`/mostConserved.bed
# genscan:cds 1.927%,
# mostConserved.bed 13.028%,
# both 0.300%, cover 15.57%, enrich 1.20x
# Estimates could be made, but more correctly, take the 30-way
# .mod file, and re-use it here.
ssh hgwdev
cd /cluster/data/hg18/bed/multizPrimate
# cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .
# add up the C and G:
grep BACKGROUND treeRun2/ave.noncons.mod | awk '{printf "%0.3f\n", $3 + $4;}'
# 0.355
# This 0.355 is used in the --gc argument below
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
ssh pk
mkdir -p /cluster/data/hg18/bed/multizPrimate/cons/run.cons
cd /cluster/data/hg18/bed/multizPrimate/cons/run.cons
# there are going to be several different phastCons runs using
# this same script. They trigger off of the current working directory
# $cwd:t which is the "grp" in this script. It is one of:
# all gliers placentals
cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.2007-05-04
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/hg18/bed/multizPrimate/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/hg18/multizPrimate/cons
cp -p $cons/$grp/*.mod .
cp -p $san/ss/$c/$f.ss $cons/$grp/*.mod $tmp
pushd $tmp > /dev/null
$PHASTBIN/phastCons $f.ss ave.cons.mod,ave.noncons.mod \
--expected-length $len --target-coverage $cov --quiet \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
# $PHASTBIN/phastCons $f.ss $grp.mod \
# --rho $rho --expected-length $len --target-coverage $cov --quiet \
# --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
sleep 4
touch $san/$grp/pp/$c $san/$grp/bed/$c
rm -f $san/$grp/pp/$c/$f.pp
rm -f $san/$grp/bed/$c/$f.bed
mv $tmp/$f.pp $san/$grp/pp/$c
mv $tmp/$f.bed $san/$grp/bed/$c
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod a+x doPhast.csh
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/hg18/multizPrimate/cons/all/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
pushd /san/sanvol1/scratch/hg18/multizPrimate/cons
ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
/cluster/data/hg18/bed/multizPrimate/cons/ss.list
popd
# run for all species
cd ..
mkdir -p all run.cons/all
cd all
# /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
# --prune-all-but=hg18,hg18,panTro2,rheMac2,calJac1,mm9,monDom4,ornAna1 \
# > all.mod
cd ../run.cons/all
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create template file for "all" run
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/hg18/multizPrimate/cons/all/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../../ss.list single template jobList
para create jobList
para try ... check ... push ... etc.
# crashed jobs are OK methinks since we're checking output in
# bed file instead of pp file
# Completed: 332 of 337 jobs
# Crashed: 5 jobs
# CPU time in finished jobs: 11572s 192.86m 3.21h 0.13d 0.000 y
# IO & Wait Time: 3189s 53.15m 0.89h 0.04d 0.000 y
# Average job time: 44s 0.74m 0.01h 0.00d
# Longest finished job: 60s 1.00m 0.02h 0.00d
# Submission to last job: 564s 9.40m 0.16h 0.01d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/hg18/multizPrimate/cons/all
time nice -n +19 cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/hg18/bed/multizPrimate/cons/all
# load into database
ssh hgwdev
cd /cluster/data/hg18/bed/multizPrimate/cons/all
time nice -n +19 hgLoadBed hg18 phastConsElementsPrimate mostConserved.bed
# Loaded 1431934 elements of size 5
# Try for 5% overall cov, and 70% CDS cov
featureBits hg18 phastConsElementsPrimate
# 460640890 bases of 2881515245 (15.986%) in intersection
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cd /san/sanvol1/scratch/hg18/multizPrimate/cons/all
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
TOP=`pwd`
export TOP
mkdir -p phastConsPrimateScores
for D in pp/chr*
do
C=${D/pp\/}
out=phastConsPrimateScores/${C}.data.gz
echo "${D} > ${C}.data.gz"
ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
gzip > ${out}
done
'_EOF_'
# << happy emacs
chmod +x gzipAscii.sh
time nice -n +19 ./gzipAscii.sh
# real 47m46.099s
# copy the phastCons8wayScores to:
# /cluster/data/hg18/bed/multizPrimate/downloads/phastCons8way/phastConsScores
# for hgdownload downloads
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
cd /san/sanvol1/scratch/hg18/multizPrimate/cons/all
time nice -n +19 ls phastConsPrimateScores/*.data.gz | xargs zcat \
| wigEncode -noOverlap stdin phastConsPrimate.wig phastConsPrimate.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# real 30m18.821s
time nice -n +19 cp -p *.wi? /cluster/data/hg18/bed/multizPrimate/cons/all
# real 1m26.426s
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/hg18/bed/multizPrimate/cons/all
ln -s `pwd`/phastConsPrimate.wib /gbdb/hg18/multizPrimate/phastConsPrimate.wib
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multizPrimate hg18 \
phastConsPrimate phastConsPrimate.wig
# real 0m53.686s
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/hg18/bed/multizPrimate/cons/all
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=hg18 phastConsPrimate > histogram.data 2>&1
# real 5m10.426s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Histogram phastConsPrimate track"
set xlabel " phastConsPrimate score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
#############################################################################
## Annotate multizPrimate multiple alignment with gene annotations
## (DONE - 2008-02-11 braney )
# Gene frames
## survey all genomes to see what type of gene track to use
ssh hgwdev
mkdir /cluster/data/hg18/bed/multizPrimate/frames
cd /cluster/data/hg18/bed/multizPrimate/frames
# dbs: eriEur1, cavPor2, sorAra1 do not exist, can not look at them
cat << '_EOF_' > showGenes.csh
#!/bin/csh -fe
foreach db (`cat ../species.list`)
echo -n "${db}: "
echo -n "Tables: "
set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
foreach table ($tables)
if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
$table == "knownGene") then
set count = `hgsql $db -N -e "select count(*) from $table"`
echo -n "${table}: ${count}, "
endif
end
set orgName = `hgsql hgcentraltest -N -e \
"select scientificName from dbDb where name='$db'"`
set orgId = `hgsql hg18 -N -e \
"select id from organism where name='$orgName'"`
if ($orgId == "") then
echo "Mrnas: 0"
else
set count = `hgsql hg18 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
echo "Mrnas: ${count}"
endif
end
'_EOF_'
# << happy emacs
chmod +x ./showGenes.csh
# given this output, manually sorted for this display:
# calJac1: Tables: Mrnas: 3558
# canFam2: Tables: ensGene: 25568, refGene: 864, Mrnas: 367629
# hg18: Tables: ensGene: 43569, knownGene: 56722, mgcGenes: 28497, refGene:
# 26066, Mrnas: 8354195
# mm9: Tables: ensGene: 43795, knownGene: 49409, mgcGenes: 22368, refGene:
# 21395, Mrnas: 5093221
# otoGar1: Tables: Mrnas: 0
# panTro2: Tables: ensGene: 32852, mgcGenes: 4, refGene: 26344, Mrnas: 6346
# ponAbe2: Tables: Mrnas: 0
# rheMac2: Tables: ensGene: 38561, refGene: 445, Mrnas: 61770
# rn4: Tables: ensGene: 33745, knownGene: 8202, mgcGenes: 5704, refGene: 14498,
# Mrnas: 872209
# tupBel1: Tables: Mrnas: 2364
# use knownGene for hg18, mm9
# use ensGene for rn4, canFam2, panTro2, rheMac2
# use Mrnas for calJac1, ponAbe2
# no annotations for
# tupBel1, otoGar1
mkdir genes
# knownGene
for DB in hg18 mm9
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# ensGene
for DB in rn4 canFam2 panTro2 rheMac2
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# and finally, using the mrna tables
for DB in calJac1 ponAbe2
do
tmpExt=`mktemp temp.XXXXXX`
tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
tmpMrna=${DB}.mrna.${tmpExt}
tmpCds=${DB}.cds.${tmpExt}
hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
from all_mrna,gbCdnaInfo,cds \
where (all_mrna.qName = gbCdnaInfo.acc) and \
(gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
$DB > ${tmpMrnaCds}
cut -f 1-2 ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
rm -f $tmpExt
echo "${DB} done"
done
ssh kkstore06
cd /cluster/data/hg18/bed/multizPrimate/frames
time (cat ../anno/maf/*.maf | nice -n +19 genePredToMafFrames hg18 stdin stdout rn4 genes/rn4.gp.gz mm9 genes/mm9.gp.gz hg18 genes/hg18.gp.gz rheMac2 genes/rheMac2.gp.gz ponAbe2 genes/ponAbe2.gp.gz panTro2 genes/panTro2.gp.gz canFam2 genes/canFam2.gp.gz calJac1 genes/calJac1.gp.gz | gzip > multizPrimate.mafFrames.gz) > frames.log 2>&1
# see what it looks like in terms of number of annotations per DB:
zcat multizPrimate.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
# 2732 calJac1
# 190927 hg18
# 195671 panTro2
# 208637 rheMac2
# 230764 mm9
# 231026 rn4
# 248086 canFam2
# load the resulting file
ssh hgwdev
cd /cluster/data/hg18/bed/multizPrimate/frames
time nice -n +19 hgLoadMafFrames hg18 multizPrimateFrames \
multizPrimate.mafFrames.gz
# real 1m1.893s
# enable the trackDb entries:
# frames multizPrimateFrames
# irows on
#############################################################################
## Add CTD data (DONE - 2008-02-22, updated 2008-03-07, Fan )
mkir /cluster/store11/gs.19/build36/bed/ctd021508
cd /cluster/store11/gs.19/build36/bed/ctd021508
# Download chem_gene_ixns.tsv from CTD site, http://ctd.mdibl.org/downloads/.
hgsql hg18 -e 'create database ctd'
hgsql ctd < ~/kent/src/hg/lib/chem_gene_ixns.sql
hgsql ctd -e 'load data local infile "chem_gene_ixns.tsv" into table chem_gene_ixns'
# create sorted data
hgsql hg18 -N -e \
'select x.geneSymbol, ChemicalId, count(distinct Interaction), ChemicalName from kgXref x, ctd.chem_gene_ixns c where x.geneSymbol=c.GeneSymbol group by x.geneSymbol, ChemicalId'|\
sort -k 1,1 -k 3,3nr -k 4,4 >ctdSorted.tab
hgsql hgFixed < ~/kent/src/hg/lib/ctdSorted.sql
hgsql hgFixed -e 'load data local infile "ctdSorted.tab" into table ctdSorted'
#############################################################################
# CREATE huge TABLE FOR HuGE LINK (DONE 3/6/08, Fan)
# Get HuGEgeneList.txt (list of HuGE genes from HuGE collaborator).
mkdir /cluster/store11/gs.19/build36/bed/HuGE
cd /cluster/store11/gs.19/build36/bed/HuGE
# put the file there.
cp HuGEgeneList.txt huge.tab
# get rid of header lines and blank lines at the end.
vi huge.tab
hgsql hg17 < ~/kent/src/hg/lib/huge.sql
hgsql hg18 < ~/kent/src/hg/lib/huge.sql
hgsql hg17 -e 'load data local infile "huge.tab" into table huge'
hgsql hg18 -e 'load data local infile "huge.tab" into table huge'
#############################################################################
#############################################################################
# ULTRACONSERVED TRACKS (LIFT FROM HG17) (DONE 2008-03-10, Andy)
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir ultras
cd ultras/
echo "select chrom,chromStart,chromEnd,name from uc16" \
| hgsql hg17 | tail +2 > uc16Hg17.bed
echo "select chrom,chromStart,chromEnd,name from ux16" \
| hgsql hg17 | tail +2 > ux16Hg17.bed
liftOver uc16Hg17.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \
uc16Hg18.bed uc16Hg18.unmapped
liftOver ux16Hg17.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \
ux16Hg18.bed ux16Hg18.unmapped
hgLoadBed hg18 uc16 uc16Hg18.bed
hgLoadBed hg18 ux16 ux16Hg18.bed
#############################################################################
# TAJIMA'S D (LIFTOVER FROM HG17) (DONE 3/17/08 angie)
ssh hgwdev
mkdir /cluster/data/hg18/bed/tajdLiftOver
cd /cluster/data/hg18/bed/tajdLiftOver
# The submitted hg17 bedGraph custom tracks had 1-based start coords,
# so correct; also, the tajdSnp* tables used a sql command to set
# the rs names, so get the data from SQL not file:
set loChain = /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz
foreach pop (Ad Ed Xd)
zcat /cluster/data/hg17/bed/tajdpoly/20050603/hg17.tajd$pop.bedGraph.gz \
| awk '{print $1 "\t" $2-1 "\t" $3 "\t" $4}' \
| liftOver stdin -minMatch=0.5 \
$loChain hg18.tajd$pop.bedGraph hg17.tajd$pop.unmapped
hgsql hg17 -NBe "select chrom,chromStart,chromEnd,name from tajdSnp$pop" \
| liftOver stdin \
$loChain hg18.tajdSnp$pop.bed hg17.tajdSnp$pop.unmapped
end
foreach pop (Ad Ed Xd)
hgLoadBed hg18 tajdSnp$pop hg18.tajdSnp$pop.bed
hgLoadBed -bedGraph=4 hg18 tajd$pop hg18.tajd$pop.bedGraph
end
# The hg17 build had some fancy sql to find items overlapping with gaps,
# awk'd to make sql to delete those items. Use featureBits to find:
foreach pop (Ad Ed Xd)
featureBits hg18 -countGaps tajdSnp$pop gap -bed=tajdSnp$pop.gap.bed
featureBits hg18 -countGaps tajd$pop gap -bed=tajd$pop.gap.bed
end
wc -l *.gap.bed
# 8 tajdAd.gap.bed
# 8 tajdEd.gap.bed
# 0 tajdSnpAd.gap.bed
# 0 tajdSnpEd.gap.bed
# 0 tajdSnpXd.gap.bed
# 8 tajdXd.gap.bed
diff tajdAd.gap.bed tajdEd.gap.bed
diff tajdAd.gap.bed tajdXd.gap.bed
# No output from either diff -- same ranges.
awk '{print $3 - $2;}' tajdAd.gap.bed
#2605
#5000
#5000
#1000
#1199
#1359
#5000
#4100
# Actually, I disagree with removing the items that overlap those.
# As the description page says, each 10kb region is really the center
# of a 100kb window. Those windows will overlap gaps -- and if the
# center 10k of a window happens to overal a gap, the whole window is
# no worse than a window that overlaps a gap 1/3 of the way in instead
# of 1/2.
#############################################################################
# ADD ALLEN BRAIN CORTEXT LINK (DONE, 2/12/08, Fan)
mkdir -p /cluster/store11/gs.19/build36/bed/allenBrain
cd /cluster/store11/gs.19/build36/bed/allenBrain
# save list of genes from Allen Brain into file allenBrainGene.tab
hgsql hg18 < ~/src/hg/lib/allenBrainGene.sql
hgsql hg18 -e \
'load data local infile "allenBrainGene.tab" into table allenBrainGene'
#############################################################################
# BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-10 - larrym)
ssh kkstore04
screen # use screen to control this multi-day job
mkdir /cluster/data/hg18/bed/blastz.equCab2.2008-04-10
cd /cluster/data/hg18/bed/blastz.equCab2.2008-04-10
cat << '_EOF_' > DEF
# Human vs. Horse
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/cluster/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.equCab2.2008-04-10
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/equCab2/blastz.hg18 >>& do.log &
# failed so had to rerun stuff manually then, continue thus:
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk -syntenicNet -continue=load \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/equCab2/blastz.hg18 >>& do.log &
0.157u 0.084s 1:21:15.25 0.0% 0+0k 0+0io 0pf+0w
ln -s blastz.equCab2.2008-04-10 /cluster/data/hg18/bed/blastz.equCab2
featureBits hg18 -chrom=chr1 chainEquCab2Link
# 133103986 bases of 224999719 (59.157%) in intersection
cd /cluster/data/hg18/bed/blastz.equCab2.2008-04-10
cat fb.hg18.chainEquCab2Link.txt
# 1647122438 bases of 2881515245 (57.162%) in intersection
# re-running with fixed UnScaffolds business with fixed chr27:
mkdir /hive/data/genomes/hg18/bed/blastzEquCab2.2008-12-01
cd /hive/data/genomes/hg18/bed/blastzEquCab2.2008-12-01
cat << '_EOF_' > DEF
# Human vs. Horse
BLASTZ=blastz
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/hg18/bed/blastzEquCab2.2008-12-01
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
# broken chain step for chr19, ran manually all day long on swarm, then
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
-continue=chainMerge -verbose=2 -workhorse=hgwdev \
-stop=net -smallClusterHub=pk -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1
XXX - running Tue Dec 2 15:42:18 PST 2008
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
-continue=syntenicNet -syntenicNet -verbose=2 -workhorse=hgwdev \
-stop=syntenicNet -smallClusterHub=pk -bigClusterHub=pk \
-debug -chainMinScore=3000 -chainLinearGap=medium > syntenicNet.log 2>&1
#############################################################################
# MAKE PCR TARGET FOR UCSC GENES (DONE 4/18/08 angie - UPDATED 11/4/08)
ssh hgwdev
mkdir /cluster/data/hg18/bed/mrnaPcr
cd /cluster/data/hg18/bed/mrnaPcr
# First, get consistent FA and PSL for UCSC Genes.
# Initially I tried to use files from /cluster/data/hg18/bed/ucsc.10/:
# subColumn 10 /cluster/data/hg18/bed/ucsc.10/rnaToGenome.psl
# /cluster/data/hg18/bed/ucsc.10/txToAcc.tab ucscGenes.hg18.psl
# /cluster/data/hg18/bed/ucsc.10/ucscGenes.fa
# But the psl was not from exactly the same seq's as in the fa.
# Jim's suggestion: use sequenceForBed to get genomic-translated
# sequences, and then genePredToFakePsl. sequenceToBed must be
# run on hgwdev.
genePredToBed /cluster/data/hg18/bed/ucsc.11/ucscGenes.gp > ucscGenes.bed
hgsql hg18 -NBe 'select kgId,geneSymbol from kgXref' \
| perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
> idSub.txt
subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
sequenceForBed -keepName -db=hg18 -bedIn=ucscGenesIdSubbed.bed \
-fastaOut=stdout \
| faToTwoBit stdin kgTargetSeq.2bit
cut -f 1-10 /cluster/data/hg18/bed/ucsc.11/ucscGenes.gp \
| genePredToFakePsl hg18 stdin kgTargetAli.psl /dev/null
# Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
cd /cluster/data/hg18/bed/mrnaPcr
hgLoadPsl hg18 kgTargetAli.psl
mkdir /gbdb/hg18/targetDb
ln -s /cluster/data/hg18/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/hg18/targetDb/
# Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
# /gbdb/hg18/targetDb/kgTargetSeq.2bit .
ssh hgwdev
# Add records to hgcentraltest blatServers and targetDb:
hgsql hgcentraltest -e \
'INSERT into blatServers values ("hg18KgNov08", "blat13", 17799, 0, 1);'
hgsql hgcentraltest -e \
'INSERT into targetDb values("hg18KgNov08", "UCSC Genes", \
"hg18", "kgTargetAli", "", "", \
"/gbdb/hg18/targetDb/kgTargetSeq.2bit", 1, now(), "");'
#############################################################################
# MAKE PCR TARGET FOR SNAPSHOT OF ALL_MRNA (DONE 4/18/08 angie)
ssh hgwdev
# Load up native mRNA target tables:
hgsql hg18 -NBe 'select qName from all_mrna' \
| sort -u > mrnaAccs.txt
$HOME/kent/src/hg/makeDb/genbank/bin/$MACHTYPE/gbGetSeqs \
-gbRoot=/gbdb/genbank -accFile=mrnaAccs.txt \
-db=hg18 -native genbank mrna mrnaTargetSeq.fa
faToTwoBit mrnaTargetSeq.fa mrnaTargetSeq.2bit
ln -s /cluster/data/hg18/bed/mrnaPcr/mrnaTargetSeq.2bit \
/gbdb/hg18/targetDb/
hgsql hg18 -e ' \
create table mrnaTargetAli select * from all_mrna; \
alter table mrnaTargetAli add index (tName,bin); \
alter table mrnaTargetAli add index (qName);'
rm *.tab
ssh kolossus
# Start up gfServer for mrnaTargetSeq:
cd /cluster/data/hg18/bed/mrnaPcr
faToTwoBit mrnaTargetSeq.fa mrnaTargetSeq.2bit
gfServer -stepSize=5 -canStop start localhost 17991 mrnaTargetSeq.2bit &
ssh hgwdev
# Add records to hgcentraltest blatServers and targetDb:
hgsql hgcentraltest -e \
'INSERT into blatServers values ("hg18MrnaApr08", "kolossus", 17991, 0, 1);'
hgsql hgcentraltest -e \
'INSERT into targetDb values("hg18MrnaApr08", "Human mRNAs", \
"hg18", "mrnaTargetAli", "", "", \
"/gbdb/hg18/targetDb/mrnaTargetSeq.2bit", 2, now(), "");'
#############################################################################
# Reload CCDS from CCDS.20080502 dump (2008-05-03 markd)
# import ccds database as described in ccds.txt
set db=hg18
set ncbiBld=36.3
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ${ncbiBld} ccdsInfo ccdsGene
# ccdsKgMap
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
checkTableCoords ${db} -verbose=2 ccdsGene
# update all.jointer to include ${db} in ccdsDb
joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
# << emacs
############################################################################
# update vega genes to version 31 (v49 of Ensembl genes)
# (DONE - 2008-05-15 - Hiram)
mkdir /cluster/data/hg18/bed/vega31_49
cd /cluster/data/hg18/bed/vega31_49
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/vega/human/gtf_file.gz"
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/vega/human/CHANGELOG.gz"
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/vega/human/catalog.txt"
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/vega/human/pep/Homo_sapiens.VEGA.apr.pep.tot.fa.gz"
# processing similar to the same processing for Ensembl genes,
# from /cluster/data/hg18/bed/ensGene.49/process/doProcess.csh
zcat gtf_file.gz \
| sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
| liftUp -type=.gtf stdout \
/cluster/data/hg18/jkStuff/ensGene.haplotype.lift carry stdin \
| gzip > allGenes.gtf.gz
gtfToGenePred -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \
| gzip > hg18.allGenes.gp.gz
/cluster/home/hiram/kent/src/hg/utils/automation/extractGtf.pl \
infoOut.txt > ensGtp.tab
genePredCheck -db=hg18 hg18.allGenes.gp.gz
# checked: 62418 failed: 0
zcat allGenes.gtf.gz | grep -i pseudo > pseudo.gtf
zcat allGenes.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
genePredCheck -db=hg18 pseudo.gp
# checked: 5747 failed: 0
genePredCheck -db=hg18 not.pseudo.gp
# checked: 56671 failed: 0
hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp
hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp
############################################################################
# DGV V8 (DATABASE OF GENOMIC VARIANTS) (DONE 8/12/09 angie)
# DGV V7 done 3/11/09
# DGV V6 thin regions dropped 2/23/09
# DGV V6 with useless thin regions done 11/12/08
# DGV V5 done 7/16/08
# DGV V4 done 5/9/08
ssh hgwdev
mkdir /hive/data/genomes/hg18/bed/dgv.v8
cd /hive/data/genomes/hg18/bed/dgv.v8
wget --timestamping \
http://projects.tcag.ca/variation/downloads/variation.hg18.v8.aug.2009.txt
wget --timestamping \
http://projects.tcag.ca/variation/downloads/indel.hg18.v8.aug.2009.txt
# Save previous version for comparison:
hgsql hg18 -e 'rename table dgv to dgvV7'
# shuffle fields into bed8+
foreach f (*.v8.*.txt)
tail -n +2 $f \
| perl -wpe 'chomp; \
($id, $landmark, $chr, $start, $end, $varType, \
undef, undef, undef, $ref, $pmid, $method, \
$gain, $loss, undef, undef, $sample) = split("\t"); \
$id =~ s/^Variation_//; \
$start-- unless ($start == 0); \
$landmark = "" if ($landmark =~ /^chr.*\d\.\.\d/); \
$rgb = ($varType =~ /^Inv/) ? "100,0,100" : "0,200,0"; \
if ($gain ne "" || $loss ne "") { \
$gain =~ s/^(NA)? ?$/0/; $loss =~ s/^(NA)? ?$/0/; \
$rgb = "200,0,0" if ($gain > 0 && $loss == 0); \
$rgb = "0,0,200" if ($loss > 0 && $gain == 0); \
} \
$_ = join("\t", $chr, $start, $end, $id, 0, "+", \
$start, $start, $rgb, $landmark, $varType, \
$ref, $pmid, $method, $sample) . "\n";' \
> $f:r.bed
end
hgLoadBed hg18 dgv *.bed \
-sqlTable=$HOME/kent/src/hg/lib/dgv.sql -tab
#Loaded 49988 elements of size 15
hgsql hg18 -NBe 'select count(distinct(pubMedId)) from dgv;'
#35
############################################################################
# AGILENT CGH PROBES (AND MM8, RN4) (Done 2008-05-13, Andy)
ssh hgwdev
bash
cd /cluster/data/hg18/bed
mkdir agilentProbes
cd agilentProbes/
cp /usr/local/apache/htdocs/donna/Agilent/Agilent_Human_CGH.zip .
# (agilent-provided zips)
# what a pain... this zipfile isn't unzippable using linux unzip.
# Bob's windows machine didn't do it either. Finally got it using the
# mac in Erich and Victoria's office. Extracting creates a directory
# called "Agilent_Human_CGH Folder"
cp Agilent_Human_CGH\ Folder/* .
rmdir Agilent_Human_CGH\ Folder/
tail +3 014693_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilent244a.bed
tail +3 014698_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilent105a.bed
tail +3 014950_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilent44k.bed
for bed in *.bed; do hgLoadBed hg18 ${bed%.bed}{,.bed}; done
cd /cluster/data/mm8/bed
mkdir agilentCgh
cd agilentCgh/
cp /usr/local/apache/htdocs/donna/Agilent/Agilent_Mouse_CGH.zip .
# (same crap as before with the zip file)
cp Agilent_Mouse_CGH\ Folder/* .
rmdir Agilent_Mouse_CGH\ Folder/
tail +3 014695_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh244a.bed
tail +3 014699_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh105a.bed
tail +3 015028_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh44k.bed
for bed in *.bed; do hgLoadBed mm8 ${bed%.bed}{,.bed}; done
cd /cluster/data/rn4/bed
mkdir agilentCgh
cd agilentCgh/
cp /usr/local/apache/htdocs/donna/Agilent/Agilent_Rat_CGH.zip .
# (yep, again)
cp Agilent_Rat_CGH\ Folder/* .
rmdir Agilent_Rat_CGH\ Folder/
tail +3 015223_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh244a.bed
tail +3 015235_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh105a.bed
for bed in *.bed; do hgLoadBed rn4 ${bed%.bed}{,.bed}; done
############################################################################
# AGILENT HUMAN SUREPRINT G3 ARRAY PROBESETS (DONE 2008-12-09, Andy)
ssh hgwdev
cd /hive/data/hg18/bed/agilentProbes
wget --timestamping --user=microarray --password=<get-it-from-agilent> \
"ftp://ftp.agilent.com/restricted/UCSC_BED_FILES/*"
zcat 021365_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCnv2x400k stdin
zcat 021529_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh1x1m stdin
zcat 021850_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh2x400k stdin
zcat 021924_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh8x60k stdin
zcat 022060_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh4x180k stdin
############################################################################
# TWO MORE AGILENT HUMAN ARRAYS (DONE, 2009-07-28 Andy)
ssh hgwdev
cd /hive/data/hg18/bed/agilentProbes
wget --timestamping --user=microarray --password=<get-it-from-agilent> \
"ftp://ftp.agilent.com/restricted/UCSC_BED_FILES/*"
tail -n +3 022837_D_UCSCTrack_20090331.txt | hgLoadBed hg18 agilentCnv2x105k stdin
tail -n +3 023642_D_BED_20090528.bed | \
awk 'BEGIN{FS="\t";OFS="\t"}{print $0, "1000", "+";}' | \
hgLoadBed hg18 agilentHdd1x1m stdin
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# ILLUMINA WG-6 PROBES (2008-06-13 Andy)
# Download the Platform file from GEO here:
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL6884
# Click on "Download full table"
ssh hgwdev
bash
cd /san/sanVol1/scratch/andy
mkdir illumina
cd illumina/
cp ~/GPL6884-5803.txt .
# Collect GIs for all the RNAs
# First download/install Biopython
wget http://biopython.org/DIST/biopython-1.45.tar.gz
tar xfz biopython-1.45.tar.gz
mkdir biopythonLibs
cd biopython-1.45/
python setup.py install --home=/san/sanVol1/scratch/andy/illumina/biopythonLibs
export PYTHONPATH=/san/sanVol1/scratch/andy/illumina/biopythonLibs
# Now get the RNAs
mkdir getRna grabbed
cd getRna/
tail +31 ../GPL6884-5803.txt | cut -f11 | sort | uniq > gis.txt
wc -l gis.txt
# 43338 gis.txt
split -d -l 100 -a 3 gis.txt gis-
rm gis.txt
cat < "EOF" > getSeqs.py
import Bio
from Bio import EUtils
from Bio.EUtils import HistoryClient
gis = open('gis.txt', 'r').readlines()
for i in range(len(gis)):
gis[i] = gis[i].rstrip('\n')
ids = EUtils.DBIds('nucleotide', gis)
client = HistoryClient.HistoryClient()
result = client.post(ids)
print result.efetch(retmode="text", rettype="fasta").read()
EOF
# << emacs
cat < "EOF" > getSeqs.sh
#!/bin/bash
for gi in gis-*; do
numGot="0";
attempt="1";
while [ $numGot -lt 100 ]; do
echo Getting $gi attempt $attempt;
cp $gi gis.txt;
fa=${gi}.fa
python getSeqs.py > $fa
numGot=`grep '>' $fa | wc -l`;
if [ $numGot = 100 ]; then
echo Got all for $gi
mv $fa ../grabbed/;
rm $gi
else
rm $fa;
sleep 10;
fi
attempt=$((attempt+1));
done
sleep 5;
done
EOF
# << emacs
chmod +x getSeqs.sh
./getSeqs.sh
# there's a fair bit that retries the download over and over but eventually it
# gets to the last one, which doesn't have 100 lines, so I run the python
# program on that on by itself.
cat ../grabbed/* > probeRna.fa
rm -rf ../grabbed/
cd ../
# Now blat RNA to genome
mkdir -p blatRna/{splits,out}
cd blatRna/
faSplit sequence ../getRna/probeRNA.fa 400 splits/rna-
ls -1 splits/* > splits.lst
cat < "EOF" > runBlat.sh
#!/bin/bash
cd -P .
fa=`basename $1`
chr=`basename $2 .nib`
split=`basename $1 .fa`
out=${split}.${chr}.psl
nibDir=/scratch/hg/hg18/bothMaskedNibs
tmpDir=/scratch/tmp/$out
mkdir $tmpDir
pushd $tmpDir
oldDir=`dirs +1`
cp ${oldDir}/$1 .
blat -noHead -ooc=/scratch/hg/hg18/11.ooc -out=psl ${nibDir}/$2 $fa $out
mkdir -p ${oldDir}/out/${chr}
cp $out ${oldDir}/out/${chr}/
popd
rm -rf $tmpDir
EOF
# << emacs
chmod +x runblat.sh
cat < "EOF" > gsub
#LOOP
./runBlat.sh {check in line+ $(path1)} $(path2) {check out exists out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
EOF
# << emacs
ls -1 /cluster/data/hg18/nib > nib.lst
ssh pk
cd /san/sanVol1/scratch/andy/illumina/blatRna
gensub2 splits.lst nib.lst gsub spec
para create spec
para try
para push
para time
#17820 jobs in batch
#34457 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 17820 of 17820 jobs
#CPU time in finished jobs: 84196s 1403.26m 23.39h 0.97d 0.003 y
#IO & Wait Time: 48448s 807.47m 13.46h 0.56d 0.002 y
#Average job time: 7s 0.12m 0.00h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 270s 4.50m 0.07h 0.00d
#Submission to last job: 1515s 25.25m 0.42h 0.02d
exit; # back to hgwdev
mkdir /tmp/andy
pslSort -nohead dirs allSorted.psl /tmp/andy out/*
rmdir /tmp/andy
pslReps -singleHit allSorted.psl single.ps{l,r}
# Blat probes against the RNAs
cd ../
mkdir -p blatProbes/out
cd blatProbes/
ln -s ../blatRna/splits .
ln -s ../blatRna/splits.lst .
ln -s ../blatRna/single.psl .
tail +31 ../GPL6884-5803.txt | cut -f1,11,18 | \
awk '{printf("%s\tgi|%s\t%s\n", $1, $2, $3);}' > probes.tab
cat << "EOF" >
#!/bin/bash
faFile=`basename $1`;
pslFile=${faFile%.fa}.psl
probeFile=$2;
rnaOnGenomePsl=$3;
tmpDir=/scratch/andy/`date +"%T" | tr ':' '_'`.$$
mkdir -p $tmpDir
cp $1 $2 $3 $tmpDir
pushd $tmpDir
for id in `grep '>' $faFile | sed 's/^>//'`; do
# make probe fa
echo $id
awk '{if ($2 == "'"$id"'") printf(">%s\n%s\n", $1, $3);}' $probeFile \
> probe.fa
# extract single RNA fa
faOneRecord $faFile $id > rna.fa
blat -noHead rna.fa probe.fa probeOnRna.psl
awk 'BEGIN{FS="\t";OFS="\t";}{if ($10 == "'"$id"'") print;}' \
$rnaOnGenomePsl > rnaOnGenome.psl
if [ `find . -size '0b' -type f | wc -l` == 0 ]; then
pslMap probeOnRna.psl rnaOnGenome.psl probeOnGenome.psl
cat probeOnGenome.psl >> $pslFile
fi
done
popd
cp $tmpDir/$pslFile $4
rm -rf $tmpDir
EOF
# << emacs
cat << "EOF" > gsub
#LOOP
./probeBlat.sh {check in line+ $(path1)} probes.tab single.psl {check out exists out/$(root1).psl}
#ENDLOOP
EOF
# << emacs
ssh pk
cd /san/sanVol1/scratch/andy/illumina/blatProbes
gensub2 splits.lst single gsub spec
para create spec
para try
para push
para time
#396 jobs in batch
#41977 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 396 of 396 jobs
#CPU time in finished jobs: 11101s 185.02m 3.08h 0.13d 0.000 y
#IO & Wait Time: 1361s 22.68m 0.38h 0.02d 0.000 y
#Average job time: 31s 0.52m 0.01h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 121s 2.02m 0.03h 0.00d
#Submission to last job: 271s 4.52m 0.08h 0.00d
exit # back to hgwdev
mkdir /tmp/andy
pslSort -nohead dirs sorted.psl /tmp/andy out
# Load stuff up
pslToBed sorted.psl sorted.bed
cd ../
mkdir tables
cd tables/
cp ../blatProbes/sorted.{psl,bed} .
hgLoadPsl -table=illuminaProbesAlign hg18 sorted.psl
hgLoadBed hg18 illuminaProbes sorted.bed
cat << "EOF" >
CREATE TABLE illuminaProbesSeq (
id varchar(40) NOT NULL,
seq varchar(55) NOT NULL,
PRIMARY KEY (id)
) TYPE=MyISAM;
EOF
# << emacs
cut -f1,3 ../blatProbes/probes.tab > illuminaProbesSeq.tab
hgLoadSqlTab hg18 illuminaProbesSeq{,.sql,.tab}
############################################################################
# dbSNP BUILD 129 (DONE 6/24/08 angie)
# 8/6/08: Regenerated snp129.sql with only those enum/set values that are
# actually used (except always keep unknown, the default) and reloaded snp129.
# No data change -- just the sql field definitions for enums and sets.
# 8/7/08: Swapped molType values cDNA <--> genomic in snp129 because they
# were swapped in the fasta headers.
# QA NOTE: used sudo mytouch to change timestamps on all downstream snp129
# tables (snp129Exceptions, snp129ExceptionDesc, snp129OrthoPt2Pa2Rm2,
# snp129Seq) to .2008-08-08 00:00:00 to avoid unwarranted joinerCheck
# time discrepancy errors. (8/8/08, brooke)
# Set up build directory
ssh kkstore06
mkdir -p /cluster/store3/dbSNP129/{human,shared}
ln -s /cluster/store3/dbSNP129 /cluster/data/dbSNP/129
# Get field encodings -- if there are changes or additions to the
# encoding of the corresponding fields, you might need to update
# snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
# hg/lib/snp125Ui.c).
cd /cluster/data/dbSNP/129/shared
alias wg wget --timestamping
set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
wg $ftpSnpDb/shared_data/LocTypeCode.bcp.gz
wg $ftpSnpDb/shared_data/SnpClassCode.bcp.gz
wg $ftpSnpDb/shared_data/SnpFunctionCode.bcp.gz
wg $ftpSnpDb/shared_data/SnpValidationCode.bcp.gz
# Here is another source -- it is not as up-to-date as the above, but
# our encodings (enums and sets in snp129.sql) are named more similar
# to those in the 2005 ASN:
# ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn
########################## DOWNLOAD #############################
cd /cluster/data/dbSNP/129/human
mkdir data schema rs_fasta
# Get data from NCBI (anonymous FTP)
wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt
cd /cluster/data/dbSNP/129/human/data
# ContigLoc table has coords, orientation, loc_type, and refNCBI allele
wg $ftpSnpDb/organism_data/b129_SNPContigLoc_36_3.bcp.gz
wg $ftpSnpDb/organism_data/b129_SNPContigLocusId_36_3.bcp.gz
wg $ftpSnpDb/organism_data/b129_ContigInfo_36_3.bcp.gz
# MapInfo has alignment weights
wg $ftpSnpDb/organism_data/b129_SNPMapInfo_36_3.bcp.gz
# SNP has univar_id, validation status and heterozygosity
wg $ftpSnpDb/organism_data/SNP.bcp.gz
# Get schema
cd /cluster/data/dbSNP/129/human/schema
wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz
# Get fasta files
# using headers of fasta files for molType, class, observed
cd /cluster/data/dbSNP/129/human/rs_fasta
wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz
########################## LOAD NCBI TABLES #############################
# Simplify names of data files -- strip version & extras to get
# local canonical table names.
cd /cluster/data/dbSNP/129/human/data
foreach f (*.bcp.gz)
set new = `echo $f \
| sed -e 's/^b129_SNP//; s/^b129_//; s/_36_3//; s/.bcp//;'`
mv $f $new
echo $new
end
# Extract just the tables that we need from the NCBI msSQL table
# creation file, and get CREATE statements from
# human_9606_table.sql for our 5 tables
cd /cluster/data/dbSNP/129/human/schema
zcat human_9606_table.sql.gz \
| perl -we '$/ = "\nGO\n\n\n"; \
while (<>) { \
next unless /^CREATE TABLE \[(b129_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_36_3)?\]/; \
s/b129_(SNP)?//; s/_36_3//; \
s/[\[\]]//g; s/GO\n\n/;/; s/smalldatetime/datetime/g; \
s/ON PRIMARY//g; s/COLLATE//g; s/Latin1_General_BIN//g; \
s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
s/nvarchar/varchar/g; s/set quoted/--set quoted/g; \
s/(image|varchar\s+\(\d+\))/BLOB/g; \
print; \
}' \
> table.sql
# load on kolossus or a small cluster machine (mysql5 is OK for this;
# in fact it's better than 4 because it has 'show warnings').
ssh kkr3u00
hgsql '' -e 'create database hg18snp129'
cd /cluster/data/dbSNP/129/human/schema
hgsql hg18snp129 < table.sql
cd ../data
# Avoid wasting space by excluding mappings to non-reference contigs:
foreach t (ContigInfo MapInfo)
zcat $t.gz \
| egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
| perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
| hgLoadSqlTab -oldTable hg18snp129 $t placeholder stdin
end
# Compare contig list between our ctgPos and reference contigs in
# ContigInfo:
ssh hgwdev-10 hgsql hg18 -N -B -e '"select contig from ctgPos;"' \
| sort > /tmp/1
hgsql hg18snp129 -NBe 'select distinct(group_label) from ContigInfo'
# --> reference, c5_H2, c6_COX, c6_QBL, c22_H2, DR53
# (HuRef, Celera, CRA_TCAGchr7v2 grepped out above)
hgsql hg18snp129 -N -B -e 'select contig_acc from ContigInfo \
where group_label in \
("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' | sort > /tmp/2
diff /tmp/1 /tmp/2
# No diff.
# Make sure there are no orient != 0 contigs among those selected.
hgsql hg18snp129 -NBe \
'select count(*) from ContigInfo where orient != 0 and \
group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");'
#0
# ContigLoc is huge, and we want just the reference contig mappings.
# So, based on the reference & haplo ctg_id values in ContigInfo,
# filter to get just the mappings for those contigs:
zcat ContigLoc.gz \
| awk '$3 <= 377 || $3 == 7015' \
| perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
| hgLoadSqlTab -oldTable hg18snp129 ContigLoc placeholder stdin
foreach t (ContigLocusId SNP)
zcat $t.gz \
| perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
| hgLoadSqlTab -oldTable hg18snp129 $t placeholder stdin
end
# There were some warnings (many cleared up by the perl substitution)
# but no rows were dropped. 'show warnings' after a manual 'load data'
# complains about missing values (OK when e.g. position is not known).
foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
echo -n "${t}:\t"
hgsql -N -B hg18snp129 -e 'select count(*) from '$t
end
#ContigInfo: 379
#ContigLoc: 15835019 (before filtering: 46913472)
#ContigLocusId: 25496815
#MapInfo: 14845535 (before filtering: 44627804)
#SNP: 14708770
#################### EXTRACT INFO FROM NCBI TABLES ####################
mkdir -p /scratch/snp/129/human
cd /scratch/snp/129/human
time hgsql hg18snp129 -e \
'alter table ContigLoc add index (ctg_id); \
alter table ContigInfo add index (ctg_id);'
#0.002u 0.002s 2:14.79 0.0% 0+0k 0+0io 1pf+0w
# was ~12m on a run without trimming ContigLoc!
time hgsql hg18snp129 -e \
'alter table ContigInfo add index (group_label(9));'
#0.005u 0.000s 0:00.16 0.0% 0+0k 0+0io 1pf+0w
# For joining files by shared column, we need a unique identifier in
# that shared column. snp_id is not unique -- the same rsID can appear
# in both the reference assembly and on one of the others e.g. c6_COX.
# So concatenate the assembly identifier and snp_id to get hopefully
# unique label.
time hgsql hg18snp129 -NBe \
'select concat(ContigInfo.group_label, ".", snp_id), \
ContigInfo.contig_acc, asn_from, asn_to, \
loc_type, orientation, allele, phys_pos_from \
from ContigLoc, ContigInfo \
where ContigLoc.ctg_id = ContigInfo.ctg_id and ContigInfo.group_label \
in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \
| sort \
> ucscContigLoc.txt
# no time output because of the pipe... took 5 minutes.
# Are these IDs unique?
wc -l ucscContigLoc.txt
#15835019 ucscContigLoc.txt
awk '{print $1;}' ucscContigLoc.txt | uniq | wc -l
#14791529
# Nope. Find non-unique IDs:
awk 'prev == $1 {print;} {prev = $1;}' ucscContigLoc.txt | head
grep ^c5_H2.10035195 ucscContigLoc.txt
#c5_H2.10035195 NT_113801 639954 639954 2 0 G 69605321
#c5_H2.10035195 NT_113801 660407 660407 2 0 G 69625774
#c5_H2.10035195 NT_113801 911780 911780 2 1 C 69877147
#c5_H2.10035195 NT_113801 933061 933061 2 1 C 69898428
# OK, they can be duplicated within the same contig. See if we can
# get by with anchoring everything to ucscContigLoc.txt. But everybody
# else better have unique IDs!
# SNP -> valid, avHet, avHetSE
# SNP has only snp_id as identifier, nothing relating to assembly.
hgsql hg18snp129 -NBe \
'select snp_id, validation_status, avg_heterozygosity, het_se \
from SNP;' \
| sort \
> ucscSNP.txt
# Check ID uniqueness:
wc -l ucscSNP.txt
#14708770 ucscSNP.txt
awk '{print $1;}' ucscSNP.txt | uniq | wc -l
#14708770
# ContigLocusId -> func
# ContigLocusId has only snp_id as an identifier (it gives one
# example contig if the SNP is on multiple contigs).
# The sort options and awk are to convert multiple entries with different
# function classes for the same SNP into one entry per SNP with a list
# of function classes.
hgsql hg18snp129 -NBe \
'select snp_id, fxn_class from ContigLocusId;' \
| sort -u -k1,1 -k2,2n \
| awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \
else { if (prevId) {print prevId "\t" prevFunc;} \
prevFunc = $2 ","; }} \
{prevId = $1;} \
END {print prevId "\t" prevFunc;}' \
> ucscFunc.txt
# Check ID uniqueness:
wc -l ucscFunc.txt
#6136008 ucscFunc.txt
awk '{print $1;}' ucscFunc.txt | sort -u | wc -l
#6136008
# MapInfo -> weight
# MapInfo needs assembly+snp_ids in order to have unique IDs.
time hgsql hg18snp129 -e \
'alter table MapInfo add index (assembly(9));'
#0.003u 0.003s 3:40.29 0.0% 0+0k 0+0io 1pf+0w
hgsql hg18snp129 -NBe \
'select concat(assembly, ".", snp_id), weight \
from MapInfo where assembly \
in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \
| sort \
> weight.txt
# ~1 minute
# Check ID uniqueness:
wc -l weight.txt
#14791529 weight.txt
awk '{print $1;}' weight.txt | uniq | wc -l
#14791529
awk '{print $2;}' weight.txt | sort -n | uniq -c
# 40910 0
#14326127 1
# 157402 2
# 256608 3
# 10482 10
# SNPs w/weight 0 and 10 will be discarded later.
# fasta headers -> observed, molType, class
zcat /cluster/data/dbSNP/129/human/rs_fasta/rs_ch*.fas.gz \
| grep '^>gnl' \
| perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
| sort \
> ucscGnl.txt
# ~5m
wc -l ucscGnl.txt
#14708630 ucscGnl.txt
awk '{print $1;}' ucscGnl.txt | uniq | wc -l
#14708630
############### JOIN NCBI COLUMNS TO GET UCSC SNP COLUMNS ################
# Join files by ID. Start with ContigLoc and MapInfo because they
# share the concatenated assembly+snp_id IDs.
time join -a 1 -e MISSING -t ' ' ucscContigLoc.txt weight.txt \
> ucscCL+w.txt
#28.334u 4.730s 1:43.47 31.9% 0+0k 0+0io 0pf+0w
wc -l ucscCL+w.txt
#15835019 ucscCL+w.txt
# Same as ucscContigLoc.txt above, good.
# Any missing weights?
grep MISSING ucscCL+w.txt | head
# No output, good.
# Join the files with SNP-only IDs.
time join -e MISSING -t ' ' ucscGnl.txt ucscSNP.txt \
> ucscG+S.txt
#17.375u 2.127s 0:47.40 41.1% 0+0k 0+0io 0pf+0w
wc -l ucscG+S.txt
#14708630 ucscG+S.txt
# Same as ucscGnl.txt -- somewhat less than ucscSNP.txt (14708770)...
grep MISSING ucscG+S.txt | wc -l
#0
time join -a 1 -e MISSING -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 \
-t ' ' ucscG+S.txt ucscFunc.txt \
> ucscG+S+F.txt
#18.612u 2.334s 0:50.30 41.6% 0+0k 0+0io 0pf+0w
wc -l ucscG+S+F.txt
#14708630 ucscG+S+F.txt
grep MISSING ucscG+S+F.txt | wc -l
#8572703
# Not surprising -- ucscFunc.txt has only 6136008 lines.
expr 14708630 - 6136008
#8572622
# Not an exact match like in 128, but not too far off.
# Convert assembly+snp_id's to just snp_id (sorted) for final join.
perl -wpe 's/^\S+\.(\d+)/$1/;' ucscCL+w.txt \
| sort > ucscCL+w.snp_id.txt
awk '{print $1;}' ucscCL+w.snp_id.txt | uniq | wc -l
#14626025
# Interesting... which snp_ids are missing from ContigLoc?
# (note: don't use sort -n | comm, it needs alphabetical sort!)
awk '{print $1;}' ucscCL+w.snp_id.txt | sort -u > /tmp/1
awk '{print $1;}' ucscGnl.txt | sort -u > /tmp/2
comm -13 /tmp/1 /tmp/2 > notInContigLoc.txt
comm -23 /tmp/1 /tmp/2 > notInSNP.txt
wc -l notIn*.txt
# 83043 notInContigLoc.txt
# 438 notInSNP.txt
# notInContigLoc could simply mean that they weren't mapped, which is OK.
# notInSNP is more concerning.
#Not deleted!: 52789237, 55664014, 61749732,
#Invalid (not retired): 63751714, 63751902
# -- sent email to snp-admin at ncbi.
# Final join -- treat ContigLoc as authoritative (since it has coords).
# Arrange columns in same order as in the SNP table, with extras for
# checking at the end (phys_pos_from).
# chr chrS chrE name strand refN obs molT cls val aH aHSE fxn locT wt ...
time join -a 1 -e MISSING -t ' ' \
-o '1.2 1.3 1.4 1.1 1.6 1.7 2.2 2.3 2.4 2.5 2.6 2.7 2.8 1.5 1.9 1.8' \
ucscCL+w.snp_id.txt ucscG+S+F.txt \
> ucscNcbiSnp.ctg.txt
#41.204u 6.274s 1:05.99 71.9% 0+0k 0+0io 0pf+0w
wc -l ucscNcbiSnp.ctg.txt
#15835019 ucscNcbiSnp.ctg.txt
grep MISSING ucscNcbiSnp.ctg.txt | awk '{print $4;}' | uniq | wc -l
#8495168
# Lift the map contig coordinates to chrom coordinates (~2m);
time liftUp ucscNcbiSnp.bed \
/cluster/data/hg18/jkStuff/liftContigs.lft warn \
ucscNcbiSnp.ctg.txt
#123.952u 7.587s 2:22.24 92.4% 0+0k 0+0io 5pf+0w
wc -l ucscNcbiSnp.bed
#15835019 ucscNcbiSnp.bed
# At this point, move back from /scratch to /cluster/data.
nice gzip ucscNcbiSnp.bed
cp -p ucscNcbiSnp.bed.gz /cluster/data/dbSNP/129/human/
cp -p notIn* /cluster/data/dbSNP/129/human/
# Drum roll please... translate NCBI's encoding into UCSC's, and
# perform a bunch of checks. This is where developer involvement
# is most likely as NCBI extends the encodings used in dbSNP.
cd /cluster/data/dbSNP/129/human/
gunzip ucscNcbiSnp.bed.gz
# Re-ran this command 8/6/08 to get new snp129.sql that includes
# only those enum/set values that are actually used. No other output
# files changed.
time snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/hg18/hg18.2bit \
snp129
# 8/7/08: added the awk command to unswap the molType values that
# were swapped in dbSNP 129 fasta headers:
# DO NOT USE THIS COMMAND NEXT TIME UNLESS NECESSARY AGAIN:
awk 'BEGIN{OFS="\t";} \
{if ($8 == "genomic") {$8 = "cDNA";} \
else if ($8 == "cDNA") {$8 = "genomic";} \
print;}' ucscNcbiSnp.bed \
| snpNcbiToUcsc stdin /cluster/data/hg18/hg18.2bit snp129
#spaces stripped from observed:
#chr12 5963395 5963395 rs41402545
#count of snps with weight 0 = 63507
#count of snps with weight 1 = 14375595
#count of snps with weight 2 = 325745
#count of snps with weight 3 = 924499
#count of snps with weight 10 = 145673
#Skipped 493 snp mappings due to errors -- see snp129Errors.bed
#210.328u 10.793s 4:04.99 90.2% 0+0k 0+0io 0pf+0w
# More skipped snps than in 128, but same reason:
cut -f 5 snp129Errors.bed | sort | uniq -c
# 493 Missing observed value (deleted SNP?).
cut -f 4 snp129Errors.bed | sort -u | sed -e 's/^rs//' > errIds.txt
comm -13 notInSNP.txt errIds.txt | wc -l
#0
# So those are a subset of the notInSNP.txt ids, good.
wc -l snp*
# 15625346 snp129.bed
# 22 snp129.sql
# 493 snp129Errors.bed
# 18 snp129ExceptionDesc.tab
# 2673142 snp129Exceptions.bed
# Make one big fasta file.
# It's a monster: 16G! Can we split by hashing rsId?
# NOTE FOR NEXT TIME: do this on the fileserver!
zcat rs_fasta/rs_ch*.fas.gz \
| perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
> snp129.fa
# Check for duplicates.
grep ^\>rs snp129.fa | sort > /scratch/tmp/seqHeaders
wc -l /scratch/tmp/seqHeaders
#14708630 /scratch/tmp/seqHeaders
uniq /scratch/tmp/seqHeaders | wc -l
#14708630
# Use hgLoadSeq to generate .tab output for sequence file offsets,
# and keep only the columns that we need: acc and file_offset.
# Index it and translate to snpSeq table format.
time hgLoadSeq -test placeholder snp129.fa
#114.516u 37.585s 3:13.58 78.5% 0+0k 0+0io 6pf+0w
cut -f 2,6 seq.tab > snp129Seq.tab
rm seq.tab
ssh hgwdev
# Load up main track tables.
cd /cluster/data/dbSNP/129/human
# Re-ran this command 8/6/08 to get new snp129.sql that includes
# only those enum/set values that are actually used. No data values
# changed. Removed -noSort because Brooke had spotted some entries
# sorted by chromEnd instead of chromStart.
# Re-ran 8/7/08 to pick up corrected molType column in snp129.bed.
time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
hg18 snp129 -sqlTable=snp129.sql snp129.bed
#100.406u 22.673s 9:44.17 21.0% 0+0k 0+0io 0pf+0w
sed -e 's/snp125/snp129/' ~/kent/src/hg/lib/snp125Exceptions.sql \
> snp129Exceptions.sql
time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
hg18 snp129Exceptions -sqlTable=snp129Exceptions.sql \
snp129Exceptions.bed
#13.125u 1.383s 1:15.39 19.2% 0+0k 0+0io 0pf+0w
sed -e 's/snp125/snp129/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
> snp129ExceptionDesc.sql
hgLoadSqlTab hg18 snp129ExceptionDesc snp129ExceptionDesc.sql \
snp129ExceptionDesc.tab
# Load up sequences.
sed -e 's/snpSeq/snp129Seq/' ~/kent/src/hg/lib/snpSeq.sql \
> snp129Seq.sql
mkdir -p /gbdb/hg18/snp
ln -s /cluster/data/dbSNP/129/human/snp129.fa /gbdb/hg18/snp/snp129.fa
time nice hgLoadSqlTab hg18 snp129Seq snp129Seq.sql snp129Seq.tab
#0.007u 0.006s 3:06.83 0.0% 0+0k 0+0io 0pf+0w
# Put in a link where one would expect to find the track build dir...
ln -s /cluster/data/dbSNP/129/human /cluster/data/hg18/bed/snp129
# Look at the breakdown of exception categories:
ssh kkr3u00
cd /cluster/data/dbSNP/129/human
cut -f 5 snp129Exceptions.bed | sort | uniq -c | sort -nr
#1580567 MultipleAlignments
# 628933 ObservedMismatch
# 387233 SingleClassLongerSpan
# 31425 SingleClassTriAllelic
# 13247 ObservedTooLong
# 11095 FlankMismatchGenomeShorter
# 10365 SingleClassZeroSpan
# 3345 SingleClassQuadAllelic
# 3310 FlankMismatchGenomeLonger
# 1397 DuplicateObserved
# 1250 MixedObserved
# 547 NamedDeletionZeroSpan
# 296 FlankMismatchGenomeEqual
# 93 ObservedContainsIupac
# 35 NamedInsertionNonzeroSpan
# 3 RefAlleleMismatch
# 1 ObservedWrongFormat
#######################################################################
# SNPMASKED SEQUENCE FOR SNP129 (DONE 7/1/08 angie)
ssh kolossus
mkdir /cluster/data/hg18/snp129Mask
cd /cluster/data/hg18/snp129Mask
# Identify rsIds with various problems -- we will exclude those.
# MultipleAlignments is kinda broad because anything that maps on
# both chrN and chrN_foo_hap1 will be excluded... similarly, extra
# matches on chrN_random might disqualify good matches on chrN.
# Well, erring on the side of caution is good.
awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \
/cluster/data/dbSNP/129/human/snp129Exceptions.bed \
| sort -u \
> snp129ExcludeRsIds.txt
time grep -vFwf snp129ExcludeRsIds.txt \
/cluster/data/dbSNP/129/human/snp129.bed \
> snp129Cleaned.bed
#154.384u 12.550s 3:09.01 88.3% 0+0k 0+0io 0pf+0w
# Substitutions:
mkdir substitutions
snpMaskSingle snp129Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
| faSplit byname stdin substitutions/
# Also this warning about total size -- just means that some chroms
# didn't have any SNPS that survived the stringent filtering.
#-- 113 warnings about differing observed at same base positions
#-- (113 distinct positions). saved as diffObserved.txt.
#-- Spot-checking, I see a case (chr1|1476801|1476802) where two SNPs
#-- should have been merged -- their flanking sequences were just from
#-- diff. strands. In another case (chr9|10122961|10122962), one of
#-- the mappings looks like an insertion instead of a substitution but
#-- the SNP's class is single, and one genomic base is mapped.
#-- IMO not serious to bother dbSNP about, they want to get on w/130.
#Masked 10637395 snps in 10637306 out of 3091528550 genomic bases
#/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3091528550 (difference is 16148723)
# Make sure that sizes are identical, first diffs are normal -> IUPAC,
# and first diffs' case is preserved:
foreach f (substitutions/chr*.fa)
faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ"
end
#(output OK)
foreach f (substitutions/chr*.fa)
echo $f:t:r
mv $f $f:r.subst.fa
gzip $f:r.subst.fa
end
# Insertions:
mkdir insertions
snpMaskAddInsertions snp129Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
| faSplit byname stdin insertions/
#Added 1617522 snps totaling 3251578 bases to 3085167749 genomic bases
#/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524)
# Again, that just means that some chroms didn't have filtered SNPs.
# Make sure that all sizes have increased relative to original:
foreach f (insertions/chr*.fa)
faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
|& perl -we '$_=<>; \
if (/^\w+ in \S+ has (\d+) bases. \w+ in \S+ has (\d+) bases/) { \
if ($1 > $2) {print "OK: ins size $1 > $2\n";} \
else {die "ERROR: ins size $1 <= $2\n";} \
} else {die $_;}'
end
#(output OK)
foreach f (insertions/chr*.fa)
mv $f $f:r.ins.fa
gzip $f:r.ins.fa
end
# Deletions:
mkdir deletions
snpMaskCutDeletions snp129Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
| faSplit byname stdin deletions/
#Cut 1046324 snps totaling 2173708 bases from 3085167749 genomic bases
#/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524)
# Again, that just means that some chroms didn't have filtered SNPs.
# Make sure that all sizes have decreased relative to original:
foreach f (deletions/chr*.fa)
faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
|& perl -we '$_=<>; \
if (/^\w+ in \S+ has (\d+) bases. \w+ in \S+ has (\d+) bases/) { \
if ($1 < $2) {print "OK: del size $1 < $2\n";} \
else {die "ERROR: del size $1 >= $2\n";} \
} else {die $_;}'
end
#(output OK)
foreach f (deletions/chr*.fa)
mv $f $f:r.del.fa
gzip $f:r.del.fa
end
# Clean up and prepare for download:
gzip snp129Cleaned.bed
foreach d (substitutions insertions deletions)
pushd $d
md5sum *.gz > md5sum.txt
popd
end
# Make a README.txt in each subdir.
# Create download links on hgwdev.
# NOTE: Currently we offer only the substitutions.
# If we get any user requests, then maybe we can put the insertions
# and deletions out there.
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask
ln -s /cluster/data/hg18/snp129Mask/substitutions/* \
/usr/local/apache/htdocs/goldenPath/hg18/snp129Mask/
## If there is user demand for ins & del, then start over with an empty
## goldenPath/snp129Mask and do this:
## foreach type (substitutions insertions deletions)
## mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask/$type
## ln -s /cluster/data/hg18/snp129Mask/$type/* \
## /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask/$type/
## end
#######################################################################
# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP129 (DONE 7/2/08 angie)
ssh kolossus
mkdir /cluster/data/hg18/bed/snp129Ortho
cd /cluster/data/hg18/bed/snp129Ortho
# Following Heather's lead in snp126orthos, filter SNPs to to keep
# only those with class=single, length=1, chrom!~random;
# Exclude those with exceptions MultipleAlignments,
# SingleClassTriAllelic or SingleClassQuadAllelic.
# Unlike snp masking, we do not filter for weight -- don't know why.
awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
/cluster/data/dbSNP/129/human/snp129Exceptions.bed \
| sort -u \
> snp129ExcludeIds.txt
awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
/cluster/data/dbSNP/129/human/snp129.bed \
| grep -vFwf snp129ExcludeIds.txt \
> snp129Simple.bed
# took ~3 minutes
wc -l snp129Simple.bed
#10633840 snp129Simple.bed
# Glom all human info that we need for the final table onto the
# name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
awk 'BEGIN{OFS="\t";} \
{print $1, $2, $3, \
$4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
0, $6;}' \
snp129Simple.bed > snp129ForLiftOver.bed
# Map coords to chimp using liftOver.
# I don't know why chimp took so much longer than macaque... the
# chimp .over has fewer chains and fewer bytes than the macaque .over.
mkdir run.liftOChimp
cd run.liftOChimp
mkdir split out
splitFile ../snp129ForLiftOver.bed 25000 split/chunk
cp /dev/null jobList
foreach f (split/chunk*)
echo liftOver $f \
/cluster/data/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \
\{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \
>> jobList
end
ssh pk
cd /cluster/data/hg18/bed/snp129Ortho/run.liftOChimp
para make jobList
#Completed: 426 of 426 jobs
#CPU time in finished jobs: 83616s 1393.60m 23.23h 0.97d 0.003 y
#IO & Wait Time: 1501s 25.02m 0.42h 0.02d 0.000 y
#Average job time: 200s 3.33m 0.06h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 574s 9.57m 0.16h 0.01d
#Submission to last job: 939s 15.65m 0.26h 0.01d
# Map coords to orangutan using liftOver.
mkdir ../run.liftOPon
cd ../run.liftOPon
mkdir out
ln -s ../run.liftOChimp/split .
cp /dev/null jobList
foreach f (split/chunk*)
echo liftOver $f \
/cluster/data/hg18/bed/liftOver/hg18ToPonAbe2.over.chain.gz \
\{check out exists out/ponAbe2.$f:t.bed\} out/hg18.$f:t.unmapped \
>> jobList
end
para make jobList
#Completed: 426 of 426 jobs
#CPU time in finished jobs: 171875s 2864.58m 47.74h 1.99d 0.005 y
#IO & Wait Time: 1767s 29.45m 0.49h 0.02d 0.000 y
#Average job time: 408s 6.79m 0.11h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 1268s 21.13m 0.35h 0.01d
#Submission to last job: 1743s 29.05m 0.48h 0.02d
# Map coords to macaque using liftOver.
mkdir ../run.liftOMac
cd ../run.liftOMac
mkdir out
ln -s ../run.liftOChimp/split .
cp /dev/null jobList
foreach f (split/chunk*)
echo liftOver $f \
/cluster/data/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \
\{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \
>> jobList
end
para make jobList
#Completed: 426 of 426 jobs
#CPU time in finished jobs: 6356s 105.93m 1.77h 0.07d 0.000 y
#IO & Wait Time: 1812s 30.21m 0.50h 0.02d 0.000 y
#Average job time: 19s 0.32m 0.01h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 51s 0.85m 0.01h 0.00d
#Submission to last job: 221s 3.68m 0.06h 0.00d
ssh kolossus
cd /cluster/data/hg18/bed/snp129Ortho
# Note: the formerly inlined script getOrthoSeq.pl has been checked in
# as kent/src/hg/snp/snpLoad/getOrthoSeq.pl.
# Concatenate the chimp results, sorting by chimp pos in order to
# efficiently access 2bit sequence in getOrthoSeq. The output of
# that is then sorted by the glommed human info field, so that we
# can use join to combine chimp and macaque results in the next step.
# Ditto for macaque and orangutan.
sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
| ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
| sort > panTro2.orthoGlom.txt
sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
| ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
| sort > ponAbe2.orthoGlom.txt
sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
| ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
| sort > rheMac2.orthoGlom.txt
# The whole pipeline takes ~5-7 minutes each.
wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
# 9909458 panTro2.orthoGlom.txt
# 9597270 ponAbe2.orthoGlom.txt
# 8467866 rheMac2.orthoGlom.txt
# Use the glommed name field as a key to join up chimp and macaque
# allele data. Include glommed name from both files because if only
# file 2 has a line for the key in 2.1, then 1.1 is empty. Then plop
# in the orthoGlom fields from each file, which are in the same order
# as the chimp and macaque columns of snp129OrthoPanTro2RheMac2.
join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
-a 1 -a 2 -e '?' \
panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
| awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
else { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
> tmp.txt
join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
-a 1 -a 2 -e '?' \
tmp.txt rheMac2.orthoGlom.txt \
| perl -wpe 'chomp; \
($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
$o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
$o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
$glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
split(/\|/, $glomKey); \
$o1Start =~ s/^\?$/0/; $o2Start =~ s/^\?$/0/; $o3Start =~ s/^\?$/0/; \
$o1End =~ s/^\?$/0/; $o2End =~ s/^\?$/0/; $o3End =~ s/^\?$/0/; \
print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
$o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
$o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
$o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
s/^.*$//;' \
| sort -k1,1 -k2n,2n > snp129OrthoPt2Pa2Rm2.bed
# took ~6 minutes.
wc -l snp129OrthoPt2Pa2Rm2.bed
#10325827 snp129OrthoPt2Pa2Rm2.bed
ssh hgwdev
cd /cluster/data/hg18/bed/snp129Ortho
time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
-sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
hg18 snp129OrthoPt2Pa2Rm2 snp129OrthoPt2Pa2Rm2.bed
#Loaded 10325827 elements of size 22
#73.396u 10.864s 10:14.76 13.7% 0+0k 0+0io 0pf+0w
# Cleanup on fileserver:
cd /cluster/data/hg18/bed/snp129Ortho
nice gzip snp129Simple.bed snp129ExcludeIds.txt snp129ForLiftOver.bed
rm -r run*/split tmp.txt *.orthoGlom.txt
############################################################################
# dbSNP BUILD 130 (UPDATED 8/18/09 angie)
# Originally done 5/22/09.
# Set up build directory
mkdir -p /hive/data/outside/dbSNP/130/{human,shared}
# Get field encodings -- if there are changes or additions to the
# encoding of the corresponding fields, you might need to update
# snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
# hg/lib/snp125Ui.c).
cd /hive/data/outside/dbSNP/130/shared
alias wg wget --timestamping
set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
wg $ftpSnpDb/shared_data/LocTypeCode.bcp.gz
wg $ftpSnpDb/shared_data/SnpClassCode.bcp.gz
wg $ftpSnpDb/shared_data/SnpFunctionCode.bcp.gz
wg $ftpSnpDb/shared_data/SnpValidationCode.bcp.gz
# Here is another source -- it is not as up-to-date as the above, but
# our encodings (enums and sets in snp130.sql) are named more similar
# to those in the 2005 ASN:
# ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn
########################## DOWNLOAD #############################
cd /hive/data/outside/dbSNP/130/human
mkdir data schema rs_fasta
# Get data from NCBI (anonymous FTP)
wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt
cd /hive/data/outside/dbSNP/130/human/data
# ContigLoc table has coords, orientation, loc_type, and refNCBI allele
wg $ftpSnpDb/organism_data/b130_SNPContigLoc_36_3.bcp.gz
wg $ftpSnpDb/organism_data/b130_SNPContigLocusId_36_3.bcp.gz
wg $ftpSnpDb/organism_data/b130_ContigInfo_36_3.bcp.gz
# MapInfo has alignment weights
wg $ftpSnpDb/organism_data/b130_SNPMapInfo_36_3.bcp.gz
# SNP has univar_id, validation status and heterozygosity
wg $ftpSnpDb/organism_data/SNP.bcp.gz
# Get schema
cd /hive/data/outside/dbSNP/130/human/schema
wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz
wg $ftpSnpDb/shared_schema/dbSNP_main_table.sql.gz
# Get fasta files
# using headers of fasta files for molType, class, observed
cd /hive/data/outside/dbSNP/130/human/rs_fasta
wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz
# Get 1000 Genomes IDs (unfortunately not in validation field as Sol suggested)
cd /hive/data/outside/dbSNP/130/human/data
wg -O 1000Genomes_README ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/1000Genomes/ReadMe.txt
wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/1000Genomes/B130_1000G_RsClusterReport.txt.gz
zcat B130_1000G_RsClusterReport.txt.gz | wc -l
#7512342
# Make a uniquified list of only the numeric portion of the assigned rs IDs:
zcat B130_1000G_RsClusterReport.txt.gz \
| cut -d, -f 3 | sed -e 's/^rs//' \
| sort -nu > 1000GenomesRsIds.txt
wc -l 1000GenomesRsIds.txt
#5611085 1000GenomesRsIds.txt
########################## LOAD NCBI TABLES #############################
# Simplify names of data files -- strip version & extras to get
# local canonical table names.
cd /hive/data/outside/dbSNP/130/human/data
foreach f (*.bcp.gz)
set new = `echo $f \
| sed -e 's/^b130_SNP//; s/^b130_//; s/_36_3//; s/.bcp//;'`
mv $f $new
echo $new
end
cd /hive/data/outside/dbSNP/130/human/schema
zcat human_9606_table.sql.gz \
| perl -we '$/ = "\nGO\n\n\n"; \
while (<>) { \
next unless /^CREATE TABLE \[(b130_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_36_3)?\]/; \
s/b130_(SNP)?//; s/_36_3//; \
s/[\[\]]//g; s/GO\n\n/;/; s/smalldatetime/datetime/g; \
s/ON PRIMARY//g; s/COLLATE//g; s/Latin1_General_BIN//g; \
s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
s/nvarchar/varchar/g; s/set quoted/--set quoted/g; \
s/(image|varchar\s+\(\d+\))/BLOB/g; \
print; \
}' \
> table.sql
# load on hgwdev (kolossus disk almost full, no more small cluster mysql5's):
hgsql '' -e 'create database hg18snp130'
cd /hive/data/outside/dbSNP/130/human/schema
hgsql hg18snp130 < table.sql
cd ../data
# Avoid wasting space by excluding mappings to non-reference contigs:
foreach t (ContigInfo MapInfo)
zcat $t.gz \
| egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
| perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
| hgLoadSqlTab -oldTable hg18snp130 $t placeholder stdin
end
#load of ContigInfo did not go as planned: 379 record(s), 0 row(s) skipped, 88 warning(s) loading /dev/stdin
# Checked ContigInfo visually, looks OK.
# Compare contig list between our ctgPos and reference contigs in
# ContigInfo:
ssh hgwdev-10 hgsql hg18 -N -B -e '"select contig from ctgPos;"' \
| sort > /tmp/1
hgsql hg18snp130 -NBe 'select distinct(group_label) from ContigInfo'
# --> reference, c5_H2, c6_COX, c6_QBL, c22_H2, DR53
# (HuRef, Celera, CRA_TCAGchr7v2 grepped out above)
hgsql hg18snp130 -N -B -e 'select contig_acc from ContigInfo \
where group_label in \
("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' | sort > /tmp/2
diff /tmp/1 /tmp/2
# No diff.
# Make sure there are no orient != 0 contigs among those selected.
hgsql hg18snp130 -NBe \
'select count(*) from ContigInfo where orient != 0 and \
group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");'
#0
# ContigLoc is huge, and we want just the reference contig mappings.
# So, based on the reference & haplo ctg_id values in ContigInfo,
# filter to get just the mappings for those contigs:
zcat ContigLoc.gz \
| awk '$3 <= 377 || $3 == 7015' \
| perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
| hgLoadSqlTab -oldTable hg18snp130 ContigLoc placeholder stdin
zcat SNP.gz \
| perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
| hgLoadSqlTab -oldTable hg18snp130 SNP placeholder stdin
zcat ContigLocusId.gz \
| perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
| hgLoadSqlTab -oldTable hg18snp130 ContigLocusId placeholder stdin
# There were some warnings (many cleared up by the perl substitution)
# but no rows were dropped. In mysql5, 'show warnings' after a manual 'load data'
# complains about missing values (OK when e.g. position is not known).
foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
echo -n "${t}:\t"
hgsql -N -B hg18snp130 -e 'select count(*) from '$t
end
#ContigInfo: 379
#ContigLoc: 19189750
#ContigLocusId: 11790054
#MapInfo: 17928700
#SNP: 17804034
#################### EXTRACT INFO FROM NCBI TABLES ####################
# Glom each SNP's function codes together and load up a new hg18Snp130 table.
# Also extract NCBI's annotations of coding SNPs' effects on translation.
# ContigLocusId includes contig_acc and asn_{from,to} but those are not comprehensive!
# If a SNP has been mapped to multiple contigs, one is randomly selected, and if
# it is not a reference contig, we miss out if we restrict by contig. We may end
# up getting a few spurious functional annotations from mappings to other assemblies
# but them's the breaks.
cd /hive/data/outside/dbSNP/130/human
hgsql hg18snp130 -NBe 'select snp_id, mrna_acc, fxn_class, \
reading_frame, allele, residue, codon from ContigLocusId' \
> ncbiFuncAnnotations.txt
cut -f 1,3 ncbiFuncAnnotations.txt \
| sort -u -k1,1 -k2,2n \
| awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \
else { if (prevId) {print prevId "\t" prevFunc;} \
prevFunc = $2 ","; }} \
{prevId = $1;} \
END {print prevId "\t" prevFunc;}' \
> ucscFunc.txt
wc -l ucscFunc.txt
#7344853 ucscFunc.txt
cat > ucscFunc.sql <<EOF
CREATE TABLE ucscFunc (
snp_id int NOT NULL ,
fxn_class varchar(255) NOT NULL ,
INDEX snp_id (snp_id)
);
EOF
hgLoadSqlTab hg18snp130 ucscFunc{,.sql,.txt}
# Extract observed allele, molType and snp class from FASTA headers gnl
zcat /hive/data/outside/dbSNP/130/human/rs_fasta/rs_ch*.fas.gz \
| grep '^>gnl' \
| perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
| sort -n \
> ucscGnl.txt
#407.555u 57.499s 4:32.89 170.4% 0+0k 0+0io 0pf+0w
wc -l ucscGnl.txt
#17804034 ucscGnl.txt
cut -f 1 ucscGnl.txt | uniq | wc -l
#17804034
cat > ucscGnl.sql <<EOF
CREATE TABLE ucscGnl (
snp_id int NOT NULL ,
observed varchar(255) NOT NULL,
molType varchar(255) NOT NULL,
class varchar(255) NULL ,
INDEX snp_id (snp_id)
);
EOF
hgLoadSqlTab hg18snp130 ucscGnl{,.sql,.txt}
# Add indices to tables for a big join (5 or 6 minutes):
hgsql hg18snp130 -e \
'alter table ContigLoc add index (ctg_id); \
alter table ContigInfo add index (ctg_id); \
alter table ContigLoc add index (snp_id); \
alter table SNP add index (snp_id); \
alter table MapInfo add index (snp_id);'
# Big leftie join to bring together all of the columns that we want in snp130,
# using all of the available joining info:
hgsql hg18snp130 -NBe \
'SELECT ci.contig_acc, cl.asn_from, cl.asn_to, cl.snp_id, cl.orientation, cl.allele, \
ug.observed, ug.molType, ug.class, \
s.validation_status, s.avg_heterozygosity, s.het_se, \
uf.fxn_class, cl.loc_type, mi.weight, cl.phys_pos_from \
FROM \
((((ContigLoc as cl JOIN ContigInfo as ci \
ON cl.ctg_id = ci.ctg_id and \
ci.group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2")) \
LEFT JOIN MapInfo as mi ON mi.snp_id = cl.snp_id and mi.assembly = ci.group_label) \
LEFT JOIN SNP as s ON s.snp_id = cl.snp_id) \
LEFT JOIN ucscGnl as ug ON ug.snp_id = cl.snp_id) \
LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id;' \
> ucscNcbiSnp.ctg.bed
#on a not-so busy hgwdev: 80.735u 36.958s 8:54.76 22.0% 0+0k 0+0io 0pf+0w
#on a very busy hgwdev: 78.753u 41.304s 30:19.77 6.5% 0+0k 0+0io 0pf+0w
wc -l ucscNcbiSnp.ctg.bed
#19189750 ucscNcbiSnp.ctg.bed
liftUp ucscNcbiSnp.bed \
/cluster/data/hg18/jkStuff/liftContigs.lft warn \
ucscNcbiSnp.ctg.bed
#116.027u 7.078s 2:27.93 83.2% 0+0k 0+0io 0pf+0w
# Drum roll please... translate NCBI's encoding into UCSC's, and
# perform a bunch of checks. This is where developer involvement
# is most likely as NCBI extends the encodings used in dbSNP.
cd /hive/data/outside/dbSNP/130/human/
snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/hg18/hg18.2bit \
-1000GenomesRsIds=data/1000GenomesRsIds.txt snp130
#spaces stripped from observed:
#chr12 5963395 5963395 rs41402545
#Line 8106609 of ucscNcbiSnp.bed: Encountered something that doesn't fit observedMixedFormat: GCAACTTCA
#count of snps with weight 0 = 74828
#count of snps with weight 1 = 17254041
#count of snps with weight 2 = 389501
#count of snps with weight 3 = 1189989
#count of snps with weight 10 = 281391
#Found no errors.
#143.111u 14.313s 3:15.18 80.6% 0+0k 0+0io 0pf+0w
wc -l snp*
# 18833531 snp130.bed
# 22 snp130.sql
# 0 snp130Errors.bed
# 18 snp130ExceptionDesc.tab
# 2631563 snp130Exceptions.bed
# More SNPs but 0 errors and a bit fewer exceptions that snp129, cool!
# Make one big fasta file.
# It's a monster: 18G! Can we split by hashing rsId?
zcat rs_fasta/rs_ch*.fas.gz \
| perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
> snp130.fa
# Check for duplicates.
grep ^\>rs snp130.fa | sort > /scratch/tmp/seqHeaders
wc -l /scratch/tmp/seqHeaders
#17804034 /scratch/tmp/seqHeaders
uniq /scratch/tmp/seqHeaders | wc -l
#17804034
# Use hgLoadSeq to generate .tab output for sequence file offsets,
# and keep only the columns that we need: acc and file_offset.
# Index it and translate to snpSeq table format.
time hgLoadSeq -test placeholder snp130.fa
#107.748u 24.338s 6:58.50 31.5% 0+0k 0+0io 0pf+0w
cut -f 2,6 seq.tab > snp130Seq.tab
rm seq.tab
# Load up main track tables.
cd /hive/data/outside/dbSNP/130/human
time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
hg18 snp130 -sqlTable=snp130.sql snp130.bed
#Loaded 18833531 elements of size 17
#107.246u 11.546s 10:49.23 18.2% 0+0k 0+0io 0pf+0w
time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
hg18 snp130Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
snp130Exceptions.bed
#15.255u 1.257s 1:11.11 23.2% 0+0k 0+0io 0pf+0w
sed -e 's/snp125/snp130/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
> snp130ExceptionDesc.sql
hgLoadSqlTab hg18 snp130ExceptionDesc snp130ExceptionDesc.sql \
snp130ExceptionDesc.tab
# Load up sequences.
sed -e 's/snpSeq/snp130Seq/' ~/kent/src/hg/lib/snpSeq.sql \
> snp130Seq.sql
mkdir -p /gbdb/hg18/snp
ln -s /hive/data/outside/dbSNP/130/human/snp130.fa /gbdb/hg18/snp/snp130.fa
time nice hgLoadSqlTab hg18 snp130Seq snp130Seq.sql snp130Seq.tab
#0.001u 0.004s 3:41.13 0.0% 0+0k 0+0io 0pf+0w
# Put in a link where one would expect to find the track build dir...
ln -s /hive/data/outside/dbSNP/130/human /cluster/data/hg18/bed/snp130
# Look at the breakdown of exception categories:
cd /hive/data/outside/dbSNP/130/human
cut -f 5 snp130Exceptions.bed | sort | uniq -c | sort -nr
#1960737 MultipleAlignments
# 519222 ObservedMismatch
# 38444 ObservedTooLong
# 32069 SingleClassTriAllelic
# 26351 FlankMismatchGenomeShorter
# 19089 SingleClassLongerSpan
# 15441 SingleClassZeroSpan
# 6583 FlankMismatchGenomeLonger
# 4108 DuplicateObserved
# 3627 SingleClassQuadAllelic
# 3473 MixedObserved
# 1369 NamedDeletionZeroSpan
# 547 FlankMismatchGenomeEqual
# 355 NamedInsertionNonzeroSpan
# 136 ObservedContainsIupac
# 8 ObservedWrongFormat
# 4 RefAlleleMismatch
#TODO: go through those above and send some bug reports to dbSNP.
# 8/18/09: dbSNP announced a correction to some functional class
# annotations (- strand mRNA -> swapped near-gene-3 and near-gene-5).
cd /hive/data/outside/dbSNP/130/human
# This is a list of affected rs IDs, genes, old funcs and new funcs:
wget ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database/organism_data/b130_update/b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt
wc -l b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt
#163147 b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt
# The first 19 lines are the header.
# Use the info in that file to make a series of sql update commands:
tail -n +20 b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt \
| perl -we '$fns[6]="intron"; $fns[13]="near-gene-3"; $fns[15]="near-gene-5"; \
$fns[41]="nonsense"; $fns[42]="missense"; \
$fns[53]="untranslated-3"; $fns[55]="untranslated-5"; \
while (<>) { \
($rs,undef,undef,$old,undef,$new) = split(","); \
$oldF = $fns[$old]; $newF = $fns[$new]; die if (!(defined $oldF && defined $newF)); \
print "UPDATE snp130 set func=(REPLACE(func,\"$oldF\",\"$newF\")) where name=\"rs$rs\";\n"; \
}' \
> snp130.func_13_15_fix.sql
wc -l snp130.func_13_15_fix.sql
#163128 snp130.func_13_15_fix.sql
hgsql hg18 < snp130.func_13_15_fix.sql
# The number of rows changed has to be smaller because some of those replacements
# are for annotations relative to a different assembly; we have func=unknown for
# those. E.g. rs437678.
#######################################################################
# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP130 (DONE 5/15/09 angie)
mkdir /hive/data/genomes/hg18/bed/snp130Ortho
cd /hive/data/genomes/hg18/bed/snp130Ortho
# Following Heather's lead in snp126orthos, filter SNPs to to keep
# only those with class=single, length=1, chrom!~random;
# Exclude those with exceptions MultipleAlignments,
# SingleClassTriAllelic or SingleClassQuadAllelic.
# Unlike snp masking, we do not filter for weight -- don't know why.
awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
/hive/data/outside/dbSNP/130/human/snp130Exceptions.bed \
| sort -u \
> snp130ExcludeIds.txt
awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
/hive/data/outside/dbSNP/130/human/snp130.bed \
| grep -vFwf snp130ExcludeIds.txt \
> snp130Simple.bed
#182.396u 12.388s 2:10.30 149.4% 0+0k 0+0io 0pf+0w
wc -l snp130Simple.bed
#12141377 snp130Simple.bed
# Glom all human info that we need for the final table onto the
# name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
awk 'BEGIN{OFS="\t";} \
{print $1, $2, $3, \
$4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
0, $6;}' \
snp130Simple.bed > snp130ForLiftOver.bed
# Map coords to chimp using liftOver.
# I don't know why chimp took so much longer than macaque... the
# chimp .over has fewer chains and fewer bytes than the macaque .over.
mkdir run.liftOChimp
cd run.liftOChimp
mkdir split out
splitFile ../snp130ForLiftOver.bed 25000 split/chunk
cp /dev/null jobList
foreach f (split/chunk*)
echo liftOver $f \
/hive/data/genomes/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \
\{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \
>> jobList
end
ssh pk
cd /hive/data/genomes/hg18/bed/snp130Ortho/run.liftOChimp
para make jobList
#Completed: 486 of 486 jobs
#CPU time in finished jobs: 76679s 1277.99m 21.30h 0.89d 0.002 y
#IO & Wait Time: 1828s 30.46m 0.51h 0.02d 0.000 y
#Average job time: 162s 2.69m 0.04h 0.00d
#Longest finished job: 486s 8.10m 0.14h 0.01d
#Submission to last job: 513s 8.55m 0.14h 0.01d
# Map coords to orangutan using liftOver.
mkdir ../run.liftOPon
cd ../run.liftOPon
mkdir out
ln -s ../run.liftOChimp/split .
cp /dev/null jobList
foreach f (split/chunk*)
echo liftOver $f \
/hive/data/genomes/hg18/bed/liftOver/hg18ToPonAbe2.over.chain.gz \
\{check out exists out/ponAbe2.$f:t.bed\} out/hg18.$f:t.unmapped \
>> jobList
end
para make jobList
#Completed: 486 of 486 jobs
#CPU time in finished jobs: 165378s 2756.31m 45.94h 1.91d 0.005 y
#IO & Wait Time: 2614s 43.56m 0.73h 0.03d 0.000 y
#Average job time: 346s 5.76m 0.10h 0.00d
#Longest finished job: 1017s 16.95m 0.28h 0.01d
#Submission to last job: 1051s 17.52m 0.29h 0.01d
# Map coords to macaque using liftOver.
mkdir ../run.liftOMac
cd ../run.liftOMac
mkdir out
ln -s ../run.liftOChimp/split .
cp /dev/null jobList
foreach f (split/chunk*)
echo liftOver $f \
/hive/data/genomes/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \
\{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \
>> jobList
end
para make jobList
#Completed: 486 of 486 jobs
#CPU time in finished jobs: 4068s 67.80m 1.13h 0.05d 0.000 y
#IO & Wait Time: 1944s 32.40m 0.54h 0.02d 0.000 y
#Average job time: 12s 0.21m 0.00h 0.00d
#Longest finished job: 38s 0.63m 0.01h 0.00d
#Submission to last job: 126s 2.10m 0.04h 0.00d
cd /hive/data/genomes/hg18/bed/snp130Ortho
# Concatenate the chimp results, sorting by chimp pos in order to
# efficiently access 2bit sequence in getOrthoSeq. The output of
# that is then sorted by the glommed human info field, so that we
# can use join to combine chimp and macaque results in the next step.
# Ditto for macaque and orangutan. Each command pipe takes ~5 minutes:
sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
| ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
| sort > panTro2.orthoGlom.txt
sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
| ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
| sort > ponAbe2.orthoGlom.txt
sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
| ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
| sort > rheMac2.orthoGlom.txt
wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
# 11318466 panTro2.orthoGlom.txt
# 10976821 ponAbe2.orthoGlom.txt
# 9702063 rheMac2.orthoGlom.txt
# Use the glommed name field as a key to join up chimp and macaque
# allele data. Include glommed name from both files because if only
# file 2 has a line for the key in 2.1, then 1.1 is empty. Then plop
# in the orthoGlom fields from each file, which are in the same order
# as the chimp and macaque columns of snp130OrthoPanTro2RheMac2.
join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
-a 1 -a 2 -e '?' \
panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
| awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
else { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
> tmp.txt
join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
-a 1 -a 2 -e '?' \
tmp.txt rheMac2.orthoGlom.txt \
| perl -wpe 'chomp; \
($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
$o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
$o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
$glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
split(/\|/, $glomKey); \
$o1Start =~ s/^\?$/0/; $o2Start =~ s/^\?$/0/; $o3Start =~ s/^\?$/0/; \
$o1End =~ s/^\?$/0/; $o2End =~ s/^\?$/0/; $o3End =~ s/^\?$/0/; \
print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
$o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
$o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
$o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
s/^.*$//;' \
| sort -k1,1 -k2n,2n > snp130OrthoPt2Pa2Rm2.bed
#300.357u 31.419s 4:33.00 121.5% 0+0k 0+0io 0pf+0w
wc -l snp130OrthoPt2Pa2Rm2.bed
#11797184 snp130OrthoPt2Pa2Rm2.bed
cd /hive/data/genomes/hg18/bed/snp130Ortho
hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
-sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
hg18 snp130OrthoPt2Pa2Rm2 snp130OrthoPt2Pa2Rm2.bed
#Loaded 11797184 elements of size 22
#83.624u 9.627s 10:19.26 15.0% 0+0k 0+0io 0pf+0w
# Cleanup fileserver:
cd /hive/data/genomes/hg18/bed/snp130Ortho
nice gzip snp130Simple.bed snp130ExcludeIds.txt snp130ForLiftOver.bed
rm -r run*/split tmp.txt *.orthoGlom.txt
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
############################################################################
# Nuclear Lamina (2008-06-16 mikep)
# "Domain organization of human chromosomes revealed by mapping of nuclear lamina interactions"
# We received these files from authors of Guelen et al. Nature 2008
# doi:10.138/nature06947
ssh hgwdev
mkdir /cluster/data/hg18/bed/nuclearLamina
cd /cluster/data/hg18/bed/nuclearLamina/
mv /var/ftp/encode/LADs_080513.bed.bz2 .
mv /var/ftp/encode/LaminB1_080513.wig.bz2 .
mv /var/ftp/encode/LaminB1_LAD.md5sum .
# to check the md5sum we need to unzip it to its original name, done on the NFS host for this directory
df -h .
# Filesystem Size Used Avail Use% Mounted on
# kkstore02-10:/export/cluster/store11
# 1.8T 1.7T 94G 95% /cluster/store11
ssh kkstore02-10
cd /cluster/data/hg18/bed/nuclearLamina/
# check they are not too big to unzip, look ok
ll -h L*bz2
# -rw-r--r-- 1 mikep protein 13K Jun 10 00:58 LADs_080513.bed.bz2
# -rw-r--r-- 1 mikep protein 16M Jun 10 01:02 LaminB1_080513.wig.bz2
bunzip2 -dk L*bz2
md5sum -c LaminB1_LAD.md5sum
# all ok
# LADs_080513.bed: OK
# LaminB1_080513.wig: OK
# Description files were received via email and copied directly to this dir.
# Needed to convert from mac to unix due to ^M chars:
mac2unix L*.html
# Checked files looked OK, needed to remove HTML tags such as: DOCTYPE <HTML> <BODY> </BODY> </HTML>
vi L*.html
# Now find the min/max/avg range of values from the wiggle file
egrep "^[0-9]" LaminB1_080513.wig |ave -col=2 stdin
# Q1 -0.509000
# median -0.000000
# Q3 0.514000
# average -0.041192
# min -6.602000
# max 5.678000
# count 2909178
# total -119833.701411
# standard deviation 1.037038
# Now load the tracks on hgwdev
ssh hgwdev
cd /cluster/data/hg18/bed/nuclearLamina/
# First two lines are custom track header
tail +3 LADs_080513.bed | hgLoadBed hg18 laminB1Lads stdin
# wigEncode the .wig and .wib files from the supplied wig ascii file, and symlink the .wib file from /gbdb
wigEncode LaminB1_080513.wig laminB1.wig laminB1.wib
ln -s /cluster/data/hg18/bed/nuclearLamina/laminB1.wib /gbdb/hg18/wib/
# Converted LaminB1_080513.wig, upper limit 5.68, lower limit -6.60
hgLoadWiggle hg18 laminB1 laminB1.wig
rm bed.tab wiggle.tab
## Create the track definitions in hg18, copy them over, (these are my paths) and do make
## Make entries for: bed = "track laminB1Lads" wiggle = "track laminB1"
ssh hgwdev
# vi /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/trackDb.ra
# cp /cluster/data/hg18/bed/nuclearLamina/laminB1.html /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/
# cp /cluster/data/hg18/bed/nuclearLamina/laminB1Lads.html /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/
# cp /cluster/data/hg18/bed/nuclearLamina/laminB1Super.html /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/
# cp /cluster/data/hg18/bed/nuclearLamina/laminB1Super.gif /cluster/home/mikep/browser/images/
# cd /cluster/home/mikep/kent/src/hg/makeDb/trackDb
# make
# Add wig ascii track (+readme) to goldenPath so it can be downloaded
mkdir /data/apache/htdocs/goldenPath/hg18/nuclearLamina
cp /cluster/data/hg18/bed/nuclearLamina/LaminB1_080513.wig.bz2 /data/apache/htdocs/goldenPath/hg18/nuclearLamina/hg18.laminB1.txt.bz2
cp /cluster/data/hg18/bed/nuclearLamina/goldenPath.README.txt /data/apache/htdocs/goldenPath/hg18/nuclearLamina/README.txt
# Add both tracks to all.joiner under section: tablesIgnored $hg
############################################################################
##### Positively Selected Genes (Pos Sel Genes) (braney - DONE - 2008-07-07)
# get SQL data (mammalPsq.sql) from Adam Siepel
# and Tomas Vinar (acs4@cornell.edu)
hgsql hg18 < mammalPsg.sql
echo "alter table mammalPsg add index (chrom(7));" | hgsql hg18
####################################################################
# UPDATE UNIGENE/SAGE TRACK (DONE - 2008-08-09 Fan)
# Create the uniGene alignments
# Download of the latest UniGene version is now automated by a
# cron job -- see /cluster/home/angie/crontab ,
# /cluster/home/angie/unigeneVers/unigene.csh .
# If hgwdev gets rebooted, that needs to be restarted... maybe there's
# a more stable place to set up that cron job.
ssh hgwdev
cd /cluster/store11/gs.19/build36/bed
cd uniGene
mkdir old
mv * old
set Version = 214
zcat /cluster/store7/uniGene/uniGene.$Version/Hs.seq.uniq.gz|\
sed -e "s#>.*/ug=#>#; s# /len.*##;" > Hs.seq.uniq.simpleHeader.fa
ssh pk
set Version = 214
mv /san/sanvol1/scratch/hg18/uniGene /san/sanvol1/scratch/hg18/uniGene.old
mkdir /san/sanvol1/scratch/hg18/uniGene/
cd /san/sanvol1/scratch/hg18/uniGene/
cp -p /cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa .
ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst
ls -1S \
/cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa \
> uniGene.lst
cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/x86_64/blat -repeats=lower -minIdentity=95 ooc=/san/sanvol1/scratch/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
gensub2 genome.lst uniGene.lst template.sub para.spec
para create para.spec
mkdir psl
para try
para check
para push
Completed: 49 of 49 jobs
CPU time in finished jobs: 59778s 996.30m 16.60h 0.69d 0.002 y
IO & Wait Time: 208s 3.47m 0.06h 0.00d 0.000 y
Average job time: 1224s 20.40m 0.34h 0.01d
Longest finished job: 4549s 75.82m 1.26h 0.05d
Submission to last job: 4653s 77.55m 1.29h 0.05d
Estimated complete: 0s 0.00m 0.00h 0.00d
pslSort dirs raw.psl tmp psl >& pslSort.log
cat raw.psl|\
pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
stdin hg18.uniGene.pslReps.psl /dev/null
# Processed 553470 alignments
gzip raw.psl
gzip Hs.seq.uniq.simpleHeader.fa
ssh hgwdev
cd /cluster/store11/gs.19/build36/bed/uniGene
cp -p /san/sanvol1/scratch/hg18/uniGene/hg18.uniGene.pslReps.psl .
hgLoadPsl -table=uniGene_3 hg18 hg18.uniGene.pslReps.psl
# load the sequence with -replace option
hgLoadSeq -replace hg18 /gbdb/hg18/uniGene/Hs.seq.uniq.simpleHeader.fa
#############################################################################
# BLASTZ/CHAIN/NET dipOrd1 (DONE - 2008-10-22 - Hiram)
screen # use screen to control this multi-day job
mkdir /hive/data/genomes/hg18/bed/blastzDipOrd1.2008-10-21
cd /hive/data/genomes/hg18/bed/blastzDipOrd1.2008-10-21
cat << '_EOF_' > DEF
# Human vs. Kangaroo rat
BLASTZ_M=50
BLASTZ=lastz
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
# QUERY: Kangaroo rat
SEQ2_DIR=/scratch/data/dipOrd1/dipOrd1.2bit
SEQ2_LEN=/scratch/data/dipOrd1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/hive/data/genomes/hg18/bed/blastzDipOrd1.2008-10-21
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 881m33.829s
cat fb.hg18.chainDipOrd1Link.txt
# 786126212 bases of 2881515245 (27.282%) in intersection
# slight difficulty with the makeMd5sum.csh script, fixed in the source
# and completed the copy of the liftOver file, then continuing,
# with -syntenicNet:
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet -continue=cleanup -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1 &
# real 86m15.646s
cd /cluster/data/hg18/bed/blastzDipOrd1.2008-10-21
time nice -n +19 doRecipBest.pl hg18 dipOrd1 > rbest.log 2>&1 &
# real 327m0.719s
#############################################################################
# BLASTZ/CHAIN/NET pteVam1 (DONE - 2008-10-21,29 - Hiram)
screen # use screen to control this multi-day job
mkdir /hive/data/genomes/hg18/bed/blastzPteVam1.2008-10-21
cd /hive/data/genomes/hg18/bed/blastzPteVam1.2008-10-21
cat << '_EOF_' > DEF
# Human vs. Megabat
BLASTZ_M=50
BLASTZ=lastz
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
# QUERY: Megabat
SEQ2_DIR=/scratch/data/pteVam1/pteVam1.2bit
SEQ2_LEN=/scratch/data/pteVam1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/hive/data/genomes/hg18/bed/blastzPteVam1.2008-10-21
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 595m14.168s
# some crashed jobs, finish the batch on pk manually, then, continuing:
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-continue=cat -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
# real 151m54.924s
cat fb.hg18.chainPteVam1Link.txt
# 1311133709 bases of 2881515245 (45.502%) in intersection
cd /cluster/data/hg18/bed/blastzPteVam1.2008-10-21
time nice -n +19 doRecipBest.pl hg18 pteVam1 > rbest.log 2>&1 &
# finish manually due to problems:
# real 286m25.330s
doRecipBest.pl -continue=download hg18 pteVam1 > rbestDownload.log 2>&1
#############################################################################
# BLASTZ/CHAIN/NET turTru1 (DONE - 2008-10-22 - Hiram)
screen # use screen to control this multi-day job
mkdir /hive/data/genomes/hg18/bed/blastzTurTru1.2008-10-21
cd /hive/data/genomes/hg18/bed/blastzTurTru1.2008-10-21
cat << '_EOF_' > DEF
# Human vs. Dolphin
BLASTZ_M=50
BLASTZ=lastz
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
# QUERY: Dolphin
SEQ2_DIR=/scratch/data/turTru1/turTru1.2bit
SEQ2_LEN=/scratch/data/turTru1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/hive/data/genomes/hg18/bed/blastzTurTru1.2008-10-21
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 702m54.490s
cat fb.hg18.chainTurTru1Link.txt
# 1398587431 bases of 2881515245 (48.537%) in intersection
# slight difficulty with the makeMd5sum.csh script, fixed in the source
# and completed the copy of the liftOver file, then continuing,
# with -syntenicNet:
cd /cluster/data/hg18/bed/blastzTurTru1.2008-10-21
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-syntenicNet -continue=cleanup -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1 &
# real 74m4.276s
time nice -n +19 doRecipBest.pl hg18 turTru1 > rbest.log 2>&1 &
# real 275m19.714s
#############################################################################
# BLASTZ/CHAIN/NET tarSyr1 (DONE - 2008-10-21,29 - Hiram)
screen # use screen to control this multi-day job
mkdir /hive/data/genomes/hg18/bed/blastzTarSyr1.2008-10-21
cd /hive/data/genomes/hg18/bed/blastzTarSyr1.2008-10-21
cat << '_EOF_' > DEF
# Human vs. Tarsier
BLASTZ_M=50
BLASTZ=lastz
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
# QUERY: Tarsier
SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit
SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/hive/data/genomes/hg18/bed/blastzTarSyr1.2008-10-21
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 1518m42.776s
# recovered the batch on pk, then continuing:
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-continue=cat -syntenicNet -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
# real 526m45.582s
cat fb.hg18.chainTarSyr1Link.txt
# 1383104827 bases of 2881515245 (47.999%) in intersection
cd /cluster/data/hg18/bed/blastzTarSyr1.2008-10-21
time nice -n +19 doRecipBest.pl hg18 tarSyr1 > rbest.log 2>&1 &
# failed, finishing manually
# real 155m48.855s
doRecipBest.pl -continue=download hg18 tarSyr1 > rbest.log 2>&1
#############################################################################
# BLASTZ/CHAIN/NET proCap1 (DONE - 2008-10-22,29 - Hiram)
screen # use screen to control this multi-day job
mkdir /hive/data/genomes/hg18/bed/blastzProCap1.2008-10-22
cd /hive/data/genomes/hg18/bed/blastzProCap1.2008-10-22
cat << '_EOF_' > DEF
# Human vs. Rock Hyrax
BLASTZ_M=50
BLASTZ=lastz
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
# QUERY: Rock Hyrax
SEQ2_DIR=/scratch/data/proCap1/proCap1.2bit
SEQ2_LEN=/scratch/data/proCap1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/hg18/bed/blastzProCap1.2008-10-22
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 1654m44.904s
# finish lastz batch manually after script difficulties, then continuing:
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-continue=cat -syntenicNet -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
# real 227m41.045s
cat fb.hg18.chainProCap1Link.txt
# 891406629 bases of 2881515245 (30.935%) in intersection
cd /cluster/data/hg18/bed/blastzProCap1.2008-10-22
time nice -n +19 doRecipBest.pl hg18 proCap1 > rbest.log 2>&1 &
# real 232m9.789s
# failed
# running the last couple of commands to finish this off
# real 561m51.171s
doRecipBest.pl -continue=download hg18 proCap1 > rbestDownload.log 2>&1
#############################################################################
# BLASTZ/CHAIN/NET choHof1 (DONE - 2008-10-22,28 - Hiram)
screen # use screen to control this multi-day job
mkdir /hive/data/genomes/hg18/bed/blastzChoHof1.2008-10-22
cd /hive/data/genomes/hg18/bed/blastzChoHof1.2008-10-22
cat << '_EOF_' > DEF
# Human vs. Sloth
BLASTZ_M=50
BLASTZ=lastz
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
# QUERY: Sloth
SEQ2_DIR=/scratch/data/choHof1/choHof1.2bit
SEQ2_LEN=/scratch/data/choHof1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/hg18/bed/blastzChoHof1.2008-10-22
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 1649m6.606s
# finish lastz batch manually after script difficulties, then continuing:
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-continue=cat -syntenicNet -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
# real 276m1.827s
cat fb.hg18.chainChoHof1Link.txt
# 993065598 bases of 2881515245 (34.463%) in intersection
cd /cluster/data/hg18/bed/blastz.choHof1.2008-10-22
time nice -n +19 doRecipBest.pl hg18 choHof1 > rbest.log 2>&1 &
# real 900m50.222s
#############################################################################
# BLASTZ/CHAIN/NET dasNov2 (DONE - 2008-10-22,29 - Hiram)
screen # use screen to control this multi-day job
mkdir /hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22
cd /hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22
cat << '_EOF_' > DEF
# Human vs. Armadillo
BLASTZ_M=50
BLASTZ=lastz
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
# QUERY: Armadillo
SEQ2_DIR=/scratch/data/dasNov2/dasNov2.2bit
SEQ2_LEN=/scratch/data/dasNov2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 1664m4.331s
# finish this batch manually after some code troubles, then:
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-continue=cat -syntenicNet -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
# real 230m4.513s
# something broke during chainSplit, try that manuallyo
nice -n +19 chainSplit \
/hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22/axtChain/chain \
/hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22/axtChain/hg18.dasNov2.all.chain.gz
# no problem with that, continuing:
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-continue=net -syntenicNet -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > net.log 2>&1 &
# real 206m54.072s
cd /cluster/data/hg18/bed/blastzDasNov2.2008-10-22
time nice -n +19 doRecipBest.pl hg18 dasNov2 > rbest.log 2>&1 &
# failed, finishing manually:
# real 680m1.703s
# the following takes an instant:
doRecipBest.pl -continue=download hg18 dasNov2 \
> rbestDownload.log 2>&1 &
#############################################################################
# BLASTZ/CHAIN/NET loxAfr2 (DONE - 2008-10-22,29 - Hiram)
screen # use screen to control this multi-day job
mkdir /hive/data/genomes/hg18/bed/blastzLoxAfr2.2008-10-22
cd /hive/data/genomes/hg18/bed/blastzLoxAfr2.2008-10-22
cat << '_EOF_' > DEF
# Human vs. Elephant
BLASTZ_M=50
BLASTZ=lastz
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
# QUERY: Elephant
SEQ2_DIR=/scratch/data/loxAfr2/loxAfr2.2bit
SEQ2_LEN=/scratch/data/loxAfr2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/hg18/bed/blastzLoxAfr2.2008-10-22
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# 1580m26.439s
# problems with batch do to scriping errors, finishing the batch
# manually
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-continue=cat -syntenicNet -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
# real 264m46.272s
cat fb.hg18.chainLoxAfr2Link.txt
# 1014404239 bases of 2881515245 (35.204%) in intersection
cd /cluster/data/hg18/bed/blastz.loxAfr2.2008-10-22
time nice -n +19 doRecipBest.pl hg18 loxAfr2 > rbest.log 2>&1 &
# real 622m17.655s
#############################################################################
# BLASTZ/CHAIN/NET vicPac1 (DONE - 2008-10-28,29 - Hiram)
screen # use screen to control this multi-day job
mkdir /hive/data/genomes/hg18/bed/blastzVicPac1.2008-10-28
cd /hive/data/genomes/hg18/bed/blastzVicPac1.2008-10-28
cat << '_EOF_' > DEF
# Human vs. Alpaca
BLASTZ_M=50
BLASTZ=lastz
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=10000
# QUERY: Alpaca
SEQ2_DIR=/scratch/data/vicPac1/vicPac1.2bit
SEQ2_LEN=/scratch/data/vicPac1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/hg18/bed/blastzVicPac1.2008-10-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-bigClusterHub=swarm -syntenicNet \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 488m36.288s
cat fb.hg18.chainVicPac1Link.txt
# 1139088501 bases of 2881515245 (39.531%) in intersection
cd /cluster/data/hg18/bed/blastzVicPac1.2008-10-28
time nice -n +19 doRecipBest.pl hg18 vicPac1 > rbest.log 2>&1 &
# real 380m17.963s
#############################################################################
# BLASTZ/CHAIN/NET Gorilla gorGor1 (DONE - 2008-11-04,05 - Hiram)
screen # use screen to control this multi-day job
mkdir /hive/data/genomes/hg18/bed/blastzGorGor1.2008-11-04
cd /hive/data/genomes/hg18/bed/blastzGorGor1.2008-11-04
cat << '_EOF_' > DEF
# Human vs. Alpaca
BLASTZ_M=50
BLASTZ=lastz
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
# QUERY: Alpaca
SEQ2_DIR=/scratch/data/gorGor1/gorGor1.2bit
SEQ2_LEN=/scratch/data/gorGor1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=284
SEQ2_LAP=0
BASE=/hive/data/genomes/hg18/bed/blastzGorGor1.2008-11-04
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-bigClusterHub=swarm -syntenicNet \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 644m45.816s
cat fb.hg18.chainGorGor1Link.txt
# 1778801556 bases of 2881515245 (61.731%) in intersection
cd /cluster/data/hg18/bed/blastzGorGor1.2008-11-04
time nice -n +19 doRecipBest.pl hg18 gorGor1 > rbest.log 2>&1 &
# real 171m42.585s
# failed, need to finish manually
cd /hive/data/genomes/hg18/bed/blastz.gorGor1/axtChain
# alter the doRecipBest.csh script to finiRecipBest.csh and run:
time ./finiRecipBest.csh > finiRecipBest.log 2>&1
# real 1177m37.534s
# then, continuing:
doRecipBest.pl -continue=download hg18 gorGor1
#############################################################################
# BLASTZ/CHAIN/NET ochPri2 (DONE braney 2008-07-30)
ssh kkstore02
screen # use screen to control this multi-day job
mkdir /cluster/data/hg18/bed/blastz.ochPri2.2008-07-29
cd /cluster/data/hg18/bed/blastz.ochPri2.2008-07-29
cat << _EOF_ > DEF
# Human vs. Pika
BLASTZ_M=50
BLASTZ=/cluster/home/braney/bin/x86_64/lastz
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Pika
SEQ2_DIR=/san/sanvol1/scratch/ochPri2/ochPri2.2bit
SEQ2_LEN=/san/sanvol1/scratch/ochPri2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.ochPri2.2008-07-29
TMPDIR=/scratch/tmp
_EOF_
# << happy emacs
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/ochPri2/blastz.hg18 > do.log 2>&1 &
# Completed: 654120 of 654120 jobs
# CPU time in finished jobs: 14082913s 234715.22m 3911.92h 163.00d 0.447 y
# IO & Wait Time: 2257180s 37619.67m 626.99h 26.12d 0.072 y
# Average job time: 25s 0.42m 0.01h 0.00d
# Longest finished job: 292s 4.87m 0.08h 0.00d
# Submission to last job: 59396s 989.93m 16.50h 0.69d
nice doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/ochPri2/blastz.hg18 > do2.log 2>&1 &
# memk cluster couldn't find san for chainRun, ran on pk
nice doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk -syntenicNet -continue=chainMerge \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/ochPri2/blastz.hg18 > do3.log 2>&1 &
ln -s `pwd`/blastz.ochPri2.2008-07-29 /cluster/data/hg18/bed/blastz.ochPri2
featureBits hg18 chainOchPri2Link
# 806073890 bases of 2881515245 (27.974%) in intersection
cd /cluster/data/hg18/bed/blastz.ochPri2.2008-07-29
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 ochPri2 > rbest.log 2>&1 &
#############################################################################
# BLASTZ/CHAIN/NET myoLuc1 (DONE braney 2008-08-02)
ssh kkstore02
screen # use screen to control this multi-day job
mkdir /cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31
cd /cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31
cat << _EOF_ > DEF
# Human vs. Microbat
BLASTZ_M=50
BLASTZ_T=2
BLASTZ=/cluster/home/braney/bin/x86_64/lastz
# TARGET: Human Hg18 (whole chroms)
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=0
# QUERY: Microbat
SEQ2_DIR=/san/sanvol1/scratch/myoLuc1/myoLuc1.2bit
SEQ2_LEN=/san/sanvol1/scratch/myoLuc1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31
TMPDIR=/scratch/tmp
_EOF_
# << happy emacs
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/myoLuc1/blastz.hg18 > do.log 2>&1 &
# Completed: 98879 of 99144 jobs
# Crashed: 56 jobs
# Other count: 209 jobs
# CPU time in finished jobs: 2327505s 38791.75m 646.53h 26.94d 0.074 y
# IO & Wait Time: 340164s 5669.40m 94.49h 3.94d 0.011 y
# Average job time: 27s 0.45m 0.01h 0.00d
# Longest finished job: 1034s 17.23m 0.29h 0.01d
# Submission to last job: 56968s 949.47m 15.82h 0.66d
# do remaining jobs on kolossus
nice doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/myoLuc1/blastz.hg18 > do2.log 2>&1 &
ln -s `pwd`/blastz.myoLuc1.2008-07-31 /cluster/data/hg18/bed/blastz.myoLuc1
featureBits hg18 chainMyoLuc1Link
# 952177725 bases of 2881515245 (33.044%) in intersection
cd /cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 myoLuc1 > rbest.log 2>&1 &
#############################################################################
# BLASTZ/CHAIN/NET loxAfr2 (not done)
ssh kkstore02
screen # use screen to control this multi-day job
mkdir /cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01
cd /cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01
cat << _EOF_ > DEF
# Human vs. Elephant
BLASTZ_M=50
BLASTZ=/cluster/home/braney/bin/x86_64/lastz
# TARGET: Human Hg18 (whole chroms)
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=0
# QUERY: Elephant
SEQ2_DIR=/san/sanvol1/scratch/loxAfr2/loxAfr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/loxAfr2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01
TMPDIR=/scratch/tmp
_EOF_
# << happy emacs
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/loxAfr2/blastz.hg18 > do.log 2>&1 &
# had to run some jobs on memk
nice doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/loxAfr2/blastz.hg18 > do2.log 2>&1 &
# netChainSubset barfed with memory error (skipped over chains)
nice doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk -syntenicNet -continue=load \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/loxAfr2/blastz.hg18 > do3.log 2>&1 &
ln -s `pwd`/blastz.loxAfr2.2008-08-01 /cluster/data/hg18/bed/blastz.loxAfr2
featureBits hg18 chainLoxAfr2Link
# 1025499138 bases of 2881515245 (35.589%) in intersection
cd /cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 loxAfr2 > rbest.log 2>&1 &
#############################################################################
# BUILD snpArrayIllumina1M SUB-TRACK (DONE 8/4/08, Fan)
# Received raw data file Illumina_Human1M-duoV3_SNPlist_Strand_Location.csv
# from Illumina, Luana Galver (lgalver at illumina.com).
mkdir -p /cluster/store11/gs.19/build36/bed/snp/illumina/1M
cd /cluster/store11/gs.19/build36/bed/snp/illumina/1M
cat Illumina_Human1M-duoV3_SNPlist_Strand_Location.csv |\
sed -e 's/,/\t/g' >1M.tab
hgsql hg18 < ~src/hg/lib/snpArrayIllumina1MRaw.sql
hgsql hg18 -e 'load data local infile "1M.tab" into table snpArrayIllumina1MRaw'
~/src/hg/snp/snpLoad/illuminaLookup1M hg18 snpArrayIllumina1MRaw snp129
# The illuminaLookup1M generate two files:
#
# illuminaLookup1M.out contains all Illumina 1M probes found in snp129
# illuminaLookup1M.err contains all Illumina 1M probes not found in snp129
mv illuminaLookup.out illuminaLookup1Ma.out
cut -f 1 illuminaLookup.err >j.1
cat j.1 |sed -e 's/chrMt/chrM/' |\
sed -e 's/XY/X/' >j.chr
cut -f 2-5 illuminaLookup.err >j.2
cut -f 6 illuminaLookup.err >j.3
cat j.3 |sed -e 's/F/+/' |sed -e 's/R/-/' >j.strand
cut -f 7 illuminaLookup.err |sed -e "s/\[//" |sed -e "s/\]//" >j.observed
paste j.chr j.2 j.strand j.observed >illuminaLookup1Mb.out
# combine two parts
cat illuminaLookup1Ma.out illuminaLookup1Mb.out >snpArrayIllumina1M.tab
# load the table
hgLoadBed hg18 snpArrayIllumina1M snpArrayIllumina1M.tab -tab -sqlTable=snpArrayIllumina1M.sql
#############################################################################
# BLASTZ/CHAIN/NET micMur1 (DONE braney 2008-08-04 )
ssh kkstore02
screen # use screen to control this multi-day job
mkdir /cluster/data/hg18/bed/blastz.micMur1.2008-08-03
cd /cluster/data/hg18/bed/blastz.micMur1.2008-08-03
cat << _EOF_ > DEF
# Human vs. Mouse lemur
BLASTZ_M=50
BLASTZ_T=2
BLASTZ=/cluster/home/braney/bin/x86_64/lastz
# TARGET: Human Hg18 (whole chroms)
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=0
# QUERY: Mouse lemur
SEQ2_DIR=/san/sanvol1/scratch/micMur1/micMur1.2bit
SEQ2_LEN=/san/sanvol1/scratch/micMur1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.micMur1.2008-08-03
TMPDIR=/scratch/tmp
_EOF_
# << happy emacs
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/micMur1/blastz.hg18 > do.log 2>&1 &
# did remaining jobs on memk
nice doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/micMur1/blastz.hg18 > do2.log 2>&1 &
ssh hgwdev
cd /cluster/data/hg18/bed
ln -s `pwd`/blastz.micMur1.2008-08-03 /cluster/data/hg18/bed/blastz.micMur1
featureBits hg18 chainMicMur1Link
# 1338330504 bases of 2881515245 (46.445%) in intersection
ssh kkstore02
cd /cluster/data/hg18/bed/blastz.micMur1.2008-08-03
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 micMur1 > rbest.log 2>&1 &
#############################################################################
# BLASTZ/CHAIN/NET speTri1 (DONE braney 2008-08-05)
ssh kkstore02
screen # use screen to control this multi-day job
mkdir /cluster/data/hg18/bed/blastz.speTri1.2008-08-04
cd /cluster/data/hg18/bed/blastz.speTri1.2008-08-04
cat << _EOF_ > DEF
# Human vs. Squirrel
BLASTZ_M=50
BLASTZ=/cluster/home/braney/bin/x86_64/lastz
# TARGET: Human Hg18 (whole chroms)
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=200000000
SEQ1_LAP=0
# QUERY: Squirrel
SEQ2_DIR=/san/sanvol1/scratch/speTri1/speTri1.2bit
SEQ2_LEN=/san/sanvol1/scratch/speTri1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastz.speTri1.2008-08-04
TMPDIR=/scratch/tmp
_EOF_
# << happy emacs
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/speTri1/blastz.hg18 > do.log 2>&1 &
# did crashed jobs on memk
nice doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/speTri1/blastz.hg18 > do2.log 2>&1 &
# had to run netChains.csh by hand due to PATH problem
nice doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk -syntenicNet -continue=load \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/speTri1/blastz.hg18 > do3.log 2>&1 &
ssh hgwdev
cd /cluster/data/hg18/bed
ln -s `pwd`/blastz.speTri1.2008-08-04 /cluster/data/hg18/bed/blastz.speTri1
featureBits hg18 chainSpeTri1Link
# 1032377454 bases of 2881515245 (35.828%) in intersection
ssh kkstore02
cd /cluster/data/hg18/bed/blastz.speTri1.2008-08-04
nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 speTri1 > rbest.log 2>&1 &
#######################################################
## 44-way multiz (braney working....
mkdir /cluster/data/hg18/bed/multiz44way
cd /cluster/data/hg18/bed/multiz44way
cp /cluster/data/mm9/bed/multiz30way/mm9.guess.30way.nh .
# get mammal tree from Michele Clamp (clamp.nh)
# that I re-rooted
#######################################################
# UW nucleosome occupancy predictions (2008-08-13 markd)
# update due to chr3 being truncated (2009-05-12 markd)
# contact William Stafford Noble <noble@gs.washington.edu>
# obtain data:
mkdir -p /cluster/data/hg18/bed/uwNucOcc
cd /cluster/data/hg18/bed/uwNucOcc
http://USER:PASS@noble.gs.washington.edu/~noble/proj/dennis/results/2008-08-11/ucsc.tgz
mkdir wig
cd wig
tar -zxf ../ucsc.tgz
cd ..
rm ucsc.tgz
# encode and load wiggles
ssh kkstore02
cd /cluster/data/hg18/bed/uwNucOcc/wib
zcat ../wig/a375/a375.chr*.wig.gz|wigEncode stdin uwNucOccA375.wig uwNucOccA375.wib
# Converted stdin, upper limit 9.88, lower limit -5.19
zcat ../wig/dennis/dennis.chr*.wig.gz|wigEncode stdin uwNucOccDennis.wig uwNucOccDennis.wib
# Converted stdin, upper limit 8.26, lower limit -9.68
zcat ../wig/mec/mec.chr*.wig.gz|wigEncode stdin uwNucOccMec.wig uwNucOccMec.wib
# Converted stdin, upper limit 5.05, lower limit -9.86
# link-n-load
ssh hgwdev
cd cluster/data/hg18/bed/uwNucOcc/wib
ln -s /cluster/data/hg18/bed/uwNucOcc/wib/uwNucOccA375.wib /gbdb/hg18/wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccA375 uwNucOccA375.wig
ln -s /cluster/data/hg18/bed/uwNucOcc/wib/uwNucOccDennis.wib /gbdb/hg18/wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccDennis uwNucOccDennis.wig
ln -s /cluster/data/hg18/bed/uwNucOcc/wib/uwNucOccMec.wib /gbdb/hg18/wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccMec uwNucOccMec.wig
rm wiggle.tab
# noble lab supplied update due to chr3 being truncated (2009-05-12 markd)
cd /cluster/data/hg18/bed/uwNucOcc
mkdir bad
mv wig/*/*.chr3.hg18.wig.gz bad/
mv wib bad/
wget http://USER:PASS@noble.gs.washington.edu/~wnoble/proj/dennis/results/2008-08-11/a375/a375.chr3.hg18.wig.gz
wget http://USER:PASS@noble.gs.washington.edu/~wnoble/proj/dennis/results/2008-08-11/mec/mec.chr3.hg18.wig.gz
wget http://USER:PASS@noble.gs.washington.edu/~wnoble/proj/dennis/results/2008-08-11/dennis/dennis.chr3.hg18.wig.gz
mv dennis.chr3.hg18.wig.gz wig/dennis/
mv mec.chr3.hg18.wig.gz wig/mec/
mv a375.chr3.hg18.wig.gz wig/a375/
cd /cluster/data/hg18/bed/uwNucOcc/wib
zcat ../wig/a375/a375.chr*.wig.gz|wigEncode stdin uwNucOccA375.wig uwNucOccA375.wib
zcat ../wig/dennis/dennis.chr*.wig.gz|wigEncode stdin uwNucOccDennis.wig uwNucOccDennis.wib
zcat ../wig/mec/mec.chr*.wig.gz|wigEncode stdin uwNucOccMec.wig uwNucOccMec.wib
cd ..
hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccA375 uwNucOccA375.wig
hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccDennis uwNucOccDennis.wig
hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccMec uwNucOccMec.wig
#########################################################################
# BLASTZ/CHAIN/NET oryLat2 (DONE - 2008-08-19,25 - Hiram)
ssh kkstore02
screen # use a screen to manage this longish running job
mkdir /cluster/data/hg18/bed/blastzOryLat2.2008-08-19
cd /cluster/data/hg18/bed/blastzOryLat2.2008-08-19
cat << '_EOF_' > DEF
# Human vs. Medaka
BLASTZ=/cluster/bin/penn/x86_64/lastz
# typical parameters for a genome that is distant from human
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Human hg18, randoms complete, as they are, no contig confusion
SEQ1_DIR=/scratch/data/hg18/hg18.2bit
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1
# QUERY: Medaka oryLat2 (40M chunks covers the largest chroms in one gulp)
SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=200
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzOryLat2.2008-08-19
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time doBlastzChainNet.pl `pwd`/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust \
-bigClusterHub=pk -verbose=2 > do.log 2>&1 &
cat fb.hg18.chainOryLat2Link.txt
# 52713428 bases of 2881515245 (1.829%) in intersection
cd /cluster/data/hg18/bed
ln -s blastzOryLat2.2008-08-19 blastz.oryLat2
# That is OK, now for the swap:
mkdir /cluster/data/oryLat2/bed/blastz.hg18.swap
cd /cluster/data/oryLat2/bed/blastz.hg18.swap
time doBlastzChainNet.pl -verbose=2 -swap \
/cluster/data/hg18/bed/blastzOryLat2.2008-08-19/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust \
-bigClusterHub=pk > swap.log 2>&1 &
# real 17m9.675s
cat fb.oryLat2.chainHg18Link.txt
# 46961822 bases of 700386597 (6.705%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET TAEGUT1 (DONE braney 2008-09-10)
ssh swarm
screen
mkdir /cluster/data/hg18/bed/blastz.taeGut1.2008-09-09
cd /cluster/data/hg18/bed/blastz.taeGut1.2008-09-09
cat << _EOF_ > DEF
# human vs. zebra finch
BLASTZ_M=50
# Copied settings from human vs galGal3
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human hg18
SEQ1_DIR=/scratch/data/hg18/hg18.2bit
# SEQ1_SMSK=/hive/data/genomes/hg18/linSpecRep/notInChicken
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
# one chrom at a time
SEQ1_CHUNK=200000000
SEQ1_LAP=0
# QUERY: Zebra finch taeGut1
SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit
SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes
# SEQ2_DIR=/hive/data/genomes/taeGut1/taeGut1.2bit
# SEQ2_LEN=/hive/data/genomes/taeGut1/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/taeGut1/taeGut1.blastz.2bit
SEQ2_CTGLEN=/hive/data/genomes/taeGut1/taeGut1.blastz.sizes
SEQ2_LIFT=/hive/data/genomes/taeGut1/jkStuff/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/hive/data/genomes/hg18/bed/blastz.taeGut1.2008-09-09
_EOF_
# << emacs
doBlastzChainNet.pl -syntenicNet \
-bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=loose \
-smallClusterHub=swarm DEF -workhorse=swarm \
-qRepeats=windowmaskerSdust > do.log 2>&1
# Completed: 14910 of 14910 jobs
# CPU time in finished jobs: 2744737s 45745.62m 762.43h 31.77d 0.087 y
# IO & Wait Time: 1493361s 24889.34m 414.82h 17.28d 0.047 y
# Average job time: 284s 4.74m 0.08h 0.00d
# Longest finished job: 3678s 61.30m 1.02h 0.04d
# Submission to last job: 6687s 111.45m 1.86h 0.08d
cd /cluster/data/hg18/bed
rm -f blastz.taeGut1
ln -s blastz.taeGut1.2008-09-09 /cluster/data/hg18/bed/blastz.taeGut1
################################################################
# HUMAN FETAL BRAIN EXON ARRAYS (YALE) (Andy)
ssh hgwdev
bash
mkdir /hive/data/genomes/hg18/bed/yaleMicroarrays
cd /hive/data/genomes/hg18/bed/yaleMicroarrays
cp /var/ftp/encode/Sestan_fetal_brain_exon_arrays.rar .
rar e Sestan_fetal_brain_exon_arrays.rar
tail +2 18_19_21_23_full_SLR_converted.txt | grep -v "\-\-\-" > sestanBrainAtlas.bed
hgLoadBed hg18 sestanBrainAtlas{,.bed}
# just a little array name organization
head -n1 18_19_21_23_full_SLR_converted.txt | \
sed 's/.*expNames=\"//;s/\"\ name=.*//;s/\.CEL//g' | \
tr ',' '\n' | sed '/^$/d' | grep -n '' | tr ':' '\t' | \
awk 'BEGIN{OFS="\t";}{$1=$1 - 1; print;}' \
> arrays.txt
awk 'BEGIN{OFS="\t";}{print $1, $2, $2, "n/a", "n/a", "n/a", "103", "n/a,n/a,"$2",";}' \
arrays.txt > sestanBrainAtlasExps.tab
ln -s ~/kent/src/hg/lib/expRecord.sql sestanBrainAtlasExps.sql
hgLoadSqlTab hgFixed sestanBrainAtlasExps{,.sql,.tab}
# Removed some of the arrays... the manual way
# something's weird
tr '\r' '\n' < sestanBrainAtlas.bed | sed '/^$/d; s/$/,/' > ses.bed
cut -f1-14 ses.bed | \
awk 'BEGIN{FS="\t";OFS="\t"}{$2 = $2 - 1; $13 = $13 - 8; print;}' | \
sed 's/95,96,97,98,99,100,101,102//' > ses14.bed
cut -f15 ses.bed | cut -d',' -f1-74,77-92,99- > ses15.bed
paste ses14.bed ses15.bed > newSestan.bed
hgLoadBed hg18 sestanBrainAtlas newSestan.bed
ln -s ~/kent/src/hg/makeDb/hgCgiData/Human/microarrayGroups.ra
grep -A5 sestanBrainAtlasAll microarrayGroups.ra | grep "^names" | sed 's/names //' | tr ',' '\n' | sed '/^$/d' > namesCol.txt
grep -A5 sestanBrainAtlasAll microarrayGroups.ra | grep "^expIds" | sed 's/expIds //' | tr ',' '\n' | sed '/^$/d' >expCol.txt
paste expCol.txt namesCol.txt > arrays.txt
awk 'BEGIN{OFS="\t";}{print $1, $2, $2, "n/a", "n/a", "n/a", "103", "n/a,n/a,"$2",";}' \
arrays.txt > sestanBrainAtlasExps.tab
hgLoadSqlTab hgFixed sestanBrainAtlasExps{,.sql,.tab}
ssh kolossus
################################################################
# HUMAN TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08)
#
# AffyExonTissue Step 1: download exon array coordinate data from Affy
# and extract coordinates Download HuEx-1_0-st-v2 Annotations, Full,
# Hg18/Build 36 gff. The file is available at
# http://www.affymetrix.com/support/technical/byproduct.affx?product=huexon-st
# and the download requires logging in to NetAffx (free, registration
# required) Uncompress the GFF files. Parse out key fields with the
# script below, generating hg18.affy.exon.coords.tab
#
#---------
#!/usr/bin/env perl
=pod
=head1 NAME
parseGffArrayData.pl
=head1 SYNOPSYS
cat *gff |parseGffArrayData.pl > array.coords.tab
=head1 DESCRIPTION
Parses probeset coordinates out of the Affy design data
=cut
{
use strict;
use Getopt::Long;
use GFF;
use GFF::GeneFeature;
use FileHandle;
print "chr\tstart\tend\tID\tscore\tstrand\n";
while (my $line = <>) {
chomp;
my @tokens = split /\s/, $line;
if ($tokens[2] eq "probeset") {
my $gffFeature = new GFF::GeneFeature;
my $gffData = $gffFeature->new_from_line($line);
my $probesetId = $gffData->group_value('probeset_id');
my $probesetLevel = $gffData->group_value('level');
my $bounded = $gffData->group_value('bounded');
my $cds = $gffData->group_value('cds');
my $score;
if ($probesetLevel eq "core") {
$score = 900;
} elsif ($probesetLevel eq "extended") {
$score = 500;
} else {
$score = 200;
}
if ($bounded) { $score -= 200; }
if ($cds) { $score += 100; }
if ($score < 100) { $score = 100; }
print($gffData->seqname(), "\t", $gffData->start(), "\t",
$gffData->end(), "\t", $probesetId, "\t$score\t",
$gffData->strand(), "\n");
}
}
}
#-------
#
# AffyExonTissue Step 2: download tissue data from Affy, generate bed15 file
#
# Download Human Exon 1.0 ST APT results from
# http://www.affymetrix.com/support/technical/sample_data/exon_array_data.affx
# (requires free registration and login, as above)
# Uncompress, and get rid of the undesired tissue mixture columns.
cut -f 1-34 \
< apt-probeset-summarize-results-exon/quant-norm.pm-gcbg.plier.summary.txt \
> quant-norm.pm-gcbg.plier.nomix.summary.txt
#
# Generate a bed15 file using the command below, and script below that.
# For the purposes of generating a track, ignore the first line.
#
arrayToBed15.py \
--coordinates hg18.affy.exon.coords.tab \
--plier quant-norm.pm-gcbg.plier.nomix.summary.txt \
--name "humanExon" \
--groups "breast,breast,breast,cerebellum,cerebellum,cerebellum,heart,heart,heart,kidney,kidney,kidney,liver,liver,liver,muscle,muscle,muscle,pancreas,pancreas,pancreas,prostate,prostate,prostate,spleen,spleen,spleen,testes,testes,testes,thyroid,thyroid,thyroid" \
|tail -n +2 > human.exon.headless.bed15
#---
#!/usr/bin/python
from optparse import OptionParser
import math
import re
#
# get the genomic probeset coordinates
#
def parseProbesetCoordinates(coordinatesFilename):
"""Build a dictionary of coordinates from a tab-delmited file"""
coordinateData = {}
coordinatesFileHandle = open(coordinatesFilename)
coordinatesFileHandle.readline() # skip the header line
for line in coordinatesFileHandle:
line = line.rstrip();
tokens = line.split('\t')
id = tokens[3]
coordinateData[id] = tokens;
return(coordinateData)
def median(numbers):
"""Sort the input list and return the middle element."""
nn = len(numbers)
copy = numbers[:] # So that "numbers" keeps its original order
copy.sort()
if nn & 1: # There is an odd number of elements
return copy[nn // 2]
else:
return (copy[nn // 2] + copy[nn // 2 - 1]) / 2
def medianOfMedians(experimentNames, experimentValues):
"""Given replicated values, find the median of the replicate medians."""
# Create a dictionary to sort the values by experiment set
replicates = {}
#
# Group the epxeriments into replicate sets by experiment names.
# This assumes that experiments in the same replicate set have the
# same name.
#
for ii in range(0,len(experimentNames)-1):
if not replicates.has_key(experimentNames[ii]):
replicates[experimentNames[ii]] = [experimentValues[ii]]
else :
replicates[experimentNames[ii]].append(experimentValues[ii])
# Make a list containing the median value of each replicate set.
medians = list()
for replicateSet in replicates.keys() :
values = replicates[replicateSet]
thisMedian = median(values)
medians.append(thisMedian)
# Now get the median value of the median list
medianValue = median(medians)
return(medianValue)
def printHeaderData(experimentList, trackName):
"""Print a header line for a bed15 file"""
expNames = ",".join(experimentList)
print "track type=\"array\" expScale=3 expStep=0.5 ",
print " name=\"" + str(trackName) + "\"", \
" description=\"Microarray custom track\"",
print " expNames=" "\"" + expNames + "\""
def printPlierResults(resultsLine, experimentGroups, probesetCoordinates):
"""median-center a line of expression results, print in bed15 format"""
background = 10
plierResultsLine = line.split('\t')
probesetId = plierResultsLine[0]
del plierResultsLine[0]
if probesetCoordinates.has_key(probesetId):
coordinates = probesetCoordinates[probesetId]
#
# Given coordinate data (chr start end ID score strand)
# and given experimental data (ID exp1 exp2 exp3 ... expN)
# Print as follows:
# 1. Basic bedfile stuff: chromosome, start, end, name, score,
# strand, thickStart (=start), thickEnd (=end), 0, blocks (=1),
# blocklengths (=end-start+1,), blockstarts (=0,)
#
start = int(coordinates[1]) - 1
end = int(coordinates[2])
length = end - start
print str(coordinates[0]) + "\t" + str(start) + "\t" \
+ str(end) + "\t" + str(probesetId) + "\t", \
coordinates[4], "\t", coordinates[5], "\t", start, "\t", \
end, "\t0\t1\t", \
str(length) + ",\t", "0,\t",
#
# Continue with microarray-specific stuff:
# - experiment count
# - comma-separated list of experiment IDs (0 .. experimentCount)
# - comma-separated list of experiment scores (log(result)-log(median))
#
experimentCount = len(plierResultsLine)
experimentValues = list()
for value in plierResultsLine:
experimentValues.append(float(value))
medianValue = medianOfMedians(experimentGroups, experimentValues)
logMedian = math.log(medianValue+background)
valuesStrings = list()
for thisValue in experimentValues:
thisScore = math.log(thisValue+background) - logMedian
valuesStrings.append(str(thisScore))
experimentScoreString = ",".join(valuesStrings)
ids = list()
for ii in range(0, experimentCount):
ids.append(str(ii))
experimentIdString = ",".join(ids)
print experimentCount, "\t", experimentIdString, "\t", \
experimentScoreString
return
parser = OptionParser()
parser.add_option("--coordinates", dest="coordinatesFile")
parser.add_option("--plier", dest="plierResultsFile")
parser.add_option("--name", dest="trackName")
parser.add_option("--groups", dest="experimentGroups")
(parameters, args) = parser.parse_args()
experimentGroups = parameters.experimentGroups.split(",")
probesetCoordinates = parseProbesetCoordinates(parameters.coordinatesFile)
plierResults = open(parameters.plierResultsFile)
for line in plierResults:
line = line.rstrip()
if (re.search("^#", line)) :
continue
elif (re.search("^probeset_id", line)) :
printHeaderData(experimentGroups, parameters.trackName)
else :
printPlierResults(line, experimentGroups, probesetCoordinates)
#---
#
# AffyExonTissue Step 3: set up a browser track from the bed15 file
# created offline: trackDb.affyExonTissues.ra,
# affyExonTissues.html,
# microarrayGroups.affyExonTissues.ra
#
cat $KENT/src/hg/makeDb/trackDb/human/trackDb.ra trackDb.affyExonTissues.ra \
> trackDb.new.ra
cp trackDb.new.ra $KENT/src/hg/makeDb/trackDb/human/trackDb.ra
cp affyExonTissues.html $KENT/src/hg/makeDb/trackDb/human
cat $KENT/src/hg/makeDb/hgCgiData/Human/microarrayGroups.ra \
microarrayGroups.affyExonTissues.ra > microarrayGroups.new.ra
hgLoadBed hg18 affyExonTissues human.exon.headless.bed15
cd $KENT/src/hg/makeDb/trackDb
make update DBS="hg17 hg18"
cd $KENT/src
make -j8 cgi >& ~/make.j8.cgi.errout
#
# AffyExonTissue Step 4: load the appropriate fields into hgFixed
#
grep -A5 affyExonTissuesAll microarrayGroups.ra | grep "^names" \
| sed 's/names //' | tr ',' '\n' | sed '/^$/d' | sed 's/^\s\+//' > n.txt
grep -A5 affyExonTissuesAll microarrayGroups.ra | grep "^expIds" \
| sed 's/expIds //' | tr ',' '\n' | sed '/^$/d' | sed 's/^\s\+//' > e.txt
paste e.txt n.txt > a.txt
awk 'BEGIN{OFS="\t";}
{print $1, $2, $2, "n/a", "n/a", "n/a", "33", "n/a,n/a,"$2",";}' a.txt \
> exps.tab
ln -s ../../../lib/expRecord.sql
hgLoadSqlTab hgFixed affyMouseExonTissuesAllExps expRecord.sql exps.tab
rm a.txt n.txt e.txt exps.tab
############
########################################################################
## AFFY ALL EXON PROBESETS (HG18/MM9/RN4) (DONE 2009-01-29, Andy)
## 1. Log into Affymetrix netaffx site.
## 2. Use Firefox add-on "Export Cookies" to save a file called cookies.txt
ssh hgwdev
grep affymetrix.com cookies.txt > affycookies.txt
wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na27/wtexon/HuEx-1_0-st-v2.na27.hg18.probeset.csv.zip
wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na27/wtexon/MoEx-1_0-st-v1.na27.mm9.probeset.csv.zip
wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na27/wtexon/RaEx-1_0-st-v1.na27.rn4.probeset.csv.zip
rm affycookies.txt
for z in *.zip; do unzip $z; done
rm *.zip
ln -s HuEx-1_0-st-v2.na27.hg18.probeset.csv hg18.csv
ln -s RaEx-1_0-st-v1.na27.rn4.probeset.csv rn4.csv
ln -s MoEx-1_0-st-v1.na27.mm9.probeset.csv mm9.csv
for csv in {hg18,mm9,rn4}.csv; do
bed=${csv%.csv}.bed
sed '1,20d' $csv | tr ',' '\t' | sed 's/\"//g' | cut -f1-5,16 \
| grep -v "\-\-\-" \
| awk 'BEGIN{FS="\t";OFS="\t";}{if ($6 == "core") score = 1000; else if ($6 == "extended") score = 700; else if ($6 == "full") score = 300; else score = 100; name = $1"|"$6; print $2, $4-1, $5, name, score, $3}' \
| bedSort stdin $bed
done
for db in hg18 mm9 rn4; do hgLoadBed $db affyAllExonProbes $db.bed; done
rm hg18.csv mm9.csv rn4.csv
gzip *.bed *.csv
mkdir -p /hive/data/genomes/{hg18,mm9,rn4}/bed/affyAllExonProbes
mv HuEx-1_0-st-v2.na27.* hg18.bed.gz /hive/data/genomes/hg18/bed/affyAllExonProbes/
mv MoEx-1_0-st-v1.na27.* mm9.bed.gz /hive/data/genomes/mm9/bed/affyAllExonProbes/
mv * /hive/data/genomes/rn4/bed/affyAllExonProbes/
## forgot mm8 (see mm8.txt for that one)
################################################
# SPLIT EXPRESSION & REGULATION GROUPS
# (2008-09-09 kate)
echo "insert into grp (name, label, priority, defaultIsClosed) values ('expression', 'Expression', 4.5, 1)" | hgsql hg18
echo "update grp set label='Regulation' where name='regulation'" | hgsql hg18
############################################################################
# KIDD/EICHLER DISCORDANT CLONE ENDS (DONE 9/16/08 angie)
ssh hgwdev
mkdir /cluster/data/hg18/bed/kiddEichlerDiscordant
cd /cluster/data/hg18/bed/kiddEichlerDiscordant
foreach i (ABC7 ABC8 ABC9 ABC10 ABC11 ABC12 ABC13 ABC14 G248)
wget --user=uuuu --password=ppppppp \
http://eichlerlab.gs.washington.edu/kiddj/hg18_fosmidmap/$i.bestdiscordant.sorted.gz
end
# Load the tracks (translate bacEndPairs-inspired format to bed12):
foreach f (*.gz)
set track = `echo $f:r:r:r \
| perl -wpe 's/^([AG])(\w+)$/kiddEichlerDisc$1\L$2/ || die;'`
if ($status != 0) break
echo $track
zcat $f \
| perl -wpe 'if (/^chrom\s+chromStart/) {s/^.*\n$//; next;} \
my ($c, $s, $e, $n, $sc, $st, $bSt, $bSz, undef, $t) = split; \
@bSts = split(",", $bSt); @bSzs = split(",", $bSz); \
if ($t =~ /^transchrm_/) { \
@bSts = (0); @bSzs = ($e - $s); \
} elsif ($t =~ /^OEA_/) { \
die "\nERROR: bSts[0] $bSts[0] != s $s\n" if ($bSts[0] != $s); \
$bSzs[0]--; \
$bE = $bSts[0] + $bSzs[0]; \
die "bE $bE != e $e\n" if ($bE != $e); \
$bSts[0] -= $s; \
} elsif ($#bSts == 1) { \
if ($bSts[0] > $bSts[1]) { \
# warn "Swapping $n ($bSts[0] > $bSts[1])\n"; \
$tmp = $bSts[0]; $bSts[0] = $bSts[1]; $bSts[1] = $tmp; \
$tmp = $bSzs[0]; $bSzs[0] = $bSzs[1]; $bSzs[1] = $tmp; \
} \
if ($bSts[0] != $s) { \
die "\nERROR: n=$n,$t: bSts[0]=$bSts[0] but s=$s\n\t"; \
} \
$bSzs[0]--; $bSzs[1]--; \
$bE0 = $bSts[0] + $bSzs[0]; \
$bE1 = $bSts[1] + $bSzs[1]; \
$bE = $bE0 > $bE1 ? $bE0 : $bE1; \
if ($bE != $e) { \
warn "n=$n,$t: bE0=$bE0, bE1=$bE1, bE=$bE, e=$e\n"; \
if ($bE1 > $e) { \
warn "n=$n,$t: tweaking bSzs[1] (clip to chromEnd)\n"; \
$bSzs[1] = $e - $bSts[1]; \
} \
} \
$bSts[0] -= $s; $bSts[1] -= $s; \
} else { die "t is $t but \$#bSts is $#bSts"; } \
$bSt = join(",", @bSts) . ","; $bSz = join(",", @bSzs) . ","; \
$rgb = ($t =~ /^deletion/) ? "224,0,0" : \
($t =~ /^insertion/) ? "0,0,224" : \
($t =~ /^inversion/) ? "0,224,0" : \
($t =~ /^OEA/) ? "240,160,64" : "0,0,0"; \
$_ = join("\t", $c, $s, $e, "$n,$t", int($sc+0.5), $st, $s, $e, $rgb, \
scalar(@bSzs), $bSz, $bSt) . "\n";' \
| hgLoadBed -tab hg18 $track stdin
checkTableCoords hg18 $track
end
# Tons of overlapping block and blockEnd[n-1]!=end warnings from
# checkTableCoords -- but these are discordant mappings, so we
# expect those. Make sure there aren't any other types of errors:
foreach f (*.gz)
set track = `echo $f:r:r:r \
| perl -wpe 's/^([AG])(\w+)$/kiddEichlerDisc$1\L$2/ || die;'`
checkTableCoords hg18 $track |& egrep -v 'overlapping|!= end'
end
# No output, good.
# Get clone ID -> NCBI acc mapping (same as for hg17; redownloaded to
# make sure).
mkdir /cluster/data/hg18/bed/kiddEichlerDiscordant/cloneIds
cd /cluster/data/hg18/bed/kiddEichlerDiscordant/cloneIds
# Saved off emailed file from Jeff Kidd to clones_used_3nov.txt.accessions;
# get trace archive trace names for end reads:
foreach n (7 9 10 11 12 13 14)
wget http://hgsv.washington.edu/general/download/clone_mapping/ABC$n/ABC$n.conversion.gz
end
# ABC8 has _a and _b files:
wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_a.conversion.gz
wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_b.conversion.gz
# That file is not available for G248.
gunzip *.gz
# Combine the relevant data from the .conversion files; keep only those
# IDs that are used in the tracks.
zcat ../[AG]*.gz \
| cut -f 4 \
| egrep -v '^(#chrom|track|name)' \
| sed -e 's/,.*//' \
| sort -u > discIds.txt
grep -h -v ^163722_163722- *.conversion \
| perl -wpe 's/^OurClone.*\n// || s/^\d+_(HUMAN|\d+_).*\n$// || \
s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\.(F|FORWARD|R|REVERSE)\.\d+\t(\w+)$/$2$3_$6\t$7\t$8/ || \
warn "Parse line $.:\n$_";' \
| sort > allEnds.tab
grep -wFf discIds.txt allEnds.tab > discEnds.txt
wc -l discIds.txt allEnds.tab discEnds.txt
# 352330 discIds.txt
# 17490847 allEnds.tab
# 781513 discEnds.txt
# discEnds.txt has 2 lines (forward & reverse) for most of its ids...
# ideally we would see 2*(352330) lines in discEnds.txt.
# Get a list of which discordant clone IDs don't have ends in *.conv*:
cut -f 1 allEnds.tab | uniq > all.tmp
comm -23 discIds.txt all.tmp > discNotInConv.txt
wc -l discNotInConv.txt
#41853 discNotInConv.txt
cat > combine.pl <<'_EOF_'
#!/usr/bin/perl -w
use strict;
my ($cloneFile, $endsFile) = @ARGV;
open(CLONES, $cloneFile) || die "Can't open $cloneFile: $!\n";
my %idInfo;
while(<CLONES>) {
(s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\t(\w+)$/$2$3_$6\t$7/ && m/^(\w+)\t(\w+)/) || \
m/^(G248\w+)\t(\w+)$/ || die "Parse line $.:$_";
my ($id, $acc) = ($1, $2);
$idInfo{$id}->[0] = $acc;
}
close(CLONES);
open(ENDS, $endsFile) || die "Can't open $endsFile: $!\n";
while (<ENDS>) {
chomp; my ($id, $dir, $traceName) = split("\t");
if ($dir =~ /^F/) {
$idInfo{$id}->[1] = $traceName;
} elsif ($dir =~ /^R/) {
$idInfo{$id}->[2] = $traceName;
} else { die "What is this \$dir: $dir ?\n"; }
}
close(ENDS);
foreach my $id (sort keys %idInfo) {
my $infoRef = $idInfo{$id};
$infoRef->[0] = '' if (! defined $infoRef->[0]);
$infoRef->[1] = 0 if (! defined $infoRef->[1]);
$infoRef->[2] = 0 if (! defined $infoRef->[2]);
print join("\t", $id, @{$infoRef}) . "\n";
}
'_EOF_'
# << emacs
chmod a+x combine.pl
combine.pl clones_used_3nov.txt.accessions discEnds.txt \
| sort > kiddEichlerToNcbi.txt
# Load table:
hgLoadSqlTab hg18 kiddEichlerToNcbi \
$HOME/kent/src/hg/lib/kiddEichlerToNcbi.sql kiddEichlerToNcbi.txt
# Add to makeDb/schema/all.joiner, then check:
runJoiner.csh hg18 kiddEichlerToNcbi $HOME/kent/src/hg/makeDb/schema
############################################################################
# hgPal downloads 28way refGene, knownGene, knownCanonical
ssh hgwdev
screen
bash
rm -rf /cluster/data/hg18/bed/multiz28way/pal
mkdir /cluster/data/hg18/bed/multiz28way/pal
cd /cluster/data/hg18/bed/multiz28way/pal
cat > order.lst <<EOF
hg18
panTro2
rheMac2
otoGar1
tupBel1
mm8
rn4
cavPor2
oryCun1
sorAra1
eriEur1
canFam2
felCat3
equCab1
bosTau3
dasNov1
loxAfr1
echTel1
monDom4
ornAna1
anoCar1
galGal3
xenTro2
danRer4
tetNig1
fr2
gasAcu1
oryLat1
EOF
mz=multiz28way
gp=refGene
db=hg18
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \
gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
gzip -c > exonAA/$j.exonAA.fa.gz"
done > $gp.jobs
time sh -x $gp.jobs > $gp.jobs.log 2>&1 &
sleep 1
tail -f $gp.jobs.log
# real 232m24.611s
# user 13m59.669s
# sys 5m5.601s
zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
# we're only distributing exons at the moment
mz=multiz28way
gp=refGene
db=hg18
pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
mz=multiz28way
gp=knownGene
db=hg18
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \
gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
gzip -c > exonAA/$j.exonAA.fa.gz"
done > $gp.$mz.jobs
time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
sleep 1
tail -f $gp.$mz.job.log
# real 248m39.293s
# user 23m30.788s
# sys 8m2.714s
zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
# now do the canonical set
cd /cluster/data/hg18/bed/multiz28way/pal
mz=multiz28way
gp=knownCanonical
db=hg18
for j in `awk '{print $1}' /cluster/data/hg18/chrom.sizes`
do
echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed
done
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -geneBeds=$j.known.bed $db $mz knownGene order.lst stdout | \
gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | \
gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | \
gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | \
gzip -c > exonAA/$j.exonAA.fa.gz"
done > $gp.$mz.jobs
time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
sleep 1
tail -f $gp.$mz.job.log
# real 216m41.700s
# user 10m22.016s
# sys 4m6.917s
rm *.known.bed
zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
#########################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
hg18.upstreamGeneTbl = refGene
hg18.upstreamMaf = multiz28way /hive/data/genomes/hg18/bed/multiz28way/species.lst
#########################################################################
# BarskiChIPseq tracks Begun: 2008-09-19 Finished: 2008-09-22 Tim
# Barski, et al 2007 Paper - High-Resolution Mapping of Histone Modifications in the Human Genome
# Solexa high-throughput sequencing: ChIPseq data
# http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/hgtcell.html
ssh hgwdev
mkdir /hive/data/genomes/hg18/bed/Barski2007/lab
cd /hive/data/genomes/hg18/bed/Barski2007/lab
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me2.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me3.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me2.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me3.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me2.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me3.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me3.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me2.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me3.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me2.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me3.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4R3me2.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2BK5me1.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2AZ.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/PolII.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/CTCF.bed
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me2.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me3.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me2.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me3.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me2.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me3.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me3.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me2.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me3.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me2.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me3.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4R3me2.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2BK5me1.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2AZ.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/PolII.vstep.gz
wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/CTCF.vstep.gz
gunzip *.gz
mv H3K4me1.vstep H3K4me1.wig
mv H3K4me2.vstep H3K4me2.wig
mv H3K4me3.vstep H3K4me3.wig
mv H3K9me1.vstep H3K9me1.wig
mv H3K9me2.vstep H3K9me2.wig
mv H3K9me3.vstep H3K9me3.wig
mv H3K27me1.vstep H3K27me1.wig
mv H3K27me2.vstep H3K27me2.wig
mv H3K27me3.vstep H3K27me3.wig
mv H3K36me1.vstep H3K36me1.wig
mv H3K36me3.vstep H3K36me3.wig
mv H3K79me1.vstep H3K79me1.wig
mv H3K79me2.vstep H3K79me2.wig
mv H3K79me3.vstep H3K79me3.wig
mv H3R2me1.vstep H3R2me1.wig
mv H3R2me2.vstep H3R2me2.wig
mv H4K20me1.vstep H4K20me1.wig
mv H4K20me3.vstep H4K20me3.wig
mv H4R3me2.vstep H4R3me2.wig
mv H2BK5me1.vstep H2BK5me1.wig
mv H2AZ.vstep H2AZ.wig
mv PolII.vstep PolII.wig
mv CTCF.vstep CTCF.wig
head -1 H3K4me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K4me1/g" > barskiChIPseqH3K4me1.wigVar
head -1 H3K4me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K4me2/g" > barskiChIPseqH3K4me2.wigVar
head -1 H3K4me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K4me3/g" > barskiChIPseqH3K4me3.wigVar
head -1 H3K9me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K9me1/g" > barskiChIPseqH3K9me1.wigVar
head -1 H3K9me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K9me2/g" > barskiChIPseqH3K9me2.wigVar
head -1 H3K9me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K9me3/g" > barskiChIPseqH3K9me3.wigVar
head -1 H3K27me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K27me1/g" > barskiChIPseqH3K27me1.wigVar
head -1 H3K27me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K27me2/g" > barskiChIPseqH3K27me2.wigVar
head -1 H3K27me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K27me3/g" > barskiChIPseqH3K27me3.wigVar
head -1 H3K36me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K36me1/g" > barskiChIPseqH3K36me1.wigVar
head -1 H3K36me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K36me3/g" > barskiChIPseqH3K36me3.wigVar
head -1 H3K79me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K79me1/g" > barskiChIPseqH3K79me1.wigVar
head -1 H3K79me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K79me2/g" > barskiChIPseqH3K79me2.wigVar
head -1 H3K79me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K79me3/g" > barskiChIPseqH3K79me3.wigVar
head -1 H3R2me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3R2me1/g" > barskiChIPseqH3R2me1.wigVar
head -1 H3R2me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3R2me2/g" > barskiChIPseqH3R2me2.wigVar
head -1 H4K20me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH4K20me1/g" > barskiChIPseqH4K20me1.wigVar
head -1 H4K20me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH4K20me3/g" > barskiChIPseqH4K20me3.wigVar
head -1 H4R3me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH4R3me2/g" > barskiChIPseqH4R3me2.wigVar
head -1 H2BK5me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH2BK5me1/g" > barskiChIPseqH2BK5me1.wigVar
head -1 H2AZ.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH2AZ/g" > barskiChIPseqH2AZ.wigVar
head -1 PolII.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqPolII/g" > barskiChIPseqPolII.wigVar
head -1 CTCF.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqCTCF/g" > barskiChIPseqCTCF.wigVar
tail --lines=+2 H3K4me1.vstep >> barskiChIPseqH3K4me1.wigVar
tail --lines=+2 H3K4me2.vstep >> barskiChIPseqH3K4me2.wigVar
tail --lines=+2 H3K4me3.vstep >> barskiChIPseqH3K4me3.wigVar
tail --lines=+2 H3K9me1.vstep >> barskiChIPseqH3K9me1.wigVar
tail --lines=+2 H3K9me2.vstep >> barskiChIPseqH3K9me2.wigVar
tail --lines=+2 H3K9me3.vstep >> barskiChIPseqH3K9me3.wigVar
tail --lines=+2 H3K27me1.vstep >> barskiChIPseqH3K27me1.wigVar
tail --lines=+2 H3K27me2.vstep >> barskiChIPseqH3K27me2.wigVar
tail --lines=+2 H3K27me3.vstep >> barskiChIPseqH3K27me3.wigVar
tail --lines=+2 H3K36me1.vstep >> barskiChIPseqH3K36me1.wigVar
tail --lines=+2 H3K36me3.vstep >> barskiChIPseqH3K36me3.wigVar
tail --lines=+2 H3K79me1.vstep >> barskiChIPseqH3K79me1.wigVar
tail --lines=+2 H3K79me2.vstep >> barskiChIPseqH3K79me2.wigVar
tail --lines=+2 H3K79me3.vstep >> barskiChIPseqH3K79me3.wigVar
tail --lines=+2 H3R2me1.vstep >> barskiChIPseqH3R2me1.wigVar
tail --lines=+2 H3R2me2.vstep >> barskiChIPseqH3R2me2.wigVar
tail --lines=+2 H4K20me1.vstep >> barskiChIPseqH4K20me1.wigVar
tail --lines=+2 H4K20me3.vstep >> barskiChIPseqH4K20me3.wigVar
tail --lines=+2 H4R3me2.vstep >> barskiChIPseqH4R3me2.wigVar
tail --lines=+2 H2BK5me1.vstep >> barskiChIPseqH2BK5me1.wigVar
tail --lines=+2 H2AZ.vstep >> barskiChIPseqH2AZ.wigVar
tail --lines=+2 PolII.vstep >> barskiChIPseqPolII.wigVar
tail --lines=+2 CTCF.vstep >> barskiChIPseqCTCF.wigVar
mkdir ../signal
mv *.wigVar ../signal
gzip *
mkdir ../tags
mv H3K4me1.bed ../tags/barskiChIPseqH3K4me1.bed
mv H3K4me2.bed ../tags/barskiChIPseqH3K4me2.bed
mv H3K4me3.bed ../tags/barskiChIPseqH3K4me3.bed
mv H3K9me1.bed ../tags/barskiChIPseqH3K9me1.bed
mv H3K9me2.bed ../tags/barskiChIPseqH3K9me2.bed
mv H3K9me3.bed ../tags/barskiChIPseqH3K9me3.bed
mv H3K27me1.bed ../tags/barskiChIPseqH3K27me1.bed
mv H3K27me2.bed ../tags/barskiChIPseqH3K27me2.bed
mv H3K27me3.bed ../tags/barskiChIPseqH3K27me3.bed
mv H3K36me1.bed ../tags/barskiChIPseqH3K36me1.bed
mv H3K36me3.bed ../tags/barskiChIPseqH3K36me3.bed
mv H3K79me1.bed ../tags/barskiChIPseqH3K79me1.bed
mv H3K79me2.bed ../tags/barskiChIPseqH3K79me2.bed
mv H3K79me3.bed ../tags/barskiChIPseqH3K79me3.bed
mv H3R2me1.bed ../tags/barskiChIPseqH3R2me1.bed
mv H3R2me2.bed ../tags/barskiChIPseqH3R2me2.bed
mv H4K20me1.bed ../tags/barskiChIPseqH4K20me1.bed
mv H4K20me3.bed ../tags/barskiChIPseqH4K20me3.bed
mv H4R3me2.bed ../tags/barskiChIPseqH4R3me2.bed
mv H2BK5me1.bed ../tags/barskiChIPseqH2BK5me1.bed
mv H2AZ.bed ../tags/barskiChIPseqH2AZ.bed
mv PolII.bed ../tags/barskiChIPseqPolII.bed
mv CTCF.bed ../tags/barskiChIPseqCTCF.bed
cd ..
cd ../signal
cat > makeWig.sh << \_EOF_
#!/bin/bash
genDir=/gbdb/hg18/barskiChIPseq
mkdir \${genDir}
for file in *.wigVar
do
base=\${file%.wigVar}
echo "Loading \${file} to \${base}..."
time nice -n +19 wigEncode base \${base}.wigVar \${base}.wig \${base}.wib
time nice -n +19 hgLoadWiggle -pathPrefix=\${genDir} hg18 \${base} \${base}.wig
ln -sf `pwd`/\${base}.wib \${genDir}/\${base}.wib
done
_EOF_
chmod 755 makeWig.sh
./makeWig.sh &
# ................ Got to here
# ................ Got to here
# ................ Got to here
# ................ Got to here
# .............. I have not loaded the tags !!!
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K4me1 BarskiChIPseqH3K4me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K4me2 BarskiChIPseqH3K4me2.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K4me3 BarskiChIPseqH3K4me3.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K9me1 BarskiChIPseqH3K9me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K9me2 BarskiChIPseqH3K9me2.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K9me3 BarskiChIPseqH3K9me3.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K27me1 BarskiChIPseqH3K27me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K27me2 BarskiChIPseqH3K27me2.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K27me3 BarskiChIPseqH3K27me3.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K36me1 BarskiChIPseqH3K36me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K36me3 BarskiChIPseqH3K36me3.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K79me1 BarskiChIPseqH3K79me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K79me2 BarskiChIPseqH3K79me2.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K79me3 BarskiChIPseqH3K79me3.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3R2me1 BarskiChIPseqH3R2me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH3R2me2 BarskiChIPseqH3R2me2.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH4K20me1 BarskiChIPseqH4K20me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH4K20me3 BarskiChIPseqH4K20me3.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH4R3me2 BarskiChIPseqH4R3me2.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH2BK5me1 BarskiChIPseqH2BK5me1.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqH2AZ BarskiChIPseqH2AZ.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqPolII BarskiChIPseqPolII.bed
# time nice -n +19 hgLoadBed hg18 barskiChIPseqCTCF BarskiChIPseqCTCF.bed
# .............. I have not loaded the tags !!!
#########################################################################
## 44-Way Multiz (DONE - 2008-11-10,15 - Hiram)
ssh hgwdev
mkdir /hive/data/genomes/hg18/bed/multiz44way
cd /hive/data/genomes/hg18/bed/multiz44way
# starting with the 44way tree that Brian made earlier:
cp -p ../multiz44way.2008-08-06/44way.db.nh ./44way.nh
sed -e "s/oryLat1/hg18/; s/danRer4/danRer5/; s/oryLat1/oryLat2/" \
/cluster/data/oryLat1/bed/multiz44way/44way.nh > 44way.nh
# this file looks like:
cat << '_EOF_' > 44way.nh
(((tetraodon_tetNig1:0.199381,fugu_fr2:0.239894):0.2,
(stickleback_gasAcu1:0.2,medaka_hg18:0.2):0.2):0.292961,
zebrafish_danRer5:0.782561);
'_EOF_'
# << happy emacs
# Use this specification in the phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# to obtain a gif image for htdocs/images/phylo/hg18_44way.gif
/cluster/bin/phast/all_dists 44way.nh > 44way.distances.txt
# Use this output to create the table below, with this perl script:
cat << '_EOF_' > sizeStats.pl
#!/usr/bin/env perl
use strict;
use warnings;
open (FH, "grep -y hg18 44way.distances.txt | sort -k3,3n|") or
die "can not read 44way.distances.txt";
my $count = 0;
while (my $line = <FH>) {
chomp $line;
my ($hg18, $D, $dist) = split('\s+', $line);
my $chain = "chain" . ucfirst($D);
my $B="/hive/data/genomes/hg18/bed/blastz.$D/fb.hg18." .
$chain . "Link.txt";
my $chainLinkMeasure =
`awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
chomp $chainLinkMeasure;
$chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
$chainLinkMeasure =~ s/\%//;
my $orgName=
`hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`;
chomp $orgName;
if (length($orgName) < 1) {
$orgName="N/A";
}
++$count;
printf "# %02d %.4f - %s %s\t(%% %.3f)\n", $count, $dist, $orgName, $D,
$chainLinkMeasure
}
close (FH);
'_EOF_'
# << happy emacs
chmod +x ./sizeStats.pl
./sizeStats.pl
#
# If you can fill in all the numbers in this table, you are ready for
# the multiple alignment procedure
#
# featureBits chainLink measures
# chainOryLat1Link chain linearGap
# distance on hg18 on other minScore
# 01 0.0092 - Chimp panTro2 (% 94.888)
# 02 0.0267 - Gorilla gorGor1 (% 61.731)
# 03 0.0467 - Orangutan ponAbe2 (% 92.892)
# 04 0.0667 - Marmoset calJac1 (% 78.351)
# 05 0.0783 - Rhesus rheMac2 (% 85.552)
# 06 0.1767 - Tarsier tarSyr1 (% 47.999)
# 07 0.2448 - Mouse lemur micMur1 (% 46.445)
# 08 0.3061 - Bushbaby otoGar1 (% 44.638)
# 09 0.3367 - Rabbit oryCun1 (% 34.015)
# 10 0.3507 - TreeShrew tupBel1 (% 37.348)
# 11 0.3567 - Squirrel speTri1 (% 35.828)
# 12 0.4067 - Guinea Pig cavPor3 (% 43.971)
# 13 0.4067 - Alpaca vicPac1 (% 39.531)
# 14 0.4098 - Megabat pteVam1 (% 45.502)
# 15 0.4099 - Microbat myoLuc1 (% 33.044)
# 16 0.4154 - Cat felCat3 (% 35.888)
# 17 0.4293 - Elephant loxAfr2 (% 35.204)
# 18 0.4314 - Dog canFam2 (% 52.915)
# 19 0.4317 - Mouse mm9 (% 35.201)
# 20 0.4362 - Rat rn4 (% 32.893)
# 21 0.4367 - Pika ochPri2 (% 27.974)
# 22 0.4639 - Horse equCab2 (% 57.162)
# 23 0.4693 - Rock hyrax proCap1 (% 30.935)
# 24 0.4767 - Dolphin turTru1 (% 48.537)
# 25 0.5067 - Kangaroo rat dipOrd1 (% 27.282)
# 26 0.5187 - Armadillo dasNov2 (% 33.663)
# 27 0.5191 - Cow bosTau4 (% 46.689)
# 28 0.5298 - hedgehog eriEur1 (% 19.622)
# 29 0.5399 - Sloth choHof1 (% 34.463)
# 30 0.5605 - Shrew sorAra1 (% 20.056)
# 31 0.5815 - Tenrec echTel1 (% 23.645)
# 32 0.7309 - Opossum monDom4 (% 12.385)
# 33 0.9870 - Platypus ornAna1 (% 7.870)
# 34 1.0313 - Zebra finch taeGut1 (% 3.503)
# 35 1.0436 - Lamprey petMar1 (% 1.251)
# 36 1.1013 - Chicken galGal3 (% 3.589)
# 37 1.2253 - Lizard anoCar1 (% 4.774)
# 38 1.5473 - X. tropicalis xenTro2 (% 2.623)
# 39 1.8337 - Stickleback gasAcu1 (% 1.923)
# 40 1.8482 - Zebrafish danRer5 (% 2.565)
# 41 1.8721 - Tetraodon tetNig1 (% 2.001)
# 42 1.9077 - Fugu fr2 (% 1.766)
# 43 2.0215 - Medaka oryLat2 (% 1.829)
# create species list and stripped down tree for autoMZ
sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
44way.nh > tmp.nh
echo `cat tmp.nh` > tree-commas.nh
echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.list
cd /hive/data/genomes/hg18/bed/multiz44way
# bash shell syntax here ...
export H=/hive/data/genomes/hg18/bed
mkdir mafLinks
for G in `sed -e "s/hg18 //" species.list`
do
mkdir mafLinks/$G
if [ -s ${H}/blastz.${G}/mafRBestNet/chr1.maf.gz ]; then
echo "$G - recipBest"
ln -s ${H}/blastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G
else
if [ -s ${H}/blastz.${G}/mafSynNet/chr1.maf.gz ]; then
echo "$G - synNet"
ln -s ${H}/blastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G
else
if [ -s ${H}/blastz.${G}/mafNet/chr1.maf.gz ]; then
echo "$G - mafNet"
ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
else
echo "missing directory blastz.${G}/*Net"
fi
fi
fi
done
# need to split these things up into smaller pieces for
# efficient kluster run. Using the new hive architecture.
ssh hgwdev
cd /hive/data/genomes/hg18/bed/multiz44way
mkdir mafSplit
# mafSplitPos splits on repeat areas that will not have any chains
mafSplitPos -minGap=50000 hg18 10 mafSplit.bed
for G in `sed -e "s/hg18 //" species.list`
do
echo -n "working ${G} ..."
rm -fr mafSplit/${G}
mkdir mafSplit/${G}
cd mafSplit/${G}
mafSplit ../../mafSplit.bed hg18_ ../../mafLinks/${G}/chr*.maf.gz \
-verbose=2
cd /hive/data/genomes/hg18/bed/multiz44way
echo " done"
done
# create a run-time list of files to operate on, not all file names
# exist for all assemblies
cd mafSplit
for D in *
do
cd "${D}"
find . -type f
cd ..
done | sort -u | sed -e "s#./##" > ../44-way.split.list
wc -l ../44-way.split.list
# 267 ../44-way.split.list
# the autoMultiz cluster run
ssh swarm
cd /hive/data/genomes/hg18/bed/multiz44way/
mkdir splitRun
cd splitRun
mkdir maf run
cd run
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn
# set the db and pairs directories here
cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = hg18
set c = $1
set result = $2
set run = `pwd`
set tmp = $run/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/hg18/bed/multiz44way/mafSplit
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../../tree.nh ../../species.list $tmp
pushd $tmp
foreach s (`sed -e "s/ $db//" species.list`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if (-e $in.gz) then
/bin/zcat $in.gz > $out
else if (-e $in) then
ln -s $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
/bin/rm -f $result
/bin/cp -p $tmp/$c.maf $result
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db
/bin/rmdir --ignore-fail-on-non-empty $run/tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz.csh
cat << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/hg18/bed/multiz44way/splitRun/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../../44-way.split.list single template jobList
para create jobList
# initial run experience suggest some of the big jobs reach 8 Gb
# of memory usage, so, tell parasol to limit the number of jobs per
# node to avoid thrashing
para -ram=6g push
# 8 jobs were finished manually on hgwdev, kolossus and memk nodes
XXXX - running 2008-11-12 - Wed Nov 12 15:29:39 PST 2008
# Completed: 792 of 792 jobs
# CPU time in finished jobs: 5423s 90.38m 1.51h 0.06d 0.000 y
# IO & Wait Time: 138287s 2304.79m 38.41h 1.60d 0.004 y
# Average job time: 181s 3.02m 0.05h 0.00d
# Longest finished job: 404s 6.73m 0.11h 0.00d
# Submission to last job: 436s 7.27m 0.12h 0.01d
# Estimated complete: 0s 0.00m 0.00h 0.00d
# put the split maf results back together into a single maf file
# eliminate duplicate comments
ssh hgwdev
cd /hive/data/genomes/hg18/bed/multiz44way/splitRun
mkdir ../maf
# the sed edits take out partitioning name information from the comments
# so the multiple parts will condense to smaller number of lines
# this takes almost 2 hours of time, resulting in a bit over 150 Gb,
# almost all chrom files over 1 Gb, up to almost 10 Gb for chr2
# HOWEVER, this is actually not necessary to maintain these comments,
# they are lost during the mafAddIRows
ls maf | sed -e "s/hg18_//; s/\..*//" | sort -u | while read C
do
echo "========== $C =============="
rm -f ../maf/${C}.maf.gz
head -q -n 1 maf/hg18_${C}.*.maf | sort -u > ../maf/${C}.maf
grep -h "^#" maf/hg18_${C}.*.maf | egrep -v "maf version=1|eof maf" | \
sed -e "s#${C}.[0-9][0-9]*#${C}#g; s#_MZ_[^ ]* # #g;" \
| sort -u >> ../maf/${C}.maf
grep -h -v "^#" `ls maf/hg18_${C}.*.maf | sort -t. -k2,2n` \
>> ../maf/${C}.maf
tail -q -n 1 maf/hg18_${C}.*.maf | sort -u >> ../maf/${C}.maf
done
# load tables for a look
ssh hgwdev
mkdir -p /gbdb/hg18/multiz44way/maf
cd /hive/data/genomes/hg18/bed/multiz44way/maf
ln -s `pwd`/*.maf /gbdb/hg18/multiz44way/maf
# this generates an immense multiz44way.tab file in the directory
# where it is running. Best to run this over in scratch.
cd /data/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/hg18/multiz44way/maf hg18 multiz44way
# real 1m10.380s
# Loaded 1366931 mafs in 1 files from /gbdb/hg18/multiz44way/maf
# load summary table
time nice -n +19 cat /gbdb/hg18/multiz44way/maf/*.maf \
| hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz44waySummary stdin
# real 2m39.822
# Created 353577 summary blocks from 2852890 components and 1197504 mafs
# from stdin
# Gap Annotation
# prepare bed files with gap info
mkdir /hive/data/genomes/hg18/bed/multiz44way/anno
cd /hive/data/genomes/hg18/bed/multiz44way/anno
mkdir maf run
# most of these will already exist from previous multiple alignments
# remove the echo from in front of the twoBitInfo command to get them
# to run if this loop appears to be correct
for DB in `cat ../species.list`
do
CDIR="/hive/data/genomes/${DB}"
if [ ! -f ${CDIR}/${DB}.N.bed ]; then
echo "creating ${DB}.N.bed"
echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
else
ls -og ${CDIR}/${DB}.N.bed
fi
done
cd run
rm -f nBeds sizes
for DB in `sed -e "s/hg18 //" ../../species.list`
do
echo "${DB} "
ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
echo ${DB}.len >> sizes
done
# the annotation step requires large memory, run on memk nodes
ssh memk
cd /hive/data/genomes/hg18/bed/multiz44way/anno/run
ls ../../maf | sed -e "s/.maf//" > chr.list
cat << '_EOF_' > template
#LOOP
./anno.csh $(root1) {check out line+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
cat << '_EOF_' > anno.csh
#!/bin/csh -fe
set inMaf = ../../maf/$1.maf
set outMaf = ../maf/$1.maf
rm -f $outMaf
mafAddIRows -nBeds=nBeds $inMaf /hive/data/genomes/hg18/hg18.2bit $outMaf
'_EOF_'
# << happy emacs
chmod +x anno.csh
gensub2 chr.list single template jobList
para create jobList
# specify lots of ram to get one job per node
para -ram=30g push
ssh hgwdev
rm -fr /gbdb/hg18/multiz44way/maf
mkdir /gbdb/hg18/multiz44way/maf
cd /hive/data/genomes/hg18/bed/multiz44way/anno/maf
ln -s `pwd`/*.maf /gbdb/hg18/multiz44way/maf/
# by loading this into the table multiz44way, it will replace the
# previously loaded table with the unannotated mafs
# huge temp files are made, do them on local disk
cd /data/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/hg18/multiz44way/maf hg18 multiz44way
# with final set of quality annotated files:
# Loaded 33320838 mafs in 49 files from /gbdb/hg18/multiz44way/maf
# real 91m46.889s
# running on Irow annotated mafs Fri Nov 28 00:28:09 PST 2008
# Loaded 33320675 mafs in 49 files from /gbdb/hg18/multiz44way/maf
# real 236m15.279s
# running on bare bones mafs Thu Nov 27 19:29:44 PST 2008
# Loaded 33273351 mafs in 49 files from /gbdb/hg18/multiz44way/maf
# real 198m55.761s - while swarm busy with rebalancing
# from before the fixed multiz:
# Loaded 35154852 mafs in 49 files from /gbdb/hg18/multiz44way/maf
# real 71m5.594s
time nice -n +19 cat /gbdb/hg18/multiz44way/maf/*.maf \
| hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz44waySummary stdin
# with the quality annotated mafs, and mem interference on hgwdev:
# Created 8514381 summary blocks from 600504256 components \
# and 33320838 mafs from stdin
# real 169m56.936s
# with the Irow annotations after the multiz fix:
# Created 8514380 summary blocks from 600499937
# components and 33298894 mafs from stdin
# real 184m42.893s
# user 70m44.431s
# sys 8m7.970s
# Created 8514078 summary blocks from 604683213 components
# and 35125649 mafs from stdin
# real 130m55.115s
# user 71m37.409s
# sys 8m5.110s
# by loading this into the table multiz44waySummary, it will replace
# the previously loaded table with the unannotated mafs
# remove the multiz44way*.tab files in this /data/tmp directory
# -rw-rw-r-- 1 1949221892 Nov 15 14:04 multiz44way.tab
# -rw-rw-r-- 1 417994189 Nov 15 20:57 multiz44waySummary.tab
wc -l multiz44way*.tab
# 33964377 multiz44way.tab
# 8514078 multiz44waySummary.tab
# 42478455 total
rm multiz44way*.tab
# create some downloads
mkdir -p /hive/data/genomes/hg18/bed/multiz44way/download/maf
cd /hive/data/genomes/hg18/bed/multiz44way/download/maf
time cp -p ../../anno/maf/chr*.maf .
# real 72m46.514s
# user 0m1.293s
# sys 5m15.981s
time gzip --rsyncable *.maf
time gzip --rsyncable *.maf
# real 185m37.884s
# user 179m51.161s
# sys 3m48.016s
time md5sum *.gz > md5sum.txt
# real 3m59.009s
# user 1m19.338s
# sys 0m18.976s
#############################################################################
## Annotate 44-way multiple alignment with gene annotations
## (DONE - 2008-12-08,23 - Hiram)
# Gene frames
## survey all genomes to see what type of gene track to use
ssh hgwdev
mkdir /hive/data/genomes/hg18/bed/multiz44way/frames
cd /hive/data/genomes/hg18/bed/multiz44way/frames
#
cat << '_EOF_' > showGenes.csh
#!/bin/csh -fe
foreach db (`cat ../species.list`)
echo -n "${db}: "
set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
foreach table ($tables)
if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
$table == "knownGene") then
set count = `hgsql $db -N -e "select count(*) from $table"`
echo -n "${table}: ${count}, "
endif
end
set orgName = `hgsql hgcentraltest -N -e \
"select scientificName from dbDb where name='$db'"`
set orgId = `hgsql hg18 -N -e \
"select id from organism where name='$orgName'"`
if ($orgId == "") then
echo "Mrnas: 0"
else
set count = `hgsql hg18 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
echo "Mrnas: ${count}"
endif
end
'_EOF_'
# << happy emacs
chmod +x ./showGenes.csh
# rearrange that output to create four sections:
# 1. knownGenes for hg18, mm9
# 2. ensGene for almost everything else
# 3. Mrnas for taeGut1, anoCar1, petMar1, calJac1
# 4. nothing for loxAfr2, dasNov2, choHof1
mkdir genes
# knownGene
for DB in hg18 mm9
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# ensGene
for DB in bosTau4 canFam2 cavPor3 danRer5 dipOrd1 echTel1 equCab2 \
eriEur1 felCat3 fr2 galGal3 gasAcu1 gorGor1 micMur1 monDom4 myoLuc1 \
ochPri2 ornAna1 oryCun1 oryLat2 otoGar1 panTro2 ponAbe2 proCap1 \
pteVam1 rheMac2 rn4 sorAra1 speTri1 tarSyr1 tetNig1 tupBel1 \
turTru1 vicPac1 xenTro2
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# use Mrnas for taeGut1 anoCar1 petMar1 calJac1
for DB in taeGut1 anoCar1 petMar1 calJac1
do
tmpExt=`mktemp temp.XXXXXX`
tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
tmpMrna=${DB}.mrna.${tmpExt}
tmpCds=${DB}.cds.${tmpExt}
hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
from all_mrna,gbCdnaInfo,cds \
where (all_mrna.qName = gbCdnaInfo.acc) and \
(gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
$DB > ${tmpMrnaCds}
cut -f 1-2 ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
rm -f $tmpExt
echo "${DB} done"
done
# leaving out loxAfr2, dasNov2, choHof1 since no gene preds there
# Create this command with this script:
cat << '_EOF_' > mkCmd.sh
#!/bin/sh
echo "time (cat ../quals/maf/*.maf | nice -n +19 genePredToMafFrames hg18 stdin stdout \\"
if [ ! -s genes/mm9.gp.gz ]; then
echo "missing genes/mm9.gp.gz"
exit 255
fi
echo "mm9 genes/mm9.gp.gz \\"
for D in `sort ensGene.list`
do
if [ ! -s genes/${D}.gp.gz ]; then
echo "missing genes/${D}.gp.gz"
exit 255
fi
echo -n "${D} genes/${D}.gp.gz "
done
echo "\\"
for D in `sort mrna.list`
do
if [ ! -s genes/${D}.gp.gz ]; then
echo "missing genes/${D}.gp.gz"
exit 255
fi
echo -n "${D} genes/${D}.gp.gz "
done
echo "\\"
echo " | gzip > multiz44way.mafFrames.gz) > frames.log 2>&1"
'_EOF_'
# << happy emacs
chmod +x ./mkCmd.sh
time (cat ../quals/maf/*.maf | nice -n +19 genePredToMafFrames hg18 stdin stdout \
mm9 genes/mm9.gp.gz \
bosTau4 genes/bosTau4.gp.gz canFam2 genes/canFam2.gp.gz cavPor3 genes/cavPor3.gp.gz danRer5 genes/danRer5.gp.gz dipOrd1 genes/dipOrd1.gp.gz echTel1 genes/echTel1.gp.gz equCab2 genes/equCab2.gp.gz eriEur1 genes/eriEur1.gp.gz felCat3 genes/felCat3.gp.gz fr2 genes/fr2.gp.gz galGal3 genes/galGal3.gp.gz gasAcu1 genes/gasAcu1.gp.gz micMur1 genes/micMur1.gp.gz monDom4 genes/monDom4.gp.gz myoLuc1 genes/myoLuc1.gp.gz ochPri2 genes/ochPri2.gp.gz ornAna1 genes/ornAna1.gp.gz oryCun1 genes/oryCun1.gp.gz oryLat2 genes/oryLat2.gp.gz otoGar1 genes/otoGar1.gp.gz panTro2 genes/panTro2.gp.gz ponAbe2 genes/ponAbe2.gp.gz proCap1 genes/proCap1.gp.gz pteVam1 genes/pteVam1.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz sorAra1 genes/sorAra1.gp.gz speTri1 genes/speTri1.gp.gz tarSyr1 genes/tarSyr1.gp.gz tetNig1 genes/tetNig1.gp.gz tupBel1 genes/tupBel1.gp.gz turTru1 genes/turTru1.gp.gz vicPac1 genes/vicPac1.gp.gz xenTro2 genes/xenTro2.gp.gz \
anoCar1 genes/anoCar1.gp.gz calJac1 genes/calJac1.gp.gz petMar1 genes/petMar1.gp.gz taeGut1 genes/taeGut1.gp.gz \
| gzip > multiz44way.mafFrames.gz) > frames.log 2>&1
# that doesn't work on any 32 Gb computer, requires much more memory
# turn it into a kluster job
ssh swarm
cd /hive/data/genomes/hg18/bed/multiz44way/frames
cat << '_EOF_' > runOne
#!/bin/csh -fe
set C = $1
set G = $2
cat ../quals/maf/${C}.maf | genePredToMafFrames hg18 stdin stdout \
${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
'_EOF_'
# << happy emacs
chmod +x runOne
ls ../quals/maf | sed -e "s/.maf//" > chr.list
ls genes | sed -e "s/.gp.gz//" | grep -v hg18 > gene.list
cat << '_EOF_' > template
#LOOP
runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
#ENDLOOP
'_EOF_'
# << happy emacs
mkdir parts
gensub2 chr.list gene.list template jobList
para -ram=8g create jobList
para try ... check ... push
# Completed: 1911 of 1911 jobs
# CPU time in finished jobs: 126751s 2112.52m 35.21h 1.47d 0.004 y
# IO & Wait Time: 2573543s 42892.38m 714.87h 29.79d 0.082 y
# Average job time: 1413s 23.55m 0.39h 0.02d
# Longest finished job: 6490s 108.17m 1.80h 0.08d
# Submission to last job: 11310s 188.50m 3.14h 0.13d
# see what it looks like in terms of number of annotations per DB:
find ./parts -type f | while read F
do
zcat ${F}
done | cut -f4 | sort | uniq -c | sort -n
165 anoCar1
2807 calJac1
3306 taeGut1
5416 petMar1
141256 tarSyr1
142346 vicPac1
163854 sorAra1
164475 galGal3
174150 felCat3
178531 oryCun1
178744 ornAna1
179511 turTru1
190622 eriEur1
191477 tupBel1
197338 panTro2
198063 speTri1
199541 micMur1
207391 ponAbe2
208629 rheMac2
208850 otoGar1
212751 myoLuc1
212857 dipOrd1
213343 proCap1
214972 echTel1
216367 monDom4
220724 ochPri2
223159 equCab2
227928 bosTau4
231351 cavPor3
231553 pteVam1
233980 mm9
234268 rn4
249016 canFam2
258191 xenTro2
315098 danRer5
365824 oryLat2
387739 fr2
423941 gasAcu1
549846 tetNig1
# load the resulting file
ssh hgwdev
cd /cluster/data/hg18/bed/multiz44way/frames
find ./parts -type f | while read F
do
zcat ${F}
done | sort -k1,1 -k2,2n | hgLoadMafFrames hg18 multiz44wayFrames stdin
find ./parts -type f | while read F
do
zcat ${F}
done | sort -k1,1 -k2,2n > multiz44wayFrames.bed
featureBits -countGaps hg18 multiz44wayFrames.bed
# 62315198 bases of 3107677273 (2.005%) in intersection
featureBits -countGaps hg18 multiz28wayFrames
# 48236360 bases of 3107677273 (1.552%) in intersection
# enable the trackDb entries:
# frames multiz44wayFrames
# irows on
# appears to work OK
#########################################################################
# Phylogenetic tree from 44-way (2008-12-06 kate)
# Extract 4-fold degenerate sites based on
# of RefSeq Reviewed, coding
ssh pk
cd /hive/data/genomes/hg18/bed/multiz44way
mkdir 4d
cd 4d
hgsql hg18 -Ne \
"select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA'" | cut -f 2-20 > refSeqReviewed.gp
wc -l refSeqReviewed.gp
#12684 refSeqReviewed.gp
genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
wc -l refSeqReviewedNR.gp
#7365 refSeqReviewedNR.gp
mkdir run
cd run
# chopped up mafs version
# run on swarm with -ram=8g
cat > 4d.csh << 'EOF'
set infile = $1
set outfile = $2
set c = `echo $1 | sed 's/^.*hg18_\(chr[^.][^.]*\).*.maf/\1/'`
echo $c
cd /scratch/tmp
# 'clean' maf
perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
awk -v C=$c '$2 == C {print}' /cluster/data/hg18/bed/multiz44way/4d/refSeqReviewedNR.gp > $c.gp
set PHASTBIN=/cluster/bin/phast.2008-11-30
$PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss
$PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile
#rm -f $c.gp $c.maf $c.ss
'EOF'
# whole chrom mafs version, using new version of
# uses memory-efficient version of phast, from Melissa Hubisz at Cornell (mjhubisz@gmail.com)
cat > 4d.csh << 'EOF'
set c = $1
set infile = $2
set outfile = $3
echo $c
cd /scratch/tmp
# 'clean' maf
perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
awk -v C=$c '$2 == C {print}' /cluster/data/hg18/bed/multiz44way/4d/refSeqReviewedNR.gp > $c.gp
set PHASTBIN=/cluster/bin/phast.2008-12-18
$PHASTBIN/msa_view --4d --features --do-cats 3 $c.gp -i MAF $c.maf -o SS > $c.ss
$PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile
rm -f $c.gp $c.maf $c.ss
'EOF'
ls -1S /hive/data/genomes/hg18/bed/multiz44way/maf/*.maf | \
grep -v random | grep -v chrM | grep -v hap > in.lst
cat << 'EOF' > template
#LOOP
csh 4d.csh $(root1) {check in line+ $(path1)} {check out line+ /cluster/data/hg18/bed/multiz44way/4d/mfa/$(root1).mfa}
#ENDLOOP
'EOF'
# << this line makes emacs coloring happy
cat << 'EOF' > template
#LOOP
csh 4d.csh $(root1) {check in line+ $(path1)} {check out line+ /cluster/data/hg18/bed/multiz44way/4d/mfa2/$(root1).mfa}
#ENDLOOP
'EOF'
# << this line makes emacs coloring happy
gensub2 in.lst single template stdout | tac > jobList
rm -fr /cluster/data/hg18/bed/multiz44way/4d/mfa
mkdir /cluster/data/hg18/bed/multiz44way/4d/mfa
para create jobList
para try
para check
para push
# combine mfa files
cd ..
sed -e "s/ /,/g" ../species.list > species.lst
/cluster/bin/phast/msa_view --aggregate `cat species.lst` mfa/*.mfa | \
sed s/"> "/">"/ > 4d.all.mfa
sed -e 's/,monDom4.*//' species.lst > placentals.lst
/cluster/bin/phast/msa_view --aggregate `cat placentals.lst` mfa/*.mfa | \
sed s/"> "/">"/ > 4d.placentals.mfa
# use phyloFit to create tree model (output is phyloFit.mod)
set PHASTBIN=/cluster/bin/phast.2008-12-18
$PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA --subst-mod REV --tree ../tree-commas.nh 4d.all.mfa
# started at 5:50pm
# ended at 7:27 => ~90 min on swarm
mv phyloFit.mod phyloFit.all.mod
grep TREE phyloFit.all.mod | sed 's/TREE\:\ //' > tree_4d.44way.nh
$PHASTBIN/tree_doctor \
--prune=monDom4,ornAna1,taeGut1,petMar1,galGal3,anoCar1,xenTro2,gasAcu1,danRer5,tetNig1,fr2,oryLat2 \
tree_4d.44way.nh > tree_4d.44way.placental.nh
# chrX-only for placental subset (requested by 2X project)
set PHASTBIN=/cluster/bin/phast.2008-12-18
$PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA --subst-mod REV --tree ../tree-commas.nh --out-root 4d.chrX mfa/chrX.mfa
#############################################################################
# phastCons 44-way (DONE - 2008-12-23 - 2009-01-02 - Hiram)
# split 44way mafs into 10M chunks and generate sufficient statistics
# files for # phastCons
ssh memk
mkdir -p /hive/data/genomes/hg18/bed/multiz44way/cons/msa.split
mkdir /hive/data/genomes/hg18/bed/multiz44way/cons/ss
cd /hive/data/genomes/hg18/bed/multiz44way/cons/msa.split
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set c = $1
set MAF = /hive/data/genomes/hg18/bed/multiz44way/maf/$c.maf
set WINDOWS = /hive/data/genomes/hg18/bed/multiz44way/cons/ss/$c
rm -fr $WINDOWS
mkdir $WINDOWS
pushd $WINDOWS > /dev/null
twoBitToFa -seq=$c /hive/data/genomes/hg18/hg18.2bit hg18.$c.fa
/cluster/bin/phast/$MACHTYPE/msa_split $MAF -i MAF \
-M hg18.$c.fa -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
rm -f hg18.$c.fa
popd > /dev/null
date >> $c.done
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
cat << '_EOF_' > template
#LOOP
doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
'_EOF_'
# << happy emacs
# do the easy ones first to see some immediate results
ls -1S -r ../maf | sed -e "s/.maf//" > maf.list
gensub2 maf.list single template jobList
para -ram=32g create jobList
para try ... check ... etc
# this takes a really long time. memk was down to 2 usable
# machines - got it finished manually on a combination of hgwdevnew CPUs
# and other machines
# Estimate phastCons parameters
# experimented with this as a parasol job on hgwdevnew to try a number
# of SS files. With a command of:
/cluster/bin/phast/x86_64/phyloFit -i SS ${SS} \
--tree "(((((((((((((((((hg18,panTro2),gorGor1),ponAbe2),rheMac2),calJac1),tarSyr1),(micMur1,otoGar1)),tupBel1),(((((mm9,rn4),dipOrd1),cavPor3),speTri1),(oryCun1,ochPri2))),(((vicPac1,(turTru1,bosTau4)),((equCab2,(felCat3,canFam2)),(myoLuc1,pteVam1))),(eriEur1,sorAra1))),(((loxAfr2,proCap1),echTel1),(dasNov2,choHof1))),monDom4),ornAna1),((galGal3,taeGut1),anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat2)),danRer5)),petMar1)" \
--out-root=$OUT/starting_tree
# running over the input files ../ss/*/*.ss results to
#.../genomes/hg18/bed/multiz44way/cons/startingTree/result/*/starting-tree.mod
# add up the C and G:
find ./result -type f | xargs ls -rt | while read F
do
D=`dirname $F`
echo -n `basename $D`" - "
grep BACKGROUND ${F} | awk '{printf "%0.3f\n", $3 + $4;}'
done
# counting number of species seen in the maf file:
find ./result -type f | xargs ls -rt | while read F
do
D=`dirname $F`
echo -n `basename $D`" - "
grep TREE $F | sed -e \
"s/TREE: //; s/(//g; s/)//g; s/[0-9].[0-9][0-9][0-9][0-9][0-9][0-9]//g; s/://g" | tr ',' '\n' | wc -l
done
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
ssh swarm
mkdir -p /hive/data/genomes/hg18/bed/multiz44way/cons/run.cons
cd /hive/data/genomes/hg18/bed/multiz44way/cons/run.cons
# there are going to be several different phastCons runs using
# this same script. They trigger off of the current working directory
# $cwd:t which is the "grp" in this script. It is one of:
# all euarchontogliers placentals
cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast/x86_64
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set cons = /hive/data/genomes/hg18/bed/multiz44way/cons
set tmp = $cons/tmp/$f
mkdir -p $tmp
set ssSrc = $cons
if (-s $cons/$grp/$grp.non-inf) then
ln -s $cons/$grp/$grp.mod $tmp
ln -s $cons/$grp/$grp.non-inf $tmp
ln -s $ssSrc/ss/$c/$f.ss $tmp
ln -s $cons/$grp/$grp.mod $tmp
ln -s $cons/$grp/$grp.non-inf $tmp
else
ln -s $ssSrc/ss/$c/$f.ss $tmp
ln -s $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative `cat $grp.non-inf` \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p pp/$c bed/$c
sleep 4
touch pp/$c bed/$c
rm -f pp/$c/$f.pp
rm -f bed/$c/$f.bed
mv $tmp/$f.pp pp/$c
mv $tmp/$f.bed bed/$c
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod a+x doPhast.csh
# this template will serve for all runs
# root1 == chrom name, file1 == ss file name without .ss suffix
cat << '_EOF_' > template
#LOOP
../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > ss.list
# run for all species
cd /hive/data/genomes/hg18/bed/multiz44way/cons
mkdir -p all
cd all
# Using Kate's .mod tree
cp -p ../../4d/44way.all.mod ./all.mod
gensub2 ../run.cons/ss.list single ../run.cons/template jobList
para -ram=8g create jobList
para try ... check ... push ... etc.
XXX - running Tue Jan 13 22:19:21 PST 2009
# Completed: 322 of 322 jobs
# CPU time in finished jobs: 47406s 790.10m 13.17h 0.55d 0.002 y
# IO & Wait Time: 29902s 498.37m 8.31h 0.35d 0.001 y
# Average job time: 240s 4.00m 0.07h 0.00d
# Longest finished job: 354s 5.90m 0.10h 0.00d
# Submission to last job: 536s 8.93m 0.15h 0.01d
# create Most Conserved track
cd /hive/data/genomes/hg18/bed/multiz44way/cons
cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
# load into database
ssh hgwdev
cd /hive/data/genomes/hg18/bed/multiz44way/cons/all
time nice -n +19 hgLoadBed hg18 phastConsElements44way mostConserved.bed
# Loaded 4878296 elements of size 5
# real 2m3.414s
# Try for 5% overall cov, and 70% CDS cov
# --rho 0.3 --expected-length 45 --target-coverage 0.3
featureBits hg18 -enrichment refGene:cds phastConsElements44way
# refGene:cds 1.144%, mostConserved.bed 4.973%,
# both 0.854%, cover 74.62%, enrich 15.01x
# --rho .31 --expected-length 45 --target-coverage .3
# refGene:cds 1.144%, phastConsElements44way 4.706%,
# both 0.824%, cover 72.07%, enrich 15.31x
# --rho 0.3 --expected-length 45 --target-coverage 0.3
featureBits hg18 -enrichment knownGene:cds phastConsElements44way
# knownGene:cds 1.205%, mostConserved.bed 4.973%,
# both 0.874%, cover 72.55%, enrich 14.59x
# --rho .31 --expected-length 45 --target-coverage .3
# knownGene:cds 1.205%, phastConsElements44way 4.706%,
# both 0.844%, cover 70.05%, enrich 14.88x
featureBits hg18 -enrichment refGene:cds phastConsElements28way
# refGene:cds 1.144%, phastConsElements28way 4.920%,
# both 0.858%, cover 74.96%, enrich 15.24x
featureBits hg18 -enrichment knownGene:cds phastConsElements28way
# knownGene:cds 1.205%, phastConsElements28way 4.920%,
# both 0.878%, cover 72.88%, enrich 14.81x
# Create merged posterier probability file and wiggle track data files
cd /hive/data/genomes/hg18/bed/multiz44way/cons/all
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
TOP=`pwd`
export TOP
mkdir -p downloads
for D in pp/chr*
do
C=${D/pp\/}
out=downloads/${C}.phastCons44way.wigFix.gz
echo "${D} > ${C}.phastCons44way.wigFix.gz"
ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
gzip > ${out}
done
'_EOF_'
# << happy emacs
chmod +x gzipAscii.sh
time nice -n +19 ./gzipAscii.sh
# real 30m7.228s
# encode those files into wiggle data
zcat downloads/*.wigFix.gz \
| wigEncode stdin phastCons44way.wig phastCons44way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# real 22m54.291s
# Load gbdb and database with wiggle.
ssh hgwdev
cd /hive/data/genomes/hg18/bed/multiz44way/cons/all
ln -s `pwd`/phastCons44way.wib /gbdb/hg18/multiz44way/phastCons44way.wib
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
phastCons44way phastCons44way.wig
# real 1m13.681s
# Create histogram to get an overview of all the data
ssh hgwdev
cd /hive/data/genomes/hg18/bed/multiz44way/cons/all
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=hg18 phastCons44way > histogram.data 2>&1
# real 8m6.841s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg18 Histogram phastCons44way track"
set xlabel " phastCons44way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
########################################################################
### Create a phastCons data set for Primates
# setup primates-only run
ssh swarm
mkdir /hive/data/genomes/hg18/bed/multiz44way/cons/primates
cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
# primates-only: exclude all but these for phastCons tree:
/cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1 \
> primates.mod
# and place the removed ones in the non-inf file so phastCons will
# truly ignore them:
echo "tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1,monDom4,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat2,danRer5,petMar1" \
> primates.non-inf
gensub2 ../run.cons/ss.list single ../run.cons/template jobList
para -ram=8g create jobList
para try ... check ... push ... etc.
# bed/chr18_random/chr18_random.1-4262.bed is empty
# bed/chr19_random/chr19_random.1-301858.bed is empty
# bed/chr21/chr21.1-10000000.bed is empty
# bed/chrM/chrM.1-16571.bed is empty
# the jobs that fail have messages like this:
# bed/chrM/chrM.1-16571.bed is empty
# WARNING: No match for name "tupBel1" in alignment.
# WARNING: No match for name "sorAra1" in alignment.
# Completed: 318 of 322 jobs
# Crashed: 4 jobs
# CPU time in finished jobs: 20253s 337.54m 5.63h 0.23d 0.001 y
# IO & Wait Time: 33093s 551.56m 9.19h 0.38d 0.001 y
# Average job time: 168s 2.80m 0.05h 0.00d
# Longest finished job: 249s 4.15m 0.07h 0.00d
# Submission to last job: 282s 4.70m 0.08h 0.00d
# create Most Conserved track
cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# load into database
ssh hgwdev
cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
time nice -n +19 hgLoadBed hg18 phastConsElements44wayPrimates \
mostConserved.bed
# Loaded 808218 elements of size 5
# real 0m16.817s
# verify coverage
featureBits hg18 phastConsElements44wayPrimates
# 113268574 bases of 2881515245 (3.931%) in intersection
# --rho 0.3 --expected-length 45 --target-coverage 0.3
featureBits hg18 -enrichment refGene:cds phastConsElements44wayPrimates
# refGene:cds 1.144%, phastConsElements44wayPrimates 4.222%,
# both 0.756%, cover 66.07%, enrich 15.65x
featureBits hg18 -enrichment knownGene:cds phastConsElements44wayPrimates
# knownGene:cds 1.205%, phastConsElements44wayPrimates 4.222%,
# both 0.769%, cover 63.84%, enrich 15.12x
# Create the downloads .pp files, from which the phastCons wiggle data
# is calculated
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
mkdir downloads
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
for D in pp/chr*
do
C=${D/pp\//}
ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
> downloads/${C}.primates.wigFix.gz
echo $D $C done
done
'_EOF_'
# << happy emacs
time nice -n +19 ./gzipAscii.sh
# real 36m13.492s
# Create merged posterier probability file and wiggle track data files
zcat downloads/chr*.wigFix.gz \
| wigEncode stdin phastCons44wayPrimates.wig phastCons44wayPrimates.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# real 24m15.688s
## load table with wiggle data
ssh hgwdev
cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
ln -s `pwd`/phastCons44wayPrimates.wib \
/gbdb/hg18/multiz44way/phastCons44wayPrimates.wib
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
phastCons44wayPrimates phastCons44wayPrimates.wig
# real 0m48.942s
# Create histogram to get an overview of all the data
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=hg18 phastCons44wayPrimates > histogram.data 2>&1
# real 5m50.154s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Hg18 Histogram phastCons44wayPrimates track"
set xlabel " phastCons44wayPrimates score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
########################################################################
### Create a phastCons data set for Euarchontoglires
# setup euarchontoglires-only run
ssh swarm
cd /hive/data/genomes/hg18/bed/multiz44way/cons
mkdir euarchontoglires
cd euarchontoglires
# euarchontoglires-only: exclude all but these for phastCons tree:
/cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2 \
> euarchontoglires.mod
# and place the removed ones in the non-inf file so phastCons will
# truly ignore them:
echo "vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1,monDom4,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat2,danRer5,petMar1" \
> euarchontoglires.non-inf
gensub2 ../run.cons/ss.list single ../run.cons/template jobList
para -ram=8g create jobList
para try ... check ... push ... etc.
# Two of these jobs fail to produce any output in the bed file:
# I believe this is because there is a missing sequence in these files
# compared to the ones specified in euarchontoglires.mod:
# bed/chr18_random/chr18_random.1-4262.bed is empty
# bed/chr19_random/chr19_random.1-301858.bed is empty
# Completed: 320 of 322 jobs
# Crashed: 2 jobs
# CPU time in finished jobs: 25869s 431.14m 7.19h 0.30d 0.001 y
# IO & Wait Time: 34404s 573.41m 9.56h 0.40d 0.001 y
# Average job time: 188s 3.14m 0.05h 0.00d
# Longest finished job: 272s 4.53m 0.08h 0.00d
# Submission to last job: 309s 5.15m 0.09h 0.00d
# create Most Conserved track
cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires
cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# load into database
ssh hgwdev
cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires
time nice -n +19 hgLoadBed hg18 phastConsElements44wayEuarch \
mostConserved.bed
# Loaded 1623656 elements of size 5
# real 4m15.125s
# verify coverage
featureBits hg18 phastConsElements44wayEuarch
# 109221588 bases of 2881515245 (3.790%) in intersection
# --rho 0.3 --expected-length 45 --target-coverage 0.3
featureBits hg18 -enrichment refGene:cds phastConsElements44wayEuarch
# refGene:cds 1.144%, mostConserved.bed 3.696%,
# both 0.822%, cover 71.87%, enrich 19.45x
# --rho 0.31 --expected-length 45 --target-coverage 0.3
# refGene:cds 1.144%, phastConsElements44wayEuarch 3.790%,
# both 0.822%, cover 71.79%, enrich 18.94x
# --rho 0.3 --expected-length 45 --target-coverage 0.3
featureBits hg18 -enrichment knownGene:cds phastConsElements44wayEuarch
# knownGene:cds 1.205%, mostConserved.bed 3.696%,
# both 0.839%, cover 69.59%, enrich 18.83x
# --rho 0.31 --expected-length 45 --target-coverage 0.3
# knownGene:cds 1.205%, phastConsElements44wayEuarch 3.790%,
# both 0.838%, cover 69.51%, enrich 18.34x
# Create the downloads .pp files, from which the phastCons wiggle data
# is calculated
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires
mkdir downloads
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
for D in pp/chr*
do
C=${D/pp\//}
ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
> downloads/${C}.euarchontoglires.wigFix.gz
echo $D $C done
done
'_EOF_'
# << happy emacs
time nice -n +19 ./gzipAscii.sh
# real 26m54.263s
# Create merged posterier probability file and wiggle track data files
zcat downloads/chr*.wigFix.gz \
| wigEncode stdin phastCons44wayEuarch.wig phastCons44wayEuarch.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# real 18m15.693s
## load table with wiggle data
ssh hgwdev
cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires
ln -s `pwd`/phastCons44wayEuarch.wib \
/gbdb/hg18/multiz44way/phastCons44wayEuarch.wib
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
phastCons44wayEuarch phastCons44wayEuarch.wig
# real 0m57.590s
# Create histogram to get an overview of all the data
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=hg18 phastCons44wayEuarch > histogram.data 2>&1
# real 6m37.512s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Hg18 Histogram phastCons44wayEuarch track"
set xlabel " phastCons44wayEuarch score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
########################################################################
### Create a phastCons data set for Placentals
# setup placental-only run
ssh swarm
mkdir /hive/data/genomes/hg18/bed/multiz44way/cons/placental
cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental
# placental-only: exclude all but these for phastCons tree:
/cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1 \
> placental.mod
# and place the removed ones in the non-inf file so phastCons will
# truly ignore them:
echo "monDom4,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat2,danRer5,petMar1" \
> placental.non-inf
gensub2 ../run.cons/ss.list single ../run.cons/template jobList
para -ram=8g create jobList
para try ... check ... push ... etc.
# Two of these jobs fail to produce any output:
# bed/chr18_random/chr18_random.1-4262.bed is empty
# bed/chr19_random/chr19_random.1-301858.bed is empty
# Completed: 320 of 322 jobs
# Crashed: 2 jobs
# CPU time in finished jobs: 38258s 637.63m 10.63h 0.44d 0.001 y
# IO & Wait Time: 34704s 578.40m 9.64h 0.40d 0.001 y
# Average job time: 228s 3.80m 0.06h 0.00d
# Longest finished job: 313s 5.22m 0.09h 0.00d
# Submission to last job: 1030s 17.17m 0.29h 0.01d
# create Most Conserved track
cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# load into database
ssh hgwdev
cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental
time nice -n +19 hgLoadBed hg18 phastConsElements44wayPlacental \
mostConserved.bed
# Loaded 3962527 elements of size 5
# real 3m28.564s
# verify coverage
featureBits hg18 phastConsElements44wayPlacental
# 119635433 bases of 2881515245 (4.152%) in intersection
# --rho 0.3 --expected-length 45 --target-coverage 0.3
featureBits hg18 -enrichment refGene:cds phastConsElements44wayPlacental
# refGene:cds 1.144%, phastConsElements44wayPlacental 4.329%,
# both 0.840%, cover 73.41%, enrich 16.96x
featureBits hg18 -enrichment knownGene:cds phastConsElements44wayPlacental
# knownGene:cds 1.205%, phastConsElements44wayPlacental 4.329%,
# both 0.858%, cover 71.17%, enrich 16.44x
# Create the downloads .pp files, from which the phastCons wiggle data
# is calculated
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental
mkdir downloads
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
for D in pp/chr*
do
C=${D/pp\//}
ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
> downloads/${C}.placental.wigFix.gz
echo $D $C done
done
'_EOF_'
# << happy emacs
time nice -n +19 ./gzipAscii.sh
# real 22m12.762s
# Create merged posterier probability file and wiggle track data files
zcat downloads/chr*.wigFix.gz \
| wigEncode stdin phastCons44wayPlacental.wig \
phastCons44wayPlacental.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# real 37m20.176s
## load table with wiggle data
ssh hgwdev
cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental
ln -s `pwd`/phastCons44wayPlacental.wib \
/gbdb/hg18/multiz44way/phastCons44wayPlacental.wib
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
phastCons44wayPlacental phastCons44wayPlacental.wig
# real 1m16.900s
# Create histogram to get an overview of all the data
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=hg18 phastCons44wayPlacental > histogram.data 2>&1
# real 8m15.623s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg18 Histogram phastCons44wayPlacental track"
set xlabel " phastCons44wayPlacental score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
#########################################################################
# Update phastCons44way tables from Adam (DONE - 2009-05-22 - Hiram)
mkdir /hive/data/genomes/hg18/bed/multiz44way/chrX.phastCons
cd /hive/data/genomes/hg18/bed/multiz44way/chrX.phastCons
mkdir primates
cd primates
wget --timestamping \
ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phastCons/primates/*
cd ..
mkdir placental
cd placental
wget --timestamping \
ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phastCons/placental/*
cd ..
mkdir all
cd all
wget --timestamping \
ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phastCons/all/*
zcat all/*.wigFix.gz \
| wigEncode stdin phastCons44way_v2.wig phastCons44way_v2.wib
zcat primates/*.wigFix.gz \
| wigEncode stdin phastCons44wayPrimates_v2.wig phastCons44wayPrimates_v2.wib
zcat placental/*.wigFix.gz \
| wigEncode stdin phastCons44wayPlacental_v2.wig phastCons44wayPlacental_v2.wib
ln -s `pwd`/*.wib /gbdb/hg18/multiz44way
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
phastCons44way_v2 phastCons44way_v2.wig
# real 0m43.022s
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
phastCons44wayPrimates_v2 phastCons44wayPrimates_v2.wig
# real 0m43.660s
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
phastCons44wayPlacental_v2 phastCons44wayPlacental_v2.wig
# real 0m44.607s
time nice -n +19 hgLoadBed hg18 phastConsElements44way_v2 \
all/mostConserved.bed
# Loaded 4779670 elements of size 5
# real 2m10.975s
time nice -n +19 hgLoadBed hg18 phastConsElements44wayPrimates_v2 \
primates/mostConserved.bed
# Loaded 785075 elements of size 5
# real 0m21.619s
time nice -n +19 hgLoadBed hg18 phastConsElements44wayPlacental_v2 \
placental/mostConserved.bed
# Loaded 3862854 elements of size 5
# real 1m41.223s
# Create histogram to get an overview of all the data
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=hg18 phastCons44wayPlacental_v2 > placental.histogram.data 2>&1
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=hg18 phastCons44wayPrimates_v2 > primates.histogram.data 2>&1
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=hg18 phastCons44way_v2 > vertebrate.histogram.data 2>&1
cat << '_EOF_' | gnuplot > placental.histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg18 Histogram phastCons44wayPlacental_v2 track"
set xlabel " phastCons44wayPlacental_v2 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "placental.histogram.data" using 2:5 title " RelFreq" with impulses, \
"placental.histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display placental.histo.png &
cat << '_EOF_' | gnuplot > primates.histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg18 Histogram phastCons44wayPrimates_v2 track"
set xlabel " phastCons44wayPrimates_v2 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "primates.histogram.data" using 2:5 title " RelFreq" with impulses, \
"primates.histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display primates.histo.png &
cat << '_EOF_' | gnuplot > vertebrate.histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg18 Histogram phastCons44way_v2 track"
set xlabel " phastCons44way_v2 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "vertebrate.histogram.data" using 2:5 title " RelFreq" with impulses, \
"vertebrate.histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display placental.histo.png &
#########################################################################
# phyloP conservation for 44-way (2009-01-05 kate)
#
# Vertebrate, Placental
# Also doing Euarchontoglire, since Hiram did
#
# Using newer scoring method LRT (replaces SPH), based
# on scoring method experiments, above (compared to SCORE method).
# Using phast from Adam's student Melissa Hubisz, with fixes needed for LRT scoring
# Will replace with version from CVS if/when these fixes are integrated
# PHAST version is 0.9.9.9b
# split SS files into 1M chunks (tried 10M used for phastCons, and these
# took 5hrs/chunk w/ LRT scoring)
ssh swarm
cd /cluster/data/hg18/bed/multiz44way
mkdir consPhyloP
cd consPhyloP
mkdir ss run.split
cd run.split
cat << 'EOF' > doSplit.csh
set c = $1
set d = /cluster/data/hg18/bed/multiz44way
set in = $d/cons/ss
set out = $d/consPhyloP/ss
set PHASTBIN = /cluster/bin/phast.2008-12-18
@ i=0
foreach f (`ls $in/$c/*.ss | sort -n -t\. -k2`)
@ i++
mkdir -p $out/$c/$i
$PHASTBIN/msa_split $f -i SS -o SS \
-r $out/$c/$i/$c.$i -w 1000000,0 -I 1000 -B 5000
end
echo "Done" >> $out/$c.done
'EOF'
# << happy emacs
set d = /cluster/data/hg18/bed/multiz44way/consPhyloP
set JOBS = $d/run.split/jobList
rm -f $JOBS
touch $JOBS
foreach c (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`)
echo "csh doSplit.csh $c {check out line+ $d/ss/$c.done}" >> $JOBS
end
para create jobList
# 49 jobs
para try
para check
para push
para time
# run phyloP with score=LRT
ssh swarm
cd /cluster/data/hg18/bed/multiz44way/consPhyloP
mkdir run.phyloP
cd run.phyloP
# Adjust model file base composition background and rate matrix to be
# representative of whole-genome (.41 -- as was done for ENCODE)
# using utility, 'modFreqs' from PHAST package
set PHASTBIN = /cluster/bin/phast.2008-12-18
set gc = `grep BACKGROUND /cluster/data/hg18/bed/multiz17way/cons/elliotsEncode.mod | awk '{printf "%0.3f\n", $3 + $4}'`
echo $gc
# .410
# NOTE: this corresponds well to Hiram's GC values from his phyloFit runs
# on the 44-way ss files
$PHASTBIN/modFreqs ../../4d/phyloFit.all.mod $gc > ../../4d/44way.all.mod
# repeat for chrX only tree
cd /cluster/data/hg18/bed/multiz44way/4d
$PHASTBIN/modFreqs 4d.chrX.mod $gc > 44way.chrX.mod
ln -s `pwd`/44way.chrX.mod /usr/local/apache/golenPath/hg18/phastCons44way
cat > doPhyloP.csh << 'EOF'
set f = $1
set out = $2
set c = $f:r:r
set n = $f:r:e
set tmp = /scratch/tmp/$f
rm -fr $tmp
mkdir -p $tmp
cp -p /cluster/data/hg18/bed/multiz44way/consPhyloP/ss/$c/$n/$f.ss $tmp
cp -p tree.mod $tmp
pushd $tmp > /dev/null
set PHASTBIN = /cluster/bin/phast.2008-12-18
$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $c \
-i SS tree.mod $f.ss > $f.wig
popd > /dev/null
mkdir -p $out:h
mv $tmp/$f.wig $out
rm -fr $tmp
'EOF'
# Create list of chunks
pushd /cluster/data/hg18/bed/multiz44way/consPhyloP/ss
ls chr*/*/chr*.*.ss | sed -e 's/.ss$//' -e 's/^\.\///' > \
/cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/in.list
popd > /dev/null
# need to fill in chr8, neglected in main run
pushd /cluster/data/hg18/bed/multiz44way/consPhyloP/ss
ls chr8/*/chr*.*.ss | sed -e 's/.ss$//' -e 's/^\.\///' > \
/cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/in.chr8.list
popd > /dev/null
# Create template file
# file1 == $chr/$chunk/file name without .ss suffix
cat > template << 'EOF'
#LOOP
csh ../doPhyloP.csh $(file1) {check out line+ wig/$(dir1)/$(file1).wig}
#ENDLOOP
'EOF'
# setup run for all species
mkdir all
cd all
cp ../../../4d/44way.all.mod tree.mod
rm -fr wig
mkdir wig
# << happy emacs
gensub2 ../in.list single ../template jobList
# 2823 jobs
para create jobList
para try
para check
para push
para time
#Completed: 2823 of 2823 jobs
#CPU time in finished jobs: 4691641s 78194.02m 1303.23h 54.30d 0.149 y
#IO & Wait Time: 171343s 2855.71m 47.60h 1.98d 0.005 y
#Average job time: 1723s 28.71m 0.48h 0.02d
#Longest finished job: 2451s 40.85m 0.68h 0.03d
#Submission to last job: 6055s 100.92m 1.68h 0.07d
ssh hgwdev
cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP
# check for clean dir here -- chr* will match garbage if it's there
cat > listWig.csh << 'EOF'
foreach c (`ls -d chr*`)
foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
ls -1 $d/*.wig | sort -n -t\. -k3
end
end
'EOF'
cd all/wig
csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayAll.wig phyloP44wayAll.wib
# Reloaded to include chr8 (2008-01-15 kate)
#Converted stdin, upper limit 7.13, lower limit -15.41
# Load gbdb and database with wiggle.
ln -s \
/cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/all/wig/phyloP44wayAll.wib \
/gbdb/hg18/multiz44way/phyloP44wayAll.wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayAll phyloP44wayAll.wig
# placental-only: exclude all but these:
cd /cluster/data/hg18/bed/multiz44way/4d
set PHASTBIN = /cluster/bin/phast.2008-12-18
$PHASTBIN/tree_doctor 44way.all.mod \
--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,\
micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2,\
vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,\
sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1 \
> 44way.placental.mod
cd ../consPhyloP/run.phyloP
mkdir placental
cd placental
cp ../../../4d/44way.placental.mod tree.mod
mkdir wig
gensub2 ../in.list single ../template jobList
# 2823 jobs
para create jobList
para try
para check
para push
para time
#Completed: 2823 of 2823 jobs
#CPU time in finished jobs: 3358003s 55966.71m 932.78h 38.87d 0.106 y
#IO & Wait Time: 142664s 2377.74m 39.63h 1.65d 0.005 y
#Average job time: 1240s 20.67m 0.34h 0.01d
#Longest finished job: 1781s 29.68m 0.49h 0.02d
#Submission to last job: 4383s 73.05m 1.22h 0.05d
# load wiggle
ssh hgwdev
cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/placental/wig
csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayPlacMammal.wig phyloP44wayPlacMammal.wib
#Converted stdin, upper limit 3.46, lower limit -14.42
# Load gbdb and database with wiggle.
ln -s \
/cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/placental/wig/phyloP44wayPlacMammal.wib \
/gbdb/hg18/multiz44way/phyloP44wayPlacMammal.wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPlacMammal phyloP44wayPlacMammal.wig
cd /cluster/data/hg18/bed/multiz44way/4d
set PHASTBIN = /cluster/bin/phast.2008-12-18
$PHASTBIN/tree_doctor 44way.all.mod \
--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2 \
> 44way.euarchontoglires.mod
# euarchontoglires only: exclude all but these:
cd ../consPhyloP/run.phyloP
mkdir euarch
cd euarch
cp ../../../4d/44way.euarchontoglires.mod tree.mod
mkdir wig
gensub2 ../in.list single ../template jobList
# 2823 jobs
para create jobList
para try
para check
para push
para time
#Completed: 2823 of 2823 jobs
#CPU time in finished jobs: 1646910s 27448.49m 457.47h 19.06d 0.052 y
#IO & Wait Time: 94310s 1571.84m 26.20h 1.09d 0.003 y
#Average job time: 617s 10.28m 0.17h 0.01d
#Longest finished job: 901s 15.02m 0.25h 0.01d
#Submission to last job: 2127s 35.45m 0.59h 0.02d
# process results and load wiggle
ssh hgwdev
cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/euarch/wig
csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayEuarch.wig phyloP44wayEuarch.wib
#Converted stdin, upper limit 2.03, lower limit -9.78
ln -s \
/cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/euarch/wig/phyloP44wayEuarch.wib \
/gbdb/hg18/multiz44way/phyloP44wayEuarch.wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayEuarch phyloP44wayEuarch.wig
# primates only: exclude all but these:
cd /cluster/data/hg18/bed/multiz44way/4d
set PHASTBIN = /cluster/bin/phast.2008-12-18
$PHASTBIN/tree_doctor 44way.all.mod \
--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1 \
> 44way.primate.mod
cd ../consPhyloP/run.phyloP
mkdir primate
cd primate
cp ../../../4d/44way.primate.mod tree.mod
mkdir wig
gensub2 ../in.list single ../template jobList
para create jobList
# 2823 jobs
para try
para check
para push
# quick!
para time
#Completed: 2823 of 2823 jobs
#CPU time in finished jobs: 895998s 14933.30m 248.89h 10.37d 0.028 y
#IO & Wait Time: 66654s 1110.90m 18.52h 0.77d 0.002 y
#Average job time: 341s 5.68m 0.09h 0.00d
#Longest finished job: 503s 8.38m 0.14h 0.01d
#Submission to last job: 1190s 19.83m 0.33h 0.01d
# process results and load wiggle
ssh hgwdev
cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate/wig
csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayPrimate.wig phyloP44wayPrimate.wib
#Converted stdin, upper limit 0.99, lower limit -8.17
ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate/wig/phyloP44wayPrimate.wib /gbdb/hg18/multiz44way/phyloP44wayPrimate.wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPrimate phyloP44wayPrimate.wig
# get stats
cd run.phyloP/all
hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayAll > stats.out
hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayAll | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out
cd ../placental
hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayPlacMammal > stats.out
hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayPlacMammal | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out
cd ../euarch
hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayEuarch > stats.out
hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayEuarch | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out
cd ../primate
hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayPrimate > stats.out
hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayPrimate | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out
# Downloads
cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP
cat > listWigsByChrom.csh << 'EOF'
set c = $1
foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
ls -1 $d/*.wig | sort -n -t\. -k3
end
'EOF'
cat > downloads.csh << 'EOF'
mkdir ../downloads
foreach c (`ls -d chr*`)
echo $c
csh ../../listWigsByChrom.csh $c > ../downloads/$c.lst
csh ../../listWigsByChrom.csh $c | xargs cat | gzip -c > ../downloads/$c.$1.wigFix.gz
end
cd ../downloads
md5sum *.wigFix.gz > md5sum.txt
'EOF'
cd all/wig
csh ../../downloads.csh phyloP44way >&! downloads.log &
cd ../../placental/wig
csh ../../downloads.csh phyloP44way.placental >&! downloads.log &
cd ../../primate/wig
csh ../../downloads.csh phyloP44way.primate >&! downloads.log &
# add create web downloads dir and add symlinks to files
cd ../../
mkdir downloads
cp /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/README.txt downloads
# edit
cd /usr/local/apache/htdocs/goldenPath/hg18/
mkdir phyloP44way
cd phyloP44way
ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/downloads/README.txt .
mkdir vertebrate
ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/all/downloads/{*.gz,md5sum.txt} vertebrate
mkdir placentalMammals
ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/placental/downloads/{*.gz,md5sum.txt} placentalMammals
mkdir primates
ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate/downloads/{*.gz,md5sum.txt} primates
# Lineage-specific runs
# uses --subtree option of phyloP
# name ancestor nodes
cd /cluster/data/hg18/bed/multiz44way/4d
set PHASTBIN = /cluster/bin/phast.2008-12-18
$PHASTBIN/tree_doctor 44way.all.mod --name-ancestors >44way.all-ancestors.mod
cd ../consPhyloP/run.phyloP
# built new PHAST package with fix from Adam for --subtree problems:w
sed -e 's/phyloP/phyloP --subtree=$3/' -e 's/phast.2008-12-18/phast.2009-01-26/' doPhyloP.csh > doPhyloPSubtree.csh
# visually inspect shell script
cat > template.subtree << 'EOF'
#LOOP
csh ../doPhyloPSubtree.csh $(file1) {check out line+ wig/$(dir1)/$(file1).wig} SUBTREE
#ENDLOOP
'EOF'
# primate lineage-specific
cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP
mkdir primate-ls
cd primate-ls
cp ../../../4d/44way.all-ancestors.mod tree.mod
mkdir wig
sed 's/SUBTREE/hg18-micMur1/' ../template.subtree > template.ls
gensub2 ../in.list single template.ls jobList
para create jobList
# 2823 jobs
para try
para check
para push
para time
#CPU time in finished jobs: 4949300s 82488.33m 1374.81h 57.28d 0.157 y
#IO & Wait Time: 143956s 2399.27m 39.99h 1.67d 0.005 y
#Average job time: 1805s 30.08m 0.50h 0.02d
#Longest finished job: 2780s 46.33m 0.77h 0.03d
#Submission to last job: 6447s 107.45m 1.79h 0.07d
# process results and load wiggle
ssh hgwdev
cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate-ls/wig
csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayPrimateLs.wig phyloP44wayPrimateLs.wib
#Converted stdin, upper limit 3.91, lower limit -9.28
ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate-ls/wig/phyloP44wayPrimateLs.wib /gbdb/hg18/multiz44way/phyloP44wayPrimateLs.wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPrimateLs phyloP44wayPrimateLs.wig
# glire lineage-specfic
cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP
mkdir glire-ls
cd glire-ls
cp ../../../4d/44way.all-ancestors.mod tree.mod
mkdir wig
sed 's/SUBTREE/mm9-oryCun1/' ../template.subtree > template.ls
gensub2 ../in.list single template.ls jobList
para create jobList
# 2823 jobs
para try
para check
para push
para time
#CPU time in finished jobs: 5173192s 86219.87m 1437.00h 59.87d 0.164 y
#IO & Wait Time: 145615s 2426.91m 40.45h 1.69d 0.005 y
#Average job time: 1884s 31.40m 0.52h 0.02d
#Longest finished job: 2721s 45.35m 0.76h 0.03d
#Submission to last job: 6883s 114.72m 1.91h 0.08d
# process results and load wiggle
ssh hgwdev
cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/glire-ls/wig
csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayGlireLs.wig phyloP44wayGlireLs.wib
#Converted stdin, upper limit 5.95, lower limit -6.99
ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/glire-ls/wig/phyloP44wayGlireLs.wib /gbdb/hg18/multiz44way/phyloP44wayGlireLs.wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayGlireLs phyloP44wayGlireLs.wig
#########################################################################
# Update phyloP44way tables from Adam Siepel, Melissa Hubisz at Cornell
# This version uses a different neutral tree model for chrX
# and will replace the original version as default view on the Conservation track
# ( 2009-06-30 kate)
mkdir /hive/data/genomes/hg18/bed/multiz44way/chrX.phyloP
cd /hive/data/genomes/hg18/bed/multiz44way/chrX.phyloP
mkdir primates
cd primates
wget --timestamping ftp:ftp.biotech.cornell.edu/2x/phyloP/44way/primates/\*
cd ..
mkdir placental
cd placental
wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/placental/\*
cd ..
mkdir all
cd all
wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/all/\*
cd ..
zcat all/*.wigFix.gz | wigEncode stdin phyloP44way_v2.wig phyloP44way_v2.wib
zcat primates/*.wigFix.gz | wigEncode stdin phyloP44wayPrimates_v2.wig phyloP44wayPrimates_v2.wib
zcat placental/*.wigFix.gz | wigEncode stdin phyloP44wayPlacental_v2.wig phyloP44wayPlacental_v2.wib
ln -s `pwd`/*.wib /gbdb/hg18/multiz44way
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44way_v2 phyloP44way_v2.wig
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPrimates_v2 phyloP44wayPrimates_v2.wig
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPlacental_v2 phyloP44wayPlacental_v2.wig
# Lineage specific phyloP
# These updated tables will appear in the Lineage Cons track
mkdir glires-ls
cd glires-ls
wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/glires-ls/\*
cd ..
mkdir primates-ls
cd primates-ls
wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/primates-ls/\*
cd ..
zcat glires-ls/*.wigFix.gz | wigEncode stdin phyloP44wayGliresLs_v2.wig phyloP44wayGliresLs_v2.wib
zcat primates-ls/*.wigFix.gz | wigEncode stdin phyloP44wayPrimatesLs_v2.wig phyloP44wayPrimatesLs_v2.wib
ln -s `pwd`/phyloP44wayGliresLs_v2.wib /gbdb/hg18/multiz44way
nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayGliresLs_v2 phyloP44wayGliresLs_v2.wig
ln -s `pwd`/phyloP44wayPrimatesLs_v2.wib /gbdb/hg18/multiz44way
nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPrimatesLs_v2 phyloP44wayPrimatesLs_v2.wig
######################################################################
# downloads for 44-way (DONE - 2009-01-09 - Hiram)
mkdir -p /hive/data/genomes/hg18/bed/multiz44way/downloads/maf
cd /hive/data/genomes/hg18/bed/multiz44way/downloads/maf
# bash script
#!/bin/sh
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
featureBits hg18 refGene:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| /cluster/bin/$MACHTYPE/mafFrags hg18 multiz44way \
stdin stdout \
-orgs=/hive/data/genomes/hg18/bed/multiz44way/species.list \
| gzip -c > upstream${S}.maf.gz
echo "done upstream${S}.maf.gz"
done
cd /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/maf
ln -s /hive/data/genomes/hg18/bed/multiz44way/downloads/maf/up*.gz .
md5sum up*.gz >> md5sum.txt
mkdir /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way
cd /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way
mkdir placentalMammals primates vertebrate
cd vertebrate
ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/all/downloads/* .
cd ../placentalMammals
ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/placental/downloads/* .
cd ../primates
ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/primates/downloads/* .
cd ..
ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/all/all.mod \
vertebrate.mod
ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/primates/primates.mod .
ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/placental/placental.mod \
./placentalMammals.mod
ln -s \
/hive/data/genomes/hg18/bed/multiz44way/downloads/phastCons44way/README.txt .
# pushQ MySQL tables:
phastCons44way, phastCons44wayPlacental, phastCons44wayPrimates,
multiz44way, multiz44wayFrames, multiz44waySummary,
phastConsElements44way, phastConsElements44wayPlacental,
phastConsElements44wayPrimates, phyloP44wayAll, phyloP44wayPlacMammal,
phyloP44wayPrimate
# pushQ files:
/gbdb/hg18/multiz44way/maf/*
/gbdb/hg18/multiz44way/phastCons44way.wib
/gbdb/hg18/multiz44way/phastCons44wayPlacental.wib
/gbdb/hg18/multiz44way/phastCons44wayPrimates.wib
/gbdb/hg18/multiz44way/phyloP44wayAll.wib
/gbdb/hg18/multiz44way/phyloP44wayPlacMammal.wib
/gbdb/hg18/multiz44way/phyloP44wayPrimate.wib
/usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/vertebrate/*
/usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/primates/*
/usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/placentalMammals/*
/usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/*.mod
/usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/README.txt
/usr/local/apache/htdocs/goldenPath/hg18/multiz44way/maf/*
/usr/local/apache/htdocs/goldenPath/hg18/multiz44way/alignments/
/usr/local/apache/htdocs/goldenPath/hg18/multiz44way/*.nh
/usr/local/apache/htdocs/goldenPath/hg18/multiz44way/README.txt
/usr/local/apache/htdocs/goldenPath/hg18/phyloP44way/vertebrate/*
/usr/local/apache/htdocs/goldenPath/hg18/phyloP44way/placentalMammals/*
/usr/local/apache/htdocs/goldenPath/hg18/phyloP44way/primate/*
# MySQL tables: 5,624,932,756 = 5,364 Mb
# gbdb files: 271,318,361,985 = 258,749 Mb
# apache htdocs: 58,767,852,372 = 56,045 Mb
# Total 335,711,147,113 = 320,159 Mb
# An extra set of error corrected MAF's from the Siepel lab:
mkdir /hive/data/genomes/hg18/bed/multiz44way/errorCorrectedMafs
wget --timestamping \
"ftp://siepellab:XXXXXX@ftp.biotech.cornell.edu/2x/maf-ec/*"
# not showing the password here on purpose
# verify md5sums:
md5sum *.maf.gz > md5sum.here
diff md5sum.txt md5sum.here
# no difference
rm md5sum.here
mkdir \
/usr/local/apache/htdocs/goldenPath/hg18/multiz44way/SiepelLabCorrectedMafs
cd \
/usr/local/apache/htdocs/goldenPath/hg18/multiz44way/SiepelLabCorrectedMafs
ln -s /hive/data/genomes/hg18/bed/multiz44way/errorCorrectedMafs/* .
#########################################################################
# Create Syntenic and Recip Best net files to load into tracks to view
# on the browser to see what was used during the multiple alignment
cd /hive/data/genomes/hg18/bed/blastz.gorGor1/axtChain
netClass -verbose=0 -noAr hg18.gorGor1.rbest.net.gz hg18 gorGor1 stdout \
| gzip -c > netRBestGorGor1.net.gz
hgLoadNet hg18 netRBestGorGor1 netRBestGorGor1.net.gz
cd /hive/data/genomes/hg18/bed/blastz.ponAbe2/axtChain
hgLoadNet hg18 netSyntenyPonAbe2 hg18.ponAbe2.syn.net.gz
cd /hive/data/genomes/hg18/bed/blastz.calJac1/axtChain
netClass -verbose=0 -noAr hg18.calJac1.rbest.net.gz hg18 calJac1 stdout \
| gzip -c > netRBestCalJac1.net.gz
hgLoadNet hg18 netRBestCalJac1 netRBestCalJac1.net.gz
cd /hive/data/genomes/hg18/bed/blastz.tarSyr1/axtChain
netClass -verbose=0 -noAr hg18.tarSyr1.rbest.net.gz hg18 tarSyr1 stdout \
| gzip -c > netRBestTarSyr1.net.gz
hgLoadNet hg18 netRBestTarSyr1 netRBestTarSyr1.net.gz
#########################################################################
# EIO/JCVI NAS TRACK (2008-11-25 Fan)
# Contact: Gaetano Gargiulo [gaetano.gargiulo@ifom-ieo-campus.it]
cd /hive/data/genomes/hg18/bed
mkdir eioJcviNAS
cd eioJcviNAS
# receive the doc and two bed files and put them there.
fgrep -v description HG18_NAS_CD34_neg.bed| \
cut -f 1-3 |hgLoadBed -noBin hg18 eioJcviNASNeg stdin
checkTableCoords -table=eioJcviNASNeg hg18
fgrep -v description HG18_NAS_CD34_pos.bed| \
cut -f 1-3 |hgLoadBed -noBin hg18 eioJcviNASPos stdin
checkTableCoords -table=eioJcviNASPos hg18
# Create the description file, eioJcviNAS.html, according to
# according to the latest doc file from Gaetano.
#
# Add the two composite sub-tracks to human/hg18/trackDb.ra.
#########################################################################
# hgPal downloads (DONE braney 2008-12-07)
# FASTA from 44way for refGene, knownGene, knownCanonical
ssh hgwdev
screen
bash
rm -rf /cluster/data/hg18/bed/multiz44way/pal
mkdir /cluster/data/hg18/bed/multiz44way/pal
cd /cluster/data/hg18/bed/multiz44way/pal
echo hg18 | cat - /cluster/data/hg18/bed/multiz44way/ordered.list > order.lst
mz=multiz44way
gp=refGene
db=hg18
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \
gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
gzip -c > exonAA/$j.exonAA.fa.gz"
done > $gp.jobs
time sh -x $gp.jobs > $gp.jobs.log 2>&1 &
sleep 1
tail -f $gp.jobs.log
# real 525m57.376s
# user 25m36.072s
# sys 7m41.565s
ssh kolossus
mz=multiz44way
gp=refGene
db=hg18
zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
# we're only distributing exons at the moment
mz=multiz44way
gp=refGene
db=hg18
pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
mkdir -p $pd
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
mz=multiz44way
gp=knownGene
db=hg18
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \
gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
gzip -c > exonAA/$j.exonAA.fa.gz"
done > $gp.$mz.jobs
time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
sleep 1
tail -f $gp.$mz.job.log
# real 442m46.735s
# user 43m3.060s
# sys 10m45.635s
mz=multiz44way
gp=knownGene
db=hg18
zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
mz=multiz44way
gp=knownGene
db=hg18
pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
mkdir -p $pd
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
# now do the canonical set
cd /cluster/data/hg18/bed/multiz44way/pal
mz=multiz44way
gp=knownCanonical
db=hg18
for j in `awk '{print $1}' /cluster/data/hg18/chrom.sizes`
do
echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed
done
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -geneBeds=$j.known.bed $db $mz knownGene order.lst stdout | \
gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | \
gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | \
gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | \
gzip -c > exonAA/$j.exonAA.fa.gz"
done > $gp.$mz.jobs
time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
sleep 1
tail -f $gp.$mz.job.log
# real 326m12.849s
# user 17m40.850s
# sys 3m59.648s
rm *.known.bed
mz=multiz44way
gp=knownCanonical
db=hg18
zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
mz=multiz44way
gp=knownCanonical
db=hg18
pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
mkdir -p $pd
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
#########################################################################
# BUILD OMIM RELATED GENES TRACK (complete rebuild, 2/24/09 Fan)
ssh hgwdev
cd /hive/data/genomes/gs.19/build36/bed
mkdir omimGene
cd omimGene
# download the file morbidmap and genemap from OMIM
mkdir omim
cd omim
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/morbidmap
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/genemap
cat genemap|sed -e 's/|/\t/g' > genemap.tab
autoSql ~/src/hg/lib/omimGeneMap.as x
cat x.sql |sed -e 's/PRIMARY KEY(numbering)/KEY(omimId)/' >omimGeneMap.sql
hgLoadSqlTab -warn hg18 omimGeneMap omimGeneMap.sql genemap.tab
# got warning on 3 records, just ignore them
Warning: load of omimGeneMap did not go as planned: 11750 record(s), 0 row(s) skipped, 3 warning(s) loading genemap.tab
rm x.c x.h
cd ..
cat omim/morbidmap|sed -e 's/|/\t/g' > mobidmap.tab
autoSql ~/src/hg/lib/omimMorbidMap.as x
cat x.sql |sed -e 's/PRIMARY KEY(description)/KEY(omimId)/' >omimMorbidMap.sql
hgLoadSqlTab -warn hg18 omimMorbidMap omimMorbidMap.sql mobidmap.tab
# get all UCSC genes (from the knownGene table) that cross-reference to a RefSeq gene
# that has a non-empty OMIM ID according to the refLink table. And use OMIM ID as
# the gene name for this new table. Please note the alignId field still holds the KG ID.
hgsql hg18 -N -e \
'select omimId, kg.* from knownGene kg, knownToRefSeq kr, refLink l where omimId != 0 and mrnaAcc=kr.value and kg.name=kr.name ' \
|cut -f 1,3-13 >o1.tab
# collect more OMIM related genes via the MIM external DB links from UniProt
hgsql hg18 -N -e \
'select extAC, kg.* from knownGene kg, kgXref k, proteome.spXref2 p where spId=p.accession and extDB="MIM" and kg.name=kgId ' \
|cut -f 1,3-13 >o2.tab
# concatenate the above two gene sets and remove duplications.
cat o1.tab o2.tab |sort -u >o3.tab
# load the result into a temp table, fanO3
hgLoadSqlTab hg18 fanO3 ~/src/hg/lib/knownGene.sql o3.tab
# while holding onto the OMIM ID, get the canonical gene (via the knownGene, knowIsoforms,
# and knownCanonical tables) that represent a cluster which contains
# initial OMIM gene in the fanO3 table
hgsql hg18 -N -e \
'select f3.name, kg.* from fanO3 f3, knownGene kg, knownCanonical c, knownIsoforms i where f3.alignId=i.transcript and kg.name=c.transcript and c.clusterId=i.clusterId'\
> o4.tab
# first column is the OMIM ID
cut -f 1 o4.tab >j1.tmp
# col 3-13 is the gene structure of the canonical KG
cut -f 3-13 o4.tab >j2.tmp
# stitch them together and remove duplicates, load the result into fanO4 table
paste j1.tmp j2.tmp |sort -u >fanO4.tab
hgLoadSqlTab hg18 fanO4 ~/src/hg/lib/knownGene.sql fanO4.tab
# finally sort the table and create bed 4 file and load it as the omimGene table
hgsql hg18 -N -e 'select chrom, txStart, txEnd, name from fanO4 order by chrom, txStart, txEnd' |sort -u >omimGene.bed
hgLoadBed hg18 omimGene omimGene.bed
# create and load the omimToKnownCanonical table.
hgsql hg18 -N -e 'select name, alignId from fanO4 order by name'\
> omimToKnownCanonical.tab
hgLoadSqlTab hg18 omimToKnownCanonical \
~/src/hg/lib/omimToKnownCanonical.sql omimToKnownCanonical.tab
# The following clean up could be done.
# hgsql hg18 -e 'drop table fanO3'
# hgsql hg18 -e 'drop table fanO4'
# rm j*.tmp
# rm o1.tab o2.tab o3.tab o4.tab
# update one omimGene record to reflect a correction UniProt is
# going to make on their MIM external link (per 12/15/08 emails from Bob and
# Livia (apache@vital-it.ch ) from ExPASy.
hgsql hg18 -e 'update omimGene set name="611016" where name="608636"'
hgsql hg18 -e 'update omimToKnownCanonical set omimId="611016" where omimId="608636"'
#############################################################################
# fox2ClipSeq from Gene Yeo (DONE - 2009-01-08 - Hiram)
mkdir /hive/data/genomes/hg18/bed/fox2ClipSeq
cd /hive/data/genomes/hg18/bed/fox2ClipSeq
# lift the hg17 data to here
liftOver -bedPlus=9 \
/hive/data/genomes/hg17/bed/fox2ClipSeq/forwardStrand.bed.gz \
/usr/local/apache/htdocs/goldenPath/hg17/liftOver/hg17ToHg18.over.chain.gz \
stdout forwardStrand.unMapped | gzip -c > forwardStrand.bed.gz
liftOver -bedPlus=9 \
/hive/data/genomes/hg17/bed/fox2ClipSeq/reverseStrand.bed.gz \
/usr/local/apache/htdocs/goldenPath/hg17/liftOver/hg17ToHg18.over.chain.gz \
stdout reverseStrand.unMapped | gzip -c > reverseStrand.bed.gz
# turn into wiggle density plot
zcat forwardStrand.bed.gz | bedItemOverlapCount hg18 stdin \
| wigEncode stdin fox2ClipSeqDensityForwardStrand.wig \
fox2ClipSeqDensityForwardStrand.wib
# Converted stdin, upper limit 2401.00, lower limit 1.00
zcat reverseStrand.bed.gz | bedItemOverlapCount hg18 stdin \
| wigEncode stdin fox2ClipSeqDensityReverseStrand.wig \
fox2ClipSeqDensityReverseStrand.wib
# Converted stdin, upper limit 1406.00, lower limit 1.00
# and load tables
zcat forwardStrand.bed.gz reverseStrand.bed.gz \
| hgLoadBed hg18 fox2ClipSeq stdin
# Loaded 4418298 elements of size 9
ln -s `pwd`/*.wib /gbdb/hg18/wib
hgLoadWiggle hg18 fox2ClipSeqDensityForwardStrand \
fox2ClipSeqDensityForwardStrand.wig
hgLoadWiggle hg18 fox2ClipSeqDensityReverseStrand \
fox2ClipSeqDensityReverseStrand.wig
# add composite track definitions to makeDb/trackDb/human/trackDb.ra
#############################################################################
# REPEATMASKER - LATEST VERSION, 3.2.7 (DONE 1/30/09 rhubley and angie)
# Robert Hubley ran the new and improved version (3.2.7) of RepeatMasker
# but politely deferred to staff to load the results:
mkdir /hive/data/genomes/hg18/bed/RMRunRMH
cd /hive/data/genomes/hg18/bed/RMRunRMH
doRepeatMasker.pl -stop mask -buildDir `pwd` hg18
# see do.log, cat.log
# Angie loaded with new table name, chr*_rmskRM327. Used -debug to
# make scripts, edited those.
cd /hive/data/genomes/hg18/bed/RMRunRMH
doRepeatMasker.pl -debug \
-continue install -buildDir `pwd` hg18
# Edit doLoad.csh: change table names: rmsk -> rmskRM327,
# nestedRepeats -> nestedRepeatsRM327
./doLoad.csh >& load.log & tail -f load.log
# Edit doSplit.csh: change -ending to .RM327.fa.out
./doSplit.csh >& split.log & tail -f split.log
doRepeatMasker.pl -continue cleanup -buildDir `pwd` \
-fileServer hgwdev hg18 >& cleanup.log & tail -f cleanup.log
# Compare coverage to original RepeatMasker run:
featureBits hg18 rmskRM327
#1457032101 bases of 2881515245 (50.565%) in intersection
featureBits hg18 rmsk
#1406290513 bases of 2881515245 (48.804%) in intersection
# Wow, Arian got his 50%! :)
# Compare Alu counts, since that is supposed to be an area of improvement:
grep SINE/Alu hg18.fa.out | wc -l
#1186885
ls /hive/data/genomes/hg18/?{,?}{,_*_hap[12]}/chr[0-9XYM]{,[0-9]}{,_random,*_hap[12]}.fa.out \
| uniq | xargs grep SINE/Alu | wc -l
#1189976
# A decrease... weird. OK, breaking it down chrom-by-chrom, the _random's
# have fewer and the regular chrom's have more Alu's. Sounds OK to me :)
featureBits hg18 rmsk \!rmskRM327
#12318974 bases of 2881515245 (0.428%) in intersection
featureBits hg18 rmskRM327 \!rmsk
#63060562 bases of 2881515245 (2.188%) in intersection
# hgTables: 49,804 rmskRM327 items (4,805,535 bases) have no overlap with rmsk
# Added download file 2/5/09:
cd /hive/data/genomes/hg18
zip -j bigZips/chromOut.RM3.2.7.zip */chr*.RM327.fa.out
ln -s /hive/data/genomes/hg18/bigZips/chromOut.RM3.2.7.zip \
/usr/local/apache/htdocs/goldenPath/hg18/bigZips/
#############################################################################
# GENOME VARIANTS - 1000 GENOMES (DONE 1/7/2009 giardine, adapted from an email to angie)
# December release from 1000 Genomes: SNP calls on four of the 6 high-cov
# individuals: a CEU trio and a YRI daughter.
# see ftp://ftp-trace.ncbi.nih.gov/1000genomes/release/2008_12/README_December2008_release
cd /hive/data/genomes/hg18/bed/pgSnp/
cat > trio2pg.pl <<'EOF'
#!/usr/bin/perl -w
use strict;
#split out individual SNPs from trio file
#format:chr loc ref alleles snp.Q av.max.map.Q depth.cov NA12891 NA12891.Q NA12892
NA12892.Q NA12878 NA12878.Q hwe maf tdt display
my $ac = shift @ARGV; #allele column, zero based
if (!$ac) {
print "Usage: trio2pg.pl alleleColumn# < infile > outfile\n";
exit;
}
while (<>) {
chomp;
my @f = split(/\t/);
if ($f[0] eq 'chr') { next; }
$f[$ac] =~ s/([ATGC])\/\1/$1/;
if ($f[$ac] eq uc($f[2])) { next; } #reference allele only
print "chr$f[0]\t", ($f[1]-1), "\t$f[1]\t$f[$ac]\t";
my $c = ($f[$ac] =~ tr/\//\//) + 1;
my $s = $f[$ac+1];
if ($s !~ /\//) {
for (my $i = 1; $c > $i; $i++) { $s .= ",$f[$ac+1]"; }
}else {
$s =~ s/\//,/g;
if ($c == 1) { $s =~ s/,.*//; }
}
my $n = "0";
for (my $i = 1; $c > $i; $i++) { $n .= ",0"; } #allele count
print "$c\t$n\t$s\n";
}
exit;
'EOF'
# << emacs
chmod a+x trio2pg.pl
#convert to pgSnp
set relDir = /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/release/2008_12/
zcat $relDir/CEU.trio.dec.with.x.with.rs.calls.gz | trio2pg.pl 7 > NA12891.pgSnp
zcat $relDir/CEU.trio.dec.with.x.with.rs.calls.gz | trio2pg.pl 9 > NA12892.pgSnp
zcat $relDir/CEU.trio.dec.with.x.with.rs.calls.gz | trio2pg.pl 11 > NA12878.pgSnp
zcat $relDir/YRI.child.dec.intersect.calls.gz | trio2pg.pl 7 > NA19240.pgSnp
#gff for indels does not give nts, can't put in pgSnp format
hgLoadBed hg18 pgNA12878 NA12878.pgSnp \
-sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
hgLoadBed hg18 pgNA12891 NA12891.pgSnp \
-sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
hgLoadBed hg18 pgNA12892 NA12892.pgSnp \
-sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
hgLoadBed hg18 pgNA19240 NA19240.pgSnp \
-sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
#############################################################################
# GENOME VARIANTS - (DONE 1/7/09 giardine, adapted by angie from pgSnp/README)
# File pgVenter.bed placed in /hive/data/genomes/hg18/bed/pgSnp/ by
# Belinda.
cd /hive/data/genomes/hg18/bed/pgSnp/
grep "^chr" pgVenter.bed | sort -k1,1 -k2,2n \
| hgLoadBed hg18 pgVenter stdin \
-noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
# 3/11/09: fetching this file because I think it's the original data (angie)
wget ftp://ftp.jcvi.org/pub/data/huref/HuRef.InternalHuRef-NCBI.gff
#############################################################################
# GENOME VARIANTS - YRI NA18507 (DONE 1/9/07 giardine, adapted by angie from pgSnp/README)
# SNP calls made by Aakrosh Ratan at PSU.
# Files pgYri{2,3}.txt placed in /hive/data/genomes/hg18/bed/pgSnp/ by
# Belinda.
# yoruban snp calls (using solid software instead of maq)
# Loaded 11/4/08 according to hg18.history, but table status says created
# 1/7/09:
cd /hive/data/genomes/hg18/bed/pgSnp/
grep "^chr" pgYri2.txt | sort -k1,1 -k2,2n \
| hgLoadBed hg18 pgYoruban2 stdin \
-noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
#Another yoruban SNP set, same individual, Solexa reads, includes indels
# Loaded 11/7/08 according to hg18.history, but table status says created
# 1/7/09:
grep "^chr" pgYri3.txt | sort -k1,1 -k2,2n \
| hgLoadBed hg18 pgYoruban3 stdin \
-noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
#############################################################################
# GENOME VARIANTS - YH (DONE 2/24/09 giardine, adapted by angie from pgSnp/README)
#Asian individual (YH1) from Nature paper
#http://yh.genomics.org.cn/index.jsp
# File pgSnpYh.txt placed in /hive/data/genomes/hg18/bed/pgSnp/ by
# Belinda.
cd /hive/data/genomes/hg18/bed/pgSnp/
grep "^chr" pgSnpYh.txt | sort -k1,1 -k2,2n \
| hgLoadBed hg18 pgYh1 stdin \
-noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
# 3/11/09: fetching this file because I think it's the original data (angie)
wget -O "yhsnp_add.gff" \
'http://yh.genomics.org.cn/do.downServlet?file=data/snps/yhsnp_add.gff'
#############################################################################
# Initial import of LSSNP data for SNP and hgGene linking (2009-02-02 markd)
#############################################################################
# dump and load LSSNP databases from Johns Hopkins. This will be automated
# soon.
# download dump into tmp directory LSSNP; must load on bugle as the
# database is mysql 5
ssh bugle
hgsql -e 'create database LSSNP'
cat LSSNP/*.sql |hgsql LSSNP
hgsqlimport LSSNP `pwd`/LSSNP/*.txt
ssh hgwdev
hgLsSnpPdbLoad fetch bugle:LSSNP lsSnpPdb.tab
hgLsSnpPdbLoad load hg18 lsSnpPdb lsSnpPdb.tab
#############################################################################
#############################################################################
# HGDP GEOGRAPHIC SNP MAPS (DONE 2/5/09 angie)
# Project data downloaded and parsed in /hive/data/outside/hgdpGeo,
# see makeDb/doc/hgdpGeo.txt.
mkdir /hive/data/genomes/hg18/bed/hgdpGeo
cd /hive/data/genomes/hg18/bed/hgdpGeo
# Make an rsId-sorted snp coords file for joining with the hgdpGeo data.
grep -Fwf /hive/data/outside/hgdpGeo/rsIDs.lst \
../snp129/snp129.bed \
| awk 'BEGIN{OFS="\t";} {print $4, $1, $2, $3;}' \
| sort > snp129Coords.txt
wc -l snp129Coords.txt
#660280 snp129Coords.txt
# How many distinct SNPs in there? (compare to 657000 from HGDP):
cut -f 1 snp129Coords.txt |uniq | wc -l
#656496
# Join files to make a track table:
join -e ERROR -t' ' -o 1.2,1.3,1.4,1.1,2.2,2.3,2.4 \
snp129Coords.txt /hive/data/outside/hgdpGeo/hgdpGeoCoordless.txt \
| sed -re 's/([AGTC])\*/\1/' \
| sort -k1,1 -k2n,2n \
> hgdpGeo.tab
wc -l hgdpGeo.tab
#660280 hgdpGeo.tab
grep ERROR hgdpGeo.tab | wc -l
#0
hgLoadBed hg18 hgdpGeo hgdpGeo.tab \
-sqlTable=$HOME/kent/src/hg/lib/hgdpGeo.sql
#Loaded 660280 elements of size 7
#############################################################################
# HGDP HETEROZYGOSITY (DONE 2/12/09 angie, except for Bantu 3/12/09)
mkdir /hive/data/genomes/hg18/bed/hgdpHzy
cd /hive/data/genomes/hg18/bed/hgdpHzy
foreach continent (african americas easia european mideast oceania sasia)
wget --timestamping http://hgdp.uchicago.edu/data/hzy/$continent.gff.gz
end
wget --timestamping http://hgdp.uchicago.edu/data/hzy/allbantu.hzy.gff.gz
foreach continent (african allbantu americas easia european mideast oceania sasia)
set bedGraph = `echo $continent \
| sed -re 's/can$/ca/; s/pean$/pe/; s/asia/Asia/; s/allbantu/bantu/; \
s/(.*)/hgdpHzy\u\1.bedGraph/'`
echo $bedGraph
zcat $continent.gff.gz \
| awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
> $bedGraph
end
# 3/12/09: All of the original files' coords were intervals between SNPs,
# but the Bantu file had SNP coordinates, and one more line per chrom than
# the others. So (after getting OK from Joe) I am going to transform the
# Bantu SNP coords to intervals like the others.
perl -we 'while (<>) { \
chomp; ($c, $s, undef, $h) = split; \
if (defined $lastC) { \
if ($lastC eq $c) { \
print "$c\t$lastS\t$s\t$lastH\n"; \
} # Discarding last SNP on each chrom \
} \
($lastC, $lastS, $lastH) = ($c, $s, $h); \
}' \
hgdpHzyBantu.bedGraph > tmp
mv tmp hgdpHzyBantu.bedGraph
# Using bedGraph, not wig, because there are only 640k datapoints and
# some are over the 10Mbase wiggle item size limit.
foreach f (*.bedGraph)
hgLoadBed hg18 $f:r $f -bedGraph=4
end
# All have same size:
#Loaded 640676 elements of size 4
#############################################################################
# HGDP FST (DONE 2/12/09 angie)
mkdir /hive/data/genomes/hg18/bed/hgdpFst
cd /hive/data/genomes/hg18/bed/hgdpFst
wget --timestamping \
http://hgdp.uchicago.edu/data/FST/autosomal_illuminasnps7_pval.gff.gz
zcat autosomal_illuminasnps7_pval.gff.gz \
| awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
> hgdpFst.bedGraph
hgLoadBed hg18 hgdpFst hgdpFst.bedGraph -bedGraph=4
#Loaded 640676 elements of size 4
#############################################################################
# HGDP IHS (DONE 2/13/09 angie)
mkdir /hive/data/genomes/hg18/bed/hgdpIhs
cd /hive/data/genomes/hg18/bed/hgdpIhs
foreach continent (Bantu Americas E.Asia European MiddleEast Oceania S.Asian)
wget --timestamping \
http://hgdp.uchicago.edu/data/iHS/smoothed$continent.iHS.gff.gz
set bedGraph = `echo $continent \
| sed -re 's/pean$/pe/; s/\.Asian?/Asia/; \
s/MiddleEast/Mideast/; s/(.*)/hgdpIhs\1.bedGraph/'`
echo $bedGraph
zcat smoothed$continent.iHS.gff.gz \
| sed -e 's/^chr23/chrX/' \
| awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
> $bedGraph
end
foreach f (*.bedGraph)
hgLoadBed hg18 $f:r $f -bedGraph=4
end
#Reading hgdpIhsBantu.bedGraph
#Loaded 540438 elements of size 4
#Reading hgdpIhsAmericas.bedGraph
#Loaded 422167 elements of size 4
#Reading hgdpIhsEAsia.bedGraph
#Loaded 487801 elements of size 4
#Reading hgdpIhsEurope.bedGraph
#Loaded 543875 elements of size 4
#Reading hgdpIhsMideast.bedGraph
#Loaded 552277 elements of size 4
#Reading hgdpIhsOceania.bedGraph
#Loaded 425340 elements of size 4
#Reading hgdpIhsSAsia.bedGraph
#Loaded 550231 elements of size 4
#############################################################################
# HGDP XP-EHH (DONE 2/12/09 angie)
mkdir /hive/data/genomes/hg18/bed/hgdpXpehh
cd /hive/data/genomes/hg18/bed/hgdpXpehh
foreach continent (Bantu Americas E.Asia Europe Mideast Oceania S.Asia)
wget --timestamping \
http://hgdp.uchicago.edu/data/XPEHH/$continent.xpehh.forbrowser.gff.gz
set bedGraph = `echo $continent \
| sed -re 's/\.Asia?/Asia/; s/(.*)/hgdpXpehh\1.bedGraph/'`
echo $bedGraph
zcat $continent.xpehh.forbrowser.gff.gz \
| awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
> $bedGraph
end
foreach f (*.bedGraph)
hgLoadBed hg18 $f:r $f -bedGraph=4
end
#Reading hgdpXpehhBantu.bedGraph
#Loaded 636680 elements of size 4
#Reading hgdpXpehhAmericas.bedGraph
#Loaded 636143 elements of size 4
#Reading hgdpXpehhEAsia.bedGraph
#Loaded 635799 elements of size 4
#Reading hgdpXpehhEurope.bedGraph
#Loaded 636680 elements of size 4
#Reading hgdpXpehhMideast.bedGraph
#Loaded 636849 elements of size 4
#Reading hgdpXpehhOceania.bedGraph
#Loaded 637418 elements of size 4
#Reading hgdpXpehhSAsia.bedGraph
#Loaded 636773 elements of size 4
#############################################################################
# LIFTOVER TO Hg19 (DONE - 2009-03-06 - Hiram )
mkdir /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06
cd /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06
# -debug run to create run dir, preview scripts...
doSameSpeciesLiftOver.pl -debug hg18 hg19
# Real run:
time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
-bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
hg18 hg19 > do.log 2>&1
# real 85m8.064s
#############################################################################
# HAPMAP REL22 RECOMBINATION RATES (PHASE II) (DONE 2/24/09 angie)
mkdir -p /hive/data/outside/hapmap/recombination/2008-03_rel22_B36/rates
cd /hive/data/outside/hapmap/recombination/2008-03_rel22_B36/
wget --timestamping \
ftp://ftp.hapmap.org/pub/hapmap/public/recombination/2008-03_rel22_B36/00README.txt
cd rates
wget --timestamping \
ftp://ftp.hapmap.org/pub/hapmap/public/recombination/2008-03_rel22_B36/rates/\*
# Make bedGraph-formatted files.
mkdir -p /hive/data/genomes/hg18/bed/hapmap/recombination/2008-03_rel22_B36
cd /hive/data/genomes/hg18/bed/hapmap/recombination/2008-03_rel22_B36
cp /dev/null hapmapRecombRate.bed
foreach f (/hive/data/outside/hapmap/recombination/2008-03_rel22_B36/rates/*.txt)
set chr = `echo $f:t:r | sed -e 's/^.*chr/chr/; s/_b36.*//;'`
echo $f $chr
perl -wpe 's/^position .*\n// && next; \
m/^(\d+) (\d+\.?\d*) .*/ || die $_; $end=$1; $rate=$2; \
$start=$end-100 unless (defined $start); \
$_ = "'$chr'\t$start\t$end\t$rate\n"; $start = $end;' \
$f >> hapmapRecombRate.bedGraph
end
# Some items are over the 10Mbase wiggle item size limit, so use bedGraph.
time hgLoadBed hg18 hapmapRecombRate hapmapRecombRate.bedGraph -bedGraph=4
#Loaded 3281323 elements of size 4
#14.688u 1.796s 0:31.99 51.4% 0+0k 0+0io 0pf+0w
# There are >3M items... try bigWig! :)
wigToBigWig hapmapRecombRate.bedGraph /hive/data/genomes/hg18/chrom.sizes \
hapmapRecombRate.bw
ln -s `pwd`/hapmapRecombRate.bw /gbdb/hg18/bbi/
hgsql hg18 -e 'drop table if exists hapmapRecombRateBW; \
create table hapmapRecombRateBW (fileName varchar(255) not null); \
insert into hapmapRecombRateBW values ("/gbdb/hg18/bbi/hapmapRecombRate.bw");'
#############################################################################
# HAPMAP REL27 GENOTYPES (MERGED PHASE II+III) (DONE 2/25/09 angie)
# First, download release to /hive/data/outside...
mkdir -p /hive/data/outside/hapmap/genotypes/2009-02_phaseII+III/{excluded,forward}
cd /hive/data/outside/hapmap/genotypes/2009-02_phaseII+III
wget --timestamping \
ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-02_phaseII+III/00README.txt
cd excluded
wget --timestamping \
ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-02_phaseII+III/excluded/\*
cd ../forward
wget --timestamping \
ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-02_phaseII+III/forward/\*
# This directory's README refers to the README from the
# phaseIII-only 2009_01, which gives the file format and explains
# the population codes:
wget --timestamping -o 00README_2009-01_phaseIII.txt \
ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-01_phaseIII/00README.txt
# For details page... this is Coriell's NHGRI panel (all HapMap except
# CEPH): http://ccr.coriell.org/Sections/Collections/NHGRI/?SsId=11
# http://www.broad.mit.edu/mpg/hapmap3/
# Broad, BCM and Sanger have a nice phase3 writeup. Here is Broad's
# copy: http://www.broad.mit.edu/mpg/hapmap3/
# Now translate those into hapmapSnps* tables.
# NOTE FOR NEXT TIME: make this a cluster job. It takes ~half hour each pop!
# Could run the script on each downloaded file as a separate job, and then
# concatenate results (or just feed chr*_$pop to hgLoadBed).
mkdir -p /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
set sourceDir = /hive/data/outside/hapmap/genotypes/2009-02_phaseII+III/forward
foreach pop (ASW CEU CHB CHD GIH JPT LWK MEX MKK TSI YRI)
echo $pop
zcat $sourceDir/genotypes_chr*_${pop}_r27_nr.b36_fwd.txt.gz \
| perl -wpe 'chomp; \
if (/^rs# alleles c\w+ pos s\w+ a\w+# c\w+ protLSID assayLSID panelLSID QCcode NA/) { \
$_ = ""; # skip header lines \
} elsif (s/^(rs\d+) ([ACGT])\/([ACGT]) (chr\w+) (\d+) \+ ncbi_[bB]?36 .* QC\+ //) { \
($rsId, $obs1, $obs2, $chr, $end) = ($1, $2, $3, $4, $5); \
%compl = (A=>"T", C=>"G", G=>"C", T=>"A"); \
%hom = (); %het = (); \
# NOTE: one trouble-maker (other pop files have A/C with AC genotypes): \
if ($rsId eq "rs7059622" && "'$pop'" eq "YRI") { warn "Tweaking YRI rs7059622.\n"; } \
foreach my $al (split()) { \
next if ($al eq "NN"); \
$al =~ /^([ACGT])([ACGT])$/ || die "Unrecognized allele string $al"; \
($a1, $a2) = ($1, $2); \
# NOTE: one trouble-maker (other pop files have A/C with AC genotypes): \
if ($rsId eq "rs7059622" && "'$pop'" eq "YRI") \
{ $a1 = $compl{$a1}; $a2 = $compl{$a2}; } \
# The error that the trouble-maker triggered: \
if (($a1 !~ /^[$obs1$obs2]$/) || ($a2 !~ /^[$obs1$obs2]$/)) \
{ die "$rsId (${chr}_'$pop'): obs $obs1/$obs2 !~ $a1$a2!\n\t"; } \
if ($a1 eq $a2) { $hom{$a1}++; } else { $het{$a1}++; $het{$a2}++; } \
} \
$start = $end - 1; \
$hom1 = $hom{$obs1} || 0; $hom2 = $hom{$obs2} || 0; \
$het = $het{$obs1} || 0; $het2 = $het{$obs2} || 0; \
$score = (1000 * (2*$hom2 + $het) / (2*($hom1 + $hom2 + $het))); \
if ($score >= 500) { $score = 1000 - $score; } \
$score = int($score + 0.5); \
if ($het != $het2) { die "het{$obs1} ($het{$obs1}) != het{$obs2} ($het{$obs2})"; } \
$_ = "$chr\t$start\t$end\t$rsId\t$score\t+\t$obs1/$obs2\t$obs1\t$hom1\t$obs2\t$hom2\t$het\n"; \
} else { \
die "Unrecognized format:\n$_\n\t"; \
}' > hapmapSnps$pop.bed
end
wc -l hapmapSnps*.bed
# 1561453 hapmapSnpsASW.bed
# 4030774 hapmapSnpsCEU.bed
# 4052336 hapmapSnpsCHB.bed
# 1306196 hapmapSnpsCHD.bed
# 1407877 hapmapSnpsGIH.bed
# 4052423 hapmapSnpsJPT.bed
# 1529764 hapmapSnpsLWK.bed
# 1410265 hapmapSnpsMEX.bed
# 1537638 hapmapSnpsMKK.bed
# 1419921 hapmapSnpsTSI.bed
# 3984356 hapmapSnpsYRI.bed
foreach pop (ASW CEU CHB CHD GIH JPT LWK MEX MKK TSI YRI)
hgLoadBed hg18 hapmapSnps$pop hapmapSnps$pop.bed -renameSqlTable \
-sqlTable=$HOME/kent/src/hg/lib/hapmapSnps.sql
end
#Reading hapmapSnpsASW.bed
#Loaded 1561453 elements of size 12
#Reading hapmapSnpsCEU.bed
#Loaded 4030774 elements of size 12
#Reading hapmapSnpsCHB.bed
#Loaded 4052336 elements of size 12
#Reading hapmapSnpsCHD.bed
#Loaded 1306196 elements of size 12
#Reading hapmapSnpsGIH.bed
#Loaded 1407877 elements of size 12
#Reading hapmapSnpsJPT.bed
#Loaded 4052423 elements of size 12
#Reading hapmapSnpsLWK.bed
#Loaded 1529764 elements of size 12
#Reading hapmapSnpsMEX.bed
#Loaded 1410265 elements of size 12
#Reading hapmapSnpsMKK.bed
#Loaded 1537638 elements of size 12
#Reading hapmapSnpsTSI.bed
#Loaded 1419921 elements of size 12
#Reading hapmapSnpsYRI.bed
#Loaded 3984356 elements of size 12
rm bed.tab; nice gzip *.bed
#############################################################################
# HAPMAP REL27 ORTHOLOGOUS ALLELES (DONE 3/4/09 angie)
# Similar procedure to snp129Ortho, but we make one table per species
# because they are independent subtracks of HapMap SNPs.
cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
# Glom all human info that we need for the final table onto the
# name, to sneak it through liftOver: rsId|chr|start|end|obs|strand
awk 'BEGIN{OFS="\t";} \
{print $1, $2, $3, \
$4 "|" $1 "|" $2 "|" $3 "|" $7 "|" $6, \
0, $6;}' \
hapmapSnps???.bed \
| sort -u -k1,1 -k2n,2n \
> hapmapSnpsForLiftOver.bed
wc -l hapmapSnpsForLiftOver.bed
#4165831 hapmapSnpsCombined.bed
# Orthologous allele locations:
mkdir run.liftOChimp
cd run.liftOChimp
mkdir split out
splitFile ../hapmapSnpsForLiftOver.bed 25000 split/chunk
cp /dev/null jobList
foreach f (split/chunk*)
echo liftOver $f \
/hive/data/genomes/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \
\{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \
>> jobList
end
ssh pk
cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III/run.liftOChimp
para make jobList
#Completed: 167 of 167 jobs
#CPU time in finished jobs: 31364s 522.74m 8.71h 0.36d 0.001 y
#IO & Wait Time: 800s 13.33m 0.22h 0.01d 0.000 y
#Average job time: 193s 3.21m 0.05h 0.00d
#Longest finished job: 431s 7.18m 0.12h 0.00d
#Submission to last job: 442s 7.37m 0.12h 0.01d
mkdir ../run.liftOMac
cd ../run.liftOMac
mkdir out
ln -s ../run.liftOChimp/split .
cp /dev/null jobList
foreach f (split/chunk*)
echo liftOver $f \
/hive/data/genomes/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \
\{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \
>> jobList
end
para make jobList
#Completed: 167 of 167 jobs
#CPU time in finished jobs: 2482s 41.36m 0.69h 0.03d 0.000 y
#IO & Wait Time: 1361s 22.69m 0.38h 0.02d 0.000 y
#Average job time: 23s 0.38m 0.01h 0.00d
#Longest finished job: 33s 0.55m 0.01h 0.00d
#Submission to last job: 97s 1.62m 0.03h 0.00d
# Concatenate the liftOver results, sorting by ortho pos in order to
# efficiently access 2bit sequence in getOrthoSeq. The output of
# that is swizzled so that a glom of ortho coords is the first column,
# and then we sort by that for joining with base quality info.
# Ditto for macaque. ~5 minutes per species:
cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
| ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /hive/data/genomes/panTro2/panTro2.2bit \
| awk 'BEGIN{OFS="\t";} {print $2 ":" $3 ":" $4, $5, $6, $1;}' \
| sort > panTro2.orthoGlom.txt
sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
| ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /hive/data/genomes/rheMac2/rheMac2.2bit \
| awk 'BEGIN{OFS="\t";} {print $2 ":" $3 ":" $4, $5, $6, $1;}' \
| sort > rheMac2.orthoGlom.txt
wc -l panTro2.orthoGlom.txt rheMac2.orthoGlom.txt
# 4057739 panTro2.orthoGlom.txt
# 3750076 rheMac2.orthoGlom.txt
# Get base qualities -- ~12-16min per species.
cut -f 1 panTro2.orthoGlom.txt | sed -e 's/:/\t/g' \
| hgWiggle -db=panTro2 -lift=1 -doAscii -bedFile=stdin quality \
| varStepToBedGraph.pl stdin \
| awk 'BEGIN{OFS="\t";} {print $1 ":" $2 ":" $3, int($4+0.5);}' \
| sort > panTro2.baseQuals.txt
#Processed 4003968 lines input, 4003685 data lines, 47 variable step declarations
cut -f 1 rheMac2.orthoGlom.txt | sed -e 's/:/\t/g' \
| hgWiggle -db=rheMac2 -lift=1 -doAscii -bedFile=stdin quality \
| varStepToBedGraph.pl stdin \
| awk 'BEGIN{OFS="\t";} {print $1 ":" $2 ":" $3, int($4+0.5);}' \
| sort > rheMac2.baseQuals.txt
#Processed 3749772 lines input, 3749645 data lines, 21 variable step declarations
# Join the allele-glom with the base qual-glom and swizzle columns into
# the right order for a hapmapAllelesOrtho table.
join -a 1 -e 0 panTro2.orthoGlom.txt panTro2.baseQuals.txt \
| perl -wpe 'chomp; ($oG, $oA, $oStr, $hG, $bQ) = split; \
($oC, $oS, $oE) = split(":", $oG); \
($rs, $hC, $hS, $hE, $hO, $hStr) = split(/\|/, $hG); \
unless (defined $bQ) { \
if ($oC =~ /^chr(21|Y|Y_random)$/) { $bQ = 98; } # per panTro2 quality track desc \
elsif ($oC eq "chrM") { $bQ = 0; } \
else { die "missing qual for $oC: $_\n\t"; } } \
$_ = "$hC\t$hS\t$hE\t$rs\t$bQ\t$hStr\t\t$hO\t$oC\t$oS\t$oE\t$oStr\t$oA\n";' \
| sort -k1,1 -k2n,2n \
> hapmapAllelesChimp.bed
wc -l hapmapAllelesChimp.bed
#4057739 hapmapAllelesChimp.bed
join -a 1 -e 0 rheMac2.orthoGlom.txt rheMac2.baseQuals.txt \
| perl -wpe 'chomp; ($oG, $oA, $oStr, $hG, $bQ) = split; \
($oC, $oS, $oE) = split(":", $oG); \
($rs, $hC, $hS, $hE, $hO, $hStr) = split(/\|/, $hG); \
unless (defined $bQ) { die "missing qual for $oC: $_\n\t"; } \
$_ = "$hC\t$hS\t$hE\t$rs\t$bQ\t$hStr\t\t$hO\t$oC\t$oS\t$oE\t$oStr\t$oA\n";' \
| sort -k1,1 -k2n,2n \
> hapmapAllelesMacaque.bed
wc -l hapmapAllelesMacaque.bed
#3750076 hapmapAllelesMacaque.bed
# Load tables.
cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
hgLoadBed hg18 hapmapAllelesChimp hapmapAllelesChimp.bed \
-tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/hapmapAllelesOrtho.sql
#Loaded 4057739 elements of size 13
hgLoadBed hg18 hapmapAllelesMacaque hapmapAllelesMacaque.bed \
-tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/hapmapAllelesOrtho.sql
#############################################################################
# HAPMAP REL27 SUMMARY FOR HGTRACKS FILTERING (DONE 3/5/09 angie)
cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
time hapmapPhaseIIISummary .
#115.244u 5.009s 2:10.08 92.4% 0+0k 0+0io 2pf+0w
time hgLoadBed hg18 hapmapPhaseIIISummary hapmapPhaseIIISummary.bed \
-tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/hapmapPhaseIIISummary.sql
#Loaded 4166007 elements of size 18
#33.401u 3.275s 1:46.95 34.2% 0+0k 0+0io 0pf+0w
#############################################################################
# GERP Conservation scoring and elements for Ensembl 31-way alignments
# From Javier Guerroro
# ENCODE-related data (equested by Margulies, for use by ENCODE analysis group)
# (2009-03-05 kate)
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir -p ensembl31wayGerp/lab
cd ensembl31wayGerp/lab
wget -r ftp://ftp.ebi.ac.uk/pub/databases/ensembl/encode/31way_msa/
cd ..
bzcat lab/31way_gerp_elements.bed.bz2 | \
tail -n +2 | \
sed 's/31way_gerp_elem_365000000/gerp31./' | \
hgLoadBed hg18 ensembl31wayGerpElements stdin \
-sqlTable=$HOME/kent/src/hg/lib/encode/broadPeak.sql -renameSqlTable
# Loaded 1464897 elements of size 9
cat > we.csh << 'EOF'
foreach f (lab/*.wig.bz2)
echo $f
bzcat $f | tail -n +2 | wigEncode stdin temp.wig temp.wib
end
'EOF'
bzcat lab/*.wig.bz2 | tail -n +2 | \
wigEncode stdin ensembl31wayGerpScores.wig ensembl31wayGerpScores.wib
# load database
mkdir /gbdb/hg18/wib
ln -s `pwd`/ensembl31wayGerpScores.wib /gbdb/hg18/wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 ensembl31wayGerpScores ensembl31wayGerpScores.wig
############################################################################
# VEGA GENES UPDATE (BUILD 33) (DONE 2008-03-11 Andy)
mkdir /cluster/data/hg18/bed/vega33
cd /cluster/data/hg18/bed/vega33
wget --timestamping "ftp://ftp.sanger.ac.uk/pub/vega/human/*" \
"ftp://ftp.sanger.ac.uk/pub/vega/human/pep/*.tot.fa.gz"
zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
| grep "^chr" > nonHaps.gtf
zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
| grep -v "^chr" > haps.gtf
awk 'BEGIN{OFS="\t";FS="\t";}{ if ($1 == "c6_COX") { if (($4 >= 28688544) && ($5 <= 33420241)) print; } else if ($1 == "c6_QBL") { if (($4 >= 28885510) && ($5 <= 33451440)) print;}}' haps.gtf > keeptHaps.gtf
liftUp -type=.gtf lifted.gtf /cluster/data/hg18/jkStuff/ensGene.haplotype.lift carry keeptHaps.gtf
cat nonHaps.gtf lifted.gtf > all.gtf
gzip all.gtf
rm *.gtf
gtfToGenePred -infoOut=infoOut.txt -genePredExt all.gtf.gz stdout | gzip > all.gp.gz
/cluster/home/hiram/kent/src/hg/utils/automation/extractGtf.pl infoOut.txt > ensGtp.tab
genePredCheck -db=hg18 all.gp.gz
#checked: 69859 failed: 0
zcat all.gtf.gz | grep -i pseudo > pseudo.gtf
zcat all.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
genePredCheck -db=hg18 pseudo.gp
#checked: 6901 failed: 0
genePredCheck -db=hg18 not.pseudo.gp
#checked: 62958 failed: 0
hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp
hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp
##############################################################################
# UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram)
mkdir /hive/data/genomes/hg18/bed/ucscToEnsembl
cd /hive/data/genomes/hg18/bed/ucscToEnsembl
awk '{printf "%s\t%s\n", $4, $2}' ../../jkStuff/ensGene.haplotype.lift \
> ucscToEnsembl.tab
cat << '_EOF_' > ucscToEnsembl.sql
# UCSC to Ensembl chr name translation
CREATE TABLE ucscToEnsembl (
ucsc varchar(255) not null, # UCSC chromosome name
ensembl varchar(255) not null, # Ensembl chromosome name
#Indices
PRIMARY KEY(ucsc(21))
);
'_EOF_'
hgsql hg18 < ucscToEnsembl.sql
hgsql hg18 \
-e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'
awk '{printf "%s\t%d\n", $2, -$1}' ../../jkStuff/ensGene.haplotype.lift \
> ensemblLift.tab
cat << '_EOF_' > ensemblLift.sql
# UCSC offset to Ensembl coordinates
CREATE TABLE ensemblLift (
chrom varchar(255) not null, # Ensembl chromosome name
offset int unsigned not null, # offset to add to UCSC position
#Indices
PRIMARY KEY(chrom(6))
);
'_EOF_'
hgsql hg18 < ensemblLift.sql
hgsql hg18 \
-e 'LOAD DATA LOCAL INFILE "ensemblLift.tab" INTO TABLE ensemblLift'
##############################################################################
# FOX2 CLUSTERS (DONE 2009-04-08, Andy)
cp cluster.combine.bed /hive/data/genomes/hg18/bed/fox2ClipSeq
## (got the data as an attachment from Gene Yeo)
cd /hive/data/genomes/hg18/bed/fox2ClipSeq
grep chr cluster.combine.bed | cut -f1-4 | \
bedSort stdin fox2ClipClusters.hg17.bed
liftOver fox2ClipClusters.hg17.bed \
/gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \
fox2ClipClusters.bed unmapped.bed
hgLoadBed hg18 fox2ClipClusters{,.bed}
##############################################################################
# RE-BUILD sno/miRNA TRACK (DONE, 2009-06-11 - 2009-06-13, hartera)
# The data in this track is out of date so update the track.
mkdir -p /hive/data/genomes/hg18/bed/wgRna-2009-06-11
cd /hive/data/genomes/hg18/bed/wgRna-2009-06-11
# Download GFF file of latest miRNA annotations from miRBase at the
# Wellcome Trust Sanger Institute (WTSI). This is Release 13.0 (March
# 2009)
wget --timestamping \
ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/CURRENT/genomes/hsa.gff
# Re-format, need to add "chr" to the beginning of each line.
sed -e 's/^/chr/' hsa.gff > hsMirBaseFormat.gff
# Remove extra "chr" in comment lines
perl -pi.bak -e 's/chr#/#/' hsMirBaseFormat.gff
# Change chrMT to chrM
perl -pi.bak -e 's/chrMT/chrM/' hsMirBaseFormat.gff
# Remove all but ID name in last field
sed -e 's/\";//g' hsMirBaseFormat.gff | sed -e 's/ID=\"//g' \
| sed -e 's/ACC=\"MI[0-9]*\s//' > hsMirBaseFormatIdOnly.gff
# use score 906 for + strand and 480 for - strand. This will show
# up black on the track for + strand and grey for - strand.
# Starts appear to be 1-based when compared to miRNAs in current track
# and those in Ensembl.
# Confirmed with Sam Griffith-Jones (one of the authors of miRBase,
# sam.griffith-jones@manchester.ac.uk) that these GFF coordinates
# are 1-based.
# Also add thickStart and thickEnd columns and "miRNA" for type.
awk 'BEGIN {FS="\t"} {OFS="\t"} \
{if ($0 !~ /#/ && $7 == "+") \
print $1, $4-1, $5, $9, 960, $7, 0, 0, "miRNA"; \
else if ($0 !~ /#/ && $7 == "-") \
print $1, $4-1, $5, $9, 480, $7, 0, 0, "miRNA";}' \
hsMirBaseFormatIdOnly.gff > hsMirBaseFormatIdOnly.bed
# 2009-06-12
# snoRNAs are from snoRNABase at http://www-snorna.biotoul.fr/
# Download coordinates for hg18 from
# http://www-snorna.biotoul.fr/coordinates.php
# This is version 3 of the database.
# save as tab-separated file: snoRNABaseVersion3Coords.txt and remove
# first and last lines.
perl -pi.bak -e 's/\"//g' snoRNABaseVersion3Coords.txt
# Reformat to BED format with thickStart and thickEnd set to 0.
awk 'BEGIN {FS="\t"} {OFS="\t"} \
{if ($4 == "+") \
print $1, $2-1, $3, $5, 960, $4, 0, 0,$6; \
else if ($4 == "-") \
print $1, $2-1, $3, $5, 480, $4, 0, 0,$6;}' \
snoRNABaseVersion3Coords.txt > snoRNABaseVersion3Coords.bed
# Merge the miRNA and snoRNA files together
cat hsMirBaseFormatIdOnly.bed snoRNABaseVersion3Coords.bed \
> wgRna20090611.bed
# Load into separate table rather than overwriting wgRna
cp -p /cluster/home/hartera/src/hg/lib/wgRna.sql wgRnaJun09.sql
perl -pi.bak -e 's/TABLE wgRna/TABLE wgRnaJun09/' wgRnaJun09.sql
hgLoadBed -sqlTable=wgRnaJun09.sql hg18 wgRnaJun09 wgRna20090611.bed
# Reading wgRna20090611.bed
# Loaded 1120 elements of size 9
# Sorted
# Creating table definition for wgRnaJun09
# Saving bed.tab
# Loading hg18
# Clean up
rm *.bak
hgsql -e 'select count(*) from wgRna;' hg18
# 1059
# for miRNAs: 685 (676 unique names)
# and others: 374 including 21 scaRNA
hgsql -e 'select count(*) from wgRnaJun09;' hg18
# 1120
# for miRNAs: 718 (705 unique)
# and others: 402 including 21 scaRNA
# 2009-06-13
# Renamed the old wgRna track to wgRnaOld and renamed the new wgRnaJun09
# track to wgRna. Will keep the old track around for a while until
# new track checked and QA'd.
hgsql -e 'alter table wgRna rename wgRnaOld;' hg18
hgsql -e 'alter table wgRnaJun09 rename wgRna;' hg18
##################
## Uniqueness Track: Step one (courtesy of John Castle, Rosetta)
## Make oligos of length XX
# Perl one-liner to make a batch file
# I've included the perl files CNV_makereads2.pl (simply uses substr on a chromosome) and fastagrep.pl (to remove sequences with Ns # The files chr$x.fa are the individual chromosomes
perl -e 'for ($i = 1;$i<= 25; $i++) {$x = $i; if ($i == 23) {$x = 'X';} if ($i == 24) {$x = 'Y';} if ($i == 25) {$x = 'M';} print "~/DTcode/CNV_makereads2.pl 100 /info/genome/Projects/721/ref/chr$x.fa | fastagrep.pl -v n > chr$x.fa\n";}' > batch_chr_get
#!/usr/bin/perl -w
#---------------------------------------------------------------------
# C O P Y R I G H T N O T I C E
#---------------------------------------------------------------------
# Copyright (c) 2001 Rosetta Inpharmatics, Inc.
# 12040 115th Avenue NE, Kirkland, WA 98034-6900
# All Rights Reserved. Reproduction, adaptation, or
# translation without prior written permission of
# Rosetta Inpharmatics, Inc. is prohibited.
#---------------------------------------------------------------------
# CNV_makereads.pl
# $Id$
#use lib ('/home/castlej/perl/','/home/castlej/OSDTools/','/home/castlej/DTcode/');
#use strict;
my $oligo_length = $ARGV[0];
my $file = $ARGV[1];
open(IN,$file);
$/ = "\n>";# change input line separator to '>' to suck up FASTA sequences
while ($line= <IN>) {
$line =~ s/^>//m;
# remove '>' from end of $line
$line =~ s/>$//m;
# remove Unigene lines starting with '#'
$line =~ s/\n\#.*$//m;
# get sequence id
$line =~ /^\s*(\S+).*([^\0]*)/;
$id = $1;
$seq = $2;
$seq =~ s/\n//g;
}
if ($id =~ /(chr\S+)\.nib/) {
$chr = $1;
} elsif ($id =~ /(chr\S+)/) {
$chr = $1;
}
for ($i = 0; $i <length($seq)-$oligo_length; $i++) {
$a = substr($seq,$i,$oligo_length);
$j = $i+$oligo_length;
print ">$chr:$i-$j\n$a\n";
}
#!/usr/bin/perl -w
#---------------------------------------------------------------------
# C O P Y R I G H T N O T I C E
#---------------------------------------------------------------------
# Copyright (c) 2000,2001,2002 Rosetta Inpharmatics, Inc.
# 12040 115th Avenue NE, Kirkland, WA 98034-6900
# All Rights Reserved. Reproduction, adaptation, or
# translation without prior written permission of
# Rosetta Inpharmatics, Inc. is prohibited.
#---------------------------------------------------------------------
#
# $Id$
#
# finds selected sequences in FASTA by regex matching in defline or sequence
use strict;
my( $option,
$regex,
@regexes,
%tofind,
$exceptflag,
$key,
$value,
$line,
);
$exceptflag = 0;
unless (scalar(@ARGV)) {
print "\nUsage: $0 [OPTION] PATTERN [FASTAFILE]\n";
print "$0 finds sequences by pattern matching in FASTA format data\n\n";
exit;
}
while ((scalar(@ARGV)) && ($ARGV[0] =~ /^-(\w+)/)) {
$option = $1;
shift(@ARGV);
if ($option =~ /v/) { # user wants sequences NOT matching regex(es)
$exceptflag = 1;
}
if ($option =~ /s/) { # regex on command line
push(@regexes, shift(@ARGV));
}
if ($option =~ /f/) { # user wants list of regexes from file
open(INHANDLE, "<$ARGV[0]") ||
die "$0: error, can't open regex list file $ARGV[0]\n";
while (defined($regex = <INHANDLE>)) {
chomp $regex;
push(@regexes, $regex);
}
shift(@ARGV);
}
}
if (scalar(@regexes) < 1) { push(@regexes, shift(@ARGV)); }
$/ = "\n>"; # change input line separator to suck up FASTA sequences
SEQUENCE:
while (defined($line = <>)) {
# remove '>' from start of first $line
$line =~ s/^>//m;
# stick '>' back on all $lines
$line = '>'.$line;
# remove '>' from end of $line
$line =~ s/>$//m;
# remove Unigene lines starting with '#'
$line =~ s/\n\#.*$//m;
foreach $regex (@regexes) {
if ($line =~ /$regex/) {
unless ($exceptflag) { print $line; }
next SEQUENCE;
}
}
if ($exceptflag) { print $line; }
}
# Submit batch file to cluster (we use LSF), each line is a submission
perl -ne 'chomp; $a = "bsub -q short64 \"$_\"\n"; system($a);' batch_chr_get
####################
# Uniqueness Step two # I've used an older version of BWA. The newer version from sourceforge outputs a binary file which then must be converted to a text file
# HG18 is the human genome
# I could include banything_2GBNew.pl but it is simply a cluster "chunk and submit" code
# Method 1 perl -e 'for ($i =1;$i<= 25; $i++) {$x = $i; if ($i == 23) {$x = 'X';} elsif ($i == 24) {$x = 'Y';} elsif ($i == 25) {$x = 'M';} print "banything_2GbNew2.pl -a /ifs65/dtap/bin/bwa/bwa-0.2.0/bwa -z 1000000 -in chr$x.fa -o chr$x.bwa -stdout chr$x.bwa -pre \"aln -o 0 /info/dtap/projects/1057_CNV/HG18/HG18 \" -suf \" \" \n";}' >! batch_banything
chmod +777 batch_banything
batch_banything
# Method 2 perl -e 'for ($i =1;$i<= 25; $i++) {$x = $i; if ($i == 23) {$x = 'X';} elsif ($i == 24) {$x = 'Y';} elsif ($i == 25) {$x = 'M';} print "/ifs65/dtap/bin/bwa/bwa-0.2.0/bwa aln -o 0 /info/dtap/projects/1057_CNV/HG18/HG18 chr$x.fa > chr$x.bwa\n"}' >! batch_banything
chmod +777 batch_banything
perl -ne 'chomp; $a = "bsub -q long64 \"$_\"\n"; system($a);' batch_anything
#####################
# Uniqueness Step three
# I ran this one-liner from a higher level directory
perl -e '$pwd = `pwd`; chomp($pwd); @a = `ls`; foreach $dir (@a) {chomp ($dir); unless ($dir =~ /(\d+)mer_2nd/) {next;}; @b = `ls $dir/*fa.bwa`; foreach $file (@b) {chomp($file); $f = "$pwd/$file"; $f =~ /^(\S+chr[^\.]+)\.*/; $e = $1; print "~/DTcode/CNV_parseBWA_wiggle.pl 100 1 $f\* > $e.quality.100.wiggle\n";}}' > batch_wiggle
# Submit batch file to cluster (we use LSF), each line is a submission
perl -ne 'chomp; $a = "bsub -q long64 \"$_\"\n"; system($a);' batch_wiggle
#!/usr/bin/perl -w
# John Castle
# May 19, 2009
# $Cap a maximum value to clip data with
# $Use_score whether to output the uniqueness score or the number of hits
# @FilesIn the BWA text output files to scan
# ** NOTE ** The newer BWA algorithm outputs a binary file that is then made into a text file using BWA again.
# However, the text file output has a slightly different format so the parsing will need to change.
($Cap, $Use_score, @FilesIn) = @ARGV;
if ($FilesIn[0] =~ /\.gz/) {
open(IN,"gzip -dc $FilesIn[0] |")
} else {
open(IN,$FilesIn[0]);
}
#### Description
@a = split("\t",<IN>);
$a[6] =~ /(\d+)/;
$len = $1;
close(IN);
### Wiggle header text
if ($Use_score == 0) {
print "track type=wiggle_0 name=\"Alignment scores of $len\mer as\" description=\"Unique $len mer alignments\" color=100,50,150 gridDefault=on yLineOnOff=on visibility=full maxHeightPixels=40:40:12\n";
} else {
print "track type=wiggle_0 name=\"$len\mer alignment scores\" description=\"$len\mer alignment scores from BWA/MAQ, where 37 indicates a unique alignment\" color=100,50,150 gridDefault=on yLineOnOff=on visibility=full maxHeightPixels=40:40:12\n";
}
### Parse through file(s)
foreach $file (@FilesIn) {
if ($file =~ /\.gz/) {
open(IN,"gzip -dc $file |");
} else {
open(IN,$file);
}
@a = split("\t",<IN>);
$a[0] =~ /(chr\S+):(\d+)/;
$Chr = $1;
$start = $2;
$score = $a[5];
$hits = $a[11];
if ($hits > $Cap) {$hits = $Cap;}
if ($Use_score == 1) {$value = $score;}
else {$value = $hits;}
while (<IN>) { # Make wiggle track, with start and end coordinates for same scoring regions
@a = split("\t",$_);
if ($#a <15) {
next;
}
$a[0] =~ /(chr\S+):(\d+)/;
$chr = $1;
$pos = $2;
$score = $a[5];
$hits = $a[11];
if ($hits > $Cap) {$hits = $Cap;}
if ($Use_score == 1) {$x = $score;
} else {$x = $hits;}
if ($x != $value) {
print "$Chr\t$start\t$pos\t$value\n";
$Chr = $chr;
$value = $x;
$start = $pos;
}
}
print "$Chr\t$start\t$pos\t$value\n";
close(IN);
}
############################################################################
# Re-Run equCab2 alignment (DONE - 2009-06-29,07-02 - Hiram)
mkdir /hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29
cd /hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29
cat << '_EOF_' > DEF
# Human vs. Horse
BLASTZ_M=50
# TARGET: Human hg18
SEQ1_DIR=/scratch/data/hg18/bothMaskedNibs
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl `pwd`/DEF \
-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-workhorse=hgwdev \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 582m47.015s
# failed due to power failure - Mon Jun 29 23:32:54 PDT 2009
time doBlastzChainNet.pl `pwd`/DEF \
-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-continue=chainRun -workhorse=hgwdev \
-chainMinScore=3000 -chainLinearGap=medium > chainRun.log 2>&1 &
# real 430m13.886s
cat fb.hg18.chainEquCab2Link.txt
# 1647122438 bases of 2881515245 (57.162%) in intersection
mkdir /hive/data/genomes/equCab2/bed/blastz.hg18.swap
cd /hive/data/genomes/equCab2/bed/blastz.hg18.swap
time doBlastzChainNet.pl \
/hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29/DEF \
-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-swap -workhorse=hgwdev \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 238m42.004s
cat fb.equCab2.chainHg18Link.txt
# 1622340736 bases of 2428790173 (66.796%) in intersection
############################################################################
# Fantom Cage 4 Track (2009-07-16)
cd /projects/compbiousr/sugnet/projects/cage-20090428
mkdir data
cd data
# Get the Human tags from Riken's download site.
wget -r -l 3 http://fantom.gsc.riken.jp/4/download/Tables/human/CAGE/mapping/
# Apparently time series with hours at:
# 4,5,6,8,10,11,15,21,22,27,28,33,34,35,37,40,42,43,45,47,48,49,51,52,53,57,59,61,62,63,64,65,69,73,74,91,92,93,h95 ctrls, i02, i03
# Goto the data directory
cd /projects/compbiousr/sugnet/projects/cage-20090428/data/fantom.gsc.riken.jp/4/download/Tables/human/CAGE/mapping/
# Unzip data
for bz in `ls *.bz2`; do \
echo "Unzipping $bz"; \
bunzip2 $bz; \
done
# From column headers it looks like the values of interest are:
# 0 = id
# 1 = library_count
# 2 = edit_string
# 3 = chrom
# 4 = strand
# 5 = start
# 6 = end
# Pull the raw scores into a single file
cat h*_mapping.tbl.txt | grep -v '^#' | grep -v 'library_count' | grep 'chr' | perl -ne '$l=$_; @w = split /\t/, $l; print "$w[3]\t$w[5]\t$w[6]\t$w[0]\t$w[1]\t$w[4]\n";' > all.wscores.bed
cat << '_EOF_' > toBed.pl
#!/usr/bin/perl
$prefix = shift(@ARGV);
$prefix =~ s/h/H/g;
while($l = <>) {
if(!($l=~ /^\#/) && !($l=~/^id/)) {
chomp($l);
@w = split /\t/, $l;
$score = 100 * $w[1];
if($score > 1000) {
$score = 1000;
}
$name = $prefix;
$size = $w[6] - $w[5];
print "$w[3]\t$w[5]\t$w[6]\t$prefix\t$score\t$w[4]\t$w[5]\t$w[6]\t0\t1\t$size,\t0,\n";
}
}
'_EOF_'
# << happy emacs
chmod 755 toBed.pl
# Make the top level bed track
for f in `ls *mapping.tbl.txt`; do
root=`basename $f .txt`;
prefix=`basename $f _mapping.tbl.txt`;
bed=$root.bed;
echo "Reading from $f into $bed with prefix $prefix";
toBed.pl $prefix < $f > $bed;
done;
# Call program in stats mode to generate summary statistics about how many reads there are in a sliding window around
# sites with tags
cageSingleTrack -input=all.wscores.bed -forward=all.forward.plaw.scores -reverse=all.reverse.plaw.scores -stats-only
# Grab every 100th record to make a bite (byte?) sized chunk for R
cat all.forward.plaw.scores | perl -e '$c = 0; while($l=<>) { if($c++ % 100 == 0) { print "$l"; } }' > sample.txt
# Some R code to fit a power law model and get coefficient via log/log line fit
d = read.table('sample.txt');
# Grab all the data less than 200 counts (81% of data) as that is where the model really fits
dd = d$V4[d$V4 < 200]
# Use hist command to find counts at each bucket size
h = hist(dd, 200, plot=F)
# Take the logs
y = log10(h$counts)
x = log10(h$breaks[1:198])
# Fit a robust line
library(MASS)
r = rlm(y~x)
# Call:
# rlm(formula = y ~ x)
# Converged in 5 iterations
#
# Coefficients:
#(Intercept) x
# 3.987744 -1.196954
# Visually note that the data fits a power law nicely
plot(log10(h$breaks[1:198]),log10(h$counts), xlab="Log10 Tags In Window", ylab="Log10 Number of Times Occuring", main="Distribution of CAGE Tags in Sliding 35bp Window")
abline(r)
# Using the coefficient learned above predict the posterior probability of seeing this observation
cageSingleTrack -input=all.wscores.bed -forward=all.forward.plaw.bg2 -reverse=all.reverse.plaw.bg2 -alpha=1.196954 -xmax=198
# Load up the bed graph tracks
hgLoadBed -bedGraph=4 hg18 FantomCageForwardPowerLawGraph all.forward.plaw.bg2
hgLoadBed -bedGraph=4 hg18 FantomCageReversePowerLawGraph all.reverse.plaw.bg2
############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
############################################################################
# rnaBinding RNA Binding Proteins (2009-07-28 markd)
# contributor: Jeremy Sanford <sanford@biology.ucsc.edu>
# sfrs1Input BED table:
# need to drop color, as it's in the wrong column
# skip header
tawk 'NR>1{print $1,$2,$3,$4,$5,$6}' Input_sequence_blocks.bed | hgLoadBed hg18 sfrs1Input stdin
# sfrs1Clip BED table:
# skip header
tawk 'NR>1{print $1,$2,$3,$4,$5,$6}' SFRS1_CLIP_sequence_blocks.bed | hgLoadBed hg18 sfrs1Clip stdin
# SFRS1_consensus_sites.wig
tawk 'NR>1' SFRS1_consensus_sites.wig | wigEncode stdin sfrs1ConsensusSites.wig sfrs1ConsensusSites.wib
# Converted stdin, upper limit 11.63, lower limit -28.64
hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 sfrs1ConsensusSites sfrs1ConsensusSites.wig
ln -s $(pwd -P)/sfrs1ConsensusSites.wib /gbdb/hg18/wib/
############################################################################
-# VEGA GENES UPDATE TO BUILD 35 (DONE, 2009-07-04, hartera)
+# VEGA GENES UPDATE TO BUILD 35 (DONE, 2009-08-04, hartera)
# Needs updating as the current version is build 33.
# Download the human VEGA Genes posted on ftp site on 2009-03-31
+# 2009-08-03 (hartera) - Added code to register track handler for
+# vegaGeneComposite.
+# 2009-08-15 - 2009-08-16 (hartera) - Added code to allow use of radio buttons
+# on the configuratio page for the track item labels. Modified code so it
+# can be shared with Ensembl to create the links to Vega transcript, gene
+# and protein reports on the details pages.
+# 2009-08-22 - Finished code for adding Vega report URLs to the details pages.
+# Loaded the vegaGtp table.
+
mkdir /hive/data/genomes/hg18/bed/vega35
cd /hive/data/genomes/hg18/bed/vega35
wget --timestamping "ftp://ftp.sanger.ac.uk/pub/vega/human/*" \
"ftp://ftp.sanger.ac.uk/pub/vega/human/pep/*.tot.fa.gz"
zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
| grep "^chr" > nonHaps.gtf
zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
| grep -v "^chr" > haps.gtf
awk 'BEGIN{OFS="\t";FS="\t";}{ if ($1 == "c6_COX") { if (($4 >= 28688544) && ($5 <= 33420241)) print; } else if ($1 == "c6_QBL") { if (($4 >= 28885510) && ($5 <= 33451440)) print;}}' haps.gtf > keptHaps.gtf
liftUp -type=.gtf lifted.gtf /cluster/data/hg18/jkStuff/ensGene.haplotype.lift carry keptHaps.gtf
cat nonHaps.gtf lifted.gtf > all.gtf
# Do this to create the infoOut.txt file and extract the extra information
gtfToGenePred -infoOut=infoOut.txt -genePredExt all.gtf stdout | gzip > tempAll.gp.gz
~/kent/src/hg/utils/automation/extractGtf.pl infoOut.txt > vegaGtp.tab
# Change the gene name to have the gene_id label so that this is in the
# name2 field of the extended genePred table. This can then be displayed
# at the track item label.
perl -pi.bak -e 's/gene_id/other_gene_id/' all.gtf
perl -pi.bak -e 's/gene_name/gene_id/' all.gtf
gzip all.gtf
rm *.gtf tempAll.gp.gz
# create genePred files for loading into database
gtfToGenePred -genePredExt all.gtf.gz stdout | gzip > all.gp.gz
genePredCheck -db=hg18 all.gp.gz
# checked: 81244 failed: 0
zcat all.gtf.gz | grep -i pseudo > pseudo.gtf
zcat all.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
genePredCheck -db=hg18 pseudo.gp
# checked: 8331 failed: 0
genePredCheck -db=hg18 not.pseudo.gp
# checked: 72913 failed: 0
hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp
hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp
# Added code to src/hg/hgTracks/simpleTracks.c to register a track handler
# for vegaGeneComposite that is now used for this data. This used
# vegaGeneMethods to display the name2 field (gene) as the item label in
# the track.
+ # 2009-08-16 (hartera)
+ # ensGtp table definition is in ~/kent/src/hg/lib/ensGtp.sql
+ # There is an index on the protein field so it can not be NULL.
+ # If there is no protein, the gene name is given.
+ # Added code to hgTracks.c and hgTrackUi.c to allow the use of
+ # radio buttons on the track configuratioin page to select the
+ # gene name, accession or both to be displayed in the track.
+ # The gene name is displayed by default.
+ # Added code to hgc.c so that Ensembl and Vega can share code to
+ # create links on the details pages to the Vega reports for transcript,
+ # gene and protein through these IDs. Created new function
+ # printEnsemblOrVegaCustomUrl().
+
+ # 2009-08-22 (hartera)
+ # Create a vegaGtp table using the vegaGtp.tab file above. Use ensGtp.sql
+ # to create the table. vegaGtp associates geneId/transcriptId/proteinId
+ # for the links to Vega reports from the details page. If there is no
+ # protein ID because the transcript is noncoding, the gene name is used
+ # instead. This field can not be NULL in the table as there is an index
+ # on it.
+ cd /hive/data/genomes/hg18/bed/vega35
+ cp ~/kent/src/hg/lib/ensGtp.sql .
+ # One of the gene names is long for a noncoding gene so it does not fit
+ # in the protein ID field so change the protein field in ensGtp.sql
+ # to allow 40 chars instead of 20 and re-load the table.
+ hgsql -e 'drop table vegaGtp;' hg18
+ hgLoadSqlTab hg18 vegaGtp ensGtp.sql vegaGtp.tab
+ # Loaded succesfully
+ # Added code to hgc.c to use printEnsemblOrVegaCustomUrl() in
+ # doVegaGene() to add the links to Vega reports on the details pages.
+
############################################################################