src/hg/makeDb/doc/hg17.txt 1.120
1.120 2009/03/11 18:31:05 angie
Updated DGV (v7).
Index: src/hg/makeDb/doc/hg17.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg17.txt,v
retrieving revision 1.119
retrieving revision 1.120
diff -b -B -U 1000000 -r1.119 -r1.120
--- src/hg/makeDb/doc/hg17.txt 23 Feb 2009 23:41:35 -0000 1.119
+++ src/hg/makeDb/doc/hg17.txt 11 Mar 2009 18:31:05 -0000 1.120
@@ -1,24682 +1,24683 @@
# for emacs: -*- mode: sh; -*-
# This file describes how we made the browser database on
# NCBI build 35 (May 2004 freeze)
# HOW TO BUILD AN ASSEMBLY FROM NCBI FILES
# ---------------------------------------
# Make gs.18 directory, gs.18/build35 directory, and gs.18/ffa directory.
ssh eieio
mkdir /cluster/store5/gs.18
mkdir /cluster/store5/gs.18/build35
mkdir /cluster/store5/gs.18/agp
mkdir /cluster/store5/gs.18/ffa
# Make a symbolic link from /cluster/store1 to this location
# (I assume there is some use for this later ?)
cd /cluster/store1
ln -s /cluster/store5/gs.18 ./gs.18
ln -s /cluster/store5/gs.18/build35 /cluster/data/hg17
# Make a symbolic link from your home directory to the build dir:
# (Investigate what this is used for, may no longer be necessary)
ln -s /cluster/store5/gs.18/build35 ~/oo
# NCBI download site, fetch everything into this one directory:
# with the machine and password in your $HOME/.netrc file, this
# wget command will require no login. Your $HOME/.netrc file
# is set to 'chmod 600 .netrc' to prevent anyone from finding
# the data. (There were some early files that later moved
# into an OLD subdirectory. They were broken.)
mkdir /cluster/store5/gs.18/ncbi
cd /cluster/store5/gs.18/ncbi
wget --timestamping ftp://ftp.ncbi.nlm.nih.gov/build_35/*
# FYI: agp file format documented at:
# http://www.ncbi.nlm.nih.gov/Genbank/WGS.agpformat.html # fixup a couple of names for our own purposes here
cd /cluster/store5/gs.18/agp
ln -s ../ncbi/chr*.agp ../ncbi/chr*.fa.gz .
sed -e "s#MT/NC_001807.4#NC_001807#" ../ncbi/chrMT.agp > chrM.agp
sed -e "s/NG_002392.2/NG_002392/" ../ncbi/DR52.agp > chr6_hla_hap1.agp
sed -e "s/NG_002433.1/NG_002433/" ../ncbi/DR53.agp > chr6_hla_hap2.agp
zcat ../ncbi/DR52.fa.gz | \
sed -e "s/gi|29124352|ref|NG_002392.2/ref|NG_002392/" | \
gzip > chr6_hla_hap1.fa.gz
zcat ../ncbi/DR53.fa.gz | \
sed -e "s/gi|28212470|ref|NG_002433.1/ref|NG_002433/" | \
gzip > chr6_hla_hap2.fa.gz
zcat ../ncbi/chrMT.fa.gz | \
sed -e "s/gi|17981852|ref|NC_001807.4/ref|NC_001807/" | \
gzip > chrM.fa.gz
# Put all the agp files together into one.
cd /cluster/store5/gs.18/build35
# The chrM sequence now has its own agp, remove it from
# ref_placed.agp
sed -e "/^NC_001807/d" ../ncbi/ref_placed.agp > ref_placed.agp
cat ref_placed.agp ../agp/chrM.agp ../ncbi/ref_unplaced.agp \
../agp/chr6_hla_hap1.agp ../agp/chr6_hla_hap2.agp \
../ncbi/PAR.agp > ncbi_build35.agp
# and into ffa
cd /cluster/store5/gs.18/ffa
# There is a single bogus line at the end of ref_placed.fa.gz
# declaring the NC_001807 MT sequence, this was later replaced by
# chrMT.fa.gz, so remove that one line:
zcat ../ncbi/ref_placed.fa.gz | sed -e "/^>ref|NC_001807/d" | \
gzip > ref_placed.fa.gz
# (That's a 40 minute job)
# sequence.inf is usually here, symlink it
ln -s ../ncbi/sequence.inf
# put all the fa.gz files together in one big fa.gz
time zcat ref_placed.fa.gz ../agp/chrM.fa.gz ../ncbi/ref_unplaced.fa.gz \
../agp/chr6_hla_hap?.fa.gz ../ncbi/PAR.fa.gz | gzip \
> ncbi_build35.fa.gz
# real 37m42.208s
# user 37m3.490s
# sys 0m31.430s
# Make a listing of all the fasta record headers, just FYI:
cd /cluster/store5/gs.18/ffa
zcat ffa/ncbi_build35.fa.gz | grep "^>" > ncbi.fa.headers
# New to this build is the sequence: NC_001807 which is the
# mitochondria sequence. This prefix NC_ is new to the process
# and will have to be accounted for below. The other two special
# prefixes are similar to what was seen before:
# from DR52.agp NG_002392
# Homo sapiens major histocompatibility complex, class II,
# DR52 haplotype (DR52) on chromosome 6
# and from DR53.agp NG_002433
# Homo sapiens major histocompatibility complex, class II,
# DR53 haplotype (DR53) on chromosome 6
# Fixup seq_contig.md
#
# It has a bunch of stuff belonging to the Celera
# genome assembly. Filter those out. I don't know what the
# NT_07959[0-7] items are, but there are no definitions for them
# in the agp files and no sequence in any fa.gz file.
# Fixup the names for the NG_ items, and change chrom MT to be M
cd /cluster/store5/gs.18/build35
egrep -v "Celera|NT_07959[0-7]" ../ncbi/seq_contig.md | \
sed -e "s/6|NG_002392/6_hla_hap1/" \
-e "s/6|NG_002433/6_hla_hap2/" \
-e "s/^9606\tMT|NC_001807/9606\tM/" \
> temp_contig.md
# get the randoms sorted in proper order. The createNcbiLifts
# does not work correctly if the randoms are not grouped together
# by chromosome
grep -v "|" temp_contig.md > seq_contig.md
# This pulls out all the randoms and groups them within the
# same chrom but leaving them in the same order as they orginally
# were (warning this is BASH code ...)
grep "|" temp_contig.md | awk -F"|" '{print $1}' | \
awk '{print $2}' | sort -n -u | while read CHR
do
grep "[^0-9]${CHR}|" temp_contig.md
done >> seq_contig.md
# Sanity check, checkYbr was updated to handle the NC_ identifier
time zcat ../ffa/ncbi_build35.fa.gz | $HOME/bin/i386/checkYbr \
ncbi_build35.agp stdin seq_contig.md > check.seq_contig
# real 2m34.143s
# user 2m24.970s
# sys 0m8.900s
# result should be clean:
cat check.seq_contig
# Read 380 contigs from ncbi_build35.agp
# Verifying sequence sizes in stdin
# 0 problems detected
# Convert fa files into UCSC style fa files and place in "contigs"
# directory inside the gs.18/build35 directory
# (a check that can be done here is make a list of the contigs
# in this ./contigs directory before and compare it with the
# list of distributed contigs created after they have been
# disbursed.)
# faNcbiToUcsc was fixed to handle the NC_ identifier
cd /cluster/store5/gs.18/build35
# We've been through this often
mv contigs contigs.0
time zcat ../ffa/ncbi_build35.fa.gz | $HOME/bin/i386/faNcbiToUcsc \
-split -ntLast stdin contigs
# real 5m10.938s
# user 2m20.070s
# sys 0m51.020s
# If you want to compare anything to previous work, check now, then:
rm -fr contigs.0
# Determine the chromosome sizes from agps
# Watch carefully how chrY gets constructed. I'm not sure
# this chrom_sizes represents the whole length of chrY with
# the PAR added. We will see about that.
# Script updated to handle new chrom names:
# my @chroms = (1 .. 22, 'X', 'Y', 'M', '6_hla_hap1', '6_hla_hap2');
cd /cluster/store5/gs.18/build35
/cluster/bin/scripts/getChromSizes ../agp
# Create chrom.lst list for use in foreach() loops
awk '{print $1}' chrom_sizes | sed -e "s/chr//" > chrom.lst
# Create lift files (this will create chromosome directory structure) and
# inserts file
/cluster/bin/scripts/createNcbiLifts -s chrom_sizes seq_contig.md .
# Create contig agp files (will create contig directory structure)
/cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build35.agp .
# Create chromsome random agp files.
/cluster/bin/scripts/createNcbiChrAgp -randomonly .
# Copy the original chrN.agp files from the gs.18/agp directory
# into each of the chromosome directories since they contain better
# gap information. Delete the comments at top from these.
cd /cluster/store5/gs.18/build35
foreach c ( `cat chrom.lst` )
sed -e "/^#.*/d" ../agp/chr${c}.agp > ./${c}/chr${c}.agp
end
# chrM needs a name fixup
sed -e "s#NC_001807#chrM#" ../agp/chrM.agp > M/chrM.agp
# Distribute contig .fa to appropriate directory (assumes all files
# are in "contigs" directory).
# Create inserts file from agp and lift files (new - added by Terry, 2004-07-12)
/cluster/bin/scripts/createInserts /cluster/data/hg17 > /cluster/data/hg17/inserts
# create global data link for everyone. No more home directory
# links required.
ln -s /cluster/store5/gs.18/build35 /cluster/data/hg17
cd /cluster/data/hg17
/cluster/bin/scripts/distNcbiCtgFa contigs .
# Verify that everything was moved properly, the contigs directory
# should be empty:
ls contigs
# Nothing there, then remove it
rmdir contigs
# Make a list of the contigs for use later
rm contig.lst
touch contig.lst
foreach chrom ( `cat chrom.lst` )
foreach c ( $chrom/N{C,G,T}_?????? )
set contig = $c:t
echo "${chrom}/${contig}/${contig}.fa" >> contig.lst
end
end
# For later comparisons, this is how many contigs we have:
wc -l contig.lst
# 380
# Note 2004-06-30 - there are some clone numbers left in some of
# the NCBI files that are incorrect. Due to version number
# changes, more than one version is listed. Namely for accession
# numbers: AC004491 AC004921 AC004983 AC005088 AC006014 AC099654
# The AGP files are correct, the sequence.inf file lists these
# twice: AC004491.1 AC004491.2
# AC004921.1 AC004921.2 AC004983.2 AC004983.3
# AC005088.2 AC005088.3 AC006014.2 AC006014.3
# AC099654.4 AC099654.5
# FILES ARE NOW READY FOR REPEAT MASKING - start that process as
# other steps here can proceed in parallel.
# Previous practice used to copy everything over for jkStuff from a
# previous build. Rather than do that, pick up whatever is needed
# at the time it is needed and verify that it is going to do what
# you expect.
cd /cluster/data/hg17
mkdir jkStuff
# Create the contig.gl files - XXX - NCBI doesn't deliver
# contig_overlaps.agp - 2004-06-18 - this is beginning to come
# together and there is now a contig_overlaps.agp file
# This is properly done below with a combination of psLayout
# alignments to create the contig_overlaps.agp file
# /cluster/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md
# Create chromosome gl files
# jkStuff/liftGl.csh contig.gl
# CREATING DATABASE (DONE - 2004-05-20 - Hiram)
# RE-DONE for new NIBS - 2004-06-03
ssh hgwdev
# Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
df -h /var/lib/mysql
# Filesystem Size Used Avail Use% Mounted on
# /dev/sdc1 1.8T 303G 1.4T 19% /var/lib/mysql
# Create the database.
hgsql -e 'create database hg17' mysql
# Copy over grp table (for track grouping) from another database:
hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" hg17
# ENCODE groups
# Added 2005-08016 kate
echo 'UPDATE grp SET priority=7 WHERE name="varRep"'| hgsql hg17
echo 'UPDATE grp SET priority=8 WHERE name="encode"'| hgsql hg17
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeGenes", "ENCODE Regions and Genes", 8.1)' | hgsql hg17
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeTxLevels", "ENCODE Transcript Levels", 8.2)' | hgsql hg17
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChip", "ENCODE Chromatin Immunoprecipitation", 8.3)' | hgsql hg17
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChrom", "ENCODE Chromosome, Chromatin and DNA Structure", 8.4)' | hgsql hg17
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeCompGeno", "ENCODE Comparative Genomics", 8.5)' | hgsql hg17
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeVariation", "ENCODE Variation", 8.6)' | hgsql hg17
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeAnalysis", "ENCODE Analysis", 8.9)' | hgsql hg17
# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS
# (DONE - 2004-05-21 - Hiram)
# RE-DONE with new NIBS - 2004-06-03
# Make nib/, unmasked until RepeatMasker and TRF steps are done.
# Do this now so that the chromInfo table will exist and thus the
# trackDb tables can be built in the next step.
# These unmasked nibs will be replaced by the masked nibs after
# repeat mask and trf are done.
ssh eieio
cd /cluster/data/hg17
# Make chr*.fa from contig .fa
# Copied chrFa.sh from hg16/jkStuff, renamed it to chrFa.csh
time ./jkStuff/chrFa.csh
# real 13m24.710s
# user 9m0.360s
# sys 1m15.820s
mkdir nib
foreach c (`cat chrom.lst`)
foreach f ($c/chr${c}{,_random}.fa)
if (-e $f) then
echo "nibbing $f"
/cluster/bin/i386/faToNib $f nib/$f:t:r.nib
endif
end
end
# Make symbolic links from /gbdb/hg17/nib to the real nibs.
ssh hgwdev
mkdir -p /gbdb/hg17/nib
ln -s /cluster/data/hg17/nib/chr*.nib /gbdb/hg17/nib
# Load /gbdb/hg17/nib paths into database and save size info.
cd /cluster/data/hg17
hgsql hg17 < $HOME/kent/src/hg/lib/chromInfo.sql
hgNibSeq -preMadeNib hg17 /gbdb/hg17/nib */chr*.fa
hgsql -N -e "select chrom,size from chromInfo order by chrom" hg17 \
> chrom.sizes
# You can compare this chrom.sizes with the previously created
# chrom_sizes. Should be no difference
sort chrom_sizes > s0
sort chrom.sizes | grep -v random > s1
diff s0 s1
rm s0 s1
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE - 2004-05-21 - Hiram)
# dbDb orderKey updated 2004-06-08 - Hiram
ssh hgwdev
# reset dbDb orderKey - these have never been ordered properly
# before, this will get them on the program.
hgsql -e 'update dbDb set orderKey=11 where name = "hg16";' \
-h genome-testdb hgcentraltest
hgsql -e 'update dbDb set orderKey=12 where name = "hg15";' \
-h genome-testdb hgcentraltest
hgsql -e 'update dbDb set orderKey=13 where name = "hg13";' \
-h genome-testdb hgcentraltest
# Enter hg17 into hgcentraltest.dbDb so test browser knows about it:
hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) \
VALUES("hg17", "May 2004", "/gbdb/hg17/nib", "Human", \
"chr4:56214201-56291736", 1, 10, "Human", "Homo sapiens", \
"/gbdb/hg17/html/description.html", 0, 0, "NCBI Build 35");' \
-h genome-testdb hgcentraltest
# Make trackDb table so browser knows what tracks to expect:
cd ~/kent/src/hg/makeDb/trackDb
cvs up -d -P .
# Edit the makefile to add hg17 in all the right places and do
make update
make alpha
cvs commit makefile
# MAKE LIFTALL.LFT, NCBI.LFT (DONE - 2004-05-21 - Hiram)
# Re-DONE with new randoms - 2004-06-03 - Hiram)
cd /cluster/data/hg17
mkdir -p jkStuff
cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft
# Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly.
# Note: this ncbi.lift will not lift floating contigs to chr_random coords,
# but it will show the strand orientation of the floating contigs
# (grep for '|').
# mdToNcbiLift seq_contig.md jkStuff/ncbi.lft
# XXXX - appears to be unused, not done - Hiram
# REPEAT MASKING (DONE - 2004-05-24 - Hiram)
# The randoms were rearranged after this was first done,
# they are re-made below 2004-06-02)
# Record the RM version here:
# RepBase Update 8.12, RM database version 20040130
# as this changes over time and there is no record in the results
# Split contigs, run RepeatMasker, lift results
# This split takes about 8 minutes
ssh eieio
cd /cluster/data/hg17
foreach chrom ( `cat chrom.lst` )
foreach c ( $chrom/N{C,G,T}_?????? )
set contig = $c:t
echo "splitting ${chrom}/${contig}/${contig}.fa"
faSplit size ${chrom}/${contig}/$contig.fa 500000 \
${chrom}/${contig}/${contig}_ \
-lift=${chrom}/${contig}/$contig.lft -maxN=500000
end
end
#- Make the run directory and job list:
cd /cluster/data/hg17
mkdir -p jkStuff
# According to RepeatMasker help file, no arguments are required to
# specify species because its default is set for primate (human)
# This run script saves the .tbl file to be sent to Arian. He uses
# those for his analysis. Sometimes he needs the .cat and .align files for
# checking problems. Krish needs the .align files, they are large.
cat << '_EOF_' > jkStuff/RMHuman
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/hg17/$2
/bin/cp $2 /tmp/hg17/$2/
cd /tmp/hg17/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2
popd
/bin/cp /tmp/hg17/$2/$2.out ./
if (-e /tmp/hg17/$2/$2.align) /bin/cp /tmp/hg17/$2/$2.align ./
if (-e /tmp/hg17/$2/$2.tbl) /bin/cp /tmp/hg17/$2/$2.tbl ./
# if (-e /tmp/hg17/$2/$2.cat) /bin/cp /tmp/hg17/$2/$2.cat ./
/bin/rm -fr /tmp/hg17/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/hg17/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/hg17
'_EOF_'
# << this line makes emacs coloring happy
chmod +x jkStuff/RMHuman
ssh eieio
cd /cluster/data/hg17
mkdir RMRun
rm -f RMRun/RMJobs
touch RMRun/RMJobs
foreach d ( `cat chrom.lst` )
foreach c ( ${d}/N{C,G,T}_*/N{C,G,T}_*_*.fa )
set f = $c:t
set cc = $c:h
set contig = $cc:t
echo /cluster/store5/gs.18/build35/jkStuff/RMHuman \
/cluster/store5/gs.18/build35/${d}/${contig} $f \
'{'check out line+ /cluster/store5/gs.18/build35/${d}/${contig}/$f.out'}' \
>> RMRun/RMJobs
end
end
# We have 5971 jobs in RMJobs:
wc RMRun/RMJobs
# 5970 41790 1105804 RMRun/RMJobs
#- Do the run
ssh kk
cd /cluster/data/hg17/RMRun
para create RMJobs
para try, para check, para check, para push, para check,...
#- While that is running, you can run TRF (simpleRepeat) on the small
# cluster. See SIMPLE REPEAT section below
# Completed: 5970 of 5970 jobs
# CPU time in finished jobs: 45189516s 753158.60m 12552.64h 523.03d 1.433 y
# IO & Wait Time: 141333s 2355.55m 39.26h 1.64d 0.004 y
# Average job time: 7593s 126.55m 2.11h 0.09d
# Longest job: 10268s 171.13m 2.85h 0.12d
# Submission to last job: 81484s 1358.07m 22.63h 0.94d
# Lift up the split-contig .out's to contig-level .out's
#
# If a mistake is made in the following it would be possible to
# destroy all the RM output. So, just to be paranoid, save all
# the RM output in bluearc for the time being:
ssh eieio
cd /cluster/data/hg17
mkdir /cluster/bluearc/hg17/RMOutput
foreach c ( `cat chrom.lst` )
foreach d ( ${c}/N{C,G,T}_* )
set T = /cluster/bluearc/hg17/RMOutput/${d}
mkdir -p ${T}
cd ${d}
set contig = $d:t
cp -p ${contig}_?{,?,??}.fa.out ${T}
cd ../..
echo "${d} done"
end
end
# Make sure we got them all:
# (this doesn't work later since there are more *.fa.out files
# after the lifting. More explicitly to find just these:
# find . -name "N?_*_*.fa.out" -print | wc -l
find . -name "*.fa.out" -print | wc -l
# 5970
find /cluster/bluearc/hg17/RMOutput -type f | wc -l
# 5970
# same count
# OK, now you can try this operation, do it in a script like this
# and save the output of the script for a record of what happened.
cat << '_EOF_' > jkStuff/liftRM.csh
#!/bin/csh -fe
foreach c ( `cat chrom.lst` )
foreach d ( ${c}/N{C,G,T}_* )
cd $d
set contig = $d:t
liftUp $contig.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out
cd ../..
end
end
'_EOF_'
chmod +x jkStuff/liftRM.csh
mkdir scriptsOutput
time jkStuff/liftRM.csh > scriptsOutput/liftRM.1 2>&1
# real 4m37.572s
# user 1m19.130s
# sys 0m32.950s
# Check that they all were done:
grep "fa.out" scriptsOutput/liftRM.1 | wc -l
# 5959
# same count as above
#- Lift up RepeatMask .out files to chromosome coordinates via
# picked up jkStuff/liftOut2.sh from the hg16 build. Renamed to
# liftOut2.csh, changed the line that does the chrom listing
time ./jkStuff/liftOut2.csh > scriptsOutput/liftOut2 2>&1
# real 9m46.780s
# user 1m18.900s
# sys 7m33.990s
#- By this point, the database should have been created (above):
ssh hgwdev
cd /cluster/data/hg17
time hgLoadOut hg17 ?/*.fa.out ??/*.fa.out 6_hla_hap?/*.fa.out > \
scriptsOutput/hgLoadOut 2>&1
# real 5m59.137s
# user 1m47.550s
# sys 0m15.410s
# errors during this load: (there are always a couple of these)
# Strange perc. field -6.1 line 243543 of 2/chr2.fa.out
# Strange perc. field -5.6 line 243543 of 2/chr2.fa.out
# Strange perc. field -6.1 line 243545 of 2/chr2.fa.out
# Strange perc. field -5.6 line 243545 of 2/chr2.fa.out
# Strange perc. field -0.2 line 30322 of 17/chr17.fa.out
# Strange perc. field -0.2 line 30324 of 17/chr17.fa.out
# Strange perc. field -0.2 line 30326 of 17/chr17.fa.out
# Strange perc. field -0.2 line 30328 of 17/chr17.fa.out
# Strange perc. field -18.6 line 77034 of 19/chr19.fa.out
# Verify we have similar results to previous assembly:
# featureBits hg17 rmsk
# 1391378842 bases of 2867328468 (48.525%) in intersection
# featureBits hg16 rmsk
# 1388770568 bases of 2865248791 (48.469%) in intersection
# Now proceed to MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF
# following the SIMPLE REPEAT sections below
# Re-Running REPEAT_MASKER on the new Randoms (DONE - 2004-06-02 - Hiram)
ssh eieio
cd /cluster/data/hg17
grep "|" seq_contig.md | awk '{print $2}' | sed -e "s#|#/#" > randoms.lst
mkdir /cluster/data/hg17/RMRandoms
foreach r ( `cat randoms.lst` )
set d = $r:h
set contig = $r:t
foreach c ( ${r}/N{C,G,T}_*_*.fa )
set f = $c:t
echo /cluster/store5/gs.18/build35/jkStuff/RMHuman \
/cluster/store5/gs.18/build35/${d}/${contig} $f \
'{'check out line+ /cluster/store5/gs.18/build35/${d}/${contig}/$f.out'}' \
>> RMRandoms/RMJobs
end
end
ssh kk
cd /cluster/data/hg17/RMRandoms
para create RMJobs
para try, para check, para check, para push, para check,...
# Completed: 94 of 94 jobs
# CPU time in finished jobs: 221454s 3690.91m 61.52h 2.56d 0.007 y
# IO & Wait Time: 866s 14.43m 0.24h 0.01d 0.000 y
# Average job time: 2365s 39.42m 0.66h 0.03d
# Longest job: 9062s 151.03m 2.52h 0.10d
# Submission to last job: 9106s 151.77m 2.53h 0.11d
# Continuing with the paranoia theme, let's backup all the RM output
#
ssh eieio
cd /cluster/data/hg17
mkdir /cluster/bluearc/hg17/RMRandoms
foreach c ( `cat chrom.lst` )
foreach d ( ${c}/N{C,G,T}_* )
set T = /cluster/bluearc/hg17/RMRandoms/${d}
mkdir -p ${T}
cd ${d}
set contig = $d:t
cp -p ${contig}_?{,?,??}.fa.out ${T}
cd ../..
echo "${d} done"
end
end
# Make sure we got them all:
find . -name "N?_*_*.fa.out" -print | wc -l
# 5959
find /cluster/bluearc/hg17/RMRandoms -type f | wc -l
# 5959
# same count
time jkStuff/liftRM.csh > scriptsOutput/liftRM2.1 2>&1
# real 4m46.302s
# user 1m18.260s
# sys 0m18.000s
# Check that they all were done:
grep "fa.out" scriptsOutput/liftRM2.1 | wc -l
# 5959
# same count as above
#- Lift up RepeatMask .out files to chromosome coordinates via
# picked up jkStuff/liftOut2.sh from the hg16 build. Renamed to
# liftOut2.csh, changed the line that does the chrom listing
time ./jkStuff/liftOut2.csh > scriptsOutput/liftOut2.1 2>&1
# real 2m46.347s
# user 1m18.650s
# sys 0m15.990s
#- By this point, the database should have been created (above):
ssh hgwdev
cd /cluster/data/hg17
time hgLoadOut hg17 ?/*.fa.out ??/*.fa.out 6_hla_hap?/*.fa.out > \
scriptsOutput/hgLoadOut 2>&1
# real 5m59.137s
# user 1m47.550s
# sys 0m15.410s
# errors during this load: (there are always a couple of these)
# Strange perc. field -6.1 line 243543 of 2/chr2.fa.out
# Strange perc. field -5.6 line 243543 of 2/chr2.fa.out
# Strange perc. field -6.1 line 243545 of 2/chr2.fa.out
# Strange perc. field -5.6 line 243545 of 2/chr2.fa.out
# Strange perc. field -0.2 line 30322 of 17/chr17.fa.out
# Strange perc. field -0.2 line 30324 of 17/chr17.fa.out
# Strange perc. field -0.2 line 30326 of 17/chr17.fa.out
# Strange perc. field -0.2 line 30328 of 17/chr17.fa.out
# Strange perc. field -18.6 line 77034 of 19/chr19.fa.out
# Verify we have similar results to previous assembly:
# featureBits hg17 rmsk
# 1390952984 bases of 2866216770 (48.529%) in intersection
# featureBits hg17 rmsk #with previous randoms:
# 1391378842 bases of 2867328468 (48.525%) in intersection
# featureBits hg16 rmsk
# 1388770568 bases of 2865248791 (48.469%) in intersection
# Now proceed to MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF
# following the SIMPLE REPEAT sections below
# SIMPLE REPEAT [TRF] TRACK (DONE - 2004-05-21 - Hiram)
# Re-done with new randoms, 2004-06-02 - Hiram
# Copy the contigs, first to the bluearc, then to /iscratch/i
ssh eieio
mkdir /cluster/bluearc/hg17
mkdir /cluster/bluearc/hg17/contigs
cd /cluster/data/hg17
foreach ctg ( `cat contig.lst` )
set c = $ctg:t
echo "$ctg > /cluster/bluearc/hg17/contigs/$c"
cp -p $ctg /cluster/bluearc/hg17/contigs/$c
end
# Check how much is there:
# du -hsc /cluster/bluearc/hg17/contigs
# 2.8G /cluster/bluearc/hg17/contigs
# Distribute contigs to /iscratch/i
ssh kkr1u00
mkdir /iscratch/i/gs.18/build35/unmaskedContigs
cd /iscratch/i/gs.18/build35/unmaskedContigs
cp -p /cluster/bluearc/hg17/contigs/* .
# Verify same amount made it there:
# du -hsc /iscratch/i/gs.18/build35/unmaskedContigs
# 2.8G /iscratch/i/gs.18/build35/unmaskedContigs
# Then send them to the other 7 Iservers
/cluster/bin/iSync
# Go to the small cluster for this business:
ssh kki
mkdir -p /cluster/data/hg17/bed/simpleRepeat
cd /cluster/data/hg17/bed/simpleRepeat
mkdir trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
# << this line makes emacs coloring happy
chmod +x runTrf
cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
ls -1S /iscratch/i/gs.18/build35/unmaskedContigs/*.fa > genome.lst
gensub2 genome.lst single gsub jobList
para create jobList
para try
para check
para push
para check
# Completed: 380 of 380 jobs
# CPU time in finished jobs: 13230s 220.49m 3.67h 0.15d 0.000 y
# IO & Wait Time: 2078s 34.64m 0.58h 0.02d 0.000 y
# Average job time: 40s 0.67m 0.01h 0.00d
# Longest job: 1590s 26.50m 0.44h 0.02d
# Submission to last job: 2504s 41.73m 0.70h 0.03d
liftUp simpleRepeat.bed /cluster/data/hg17/jkStuff/liftAll.lft \
warn trf/*.bed > lu.out 2>&1
# Load into the database:
ssh hgwdev
cd /cluster/data/hg17/bed/simpleRepeat
/cluster/bin/i386/hgLoadBed hg17 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# Loaded 629076 elements of size 16
# Compare with previous assembly
featureBits hg17 simpleRepeat
# 54952425 bases of 2866216770 (1.917%) in intersection
# with previous randoms
featureBits hg17 simpleRepeat
# 54964044 bases of 3096628158 (1.775%) in intersection
featureBits hg16 simpleRepeat
# 54320136 bases of 2865248791 (1.896%) in intersection
# GAPS weren't in hg17 yet at this point, after gaps added:
# featureBits hg17 simpleRepeat
# 54964044 bases of 2867328468 (1.917%) in intersection
# featureBits -countGaps hg17 simpleRepeat
# 54964044 bases of 3096628158 (1.775%) in intersection
###########################################################################
# CREATE MICROSAT TRACK (done 2006-7-5 JK)
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir microsat
cd microsat
awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed
/cluster/bin/i386/hgLoadBed hg17 microsat microsat.bed
# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2004-05-21 - Hiram)
# re-done with new randoms - 2004-06-03 - Hiram
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh eieio
cd /cluster/data/hg17/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# EXPERIMENT, at a filter of <= 12, we have coverage:
# 20904399 bases of 2867328468 (0.729%) in intersection
# at a filter of <= 9, we have coverage:
# 19271270 bases of 2867328468 (0.672%) in intersection
# Lift up filtered trf output to chrom coords as well:
cd /cluster/data/hg17
mkdir bed/simpleRepeat/trfMaskChrom
foreach c ( `cat chrom.lst` )
if (-e $c/lift/ordered.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/ordered.lst > $c/lift/oTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
endif
if (-e $c/lift/random.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/random.lst > $c/lift/rTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
endif
end
# MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE - 2004-05-25)
# -Hiram
# re-done with new randoms - 2004-06-03 - Hiram
# This used to be done right after RepeatMasking. Now, we mask with
# TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above,
# and after Repeat Masker is complete.
ssh eieio
cd /cluster/data/hg17
# copied these scripts from hg16 - reset the lines that make
# the chrom list to work on, reset the wild cards that find all the
# contig .fa's
# Make chr*.fa from contig .fa
# Copied chrFa.sh from hg16/jkStuff, renamed it to chrFa.csh
time ./jkStuff/chrFa.csh > scriptsOutput/chrFa.out 2>&1 &
# real 13m18.512s
# user 9m1.670s
# sys 1m7.290s
#- Soft-mask (lower-case) the contig and chr .fa's
time ./jkStuff/makeFaMasked.csh > scriptsOutput/maFaMasked.out 2>&1
# real 29m31.623s
# user 13m49.700s
# sys 5m58.750s
#- Make hard-masked .fa.masked files as well:
time ./jkStuff/makeHardMasked.csh > scriptsOutput/maHardMasked.out 2>&1
#- Create the bothMasksNib/ directory
time ./jkStuff/makeNib.csh > scriptsOutput/maNib.out 2>&1
# real 14m41.694s
# user 6m28.000s
# sys 1m42.500s
# Make symbolic links from /gbdb/hg17/nib to the real nibs.
ssh hgwdev
mv nib nib.raw
mv bothMasksNib nib
rm /gbdb/hg17/nib/*.nib
ln -s `pwd`/nib/* /gbdb/hg17/nib
# Load /gbdb/hg17/nib paths into database and save size info.
hgsql hg17 < ~/kent/src/hg/lib/chromInfo.sql
cd /cluster/data/hg17
hgNibSeq -preMadeNib hg17 /gbdb/hg17/nib */chr*.fa
# 3096628158 total bases
# Should be the same size as before
hgsql -N -e "select chrom,size from chromInfo order by chrom" hg17 \
> chrom.sizes.masked
diff chrom.sizes chrom.sizes.masked
# should be no output at all, thus:
rm chrom.sizes.masked
# Copy the masked contig fa to /scratch and /iscratch
# And everything else we will need for blastz runs, etc ...
# Best to do this sequence first to /cluster/bluearc/scratch,
# which is going to be the source for the /scratch copy.
# And then from there to the /iscratch
# Make sure you are on the fileserver for the original source:
ssh eieio
mkdir -p /cluster/bluearc/scratch/hg/gs.18/build35
cd /cluster/bluearc/scratch/hg/gs.18/build35
# these copies take less than 2 minutes each
mkdir bothMaskedNibs
cp -p /cluster/data/hg17/nib/*.nib ./bothMaskedNibs
mkdir maskedContigs
foreach chrom ( `cat /cluster/data/hg17/chrom.lst` )
cp -p /cluster/data/hg17/${chrom}/N{C,G,T}_*/N{C,G,T}_??????.fa \
./maskedContigs
echo "done ${chrom}"
end
# make sure you have them all:
ls maskedContigs | wc -l
# 380
wc -l /cluster/data/hg17/contig.lst
# 380
mkdir rmsk
foreach chrom ( `cat /cluster/data/hg17/chrom.lst` )
cp -p /cluster/data/hg17/${chrom}/*.out ./rmsk
echo "done ${chrom}"
end
# Now, go to the destination for /iscratch and copy from the
# bluearc
ssh kkr1u00
mkdir -p /iscratch/i/gs.18/build35
cd /iscratch/i/gs.18/build35
# This takes about 5 minutes
rsync -arlv /cluster/bluearc/scratch/hg/gs.18/build35/ .
time /cluster/bin/iSync
# real 7m27.649s
# request rsync of /cluster/bluearc/scratch to the KiloKluster /scratch
# LOAD ctgPos table - Contig position track (DONE - 2004-06-08 - Hiram)
# After fixing up hgCtgPos to accept the -chromLst argument, simply:
cd /cluster/data/hg17
hgCtgPos -chromLst=chrom.lst hg17 .
# GOLD AND GAP TRACKS (DONE - 2004-05-21 - Hiram)
# RE-DONE with new randoms - 2004-06-03 - Hiram
ssh hgwdev
cd /cluster/data/hg17
hgGoldGapGl -noGl -chromLst=chrom.lst hg17 /cluster/data/hg17 .
# Disappointing to see this create so many tables ...
# _gap and _gold for each chrom
# Create the contig.gl files - XXX - NCBI doesn't deliver
# contig_overlaps.agp - 2004-06-18 - this is beginning to come
# together and there is now a contig_overlaps.agp file
cd /cluster/store5/gs.18/build35
# combine the various psLayout attempts on different sections of
# clones
./combineContigOverlaps.sh
# Turn contig_overlaps.agp into gl files
~hiram/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md
# Create chromosome gl files (had to fix liftUp to do the NC_ properly)
jkStuff/liftGl.csh contig.gl
#
# Need to remove these PAR clone names from chrY.gl
rm -f /tmp/chrY.gl
sed -e "/^AL954722.18/d; /^BX537334.4/d; /^BX000483.7/d; \
/^BX908402.3/d; /^BX649635.3/d; /^BX119919.5/d; \
/^AC079176.15/d; /^AC097314.27/d; /^AC006209.25/d; \
/^AJ271735.1/d; /^AJ271736.1/d" Y/chrY.gl > /tmp/chrY.gl
rm -f Y/chrY.gl
mv /tmp/chrY.gl Y/chrY.gl
# After contig.gl files have been made from contig_overlaps.agp
# The sed fixes the Celera clones that are marked phase W
# Call that phase 3 instead,
# Delete the Celera AACC clones, they are not in this assembly,
# And fix the line of AC018743 to add it to the assembly, it was a
# last minute addition by Terry that didn't get carried into the
# NCBI sequence.inf file. And remove the older versions of five
# clones that got left in by mistake at NCBI
# AC004491.1=AC004491.2 AC004921.1=AC004921.2 AC004983.2=AC004983.3
# AC005088.2=AC005088.3 AC006014.2=AC006014.3 AC099654.4=AC099654.5
# And finally the grep selects only those things for_assembly
cd /cluster/data/hg17
egrep "for_assembly|AC018743" /cluster/store5/gs.18/ncbi/sequence.inf | \
sed -e "s/\tW\t/\t3\t/; /^AACC010000.*/d; /^AC004491.1.*/d; \
/^AC004921.1.*/d; /^AC004983.2.*/d; /^AC005088.2.*/d; \
/^AC006014.2.*/d; /^AC099654.4.*/d; \
s/AC018743.27\t31791062\t466818\t1\tD\tUn\t-\tBCM\tRP11-289M22\tSIZE:2big/AC018743.27\t31791062\t466818\t1\t-\t(12)\t-\tBCM\tRP11-289M22\tfor_assembly/" \
> sequence.inf
cd /cluster/data/hg17
hgGoldGapGl -chromLst=chrom.lst hg17 /cluster/store5/gs.18 build35
$HOME/bin/i386/hgClonePos -chromLst=chrom.lst hg17 \
/cluster/data/hg17 ./sequence.inf /cluster/store5/gs.18 -maxErr=3 \
-maxWarn=2000 2> clone.pos.errors
# Extract all the PAR clones for chrX from clonePos, change the X
# to Y, fixup the coordinates on the last three, and load this
# data in on the clonePos table in addition to what is there
# already.
cat << '_EOF_' > chrY.par.clonePos
BX640545.2 34821 3 chrY 0 34250 F
AL954722.18 37771 3 chrY 84821 122592 F
BX537334.4 36872 3 chrY 120592 157464 F
BX000483.7 15918 3 chrY 155466 171384 F
AL954664.17 39200 3 chrY 251384 290307 F
BX000476.5 33340 3 chrY 282188 315528 F
AL732314.18 218723 3 chrY 313528 532251 F
BX004827.18 119555 3 chrY 479050 600112 F
AL683871.15 175765 3 chrY 598112 773877 F
AL672311.26 115998 3 chrY 771877 887875 F
AL672277.20 131682 3 chrY 885875 1017557 F
BX908402.3 36556 3 chrY 1067557 1104113 F
BX649635.3 43709 3 chrY 1154113 1197822 F
BX649553.5 90286 3 chrY 1347822 1438108 F
BX296563.3 21008 3 chrY 1488108 1509117 F
BX119906.16 35666 3 chrY 1507116 1542782 F
AL683870.15 162377 3 chrY 1541782 1704175 F
AL691415.17 45085 3 chrY 1702175 1747265 F
AL683807.22 189825 3 chrY 1745260 1935086 F
AL672040.10 117297 3 chrY 1933086 2050383 F
BX004859.8 63432 3 chrY 2048380 2111815 F
BX119919.5 55442 3 chrY 2261815 2317257 F
AC079176.15 186278 3 chrY 2311674 2497952 F
AC097314.27 80501 3 chrY 2495948 2576449 F
AC006209.25 141759 3 chrY 2551122 2692881 F
AJ271735.1 240000 3 chrY 57302979 57543030 F
AJ271736.1 158661 3 chrY 57543030 57701691 F
'_EOF_'
hgsql -e 'load data local infile "chrY.par.clonePos" into table clonePos;' hg17
# We have the following errors
# Processing /cluster/data/hg17/Y/chrY.gl
# Clone BX640545 is on chromosomes chrX and chrY. Ignoring chrY
# Clone AL954722 is on chromosomes chrX and chrY. Ignoring chrY
# ... etc for all the PAR clones
# ... And there are an unknown number of these:
# AB000359 is in ./sequence.inf but not in ooDir/*/*.gl
# AB000360 is in ./sequence.inf but not in ooDir/*/*.gl
# gc5Base wiggle TRACK (DONE - 2004-05-22 - Hiram)
# This previously was a script that ran through each nib
# Recently transformed into a mini cluster run.
# Re-DONE with the new randoms - 2004-06-04
ssh kki
mkdir /cluster/data/hg17/bed/gc5Base
cd /cluster/data/hg17/bed/gc5Base
mkdir wigData5 dataLimits5 wigData5_1K dataLimits5_1K
cat << '_EOF_' > kkRun.sh
#!/bin/sh
NIB=$1
chr=${NIB/.nib/}
chrom=${chr#chr}
hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 hg17 \
/iscratch/i/gs.18/build35/bothMaskedNibs | \
grep -w GC | \
awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
wigAsciiToBinary -dataSpan=5 -chrom=${chr} \
-wibFile=wigData5/gc5Base_${chrom} \
-name=${chrom} stdin 2> dataLimits5/${chr}
'_EOF_'
# << this line makes emacs coloring happy
chmod +x kkRun.sh
ls /iscratch/i/gs.18/build35/bothMaskedNibs > nibList
cat << '_EOF_' > gsub
#LOOP
./kkRun.sh $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 nibList single gsub jobList
para create jobList
para try, check, ... etc
# Completed: 46 of 46 jobs
# CPU time in finished jobs: 5251s 87.51m 1.46h 0.06d 0.000 y
# IO & Wait Time: 130s 2.17m 0.04h 0.00d 0.000 y
# Average job time: 117s 1.95m 0.03h 0.00d
# Longest job: 413s 6.88m 0.11h 0.00d
# Submission to last job: 475s 7.92m 0.13h 0.01d
# load the .wig files back on hgwdev:
ssh hgwdev
cd /cluster/data/hg17/bed/gc5Base
hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/gc5Base hg17 gc5Base wigData5/*.wig
# and symlink the .wib files into /gbdb
mkdir /gbdb/hg17/wib/gc5Base
ln -s `pwd`/wigData5/*.wib /gbdb/hg17/wib/gc5Base
# And then the zoomed data view
ssh kki
cd /cluster/data/hg17/bed/gc5Base
mkdir wigData5_1K dataLimits5_1K
cat << '_EOF_' > kkRunZoom.sh
#!/bin/sh
NIB=$1
chr=${NIB/.nib/}
chrom=${chr#chr}
hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 hg17 \
/iscratch/i/gs.18/build35/bothMaskedNibs | \
grep -w GC | \
awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \
-chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \
-name=${chrom} stdin 2> dataLimits5_1K/${chr}
'_EOF_'
# << this line makes emacs coloring happy
chmod +x kkRunZoom.sh
cat << '_EOF_' > gsubZoom
#LOOP
./kkRunZoom.sh $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 nibList single gsubZoom jobListZoom
para create jobListZoom
para try ... check ... etc ...
# Completed: 46 of 46 jobs
# CPU time in finished jobs: 5216s 86.93m 1.45h 0.06d 0.000 y
# IO & Wait Time: 34s 0.57m 0.01h 0.00d 0.000 y
# Average job time: 114s 1.90m 0.03h 0.00d
# Longest job: 415s 6.92m 0.12h 0.00d
# Submission to last job: 469s 7.82m 0.13h 0.01d
# Then load these .wig files into the same database as above
ssh hgwdev
hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/gc5Base \
-oldTable hg17 gc5Base wigData5_1K/*.wig
# and symlink these .wib files into /gbdb
mkdir -p /gbdb/hg17/wib/gc5Base
ln -s `pwd`/wigData5_1K/*.wib /gbdb/hg17/wib/gc5Base
# AUTO UPDATE GENBANK MRNA RUN (DONE - 2004-06-08 - Hiram)
ssh eieio
cd /cluster/data/genbank
# This is a new organism, edit the etc/genbank.conf file and add:
# hg17
hg17.genome = /scratch/hg/gs.18/build35/bothMaskedNibs/chr*.nib
hg17.lift = /cluster/store5/gs.18/build35/jkStuff/liftAll.lft
hg17.genbank.est.xeno.load = yes
hg17.mgcTables.default = full
hg17.mgcTables.mgc = all
hg17.downloadDir = hg17
# Do the refseq's first, they are the quick ones
ssh eieio
cd /cluster/data/genbank
nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial hg17
# logFile: var/build/logs/2004.05.25-13:41:07.hg17.initalign.log
# checking that log, or watching the batch on kk, you can find
# where the batch is running and after it is done get the time:
cd /cluster/store6/genbank/work/initial.hg17/align
para time > time
cat time
# Completed: 9500 of 9500 jobs
# CPU time in finished jobs: 62241s 1037.35m 17.29h 0.72d 0.002 y
# IO & Wait Time: 33719s 561.98m 9.37h 0.39d 0.001 y
# Average job time: 10s 0.17m 0.00h 0.00d
# Longest job: 1062s 17.70m 0.29h 0.01d
# Submission to last job: 1063s 17.72m 0.30h 0.01d
# Load the results from the above
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg17
# To get the genbank started, the above results need to be
# moved out of the way. These things can be removed if there are
# no problems to debug
ssh eieio
cd /cluster/data/genbank/work
mv initial.hg17 initial.hg17.refseq.mrna
cd /cluster/data/genbank
nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial hg17
# logFile: var/build/logs/2004.06.04-10:47:21.hg17.initalign.log
# One job was hung up, after killing it on its node, the batch
# finished in a few minutes.
# Completed: 35720 of 35720 jobs
# CPU time in finished jobs: 5161424s 86023.74m 1433.73h 59.74d 0.164 y
# IO & Wait Time: 144149s 2402.48m 40.04h 1.67d 0.005 y
# Average job time: 149s 2.48m 0.04h 0.00d
# Longest job: 18306s 305.10m 5.08h 0.21d
# Submission to last job: 35061s 584.35m 9.74h 0.41d
ssh hgwdev
cd /cluster/data/genbank
# some kind of error happened here, had to remove a lock file to
# get this to proceed (this same thing happened again the second
# time around)
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg17
ssh eieio
cd /cluster/data/genbank/work
mv initial.hg17 initial.hg17.genbank.mrna
cd /cluster/data/genbank
nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial hg17
# Completed: 189240 of 189240 jobs
# CPU time in finished jobs: 97172120s 1619535.33m 26992.26h 1124.68d 3.081 y
# IO & Wait Time: 1507789s 25129.82m 418.83h 17.45d 0.048 y
# Average job time: 521s 8.69m 0.14h 0.01d
# Longest job: 33165s 552.75m 9.21h 0.38d
# Submission to last job: 126988s 2116.47m 35.27h 1.47d
ssh hgwdev
cd /cluster/data/genbank
time nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg17
# real 440m42.750s
# user 69m7.810s
# sys 23m18.640s
# This is ~7.5 hours
# If the above is all OK, ask Mark to put this assembly on
# the daily updates.
# CPGISLANDS (DONE - 2004-05-25 - Hiram)
# Re-DONE with new randoms - 2004-06-04 - Hiram
ssh hgwdev
mkdir -p /cluster/data/hg17/bed/cpgIsland
cd /cluster/data/hg17/bed/cpgIsland
# Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
cvs co hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
make
# gcc readseq.c cpg_lh.c -o cpglh.exe
mv cpglh.exe /cluster/data/hg17/bed/cpgIsland/
# cpglh.exe requires hard-masked (N) .fa's.
# There may be warnings about "bad character" for IUPAC ambiguous
# characters like R, S, etc. Ignore the warnings.
ssh eieio
cd /cluster/data/hg17/bed/cpgIsland
foreach f (../../*/chr*.fa.masked)
set fout=$f:t:r:r.cpg
echo running cpglh on $f to $fout
./cpglh.exe $f > $fout
end
# the warnings:
# Bad char 0x52 = 'R' at line 2046, base 102229, sequence chr16_random
# Bad char 0x4d = 'M' at line 1216113, base 60805573, sequence chr3
# Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3
# Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3
# real 21m47.823s
# user 18m30.810s
# sys 1m13.420s
# Transform cpglh output to bed +
cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
# << this line makes emacs coloring happy
awk -f filter.awk chr*.cpg > cpgIsland.bed
ssh hgwdev
cd /cluster/data/hg17/bed/cpgIsland
hgLoadBed hg17 cpgIslandExt -tab -noBin \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
# Reading cpgIsland.bed
# Loaded 27801 elements of size 10
# Sorted
# Saving bed.tab
# Loading hg17
# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE - 2004-05-25 - Heather)
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans) \
VALUES("hg17", "blat12", "17778", "1"); \
INSERT INTo blatServers (db, host, port, isTrans) \
VALUES("hg17", "blat12", "17779", "0");' \
-h genome-testdb hgcentraltest
# PREPARE CLUSTER FOR BLASTZ RUNS (DONE - 2004-05-26 - Hiram)
# Re-DONE with new randoms - 2004-06-03 - Hiram
ssh eieio
mkdir /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec
cd /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec
ln -s ../rmsk/*.out .
# This takes 40 minutes run as a script, to hurry it up it has
# been converted to a mini cluster run
cat << '_EOF_' > runArian.sh
#!/bin/sh
for FN in *.out
do
echo /cluster/bluearc/RepeatMasker030619/DateRepsinRMoutput.pl \
${FN} -query human -comp rat -comp mouse
done
'_EOF_'
chmod +x runArian.sh
ssh kki
cd /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec
./runArian.sh > jobList
para create jobList
para try, ... check ... push ... etc ...
# Completed: 46 of 46 jobs
# CPU time in finished jobs: 668s 11.14m 0.19h 0.01d 0.000 y
# IO & Wait Time: 514s 8.56m 0.14h 0.01d 0.000 y
# Average job time: 26s 0.43m 0.01h 0.00d
# Longest job: 86s 1.43m 0.02h 0.00d
# Submission to last job: 108s 1.80m 0.03h 0.00d
# Now extract each one, 1 = Rat, 2 = Mouse
ssh eieio
cd /cluster/bluearc/scratch/hg/gs.18/build35
mkdir linSpecRep.notInRat linSpecRep.notInMouse
foreach f (rmsk.spec/*.out_rat_mus)
set base = $f:t:r:r
echo "$f -> $base.out.spec"
/cluster/bin/scripts/extractLinSpecReps 1 $f > \
linSpecRep.notInRat/$base.out.spec
/cluster/bin/scripts/extractLinSpecReps 2 $f > \
linSpecRep.notInMouse/$base.out.spec
end
# There is actually no difference at all between these two results.
# copy to iscratch
ssh kkr1u00
cd /iscratch/i/gs.18/build35
rsync -arlv /cluster/bluearc/scratch/hg/gs.18/build35/ .
/cluster/bin/iSync
# request rsync of /cluster/bluearc/scratch to the KiloKluster /scratch
# COPY DATA TO GOLDEN PATH LOCATIONS (DONE - 2004-06-04 - Hiram)
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/chromosomes
cd /cluster/data/hg17
# Beware, this backgrounding of the gzips can be hard on hgwdev.
# You could wait until after the copy then run one gzip to do them all
foreach chrom ( `cat chrom.lst` )
cp -p ${chrom}/*.fa /usr/local/apache/htdocs/goldenPath/hg17/chromosomes
gzip \
/usr/local/apache/htdocs/goldenPath/hg17/chromosomes/chr${chrom}*.fa &
echo "done ${chrom}"
end
cd /usr/local/apache/htdocs/goldenPath/hg17/chromosomes
gzip *.fa
# FOSMID END PAIRS TRACK (2004-06-09 kate)
# Corrected upper size limit to 50Kbp, reran pslPairs,
# and reloaded (2004-07-15 kate)
# Use latest fosmid ends data prepared by Terry Furey.
# He says there is no on-going work on fosmid ends, so this
# should suffice indefinitely ? Move/link this stuff into
# central data area.
ssh eieio
cd /cluster/data/ncbi
mkdir -p fosends/human
ln -s /cluster/store1/fosends.3 fosends/human
cd fosends/human/fosends.3
faSize fosEnds.fa
# 579735181 bases (369769 N's 579365412 real) in 1087670 sequences
# 580M bases in 1M sequences
# create link in /gbdb/ncbi/fosends/human ?
# use pre-split fosend files, and associated list for cluster run
# Sequences are in /cluster/bluearc/hg/fosEnds
cp /cluster/bluearc/booch/fosends/fosEnds.lst /cluster/bluearc/hg/fosEnds
# run on rack9 since kilokluster is busy
ssh kk9
cd /cluster/data/hg17
mkdir -p bed/fosends
cd bed/fosends
mkdir -p run
cd run
ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa \
> contigs.lst
cp /cluster/bluearc/hg/fosEnds/fosEnds.lst fosEnds.lst
# 380 contigs vs 97 fosEnd files -> 40K jobs
# send output to kksilo, as it can better handle the NFS load
mkdir -p /cluster/store7/kate/hg17/fosends/out
ln -s /cluster/store7/kate/hg17/fosends/out ../out
cat > gsub << 'EOF'
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -ooc=/scratch/hg/h/11.ooc {check out line+ /cluster/data/hg17/bed/fosends/out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'EOF'
gensub2 contigs.lst fosEnds.lst gsub jobList
foreach f (`cat fosEnds.lst`)
set d = $f:r:t
echo $d
mkdir -p /cluster/data/hg17/bed/fosends/out/$d
end
para create jobList
# 36860 jobs
para try
para check
para push
# CPU time in finished jobs: 1655943s 27599.05m 459.98h 19.17d 0.053 y
# IO & Wait Time: 101145s 1685.75m 28.10h 1.17d 0.003 y
# Average job time: 48s 0.79m 0.01h 0.00d
# Longest job: 1294s 21.57m 0.36h 0.01d
# Submission to last job: 19269s 321.15m 5.35h 0.22d
# sort, filter, and lift alignments
ssh eieio
cd /cluster/data/hg17/bed/fosends
pslSort dirs raw.psl temp out/fosEnds*
pslReps -nearTop=0.01 -minCover=0.70 -minAli=0.85 -noIntrons raw.psl \
fosEnds.psl /dev/null
# Processed 84096767 alignments
# cleanup
rm -r temp
rm raw.psl
rm -fr out /cluster/store7/kate/hg17/fosends
mkdir lifted
liftUp lifted/fosEnds.lifted.psl \
/cluster/data/hg17/jkStuff/liftAll.lft warn fosEnds.psl
pslSort dirs fosEnds.sorted.psl temp lifted
rmdir temp
wc -l *.sorted.psl
# 1693693 fosEnds.sorted.psl
set ncbiDir = /cluster/data/ncbi/fosends/human/fosends.3
~/bin/i386/pslPairs -tInsert=5000 -minId=0.94 -noBin -min=30000 -max=50000 -slop -short -long -orphan -mismatch -verbose fosEnds.sorted.psl $ncbiDir/fosEnds.pairs all_fosends fosEnds
# create header required by "rdb" tools
# TODO: replace w/ awk & sort
echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
cat header fosEnds.pairs | row score ge 300 | sorttbl chr start | headchg -del > fosEndPairs.bed
cat header fosEnds.slop fosEnds.short fosEnds.long fosEnds.mismatch \
fosEnds.orphan \
| row score ge 300 | sorttbl chr start | headchg -del > fosEndPairsBad.bed
extractPslLoad -noBin fosEnds.sorted.psl fosEndPairs.bed \
fosEndPairsBad.bed | \
sorttbl tname tstart | headchg -del > fosEnds.load.psl
# load into database
ssh hgwdev
cd /cluster/data/hg17/bed/fosends
hgLoadBed hg17 fosEndPairs fosEndPairs.bed \
-sqlTable=/cluster/home/kate/kent/src/hg/lib/fosEndPairs.sql
# Loaded 384558 elements
# note - this track isn't pushed to RR, just used for assembly QA
hgLoadBed hg17 fosEndPairsBad fosEndPairsBad.bed \
-sqlTable=/cluster/home/kate/kent/src/hg/lib/fosEndPairsBad.sql
# Loaded 30830 elements
#hgLoadPsl hg17 -nobin -table=all_fosends fosEnds.load.psl
# NOTE: truncates file to 0 if -nobin is used
hgLoadPsl hg17 -table=all_fosends fosEnds.load.psl
# load of all_fosends did not go as planned: 1526991 record(s), 0 row(s) skipped, 156 warning(s) loading psl.tab
# load sequences
mkdir -p /gbdb/hg17/fosends
ln -s /cluster/data/ncbi/fosends/human/fosends.3/fosEnds.fa \
/gbdb/hg17/fosends/fosEnds.fa
hgLoadSeq hg17 /gbdb/hg17/fosends/fosEnds.fa
# 1087670 sequences
# NOTE: extFile ID is 832625 (shouldn't be so large ??)
# may want to reset this.
# BAC END PAIRS TRACK (DONE - 2004-06-09 kate)
# Re-ran pslPairs with updated pairs file (2004-10-04 booch)
# Use latest BAC ends data from NCBI
# Checked ftp.ncbi.nih.gov/genomes/BACENDS/homo_sapiens,
# and files were unchanged from Terry's last download
# (to /cluster/store1/bacends.4)
# Link this stuff into central data area.
ssh eieio
cd /cluster/data/ncbi
mkdir -p bacends/human
ln -s /cluster/store1/bacends.4 bacends/human
cd bacends/human/bacends.4
faSize BACends.fa
# 400230494 bases (2743171 N's 397487323 real) in 832614 sequences
# 400M bases in 800K sequences
# use pre-split bacends files, and associated list for cluster run
ssh kk
cd /cluster/data/hg17
mkdir -p bed/bacends
cd bed/bacends
mkdir run
cd run
ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > contigs.lst
ls -1S /cluster/bluearc/hg/bacEnds/hs/*.fa > bacends.lst
# 380 contigs vs 98 bacends files -> 40K jobs
# send output to kksilo, as it can better handle the NFS load
# (these are quick jobs)
mkdir -p /cluster/store7/kate/hg17/bacends/out
ln -s /cluster/store7/kate/hg17/bacends/out ../out
cat > gsub << 'EOF'
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -ooc=/scratch/hg/h/11.ooc {check out line+ /cluster/data/hg17/bed/bacends/out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'EOF'
gensub2 contigs.lst bacends.lst gsub jobList
foreach f (`cat bacends.lst`)
set d = $f:r:t
echo $d
mkdir -p /cluster/data/hg17/bed/bacends/out/$d
end
para create jobList
# 37240 jobs written to batch
para try
para check
para push
# CPU time in finished jobs: 1573932s 26232.19m 437.20h 18.22d 0.050 y
# IO & Wait Time: 122751s 2045.86m 34.10h 1.42d 0.004 y
# Average job time: 46s 0.76m 0.01h 0.00d
# Longest job: 3312s 55.20m 0.92h 0.04d
# Submission to last job: 7148s 119.13m 1.99h 0.08d
cd ../out/BACends000
pslCheck *.psl
#Error: invalid PSL: AZ519021:1-575 NT_004559:1306426-1608347 - NT_004559.BACends000.psl:1101
#AZ519021 query block 3 start 283 < previous block end 575
# NOTE: inquired with JK regarding these results
# lift alignments
ssh eieio
cd /cluster/data/hg17/bed/bacends
pslSort dirs raw.psl temp out/BACends*
# takes hours ?
# 37240 files in 98 dirs
# Got 37240 files 193 files per mid file
pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
raw.psl bacEnds.psl /dev/null
# Processed 52291246 alignments
mkdir lifted
liftUp lifted/bacEnds.lifted.psl \
/cluster/data/hg17/jkStuff/liftAll.lft warn bacEnds.psl
pslSort dirs bacEnds.sorted.psl temp lifted
# cleanup
rmdir temp
rm -fr out /cluster/store7/kate/hg17/bacends
wc -l *.sorted.psl
# 2497227 bacEnds.sorted.psl
set ncbiDir = /cluster/data/ncbi/bacends/human/bacends.4
~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose bacEnds.sorted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds
# create header required by "rdb" tools
# TODO: replace w/ awk & sort
echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
cat header bacEnds.pairs | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairs.bed
cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch bacEnds.orphan \
| row score ge 300 | sorttbl chr start | headchg -del > bacEndPairsBad.bed
extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \
bacEndPairsBad.bed | \
sorttbl tname tstart | headchg -del > bacEnds.load.psl
# load into database
ssh hgwdev
cd /cluster/data/hg17/bed/bacends
hgLoadBed hg17 bacEndPairs bacEndPairs.bed \
-sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairs.sql
# Loaded 201380
# note - this track isn't pushed to RR, just used for assembly QA
hgLoadBed hg17 bacEndPairsBad bacEndPairsBad.bed \
-sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairsBad.sql
# Loaded 81773
#hgLoadPsl hg17 -nobin -table=all_bacends bacEnds.load.psl
# NOTE: truncates file to 0 if -nobin is used
hgLoadPsl hg17 -table=all_bacends bacEnds.load.psl
#load of all_bacends did not go as planned: 441072 record(s), 0 row(s) skipped, 30 warning(s) loading psl.tab
# Reloaded table, 2004-07-21, and got more rows:
# load of all_bacends did not go as planned: 1698790 record(s),
# 0 row(s) skipped, 63 warning(s) loading psl.tab
# load BAC end sequences
mkdir -p /gbdb/hg17/bacends
ln -s /cluster/data/ncbi/bacends/human/bacends.4/BACends.fa \
/gbdb/hg17/bacends/BACends.fa
hgLoadSeq hg17 /gbdb/hg17/bacends/BACends.fa
# 158588 sequences
# Re-ran pslPairs with updated pairs file to take advantage of new
# feature of # allowing comma separated lists of end accessions for each
# end for a clone
# First, create new pairs file (bacEndPairs.txt, bacEndSingles.txt)
mkdir /cluster/data/ncbi/bacends/human/bacends.5
cd /cluster/data/ncbi/bacends/human/bacends.5
cp ../bacends.4/cl_ac_gi_len .
/cluster/bin/scripts/convertEndPairInfo cl_ac_gi_len
# Next, re-create the bed file
mkdir /cluster/data/hg17/bed/bacends.update
cd /cluster/data/hg17/bed/bacends.update
ln -s /cluster/data/hg17/bed/bacends/bacEnds.sorted.psl ./bacEnds.sorted.psl
set ncbiDir = /cluster/data/ncbi/bacends/human/bacends.5
~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose bacEnds.sorted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds
# create header required by "rdb" tools
# TODO: replace w/ awk & sort
echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
cat header bacEnds.pairs | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairs.bed
cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch bacEnds.orphan \
| row score ge 300 | sorttbl chr start | headchg -del > bacEndPairsBad.bed
# wc *.bed
# 204884 2253724 20612402 bacEndPairs.bed
# 79401 873411 6527559 bacEndPairsBad.bed
# previous
# wc ../bacends/*.bed
# 201380 2215180 20280578 ../bacends/bacEndPairs.bed
# 81773 899503 6712402 ../bacends/bacEndPairsBad.bed
extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \
bacEndPairsBad.bed | \
sorttbl tname tstart | headchg -del > bacEnds.load.psl
# load into database
ssh hgwdev
cd /cluster/data/hg17/bed/bacends.update
hgLoadBed hg17 bacEndPairs bacEndPairs.bed \
-sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairs.sql
# Loaded 204884
# note - this track isn't pushed to RR, just used for assembly QA
hgLoadBed hg17 bacEndPairsBad bacEndPairsBad.bed \
-sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairsBad.sql
# Loaded 79401
#hgLoadPsl hg17 -nobin -table=all_bacends bacEnds.load.psl
# NOTE: truncates file to 0 if -nobin is used
hgLoadPsl hg17 -table=all_bacends bacEnds.load.psl
# load of all_bacends did not go as planned: 1729146 record(s), 0 row(s) skipped, 70 warning(s) loading psl.tab
# PLACE ASSEMBLY CLONES - misc instructions, only somewhat relevant
# See PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE below
###### A second attempt at clone alignment###
# Split the clones into 3K pieces into about 1000 fa files
# Example:
zcat Z99916.1.fa.gz Z99774.1.fa.gz Z99756.7.fa.gz | faSplit size stdin 3000 /tmp/name.fa -lift=/tmp/name.lft -oneFile
# Trying this idea in unPlacedBatch
ssh kk0
mkdir /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
cd /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs > nibList
ls -1S /cluster/data/hg17/bed/contig_overlaps/blatClones > cloneList
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fastMap -ooc=/scratch/hg/h/11.ooc -q=dna -t=dna {check in exists /scratch/hg/gs.18/build35/bothMaskedNibs/$(path1)} {check in exists+ /cluster/data/hg17/bed/contig_overlaps/blatClones/$(path2)} {check out line+ psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
mkdir psl
cat nibList | sed -e "s/.nib//" | while read D
do
mkdir psl/$D
done
gensub2 nibList cloneList gsub jobList
para create jobList
# PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE (DONE - 2004-07-12 - Hiram)
ssh eieio
mkdir /cluster/data/hg17/bed/contig_overlaps
cd /cluster/data/hg17/bed/contig_overlaps
# find all the clones that were used in the assembly
sed -e "/^#.*/d" /cluster/data/hg17/ncbi_build35.agp | \
awk '{if (!match($5,"N")) {print $6}}' | \
sort -u > placed_in_assembly.list
wc -l placed_in_assembly.list
# 26872 placed_in_assembly.list
# These may be available from the phases files at:
# ftp://ftp.ncbi.nih.gov/genbank/genomes/H_sapiens
# Which are easily fetched with wget. However I took a look
# at those and could not find all the clones in them. There may
# be a versioning problem because these phases files are often
# updated.
# Fetch them from Genbank with the following three PERL scripts:
# [hiram@hgwdev /cluster/data/hg17/bed/contig_overlaps] ls -og *.pl
# -rwxrwxr-x 1 3047 May 24 18:43 bioPerlFetch.pl
# -rwxrwxr-x 1 2370 Jun 4 15:21 fetchGenbank.pl
# -rwxrwxr-x 1 700 May 24 21:47 foldEm.pl
# Which takes about 4 days ...
# Example,
cat << '_EOF_' > terrys.list
AC011841.7
AC018692.9
AC018743.27
AC037482.14
AL163540.11
'_EOF_'
# << this line makes emacs coloring happy
# only works on hgwdev
ssh hgwdev
cd /cluster/data/hg17/bed/contig_overlaps
mkdir fasta
time ./fetchGenbank.pl terrys.list > fetchResult.out 2>&1
# There is a bit of behind the scenes hocus pocus going on here.
# This is a tedious task of comparing various lists with each
# other and making sure everything matches. Manual fixups are
# done for the newly named 6_hla_hap* items, copies of the PAR
# business were duplicated so that X and Y both have the same set
# of clones for that. The end result should be a directory hierarchy
# here with a directory for each chrom, each random, the 6_hla_hap?
# items and each directory contains the clones that belong to that
# chromosome. The leftovers are the unplaced clones which end up
# in the directory called: unPlaced. The instructions here are
# merely a guideline of possibilities. Care should be taken to
# make sure all listings are correct and everything gets in the
# right place.
ssh eieio
# And then make a list of all clones considered for assembly:
sed -e "/^#.*/d" /cluster/store5/gs.18/ncbi/sequence.inf | \
grep for_assembly | awk '{print $1}' | sort -u > sequence.list
wc -l sequence.list
# 46733 sequence.list
# Verify overlaps are correct:
comm -12 placed_in_assembly.list sequence.list > inBoth
comm -23 placed_in_assembly.list sequence.list > inAssemblyNotSequence
comm -13 placed_in_assembly.list sequence.list > inSequenceNotAssembly
wc in*
# 1 1 12 inAssemblyNotSequence
# 26871 26871 301709 inBoth
# 19862 19862 219050 inSequenceNotAssembly
# 46734 46734 520771 total
# This stray one is from Terry's five additions in the final fixup
# phase with Greg:
cat inAssemblyNotSequence
# AC018743.27
# Terry added: AC011841.7 AC018692.9 AC018743.27 AC037482.14 AL163540.11
#
# Generate a listing that relates clones to their contigs
sed -e "/^#.*/d" /cluster/store5/gs.18/build35/ncbi_build35.agp | \
./contigAcc.pl > disburseEm.list
#
# Using that list, sort the downloaded clones into their
# respective chrom directories:
./disburse.sh
# Check the number of sequences obtained:
find ./? ./?? ./*_random ./6_hla* -type f | wc -l
# 26872
# So, why is this number one more than the inBoth list ?
# Because, the official NCBI sequence.inf file is missing one of
# the clones that Terry added: AC018743.27
# And it shows up in our check list above as inAssemblyNotSequence
# It isn't exactly missing, it just isn't marked "for_assembly"
# OK, with everything in place, we are ready to try and find
# all these items in the assembly. To run a Kluster job on one of
# the chroms, matching the items that are supposed to be included
# in that chrom. We need to get things set up on the Iservers,
# psLayout is heavy into disk I/O and it brings everything down if
# allowed to work on any NFS filesystems for input.
# It appears that psLayout wants an ooc file of tile size 10
# I tried making one for the whole assembly but it seemed to
# include too much for some contigs and it caused a lot of
# alignments to be missed. Thus, create an ooc file for each
# contig
ssh eieio
mkdir /cluster/bluearc/scratch/hg/gs.18/build35/contigOoc10
cd /cluster/bluearc/scratch/hg/gs.18/build35/contigOoc10
ls ../maskedContigs | sed -e "s/.fa//" | while read CONTIG
do
blat -repMatch=256 -makeOoc=${CONTIG}.10.ooc -tileSize=10 \
../maskedContigs/${CONTIG}.fa \
../maskedContigs/${CONTIG}.fa /dev/null
echo "done: ${CONTIG}"
done
# Copy that result to the Iservers:
ssh kkr1u00
mkdir /iscratch/i/gs.18/build35/contigOoc10
cd /iscratch/i/gs.18/build35/contigOoc10
rsync -arlv /cluster/bluearc/scratch/hg/gs.18/build35/contigOoc10/ .
# And, copy the clone sequences:
mkdir /iscratch/i/gs.18/build35/clones
cd /cluster/store5/gs.18/build35/bed/contig_overlaps
for D in ? ?? *_random 6_hla_hap?
do
rsync -arlv `pwd`/${D} /iscratch/i/gs.18/build35/clones
done
/cluster/bin/iSync
ssh kk
cd /cluster/data/hg17/bed/contig_overlaps
mkdir psl
cat << '_EOF_' > runPsLayout.sh
#!/bin/sh
# kkiPsLayout.sh <chrom> <clone> <contig>
# where <chrom> is the chrom this contig is on
# <clone> is one of the .fa.gz files in
# /cluster/data/hg17/bed/contig_overlaps/*/<clone>.fa.gz
# without the .fa.gz extension
# This stuff has been mirrored to:
# /iscratch/i/gs.18/clones/*/<clone>.fa.gz
# <contig> is one of the contigs found in:
# /cluster/store5/gs.18/build35/<chrom>/<contig>/<contig>.fa
#
CHROM=$1
CLONE=$2
CONTIG=$3
TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}.fa
FAZ=/iscratch/i/gs.18/build35/clones/${CHROM}/${CLONE}.fa.gz
OOC=/iscratch/i/gs.18/build35/contigOoc10/${CONTIG}.10.ooc
mkdir -p psl/${CONTIG}
if [ ! -s ${FAZ} ]; then
echo "Can not find: ${FAZ}"
exit 255
fi
if [ ! -s ${TARGET} ]; then
echo "Can not find: ${TARGET}"
exit 255
fi
if [ ! -s ${OOC} ]; then
echo "Can not find: ${OOC}"
exit 255
fi
zcat ${FAZ} > /tmp/${CLONE}.fa
$HOME/bin/i386/psLayout ${TARGET} \
/tmp/${CLONE}.fa genomic ${OOC} psl/${CONTIG}/${CLONE}.psl
RET=$?
rm -f /tmp/${CLONE}.fa
exit ${RET}
'_EOF_'
# << this line makes emacs coloring happy
chmod +x runPsLayout.sh
# make up a listing of chrom, clone, contig from:
grep -v "^#" disburseEm.list | sed -e "s/.fa.gz//" > chr.clone.contig.list
wc -l chr.clone.contig.list
# 26872 chr.clone.contig.list
awk '{
printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s.psl}\n",
$1, $2, $3, $3, $2
}' chr.clone.contig.list > jobList
# << this line makes emacs coloring happy
# To do a quick test, run just chr22:
grep -v "^22" chr.clone.contig.list | awk '{
printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s.psl}\n",
$1, $2, $3, $3, $2
}' > jobList
para create jobList
para try ... check ... etc ...
# One run on chr22 took:
# Completed: 561 of 561 jobs
# CPU time in finished jobs: 927068s 15451.14m 257.52h 10.73d 0.029 y
# IO & Wait Time: 6295s 104.91m 1.75h 0.07d 0.000 y
# Average job time: 1664s 27.73m 0.46h 0.02d
# Longest job: 69745s 1162.42m 19.37h 0.81d
# Submission to last job: 69780s 1163.00m 19.38h 0.81d
# put the results together, filter, lift and load:
cd /cluster/data/hg17/bed/contig_overlaps/psl
pslSort dirs raw.psl tmp N*
pslReps -singleHit raw.psl repsSingle.psl /dev/null
liftUp chr22.psl /cluster/data/hg17/jkStuff/liftAll.lft \
warn repsSingle.psl
hgLoadPsl -table=cloneTest hg17 chr22.psl
# There are a number of clones listed in the sequence.inf file
# as status W with names beginning AACC AADB AADC AADD
# These are the Whole shotgun assemblies for the Celera genome.
# A few of them were used in the assembly of the NCBI genome, namely:
./11/AADB01066164.1.fa.gz
./11/AADC01095577.1.fa.gz
./11/AADD01116830.1.fa.gz
./11/AADD01118406.1.fa.gz
./11/AADD01116787.1.fa.gz
./11/AADD01112371.1.fa.gz
./11/AADD01116788.1.fa.gz
./11/AADD01115518.1.fa.gz
./11/AADD01118410.1.fa.gz
./11/AADD01117999.1.fa.gz
./21/AADD01172789.1.fa.gz
./21/AADD01172788.1.fa.gz
./21/AADD01209098.1.fa.gz
./21/AADD01172902.1.fa.gz
# And these have been distributed properly in their corresponding
# chromosome. The rest of them, 26, all with names starting AACC are in
# the directory here: celeraOnly
# To run the unPlaced alignments.
# Prepare scratch and iscratch
ssh eieio
mkdir /cluster/bluearc/scratch/hg/gs.18/build35/clones/unPlaced
rsync -arlv /cluster/data/hg17/bed/contig_overlaps/unPlaced/ \
/cluster/bluearc/scratch/hg/gs.18/build35/clones/unPlaced
# request scratch sync to cluster admins
ssh kkr1u00
mkdir /iscratch/i/gs.18/build35/clones/unPlaced
rsync -arlv /cluster/data/hg17/bed/contig_overlaps/unPlaced/ \
/iscratch/i/gs.18/build35/clones/unPlaced
/cluster/bin/iSync
ssh hgwdev
cd /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
# There are too many to try them all, obtain guildelines from hg16
# of clone to contig mapping:
hgsql -N -e "select name,chrom from clonePos;" hg16 > hg16.clone.chrom
hgsql -N -e "select contig,chrom from ctgPos;" hg16 > hg16.contig.chrom
ssh kk
mkdir /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
cd /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
ls ../unPlaced | sed -e "s/.fa.gz//" > unPlaced.clone.list
wc -l unPlaced.clone.list
# 19836 unPlaced.clone.list
ls -1S /scratch/hg/gs.18/build35/maskedContigs > contig.list
wc -l contig.list
# 380 contig.list
cat << '_EOF_' > runPsLayout.sh
#!/bin/sh
# kkiPsLayout.sh <clone> <contig>
# <clone> is one of the .fa.gz files in
# /scratch/hg/gs.18/build35/clones/unPlaced
# without the .fa.gz extension
# <contig> is one of the contigs found in:
# /iscratch/i/gs.18/build35/maskedContigs
#
CLONE=$1
CONTIG=$2
TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}.fa
FAZ=/scratch/hg/gs.18/build35/clones/unPlaced/${CLONE}.fa.gz
OOC=/iscratch/i/gs.18/build35/contigOoc10/${CONTIG}.10.ooc
mkdir -p psl/${CONTIG}
if [ ! -s ${FAZ} ]; then
echo "Can not find: ${FAZ}"
exit 255
fi
if [ ! -s ${TARGET} ]; then
echo "Can not find: ${TARGET}"
exit 255
fi
if [ ! -s ${OOC} ]; then
echo "Can not find: ${OOC}"
exit 255
fi
zcat ${FAZ} > /tmp/${CLONE}.fa
$HOME/bin/i386/psLayout ${TARGET} \
/tmp/${CLONE}.fa genomic ${OOC} psl/${CONTIG}/${CLONE}.psl
RET=$?
rm -f /tmp/${CLONE}.fa
exit ${RET}
'_EOF_'
# << this line makes emacs coloring happy
chmod +x runPsLayout.sh
cat << '_EOF_' > gsub
#LOOP
./runPsLayout.sh $(path1) $(path2) {check out line+ psl/$(path2)/$(path1).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 unPlaced.clone.list contig.list gsub jobList
# XXXX - some time later ... 2004-07-12
# Bringing this sequence to a close. Difficulties encountered:
# Placed clones that did not survive the psLayout filter:
# AC006040.3 AC006328.5 AC007039.6 AC007241.3 AC007965.3
# AC009947.2 AC010682.2 AC012005.4 AC016707.2 AC016728.4
# AC016752.2 AC017005.7 AC025226.4 AC025246.6 AC055713.29
# AC068541.7 AC068601.8 AC068704.4 AC073649.3 AC073962.5
# AC091175.11 AC095381.1 AC104597.3 AC130223.2 AC130814.3
# AC133883.6 AC139103.3 AF003627.3 AF135405.3 AL021878.2
# AL137064.6 AL356803.2 AL390801.4 AL591480.8 AL901608.1
# AP005814.2 BX322790.2 Z84489.1 Z84814.1
# And placed clones that broken into two pieces during their
# psLayout alignment:
# AC006982.3 AC007742.4 AC023342.3 AC024183.4 AC025735.4
# AC095380.1 AL646104.4 BX293536.4
# For the above clones, their assignments in ref_placed.agp were
# used instead of trying to adjust the psLayout process.
# The PAR clones are a problem. They were placed properly, but
# during their load with hgClonePos there was a warning issued
# about their dual existance. hgClonePos said they were only
# going to be placed on chrX and not on chrY. However in the
# browser when chrY is viewed it issues errors about these not
# having proper coordinates in the clonePos table. These were
# removed from the coverage track to eliminate that error.
# AL954722.18 BX537334.4 BX000483.7 BX908402.3 BX649635.3 BX119919.5
# AC079176.15 AC097314.27 AC006209.25 AJ271735.1 AJ271736.1
#
# And finally, after many different types of alignment attempts,
# there remain 1489 un-placed clones that could not be located.
# While trying to figure out which contigs many clones belonged
# to, the following cluster run script was used to take a survey
# using blat:
#!/bin/sh
# runBlat.sh <clone> <contig>
# <clone> is one of the .fa.gz files in
# /scratch/hg/gs.18/build35/clones/
# without the .fa.gz extension
# <contig> is one of the contigs found in:
# /iscratch/i/gs.18/build35/maskedContigs
#
# ./runBlat.sh unPlaced/AB000876.1.fa.gz NT_005612.fa {check out line+
# psl/NT_005612.fa/unPlaced/AB000876.1.fa.gz.psl}
#
HERE=`pwd`
CLONE=$1
CLONEDIR=`dirname ${CLONE}`
CLONENAME=`basename ${CLONE}`
CLONESRC=/iscratch/i/gs.18/build35/clones/${CLONE}.fa.gz
CONTIG=$2
CONTIGBASE=${CONTIG/.fa/}
TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}
if [ ! -s ${CLONESRC} ]; then
echo "Can not find: ${CLONESRC}" 1>/dev/stderr
exit 255
fi
if [ ! -s ${TARGET} ]; then
echo "Can not find: ${TARGET}" 1>/dev/stderr
exit 255
fi
mkdir -p /tmp/${CLONEDIR}/${CLONENAME}
zcat ${CLONESRC} > /tmp/${CLONEDIR}/${CLONENAME}/${CLONENAME}.fa
cd /tmp/${CLONEDIR}
/cluster/data/hg17/bed/contig_overlaps/FfaSplit/faToFfa ${CLONENAME}
ECOUNT=`cat error.convert | wc -l`
if [ "${ECOUNT}" -ne 0 ]; then
echo "Error during faToFfa, error.convert not empty" 1>/dev/stderr
exit 255
fi
rm -f error.convert
B=${CLONENAME/\.*/}
cd /tmp/${CLONEDIR}/${CLONENAME}
faSplit byname ${CLONENAME}.fa .
RET=0
export RET
for F in ${CLONENAME}_*.fa
do
FA=${F/_*.fa/}
A=${FA/.[0-9]*/}
P=${F/.fa/}
N=${P##*_}
rm -f t.fa
mv ${F} t.fa
cat t.fa | faSplit -oneFile size stdin 3000 ${A}_${N}
rm -f t.fa
blat ${TARGET} ${A}_${N}.fa -ooc=/scratch/hg/h/11.ooc ${A}_${N}.psl \
-t=dna -q=dna -fastMap -noHead
RET=$?
if [ "$RET" -ne 0 ]; then
echo "Error during blat ${TARGET} ${A}_${N}.fa" 1>/dev/stderr
break
fi
done
rm -f ${CLONENAME}.fa
rm -f ${B}_*.fa
cd ${HERE}
mkdir -p psl/${CONTIGBASE}
sed -e "s/${A}/${CLONENAME}/" /tmp/${CLONEDIR}/${CLONENAME}/*.psl > \
psl/${CONTIGBASE}/${CLONENAME}.psl
rm -f /tmp/${CLONEDIR}/${CLONENAME}/*.psl
rmdir --ignore-fail-on-non-empty /tmp/${CLONEDIR}/${CLONENAME}
rmdir --ignore-fail-on-non-empty /tmp/${CLONEDIR}
exit ${RET}
# The alignment with psLayout were done with the following cluster
# run script:
#!/bin/sh
# kkiPsLayout.sh <clone> <contig>
# <clone> is one of the .fa.gz files in
# /scratch/hg/gs.18/build35/clones/unPlaced
# without the .fa.gz extension
# <contig> is one of the contigs found in:
# /iscratch/i/gs.18/build35/maskedContigs
#
# ./runPsLayout.sh unPlaced/AP001966.2 NT_016354 {check out exists
# psl/NT_016354/AP001966.2.psl}
#
HERE=`pwd`
CLONE=$1
CONTIG=$2
CLONEDIR=`dirname ${CLONE}`
CLONENAME=`basename ${CLONE}`
RESULT=psl/${CONTIG}/${CLONENAME}.psl
CLONESRC=/iscratch/i/gs.18/build35/clones/${CLONE}.fa.gz
TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}.fa
OOC=/iscratch/i/gs.18/build35/contigOoc10/${CONTIG}.10.ooc
if [ ! -s ${CLONESRC} ]; then
echo "Can not find: ${CLONESRC}" 1>/dev/stderr
exit 255
fi
if [ ! -s ${TARGET} ]; then
echo "Can not find: ${TARGET}" 1>/dev/stderr
exit 255
fi
if [ ! -s ${OOC} ]; then
echo "Can not find: ${OOC}" 1>/dev/stderr
exit 255
fi
mkdir -p /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}
zcat ${CLONESRC} > /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}.fa
cd /tmp/${CONTIG}
/cluster/data/hg17/bed/contig_overlaps/FfaSplit/faToFfa ${CLONEDIR}
cd ${HERE}
mkdir -p psl/${CONTIG}
$HOME/bin/i386/psLayout ${TARGET} /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}.fa genomic ${OOC} ${RESULT}
RET=$?
rm -f /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}.fa /tmp/${CONTIG}/error.convert
rmdir --ignore-fail-on-non-empty /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}
rmdir --ignore-fail-on-non-empty /tmp/${CONTIG}/${CLONEDIR}/
rmdir --ignore-fail-on-non-empty /tmp/${CONTIG}
exit ${RET}
# BUILD KNOWN GENES TABLES (DONE 6/8/04 Fan)
Build sp040515 and proteins040515 DBs first.
hgsql hg17 -e "create database kgHg17"
cd /cluster/store6/kgDB/bed
mkdir kgHg17
cd /cluster/store6/kgDB/bed/kgHg17
~/src/hg/protein/KGprocess.sh kgHg17 hg17 040515
The script was run successfully with the last message:
Tue Jun 8 15:36:52 PDT 2004 DONE
After initial inspection of tables in kgHg17, do the following
from mySql prompt:
alter table kgHg17.cgapAlias rename as hg17.cgapAlias;
alter table kgHg17.cgapBiocDesc rename as hg17.cgapBiocDesc;
alter table kgHg17.cgapBiocPathway rename as hg17.cgapBiocPathway;
alter table kgHg17.dupSpMrna rename as hg17.dupSpMrna;
alter table kgHg17.keggMapDesc rename as hg17.keggMapDesc;
alter table kgHg17.keggPathway rename as hg17.keggPathway;
alter table kgHg17.kgAlias rename as hg17.kgAlias;
alter table kgHg17.kgProtAlias rename as hg17.kgProtAlias;
alter table kgHg17.kgXref rename as hg17.kgXref;
alter table kgHg17.knownGene rename as hg17.knownGene;
alter table kgHg17.knownGeneLink rename as hg17.knownGeneLink;
alter table kgHg17.knownGeneMrna rename as hg17.knownGeneMrna;
alter table kgHg17.knownGenePep rename as hg17.knownGenePep;
alter table kgHg17.mrnaRefseq rename as hg17.mrnaRefseq;
alter table kgHg17.spMrna rename as hg17.spMrna;
hg17.knownGene has 43,401 entries and hg16.knownGene has 43,232 entries.
and running featireBits shows:
featureBits hg17 knownGene
63983072 bases of 2866216770 (2.232%) in intersection
featureBits hg16 knownGene
63781799 bases of 2865248791 (2.226%) in intersection
Connect to genome-testdb and use hgcentraltest DB.
Add a new entry in gdbPdb table:
insert into gdbPdb values('hg17', 'proteins040515');
# CREATE LINEAGE-SPECIFIC REPEATS FOR BLASTZ WITH ZEBRAFISH
# (DONE, 2004-06-08, hartera)
# Treat all repeats as lineage-specific
mkdir /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish
foreach f (/iscratch/i/gs.18/build35/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
end
iSync
# PREP FOR LIFTOVER CHAINS TO THIS ASSEMBLY (2004-06-10 kate)
# split into 3K chunks
ssh eieio
set liftDir = /cluster/data/hg17/bed/liftOver/liftSplit
mkdir -p $liftDir
cd $liftDir
cat > split.csh << 'EOF'
set splitDir = /iscratch/i/hg17/liftOver/split
mkdir -p $splitDir
set liftDir = /cluster/data/hg17/bed/liftOver/liftSplit
foreach n (`ls /cluster/data/hg17/nib`)
set c = $n:r
echo $c
faSplit -lift=$liftDir/$c.lft size \
/cluster/data/hg17/$d/$c.fa -oneFile 3000 $splitDir/$c
end
'EOF'
# << for emacs
csh split.csh >&! split.log &
tail -100f split.log
ssh kkr1u00
iSync
# STS MARKERS (DONE 2004-07-21 kate)
# MANUAL UPDATE OF D21S168 and D21S167 (DONE, 2005-02-11, hartera)
# FILTERED OUT noOoc ALIGNMENTS WITH tBaseInsert >=1000
# (DONE, 2005-02-17, hartera) AND RELOADED stsMap, stsInfo2 and all_sts_seq
# DATABASE TABLES AFTER ADDING FILTERED ALIGNMENTS TO all_sts_seq AND
# REMOVING DATA FROM stsMap and stsInfo2 FOR THE MARKERS REMOVED FROM THE
# FILTERED SET (DONE, 2005-02-18, hartera)
# UPDATE PSL ALIGNMENTS FOR D21S167 and D21S168 AND RELOAD INTO all_sts_seq
# (DONE, 2005-02-23, hartera)
# UPDATED stsAlias TABLE REMOVING OF FILTERED ALIGNMENTS (2005-02-24, hartera)
# Terry's sts.9 dir is in /cluster/store5/sts.2004-07.old
# remove this after verifying the newer version
# update from NCBI (booch)
ssh eieio
# use store5 for space
mkdir -p /cluster/store5/sts.2004-07
ln -s /cluster/store5/sts.2004-07 /cluster/data/ncbi
ln -s /cluster/data/ncbi/sts.2004-07 sts.9
cd /cluster/data/ncbi/sts.2004-07
wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts
wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases
wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
gunzip sts.gz
mv sts dbSTS.fa
# incremental update from previous build
# NOTE: could mysql dump this, unless hand-updated (like hg16)
# First - copy from Terry's dir
ssh eieio
ln -s /cluster/store1/sts.8 /cluster/data/ncbi
cd /cluster/data/ncbi/sts.9
# this time, snag from Terry's dir
cd /cluster/data/ncbi/sts.9
cp -p ~booch/tracks/update/all.STS.fa.prev .
cp -p ~booch/tracks/update/stsInfo2.bed stsInfo2.bed.prev
# Convert dbSTS.fa file to easier reading format, and get accessions
/cluster/bin/scripts/convertGbFaFile dbSTS.fa > dbSTS.convert.fa
grep ">" dbSTS.convert.fa | cut -f 2 -d ">" > dbSTS.acc
# NOTE: updateStsInfo creates new stsInfo2.bed, all.primers,
# all.STS.fa, stsAlias.bed files
updateStsInfo -verbose=1 -gb=dbSTS.acc stsInfo2.bed.prev all.STS.fa.prev \
dbSTS.sts dbSTS.aliases dbSTS.convert.fa new
# 129991 SWXD2599 99622 (0) not in dbSTS anymore
# 166473 D3S3812 154523 (0) not in dbSTS anymore
# 185776 RH83562 209614 (0) not in dbSTS anymore
mv new.info stsInfo2.bed
mv new.primers all.primers
mv new.alias stsAlias.bed
mv new.fa all.STS.fa
# get list of all STS id's in the fasta file
sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n > all.STS.id
wc -l all.STS.id
# 92674 total sequences
/cluster/bin/scripts/convertPrimerToFA all.primers > all.primers.fa
# Copy stsInfo2.bed and stsAlias.bed to data directory becuase
# these will be loaded into the database later
mkdir -p /cluster/data/hg17/bed/sts
cp stsInfo2.bed /cluster/data/hg17/bed/sts/
cp stsAlias.bed /cluster/data/hg17/bed/sts/
# Create sts sequence alignments
mkdir -p /cluster/bluearc/sts.9/sts.split
faSplit sequence all.STS.fa 50 /cluster/bluearc/sts.9/sts.split/sts
cp /cluster/data/ncbi/sts.9/all.STS.fa /cluster/bluearc/sts.9
# create small ooc file to use with alignments (if not existing)
# NOTE: these were just used for experimenting; weren't used in
# final runs
ssh kolossus
cd /cluster/data/hg17/bed/sts
ls /cluster/bluearc/hg17/bothMaskedNibs/chr*.nib > nib.lst
blat nib.lst /dev/null /dev/null \
-tileSize=11 -makeOoc=/cluster/bluearc/hg/h/11.4096.ooc -repMatch=4096
blat nib.lst /dev/null /dev/null \
-tileSize=11 -makeOoc=/cluster/bluearc/hg/h/11.16384.ooc -repMatch=16384
ssh kk
cd /cluster/data/hg17/bed/sts
mkdir run
cd run
ls -1S /scratch/hg/hg17/maskedContigs/*.fa > contigs.lst
ls -1S /cluster/bluearc/sts.9/sts.split/sts*.fa > sts.lst
mkdir -p /cluster/bluearc/hg17/sts/sts/out
foreach f (`cat sts.lst`)
set d = $f:t:r
mkdir /cluster/bluearc/hg17/sts/sts/out/$d
end
# create alignments
cat > template << 'EOF'
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -ooc=/cluster/bluearc/hg/h/11.ooc -stepSize=5 {check out line+ /cluster/bluearc/hg17/sts/sts/out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'EOF'
# << for emacs
gensub2 contigs.lst sts.lst template jobList
para create jobList
# 17860 jobs
para try
para check
para push
# CPU time in finished jobs: 216985s 3616.41m 60.27h 2.51d 0.007 y
# IO & Wait Time: 48790s 813.17m 13.55h 0.56d 0.002 y
# Average job time: 15s 0.25m 0.00h 0.00d
# Longest job: 267s 4.45m 0.07h 0.00d
# Submission to last job: 2228s 37.13m 0.62h 0.03d
# Compile sts sequence results
ssh kolossus
cd /cluster/bluearc/hg17/sts/sts
pslSort dirs raw.psl temp out/*
rm -rf temp
pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons raw.psl \
stsMarkers.psl /dev/null
# Processed 7121016 alignments
#cp stsMarkers.psl /cluster/data/hg17/bed/sts/run
# Lift them and get them ready to combine with primer alignments
#cd /cluster/data/hg17/bed/sts/run
#liftUp -nohead /cluster/data/hg17/bed/sts/run/stsMarkers.lifted.psl \
liftUp -nohead stsMarkers.lifted.psl \
/cluster/data/hg17/jkStuff/liftAll.lft warn stsMarkers.psl
# missing some utilities for kolossus, so switch to fileserver
# NOTE: probably no longer true -- try on kolossus next time
ssh kksilo
cd /cluster/bluearc/hg17/sts/sts
/cluster/bin/scripts/extractPslInfo stsMarkers.lifted.psl
# creates <file>.initial
/cluster/bin/scripts/findAccession -agp stsMarkers.lifted.psl.initial \
/cluster/data/hg17
# "Could not open /cluster/data/hg17/Y/chrY_random.agp" etc.
# Looks like it trys all _randoms (even one's that don't
# exist/aren't needed
# creates <file>.acc
#rm stsMarkers.lifted.psl.initial
sort -k 4n stsMarkers.lifted.psl.initial.acc > stsMarkers.final
#rm stsMarkers.lifted.psl.initial.acc
#cp stsMarkers.final stsMarkers.lifted.psl.initial /cluster/data/hg17/bed/sts
# determine found markers (4th field in file)
cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
wc -l stsMarkers.found
# 89532 stsMarkers.found
# out of 92674 total sequences
# extract sequences for markers not yet found, and
# blat w/o ooc to try to place more
comm -1 -3 stsMarkers.found /cluster/data/ncbi/sts.9/all.STS.id \
> stsMarkers.notFound
wc -l stsMarkers.notFound
# 3142 stsMarkers.notFound
faSomeRecords /cluster/data/ncbi/sts.9/all.STS.fa stsMarkers.notFound \
notFound.STS.fa
mkdir /cluster/bluearc/sts.9/sts.splitNotFound
faSplit sequence notFound.STS.fa 20 \
/cluster/bluearc/sts.9/sts.splitNotFound/sts
# blat with 11.ooc misses alignments, so reblat w/o the
# sequences that aren't found
# NOTE: filtering produces yield of only 149 markers placed (out of 3142).
# not enough to justify this step next time
ssh kk
cd /cluster/data/hg17/bed/sts
mkdir run.noOoc
cd run.noOoc
ls -1S /scratch/hg/hg17/maskedContigs/*.fa > contigs.lst
ls -1S /cluster/bluearc/sts.9/sts.splitNotFound/sts*.fa > sts.lst
mkdir -p /cluster/bluearc/hg17/sts/sts/out.noOoc
foreach f (`cat sts.lst`)
set d = $f:t:r
mkdir /cluster/bluearc/hg17/sts/sts/out.noOoc/$d
end
cat > template << 'EOF'
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -stepSize=5 {check out line+ /cluster/bluearc/hg17/sts/sts/out.noOoc/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'EOF'
# << for emacs
gensub2 contigs.lst sts.lst template jobList
para create jobList
# 7220 jobs written to batch
para try
para check
# process this set of alignments
ssh kolossus
cd /cluster/bluearc/hg17/sts/sts
pslSort dirs raw.noOoc.psl temp out.noOoc/*
rm -rf temp
pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons \
raw.noOoc.psl stsMarkers.noOoc.psl /dev/null
# Processed 4254094 alignments
#cp stsMarkers.psl /cluster/data/hg17/bed/sts/run
# Lift them and get them ready to combine with primer alignments
liftUp -nohead stsMarkers.noOoc.lifted.psl \
/cluster/data/hg17/jkStuff/liftAll.lft warn stsMarkers.noOoc.psl
/cluster/bin/scripts/extractPslInfo stsMarkers.noOoc.lifted.psl
# creates <file>.initial
/cluster/bin/scripts/findAccession -agp \
stsMarkers.noOoc.lifted.psl.initial /cluster/data/hg17
# "Could not open /cluster/data/hg17/Y/chrY_random.agp" etc.
# Looks like it trys all _randoms (even one's that don't
# exist/aren't needed
# creates <file>.acc
#rm stsMarkers.lifted.psl.initial
mv stsMarkers.final stsMarkers.ooc.final
sort -k 4n stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.extra
sort -k 4n stsMarkers.lifted.psl.initial.acc \
stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.final
# determine found markers (4th field in file)
cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
wc -l stsMarkers.found
# 89681 stsMarkers.found
cut -f 4 stsMarkers.extra | sort -n -u > stsMarkers.extra.found
wc -l stsMarkers.extra.found
# 149 out of 3142 attempted
# out of 92674 total sequences
cp stsMarkers.final stsMarkers.lifted.psl stsMarkers.*lifted.psl.initial* stsMarkers.found \
/cluster/data/hg17/bed/sts
# Alignments from noOoc set were not added to all_sts_seq but info for the markers
# is in stsMap and stsInfo2. Some of the alignments are bad so filter by removing
# all alignments from noOoc psl file where tBaseInsert >=1000. Add the remaining
# alignments to the set of final alignments for stsMarkers. The information for the
# removed markers from the filtered set was also removed from stsMap and stsInfo2.
# (DONE, 2005-02-17, hartera)
ssh eieio
cd /cluster/data/hg17/bed/sts/fix
cp /cluster/bluearc/hg17/sts/sts/stsMarkers.noOoc.lifted.psl .
awk '{if ($8 < 1000) print;}' stsMarkers.noOoc.lifted.psl > stsMarkers.noOoc.lifted.filt1000.psl
wc -l *.filt*.psl
# 254 5334 26384 stsMarkers.noOoc.lifted.filt1000.psl
sort -k 4n /cluster/bluearc/hg17/sts/sts/stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.extra
awk '{print $4;}' stsMarkers.extra | sort -n | uniq > extra.ids
# in psl file, the ids are the 10th field
awk '{print $10;}' stsMarkers.noOoc.lifted.psl | sort -n | uniq \
> noOoc.ids
diff extra.ids noOoc.ids
# there is no difference as expected
# get list of IDs from filtered file, filter < 1000
awk '{print $10;}' stsMarkers.noOoc.lifted.filt1000.psl \
| sort -n | uniq > filt1000.ids
foreach i (`cat filt1000.ids`)
awk 'BEGIN {OFS="\t"} \
{if ($4 == "'$i'") print $1, $2, $3, $4, $5, $6, $7}' \
stsMarkers.extra >> stsMarkers.extra.filt1000
end
cp ../stsMarkers.final stsMarkers.final
# cat stsMarkers.extra.filt1000 >> stsMarkers.final2
# need to filter stsMarkers.final not just cat this on the end
# get list of alignments with tBaseInsert >= 1000 and remove these
cd /cluster/data/hg17/bed/sts/fix
awk '{if ($8 >= 1000) print;}' stsMarkers.noOoc.lifted.psl > stsMarkers.noOoc.lifted.filtToRemove.psl
wc -l *.filt*.psl
# 254 stsMarkers.noOoc.lifted.filt1000.psl
# 249 stsMarkers.noOoc.lifted.filt500.psl
# 448 stsMarkers.noOoc.lifted.filtToRemove.psl
# get list of IDs that need to be removed
awk '{print $10;}' stsMarkers.noOoc.lifted.filtToRemove.psl | sort -n \
| uniq > noOoc.IdsToRemove.txt
# get chrom and co-ordinates for IDs to be removed
awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \
stsMarkers.noOoc.lifted.filtToRemove.psl | sort | uniq \
> sts.noOoc.filtToRemove.coords
# checked that the stsMarkers.final contain the noOoc alignments
# wrote perl script to remove lines with these IDs from stsMarkers.final
cat << '_EOF_' > removeIds.pl
#!/usr/bin/perl -w
use strict;
my $ids = $ARGV[0];
my $file = $ARGV[1];
# list of IDs with chrom and coords to remove
open(IDS, $ids) || die "Can not open $ids: $!\n";
# file for removal of IDs
open(FILE, $file) || die "Can not open $file: $!\n";
open(OUT, ">removed.txt") || die "Can not create removed.txt: $!\n";
my %idsHash;
while (<IDS>) {
chomp;
my @a = split(/\t/);
my $chr = $a[0];
my $st = $a[1];
my $end = $a[2];
my $id = $a[3];
my $key = $id."_".$chr . "_" . $st . "_" . $end;
$idsHash{$key}->{chrom} = $chr;
$idsHash{$key}->{start} = $st;
$idsHash{$key}->{end} = $end;
}
close IDS;
while (<FILE>) {
chomp;
my $l = $_;
my $found = "FALSE";
my @f = split(/\t/, $l);
foreach my $k (keys(%idsHash)) {
# if the id is contained in the key
if ($k =~ /^$f[3]/) {
my $c = $idsHash{$k}->{chrom};
my $s = $idsHash{$k}->{start};
my $e = $idsHash{$k}->{end};
if ($f[0] eq $c && $f[1] == $s && $f[2] == $e) {
print OUT "$c\t$s\t$e\t$f[3]\n";
$found = "TRUE";
}
}
}
if ($found eq "FALSE") {
print "$l\n";
}
}
'_EOF_'
chmod +x removeIds.pl
perl removeIds.pl sts.noOoc.filtToRemove.coords stsMarkers.final \
> stsMarkers.final.new
wc -l stsMarkers.final*
# 92338 stsMarkers.final
# 91890 stsMarkers.final.new
# There are 448 ids and sets of co-ordinates in list of Ids to remove
# check that stsMarkers.final.new contains all the alignments that
# are in filtered set: stsMarkers.noOoc.lifted.filt1000.psl
awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \
stsMarkers.noOoc.lifted.filt1000.psl | sort | uniq \
> sts.noOoc.filt1000.coords
awk 'BEGIN {OFS = "\t"} {print $1,$2,$3,$4}' \
stsMarkers.final.new | sort | uniq \
> sts.finalnew.coords
diff sts.finalnew.coords sts.noOoc.filt1000.coords > finalnewvsfilt1000
grep '>' finalnewvsfilt1000
# there is nothing in sts.noOoc.filt1000.coords not found in the
# sts.finalnew.coords file therefore this contains all the alignments
# from the filtered noOoc file.
cp ../primers/primers.final .
awk '{print $4}' primers.final | sort | uniq > primers.ids
awk '{print $4}' stsMarkers.final.new | sort | uniq > stsfinal.new.ids
# primers
ssh eieio
cd /cluster/data/ncbi/sts.9
# strip out N's and wobbles (KS) from primers, as isPcr
# can't currently handle them
# strip out primers < 10 as isPcr can't handle them
awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \
all.primers > all.primers.ispcr
mkdir -p /cluster/bluearc/sts.9/primers
cd /cluster/bluearc/sts.9/primers
split -l 2000 /cluster/data/ncbi/sts.9/all.primers.ispcr primers_
ssh kk
cd /cluster/data/hg17/bed/sts
mkdir primers
cd primers
mkdir run
cd run
ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > contigs.lst
ls -1S /cluster/bluearc/sts.9/primers/primers_* > primers.lst
mkdir -p /cluster/bluearc/hg17/sts/primers/out
cat > template << 'EOF'
#LOOP
/cluster/home/kate/bin/i386/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 -ooc=/scratch/hg/h/10.ooc -stepSize=5 $(path1) $(path2) {check out line /cluster/bluearc/hg17/sts/primers/out/$(root1)_$(root2).psl}
#ENDLOOP
'EOF'
# << for emacs
gensub2 contigs.lst primers.lst template jobList
para create jobList
# 26980 jobs
para try
para check
para push
#Completed: 26953 of 26980 jobs
#Crashed: 27 jobs
#CPU time in finished jobs: 1130353s 18839.22m 313.99h 13.08d 0.036 y
#IO & Wait Time: 86067s 1434.44m 23.91h 1.00d 0.003 y
#Average job time: 45s 0.75m 0.01h 0.00d
#Longest job: 1255s 20.92m 0.35h 0.01d
#Submission to last job: 2762s 46.03m 0.77h 0.03d
# 27 jobs seg faulted due to -minPerfect=2.
# Looks like a bug in isPcr -- till it's fixed,
# we'll rerun with -minPerfect=5 (Terry determined they
# all complete with this (he used 3, 4, or 5, tuned individually
# for each job, but just using 5 should be adequate and
# less labor-intensive).
# NOTE: isPcr bug is fixed -- this shouldn't be necessary for
# next run
para crashed | grep isPcr | sed 's/minPerfect=2/minPerfect=5/' \
> jobList.minPerfect5
para create jobList.minPerfect5
# 28 jobs
# repeat with increasing minPerfect, till all complete succesfully
# Filter output file quickly based on simple parameters
ssh kolossus
cd /cluster/bluearc/hg17/sts/primers/
mkdir -p filter
pslQuickFilter -minMatch=26 -maxMismatch=5 -maxTinsert=5000 -verbose out/ filter/
# Note: there will be many messages saying files are empty - this is OK
pslSort dirs primers.psl.unlifted temp filter
# filter primer alignments and create not found primer file for ePCR run (booch)
pslFilterPrimers /cluster/bluearc/hg17/sts/primers/primers.psl.unlifted \
/cluster/data/ncbi/sts.9/all.primers primers.filter.unlifted.psl
# creates $3.notfound.primers
wc -l primers.filter.unlifted.psl.notfound.primers
# 21919 primers.filter.unlifted.psl.notfound.primers
# use Greg Schuler's ePCR to attempt alignment of primers missed
# by isPcr
mkdir -p /cluster/data/hg17/bed/sts/primers/run.epcr
mkdir -p /cluster/bluearc/hg17/sts/primers/epcr
cd /cluster/bluearc/hg17/sts/primers/epcr
split -l 2500 /cluster/data/hg17/bed/sts/primers/primers.filter.unlifted.psl.notfound.primers primers_
cd /cluster/data/hg17/bed/sts/primers/run.epcr
ls -1S /cluster/bluearc/hg17/sts/primers/epcr/primers_* > primers.lst
# create contig.lst based on split in build dir
# NOTE: should probably replace this with something more standard
# and faster. Also, this appears to cause load spikes on fileservers.
# Should get contigs from bluearc, iservers, or cluster local disk
# At least it's over pretty quick!
ssh eieio
cd /cluster/data/hg17/bed/sts/primers/run.epcr
/cluster/bin/scripts/splitContigList -ncbi /cluster/data/hg17 1
# next time... ls -1S /cluster/bluearc/hg17/contigs/* > contig.lst (?)
mkdir -p /cluster/bluearc/hg17/sts/primers/epcr/out
ssh kk
cd /cluster/data/hg17/bed/sts/primers/run.epcr
cat > template << 'EOF'
#LOOP
/cluster/bin/scripts/runEpcr $(path1) $(path2) {check out line /cluster/bluearc/hg17/sts/primers/epcr/out/$(root1).$(root2).epcr}
#ENDLOOP
'EOF'
# << for emacs
gensub2 primers.lst contig.lst template jobList
para create jobList
# 3420 jobs
para try
para check
para push
# CPU time in finished jobs: 78897s 1314.95m 21.92h 0.91d 0.003 y
# IO & Wait Time: 254582s 4243.03m 70.72h 2.95d 0.008 y
# Average job time: 98s 1.63m 0.03h 0.00d
# Longest job: 647s 10.78m 0.18h 0.01d
# Submission to last job: 1112s 18.53m 0.31h 0.01d
# merge output
ssh eieio
cd /cluster/bluearc/hg17/sts/primers/epcr
cat out/*.epcr > all.epcr
wc -l all.epcr
# 3573
# use all.epcr file to re-filter alignemnts and determine which
# ePCR records to keep
cp all.epcr /cluster/data/hg17/bed/sts/primers
cd /cluster/data/hg17/bed/sts/primers
pslFilterPrimers -epcr=all.epcr -verbose=1 \
/cluster/bluearc/hg17/sts/primers/primers.psl.unlifted \
/cluster/data/ncbi/sts.9/all.primers primers.unlifted.epcr.psl
# convert to PSL and combine with other psl file (this takes a couple hours)
/cluster/bin/scripts/epcrToHgPsl epcr.not.found \
/cluster/data/ncbi/sts.9/all.primers /cluster/data/hg17
cat primers.unlifted.epcr.psl epcr.not.found.psl \
| sort -k 10n > primers.final.unlifted.psl
# Fix the query gap lengths so that they match the all.primers.fa
# file lengths
/cluster/bin/scripts/fixPrimersQueryGaps \
/cluster/data/ncbi/sts.9/all.primers primers.final.unlifted.psl \
> primers.final.unlifted.fix.psl
# lift results from contigs to chrom coordinates, and create final file
liftUp -nohead /cluster/data/hg17/bed/sts/primers/primers.psl \
/cluster/data/hg17/jkStuff/liftAll.lft warn \
primers.final.unlifted.fix.psl
# Extract relevant info, make alignments unique, and create final file to be merged
# with full sequence alignments
/cluster/bin/scripts/extractPslInfo primers.psl
/cluster/bin/scripts/findAccession -agp primers.psl.initial \
/cluster/data/hg17
#rm primers.psl.initial
/cluster/bin/scripts/getStsId /cluster/data/ncbi/sts.9/stsInfo2.bed \
primers.psl.initial.acc \
| sort -k 4n > primers.final
#rm primers.psl.initial.acc
wc -l primers.final
# 314713 primers.final
# Merge primer and sequence files to create final bed file
# Merge (combineSeqPrimerPos) takes about an hour to run
ssh kolossus
cd /cluster/data/hg17/bed/sts
/cluster/bin/scripts/combineSeqPrimerPos stsMarkers.final primers/primers.final
# creates *_pos.rdb
/cluster/bin/scripts/createSTSbed /cluster/data/ncbi/sts.9/stsInfo2.bed \
stsMarkers_pos.rdb > stsMap.bed
# Set up sequence files
ssh hgwdev
mkdir -p /gbdb/hg17/sts.9/
ln -s /cluster/data/ncbi/sts.9/all.STS.fa /gbdb/hg17/sts.9/all.STS.fa
ln -s /cluster/data/ncbi/sts.9/all.primers.fa \
/gbdb/hg17/sts.9/all.primers.fa
# Load all files
cd /cluster/data/hg17/bed/sts
hgLoadSeq hg17 /gbdb/hg17/sts.9/all.STS.fa /gbdb/hg17/sts.9/all.primers.fa
hgsql hg17 < ~kent/src/hg/lib/stsInfo2.sql
hgsql hg17 < ~kent/src/hg/lib/stsAlias.sql
cp /cluster/data/ncbi/sts.9/{stsInfo2.bed,stsAlias.bed} .
hgsql hg17 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
hgsql hg17 -e 'load data local infile "stsAlias.bed" into table stsAlias'
hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/stsMap.sql \
hg17 stsMap stsMap.bed
hgLoadPsl -nobin -table=all_sts_primer hg17 primers/primers.psl
hgLoadPsl -nobin -table=all_sts_seq hg17 stsMarkers.lifted.psl
# update of information for D21S167 and D21S168 (2005-02-11, hartera)
# currently X52289 associated with D21S168
# and X53367 associated with D21S167 - these need to be switched as they
# are causing incorrect positioning
# On Terry's advice,
# first manually update the accession field stsInfo2.bed so that the
# corrected version is carried through to the next version
cd /cluster/data/hg17/bed/sts
# manually change accessions in this file so now X52289 is associated
# with D21S167 and X53367 is now associated with D21S168
# manually update the chromStart and chromEnd fields for these
# records in stsMap.bed
# this change was not carried through after filtering to change stsMap.bed
# again and reload this table (DONE, 2005-02-18, hartera)
chr21 39867340 39867513 D21S167 1000 7888 AF064860
# becomes
chr21 37117635 37117858 D21S167 1000 7888 AF064860
chr21 37117635 37117858 D21S168 1000 103256 AP000699
# becomes
chr21 39867340 39867513 D21S168 1000 103256 AP000699
# then reload the stsMap.bed and stsInfo2.bed files
# copy this updated bed file back to ncbi directory
cp stsInfo2.bed /cluster/data/ncbi/sts.9/
# delete previous data before reloading tables
hgsql hg17 -e 'delete from stsInfo2'
hgsql hg17 -e 'drop table stsMap'
hgsql hg17 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/stsMap.sql \
hg17 stsMap stsMap.bed
# (2005-02-19, hartera)
# also need to update the psl alignment file and reload into all_sts_seq
# for D21S168, the id is 103256, this is qName in the psl file
# for D21S167, the id is 7888
cd /cluster/data/hg17/bed/sts
# manually update the stsMarkers.lifted.psl file with the new
# co-ordinates as above.
# (2005-02-23) Correct alignments.
# need to swap the names for the alignments not just the start and end
# coords as before as now the rest of the alignment data fields in the
# table are incorrect. Change the start and end co-ordinates and just swap
# the names for D21S167 and D21S168 in the psl file then reload the table.
# sort on the ID field (qName)
sort -k 10n stsMarkers.lifted.psl > sts.lifted.sort
mv sts.lifted.sort stsMarkers.lifted.psl
hgsql hg17 -e 'drop table all_sts_seq'
hgLoadPsl -nobin -table=all_sts_seq hg17 stsMarkers.lifted.psl
# Add new information after filtering the noOoc files
# (DONE, 2005-02-17, hartera)
# latest psl file: stsMarker.lifted.new.psl is in fix dir
# Merge primer and sequence files to create final bed file
ssh kolossus
cd /cluster/data/hg17/bed/sts/fix
nice /cluster/bin/scripts/combineSeqPrimerPos stsMarkers.final.new \
../primers/primers.final
# creates *_pos.rdb
/cluster/bin/scripts/createSTSbed /cluster/data/ncbi/sts.9/stsInfo2.bed \
stsMarkers_pos.rdb > stsMap.bed
awk '{print $6;}' stsMap.bed | sort -n | uniq > stsMap.ids
diff stsMap.ids filt1000.ids
# There is only 1 id that does not make it into this set (109375)
# There are 38 of the IDs to remove that do not appear in stsMap.ids
# there are 65 therefore that appear in stsMap.bed: noOoctoremoveinStsMap
foreach i (`cat noOoctoremoveinStsMap`)
awk 'BEGIN {OFS = "\t"} {if ($10 == "'$i'" && $8 >= 1000) \
print $14, $16, $17, $10;}' stsMarkers.noOoc.lifted.filtToRemove.psl \
>> stsMap.noOoc.toRemove.coords
end
sort stsMap.noOoc.toRemove.coords > stsMap.noOoc.toRemove.coords.sort
wc -l stsMap.noOoc.toRemove.coords.sort
# 122
# get the equivalent co-ordinates from stsMap.bed
foreach i (`cat noOoctoremoveinStsMap`)
awk 'BEGIN {OFS = "\t"} {if ($6 == "'$i'") print $1,$2,$3,$6;}' \
stsMap.bed >> stsMap.toRemove.coords
end
sort stsMap.toRemove.coords > stsMap.toRemove.coords.sort
wc -l stsMap.toRemove.coords.sort
# 68
diff stsMap.noOoc.toRemove.coords stsMap.toRemove.coords.sort
# They are different co-ordinates in each set although the same ID
# is represented.
# none of the noOoc alignments are in stsMarkers.lifted.psl so add
cp ../stsMarkers.lifted.psl stsMarkers.lifted.psl
awk '{print $10}' stsMarkers.lifted.psl | sort -n | uniq > sts.liftedpsl.ids
# none of the noOoc alignments are in stsMarkers.lifted.psl so add
# the filtered version
cp stsMarkers.lifted.psl stsMarkers.lifted.new.psl
cat stsMarkers.noOoc.lifted.filt1000.psl >> stsMarkers.lifted.new.psl
wc -l stsMarkers.lifted.new.psl
# 91890
awk '{print $1;}' ../stsInfo2.bed | sort -n | uniq > stsInfo2.ids
# diff with filt1000.ids and noOoc.IdsToRemove.txt
# all of these are in stsInfo2.bed
# need to remove info for the filtered out set but only for the 38 that
# were removed from stsMap.bed - noOocnotinstsMap
perl removeById.pl noOocnotinstsMap stsInfo2.bed > stsInfo2.new.bed
cat << '_EOF_' > removeById.pl
#!/usr/bin/perl -w
use strict;
my $ids = $ARGV[0];
my $file = $ARGV[1];
# list of to remove
open(IDS, $ids) || die "Can not open $ids: $!\n";
# file of stsMarkers.final
open(FILE, $file) || die "Can not open $file: $!\n";
open(OUT, ">removedIds.txt") || die "Can not create removedIds.txt: $!\n";
my %idsHash;
while (<IDS>) {
chomp;
my @a = split(/\t/);
my $id = $a[0];
$idsHash{$id} = 1;
}
close IDS;
while (<FILE>) {
my $l = $_;
my $found = "FALSE";
my @f = split(/\t/, $l);
foreach my $k (keys(%idsHash)) {
# if the id is contained in the key
if ($k eq $f[0]) {
$found = "TRUE";
print OUT "$f[0]\n";
}
}
if ($found eq "FALSE") {
print $l;
}
}
'_EOF_'
# << emacs
chmod +x removeById.pl
# this removed data for all 38 of these Ids from stsInfo2.bed
# need to reload database tables (2005-02-18, hartera)
ssh hgwdev
cd /cluster/data/hg17/bed/sts/fix
hgsql hg17 -e 'drop table stsMap'
hgsql hg17 -e 'drop table all_sts_seq'
hgsql hg17 -e 'drop table stsInfo2'
mv stsInfo2.new.bed stsInfo2.bed
cp stsInfo2.bed /cluster/data/ncbi/sts.9/stsInfo2.bed
mv stsMap.new.bed stsMap.bed
mv stsMarkers.lifted.new.psl stsMarkers.lifted.psl
hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/stsMap.sql \
hg17 stsMap stsMap.bed
hgLoadPsl -nobin -table=all_sts_seq hg17 stsMarkers.lifted.psl
hgsql hg17 < ~kent/src/hg/lib/stsInfo2.sql
hgsql hg17 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
cd ..
mkdir old
mv stsMap.bed stsInfo2.bed stsMarkers.lifted.psl ./old
mv ./fix/stsMap.bed ./fix/stsInfo2.bed ./fix/stsMarkers.lifted.psl .
# Update of stsAlias table (DONE, 2005-02-24, hartera)
# stsAlias filtered IDs removed
# should have same IDs as in stsInfo2
ssh eieio
cd /cluster/data/hg17/bed/sts/fix
awk '{print $2;}' ../stsAlias.bed | sort -n | uniq > alias.ids
# 145985 alias.ids
awk '{print $6;}' ../stsMap.bed | sort -n | uniq > stsMap.new.ids.sort
awk '{print $1;}' ../stsInfo2.bed | sort -n | uniq > stsInfo.new.ids.sort
# 16678 ids in stsInfo2 that are not in stsMap
# 16717 ids in stsAlias that are not in stsMap
# 38 ids in stsAlias that are not in stsInfo2
cat stsMap.new.ids.sort stsInfo.new.ids.sort | sort -n | uniq \
> stsMapandInfo.ids.sort
diff stsMapandInfo.ids.sort alias.ids | grep '>' > idstoremoveAlias
# there are 38 of these IDs to remove
perl -pi.bak -e 's/> //' idstoremoveAlias
cp ../stsAlias.bed .
foreach i (`cat idstoremoveAlias`)
awk '{if ($2 != "'$i'") print;}' stsAlias.bed > stsAlias.tmp
mv stsAlias.tmp stsAlias.bed
end
# check that ids are removed from file and that they are the correct ones
# all looks good
cd /cluster/data/hg17/bed/sts
# save old stsAlias file and copy new one to sts dir and to ncbi sts dir
mv stsAlias.bed ./old
cp ./fix/stsAlias.bed .
cp stsAlias.bed /cluster/data/ncbi/sts.9/stsAlias.bed
ssh hgwdev
# remove old table data and reload
hgsql hg17 -e 'delete from stsAlias'
hgsql hg17 -e 'load data local infile "stsAlias.bed" into table stsAlias'
# PRUNE stsMap RECORDS (DONE 3/3/06)
hgsql hg17 -e 'delete from stsMap where chromEnd-chromStart > 5000'
# RECOMBINATION RATES (2004-07-13 Terry)
# (2004-07-21 kate)
# The STS MArkers track must be completed prior to creating this track
ssh eieio
cd /cluster/data/hg17/bed
mv recombRate recombRate.terry
mkdir -p recombRate
cd recombRate
# Copy other necessary files here (in future, can take from previous version)
# NOTE: these are stable, and could be saved in a permanent spot
cp /projects/hg2/booch/psl/info/decode_all .
cp /projects/hg2/booch/psl/info/marshfield_all .
cp /projects/hg2/booch/psl/info/genethon_all .
# Determine maximum concordant set of markers for each of the maps
/cluster/bin/scripts/assignGPsts -full -maxcon \
/cluster/data/ncbi/sts.9/stsAlias.bed \
/cluster/data/hg17/bed/sts/stsMarkers_pos.rdb \
decode_all > decode.marker.rdb
/cluster/bin/scripts/assignGPsts -full -maxcon \
/cluster/data/ncbi/sts.9/stsAlias.bed \
/cluster/data/hg17/bed/sts/stsMarkers_pos.rdb \
marshfield_all > marshfield.marker.rdb
/cluster/bin/scripts/assignGPsts -full -maxcon \
/cluster/data/ncbi/sts.9/stsAlias.bed \
/cluster/data/hg17/bed/sts/stsMarkers_pos.rdb \
genethon_all > genethon.marker.rdb
# Determine the rates for each of the maps
/cluster/bin/scripts/markers_to_recomb_rate.terry.pl decode.marker.rdb \
/cluster/data/hg17/chrom.sizes 1000000 1000000 \
> decode_1mb_slide_1mb
/cluster/bin/scripts/markers_to_recomb_rate.terry.pl genethon.marker.rdb \
/cluster/data/hg17/chrom.sizes 1000000 1000000 \
> genethon_1mb_slide_1mb
# Marker number 2 at position 120005974 on chr9 is out of genetic distance order. DISCARDING
/cluster/bin/scripts/markers_to_recomb_rate.terry.pl marshfield.marker.rdb \
/cluster/data/hg17/chrom.sizes 1000000 1000000 \
> marshfield_1mb_slide_1mb
# Marker number 1 at position 124276104 on chr9 is out of genetic distance order. DISCARDING
# Convert files to proper format
/cluster/bin/scripts/convertRecombRate decode_1mb_slide_1mb \
/cluster/data/hg17/inserts \
/cluster/data/hg17 1000 > decode_1mb_slide_1mb_conv
/cluster/bin/scripts/convertRecombRate marshfield_1mb_slide_1mb \
/cluster/data/hg17/inserts \
/cluster/data/hg17 1000 > marshfield_1mb_slide_1mb_conv
/cluster/bin/scripts/convertRecombRate genethon_1mb_slide_1mb \
/cluster/data/hg17/inserts \
/cluster/data/hg17 1000 > genethon_1mb_slide_1mb_conv
# Create bed file and load
/cluster/bin/scripts/createRRbed decode_1mb_slide_1mb_conv \
marshfield_1mb_slide_1mb_conv genethon_1mb_slide_1mb_conv \
> recombRate.bed
hgLoadBed -noBin -tab \
-sqlTable=/cluster/home/kent/src/hg/lib/recombRate.sql \
hg17 recombRate recombRate.bed
# FISH CLONES (DONE 2004-07-22 Kate)
# Reloaded 2004-09-36 after Terry Furey reworked fishClones.c
# to improve scoring
# The STS Marker, Coverage, and BAC End Pairs tracks must be completed prior to
# creating this track
ssh eieio
mkdir -p /cluster/data/ncbi/fishClones/fishClones.2004-07/
cd /cluster/data/ncbi/fishClones/fishClones.2004-07/
# Download information from NCBI
# point browser at http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
# change "Show details on sequence-tag" to "yes"
# change "Download or Display" to "Download table for UNIX"
# press Submit - save as /cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt
chmod 664 /cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt
# Get current clone/accession information
wget http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out
# Create initial Fish Clones bed file
mkdir -p /cluster/data/hg17/bed/fishClones
cd /cluster/data/hg17/bed/fishClones
# Copy previous sts info from fhcrc (take from previous build in future)
cp ~booch/tracks/fish/fhcrc.sts .
fishClones -verbose=1 -fhcrc=fhcrc.sts -noBin hg17 \
/cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt \
/cluster/data/ncbi/fishClones/fishClones.2004-07/clac.out \
/cluster/data/ncbi/bacends/human/bacends.4/cl_acc_gi_len \
/cluster/data/hg17/bed/bacends/lifted/bacEnds.lifted.psl \
fishClones_initial
# Get sequences for accessions not in genome
ssh eieio
mkdir -p /cluster/bluearc/hg17/fishClones/
cd /cluster/bluearc/hg17/fishClones/
# goto http://www.ncbi.nlm.nih.gov/entrez/batchentrez.cgi?db=Nucleotide
# select file "/cluster/data/hg17/bed/fishClones/fishClones_initial.acc"
# change output to FASTA format
# download results to "/cluster/bluearc/hg17/fishClones/notFound.fa"
# Align these using blat
cp ~booch/tracks/gs.17/build34/fish/convert.pl .
cp ~booch/tracks/gs.17/build34/fish/blatAll.pl .
# edited to use ooc file on bluearc, so can run on kolossus
convert.pl < notFound.fa > notFound.convert.fa
mkdir out
blatAll.pl /cluster/data/hg17 notFound.convert.fa out
# creates raw.psl, not.found.psl
# Make final fishClones file with this new clone placement info
cd /cluster/data/hg17/bed/fishClones
fishClones -verbose=1 -fhcrc=fhcrc.sts -noBin \
-psl=/cluster/bluearc/hg17/fishClones/not.found.psl hg17 \
/cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt \
/cluster/data/ncbi/fishClones/fishClones.2004-07/clac.out \
/cluster/data/ncbi/bacends/human/bacends.4/cl_acc_gi_len \
/cluster/data/hg17/bed/bacends/lifted/bacEnds.lifted.psl fishClones
# Load the track
ssh hgwdev
cd /cluster/data/hg17/bed/fishClones
hgLoadBed -noBin -tab \
-sqlTable=/cluster/home/kent/src/hg/lib/fishClones.sql \
hg17 fishClones fishClones.bed
# Loaded 10601 elements of size 16
# fixed bad table entry (2004-08-12 kate)
# NOTE: this won't be necessary in the future, as the fishClones program
# will now accomodate more bad input data.
hgsql hg17 -e "update fishClones set bandEnds='1q43,Yp' where name='RP11-188A4' and placeCount=2"
# CHROMOSOME BANDS TRACK (2004-07-13 Terry)
# This must wait until the Fish Clones tracks is done
mkdir -p /cluster/data/hg17/bed/cytoband
cd /cluster/data/hg17/bed/cytoband
# Copy in some necessary files (usually from previous version)
cp /projects/hg2/booch/psl/cytobands/pctSetBands.txt .
cp /projects/hg2/booch/psl/cytobands/ISCN800.txt .
# Create some preliminary information files
/cluster/bin/scripts/createSetBands pctSetBands.txt \
/cluster/data/hg17/inserts /cluster/data/hg17 100 > setBands.txt
/cluster/bin/scripts/makeBands ISCN800.txt /cluster/data/hg17 > cytobands.pct.bed
/cluster/bin/scripts/makeBandRanges cytobands.pct.bed > cytobands.pct.ranges
# Reformat fishClones file
/cluster/bin/scripts/createBanderMarkers \
/cluster/data/hg17/bed/fishClones/fishClones.bed > fishClones.txt
# Create bed file
/cluster/bin/scripts/runBander fishClones.txt \
ISCN800.txt setBands.txt /cluster/data/hg17
# Should be 862 bands
wc cytobands.bed
# 862 4310 30748 cytobands.bed
# Load track
hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql \
hg17 cytoBand cytobands.bed
# Load ideogram table
hgLoadBed -noBin -tab -sqlTable=/cluster/home/booch/src/hg/lib/cytoBandIdeo.sql \
hg17 cytoBandIdeo cytobands.bed
# CHROMOSOME BANDS TRACK REDO (2004-07-22 Kate)
# Just to make sure we know the proper steps.
# The tables were not reloaded, as Terry has already
# sent the data to NCBI
# This must wait until the Fish Clones tracks is done
ssh kolossus
mkdir -p /cluster/data/hg17/bed/cytoband.kate
cd /cluster/data/hg17/bed/cytoband.kate
# Copy in some necessary files (usually from previous version)
cp /projects/hg2/booch/psl/cytobands/pctSetBands.txt .
cp /projects/hg2/booch/psl/cytobands/ISCN800.txt .
# Create some preliminary information files
/cluster/bin/scripts/createSetBands pctSetBands.txt \
/cluster/data/hg17/inserts /cluster/data/hg17 100 > setBands.txt
/cluster/bin/scripts/makeBands ISCN800.txt \
/cluster/data/hg17 > cytobands.pct.bed
/cluster/bin/scripts/makeBandRanges cytobands.pct.bed \
> cytobands.pct.ranges
# Reformat fishClones file
/cluster/bin/scripts/createBanderMarkers \
/cluster/data/hg17/bed/fishClones/fishClones.bed > fishClones.txt
# Create bed file
ssh eieio
cd /cluster/data/hg17/bed/cytoband.kate
/cluster/bin/scripts/runBander fishClones.txt \
ISCN800.txt setBands.txt /cluster/data/hg17
# NOTE: fails on kolossus (C++ compiler different ??)
# Should be 862 bands
wc -l cytobands.bed
# 862 cytobands.bed
# NOTE - don't load tracks, as Terry has already sent his
# versions to NCBI
# Load track
#hgLoadBed -noBin -tab \
# -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql \
# hg17 cytoBand cytobands.bed
# Load ideogram table
#hgLoadBed -noBin -tab \
# -sqlTable=/cluster/home/booch/src/hg/lib/cytoBandIdeo.sql \
# hg17 cytoBandIdeo cytobands.bed
# LOAD AFFYRATIO (DONE - 2004-07-14 - Hiram)
# Copied from Hg16 doc
# Set up cluster job to align consenesus/exemplars to hg17
ssh eieio
mkdir /cluster/bluearc/hg17/affyGnf
cp -p /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa /cluster/bluearc/hg17/affyGnf
ssh kkr1u00
mkdir -p /iscratch/i/affyGnf
cp -p /cluster/bluearc/hg17/affyGnf/* /iscratch/i/affyGnf
/cluster/bin/iSync
ssh kki
mkdir /cluster/data/hg17/bed/affyGnf.2004-06-09
cd /cluster/data/hg17/bed/affyGnf.2004-06-09
ls -1 /iscratch/i/affyGnf/* > affy.lst
ls -1 /iscratch/i/gs.18/build35/maskedContigs/* > allctg.lst
cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/gs.18/build35/hg17.11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 allctg.lst affy.lst template.sub jobList
mkdir psl
para create jobList
# Completed: 380 of 380 jobs
# CPU time in finished jobs: 2922s 48.70m 0.81h 0.03d 0.000 y
# IO & Wait Time: 1146s 19.10m 0.32h 0.01d 0.000 y
# Average job time: 11s 0.18m 0.00h 0.00d
# Longest job: 80s 1.33m 0.02h 0.00d
# Submission to last job: 333s 5.55m 0.09h 0.00d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU95.psl
ssh eieio
cd /cluster/data/hg17/bed/affyGnf.2004-06-09
pslSort dirs raw.psl tmp psl
# change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least 95% identity in aligned
# region.
# minAli = 0.97 too high. low minCover as a lot of n's in these
# sequences
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 \
raw.psl contig.psl /dev/null
liftUp affyU95.psl ../../jkStuff/liftAll.lft warn contig.psl
# Eliminate the long names
sed -e "s/U95Av2://" affyU95.psl | sed -e "s/;//" > affyU95shortQname.psl
# Merge with spot data and load into database. added -chip flag to
# affyPslAndAtlasToBed to allow correct parsing
ssh hgwdev
cd /cluster/data/hg17/bed/affyGnf.2004-06-09
/cluster/home/sugnet/bin/i386/affyPslAndAtlasToBed -chip=U95Av2 \
affyU95shortQname.psl \
/projects/compbiodata/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt \
affyRatio.bed affyRatio.exr > affyPslAndAtlasToBed.log 2>&1
hgLoadBed -sqlTable=$HOME/src/hg/lib/affyRatio.sql hg17 \
affyRatio affyRatio.bed
# Loaded 12740 elements of size 15
mkdir affyU95
hgLoadPsl hg17 -table=affyU95 affyU95shortQname.psl
# sequences loaded 2004-08-06
hgLoadSeq -abbr=U95Av2: hg17 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
# Advisory lock created
# Creating .tab file
# Adding /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
# 12386 sequences
# Updating seq table
# Advisory lock has been released
# All done
# Load AFFYUCLANORM, extended version of affyUcla track. Hopefully
# final freeze of data set. (DONE - 2004-07-14 - Hiram)
ssh kk
mkdir /cluster/data/hg17/bed/affyUclaNorm
cd /cluster/data/hg17/bed/affyUclaNorm
cp /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa .
ls -1 /scratch/hg/gs.18/build35/maskedContigs/* > contig.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << keep emacs happy
mkdir psl
ls HG-U133AB_all.fa > affy.lst
gensub2 contig.lst affy.lst gsub jobList
para create jobList
para try
para check
para push ... etc
# Completed: 380 of 380 jobs
# CPU time in finished jobs: 20070s 334.51m 5.58h 0.23d 0.001 y
# IO & Wait Time: 162784s 2713.06m 45.22h 1.88d 0.005 y
# Average job time: 481s 8.02m 0.13h 0.01d
# Longest job: 735s 12.25m 0.20h 0.01d
# Submission to last job: 771s 12.85m 0.21h 0.01d
ssh eieio
cd /cluster/data/hg17/bed/affyUclaNorm
pslSort dirs hg17.affyU133AB_all.psl tmp psl
wc hg17.affyU133AB_all.psl
# 61022 1281401 12934919 hg17.affyU133AB_all.psl
liftUp hg17.affyU133AB_all.lifted.psl \
/cluster/data/hg17/jkStuff/liftAll.lft warn hg17.affyU133AB_all.psl
pslReps -minCover=0.5 -sizeMatters -minAli=0.97 \
-nearTop=0.005 hg17.affyU133AB_all.lifted.psl \
hg17.affyU133AB_all.lifted.pslReps.psl out.psr
# Processed 61017 alignments
affyUclaMergePslData -pslFile=hg17.affyU133AB_all.lifted.pslReps.psl \
-affyFile=/projects/compbio/data/microarray/affyUcla/data/feature_biomaterial_chip_logratios_formatForTrack.txt \
-bedOut=hg17.affyUcla.bed \
-expRecordOut=hg17.affyUcla.expRecords \
-expFile=/projects/compbio/data/microarray/affyUcla/data/expNames.sorted.txt
~/kent/src/hg/affyGnf/addUclaAnnotations.pl hg17.affyUcla.expRecords \
/projects/compbio/data/microarray/affyUcla/data/normal_tissue_database_annotations2.txt > hg17.affyUcla.annotations.expRecords
# Load the databases
ssh hgwdev
cd /cluster/data/hg17/bed/affyUclaNorm
sed -e 's/affyRatio/affyUclaNorm/' ~/kent/src/hg/lib/affyRatio.sql \
> affyUclaNorm.sql
hgLoadBed hg17 affyUclaNorm hg17.affyUcla.bed -sqlTable=affyUclaNorm.sql
# MAKE AFFY U133 - made after above affyUclaNorm (DONE - 2004-07-15 - Hiram)
# Someday the names can be fixed.
ssh hgwdev
mkdir /cluster/data/hg17/bed/affyU133
cd /cluster/data/hg17/bed/affyU133
ln -s ../affyUclaNorm/hg17.affyU133AB_all.lifted.pslReps.psl affyU133.psl
hgLoadPsl hg17 affyU133.psl
# hgsql -e "select count(*) from affyU133;" hg17
# row count in hg16: 45693, in hg17: 44620
hgLoadSeq hg17 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa
# 44792 sequences
# MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN & FUGU (DONE 2004-06-10 kate)
# In an email 2/13/04 to Angie, Arian said we could treat all
# human repeats as
# lineage-specific for human-chicken blastz.
# and Angie did the same for fugu.
# Lacking input from Arian, and using blastzSelf as a model,
# I'm also using all human repeats for the human/chimp blastz.
# Scripts expect *.out.spec filenames.
ssh kkr1u00
cd /cluster/data/hg17
mkdir /iscratch/i/hg17/linSpecRep.chicken
foreach f (/iscratch/i/hg17/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/hg17/linSpecRep.chicken/$f:t:r:r.out.spec
end
ln -s /iscratch/i/hg17/linSpecRep.chicken \
/iscratch/i/hg17/linSpecRep.fugu
ln -s /iscratch/i/hg17/linSpecRep.chicken \
/iscratch/i/hg17/linSpecRep.chimp
iSync
# BLASTZ FUGU (FR1) (DONE 2004-06-24 kate)
ssh kk
mkdir -p /cluster/data/hg17/bed/blastz.fr1.2004-06-10
ln -s /cluster/data/hg17/bed/blastz.fr1.2004-06-10 \
/cluster/data/hg17/bed/blastz.fr1
cd /cluster/data/hg17/bed/blastz.fr1
# Set L=6000 (more relaxed than chicken) and abridge repeats.
# Treat all repeats as lineage-specific (reuse linSpecRep.Chicken).
cat << '_EOF_' > DEF
# human vs. fugu
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Reuse parameters from human-chicken.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human
SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/hg17/linSpecRep.fugu
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Fugu
SEQ2_DIR=/iscratch/i/fr1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/fr1/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg17/bed/blastz.fr1
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
bash # if a csh/tcsh user
source DEF
mkdir $RAW run.0
/cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
# GOT HERE
sh ./xdir.sh
cd run.0
sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
para create jobList
# 11935 jobs
para try
para check
para push
# Completed: 11935 of 11935 jobs
# CPU time in finished jobs: 4673316s 77888.60m 1298.14h 54.09d 0.148 y
# IO & Wait Time: 329249s 5487.48m 91.46h 3.81d 0.010 y
# Average job time: 419s 6.99m 0.12h 0.00d
# Longest job: 714s 11.90m 0.20h 0.01d
# Submission to last job: 5575s 92.92m 1.55h 0.06d
# second cluster run: lift raw alignments -> lav dir
ssh kki
cd /cluster/data/hg17/bed/blastz.fr1
bash # if a csh/tcsh user
source DEF
mkdir run.1 lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
cd run.1
wc -l jobList
para create jobList
# 341 jobs
para try
para check
para push
# CPU time in finished jobs: 315s 5.26m 0.09h 0.00d 0.000 y
# IO & Wait Time: 4451s 74.18m 1.24h 0.05d 0.000 y
# Average job time: 14s 0.23m 0.00h 0.00d
# Longest job: 107s 1.78m 0.03h 0.00d
# Submission to last job: 368s 6.13m 0.10h 0.00d
# third run: lav -> axt
ssh kki
cd /cluster/data/hg17/bed/blastz.fr1
mkdir axtChrom pslChrom run.2
cd run.2
cat << 'EOF' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin \
/iscratch/i/hg17/bothMaskedNibs /iscratch/i/fr1/nib stdout \
| axtSort stdin ../../axtChrom/$chr.axt
axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
../../pslChrom/$chr.psl
'EOF'
# << this line keeps emacs coloring happy
chmod a+x do.csh
cp /dev/null jobList
foreach d (../lav/chr*)
echo "do.csh $d" >> jobList
end
para create jobList
# 41 jobs
para try
para check
para push
# CHAIN FUGU BLASTZ (2004-06-11 kate)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/hg17/bed/blastz.fr1
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg17/bed/blastz.fr1/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# Reuse gap penalties from chicken run.
cat << '_EOF_' > temp.gap
tablesize 11
smallSize 111
position 1 2 3 11 111 2111 12111 32111 72111 152111 252111
qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000
'_EOF_'
# << this line makes emacs coloring happy
sed 's/ */\t/g' temp.gap > ../../fuguHumanTuned.gap
rm -f temp.gap
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=../../fuguHumanTuned.gap \
-minScore=5000 $1 \
/iscratch/i/hg17/bothMaskedNibs \
/iscratch/i/fr1/nib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
# 46 jobs
para try
para check
para push
# 1 crashed job -- chr6_hla_hap1.chain is empty
# CPU time in finished jobs: 610s 10.16m 0.17h 0.01d 0.000 y
# IO & Wait Time: 1644s 27.40m 0.46h 0.02d 0.000 y
# Average job time: 50s 0.83m 0.01h 0.00d
# Longest job: 233s 3.88m 0.06h 0.00d
# Submission to last job: 339s 5.65m 0.09h 0.00d
# now on the cluster server, sort chains
ssh eieio
cd /cluster/data/hg17/bed/blastz.fr1/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# Load chains into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.fr1/axtChain/chain
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain hg17 ${c}_chainFr1 $i
end
featureBits hg16 chainFr1Link
# 50709290 bases of 2865248791 (1.770%) in intersection
# ANCIENT REPEAT TABLE (2004-06-11 kate)
# The netClass operations requires an "ancientRepeat" table in one
# of the databases.
# This is a hand curated table obtained from Arian.
ssh hgwdev
mkdir -p /cluster/data/hg17/bed/ancientRepeat
cd /cluster/data/hg17/bed/ancientRepeat
# mysqldump needs write permission to this directory
chmod 777 .
hgsqldump --all --tab=. hg15 ancientRepeat
chmod 775 .
hgsql hg17 < ancientRepeat.sql
echo "LOAD DATA LOCAL INFILE 'ancientRepeat.txt' into table ancientRepeat"\
| hgsql hg17
# NET FUGU BLASTZ (2004-06-11 kate)
ssh eieio
cd /cluster/data/hg17/bed/blastz.fr1/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.fr1/axtChain
netClass noClass.net hg17 fr1 human.net
# Make a 'syntenic' subset:
ssh eieio
cd /cluster/data/hg17/bed/blastz.fr1/axtChain
rm noClass.net
# Make a 'syntenic' subset of these with
netFilter -syn human.net > humanSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.fr1/axtChain
netFilter -minGap=10 human.net | hgLoadNet hg17 netFr1 stdin
#netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 netSyntenyFr1 stdin
# EXTRACT AXT'S AND MAF'S FROM THE NET (kate)
# NOTE: Redo 2005-08-16 to fix overlap problem (use 8/05 netToAxt)
# Replace bad chr5 axtNet and mafNet (2006-01-05 kate)
ssh kkstore2
cd /cluster/data/hg17/bed/blastz.fr1/axtChain
netSplit human.net humanNet
mkdir -p ../axtNet ../mafNet
cat > makeMaf.csh << 'EOF'
foreach f (humanNet/chr*.net)
set c = $f:t:r
echo "axtNet on $c"
netToAxt humanNet/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/fr1/nib stdout | axtSort stdin ../axtNet/$c.axt
axtToMaf ../axtNet/$c.axt \
/cluster/data/hg17/chrom.sizes /cluster/data/fr1/chrom.sizes \
../mafNet/$c.maf -tPrefix=hg17. -qPrefix=fr1.
end
'EOF'
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
mkdir -p /cluster/bluearc/hg17/mafNet
cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/fr1
# FUGU FR1 DOWNLOADS (DONE 2004-09-17 kate)
# REDO axtNet downloads for fix, above (2005-09-12 kate)
# Replace bad chr5 axtNet and mafNet (2006-01-05 kate)
ssh kksilo
cd /cluster/data/hg17/bed/blastz.fr1/axtChain
ln -s all.chain fugu.chain
mkdir gz
gzip -c fugu.chain > gz/fugu.chain.gz
gzip -c human.net > gz/fugu.net.gz
cd ../axtNet
nice gzip *.axt
ssh kkstore02
cd /cluster/data/hg17/bed/blastz.fr1/axtNet
gzip *.axt
md5sum *.gz > md5sum.txt
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17
mkdir -p vsFr1
cd vsFr1
# Copy and edit README
cp /cluster/data/hg17/bed/blastz.fr1/axtChain/gz/*.gz .
md5sum *.gz > md5sum.txt
mv axtNet axtNet.old
ln -s /cluster/data/hg17/bed/blastz.fr1/axtNet .
# PRODUCE FUGU BLAT ALIGNMENT (DONE - 2004-07-07 - Hiram)
# Use masked scaffolds from fr1 assembly (same sequence as
# previous BlatFugu, however it's repeat and TRF-masked).
ssh kk
mkdir /cluster/data/hg17/bed/blatFr1
cd /cluster/data/hg17/bed/blatFr1
mkdir psl
# next time, use N?_?????? (to pick up NG_ contigs)
foreach f ( `cat /cluster/data/hg17/contig.lst` )
set c=$f:t:r
echo $c
mkdir psl/$c
end
# create cluster job
mkdir run
cd run
ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > human.lst
cat << 'EOF' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/hg17/bed/blatFr1/psl/$(root1)/$(root1)_$(root2).psl}
#ENDLOOP
'EOF'
# << keep emacs happy
gensub2 human.lst fugu.lst gsub jobList
para create spec
# 219640 jobs
para try
para check
para push -maxQueue=300000 -maxPush=220000
para check
# Completed: 219640 of 219640 jobs
# CPU time in finished jobs: 5206945s 86782.41m 1446.37h 60.27d 0.165 y
# IO & Wait Time: 797791s 13296.52m 221.61h 9.23d 0.025 y
# Average job time: 27s 0.46m 0.01h 0.00d
# Longest job: 951s 15.85m 0.26h 0.01d
# Submission to last job: 7553s 125.88m 2.10h 0.09d
# cd psl
# count files with aligments
# find . -not -size 427c | wc -l
# 44558
# count files with no aligments
# find . -size 427c | wc -l
# 175463
# When cluster run is done, sort alignments
# into chrom directory
ssh eieio
cd /cluster/data/hg17/bed/blatFr1
pslCat -dir psl/N?_?????? | \
liftUp -type=.psl stdout \
/cluster/data/hg17/jkStuff/liftAll.lft warn stdin | \
pslSortAcc nohead chrom temp stdin
# 65 minutes ?
# Processed 216595 lines into 1 temp files
# Rename to correspond with tables and load into database:
ssh hgwdev
cd /cluster/data/hg17/bed/blatFr1/chrom
foreach i (chr*.psl)
set r = $i:r
echo mv $i ${r}_blatFr1.psl
mv $i ${r}_blatFr1.psl
end
# lift fugu scaffolds to Fugu browser chrUn,
# so you can link to other browser. And don't need to load sequence
cd /cluster/data/hg17/bed/blatFr1
liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl
hgLoadPsl -table=blatFr1 hg17 all.psl
# load of blatFr1 did not go as planned: 216595 record(s),
# 0 row(s) skipped, 3 warning(s) loading psl.tab
# featureBits hg17 blatFr1 refGene:CDS
# 13563544 bases of 2866216770 (0.473%) in intersection
# featureBits hg16 blatFr1 refGene:CDS
# 13547219 bases of 2865248791 (0.473%) in intersection
# featureBits hg15 blatFugu refGene:CDS
# 12427544 bases of 2866466359 (0.434%) in intersection
# BLASTZ RAT RN3 (DONE - 2004-06-14 - Hiram)
ssh kk
mkdir -p /cluster/data/hg17/bed/blastz.rn3.2004-06-11
cd /cluster/data/hg17/bed
ln -s blastz.rn3.2004-06-11 blastz.rn3
cd blastz.rn3
cat << '_EOF_' > DEF
# rat vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInRat
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Rat
SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/store5/gs.18/build35/bed/blastz.rn3
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line makes emacs coloring happy
# prepare first cluster run
ssh kk
cd /cluster/data/hg17/bed/blastz.rn3
source DEF
# script copied over from /cluster/data/mm4/jkStuff/BlastZ_run0.sh
# it is a generic script and works for any assembly
/cluster/data/hg17/jkStuff/BlastZ_run0.sh
cd run.0
para try, check, push, check, ....
Completed: 41943 of 41943 jobs
CPU time in finished jobs: 15330421s 255507.02m 4258.45h 177.44d 0.486 y
IO & Wait Time: 673809s 11230.15m 187.17h 7.80d 0.021 y
Average job time: 382s 6.36m 0.11h 0.00d
Longest job: 4651s 77.52m 1.29h 0.05d
Submission to last job: 169197s 2819.95m 47.00h 1.96d
# Second cluster run to convert the .out's to .lav's
# You do NOT want to run this on the big cluster. It brings
# the file server to its knees. Run this on the small cluster.
ssh kki
cd /cluster/data/hg17/bed/blastz.rn3
# script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh
# fixup machine check, should be kki, not kk
/cluster/data/hg17/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 1894s 31.56m 0.53h 0.02d 0.000 y
# IO & Wait Time: 6271s 104.52m 1.74h 0.07d 0.000 y
# Average job time: 24s 0.40m 0.01h 0.00d
# Longest job: 131s 2.18m 0.04h 0.00d
# Submission to last job: 590s 9.83m 0.16h 0.01d
# Third cluster run to convert lav's to axt's
cd /cluster/data/hg17/bed/blastz.rn3
# The copy of this in mm4 was broken, fixed here
/cluster/data/hg17/jkStuff/BlastZ_run2.sh
cd run.2
para try, check, push, etc ...
# Completed: 46 of 46 jobs
# CPU time in finished jobs: 426s 7.09m 0.12h 0.00d 0.000 y
# IO & Wait Time: 7283s 121.39m 2.02h 0.08d 0.000 y
# Average job time: 168s 2.79m 0.05h 0.00d
# Longest job: 642s 10.70m 0.18h 0.01d
# Submission to last job: 642s 10.70m 0.18h 0.01d
# translate sorted axt files into psl
ssh eieio
cd /cluster/data/hg17/bed/blastz.rn3
mkdir pslChrom
set tbl = "blastzRn3"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# That takes about 30 minutes
# Load database tables
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.rn3/pslChrom
for I in *.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
echo "done: ${I}"
done
# this is a 55 minute job
# Check results
# featureBits hg16 blastzRn3
# 1013603401 bases of 2865248791 (35.376%) in intersection
# featureBits hg17 blastzRn3
# 1013003285 bases of 2866216770 (35.343%) in intersection
# CHAIN RN3 BLASTZ (DONE - 2004-06-14 - Hiram)
# re-worked with no 'axtFilter -notQ_random' on the axtChain step - 2004-06-23
# used to be: axtFilter -notQ_random $1 | axtChain stdin
# The axtChain is best run on the small kluster, or the kk9 kluster
ssh kki
mkdir -p /cluster/data/hg17/bed/blastz.rn3/axtChain/run1
cd /cluster/data/hg17/bed/blastz.rn3/axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg17/bed/blastz.rn3/axtChrom/*.axt > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
/iscratch/i/gs.18/build35/bothMaskedNibs \
/iscratch/i/rn3/bothMaskedNibs $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
# 46 jobs
gensub2 input.lst single gsub jobList
para create jobList
para try
para push # ... etc ...
# Completed: 46 of 46 jobs
# CPU time in finished jobs: 4645s 77.41m 1.29h 0.05d 0.000 y
# IO & Wait Time: 6840s 114.00m 1.90h 0.08d 0.000 y
# Average job time: 250s 4.16m 0.07h 0.00d
# Longest job: 1539s 25.65m 0.43h 0.02d
# Submission to last job: 3761s 62.68m 1.04h 0.04d
# now on the file server, sort chains
ssh eieio
cd /cluster/data/hg17/bed/blastz.rn3/axtChain
time chainMergeSort run1/chain/*.chain > all.chain
# real 36m42.170s
# user 4m55.970s
# sys 1m49.840s
time chainSplit chain all.chain
# real 13m54.860s
# user 4m50.370s
# sys 1m3.260s
# optionally: rm run1/chain/*.chain
# Load chains into database
# next machine
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.rn3/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain hg17 ${c}_chainRn3 $i
echo done $c
end
# featureBits hg17 chainRn3
# 2827052992 bases of 2866216770 (98.634%) in intersection
# (with filter:) 2826192649 bases of 2866216770 (98.604%) in intersection
# featureBits hg16 chainRn3
# 2830563493 bases of 2865248791 (98.789%) in intersection
# NET RN3 (DONE - 2004-06-15 - Hiram)
# Re-done due to Chain being re-done 2004-06-23
# NOTE: Redo net axt's and net maf's to fix overlaps,
# (using 8/05 netToAxt). (2005-08-16 kate)
ssh eieio
cd /cluster/data/hg17/bed/blastz.rn3/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i /cluster/data/hg17/chrom.sizes \
/cluster/data/rn3/chrom.sizes ../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg17/chrom.sizes \
/cluster/data/rn3/chrom.sizes ../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
# memory usage 2510467072, utime 19307 s/100, stime 3181
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.rn3/axtChain
time netClass hNoClass.net hg17 rn3 rat.net \
-tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInRat \
-qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
# real 34m29.829s
# user 11m30.440s
# sys 1m52.730s
# If things look good do
ssh eieio
cd /cluster/data/hg17/bed/blastz.rn3/axtChain
rm -r n1 hNoClass.net
# Make a 'syntenic' subset of these with
time netFilter -syn rat.net > ratSyn.net
# real 16m25.640s
# user 7m41.330s
# sys 1m1.150s
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.rn3/axtChain
netFilter -minGap=10 rat.net | hgLoadNet hg17 netRn3 stdin
netFilter -minGap=10 ratSyn.net | hgLoadNet hg17 syntenyNetRn3 stdin
# real 37m0.199s
# user 15m13.770s
# sys 1m41.540s
# check results
# featureBits hg17 netRn3
# 2817656275 bases of 2866216770 (98.306%) in intersection
# (with axtFilter) 2816623107 bases of 2866216770 (98.270%) in intersection
# featureBits hg16 netRn3
# 2820958389 bases of 2865248791 (98.454%) in intersection
# featureBits hg17 syntenyNetRn3
# 2781748096 bases of 2866216770 (97.053%) in intersection
# (with axtFilter) 2780883450 bases of 2866216770 (97.023%) in intersection
# featureBits hg16 syntenyNetRn3
# 2784011730 bases of 2865248791 (97.165%) in intersection
# Add entries for net and chain to rat/hg17 trackDb
# make net
ssh eieio
cd /cluster/data/hg17/bed/blastz.rn3/axtChain
mkdir ratNet
time netSplit rat.net ratNet
# real 12m1.478s
# user 8m35.050s
# sys 1m7.230s
# extract axts from net
mkdir ../axtNet ../mafNet
cat << 'EOF' > makeMaf.csh
foreach n (ratNet/chr*.net)
set c=$n:t:r
echo $c
netToAxt ratNet/$c.net chain/$c.chain \
/cluster/data/hg17/nib /cluster/data/rn3/nib stdout | \
axtSort stdin ../axtNet/$c.axt
axtToMaf ../axtNet/$c.axt \
/cluster/data/hg17/chrom.sizes /cluster/data/rn3/chrom.sizes \
../mafNet/$c.maf -tPrefix=hg17. -qPrefix=rn3.
end
'EOF'
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
mkdir -p /cluster/bluearc/hg17/mafNet
cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/rn3
ssh hgwdev
mkdir -p /cluster/data/hg17/bed/blastz.rn3/axtBest
cd /cluster/data/hg17/bed/blastz.rn3/axtBest
ln -s ../axtNet/chr*.axt .
# copy net axt's to download area
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.rn3/axtNet
mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtNet
cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtNet
cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtNet
gzip *.axt
# add README.txt file to dir (use previous assembly's copy as template)
# Convert those axt files to psl
ssh eieio
cd /cluster/data/hg17/bed/blastz.rn3
mkdir pslBest
foreach a (axtBest/chr*.axt)
set c=$a:t:r
echo "processing $c.axt -> ${c}_blastzBestRn3.psl"
/cluster/bin/i386/axtToPsl axtBest/${c}.axt \
S1.len S2.len pslBest/${c}_blastzBestRn3.psl
echo "Done: ${c}_blastzBestRn3.psl"
end
# Load tables
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.rn3/pslBest
for I in chr*BestRn3.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
echo "done ${I}"
done
# check results
# featureBits hg17 blastzBestRn3
# 975533772 bases of 2866216770 (34.036%) in intersection
# (with axtFilter) 970005525 bases of 2866216770 (33.843%) in intersection
# featureBits hg16 blastzBestRn3
# 976121391 bases of 2865248791 (34.068%) in intersection
# Make /gbdb links and add them to the axtInfo table:
mkdir -p /gbdb/hg17/axtBest/Rn3
cd /gbdb/hg17/axtBest/Rn3
ln -s /cluster/data/hg17/bed/blastz.rn3/axtNet/chr*.axt .
cd /cluster/data/hg17/bed/blastz.rn3/axtNet
rm -f axtInfoInserts.sql
foreach f (/gbdb/hg17/axtBest/Rn3/chr*.axt)
set chr=$f:t:r
echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \
>> axtInfoInserts.sql
end
hgsql hg17 < ~/kent/src/hg/lib/axtInfo.sql
# table axtInfo may already exist, ignore create error.
hgsql hg17 < axtInfoInserts.sql
# MAKING RAT SYNTENY (DONE - 2004-06-30 - Hiram)
# Re-Done after above done without the axtFilter
ssh hgwdev
mkdir /cluster/data/hg17/bed/syntenyRn3
cd /cluster/data/hg17/bed/syntenyRn3
# Copy all the needed scripts from /cluster/data/hg16/bed/syntenyMm3
cp -p /cluster/data/hg16/bed/syntenyMm3/*.pl .
cp -p /cluster/data/hg16/bed/syntenyMm3/*.sh .
./syntenicBest.pl -db=hg17 -table=blastzBestRn3
./smooth.pl
./joinsmallgaps.pl
./fillgap.pl -db=hg17 -table=blastzBestRn3
./synteny2bed.pl
# The five commands above
# real 196m2.565s
# user 0m21.170s
# sys 0m4.690s
# Used to load this in syntenyRn3, but that type is misleading to
# the table browser and fails the checkTableCoords check.
# Better to use this ensRatMusHom type:
sed -e 's/ensPhusionBlast/ensRn3MusHom/g' \
$HOME/kent/src/hg/lib/ensPhusionBlast.sql \
> ensRn3MusHom.sql
hgLoadBed hg17 ensRn3MusHom ucsc100k.bed -sqlTable=ensRn3MusHom.sql
# featureBits hg17 ensRn3MusHom
# 2592164486 bases of 2866216770 (90.439%) in intersection
# featureBits hg16 syntenyRn3
# 2595919851 bases of 2865248791 (90.600%) in intersection
# MAKING RAT AXTTIGHT FROM AXTBEST (DONE - 2004-06-15 - Hiram)
# After creating axtBest alignments above, use subsetAxt to get axtTight:
ssh eieio
cd /cluster/data/hg17/bed/blastz.rn3/axtNet
mkdir -p ../axtTight
foreach i (*.axt)
echo $i
subsetAxt $i ../axtTight/$i \
~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
end
# translate to psl
cd ../axtTight
mkdir ../pslTight
foreach i (*.axt)
set c = $i:r
axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn3.psl
echo "Done: $i"
end
# Load tables into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.rn3/pslTight
for I in chr*TightRn3.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
echo "done ${I}"
done
# Compare results with previous assembly
# featureBits hg17 blastzTightRn3
# 153936720 bases of 2866216770 (5.371%) in intersection
# featureBits hg16 blastzTightRn3
# 153151903 bases of 2865248791 (5.345%) in intersection
# copy axt's to download area
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.rn3/axtTight
mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtTight
cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtTight
cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtTight
gzip *.axt
# add README.txt file to dir (use previous assembly's copy as template)
# REDO downloads with fixed axtNet's (2005=09-13 kate)
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3
mv axtNet axtNet.old
nice cp -rp /cluster/data/hg17/bed/blastz.rn3/axtNet .
cd axtNet
nice gzip *.axt
md5sum *.axt.gz > md5sum.txt
# BLASTZ RN3 CLEAN UP (DONE - 2004-07-02 - Hiram)
ssh eieio
cd /cluster/data/hg17/bed/blastz.rn3
nice rm -rf raw &
nice rm axtChain/run1/chain/* &
nice rm -fr axtChain/n1 axtChain/hNoClass.net &
nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net &
# BLASTZ CHICKEN (GALGAL2) (DONE - 2004-06-14 - Fan)
ssh kk
mkdir /cluster/data/hg17/bed/blastz.galGal2.2004-06-14
cd /cluster/data/hg17/bed
ln -s /cluster/data/hg17/bed/blastz.galGal2.2004-06-14 blastz.galGal2
cd blastz.galGal2
# Set L=10000 (higher threshold on blastz's outer loop) and abridge
# repeats.
cat << '_EOF_' > DEF
# human vs. chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human
SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.chicken
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chicken
SEQ2_DIR=/iscratch/i/galGal2/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/store5/gs.18/build35/bed/blastz.galGal2
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
# prepare first cluster run
ssh kk
cd /cluster/data/hg17/bed/blastz.galGal2
bash
# script copied over from /cluster/data/mm4/jkStuff/BlastZ_run0.sh
# it is a generic script and works for any assembly
/cluster/data/hg17/jkStuff/BlastZ_run0.sh
cd run.0
para try, check, push, check, ....
Completed: 41943 of 41943 jobs
CPU time in finished jobs: 15330421s 255507.02m 4258.45h 177.44d 0.486 y
IO & Wait Time: 673809s 11230.15m 187.17h 7.80d 0.021 y
Average job time: 382s 6.36m 0.11h 0.00d
Longest job: 4651s 77.52m 1.29h 0.05d
Submission to last job: 169197s 2819.95m 47.00h 1.96d
# Second cluster run to convert the .out's to .lav's
# You do NOT want to run this on the big cluster. It brings
# the file server to its knees. Run this on the small cluster.
ssh kki
cd /cluster/data/hg17/bed/blastz.galGal2
bash
# script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh
# fixup machine check, should be kki, not kk
/cluster/data/hg17/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 1894s 31.56m 0.53h 0.02d 0.000 y
# IO & Wait Time: 6271s 104.52m 1.74h 0.07d 0.000 y
# Average job time: 24s 0.40m 0.01h 0.00d
# Longest job: 131s 2.18m 0.04h 0.00d
# Submission to last job: 590s 9.83m 0.16h 0.01d
# Third cluster run to convert lav's to axt's
cd /cluster/data/hg17/bed/blastz.galGal2
# The copy of this in mm4 was broken, fixed here
/cluster/data/hg17/jkStuff/BlastZ_run2.sh
cd run.2
para try, check, push, etc ...
# Completed: 46 of 46 jobs
# CPU time in finished jobs: 426s 7.09m 0.12h 0.00d 0.000 y
# IO & Wait Time: 7283s 121.39m 2.02h 0.08d 0.000 y
# Average job time: 168s 2.79m 0.05h 0.00d
# Longest job: 642s 10.70m 0.18h 0.01d
# Submission to last job: 642s 10.70m 0.18h 0.01d
# translate sorted axt files into psl
ssh eieio
cd /cluster/data/hg17/bed/blastz.galGal2
mkdir pslChrom
set tbl = "blastzGalGal2"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# That takes about 30 minutes
# Load database tables
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.galGal2/pslChrom
bash
for I in *.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
echo "done: ${I}"
done
# GNF ATLAS 2 (DONE - 2004-07-14 - Hiram
# Align probes from GNF1H chip.
ssh kk
cd /cluster/data/hg17/bed
mkdir -p geneAtlas2/run/psl
cd geneAtlas2/run
# This bluearc/geneAtlas2 directory already exists
# mkdir -p /cluster/bluearc/geneAtlas2
# cp /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /cluster/bluearc/geneAtlas2
ls -1 /scratch/hg/gs.18/build35/maskedContigs > genome.lst
ls -1 /cluster/bluearc/geneAtlas2/gnf1h.fa > mrna.lst
cat << '_EOF_' > gsub
#LOOP
blat -fine -ooc=/scratch/hg/h/11.ooc /scratch/hg/gs.18/build35/maskedContigs/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.lst mrna.lst gsub jobList
para create jobList
para try
para check
para push
para time
# Completed: 380 of 380 jobs
# CPU time in finished jobs: 10599s 176.65m 2.94h 0.12d 0.000 y
# IO & Wait Time: 3893s 64.88m 1.08h 0.05d 0.000 y
# Average job time: 38s 0.64m 0.01h 0.00d
# Longest job: 649s 10.82m 0.18h 0.01d
# Submission to last job: 663s 11.05m 0.18h 0.01d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create gnf1h.psl.
pslSort dirs raw.psl tmp psl
pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl \
contig.psl /dev/null
# Processed 80818 alignments
liftUp ../affyGnf1h.psl ../../../jkStuff/liftAll.lft warn contig.psl
rm -r contig.psl raw.psl psl
# Load probes and alignments from GNF1H into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneAtlas2
# Already symlinked
# ln -s /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa \
# /gbdb/hgFixed/affyProbes
hgLoadPsl hg17 affyGnf1h.psl
hgLoadSeq hg17 /gbdb/hgFixed/affyProbes/gnf1h.fa
grep -v U133B ../affyUclaNorm/hg17.affyU133AB_all.lifted.pslReps.psl \
| sed -e "s/exemplar://; s/consensus://; s/U133A://" \
| sed -e "s/;//" > affyU133A.psl
hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \
affyU133A.psl /cluster/data/hg17/bed/geneAtlas2/affyGnf1h.psl
# Loaded 44696 rows of expression data from hgFixed.gnfHumanAtlas2MedianRatio
# Mapped 32857, multiply-mapped 1462, missed 49, unmapped 11839
hgLoadBed hg17 gnfAtlas2 gnfAtlas2.bed
# Loaded 34319 elements of size 15
# LOAD SNPS ( Daryl Thomas; November 7, 2004 ;
snpExceptions added January 8, 2005 ;
updated to build 124 on January 13, 2005;
added affy snps March 5, 2004 )
set db = hg17
set org = human
set build = 124
set dir = /cluster/bluearc/snp/$db/build$build
# ssh to some quiet machine with fast access to the bluearc
# it takes ~4.5 hours to download the data
# (build 124 directly to /cluster/bluearc/... from eieio)
# Check to make sure the chrMT file is included
mkdir -p $dir $dir/ds_ch.xml $dir/det $dir/str $dir/loc $dir/seq
cd $dir
ln -s /cluster/data/$db/jkStuff/liftAll.lft .
screen
ftp ftp.ncbi.nih.gov
cd snp/$org/XML
prompt
mget ds_ch*.xml.gz
exit # screen
exit # machine
# TODO: check chromStart for each locType
cp -f {$HOME}/kent/src/hg/snp/parseDbSnpXML /cluster/bin/scripts
chmod 775 /cluster/bin/scripts/parseDbSnpXML
#ssh kk
touch jobList
foreach file ( /cluster/bluearc/snp/$db/build$build/ds_ch*.xml.gz )
set out = $file:t:r
echo /cluster/bin/scripts/parseDbSnpXML $file /cluster/bluearc/snp/$db/build$build $out.contig >> jobList
end
# I removed ds_chMulti.xml.gz and ds_chNotOn.xml.gz from the job list
# para create jobList; para push; para check ...
#Completed: 25 of 25 jobs
#CPU time in finished jobs: 30120s 502.01m 8.37h 0.35d 0.001 y
#IO & Wait Time: 2533s 42.21m 0.70h 0.03d 0.000 y
#Average job time: 1306s 21.77m 0.36h 0.02d
#Longest job: 2611s 43.52m 0.73h 0.03d
#Submission to last job: 2611s 43.52m 0.73h 0.03d
exit # kk
mv -r $dir /cluster/data/$db/bed/snp/build$build
set dir = /cluster/data/$db/bed/snp/build$build
cd $dir
# concatenate the details files to make it easier to lift (and load)
time zcat det/ds_ch*.xml.contig.det.gz > $db.build$build.contig.bed
# 33.380u 24.470s 1:54.79 50.3% 0+0k 0+0io 86pf+0w (hgwdev)
time gzip $db.build$build.contig.bed
# 251.160u 16.770s 12:40.77 35.2% 0+0k 0+0io 83pf+0w (hgwdev/bluearc - should have done it on eieio/store5)
# some of the NT contigs are not in the liftSpec - this is expected as snps that map to
# alternate assemblies (Celera) are in the original files, but we disregard their mappings.
time liftUp $db.build$build.bed liftAll.lft warn $db.build$build.contig.bed.gz
# 232.260u 30.050s 5:09.04 84.8% 0+0k 0+0io 379pf+0w (hgwdev/store5)
time gzip hg17.build124.bed
# 141.980u 8.180s 2:34.43 97.2% 0+0k 0+0io 83pf+0w
# hgLoadBed is the important step - check to make sure there are no warnings
time hgLoadBed $db snp $db.build$build.bed.gz -sqlTable=${HOME}/kent/src/hg/lib/snp.sql
# Loaded 9131054 elements of size 16
# 225.040u 37.030s 35:20.45 12.3% 0+0k 0+0io 308pf+0w
# basic snp table is now loaded, but exception column needs to be updated
# ~ 3 hours wall clock time from here to end
# run queries from snpException.query against snp table
mkdir -p /usr/local/apache/htdocs/qa/test-results/snpException/build$build
cd /usr/local/apache/htdocs/qa/test-results/snpException/build$build
time snpException hg17 0 ${db}snpException > ${db}snpException.log
chmod o+rx .
chmod o+r *
# 10.610u 19.200s 53:59.98 0.9% 0+0k 0+0io 264pf+0w
# check alignment of flanking sequences
time snpValid $db /cluster/data/$db/bed/snp/build$build/seq > ${db}snpValid.log
# 5205.860u 216.570s 1:55:10.27 78.4% 0+0k 0+0io 72408pf+0w (hgwdev)
### NOTE: the pseudoautosomal snps are reported in the chrX files
### only, which causes problems for snpValid when checking the
### chrY snp mappings. I got around this by confirming that all
### of the 'missing flank' errors (#23) were in pseudoautosomal
### regions and ignoring them. I manually truncated the
### hg17snpException.23.bed file before continuing with the next
### step. This could/should be fixed in the next iteration.
# create list of statements to update the snp table and run them
time tail +3 ${db}snpException.* | awk '/rs/ {printf "%s\t%d\t%d\n",$4,$2,$5}' | sort -k1,2n > exceptionList.txt
# ~10 seconds
time updateExceptionList.pl < exceptionList.txt > updateExceptionList.sql
# 7.250u 0.390s 0:07.87 97.0% 0+0k 0+0io 337pf+0w
time hgsql hg17 < updateExceptionList.sql
# 8.420u 10.370s 11:58.44 2.6% 0+0k 0+0io 413pf+0w build123 (this is mostly a mysql process)
# 6.550u 9.370s 14:34.17 1.8% 0+0k 0+0io 413pf+0w build124
# > wc -l build12*/updateExceptionList.sql
# 387166 build123/updateExceptionList.sql
# 383759 build124/updateExceptionList.sql
# Add Affy SNPs from new submission
#!/bin/csh -fe
# rm -f log ; date ; ./loadAffySnps.csh > & log ; date ; cat log
set db = hg17
cd /cluster/data/$db/bed/snp/affy/latest
touch affy.txt affy.bed Affy.bed bed.tab
rm -f affy*.txt affy*.bed Affy.bed* bed.tab
# datafile was provided by Valmeekam, Venu [Venu_Valmeekam@affymetrix.com]
tar xfz affyhg17maps_withstrand_alleles.tgz
wc -l affy*txt
awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10K\t0\n", $1,$2,$3,$4,$6,$7);}' < affy10K.txt > affy10K.bed
awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10Kv2\t0\n", $1,$2,$3,$4,$6,$7);}' < affy10Kv2.txt > affy10Kv2.bed
awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_HindIII\t0\n",$1,$2,$3,$4,$6,$7);}' < affy50K_HindIII.txt > affy50K_HindIII.bed
awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_XbaI\t0\n", $1,$2,$3,$4,$6,$7);}' < affy50K_XbaI.txt > affy50K_XbaI.bed
# this is a temporary kluge to fix some bad input data.
cat affy*.bed | sed 's/_par//' > Affy.bed
# the source enum for 'dbSnp' is 2; all of the affy* values are higher.
hgsql $db -e "delete from snp where source > 2 "
hgLoadBed $db snp Affy.bed -oldTable -tab
rm -f affy*.txt affy*.bed bed.tab
gzip Affy.bed
#mysql> select source, count(*) from hg17.snp group by source;
#+-----------------+----------+
#| source | count(*) |
#+-----------------+----------+
#| dbSnp | 9131054 |
#| Affy10K | 11344 |
#| Affy10Kv2 | 10032 |
#| Affy50K_HindIII | 56859 |
#| Affy50K_XbaI | 58494 |
#+-----------------+----------+
#March 7, 2005: fix pseudoautosomal snps:
#SNP_A-1606360
#SNP_A-1606329
#SNP_A-1666553
#SNP_A-1715750
#SNP_A-1726331
#SNP_A-1685712
#SNP_A-1735899
#SNP_A-1726272
#SNP_A-1660936
#SNP_A-1662285
#SNP_A-1680848
#SNP_A-1671440
#SNP_A-1719355
#SNP_A-1716499
#SNP_A-1643847
#SNP_A-1646007
#SNP_A-1715285
#SNP_A-1657714
#SNP_A-1725038
#SNP_A-1713938
#SNP_A-1708565
#SNP_A-1510243
#SNP_A-1510197
#SNP_A-1606356
delete from snp
where chrom = 'chrY'
and name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356');
update snp
set chrom = 'chrX'
where name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356');
insert into snp
select bin, 'chrY' as chrom, chromStart, chromEnd, name, score, strand,
observed, molType, class, valid, avHet, avHetSE, func, locType, source, exception
from snp
where name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356');
select chrom, count(*)
from snp
where name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356')
group by chrom;;
## LS-SNP links [load data only] (Daryl Thomas; November 3, 2005)
# Data from Rachel Karchin in the Andrej Sali lab at UCSF
# /cluster/data/hg17/bed/lssnp
hgsql hg17 < ${HOME}/kent/src/hg/lib/lsSnpFunction.sql
hgsql hg17 < ${HOME}/kent/src/hg/lib/lsSnpStructure.sql
mysql> load data local infile "snp-human3-function-predictions.txt" into table lsSnpFunction;
Query OK, 24337 rows affected (1.27 sec)
mysql> load data local infile "snp-human3-structure-predictions.txt" into table lsSnpStructure;
Query OK, 34764 rows affected (2.36 sec)
# Tajima's D (DONE -- 2005-09-20 -- Daryl)
# Data from Chris Carlson in Debbie Nickerson's lab
# Chris Carlson [csc47<AT>u<DOT>washington<DOT>edu]
# get data from ftp site, unpack in $dir:
# tar tvfz *gz | more
# -rw-r--r-- chris/admin 34405061 2005-06-03 13:22:15 AD.SNP.track
# -rw-r--r-- chris/admin 29869512 2005-06-03 13:22:30 ED.SNP.track
# -rw-r--r-- chris/admin 27154049 2005-06-03 13:22:41 XD.SNP.track
# -rw-r--r-- chris/admin 10948753 2005-06-02 21:12:27 AD.tajd.track
# -rw-r--r-- chris/admin 10928630 2005-06-02 21:12:39 ED.tajd.track
# -rw-r--r-- chris/admin 10926122 2005-06-02 21:12:51 XD.tajd.track
set db=hg17
set dir=/cluster/data/$db/bed/tajdpoly/latest
cd $dir
tar xvfz TajDtracks.tar.gz
mac2unix < AD.SNP.track | grep -v track | sed 's/1\.02e+08/102000000/;s/8\.8e+07/88000000/;s/1\.5e+07/15000000/' > hg17.tajdSnpAd.bed
mac2unix < ED.SNP.track | grep -v track | sed 's/1\.02e+08/102000000/;s/8\.8e+07/88000000/;s/1\.5e+07/15000000/' > hg17.tajdSnpEd.bed
mac2unix < XD.SNP.track | grep -v track | sed 's/1\.02e+08/102000000/;s/8\.8e+07/88000000/;s/1\.5e+07/15000000/' > hg17.tajdSnpXd.bed
mac2unix < AD.tajd.track | grep -v track | awk '{printf"%s\t%s\t%d\t%.3f\n",$1,$2,$3,$4}' > hg17.tajdAd.bedGraph
mac2unix < ED.tajd.track | grep -v track | awk '{printf"%s\t%s\t%d\t%.3f\n",$1,$2,$3,$4}' > hg17.tajdEd.bedGraph
mac2unix < XD.tajd.track | grep -v track | awk '{printf"%s\t%s\t%d\t%.3f\n",$1,$2,$3,$4}' > hg17.tajdXd.bedGraph
set chain = /cluster/data/hg17/bed/bedOver/hg17ToHg16.over.chain
foreach pop (Ad Ed Xd)
liftOver hg17.tajdSnp$pop.bed $chain hg16.tajdSnp$pop.bed hg17ToHg16.tajdSnp$pop.unmapped
liftOver hg17.tajd$pop.bedGraph $chain hg16.tajd$pop.bedGraph hg17ToHg16.tajd$pop.unmapped
foreach db (hg16 hg17)
hgLoadBed -bedGraph=4 $db tajd$pop $db.tajd$pop.bedGraph
hgLoadBed $db tajdSnp$pop $db.tajdSnp$pop.bed
end
end
set where1 = "where t.bin=g.bin and t.chrom=g.chrom and (t.chromStart between g.chromStart and g.chromEnd or t.chromEnd between g.chromStart and g.chromEnd)"
set where2 = "t, chromInfo c where t.chromStart < 0 or (t.chrom=c.chrom and t.chromEnd > c.size)"
set list = "as pop, t.chrom, t.chromStart from"
foreach db (hg16 hg17)
rm -f $db.delete.sql
touch $db.delete.sql
foreach p (Ad Ed Xd SnpAd SnpEd SnpXd)
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
echo "select 'tajd$p' $list tajd${p} t,chr${c}_gap g $where1" | \
hgsql $db | \
grep -v pop | \
awk '{printf "delete from %s where chrom=\"%s\" and chromStart=%d;\n",$1,$2,$3}' \
>> $db.delete.sql
end
echo "select 'tajd$p' $list tajd${p} $where2" | \
hgsql $db | \
grep -v pop | \
awk '{printf "delete from %s where chrom=\"%s\" and chromStart=%d;\n",$1,$2,$3}'\
>> $db.delete.sql
end
hgsql $db < $db.delete.sql
end
# GENE SORTER (AKA: FAMILY BROWSER) (DONE - 2004-06-16 - Hiram)
# Added knownToU133Plus2 track (2004-10-14)
# to be done after knownGene tables are complete from known gene
# process.
#
# Cluster together various alt-splicing isoforms.
# Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/hg17/bed/geneSorter.2004-06-15
ln -s /cluster/data/hg17/bed/geneSorter.2004-06-15 \
/cluster/data/hg17/bed/geneSorter
cd /cluster/data/hg17/bed/geneSorter
hgClusterGenes hg17 knownGene knownIsoforms knownCanonical
# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/hg17/bed/geneSorter/blastp
cd /cluster/data/hg17/bed/geneSorter/blastp
pepPredToFa hg17 knownGenePep known.faa
# You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
# This command is in /projects/compbio/bin/$MACH/formatdb
# Copy over database to bluearc
rm -fr /cluster/bluearc/hg17/blastp
mkdir -p /cluster/bluearc/hg17/blastp
cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* \
/cluster/bluearc/hg17/blastp
# Had to pick up a new blastall binary (2004-06-15)
# Our old one would no longer run on our systems that have
# updated Linux versions
mkdir /cluster/bluearc/blast229
cd /cluster/bluearc/blast229
wget --timestamping \
ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.9/blast-2.2.9-ia32-linux.tar.gz
wget --timestamping \
ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.9/ChangeLog.txt
wget --timestamping \
ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.9/ReleaseNotes.txt
tar xvzf blast-2.2.9-ia32-linux.tar.gz
# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/hg17/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/self
cd /cluster/data/hg17/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \
-e 0.01 -m 8 -b 1000
'_EOF_'
# << keep emacs happy
chmod +x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# 'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
# This should finish in ~15 minutes if the cluster is free.
Completed: 7749 of 7749 jobs
CPU time in finished jobs: 182148s 3035.81m 50.60h 2.11d 0.006 y
IO & Wait Time: 22954s 382.56m 6.38h 0.27d 0.001 y
Average job time: 26s 0.44m 0.01h 0.00d
Longest job: 372s 6.20m 0.10h 0.00d
Submission to last job: 871s 14.52m 0.24h 0.01d
# Load into database. This takes about 30 minutes
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out
time hgLoadBlastTab hg17 knownBlastTab *.tab
# Scanning through 7749 files
# Loading database with 11799667 rows
# Hg16 was: 11376875 rows
# real 30m10.761s
# user 5m25.490s
# sys 1m0.630s
cd /cluster/data/hg17/bed/geneSorter
# Create table that maps between known genes and RefSeq
hgMapToGene hg17 refGene knownGene knownToRefSeq
# may need to build this command in src/hg/near/hgMapToGene
# hgsql -e "select count(*) from knownToRefSeq;" hg17
# row count changed from 36078 in Hg16 to 36082
# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 \
> refToLl.txt
hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt
# hgsql -e "select count(*) from knownToLocusLink;" hg17
# row count went from 36078 in Hg16 to 36082
# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam
# hgsql -e "select count(*) from knownToPfam;" hg17
# row count dropped from 30467 in Hg16 to 29725
# Create table to map between known genes and GNF Atlas2
# expression data.
hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
# hgsql -e "select count(*) from knownToGnfAtlas2;" hg17
# row count droppted from 35817 in Hg16 to 35739
# Create expression distance table - takes about an hour
hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnfAtlas2
# Got 35739 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
# hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
# row count went from 35,817,000 in Hg16 to 35,739,000
# real 108m1.671s
# user 89m30.680s
# sys 3m6.800s
# Create a table that maps between known genes and
# the nice affy expression data.
hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133
# hgsql -e "select count(*) from knownToU133;" hg17
# row count went from 37,634 in Hg16 to 36,795
# Create expression distance table. This will take about 2.5 hours
cd /tmp
cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \
-weights=affyUcla.weight -lookup=knownToU133
# 211 genes, 42 weights, 26.500000 total wieght
# Got 36795 unique elements in affyUclaNorm
# real 154m1.058s
# user 134m45.000s
# sys 3m1.990s
# Create table that maps between known genes and
# the GNF data.
cd /tmp
hgMapToGene hg17 affyU95 knownGene knownToU95
# row count went from 18780 in Hg16 to 18796
# hgFixed.gnfHumanU95Exps argument is unused, no need to exist
hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \
hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95
# row count went from 17711000 in Hg16 to 17710000
# real 21m37.703s
# user 13m35.110s
# sys 0m28.470s
# Create known gene mapping table and expression distance tables
# for GNF Atlas 2. (The hgExpDistance takes only 10 minutes.)
hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h
hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnf1h
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 9756 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
# create table mapping knownGenes to affyU133Plus2 table (2004-10-14, hartera)
cd /cluster/data/hg17/bed/geneSorter
hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2
# Make sure that GO database is up to date.
See README in /cluster/store1/geneOntology.
# I update this GO database very carefully, checking that all
# structures in it remain the same from release to release and
# backing up the current go DB in a backup database. In this case
# the backup is go040107 - when it was loaded for Mm4, and the new
# go database is based on data from Dec 17th 2003 and Feb 2004 according
# to the time stamp on the fetched data. This build was done in
# /cluster/store1/geneOntology/20040217
cd /cluster/data/hg17/bed/geneSorter
XXX - DO NOT YET HAVE ensGene table - must wait on Ensembl to release that
XXX - have not created the knownToEnsembl table yet - 2004-07-15 - Hiram
# Create knownToEnsembl column
hgMapToGene hg17 ensGene knownGene knownToEnsembl
# table row count went from previous version: 36068 to 38251
# Make knownToCdsSnp table (DONE Nov 11, 2004, Heather)
ssh hgwdev
nice hgMapToGene hg17 snp knownGene knownToCdsSnp -all -cds
# row count 165728
# unique 34013
# approx. 5 minutes running time
# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This is already done, see makeMm3.doc for procedure
# the directory: /cluster/bluearc/ce1/blastp should have data
# The blast jobs below can be run on the kk or kk9 clusters
# Create the ceBlastTab
ssh kk9
mkdir /cluster/data/hg17/bed/geneSorter/blastp/ce1
cd /cluster/data/hg17/bed/geneSorter/blastp/ce1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/ce1/blastp/wormPep \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Only takes 10 minutes on an idle cluster
# Completed: 7749 of 7749 jobs
# CPU time in finished jobs: 32023s 533.72m 8.90h 0.37d 0.001 y
# IO & Wait Time: 20643s 344.05m 5.73h 0.24d 0.001 y
# Average job time: 7s 0.11m 0.00h 0.00d
# Longest job: 110s 1.83m 0.03h 0.00d
# Submission to last job: 1911s 31.85m 0.53h 0.02d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/ce1/run/out
hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab
# row count went from 27620 to 27616
# Make mouse ortholog column using blastp on mouse known genes.
# First make mouse protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This already exists. See makeMm5.doc for procedure
# the directory: /cluster/bluearc/scratch/mus/mm5/blastp should have data
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm5
cd /cluster/data/hg17/bed/geneSorter/blastp/mm5
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/scratch/mus/mm5/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# this echo trick is used because otherwise the command line is
# too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7749 of 7749 jobs
# CPU time in finished jobs: 139041s 2317.34m 38.62h 1.61d 0.004 y
# IO & Wait Time: 21227s 353.79m 5.90h 0.25d 0.001 y
# Average job time: 21s 0.34m 0.01h 0.00d
# Longest job: 260s 4.33m 0.07h 0.00d
# Submission to last job: 1137s 18.95m 0.32h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/mm5/run/out
hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
# Scanning through 7748 files
# row count went from 36471 to 36638
# Make rat ortholog column using blastp on rat known genes.
# First make rat protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This already exists. See makeRn3.doc for procedure.
# Files were put in this directory: /cluster/bluearc/rn3/blastp/
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/rn3/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# this echo trick is used because otherwise the command line is
# too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
#Completed: 7749 of 7749 jobs
#CPU time in finished jobs: 31035s 517.25m 8.62h 0.36d 0.001 y
#IO & Wait Time: 38472s 641.20m 10.69h 0.45d 0.001 y
#Average job time: 9s 0.15m 0.00h 0.00d
#Longest job: 75s 1.25m 0.02h 0.00d
#Submission to last job: 169s 2.82m 0.05h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out
hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab
# Scanning through 7749 files
#Loading database with 25574 rows
# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This is already done, see makeMm3.doc for procedure
# the directory: /cluster/bluearc/dr1/blastp should have data
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/dr1
cd /cluster/data/hg17/bed/geneSorter/blastp/dr1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/dr1/blastp/ensembl \
-i $1 -o $2 -e 0.005 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7749 of 7749 jobs
# CPU time in finished jobs: 100217s 1670.28m 27.84h 1.16d 0.003 y
# IO & Wait Time: 23697s 394.95m 6.58h 0.27d 0.001 y
# Average job time: 16s 0.27m 0.00h 0.00d
# Longest job: 233s 3.88m 0.06h 0.00d
# Submission to last job: 1667s 27.78m 0.46h 0.02d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/dr1/run/out
hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab
# row count went from 32971 to 33023
# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This is already done, see makeMm3.doc for procedure
# the directory: /cluster/bluearc/sc1/blastp should have data
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7749 of 7749 jobs
# CPU time in finished jobs: 20738s 345.64m 5.76h 0.24d 0.001 y
# IO & Wait Time: 22018s 366.96m 6.12h 0.25d 0.001 y
# Average job time: 6s 0.09m 0.00h 0.00d
# Longest job: 39s 0.65m 0.01h 0.00d
# Submission to last job: 572s 9.53m 0.16h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out
hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab
# row count went from 18286 to 18265
# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This is already done, see makeMm3.doc for procedure
# the directory: /cluster/bluearc/dm1/blastp should have data
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7749 of 7749 jobs
# CPU time in finished jobs: 82022s 1367.03m 22.78h 0.95d 0.003 y
# IO & Wait Time: 21982s 366.37m 6.11h 0.25d 0.001 y
# Average job time: 13s 0.22m 0.00h 0.00d
# Longest job: 174s 2.90m 0.05h 0.00d
# Submission to last job: 1439s 23.98m 0.40h 0.02d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out
hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab
# row count went from 29322 to 29341
#### Blat knownGene proteins to determine exons (braney 2004-06-20 DONE)
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir blat.hg17KG.2004-06-20
rm blat.hg17KG
ln -s blat.hg17KG.2004-06-20 blat.hg17KG
cd blat.hg17KG
pepPredToFa hg17 knownGenePep known.fa
hgPepPred hg17 generic blastKGPep00 known.fa
grep ">" known.fa | sed "s/>//" > kgName.lst
kgName hg17 kgName.lst blastKGRef00
hgsql hg17 < ~/kent/src/lib/hg/blastRef.sql
echo "rename table blastRef to blastKGRef00" | hgsql hg17
echo "load data local infile 'blastKGRef00' into table blastKGRef00" | hgsql hg17
ssh kk
cd /cluster/data/hg17/bed/blat.hg17KG
cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
# << keep emacs happy
chmod +x blatSome
ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs/*.nib > human.lst
mkdir kgfa
cd kgfa
faSplit sequence ../known.fa 3000 kg
cd ..
ls -1S kgfa/*.fa > kg.lst
cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << keep emacs happy
gensub2 human.lst kg.lst blatGsub blatSpec
mkdir psl
cd psl
foreach i (`cat ../human.lst`)
mkdir `basename $i .nib`
end
cd ..
para create blatSpec
para push
# Completed: 133676 of 133676 jobs
# CPU time in finished jobs: 29661130s 494352.16m 8239.20h 343.30d 0.941 y
# IO & Wait Time: 2181179s 36352.99m 605.88h 25.25d 0.069 y
# Average job time: 238s 3.97m 0.07h 0.00d
# Longest job: 105972s 1766.20m 29.44h 1.23d
ssh eieio
cd /cluster/data/hg17/bed/blat.hg17KG
pslSort dirs raw.psl /tmp psl/*
pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
pslUniq cooked.psl hg17KG.psl
pslxToFa hg17KG.psl hg17KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
# BLASTZ MM4 (DONE - 2004-06-22 - Hiram)
ssh kk
mkdir -p /cluster/data/hg17/bed/blastz.mm4.2004-06-21
cd /cluster/data/hg17/bed
ln -s blastz.mm4.2004-06-21 blastz.mm4
cd blastz.mm4
cat << '_EOF_' > DEF
# human vs. mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInRat
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Mouse
SEQ2_DIR=/scratch/mus/mm4/softNib
# RMSK not currently used
SEQ2_RMSK=/scratch/mus/mm4/rmsk
# FLAG not currently used
SEQ2_FLAG=-rodent
SEQ2_SMSK=/scratch/mus/mm4/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg17/bed/blastz.mm4
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
# prepare first cluster run
ssh kk
cd /cluster/data/hg17/bed/blastz.mm4
/cluster/data/hg17/jkStuff/BlastZ_run0.sh
cd run.0
para try, check, push, check, ....
# Completed: 43648 of 43648 jobs
# CPU time in finished jobs: 16448001s 274133.36m 4568.89h 190.37d 0.522 y
# IO & Wait Time: 751666s 12527.76m 208.80h 8.70d 0.024 y
# Average job time: 394s 6.57m 0.11h 0.00d
# Longest job: 8323s 138.72m 2.31h 0.10d
# Submission to last job: 44244s 737.40m 12.29h 0.51d
# the file server to its knees. Run this on the small cluster.
ssh kki
cd /cluster/data/hg17/bed/blastz.mm4
/cluster/data/hg17/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 3925s 65.42m 1.09h 0.05d 0.000 y
# IO & Wait Time: 6208s 103.46m 1.72h 0.07d 0.000 y
# Average job time: 30s 0.50m 0.01h 0.00d
# Longest job: 289s 4.82m 0.08h 0.00d
# Submission to last job: 2800s 46.67m 0.78h 0.03d
# Third cluster run to convert lav's to axt's
# Does not work on kki since /scratch on the iservers is not the
# same as /scratch on the other clusters.
ssh kk
cd /cluster/data/hg17/bed/blastz.mm4
/cluster/data/hg17/jkStuff/BlastZ_run2.sh
cd run.2
para try, check, push, etc ...
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 2389s 39.82m 0.66h 0.03d 0.000 y
# IO & Wait Time: 13374s 222.90m 3.71h 0.15d 0.000 y
# Average job time: 350s 5.84m 0.10h 0.00d
# Longest job: 1426s 23.77m 0.40h 0.02d
# Submission to last job: 1440s 24.00m 0.40h 0.02d
# chr19 failing due to out of memory. Run this job individually
# on kolossus, adjusting the location of the nib directories:
ssh kolossus
cd /cluster/data/hg17/bed/blastz.mm4
sed -e "s/i386/x86_64/g" /cluster/bin/scripts/blastz-chromlav2axt > \
x86_64-chromlav2axt
chmod +x x86_64-chromlav2axt
time ./x86_64-chromlav2axt \
/cluster/data/hg17/bed/blastz.mm4/lav/chr19 \
/cluster/data/hg17/bed/blastz.mm4/axtChrom/chr19.axt \
/cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs \
/cluster/bluearc/scratch/mus/mm4/softNib
# real 24m28.955s
# user 6m40.990s
# sys 1m16.500s
# translate sorted axt files into psl
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm4
mkdir -p pslChrom
set tbl = "blastzMm4"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# This takes more than an hour. You can shorten this by changing
# that command to a simple echo, put the results into a file,
# split the file into four parts and run the four files as shell
# scripts on eieio to have four processes running at the same
# time. Load on eieio gets up to about 20 which is reasonable.
# Load database tables
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm4/pslChrom
bash
for F in chr*_blastzMm4.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${F}
echo "${F} done"
done
# this is a 55 minute job
# exit bash if you are tcsh
# featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of
# memory. But if you reset your ~/.hg.conf to use the read-only
# user and contact the hgwdev host, then use the x86_64 featureBits
# featureBits hg16 blastzMm4
# 1056761609 bases of 2865248791 (36.882%) in intersection
# featureBits hg17 blastzMm4
# 1056201417 bases of 2866216770 (36.850%) in intersection
# CHAIN MM4 BLASTZ (DONE - 2004-06-29 - Hiram)
# redone with the 'axtFilter -notQ_random' removed - 2004-06-23
# The axtChain is best run on the small kluster, or the kk9 kluster
ssh kk9
mkdir -p /cluster/data/hg17/bed/blastz.mm4/axtChain/run1
cd /cluster/data/hg17/bed/blastz.mm4/axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg17/bed/blastz.mm4/axtChrom/*.axt > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# May need -minScore=5000 for all chroms if chr19 won't finish on kolossus
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /iscratch/i/gs.18/build35/bothMaskedNibs \
/iscratch/i/mm4/softNib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
# 46 jobs
gensub2 input.lst single gsub jobList
para create jobList
para try
para push # ... etc ...
# Completed: 45 of 46 jobs
# CPU time in finished jobs: 6575s 109.58m 1.83h 0.08d 0.000 y
# IO & Wait Time: 9274s 154.57m 2.58h 0.11d 0.000 y
# Average job time: 352s 5.87m 0.10h 0.00d
# Longest job: 3121s 52.02m 0.87h 0.04d
# Submission to last job: 3121s 52.02m 0.87h 0.04d
# one job wouldn't finish due to memory usage
# run the chr19 job on kolossus, takes an hour, gets up to 4 Gb
# memory usage
# now on the file server, sort chains
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm4/axtChain
time chainMergeSort run1/chain/*.chain > all.chain
# real 17m17.639s
# user 9m54.240s
# sys 1m31.210s
# (1.9 Gb result file !)
time chainSplit chain all.chain
# real 27m32.278s
# user 9m46.970s
# sys 2m45.960s
# optionally: rm run1/chain/*.chain
# Load chains into database
# next machine
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm4/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain hg17 ${c}_chainMm4 $i
echo done $c
end
# featureBits hg17 chainMm4
# 2829135227 bases of 2866216770 (98.706%) in intersection
# featureBits hg16 chainMm4
# 2828363353 bases of 2865248791 (98.713%) in intersection
# NET MM4 (DONE - 2004-06-29 - Hiram)
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm4/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i /cluster/data/hg17/chrom.sizes \
/cluster/data/mm4/chrom.sizes ../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg17/chrom.sizes \
/cluster/data/mm4/chrom.sizes ../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
# memory usage 2504171520, utime 19373 s/100, stime 5906
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm4/axtChain
time netClass hNoClass.net hg17 mm4 mouse.net \
-tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInMouse \
-qNewR=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInHuman
# real 19m33.421s
# user 10m37.130s
# sys 1m45.630s
# If things look good do
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm4/axtChain
rm -r n1 hNoClass.net
# Make a 'syntenic' subset of these with
time netFilter -syn mouse.net > mouseSyn.net
# real 13m24.885s
# user 7m37.100s
# sys 1m5.760s
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm4/axtChain
netFilter -minGap=10 mouse.net | hgLoadNet hg17 netMm4 stdin
netFilter -minGap=10 mouseSyn.net | hgLoadNet hg17 syntenyNetMm4 stdin
# real 44m20.735s
# user 15m58.620s
# sys 1m58.720s
# check results
# featureBits hg17 netMm4
# 2824272033 bases of 2866216770 (98.537%) in intersection
# featureBits hg16 netMm4
# 2823565051 bases of 2865248791 (98.545%) in intersection
# featureBits hg17 syntenyNetMm4
# 2785830955 bases of 2866216770 (97.195%) in intersection
# featureBits hg16 syntenyNetMm4
# 2786960572 bases of 2865248791 (97.268%) in intersection
# Add entries for net and chain to mouse/hg17 trackDb
# make net
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm4/axtChain
mkdir mouseNet
time netSplit mouse.net mouseNet
# real 12m1.478s
# user 8m35.050s
# sys 1m7.230s
# extract axt's from net, and convert to maf's (DONE - Kate - 2004-06-24)
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm4/axtChain
mkdir ../axtNet ../mafNet
cat > makeMaf.csh << '_EOF_'
foreach f (mouseNet/chr*.net)
set c = $f:t:r
echo "netToAxt: $c.net -> $c.axt"
rm -f ../axtNet/$c.axt
netToAxt mouseNet/$c.net chain/$c.chain \
/cluster/data/hg17/nib /cluster/data/mm4/nib stdout | \
axtSort stdin ../axtNet/$c.axt
axtToMaf ../axtNet/$c.axt \
/cluster/data/hg17/chrom.sizes /cluster/data/mm4/chrom.sizes \
../mafNet/$c.maf -tPrefix=hg17. -qPrefix=mm4.
echo "Complete: $c.net -> axtNet/$c.axt -> mafNet/$c.maf"
end
'_EOF_'
# << for emacs
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
ssh hgwdev
mkdir -p /cluster/data/hg17/bed/blastz.mm4/axtBest
cd /cluster/data/hg17/bed/blastz.mm4/axtBest
ln -s ../axtNet/chr*.axt .
# copy net axt's to download area
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm4/axtNet
mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtNet
cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtNet
cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtNet
gzip *.axt
# add README.txt file to dir (use previous assembly's copy as template)
# Convert those axt files to psl
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm4
mkdir pslBest
foreach a (axtBest/chr*.axt)
set c=$a:t:r
echo "processing $c.axt -> ${c}_blastzBestMm4.psl"
/cluster/bin/i386/axtToPsl axtBest/${c}.axt \
S1.len S2.len pslBest/${c}_blastzBestMm4.psl
echo "Done: ${c}_blastzBestMm4.psl"
end
# Load tables
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm4/pslBest
for I in chr*BestMm4.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
echo "done ${I}"
done
# check results
# featureBits hg17 blastzBestMm4
# 1017319919 bases of 2866216770 (35.493%) in intersection
# featureBits hg16 blastzBestMm4
# 996722004 bases of 2865248791 (34.787%) in intersection
# Make /gbdb links and add them to the axtInfo table:
mkdir -p /gbdb/hg17/axtBest/Mm4
cd /gbdb/hg17/axtBest/Mm4
ln -s /cluster/data/hg17/bed/blastz.mm4/axtNet/chr*.axt .
cd /cluster/data/hg17/bed/blastz.mm4/axtNet
rm -f axtInfoInserts.sql
foreach f (/gbdb/hg17/axtBest/Mm4/chr*.axt)
set chr=$f:t:r
echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
VALUES ('mm4','Blastz Best in Genome','$chr','$f');" \
>> axtInfoInserts.sql
end
hgsql hg17 < ~/kent/src/hg/lib/axtInfo.sql
# table axtInfo may already exist, ignore create error.
hgsql hg17 < axtInfoInserts.sql
# MAKING MOUSE SYNTENY (DONE - 2004-06-29 - Hiram)
ssh hgwdev
mkdir /cluster/data/hg17/bed/syntenyMm4
cd /cluster/data/hg17/bed/syntenyMm4
# Copy all the needed scripts from /cluster/data/hg16/bed/syntenyMm3
cp -p /cluster/data/hg17/bed/syntenyRn3/*.pl .
./syntenicBest.pl -db=hg17 -table=blastzBestMm4
./smooth.pl
./joinsmallgaps.pl
./fillgap.pl -db=hg17 -table=blastzBestMm4
./synteny2bed.pl
# The five commands above
# real 220m16.227s
# user 0m22.940s
# sys 0m3.960s
# Used to load this in syntenyMm4, but that type is misleading to
# the table browser and fails the checkTableCoords check.
# Better to use this ensRatMusHom type:
# Need a new name here for the Mm4 to not conflict with Rn3
sed -e 's/ensPhusionBlast/ensRatMm4Hom/g' \
$HOME/kent/src/hg/lib/ensPhusionBlast.sql \
> ensRatMm4Hom.sql
hgLoadBed hg17 ensRatMm4Hom ucsc100k.bed -sqlTable=ensRatMm4Hom.sql
# featureBits hg17 ensRatMm4Hom
# 2549307611 bases of 2866216770 (88.943%) in intersection
# featureBits hg16 syntenyMm4
# 2560252977 bases of 2865248791 (89.355%) in intersection
# MAKING MOUSE AXTTIGHT FROM AXTBEST (DONE - 2004-06-29 - Hiram)
# After creating axtBest alignments above, use subsetAxt to get axtTight:
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm4/axtNet
mkdir -p ../axtTight
foreach i (*.axt)
echo $i
subsetAxt $i ../axtTight/$i \
~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
end
# translate to psl
cd ../axtTight
mkdir ../pslTight
foreach i (*.axt)
set c = $i:r
axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm4.psl
echo "Done: $i"
end
# Load tables into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm4/pslTight
for I in chr*TightMm4.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
echo "done ${I}"
done
# Compare results with previous assembly:
# featureBits hg17 blastzTightMm4
# 166569246 bases of 2866216770 (5.811%) in intersection
# featureBits hg16 blastzTightMm4
# 162641577 bases of 2865248791 (5.676%) in intersection
# copy axt's to download area
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm4/axtTight
mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtTight
cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtTight
cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtTight
gzip *.axt
# add README.txt file to dir (use previous assembly's copy as template)
# BLASTZ MM4 CLEAN UP (DONE - 2004-07-02 - Hiram)
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm4
nice rm -rf raw &
nice rm axtChain/run1/chain/* &
nice rm -fr axtChain/n1 axtChain/hNoClass.net &
nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net &
# BLASTZ CHIMP panTro1 (DONE 2004-06-22 kate)
# NOTE: Ran with abridge repeats=0, although SMSK was set
# Looked better than running with abridge=1, which had very
# chopped-up alignments
ssh kk
cd /cluster/data/hg17/bed
mkdir -p blastz.panTro1.2004-06-22
rm -f blastz.panTro1
cd blastz.panTro1.2004-06-22
cat << 'EOF' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0
# Specific settings for chimp
BLASTZ_Y=3400
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/cluster/data/penn/human_chimp.q
# TARGET: Human
SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/hg17/linSpecRep.chimp
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chimp
SEQ2_DIR=/scratch/chimp/panTro1/nib
# not currently used
SEQ2_RMSK=/iscratch/i/chimp/panTro1/linSpecRep.human
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg17/bed/blastz.panTro1.2004-06-22
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'EOF'
# << this line keeps emacs coloring happy
# first cluster run: raw blastz alignments
ssh kk
bash # if a csh/tcsh user
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
source DEF
mkdir $RAW run.0
/cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
sh ./xdir.sh
cd run.0
sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
para create jobList
# 160270 jobs written to batc
para try, check, push, check, ....
#CPU time in finished jobs: 2399227s 39987.11m 666.45h 27.77d 0.076 y
#IO & Wait Time: 503100s 8385.00m 139.75h 5.82d 0.016 y
#Average job time: 18s 0.30m 0.01h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 2073s 34.55m 0.58h 0.02d
#Submission to last job: 10843s 180.72m 3.01h 0.13d
# second cluster run: lift raw alignments -> lav dir
ssh kki
bash # if a csh/tcsh user
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
source DEF
mkdir run.1 lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
cd run.1
wc -l jobList
para create jobList
# 341 jobs
para try, check, push, etc ...
# CPU time in finished jobs: 3458s 57.63m 0.96h 0.04d 0.000 y
# IO & Wait Time: 57996s 966.60m 16.11h 0.67d 0.002 y
# Average job time: 180s 3.00m 0.05h 0.00d
# Longest job: 483s 8.05m 0.13h 0.01d
# Submission to last job: 1498s 24.97m 0.42h 0.02d
# third run: lav -> axt -> psl
ssh kki
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
mkdir axtChrom pslChrom run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| /cluster/bin/x86_64/lavToAxt stdin \
/iscratch/i/hg17/bothMaskedNibs /iscratch/i/chimp/panTro1/nib stdout \
| /cluster/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt
/cluster/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
../../pslChrom/$chr.psl
'_EOF_'
# << this line keeps emacs coloring happy
chmod a+x do.csh
cp /dev/null jobList
for d in ../lav/chr*; do
echo "do.csh $d" >> jobList
done
para create jobList
# 46 jobs
para try, check, push, check
#Completed: 42 of 42 jobs
#Average job time: 38s 0.64m 0.01h 0.00d
#Longest job: 147s 2.45m 0.04h 0.00d
#Submission to last job: 147s 2.45m 0.04h 0.00d
# Load database tables (takes an hour or so)
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/pslChrom
cat > load.csh << 'EOF'
foreach f (chr*.psl)
set table = $f:r_blastzPanTro1
echo "loading ${table}"
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 -table=$f:r_${table} $f
end
'EOF'
# << for emacs
csh load.csh >&! load.log &
tail -100f load.log
# CHAIN CHIMP BLASTZ (6/23/04 kate)
# Run axtChain on little cluster
# first copy input to bluearc, as eieo bogs down if even mini-cluster
# gets input from it !?
ssh eieio
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
cp -rp axtChrom /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom
ssh kki
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh -fe
set c = $1:r:t
axtChain $1 -scoreScheme=/cluster/data/blastz/human_chimp.q \
/iscratch/i/hg17/bothMaskedNibs \
/iscratch/i/chimp/panTro1/nib /tmp/$c.chain.$$ > /tmp/$c.out.$$
set ret = $status
mv -f /tmp/$c.chain.$$ $2
mv -f /tmp/$c.out.$$ $3
exit $status
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# TODO
rm -fr /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom
echo "remove after 7/1/04" > /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom/README
# now on the cluster server, sort chains
ssh eieio
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# TODO
rm run1/chain/*.chain
echo "remove after 7/1/04" > run1/chain/README
# Load chains into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain/chain
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain hg17 ${c}_chainPanTro1 $i
end
# TODO
featureBits hg16 chainPanTro1Link
#2627280557 bases of 2865248791 (91.695%) in intersection
featureBits hg17 chainPanTro1Link
# 2633869032 bases of 2866216770 (91.894%) in intersection
# NET CHIMP (DONE 2004-6-24 kate)
# Redone to make chimp.net on 2004-10-11 kate (other files have
# new times, but are the same as 6-24 versions)
ssh kolossus
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
#chainPreNet all.chain ../S1.len ../S2.len stdout \
#| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
#| netSyntenic stdin noClass.net
time chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=10 ../S1.len ../S2.len human.net chimp.net
# 42.860u 2.080s 2:11.11 34.2%
netSyntenic human.net noClass.net
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
netClass noClass.net hg17 panTro1 human.net
rm noClass.net
# Make a 'syntenic' subset:
ssh eieio
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
# TODO
#rm noClass.net
# Make a 'syntenic' subset of these with
# NOTE: we used -chimpSyn filtering for the reciprocal best nets
# on hg16 -- perhaps should use for nets here as well
netFilter -chimpSyn human.net > humanSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
netFilter -minGap=10 human.net | hgLoadNet hg17 netPanTro1 stdin
netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 netSyntenyPanTro1 stdin
# Add entries for chainPanTro1, netPanTro1 to
# human/hg17 trackDb
# save chimp net to downloads area
ssh eieio
cd /cluster/data/hg17/blastz.panTro1/axtChain
nice gzip chimp.net
cp chimp.net.gz /usr/local/apache/htdocs/goldenPath/panTro1/vsHg17
cd /usr/local/apache/htdocs/goldenPath/panTro1/vsHg17
md5sum *.gz > md5sum.txt
# RECIPROCAL BEST CHAINS FOR ENSEMBL GENE BUILD (DONE 2004-10-11 kate)
# Starting with the chimp-reference net, which contains the best human
# alignments to chimp, extract the subset of chains in the net.
# (these are the "best" chains of human alignments to chimp).
# Net these chains and use the resulting human-reference net (the
# "reciprocal best" net). Extract the chains from this net to
# obtain "reciprocal best" chains of chimp alignments to human.
ssh kolossus
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
mkdir rBest
grep chain all.chain | wc -l
# extract "best" chains from the chimp-reference net
time chainSwap all.chain stdout | \
netChainSubset chimp.net stdin stdout | \
chainSort stdin rBest/chimp.best.chain
grep chain rBest/chimp.best.chain | wc -l
# 64396
# for comparison later, extract "best" chains from human-reference net
netChainSubset human.net all.chain stdout | \
chainSort stdin rBest/human.best.chain
cd rBest
# net the best chains from the chimp net and pull the human-ref net
# (Daryl accidentally deleted human.rbest.net and rebuilt it with the
# same command on 8/14/2005, resulting in a file of the same size)
time chainPreNet chimp.best.chain ../../S2.len ../../S1.len stdout | \
chainNet stdin -minSpace=10 ../../S2.len ../../S1.len \
/dev/null human.rbest.net
# extract "reciprocal best" chains from the "best" human-reference net
netChainSubset human.rbest.net ../all.chain stdout | \
chainSort stdin human.rbest.chain
# take a look
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
cd rBest
mkdir rBestChain
chainSplit rBestChain human.rbest.chain
hgLoadChain hg17 chr7_rBestChainPanTro1 rBestChain/chr7.chain
# Loading 1639 chains into hg17.chr7_rBestChainPanTro1
mkdir bestChain
chainSplit bestChain human.best.chain
hgLoadChain hg17 chr7_bestChainPanTro1 bestChain/chr7.chain
# Loading 6516 chains into hg17.chr7_bestChainPanTro1
# compare
hgsql hg16 -s -e "select count(*) from chr7_rBestChainPanTro1"
# 2416
# spot-checked by comparing chr7 best and rbest:
# 1. for a a chain appearing in rBest, click thru to human browser,
# then via chimp net back to human browser at same region
# 2. for a chain in "best", but not rBest, do the same, verify
# that it produces a different region in the human browser
# post pre-Q/A file for ensembl download
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain/rBest
gzip human.rbest.chain
cp human.rbest.chain.gz \
/usr/local/apache/htdocs/kate/ensembl/hg17-panTro1.rbest.chain.gz
cd /usr/local/apache/htdocs/kate/ensembl
md5sum *.gz > md5sum.txt
mv hg17-panTro1.rbest.chain.gz /usr/local/apache/htdocs/hg17/vsPanTro1/hg17.panTro1.rbest.chain.gz
# save as reciprocal best liftover chain (2005-02-22 kate)
gunzip -c human.rbest.chain.gz > \
/cluster/data/hg17/bed/liftOver/hg17ToPanTro1.rbest.chain
# cleanup (TODO -- after QA)
ssh hgwdev
hgsql hg17 -e "drop table chr7_rBestChainPanTro1"
hgsql hg17 -e "drop table chr7_bestChainPanTro1"
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
mv rBest/human.rbest.chain.gz ..
rm -fr rBest
# RECIPROCAL BEST AXT'S FROM RECIPROCAL BEST CHAIN (2005-08-16 kate)
# (requested by Daryl)
cd /cluster/data/hg17/bed/blastz.panTro1
mkdir -p axtRBestNet
cat > makeRbestAxt.csh << 'EOF'
foreach f (axtChain/rBest/rBestChain/*.chain)
set c = $f:t:t:r
echo $c
chainToAxt $f /cluster/data/hg17/nib /cluster/data/panTro1/nib stdout \
| axtSort stdin axtRBestNet/$c.axt
end
'EOF'
# << for emacs
csh makeRbestAxt.csh >&! makeRbestAxt.log &
# GENERATE CHIMP MAF FOR MULTIZ FROM NET (DONE 2004-06-24 kate)
# Redo to fix overlap problem using 8/05 netToAxt (2005-08-16 kate)
# Replace bad chr5 axtNet and mafNet (2006-01-05 kate)
# There was apparently a bad chr5 nib for a while...
ssh kkstore02
cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
netSplit human.net net
mkdir axtNet mafNet
cat > makeMaf.csh << 'EOF'
foreach f (axtChain/net/*.net)
set c = $f:t:r
netToAxt $f axtChain/chain/$c.chain /cluster/data/hg17/nib \
/cluster/data/panTro1/nib stdout | axtSort stdin axtNet/$c.axt
axtToMaf axtNet/$c.axt \
/cluster/data/hg17/chrom.sizes /cluster/data/panTro1/chrom.sizes \
mafNet/$c.maf -tPrefix=hg17. -qPrefix=panTro1.
end
'EOF'
# << for emacs
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
mkdir -p /cluster/bluearc/hg17/mafNet
cp -rp mafNet /cluster/bluearc/hg17/mafNet/panTro1
# MAKE PANTRO1 DOWNLOADABLES (DONE 2004-09-14 kate)
# Redo panTro1.net.gz (it was truncated) 2004-10-07 kate
# Redo axtNets with non-overlapped versions (2005-08-29 kate)
# Replace bad chr5 axtNet and mafNet (2006-01-05 kate)
ssh eieio
cd /cluster/data/hg17/bed/blastz.panTro1
# gzip chains and nets
mkdir gz
cd gz
nice gzip -c ../axtChain/all.chain > panTro1.chain.gz
nice gzip -c ../axtChain/human.net > panTro1.net.gz
wc -l *.gz
cd ../axtNet
time nice gzip *.axt
# 46 mins.
ssh hgwdev
# copy chains and nets to downloads area
cd /usr/local/apache/htdocs/goldenPath/hg17
mkdir -p vsPanTro1
cd vsPanTro1
mv /cluster/data/hg17/bed/blastz.panTro1/gz/*.gz .
md5sum *.gz > md5sum.txt
# copy in README and edit
rmdir /cluster/data/hg17/bed/blastz.panTro1/gz
mkdir -p axtNet
cd axtNet
cp /cluster/data/hg17/bed/blastz.panTro1/axtNet/*.axt.gz .
md5sum *.gz > md5sum.txt
# RESCORE CHICKEN BLASTZ (DONE 6/23/04 angie)
# Webb noticed low scores when using non-default BLASTZ_Q scoring matrix
# and repeats abridged --
# PSU's restore_rpts program rescored alignments with default matrix
# instead of BLASTZ_Q matrix. Rescore them here so the chainer sees
# the higher scores:
ssh kolossus
cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14
mkdir axtChrom.rescore
foreach f (axtChrom/chr*.axt)
axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
$f axtChrom.rescore/$f:t
end
mv axtChrom axtChrom.preRescore
mv axtChrom.rescore axtChrom
# CHAIN CHICKEN BLASTZ (DONE 6/23/04 angie)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=/cluster/data/blastz/chickenHumanTuned.gap \
-minScore=5000 $1 \
/iscratch/i/hg17/bothMaskedNibs \
/iscratch/i/galGal2/nib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# axtChrom/chr18_random.axt is empty, so the {out line +} check failed:
#Completed: 45 of 46 jobs
#Crashed: 1 jobs
#Average job time: 46s 0.76m 0.01h 0.00d
#Longest job: 273s 4.55m 0.08h 0.00d
#Submission to last job: 519s 8.65m 0.14h 0.01d
# now on the cluster server, sort chains
ssh eieio
cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# Load chains into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain/chain
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain hg17 ${c}_chainGalGal2 $i
end
# NET CHICKEN BLASTZ (DONE 6/23/04 angie)
ssh eieio
cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
netClass noClass.net hg17 galGal2 human.net
# Make a 'syntenic' subset:
ssh eieio
cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
rm noClass.net
# Make a 'syntenic' subset of these with
netFilter -syn human.net > humanSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
netFilter -minGap=10 human.net | hgLoadNet hg17 netGalGal2 stdin
netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 netSyntenyGalGal2 stdin
# Add entries for chainGalGal2, netGalGal2, syntenyGalGal2 to
# human/hg17 trackDb
# XENOPUS BLASTZ/CHAIN/NET (DONE 9/24/04 jk)
# see makeXenTro1.doc and search for zb.hg17
# The results of this are also symlinked under hg17/bed
# GENERATE GALGAL2 MAF FOR MULTIZ FROM NET (DONE 6/23/04 angie)
# Redo net axt's and maf's to fix overlap problem (use 8/05 netToAxt)
# (2005-08-16 kate)
ssh kkstore02
cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
netSplit human.net net
cd ..
mkdir axtNet mafNet
cat > makeMaf.csh << 'EOF'
foreach f (axtChain/net/*)
set chr = $f:t:r
netToAxt $f axtChain/chain/$chr.chain /cluster/data/hg17/nib \
/cluster/data/galGal2/nib stdout \
| axtSort stdin axtNet/$chr.axt
axtToMaf axtNet/$chr.axt \
/cluster/data/hg17/chrom.sizes /cluster/data/galGal2/chrom.sizes \
mafNet/$chr.maf -tPrefix=hg17. -qPrefix=galGal2.
end
'EOF'
# << for emacs
csh makeMaf.csh >&! makeMaf.log &
mkdir -p /cluster/bluearc/hg17/mafNet
cp -rp mafNet /cluster/bluearc/hg17/mafNet/galGal2
# MAKE VSGALGAL2 DOWNLOADABLES (REDONE 9/13/04 angie)
# REDO axtNet's to fix overlaps (2005-09-12 kate)
ssh eieio
cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
gzip -c all.chain > /cluster/data/hg17/zip/chicken.chain.gz
gzip -c human.net > /cluster/data/hg17/zip/chicken.net.gz
mkdir /cluster/data/hg17/zip/axtNet
foreach f (axtNet/chr*axt)
gzip -c $f > /cluster/data/hg17/zip/$f.gz
end
# Doh! above for loop didn't work because all axt's have been removed
# from this dir! :| Just this once, regenerate compressed axtNet on
# the fly:
ssh kolossus
cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain/net
foreach f (*.net)
set chr = $f:t:r
echo $chr
netToAxt $f ../chain/$chr.chain /cluster/data/hg17/nib \
/cluster/data/galGal2/nib stdout \
| axtSort stdin stdout \
| gzip -c > /cluster/data/hg17/zip/axtNet/$chr.axt.gz
end
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsGalGal2
cd /usr/local/apache/htdocs/goldenPath/hg17/vsGalGal2
mv /cluster/data/hg17/zip/chicken*.gz .
mv /cluster/data/hg17/zip/axtNet .
md5sum *.gz */*.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# REDO axtNet downloads to fix overlaps (2005-09-13 kate)
ssh kkstore02
cd /cluster/data/hg17/bed/blastz.galGal2/axtNet
nice gzip *.axt
md5sum *.axt.gz > md5sum.txt
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17/vsGalGal2
mv axtNet axtNet.old
ln -s /cluster/data/hg17/bed/blastz.galGal2/axtNet .
# 8-WAY MULTIZ MULTIPLE ALIGNMENT WITH MM5 (DONE 2004-07-13 kate)
# Redo, below to fix overlapping alignments (2005-08-16 kate)
ssh eieio
set multizDir = multiz.2004-07-13
set workingDir = /cluster/bluearc/hg17/$multizDir
ln -s $workingDir /cluster/bluearc/hg17/multiz8way
mkdir -p $workingDir
mkdir -p /cluster/data/hg17/bed/$multizDir
cd /cluster/data/hg17/bed/$multizDir
# wrapper script for multiz
# NOTE: first arg is pairwise, 2nd arg is multiple (to add to)
# NOTE: next time, modify script so it only needs one arg -- saves the
# multiple dirname in a file for use by the next run
cat << 'EOF' > doMultiz.csh
#!/bin/csh -fe
mkdir -p $3:h
/cluster/bin/penn/multiz $1 $2 - > $3
'EOF'
# << for emacs
cat << 'EOF' > gsub
#LOOP
../doMultiz.csh {check in line /cluster/bluearc/hg17/multiz.2004-07-13/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/hg17/multiz.2004-07-13/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/hg17/multiz.2004-07-13/$(root1)$(dir1)/$(root2).maf}
#ENDLOOP
'EOF'
# << for emacs
chmod +x doMultiz.csh
ssh eieio
set workingDir = /cluster/bluearc/hg17/multiz.2004-07-13
# copy mafs to bluearc -- chimp
mkdir $workingDir/panTro1
cp /cluster/data/hg17/bed/blastz.panTro1/mafNet/*.maf \
$workingDir/panTro1
ls $workingDir/panTro1/*.maf > chrom.lst
# mouse
mkdir $workingDir/mm5
cp /cluster/data/hg17/bed/blastz.mm5/mafNet/chr*.maf $workingDir/mm5
# rat
mkdir $workingDir/rn3
cp /cluster/data/hg17/bed/blastz.rn3/mafNet/chr*.maf $workingDir/rn3
# dog
mkdir $workingDir/canFam1
foreach f (/cluster/data/hg17/bed/blastz.canFam1.2004-07-08/mafNet/chr*.maf)
set c = $f:r:r:t
echo $c
cp $f $workingDir/canFam1/$c.maf
end
# chicken
mkdir $workingDir/galGal2
foreach f (/cluster/data/hg17/bed/blastz.galGal2/mafNet/chr*.maf)
set c = $f:r:r:t
cp $f $workingDir/galGal2/$c.maf
end
# fugu
mkdir $workingDir/fr1
cp /cluster/data/hg17/bed/blastz.fr1/mafNet/chr*.maf $workingDir/fr1
# zebrafish
mkdir $workingDir/danRer1
cp /cluster/data/hg17/bed/blastz.danRer1.swap/mafNet/chr*.maf \
$workingDir/danRer1
# first multiz - add in mm5 mouse to human/chimp
#
ssh kki
set multizDir = multiz.2004-07-13
set workingDir = /cluster/bluearc/hg17/$multizDir
cd /cluster/data/hg17/bed/$multizDir
mkdir run.mm5
cd run.mm5
echo "mm5/panTro1" > species.lst
gensub2 species.lst ../chrom.lst ../gsub jobList
para create jobList
# 46 jobs
para try, check, push, check
# CPU time in finished jobs: 6620s 110.33m 1.84h 0.08d 0.000 y
# IO & Wait Time: 3685s 61.42m 1.02h 0.04d 0.000 y
# Average job time: 224s 3.73m 0.06h 0.00d
# Longest job: 819s 13.65m 0.23h 0.01d
# Submission to last job: 1474s 24.57m 0.41h 0.02d
cd ..
# rat
mkdir run.rn3
cd run.rn3
echo "rn3/panTro1mm5" > species.lst
gensub2 species.lst ../chrom.lst ../gsub jobList
para create jobList
# 46 jobs
para try, check, push, check
cd ..
# dog
mkdir run.canFam1
cd run.canFam1
echo "canFam1/panTro1mm5rn3" > species.lst
gensub2 species.lst ../chrom.lst ../gsub jobList
para create jobList
# 46 jobs
para try, check, push, check
cd ../
# chicken
mkdir run.galGal2
cd run.galGal2
echo "galGal2/panTro1mm5rn3canFam1" > species.lst
gensub2 species.lst ../chrom.lst ../gsub jobList
# no alignment file for chr18_random -- create one so we can create jobList
touch $workingDir/galGal2/chr18_random.maf
para create jobList
# 46 jobs
para try, check, push, check
# 1 crashed job for empty file chr18_random
cd ..
# fugu
mkdir run.fr1
cd run.fr1
echo "fr1/panTro1mm5rn3canFam1galGal2" > species.lst
gensub2 species.lst ../chrom.lst ../gsub jobList
# create empty alignment file for missing one (no alignments)
touch /cluster/bluearc/hg17/multiz.2004-07-13/fr1/chr6_hla_hap1.maf
para create jobList
# 46 jobs
para try, check, push, check
# 1 crashed job for empty file chr6_hla_hap1
cd ..
# zebrafish
mkdir run.danRer1
cd run.danRer1
echo "danRer1/panTro1mm5rn3canFam1galGal2fr1" > species.lst
gensub2 species.lst ../chrom.lst ../gsub jobList
para create jobList
# 46 jobs
para try, check, push, check
cd ..
# copy 8-way mafs to build directory
ssh eieio
set multizDir = multiz.2004-07-13
set workingDir = /cluster/bluearc/hg17/$multizDir
ln -s $workingDir/panTro1mm5rn3canFam1galGal2fr1danRer1 $workingDir/maf
cd /cluster/data/hg17/bed/multiz.2004-07-13
mkdir maf
cp $workingDir/maf/*.maf maf
# copy to download area (2004-07-27 angie)
# moved gzipped files to mafDownload dir and recreated symlinks
# (2006-04-23 kate)
cd /usr/local/apache/htdocs/goldenPath/hg17
mkdir -p mzPt1Mm5Rn3Cf1Gg2Fr1Dr1
# gzipped & copied maf files from /cluster/data/hg17/bed/multiz8way/maf
# dumped table and gzipped for download (user request to after file
# removed when the track was replaced by 18way).
cd /cluster/data/hg17/bed/multiz8way/mafDownloads
hgsqldump --all -c --tab=. hg17 multiz8way
ssh kkstore02 \
'gzip /cluster/data/hg17/bed/multiz8way/mafDownloads/multiz8way.{sql,txt}'
ln -s /cluster/data/hg17/bed/multiz8way/mafDownloads/multiz8way.{sql,txt}.gz \
/usr/local/apache/htdocs/goldenPath/hg17/multiz8way
# load summary table (2005-09-27)
cd /cluster/data/hg17/bed/multiz.2004-07-13/maf
time cat chr*.maf | hgLoadMafSummary hg17 multiz8waySummary stdin
# 30 minutes ?
# NOTE: this didn't improve track display time at 5MB, so
# I'm leaving out of trackDb (sticking with pairwise maf's) for now
# It may be that this helps performance only with larger numbers
# of species.
# Create upstream files for download (2004-09-13 kate)
ssh hgwdev
cd /cluster/data/hg17/bed/multiz8way
echo hg17 panTro1 mm5 rn3 canFam1 galGal2 fr1 danRer1 > org.txt
# mafFrags takes a while
foreach i (1000 2000 5000)
echo "making upstream$i.maf"
featureBits hg17 refGene:upstream:$i -fa=/dev/null -bed=up.bad
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
rm up.bad
mafFrags hg17 multiz8way up.bed upstream$i.maf -orgs=org.txt
rm up.bed
end
ssh eieio
cd /cluster/data/hg17/bed/multiz8way
nice gzip upstream{1000,2000,5000}.maf
# 6 mins.
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17
ln -s mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 multiz8way
mv /cluster/data/hg17/bed/multiz8way/upstream*.maf.gz multiz8way
# PHYLO-HMM (PHASTCONS) CONSERVATION FOR 8-WAY WITH MM5 (DONE 2004-07-20 kate)
# (this was partially redone by acs using the new phastCons, 08-28;
# I've tried to merge the two sets of docs into one cohesive
# description)
# More revisions, acs, 09-13
ssh eieio
set path = ($path /cluster/bin/phast)
cd /cluster/data/hg17/bed/multiz.2004-07-13
mkdir cons
cd cons
#break up the genome-wide MAFs into pieces
mkdir /cluster/bluearc/hg17/chrom
cd /cluster/data/hg17
foreach f (`cat chrom.lst`)
echo $f
cp $f/*.fa /cluster/bluearc/hg17/chrom
end
ssh kki
cd /cluster/data/hg17/bed/multiz.2004-07-13/cons
mkdir run.split
cd run.split
set WINDOWS = /cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.sh
#!/bin/sh
PHAST=/cluster/bin/phast
FA_SRC=/cluster/bluearc/hg17/chrom
WINDOWS=/cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS
maf=$1
c=`basename $maf .maf`
echo $c
mkdir -p /scratch/msa_split
${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg17,panTro1,mm5,rn3,canFam1,galGal2,fr1,danRer1 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -B 5000
[ $? -eq 0 ] || exit 1
echo "Copying..."
cd /scratch/msa_split
for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
[ $? -eq 0 ] || exit 1
rm -f /scratch/msa_split/$c.*.ss
echo "Done copying"
echo "Done" >> ${WINDOWS}/$c.done
'EOF'
# << for emacs
chmod +x doSplit.sh
rm -f jobList
foreach file (/cluster/bluearc/hg17/multiz.2004-07-13/maf/*.maf)
set c = $file:t:r
echo "doSplit.sh $file {check out line+ $WINDOWS/$c.done}" >> jobList
end
para create jobList
# 46 jobs
para try
para check
para push
# 2 crashed jobs -- due to no alignments in input maf
# chr18_random, chr6_hla_hap1
cd ..
# now generate conservation scores and predicted elements
ssh hgwdev
cd /cluster/data/hg17/bed/multiz.2004-07-13/cons
mkdir run.elements
# despite the name, I've put the elements and the new conservation
# scores here
# first produce a rough starting model; in this case, we can just
# use the model previously estimated (see the entry below on PHYLOFIT/PHASTCONS)
cp /cluster/bluearc/hg17/multiz.2004-07-13/panTro1mm5rn3canFam1/hg17panTro1rn3mm5canFam1galGal2fr1danRer1.mod starting-tree.mod
# In other cases, it would be sufficient to choose an arbitrary
# input file from the WINDOWS directory (choose one with plenty of
# data, i.e., large NTUPLES) and run phyloFit on it with the
# correct tree topology, e.g.,
# phyloFit -i SS datafile.ss --tree \
# "(((((hg17,panTro1),(mm5,rn3)),canFam1),galGal2),(fr1,danRer1))" \
# --out-root starting-tree
# Get genome-wide average GC content (for all species together,
# not just the reference genome). If you have a globally
# estimated tree model, as above, you can get this from the
# BACKGROUND line in the .mod file. E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.294633 0.205082 0.205189 0.295097
# This implies a GC content of 0.205 + 0.205 = 0.410
# If you do *not* have a global tree model and you do not know
# your GC content, you can get it directly from the MAFs with
# a command like:
# msa_view --aggregate hg17,panTro1,rn3,mm5,canFam1,galGal2,danRer1,fr1 \
# -i MAF --summary-only /cluster/data/hg17/bed/multiz.2004-07-13/maf/chr*.maf\
# > maf_summary.txt
# This will take a little while (30-60 min). Run on eieio.
# now set up cluster job to estimate model parameters. Parameters
# will be estimated separately for each alignment fragment then
# will be combined across fragments
cat << 'EOF' > doEstimate.sh
#!/bin/sh
zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.410 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 12 --target-coverage 0.17 --quiet --log $2 --estimate-trees $3
EOF
# Be sure to substitute in the right G+C content. Also, notice the
# target coverage of 0.17. We actually want 5% coverage here but
# the final (posterior) coverage is only indirectly related to the
# expected (prior) coverage. One thing to consider is that we
# only have about 40% alignment coverage (excluding chimp, which
# doesn't help us much in identifying conserved regions). As far
# as phastCons is concerned, we want to aim for about 0.05 / 0.4 =
# 0.125 coverage. In this case, though, --target-coverage
# 0.125 resulted in only about 4.1% coverage. I had to iterate
# a couple of times (using only chromosome 1) to find a value that
# got me close to the target of 5%
chmod u+x doEstimate.sh
rm -fr LOG TREES
mkdir -p LOG TREES
rm -f jobs.lst
# watch out: bash assumed below in a few places
for f in /cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS/*.ss.gz ; do \
root=`basename $f .ss.gz` ;\
echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ;\
done
# run cluster job
ssh kk, cd /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements, para create, ...
# takes about an hour
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ls TREES/*.cons.mod > cons.txt
phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
ls TREES/*.noncons.mod > noncons.txt
phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
# look over the files cons_summary.txt and noncons_summary.txt.
# The means and medians should be roughly equal and the stdevs
# should be reasonably small compared to the means, particularly
# for rate matrix parameters (at bottom) and for branches to the
# leaves of the tree. The stdevs may be fairly high for branches
# near the root of the tree; that's okay. Some min values may be
# 0 for some parameters. That's okay, but watch out for very large
# values in the max column, which might skew the mean. If you see
# any signs of bad outliers, you may have to track down the
# responsible .mod files and throw them out. I've never had to do
# this; the estimates generally seem pretty well behaved.
# NOTE: Actually, a random sample of several hundred to a thousand
# alignment fragments (say, a number equal to the number of
# available cluster nodes) should be more than adequate for
# parameter estimation. If pressed for time, use this strategy.
# Now we are ready to set up the cluster job for computing the
# conservation scores and predicted elements. It's all downhill
# from here.
cat << 'EOF' > doPhastCons.sh
#!/bin/sh
mkdir -p /cluster/bluearc/hg17/phastCons/POSTPROBS /cluster/bluearc/hg17/phastCons/ELEMENTS
pref=`basename $1 .ss.gz`
chr=`echo $pref | awk -F\. '{print $1}'`
tmpfile=/scratch/phastCons.$$
zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 12 --target-coverage 0.17 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/hg17/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/hg17/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
EOF
chmod u+x doPhastCons.sh
rm -fr /cluster/bluearc/hg17/phastCons/POSTPROBS /cluster/bluearc/hg17/phastCons/ELEMENTS
rm -f jobs2.lst
for f in /cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs2.lst ; done
# run cluster job
ssh kk, cd /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements, para create, ...
logout
# takes about 20 minutes
# combine predictions and transform scores to be in 0-1000 interval
# do in a way that avoids limits on numbers of args
find /cluster/bluearc/hg17/phastCons/ELEMENTS -name "*.bed" > files
rm -f splitfiles* all.raw.bed
split files splitfiles
for s in splitfiles* ; do awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' `cat $s` >> all.raw.bed ; done
/cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
rm files splitfiles*
hgLoadBed hg17 phastConsElements all.bed
hgLoadBed -chrom=chr1 hg17 phastConsElements all.bed
# check coverage
featureBits hg17 phastConsElements
#137850739 bases of 2866216770 (4.810%) in intersection
# This should be close enough. If necessary, you can rerun the
# steps above with a different target coverage. When hitting the
# target is important, you may want to perform several iterations
# using a representative subset of the entire dataset (human chr1
# seems to work pretty well)
# set up wiggle
mkdir -p /cluster/bluearc/hg17/phastCons/wib
cat << 'EOF' > doWigAsciiToBinary.sh
#!/bin/sh
chr=$1
zcat `ls /cluster/bluearc/hg17/phastCons/POSTPROBS/$chr.*.pp.gz | sort -t\. -k2,2n` | wigAsciiToBinary -chrom=$chr -wibFile=/cluster/bluearc/hg17/phastCons/wib/${chr}_phastCons stdin
EOF
chmod u+x doWigAsciiToBinary.sh
rm -f jobs3.lst
for chr in `ls /cluster/bluearc/hg17/phastCons/POSTPROBS | awk -F\. '{print $1}' | sort -u` ; do echo doWigAsciiToBinary.sh $chr >> jobs3.lst ; done
# run a little wigAsciiToBinary cluster job
ssh kk, etc.
# copy wibs and wigs from bluearc
rsync -av /cluster/bluearc/hg17/phastCons/wib .
# load track
hgLoadWiggle hg17 phastCons -pathPrefix=/gbdb/hg17/phastCons/wib \
wib/chr*_phastCons.wig
mkdir -p /gbdb/hg17/phastCons/wib
rm -f /gbdb/hg17/phastCons/wib/chr*phastCons.wib
ln -s /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements/wib/*.wib /gbdb/hg17/phastCons/wib
chmod 775 . wib /gbdb/hg17/phastCons /gbdb/hg17/phastCons/wib
chmod 664 wib/*.wib
# move postprobs over and clean up bluearc
rsync -av /cluster/bluearc/hg17/phastCons/POSTPROBS .
# (people sometimes want the raw scores)
rm -r /cluster/bluearc/hg17/phastCons/ELEMENTS /cluster/bluearc/hg17/phastCons/POSTPROBS /cluster/bluearc/hg17/phastCons/wib
# set up full alignment/conservation track ("multiz8way")
# load multiz maf tables
ssh hgwdev
cd /cluster/data/hg17/bed/multiz.2004-07-13
set mafDir = /gbdb/hg17/multiz8way/maf
set table = multiz8way
mkdir -p $mafDir/$table
ln -s `pwd`/maf/*.maf $mafDir/$table
cd maf
hgLoadMaf hg17 -warn multiz8way -pathPrefix=$mafDir/$table/maf
# someone dropped from hgwdev
# reload (2007-03-19 kate)
nice hgLoadMaf hg17 -warn multiz8way -pathPrefix=/gbdb/hg17/multiz8wayFixed
cat /gbdb/hg17/multiz8wayFixed/*.maf | \
nice hgLoadMafSummary hg17 -minSize=30000 -mergeGap=1500 -maxSize=200000 \
multiz8waySummary stdin
# load blastz maf tables
# TODO: change mafWiggle to use db names instead of species names
# in speciesOrder
# link files into /gbdb table dir
ln -s /cluster/data/hg17/bed/blastz.panTro1/mafNet $mafDir/chimp_netBlastz
ln -s /cluster/data/hg17/bed/blastz.mm5/mafNet $mafDir/mouse_netBlastz
ln -s /cluster/data/hg17/bed/blastz.rn3/mafNet $mafDir/rat_netBlastz
ln -s /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/mafNet $mafDir/dog_netBlastz
ln -s /cluster/data/hg17/bed/blastz.galGal2/mafNet $mafDir/chicken_netBlastz
ln -s /cluster/data/hg17/bed/blastz.fr1/mafNet $mafDir/fugu_netBlastz
ln -s /cluster/data/hg17/bed/blastz.danRer1.swap/mafNet $mafDir/zebrafish_netBlastz
# remove empty file, disliked by hgLoadMaf
# NOTE: these shouldn't be empty -- next time, make sure previous
# alignments are copied over to output maf (multiz won't if there's
# an empty input file).
rm chicken/chr18_random.maf
rm fugu/chr6_hla_hap1.maf
# load tables
foreach s (chimp mouse rat dog chicken fugu zebrafish)
set table = ${s}_netBlastz
echo "$s $mafDir/$table"
~kate/bin/i386/hgLoadMaf hg17 -warn ${s}_netBlastz -pathPrefix=$mafDir/$table
end
# trackDb entry:
# track multiz8way
# shortLabel Conservation
# longLabel Chimp/Mouse/Rat/Dog/Chicken/Fugu/Zebrafish Multiz Alignments & Conservation
# group compGeno
# priority 149
# visibility pack
#color 0, 10, 100
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# wiggle phastCons
# yLineOnOff Off
# autoScaleDefault Off
# pairwise netBlastz
# speciesOrder chimp mouse rat dog chicken fugu zebrafish
# PHASTCONS SCORES DOWNLOADABLES (REDONE 6/15/05 angie)
# Initially done 10/11/04, but using scores from run.cons -- which
# had been replaced by scores in run.elements, where I did not think
# to look for scores. :( !
ssh eieio
mkdir /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1
cd /cluster/data/hg17/bed/multiz8way/cons/run.elements/POSTPROBS
foreach chr (`awk '{print $1;}' /cluster/data/hg17/chrom.sizes`)
echo $chr
nice zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` \
| nice gzip -c \
> /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1/$chr.gz
end
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/hg17/phastCons
# Doh! /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 is 11G now --
# too much to dump on hgwdev's / which is at 94%. So don't do this:
#mv /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 .
# make symbolic links instead:
mkdir /usr/local/apache/htdocs/goldenPath/hg17/phastCons/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1
cd /usr/local/apache/htdocs/goldenPath/hg17/phastCons/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1
ln -s /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1/* .
md5sum *.gz > md5sum.txt
# make a README.txt.
# PHYLOFIT AND TREE-DOCTOR FOR 8-WAY: ESTIMATE PHYLOGENETIC TREE (acs)
# (This was originally done for phastCons but is not necessary with
# the new version. However, it may be useful for other purposes, so
# I'm leaving it in as a separate entry.)
# first estimate a model for the mammals
ssh eieio
cd /cluster/bluearc/hg17/multiz.2004-07-13/panTro1mm5rn3canFam1
# collect sufficient stats (takes maybe an hour)
for file in *.maf ; do echo $file ; msa_view -i MAF $file -o SS --order hg17,panTro1,rn3,mm5,canFam1 > `basename $file .maf`.ss ; done
ls *.ss | grep -v chr6_hla_hap2 > files
msa_view '*files' --aggregate hg17,panTro1,rn3,mm5,canFam1 -i SS -o SS > all.ss
# BTW, this can now be done in one step using something like:
# msa_view --aggregate hg17,panTro1,rn3,mm5,canFam1 -i MAF -o SS *.maf > all.ss
# (modify to exclude certain files if necessary)
# estimate model, with rate variation (takes about a minute)
phyloFit all.ss --nrates 10 --tree "(((hg17,panTro1),(rn3,mm5)),canFam1)" --alpha 4.4 --EM --log log -i SS --out-root hprmc-rev-dg
# (Actually, --nrates 4 should be more than adequate for most purposes)
cat hprmc-rev-dg.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#NRATECATS: 10
#ALPHA: 4.658942
#TRAINING_LNL: -6889216721.159384
#BACKGROUND: 0.294633 0.205082 0.205189 0.295097
#RATE_MAT:
# -0.865237 0.159990 0.554805 0.150442
# 0.229851 -1.194646 0.168269 0.796526
# 0.796651 0.168182 -1.194919 0.230086
# 0.150205 0.553556 0.159985 -0.863747
#TREE: (((1:0.006523,2:0.007997):0.103779,(3:0.104867,4:0.078911):0.265676):0.112364,5:0.112364);
# now extrapolate to fish and chicken using tree_doctor and the CFTR 25 tree
# (replace numbers with names in hprmc-rev-dg.mod; this won't be necessary in the future)
tree_doctor --rename "1->hg17;2->panTro1;3->rn3;4->mm5;5->canFam1" hprmc-rev-dg.mod > hprmc-rev-dg.names.mod
# (obtain 8-way subtree from cftr25_hybrid.nh; also map names as necessary to match above)
tree_doctor /cluster/data/nisc/targets/cftr/phyloHMMcons25/cftr25_hybrid.nh --prune-all-but hg16,chimp,mm3,rn3,dog,chicken,fr1,zfish --rename "hg16->hg17;mm3->mm5;chimp->panTro1;dog->canFam1;chicken->galGal2;zfish->danRer1" > cftr8way.nh
# now merge (see tree_doctor help page for explanation)
tree_doctor hprmc-rev-dg.names.mod --merge cftr8way.nh > hg17panTro1rn3mm5canFam1galGal2fr1danRer1.mod
cat hg17panTro1rn3mm5canFam1galGal2fr1danRer1.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#NRATECATS: 10
#ALPHA: 4.658942
#BACKGROUND: 0.294633 0.205082 0.205189 0.295097
#RATE_MAT:
# -0.865237 0.159990 0.554805 0.150442
# 0.229851 -1.194646 0.168269 0.796526
# 0.796651 0.168182 -1.194919 0.230086
# 0.150205 0.553556 0.159985 -0.863747
#TREE: (((((hg17:0.006523,panTro1:0.007997):0.103779,(rn3:0.104867,mm5:0.078911):0.265676):0.019461,canFam1:0.205267):0.377150,galGal2:0.511134):0.536627,(danRer1:0.905323,fr1:0.922995):0.536627);
# CONSERVED NON-CODING (CNS) TRACK (acs 08/29/04)
# (depends on phastConsElements)
cd /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements
featureBits hg17 -bed=possibleCoding.bed -or twinscan:exon xenoMrna mrna intronEst
# (add SGP, exoniphy, possib. others if available)
# now filter out all phastCons elements that overlap
overlapSelect -nonOverlapping possibleCoding.bed all.bed cns.bed
hgLoadBed hg17 cns cns.bed
# track cns
# shortLabel CNS
# longLabel Conserved Non-Coding (Cons Elements Minus Predicted Coding)
# priority 109.11
# group compGeno
# visibility hide
# type bed 5 .
# PRODUCING GENSCAN PREDICTIONS (DONE - 2004-07-08 - Hiram)
# Needed to download a new binary for this run. Our Linux systems
# XXX - I thought a new binary was needed. Turned out it was already
# here in our hg3rdParty CVS project. All of this discussed here can
# be simply fetched from cvs: cvs co hg3rdParty/genscanlinux
# have been updated since the last time, the old binary would not
# run. Go to: http://genes.mit.edu/GENSCAN.html
# and then to: http://genes.mit.edu/license.html
# Fill in the license agreement and you can then pick up the
# README and the Linux version: genscanlinux.tar.uue.tgz
# To uudecode that file, go to one of the Solaris home machines
# and use the uudecode command:
# uudecode genscanlinux.tar.uue.tgz
# That produces the file: genscanlinux.tar
# Which contains the files:
# drwxr-xr-x chris/burgelab 0 2003-02-17 11:48:44 ./
# -rw-r--r-- chris/burgelab 219056 2000-09-07 12:39:26 ./Arabidopsis.smat
# -rw-r--r-- chris/burgelab 6622 2000-09-07 12:39:26 ./HUMRASH
# -rw-r--r-- chris/burgelab 849 2000-09-07 12:39:26 ./HUMRASH.sample
# -rw-r--r-- chris/burgelab 219050 2000-09-07 12:39:26 ./HumanIso.smat
# -rw-r--r-- chris/burgelab 155735 2000-09-07 12:39:26 ./Maize.smat
# -rw-r--r-- chris/burgelab 24465 2000-09-07 12:39:26 ./README
# -rw-r--r-- chris/burgelab 6344 2000-09-07 12:39:27 ./HUMRASH.ps
# -rwxr-xr-x chris/burgelab 126365 2003-02-17 11:48:44 ./genscan
#
# I placed these currently in: /cluster/home/hiram/GENSCAN/
# I'll check with Angie where it should properly live ...
# XXX - it already lives in 'cvs co hg3rdParty/genscanlinux'
# These instructions should simple check it out right here in
# bed/genscan and make the gsub command refer to these copies.
ssh hgwdev
mkdir /cluster/data/hg17/bed/genscan
cd /cluster/data/hg17/bed/genscan
cvs co hg3rdParty/genscanlinux
ssh eieio
cd /cluster/data/hg17/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the contigs
# *that do not have pure Ns* (due to heterochromatin, unsequencable
# stuff) which would cause genscan to run forever.
rm -f genome.list
bash
for f in `cat /cluster/data/hg17/contig.lst`
do
egrep '[ACGT]' /cluster/data/hg17/$f.masked > /dev/null
if [ $? = 0 ]; then
echo /cluster/data/hg17/$f.masked >> genome.list
fi
done
# exit your bash shell if you are [t]csh ...
# This egrep matched all the contigs in hg17. I guess none of
# them are complete Ns* at this point.
# Log into kki (not kk !). kki is the driver node for the small
# cluster (kkr2u00 -kkr8u00. Genscan has problem running on the
# big cluster, due to limitation of memory and swap space on each
# processing node).
ssh kki
cd /cluster/data/hg17/bed/genscan
# Create template file, gsub, for gensub2. For example (3-line file):
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.list single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 379 of 380 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 79998s 1333.30m 22.22h 0.93d 0.003 y
# IO & Wait Time: 2989s 49.82m 0.83h 0.03d 0.000 y
# Average job time: 219s 3.65m 0.06h 0.00d
# Longest job: 2999s 49.98m 0.83h 0.03d
# Submission to last job: 8324s 138.73m 2.31h 0.10d
# Running the single failed job on kolossus with a smaller window:
/cluster/bin/x86_64/gsBig /cluster/data/hg17/5/NT_006576/NT_006576.fa.masked \
gtf/NT_006576.fa.gtf -trans=pep/NT_006576.fa.pep \
-subopt=subopt/NT_006576.fa.bed -exe=hg3rdParty/genscanlinux/genscan \
-par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000
# If there were out-of-memory problems (run "para problems"), then
# re-run those jobs by hand but change the -window arg from 2400000
# something lower. In build33, this was 22/NT_011519
# In build34 there were NO failures !
# Convert these to chromosome level files as so:
ssh eieio
cd /cluster/data/hg17/bed/genscan
$HOME/bin/i386/liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/N*.gtf
$HOME/bin/i386/liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft \
warn subopt/N*.bed
cat pep/*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/hg17/bed/genscan
ldHgGene hg17 genscan genscan.gtf
# 35 minute job
# Read 42807 transcripts in 325994 lines in 1 files
# 42807 groups 46 seqs 1 sources 1 feature types
hgPepPred hg17 generic genscanPep genscan.pep
# Processing genscan.pep
hgLoadBed hg17 genscanSubopt genscanSubopt.bed
# Reading genscanSubopt.bed
# Loaded 517157 elements of size 6
# Sorted
# Creating table definition for
# Saving bed.tab
# Loading hg17
# featureBits hg17 genscan
# 55323340 bases of 2866216770 (1.930%) in intersection
# featureBits hg16 genscan
# 55333689 bases of 2865248791 (1.931%) in intersection
# featureBits hg17 genscanSubopt
# 55986178 bases of 2866216770 (1.953%) in intersection
# featureBits hg16 genscanSubopt
# 56082952 bases of 2865248791 (1.957%) in intersection
# Should be zero intersection with rmsk
# featureBits -chrom=chr1 hg17 genscan rmsk
# 794 bases of 222827847 (0.000%) in intersection
# EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 8/1/05 angie)
# Originally done 7/1/04 for canFam1 -- redone 8/1/05 for canFam2.
ssh kolossus
cd /san/sanvol1/scratch/hg17/rmsk
# Run Arian's DateRepsinRMoutput.pl to add extra columns telling
# whether repeats in -query are also expected in -comp species.
# Even though we already have the human-mouse linSpecReps,
# extractLinSpecReps requires two columns of DateRepsinRMoutput.pl
# additions. So add mouse, then ignore it.
# Dog in extra column 1, Mouse in extra column 2
foreach outfl ( *.out )
echo "$outfl"
/cluster/bluearc/RepeatMasker/DateRepeats \
${outfl} -query human -comp dog -comp mouse
end
# Now extract dog (extra column 1), ignore mouse.
cd ..
mkdir linSpecRep.notInDog
foreach f (rmsk/*.out_canis-familiaris_mus-musculus)
set base = $f:t:r:r
echo $base.out.spec
/cluster/bin/scripts/extractLinSpecReps 1 $f > \
linSpecRep.notInDog/$base.out.spec
end
# Clean up.
rm rmsk/*.out_canis*
rsync -av /san/sanvol1/scratch/hg17/linSpecRep.notInDog \
/cluster/bluearc/scratch/hg/gs.18/build35/
# Ask cluster-admin for an rsync.
# BLASTZ DOG (CANFAM1) (DONE 7/8/04 angie)
ssh kk
# space is awful tight on store4 -- use store7.
mkdir -p /cluster/store7/hg17/bed/blastz.canFam1.2004-07-08
ln -s /cluster/store7/hg17/bed/blastz.canFam1.2004-07-08 \
/cluster/data/hg17/bed/
cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
# Use default (Human-Mouse) settings for starters.
cat << '_EOF_' > DEF
# human vs. dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Default
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human
SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInDog
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Dog
SEQ2_DIR=/scratch/hg/canFam1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg17/bed/blastz.canFam1.2004-07-08
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
# first cluster run: raw blastz alignments
ssh kk
bash # if a csh/tcsh user
cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
source DEF
mkdir $RAW run.0
/cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
sh ./xdir.sh
cd run.0
sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
para create jobList
para try, check, push, check, ....
# Moving the human chr19 jobs up to the top of the jobList probably
# would have shaved 4 hours off the total time! It was almost done
# after 6 hours, except for a few chr19 stragglers.
#Completed: 93775 of 93775 jobs
#Average job time: 202s 3.37m 0.06h 0.00d
#Longest job: 17806s 296.77m 4.95h 0.21d
#Submission to last job: 35523s 592.05m 9.87h 0.41d
# second cluster run: lift raw alignments -> lav dir
ssh kki
bash # if a csh/tcsh user
cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
source DEF
mkdir run.1 lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
cd run.1
wc -l jobList
para create jobList
para try, check, push, etc ...
#Completed: 341 of 341 jobs
#Average job time: 36s 0.61m 0.01h 0.00d
#Longest job: 302s 5.03m 0.08h 0.00d
#Submission to last job: 1143s 19.05m 0.32h 0.01d
# third run: lav -> axt
# (if non-default BLASTZ_Q is used in the future, put axtRescore in
# the pipe after lavToAxt)
ssh kki
cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
mkdir axtChrom pslChrom run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| $HOME/bin/x86_64/lavToAxt stdin \
/iscratch/i/gs.18/build35/bothMaskedNibs /iscratch/i/canFam1/nib stdout \
| $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt
$HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
../../pslChrom/$chr.psl
'_EOF_'
# << this line keeps emacs coloring happy
chmod a+x do.csh
cp /dev/null jobList
foreach d (../lav/chr*)
echo "do.csh $d" >> jobList
end
para create jobList
para try, check, push, check
#Completed: 46 of 46 jobs
#Average job time: 300s 5.00m 0.08h 0.00d
#Longest job: 1669s 27.82m 0.46h 0.02d
#Submission to last job: 1689s 28.15m 0.47h 0.02d
# CHAIN DOG BLASTZ (DONE 7/9/04 angie)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
/iscratch/i/gs.18/build35/bothMaskedNibs \
/iscratch/i/canFam1/nib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
#Completed: 46 of 46 jobs
#Average job time: 266s 4.43m 0.07h 0.00d
#Longest job: 3578s 59.63m 0.99h 0.04d
#Submission to last job: 3578s 59.63m 0.99h 0.04d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r
textHistogram -binSize=10000 /tmp/score.$f:t:r
echo ""
end
# Lots of chaff with scores in the 3000's. Many very-high-scoring
# chains. So filter the chain down somewhat...
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chain
rm chain/*
chainSplit chain all.chain
gzip all.chain.unfiltered
# Load chains into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain hg17 ${c}_chainCanFam1 $i
end
# Coverage is significantly higher than mouse:
featureBits hg17 -chrom=chr1 chainCanFam1Link
#123999291 bases of 222827847 (55.648%) in intersection
# before filtering: 124750124 bases of 222827847 (55.985%) in intersection
featureBits hg17 -chrom=chr1 chainMm5Link
#83773012 bases of 222827847 (37.595%) in intersection
# NET DOG BLASTZ (DONE 7/9/04 angie)
ssh kolossus
cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
netClass noClass.net hg17 canFam1 dog.net
# Make a 'syntenic' subset:
ssh kksilo
cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
rm noClass.net
# Make a 'syntenic' subset of these with
netFilter -syn dog.net > dogSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
netFilter -minGap=10 dog.net | hgLoadNet hg17 netCanFam1 stdin
netFilter -minGap=10 dogSyn.net | hgLoadNet hg17 syntenyNetCanFam1 stdin
# Add entries for chainCanFam1, netCanFam1 to human/hg17 trackDb
# MAKE VSCANFAM1 DOWNLOADABLES (DONE 9/17/04 kate)
ssh kksilo
cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
ln -s all.chain dog.chain
mkdir gz
cd gz
gzip -c ../dog.chain > dog.chain.gz
gzip -c ../dog.net > dog.net.gz
gzip ../dogSyn.net > dogSyn.net.gz
# Angie's notes...
# Mike Zody asked for raw blastz in chain format, so figure out some
# way to translate axt or psl to chain and put it out there.
# Actually, it's probably just hg16-canFam1 that he wants for now -- ?
# Ask when we get to this point.
cd ../axtNet
time gzip *.axt
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17
mkdir -p vsCanFam1
cd vsCanFam1
mv /cluster/data/hg17/bed/blastz.canFam1/axtChain/gz/*.gz .
md5sum *.gz > md5sum.txt
mkdir -p axtNet
cd axtNet
cp /cluster/data/hg17/bed/blastz.canFam1/axtNet/*.axt.gz .
md5sum *.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# REDO downloads of axtNet's to fix overlaps (2005-09-13 kate)
# Finally, replace bad chr5 files (2006-01-05 kate)
ssh kkstore02
cd /cluster/data/hg17/bed/blastz.canFam1/axtNet
nice gzip *.axt
md5sum *.axt.gz > md5sum.txt
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17/vsCanFam1
mv axtNet axtNet.old
ln -s /cluster/data/hg17/bed/blastz.canFam1/axtNet .
# GENERATE CANFAM1 MAF FOR MULTIZ FROM NET (DONE 7/9/04 angie)
# Redo net axt's and maf's to fix overlaps (use 8/5 netToAxt)
# (2005-08-16 kate)
# and replace bad chr5 files (2006-01-05 kate)
ssh kkstore02
cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
netSplit dog.net net
cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
mkdir axtNet mafNet
cat > makeMaf.csh << 'EOF'
foreach f (axtChain/net/*)
set chr = $f:t:r
echo $chr
netToAxt $f axtChain/chain/$chr.chain /cluster/data/hg17/nib \
/cluster/data/canFam1/nib stdout \
| axtSort stdin axtNet/$chr.axt
axtToMaf axtNet/$chr.axt \
/cluster/data/hg17/chrom.sizes /cluster/data/canFam1/chrom.sizes \
mafNet/$chr.maf -tPrefix=hg17. -qPrefix=canFam1.
end
'EOF'
csh makeMaf.csh >&! makeMaf.log &
mkdir -p /cluster/bluearc/hg17/mafNet
cp -rp mafNet /cluster/bluearc/hg17/mafNet/canFam1
# BLASTZ MM5 (DONE - 2004-06-22 - Hiram)
ssh kk
mkdir -p /cluster/data/hg17/bed/blastz.mm5.2004-07-01
cd /cluster/data/hg17/bed
ln -s blastz.mm5.2004-07-01 blastz.mm5
cd blastz.mm5
cat << '_EOF_' > DEF
# human vs. mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInRat
# notInRat OK as it is identical to notInMouse
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Mouse
SEQ2_DIR=/scratch/mus/mm5/softNib
# RMSK not currently used
SEQ2_RMSK=/scratch/mus/mm5/rmsk
# FLAG not currently used
SEQ2_FLAG=-rodent
SEQ2_SMSK=/scratch/mus/mm5/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg17/bed/blastz.mm5
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
# prepare first cluster run
ssh kk
cd /cluster/data/hg17/bed/blastz.mm5
/cluster/data/hg17/jkStuff/BlastZ_run0.sh
cd run.0
para try, check, push, check, ....
# Completed: 44330 of 44330 jobs
# CPU time in finished jobs: 16250628s 270843.80m 4514.06h 188.09d 0.515 y
# IO & Wait Time: 387936s 6465.60m 107.76h 4.49d 0.012 y
# Average job time: 375s 6.26m 0.10h 0.00d
# Longest job: 4417s 73.62m 1.23h 0.05d
# Submission to last job: 43754s 729.23m 12.15h 0.51d
# the file server to its knees. Run this on the small cluster.
ssh kki
cd /cluster/data/hg17/bed/blastz.mm5
/cluster/data/hg17/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 2189s 36.48m 0.61h 0.03d 0.000 y
# IO & Wait Time: 7714s 128.57m 2.14h 0.09d 0.000 y
# Average job time: 29s 0.48m 0.01h 0.00d
# Longest job: 165s 2.75m 0.05h 0.00d
# Submission to last job: 830s 13.83m 0.23h 0.01d
# Third cluster run to convert lav's to axt's
# Does not work on kki since /scratch on the iservers is not the
# same as /scratch on the other clusters.
ssh kk
cd /cluster/data/hg17/bed/blastz.mm5
/cluster/data/hg17/jkStuff/BlastZ_run2.sh
cd run.2
para try, check, push, etc ...
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 1638s 27.30m 0.46h 0.02d 0.000 y
# IO & Wait Time: 12068s 201.13m 3.35h 0.14d 0.000 y
# Average job time: 305s 5.08m 0.08h 0.00d
# Longest job: 1124s 18.73m 0.31h 0.01d
# Submission to last job: 2519s 41.98m 0.70h 0.03d
# chr19 takes too long, the axtSort becomes too large and the poor
# node ends up swapping forever. When you are down to that last
# job running, stop it and go to kolossus.
# Adjusting the location of the nib directories, and fixing the
# MACHTYPE on the commands in the blastz script:
ssh kolossus
cd /cluster/data/hg17/bed/blastz.mm5
sed -e "s/i386/x86_64/g" /cluster/bin/scripts/blastz-chromlav2axt > \
x86_64-chromlav2axt
chmod +x x86_64-chromlav2axt
time ./x86_64-chromlav2axt \
/cluster/data/hg17/bed/blastz.mm5/lav/chr19 \
/cluster/data/hg17/bed/blastz.mm5/axtChrom/chr19.axt \
/cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs \
/cluster/bluearc/scratch/mus/mm5/softNib
# real 7m41.719s
# user 2m2.850s
# sys 0m23.070s
# translate sorted axt files into psl
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm5
mkdir -p pslChrom
set tbl = "blastzMm5"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# This takes more than an hour. You can shorten this by changing
# that command to a simple echo, put the results into a file,
# split the file into four parts and run the four files as shell
# scripts on eieio to have four processes running at the same
# time. Load on eieio gets up to about 20 which is reasonable.
# Load database tables
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm5/pslChrom
bash # for tcsh users
for F in chr*_blastzMm5.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${F}
echo "${F} done"
done
# this is a 40 minute job
# exit bash if you are tcsh
# featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of
# memory. But if you reset your ~/.hg.conf to use the read-only
# user and contact the hgwdev host, then use the x86_64 featureBits
# featureBits hg16 blastzMm5
# 1056761609 bases of 2865248791 (36.882%) in intersection
# featureBits hg17 blastzMm5
# 1052077141 bases of 2866216770 (36.706%) in intersection
# featureBits hg17 blastzMm4
# 1056201417 bases of 2866216770 (36.850%) in intersection
# CHAIN MM5 BLASTZ (DONE - 2004-07-02 - Hiram)
# The axtChain is best run on the small kluster, or the kk9 kluster
ssh kki
mkdir -p /cluster/data/hg17/bed/blastz.mm5/axtChain/run1
cd /cluster/data/hg17/bed/blastz.mm5/axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg17/bed/blastz.mm5/axtChrom/*.axt > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# May need -minScore=5000 for all chroms if chr19 won't finish on kolossus
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /iscratch/i/gs.18/build35/bothMaskedNibs \
/iscratch/i/mus/mm5/softNib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
# 46 jobs
gensub2 input.lst single gsub jobList
para create jobList
para try
para push # ... etc ...
# Completed: 46 of 46 jobs
# CPU time in finished jobs: 4856s 80.94m 1.35h 0.06d 0.000 y
# IO & Wait Time: 20083s 334.71m 5.58h 0.23d 0.001 y
# Average job time: 542s 9.04m 0.15h 0.01d
# Longest job: 2929s 48.82m 0.81h 0.03d
# Submission to last job: 2929s 48.82m 0.81h 0.03d
# now on the file server, sort chains
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm5/axtChain
time chainMergeSort run1/chain/*.chain > all.chain
# real 8m42.853s
# user 5m59.100s
# sys 0m40.320s
time chainSplit chain all.chain
# real 10m52.224s
# user 5m52.360s
# sys 0m34.870s
# optionally: rm run1/chain/*.chain
# Load chains into database
# next machine
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm5/axtChain/chain
bash # for tcsh users
for i in *.chain
do
c=${i/.chain/}
hgLoadChain hg17 ${c}_chainMm5 $i
echo done $c
done
# exit bash if you are tcsh
# This is a 50 minute job
# featureBits hg17 chainMm5
# 2834490112 bases of 2866216770 (98.893%) in intersection
# featureBits hg17 chainMm4
# 2829135227 bases of 2866216770 (98.706%) in intersection
# featureBits hg16 chainMm4
# 2828363353 bases of 2865248791 (98.713%) in intersection
# NET MM5 (DONE - 2004-07-02 - Hiram)
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm5/axtChain
mkdir preNet
cd chain
bash # for tcsh users
for i in *.chain
do
echo preNetting $i
/cluster/bin/i386/chainPreNet $i /cluster/data/hg17/chrom.sizes \
/cluster/data/mm5/chrom.sizes ../preNet/$i
done
# exit bash if you are tcsh
# 15 minute job
cd ..
mkdir n1
cd preNet
bash # for tcsh users
for i in *.chain
do
n=${i/.chain/}.net
echo primary netting $i $n
/cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg17/chrom.sizes \
/cluster/data/mm5/chrom.sizes ../n1/$n /dev/null
done
# exit bash if you are tcsh
# 9 minute job
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
# memory usage 2546110464, utime 16327 s/100, stime 3546
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm5/axtChain
time netClass hNoClass.net hg17 mm5 mouse.net \
-tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInMouse \
-qNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInHuman
# real 16m38.098s
# user 11m38.490s
# sys 1m48.470s
# If things look good do
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm5/axtChain
rm -r n1 hNoClass.net
# Make a 'syntenic' subset of these with
time netFilter -syn mouse.net > mouseSyn.net
# real 12m3.701s
# user 8m44.180s
# sys 1m1.610s
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm5/axtChain
netFilter -minGap=10 mouse.net | hgLoadNet hg17 netMm5 stdin
netFilter -minGap=10 mouseSyn.net | hgLoadNet hg17 syntenyNetMm5 stdin
# check results
# featureBits hg17 netMm5
# 2830625630 bases of 2866216770 (98.758%) in intersection
# featureBits hg17 netMm4
# 2824272033 bases of 2866216770 (98.537%) in intersection
# featureBits hg16 netMm5
# 2823565051 bases of 2865248791 (98.545%) in intersection
# featureBits hg17 syntenyNetMm5
# 2799194300 bases of 2866216770 (97.662%) in intersection
# featureBits hg17 syntenyNetMm4
# 2785830955 bases of 2866216770 (97.195%) in intersection
# featureBits hg16 syntenyNetMm5
# 2786960572 bases of 2865248791 (97.268%) in intersection
# Add entries for net and chain to mouse/hg17 trackDb
# make net
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm5/axtChain
mkdir mouseNet
time netSplit mouse.net mouseNet
# real 11m45.243s
# user 8m48.490s
# sys 1m13.490s
# extract axt's from net, and convert to maf's
# NOTE: Redo the net axt's and maf's using 8/05 netToAxt
# in order to remove overlaps (2005-08-16 kate)
ssh kkstore02
cd /cluster/data/hg17/bed/blastz.mm5/axtChain
mkdir ../axtNet ../mafNet
cat > makeMaf.csh << '_EOF_'
#!/bin/csh -ef
foreach f (mouseNet/chr*.net)
set c = $f:t:r
echo "netToAxt: $c.net -> $c.axt"
rm -f ../axtNet/$c.axt
netToAxt mouseNet/$c.net chain/$c.chain \
/cluster/data/hg17/nib /cluster/data/mm5/nib stdout | \
axtSort stdin ../axtNet/$c.axt
axtToMaf ../axtNet/$c.axt \
/cluster/data/hg17/chrom.sizes /cluster/data/mm5/chrom.sizes \
../mafNet/$c.maf -tPrefix=hg17. -qPrefix=mm5.
echo "Complete: $c.net -> axtNet/$c.axt -> mafNet/$c.maf"
end
'_EOF_'
# << for emacs
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
mkdir -p /cluster/bluearc/hg17/mafNet
cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/mm5
ssh hgwdev
mkdir /cluster/data/hg17/bed/blastz.mm5/axtBest
cd /cluster/data/hg17/bed/blastz.mm5/axtBest
ln -s ../axtNet/chr*.axt .
# copy net axt's to download area
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm5/axtNet
mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtNet
cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtNet
cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtNet
gzip *.axt
# add README.txt file to dir (use previous assembly's copy as template)
# 32 minute gzip
# Convert those axt files to psl
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm5
mkdir pslBest
foreach a (axtBest/chr*.axt)
set c=$a:t:r
echo -n "processing $c.axt -> ${c}_blastzBestMm5.psl ..."
/cluster/bin/i386/axtToPsl axtBest/${c}.axt \
S1.len S2.len pslBest/${c}_blastzBestMm5.psl
echo "Done"
end
# Load tables
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm5/pslBest
for I in chr*BestMm5.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
echo "done ${I}"
done
# check results
# featureBits hg17 blastzBestMm5
# 1013348528 bases of 2866216770 (35.355%) in intersection
# featureBits hg17 blastzBestMm4
# 1017319919 bases of 2866216770 (35.493%) in intersection
# featureBits hg16 blastzBestMm5
# 996722004 bases of 2865248791 (34.787%) in intersection
# Make /gbdb links and add them to the axtInfo table:
mkdir -p /gbdb/hg17/axtBest/Mm5
cd /gbdb/hg17/axtBest/Mm5
ln -s /cluster/data/hg17/bed/blastz.mm5/axtNet/chr*.axt .
cd /cluster/data/hg17/bed/blastz.mm5/axtNet
rm -f axtInfoInserts.sql
foreach f (/gbdb/hg17/axtBest/Mm5/chr*.axt)
set chr=$f:t:r
echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
VALUES ('mm5','Blastz Best in Genome','$chr','$f');" \
>>! axtInfoInserts.sql
end
hgsql hg17 < ~/kent/src/hg/lib/axtInfo.sql
# table axtInfo may already exist, ignore create error.
hgsql hg17 < axtInfoInserts.sql
# REDO: replace downloadable axtNet's to remove overlaps (2005-09-12 kate)
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5
mv axtNet axtNet.old
mkdir axtNet
cd axtNet
cp /cluster/data/hg17/bed/blastz.mm5/axtNet/*.axt .
nice gzip *.axt
md5sum *.axt.gz > md5sum.txt
# HG17 TO MM5 LIFTOVER CHAIN (DONE 1/6/05 Andy)
ssh kolossus
cd /cluster/data/hg17/bed/blastz.mm5/axtChain
mkdir over
for file in chain/*.chain; do
chrom=`basename $file .chain`
netChainSubset mouseNet/$chrom.net chain/$chrom.chain over/$chrom.over
cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToMm5.chain
done
rm -rf over/
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/hg17/liftOver
cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
cp /cluster/data/hg17/bed/liftOver/hg17ToMm5.chain .
gzip hg17ToMm5.chain
ln -s /cluster/data/hg17/bed/liftOver/hg17ToMm5.chain /gbdb/hg17/liftOver/hg17ToMm5.over.chain
hgAddLiftOverChain -multiple hg17 mm5
# HG17 TO CANFAM1 LIFTOVER CHAIN (DONE 1/7/05 Andy)
ssh kolossus
cd /cluster/data/hg17/bed/blastz.canFam1/axtChain
mkdir over
for file in chain/*.chain; do
chrom=`basename $file .chain`
netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain
done
rm -rf over/
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
cp /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain .
gzip hg17ToCanFam1.chain
ln -s /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain /gbdb/hg17/liftOver/hg17ToCanFam1.over.chain
hgAddLiftOverChain -multiple hg17 canFam1
# HG17 TO PANTRO1 LIFTOVER CHAIN (DONE 1/20/05 Andy)
ssh kolossus
cd /cluster/data/hg17/bed/blastz.panTro1/axtChain
mkdir over
for file in chain/*.chain; do
chrom=`basename $file .chain`
netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain
done
rm -rf over/
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
cp /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain .
gzip hg17ToPanTro1.chain
ln -s /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain /gbdb/hg17/liftOver/hg17ToPanTro1.over.chain
hgAddLiftOverChain -multiple hg17 panTro1
# HG17 TO RN3 LIFTOVER CHAIN (DONE 3/1/05 Andy)
#ssh kolossus
#cd /cluster/data/hg17/bed/blastz.rn3/axtChain
#mkdir over
#for file in chain/*.chain; do
# chrom=`basename $file .chain`
# netChainSubset ratNet/$chrom.net chain/$chrom.chain over/$chrom.over
# cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToRn3.chain
#done
#rm -rf over/
# Oh fancy that, there's already a hg17ToRn3.over.chain in the /cluster/data/hg17/bed/liftOver
# directory generated by Angie.
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
cp /cluster/data/hg17/bed/liftOver/hg17ToRn3.over.chain .
gzip hg17ToRn3.over.chain
ln -s /cluster/data/hg17/bed/liftOver/hg17ToRn3.over.chain /gbdb/hg17/liftOver/hg17ToRn3.over.chain
hgAddLiftOverChain -multiple hg17 rn3
# HG17 TO GALGAL2 LIFTOVER CHAIN (DONE 3/1/05 Andy)
# OK there's already a /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain file generated
# by Angie.
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
cp /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain .
gzip hg17ToGalGal2.over.chain
ln -s /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain /gbdb/hg17/liftOver/hg17ToGalGal2.over.chain
hgAddLiftOverChain -multiple hg17 galGal2
# HG17 TO MONDOM1 LIFTOVER CHAIN (DONE 3/1/05 Andy)
ssh kksilo
cd /cluster/data/monDom1/bed/zb.hg17/axtChain
netSplit human.net.gz net
ssh kolossus
cd /cluster/data/monDom1/bed/zb.hg17/axtChain
mkdir over
for file in chain/*.chain.gz; do
chrom=`basename $file .chain.gz`
netChainSubset net/$chrom.net chain/$chrom.chain.gz over/$chrom.over
cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain
done
rm -rf over/ net/
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
cp /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain .
gzip hg17ToMonDom1.chain
ln -s /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain /gbdb/hg17/liftOver/hg17ToMonDom1.over.chain
hgAddLiftOverChain -multiple hg17 monDom1
# HG17 TO DANRER2 LIFTOVER CHAIN (DONE 3/2/05 Andy)
ssh kolossus
cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
chainSplit chain all.chain.gz
netSplit zfishdanRer2.net.gz net
mkdir over
# FAILED STEPS:
#for file in chain/*.chain; do
# chrom=`basename $file .chain`
# netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
# cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain
#done
# Error:
#read 28019 of 28019 chains in chain/chr1.chain
#Processing chr1
#netChainSubset: netChainSubset.c:55: writeChainPart: Assertion `subChain != ((void *)0)' failed.
# OK instead of using the ones in the chain/ subdir, I'm using the ones in
# the chainAR/ subdir. These chain files had an additional step in the process of making
# them: Rachel used the chainAntiRepeat program.
for file in chain/*.chain; do
chrom=`basename $file .chain`
if [ $chrom = "chr1" ]; then
netChainSubset net/$chrom.net chainAR/$chrom.chain over/$chrom.over
else
netChainSubset net/$chrom.net chainAR/$chrom.chain.gz over/$chrom.over
fi
cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain
done
rm -rf over/
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
cp /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain .
gzip hg17ToDanRer2.chain
ln -s /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain /gbdb/hg17/liftOver/hg17ToDanRer2.over.chain
hgAddLiftOverChain -multiple hg17 danRer2
# HG17 TO TETNIG1 LIFTOVER CHAIN (DONE 3/1/05 Andy)
ssh kolossus
cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
mkdir over
for file in chain/*.chain; do
chrom=`basename $file .chain`
netChainSubset tetraodonNet/$chrom.net chain/$chrom.chain over/$chrom.over
cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain
done
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
cp /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain .
gzip hg17ToTetNig1.chain
ln -s /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain /gbdb/hg17/liftOver/hg17ToTetNig1.over.chain
hgAddLiftOverChain -multiple hg17 tetNig1
# HG17 TO BOSTAU1 LIFTOVER CHAIN (DONE Mar. 18, 2004, Heather)
ssh kolossus
cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
mkdir over
for file in chain/*.chain; do
chrom=`basename $file .chain`
netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain
done
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
cp /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain .
gzip hg17ToBosTau1.chain
ln -s /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain /gbdb/hg17/liftOver/hg17ToBosTau1.over.chain
hgAddLiftOverChain -multiple hg17 bosTau1
# HG17 TO XENTRO1 LIFTOVER CHAIN (DONE 7/5/05 Andy)
ssh kolossus
cd /cluster/data/xenTro1/bed/zb.hg17/axtChain
mkdir chain net over
chainSplit chain all.chain
netSplit human.net net
for file in chain/*.chain; do
chrom=`basename $file .chain`
netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToXenTro1.chain
done
rm -rf over/ chain/ net/
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
cp /cluster/data/hg17/bed/liftOver/hg17ToXenTro1.chain .
gzip hg17ToXenTro1.chain
ln -s /cluster/data/hg17/bed/liftOver/hg17ToXenTro1.chain /gbdb/hg17/liftOver/hg17ToXenTro1.over.chain
hgAddLiftOverChain -multiple hg17 xenTro1
# ADD CHAIN AND NET TO VSMM5 AND VSRN3 DOWNLOAD AREAS (DONE 8/5/04 angie)
ssh hgwdev
cp -p /cluster/data/hg17/bed/blastz.mm5/axtChain/all.chain.gz \
/usr/local/apache/htdocs/goldenPath/hg17/vsMm5/mouse.chain.gz
cp -p /cluster/data/hg17/bed/blastz.mm5/axtChain/mouse.net.gz \
/usr/local/apache/htdocs/goldenPath/hg17/vsMm5/mouse.net.gz
cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5
md5sum *.gz */*.gz > md5sum.txt
# Update the README.txt
cp -p /cluster/data/hg17/bed/blastz.rn3/axtChain/all.chain.gz \
/usr/local/apache/htdocs/goldenPath/hg17/vsRn3/rat.chain.gz
cp -p /cluster/data/hg17/bed/blastz.rn3/axtChain/rat.net.gz \
/usr/local/apache/htdocs/goldenPath/hg17/vsRn3/rat.net.gz
cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3
md5sum *.gz */*.gz > md5sum.txt
# Update the README.txt
# ADD CHAIN AND NET TO VSHG17 DOWNLOAD AREAS (DONE Sept. 8th, 2004, heather)
ssh hgwdev
cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/all.chain.gz \
/usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.chain.gz
cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/human.net.gz \
/usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.net.gz
cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3
md5sum *.gz */*.gz > md5sum.txt
# Update the README.txt
# SWAP BLASTZ ZEBRAFISH-HUMAN (danRer1-hg17) to HUMAN-ZEBRAFISH (hg17-danRer1)
# USE RESCORED ALIGNMENTS (see makeDanRer1.doc)
# (DONE, 2004-06-22, hartera)
# CONVERT AXTs TO PSL AND LOAD INTO DATABASE (DONE, 2004-07-08, hartera)
ssh kolossus
mkdir /cluster/data/hg17/bed/blastz.danRer1.swap
cd /cluster/data/hg17/bed/blastz.danRer1.swap
# use rescored axtChrom from blastzHg17 on danRer1
set aliDir = /cluster/data/danRer1/bed/blastz.hg17
cp $aliDir/S1.len S2.len
cp $aliDir/S2.len S1.len
mkdir unsorted axtChrom
cat $aliDir/axtChrom/chr*.axt \
| axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
| axtSplitByTarget stdin unsorted
# Sort the shuffled .axt files.
foreach f (unsorted/*.axt)
echo sorting $f:t:r
axtSort $f axtChrom/$f:t
end
du -sh $aliDir/axtChrom unsorted axtChrom
# 19G /cluster/data/danRer1/bed/blastz.hg17/axtChrom
# 19G unsorted
rm -r unsorted
# translate sorted axt files into psl
ssh kolossus
cd /cluster/data/hg17/bed/blastz.danRer1.swap
mkdir -p pslChrom
set tbl = "blastzDanRer1"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.danRer1.swap/pslChrom
foreach f (./*.psl)
/cluster/bin/i386/hgLoadPsl hg17 $f
echo "$f Done"
end
# CHAIN ZEBRAFISH (danRer1) BLASTZ (DONE, 2004-06-23, hartera)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/hg17/bed/blastz.danRer1.swap
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg17/bed/blastz.danRer1.swap/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# Reuse gap penalties from hg16 vs chicken run.
cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V 11
smallSize^V 111
position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111
qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600
tGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600
bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtFilter $1 \
| axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=../../chickenHumanTuned.gap \
-minScore=5000 stdin \
/iscratch/i/gs.18/build35/bothMaskedNibs \
/iscratch/i/danRer1/nib $2 > $3
'_EOF_'
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# para time
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 3559s 59.32m 0.99h 0.04d 0.000 y
# IO & Wait Time: 934s 15.56m 0.26h 0.01d 0.000 y
# Average job time: 100s 1.66m 0.03h 0.00d
# Longest job: 502s 8.37m 0.14h 0.01d
# Submission to last job: 2969s 49.48m 0.82h 0.03d
# chr19.axt crashed - out of memory so try again on kolossus
ssh kolossus
cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/run1
# need to use nibs on bluearc as iscratch not accessible to kolossus
cat << '_EOF_' > doChain2
#!/bin/csh
axtFilter $1 \
| axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=../../chickenHumanTuned.gap \
-minScore=5000 stdin \
/cluster/bluearc/hg17/bothMaskedNibs \
/cluster/bluearc/danRer1/nib $2 >& $3
'_EOF_'
chmod +x doChain2
doChain2 \
/cluster/data/hg17/bed/blastz.danRer1.swap/axtChrom/chr19.axt \
chain/chr19.chain out/chr19.out
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# Load chains into database
# next machine
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain hg17 ${c}_chainDanRer1 $i
echo done $c
end
# tried minScore = 1000 and minScore = 10000 for axtChain
# minScore = 5000 was best for reducing low scoring chains but not reducing
# overlap with refGene CDS too much
# NET ZEBRAFISH (danRer1) BLASTZ (DONE, 2004-06-24, hartera)
# REMAKE NET WITHOUT ANCIENT REPEATS (DONE, 2004-07-07, hartera)
ssh kksilo
cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
../n1/$n /dev/null
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
# memory usage 149086208, utime 868 s/100, stime 173
# Add classification info using db tables:
cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
# netClass looks for ancient repeats in one of the databases
# hg17 has this table - hand-curated by Arian
# this is only for human rodent comparisons so use -noAr option
mkdir -p /cluster/bluearc/danRer1/linSpecRep.notInHuman
mkdir -p /cluster/bluearc/hg17/linSpecRep.notInZebrafish
cp /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish/* \
/cluster/bluearc/hg17/linSpecRep.notInZebrafish
cp /iscratch/i/danRer1/linSpecRep.notInHuman/* \
/cluster/bluearc/danRer1/linSpecRep.notInHuman
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
# add -noAr option
# mkdir old
# mv zebrafish.net ./old/zebrafish.net.old
time netClass noClass.net hg17 danRer1 zebrafish.net \
-tNewR=/cluster/bluearc/hg17/linSpecRep.notInZebrafish \
-qNewR=/cluster/bluearc/danRer1/linSpecRep.notInHuman -noAr
# 83.410u 43.650s 3:09.94 66.8% 0+0k 0+0io 198pf+0w
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
netFilter -minGap=10 zebrafish.net | hgLoadNet hg17 netDanRer1 stdin
# EXTRACT AXT'S AND MAF'S FROM ZEBRAFISH (danRer1) NET
# (DONE, 2004-06-24, hartera) used net where hg17 ancient Repeat table used
# sorted axts and remade mafs as multiz needs axts to be sorted
# (DONE, 2004-06-25, kate)
# Redone to fix overlaps using 8/05 axtToNet (2005-08-16 kate)
# Replace bad chr5 axtNet and mafNet (2006-01-05 kate)
ssh eieio
# create axts
cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
netSplit zebrafish.net zebrafishNet
mkdir -p ../axtNet ../mafNet
cat > makeMaf.csh << 'EOF'
foreach f (zebrafishNet/chr*.net)
set c = $f:t:r
echo $c
netToAxt zebrafishNet/$c.net chain/$c.chain \
/cluster/data/hg17/nib /cluster/data/danRer1/nib stdout | \
axtSort stdin ../axtNet/$c.axt
axtToMaf ../axtNet/$c.axt \
/cluster/data/hg17/chrom.sizes /cluster/data/danRer1/chrom.sizes \
../mafNet/$c.maf -tPrefix=hg17. -qPrefix=danRer1.
end
'EOF'
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
mkdir -p /cluster/bluearc/hg17/mafNet
cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/danRer1
# BLASTZ ZEBRAFISH (danRer1) CLEAN UP (DONE, 2004-07-19, hartera)
# FURTHER CLEANUP (hartera, 2006-09-01, hartera)
ssh eieio
cd /cluster/data/hg17/bed/blastz.danRer1.swap
nice rm axtChain/run1/chain/* &
nice rm -fr axtChain/n1 axtChain/hNoClass.net &
nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/*.net &
# further cleanup (2006-09-01, hartera)
ssh kkstore02
cd /cluster/data/hg17/bed/blastz.danRer1.swap
rm -r axtNet.old axtNet.unsorted mafNet
cd axtChain
rm hist*
# remove chains and nets directories. These can be reconstructed with
# all.chain.gz and zebrafish.net.gz
rm -r old chain zebrafishNet preNet
rm noClass.net.gz
cd ..
rm pslChrom/psl.tab.gz
# ZEBRAFISH DANRER1 DOWNLOADS (WORKING 2004-09-17 kate)
ssh eieio
cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtNet
gzip *.axt
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17
mkdir -p vsDanRer1
cd vsDanRer1
cp /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/all.chain.gz zebrafish.chain.gz
cp /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/zebrafish.net.gz .
md5sum *.gz > md5sum.txt
mkdir -p axtNet
cd axtNet
cp /cluster/data/hg17/bed/blastz.danRer1.swap/axtNet/*.axt.gz .
md5sum *.gz > md5sum.txt
# Copy and edit README.txt
# MAKING MOUSE SYNTENY (DONE - 2004-07-03 - Hiram)
ssh hgwdev
mkdir /cluster/data/hg17/bed/syntenyMm5
cd /cluster/data/hg17/bed/syntenyMm5
# Copy all the needed scripts from /cluster/data/hg16/bed/syntenyMm3
cp -p /cluster/data/hg17/bed/syntenyRn3/*.pl .
./syntenicBest.pl -db=hg17 -table=blastzBestMm5
./smooth.pl
./joinsmallgaps.pl
./fillgap.pl -db=hg17 -table=blastzBestMm5
./synteny2bed.pl
# The five commands above
# real 209m28.161s
# user 0m21.040s
# sys 0m4.100s
# Used to load this in syntenyMm5, but that type is misleading to
# the table browser and fails the checkTableCoords check.
# Better to use this ensRatMusHom type:
# Need a new name here for the Mm5 to not conflict with Rn3
sed -e 's/ensPhusionBlast/ensRatMm5Hom/g' \
$HOME/kent/src/hg/lib/ensPhusionBlast.sql \
> ensRatMm5Hom.sql
hgLoadBed hg17 ensRatMm5Hom ucsc100k.bed -sqlTable=ensRatMm5Hom.sql
# featureBits hg17 ensRatMm5Hom
# 2649530748 bases of 2866216770 (92.440%) in intersection
# featureBits hg17 ensRatMm4Hom
# 2549307611 bases of 2866216770 (88.943%) in intersection
# featureBits hg16 syntenyMm5
# 2560252977 bases of 2865248791 (89.355%) in intersection
# MAKING MOUSE AXTTIGHT FROM AXTBEST (DONE - 2004-07-02 - Hiram)
# After creating axtBest alignments above, use subsetAxt to get axtTight:
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm5/axtNet
mkdir -p ../axtTight
bash # for tcsh users
for I in *.axt
do
echo $I
subsetAxt $I ../axtTight/$I \
~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
done
# exit bash if you are tcsh
# An 8 minute job
# translate to psl
cd ../axtTight
mkdir ../pslTight
bash # for tcsh users
for I in *.axt
do
C=${I/.axt/}
axtToPsl $I ../S1.len ../S2.len ../pslTight/${C}_blastzTightMm5.psl
echo "Done: $I -> ${C}_blastzTightMm5.psl"
done
# exit bash if you are tcsh
# Load tables into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm5/pslTight
for I in chr*TightMm5.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
echo "done ${I}"
done
# Compare results with previous assembly:
# featureBits hg17 blastzTightMm5
# 165862935 bases of 2866216770 (5.787%) in intersection
# featureBits hg17 blastzTightMm4
# 166569246 bases of 2866216770 (5.811%) in intersection
# featureBits hg16 blastzTightMm5
# 162641577 bases of 2865248791 (5.676%) in intersection
# copy axt's to download area
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.mm5/axtTight
mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtTight
cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtTight
cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtTight
gzip *.axt
# add README.txt file to dir (use previous assembly's copy as template)
# 4 minute gzip
# BLASTZ MM5 CLEAN UP (DONE 2004-07-02 - Hiram)
ssh eieio
cd /cluster/data/hg17/bed/blastz.mm5
nice rm -rf raw &
nice rm -fr axtChain/n1 axtChain/hNoClass.net &
nice rm axtChain/run1/chain/* &
nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net &
##############################################################################
# MAKING BLASTZ SELF (DONE - 2004-07-14 - Hiram)
# The procedure for lineage spec business with self is to simply
# use the actual repeat masker output for this human assembly as
# the lineage specific repeats for itself. Thus, merely make
# symlinks to the repeat masker out files and name them as expected
# for blastz. In this case they are called notInHuman but they
# really mean InHuman. Yes, it is confusing, but that's just the
# nature of the game in this case.
ssh eieio
mkdir /cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
cd /cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
foreach f (../rmsk/*.fa.out)
set base = $f:t:r:r
echo $base.out.spec
ln -s $f $base.out.spec
end
# Same thing done on iscratch
# Not worried about pushing this scratch yet, it will get done
# sometime later. Using the actual /cluster/bluearc/scratch/
# location below.
ssh kk
mkdir /cluster/data/hg17/bed/blastzSelf.2004-07-01
cd /cluster/data/hg17/bed
ln -s blastzSelf.2004-07-01 blastzSelf
cd blastzSelf
cat << '_EOF_' > DEF
# human vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Human
SEQ2_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
BASE=/cluster/data/hg17/bed/blastzSelf
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line makes emacs coloring happy
# prepare first cluster run
ssh kk
cd /cluster/data/hg17/bed/blastzSelf
/cluster/data/hg17/jkStuff/BlastZ_run0.sh
cd run.0
para try, check, push, check, ....
# you need a -maxPush=200000 on this one, it is more than 100000
# jobs the default push limit. Also be aware of maxQueue limits
# on the KK, may need something more than the default of 200000 if
# the KK is busy.
XXX - running 2004-07-01 11:26
##############################################################################
# LIFTOVER (DROP) CHAINS TO HG16 (IN PROGRESS 2005-01-03 kate)
# swap hg16->hg17 chains
# LIFTOVER (DROP) CHAINS TO HG16 (IN PROGRESS 2004-07-07 kate)
# run alignment
# NOTE: split hg16 to /iscratch/i is doc'ed in makeHg16.doc
ssh kk
cd /cluster/data/hg17
makeLoChain-align hg17 /scratch/hg/gs.18/build35/bothMaskedNibs \
hg16 /iscratch/i/gs.17/build34/liftOver/split
# Created parasol job in bed/blat.hg16.2004-07-07/run
# 1150 jobs
cd bed/blat.hg16.2004-07-07/run
para try
para check
para push
# GOT HERE
# lift results (use bash)
cd /cluster/data/hg17/bed/blat.hg16
for file in /cluster/data/hg16/nib/*.nib; do
chrom=`basename $file .nib`
liftUp -pslQ psl/$chrom.psl /cluster/bluearc/hg/gs.17/build34/liftOver/lift/$chrom.lft warn raw/chr*_${chrom}.psl
done
# There were some errors from not finding .lft files for the chr_random ones.
ssh kk9
cd ../liftOver
ln -s blat.hg16 blat.hg16.2005-01-22
makeLoChain-chain hg17 /cluster/data/hg17/nib hg16 /cluster/data/hg16/nib 2>chain.error.log >chain.log
ssh eieio
makeLoChain-net hg17 hg16
ssh hgwdev
makeLoChain-load hg17 hg16
# DROPUNDER CHAIN TO HG15 (DONE 2005-07-21 Andy)
# Split things up
ssh eieio
cd /cluster/bluearc
mkdir -p hg15/liftOver/split
cd hg15/liftOver/split/
mkdir ../lift
for c in `cut -f1 /cluster/data/hg15/chrom.sizes`; do
echo $c
num=${c%_random}
num=${num#chr}
faSplit -lift=../lift/${c}.lft size /cluster/data/hg15/${num}/${c}.fa -oneFile 3000 ${c}
done
# Move files to santest
ssh hgwdev
cd /santest/scratch
mkdir hg15
cd hg15/
cp -r /cluster/bluearc/hg15/liftOver .
# run alignment
ssh kk
cd /cluster/data/hg17
makeLoChain-align hg17 /scratch/hg/gs.18/build35/bothMaskedNibs \
hg15 /santest/scratch/hg15/liftOver/split
# Created parasol job in bed/blat.hg16.2004-07-07/run
# 2024 jobs written to batch
# *** IGNORE the batch created by the script.
ln -s bed/blat.hg15.2005-07-21 bed/blat.hg15
cd bed/blat.hg15/
mv run run.kk
mkdir run.kk9 run.kki
cd run.kk/
sed 's/\.fa\./\./g' spec > tmp; mv tmp spec
grep Un_random spec > ../run.kki/spec
grep -v Un_random spec > newspec
mv newspec spec
egrep "chr(1|19|X)(\.|_)" spec | grep -v random > ../run.kk9/spec
grep -Fv -f ../run.kk9/spec spec > newspec
mv newspec spec
wc -l spec ../run.kk9/spec ../run.kki/spec
# 1831 spec
# 147 ../run.kk9/spec
# 46 ../run.kki/spec
# 2024 total
# Checks out
# Run the thing on all 3 clusters.
para create spec
para push
#Completed: 1831 of 1831 jobs
#CPU time in finished jobs: 8556066s 142601.10m 2376.69h 99.03d 0.271 y
#IO & Wait Time: 60428s 1007.13m 16.79h 0.70d 0.002 y
#Average job time: 4706s 78.43m 1.31h 0.05d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 46724s 778.73m 12.98h 0.54d
#Submission to last job: 46725s 778.75m 12.98h 0.54d
ssh kk9
cd /cluster/data/hg17/bed/blat.hg15/run.kk9
para create spec
para push
#Completed: 147 of 147 jobs
#CPU time in finished jobs: 1698424s 28307.07m 471.78h 19.66d 0.054 y
#IO & Wait Time: 874s 14.56m 0.24h 0.01d 0.000 y
#Average job time: 11560s 192.66m 3.21h 0.13d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 31413s 523.55m 8.73h 0.36d
#Submission to last job: 31413s 523.55m 8.73h 0.36d
ssh kki
cd /cluster/data/hg17/bed/blat.hg15/run.kki
para create spec
para push
# OK I don't have para time stuff for this one, but it was the shortest
# by far.
# lift results
cd /cluster/data/hg17/bed/blat.hg15/lift.run
for chrom in `cut -f1 /cluster/data/hg15/chrom.sizes`; do
liftUp -pslQ /cluster/bluearc/hg15/liftOver/psl/${chrom}.psl /cluster/bluearc/hg15/liftOver/lift/${chrom}.lft warn raw/chr*_${chrom}.psl
done
# Chain
# There's been some problems with store5.
ssh kk9
cd /cluster/store12/store5/gs.18/build35/bed/blat.hg15.2005-07-21
mkdir chainRun
mkdir -p /panasas/store/hg15/chainRaw
ln -s /panasas/store/hg15/chainRaw chainRaw
cd chainRun/
ls -1S ../psl/*.psl > in.lst
cat > chain.sh << "_EOF_"
#!/bin/bash
tmp=/scratch/`basename $4`
axtChain -psl $1 $2 $3 $tmp
cp $tmp $4
rm $tmp
_EOF_
chmod +x chain.sh
cat > gsub << "_EOF_"
#LOOP
./chain.sh $(path1) /scratch/hg/gs.18/build35/bothMaskedNibs /scratch/hg/gs.16/build33/chromTrfMixedNib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
_EOF_
# <<
cd chainRun/
gensub2 in.lst single gsub spec
para create spec
para push
#Completed: 44 of 44 jobs
#CPU time in finished jobs: 7448s 124.13m 2.07h 0.09d 0.000 y
#IO & Wait Time: 9591s 159.85m 2.66h 0.11d 0.000 y
#Average job time: 387s 6.45m 0.11h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 1906s 31.77m 0.53h 0.02d
#Submission to last job: 1906s 31.77m 0.53h 0.02d
ssh kolossus
cd /panasas/store/hg15/chainRaw
chainMergeSort *.chain | chainSplit /scratch/andy/chain stdin
cd /scratch/andy
mkdir net over
cd chain/
for chain in *; do
c=${chain%.chain}
echo $c
chainNet $chain /cluster/store12/store5/gs.18/build35/chrom.sizes \
/cluster/store12/store5/gs.16/build33/chrom.sizes \
../net/${c}.net /dev/null
netChainSubset ../net/${c}.net $chain ../over/${c}.over
done
cd ../over/
cat * >> ../hg17ToHg15.over.chain
cd ../
cp -r hg17* over/ /cluster/store12/store5/gs.18/build35/bed/blat.hg15.2005-07-21/
cd ../
rm -rf andy/
rm -rf /panasas/store/hg15
cd /cluster/bluearc/hg15/liftOver/psl
for psl in *; do
gzip $psl
done
cd ../
# Completed: 116281 of 116281 jobs
# CPU time in finished jobs: 21807388s 363456.46m 6057.61h 252.40d 0.692 y
# IO & Wait Time: 2319383s 38656.39m 644.27h 26.84d 0.074 y
# Average job time: 207s 3.46m 0.06h 0.00d
# Longest job: 22063s 367.72m 6.13h 0.26d
# Submission to last job: 83402s 1390.03m 23.17h 0.97d
# Second cluster run to convert the .out's to .lav's
# You do NOT want to run this on the big cluster. It brings
# the file server to its knees. Run this on the small cluster.
ssh kki
cd /cluster/data/hg17/bed/blastzSelf
/cluster/data/hg17/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 6344s 105.73m 1.76h 0.07d 0.000 y
# IO & Wait Time: 5413s 90.22m 1.50h 0.06d 0.000 y
# Average job time: 34s 0.57m 0.01h 0.00d
# Longest job: 505s 8.42m 0.14h 0.01d
# Submission to last job: 4521s 75.35m 1.26h 0.05d
# Third cluster run to convert lav's to axt's
# These self alignments do not work well as the usual third cluster job.
# Instead, a specialized job here that includes a DropSelf
# operation, and in individual lav pieces to avoid out of memory
# problems during axtSort
ssh kki
cd /cluster/data/hg17/bed/blastzSelf
mkdir axtChrom run.2
cd run.2
cat << '_EOF_' > runLavToAxt.sh
#!/bin/sh
BASE=/cluster/data/hg17/bed/blastzSelf
SEQ1_DIR=/cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs
SEQ2_DIR=/cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs
CHR=$1
OUT=axtChrom/$CHR.axt
cd ${BASE}/lav/${CHR}
for D in *.lav
do
smallout=$D.axt
lavToAxt $D $SEQ1_DIR $SEQ2_DIR stdout \
| axtDropSelf stdin stdout \
| axtSort stdin $smallout
done
cat `ls -1 *.lav.axt | sort -g` > $BASE/$OUT
'_EOF_'
# << keep emacs coloring happy
chmod +x runLavToAxt.sh
cat << '_EOF_' > gsub
#LOOP
./runLavToAxt.sh $(path1) {check out line ../axtChrom/$(path1).axt}
#ENDLOOP
'_EOF_'
# << keep emacs coloring happy
ls ../lav > chrList
gensub2 chrList single gsub jobList
para create jobList
para try
para push
# This is a tough load on eieio. Managable, but the load should
# be monitored to make sure it isn't severe. I saw about 100 to 150
# the chr19 job will not finish, even in parts it takes up too
# much memory and the node it runs on ends up swapping endlessly.
# Need to go to kolossus to do chr19
para stop
para recover jobList chr19JobList
ssh kolossus
cd /cluster/data/hg17/bed/blastzSelf/run.2
time ./runLavToAxt.sh chr19
# real 43m14.797s
# user 12m56.670s
# sys 3m13.590s
# translate sorted axt files into psl
ssh eieio
cd /cluster/data/hg17/bed/blastzSelf
mkdir pslChrom
set tbl = "blastzSelf"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# That takes about 70 minutes
# Load database tables
ssh hgwdev
cd /cluster/data/hg17/bed/blastzSelf/pslChrom
bash # if a csh/tcsh user
for I in *.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
echo "done: ${I}"
done
# exit bash if you are tcsh
# This is an 80 minute job
# Check results
# featureBits hg17 blastzSelf
# 252256266 bases of 2866216770 (8.801%) in intersection
# real 40m49.573s
# user 21m14.200s
# sys 2m10.420s
# featureBits hg16 blastzSelf
# 254410837 bases of 2865248791 (8.879%) in intersection
# CHAIN SELF BLASTZ (DONE - 2004-07-07 - Hiram)
# The axtChain is best run on the small kluster, or the kk9 kluster
ssh kki
mkdir -p /cluster/data/hg17/bed/blastzSelf/axtChain/run1
cd /cluster/data/hg17/bed/blastzSelf/axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg17/bed/blastzSelf/axtChrom/*.axt > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# May need -minScore=5000 for all chroms if chr19 won't finish on kolossus
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /iscratch/i/gs.18/build35/bothMaskedNibs \
/iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
# 46 jobs
gensub2 input.lst single gsub jobList
para create jobList
para try
para push # ... etc ...
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 8519s 141.98m 2.37h 0.10d 0.000 y
# IO & Wait Time: 4795s 79.92m 1.33h 0.06d 0.000 y
# Average job time: 296s 4.93m 0.08h 0.00d
# Longest job: 2407s 40.12m 0.67h 0.03d
# Submission to last job: 3540s 59.00m 0.98h 0.04d
# chr19 did fail, on kolossus, try:
ssh kolossus
cd /cluster/data/hg17/bed/blastzSelf/axtChain/run1
time axtChain /cluster/data/hg17/bed/blastzSelf/axtChrom/chr19.axt \
/cluster/data/hg17/nib \
/cluster/data/hg17/nib \
chain/chr19.chain > out/chr19.out
# 80 minute job, 1.5 Gb result:
# -rw-rw-r-- 1 1588795432 Jul 7 21:54 chr19.chain
# now on the file server, sort chains
ssh eieio
cd /cluster/data/hg17/bed/blastzSelf/axtChain
time chainMergeSort run1/chain/*.chain > all.chain
# real 27m38.935s
# user 23m18.540s
# sys 2m39.300s
# A 5 Gb file:
# -rw-rw-r-- 1 5267202936 Jul 7 22:23 all.chain
time chainSplit chain all.chain
# real 29m27.062s
# user 22m48.250s
# sys 1m57.910s
# optionally: rm run1/chain/*.chain
# Load chains into database
# next machine
ssh hgwdev
cd /cluster/data/hg17/bed/blastzSelf/axtChain/chain
bash # for tcsh users
for I in *.chain
do
c=${I/.chain/}
$HOME/bin/i386/hgLoadChain -normScore hg17 ${c}_chainSelf $I
echo done $c
done
# exit bash if you are tcsh
# This is almost 3 hours to load
ssh kolossus
cd /cluster/data/hg17/bed/blastzSelf.2004-07-01
time HGDB_CONF=~/.hg.conf.read-only featureBits \
-noRandom -noHap hg17 chainSelfLink > fb.chainSelfLink 2>&1 &
# real 56m34.802s
# 240976607 bases of 2851352871 (8.451%) in intersection
# featureBits hg17 chainSelf
# 682833453 bases of 2866216770 (23.824%) in intersection
# featureBits hg16 chainSelf
# 626345319 bases of 2865248791 (21.860%) in intersection
# DELIVER these chain files to hgdownload (2005-01-27 - Hiram)
ssh eieio
cd /cluster/data/hg17/bed/blastzSelf/axtChain/chain
gzip chr*.chain
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsSelf
cd /cluster/data/hg17/bed/blastzSelf/axtChain/chain
cp -p *.chain.gz /usr/local/apache/htdocs/goldenPath/hg17/vsSelf
# fixup README file, request push
# NET SELF (DONE - 2004-07-13 - Hiram)
ssh eieio
cd /cluster/data/hg17/bed/blastzSelf/axtChain
mkdir preNet
cd chain
bash # for tcsh users
for I in *.chain
do
echo preNetting $I
/cluster/bin/i386/chainPreNet $I /cluster/data/hg17/chrom.sizes \
/cluster/data/hg17/chrom.sizes ../preNet/$I
done
# 23 minutes
cd ..
mkdir n1
cd preNet
for I in *.chain
do
N=${I/.chain/}.net
echo primary netting $I
/cluster/bin/i386/chainNet $I -minSpace=10 \
/cluster/data/hg17/chrom.sizes /cluster/data/hg17/chrom.sizes \
../n1/$N /dev/null
done
# exit bash if you are tcsh
# 5 minute job
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
# memory usage 206442496, utime 3009 s/100, stime 252
# memory usage 2510467072, utime 19307 s/100, stime 3181
ssh hgwdev
cd /cluster/data/hg17/bed/blastzSelf/axtChain
time netClass hNoClass.net hg17 hg17 human.net \
-tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman \
-qNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
# real 9m32.951s
# user 2m42.840s
# sys 1m23.460s
# If things look good do
ssh eieio
cd /cluster/data/hg17/bed/blastzSelf/axtChain
rm -r n1 hNoClass.net
# Make a 'syntenic' subset of these with
time netFilter -syn human.net > humanSyn.net
# real 0m29.851s
# user 0m27.200s
# sys 0m2.120s
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg17/bed/blastzSelf/axtChain
netFilter -minGap=10 human.net | hgLoadNet hg17 netSelf stdin
netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 syntenyNetSelf stdin
# check results
# featureBits hg17 netSelf
# 620827374 bases of 2866216770 (21.660%) in intersection
# featureBits hg16 netSelf
# 563788850 bases of 2865248791 (19.677%) in intersection
# featureBits hg15 selfNet
# 749177799 bases of 2866466359 (26.136%) in intersection
# featureBits hg17 syntenyNetSelf
# 404535376 bases of 2866216770 (14.114%) in intersection
# featureBits hg16 syntenyNetSelf
# 340871322 bases of 2865248791 (11.897%) in intersection
# Add entries for net and chain to human/hg17 trackDb
# make net
ssh eieio
cd /cluster/data/hg17/bed/blastzSelf/axtChain
mkdir humanNet
time netSplit human.net humanNet
# real 0m52.106s
# user 0m43.350s
# sys 0m5.170s
# extract axts from net - this should be combined with the sort and
# maf conversion below
mkdir ../axtNet
foreach n (humanNet/chr*.net)
set c=$n:t:r
echo "netToAxt: $c.net -> $c.axt"
rm -f ../axtNet/$c.axt
netToAxt humanNet/$c.net chain/$c.chain \
/cluster/data/hg17/nib \
/cluster/data/hg17/nib stdout > ../axtNet/$c.axt
echo "Complete: $c.net -> axtNet/$c.axt"
end
# sort axt's and convert to maf format
mkdir ../mafNet
foreach f (../axtNet/chr*.axt)
set c=$f:t:r
echo $c.axt
mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt
axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt
rm ../axtNet/$c.unsorted.axt
axtToMaf ../axtNet/$c.axt \
/cluster/data/hg17/chrom.sizes /cluster/data/hg17/chrom.sizes \
../mafNet/$c.maf -tPrefix=hg17. -qPrefix=hg17.
end
# a 3 minute job
XXXX - ! ! ! WE DO NOT NEED the Best and Tight tracks for Self ! ! !
ssh hgwdev
mkdir -p /cluster/data/hg17/bed/blastzSelf/axtBest
cd /cluster/data/hg17/bed/blastzSelf/axtBest
ln -s ../axtNet/chr*.axt .
# copy net axt's to download area - XXX Do we need this for Self ?
ssh hgwdev
cd /cluster/data/hg17/bed/blastzSelf/axtNet
mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsSelf
cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsSelf
cd /usr/local/apache/htdocs/goldenPath/hg17/vsSelf
nice gzip *.axt
nice md5sum *.gz > md5sum.txt
# add README.txt file to dir (use previous assembly's copy as template)
# Convert those axt files to psl
ssh eieio
cd /cluster/data/hg17/bed/blastzSelf
mkdir pslBest
foreach a (axtBest/chr*.axt)
set c=$a:t:r
echo "processing $c.axt -> ${c}_blastzBestSelf.psl"
/cluster/bin/i386/axtToPsl axtBest/${c}.axt \
S1.len S2.len pslBest/${c}_blastzBestSelf.psl
echo "Done: ${c}_blastzBestSelf.psl"
end
# Load tables
ssh hgwdev
cd /cluster/data/hg17/bed/blastzSelf/pslBest
bash # if a csh/tcsh user
for I in chr*BestSelf.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
echo "done ${I}"
done
# exit bash if you are tcsh
# check results
# featureBits hg17 blastzBestSelf
# 233978156 bases of 2866216770 (8.163%) in intersection
# featureBits hg16 blastzBestSelf
# 225819219 bases of 2865248791 (7.881%) in intersection
# MAKING HUMAN AXTTIGHT FROM AXTBEST (NOT TO BE DONE - 2004-07-13 - Hiram)
# XXXX - ! ! ! DO NOT NEED axtBest for Self alignments
# Been done anyway, Robert and Gill like to see it.
# BLASTZ SELF CLEAN UP (DONE - 2004-07-15 - Hiram)
ssh eieio
cd /cluster/data/hg17/bed/blastzSelf
nice rm -rf raw &
nice rm axtChain/run1/chain/* &
nice rm -fr axtChain/n1 axtChain/hNoClass.net &
nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net &
# CREATING BIG ZIPS (DONE - 2004-07-23 - Hiram)
ssh eieio
cd /cluster/data/hg17/jkStuff
time ./zipAll.sh > zipAll.out 2>&1
ssh hgwdev
# This stuff has to work in a different way because this stuff
# updates on a daily basis.
cd /usr/local/apache/htdocs/goldenPath/hg17/bigZips
featureBits hg17 refGene:upstream:1000 -fa=upstream1000.fa
zip upstream1000.zip upstream1000.fa
rm upstream1000.fa
featureBits hg17 refGene:upstream:2000 -fa=upstream2000.fa
zip upstream2000.zip upstream2000.fa
rm upstream2000.fa
featureBits hg17 refGene:upstream:5000 -fa=upstream5000.fa
zip upstream5000.zip upstream5000.fa
rm upstream5000.fa
# ENCODE REGIONS (DONE 2004-07-28 kate)
ssh eieio
cd /cluster/data/hg17/bed
mkdir encodeRegions
cd encodeRegions
liftOver /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed \
/cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \
encodeRegions.bed encodeRegions.unmapped
wc -l encodeRegions.*
# 44 encodeRegions.bed
# 0 encodeRegions.unmapped
ssh hgwdev
cd /cluster/data/hg17/bed/encodeRegions
hgLoadBed hg17 encodeRegions encodeRegions.bed -noBin
# H-INVITATIONAL GENE ANNOTATION DATABASE (WORKING 2004-07-28 kate)
# http://www.jbirc.aist.go.jp/hinv/top.html
# Create knownGene table to reference HINV gene ID's
# for link on knownGenes details page
# Also, create an HINV gene track
# download CDNA file release 1.5 -- got release # from downloads page).
ssh kksilo
mkdir -p /cluster/data/hinv
cd /cluster/data/hinv
wget http://www.jbirc.aist.go.jp/hinv/download/alldata/flatfile/FCDNA.gz
gunzip FCDNA.gz
mv FCDNA FCDNA.1.5
# set up assembly work area
ssh eieio
cd /cluster/data/hg17
mkdir -p bed/hinv
cd bed/hinv
# extract H-INV ID's and Genbank accessions of mRNAs
awk '/CDNA_ACCESSION-NO:/ {print $2}' < /cluster/data/hinv/FCDNA.1.5 \
> accessions.txt
awk '/CDNA_H-INVITATIONAL-ID:/ {print $2}' < /cluster/data/hinv/FCDNA.1.5 \
> ids.txt
paste accessions.txt ids.txt > queries.txt
wc -l ids.txt
# 41118 ids.txt
# create PSL file from alignments for these mRNA's, extracted from the
# table of all aligned mRNA's
ssh hgwdev
cd /cluster/data/hg17/bed/hinv
hgsql hg17 -s -e "SELECT * FROM all_mrna" | cut -f 2- > all_mrna.tab
ssh eieio
cd /cluster/data/hg17/bed/hinv
pslReps /dev/null stdout /dev/null | cat - all_mrna.tab > all_mrna.psl
# using pslReps to generate the PSL file header
~kate/bin/i386/pslSelect -queryPairs=queries.txt all_mrna.psl hinv_mrna.psl
# NOTE: generated with pslSelect.c v1.3 (1.4 is broken -- test is
# setup in hg/pslSelect/tests & I requested Robert take a look)
# load track of mrna alignments
hgwdev
cd /cluster/data/hg17/bed/hinv
hgLoadPsl hg17 -table=HInvGeneMrna hinv_mrna.psl
hgsql hg17 -s -e \
"select distinct(qName) from HInvGeneMrna order by qName" > hg17.mrna
hgsql hg16 -s -e \
"select distinct(qName) from HInvGeneMrna order by qName" > hg16.mrna
wc -l hg*.mrna
# 40998 hg16.mrna
# 41023 hg17.mrna
comm -1 -3 *.mrna > hg17.aligned
wc -l hg17.aligned
# 29 (transcripts newly aligned in hg17)
comm -2 -3 *.mrna > hg16.aligned
wc -l hg16.aligned
# 4 (transcripts no longer aligned in hg17)
comm -2 -3 ids.txt hg17.mrna > hg17.notaligned
wc -l hg17.notaligned
# 95 (transcripts not aligned in hg17 -- checking on why...)
# also make a table with various useful items for each transcript
ssh hgwdev
hgsql hg17 < ~/kent/src/hg/lib/HInv.sql
cd /cluster/data/hg17/bed/hinv
/cluster/data/hinv/hinvToTable.pl < /cluster/data/hinv/FCDNA.1.5 > HInv.tab
echo 'load data local infile "HInv.tab" into table HInv' | hgsql hg17
hgsql hg16 -s -e "select count(*) from HInv"
# 41118
hgsql hg17 -s -e "select count(*) from HInv"
# 41118
# create table for knownGenes detail page
ssh hgwdev
cd /cluster/data/hg17/bed/hinv
hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv
# GENEID GENE PREDICTIONS (DONE 7/30/04 angie)
ssh hgwdev
mkdir /cluster/data/hg17/bed/geneid
cd /cluster/data/hg17/bed/geneid
foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/geneid_v1.2/$chr.gtf
wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/geneid_v1.2/$chr.prot
end
# Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
cp /dev/null geneid.fa
foreach f (chr*.prot)
perl -wpe 's/^(>chr\S+)/$1.1/' $f >> geneid.fa
end
ldHgGene -gtf -genePredExt hg17 geneid *.gtf
hgPepPred hg17 generic geneidPep geneid.fa
# MITOPRED DATA FOR HGGENE (DONE 7/30/04 angie)
ssh hgwdev
mkdir /cluster/data/hg17/bed/mitopred
cd /cluster/data/hg17/bed/mitopred
wget http://mitopred.sdsc.edu/data/hum_30.out
perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' hum_30.out > mitopred.tab
cat > mitopred.sql << '_EOF_'
# Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/
CREATE TABLE mitopred (
name varchar(10) not null, # SwissProt ID
confidence varchar(8) not null, # Confidence level
#Indices
PRIMARY KEY(name(6))
);
'_EOF_'
# << this line makes emacs coloring happy
hgsql hg17 < mitopred.sql
hgsql hg17 -e 'load data local infile "mitopred.tab" into table mitopred'
# NUCLEAR PROTEIN DATABASE (IN PROGRESS 7/30/04 angie)
ssh eieio
mkdir /cluster/data/hg17/bed/npd
cd /cluster/data/hg17/bed/npd
wget ftp://ftp.hgu.mrc.ac.uk/pub/npd/database.zip
unzip database.zip
# OK, it's one big .mdb (Microsoft Access DB) file.
# Googling... can buy a converter for $40... free trial .exe...
# CREATING REFFULL - DBTSS MRNA (DONE - 2004-08-02 - Hiram)
ssh to eieio
mkdir /cluster/data/hg17/bed/refFull
cd /cluster/data/hg17/bed/refFull
wget --timestamping "ftp://ftp.hgc.jp/pub/hgc/db/dbtss/ref-full.fa.gz" .
wget --timestamping "ftp://ftp.hgc.jp/pub/hgc/db/dbtss/readme" .
# See also: http://dbtss.hgc.jp/index.html
# gunzip it and split the ref-rull.fa file into about 200 pieces
# (faSplit won't do this job if it is:
# zcat ref-full.fa.gz | faSplit sequence stdin 50 splitRefFull
gunzip ref-full.fa.gz
faSplit sequence ref-full.fa 50 splitRefFull
gzip ref-full.fa
# copy to Iservers
ssh kkr1u00
cd /cluster/data/hg17/bed/refFull
mkdir /iscratch/i/gs.18/build35/refFull
cp -p split*.fa /iscratch/i/gs.18/build35/refFull
/cluster/bin/iSync
# no longer need these split files here
rm -f split*.fa
# run alignments on kluster
ssh kk
cd /cluster/data/hg17/bed/refFull
ls -1S /scratch/hg/gs.18/build35/maskedContigs > genome.lst
ls -1S /iscratch/i/gs.18/build35/refFull > refFull.lst
# Use BLAT to generate refFull alignments as so:
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -ooc=/scratch/hg/h/11.ooc -q=dna -t=dna {check in exists /scratch/hg/gs.18/build35/maskedContigs/$(path1)} {check in exists+ /iscratch/i/gs.18/build35/refFull/$(path2)} {check out line+ psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << keep emacs coloring happy
bash # if a csh/tcsh user
mkdir psl
cat genome.lst | sed -e "s/.fa//" | while read C
do
mkdir psl/${C}
done
# exit bash if you are tcsh
gensub2 genome.lst refFull.lst gsub jobList
para create jobList
# 18240 jobs written to batch
para try
para check
para push ... etc ...
# Completed: 18240 of 18240 jobs
# CPU time in finished jobs: 37011s 616.85m 10.28h 0.43d 0.001 y
# IO & Wait Time: 62630s 1043.84m 17.40h 0.72d 0.002 y
# Average job time: 5s 0.09m 0.00h 0.00d
# Longest job: 51s 0.85m 0.01h 0.00d
# Submission to last job: 850s 14.17m 0.24h 0.01d
# Process refFull alignments into near best in genome.
ssh eieio
cd /cluster/data/hg17/bed/refFull
pslSort dirs raw.psl tmp psl/*
pslReps -minCover=0.2 -sizeMatters -minAli=0.965 \
-nearTop=0.001 raw.psl contig.psl /dev/null
liftUp -nohead all_refFull.psl ../../jkStuff/liftAll.lft warn contig.psl
pslSortAcc nohead chrom tmp all_refFull.psl
pslCat -dir chrom > refFullAli.psl
# Load refFull alignments into database
ssh hgwdev
cd /cluster/data/hg17/bed/refFull
hgLoadPsl hg17 -tNameIx refFullAli.psl
# VAR_MULTIZ HG17/MM5/RN3/GALGAL2/FR1 (acs 2004-08-12)
# This is a new, experimental version of multiz written by Minmei at
# PSU and sent by e-mail from Webb. This version allows for a
# progressive alignment strategy (i.e., alignment construction in a
# post-order traversal of the tree) using only pairwise alignments of
# each sequence with the reference sequence and without any need for
# "staging". Here's a little blurb about it from the header of
# var_multiz.v3.c.
# var_multiz.v3.c
#
# Variant to multiz program. It aligns two files of
# alignment blocks where top row is always the reference,
# assuming blocks are increasing ordered based on the
# start position on the refernece seqence. Single-coverage
# on reference is required at this stage.
#
# Four arguments are required: char* arg1, char* arg2,
# int arg3, int arg4. arg1 and arg2 are two files need
# to be aligned together. The alignment of reference in
# two files are either fixed or not, determined from
# argurments arg3 and arg4. arg3 and arg4 are either 1
# or 0, but cannot be 1 at the same time. 1 means
# reference is fixed. v1 and v2 cannot be both 1.
# ...
mkdir /cluster/data/hg17/bed/var_multiz.2004-08-12
# unpack source and compile
cp /cluster/home/acs/var_multiz.tar.gz /cluster/data/hg17/bed/var_multiz.2004-08-12
cd /cluster/data/hg17/bed/var_multiz.2004-08-12
tar xfz var_multiz.tar.gz
cd var_multiz_source
make
NOTE (8/14): this version of the source is already out of date!
# Source is now checked in under hg3rdParty and updated binaries
# are being kept under /cluster/bin/penn/var_multiz
# script for creating the 5-way alignments for a given chromosome
# (acs, 8/20/04) below revised after e-mail exchange with Minmei
cat << '_EOF_' > doVarMultiz.csh
#!/bin/csh -fe
set chr = $1 # may include _random or _hla_hap[12]
set REF = hg17.$chr
set RAT = /cluster/bluearc/hg17/multiz8way/rn3/$chr.maf
set MOUSE = /cluster/bluearc/hg17/multiz8way/mm5/$chr.maf
set CHICKEN = /cluster/bluearc/hg17/multiz8way/galGal2/$chr.maf
set FISH = /cluster/bluearc/hg17/multiz8way/fr1/$chr.maf
set DEST = /cluster/data/hg17/bed/var_multiz.2004-08-12/maf/$chr.maf
set VMZ = /cluster/bin/penn/var_multiz
set PROJECT = /cluster/bin/penn/var_multiz.2004.08.12/maf_project
mkdir -p $DEST:h
if ( -s $RAT && -s $MOUSE ) then
echo "Aligning $RAT $MOUSE..."
$VMZ $RAT $MOUSE 0 0 > /scratch/$chr.tmp1.maf
echo "Projecting on $REF..."
$PROJECT /scratch/$chr.tmp1.maf $REF > /scratch/$chr.hrm.maf
else if ( -s $RAT ) then
cp $RAT /scratch/$chr.hrm.maf
else if ( -s $MOUSE ) then
cp $MOUSE /scratch/$chr.hrm.maf
endif
if ( -s $CHICKEN && -s /scratch/$chr.hrm.maf ) then
echo "Adding $CHICKEN..."
$VMZ /scratch/$chr.hrm.maf $CHICKEN 1 0 > /scratch/$chr.tmp2.maf
echo "Projecting on $REF..."
$PROJECT /scratch/$chr.tmp2.maf $REF > /scratch/$chr.hrmc.maf
else if ( -s $CHICKEN ) then
cp $CHICKEN /scratch/$chr.hrmc.maf
else if ( -s /scratch/$chr.hrm.maf ) then
cp /scratch/$chr.hrm.maf /scratch/$chr.hrmc.maf
endif
if ( -s $FISH && -s /scratch/$chr.hrmc.maf ) then
echo "Adding $FISH..."
$VMZ /scratch/$chr.hrmc.maf $FISH 1 0 > /scratch/$chr.tmp3.maf
echo "Projecting on $REF..."
$PROJECT /scratch/$chr.tmp3.maf $REF > $DEST
else if ( -s $FISH ) then
cp $FISH $DEST
else if ( -s /scratch/$chr.hrmc.maf ) then
cp /scratch/$chr.hrmc.maf $DEST
endif
echo "Done."
rm /scratch/$chr.tmp[123].maf /scratch/$chr.hrm.maf /scratch/$chr.hrmc.maf
'_EOF_'
# << keep emacs coloring happy
chmod 755 doVarMultiz.csh
for file in `find /cluster/bluearc/hg17/multiz8way/rn3 /cluster/bluearc/hg17/multiz8way/mm5 /cluster/bluearc/hg17/multiz8way/galGal2 /cluster/bluearc/hg17/multiz8way/fr1 -name "chr*.maf"` ; do echo `basename $file .maf` ; done | sort -u > chrlist
rm -f jobs.lst
for chr in `cat chrlist` ; do echo "doVarMultiz.csh $chr" >> jobs.lst ; done
# run cluster job
ssh kk ; cd /cluster/data/hg17/bed/var_multiz.2004-08-12; para create jobs.lst ; para try ; para push
# (etc.)
Completed: 46 of 46 jobs
CPU time in finished jobs: 71302s 1188.36m 19.81h 0.83d 0.002 y
IO & Wait Time: 1162s 19.37m 0.32h 0.01d 0.000 y
Average job time: 1575s 26.26m 0.44h 0.02d
Longest job: 6353s 105.88m 1.76h 0.07d
Submission to last job: 6362s 106.03m 1.77h 0.07d
# for now just create an ordinary maf track (conservation later)
rm -rf /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1
mkdir -p /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1
ln -s /cluster/data/hg17/bed/var_multiz.2004-08-12/maf/*.maf /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1
/cluster/bin/i386/hgLoadMaf hg17 -warn varMultizMm5Rn3GalGal2Fr1 -pathPrefix=/gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1
chmod 775 /gbdb/hg17/varMultiz /gbdb/hg17/varMultiz/maf /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1 /cluster/data/hg17/bed/var_multiz.2004-08-12 /cluster/data/hg17/bed/var_multiz.2004-08-12/maf
chmod 664 /cluster/data/hg17/bed/var_multiz.2004-08-12/maf/*.maf
# trackDb entry
# track varMultizMm5Rn3GalGal2Fr1
# shortLabel varMultiz5Way
# longLabel Human/Mouse/Rat/Chicken/Fugu Var-Multiz
# group compGeno
# priority 190
# visibility hide
# type maf
# elephant human blastz alignment by Robert Aug 11 2004
mkdir /cluster/bluearc/elephant
cd /cluster/bluearc/elephant
#get reads and qual scores from trace repository
for i in `cat trace.lst`; do echo $i ; wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/loxodonta_africana/fasta.loxodonta_africana.$i.gz ; done
for i in `cat trace.lst`; do echo $i ; wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/loxodonta_africana/qual.loxodonta_africana.$i.gz ; done
for i in `cat trace.lst` ; do zcat fasta.loxodonta_africana.$i.gz > loxodonta_africana.$i.fa ; done
#trim reads
for i in `cat trace.lst` ; do nice gunzip -c qual.loxodonta_africana.$i.gz > qual.loxodonta_africana.$i; faTrimRead loxodonta_africana.$i.fa qual.loxodonta_africana.$i tmp.$i.fa lift.$i.lft; mv -f tmp.$i.fa loxodonta_africana.$i.fa ; rm -f qual.loxodonta_africana.$i ; done
for i in `cat trace.lst`; do faSize -detailed=on loxodonta_africana.$i.fa > mac.$i.len ; done
cat mac.0*.len > S2.len
for i in `cat trace.lst`; do sed -e s/S2.len/mac.$i.len/ < DEF > DEF.$i ; done
#split fa reads into 10mb chunks for blastz run and distribute to i-servers.
ssh kkr1u00
for i in `cat trace.lst`; do nice faSplit about loxodonta_africana.$i.fa 10000000 /iscratch/i/elephant/${i}.mac. ; done
cd /iscratch/i/elephant
find split -name \*.fa > /cluster/bluearc/elephant/mac.lst
cd /cluster/bluearc/elephant
hgsql hg17 -N < chromLen.sql > S1.len
cd /iscratch/i/elephant
iSync
#setup cluster run to blastz reads to human genome
ssh kk
cd /cluster/bluearc/elephant
BlastZ_run0.sh
cd run.0
para create jobList
para push
#94798 jobs in batch
#149 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 94797 of 94798 jobs
#Crashed: 1 jobs
#CPU time in finished jobs: 14183153s 236385.89m 3939.76h 164.16d 0.450 y
#IO & Wait Time: 310938s 5182.30m 86.37h 3.60d 0.010 y
#Average job time: 153s 2.55m 0.04h 0.00d
#Longest job: 1770s 29.50m 0.49h 0.02d
#Submission to last job: 52186s 869.77m 14.50h 0.60d
ssh kkr9
BlastZ_run1.sh
cd run.1
para create jobList
para push
#341 jobs in batch
#151 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 341 of 341 jobs
#CPU time in finished jobs: 142914s 2381.91m 39.70h 1.65d 0.005 y
#IO & Wait Time: 14078s 234.63m 3.91h 0.16d 0.000 y
#Average job time: 460s 7.67m 0.13h 0.01d
#Longest job: 782s 13.03m 0.22h 0.01d
#Submission to last job: 1954s 32.57m 0.54h 0.02d
#generate lst and fa files for each chromosome for faster lavToAxt
cd /cluster/bluearc/elephant
echo "select chrom from chromInfo;" > chrom.sql
hgsql hg17 -B -N < chrom.sql > chrom.lst
for i in `cat chrom.lst` ; do grep -h '>' lav/$i/* | awk '{print $1}' | sed -e 's/"//g' | sed -e 's/>//g' > mac.$i.lst ; echo $i ; done
ssh kki
cd /cluster/bluearc/elephant
mkdir -p splitChrom
/bin/rm splitChrom/*
gensub2 trace.lst chrom.lst gsub.split spec.split
para create spec.split
para push
#322 jobs in batch
#Checking finished jobs
#Completed: 322 of 322 jobs
#CPU time in finished jobs: 819s 13.65m 0.23h 0.01d 0.000 y
#IO & Wait Time: 3278s 54.63m 0.91h 0.04d 0.000 y
#Average job time: 13s 0.21m 0.00h 0.00d
#Longest job: 51s 0.85m 0.01h 0.00d
#Submission to last job: 462s 7.70m 0.13h 0.01d
cd /cluster/bluearc/elephant/splitChrom
for i in `cat /cluster/bluearc/elephant/chrom.lst` ; do cat mac.*.$i.fa > mac.$i.fa ; echo $i ; done
#lav to axt run
ssh kk
cd /cluster/bluearc/elephant
mkdir -p run.2
#NOTE: chr19 must be run on kolossus with 64bit executables
#change SEQ1_DIR form /scratch to /iscratch for mini cluster
. DEF
echo "#LOOP" > run.2/gsub
echo '/cluster/bin/scripts/blastz-contiglav2axt '${BASE}'/lav/$(root1) {check out line+ '${BASE}'/axtChrom/$(root1).axt} '${SEQ1_DIR}' /cluster/bluearc/elephant/splitChrom/mac.$(root1).fa' >> run.2/gsub
echo "#ENDLOOP" >> run.2/gsub
cd run.2
gensub2 ../chrom.lst single gsub jobList
para create jobList
para push
#chrM has no data and crashed
#46 jobs in batch
#Checking finished jobs
#Completed: 45 of 46 jobs
#Crashed: 1 jobs
#CPU time in finished jobs: 249970s 4166.17m 69.44h 2.89d 0.008 y
#IO & Wait Time: 5407s 90.11m 1.50h 0.06d 0.000 y
#Average job time: 5675s 94.58m 1.58h 0.07d
#Longest job: 27065s 451.08m 7.52h 0.31d
#Submission to last job: 47744s 795.73m 13.26h 0.55d
#split reads by prefix so axtBest will fit in memory
mkdir axtByQ
cat mac*.lst | awk -F\| '{print substr($3,1,3)}' | sort -nu >prefix.lst
for i in `cat prefix.lst` ; do cat axtChrom/*.axt | axtFilter -qStartsWith=gnl\|ti\|$i stdin | axtSwap stdin S1.len S2.len stdout | axtSort stdin axtByQ/q$i.axt ; done
mkdir axtByQBest
#lots of memory needed for reciprocal best
ssh kolossus
cd /cluster/bluearc/elephant/axtByQ
for i in `ls *.axt` ; do axtBest -quiet i all stdout | axtSwap stdin ../S2.len ../S1.len ../axtByQBest/$i ; echo $i done ; done
cd ../cluster/bluearc/elephant/axtByQBest
cat q*.axt | axtSplitByTarget stdin .
mkdir axtRecipBest
for i in `cat chrom.lst` ; do axtSort axtByQBest/$i.axt stdout | axtBest stdin $i axtRecipBest/$i.axt ; echo $i ;done
for i in `cat chrom.lst` ; do axtToMaf axtRecipBest/$i.axt S1.len S2.len maf/$i.maf -tPrefix=hg17. -qPrefix=rm1. -scoreZero ; done
for i in `cat chrom.lst` ; do mafFilter -minScore=1000 maf/$i.maf > mafFilter/$i.maf ; done
# record coverage
cd mafFilter
for i in `cat ../chrom.lst` ; do nice mafCoverage hg17 $i.maf -count=2 > $i.cov ; echo done $i ; done
# CHIMP DELS FROM HG16 (DONE 2004-08-17 kate)
# NOTE: this track just for development -- it should be regenerated from the latest
# alignments instead of lifted.
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir -p chimpDels
cd chimpDels
hgsql -s hg16 -e "SELECT * FROM chimpDels" | cut -f 2- > chimpDels.hg16.bed
liftOver chimpDels.hg16.bed \
/cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \
chimpDels.bed chimpDels.unmapped
wc -l chimpDels.bed chimpDels.unmapped
# 27662 chimpDels.bed
# 132 chimpDels.unmapped
# 27794 total
hgLoadBed hg17 chimpDels chimpDels.bed -noBin
### CREATE chimpFixedDiff -- panTro1 (Daryl, August 18, 2005)
# Convert chimp quality scores from uncompressed to compressed
# chromosome format. This took 22 minutes on crow.
## previously done for hg16
# cd /cluster/data/panTro1
# cat */chr*.qa | qaToQac stdin chrom.qac
# Make single base pair high quality differences into a bed file
# and load into database
cd /cluster/data/hg17/bed
mkdir chimpFixedDiff
cd chimpFixedDiff
sed 's/simpleNucDiff/chimpFixedDiff/' ~/kent/src/hg/lib/simpleNucDiff.sql > chimpFixedDiffs.sql
# chimpHiQualDiffs was changed to allow different
# quality parameters as command line options
set axtDir = /cluster/data/hg17/bed/blastz.panTro1/axtRBestNet
# This crashed twice at the same place, but ran successfully when
# each chromosome was run separately.
## time chimpFixedDiffs /$axtDir /cluster/data/panTro1/chrom.qac chimpFixedDiffs.bed >& chimpFixedDiffs.log
mkdir chroms; cd chroms
ls -1 $axtDir | grep chr | grep axt | sed 's/.axt//' | xargs mkdir
# rmdir chr*random
touch cfd.log
foreach f (chr*)
echo -n $f " "
ln -s /$axtDir/$f.axt $f/$f.axt
time nice chimpFixedDiffs $f /cluster/data/panTro1/chrom.qac $f.chimpFixedDiffs.bed>>& cfd.log
end
rm ../chimpFixedDiffs.bed
cat chr*bed > ../chimpFixedDiffs.bed
## The load (sort) ran out of memory on hgwdev, so I sorted the
## file first on kolossus (3 minutes) and then loaded it on hgwdev
ssh kolossus
hgLoadBed -strict -sqlTable=chimpFixedDiffs.sql -noLoad hg16 chimpFixedDiff chimpFixedDiffs.bed
exit
## hgwdev (37 minutes)
hgLoadBed -hasBin -noSort -sqlTable=chimpFixedDiffs.sql hg16 chimpFixedDiff bed.tab
TODO: need to filter out polymorphic sites (SNPs)
# Load firstEF track (DONE 2004-08-18 braney)
ssh hgwdev
mkdir -p /cluster/data/hg17/bed/firstEF
cd /cluster/data/hg17/bed/firstEF
wget "http://bioinformatics.med.ohio-state.edu/downloads/firstEFMay04.bed.gz"
cat << '_EOF_' > sedScript
s/chr23/chrX/g
s/chr24/chrY/g
/^>/d
/^$/d
/^No/d
'_EOF_'
zcat firstEFMay04.bed.gz | sed -f sedScript | awk "{OFS=\"\t\"} {\$3 +=1; print \$0}" > firstEF.bed
hgLoadBed hg17 firstEF firstEF.bed
rm firstEF.tab
gzip *.bed
#done firstEF
# GENE BOUNDS (RNACLUSTER) (DONE 08-18-2004 Chuck)
# Create rnaCluster table (depends on {est,mrna}OrientInfo)
cd ~sugnet/store1/altSplice/hg17/
mkdir rnaCluster
cd rnaCluster/
mkdir chrom
# Create a list of accessions that come from RAGE libraries and need to be excluded.
~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg17 rage.libs
foreach f (/cluster/data/hg17/nib/chr*.nib)
set c = $f:t:r
set out = chrom/$c.bed
# Exclude accesions in the RAGE file
echo clusterRna -mrnaExclude=hg17.rage.libs hg17 /dev/null $out -chrom=$c
clusterRna -mrnaExclude=hg17.rage.libs hg17 /dev/null $out -chrom=$c
end
hgLoadBed hg17 rnaCluster chrom/*.bed
mkdir /cluster/data/hg17/bed/rnaCluster
cp -r chrom /cluster/data/hg17/bed/rnaCluster
# miRNA track (CORRECTION 2004-12-09 - Hiram)
# Received the following correction from Michel Weber:
# Could you please replace the two lines:
# chr6 72169974 72170045 hsa-mir-30a 480 - 72169977 72169999
# chr6 72170017 72170040 hsa-mir-30a-5p 480 - 72170017 72170040
# by:
# chr6 72169974 72170045 hsa-mir-30a 480 - 72169977 72169999
# chr6 72169974 72170045 hsa-mir-30a 480 - 72170017 72170040
# (The first line remains identical, only the second is changed. The
# repetition of the hsa-mir-30a entry means that both strands of its
# hairpin structure are matured into microRNAs, named hsa-miR-30a-3p and
# hsa-miR-30a-5p in Rfam database).
ssh hgwdev
cd /cluster/data/hg17/bed/miRNA
mv miRNA_hg17_1.bed miRNA_hg17_1.bed.0
cp miRNA_hg17_1.bed.0 miRNA_hg17_1.bed
# edit miRNA_hg17_1.bed to change the single line. Then:
mv hg17.bed hg17.bed.0
egrep -v "^track |^browser " miRNA_hg17_1.bed | \
sed -e "s/miR/mir/g; s/ sa-mir/ hsa-mir/g; s/ /\t/g;" > hg17.bed
# Check that the edit is in place properly:
diff hg17.bed.0 hg17.bed
# and load it
hgLoadBed hg17 miRNA hg17.bed
# Loaded 221 elements of size 8
# featureBits remains the same:
featureBits hg17 miRNA
# 18052 bases of 2866216770 (0.001%) in intersection
# miRNA track (DONE 2004-09-03 - Hiram)(CORRECTED, see above 2004-12-09)
# The source data for this was received via email from Sam
# Griffiths-Jones to Donna 16 August 2004. In other email Michel
# Weber asked to add one more data line to that file.
# data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
# and Michel.Weber@ibcg.biotoul.fr
# notify them if this assembly updates to renew this track
cd /cluster/data/hg17/bed
mkdir miRNA
cd miRNA
# one name was missing the h in hsa-mir and one was miR instead of
# mir
egrep -v "^track |^browser " miRNA_hg17_1.bed | \
sed -e "s/miR/mir/g; s/ sa-mir/ hsa-mir/g; s/ /\t/g;" > hg17.bed
hgLoadBed hg17 miRNA hg17.bed
# compare with previous results, should be relatively similar
# featureBits hg16 miRNA
# 16923 bases of 2865248791 (0.001%) in intersection
# featureBits hg17 miRNA
# 18052 bases of 2866216770 (0.001%) in intersection
# entry is already in trackDb/trackDb.ra
## blastz mRNA track for internal use - Robert 8/12/04
mkdir /cluster/bluearc/hg17/mrnaBlastz
cd /cluster/bluearc/hg17/mrnaBlastz
/cluster/data/genbank/bin/i386/gbGetSeqs -gbRoot=/cluster/data/genbank genbank mrna mrna.fa -db=hg -native
mkdir -p split
faTrimPolyA mrna.fa trim.fa
faSplit about trim.fa 1000000 split/mrna
cp -ip trim.fa /panfs/ucsc.edu/home/scratch/hg17/mrnaBlastz
faSize trim.fa -detailed=on > S2.len
hgsql hg16 < chromInfo.sql > S1.len
BlastZ_run0.sh
cd run.0
para push
para time
#113894 jobs in batch
#207911 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 113894 of 113894 jobs
#CPU time in finished jobs: 14423845s 240397.41m 4006.62h 166.94d 0.457 y
#IO & Wait Time: 334352s 5572.54m 92.88h 3.87d 0.011 y
#Average job time: 130s 2.16m 0.04h 0.00d
#Longest job: 38301s 638.35m 10.64h 0.44d
#Submission to last job: 59841s 997.35m 16.62h 0.69d
mkdir run.1
~angie/hummus/do.out2lav DEF > run.1/j
cd run.1
para create j
para push
para time
#341 jobs in batch
#208550 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 341 of 341 jobs
#CPU time in finished jobs: 28990s 483.17m 8.05h 0.34d 0.001 y
#IO & Wait Time: 43139s 718.98m 11.98h 0.50d 0.001 y
#Average job time: 212s 3.53m 0.06h 0.00d
#Longest job: 2015s 33.58m 0.56h 0.02d
#Submission to last job: 2187s 36.45m 0.61h 0.03d
#!/bin/tcsh
set base="/cluster/bluearc/hg17/mrnaBlastz"
cd $base
mkdir -p pslRaw
foreach c (lav/*)
pushd $c
set chr=$c:t
set out=$base/pslRaw/$chr.psl
echo "Translating $chr lav to $out"
cat `ls -1 *.lav | sort -g` \
| lavToPsl stdin stdout \
| sed -e 's@scratch/hg/gs.18/build35/bothMaskedNibs//@@' | sed -e 's/\.nib:[0-9]*-[0-9]*//' > $out
popd
end
mkdir run.2
for i in `awk '{print $1}' S1.len` ; do echo doSortFilter.sh ../pslRaw/$i.psl ../pslFilter/$i.psl >> run.2/spec.dup ; done
cd run.2
para create spec.dup
para push
#46 jobs in batch
#3 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 46 of 46 jobs
#CPU time in finished jobs: 4409s 73.48m 1.22h 0.05d 0.000 y
#IO & Wait Time: 1082s 18.04m 0.30h 0.01d 0.000 y
#Average job time: 119s 1.99m 0.03h 0.00d
#Longest job: 3842s 64.03m 1.07h 0.04d
#Submission to last job: 3842s 64.03m 1.07h 0.04d
cd ..
for i in `awk '{print $1}' S1.len` ; do echo axtChain -linearGap=linearGap.txt -psl pslFilter/$i.psl /scratch/hg/gs.18/build35/bothMaskedNibs/ -faQ /panfs/ucsc.edu/home/scratch/hg17/mrnaBlastz/trim.fa chain/$i.chain >> spec.chain ; done
para create spec.chain
para push
cd run.3
para create spec.filter
para push
cd ..
ls /cluster/data/hg17/nib/*.nib > S1.lst
#Skip chainPreNet it is not good for mrna
#mkdir -p preNet
#
#cd chainFilter
#foreach i ( *.chain)
#chainPreNet $i ../S1.len ../S2.len ../preNet/$i
#end
mkdir run.4
cd run.4
for i in `awk '{print $1}' ../S1.len`; do echo "chainToPsl ../chainFilter/$i.chain ../S1.len ../S2.len ../S1.lst /panfs/ucsc.edu/home/scratch/hg17/mrnaBlastz/trim.fa ../psl/$i.psl" >> spec.chain2psl.new ; done
pslCat psl/*psl > mrnaBlastz.psl
hgLoadPsl hg17 mrnaBlastz.psl
cp trim.fa /cluster/data/hg17/bed/mrnaBlastz/hg17Mrna.fa
ln /cluster/data/hg17/bed/mrnaBlastz/hg17Mrna.fa /gbdb/hg17/mrnaBlastz/ -s
hgLoadSeq -prefix=bz hg17 /gbdb/hg17/mrnaBlastz/hg17Mrna.fa
## end of blastz Mrna track
#### BUILD RETROGENE TRACK (done Robert 8/26/2004)
#### REBUILD RETROGENE TRACK (done Robert 12/24/2004 - but no notes - kuhn)
# diffs before push to beta:
# 1640 hg17.pseudoGeneLink.devOnly
# 9639 hg17.pseudoGeneLink.betaOnly
# 15091 hg17.pseudoGeneLink.common
# RETROGENE TRACK data update - Robert - 2005-04-08
(added by Jen 2006-01-31)
- pushQ entry did not include psuedoMrna table. Old table is still
present on RR. New data has since been lost on dev.
User impact: ~1000 sequence missing links in browser
- new all.joiner rule needed to link psuedoMrna to pseudoGeneLink table
- current all.joiner rule between knownGene to pseudoGeneLink gives errors.
the data types appear to be mismatched. pseudoGeneLink.kgName is
a gene symbol, not the same identifier as in knownGene.name
- data is to be regenerated soon and errors corrected at that time
mkdir /cluster/data/hg17/bed/pseudo
cd /cluster/data/hg17/bed/pseudo
ls /cluster/data/hg17/nib/*.nib > S1.lst
hgsql hg17 -N -B < allMrna.sql > allMrna.lst
cp /cluster/data/genbank/data/aligned/genbank.142.0/hg17/full/mrna.native.psl.gz .
gunzip mrna.native.psl.gz
awk '{OFS="\t";print $1,$2,$3,$4,$5,$6,$7,$8,$9,substr($10,1,index($10,".")-1),$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23}' mrna.native.rawPsl > mrnaBlat.psl
hgsql hg17 -N -B < refGene.sql > refGene.tab
hgsql hg17 -B -N < mgcGene.sql > mgcGene.tab
cat ../../*/*.fa.out | awk '$5~/chr*/{OFS="\t";print $5,$6,$7}' >rmsk.bed
cd /cluster/bluearc/hg17/mrnaBlastz/
zcat /cluster/data/hg17/bed/blastz.mm5/axtChain/mouseSyn.net.gz | netToBed stdin mouseSyn.bed
hgsql hg17 < mrna.sql | grep -v matches | awk '{OFS="\t"; print $2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22}' > all_mrna.psl
ssh eieio
pslCat -nohead -check all_mrna.psl /cluster/bluearc/hg17/mrnaBlastz/psl/*.psl |awk '{print $0, $1*3-$2}' | sort -k 10,10 -k 22nr -T /tmp | awk '{OFS="\t"; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21}' > blatBlastzHg17.psl
ssh hgwdev
cp blatBlastzHg17.psl /scratch/
tawk '$12 > 1 && $12 < 9999999{x=$11;$11=$12;$12=x;print $0}' /cluster/data/kgDB/bed/hg17/kgBestMrna/sortedKnownGene.tab > sortedKnownGene.tab
#copy files to iServers for cluster run
ssh kkr1u00
/cluster/home/baertsch/bin/i386/pslSplit nohead -chunkSize=121 /iscratch/i/gs.18/build35/pseudo blatBlastzHg17.psl
cd /cluster/data/hg17/bed/pseudo
cp refGene.tab /iscratch/i/gs.18/build35/pseudo
cp /cluster/data/hg17/bed/simpleRepeat/simpleRepeat.bed /iscratch/i/gs.18/build35/pseudo
cp mrnaHg17.fa /iscratch/i/gs.18/build35/pseudo
cp sortedKnownGene.tab /iscratch/i/gs.18/build35/pseudo
cp rmsk.bed /iscratch/i/gs.18/build35/pseudo
cp all_mrna.psl /iscratch/i/gs.18/build35/pseudo
cp mouseSyn.bed /iscratch/i/gs.18/build35/pseudo
for i in `ls tmp*` ; do echo "doBuildkk.sh ${i%%.psl}" ; done | sed -e 's/tmp//g' > ~/hg17/pseudo/spec.kk
cd /iscratch/i/hg/gs.18/build35/pseudo
iSync
para create spec.kk
para push
#post process
# run from eieio
BLUE=/cluster/bluearc/hg17/pseudo
echo catting output
cat $BLUE/pseudoGeneLink[0-9]*.bed | sort -k1,1 -k2,3n >pseudoGeneLinkSort.bed ; /bin/rm $BLUE/pseudoGeneLink[0-9]*.bed
cat $BLUE/pseudo[0-9]*.psl > pseudo.psl ; /bin/rm $BLUE/pseudo[0-9]*.psl &
echo Filtering pseudoGeneLinkSort.bed
tawk '$5 > 10 && $15 > 10000 && $35 > 650 {OFS="\t";print $0}' pseudoGeneLinkSort.bed > pseudoGeneLinkSortFilter.bed
echo Removing Overlaps
doSplit
cd /cluster/bluearc/hg17/pseudo/run.o
spec.overlap
cd ~/hg17/pseudo
cat /cluster/bluearc/hg17/pseudo/chr*pseudoNoOverlap.bed > pseudoGeneLinkNoOverlap.bed
echo Making psl
awk '{printf("%s\t%s\t%s\n", $5,$2,$3)}' pseudoGeneLinkNoOverlap.bed > pseudoGeneLinkSelect.tab
## 350 is the sacrad magic number and will probably change
tawk '$6>=350{print $0}' pseudoGeneLinkNoOverlap.bed > pseudoGeneLinkNoOverlapFilter.bed
pslSelect -qtStart=pseudoGeneLinkSelect.tab pseudo.psl pseudoMrna.psl
echo Loading Bed
hgLoadBed hg17 pseudoGeneLink pseudoGeneLinkNoOverlapFilter.bed -hasBin -sqlTable=/cluster/home/baertsch/kent/src/hg/lib/pseudoGeneLink.sql
echo Loading Psl
hgLoadPsl hg17 pseudoMrna.psl
## end of retroGene track
# 3-WAY MULTIZ MULTIPLE ALIGNMENT (MM5, RN3) (DONE 2004-08-27 kate)
# HMR Maf's needed for regulatory potential track
ssh eieio
set multizDir = multiz.2004-08-27
cd /cluster/data/hg17/bed/$multizDir
set workingDir = /cluster/bluearc/hg17/$multizDir
ln -s $workingDir /cluster/bluearc/hg17/multiz3way
ln -s $multizDir multiz3way
mkdir -p $workingDir
mkdir -p /cluster/data/hg17/bed/$multizDir
# wrapper script for multiz
# NOTE: first arg is pairwise, 2nd arg is multiple (to add to)
# NOTE: next time, modify script so it only needs one arg -- saves the
# multiple dirname in a file for use by the next run
cat << 'EOF' > doMultiz.csh
#!/bin/csh -fe
mkdir -p $3:h
/cluster/bin/penn/multiz $1 $2 - > $3
'EOF'
# << for emacs
cat << 'EOF' > gsub
#LOOP
../doMultiz.csh {check in line /cluster/bluearc/hg17/multiz.2004-08-27/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/hg17/multiz.2004-08-27/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/hg17/multiz.2004-08-27/$(root1)$(dir1)/$(root2).maf}
#ENDLOOP
'EOF'
# << for emacs
chmod +x doMultiz.csh
ssh eieio
set workingDir = /cluster/bluearc/hg17/multiz.2004-08-27
# copy mafs to bluearc -- chimp
mkdir $workingDir/mm5
cp /cluster/data/hg17/bed/blastz.mm5/mafNet/*.maf \
$workingDir/mm5
ls $workingDir/mm5/*.maf > chrom.lst
# rat
mkdir $workingDir/rn3
cp /cluster/data/hg17/bed/blastz.rn3/mafNet/chr*.maf $workingDir/rn3
# multiz - add in rn3 rat to human/mouse
#
ssh kki
set multizDir = multiz.2004-08-27
set workingDir = /cluster/bluearc/hg17/$multizDir
cd /cluster/data/hg17/bed/$multizDir
mkdir run.rn3
cd run.rn3
echo "rn3/mm5" > species.lst
gensub2 species.lst ../chrom.lst ../gsub jobList
para create jobList
# 47 jobs
para try, check, push, check
# copy 3-way mafs to build directory
ssh eieio
set multizDir = multiz.2004-08-27
set workingDir = /cluster/bluearc/hg17/$multizDir
ln -s $workingDir/mm5rn3 $workingDir/maf
cd /cluster/data/hg17/bed/multiz.2004-08-27
mkdir maf
cp $workingDir/maf/*.maf maf
# BLASTZ TETRAODON (tetNig1) (DONE, 2004-08-26, hartera)
# blastz requires lineage-specific repeats
# Treat all repeats as lineage-specific.
ssh kkr1u00
mkdir /iscratch/i/gs.18/build35/linSpecRep.notInTetraodon
foreach f (/iscratch/i/gs.18/build35/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/gs.18/build35/linSpecRep.notInTetraodon/$f:t:r:r.out.spec
end
mkdir /iscratch/i/tetNig1/linSpecRep.notInHuman
foreach f (/iscratch/i/tetNig1/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/tetNig1/linSpecRep.notInHuman/$f:t:r:r.out.spec
end
iSync
ssh kk
mkdir -p /cluster/data/hg17/bed/blastz.tetNig1.2004-08-20
ln -s /cluster/data/hg17/bed/blastz.tetNig1.2004-08-20 \
/cluster/data/hg17/bed/blastz.tetNig1
cd /cluster/data/hg17/bed/blastz.tetNig1
# abridge repeats.
# Treat all repeats as lineage-specific.
cat << '_EOF_' > DEF
# human vs. Tetraodon
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
# use same parameters as for danRer1-fr1
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human
SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/hg17/linSpecRep.notInTetraodon
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Tetraodon
SEQ2_DIR=/iscratch/i/tetNig1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/tetNig1/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg17/bed/blastz.tetNig1
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
#DEBUG=1
'_EOF_'
# << this line keeps emacs coloring happy
# save the DEF file in the current standard place
chmod +x DEF
cp DEF ~angie/hummus/DEF.hg17-tetNig1.2004-08-20
# make sure BlastZ_run0.sh, BlastZ_run1.sh and BlastZ_run2.sh scripts
# are in /cluster/data/hg17/jkStuff
# edit BlastZ_run0.sh so directory for blastz is /cluster/bin/penn
bash # if a csh/tcsh user
. ./DEF
/cluster/data/hg17/jkStuff/BlastZ_run0.sh
cd run.0
# check batch looks ok then
para try, check, push, check, ....
# para time
# Completed: 19437 of 19437 jobs
# CPU time in finished jobs: 3225816s 53763.60m 896.06h 37.34d 0.102 y
# IO & Wait Time: 174096s 2901.60m 48.36h 2.01d 0.006 y
# Average job time: 175s 2.92m 0.05h 0.00d
# Longest job: 709s 11.82m 0.20h 0.01d
# Submission to last job: 5324s 88.73m 1.48h 0.06d
# second cluster run: lift raw alignments -> lav dir
ssh kki
cd /cluster/data/hg17/bed/blastz.tetNig1
bash # if a csh/tcsh user
. ./DEF
/cluster/data/hg17/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, check etc.
# para time
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 280s 4.66m 0.08h 0.00d 0.000 y
# IO & Wait Time: 2183s 36.39m 0.61h 0.03d 0.000 y
# Average job time: 7s 0.12m 0.00h 0.00d
# Longest job: 41s 0.68m 0.01h 0.00d
# Submission to last job: 469s 7.82m 0.13h 0.01d
# third run: lav -> axt
ssh kki
cd /cluster/data/hg17/bed/blastz.tetNig1
mkdir axtChrom run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /iscratch/i/gs.18/build35/bothMaskedNibs \
/iscratch/i/tetNig1/nib stdout \
| axtSort stdin $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x do.csh
cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg17/bed/blastz.tetNig1/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
\ls -1Sd ../lav/chr* > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
head jobList
para create jobList
para try, check, push, check,...
# para time
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 52s 0.87m 0.01h 0.00d 0.000 y
# IO & Wait Time: 256s 4.27m 0.07h 0.00d 0.000 y
# Average job time: 7s 0.11m 0.00h 0.00d
# Longest job: 36s 0.60m 0.01h 0.00d
# Submission to last job: 275s 4.58m 0.08h 0.00d
# one job crashed because chr6_hla_hap1.axt is empty. Checked by running this
# again and then looked at the lav file which has no alignments in it.
# translate sorted axt files into psl
ssh kolossus
cd /cluster/data/hg17/bed/blastz.tetNig1
mkdir -p pslChrom
set tbl = "blastzTetNig1"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.tetNig1/pslChrom
foreach f (./*.psl)
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 $f
echo "$f Done"
end
# original blastzTetNig1:
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=6000
# BLASTZ_K=2200
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# BLASTZ_ABRIDGE_REPEATS=1
# featureBits -chrom=chr1 hg17 blastzTetNig1
# 6378680 bases of 222827847 (2.863%) in intersection
# featureBits -chrom=chr1 hg17 refGene:cds blastzTetNig1 -enrichment
# refGene:cds 1.246%, blastzTetNig1 2.863%, both 0.856%, cover 68.70%,
# enrich 24.00x
# featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer1 -enrichment
# refGene:cds 1.246%, blastzDanRer1 3.934%, both 0.831%, cover 66.72%,
# enrich 16.96x
# comparable to zebrafish so good
# try same parameters with L=8000
# featureBits -chrom=chr1 hg17 refGene:cds blastzTetNig1L8k -enrichment
# refGene:cds 1.246%, blastzTetNig1L8k 2.095%, both 0.753%, cover 60.47%,
# enrich 28.87x
# load chr1 with blastz using just H=2000 and default parameters
# featureBits -chrom=chr1 hg17 refGene:cds blastzTetNig1Default -enrichment
# refGene:cds 1.246%, blastzTetNig1Default 1.630%, both 0.808%, cover 64.87%,
# enrich 39.80x
# rows in chr1_blastzTetNig1 tables
# blastzTetNig1 95156
# blastzTetNig1L8k 58015
# blastzTetNig1Default 71342
# The default values also used for danRer1 vs fugu give good coverage and
# higher enrichment than blastzTetNig1 with less alignments so this will be
# used for the blastz track - now called blastzTetNig1.
# CHAIN TETRAODON (tetNig1) BLASTZ (DONE, 2004-08-26, hartera)
# Make chains with rescored blastz
# Run axtChain on little cluster
ssh kki
cd /cluster/data/hg17/bed/blastz.tetNig1
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg17/bed/blastz.tetNig1/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# Make our own linear gap file with reduced gap penalties,
# in hopes of getting longer chains - works well for species at
# chicken-human distance or greater
cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize 11
smallSize 111
position 1 2 3 11 111 2111 12111 32111 72111 152111 252111
qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -linearGap=../../chickenHumanTuned.gap $1 \
/iscratch/i/gs.18/build35/bothMaskedNibs \
/iscratch/i/tetNig1/nib $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# para time
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 553s 9.22m 0.15h 0.01d 0.000 y
# IO & Wait Time: 102s 1.69m 0.03h 0.00d 0.000 y
# Average job time: 15s 0.24m 0.00h 0.00d
# Longest job: 56s 0.93m 0.02h 0.00d
# Submission to last job: 985s 16.42m 0.27h 0.01d
# one job crashed since chr6_hla_hap1.axt is empty - no alignments
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r >> hist5000.out
textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
echo ""
end
# only chr19 has a very large number of chains with score < 5000
# load chr 1 into table
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain/chain
hgLoadChain hg17 chr1_chainTetnig1 chr1.chain
# featureBits -chrom=chr1 hg17 refGene:cds chainTetnig1Link -enrichment
# refGene:cds 1.246%, chainTetnig1Link 1.582%, both 0.805%, cover 64.59%,
# enrich 40.83x
# try filtering with minScore of 5000
ssh kksilo
cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chain
rm -r chain
chainSplit chain all.chain
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain/chain
hgLoadChain hg17 chr1_chainTetNig1Filt5k chr1.chain
# featureBits -chrom=chr1 hg17 refGene:cds chainTetNig1Filt5kLink -enrichment
# refGene:cds 1.246%, chainTetNig1Filt5kLink 1.487%, both 0.789%, cover 63.33%,
# enrich 42.58x
# this cleans it up a lot with little reduction in coverage.
# check in browser - filtered version looks good.
# add all chains for minScore=5000 filtered chains
# remove test chain tables for chr1
ssh hgwdev
hgsql -e "drop table chr1_chainTetnig1;" hg17
hgsql -e "drop table chr1_chainTetnig1Link;" hg17
hgsql -e "drop table chr1_chainTetNig1Filt5k;" hg17
hgsql -e "drop table chr1_chainTetNig1Filt5kLink;" hg17
cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain hg17 ${c}_chainTetNig1 $i
echo done $c
end
# NET TETRAODON (tetNig1) BLASTZ (DONE, 2004-08-26, hartera)
ssh kksilo
cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
# memory usage 55373824, utime 415 s/100, stime 45
# Add classification info using db tables:
cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
# netClass looks for ancient repeats in one of the databases
# hg17 has this table - hand-curated by Arian but this is for
# human-rodent comparisons so do not use here, use -noAr option
mkdir -p /cluster/bluearc/hg17/linSpecRep.notInTetraodon
mkdir -p /cluster/bluearc/tetNig1/linSpecRep.notInHuman
cp /iscratch/i/hg17/linSpecRep.notInTetraodon/* \
/cluster/bluearc/hg17/linSpecRep.notInTetraodon
cp /iscratch/i/tetNig1/linSpecRep.notInHuman/* \
/cluster/bluearc/tetNig1/linSpecRep.notInHuman
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
time netClass noClass.net hg17 tetNig1 tetNig1.net \
-tNewR=/cluster/bluearc/hg17/linSpecRep.notInTetraodon \
-qNewR=/cluster/bluearc/tetNig1/linSpecRep.notInHuman -noAr
# 54.100u 31.890s 2:20.01 61.4% 0+0k 0+0io 197pf+0w
netFilter -minGap=10 tetNig1.net | hgLoadNet hg17 netTetNig1 stdin
# featureBits hg17 refGene:cds netTetNig1 -enrichment
# refGene:cds 0.978%, netTetNig1 25.095%, both 0.778%, cover 79.53%,
# enrich 3.17x
# TWINSCAN 1.3 GENE PREDICTIONS (Done, 2004-Aug-26, heather)
cd /cluster/data/hg17/bed
mkdir twinscan
tarFile=hg17_TS13_pseudomasked.tar.gz
wget http://genes.cs.wustl.edu/predictions/human/NCBI35/hg17_TS13_pseudomasked.tar.gz
wget http://genes.cs.wustl.edu/predictions/human/NCBI35/md5sum.txt
# check file transferred correctly
grep gz md5sum.txt > gz.sum
md5sum $tarFile | diff - gz.sum
# extract
tar xvfz $tarFile
unset tarFile
# check that files unzipped and untarred correctly
# expect no differences
cd chr_gtf
grep gtf ../md5sum.txt > md5sum.txt
cd ../chr_ptx
grep ptx ../md5sum.txt > md5sum.txt
cd ../chr_tx
grep tx ../md5sum.txt > md5sum.txt
cd ..
md5sum chr_gtf/* > gtf.sum
diff gtf.sum chr_gtf/md5sum.txt
md5sum chr_ptx/* > ptx.sum
diff ptx.sum chr_ptx/md5sum.txt
md5sum chr_tx/* > tx.sum
diff tx.sum chr_tx/md5sum.txt
# pare down protein FASTA header to id and add missing .a:
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
echo chr$c
perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa
end
ldHgGene hg17 twinscan chr_gtf/chr*.gtf -gtf -genePredExt
hgPepPred hg17 generic twinscanPep chr_ptx/chr*-fixed.fa
# MAKE VSTETNIG1 DOWNLOADABLES (DONE, 2004-09-08, hartera)
# Replace with gzipped versions (DONE 2004-09-14 kate)
ssh kksilo
# zip chains and nets
cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
cp all.chain tetNig1.chain
zip -j /cluster/data/hg17/zip/tetNig1.chain.zip tetNig1.chain
rm tetNig1.chain
zip -j /cluster/data/hg17/zip/tetNig1.net.zip tetNig1.net
ssh hgwdev
# copy chains and nets to downloads area
set gp = /usr/local/apache/htdocs/goldenPath/hg17
mkdir -p $gp/vsTetNig1
cd $gp/vsTetNig1
mv /cluster/data/hg17/zip/tetNig1*.zip .
md5sum *.zip > md5sum.txt
# move axt files to downloads area and zip
cd /cluster/data/hg17/bed/blastz.tetNig1/axtChrom
mkdir -p $gp/vsTetNig1/axtChrom
cp -p *.axt $gp/vsTetNig1/axtChrom
cd $gp/vsTetNig1/axtChrom
gzip *.axt
md5sum *.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# BLASTZ TETRAODON (tetNig1) CLEANUP (DONE, 2004-09-10, hartera)
ssh kksilo
cd /cluster/data/hg17/bed/blastz.tetNig1
nice rm -rf raw &
nice rm -rf lav &
nice rm axtChain/run1/chain/* &
nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} &
# regulatory potential 2X track (WORKING - 2004-09-14 - Hiram)
ssh eieio
mkdir /cluster/store3/gs.18/build35/bed/regPotential2X
mkdir /cluster/store3/gs.18/build35/bed/regPotential3X
cd /cluster/data/hg17/bed
ln -s /cluster/store3/gs.18/build35/bed/regPotential2X .
ln -s /cluster/store3/gs.18/build35/bed/regPotential3X .
cd regPotential2X
wget --timestamping 'http://www.bx.psu.edu/~james/stuff/rp_kit.tgz' .
tar xvzf rp_kit.tgz
# fixup the hmr_rp_score.sh and hm_rp_score.sh to set
# RP=. to read: RP=/cluster/data/hg17/bed/regPotential2X/rp_kit
# And fix the usage of SHIFT and WINDOW, the following diff shows
# the changes:
# 5c5
# < RP_DIR=/cluster/data/hg17/bed/regPotential2X/rp_kit
# ---
# > RP_DIR=.
# 8,9c8,9
# < MAPPING=rp_kit/hm_5a_mapping.txt
# < MATRIX=rp_kit/hm_5a+3_scoreMatrix.dat
# ---
# > MAPPING=hm_5a_mapping.txt
# > MATRIX=hm_5a+3_scoreMatrix.dat
# 12c12
# < SHIFT=1
# ---
# > SHIFT=5
# 24,25c24,25
# < --shiftAmount $SHIFT \
# < --windowSize $WINDOW \
# ---
# > --shiftAmount 5 \
# > --windowSize 100 \
mkdir maf
for A in `(cd /cluster/data/hg17/bed/blastz.mm5/axtNet; ls chr*.axt)`
do
C=${A/.axt}
echo "/cluster/data/hg17/bed/blastz.mm5/axtNet/${A} -> maf/${C}.maf.gz"
axtToMaf /cluster/data/hg17/bed/blastz.mm5/axtNet/${A} \
/cluster/data/hg17/chrom.sizes /cluster/data/mm5/chrom.sizes \
stdout | gzip > maf/${C}.maf.gz
done
# Replace bad chr5 axtNet and mafNet (2006-01-05 kate)
# a valid java runtime is only on hgwdev. This is a java procedure
ssh hgwdev
cd /cluster/data/hg17/bed/regPotential2X
mkdir rp_scores
# WARNING - the following loop takes almost 12 hours !
for M in maf/chr*.maf.gz
do
C=${M/.maf.gz}
C=${C#maf/}
echo "$M -> rp_scores/$C.score.gz"
(zcat ${M} | ./rp_kit/hm_rp_score.sh /dev/stdin /dev/stderr 2>&1 >/dev/null) | sort -n | \
gzip > rp_scores/${C}.score.gz
done
# real 709m55.805s
# user 754m51.030s
# sys 20m11.000s
# Back to the file server to create the wiggle data
ssh eieio
cd /cluster/data/hg17/bed/regPotential2X
mkdir wigData dataLimits
for S in rp_scores/chr*.score.gz
do
C=${S/.score.gz}
C=${C#rp_scores/}
echo "$S -> wigData/$C.wig"
zcat $S | sort -n | \
wigAsciiToBinary -chrom=$C -dataSpan=1 \
-wibFile=wigData/$C stdin 2> dataLimits/$C.limits
done
# real 313m0.567s
# user 285m37.319s
# sys 23m8.301s
# Loading the table on hgwdev
ssh hgwdev
cd /cluster/data/hg17/bed/regPotential2X/wigData
mkdir /gbdb/hg17/wib/regPotential2X
ln -s `pwd`/*.wib /gbdb/hg17/wib/regPotential2X
time hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/regPotential2X \
hg17 regPotential2X chr*.wig
# real 2m29.668s
# user 0m33.380s
# sys 0m8.200s
# regulatory potential 3X track (WORKING - 2004-09-14 - Hiram)
# Expects groundwork done above in the 2X track
# a valid java runtime is only on hgwdev. This is a java procedure
ssh hgwdev
cd /cluster/data/hg17/bed/regPotential3X
ln -s ../regPotential2X/rp_kit/hmr_rp_score.sh .
mkdir rp_scores
# WARNING - the following loop takes almost 12 hours !
for M in maf/chr*.maf.gz
do
C=${M/.maf.gz}
C=${C#maf/}
echo "$M -> rp_scores/$C.score.gz"
(zcat ${M} | ./rp_kit/hm_rp_score.sh /dev/stdin /dev/stderr 2>&1 >/dev/null) | sort -n | \
gzip > rp_scores/${C}.score.gz
done
# real 613m8.230s
# user 623m7.110s
# sys 20m24.550s
# Back to the file server to create the wiggle data
ssh eieio
cd /cluster/data/hg17/bed/regPotential3X
mkdir wigData dataLimits
for S in rp_scores/chr*.score.gz
do
C=${S/.score.gz}
C=${C#rp_scores/}
echo "$S -> wigData/$C.wig"
zcat $S | sort -n | \
wigAsciiToBinary -chrom=$C -dataSpan=1 \
-wibFile=wigData/$C stdin 2> dataLimits/$C.limits
done
# Loading the table on hgwdev
ssh hgwdev
cd /cluster/data/hg17/bed/regPotential3X/wigData
mkdir /gbdb/hg17/wib/regPotential3X
ln -s `pwd`/*.wib /gbdb/hg17/wib/regPotential3X
time hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/regPotential3X \
hg17 regPotential3X chr*.wig
# real 1m45.568s
# user 0m32.740s
# sys 0m6.140s
# regulatory potential 5X track (DONE - 2005-09-19 - Daryl)
ssh kkstore02
mkdir -p /cluster/data/hg17/bed/regPotential5X/rp_scores
cd /cluster/data/hg17/bed/regPotential5X/rp_scores
wget -r -l 1 -nH http://www.bx.psu.edu/~james/rp/hg17panTro1mm5rn3canFam1/all_truncate.tar
tar xvf all_truncate.tar
cd /cluster/data/hg17/bed/regPotential5X
mkdir -p wigData dataLimits
cd wigData
## 8 minutes
for S in ../rp_scores/chr*.scores.truncated.gz
do
C=${S/.scores.truncated.gz}
C=${C#../rp_scores/}
echo "$S -> wigData/$C.wig"
zcat $S | wigEncode stdin $C.wig $C.wib 2> ../dataLimits/$C.limits
done
# Loading the table on hgwdev
ssh hgwdev
cd /cluster/data/hg17/bed/regPotential5X/wigData
mkdir -p /gbdb/hg17/wib/regPotential5X
chmod o+rx /gbdb/hg17/wib/regPotential5X
ln -s /cluster/data/hg17/bed/regPotential5X/wigData/*.wib /gbdb/hg17/wib/regPotential5X
chmod o+r /gbdb/hg17/wib/regPotential5X/ch*wib
time hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/regPotential5X hg17 regPotential5X chr*.wig
# 57.720u 9.960s 2:26.05 46.3% 0+0k 0+0io 213pf+0w
# SGP GENES (DONE 9/17/04 angie)
ssh eieio
mkdir /cluster/data/hg17/bed/sgp
cd /cluster/data/hg17/bed/sgp
foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/SGP/$chr.gtf
wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/SGP/$chr.prot
end
# Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
cp /dev/null sgpPep.fa
foreach f (chr*.prot)
perl -wpe 's/^(>chr\S+)/$1.1/' $f >> sgpPep.fa
end
ssh hgwdev
cd /cluster/data/hg17/bed/sgp
ldHgGene -gtf -genePredExt hg17 sgpGene chr*.gtf
hgPepPred hg17 generic sgpPep sgpPep.fa
# SGP GENES (UPDATE 1/18/2006)
sgpPep table dropped, replaced by hgc generated protein seq in browser
# LIFTOVER RNAGENE FROM HG16 (09/29/04, acs)
cd /cluster/data/hg17/bed
mkdir rnaGene
cd rnaGene
liftOver -gff /cluster/data/hg16/bed/rnaGene/all.gff \
/cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \
rnaGeneLift.gff rnaGeneMiss.gff
# 7204 records passed, 16 failed
hgsql hg17 < ~/kent/src/hg/lib/rnaGene.sql
hgRnaGenes hg17 rnaGeneLift.gff
# BUILD BioCyc TABLES (DONE 10/1/04 Fan)
- Create bioCycMapDesc table.
CREATE TABLE bioCycMapDesc (
mapID varchar(40) NOT NULL default '',
description varchar(255) NOT NULL default '',
KEY mapID (mapID)
) TYPE=MyISAM;
- Crate bioCycPathway table.
CREATE TABLE bioCycPathway (
kgID varchar(40) NOT NULL default '',
geneID varchar(40) NOT NULL default '',
mapID varchar(40) NOT NULL default '',
KEY kgID (kgID),
KEY geneID (geneID),
KEY mapID (mapID)
) TYPE=MyISAM;
Using data files sent by Peter Carp from SRI,
per Peter's email of 10/1/04, they don't have recent update,
so data files received last year are used.
Save the BioCyc Pathway name and description table as names.txt.
Save the pathway data file as gene-pathway.dat.
Make sure there is no extra ^M at end of the lines.
hgsql hg17 -e 'LOAD DATA local INFILE 'names.txt' into table bioCycMapDesc'
Run hgBioCyc program to generate the file bioCycPathway.tab.
hgBioCyc gene-pathway.dat hg17
Load into hg17.
hgsql hg17 -e 'LOAD DATA local INFILE "bioCycPathway.tab" into table bioCycPathway'
# MAKING FOLDUTR TABLES (DONE - 2004-10-4 Fan)
# First set up directory structure and extract UTR sequence on hgwdev
ssh hgwdev
mkdir -p /cluster/data/hg17/bed/rnaStruct
cd /cluster/data/hg17/bed/rnaStruct
mkdir -p utr3/split utr5/split utr3/fold utr5/fold
utrFa hg17 knownGene utr3 utr3/utr.fa
utrFa hg17 knownGene utr5 utr5/utr.fa
# Split up files and make files that define job.
ssh kk
cd /cluster/data/hg17/bed/rnaStruct
faSplit sequence utr3/utr.fa 50000 utr3/split/s
faSplit sequence utr5/utr.fa 50000 utr5/split/s
ls -1 utr3/split > utr3/in.lst
ls -1 utr5/split > utr5/in.lst
cd utr3
cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
cp gsub ../utr5
# Do cluster run for 3' UTRs
gensub2 in.lst single gsub spec
para create spec
para try
para push
# Completed: 37244 of 37244 jobs
# CPU time in finished jobs: 1036479s 17274.64m 287.91h 12.00d 0.033 y
# IO & Wait Time: 112286s 1871.44m 31.19h 1.30d 0.004 y
# Average job time: 31s 0.51m 0.01h 0.00d
# Longest job: 3370s 56.17m 0.94h 0.04d
# Submission to last job: 4355s 72.58m 1.21h 0.05d
# Do cluster run for 5' UTRs
cd ../utr5
gensub2 in.lst single gsub spec
para create spec
para try
para push
# Completed: 29817 of 29817 jobs
# CPU time in finished jobs: 98143s 1635.72m 27.26h 1.14d 0.003 y
# IO & Wait Time: 105763s 1762.71m 29.38h 1.22d 0.003 y
# Average job time: 7s 0.11m 0.00h 0.00d
# Longest job: 2133s 35.55m 0.59h 0.02d
# Submission to last job: 2465s 41.08m 0.68h 0.03d
# Load database
ssh hgwdev
cd /cluster/data/hg17/bed/rnaStruct/utr5
hgLoadRnaFold hg17 foldUtr5 fold
cd ../utr3
hgLoadRnaFold hg17 foldUtr3 fold
# Clean up
rm -r split fold err batch.bak
cd ../utr5
rm -r split fold err batch.bak
####### BUILD RGD HUMAN QTL TRACKS (DONE 10/7/04 Fan) ##############
mkdir -p /cluster/store8/rgd/human041007
ln -s /cluster/store8/rgd/human041007 /cluster/data/hg17/bed/rgdQtl
cd /cluster/data/hg17/bed/rgdQtl
# download data files from RGD
wget --timestamp ftp://rgd.mcw.edu/pub/RGD_genome_annotations/human/human_QTL.gff
# remove extra line feed character at the end of lines
rmLf human_QTL.gff > rgdQtl.gff
# create rgdQtl.tab
awk '{print $1"\t"$4"\t"$5"\t"$10}' rgdQtl.gff |sed -e 's/Chr/chr/g'| \
sed -e 's/"//g' |sed -e 's/RGD://g' | sed -e 's/;//g' > rgdQtl.tab
# create rgdQtlLink.tab
awk '{printf "%s\t%s\t", $12, $10; for (i = 14;i <= NF; ++i ) {printf "%s ", $i} printf "\n"} ' rgdQtl.gff | \
sed -e 's/"//g'| sed -e 's/RGD://g' | sed -e 's/;//g'| sed -e 's/Note//g' > rgdQtlLink.tab
# load rgdQtl table
hgLoadBed hg17 rgdQtl rgdQtl.tab
# check rgdQtl table
checkTableCoords hg17 rgdQtl
# load rgdQtlLink table
hgsql hg17 -e "drop table hg17.rgdQtlLink;"
hgsql hg17 <~/kent/src/hg/lib/rgdQtlLink.sql
hgsql hg17 -e 'load data local infile "rgdQtlLink.tab" into table hg17.rgdQtlLink;'
# updated trackDb.ra under /kent/src/hg/makeDb/trackDb/human/hg17 and
# added rgdQtl.html.
#### AFFYMETRIX HG-U133 PLUS TRACK (DONE, 2004-10-11, hartera)
ssh hgwdev
mkdir -p /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
# Go to
#http://www.affymetrix.com/support/technical/byproduct.affx?product=hg-u133-plus
# and download the consensus and exemplar sequences to this directory
cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
unzip HG-U133_Plus_2_consensus.zip
unzip HG-U133_Plus_2_exemplar.zip
cat HG-U133_Plus_2_consensus HG-U133_Plus_2_exemplar >> U133Plus2_all.fa
perl -pi.bak -e "s/(consensus|exemplar):HG-U133_Plus_2:/U133+2:/" \
U133Plus2_all.fa
# remove ";" from probe set names
perl -pi.bak -e "s/;//" U133Plus2_all.fa
# clean up
rm *.zip *.bak
mkdir -p /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
cp U133Plus2_all.fa /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
# Set up cluster job to align consensus/exemplars to hg16
ssh kkr1u00
mkdir -p /iscratch/i/affy
mv /cluster/data/hg17/bed/affyU133Plus2.2004-10-11/U133Plus2_all.fa \
/iscratch/i/affy
iSync
ssh kk
cd /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
ls -1 /iscratch/i/affy/U133Plus2_all.fa > affy.lst
ls -1 /iscratch/i/gs.18/build35/maskedContigs/* > allctg.lst
cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/gs.18/build35/hg17.11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << for emacs
gensub2 allctg.lst affy.lst template.sub para.spec
mkdir psl
para create para.spec
para try, para check, para push .....
# para time
# Completed: 380 of 380 jobs
# CPU time in finished jobs: 24533s 408.88m 6.81h 0.28d 0.001 y
# IO & Wait Time: 2180s 36.34m 0.61h 0.03d 0.000 y
# Average job time: 70s 1.17m 0.02h 0.00d
# Longest job: 751s 12.52m 0.21h 0.01d
# Submission to last job: 2425s 40.42m 0.67h 0.03d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU133Plus2.psl
pslSort dirs raw.psl tmp psl
# use filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least 95% identity in aligned region.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
liftUp affyU133Plus2.psl ../../jkStuff/liftAll.lft warn contig.psl
perl -pi.bak -e "s/U133\+2://" affyU133Plus2.psl
# load into the database
ssh hgwdev
cd /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
hgLoadPsl hg17 affyU133Plus2.psl
# Add sequence data to database
# Copy probe sequence to /gbdb if it isn't already
mkdir -p /gbdb/hgFixed/affyProbes
cd /gbdb/hgFixed/affyProbes
ln -s /projects/compbio/data/microarray/affyHuman/HG-U133Plus2/U133Plus2_all.fa .
cd /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
hgLoadSeq -abbr=U133+2: hg17 /gbdb/hgFixed/affyProbes/U133Plus2_all.fa
# clean up
rm -r psl tmp err contig.psl raw.psl *.bak psl.tab seq.tab
# Added knownToU133Plus2 track (2004-10-14) - see GeneSorter section
#### MAF COVERAGE FIGURES FOR ADAM (DONE 10/18/04 angie)
# First, get ranges of target coverage:
ssh eieio
mkdir /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage
cd /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage/
cat /cluster/data/hg17/bed/var_multiz.2004-08-12/maf.09-12-04/*.maf \
| nice mafRanges -notAllOGap stdin hg17 hg17.mafRanges.bed
# Get pairwise coverage as well.
ssh kolossus
cd /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage/
cat /cluster/bluearc/hg17/multiz8way/rn3/*.maf \
| nice mafRanges -notAllOGap stdin hg17 hg17.rn3.mafRanges.bed
cat /cluster/bluearc/hg17/multiz8way/mm5/*.maf \
| nice mafRanges -notAllOGap stdin hg17 hg17.mm5.mafRanges.bed
cat /cluster/bluearc/hg17/multiz8way/galGal2/*.maf \
| nice mafRanges -notAllOGap stdin hg17 hg17.galGal2.mafRanges.bed
cat /cluster/bluearc/hg17/multiz8way/fr1/*.maf \
| nice mafRanges -notAllOGap stdin hg17 hg17.fr1.mafRanges.bed
ssh hgwdev
cd /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage
# To make subsequent intersections a bit quicker, output a bed with
# duplicate/overlapping ranges collapsed:
nice featureBits hg17 hg17.mafRanges.bed \
-bed=hg17.mafRangesCollapsed.bed
#1147548420 bases of 2866216770 (40.037%) in intersection
foreach other (mm5 rn3 galGal2 fr1)
nice featureBits hg17 hg17.$other.mafRanges.bed \
-bed=hg17.${other}.mafRangesCollapsed.bed
end
#1013348528 bases of 2866216770 (35.355%) in intersection
#975533772 bases of 2866216770 (34.036%) in intersection
#101623034 bases of 2866216770 (3.546%) in intersection
#46737824 bases of 2866216770 (1.631%) in intersection
# mafCoverage barfs currently, so pass on this for now:
#cat ../maf.09-12-04/*.maf \
#| nice mafCoverage -count=2 hg17 stdin > hg17.mafCoverage
# Intersect maf target coverage with gene regions --
# use Adam's knownGene region files:
nice featureBits hg17 -enrichment \
../phastCons/stats2/knownGenesCds.bed \
hg17.mafRangesCollapsed.bed \
-bed=hg17.mafCds.bed
#knownGenesCds.bed 1.166%, hg17.mafRangesCollapsed.bed 40.037%, both 1.111%, cover 95.36%, enrich 2.38x
nice featureBits hg17 -enrichment \
../phastCons/stats2/knownGenesUtr3.bed \
hg17.mafRangesCollapsed.bed \
-bed=hg17.mafUtr3.bed
#knownGenesUtr3.bed 0.918%, hg17.mafRangesCollapsed.bed 40.037%, both 0.662%, cover 72.18%, enrich 1.80x
nice featureBits hg17 -enrichment \
../phastCons/stats2/knownGenesUtr5.bed \
hg17.mafRangesCollapsed.bed \
-bed=hg17.mafUtr5.bed
#knownGenesUtr5.bed 0.266%, hg17.mafRangesCollapsed.bed 40.037%, both 0.198%, cover 74.42%, enrich 1.86x
# Intersect pairwise target coverages with gene regions:
foreach other (mm5 rn3 galGal2 fr1)
nice featureBits hg17 -enrichment \
../phastCons/stats2/knownGenesCds.bed \
hg17.$other.mafRangesCollapsed.bed -bed=hg17.${other}Cds.bed
nice featureBits hg17 -enrichment \
../phastCons/stats2/knownGenesUtr3.bed \
hg17.$other.mafRangesCollapsed.bed -bed=hg17.${other}Utr3.bed
nice featureBits hg17 -enrichment \
../phastCons/stats2/knownGenesUtr5.bed \
hg17.$other.mafRangesCollapsed.bed -bed=hg17.${other}Utr5.bed
end
#knownGenesCds.bed 1.166%, hg17.mm5.mafRangesCollapsed.bed 35.355%, both 1.093%, cover 93.74%, enrich 2.65x
#knownGenesUtr3.bed 0.918%, hg17.mm5.mafRangesCollapsed.bed 35.355%, both 0.618%, cover 67.37%, enrich 1.91x
#knownGenesUtr5.bed 0.266%, hg17.mm5.mafRangesCollapsed.bed 35.355%, both 0.186%, cover 69.81%, enrich 1.97x
#knownGenesCds.bed 1.166%, hg17.rn3.mafRangesCollapsed.bed 34.036%, both 1.071%, cover 91.85%, enrich 2.70x
#knownGenesUtr3.bed 0.918%, hg17.rn3.mafRangesCollapsed.bed 34.036%, both 0.597%, cover 65.09%, enrich 1.91x
#knownGenesUtr5.bed 0.266%, hg17.rn3.mafRangesCollapsed.bed 34.036%, both 0.179%, cover 67.33%, enrich 1.98x
#knownGenesCds.bed 1.166%, hg17.galGal2.mafRangesCollapsed.bed 3.546%, both 0.779%, cover 66.84%, enrich 18.85x
#knownGenesUtr3.bed 0.918%, hg17.galGal2.mafRangesCollapsed.bed 3.546%, both 0.194%, cover 21.12%, enrich 5.96x
#knownGenesUtr5.bed 0.266%, hg17.galGal2.mafRangesCollapsed.bed 3.546%, both 0.056%, cover 21.03%, enrich 5.93x
#knownGenesCds.bed 1.166%, hg17.fr1.mafRangesCollapsed.bed 1.631%, both 0.714%, cover 61.26%, enrich 37.57x
#knownGenesUtr3.bed 0.918%, hg17.fr1.mafRangesCollapsed.bed 1.631%, both 0.073%, cover 7.92%, enrich 4.86x
#knownGenesUtr5.bed 0.266%, hg17.fr1.mafRangesCollapsed.bed 1.631%, both 0.039%, cover 14.82%, enrich 9.09x
# ALTERNATIVE CPG ISLANDS (DONE 10/14/04 angie)
ssh eieio
nice tcsh
mkdir /cluster/data/hg17/bed/cpgIslandAlt
cd /cluster/data/hg17/bed/cpgIslandAlt
# Try cpg_ash (WUSTL program modified to not chop islands in half before
# scoring) with default params:
cp /dev/null cpg_ash.default.cpg
foreach f (../../?{,?}/chr*.fa.masked)
echo running on $f:t:r:r
~angie/cb/hg3rdParty/cpgIslands/cpg_ash.exe $f >> cpg_ash.default.cpg
end
awk -f ../cpgIsland/filter.awk cpg_ash.default.cpg > cpgIslandAlt.bed
# Run Andy Law's script on masked seq:
cp /dev/null cpgIslandGgfAndyMasked.bed
foreach f (../../?{,?}/chr*.fa.masked)
set chr = $f:t:r:r
echo running on $chr
/cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f \
| /cluster/home/angie/ggf-andy-cpg-island.pl \
| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
$gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \
$pGc = (100.0 * $gc / $n); \
$_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
"$pCpG\t$pGc\t$oE\n";' \
>> cpgIslandGgfAndyMasked.bed
end
# Compare enrichment for knownGene upstream -- an uphill battle for
# programs closer to meeting the stated length, GC, O/E params!
ssh hgwdev
nice featureBits hg17 -enrichment knownGene:upstream:1000 \
/cluster/data/hg17/bed/cpgIsland/cpgIsland.bed
#knownGene:upstream:1000 0.857%, cpgIsland.bed 0.741%, both 0.166%, cover 19.37%, enrich 26.13x
nice featureBits hg17 -enrichment knownGene:upstream:1000 \
/cluster/data/hg17/bed/cpgIslandAlt/cpgIslandAlt.bed
#knownGene:upstream:1000 0.857%, cpgIslandAlt.bed 1.075%, both 0.200%, cover 23.38%, enrich 21.76x
nice featureBits hg17 -enrichment knownGene:upstream:1000 \
/cluster/data/hg17/bed/cpgIslandAlt/cpgIslandGgfAndyMasked.bed
#knownGene:upstream:1000 0.857%, cpgIslandGgfAndyMasked.bed 1.964%, both 0.292%, cover 34.06%, enrich 17.34x
cd /cluster/data/hg17/bed/cpgIslandAlt
sed -e 's/cpgIslandExt/cpgIslandAlt/g' \
~/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandAlt.sql
hgLoadBed -noBin -tab -sqlTable=cpgIslandAlt.sql \
hg17 cpgIslandAlt cpgIslandAlt.bed
#Loaded 29998 elements of size 10
sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
~/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \
hg17 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed
#Loaded 80555 elements of size 10
# Quick length stats:
hgsql hg17 -e 'select min(length), avg(length), max(length) from cpgIslandExt'
#| 201 | 764.1913 | 40058 |
hgsql hg17 -e 'select min(length), avg(length), max(length) from cpgIslandAlt'
#| 200 | 1026.9194 | 32440 |
hgsql hg17 -e 'select min(length), avg(length), max(length) from cpgIslandGgfAndyMasked'
#| 200 | 698.8257 | 100308 |
# 1/26/05: Make better island names in cpgIslandGgfAndyMasked,
# for Dave Burt's cross-species island comparisons.
ssh eieio
cd /cluster/data/hg17/bed/cpgIslandAlt
mv cpgIslandGgfAndyMasked.bed cpgIslandGgfAndyMasked.bed.orig
perl -wpe '@w=split("\t"); $w[3] = "hg17.$w[0]." . ($w[1]+1) . ".$w[2]"; \
$_ = join("\t", @w);' \
cpgIslandGgfAndyMasked.bed.orig \
> cpgIslandGgfAndyMasked.bed
# Now liftOver islands from mm5, rn3, galGal2:
ssh kolossus
cd /cluster/data/hg17/bed/cpgIslandAlt
foreach match (50 95)
liftOver /cluster/data/mm5/bed/cpgIslandGgfAndy/cpgIslandGgfAndyMasked.bed \
/cluster/data/mm5/bed/bedOver/mm5Tohg17.chain -minMatch=0.$match \
cpgIslandGAMFromMm5_$match.bed cpgIslandGAMFromMm5_$match.unmapped
liftOver /cluster/data/rn3/bed/cpgIslandGgfAndy/cpgIslandGgfAndyMasked.bed \
/cluster/data/rn3/bed/bedOver/rn3ToHg17.over.chain -minMatch=0.$match \
cpgIslandGAMFromRn3_$match.bed cpgIslandGAMFromRn3_$match.unmapped
liftOver /cluster/data/galGal2/bed/cpgIslandGgfAndy/cpgIslandGgfAndyMasked.bed \
/cluster/data/galGal2/bed/bedOver/galGal2ToHg17.over.chain -minMatch=0.$match \
cpgIslandGAMFromGalGal2_$match.bed cpgIslandGAMFromGalGal2_$match.unmapped
end
# Load up the renamed islands as well as
ssh hgwdev
cd /cluster/data/hg17/bed/cpgIslandAlt
hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \
hg17 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed
# MAKE UNIGENE/SAGE TRACK (DONE - 2004-10-15 Fan)
# First get SAGE data and determine which version of UniGene to use first
ssh hgwdev
cd ~/kent/src/hg/sage
make
# XXX = uniGene build for which SAGE was built -- not necessarily current!
# Figure out the build number by peeking at this file:
wget -O - ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt 2> /dev/null
# UniGene Build #44 Arabidopsis thaliana
# UniGene Build #61 Bos taurus
# UniGene Build #16 Caenorhabditis elegans
# UniGene Build #171 Homo sapiens
# UniGene Build #19 Medicago truncatula
# UniGene Build #138 Mus musculus
# UniGene Build #52 Oryza sativa
# UniGene Build #14 Pinus taeda
# UniGene Build #132 Rattus norvegicus
# UniGene Build #27 Sus scrofa
# UniGene Build #38 Triticum aestivum
# UniGene Build #11 Vitis vinifera
# UniGene Build #41 Zea mays
# From above info, set Version 171 for hg17
ls /projects/cc/hg/sugnet/uniGene
# set Version = XXX
set Version=171
mkdir /projects/cc/hg/sugnet/sage/sage.$Version
cd /projects/cc/hg/sugnet/sage/sage.$Version
wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/map/Hs
wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/map/readme.txt
wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt
wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/extr
wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/info
# That downloaded about 1 GB of data
cd map/Hs/NlaIII
unzip -j SAGEmap_tag_ug-rel.zip
cd ../../../extr/
../../scripts/summarizeCounts.pl expCounts.tab ./SAGE_*
../../scripts/countGenesPerTag.pl expCounts.tab allTags.count.tab
../../scripts/createArraysForTags.pl allTags.count.tab tagExpArrays.tab \
./SAGE_*
../../scripts/countsPerExp.pl expCounts.tab expList.tab
cd ../map/Hs/NlaIII/
cat << '_EOF_' > /tmp/t.pl
#!/usr/local/bin/perl
while (<>) {
chomp($_);
@p = split(/\t/, $_);
print "$p[2]\t$p[3]\t$p[0]\n";
}
'_EOF_'
chmod +x /tmp/t.pl
cat SAGEmap_tag_ug-rel | /tmp/t.pl | sort | sed -e 's/ /_/g' \
> SAGEmap_ug_tag-rel_Hs
cd ../../../extr
createSageSummary ../map/Hs/NlaIII/SAGEmap_ug_tag-rel_Hs \
tagExpArrays.tab sageSummary.sage
# Create the uniGene alignments
# /cluster/data/hg17/uniGene/hg17.uniGene.lifted.pslReps.psl
# Download of the latest UniGene version is now automated by a
# cron job -- see /cluster/home/angie/crontab ,
# /cluster/home/angie/unigeneVers/unigene.csh .
# If hgwdev gets rebooted, that needs to be restarted... maybe there's
# a more stable place to set up that cron job.
# substitute XXX -> the uniGene version used by SAGE.
# set Version = XXX
set Version = 171 (bash: export Version=171)
cd /projects/cc/hg/sugnet/uniGene/uniGene.$Version
gunzip Hs.seq.uniq.gz Hs.data.gz
../countSeqsInCluster.pl Hs.data counts.tab
../parseUnigene.pl Hs.seq.uniq Hs.seq.uniq.simpleHeader.fa leftoverData.tab
# Distribute UniGene sequence to /iscratch/i/ (kkstore can see /projects)
ssh kkstore
set Version = 171 # same as above
mkdir -p /iscratch/i/uniGene.$Version
cp -p \
/projects/cc/hg/sugnet/uniGene/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \
/iscratch/i/uniGene.$Version
ssh kkr1u00
iSync
ssh kk
set Version = 171 # same as above
mkdir -p /cluster/data/hg17/bed/uniGene.$Version
cd /cluster/data/hg17/bed/uniGene.$Version
ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > allctg.lst
ls -1S /iscratch/i/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \
> uniGene.lst
cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out
line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
gensub2 allctg.lst uniGene.lst template.sub para.spec
para create para.spec
mkdir psl
para try
para check
para push
# Completed: 380 of 380 jobs
# CPU time in finished jobs: 35994s 599.91m 10.00h 0.42d 0.001 y
# IO & Wait Time: 1812s 30.19m 0.50h 0.02d 0.000 y
# Average job time: 99s 1.66m 0.03h 0.00d
# Longest job: 1497s 24.95m 0.42h 0.02d
# Submission to last job: 1551s 25.85m 0.43h 0.02d
ssh eieio
set Version = 171 # same as above
cd /cluster/data/hg17/bed/uniGene.$Version
pslSort dirs raw.psl tmp psl >& pslSort.log
liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \
| pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
stdin hg17.uniGene.lifted.pslReps.psl /dev/null
# Processed 141416 alignments
# use hg17.uniGene.lifted.pslReps.psl for building UNIGENE/SAGE track.
ssh hgwdev
set Version = 171
cd /projects/cc/hg/sugnet/sage/sage.$Version/extr
addAveMedScoreToPsls \
/cluster/data/hg17/bed/uniGene.$Version/hg17.uniGene.lifted.pslReps.psl \
sageSummary.sage uniGene.wscores.bed
hgLoadBed hg17 uniGene_2 uniGene.wscores.bed
hgsql hg17 < ~kent/src/hg/lib/sage.sql
echo "load data local infile 'sageSummary.sage' into table sage" \
| hgsql hg17
cd ../info
../../scripts/parseRecords.pl ../extr/expList.tab > sageExp.tab
hgsql hg17 < ~/kent/src/hg/lib/sageExp.sql
echo "load data local infile 'sageExp.tab' into table sageExp" | hgsql hg17
# update ~/kent/src/hg/makeDb/trackDb/human/hg17/uniGene_2.html
# with current uniGene date.
# CREATE kgSpAlias TABLE FOR PB (Done 10/20/04)
hgsql hg17 -e \
'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
hgsql hg17 -e \
'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
>>j.tmp
cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab
rm j.tmp
hgsql hg17 -e 'drop table kgSpAlias';
hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql
hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias'
# SEGMENTAL DUPLICATIONS (DONE 10/21/04 angie)
ssh hgwdev
mkdir /cluster/data/hg17/bed/genomicSuperDups
cd /cluster/data/hg17/bed/genomicSuperDups
# A tar file containing files for both hg16 and hg17 was downloaded into
# /cluster/data/hg16/bed/genomicSuperDups; move over the hg17 part.
mv /cluster/data/hg16/bed/genomicSuperDups/bd35 .
cd bd35
# A note from Xinwei She about the contents:
#Build35 contains only 2 tables: genomicSuperDups and celeraDupPositive.
# use tail +2 to skip past the header line:
# actually, celeraDupPositive.tab.gz has one extra bogus line so +3 for it:
zcat celeraDupPositive.tab.gz | tail +3 \
| hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/celeraDupPositive.sql \
hg17 celeraDupPositive stdin
zcat genomicSuperDups.tab.gz | tail +2 \
| hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql \
hg17 genomicSuperDups stdin
# clean up
rm bed.tab
# ECGENE TRACK (DONE, 2004-10-29, hartera)
ssh eieio
mkdir -p /cluster/data/hg17/bed/ECgene.2004-10-27
ln -s /cluster/data/hg17/bed/ECgene.2004-10-27 \
/cluster/data/hg17/bed/ECgene
cd /cluster/data/hg17/bed/ECgene
wget \
"http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_hg17_low_gene.txt.gz"
wget \
"http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_hg17_low_pep.txt.gz"
gunzip *.gz
# load database
ssh hgwdev
cd /cluster/data/hg17/bed/ECgene
ldHgGene -predTab hg17 ECgene v1.2_hg17_low_gene.txt
# 646778 gene predictions
hgPepPred hg17 tab ECgenePep v1.2_hg17_low_pep.txt
rm *.tab
nice gzip *.txt
# LOAD ENSEMBL GENES (DONE, 2004-11-19, hartera)
mkdir /cluster/data/hg17/bed/ensembl
cd /cluster/data/hg17/bed/ensembl
# Get the ensembl protein data from
# http://www.ensembl.org/Homo_sapiens/martview
# Follow this sequence through the pages:
# Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Structures" box.
# Page 4) Choose GTF as the ouput. choose gzip compression. hit export.
# Save as ensemblGene.gtf.gz
# Ensembl handles random chromosomes differently than us, so we
# strip this data. Fortunately it just loses a couple of genes.
# Add "chr" to front of each line in the gene data gtf file to make
# it compatible with our software.
# Finally, get rid of the ".1" or ".2" after the name
gunzip -c ensemblGene.gtf.gz \
| grep -v ^6_DR51 \
| grep -v ^DR51 \
| grep -v ^DR52 \
| grep -v ^DR53 \
| grep -v _NT_ \
| perl -wpe 's/^([0-9]|X|Y|Un|MT)/chr$1/ \
|| die "Line $. doesnt start with human chrom:\n$_"' \
| sed -e 's/\..\"/\"/g' \
| sed -e 's/chrMT_NC_001807/chrM/' \
> ensGene.gtf
ssh hgwdev
/cluster/data/hg17/bed/ensembl
/cluster/bin/i386/ldHgGene hg17 ensGene ensGene.gtf
# Read 33666 transcripts in 696579 lines in 1 files
# 33666 groups 25 seqs 1 sources 4 feature types
# 33666 gene predictions
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper. Use ensMart to create it as above, except:
# Page 3) Choose the "Features" box. In "Ensembl Attributes", check
# Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# Choose Text, tab-separated as the output format. Result name ensGtp.
# Save file as ensGtp.txt.gz
gunzip ensGtp.txt.gz
hgsql hg17 < ~/kent/src/hg/lib/ensGtp.sql
# remove header line from ensGtp.txt
echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg17
# Load Ensembl peptides:
# Get them from ensembl as above in the gene section except for
# Page 3) Choose the "Sequences" box.
# Page 4) Transcripts/Proteins. Peptide. Format = FASTA.
# Save file as ensemblPep.fa.gz
gunzip ensemblPep.fa.gz
hgPepPred hg17 ensembl ensemblPep.fa
# UPDATE KNOWN GENES TABLES (DONE 11/22/04 Fan)
# Make sure the protein databases (sp041115 and proteins041115) were built first.
hgsql hg17 -e "create database kgHg17B"
mkdir -p /cluster/store8/kg/kgHg17B
cd /cluster/store6/kgDB/bed
ln -s /cluster/store8/kg/kgHg17B kgHg17B
cd kgHg17B
~/src/hg/protein/KGprocess.sh kgHg17B hg17 041115
# Found gbGetSeqs changed the format of mrna.fa output file
# (extra version number). Updated Kgprocess.sh and manually
# re-ran the following manually:
grep "^>" mrna.fa |awk '{print $1}' > mrna.lis
kgGetPep 041115 > mrnaPep.fa
hgKgMrna kgH17BTemp mrna.fa mrna.ra tight_mrna.psl ll/loc2ref \
mrnaPep.fa ll/mim2loc ${PDB} > kgHg17BKgMrna.out 2> kgHg17BKgMrna.err
# then run Kgprocess.sh again to continue processing.
~/src/hg/protein/KGprocess.sh kgHg17B hg17 041115
hgsql hg17 -e "select * from chromInfo" > chromInfo.tab
getDbTabDef hg17 chromInfo >chromInfo.sql
hgsql kgHg17B <chromInfo.sql
hgsql kgHg17B -e 'load data local infile "chromInfo.tab" into table chromInfo ignore 1 lines'
# Build kgProtMap table. This table is needed by the Proteome Browser and
# it should be built before all the KG tables be moved from kgHg17B to hg17.
~/src/hg/protein/kgProtMap.sh kgHg17B hg17 041115
# Completed: 7923 of 7923 jobs
# CPU time in finished jobs: 2502923s 41715.39m 695.26h 28.97d 0.079 y
# IO & Wait Time: 175358s 2922.63m 48.71h 2.03d 0.006 y
# Average job time: 338s 5.63m 0.09h 0.00d
# Longest job: 2403s 40.05m 0.67h 0.03d
# Submission to last job: 7164s 119.40m 1.99h 0.08d
# The script ran successfully with the last message:
# Mon Nov 22 17:11:59 PST 2004 DONE =========================
# Create database hg17Kg1 to store the old KG tables, just in case.
hgsql hg17
create database hg17Kg1;
alter table cgapAlias rename as hg17Kg1.cgapAlias;
alter table cgapBiocDesc rename as hg17Kg1.cgapBiocDesc;
alter table cgapBiocPathway rename as hg17Kg1.cgapBiocPathway;
alter table dupSpMrna rename as hg17Kg1.dupSpMrna;
alter table keggMapDesc rename as hg17Kg1.keggMapDesc;
alter table keggPathway rename as hg17Kg1.keggPathway;
alter table kgAlias rename as hg17Kg1.kgAlias;
alter table kgProtAlias rename as hg17Kg1.kgProtAlias;
alter table kgXref rename as hg17Kg1.kgXref;
alter table knownGene rename as hg17Kg1.knownGene;
alter table knownGeneLink rename as hg17Kg1.knownGeneLink;
alter table knownGeneMrna rename as hg17Kg1.knownGeneMrna;
alter table knownGenePep rename as hg17Kg1.knownGenePep;
alter table mrnaRefseq rename as hg17Kg1.mrnaRefseq;
alter table spMrna rename as hg17Kg1.spMrna;
alter table kgProtMap rename as hg17Kg1.kgProtMap;
# After initial inspection of tables in kgHg17B, do the following
# from mySql prompt:
alter table kgHg17B.cgapAlias rename as hg17.cgapAlias;
alter table kgHg17B.cgapBiocDesc rename as hg17.cgapBiocDesc;
alter table kgHg17B.cgapBiocPathway rename as hg17.cgapBiocPathway;
alter table kgHg17B.dupSpMrna rename as hg17.dupSpMrna;
alter table kgHg17B.keggMapDesc rename as hg17.keggMapDesc;
alter table kgHg17B.keggPathway rename as hg17.keggPathway;
alter table kgHg17B.kgAlias rename as hg17.kgAlias;
alter table kgHg17B.kgProtAlias rename as hg17.kgProtAlias;
alter table kgHg17B.kgXref rename as hg17.kgXref;
alter table kgHg17B.knownGene rename as hg17.knownGene;
alter table kgHg17B.knownGeneLink rename as hg17.knownGeneLink;
alter table kgHg17B.knownGeneMrna rename as hg17.knownGeneMrna;
alter table kgHg17B.knownGenePep rename as hg17.knownGenePep;
alter table kgHg17B.mrnaRefseq rename as hg17.mrnaRefseq;
alter table kgHg17B.spMrna rename as hg17.spMrna;
alter table kgHg17B.kgProtMap rename as hg17.kgProtMap;
# Old hg17.knownGene has 43,401 entries and the new one has 44,338 entries.
# Previously:
featureBits hg17 knownGene
# 65728598 bases of 2866216770 (2.293%) in intersection
# Previously, was:
# 63983072 bases of 2866216770 (2.232%) in intersection
# Connect to genome-testdb and use hgcentraltest DB.
# Update the entry in gdbPdb table:
delete from gdbPdb where genomeDb='hg17';
insert into gdbPdb values('hg17', 'proteins041115');
# UPDATE KGSPALIAS TABLE TO BE USED BY PB (Done 12/20/04)
cd /cluster/data/hg17/bed/pb
mkdir new
cd new
hgsql hg17 -e \
'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
hgsql hg17 -e \
'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
>>j.tmp
cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab
rm j.tmp
hgsql hg17 -e 'drop table kgSpAlias';
hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql
hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias'
gzip kgSpAlias.tab
# Create hg17GeneList.html (to be used by Google).
# This step was done 12/08/04.
cd /cluster/data/hg17/bed
mkdir geneList
cd geneList
wget -O hg17GeneList.html "http://hgwdev-fanhsu.cse.ucsc.edu/cgi-bin/hgGeneList?db=hg17"
cp -p hg17GeneList.html /usr/local/apache/htdocs/goldenPath
# Check this html file into CVS.
# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
# This depends on the go and uniProt databases as well as
# the kgAlias and kgProAlias tables. The hgKgGetText takes
# about 5 minutes when the database is not too busy. The rest
# is real quick.
ssh hgwdev
cd /cluster/data/hg17/bed/kgHg17F
mkdir index
cd index
hgKgGetText hg17 knownGene.text
ixIxx knownGene.text knownGene.ix knownGene.ixx
ln -s /cluster/data/hg17/bed/kgHg17F/index/knownGene.ix /gbdb/hg17/knownGene.ix
ln -s /cluster/data/hg17/bed/kgHg17F/index/knownGene.ixx /gbdb/hg17/knownGene.ixx
# UPDATE TABLES NEEDED BY hgGene (DONE 11/30/04 Fan)
# UPDATE BioCyc TABLES
hgsql hg17 -e 'delete from bioCycPathway'
hgsql hg17 -e 'delete from bioCycMapDesc'
# Using data files sent by Peter Carp from SRI,
# per Peter's email of 10/1/04, they don't have recent update,
# so data files received last year are used.
# Save the BioCyc Pathway name and description table as gene-pathway.dat.
# Save the pathway data file as gene-pathway.dat.
# Make sure there is no extra ^M at end of the lines.
# Run hgBioCyc program to generate the file bioCycPathway.tab.
hgBioCyc gene-pathway.dat hg17
# Load results into hg17.
LOAD DATA local INFILE 'pathway-names.dat' into table bioCycMapDesc;
LOAD DATA local INFILE 'bioCycPathway.tab' into table bioCycPathway;
# REBUID FOLDUTR TABLES (DONE - 2004-11-30 Fan)
# First set up directory structure and extract UTR sequence on hgwdev
ssh hgwdev
cd /cluster/data/hg17/bed
mv rnaStruct rnaStruct.2004-10-04
mkdir -p /cluster/data/hg17/bed/rnaStruct.2004-11-30
ln -s rnaStruct.2004-11-30 rnaStruct
cd /cluster/data/hg17/bed/rnaStruct
mkdir -p utr3/split utr5/split utr3/fold utr5/fold
utrFa hg17 knownGene utr3 utr3/utr.fa
utrFa hg17 knownGene utr5 utr5/utr.fa
# Split up files and make files that define job.
ssh kk
cd /cluster/data/hg17/bed/rnaStruct
faSplit sequence utr3/utr.fa 50000 utr3/split/s
faSplit sequence utr5/utr.fa 50000 utr5/split/s
ls -1 utr3/split > utr3/in.lst
ls -1 utr5/split > utr5/in.lst
cd utr3
cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
cp gsub ../utr5
# Do cluster run for 3' UTRs
gensub2 in.lst single gsub spec
para create spec
para try
para push
# Completed: 38115 of 38115 jobs
# CPU time in finished jobs: 1101680s 18361.33m 306.02h 12.75d 0.035 y
# IO & Wait Time: 100275s 1671.25m 27.85h 1.16d 0.003 y
# Average job time: 32s 0.53m 0.01h 0.00d
# Longest job: 3645s 60.75m 1.01h 0.04d
# Submission to last job: 7007s 116.78m 1.95h 0.08d
# Do cluster run for 5' UTRs
cd ../utr5
gensub2 in.lst single gsub spec
para create spec
para try
para push
# Completed: 30524 of 30524 jobs
# CPU time in finished jobs: 116647s 1944.12m 32.40h 1.35d 0.004 y
# IO & Wait Time: 80477s 1341.28m 22.35h 0.93d 0.003 y
# Average job time: 6s 0.11m 0.00h 0.00d
# Longest job: 2449s 40.82m 0.68h 0.03d
# Submission to last job: 3386s 56.43m 0.94h 0.04d
# Load database
ssh hgwdev
cd /cluster/data/hg17/bed/rnaStruct/utr5
hgLoadRnaFold hg17 foldUtr5 fold
# Parsed 30525 files
# Warning: load of foldUtr5 did not go as planned: 30525 record(s), 2 row(s) skipped, 0 warning(s) loading
./foldUtr5.tab
cd ../utr3
hgLoadRnaFold hg17 foldUtr3 fold
# Parsed 38115 files
#Warning: load of foldUtr3 did not go as planned: 38115 record(s), 2 row(s) skipped, 0 warning(s) loading
./foldUtr3.tab
# Clean up
rm -r split fold err batch.bak
cd ../utr5
rm -r split fold err batch.bak
# UPDATE GENE SORTER TABLES (AKA: FAMILY BROWSER) (DONE - 2004-11-29 - Fan)
# This should be done after knownGene tables are complete from known gene
# process.
#
# Cluster together various alt-splicing isoforms.
# Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/hg17/bed/geneSorter.2004-11-24
# remove old symbolic link
rm /cluster/data/hg17/bed/geneSorter
ln -s /cluster/data/hg17/bed/geneSorter.2004-11-24 \
/cluster/data/hg17/bed/geneSorter
cd /cluster/data/hg17/bed/geneSorter
hgClusterGenes hg17 knownGene knownIsoforms knownCanonical
# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/hg17/bed/geneSorter/blastp
cd /cluster/data/hg17/bed/geneSorter/blastp
pepPredToFa hg17 knownGenePep known.faa
# You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
# This command is in /projects/compbio/bin/$MACH/formatdb
# Copy over database to bluearc
rm -fr /cluster/bluearc/hg17/blastp
mkdir -p /cluster/bluearc/hg17/blastp
cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* \
/cluster/bluearc/hg17/blastp
# Had to pick up a new blastall binary (2004-06-15)
# Our old one would no longer run on our systems that have
# updated Linux versions
mkdir /cluster/bluearc/blast2210
cd /cluster/bluearc/blast2210
wget --timestamping \
ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.10/blast-2.2.10-ia32-linux.tar.gz
wget --timestamping \
ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.10/ChangeLog.txt
wget --timestamping \
ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.10/ReleaseNotes.txt
tar xvzf blast-2.2.10-ia32-linux.tar.gz
# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/hg17/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/self
cd /cluster/data/hg17/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \
-e 0.01 -m 8 -b 1000
'_EOF_'
# << keep emacs happy
chmod +x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# 'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
# This should finish in ~15 minutes if the cluster is free.
Completed: 7748 of 7748 jobs
CPU time in finished jobs: 191136s 3185.59m 53.09h 2.21d 0.006 y
IO & Wait Time: 66703s 1111.72m 18.53h 0.77d 0.002 y
Average job time: 33s 0.55m 0.01h 0.00d
Longest job: 370s 6.17m 0.10h 0.00d
Submission to last job: 747s 12.45m 0.21h 0.01d
# Load into database. This takes about 30 minutes
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out
time hgLoadBlastTab hg17 knownBlastTab *.tab
# Scanning through 7748 files
# Loading database with 12810133 rows
# 306.480u 54.190s 26:35.50 22.6% 0+0k 0+0io 206pf+0w
cd /cluster/data/hg17/bed/geneSorter
# Create table that maps between known genes and RefSeq
hgMapToGene hg17 refGene knownGene knownToRefSeq
# may need to build this command in src/hg/near/hgMapToGene
# hgsql -e "select count(*) from knownToRefSeq;" hg17
# row count changed 37611
# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 \
> refToLl.txt
hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt
# hgsql -e "select count(*) from knownToLocusLink;" hg17
# row count changed to 37611
# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam
# hgsql -e "select count(*) from knownToPfam;" hg17
# row count changed to 36302
# Create table to map between known genes and GNF Atlas2
# expression data.
hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
# hgsql -e "select count(*) from knownToGnfAtlas2;" hg17
# row count changed to 36373
# Create expression distance table - takes about an hour
hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnfAtlas2 &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 36373 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
# hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
# row count changed to 36373000
# Create a table that maps between known genes and
# the nice affy expression data.
hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133
# hgsql -e "select count(*) from knownToU133;" hg17
# row count changed to 37299
# Create expression distance table. This will take about 2.5 hours
cd /tmp
cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \
-weights=affyUcla.weight -lookup=knownToU133 &
# Have 43039 elements in affyUclaNorm
# 211 genes, 42 weights, 26.500000 total wieght
# Got 37299 unique elements in affyUclaNorm
# 8212.320u 217.310s 2:38:07.84 88.8% 0+0k 0+0io 267pf+0w
# Create table that maps between known genes and
# the GNF data.
cd /tmp
hgMapToGene hg17 affyU95 knownGene knownToU95
# row count changed to 18791
# hgFixed.gnfHumanU95Exps argument is unused, no need to exist
hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \
hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 &
# Have 11545 elements in hgFixed.gnfHumanU95MedianRatio
# Got 17682 unique elements in hgFixed.gnfHumanU95MedianRatio
# row count changed to 17682000
# Create known gene mapping table and expression distance tables
# for GNF Atlas 2. (The hgExpDistance takes only 10 minutes.)
hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h
hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnf1h &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 10273 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
cd /cluster/data/hg17/bed/geneSorter
hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2
# row count changed to 40015
# Make sure that GO database is up to date.
# UPDATE GO DATABASE (DONE 11/24/04 Fan)
# Download the terms and make the database.
ssh hgwdev
mkdir /cluster/store1/geneOntology/20041124
cd /cluster/store1/geneOntology/20041124
wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200411-assocdb-data.gz
hgsql mysql <<end
create database go041124;
end
zcat go_*data.gz | hgsql go041124
wget -timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_sptr.gz
wget --timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_uniprot.gz
zcat gene_association.goa_uniprot.gz | hgGoAssociation go041124 goaPart stdin
# Passed 4502016 of 5291097 of 5291097, 85.09%
# Ask sys-admin to switch the database pointer go to point to go041124.
cd /cluster/data/hg17/bed/geneSorter
XXX - DO NOT YET HAVE ensGene table - must wait on Ensembl to release that
XXX - have not created the knownToEnsembl table yet - 2004-07-15 - Hiram
# Create knownToEnsembl column
hgMapToGene hg17 ensGene knownGene knownToEnsembl
# table row count went from previous version: 36068 to 38251
# Make knownToCdsSnp table (Heather did this table, Nov 29, 2004)
ssh hgwdev
nice hgMapToGene hg17 snp knownGene knownToCdsSnp -all -cds
# row count 168336
# approx. 5 minutes running time
# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to cluster/bluearc
# There was no /cluster/bluearc/ce1/blastp, so get the latest wormpep from Sanger
cd /cluster/data/ce1/bed/blastp
mkdir old
cp -p * old
wget --timestamp ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep134/wormpep134
mv wormpep134 wormPep.faa
formatdb -i wormPep.faa -t wormPep -n wormPep
#copy them to /cluster/bluearc
ssh kkr1u00
mkdir -p /cluster/bluearc/ce1/blastp
cp /cluster/data/ce1/bed/blastp/wormPep.p?? /cluster/bluearc/ce1/blastp
# The blast jobs below can be run on the kk or kk9 clusters
# Create the ceBlastTab
ssh kk9
mkdir /cluster/data/hg17/bed/geneSorter/blastp/ce1
cd /cluster/data/hg17/bed/geneSorter/blastp/ce1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/ce1/blastp/wormPep \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Only takes 10 minutes on an idle cluster
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs: 33235s 553.91m 9.23h 0.38d 0.001 y
# IO & Wait Time: 19891s 331.52m 5.53h 0.23d 0.001 y
# Average job time: 7s 0.11m 0.00h 0.00d
# Longest job: 68s 1.13m 0.02h 0.00d
# Submission to last job: 653s 10.88m 0.18h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/ce1/run/out
hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab
# row count changed to 28252
# Make mouse ortholog column using blastp on mouse known genes.
# First make mouse protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This already exists. See makeMm5.doc for procedure
# the directory: /cluster/bluearc/scratch/mus/mm5/blastp should have data
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm5
cd /cluster/data/hg17/bed/geneSorter/blastp/mm5
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/scratch/mus/mm5/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# this echo trick is used because otherwise the command line is
# too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs: 141842s 2364.04m 39.40h 1.64d 0.004 y
# IO & Wait Time: 52251s 870.85m 14.51h 0.60d 0.002 y
# Average job time: 25s 0.42m 0.01h 0.00d
# Longest job: 254s 4.23m 0.07h 0.00d
# Submission to last job: 540s 9.00m 0.15h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/mm5/run/out
hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
# Scanning through 7748 files
# row count changed to 37549
# Make rat ortholog column using blastp on rat known genes.
# First make rat protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This already exists. See makeRn3.doc for procedure.
# Files were put in this directory: /cluster/bluearc/rn3/blastp/
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/rn3/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# this echo trick is used because otherwise the command line is
# too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs: 31786s 529.77m 8.83h 0.37d 0.001 y
# IO & Wait Time: 25795s 429.91m 7.17h 0.30d 0.001 y
# Average job time: 7s 0.12m 0.00h 0.00d
# Longest job: 75s 1.25m 0.02h 0.00d
# Submission to last job: 157s 2.62m 0.04h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out
hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab
# Scanning through 7748 files
#Loading database with 26133 rows
# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
# NOTE: data used to reside in /cluster/bluearc/dr1/blastp
mv /cluster/bluearc/dr1/blastp /cluster/bluearc/danRer1/blastp
# the directory: /cluster/bluearc/danRer1/blastp should have data
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/danRer1
cd /cluster/data/hg17/bed/geneSorter/blastp/danRer1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/danRer1/blastp/ensembl \
-i $1 -o $2 -e 0.005 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs: 102324s 1705.39m 28.42h 1.18d 0.003 y
# IO & Wait Time: 47203s 786.72m 13.11h 0.55d 0.001 y
# Average job time: 19s 0.32m 0.01h 0.00d
# Longest job: 230s 3.83m 0.06h 0.00d
# Submission to last job: 427s 7.12m 0.12h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/danRer1/run/out
hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab
# Loading database with 33852 rows
# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This is already done, see makeMm3.doc for procedure
# the directory: /cluster/bluearc/sc1/blastp should have data
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs: 20983s 349.72m 5.83h 0.24d 0.001 y
# IO & Wait Time: 25513s 425.21m 7.09h 0.30d 0.001 y
# Average job time: 6s 0.10m 0.00h 0.00d
# Longest job: 37s 0.62m 0.01h 0.00d
# Submission to last job: 106s 1.77m 0.03h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out
hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab
# Loading database with 18489 rows
# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This is already done, see makeMm3.doc for procedure
# the directory: /cluster/bluearc/dm1/blastp should have data
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs: 83377s 1389.62m 23.16h 0.97d 0.003 y
# IO & Wait Time: 39913s 665.21m 11.09h 0.46d 0.001 y
# Average job time: 16s 0.27m 0.00h 0.00d
# Longest job: 167s 2.78m 0.05h 0.00d
# Submission to last job: 365s 6.08m 0.10h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out
hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab
# Loading database with 30067 rows
# update knownToHInv table
# Verified that there is now new release of HInv data.
hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv
# count changed to 33236
#### BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-11-30 - Fan)
# Get the ensembl gene/protein cross-reference data from
# http://www.ensembl.org/Homo_sapiens/martview
# Follow this sequence through the pages:
# Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Feature" box, select gene, transcript, protein,
SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC
# Page 4) Choose "Text, tab separated". choose gzip compression. hit export.
# Save as ensXref.txt
sed ensXref.txt -e 's/\./\t/g' > ensemblXref3.tab
hgsql hg17 -e "drop table ensemblXref3"
hgsql hg17 < ~/src/hg/lib/ensemblXref3.sql
hgsql hg17 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines'
#### BUILD SUPERFAMILY RELATED TABLES (DONE - 2004-11-30 - Fan)
# Download Superfamily data files and build the Superfamily DB
# from supfam.mrc-lmb.cam.ac.uk
mkdir /cluster/store8/superfamily/041128
ln -s /cluster/store8/superfamily/041128 /cluster/data/superfamily/041128
cd /cluster/data/superfamily/041128
# ftp over the following two files:
ass_28-Nov-2004.tab.gz
supfam_28-Nov-2004.sql.gz
gzip -d *.gz
# Load the Superfamily database
hgsql hg17 -e "create database superfam041128"
hgsql superfam041128 < supfam_28-Nov-2004.sql
# This may take about an hour.
# Make sure to add an index on id of the des table of superfam041128.
hgsql superfam041128 -e "create index id on des(id);"
hgsql superfam041128 < ~/src/hg/lib/sfAssign.sql
hgsql superfam041128 -e 'load data local infile "ass_28-Nov-2004.tab" into table
superfam041128.sfAssign;'
# Build or rebuild Superfamily track and create sf tables needed for PB
hgsql hg17 < ~/src/hg/lib/sfAssign.sql
cd /cluster/data/superfamily/041128
hgsql hg17 -e 'load data local infile "ass_28-Nov-2004.tab" into table hg17.sfAssign;'
# If hg17.sfDes already exists, drop it.
hgsql superfam041128 -e "select * from des" >sfDes.tab
hgsql hg17 < ~/src/hg/lib/sfDes.sql
hgsql hg17 -e 'load data local infile "sfDes.tab" into table hg17.sfDes ignore 1 lines;'
# If hg17.superfamily already exists, drop it.
cd /cluster/data/hg17/bed
mkdir /cluster/data/hg17/sf.2004-1128
ln -s sf.2004-1128 sf
hgSuperfam hg17 > sf.log
# It is normal that many proteins does not have corresponding Superfamily entries.
# If hg17.sfDescription exists, drop it.
hgsql hg17 < ~/src/hg/lib/sfDescription.sql
hgsql hg17 -e 'LOAD DATA local INFILE "sfDescription.tab" into table hg17.sfDescription;'
# Finally, load the superfamily table.
hgLoadBed hg17 superfamily superfamily.tab -tab
# Create knownToSuperfamily table
# Note hs is changed into ht for this Superfamily release.
cat /cluster/data/superfamily/041128/ass_28-Nov-2004.tab \
| hgKnownToSuper hg17 hs stdin
# created 25287 rows in knownToSuper
### HG17 PROTEOME BROWSER TABLES RE-BUILD #### (DONE - 2004-12-01 - Fan)
# These are instructions for rebuilding tables
# needed for the Proteome Browser.
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.
# This update is based on proteins DBs dated 041115.
# Create the working directory
ssh hgwdev
mv /cluster/data/hg17/bed/pb /cluster/data/hg17/bed/pb.2004-06-11
mkdir /cluster/data/hg17/bed/pb.2004-12-01
cd /cluster/data/hg17/bed
ln -s /cluster/data/hg17/bed/pb.2004-12-01 pb
# Move the existing PB tables by:
hgsql hg17
create database hg17Sav2;
alter table hg17.pepCCntDist rename as hg17Sav2.pepCCntDist;
alter table hg17.pepExonCntDist rename as hg17Sav2.pepExonCntDist;
alter table hg17.pepHydroDist rename as hg17Sav2.pepHydroDist;
alter table hg17.pepIPCntDist rename as hg17Sav2.pepIPCntDist;
alter table hg17.pepMolWtDist rename as hg17Sav2.pepMolWtDist;
alter table hg17.pepMwAa rename as hg17Sav2.pepMwAa;
alter table hg17.pepPi rename as hg17Sav2.pepPi;
alter table hg17.pepPiDist rename as hg17Sav2.pepPiDist;
alter table hg17.pepResDist rename as hg17Sav2.pepResDist;
alter table hg17.pbAaDistA rename as hg17Sav2.pbAaDistA;
alter table hg17.pbAaDistC rename as hg17Sav2.pbAaDistC;
alter table hg17.pbAaDistD rename as hg17Sav2.pbAaDistD;
alter table hg17.pbAaDistE rename as hg17Sav2.pbAaDistE;
alter table hg17.pbAaDistF rename as hg17Sav2.pbAaDistF;
alter table hg17.pbAaDistG rename as hg17Sav2.pbAaDistG;
alter table hg17.pbAaDistH rename as hg17Sav2.pbAaDistH;
alter table hg17.pbAaDistI rename as hg17Sav2.pbAaDistI;
alter table hg17.pbAaDistK rename as hg17Sav2.pbAaDistK;
alter table hg17.pbAaDistL rename as hg17Sav2.pbAaDistL;
alter table hg17.pbAaDistM rename as hg17Sav2.pbAaDistM;
alter table hg17.pbAaDistN rename as hg17Sav2.pbAaDistN;
alter table hg17.pbAaDistP rename as hg17Sav2.pbAaDistP;
alter table hg17.pbAaDistQ rename as hg17Sav2.pbAaDistQ;
alter table hg17.pbAaDistR rename as hg17Sav2.pbAaDistR;
alter table hg17.pbAaDistS rename as hg17Sav2.pbAaDistS;
alter table hg17.pbAaDistT rename as hg17Sav2.pbAaDistT;
alter table hg17.pbAaDistV rename as hg17Sav2.pbAaDistV;
alter table hg17.pbAaDistW rename as hg17Sav2.pbAaDistW;
alter table hg17.pbAaDistY rename as hg17Sav2.pbAaDistY;
alter table hg17.pbAnomLimit rename as hg17Sav2.pbAnomLimit;
alter table hg17.pbResAvgStd rename as hg17Sav2.pbResAvgStd;
alter table hg17.pbStamp rename as hg17Sav2.pbStamp;
quit
# Define pep* tables in hg17 DB
cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql
# First edit out pepPred table definition, then
hgsql hg17 < pepAll.sql
# Build the pepMwAa table
hgsql proteins041115 -e "select info.acc, molWeight, aaSize from sp041115.info, sp041115.accToTaxon where
accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > pepMwAa.tab
hgsql hg17 <<end
load data local infile "pepMwAa.tab" into table hg17.pepMwAa ignore 1 lines;
end
o Build the pepPi table
hgsql proteins041115 -e "select info.acc from sp041115.info, sp041115.accToTaxon where
accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > protAcc.lis
pbCalPi protAcc.lis sp041115 pepPi.tab
hgsql hg17 <<end
load data local infile "pepPi.tab" into table hg17.pepPi;
end
# Calculate and load pep distributions
pbCalDist sp041115 proteins041115 9606 hg17 >pbCalDist.out
cat pbCalDist.out
wc pbCalDist.out
hgsql hg17
load data local infile "pepExonCntDist.tab" into table hg17.pepExonCntDist;
load data local infile "pepCCntDist.tab" into table hg17.pepCCntDist;
load data local infile "pepHydroDist.tab" into table hg17.pepHydroDist;
load data local infile "pepMolWtDist.tab" into table hg17.pepMolWtDist;
load data local infile "pepResDist.tab" into table hg17.pepResDist;
load data local infile "pepIPCntDist.tab" into table hg17.pepIPCntDist;
load data local infile "pepPiDist.tab" into table hg17.pepPiDist;
quit
# Calculate frequency distributions
pbCalResStd 041115 9606 hg17
# Create pbAnomLimit and pbResAvgStd tables
hgsql hg17 < ~/src/hg/lib/pbAnomLimit.sql
hgsql hg17 < ~/src/hg/lib/pbResAvgStd.sql
hgsql hg17 -e 'load data local infile "pbResAvgStd.tab" into table hg17.pbResAvgStd;'
hgsql hg17 -e 'load data local infile "pbAnomLimit.tab" into table hg17.pbAnomLimit;'
# Create pbStamp table for PB
hgsql hg17 < ~/src/hg/lib/pbStamp.sql
hgsql hg17Sav2 -e 'select * from pbStamp' > pbStamp.tab
hgsql hg17 -e 'load data local infile "pbStamp.tab" into table hg17.pbStamp ignore 1 lines;'
# Adjust drawing parameters for Proteome Browser stamps
Now invoke Proteome Browser and adjust various drawing parameters
(mostly the ymax of each stamp) if necessary, by updating the
pbStamp.tab file and then delete and reload the pbStamp table.
# Perform preliminary review of Proteome Browser for hg17, then
notify QA for formal review.
#### Blat knownGene proteins to determine exons (braney DONE 12/11/04)
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir blat.hg17KG.2004-12-08
rm blat.hg17KG
ln -s blat.hg17KG.2014-12-08 blat.hg17KG
cd blat.hg17KG
pepPredToFa hg17 knownGenePep known.fa
ssh kk
cd /cluster/data/hg17/bed/blat.hg17KG
cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << keep emacs happy
chmod +x blatSome
ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs/*.nib > human.lst
mkdir kgfa
cd kgfa
faSplit sequence ../known.fa 3010 kg
cd ..
ls -1S kgfa/*.fa > kg.lst
gensub2 human.lst kg.lst blatGsub blatSpec
mkdir psl
cd psl
foreach i (`cat ../human.lst`)
mkdir `basename $i .nib`
end
cd ..
para create blatSpec
para push
# Completed: 134130 of 134136 jobs
# Crashed: 6 jobs
# CPU time in finished jobs: 29801114s 496685.23m 8278.09h 344.92d 0.945 y
# IO & Wait Time: 1983513s 33058.55m 550.98h 22.96d 0.063 y
# Average job time: 237s 3.95m 0.07h 0.00d
# Longest job: 63306s 1055.10m 17.59h 0.73d
# Submission to last job: 169384s 2823.07m 47.05h 1.96d
# did 6 crashed jobs on small cluster
ssh eieio
cd /cluster/data/hg17/bed/blat.hg17KG
pslSort dirs raw.psl /tmp psl/*
pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
pslUniq cooked.psl hg17KG.psl
pslxToFa hg17KG.psl hg17KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
kgName hg17 hg17KG.psl blastKGRef01
cut -f 10 hg17KG.psl > kgName.lst
faSomeRecords known.fa kgName.lst hg17KG.fa
hgPepPred hg17 generic blastKGPep01 hg17KG.fa
ssh hgwdev
cd /cluster/data/hg17/bed/blat.hg17KG
hgsql hg17 < ~/kent/src/hg/lib/blastRef.sql
echo "rename table blastRef to blastKGRef01" | hgsql hg17
echo "load data local infile 'blastKGRef01' into table blastKGRef01" | hgsql hg17
#### TIGR GENE INDEX (DONE 2004-12-04 Fan)
mkdir -p /cluster/data/hg17/bed/tigr
cd /cluster/data/hg17/bed/tigr
wget -timestamp ftp://ftp.tigr.org/pub/data/tgi/Homo_sapiens/TGI_track_HumanGenome_build35.tgz
tar xvzf TGI*.tgz
foreach f (*cattle*)
set f1 = `echo $f | sed -e 's/cattle/cow/g'`
mv $f $f1
end
foreach o (mouse cow human pig rat)
echo $o
setenv O $o
foreach f (chr*_$o*s)
tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
end
end
ssh hgwdev
cd /cluster/data/hg17/bed/tigr
hgsql hg17 -e "drop table tigrGeneIndex"
hgsql hg17 < ~/kent/src/hg/lib/tigrGeneIndex.sql
foreach f (*.gff)
echo Processing $f ...
/cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC hg17 tigrGeneIndex $f
hgsql hg17 -e "select count(*) from tigrGeneIndex"
end
# Total of 401322 entries created in tigrGeneIndex table.
hgsql hg17 -e "update tigrGeneIndex set cdsStart = txStart;"
hgsql hg17 -e "update tigrGeneIndex set cdsEnd = txEnd;"
checkTableCoords hg17 tigrGeneIndex
gzip *.gff *TCs
# BLASTZ FOR ZEBRAFISH (danRer2) (DONE, 2004-12-09, hartera)
ssh kkr1u00
# blastz requires lineage-specific repeats
# Treat all repeats as lineage-specific.
# /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish exists (makeDanRer1.doc)
mkdir -p /iscratch/i/danRer2/linSpecRep.notInHuman
foreach f (/iscratch/i/danRer2/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/danRer2/linSpecRep.notInHuman/$f:t:r:r.out.spec
end
iSync
ssh kk
mkdir -p /cluster/data/hg17/bed/blastz.danRer2.2004-12-08
ln -s /cluster/data/hg17/bed/blastz.danRer2.2004-12-08 \
/cluster/data/hg17/bed/blastz.danRer2
cd /cluster/data/hg17/bed/blastz.danRer2
# Set L=6000 and abridge repeats - these are the same parameters used
# for hg16 and Fugu and similar to those for hg16-galgal2
cat << '_EOF_' > DEF
# human (hg17) vs zebrafish (danRer2)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Reuse parameters from hg16-fr1.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human (hg17)
SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInZebrafish
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: zebrafish (danRer2)
SEQ2_DIR=/iscratch/i/danRer2/nib/
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/danRer2/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg17/bed/blastz.danRer2
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
#DEBUG=1
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x DEF
# Save the DEF file in the current standard place
cp DEF ~angie/hummus/DEF.hg17-danRer2.2004-12-08
# prepare first cluster run
ssh kk
cd /cluster/data/hg17/bed/blastz.danRer2
bash # if a csh/tcsh user
. ./DEF
/cluster/data/hg17/jkStuff/BlastZ_run0.sh
cd run.0
para try, check, push, check ...etc.
# para time
# Completed: 58993 of 58993 jobs
# CPU time in finished jobs: 19583036s 326383.93m 5439.73h 226.66d 0.621 y
# IO & Wait Time: 471090s 7851.50m 130.86h 5.45d 0.015 y
# Average job time: 340s 5.67m 0.09h 0.00d
# Longest job: 885s 14.75m 0.25h 0.01d
# Submission to last job: 78245s 1304.08m 21.73h 0.91d
ssh kki
cd /cluster/data/hg17/bed/blastz.danRer2
bash # if a csh/tcsh user
. ./DEF
/cluster/data/hg17/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# para time
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 789s 13.14m 0.22h 0.01d 0.000 y
# IO & Wait Time: 2992s 49.87m 0.83h 0.03d 0.000 y
# Average job time: 11s 0.18m 0.00h 0.00d
# Longest job: 34s 0.57m 0.01h 0.00d
# Submission to last job: 391s 6.52m 0.11h 0.00d
# Third cluster run to convert lav's to axt's
ssh kki
cd /cluster/data/hg17/bed/blastz.danRer2
mkdir axtChrom
# a new run directory
mkdir run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /iscratch/i/gs.18/build35/bothMaskedNibs \
/iscratch/i/danRer2/nib stdout \
| axtSort stdin $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x do.csh
cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg17/bed/blastz.danRer2/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
\ls -1Sd ../lav/chr* > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
head jobList
para create jobList
para try, check, push, check,...
# para time
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 99s 1.64m 0.03h 0.00d 0.000 y
# IO & Wait Time: 862s 14.37m 0.24h 0.01d 0.000 y
# Average job time: 21s 0.36m 0.01h 0.00d
# Longest job: 92s 1.53m 0.03h 0.00d
# Submission to last job: 456s 7.60m 0.13h 0.01d
# crashed job: chr6_hla_hap1.axt is empty - has no alignments
# translate sorted axt files into psl
ssh kolossus
cd /cluster/data/hg17/bed/blastz.danRer2
mkdir -p pslChrom
set tbl = "blastzDanRer2"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.danRer2/pslChrom
foreach f (./*.psl)
/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 $f
echo "$f Done"
end
# try different parameters for blastz with chr1 of hg17
# featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer1 -enrichment
# refGene:cds 1.301%, blastzDanRer1 3.934%, both 0.874%, cover 67.23%,
# enrich 17.09x
# H=2000, Y=3400, L=6000, K=2200 and HoxD55.q scoring matrix
# featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2 -enrichment
# refGene:cds 1.301%, blastzDanRer2 3.845%, both 0.879%, cover 67.55%,
# enrich 17.57x
# same parameters as above but L=8000
# featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2L8k -enrichment
# refGene:cds 1.301%, blastzDanRer2L8k 2.309%, both 0.778%, cover 59.81%,
# enrich 25.91x
# enrichment went up but coverage dropped quite a bit.
# Default parameters with H=2000
# featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2Default -enrichment
# refGene:cds 1.301%, blastzDanRer2Default 1.701%, both 0.846%, cover 65.04%,
# enrich 38.24x
# same as first run but with no Y option set (default Y)
# featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2NoY -enrichment
# refGene:cds 1.301%, blastzDanRer2NoY 3.980%, both 0.877%, cover 67.47%,
# enrich 16.95x
# row count:
# danRer2 122160
# danRer2L8k 62815
# danRer2Default 75818
# danRer2NoY 124129
# can be pruned at the chaining step.
# trackDb - change Zebrafish Blastz to danRer1 Blastz and display this track
# for danRer2 as Zebrafish Blastz
# RESCORE DANRER2 BLASTZ (DONE, 2004-12-09, hartera)
# Low scores can occur with repeats abridged and using the
# HoxD55.q matrix. PSU's restore_rpts program rescored alignments
# with the default matrix instead of the BLASTZ_Q matrix.
# Rescore them here so the chainer sees the higher scores:
ssh kolossus
cd /cluster/data/hg17/bed/blastz.danRer2
mkdir axtChrom.rescore
foreach f (axtChrom/chr*.axt)
axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
$f axtChrom.rescore/$f:t
end
mv axtChrom axtChrom.orig
mv axtChrom.rescore axtChrom
# psl files and blastz tables will be the same regardless of score so
# no need to reload
# CHAIN DANRER2 BLASTZ (DONE, 2004-12-09, hartera)
# RELOAD CHAINS WIH FILTERING (DONE, 2004-12-10, hartera)
# APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS
# AND DEGENERATE DNA (DONE, 2004-12-22, hartera)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/hg17/bed/blastz.danRer2
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
# create input list
ls -1S /cluster/data/hg17/bed/blastz.danRer2/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# Reuse gap penalties from hg16 vs chicken run.
cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V 11
smallSize^V 111
position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V
72111^V 152111^V 252111
qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V
31600^V 56600
bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V
16000^V 32000^V 57000
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=../../chickenHumanTuned.gap $1 \
/iscratch/i/gs.18/build35/bothMaskedNibs \
/iscratch/i/danRer2/nib $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# para time
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 1837s 30.62m 0.51h 0.02d 0.000 y
# IO & Wait Time: 441s 7.35m 0.12h 0.01d 0.000 y
# Average job time: 51s 0.84m 0.01h 0.00d
# Longest job: 106s 1.77m 0.03h 0.00d
# Submission to last job: 419s 6.98m 0.12h 0.00d
# crashed job is chr6_hla_hap1 which has no alignments
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r >> hist5000.out
textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
echo ""
end
# apart from chr19 not too many with chains with scores < 5000
# load chr1 chain into table and check
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.danRer2/axtChain/chain
hgLoadChain hg17 chr1_chainDanRer2 chr1.chain
# featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 1.301%, chainDanRer2Link 3.676%, both 0.877%, cover 67.42%,
# enrich 18.34x
# featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2 -enrichment
# refGene:cds 1.301%, chainDanRer2 32.611%, both 1.034%, cover 79.52%,
# enrich 2.44x
ssh kksilo
cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chain
chainSplit chainFilt5k all.chain
# load chr1 filtered chains and check
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.danRer2/axtChain/chainFilt5k
hgLoadChain hg17 chr1_chainDanRer2Filt5k chr1.chain
# featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Filt5kLink -enrichment
# refGene:cds 1.301%, chainDanRer2Filt5kLink 2.907%, both 0.870%, cover 66.86%,
# enrich 23.00x
# featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Filt5k -enrichment
# refGene:cds 1.301%, chainDanRer2Filt5k 31.343%, both 1.028%, cover 79.02%,
# enrich 2.52x
# checked browser - when filtered on minScore=5000, the low scoring
# alignments removed are small and/or poor alignments so use this version.
# remove repeats from filtered chains and reload into database
# (2004-12-22, hartera)
ssh kksilo
cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
mv chainFilt5k chainRaw
mkdir chain
cd chainRaw
foreach f (*.chain)
set c = $f:r
echo $c
nice chainAntiRepeat /cluster/bluearc/hg17/bothMaskedNibs \
/cluster/bluearc/danRer2/nib $f \
../chain/$c.chain
end
cd ..
chainMergeSort ./chain/*.chain > all.chain.antirepeat
chainSplit chainAR all.chain.antirepeat
# load filtered chains and check
ssh hgwdev
echo 'drop table chr1_chainDanRer2Filt5k;' | hgsql hg17
echo 'drop table chr1_chainDanRer2Filt5kLink;' | hgsql hg17
# reload filtered chains instead of unfiltered (2004-12-10, hartera)
# reload filtered chains with repeats removed (2004-12-22, hartera)
cd /cluster/data/hg17/bed/blastz.danRer2/axtChain/
cd chainAR
foreach i (*.chain)
set c = $i:r
hgLoadChain hg17 ${c}_chainDanRer2 $i
echo done $c
end
# trackDb - change Zebrafish Chain to danRer1 Chain and display this track
# for danRer2 as Zebrafish Chain.
# after chainAntiRepeat
# featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 1.304%, chainDanRer2Link 2.742%, both 0.872%, cover 66.81%,
# enrich 24.36x
# NET DANRER2 BLASTZ (DONE, 2004-12-09, hartera)
# RE-CREATE NET WITH FILTERED CHAINS (DONE, 2004-12-10, hartera)
# RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-22, hartera)
ssh kksilo
cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
rm -r preNet
mkdir preNet
cd chainAR
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
# memory usage 133443584, utime 905 s/100, stime 139
# Add classification info using db tables:
# netClass looks for ancient repeats in one of the databases
# hg17 has this table - hand-curated by Arian but this is for
# human-rodent comparisons so do not use here, use -noAr option
mkdir -p /cluster/bluearc/danRer2/linSpecRep.notInHuman
# linSpecRep.notInZebrafish exists for hg17
cp /iscratch/i/danRer2/linSpecRep.notInHuman/* \
/cluster/bluearc/danRer2/linSpecRep.notInHuman
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
time netClass noClass.net hg17 danRer2 zfishdanRer2.net \
-tNewR=/cluster/bluearc/hg17/linSpecRep.notInZebrafish \
-qNewR=/cluster/bluearc/danRer2/linSpecRep.notInHuman -noAr
# 97.230u 54.290s 5:37.50 44.8% 0+0k 0+0io 217pf+0w
# load net into database
cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
netFilter -minGap=10 zfishdanRer2.net | hgLoadNet hg17 netDanRer2 stdin
# trackDb - change Zebrafish Net to danRer1 Net and display this track
# for danRer2 as Zebrafish Net.
# after chainAntiRepeat:
# featureBits hg17 refGene:cds netDanRer2 -enrichment
# refGene:cds 1.015%, netDanRer2 22.898%, both 0.783%, cover 77.15%,
# enrich 3.37x
# index had NULL cardinality, analyze table to fix (2005-1-18, Heather)
hgsql hg17
analyze table netDanRer2
# LOAD ACEMBLY TRACK (DONE, 2005-01-24, hartera)
# ACEMBLY TABLE RELOADED AND FINISHED COLOR CODING CODE IN
# hgTracks (2005-01-28, hartera)
# FINISHED CODE FOR FILTERING BY GENE CLASS (2005-02-03, hartera)
mkdir -p /cluster/data/hg17/bed/acembly
cd /cluster/data/hg17/bed/acembly
# Data is obtained from
# Danielle et Jean Thierry-Mieg mieg@ncbi.nlm.nih.gov
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.genes.proteins.fasta.tar.gz
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.genes.gff.tar.gz
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.mrnas.fasta.tar.gz
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.genes.pfamhits.tar.gz
tar xvzf acembly.ncbi_35.genes.gff.tar.gz
tar xvzf acembly.ncbi_35.genes.proteins.fasta.tar.gz
cd acembly.ncbi_35.genes.gff
# the acembly dataset for hg16 had problems with reverse blocks so
# check for these
cat << '_EOF_' > checkReversedBlocks
for i in x1*.gff
do
echo -n "$i working ..."
awk -F"\t" '
{
if ($4 > $5) {
printf "reverse blocks problem for $1"
printf "\n"
}
}
' $i > $i.fixed
echo " done"
done
'_EOF_'
# << this line makes emacs coloring happy
chmod +x checkReversedBlocks
./checkReversedBlocks
ls -l *.fixed
# all *.fixed files are empty so remove - there is no reversing of blocks
rm *.fixed
foreach f (x1.acemblygenes.*.gff)
set c=$f:r:e
egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \
perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff
if (-e ../../../$c/lift/random.lft) then
liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \
ctg-chr${c}_random.gff
endif
grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' | \
grep -v "^chr//" > chr$c.gff
echo "done $c"
end
#- Load into database - use extended genePred
ssh hgwdev
cd /cluster/data/hg17/bed/acembly
# Reloaded without -genePredExt 1/6/05:
ldHgGene -gtf hg17 acembly acembly.ncbi_35.genes.gff/chr*.gff
# for entry with 28212470 from chr6.gff, change to chr6
# and for 29124352 in chr6.gff, change to chr6 (1/13/05)
echo 'update acembly set chrom = "chr6" where chrom = "chr28212470";' \
| hgsql hg17
echo 'update acembly set chrom = "chr6" where chrom = "chr29124352";' \
| hgsql hg17
# checkTableCoords and runGeneCheck to check data
# a number of errors so leave on hgwdev for the moment
# checkTableCoords:
# rah.acembly has 16 records with chrom not described in chromInfo.
# rah.acembly item RPL10A.sNov04 chr6:35544172-35546520: end of last block (35546519) is not the same as chromEnd (35546520).
# rah.acembly has 1 records with blockEnd[n-1] != end.
# rah.acembly has 1 records with end > chromSize.
# chr6 acembly exon 35545934 35546101 . + 0
# gene_id RPL10A; transcript_id RPL10A.sNov04; exon_number 5
# chr6 acembly intron 35546102 35546520 . + 0
# gene_id RPL10A; transcript_id RPL10A.sNov04; intron_type fuzzy
# chr6 acembly CDS 35546335 35546384 . + 0
# gene_id RPL10A; transcript_id RPL10A.sNov04; exon_number 6
# chr6 acembly exon 35546335 35546519 . + 0
# gene_id RPL10A; transcript_id RPL10A.sNov04; exon_number 6
# chr6 acembly stop_codon 35546382 35546384 . +
# 0 gene_id RPL10A; transcript_id RPL10A.sNov04;
# here the intron overlaps exon 6 so take 35546519 to be txEnd
echo 'update acembly set txEnd = 35546519 where name = "RPL10A.sNov04";'\
| hgsql hg17
# for record where end > chromSize
echo 'select * from acembly as a, chromInfo as c where c.chrom = a.chrom \ and c.size < a.txEnd;' | hgsql hg17
# KIR2DL5.bNov04 on chr19_random, chr19_random size is 301858,
# txEnd is 305266 delete this record
echo 'delete from acembly where name = "KIR2DL5.bNov04";' | hgsql hg17
# from runGeneCheck:
# 5780 inFrameStop
# 110664 noStart
# 23085 badCdsSplice
# 23848 noStop
# 14957 badUtrSplice
# 3661 gap
# 4726 badFrame
# 261066 lines in genePred.tab
# e-mailed authors of data (2004-12-21, hartera)
# notiri.aNov04 - has ctg instead of atg at start. others have no start specified: sirora.nNov04
# sirora.zmNov04 - chr1:19389-19392 is AAC (gtt) (-)
# sirora.sNov04 - chr1:8925-8928 CAA (ttg) (-)
# sirora.rNov04 - chr1:8925-8928 CAA (ttg) (-)
# for entries with 28212470 and 29124352 instead of chr6 change to chr6
# Re-process this x1 file to chr6.gff (2005-01-24)
mv x1.acemblygenes.6.gff x1.acemblygenes.6.gff.broken
sed -e "s/^28212470/6/" x1.acemblygenes.6.gff.broken | sed -e \
"s/^29124352/6/" > x1.acemblygenes.6.gff
grep -v ^6\| x1.acemblygenes.6.gff | grep -v ^Hs | perl -wpe 's/^/chr/;' | \
grep -v "^chr//" > chr6.gff
# Received a list of genes from Jean and Danielle Mieg
# showing genes that are "main", "putative" or "cloud" - there should be
# no "cloud" genes in our data set (2005-01-11)
# download acembly_gene_lists.tar.gz from e-mail
cd /cluster/data/hg17/bed/acembly
tar xvzf acembly_gene_lists.tar.gz
cd acembly_gene_lists
cat << '_EOF_' > getIDs.pl
#!/usr/bin/perl -w
use strict;
while (<STDIN>) {
my @f = split(/\s+/);
for (my $i =0; $i <= $#f; $i++) {
if ($f[$i] =~ /gene_id$/) {
# if field is ID type then next value is the ID
my $id = $f[$i+1];
# remove ";" at end and print ID
chop $id;
print "$id\n";
}
}
}
'_EOF_'
chmod +x getIDs.pl
# get gene IDs from gff files
foreach f (../acembly.ncbi_35.genes.gff/chr*.gff)
echo "Processing $f"
perl getIDs.pl < $f >> genesGffs.ids
end
# remove back slash from some names
sort genesGffs.ids | uniq > genesGffs.ids.uniq
# reformat gene list to get just the genes and remove first 2 lines and sort
foreach g (*.list)
sed -e 's/"//g;' $g | sed -e 's/Gene : //;' | sed -e '1,2d' \
| sort | uniq > $g.IDsort
end
# remove back slash from some names
perl -pi.bak -e 's/\\//' *.IDsort
# check if cloud genes appear in gff files list of genes
# list of genes in cloud but not in gff
comm -13 genesGffs.ids.uniq cloud_gene.list.IDsort > gffvscloud.out
diff gffvscloud.out cloud_gene.list.IDsort
# there is no difference so none of the cloud genes are in the gff files
# check if all the other genes in the main and putative lists are in gffs
comm -13 genesGffs.ids.uniq main_gene.list.IDsort > gffvsmain.out
comm -13 genesGffs.ids.uniq putative_gene.list.IDsort > gffvsputative.out
wc -l *.out
# 14 gffvsmain.out
# 0 gffvsputative.out
# there are 14 genes in the main set not in the gff files
# actually there are 12, as FCA/MR and SLA/LP are in the gff files
# all putative genes are in the gff set
wc -l main_gene.list.IDsort putative_gene.list.IDsort
# 52467 main_gene.list.IDsort
# 43978 putative_gene.list.IDsort
# 96445 total
wc -l genesGffs.ids.uniq
# 97042 genesGffs.ids.uniq
# check discrepancy
cat main_gene.list.IDsort putative_gene.list.IDsort > mp.ids
sort mp.ids > mp.sort
comm -23 genesGffs.ids.uniq mp.sort > gffNotMP.out
wc -l gffNotMP.out
# 609 gffNotMP.out
# create table of Acembly gene classifications
# see http://www.ncbi.nlm.nih.gov/IEB/Research/Acembly/index.html?human
# in FAQ, describes main, putative and cloud genes. The cloud genes are not
# well confirmed and so they are not in this data set.
# NEED TO FILTER GENES AND RELOAD TABLES:
# authors Jean and Danielle Mieg e-mailed back. The 12 genes in the
# putative list that are not in the gff files were not exported
# as they did not find a single putative protein to describe so they
# were not added to the gffs. They will be added at a later date.
# Remove these from the acemblyClass table (2005-01-21, hartera)
# Reload acemblyClass table as problems with the gene names
# the class table has gene IDs and the acembly table has transcript IDs
# it is hard to look up class in the class table since just removing the
# transcript ID suffixes (e.g. "aNov04" after a ".") does not work as
# some gene IDs have a "." in them anyway.
ssh kksilo
cd /cluster/data/hg17/bed/acembly/acembly_gene_lists
comm -13 gffvsmain.out main_gene.list.IDsort > main_gene.list.filt
wc -l main_gene.list.filt
# 52455 main_gene.list.filt
ssh hgwdev
cd /cluster/data/hg17/bed/acembly/acembly_gene_lists
# drop acemblyClass table and recreate (2005-01-27, hartera)
echo 'drop table acemblyClass;' | hgsql hg17
# prepare a file of genes and classification
# use transcript IDs - get these and corresponding gene IDs from gff files
# if Gene IDs used the difficult to parse transcript ID (name column) from
# the acembly genePred table. e.g. notiri.aNov04 is a transcript ID so can
# remove suffix after "." to obtain gene ID, some gene names have "." in
# them and not all have the suffix.
# 260446 transcript IDs (use allFiltered.gff - see below)
perl getClass.pl main_gene.list.filt putative_gene.list.IDsort \
../acembly.ncbi_35.genes.gff/allFiltered.gff
foreach f (main_gene.list.filt putative_gene.list.IDsort)
if ($f == "main_gene.list.filt") then
set t = "main"
endif
if ($f == "putative_gene.list.IDsort") then
set t = "putative"
endif
awk 'BEGIN {OFS="\t"} {print $1, "'$t'"}' $f >> class.txt
end
sort classes.txt | uniq > geneIDtxID.class
# get transcript ID and class fields for acemblyClass table
awk 'BEGIN {OFS="\t"} {print $2,$3}' geneIDtxID.class > acemblyClass.tab
wc -l acemblyClass.tab
# 260446 acemblyClass.tab
# make change to acemblyClass.as and check in:
# change name to be transcript ID instead of gene ID
cat << '_EOF_' > $HOME/kent/src/hg/lib/acemblyClass.as
table acemblyClass
"Class for Acembly genes"
(
string name; "Transcript ID for Acembly gene"
string class; "Class of gene"
)
'_EOF_'
cd $HOME/kent/src/hg/lib/
autoSql acemblyClass.as acemblyClass
mv acemblyClass.h $HOME/kent/src/hg/inc
# do make to check it works and commit the .as, .sql, .c and .h files to CVS
cd /cluster/data/hg17/bed/acembly/acembly_gene_lists
echo "drop table acemblyClass" | hgsql hg17
hgsql hg17 < ~/kent/src/hg/lib/acemblyClass.sql
# reload table with transcript IDs
echo "load data local infile 'acemblyClass.tab' into table acemblyClass" \
| hgsql hg17
# There were also 609 genes in the gff files that are not in the
# main, putative or cloud gene lists. Jean and Danielle Mieg say that
# these were filtered out from their data set but not from the gff files.
# Remove these from the gff files. (gffNotMP.out) (2005-01-24)
cd /cluster/data/hg17/bed/acembly/acembly.ncbi_35.genes.gff
cat chr*.gff > all.gff
cat << '_EOF_' > removeGenes.pl
#!/usr/bin/perl -w
use strict;
my $genes = $ARGV[0];
my $gff = $ARGV[1];
open(GENES, $genes) || die "Can not open $genes:$!\n";
open(GFF, $gff) || die "Can not open $gff:$!\n";
open(OUT, ">removed.out") || die "Can not open removed.out:$!\n";
my %genes;
while (<GENES>) {
chomp;
my $g = $_;
$genes{$g} = 1;
}
close GENES;
while (<GFF>) {
my $l = $_;
my $id;
my @line = split(/\s+/);
for (my $i = 0; $i <= $#line; $i++) {
if ($line[$i] eq "gene_id") {
$id = $line[$i+1];
}
}
$id =~ s/;//;
print "id is now $id\n";
if (!exists($genes{$id})) {
print $l;
}
else {
print OUT $l;
}
}
'_EOF_'
perl removeGenes.pl ../acembly_gene_lists/gffNotMP.out all.gff \
> allFiltered.gff
# checked that gene IDs in the removed.out file are the same
# same as those in gffNotMP.out
# reload into the acembly table
ssh hgwdev
cd /cluster/data/hg17/bed/acembly
echo 'drop table acembly;' | hgsql hg17
# Reloaded with filtered set 2005-01-23, reload again 2005-01-28 with
# the genePredExt option to get gene ID in name 2 field
ldHgGene -gtf -genePredExt hg17 acembly \
acembly.ncbi_35.genes.gff/allFiltered.gff
# Read 260446 transcripts in 3656676 lines in 1 files
# 260446 groups 41 seqs 1 sources 5 feature types
# 260446 gene predictions
# remove cdsStartStat, cdsEndStat and exonFrames fields
echo 'alter table acembly drop column cdsStartStat;' | hgsql hg17
echo 'alter table acembly drop column cdsEndStat;' | hgsql hg17
echo 'alter table acembly drop column exonFrames;' | hgsql hg17
# fix problem data found by checkTableCoords
# here the intron overlaps exon 6 so take 35546519 to be txEnd
echo 'update acembly set txEnd = 35546519 where name = "RPL10A.sNov04";'\
| hgsql hg17
# for record where end > chromSize
echo 'select * from acembly as a, chromInfo as c where c.chrom = a.chrom \ and c.size < a.txEnd;' | hgsql hg17
# KIR2DL5.bNov04 on chr19_random, size is 301858, txEnd is 305266
# delete this record
echo 'delete from acembly where name = "KIR2DL5.bNov04";' | hgsql hg17
# acembly peptide table
# need to just grab same sequences that are in acembly
cd ./acembly.ncbi_35.genes.proteins.fasta
echo 'select name from acembly;' | hgsql -N hg17 > acembly.name
cat *.fasta > allPep.fa
faSomeRecords allPep.fa acembly.name acemblyPep.fa
# PEPTIDE SEQUENCES NOT LOADED
# There are 236,554 peptide names that do not match transcript IDs in
# the acembly table and 110,278 transcript IDs in acembly that do not
# have a corresponding peptide. Waiting for repsonse about this from
# Jean and Danielle (2005-01-31)
# hgPepPred hg17 generic acemblyPep \
# acembly.ncbi_35.genes.proteins.fasta/*.fasta
# Edit hgTracks.c to get colour coded tracks based on the gene class
# for each gene as read from the acemblyClass table.
# Edits to hui.c, hgTrackUi.c and hgTracks.c to allow filtering of
# genes based on class.
# acembly trackDb entry:
# track acembly
# shortLabel Acembly Genes
# longLabel AceView Gene Models With Alt-Splicing
# group genes
# priority 41
# visibility dense
# color 155,0,125
# type genePred acemblyPep acemblyMrna
# url http://www.ncbi.nih.gov/IEB/Research/Acembly/av.cgi?db=hg17&l=$$
# itemClassTbl acemblyClass
# geneClasses main putative
# gClass_main 128,0,125
# gClass_putative 200,0,125
# urlLabel Transcript ID:
# search added:
# searchTable acembly
# searchType genePred
# searchMethod prefix
# termRegex [^[:space:]]+
# searchPriority 50
# Received data with gene product relationship from Jean Thierry-Mieg
# (2005-02-17)
ssh eieio
cd /cluster/data/hg17/bed/acembly/acembly.ncbi_35.genes.proteins.fasta
wget --timestamp ftp://ftp.ncbi.nlm.nih.gov/repository/acedb/human/acembly.ncbi_35.gene2product.txt.gz
gunzip acembly.ncbi_35.gene2product.txt.gz
# these are gene ID and product mappings, need transcript ID to product
# mappings. E-mailed Jean Thierry-Mieg to ask for this information
# BUILD WGRNA TRACK (DONE, 2004-12-13, Fan)
# Grab data from original miRNA track and convert them into wgRna .tab format.
hgsql hg17 --skip-column-names -e 'select * from miRNA' >miRNA.out
cat miRNA.out | awk {'print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t""miRna"'} >wgRna.tab
# Break the original custom track data file, hsa-snoRNA_track.txt, into two files j1 and j2,
# then remove header and blank lines.
cat j1 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""CDBox"'} >>wgRna.tab
cat j2 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""HAcaBox"'} >>wgRna.tab
# load into wgRna table
hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
# create and edit wgRna.html under src/hg/makeDb/trackDb/human/hg17.
# RELOADED wgRna DATA USING wgRNA_corrected.txt SENT BY MICHEL WEBER
# Manually removed the first header line and the first column of the bin field and removed
# the last empty line.
cut -f 2- wgRNA_corrected.txt >wgRna.tab
vi wgRna.tab
hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
# UPDATED WGRNA DATA PER EMAIL FROM WEBER (2004-12-14, Fan).
# Added the following 3 lines to j1
chr3 161715396 161715726 U90 480 -
chr11 93104316 93104387 Z40 480 -
chr11 93106041 93106114 mgh28S-2410 480 -
# Regenerated wgRna table
cat miRNA.out | awk {'print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t""miRna"'} >wgRna.tab
cat j1 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""CDBox"'} >>wgRna.tab
cat j2 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""HAcaBox"'} >>wgRna.tab
hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
# Changed the following records to RNA type scaRna.
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U85"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U87"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U88"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U89"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U90"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U91"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U92"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U93"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U100"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA26"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA35"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA45"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA47"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA57"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="HBII-382"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-19/30"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-25/61"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-22/U4-8"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U85"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U87"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U88"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U89"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U90"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U91"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U92"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U93"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U100"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA26"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA35"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA45"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA47"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA57"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="HBII-382"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-19/30"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-25/61"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-22/U4-8"'
# Updated .../trackDb/human/hg17/wgRna.html.
# MAKE VSDANRER2 DOWNLOADABLES (DONE, 2004-12-14, hartera)
# REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat
# (DONE, 2004-12-22, hartera)
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.danRer2/axtChrom
set gp = /usr/local/apache/htdocs/goldenPath/hg17
mkdir -p $gp/vsDanRer2/axtChrom
cp -p *.axt $gp/vsDanRer2/axtChrom
cd $gp/vsDanRer2/axtChrom
gzip *.axt
md5sum *.gz > md5sum.txt
# copy chains and nets to downloads area
# re-make chains and net downloadables (2004-12-22, hartera)
rm $gp/vsDanRer2/zebrafish*.gz $gp/vsDanRer2/md5sum.txt
cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
gzip -c all.chain.antirepeat > \
/cluster/data/hg17/zip/zebrafishDanRer2.chain.gz
gzip -c zfishdanRer2.net > /cluster/data/hg17/zip/zebrafishDanRer2.net.gz
cd $gp/vsDanRer2
mv /cluster/data/hg17/zip/zebrafish*.gz .
md5sum *.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# CLEANUP DANRER2 BLASTZ (DONE, 2004-12-14, hartera)
# RE-DONE (DONE, 2004-12-22, hartera)
# REMOVED RAW AND LAV DIRS (DONE, 2005-02-24, hartera)
ssh eieio
cd /cluster/data/hg17/bed/blastz.danRer2
nice rm axtChain/run1/chain/* &
nice rm -fr axtChain/n1 axtChain/noClass.net &
nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net &
nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain &
nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &
nice rm -rf raw &
nice rm -rf lav &
# EXTRACT AXT'S AND MAF'S FROM TETRAODON (tetNig1) NET
# (DONE, 2004-12-15, hartera)
# Redo to remove overlaps (2006-04-07 kate)
ssh eieio
# create axts
cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
netSplit tetNig1.net tetraodonNet
mkdir -p ../axtNet
cat > axtNet.csh << 'EOF'
foreach f (tetraodonNet/chr*.net)
set c = $f:t:r
echo "axtNet on $c"
netToAxt tetraodonNet/$c.net chain/$c.chain \
/cluster/data/hg17/nib /cluster/data/tetNig1/nib ../axtNet/$c.axt
echo "Complete: $c.net -> $c.axt"
end
'EOF'
chmod +x axtNet.csh
csh axtNet.csh >&! axtNet.log &
tail -100f axtNet.log
# sort axts before making mafs - must be sorted for multiz
cd /cluster/data/hg17/bed/blastz.tetNig1
mv axtNet axtNet.unsorted
mkdir axtNet
foreach f (axtNet.unsorted/*.axt)
set c = $f:t:r
echo "Sorting $c"
axtSort $f axtNet/$c.axt
end
# create maf
ssh eieio
cd /cluster/data/hg17/bed/blastz.tetNig1
cd axtNet
mkdir ../mafNet
cat > makeMaf.csh << 'EOF'
foreach f (chr*.axt)
set maf = $f:t:r.tetNig1.maf
echo translating $f to $maf
axtToMaf $f \
/cluster/data/hg17/chrom.sizes /cluster/data/tetNig1/chrom.sizes \
../mafNet/$maf -tPrefix=hg17. -qPrefix=tetNig1.
end
'EOF'
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/*.net &
# redo axt's and maf's to remove overlaps (2006-04-07 kate)
ssh kkstore02
cd /cluster/data/hg17/bed/blastz.tetNig1
mv axtNet axtNet.old
mv mafNet mafNet.old
mkdir -p axtNet mafNet
cd axtChain
cat > fix.csh << 'EOF'
date
foreach f (tetraodonNet/chr*.net)
set c = $f:t:r
echo $c
netToAxt tetraodonNet/$c.net chain/$c.chain \
/cluster/data/hg17/nib /cluster/data/tetNig1/nib stdout | \
axtSort stdin ../axtNet/$c.axt
echo "Complete: $c.net -> $c.axt"
axtToMaf ../axtNet/$c.axt \
/cluster/data/hg17/chrom.sizes /cluster/data/tetNig1/chrom.sizes \
../mafNet/$c.maf -tPrefix=hg17. -qPrefix=tetNig1.
end
date
'EOF'
csh fix.csh >&! fix.log &
cd /san/sanvol1/scratch/hg17/mafNet
rm -fr tetNig1
cp -rp /cluster/data/hg17/bed/blastz.tetNig1/mafNet tetNig1
# 10-WAY MULTIZ -- 8-WAY PLUS FROG AND TETRA (DONE 2004-12-22 kate)
# Use older multiz (not v10) till bugs fixed
ssh eieio
cd /cluster/data/hg17/bed
rm multiz10way
mkdir multiz.2004-12-22
ln -s multiz.2004-12-22 multiz10way
cd multiz10way
cat > tree.nh << 'EOF'
((((((hg17,panTro1),(rn3,mm5)),canFam1),galGal2),xenTro1),((fr1,tetNig1),danRer1))
'EOF'
mkdir /cluster/bluearc/hg17/multiz.2004-12-22
cd /cluster/bluearc/hg17
mkdir 2004-12-22
rm multiz10way
ln -s multiz.2004-12-17 multiz10way.v10
ln -s multiz.2004-12-22 multiz10way
# reuse pairwise MAF's on bluearc
mv multiz10way.v10/{canFam1,danRer1,fr1,galGal2,mm5,panTro1,rn3,tetNig1,xenTro1} multiz10way
# NOTE: pairwise mafs were moved to /cluster/bluearc/hg17/mafNet
# make output dir and run dir
ssh kk9
cd /cluster/data/hg17/bed
cd multiz10way
mkdir -p maf
mkdir -p run
cd run
# create scripts to run multiz on cluster
cat > oneMultiz.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set multi = /scratch/$user/multiz10way.$c
set pairs = /cluster/bluearc/hg17/multiz10way
# special mode --
# with 1 arg, cleanup
if ($#argv == 1) then
rm -fr $multi
exit
endif
set s1 = $2
set s2 = $3
# locate input files -- in pairwise dir, or multiple dir
set d1 = $multi
set d2 = $multi
if (-d $pairs/$s1) then
set d1 = $pairs
endif
if (-d $pairs/$s2) then
set d2 = $pairs
endif
set f1 = $d1/$s1/$c.maf
set f2 = $d2/$s2/$c.maf
# write to output dir
set out = $multi/${s2}${s1}
mkdir -p $out
# check for empty input file
if (-s $f1 && -s $f2) then
echo "Aligning $f1 $f2"
/cluster/bin/penn/multiz $f1 $f2 - > $out/$c.maf
else if (-s $f1) then
cp $f1 $out
else if (-s $f2) then
cp $f2 $out
endif
'EOF'
# << for emacs
chmod +x oneMultiz.csh
cat > allMultiz.csh << 'EOF'
#!/bin/csh -fe
set c = $1
oneMultiz.csh $c mm5 panTro1
oneMultiz.csh $c rn3 panTro1mm5
oneMultiz.csh $c canFam1 panTro1mm5rn3
oneMultiz.csh $c galGal2 panTro1mm5rn3canFam1
oneMultiz.csh $c xenTro1 panTro1mm5rn3canFam1galGal2
oneMultiz.csh $c fr1 panTro1mm5rn3canFam1galGal2xenTro1
oneMultiz.csh $c tetNig1 panTro1mm5rn3canFam1galGal2xenTro1fr1
oneMultiz.csh $c danRer1 panTro1mm5rn3canFam1galGal2xenTro1fr1tetNig1
# get final alignment file
cp /scratch/$user/multiz10way.$c/panTro1mm5rn3canFam1galGal2xenTro1fr1tetNig1danRer1/$c.maf /cluster/data/hg17/bed/multiz10way/maf/$c.maf
#cleanup
oneMultiz.csh $c
'EOF'
# << for emacs
chmod +x allMultiz.csh
cat > gsub << 'EOF'
#LOOP
allMultiz.csh $(root1) {check out line+ /cluster/data/hg17/bed/multiz10way/maf/$(root1).maf}
#ENDLOOP
'EOF'
# << for emacs
cut -f 1 /cluster/data/hg17/chrom.sizes > chrom.lst
gensub2 chrom.lst single gsub jobList
para create jobList
para try; para check
para push
# post-process multiz maf with maf_project to "glue" short
# alignment blocks together
ssh eieio
cd /cluster/data/hg17/bed/multiz10way.v8
mkdir -p mafGlued
cd maf
foreach f (*.maf)
set c = $f:r
echo "gluing $f"
/cluster/bin/penn/maf_project $f hg17.$c > ../mafGlued/$c.maf
end
# filter out alignment blocks with no alignments in non-reference species,
# and low-scoring alignments based on Webb Miller's latest
# recommendations (score < -5 * ncol^2 * nrow)
# NOTE: Webb hasn't approved the filtered alignments yet,
# so leaving them in for now.
#mkdir -p mafFiltered
#cd ../mafGlued
#foreach f (*.maf)
#set c = $f:r
#echo "filtering $f"
#~kate/bin/i386/mafFilter -factor $f > ../mafFiltered/$c.maf
#end
#cd ..
grep score mafGlued/chr1.maf | wc -l
grep score mafFiltered/chr1.maf | wc -l
grep score mafGlued/bad | wc -l
# 43692
grep score=0.0 bad | wc -l
# 10206
# load alignments into tables
ssh hgwdev
cd /cluster/data/hg17/bed/multiz10way.v8
set mafDir = /gbdb/hg17/mafNet
mkdir -p $mafDir
# multiple alignment
set mafDir = /gbdb/hg17/multiz10way/maf
mkdir -p $mafDir/multiz10way
cd /cluster/data/hg17/bed/multiz10way.v8/mafGlued
ln -s `pwd`/*.maf $mafDir/multiz10way
hgLoadMaf hg17 -warn multiz10way -pathPrefix=$mafDir/multiz10way
# load summary table to replace pairwise
cd /cluster/data/hg17/bed/multiz10way.v8/mafGlued/
time cat chr*.maf | hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 hg17 multiz10waySummary stdin
# Processed 27314693 components in 9081437 mafs from stdin
# 30 minutes
# CONSERVATION SCORING WITH PHASTCONS (DONE 2005-01-14 kate)
# 1. Partition multiple alignment into windows, using "msa_split"
# 2. Create starting tree model, with branch lengths
# use "phyloFit" on alignments
# 3. Estimate GC avg. over all species, use "msa_view" on maf
# 4. Estimate other model params, using phastCons (via doEstimate script)
# NOTE: no alignment filtering done -- the scores don't look
# particularly meaningful w/ this version of multiz.
# Next time, run on "glued" (maf_projected)
ssh eieio
cd /cluster/data/hg17/bed/multiz10way.v8
set mafDir = /cluster/bluearc/hg17/multiz10way.v8/maf
mkdir -p $mafDir
cp -r maf/*.maf $mafDir
ssh kk9
cd /cluster/data/hg17/bed/multiz10way.v8
mkdir cons
cd cons
# break up the genome-wide MAFs into pieces
# NOTE: chrom fasta files are already on the bluearc
# from previous run
mkdir /cluster/bluearc/hg17/chrom
cd /cluster/data/hg17
foreach f (`cat chrom.lst`)
echo $f
cp -r $f/*.fa /cluster/bluearc/hg17/chrom
end
cd /cluster/data/hg17/bed/multiz10way.v8/cons
mkdir run.split
cd run.split
set WINDOWS = /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.sh
#!/bin/sh
PHAST=/cluster/bin/phast
FA_SRC=/cluster/bluearc/hg17/chrom
WINDOWS=/cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS
maf=$1
c=`basename $maf .maf`
echo $c
mkdir -p /scratch/msa_split
${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg17,panTro1,mm5,rn3,canFam1,galGal2,xenTro1,fr1,tetNig1,danRer1 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -B 5000
[ $? -eq 0 ] || exit 1
echo "Copying..."
cd /scratch/msa_split
for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
[ $? -eq 0 ] || exit 1
rm -f /scratch/msa_split/$c.*.ss
echo "Done copying"
echo "Done" >> ${WINDOWS}/$c.done
'EOF'
# << for emacs
set WINDOWS = /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS
chmod +x doSplit.sh
rm -f jobList
foreach file (/cluster/bluearc/hg17/multiz10way.v8/maf/*.maf)
set c = $file:t:r
echo "doSplit.sh $file {check out line+ $WINDOWS/$c.done}" >> jobList
end
para create jobList
# 46 jobs
para try
para check
# TODO: cleanup
# rm -fr $mafDir
# now generate conservation scores and predicted elements
set path = ($path /cluster/bin/phast); rehash
cd /cluster/data/hg17/bed/multiz10way.v8/cons
mkdir run.elements
cd run.elements
# create a starting tree model from a chr1 ss files in WINDOWS dir.
ssh kolossus
cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
gunzip -c /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/chr1.14996059-15998256.ss.gz \
> /tmp/phastCons.$$
phyloFit -i SS /tmp/phastCons.$$ --out-root starting-tree --tree \
"((((((hg17,panTro1),(mm5,rn3)),canFam1),galGal2),xenTro1),((fr1,tetNig1),danRer1))"
rm /tmp/phastCons.$$
cat starting-tree.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#TRAINING_LNL: -2635749.517410
#BACKGROUND: 0.247225 0.248374 0.250827 0.253574
#RATE_MAT:
#-0.997890 0.201447 0.648573 0.147870
#0.200515 -1.020796 0.190184 0.630096
#0.639258 0.188324 -1.025170 0.197587
#0.144168 0.617176 0.195447 -0.956791
#TREE: ((((((hg17:0.006401,panTro1:0.008342):0.099376,(mm5:0.083404,rn3:0.105411):0.242694):0.020883,canFam1:0.221922):0.099131,galGal2:0.275759):0.041997,xenTro1:0.280306):0.064815,((fr1:0.137674,tetNig1:0.091463):0.118573,danRer1:0.250847):0.064815);
# estimate model parameters
# estimate avg. cross-species avg. GC content from chr1 maf's
ssh kolossus
set path = ($path /cluster/bin/phast); rehash
cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
msa_view --aggregate hg17,panTro1,rn3,mm5,canFam1,galGal2,xenTro1,danRer1,tetNig1,fr1 \
-i MAF \
--summary-only /cluster/data/hg17/bed/multiz10way.v8/maf/chr1.maf\
> maf_summary.txt
awk '$1 == "[aggregate]" {printf "%0.3f\n", $3 + $4}' maf_summary.txt
# 0.424
# generate models from random sample of genome (use 90 1Mb windows,
# to conveniently run on rack 9 100-node cluster)
# On first pass, used parameters from 8way alignment:
# expected-lengths 12 -taret-coverage .17
# NOTE: there may be a cleverer way to select the first length param
# On second pass, used parameters below, based on consEntropy
# and featureBits coverage of elements, below
cat << 'EOF' > doEstimate.sh
#!/bin/sh
zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.424 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 11 --target-coverage 0.20 --quiet --log $2 --estimate-trees $3
'EOF'
chmod u+x doEstimate.sh
rm -fr LOG TREES
mkdir -p LOG TREES
ls /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/*.gz > all.windows
/cluster/bin/phast/chooseLines -k 90 all.windows > subset.windows
rm -f jobs.lst
foreach f (`cat subset.windows`)
set root = $f:t:r:r
echo doEstimate.sh /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/$f LOG/$root.log TREES/$root >> jobs.lst
end
# run cluster job (about an hour)
ssh kk9
cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
para create jobs.lst
# 90 jobs written to batch
para try; para check
para push
# 2 jobs crashed with out-of-mem; as we are just taking a sample
# this is probably OK, but I've notified Adam
# Average job time: 1055s 17.58m 0.29h 0.01d
# Longest job: 3647s 60.78m 1.01h 0.04d
# NOTE: should have used ave.noncons.mod to improve parameter estimation
# cp nave.noncons.mod starting-tree.mod
ls TREES/*.cons.mod > cons.txt
/cluster/bin/phast/phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
grep TREE ave.cons.mod
# TREE: ((((((hg17:0.002313,panTro1:0.002931):0.036375,(mm5:0.029849,rn3:0.039008):0.095334):0.003258,canFam1:0.078205):0.047189,galGal2:0.158045):0.020103,xenTro1:0.169387):0.028857,((fr1:0.071610,tetNig1:0.057766):0.091165,danRer1:0.138905):0.028857);
ls TREES/*.noncons.mod > noncons.txt
/cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
grep TREE ave.noncons.mod
# TREE: ((((((hg17:0.007342,panTro1:0.009340):0.116009,(mm5:0.095037,rn3:0.124288):0.304355):0.010633,canFam1:0.249367):0.151476,galGal2:0.507037):0.064317,xenTro1:0.549121):0.094733,((fr1:0.231246,tetNig1:0.185161):0.296288,danRer1:0.446734):0.094733);
# analyze conservation genome-wide
cat << 'EOF' > doPhastCons.sh
#!/bin/sh
mkdir -p /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS
pref=`basename $1 .ss.gz`
chr=`echo $pref | awk -F\. '{print $1}'`
tmpfile=/scratch/phastCons.$$
zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 11 --target-coverage 0.20 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
chmod u+x doPhastCons.sh
rm -fr /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS
rm -f jobs2.lst
foreach f (/cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/*.ss.gz)
echo doPhastCons.sh $f >> jobs2.lst
end
# run cluster job (it's quick -- 10 minutes or so)
ssh kk
cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
para create jobs2.lst
# 2932 jobs written to batch
para try; para check
para push
# Average job time: 80s 1.33m 0.02h 0.00d
# Longest job: 157s 2.62m 0.04h 0.00d
# Submission to last job: 583s 9.72m 0.16h 0.01d
# combine predictions and transform scores to be in 0-1000 interval
# do in a way that avoids limits on numbers of args
rm -f splitfiles* all.raw.bed
find /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS -name "*.bed" > files
split files splitfiles
foreach s (splitfiles*)
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' `cat $s` >> all.raw.bed
end
/cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
rm files splitfiles*
ssh hgwdev
cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
hgLoadBed hg17 phastConsElements10way all.bed
sort -rn -k 5 all.bed | sed -n '1,100000p' > top100K.bed
hgLoadBed hg17 phastConsElements10wayTop100K top100K.bed
# check coverage -- reran estimation and conservation steps with new parameters till
# coverage close to 5% and expected-length parameter is close to consEntropy recommended length
featureBits hg17 phastConsElements10way
# first pass
# .17 12
# 132657993 bases of 2866216770 (4.628%) in intersection
# second pass -- used this
# .20 11
# 143386170 bases of 2866216770 (5.003%) in intersection
featureBits hg17 phastConsElements
# 137850739 bases of 2866216770 (4.810%) in intersection
# check expected-length parameter
# first pass
/cluster/bin/phast/consEntropy .17 12 \
ave.cons.mod ave.noncons.mod --NH 9.78
# recommended length 10.4
# second pass -- good enough according to Adam
/cluster/bin/phast/consEntropy .20 11 \
ave.cons.mod ave.noncons.mod --NH 9.78
#( Solving for new omega: 11.000000 12.243251 12.155776 12.155369 )
#Transition parameters: gamma=0.200000, omega=11.000000, mu=0.090909, nu=0.022727
#Relative entropy: H=1.263205 bits/site
#Required length: N=7.548911 sites
#Total entropy: NH=9.535821 bits
#Recommended expected length: omega=12.155369 sites (for NH=9.780000)
# create wiggle data files
ssh eieio
cd /cluster/data/hg17/bed/multiz10way.v8/cons
# sort post-prob files by chrom position using filename, then
# use wigEncode to create binary files for wiggle
find /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS \
-name "*.pp.gz" | sort -t\. -k2,2n | xargs zcat | \
wigEncode stdin phastCons10way.wig phastCons10way.wib
hgWiggle -db=hg17 -doHistogram -hBinSize=0.001 \
-hBinCount=1000 -hMinVal=0.0 -db=hg17 phastCons >histo.8way.data
hgWiggle -db=hg17 -doHistogram -hBinSize=0.001 \
-hBinCount=1000 -hMinVal=0.0 -db=hg17 phastCons10way >histo.10way.data
hgWiggle -db=hg17 -doStats \
phastCons > stats.8way.data
hgWiggle -db=hg17 -doStats \
phastCons10way > stats.10way.data
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/hg17/bed/multiz10way.v8/cons
set wibDir = /gbdb/hg17/multiz10way/wib/phastCons10way
mkdir -p $wibDir
ln -s `pwd`/phastCons10way.wib $wibDir
hgLoadWiggle hg17 phastCons10way phastCons10way.wig \
-pathPrefix=$wibDir
# create tree image:
# edit tree.nh to create species.nh with common names
/cluster/bin/phast/draw_tree -b -s species.nh > species10.ps
# photoshop to enhance, then save as gif/jpg
cp /cluster/data/hg17/bed/multiz10way.v8/species10.jpg \
/usr/local/apache/htdocs/images/phylo/10way.jpg
# get stats on the track
ssh hgwdev
featureBits hg17 -enrichment refGene:cds phastConsElements10way
# refGene:cds 1.020%, phastConsElements10way 5.003%, both 0.711%, cover 69.73%, enrich 13.94x
# compare to previous elements (generated from 8way)
featureBits hg17 -enrichment refGene:cds phastConsElements
# refGene:cds 1.020%, phastConsElements 4.810%, both 0.747%, cover 73.22%, enrich 15.22x
# see how gluing reduces number of alignments
ssh eieio
cd /cluster/data/hg17/bed/multiz10way.v8
mkdir stats
grep score maf/chr22.maf | grep -v 0.0 | wc -l
#179576
grep score mafGlued/chr22.maf | grep -v 0.0 | wc -l
#110550
# look at distribution of alignment sizes after gluing
ssh hgwdev
cd /cluster/data/hg17/bed/multiz10way.v8
mkdir mafTemp
ln -s `pwd`/maf/chr1.maf mafTemp
# load temp table
hgLoadMaf hg17 -pathPrefix=mafTemp multiz10wayChr1
#Loaded 1246727 mafs
# again, compare to glued:
echo "SELECT COUNT(*) FROM multiz10way"
# 738030
# again, ~40% fewer
cd stats
echo "SELECT chromEnd - chromStart FROM multiz10way WHERE chrom='chr1'" | \
hgsql -N hg17 | sort -n > chr1.maf.glued.sizes
echo "SELECT chromEnd - chromStart FROM multiz10wayChr1"| \
hgsql -N hg17 | sort -n > chr1.maf.sizes
# cleanup
hgsql hg17 -e "DROP TABLE multiz10wayChr1"
rm -fr ../mafTemp
# coverage of multiple alignment, and pairs
ssh kolossus
cd /cluster/data/hg17/bed/multiz10way.v8
cd stats
nice mafRanges -notAllOGap ../mafGlued/chr1.maf hg17 \
hg17.chr1.mafRanges.bed
nice mafRanges -notAllOGap /cluster/data/hg17/bed/multiz8way/maf/chr1.maf \
hg17 hg17.8way.chr1.mafRanges.bed
foreach db (panTro1 canFam1 mm5 rn3 galGal2 xenTro1 fr1 tetNig1 danRer1)
echo $db
nice mafRanges /cluster/data/hg17/bed/blastz.$db/mafNet/chr1.*maf \
-notAllOGap hg17 $db.chr1.mafRanges.bed
ls /cluster/data/hg17/bed/blastz.$db/mafNet/chr1.*maf
end
ssh hgwdev
cd /cluster/data/hg17/bed/multiz10way.v8/stats
nice featureBits -chrom=chr1 hg17 refGene:cds hg17.chr1.mafRanges.bed -enrichment
# refGene:cds 1.308%, hg17.chr1.mafRanges.bed 95.725%, both 1.307%, cover 99.94%, enrich 1.04x
nice featureBits -chrom=chr1 hg17 refGene:cds hg17.8way.chr1.mafRanges.bed -enrichment
# refGene:cds 1.308%, hg17.8way.chr1.mafRanges.bed 95.742%, both 1.307%, cover 99.97%, enrich 1.04x
foreach db (panTro1 canFam1 mm5 rn3 galGal2 xenTro1 fr1 tetNig1 danRer1)
nice featureBits -chrom=chr1 -enrichment hg17 refGene:cds $db.chr1.mafRanges.bed
end
#refGene:cds 1.308%, panTro1.chr1.mafRanges.bed 93.472%, both 1.264%, cover 96.65%, enrich 1.03x
#refGene:cds 1.308%, canFam1.chr1.mafRanges.bed 55.377%, both 1.277%, cover 97.64%, enrich 1.76x
#refGene:cds 1.308%, mm5.chr1.mafRanges.bed 37.342%, both 1.280%, cover 97.92%, enrich 2.62x
#refGene:cds 1.308%, rn3.chr1.mafRanges.bed 35.429%, both 1.257%, cover 96.14%, enrich 2.71x
#refGene:cds 1.308%, galGal2.chr1.mafRanges.bed 3.840%, both 0.936%, cover 71.61%, enrich 18.65x
#refGene:cds 1.308%, xenTro1.chr1.mafRanges.bed 3.059%, both 0.881%, cover 67.36%, enrich 22.02x
#refGene:cds 1.308%, fr1.chr1.mafRanges.bed 1.892%, both 0.854%, cover 65.29%, enrich 34.50x
#refGene:cds 1.308%, tetNig1.chr1.mafRanges.bed 1.384%, both 0.805%, cover 61.57%, enrich 44.50x
#refGene:cds 1.308%, danRer1.chr1.mafRanges.bed 2.716%, both 0.847%, cover 64.81%, enrich 23.86x
# MAKE HG17-RN3 OVER.CHAIN FOR LIFTOVER (DONE 1/25/05 angie)
ssh kolossus
set chainDir = /cluster/data/hg17/bed/blastz.rn3/axtChain
netChainSubset $chainDir/rat.net.gz $chainDir/all.chain.gz \
/cluster/data/hg17/bed/bedOver/hg17ToRn3.over.chain
# MAKE HG17-GALGAL2 OVER.CHAIN FOR LIFTOVER (DONE 1/25/05 angie)
ssh kolossus
set chainDir = /cluster/data/hg17/bed/blastz.galGal2/axtChain
netChainSubset $chainDir/human.net $chainDir/all.chain \
/cluster/data/hg17/bed/bedOver/hg17ToGalGal2.over.chain
# DOWNLOADS FOR 10-WAY MULTIZ (2005-01-24 kate)
# Use "glued" mafs
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17
mkdir -p multiz10way
cd multiz10way
foreach f (/cluster/data/hg17/bed/multiz10way.v8/mafGlued/*.maf)
set c = $f:r:t
echo $c
nice gzip -c $f > $c.maf.gz
end
# copy README and edit
# Create upstream files for download
ssh hgwdev
cd /cluster/data/hg17/bed/multiz10way.v8
echo hg17 panTro1 mm5 rn3 canFam1 galGal2 xenTro1 fr1 tetNig1 danRer1 > org.txt
# mafFrags takes a while
foreach i (1000 2000 5000)
echo "making upstream$i.maf"
nice featureBits hg17 refGene:upstream:$i -fa=/dev/null -bed=up.bad
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
rm up.bad
nice mafFrags hg17 multiz10way up.bed upstream$i.maf -orgs=org.txt
rm up.bed
end
ssh eieio
cd /cluster/data/hg17/bed/multiz10way.v8
nice gzip upstream{1000,2000,5000}.maf
# 6 mins.
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17
mv /cluster/data/hg17/bed/multiz10way.v8/upstream*.maf.gz multiz10way
cd multiz10way
md5sum *.gz > md5sum.txt
# Create histogram of this phastCons data (Hiram - 2005-02-07)
ssh hgwdev
cd /cluster/data/hg17/bed/multiz.2004-12-22/cons
time hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=hg17 phastCons > histogram.data 2>&1
# 34 minutes
cat << '_EOF_' > histo.gp
set terminal png small color \
x000000 xffffff xc000ff x66ff66 xffff00 xff0000 xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Hg17 Histogram phastCons track"
set xlabel "Hg17 phastCons score"
set ylabel "p-Value"
set y2label "Cumulative Probability Distribution"
set y2range [0:1]
set y2tics
plot "histogram.data" using 2:5 title " pValue" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CPD" with lines
'_EOF_'
gnuplot histo.gp > histo.png
display histo.png &
# BLASTZ BOREOEUTHERIAN (BOREUT1) (DONE 1/29/05 braney)
ssh kk
mkdir /cluster/data/borEut1/bed/zb.hg17
ln -s /cluster/data/borEut1/bed/zb.hg17 /cluster/data/hg17/bed/blastz.borEut1
cd /cluster/data/hg17/bed/blastz.borEut1
# Use default (Human-Mouse) settings for starters.
cat << '_EOF_' > DEF
# human vs. dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Default
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human
SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInDog
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Dog
SEQ2_DIR=/iscratch/i/borEut1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg17/bed/blastz.borEut1
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
/cluster/data/hg17/jkStuff/BlastZ_run0.sh
cd run.0
para push
# Completed: 2728 of 2728 jobs
# CPU time in finished jobs: 621440s 10357.34m 172.62h 7.19d 0.020 y
# IO & Wait Time: 19079s 317.98m 5.30h 0.22d 0.001 y
# Average job time: 235s 3.91m 0.07h 0.00d
# Longest job: 2340s 39.00m 0.65h 0.03d
# Submission to last job: 2837s 47.28m 0.79h 0.03d
ssh kki
cd /cluster/data/hg17/bed/blastz.borEut1
/cluster/data/hg17/jkStuff/BlastZ_run1.sh
cd run.1
para push
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 95s 1.58m 0.03h 0.00d 0.000 y
# IO & Wait Time: 825s 13.75m 0.23h 0.01d 0.000 y
# Average job time: 3s 0.04m 0.00h 0.00d
# Longest job: 10s 0.17m 0.00h 0.00d
# Submission to last job: 73s 1.22m 0.02h 0.00d
ssh kk
cd /cluster/data/hg17/bed/blastz.borEut1
/cluster/data/hg17/jkStuff/BlastZ_run2.sh
cd run.2
para push
# /cluster/data/hg17/bed/blastz.borEut1/axtChrom/chr18_random.axt is empty
# /cluster/data/hg17/bed/blastz.borEut1/axtChrom/chr19_random.axt is empty
# ..
# Completed: 44 of 46 jobs
# Crashed: 2 jobs
# CPU time in finished jobs: 104s 1.73m 0.03h 0.00d 0.000 y
# IO & Wait Time: 482s 8.04m 0.13h 0.01d 0.000 y
# Average job time: 13s 0.22m 0.00h 0.00d
# Longest job: 134s 2.23m 0.04h 0.00d
# Submission to last job: 142s 2.37m 0.04h 0.00d
# END BLASTZ BOREOEUTHERIAN
##########################################################################
# MAKE tfbsConsSites and tfbsConsFactors for TFBS conserved track (DONE braney 1/15/05)
# Questions? weirauch@soe.ucsc.edu or braney@soe.ucsc.edu
# tfbsConsSites table reloaded 2006-11-03 - Hiram - see below:
## reload tfbsCons table - it was based on a newer version of tfbs names that
ssh hgwdev
mkdir /cluster/data/hg17/bed/tfbsCons
cd /cluster/data/hg17/bed/tfbsCons
# Define all parameters in 'PARAMS.txt'
# Define all chromosomes in 'CHROMS.txt'
# Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts weirauch@soe.ucsc.edu
set tarfile=/cluster/data/hg17/bed/tfbsCons/tfbsConsUtils.tar.gz
tar zxf $tarfile
nice ./getRefseqStats.pl &
nice ./getBatchQueries.pl &
ssh kk
mkdir /cluster/bluearc/braney/tfloc
# Copy ./tmp/ctfbs_batch_list.txt to this dir
# Copy ./scripts/doit to this dir
para create ctfbs_batch_list.txt
para try
para push
# When the run is done (within a day or so), the results will be in individual dirs, one for each chromosome.
ssh kksilo (or hgwdev, or whatever)
nice ./getBedFile.pl &
hgLoadBed -noSort hg17 tfbsConsSites \
-sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql \
tfbsConsSites.bed -tab
hgLoadBed -noSort hg17 tfbsConsFactors \
-sqlTable=$HOME/kent/src/hg/lib/tfbsConsFactors.sql \
tfbsConsFactors.bed -tab
# Feel free to delete or gzip anything in ./tmp
# (particularly the huge .maf and .bed files)
# after the final two bed files are sucessfully loaded
##########################################################################
# CHICKEN RECIPROCAL-BEST NET FOR STRINGENT LIFTOVER (DONE 2/3/05 angie)
ssh kolossus
cd /cluster/data/hg17/bed/blastz.galGal2/axtChain
# Run chainNet again, this time keeping both of its outputs:
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin ../S1.len ../S2.len h_g.net g_h.net
# Get the chicken chains from the chicken-referenced (but human-centric)
# net:
chainSwap all.chain g_h.chain
netChainSubset g_h.net g_h.chain stdout \
| chainSort stdin g_h.subset.chain
# Net those (sorted) chicken chains, and keep both outputs, to get
# reciprocal best nets referenced to both species:
chainPreNet g_h.subset.chain ../S2.len ../S1.len stdout \
| chainNet stdin ../S2.len ../S1.len g_h.rbest.net h_g.rbest.net
# Get the chains from the recip-best nets for stringent liftOver:
netChainSubset g_h.rbest.net g_h.chain galGal2ToHg17.rbest.over.chain
netChainSubset h_g.rbest.net all.chain hg17ToGalGal2.rbest.over.chain
####### RE-BUILD RGD HUMAN QTL TRACKS (DONE 2/5/05 Fan) ##############
mkdir -p /cluster/store8/rgd/human050205
rm /cluster/data/hg17/bed/rgdQtl
ln -s /cluster/store8/rgd/human050205 /cluster/data/hg17/bed/rgdQtl
cd /cluster/data/hg17/bed/rgdQtl
# download data files from RGD
wget --timestamp ftp://rgd.mcw.edu/pub/RGD_genome_annotations/human/human_QTL.gff
# remove extra line feed character at the end of lines
# !!! manually corrected the line of AASTH7_H because chromStart is greater than chrEnd
rmLf human_QTL.gff > rgdQtl.gff
# create rgdQtl.tab
awk '{print $1"\t"$4"\t"$5"\t"$10}' rgdQtl.gff |sed -e 's/Chr/chr/g'| \
sed -e 's/"//g' |sed -e 's/RGD://g' | sed -e 's/;//g' > rgdQtl.tab
# create rgdQtlLink.tab
awk '{printf "%s\t%s\t", $12, $10; for (i = 14;i <= NF; ++i ) {printf "%s ", $i} printf "\n"} ' rgdQtl.gff | \
sed -e 's/"//g'| sed -e 's/RGD://g' | sed -e 's/;//g'| sed -e 's/Note//g' > rgdQtlLink.tab
# load rgdQtl table
hgLoadBed hg17 rgdQtl rgdQtl.tab
# check rgdQtl table
checkTableCoords hg17 rgdQtl
# load rgdQtlLink table
hgsql hg17 -e "drop table hg17.rgdQtlLink;"
hgsql hg17 <~/kent/src/hg/lib/rgdQtlLink.sql
hgsql hg17 -e 'load data local infile "rgdQtlLink.tab" into table hg17.rgdQtlLink;'
# updated trackDb.ra under /kent/src/hg/makeDb/trackDb/human/hg17 and
# added rgdQtl.html.
# GENOSCOPE TETRAODON (tetNig1) ECORES (DONE, 2005-02-08, hartera)
ssh eieio
mkdir -p /cluster/data/hg17/bed/ecoresTetNig1
cd /cluster/data/hg17/bed/ecoresTetNig1
wget --timestamp \
http://www.genoscope.cns.fr/externe//4ucsc/ExofishHs35Tnig1
# this is in gff format
# remove "Ecotig" from name field
sed -e 's/Ecotig EG/EG/g' ExofishHs35Tnig1 > ExofishHs35Tnig1.gff
# need to have tabs between fields not a space to load file into table
sed -e 's/ /\t/g' ExofishHs35Tnig1.gff > Hs35Tnig1format.gff
# if "ecore" is changed to "CDS" and "ecotig" to "transcript" this loads
# correctly into the table.
sed -e 's/ecore/CDS/' Hs35Tnig1format.gff | sed -e 's/ecotig/transcript/' \
> Hg17vstetNig1.gff
# add "chr" in front of the chromsome name in first field (2005-02-08)
perl -pi.bak -e 's/^([0-9XYM]{1,2})/chr$1/' Hg17vstetNig1.gff
rm *.bak
# need to reload table
ssh hgwdev
cd /cluster/data/hg17/bed/ecoresTetNig1
echo 'drop table ecoresTetNig1;' | hgsql hg17
nice ldHgGene hg17 ecoresTetNig1 Hg17vstetNig1.gff
# Read 40172 transcripts in 186032 lines in 1 files
# 40172 groups 42 seqs 1 sources 2 feature types
# 40172 gene predictions
# added ecoresTetNig1 entry to trackDb.ra in trackDb/human
# and created ecoresTetNig1.html. Genoscope will not be maintaining this
# newest data in their Exofish comparative browser display.
# UPDATE kgSpAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan)
# Add new human protein display IDs to the alias table to support user search
ssh hgwdev
cd /cluster/data/hg17/bed/pb
mkdir newDisplayId
cd newDisplayId
hgsql proteome -e 'select hg17.kgSpAlias.kgID, hg17.kgSpAlias.SpID, spOldNew.newDisplayId from spOldNew, hg17.kgSpAlias where spOldNew.acc=hg17.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >hg17.tab
hgsql hg17 -e 'load data local infile "hg17.tab" into table hg17.kgSpAlias'
# UPDATE kgProtAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan)
# Add new hg17 protein display IDs to the alias table to support user search
ssh hgwdev
cd /cluster/data/hg17/bed/pb/newDisplayId
hgsql proteome -e 'select hg17.kgSpAlias.kgID,spOldNew.oldDisplayId,spOldNew.newDisplayId from spOldNew, hg17.kgSpAlias where spOldNew.acc=hg17.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >hg17.kgProtAlias.tab
# get rid of the header line at the end of the file
vi hg17.kgProtAlias.tab
hgsql hg17 -e 'load data local infile "hg17.kgProtAlias.tab" into table hg17.kgProtAlias'
# BLASTZ HUMAN TARGET, COW QUERY (DONE, Nov. 2004 - Jan. 2005, Heather)
ssh kk
# use /cluster/data/bosTau1 because more disk space there
cd /cluster/data/bosTau1/bed
mkdir zb.hg17
# create DEF file
# for now, not doing ABRIDGE_REPEATS
# this means I don't need to create lineage specific repeats
# This is because blastz-run wouldn't take advantage of these
# because my query is in scaffolds
# human vs. cow
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Default
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Human
SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
#SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cow
SEQ2_DIR=/iscratch/i/bosTau1/splitDir
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg17/bed/zb.hg17
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
bash
cd /cluster/data/bosTau1/bed/zb.hg17
source DEF
mkdir $RAW run.0
# create S2.len so make-joblist doesn't have to
/cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run.0/j
sh ./xdir.sh
cd run.0
# check how many lines in j
sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
para create jobList
para try, para check, para push, para check....
# convert out to lav
ssh kki
cd /cluster/data/bosTau1/bed/zb.hg17
# run bash shell if not running it already
source DEF
mkdir -p $BASE/run.1
mkdir -p $BASE/lav
# create a new job list to convert out files to lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > run.1/jobList
cd run.1
# make sure the job list is OK
wc -l jobList
head jobList
para create jobList
para try
para check
para push
# lavToAxt
ssh kksilo
cd /cluster/data/bosTau1/bed/zb.hg17
mkdir axtTemp
cd lav
foreach i (*)
catDir $i | lavToAxt stdin /cluster/data/hg17/nib \
/cluster/data/bosTau1/bosTau1.2bit ../axtTemp/$i.axt
echo done $i
end
# axtChain
ssh kki
cd /cluster/data/bosTau1/bed/zb.hg17
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chainRaw
ls -1S /cluster/data/bosTau1/bed/zb.hg17/axtTemp/*.axt > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chainRaw/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
'_EOF_'
axtChain $1 /iscratch/i/hg17/bothMaskedNibs /iscratch/i/bosTau1/nib/bosTau1.2bit $2 > $3
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try
para check
para push
# Completed: 46 of 46 jobs
# Average job time: 83s 1.39m 0.02h 0.00d
# Longest job: 1240s 20.67m 0.34h 0.01d
# Submission to last job: 1326s 22.10m 0.37h 0.02d
# mergesort
ssh kksilo
cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
chainMergeSort run1/chainRaw/*.chain > all.chain.jan3
# chainAntiRepeat
ssh kksilo
cd /cluster/data/bosTau1/bed/zb.hg17/axtChain/run1
mkdir chainAntiRepeat
# test with just one
chainAntiRepeat /cluster/store5/gs.18/build35/nib /cluster/data/bosTau1/bosTau1.2bit \
chainRaw/chr18.chain chainAntiRepeat/chr18.chain
# do them all
foreach f (chainRaw/*.chain)
set f1 = $f:t
echo $f1
chainAntiRepeat /cluster/store5/gs.18/build35/nib /cluster/data/bosTau1/bosTau1.2bit \
$f chainAntiRepeat/$f1
end
# mergesort again
ssh kksilo
cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
chainMergeSort run1/chainAntiRepeat/*.chain > all.chain.jan5
gzip all.chain.jan3
# split
mkdir chain
chainSplit chain all.chain.jan5
# look at the distribution
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r
textHistogram -binSize=5000 /tmp/score.$f:t:r
echo ""
end
# see files histogram.out and histogram.interesting
# run chainFilter
chainFilter -minScore=5000 all.chain.jan5 > all.chain.jan5.filtered
gzip all.chain.jan5
# split
rm chain/*
chainSplit chain all.chain.jan5.filtered
gzip all.chain.jan5.filtered
# load
ssh hgwdev
cd /cluster/data/bosTau1/bed/zb.hg17/axtChain/chain
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain hg17 ${c}_chainBosTau1 $i
end
# featureBits -chrom=chr1 hg17 chainBosTau1Link
# 103272818 bases of 222827847 (46.346%) in intersection
# featureBits -chrom=chr2 hg17 chainBosTau1Link
# 105920345 bases of 237506229 (44.597%) in intersection
# featureBits -chrom=chr3 hg17 chainBosTau1Link
# 89582887 bases of 194635740 (46.026%) in intersection
# featureBits -chrom=chr4 hg17 chainBosTau1Link
# 77513949 bases of 187161218 (41.416%) in intersection
# featureBits -chrom=chr5 hg17 chainBosTau1Link
# 80428726 bases of 177702766 (45.260%) in intersection
# featureBits -chrom=chr6 hg17 chainBosTau1Link
# 71830264 bases of 167317699 (42.930%) in intersection
# featureBits -chrom=chr7 hg17 chainBosTau1Link
# 64561289 bases of 154759139 (41.717%) in intersection
# featureBits -chrom=chr8 hg17 chainBosTau1Link
# 55896735 bases of 142612826 (39.195%) in intersection
# featureBits -chrom=chr9 hg17 chainBosTau1Link
# 52068957 bases of 117781268 (44.208%) in intersection
# featureBits -chrom=chr10 hg17 chainBosTau1Link
# 57427282 bases of 131613628 (43.633%) in intersection
# featureBits -chrom=chr11 hg17 chainBosTau1Link
# 58412709 bases of 131130853 (44.545%) in intersection
# featureBits -chrom=chr12 hg17 chainBosTau1Link
# 56076163 bases of 130259811 (43.049%) in intersection
# featureBits -chrom=chr13 hg17 chainBosTau1Link
# 37951944 bases of 95559980 (39.715%) in intersection
# featureBits -chrom=chr14 hg17 chainBosTau1Link
# 39896970 bases of 88290585 (45.188%) in intersection
# featureBits -chrom=chr15 hg17 chainBosTau1Link
# 37507979 bases of 81341915 (46.112%) in intersection
# featureBits -chrom=chr16 hg17 chainBosTau1Link
# 33883573 bases of 78884754 (42.953%) in intersection
# featureBits -chrom=chr17 hg17 chainBosTau1Link
# 31871034 bases of 77800220 (40.965%) in intersection
# featureBits -chrom=chr18 hg17 chainBosTau1Link
# 30359555 bases of 74656155 (40.666%) in intersection
# NET
# run in stages to avoid memory problems
ssh kolossus
cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
# PRE
/cluster/bin/x86_64/chainPreNet all.chain.jan5.filtered ../S1.len ../S2.len chainPreNet.out
# chainNet
/cluster/bin/x86_64/chainNet chainPreNet.out \
-minSpace=1 ../S1.len ../S2.len bosTau1.net.raw /dev/null
# syntenic (using revision 1.6)
/cluster/home/heather/bin/x86_64/netSyntenic bosTau1.net.raw bosTau1.net.syn
# memory usage 2757492736, utime 13404 s/100, stime 616
# backup/compress
ssh kksilo
cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
gzip bosTau1.net.raw
cp bosTau1.net.syn bosTau1.net.syn.backup
# netClass
# takes about 4 hours
ssh hgwdev
cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
netClass -noAr bosTau1.net.syn hg17 bosTau1 bosTau1.net
# backups
ssh kksilo
cp bosTau1.net bosTau1.net.backup
rm bosTau1.net.syn.backup
# load
ssh hgwdev
cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
netFilter -minGap=10 bosTau1.net | hgLoadNet hg17 netBosTau1 stdin
rm bosTau1.net.backup
# index has NULL cardinality; analyze to fix
hgsql hg17
analyze table netBosTau1
# generate axts
ssh kksilo
cd /cluster/data/bosTau1/bed/zb.hg17
mkdir axtNet
# split first (not required?)
cd axtChain
mkdir net
netSplit bosTau1.net.syn net
cd net
foreach i (*.net)
netToAxt $i ../chain/$i:r.chain /cluster/data/hg17/nib /cluster/data/bosTau1/bosTau1.2bit ../../axtNet/$i:r.axt
end
gzip bosTau1.net.syn
gzip bosTau1.net
# axtSort (takes about 5 minutes)
ssh kksilo
cd /cluster/data/bosTau1/bed/zb.hg17
mkdir axtNetSort
foreach f ( axtNet/*.axt )
set c = $f:t:r
echo "axtSort on $c"
axtSort $f axtNetSort/$c.axt
end
# make maf files
mkdir mafNet
foreach f (axtNetSort/*.axt)
set c = $f:t:r
echo "axtToMaf on $c"
axtToMaf $f /cluster/data/hg17/chrom.sizes /cluster/data/bosTau1/chrom.sizes mafNet/$c.maf -tPrefix=hg17. -qPrefix=bosTau1.
end
# MAKE VSBOSTAU1 DOWNLOADABLES (DONE Feb. 15, 2005 Heather)
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17
mkdir vsBosTau1
cd vsBosTau1
mkdir axtNet
cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
cp -p all.chain.gz /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1/cow.chain.gz
cp -p bosTau1.net.gz /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1/cow.net.gz
cd ../axtNet
cp -p * /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1/axtNet
cd /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1
# Make a README.txt which explains the files & formats.
md5sum *.gz > md5sum.txt
cd axtNet
md5sum *.gz > md5sum.txt
# YALE PSEUDOGENES (started Robert Baertsch, finished JK 2/21/05)
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir pseudoYale
cd pseudoYale
# Place file obtained from Mark Gerstein at yale in pseudoYale.gtf
ldHgGene hg17 pseudoYale pseudoYale.gtf
# Note - I'm guessing how this goes. Robert left no record. -jk
# added xenoRefGene track (markd ~2005-02-20)
add to /cluster/data/genbank/genbank.con
hg17.refseq.mrna.xeno.load = yes
hg17.refseq.mrna.xeno.loadDesc = yes
# BUILD ccdsGene and ccdsInfo tables (markd 2005-02-25)
# download files to the genbank data area, as this will eventually
# be done automatically as part of the genbank build process.
cd /cluster/data/genbank
mkdir -p data/ccds/hg17/2005-02-25
cd data/ccds/hg17/2005-02-25
# get the basic text dumps of the data, rather than the database dumps
wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/
# ends up with:
About-NcbiHinxton.txt
NcbiHinxton.txt
NcbiHinxtonAllAccessions.txt
# this is a preliminary release, it contained 2 PAR genes that had
# bad coordinates and 7 genes that were determined at be pseudogenes
# at the last minute. The accessions for these 9 genes were
# placed in skip.ccds and then removed:
fgrep -v -f skip.ccds /scratch/markd/gene-sets/ncbiDb/set1.5/NcbiHinxtonAllAccessions.txt > /scratch/markd/gene-sets/ncbiDb/set1.5/NcbiHinxtonAllAccessions.cleaned.txt
# create the tab files to load in the database
/cluster/data/genbank/bin/i386/ccdsImport NcbiHinxtonAllAccessions.cleaned.txt ccdsGene.gp ccdsInfo.tab
# load ccdsInfo
hgsql hg17 <../../../../../lib/ccdsInfo.sql
hgsql -e 'load data local infile "ccdsInfo.tab" into table ccdsInfo' hg17
# load cdsGene.gp and check
ldHgGene -predTab -genePredExt hg17 ccdsGene ccdsGene.gp
checkTableCoords hg17 -verbose=2 ccdsGene
rm *.tab
gzip -9 NcbiHinxton*.txt
# BUILD refSeqKg TABLE TO SUPPORT CCDS GENES (RE-DONE, Fan 2/26/05)
hgsql hg17 -N -e "select * from knownGene" >kg.gp
hgsql hg17 -N -e "select * from refGene" >ref.gp
overlapSelect -inCds -strand -idOutput -fraction=fraction.out -selectCds -overlapSimilarity=0.90 -selectFmt=genePred -inFmt=genePred kg.gp ref.gp refSeqKg.90.tab
cat fraction.out|sort -u >refSeqKg.tab
hgsql hg17 -e 'drop table refSeqKg'
hgsql hg17 < ~/src/hg/lib/refSeqKg.sql
hgsql hg17 -e 'load data local infile "refSeqKg.tab" into table refSeqKg'
rm fraction.out
# BUILD ccdsGene and ccdsInfo tables (markd, reone 2005-03-17)
cd /cluster/store5/genbank/data/ccds/hg17
wget ftp://ftp.ncbi.nlm.nih.gov/pub/hcds/Hs35.1/CDSTrackDB/CCDS.20050303.tar.gz
mkdir /scratch/tmp/ccds
cd /scratch/tmp/ccds
tar -zxf /cluster/store5/genbank/data/ccds/hg17/CCDS.20050303.tar.gz
# import ccds database tables
hgsql -e 'create database ccds'
hgsql ccds </cluster/data/genbank/etc/createTables.sql
hgsql ccds </cluster/data/genbank/etc/createKeys.sql
/cluster/data/genbank/bin/i386/ccdsImport ccds data/[A-Z]*.txt
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/i386/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene
# refSeqKg table
hgsql -N -e "select * from knownGene" hg17 >kg.gp
hgsql -N -e "select * from refGene" hg17 >ref.gp
overlapSelect -statsOutput -strand -inCds -selectCds -overlapSimilarity=0.90 kg.gp ref.gp stdout | tail +2 | sort -u >refSeqKg.tab
hgsql hg17 -e 'drop table refSeqKg'
hgsql hg17 < ~/compbio/kent/src/hg/lib/refSeqKg.sql
hgsql hg17 -e 'load data local infile "refSeqKg.tab" into table refSeqKg'
cd ..
rm -r ccds
# COW BACENDS (Done, Heather, Mar. 21, 2005)
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir bacendsCow
cd bacendsCow
# Obtain GFF file from Denis; unzip into BACendhg15.gff
# Convert into BED 6:
makebed.pl < BACendhg17.gff > BACendhg17.bed
hgLoadBed -noBin hg17 bacendsCow BACendhg17.bed
# 53403 warnings
# add to kent/src/hg/makeDb/trackDb/human/hg17/trackDb.ra
# make map between ccds and known genes (markd 2005/03/08)
# this should be run whenever either known genes or ccds is updated
/cluster/data/genbank/bin/i386/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
# UPDATE WGRNA TRACK (DONE, 2004-12-13, Fan)
# Received updated data file, wg_track_april2005.txt, from Michel Weber by email.
cut -f 2-10 wg_track_april2005.txt |tail +2 >wg_track_april2005.tab
# Use editor to remove the last blank line.
hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wg_track_april2005.tab
# Asked Donna to update Reference section according to Michel's email.
## refresh vega tracks with vega build30 (done 5/4/04 Robert)
##download vega mysql tables
cd /cluster/store8/ensembl
mkdir vega30_35c
cd vega30_35c
ln /cluster/store8/ensembl/vega30_35c /cluster/data/hg17/bed/vega30 -s
for i in `cat tables` ; do wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/$i.gz ; done
wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/homo_sapiens_vega_30_35c_mysql40_compatible..sql.gz
gunzip *.gz
##create mysql database
mysql
create database vega30
use vega30
source homo_sapiens_vega_30_35c_mysql40_compatible.sql
source dropMt.sql
source load.sql
exit
hgsql vega30 -N -B < vegaGene.sql > vegaGene.tab
awk -f vegaGene.awk < vegaGene.tab > vegaGene.gp
ldHgGene hg17 vegaGene -predTab vegaGene.gp -gtf -genePredExt
hgsql vega30 -N -B < vegaPseudo.sql > vegaPseudo.tab
awk -f vegaPseudo.awk < vegaPseudo.tab > vegaPseudo.gp
ldHgGene hg17 vegaPseudoGene -predTab vegaPseudo.gp -gtf -genePredExt
#load processed pseudogenes
grep Processed vegaPseudo.tab > vegaProcPseudo.tab
awk -f vegaPseudo.awk < vegaProcPseudo.tab > vegaProcPseudo.gp
ldHgGene hg17 vegaProcessedPseudo -predTab vegaProcPseudo.gp -gtf -genePredExt
#load vegaInfo
hgsql vega30 -N -B < vegaGeneInfo.sql > vegaInfo.tab
hgsql vega30 -N -B < vegaPseudoInfo.sql >> vegaInfo.tab
hgsql hg17 -N -B < /cluster/home/baertsch/kent/src/hg/lib/vegaInfo.sql
echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg17 -N -B
#load down to hg16
liftOver vegaGene.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaGeneHg16.gp unMapped.gp -genePred
liftOver vegaPseudo.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaPseudoGeneHg16.gp unMappedPseudo.gp -genePred
ldHgGene hg16 vegaGene -predTab vegaGeneHg16.gp -gtf
ldHgGene hg16 vegaPseudoGene -predTab vegaPseudoGeneHg16.gp -gtf
echo 'truncate table vegaInfo' | hgsql hg16 -N -B
echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg16 -N -B
#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)
####################################################################################
# RE-BUILD KNOWN GENES TABLES, 2ND TRIAL WITH VARIANT PROTEINS (Started 5/13/05 Fan)
# First build protein databases, sp050415 and proteins050415
# See makeProteins050415.doc for details.
# Create working subdirectories and temporary databases (kgHg17F)
ssh hgwdev
cd /cluster/store10/kg
mkdir kgHg17F
ln -s /cluster/store10/kg/kgHg17F /cluster/store6/kgDB/bed/kgHg17F
ln -s /cluster/store10/kg/kgHg17F /cluster/data/hg17/bed/kgHg17F
hgsql hg17 -e "create database kgHg17F"
hgsql hg17 -e "create database kgHg17FTemp"
mkdir /cluster/bluearc/kgDB/kgHg17F
mkdir /cluster/bluearc/kgDB/kgHg17F/protBlat
ln -s /cluster/bluearc/kgDB/kgHg17F/protBlat /cluster/store10/kg/kgHg17F/protBlat
cd /cluster/store10/kg/kgHg17F/protBlat
#################################################################
# VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
# The protBlat.psl was built during the first KG II build trial
# The results are still valid, except that kgHg17E was used
# instead of kgHg17F
# Create working subdirectories and temporary databases (kgHg17E)
ssh hgwdev
cd /cluster/store10/kg
mkdir kgHg17E
ln -s /cluster/store10/kg/kgHg17E /cluster/store6/kgDB/bed/kgHg17E
ln -s /cluster/store10/kg/kgHg17E /cluster/data/hg17/bed/kgHg17E
hgsql hg17 -e "create database kgHg17E"
hgsql hg17 -e "create database kgHg17ETemp"
mkdir /cluster/bluearc/kgDB/kgHg17E
mkdir /cluster/bluearc/kgDB/kgHg17E/protBlat
ln -s /cluster/bluearc/kgDB/kgHg17E/protBlat /cluster/store10/kg/kgHg17E/protBlat
cd /cluster/store10/kg/kgHg17E/protBlat
# Get all human protein sequences
hgsql -N sp050415 -e \
'select proteins050415.spXref3.accession,protein.val from proteins050415.spXref3,protein where division="9606" and acc=accession' \
|awk '{print ">" $1;print $2}' >humanProt.fa
# Prepare and perform cluster run for protein/genome alignment
ssh kk
cd /cluster/data/hg17/bed/kgHg17E/protBlat
mkdir prot
faSplit sequence humanProt.fa 1000 prot/prot
ls /cluster/bluearc/kgDB/kgHg17E/protBlat/prot/* > prot.lis
ssh hgwdev
cd /cluster/data/hg17/bed/kgHg17E/protBlat
hgsql hg17 -N -e 'select chrom from chromInfo' > chrom.lis
exit
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -noHead -t=dnax -q=prot /cluster/data/hg17/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgHg17E/protBlat/result/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
mkdir result
gensub2 chrom.lis prot.lis gsub jobList
para create jobList
para try
para check
para push
para check ...
# many output .psl will be empty, the warnings are OK.
[kk:protBlat> para check
45494 jobs in batch
0 jobs (including everybody's) in Parasol queue.
Checking finished jobs
tracking errors: 1
crashed: 12643
ranOk: 32850
total jobs in batch: 45494
[kk:protBlat> para time
45494 jobs in batch
0 jobs (including everybody's) in Parasol queue.
Checking finished jobs
Completed: 32850 of 45494 jobs
Crashed: 12643 jobs
para.results: file not found. paraHub can't write to this dir?
CPU time in finished jobs: 36153510s 602558.50m 10042.64h 418.44d 1.146 y
IO & Wait Time: 1585456s 26424.27m 440.40h 18.35d 0.050 y
Average job time: 1149s 19.15m 0.32h 0.01d
Longest running job: 0s 0.00m 0.00h 0.00d
Longest finished job: 155120s 2585.33m 43.09h 1.80d
Submission to last job: 276342s 4605.70m 76.76h 3.20d
# This cluster run took about 3 days. Crashed jobs are due to empty BLAT result. It is OK.
# collect BLAT results
ssh hgwdev
cd /cluster/data/hg17/bed/kgHg17E/protBlat
mkdir result2
mkdir result3
cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall
cat << '_EOF_' > do1.1
echo processing $1
cat result/$1_prot*.psl >result2/$1.psl
'_EOF_'
cat << '_EOF_' > do1.2
echo processing $1
pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result3/$1.psl /dev/null >>j.out
'_EOF_'
chmod +x do*
cp do1.1 do1
doall
cp do1.2 do1
doall
cat result3/*.psl >protBlat.psl
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# The end of protBlat.psl build, using kgHg17E
################################################################################
############################################################################
# This part process the variant splice proteins.
# First build variant splice protein tables.
# Get all variant isoform human protein sequences
ssh hgwdev
cd /cluster/data/swissprot/050415/build
wget --timestamp \
ftp://us.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz
wget --timestamp \
ftp://us.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl_varsplic.fasta.gz
gzip -d *varsplic.fasta.gz
faToTab -type=protein uniprot_trembl_varsplic.fasta splicTrembl.tab
faToTab -type=protein uniprot_sprot_varsplic.fasta splicSprot.tab
cat splicTrembl.tab splicSprot.tab >varProtein.tab
hgsql sp050415 < ~/src/hg/lib/varProtein.sql
hgsql sp050415 -e 'load data local infile "varProtein.tab" into table varProtein'
cat varProtein.tab |cut -f 1>j1
cut -f 1 j1|sed -e 's/-/\t/g' >j2
paste j1 j2 >splicProt.tab
hgsql kgHg17FTemp -e 'drop table splicProt'
hgsql kgHg17FTemp <~/src/hg/lib/splicProt.sql
hgsql kgHg17FTemp -e 'load data local infile "splicProt.tab" into table splicProt'
hgsql kgHg17FTemp -N -e \
'select varAcc, varProtein.val from sp050415.varProtein,splicProt,proteins050415.spXref3 where accession=parAcc and varProtein.acc=splicProt.varAcc and division="9606"'| \
awk '{print ">" $1;print $2}' >humanVarProt.fa
cd /cluster/data/hg17/bed/kgHg17F
# get all Human splicProtBlat records
hgsql hg17 -N -e \
'select splicProtBlat.* from splicProtBlat,proteins050415.spXref3,kgHg17FTemp.splicProt where qName=splicProt.varAcc and parAcc=accession and division="9606"'\
|cut -f 2-22 \
>humanVarProtBlat.psl
# Combine the regular protein protBlat records with the variant protein psl records.
cd /cluster/store10/kg/kgHg17F
cat ../kgHg17E/protBlat/protBlat.psl humanVarProtBlat.psl >protBlat.psl
hgLoadPsl hg17 protBlat.psl
# Processing protBlat.psl
# load of protBlat did not go as planned: 104064 record(s), 0 row(s) skipped, 1484 warning(s) loading psl.tab
# Looked into the cause of these 1484 warnings. It was due to that qBaseInsert and tBaseInsert
# have negative values, probably due to that this is protein alignment.
# create all_mrna.psl and tight_mrna.psl
hgsql hg17 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl
pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \
all_mrna.psl tight_mrna.psl /dev/null
# Use overlapSelect to get protein and mRNA alignment overlaps
overlapSelect -statsOutput -dropped=protOut.psl -overlapThreshold=0.90 \
-selectFmt=psl -inFmt=psl tight_mrna.psl protBlat.psl protMrna.stat
overlapSelect -mergeOutput -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \
-inFmt=psl tight_mrna.psl protBlat.psl protMrna.out
# Create protein/mRNA pair and protein lists
cut -f 10,31 protMrna.out|sort -u >spMrna.tab
cut -f 10 protMrna.out|sort -u >protein.lis
# Load spMrna.tab into spMrna table in temp DB.
hgsql kgHg17FTemp < ~/src/hg/lib/spMrna.sql
hgsql kgHg17FTemp -e 'load data local infile "spMrna.tab" into table spMrna'
hgsql kgHg17FTemp -e 'create index mrnaID on spMrna(mrnaID)'
# Prepare and perform cluster run of protein/mRNA alignment
# Get mRNA fa file.
cd /cluster/data/hg17/bed/kgHg17F
/cluster/data/genbank/bin/i386/gbGetSeqs -native -db=hg17 \
-gbRoot=/cluster/data/genbank genbank mrna mrna.fa
# Create mrnaSeq table in kgHg17FTemp DB.
faToTab mrna.fa mrnaSeq.tab
hgsql kgHg17FTemp -e 'drop table mrnaSeq'
hgsql kgHg17FTemp <~/src/hg/lib/mrnaSeq.sql
hgsql kgHg17FTemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq'
# Prepare files for cluster run
~/src/hg/protein/KG2.sh kgHg17F hg17 050415
# Perform cluster run of protein/mRNA alignment
~/src/hg/protein/KG3.sh kgHg17F hg17 050415
# Collect cluster run results
cd kgBestMrna
ls out | sed -e 's/prot/do1 prot/g' >doall
# create do1 with the following 2 lines:
cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protMrnaRaw.psl
'_EOF_'
chmod +x do*
doall
# Filter out low quality alignments
pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null
cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis
wc protMrna.lis
# Load BLAT results into temp DB.
hgsql kgHg17FTemp < ~/src/hg/lib/protMrnaBlat.sql
hgsql kgHg17FTemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat'
hgsql kgHg17FTemp -e 'create index tName on protMrnaBlat(tName)'
# Create CDS files from protein/mRNA alignment results.
hgsql kgHg17FTemp -N -e \
'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\
|sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds
# Create protMrna.psl with proteinID_mrnaID as query ID.
cut -f 22-30 ../protMrna.out > j1.tmp
cut -f 32-42 ../protMrna.out > j2.tmp
cut -f 10,31 ../protMrna.out|sed -e 's/\t/_/g' >j3.tmp
paste j1.tmp j3.tmp j2.tmp >protMrna.psl
rm j1.tmp j2.tmp j3.tmp
# Run mrnaToGene to create protMrna.gp
bash
mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
exit
# Prepare refGene and all_mrna gp files.
cd ..
hgsql hg17 -N -e 'select * from refGene' >ref.gp
hgsql hg17 -N -e \
'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \
|sort -u > all_mrna.cds
bash
mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
exit
# Align proteins to RefSeq.
overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
protBlat.psl ref.gp ref.stat
overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
protBlat.psl ref.gp protRef.gp
overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\
-selectFmt=genePred ref.gp protBlat.psl protRef.out
cut -f 10,22 protRef.out | sort -u >spRef.tab
cut -f 10 protRef.out | sort -u >protRef.lis
hgsql kgHg17FTemp -e 'drop table spRef'
hgsql kgHg17FTemp <~/src/hg/lib/spRef.sql
hgsql kgHg17FTemp -e 'load data local infile "spRef.tab" into table spRef'
# Prepare and perform cluster runs for protein/RefSeq alignments
~/src/hg/protein/KGRef2.sh kgHg17F hg17 050415
~/src/hg/protein/KGRef3.sh kgHg17F hg17 050415
cd kgBestRef
ls out | sed -e 's/prot/do1 prot/g' >doall
cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protRefRaw.psl
'_EOF_'
chmod +x do*
doall
# Filter out low quality alignments.
pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null
cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis
wc protRef.lis
hgsql kgHg17FTemp -e 'drop table protRefBlat'
hgsql kgHg17FTemp < ~/src/hg/lib/protRefBlat.sql
hgsql kgHg17FTemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat'
hgsql kgHg17FTemp -e 'create index tName on protRefBlat(tName)'
# Run gene-check to filter out invalid gp entries
cd /cluster/data/hg17/bed/kgHg17F
cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp
gene-check -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/data/hg17/nib kgCandidate0.gp kgCandidate0.check
hgsql kgHg17FTemp -e 'drop table kgCandidate0'
hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidate0.sql
hgsql kgHg17FTemp -e 'load data local infile "kgCandidate0.gp" into table kgCandidate0'
hgsql kgHg17FTemp -e 'drop table geneCheck'
hgsql kgHg17FTemp < ~/src/hg/lib/geneCheck.sql
hgsql kgHg17FTemp -e 'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines'
# Run kgCheck to get all KG candidates that pass the KG gene check criteria
kgCheck kgHg17FTemp hg17 kgCandidate0 geneCheck kgCandidate.tab
hgsql kgHg17FTemp -e 'drop table kgCandidate'
hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidate.sql
hgsql kgHg17FTemp -e 'load data local infile "kgCandidate.tab" into table kgCandidate'
hgsql kgHg17FTemp -e 'create index alignID on kgCandidate(alignID)'
# ####### NEXT TIME AROUND PUT IN AN EXTRA STEP TO BRING IN ITEMS ON A "PUT BACK" LIST
# FOR SPECIAL CASES LIKE SELENOCYSTEINE, NON-AUG INITIATION CODON, RIBOSOMAL SLIPPAGE, ETC.
# #######
# Construct the kgCandidateX table that has alignID in the name field.
cut -f 2-10 kgCandidate.tab >j2.tmp
cut -f 11 kgCandidate.tab >j1.tmp
paste j1.tmp j2.tmp >kgCandidateX.tab
hgsql kgHg17FTemp -e 'drop table kgCandidateX'
hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidateX.sql
hgsql kgHg17FTemp -e 'load data local infile "kgCandidateX.tab" into table kgCandidateX'
# Score protein/mRna and protein/RefSeq alignments
kgResultBestMrna2 050415 kgHg17FTemp hg17|sort -u >protMrnaBlatScore.tab
kgResultBestRef2 050415 kgHg17FTemp hg17|sort -u >protRefScore.tab
# Combine scoring results and load them into temp DB.
cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
hgsql kgHg17FTemp -e 'drop table protMrnaScore'
hgsql kgHg17FTemp < ~/src/hg/lib/protMrnaScore.sql
hgsql kgHg17FTemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore'
hgsql kgHg17FTemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'
# Run kgGetCds to get CDS structure of each gene
kgGetCds kgHg17FTemp kgCandidateX jY.tmp
cat jY.tmp |sort -u >kgCandidateY.tab
rm jY.tmp
hgsql kgHg17FTemp -e 'drop table kgCandidateY'
hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidateY.sql
hgsql kgHg17FTemp -e 'load data local infile "kgCandidateY.tab" into table kgCandidateY'
# Run kgPickPrep to replace long cds structure string with cdsId.
kgPickPrep kgHg17FTemp kgCandidateZ.tab
hgsql kgHg17FTemp -e 'drop table kgCandidateZ'
hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidateZ.sql
hgsql kgHg17FTemp -e 'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
hgsql kgHg17FTemp -e 'create index cdsId on kgCandidateZ(cdsId)'
# Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.
kgPick kgHg17FTemp hg17 proteins050415 kg3.tmp dupSpMrna.tmp
sort -u dupSpMrna.tmp >dupSpMrna.tab
# Sort KG genes to make the kg3.gp table file.
~/kent/src/hg/protein/sortKg.pl kg3.tmp >kg3.gp
hgsql kgHg17FTemp -e 'drop table knownGene'
hgsql kgHg17FTemp < ~/src/hg/lib/knownGene.sql
hgsql kgHg17FTemp -e 'load data local infile "kg3.gp" into table knownGene'
hgsql hg17 -e 'drop table kg3'
hgsql hg17 < ~/src/hg/lib/kg3.sql
hgsql hg17 -e 'load data local infile "kg3.gp" into table kg3'
# Perform analysis before renaming the kg3 table to knownGene.
# Load data into hg17 knownGene table.
hgsql hg17 -e 'drop table knownGene'
hgsql hg17 < ~/src/hg/lib/knownGene.sql
hgsql hg17 -e 'load data local infile "kg3.gp" into table knownGene'
# Build knownGeneMrna and knownGenePep tables.
kgPepMrna kgHg17FTemp hg17 050415
hgsql hg17 -e 'drop table knownGeneMrna'
hgsql hg17 < ~/src/hg/lib/knownGeneMrna.sql
hgsql hg17 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
hgsql hg17 -e 'drop table knownGenePep'
hgsql hg17 < ~/src/hg/lib/knownGenePep.sql
hgsql hg17 -e 'load data local infile "knownGenePep.tab" into table knownGenePep'
# Build kgXref table
kgXref2 kgHg17FTemp 050415 hg17
hgsql hg17 -e 'drop table kgXref'
hgsql hg17 < ~/src/hg/lib/kgXref.sql
hgsql hg17 -e 'load data local infile "kgXref.tab" into table kgXref'
# Build spMrna table
hgsql hg17 -N -e 'select name, proteinID from knownGene' >kgSpMrna.tab
hgsql hg17 -e 'drop table spMrna'
hgsql hg17 <~/src/hg/lib/spMrna.sql
hgsql hg17 -e 'load data local infile "kgSpMrna.tab" into table spMrna'
# Build kgProtMap table
~/src/hg/protein/kgProtMap2.sh kgHg17F hg17 050415
# Update and clean up kgResultBestMrna2.c and then check it in.
#####################################
# Build alias tables. DONE 5/18/05 Fan.
# kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases
# proteins050415.hugo.withdraws, hg17.kgXref.kgID
# to create kgAliasM.tab and geneAlias.tab
# by picking out those kgID items from kgXref where
# kgXref.geneSymbol == hugo.symbol
kgAliasM hg17 proteins050415
# kgAliasKgXref reads from hg17.knownGene.proteinID,
# hg17.knownGene.name, hg17.kgXref.geneSymbol
# to create kgAliasKgXref.tab
kgAliasKgXref hg17
# kgAliasRefseq reads from hg17.knownGene.name,
# hg17.knownGene.proteinID, hg17.kgXref.refseq
# to create kgAliasRefseq.tab
kgAliasRefseq hg17
hgsql sp050415 -N -e 'select name,gene.val from hg17.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
| sort -u > kgAliasP.tab
hgsql hg17 -N -e 'select name, name from knownGene' >kgAliasDup.tab
hgsql hg17 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
sort |uniq > kgAlias.tab
hgsql -e "drop table kgAlias;" hg17
hgsql hg17 < ~/kent/src/hg/lib/kgAlias.sql
hgsql hg17 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias'
# kgProtAlias reads from hg17.knownGene.name,
# hg17.knownGene.proteinID, hg17.knownGene.alignID,
# proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb
# to create kgProtAlias.tab
#
kgProtAlias hg17 050415
hgsql hg17 -N -e \
'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
| sort -u >kgProtAliasNCBI.tab
# include variant splice protein IDs
hgsql hg17 -N -e \
'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\
|sort -u >kgProtAliasDup.tab
# include duplicate protein IDs from dupSpMrna table
hgsql hg17 -N -e \
'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
|sort -u >>kgProtAliasDup.tab
# catch parent acc from dupProteinID too
hgsql hg17 -N -e\
'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
|sort -u >>kgProtAliasDup.tab
cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab
echo "`date` creating table kgProtAlias"
hgsql hg17 -e "drop table kgProtAlias;"
hgsql hg17 <~/src/hg/lib/kgProtAlias.sql;
hgsql hg17 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'
# Build kgSpAlias table
hgsql hg17 -e \
'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
hgsql hg17 -e \
'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
>>j.tmp
cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab
rm j.tmp
hgsql hg17 -e 'drop table kgSpAlias';
hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql
hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias'
# MAKE FOLDUTR TABLES (DONE 2005-05-19, Fan)
# First set up directory structure and extract UTR sequence on hgwdev
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir rnaStruct.2005-05-18
rm rnaStruct
ln -s rnaStruct.2005-05-18 rnaStruct
cd rnaStruct
mkdir -p utr3/split utr5/split utr3/fold utr5/fold
utrFa hg17 knownGene utr3 utr3/utr.fa
utrFa hg17 knownGene utr5 utr5/utr.fa
# Split up files and make files that define job.
ssh kk
cd /cluster/data/hg17/bed/rnaStruct
faSplit sequence utr3/utr.fa 50000 utr3/split/s
faSplit sequence utr5/utr.fa 50000 utr5/split/s
ls -1 utr3/split > utr3/in.lst
ls -1 utr5/split > utr5/in.lst
cd utr3
cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
cp gsub ../utr5
# Do cluster run for 3' UTRs
gensub2 in.lst single gsub spec
para create spec
para try
para push
# Completed: 35692 of 35692 jobs
# CPU time in finished jobs: 1272085s 21201.42m 353.36h 14.72d 0.040 y
# IO & Wait Time: 102447s 1707.45m 28.46h 1.19d 0.003 y
# Average job time: 39s 0.64m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 6554s 109.23m 1.82h 0.08d
# Submission to last job: 9100s 151.67m 2.53h 0.11d
# Do cluster run for 5' UTRs
cd ../utr5
gensub2 in.lst single gsub spec
para create spec
para try
para push
# Completed: 33693 of 33693 jobs
# CPU time in finished jobs: 393764s 6562.74m 109.38h 4.56d 0.012 y
# IO & Wait Time: 126205s 2103.41m 35.06h 1.46d 0.004 y
# Average job time: 15s 0.26m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 51595s 859.92m 14.33h 0.60d
# Submission to last job: 52057s 867.62m 14.46h 0.60d
# Load database
ssh hgwdev
cd /cluster/data/hg17/bed/rnaStruct/utr5
hgLoadRnaFold hg17 foldUtr5 fold
cd ../utr3
hgLoadRnaFold hg17 foldUtr3 fold
# Clean up
rm -r split fold err batch.bak
cd ../utr5
rm -r split fold err batch.bak
# Build KEGG pathway tables. DONE 5/19/05. Fan.
ssh hgwdev
cd /cluster/store10/kg/kgHg17F
md kegg
cd kegg
~/src/hg/protein/KGpath.sh kgHg17F hg17 050415
hgsql hg17 -e "drop table keggMapDesc"
hgsql hg17 -e "drop table keggPathway"
hgsql hg17 <~/src/hg/lib/keggMapDesc.sql
hgsql hg17 <~/src/hg/lib/keggPathway.sql
hgsql hg17 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
hgsql hg17 -e 'load data local infile "keggPathway.tab" into table keggPathway'
# Build CGAP pathway tables
# RELOAD cgapAlias TABLE AFTER REMOVING REPLICATE ROWS (hartera, 2005-07-26)
# duplicate rows. (hartera, 2005-07-26)
# RELOADED cgapAlias AGAIN AS TOO MANY ROWS REMOVED BEFORE (hartera, 2005-10-06)
cd ..
~/src/hg/protein/KGcgap.sh kgHg17F hg17 050415
hgsql hg17 -e "drop table cgapAlias"
hgsql hg17 -e "drop table cgapBiocDesc"
hgsql hg17 -e "drop table cgapBiocPathway"
hgsql hg17 <~/src/hg/lib/cgapAlias.sql
hgsql hg17 <~/src/hg/lib/cgapBiocDesc.sql
hgsql hg17 <~/src/hg/lib/cgapBiocPathway.sql
hgsql hg17 -e 'load data local infile "cgapAlias.tab" \
into table cgapAlias'
hgsql hg17 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
hgsql hg17 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'
# RELOAD cgapAlias TABLE. Sort and reload alias tab file to remove
# duplicate rows. (hartera, 2005-07-26)
# DO TABLE RELOAD AGAIN AS sort -nu REMOVES MORE ROWS THAN sort -u
# OR sort -n | uniq.
#USE sort -n then uniq TO SORT ON THE IDs AND THEN UNIQ(hartera, 2005-10-06)
cd /cluster/store10/kg/kgHg17F
hgsql hg17 -e "drop table cgapAlias"
# cgapAlias.tab has replicated rows so sort and unique before loading
sort -n cgapAlias.tab | uniq > cgapAliasSorted.tab
hgsql hg17 < ~/kent/src/hg/lib/cgapAlias.sql
hgsql hg17 -e 'load data local infile "cgapAliasSorted.tab" \
into table cgapAlias'
# LOAD ENSEMBL GENES (DONE, 5/23/05, Fan)
# Ensembl changed things again! Please note there are two subtle changes to make it work.
mkdir /cluster/data/hg17/bed/ensembl
cd /cluster/data/hg17/bed/ensembl
mkdir new
cd new
# Get the ensembl protein data from
# http://www.ensembl.org/Homo_sapiens/martview
# Follow this sequence through the pages:
# Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Structures" box.
# Page 4) Choose GTF as the ouput. choose gzip compression. hit export.
# Save as ensemblGene.gtf.gz
# This time, there are some extra lines, like ' 1;',
# that are causing problems, so added an extra filter in the beginning
# to get rid of them.
# Ensembl handles random chromosomes differently than us, so we
# strip this data. Fortunately it just loses a couple of genes.
# Add "chr" to front of each line in the gene data gtf file to make
# it compatible with our software.
# Finally, get rid of the ".1" or ".2" after the name
cat ensemblGene.gtf |sed -e 's/\t\t/xxxxx/g' \
|grep -v xxxxx \
| grep -v ^6_DR51 \
| grep -v ^DR51 \
| grep -v ^DR52 \
| grep -v ^DR53 \
| grep -v _NT_ \
| perl -wpe 's/^([0-9]|X|Y|Un|MT)/chr$1/ \
|| die "Line $. doesnt start with human chrom:\n$_"' \
| sed -e 's/chrMT/chrM/g' \
| sed -e 's/\..\"/\"/g' \
>ensGene.gtf
ssh hgwdev
cd /cluster/data/hg17/bed/ensembl/new
/cluster/bin/i386/ldHgGene hg17 ensGene ensGene.gtf
# Read 33581 transcripts in 699580 lines in 1 files
# 33581 groups 25 seqs 1 sources 4 feature types
# 33581 gene predictions
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper. Use ensMart to create it as above, except:
# Page 3) Choose the "Features" box. In "Ensembl Attributes", check
# Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# Choose Text, tab-separated as the output format. Result name ensGtp.
# Save file as ensGtp.txt.gz
gunzip ensGtp.txt.gz
hgsql hg17 -e 'drop table ensGtp'
hgsql hg17 < ~/kent/src/hg/lib/ensGtp.sql
hgsql hg17 -e 'load data local infile "ensGtp.txt" into table ensGtp ignore 1 lines'
# ensMart has some problem with the resulting ensemblPep.fa.gz, so use different
# processing step instead:
wget -timestamp \
ftp://ftp.ensembl.org/pub/current_human/data/fasta/pep/Homo_sapiens.NCBI35.may.pep.fa.gz
zcat Homo_sapiens.NCBI35.may.pep.fa.gz | sed -e "s/transcript:/\n>/g" | grep -v 'gene:' >ensPep.fa
faToTab -type=protein ensPep.fa ensPep.tab
hgsql hg17 -e 'drop table ensPep'
hgsql hg17 < ~/kent/src/hg/lib/ensPep.sql
hgsql hg17 -e 'load data local infile "ensPep.tab" into table ensPep'
# kept the following, just in case Ensembl fixed the problem in the future
# Load Ensembl peptides:
# Get them from ensembl as above in the gene section except for
# Page 3) Choose the "Sequences" box.
# Page 4) Transcripts/Proteins. Peptide. Format = FASTA.
# Save file as ensemblPep.fa.gz
# gunzip ensemblPep.fa.gz
# hgPepPred hg17 ensembl ensemblPep.fa
# UPDATE GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED - 2005-05-21, DONE 2005-05-23 - Fan)
# This should be done after knownGene tables are complete from known gene
# process.
#
# Cluster together various alt-splicing isoforms.
# Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/hg17/bed/geneSorter.2005-05-21
# remove old symbolic link
rm /cluster/data/hg17/bed/geneSorter
ln -s /cluster/data/hg17/bed/geneSorter.2005-05-21 \
/cluster/data/hg17/bed/geneSorter
cd /cluster/data/hg17/bed/geneSorter
hgClusterGenes hg17 knownGene knownIsoforms knownCanonical
# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/hg17/bed/geneSorter/blastp
cd /cluster/data/hg17/bed/geneSorter/blastp
pepPredToFa hg17 knownGenePep known.faa
# You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
# This command is in /projects/compbio/bin/$MACH/formatdb
# Copy over database to bluearc
rm -fr /cluster/bluearc/hg17/blastp
mkdir -p /cluster/bluearc/hg17/blastp
cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* \
/cluster/bluearc/hg17/blastp
# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/hg17/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/self
cd /cluster/data/hg17/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \
-e 0.01 -m 8 -b 1000
'_EOF_'
# << keep emacs happy
chmod +x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# 'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
# This should finish in ~15 minutes if the cluster is free.
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs: 150459s 2507.64m 41.79h 1.74d 0.005 y
# IO & Wait Time: 22325s 372.09m 6.20h 0.26d 0.001 y
# Average job time: 22s 0.37m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 198s 3.30m 0.06h 0.00d
# Submission to last job: 2019s 33.65m 0.56h 0.02d
# Load into database. This takes about 30 minutes
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out
time hgLoadBlastTab hg17 knownBlastTab *.tab
# Scanning through 7739 files
# Loading database with 9836439 rows
# 232.300u 42.580s 23:13.41 19.7% 0+0k 0+0io 205pf+0w
cd /cluster/data/hg17/bed/geneSorter
# Create table that maps between known genes and RefSeq
hgMapToGene hg17 refGene knownGene knownToRefSeq
# may need to build this command in src/hg/near/hgMapToGene
# hgsql -e "select count(*) from knownToRefSeq;" hg17
# row count changed 34667
# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 \
> refToLl.txt
hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt
# hgsql -e "select count(*) from knownToLocusLink;" hg17
# row count changed to 34667
# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam
# hgsql -e "select count(*) from knownToPfam;" hg17
# row count changed to 36010
# Create table to map between known genes and GNF Atlas2
# expression data.
hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
# hgsql -e "select count(*) from knownToGnfAtlas2;" hg17
# row count changed to 32381
# Create expression distance table - takes about an hour
hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnfAtlas2 &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 32381 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
# hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
# row count changed to 32381000
# Create a table that maps between known genes and
# the nice affy expression data.
hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133
# hgsql -e "select count(*) from knownToU133;" hg17
# row count changed to 32886
# Create expression distance table. This will take about 2.5 hours
cd /tmp
cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \
-weights=affyUcla.weight -lookup=knownToU133 &
# Have 43039 elements in affyUclaNorm
# 211 genes, 42 weights, 26.500000 total wieght
# Got 32886 unique elements in affyUclaNorm
# Create table that maps between known genes and
# the GNF data.
cd /tmp
hgMapToGene hg17 affyU95 knownGene knownToU95
# row count changed to 17501
# hgFixed.gnfHumanU95Exps argument is unused, no need to exist
hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \
hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 &
# Have 11545 elements in hgFixed.gnfHumanU95MedianRatio
# Got 16450 unique elements in hgFixed.gnfHumanU95MedianRatio
# row count changed to 16450000
# Create known gene mapping table and expression distance tables
# for GNF Atlas 2. (The hgExpDistance takes only 10 minutes.)
hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h
hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnf1h &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 8814 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
cd /cluster/data/hg17/bed/geneSorter
hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2
# row count changed to 35055
#### UPDATE GO DATABASE (DONE 5/21/05 Fan)
# Download the terms and make the database.
ssh hgwdev
mkdir /cluster/store1/geneOntology/20050521
cd /cluster/store1/geneOntology/20050521
wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200504-assocdb-data.gz
hgsql mysql <<end
create database go050521;
end
zcat go_*data.gz | sed -e 's/ENGINE=MyISAM DEFAULT CHARSET=latin1/TYPE=MyISAM/g' >j.tmp
hgsql go050521 <j.tmp
rm j.tmp
wget --timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_uniprot.gz
zcat gene_association.goa_uniprot.gz | hgGoAssociation go050521 goaPart stdin
# Passed 5589891 of 6584507 of 6584507, 84.89%
# Ask sys-admin to switch the database pointer go to point to go050521.
cd /cluster/data/hg17/bed/geneSorter
# Rebuilt Ensembl Gene tables. See documentation (5/23/05 Fan) above.
# Create knownToEnsembl column
hgMapToGene hg17 ensGene knownGene knownToEnsembl
# table row count went from previous version: 38251 to 35436
# Make knownToCdsSnp table
ssh hgwdev
nice hgMapToGene hg17 snp knownGene knownToCdsSnp -all -cds
# row count 94394
# approx. 5 minutes running time
# C.ELEGANS BLASTP FOR GENE SORTER
# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to iscratch/i
# if it doesn't exist already:
# The following section is done during mm6 build already.
# ssh eieio
# mkdir /cluster/data/ce2/bed/blastp
# cd /cluster/data/ce2/bed/blastp
# # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
# # to find out the latest version. Then use that in place of 142 below.
# wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142
# formatdb -i wormPep142.faa -t wormPep142 -n wormPep142
# ssh kkr1u00
# if (-e /iscratch/i/ce2/blastp) then
# rm -r /iscratch/i/ce2/blastp
# endif
# mkdir -p /iscratch/i/ce2/blastp
# cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp
# iSync
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/hg17/bed/blastp/ce2/run/out
cd /cluster/data/hg17/bed/blastp/ce2/run
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep142 -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/hg17/bed/geneSorter/blastp/split >split.lst
#EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs: 60973s 1016.22m 16.94h 0.71d 0.002 y
# IO & Wait Time: 21292s 354.86m 5.91h 0.25d 0.001 y
# Average job time: 11s 0.18m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 50s 0.83m 0.01h 0.00d
# Submission to last job: 570s 9.50m 0.16h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/blastp/ce2/run/out
hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab
# Scanning through 7739 files
# Loading database with 25706 rows
# Make mouse ortholog column using blastp on mouse known genes.
# First make mouse protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This already exists. See makeMm6.doc for procedure
# the directory: /cluster/bluearc/scratch/mus/mm6/blastp should have data
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm6
cd /cluster/data/hg17/bed/geneSorter/blastp/mm6
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/panasas/home/store/mm6/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# this echo trick is used because otherwise the command line is
# too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs: 65337s 1088.95m 18.15h 0.76d 0.002 y
# IO & Wait Time: 20794s 346.56m 5.78h 0.24d 0.001 y
# Average job time: 11s 0.19m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 80s 1.33m 0.02h 0.00d
# Submission to last job: 598s 9.97m 0.17h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/mm6/run/out
hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
# Scanning through 7739 files
# row count changed to 32880
# Make rat ortholog column using blastp on rat known genes.
# First make rat protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This already exists. See makeRn3.doc for procedure.
# Files were put in this directory: /cluster/bluearc/rn3/blastp/
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/rn3/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# this echo trick is used because otherwise the command line is
# too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs: 28325s 472.08m 7.87h 0.33d 0.001 y
# IO & Wait Time: 20416s 340.27m 5.67h 0.24d 0.001 y
# Average job time: 6s 0.10m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 24s 0.40m 0.01h 0.00d
# Submission to last job: 617s 10.28m 0.17h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out
hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab
# Scanning through 7739 files
# Loading database with 24140 rows
# ZEBRAFISH BLASTP FOR GENE SORTER
# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to iscratch/i
# if it doesn't exist already:
ssh kkstore
mkdir /cluster/data/danRer2/bed/blastp
cd /cluster/data/danRer2/bed/blastp
wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.may.pep.fa.gz
zcat Dan*.pep.fa.gz > ensembl.faa
/scratch/blast/formatdb -i ensembl.faa -t ensembl -n ensembl
ssh kkr1u00
if (-e /iscratch/i/danRer2/blastp) then
rm -r /iscratch/i/danRer2/blastp
endif
mkdir -p /iscratch/i/danRer2/blastp
cp /cluster/data/danRer2/bed/blastp/ensembl.p?? /iscratch/i/danRer2/blastp
iSync
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/hg17/bed/blastp/danRer2/run/out
cd /cluster/data/hg17/bed/blastp/danRer2/run
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer2/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/hg17/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' > split.lst
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs: 113595s 1893.26m 31.55h 1.31d 0.004 y
# IO & Wait Time: 26231s 437.18m 7.29h 0.30d 0.001 y
# Average job time: 18s 0.30m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 99s 1.65m 0.03h 0.00d
# Submission to last job: 445s 7.42m 0.12h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/blastp/danRer2/run/out
hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab
# Scanning through 7739 files
# Loading database with 30731 rows
# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This is already done, see makeMm3.doc for procedure
# the directory: /cluster/bluearc/sc1/blastp should have data
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs: 18630s 310.50m 5.17h 0.22d 0.001 y
# IO & Wait Time: 20776s 346.27m 5.77h 0.24d 0.001 y
# Average job time: 5s 0.08m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 15s 0.25m 0.00h 0.00d
# Submission to last job: 295s 4.92m 0.08h 0.00d
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out
hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab
# Loading database with 16540 rows
# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# The following section was already done.
# cd /cluster/data/dm1/bed
# mkdir blastp
# cd blastp
#wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/FASTA/whole_genome_translation_dmel_RELEASE3-1.FASTA.gz
# zcat whole_ge*.gz | faFlyBaseToUcsc stdin flyBase.faa
# formatdb -i flyBase.faa -t flyBase -n flyBase
# if (-e /cluster/bluearc/dm1/blastp) then
# rm -r /cluster/bluearc/dm1/blastp
# endif
# mkdir -p /cluster/bluearc/dm1/blastp
# cp /cluster/data/dm1/bed/blastp/flyBase.p?? /cluster/bluearc/dm1/blastp
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs: 73518s 1225.30m 20.42h 0.85d 0.002 y
# IO & Wait Time: 45038s 750.63m 12.51h 0.52d 0.001 y
# Average job time: 15s 0.26m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 69s 1.15m 0.02h 0.00d
# Submission to last job: 762s 12.70m 0.21h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out
hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab
# Loading database with 27212 rows
# update knownToHInv table
# Verified that there is now new release of HInv data.
hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv
# count changed to 28851
# The new KG process no longer need entries in knownGeneLink (used to store
# info for DNA based RefSeqs. So clean out the old data in knownGeneLink.
hgsql hg17 -e "delete from knownGeneLink"
#### RE-BUILD SUPERFAMILY RELATED TABLES (DONE - 2005-05-27 - Fan)
# Download latest Superfamily data files and build the Superfamily DB
# from supfam.mrc-lmb.cam.ac.uk
mkdir /cluster/store10/superfamily/050524
ln -s /cluster/store10/superfamily/050524 /cluster/data/superfamily/050524
cd /cluster/data/superfamily/050524
# ftp over the following two files:
ass_22-May-2005.tab.gz
supfam_22-May-2005.sql.gz
gzip -d *.gz
# Load the Superfamily database
hgsql hg17 -e "create database superfam050524"
nice hgsql superfam050524 < supfam_22-May-2005.sql &
# This may take about an hour.
# Make sure to add an index on id of the des table of superfam050524.
hgsql superfam050524 -e "create index id on des(id);"
hgsql superfam050524 < ~/src/hg/lib/sfAssign.sql
hgsql superfam050524 -e 'load data local infile "ass_22-May-2005.tab" into table
superfam050524.sfAssign;'
# Build or rebuild Superfamily track and create sf tables needed for PB
hgsql hg17 < ~/src/hg/lib/sfAssign.sql
cd /cluster/data/superfamily/050524
hgsql hg17 -e 'load data local infile "ass_22-May-2005.tab" into table hg17.sfAssign;'
# If hg17.sfDes already exists, drop it.
hgsql superfam050524 -N -e "select * from des" >sfDes.tab
hgsql hg17 < ~/src/hg/lib/sfDes.sql
hgsql hg17 -e 'load data local infile "sfDes.tab" into table sfDes'
# If hg17.superfamily already exists, drop it.
cd /cluster/data/hg17/bed
mkdir /cluster/data/hg17/sf.2004-1128
ln -s sf.2004-1128 sf
hgSuperfam hg17 > sf.log
# It is normal that many proteins does not have corresponding Superfamily entries.
# If hg17.sfDescription exists, drop it.
hgsql hg17 < ~/src/hg/lib/sfDescription.sql
hgsql hg17 -e 'LOAD DATA local INFILE "sfDescription.tab" into table hg17.sfDescription;'
# Finally, load the superfamily table.
hgLoadBed hg17 superfamily superfamily.tab -tab
# Create knownToSuperfamily table
# Note hs is changed into ht for this Superfamily release.
cat /cluster/data/superfamily/050524/ass_22-May-2005.tab \
| hgKnownToSuper hg17 hs stdin
# created 25287 rows in knownToSuper
# Build tables needed by pbGlobal in proteins050415
cd /cluster/data/superfamily/050524
hgsql proteins050415 -e 'load data local infile "ass_22-May-2005.tab" into table sfAssign'
hgsql proteins050415 -e 'load data local infile "sfDes.tab" into table sfDes'
cd /cluster/store10/kg/kgHg17F
hgsql proteins050415 -e 'load data local infile "ensemblXref.tab" into table ensemblXref'
# These sf tables and ensemblXref3 are needed for non-HMR KG proteins.
# Should add content of ensemblXref3 of mm6 after it is done.
# And similarly for rn4 and possibly for other non-HMR species.
# CCDS <-> knownGene mapping need to be updated (markd 2005-05-29)
# this should be part of the known gene build
/cluster/data/genbank/bin/i386/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
# AUGUSTUS GENES (DONE 6/1/2005 Andy)
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir augustus
cd augustus/
wget http://augustus.gobics.de/predictions/hg17/hg17.allchr.augustus.gtf.gz
cp /cluster/data/dm2/bed/augustus/cleanAugustus.awk .
zcat hg17.allchr.augustus.gtf.gz | awk -f cleanAugustus.awk | gzip > hg17.allchr.augustus.clean.gtf.gz
ldHgGene -gtf hg17 augustus hg17.allchr.augustus.clean.gtf.gz
rm hg17.allchr.augustus.gtf.gz
# MAKE Mouse Proteins track (DONE for chr13 braney ~5/25/05)
ssh kkstore01
mkdir -p /cluster/data/hg17/blastDb
cd /cluster/data/hg17/blastDb
awk "{print \$2}" ../*/chr*/*.lft > subChr.lst
for i in `cat subChr.lst`
do
ln -s ../*/chr*/$i.fa
echo formatdb -i $i.fa -p F
formatdb -i $i.fa -p F
done
rm *.log *.fa list
cd ..
for i in `cat chrom.lst`; do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft
ssh kkr1u00
rm -rf /iscratch/i/hg17/blastDb
mkdir -p /iscratch/i/hg17/blastDb
cd /cluster/data/hg17/blastDb
for i in nhr nin nsq; do cp *.$i /iscratch/i/hg17/blastDb ; echo $i; done
cd
iSync > sync.out
mkdir -p /cluster/data/hg17/bed/tblastn.mm6KG
cd /cluster/data/hg17/bed/tblastn.mm6KG
echo /panasas/store/hg17/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
# back to kkstore01
exit
cd /cluster/data/hg17/bed/tblastn.mm6KG
rm -rf /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa
mkdir -p /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa
split -l 560 /cluster/data/mm6/bed/blat.mm6KG/mm6KG.psl /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa/kg
ln -s /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa kgfa
cd kgfa
for i in *; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S kgfa/*.fa > kg.lst
rm -rf /cluster/bluearc/hg17/bed/tblastn.mm6KG/blastOut
mkdir -p /cluster/bluearc/hg17/bed/tblastn.mm6KG/blastOut
ln -s /cluster/bluearc/hg17/bed/tblastn.mm6KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subLiftAll.lft carry $f.2
liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/mm6/bed/blat.mm6KG/protein.lft warn $f.4
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
ssh kk
cd /cluster/data/hg17/bed/tblastn.mm6KG
para create blastSpec
para push
# Completed: 214524 of 214524 jobs
# CPU time in finished jobs: 44907411s 748456.85m 12474.28h 519.76d 1.424 y
# IO & Wait Time: 712709s 11878.48m 197.97h 8.25d 0.023 y
# Average job time: 213s 3.54m 0.06h 0.00d
# Longest finished job: 1363s 22.72m 0.38h 0.02d
# Submission to last job: 75910s 1265.17m 21.09h 0.88d
# just for chr13
# completed: 55290 of 55290 jobs
# cCPU time in finished jobs: 1487547s 24792.46m 413.21h 17.22d 0.047 y
# cIO & Wait Time: 148854s 2480.89m 41.35h 1.72d 0.005 y
# cAverage job time: 30s 0.49m 0.01h 0.00d
# cLongest running job: 0s 0.00m 0.00h 0.00d
# cLongest finished job: 98s 1.63m 0.03h 0.00d
# cSubmission to last job: 3904s 65.07m 1.08h 0.05d
cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
ssh kki
cd /cluster/data/hg17/bed/tblastn.mm6KG
tcsh
cat << '_EOF_' > chainOne
(cd $1; cat q."$2"* | simpleChain -prot -outPsl -maxGap=200000 stdin ../c.`basename $1`.$2.psl)
'_EOF_'
chmod +x chainOne
for j in blastOut/kg??; do for i in `cat ../../chrom.lst`; do echo chainOne $j chr"$i"; done ; done > chainSpec
para create chainSpec
para push
# CPU time in finished jobs: 90s 1.50m 0.03h 0.00d 0.000 y
# IO & Wait Time: 19151s 319.18m 5.32h 0.22d 0.001 y
# Average job time: 3s 0.04m 0.00h 0.00d
# Longest finished job: 5s 0.08m 0.00h 0.00d
# Submission to last job: 1642s 27.37m 0.46h 0.02d
# Completed: 7695 of 7695 jobs
# CPU time in finished jobs: 48s 0.80m 0.01h 0.00d 0.000 y
# IO & Wait Time: 18931s 315.51m 5.26h 0.22d 0.001 y
# Average job time: 2s 0.04m 0.00h 0.00d
# Longest finished job: 6s 0.10m 0.00h 0.00d
# Submission to last job: 1618s 26.97m 0.45h 0.02d
exit
# back to kkstore01
cd /cluster/data/hg17/bed/tblastn.mm6KG/blastOut
for i in kg??
do
cat c.$i.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.90 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/hg17/bed/tblastn.mm6KG/blastMm6KG.psl
cd ..
ssh hgwdev
cd /cluster/data/hg17/bed/tblastn.mm6KG
hgLoadPsl hg17 blastHg17KG.psl
# 1425966 bases of 64944656 (2.196%)
# back to kkstore01
rm -rf blastOut
# End tblastn of mouse proteins
####################################################################################
# RE-BUILD KNOWN GENES TABLES, 3ND TRIAL WITH CORRECTED kgCheck and kgGetCds (DONE 6/5/05 Fan)
# Start from the step that gene-check is run and kgCandidate0.gp is produced.
cd
cd /cluster/store10/kg/kgHg17F
mkdir try3
cd try3
hgsql kgHg17FTempTry3 -e 'drop table kgCandidate0'
hgsql kgHg17FTempTry3 -e 'drop table kgCandidate0'
hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidate0.sql
hgsql kgHg17FTempTry3 -e 'load data local infile "../kgCandidate0.gp" into table kgCandidate0'
hgsql kgHg17FTempTry3 -e 'drop table geneCheck'
hgsql kgHg17FTempTry3 < ~/src/hg/lib/geneCheck.sql
hgsql kgHg17FTempTry3 -e 'load data local infile "../kgCandidate0.check" into table geneCheck ignore 2 lines'
# Run kgCheck to get all KG candidates that pass the KG gene check criteria
kgCheck kgHg17FTempTry3 hg17 kgCandidate0 geneCheck kgCandidate.tab
hgsql kgHg17FTempTry3 -e 'drop table kgCandidate'
hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidate.sql
hgsql kgHg17FTempTry3 -e 'load data local infile "kgCandidate.tab" into table kgCandidate'
hgsql kgHg17FTempTry3 -e 'create index alignID on kgCandidate(alignID)'
# Construct the kgCandidateX table that has alignID in the name field.
cut -f 2-10 kgCandidate.tab >j2.tmp
cut -f 11 kgCandidate.tab >j1.tmp
paste j1.tmp j2.tmp >kgCandidateX.tab
hgsql kgHg17FTempTry3 -e 'drop table kgCandidateX'
hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidateX.sql
hgsql kgHg17FTempTry3 -e 'load data local infile "kgCandidateX.tab" into table kgCandidateX'
# Score protein/mRna and protein/RefSeq alignments
# kgResultBestMrna2 050415 kgHg17FTempTry3 hg17|sort -u >protMrnaBlatScore.tab
# kgResultBestRef2 050415 kgHg17FTempTry3 hg17|sort -u >protRefScore.tab
# Combine scoring results and load them into temp DB.
cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
hgsql kgHg17FTempTry3 -e 'drop table protMrnaScore'
hgsql kgHg17FTempTry3 < ~/src/hg/lib/protMrnaScore.sql
hgsql kgHg17FTempTry3 -e 'load data local infile "../protMrnaScore.tab" into table protMrnaScore'
hgsql kgHg17FTempTry3 -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'
# Run kgGetCds to get CDS structure of each gene
kgGetCds kgHg17FTempTry3 kgCandidateX jY.tmp1
cat jY.tmp1 |sort -u >kgCandidateY.tab
rm jY.tmp1
hgsql kgHg17FTempTry3 -e 'drop table kgCandidateY'
hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidateY.sql
hgsql kgHg17FTempTry3 -e 'load data local infile "kgCandidateY.tab" into table kgCandidateY'
# Run kgPickPrep to replace long cds structure string with cdsId.
kgPickPrep kgHg17FTempTry3 kgCandidateZ.tab
hgsql kgHg17FTempTry3 -e 'drop table kgCandidateZ'
hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidateZ.sql
hgsql kgHg17FTempTry3 -e 'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
hgsql kgHg17FTempTry3 -e 'create index cdsId on kgCandidateZ(cdsId)'
# Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.
kgPick kgHg17FTempTry3 hg17 proteins050415 kg3Try3.tmp dupSpMrna.tmp
cat kg3Try3.tmp | grep NM_ > jNM
cat kg3Try3.tmp | grep -v NM_ >jnoNM
cut -f 1 jnoNM | sed -e "s/_/_\n/" |grep -v _ >jnoNM1
cut -f 2-12 jnoNM >jnoNM2
paste jnoNM1 jnoNM2 > kg3Try3B.tmp
cat jNM >> kg3Try3B.tmp
sort -u dupSpMrna.tmp >dupSpMrna.tab
hgsql hg17 -e 'drop table dupSpMrna'
hgsql hg17 < ~/src/hg/lib/dupSpMrna.sql
hgsql hg17 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna'
# Add entries in the put back list
# Obtain from Mark the put back list, kgPutBack.lis, for human RefSeq.
hgsql kgHg17FTempTry3 -e 'drop table kgPutBack'
hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgPutBack.sql
hgsql kgHg17FTempTry3 -e 'load data local infile "kgPutBack.lis" into table kgPutBack'
kgPutBack kgHg17FTempTry3 hg17 proteins050415 kgPutBack kgPutBack.gp
# Sort KG genes to make the kg3Try3.gp table file.
cat kg3Try3B.tmp kgPutBack.gp >kg3Try3C.tmp
~/kent/src/hg/protein/sortKg.pl kg3Try3C.tmp >kg3Try3.gp
# Manually edit to correct one line problem of O75438_BC009691
hgsql kgHg17FTempTry3 -e 'drop table knownGene'
hgsql kgHg17FTempTry3 < ~/src/hg/lib/knownGene.sql
hgsql kgHg17FTempTry3 -e 'load data local infile "kg3Try3.gp" into table knownGene'
hgsql hg17 -e 'drop table kg3Try3'
hgsql hg17 < ~/src/hg/lib/kg3Try3.sql
hgsql hg17 -e 'load data local infile "kg3Try3.gp" into table kg3Try3'
# Perform analysis before renaming the kg3Try3 table to knownGene.
# Load data into hg17 knownGene table.
hgsql hg17 -e 'drop table knownGene'
hgsql hg17 < ~/src/hg/lib/knownGene.sql
hgsql hg17 -e 'load data local infile "kg3Try3.gp" into table knownGene'
# Build knownGeneMrna and knownGenePep tables.
hgsql kgHg17FTempTry3 -e 'drop table mrnaSeq'
hgsql kgHg17FTempTry3 < ~/src/hg/lib/mrnaSeq.sql
hgsql kgHg17FTempTry3 -e 'load data local infile "../mrnaSeq.tab" into table mrnaSeq'
kgPepMrna kgHg17FTempTry3 hg17 050415
hgsql hg17 -e 'drop table knownGeneMrna'
hgsql hg17 < ~/src/hg/lib/knownGeneMrna.sql
hgsql hg17 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
hgsql hg17 -e 'drop table knownGenePep'
hgsql hg17 < ~/src/hg/lib/knownGenePep.sql
hgsql hg17 -e 'load data local infile "knownGenePep.tab" into table knownGenePep'
# Build spMrna table
hgsql hg17 -N -e 'select proteinID, name from knownGene' |sort -u >kgSpMrna.tab
hgsql hg17 -e 'drop table spMrna'
hgsql hg17 <~/src/hg/lib/spMrna.sql
hgsql hg17 -e 'load data local infile "kgSpMrna.tab" into table spMrna'
# Build kgXref table
kgXref2 kgHg17FTempTry3 050415 hg17
hgsql hg17 -e 'drop table kgXref'
hgsql hg17 < ~/src/hg/lib/kgXref.sql
hgsql hg17 -e 'load data local infile "kgXref.tab" into table kgXref'
# MAKE FOLDUTR TABLES
# First set up directory structure and extract UTR sequence on hgwdev
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir rnaStruct.2005-06-05
rm rnaStruct
ln -s rnaStruct.2005-06-05 rnaStruct
cd rnaStruct
mkdir -p utr3/split utr5/split utr3/fold utr5/fold
utrFa hg17 knownGene utr3 utr3/utr.fa
utrFa hg17 knownGene utr5 utr5/utr.fa
# Split up files and make files that define job.
ssh kk
cd /cluster/data/hg17/bed/rnaStruct
faSplit sequence utr3/utr.fa 50000 utr3/split/s
faSplit sequence utr5/utr.fa 50000 utr5/split/s
ls -1 utr3/split > utr3/in.lst
ls -1 utr5/split > utr5/in.lst
cd utr3
cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
cp gsub ../utr5
# Do cluster run for 3' UTRs
gensub2 in.lst single gsub spec
para create spec
para try
para push
# Completed: 35774 of 35774 jobs
# CPU time in finished jobs: 1174534s 19575.57m 326.26h 13.59d 0.037 y
# IO & Wait Time: 98071s 1634.51m 27.24h 1.14d 0.003 y
# Average job time: 36s 0.59m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 5409s 90.15m 1.50h 0.06d
# Submission to last job: 6712s 111.87m 1.86h 0.08d
# Do cluster run for 5' UTRs
cd ../utr5
gensub2 in.lst single gsub spec
para create spec
para try
para push
# Completed: 33765 of 33765 jobs
# CPU time in finished jobs: 341000s 5683.33m 94.72h 3.95d 0.011 y
# IO & Wait Time: 106605s 1776.75m 29.61h 1.23d 0.003 y
# Average job time: 13s 0.22m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 30479s 507.98m 8.47h 0.35d
# Submission to last job: 30622s 510.37m 8.51h 0.35d
# Load database
ssh hgwdev
cd /cluster/data/hg17/bed/rnaStruct/utr5
hgLoadRnaFold hg17 foldUtr5 fold
cd ../utr3
hgLoadRnaFold hg17 foldUtr3 fold
# Clean up
rm -r split fold err batch.bak
cd ../utr5
rm -r split fold err batch.bak
# Build kgProtMap table
# move all files under hg17Kg to old and copy try3/kgXref.tab up.
# Note: it is important that tight_mrna.psl is here!
cp old/tight_mrna.psl . -p
~/src/hg/protein/kgProtMap2.sh kgHg17F hg17 050415
# Update and clean up kgResultBestMrna2.c and then check it in.
# Build alias tables
# kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases
# proteins050415.hugo.withdraws, hg17.kgXref.kgID
# to create kgAliasM.tab and geneAlias.tab
# by picking out those kgID items from kgXref where
# kgXref.geneSymbol == hugo.symbol
kgAliasM hg17 proteins050415
# kgAliasKgXref reads from hg17.knownGene.proteinID,
# hg17.knownGene.name, hg17.kgXref.geneSymbol
# to create kgAliasKgXref.tab
kgAliasKgXref hg17
# kgAliasRefseq reads from hg17.knownGene.name,
# hg17.knownGene.proteinID, hg17.kgXref.refseq
# to create kgAliasRefseq.tab
kgAliasRefseq hg17
hgsql sp050415 -N -e 'select name,gene.val from hg17.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
| sort -u > kgAliasP.tab
hgsql hg17 -N -e 'select name, name from knownGene' >kgAliasDup.tab
hgsql hg17 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
sort |uniq > kgAlias.tab
hgsql -e "drop table kgAlias;" hg17
hgsql hg17 < ~/kent/src/hg/lib/kgAlias.sql
hgsql hg17 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias'
# kgProtAlias reads from hg17.knownGene.name,
# hg17.knownGene.proteinID, hg17.knownGene.alignID,
# proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb
# to create kgProtAlias.tab
#
kgProtAlias hg17 050415
hgsql hg17 -N -e \
'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
| sort -u >kgProtAliasNCBI.tab
# include variant splice protein IDs
hgsql hg17 -N -e \
'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\
|sort -u >kgProtAliasDup.tab
# include duplicate protein IDs from dupSpMrna table
hgsql hg17 -N -e \
'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
|sort -u >>kgProtAliasDup.tab
# catch parent acc from dupProteinID too
hgsql hg17 -N -e\
'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
|sort -u >>kgProtAliasDup.tab
cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab
echo "`date` creating table kgProtAlias"
hgsql hg17 -e "drop table kgProtAlias;"
hgsql hg17 <~/src/hg/lib/kgProtAlias.sql;
hgsql hg17 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'
# Build kgSpAlias table
hgsql hg17 -e \
'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
hgsql hg17 -e \
'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
>>j.tmp
cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab
rm j.tmp
hgsql hg17 -e 'drop table kgSpAlias';
hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql
hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias'
# Build KEGG pathway tables
ssh hgwdev
cd /cluster/store10/kg/kgHg17F
md kegg
cd kegg
~/src/hg/protein/KGpath.sh kgHg17F hg17 050415
hgsql hg17 -e "drop table keggMapDesc"
hgsql hg17 -e "drop table keggPathway"
hgsql hg17 <~/src/hg/lib/keggMapDesc.sql
hgsql hg17 <~/src/hg/lib/keggPathway.sql
hgsql hg17 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
hgsql hg17 -e 'load data local infile "keggPathway.tab" into table keggPathway'
# Build CGAP pathway tables
cd ..
~/src/hg/protein/KGcgap.sh kgHg17F hg17 050415
hgsql hg17 -e "drop table cgapAlias"
hgsql hg17 -e "drop table cgapBiocDesc"
hgsql hg17 -e "drop table cgapBiocPathway"
hgsql hg17 <~/src/hg/lib/cgapAlias.sql
hgsql hg17 <~/src/hg/lib/cgapBiocDesc.sql
hgsql hg17 <~/src/hg/lib/cgapBiocPathway.sql
hgsql hg17 -e 'load data local infile "cgapAlias.tab" into table cgapAlias'
hgsql hg17 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
hgsql hg17 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'
# Build BioCyc pathway tables
# Download BioCyc DB, create and load bioCyc DB
# See makeBioCycDB.doc for details.
hgsql hg17 -e "drop table bioCycMapDesc"
hgsql hg17 <~/src/hg/lib/bioCycMapDesc.sql
hgsql hg17 -e 'load data local infile "bioCycMapDesc.tab" into table bioCycMapDesc'
kgBioCyc |sort -u > bioCycPathway.tab
hgsql hg17 -e "drop table bioCycPathway"
hgsql hg17 <~/src/hg/lib/bioCycPathway.sql
hgsql hg17 -e 'load data local infile "bioCycPathway.tab" into table bioCycPathway'
# CCDS <-> knownGene mapping need to be updated (Fan redone 2005-06-05)
# this should be part of the known gene build
/cluster/data/genbank/bin/i386/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
### HG17 PROTEOME BROWSER TABLES RE-BUILD #### (DONE - 2005-06-05 - Fan)
# These are instructions for rebuilding tables
# needed for the Proteome Browser.
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.
# This update is based on proteins DBs dated 050415.
# Create the working directory
ssh hgwdev
mkdir /cluster/store10/kg/kgHg17F/pb-2005-06-05
cd /cluster/data/hg17/bed
rm pb
ln -s /cluster/store10/kg/kgHg17F/pb-2005-06-05 pb
cd pb
# Move the existing PB tables by:
hgsql hg17
create database hg17Sav4;
alter table hg17.pepCCntDist rename as hg17Sav4.pepCCntDist;
alter table hg17.pepExonCntDist rename as hg17Sav4.pepExonCntDist;
alter table hg17.pepHydroDist rename as hg17Sav4.pepHydroDist;
alter table hg17.pepIPCntDist rename as hg17Sav4.pepIPCntDist;
alter table hg17.pepMolWtDist rename as hg17Sav4.pepMolWtDist;
alter table hg17.pepMwAa rename as hg17Sav4.pepMwAa;
alter table hg17.pepPi rename as hg17Sav4.pepPi;
alter table hg17.pepPiDist rename as hg17Sav4.pepPiDist;
alter table hg17.pepResDist rename as hg17Sav4.pepResDist;
alter table hg17.pbAnomLimit rename as hg17Sav4.pbAnomLimit;
alter table hg17.pbResAvgStd rename as hg17Sav4.pbResAvgStd;
alter table hg17.pbStamp rename as hg17Sav4.pbStamp;
quit
# Define pep* tables in hg17 DB
cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql
# First edit out pepPred table definition, then
hgsql hg17 < pepAll.sql
# Build the pepMwAa table
hgsql proteins050415 -N -e \
"select info.acc, molWeight, aaSize from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > pepMwAa.tab
hgsql hg17 -e 'load data local infile "pepMwAa.tab" into table pepMwAa'
o Build the pepPi table
hgsql proteins050415 -e \
"select info.acc from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > protAcc.lis
hgsql hg17 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis
pbCalPi protAcc.lis sp050415 pepPi.tab
hgsql hg17 -e 'delete from pepPi'
hgsql hg17 -e 'load data local infile "pepPi.tab" into table hg17.pepPi'
# Calculate and load pep distributions
pbCalDist sp050415 proteins050415 9606 hg17 >pbCalDist.out
wc pbCalDist.out
hgsql hg17
load data local infile "pepExonCntDist.tab" into table hg17.pepExonCntDist;
load data local infile "pepCCntDist.tab" into table hg17.pepCCntDist;
load data local infile "pepHydroDist.tab" into table hg17.pepHydroDist;
load data local infile "pepMolWtDist.tab" into table hg17.pepMolWtDist;
load data local infile "pepResDist.tab" into table hg17.pepResDist;
load data local infile "pepIPCntDist.tab" into table hg17.pepIPCntDist;
load data local infile "pepPiDist.tab" into table hg17.pepPiDist;
quit
# Calculate frequency distributions
pbCalResStd sp050415 9606 hg17
# Create pbAnomLimit and pbResAvgStd tables
hgsql hg17 -e "drop table pbAnomLimit"
hgsql hg17 -e "drop table pbResAvgStd"
hgsql hg17 < ~/src/hg/lib/pbAnomLimit.sql
hgsql hg17 < ~/src/hg/lib/pbResAvgStd.sql
hgsql hg17 -e 'load data local infile "pbResAvgStd.tab" into table hg17.pbResAvgStd;'
hgsql hg17 -e 'load data local infile "pbAnomLimit.tab" into table hg17.pbAnomLimit;'
# Create pbStamp table for PB
hgsql hg17 -e "drop table pbStamp"
hgsql hg17 < ~/src/hg/lib/pbStamp.sql
hgsql hg17Sav4 -N -e 'select * from pbStamp' > pbStamp.tab
hgsql hg17 -e 'load data local infile "pbStamp.tab" into table hg17.pbStamp'
# Adjust drawing parameters for Proteome Browser stamps
Now invoke Proteome Browser and adjust various drawing parameters
(mostly the ymax of each stamp) if necessary, by updating the
pbStamp.tab file and then delete and reload the pbStamp table.
# Perform preliminary review of Proteome Browser for hg17, then
notify QA for formal review.
# RE-BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (DONE - 2005-06-04 - Fan)
# This should be done after KG tables are complete from known genes build
# process.
#
# Cluster together various alt-splicing isoforms.
# Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/hg17/bed/geneSorter.2005-06-04
# remove old symbolic link
rm /cluster/data/hg17/bed/geneSorter
ln -s /cluster/data/hg17/bed/geneSorter.2005-06-04 /cluster/data/hg17/bed/geneSorter
cd /cluster/data/hg17/bed/geneSorter
hgClusterGenes hg17 knownGene knownIsoforms knownCanonical
# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/hg17/bed/geneSorter/blastp
cd /cluster/data/hg17/bed/geneSorter/blastp
pepPredToFa hg17 knownGenePep known.faa
# You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
# This command is in /projects/compbio/bin/$MACH/formatdb
# Copy over database to bluearc
rm -fr /cluster/bluearc/hg17/blastp
mkdir -p /cluster/bluearc/hg17/blastp
cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* /cluster/bluearc/hg17/blastp
# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/hg17/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/self
cd /cluster/data/hg17/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \
-e 0.01 -m 8 -b 1000
'_EOF_'
# << keep emacs happy
chmod +x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# 'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para push
para check
# Completed: 7735 of 7735 jobs
# CPU time in finished jobs: 142764s 2379.39m 39.66h 1.65d 0.005 y
# IO & Wait Time: 67623s 1127.06m 18.78h 0.78d 0.002 y
# Average job time: 27s 0.45m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 144s 2.40m 0.04h 0.00d
# Submission to last job: 392s 6.53m 0.11h 0.00d
# Load into database. This takes about 30 minutes
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out
time hgLoadBlastTab hg17 knownBlastTab *.tab
# Scanning through 7735 files
# Loading database with 9757382 rows
# 255.200u 50.520s 25:19.66 20.1% 0+0k 0+0io 247pf+0w
cd /cluster/data/hg17/bed/geneSorter
# Create table that maps between known genes and RefSeq
hgMapToGene hg17 refGene knownGene knownToRefSeq
# may need to build this command in src/hg/near/hgMapToGene
# hgsql -e "select count(*) from knownToRefSeq;" hg17
# row count changed 34667
# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 > refToLl.txt
hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt
# hgsql -e "select count(*) from knownToLocusLink;" hg17
# row count changed to 34773
# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam
# hgsql -e "select count(*) from knownToPfam;" hg17
# row count changed to 29171
# Create table to map between known genes and GNF Atlas2
# expression data.
hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
# hgsql -e "select count(*) from knownToGnfAtlas2;" hg17
# row count changed to 32458
# Create expression distance table - takes about an hour
hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnfAtlas2 &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 32458 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
# hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
# row count changed to 32381000
# Create a table that maps between known genes and
# the nice affy expression data.
hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133
# hgsql -e "select count(*) from knownToU133;" hg17
# row count changed to 32965
# Create expression distance table. This will take about 2.5 hours
cd /tmp
cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \
-weights=affyUcla.weight -lookup=knownToU133 &
# Have 43039 elements in affyUclaNorm
# 211 genes, 42 weights, 26.500000 total wieght
# Got 32965 unique elements in affyUclaNorm
# Create table that maps between known genes and
# the GNF data.
cd /tmp
hgMapToGene hg17 affyU95 knownGene knownToU95
# row count changed to 17555
# hgFixed.gnfHumanU95Exps argument is unused, no need to exist
hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \
hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 &
# Have 11545 elements in hgFixed.gnfHumanU95MedianRatio
# Got 16501 unique elements in hgFixed.gnfHumanU95MedianRatio
# row count changed to 16450000
# Create known gene mapping table and expression distance tables
# for GNF Atlas 2. (The hgExpDistance takes only 10 minutes.)
hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h
hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnf1h &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 8827 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
cd /cluster/data/hg17/bed/geneSorter
hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2
# row count changed to 35139
#### UPDATE GO DATABASE (THIS PART WAS DONE 5/21/05 Fan)
# Download the terms and make the database.
ssh hgwdev
mkdir /cluster/store1/geneOntology/20050521
cd /cluster/store1/geneOntology/20050521
wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200504-assocdb-data.gz
hgsql mysql <<end
create database go050521;
end
zcat go_*data.gz | sed -e 's/ENGINE=MyISAM DEFAULT CHARSET=latin1/TYPE=MyISAM/g' >j.tmp
hgsql go050521 <j.tmp
rm j.tmp
wget --timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_uniprot.gz
zcat gene_association.goa_uniprot.gz | hgGoAssociation go050521 goaPart stdin
# Passed 5589891 of 6584507 of 6584507, 84.89%
# Ask sys-admin to switch the database pointer go to point to go050521.
cd /cluster/data/hg17/bed/geneSorter
# Rebuilt Ensembl Gene tables. See documentation (5/23/05 Fan) above.
# Create knownToEnsembl column
hgMapToGene hg17 ensGene knownGene knownToEnsembl
# hgsql hg17 -e "select count(*) from knownToEnsembl"
# table row count 35521
# Make knownToCdsSnp table
ssh hgwdev
nice hgMapToGene hg17 snp knownGene knownToCdsSnp -all -cds
# hgsql hg17 -e "select count(*) from knownToCdsSnp"
# row count 94633
# approx. 5 minutes running time
# C.ELEGANS BLASTP FOR GENE SORTER
# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to iscratch/i
# if it doesn't exist already:
# The following section is done during mm6 build already.
# ssh eieio
# mkdir /cluster/data/ce2/bed/blastp
# cd /cluster/data/ce2/bed/blastp
# # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
# # to find out the latest version. Then use that in place of 142 below.
# wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142
# formatdb -i wormPep142.faa -t wormPep142 -n wormPep142
# ssh kkr1u00
# if (-e /iscratch/i/ce2/blastp) then
# rm -r /iscratch/i/ce2/blastp
# endif
# mkdir -p /iscratch/i/ce2/blastp
# cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp
# iSync
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/hg17/bed/blastp/ce2/run/out
cd /cluster/data/hg17/bed/blastp/ce2/run
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep142 -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/hg17/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Initial run has 13 jobs crashed.
# crashed: 13
# ranOk: 7722
# para problems show the following typical message:
# total jobs in batch: 7735
# job: blastSome ../../../geneSorter/blastp/split/kg5911.fa out/kg5911.tab
# id: 209522384
# failure type: crash
# host: kkr2u28.kilokluster.ucsc.edu
# start time: Sat Jun 4 11:45:51 2005
# return: 0
# stderr:
# [blastall] FATAL ERROR: blast: Unable to open input file ../../../geneSorter/blastp/split/kg5911.fa
# para push again and these 13 ran fine.
# Completed: 7735 of 7735 jobs
# CPU time in finished jobs: 60319s 1005.32m 16.76h 0.70d 0.002 y
# IO & Wait Time: 31239s 520.65m 8.68h 0.36d 0.001 y
# Average job time: 12s 0.20m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 72s 1.20m 0.02h 0.00d
# Submission to last job: 199s 3.32m 0.06h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/blastp/ce2/run/out
hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab
# Scanning through 7735 files
# Loading database with 25574 rows
# Make mouse ortholog column using blastp on mouse known genes.
# First make mouse protein database and copy it to /cluster/panasas
# if it doesn't exist already
# This already exists. See makeMm6.doc for procedure
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm6
cd /cluster/data/hg17/bed/geneSorter/blastp/mm6
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/panasas/home/store/mm6/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# this echo trick is used because otherwise the command line is
# too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7735 of 7735 jobs
# CPU time in finished jobs: 85769s 1429.49m 23.82h 0.99d 0.003 y
# IO & Wait Time: 20587s 343.11m 5.72h 0.24d 0.001 y
# Average job time: 14s 0.23m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 78s 1.30m 0.02h 0.00d
# Submission to last job: 206s 3.43m 0.06h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/mm6/run/out
hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
# Scanning through 7735 files
# Loading database with 32951 rows
# Make rat ortholog column using blastp on rat known genes.
# First make rat protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This already exists. See makeRn3.doc for procedure.
# Files were put in this directory: /cluster/bluearc/rn3/blastp/
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/rn3/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# this echo trick is used because otherwise the command line is
# too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc
# Completed: 7735 of 7735 jobs
# CPU time in finished jobs: 27804s 463.40m 7.72h 0.32d 0.001 y
# IO & Wait Time: 30334s 505.56m 8.43h 0.35d 0.001 y
# Average job time: 8s 0.13m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 26s 0.43m 0.01h 0.00d
# Submission to last job: 119s 1.98m 0.03h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out
hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab
# Scanning through 7735 files
# Loading database with 24030 rows
# ZEBRAFISH BLASTP FOR GENE SORTER
# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to iscratch/i
# if it doesn't exist already:
ssh kkstore
mkdir /cluster/data/danRer2/bed/blastp
cd /cluster/data/danRer2/bed/blastp
wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.may.pep.fa.gz
zcat Dan*.pep.fa.gz > ensembl.faa
/scratch/blast/formatdb -i ensembl.faa -t ensembl -n ensembl
ssh kkr1u00
if (-e /iscratch/i/danRer2/blastp) then
rm -r /iscratch/i/danRer2/blastp
endif
mkdir -p /iscratch/i/danRer2/blastp
cp /cluster/data/danRer2/bed/blastp/ensembl.p?? /iscratch/i/danRer2/blastp
iSync
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/hg17/bed/blastp/danRer2/run/out
cd /cluster/data/hg17/bed/blastp/danRer2/run
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer2/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/hg17/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' > split.lst
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7735 of 7735 jobs
# CPU time in finished jobs: 111467s 1857.78m 30.96h 1.29d 0.004 y
# IO & Wait Time: 21159s 352.65m 5.88h 0.24d 0.001 y
# Average job time: 17s 0.29m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 95s 1.58m 0.03h 0.00d
# Submission to last job: 223s 3.72m 0.06h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/blastp/danRer2/run/out
hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab
# Scanning through 7735 files
# Loading database with 30651 rows
# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This is already done, see makeMm3.doc for procedure
# the directory: /cluster/bluearc/sc1/blastp should have data
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7735 of 7735 jobs
# CPU time in finished jobs: 18194s 303.23m 5.05h 0.21d 0.001 y
# IO & Wait Time: 24452s 407.53m 6.79h 0.28d 0.001 y
# Average job time: 6s 0.09m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 16s 0.27m 0.00h 0.00d
# Submission to last job: 120s 2.00m 0.03h 0.00d
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out
hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab
# Scanning through 7735 files
# Loading database with 16395 rows
# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# The following section was already done.
# cd /cluster/data/dm1/bed
# mkdir blastp
# cd blastp
#wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/FASTA/whole_genome_translation_dmel_RELEASE3-1.FASTA.gz
# zcat whole_ge*.gz | faFlyBaseToUcsc stdin flyBase.faa
# formatdb -i flyBase.faa -t flyBase -n flyBase
# if (-e /cluster/bluearc/dm1/blastp) then
# rm -r /cluster/bluearc/dm1/blastp
# endif
# mkdir -p /cluster/bluearc/dm1/blastp
# cp /cluster/data/dm1/bed/blastp/flyBase.p?? /cluster/bluearc/dm1/blastp
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7735 of 7735 jobs
# CPU time in finished jobs: 72141s 1202.35m 20.04h 0.83d 0.002 y
# IO & Wait Time: 41717s 695.28m 11.59h 0.48d 0.001 y
# Average job time: 15s 0.25m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 57s 0.95m 0.02h 0.00d
# Submission to last job: 204s 3.40m 0.06h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out
hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab
# Scanning through 7735 files
# Loading database with 27109 rows
# update knownToHInv table
# Verified that there is now new release of HInv data.
hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv
# count changed to 28851
# The new KG process no longer need entries in knownGeneLink (used to store
# info for DNA based RefSeqs. So clean out the old data in knownGeneLink.
hgsql hg17 -e "delete from knownGeneLink"
#### RE-BUILD SUPERFAMILY RELATED TABLES (DONE - 2005-05-27 - Fan)
# Download latest Superfamily data files and build the Superfamily DB
# from supfam.mrc-lmb.cam.ac.uk
mkdir /cluster/store10/superfamily/050524
ln -s /cluster/store10/superfamily/050524 /cluster/data/superfamily/050524
cd /cluster/data/superfamily/050524
# ftp over the following two files:
ass_22-May-2005.tab.gz
supfam_22-May-2005.sql.gz
gzip -d *.gz
# Load the Superfamily database
hgsql hg17 -e "create database superfam050524"
nice hgsql superfam050524 < supfam_22-May-2005.sql &
# This may take about an hour.
# Make sure to add an index on id of the des table of superfam050524.
hgsql superfam050524 -e "create index id on des(id);"
hgsql superfam050524 < ~/src/hg/lib/sfAssign.sql
hgsql superfam050524 -e 'load data local infile "ass_22-May-2005.tab" into table
superfam050524.sfAssign;'
# Build or rebuild Superfamily track and create sf tables needed for PB
hgsql hg17 < ~/src/hg/lib/sfAssign.sql
cd /cluster/data/superfamily/050524
hgsql hg17 -e 'load data local infile "ass_22-May-2005.tab" into table hg17.sfAssign;'
# If hg17.sfDes already exists, drop it.
hgsql superfam050524 -N -e "select * from des" >sfDes.tab
hgsql hg17 < ~/src/hg/lib/sfDes.sql
hgsql hg17 -e 'load data local infile "sfDes.tab" into table sfDes'
# If hg17.superfamily already exists, drop it.
cd /cluster/data/hg17/bed
mkdir /cluster/data/hg17/sf.2004-1128
ln -s sf.2004-1128 sf
hgSuperfam hg17 > sf.log
# It is normal that many proteins does not have corresponding Superfamily entries.
# If hg17.sfDescription exists, drop it.
hgsql hg17 < ~/src/hg/lib/sfDescription.sql
hgsql hg17 -e 'LOAD DATA local INFILE "sfDescription.tab" into table hg17.sfDescription;'
# Finally, load the superfamily table.
hgLoadBed hg17 superfamily superfamily.tab -tab
# Create knownToSuperfamily table
# Note hs is changed into ht for this Superfamily release.
cat /cluster/data/superfamily/050524/ass_22-May-2005.tab \
| hgKnownToSuper hg17 hs stdin
# created 32906 rows in knownToSuper
# Build tables needed by pbGlobal in proteins050415
cd /cluster/data/superfamily/050524
hgsql proteins050415 -e 'load data local infile "ass_22-May-2005.tab" into table sfAssign'
hgsql proteins050415 -e 'load data local infile "sfDes.tab" into table sfDes'
cd /cluster/store10/kg/kgHg17F
hgsql proteins050415 -e 'load data local infile "ensemblXref.tab" into table ensemblXref'
# These sf tables and ensemblXref3 are needed for non-HMR KG proteins.
# Should add content of ensemblXref3 of mm6 after it is done.
# And similarly for rn4 and possibly for other non-HMR species.
# CCDS <-> knownGene mapping need to be updated (markd 2005-05-29)
# this should be part of the known gene build
/cluster/data/genbank/bin/i386/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
# Build targetScanS track - (DONE - 2005-06-22 Fan)
# requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov
ssh hgwdev
mkdir -p /cluster/data/hg17/bed/targetScanS
cd /cluster/data/hg17/bed/targetScanS
wget --timestamp http://genes.mit.edu/targetscan/tracks/targetscan.bed
# Remove the first description line of targetscan.bed
hgLoadBed -tab hg17 targetScanS targetscan.bed
# Create/edit/check in targetScans.html and trackDb.ra under
# kent/src/hg/makeDb/trackDb/human/hg17
# Update mrnaRefseq table (DONE - Fan 6/22/05)
# The old table contains non-human mrna/RefSeqs.
# The new table contains only human mrna/RefSeq and RefSeq/RefSeq.
# First build entrez DB tables, see makeMm6.doc for details.
hgsql entrez -N -e \
'select mrna, refseq from entrezRefseq, entrezMrna, hg17.all_mrna where entrezRefseq.geneID=entrezMrna.geneID and mrna=all_mrna.qName' \
>mrnaRefseq1.tab
# Include RefSeq as valid mRNA too.
hgsql hg17 -N -e 'select name, name from refGene' >mrnaRefseq2.tab
cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab
hgsql hg17 -e 'drop table mrnaRefseq'
hgsql hg17 < ~/src/hg/lib/mrnaRefseq.sql
hgsql hg17 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'
# BUILD KNOWN GENE LIST FOR GOOGLE. DONE 6/27/05 Fan.
# make knownGeneLists.html hg17GeneList.html mm5GeneList.html rm3GeneList.html
cd /cluster/data/hg17/bed
rm -rf knownGeneList/hg17
# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/hg17
hgKnownGeneList hg17
# copy over to /usr/local/apache/htdocs
rm -rf /usr/local/apache/htdocs/knownGeneList/hg17
mkdir -p /usr/local/apache/htdocs/knownGeneList/hg17
cp -Rfp knownGeneList/hg17/* /usr/local/apache/htdocs/knownGeneList/hg17
#### Blat knownGene proteins to determine exons (DONE braney 06-30-05)
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir blat.hg17KG.2005-06-17
rm blat.hg17KG
ln -s blat.hg17KG.2005-06-17 blat.hg17KG
cd blat.hg17KG
pepPredToFa hg17 knownGenePep known.fa
hgPepPred hg17 generic blastKGPep02 known.fa
grep ">" known.fa | sed "s/>//" > kgName.lst
kgName hg17 kgName.lst blastKGRef02
hgsql hg17 < ~/kent/src/hg/lib/blastRef.sql
echo "rename table blastRef to blastKGRef02" | hgsql hg17
echo "load data local infile 'blastKGRef02' into table blastKGRef02" | hgsql hg17
ssh kk
cd /cluster/data/hg17/bed/blat.hg17KG
cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
# << keep emacs happy
chmod +x blatSome
ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs/*.nib > human.lst
mkdir kgfa
cd kgfa
faSplit sequence ../known.fa 3020 kg
cd ..
ls -1S kgfa/*.fa > kg.lst
cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << keep emacs happy
gensub2 human.lst kg.lst blatGsub blatSpec
mkdir psl
cd psl
foreach i (`cat ../human.lst`)
mkdir `basename $i .nib`
end
cd ..
para create blatSpec
para push
# Completed: 134320 of 134320 jobs
# CPU time in finished jobs: 22196680s 369944.67m 6165.74h 256.91d 0.704 y
# IO & Wait Time: 1712586s 28543.10m 475.72h 19.82d 0.054 y
# Average job time: 178s 2.97m 0.05h 0.00d
# Longest finished job: 7691s 128.18m 2.14h 0.09d
# Submission to last job: 608750s 10145.83m 169.10h 7.05d
# Completed: 133676 of 133676 jobs
# CPU time in finished jobs: 29661130s 494352.16m 8239.20h 343.30d 0.941 y
# IO & Wait Time: 2181179s 36352.99m 605.88h 25.25d 0.069 y
# Average job time: 238s 3.97m 0.07h 0.02d
# Longest job: 105972s 1766.20m 29.44h 1.23d
ssh eieio
cd /cluster/data/hg17/bed/blat.hg17KG
pslSort dirs raw.psl /tmp psl/*
pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
pslUniq cooked.psl hg17KG.psl
pslxToFa hg17KG.psl hg17KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
ssh hgwdev
cd /cluster/data/hg17/bed/blat.hg17KG
kgName hg17 hg17KG.psl blastKGRef02
cut -f 10 hg17KG.psl > kgName.lst
faSomeRecords known.fa kgName.lst hg17KG.fa
hgPepPred hg17 generic blastKGPep02 hg17KG.fa
#end blat proteins
# MAKE Drosophila Proteins track (DONE 07-05-05 braney)
ssh kk
mkdir -p /cluster/data/hg17/bed/tblastn.dm2FB
cd /cluster/data/hg17/bed/tblastn.dm2FB
echo /panasas/store/hg17/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > target.lst
mkdir fbfa
# calculate a reasonable number of jobs
calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl|awk "{print \\\$1}"`/\(264630/`wc target.lst| awk "{print \\\$1}"`\)
# 18929/(350000/5959) = 322.279746
split -l 322 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb
cd fbfa
for i in *; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S fbfa/*.fa > fb.lst
mkdir -p /cluster/bluearc/hg17/bed/tblastn.dm2FB/blastOut
ln -s /cluster/bluearc/hg17/bed/tblastn.dm2FB/blastOut
for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/hg17/jkStuff/subLiftAll.lft warn $f.2
liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/hg17/jkStuff/liftAll.lft warn $f.3
liftUp -isPtoG -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.4
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.3 $f.8
exit 1
'_EOF_'
chmod +x blastSome
gensub2 target.lst fb.lst blastGsub blastSpec
ssh kk
cd /cluster/data/hg17/bed/tblastn.dm2FB
para create blastSpec
para push
# Completed: 351581 of 351581 jobs
# CPU time in finished jobs: 30733031s 512217.19m 8536.95h 355.71d 0.975 y
# IO & Wait Time: 1035790s 17263.16m 287.72h 11.99d 0.033 y
# Average job time: 90s 1.51m 0.03h 0.00d
# Longest finished job: 816s 13.60m 0.23h 0.01d
# Submission to last job: 135367s 2256.12m 37.60h 1.57d
ssh kki
cd /cluster/data/hg17/bed/tblastn.dm2FB
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1) $(path2)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainSome
(cd $1; cat $2.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainSome
ls -1dS `pwd`/blastOut/fb?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
para create chainSpec
para push
# Completed: 2714 of 2714 jobs
# CPU time in finished jobs: 222508s 3708.46m 61.81h 2.58d 0.007 y
# IO & Wait Time: 10577s 176.29m 2.94h 0.12d 0.000 y
# Average job time: 86s 1.43m 0.02h 0.00d
# Longest finished job: 9787s 163.12m 2.72h 0.11d
cd /cluster/data/hg17/bed/tblastn.dm2FB/blastOut
for i in fb??
do
awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.*.psl > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -u -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/hg17/bed/tblastn.dm2FB/blastDm2FB.psl
cd ..
ssh hgwdev
cd /cluster/data/hg17/bed/tblastn.dm2FB
hgLoadPsl hg17 blastDm2FB.psl
exit
# back to kksilo
rm -rf blastOut
# End tblastn
# Build kgReactome table for KG to Reactome xref. Done 6/28/05 Fan.
ssh hgwdev
mkdir -p /cluster/store10/reactome/reactome050613
rm /cluster/data/reactome
ln -s /cluster/store10/reactome/reactome050613 /cluster/data/reactome
cd /cluster/data/reactome
wget --timestamp http://www.reactome.org/download/current/sql.gz
hgsql hg17 -e 'drop database reactome'
hgsql hg17 -e 'create database reactome'
zcat sql.gz| hgsql reactome
hgsql reactome -N -e 'select kgId, spID, DB_ID from ReferenceEntity, hg17.kgXref where identifier=spID' >kgReactome.tab;
hgsql hg17 -e 'drop table kgReactome'
hgsql hg17 < ~/src/hg/lib/kgReactome.sql
hgsql hg17 -e 'load data local infile "kgReactome.tab" into table kgReactome'
# UPDATE WGRNA TRACK (DONE, 2005-07-05, Fan)
ssh hgwdev
cd /cluster/data/hg17/bed
mv wgRna wgRna-2005-06-16
mkdir wgRna-2005-07-05
cd wgRna-2005-07-05
# Received the data file, wgtrack_july2005.txt, from Michel Weber's email (Michel.Weber@ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg17/bed/wgRna-2005-07-05.
cat wgtrack_july2005.txt|sed -e 's/ /\t/g' >wgRna.tab
# edit wgRna.tab to take out the first 5 lines of data field labels.
hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
# REBUILT knownToPfam TABLE TO ALLOW KG REPRESENTED BY VARIANT SPLICE PROTEINS MAPPED TO PFAM (DONE 7/14/05, Fan)
# hgMapViaSwissProt.c was updated to support this.
# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam
## EVOFOLD (DONE, 2005-07-15, Jakob (jsp) )
# EvoFold is a new comparative method for predicting functional RNA
# secondary structures based on multiple sequence alignnments. The
# predictions generated for the EvoFold track are based on the most
# conserved elements of the 8-way alignment (multiz8way). The current
# data is the result of a pilot study (ongoing research of mine), the
# procedure used to generate the data will therefore be simplified
# when forthcoming evofold tracks for other organism are made. The
# documentation therefore skips the actual data generation, and
# instead starts with a data file I provide.
ssh -C hg17
mkdir -p /cluster/data/hg17/bed/evofold
cd /cluster/data/hg17/bed/evofold
cp /cluster/home/jsp/data/rnass/genome-scan/vertebrate/folds_hg17.bed foldsHg17.bed
# The folds_hg17.bed is a 9-column bed file: column 1-6 provide
# standard bed information, column 7 is element length, column 8 is
# the RNA secondary structure in parentheses format, and column nine
# is a commaseparated list of position specific confidence scores
# (floats).
hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql hg17 evofold foldsHg17.bed
##########################################################################
# TRANSFRAG PHASE 2 TABLES - lifted from hg15 (Jakob Skou Pedersen)
# Done: July 21, 2005
#
# These tables were lifted for use in my own research, but may be used
# for the 'Affymetrix Transcriptome Project Phase 2' tracks.
ssh -C hgwdev
mkdir -p /cluster/data/hg17/bed/transfrag
cd /cluster/data/hg17/bed/transfrag
# lifting transfrag tables from hg15 via hg16 to hg17
for name in A375CytosolicPolyAPlusTnFg FHs738LuCytosolicPolyAPlusTnFg HepG2CytosolicPolyAMinusTnFg HepG2CytosolicPolyAPlusTnFg HepG2NuclearPolyAMinusTnFg HepG2NuclearPolyAPlusTnFg JurkatCytosolicPolyAPlusTnFg NCCITCytosolicPolyAPlusTnFg PC3CytosolicPolyAPlusTnFg SKNASCytosolicPolyAPlusTnFg U87CytosolicPolyAPlusTnFg; do
echo "select chrom, chromStart, chromEnd, name from ${name};" | hgsql hg15 | sed -e 1d > ${name}Hg15.bed
liftOver ${name}Hg15.bed /cluster/data/hg15/bed/liftOver/hg15ToHg16.over.chain ${name}Hg16.bed unmappedHg16.bed
liftOver ${name}Hg16.bed /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain ${name}Hg17.bed unmappedHg17.bed
echo "hg16 unmapped count for ${name}: " `grep "#" unmappedHg16.bed | wc -l | awk '{print $1}'`
echo "hg17 unmapped count for ${name}: " `grep "#" unmappedHg17.bed | wc -l | awk '{print $1}'`
hgLoadBed hg17 ${name} ${name}Hg17.bed
# clean up
rm ${name}Hg15.bed ${name}Hg16.bed unmappedHg16.bed unmappedHg17.bed
done
# GLADSTONE ARRAY TRACK (DONE 7/19/2005 Andy)
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir gladHumES
cd gladHumES/
cp /cluster/data/hg16/bed/geneAtlas2/geneAtlas2.bed .
cut -f1-12 geneAtlas2.bed > bed.hg16
liftOver bed.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain bed.hg17 /dev/null
hgMapMicroarray bed.hg17.data hgFixed.gladHumESRatio \
-bedIn bed.hg17
#Loaded 11087 rows of expression data from hgFixed.gladHumESRatio
#Mapped 10925, multiply-mapped 382, missed 23266, unmapped 162
hgLoadBed hg17 gladHumES bed.hg17.data
# PHASTODDS GENESORTER COLUMN (DONE 7/28/2005 Andy)
ssh kolossus
cd /panasas/store/andy
mkdir phastOdds
cd phastOdds/
export PATH=${PATH}:/cluster/bin/phast/x86_64
mkdir sso beds
cat > runChrom.sh << "_EOF_"
#!/bin/bash
c=$1
numDir=`echo ${c#chr} | sed 's/_random//'`
ALNDIR=/cluster/data/hg17/bed/multiz10way
echo msa_view $c
/cluster/bin/phast/x86_64/msa_view --in-format MAF ${ALNDIR}/maf/${c}.maf --refseq /cluster/data/hg17/${numDir}/${c}.fa > /tmp/${c}.sso
echo phastCons $c
/cluster/bin/phast/x86_64/phastOdds -f ${ALNDIR}/cons/run.elements/ave.cons.mod -b ${ALNDIR}/cons/run.elements/ave.noncons.mod -g kg/${c}.bed /tmp/${c}.sso > /tmp/${c}.phastOdds.gtf
cp /tmp/${c}.sso sso/
rm /tmp/${c}.sso
cp /tmp/${c}.phastOdds.gtf gtfs/
rm /tmp/${c}.phastOdds.gtf
echo $c done
_EOF_
ssh hgwdev
cd /panasas/store/andy/phastOdds
genePredToGtf hg17 knownGene kg.gtf
exit
for c in `cut -f1 kg.gtf | sort | uniq`; do
grep "\b${c}\b" kg.gtf > kg/${c}.gtf;
done
for f in kg/*.bed; do
c=`basename $f .bed`;
echo $c;
./runChrom.sh $c;
addPhastOddsExons $f gtfs/$c.phastOdds.gtf beds/$c.bed
done
cat beds/* | sort -k4,4 -k1,1 -k2,2n -k3,3n > phastOdds.kg.bed
cat > phastOdds.sql << "EOF"
CREATE TABLE phastOdds (
bin smallint not null, # Speedup.
chrom varchar(255) not null, # Human chromosome or FPC contig
chromStart int unsigned not null, # Start position in chromosome
chromEnd int unsigned not null, # End position in chromosome
name varchar(255) not null, # Name of item
#Indices
score float not null, # phastOdds score.
index(chrom(8),bin),
index(name(10))
);
EOF
# <<
hgLoadBed -sqlTable=phastOdds.sql hg17 phastOdds phastOdds.kg.bed
# Actually I probably don't need that hg17 table.
echo create table phastOdds select name, score from hg17.phastOdds | hgsql hgFixed
echo create index nameIndex on phastOdds (name(10)) | hgsql hgFixed
##########################################################################
# Illumina SNPs (Heather, July 2005)
# Source: Jeff Ohmen, PhD, johmen@illumina.com, 858/232-2702
# using /cluster/store11 because /cluster/data/hg17 is on store5,
# which is currently being restored
cd /cluster/store11/heather/illumina
fix.pl < LinkageIVbSNP.txt > illumina.bed
hgLoadBed hg17 snpIllumina -tab -strict -sqlTable=snpIllumina.sql illumina.bed
# Reading illumina.bed
# Loaded 6008 elements of size 4
# Sorted
# Saving bed.tab
# Loading hg17
# note: 28 rows where chrom = "chrXY"
# reload rankProp and psiBlast gene sorter tables to link with new
# known genes (markd 2005-07-15)
(spLoadRankProp -noMapFile=max1k.nomap hg17 rankProp -- /cluster/bluearc/markd/rankprop/results/hs.sw+tr/max1k.rankp.gz) >&max1k.hg17.out
(spLoadPsiBlast hg17 spPsiBlast /cluster/bluearc/markd/rankprop/results/hs.sw+tr.eval.gz) >&pslBlast.hg17.out
# BLASTZ/CHAIN/NET CANFAM2 (DONE 8/2/05 angie - REDONE 12/12/05 angie - REDONE 2/6/06 angie)
# Unfortunately, this was done with a corrupted
# /san/sanvol1/scratch/hg17/nib/chr5.nib the first time around;
# also, a linSpecRep bug in blastz-run-ucsc has been fixed since then.
# Doh, then Kate pointed out that linSpecReps were not snipped properly --
# I had omitted the BLASTZ_ABRIDGE_REPEATS line from the DEF!!!
# Added an error message to doBlastzChainNet.pl to catch that next time.
# Therefore I'm moving aside the previous run:
mv /usr/local/apache/htdocs/goldenPath/hg17/vsCanFam2{,.bak}
# And rerunning...
ssh kkstore02
mkdir /cluster/data/hg17/bed/blastz.canFam2.2006-02-06
cd /cluster/data/hg17/bed/blastz.canFam2.2006-02-06
cat << '_EOF_' > DEF
# human vs. dog
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human
SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
SEQ1_SMSK=/san/sanvol1/scratch/hg17/linSpecRep.notInDog
SEQ1_LEN=/cluster/data/hg17/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Dog
SEQ2_DIR=/san/sanvol1/scratch/canFam2/nib
SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInHuman
SEQ2_LEN=/cluster/data/canFam2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg17/bed/blastz.canFam2.2006-02-06
'_EOF_'
# << for emacs
doBlastzChainNet.pl DEF -bigClusterHub pk -smallClusterHub pk \
-workhorse pk \
-blastzOutRoot /san/sanvol1/scratch/blastzHg17CanFam2Out >& do.log &
tail -f do.log
rm -f /cluster/data/hg17/bed/blastz.canFam2
ln -s blastz.canFam2.2006-02-06 /cluster/data/hg17/bed/blastz.canFam2
# RE-RUN NETTOAXT, AXTTOMAF FOR CANFAM2 (DONE 10/26/05 angie)
# Kate fixed netToAxt to avoid duplicated blocks, which is important
# for input to multiz. Regenerate maf using commands from sub-script
# netChains.csh generated by doBlastzChainNet.pl above.
# Obsoleted by re-run of hg17-canFam2 above 12/12/05 angie...
ssh kolossus
cd /cluster/data/hg17/bed/blastz.canFam2.2005-08-01/axtChain
netSplit hg17.canFam2.net.gz net
chainSplit chain hg17.canFam2.all.chain.gz
cd ..
mv axtNet axtNet.orig
mkdir axtNet
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
/panasas/store/hg17/nib /iscratch/i/canFam2/nib stdout \
| axtSort stdin stdout \
| gzip -c > axtNet/$f:t:r.hg17.canFam2.net.axt.gz
end
rm -r mafNet
mkdir mafNet
foreach f (axtNet/*.hg17.canFam2.net.axt.gz)
axtToMaf -tPrefix=hg17. -qPrefix=canFam2. $f \
/cluster/data/hg17/chrom.sizes /cluster/data/canFam2/chrom.sizes \
stdout \
| gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end
rm -r axtChain/{chain,net}/ axtNet.orig
############
# Sangamo/EIO DNaseI Hypersensitive Sites (2005-08-15 kate)
# (Sangamo Biosciences and European Inst. Oncology)
# Contact: Fyodor Umov (fumov@sangamo.com)
cd /cluster/data/hg17/bed
mkdir sangamo
cd sangamo
grep chr 3314_hs_sites_browser.bed | grep -v browser | \
hgLoadBed -noBin hg17 sangamoDnaseHs stdin
# Loaded 3314 elements of size 6
checkTableCoords -table=sangamoDnaseHs hg17
# use "antiword" to create plain text from .doc description file
# UPDATE WGRNA TRACK (DONE, 2005-08-24, Fan)
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir wgRna-2005-08-24
cd wgRna-2005-08-24
# Received the data file, wgtrack_aug2005.txt, from Michel Weber's email
# (Michel.Weber@ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg17/bed/wgRna-2005-08-24.
cut -f 2-10 wgtrack_aug2005.txt >wgRna.tab
vi wgRna.tab
# edit wgRna.tab to take out the first line of data field labels.
hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
# Compared to 7/5 data, one record updated, one record dropped, one record added, out of 741 records.
# Generate snpMask files (Done Heather Sept. 1, 2005)
# Takes about 10-15 minutes
# Consumes about 1 gig of disk
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg17
mkdir snpMask
cd snpMask
foreach chrom ( `cat /cluster/data/hg17/chrom.lst` )
snpMaskChrom hg17 ${chrom} /gbdb/hg17/nib/${chrom}.nib ${chrom}.ambigs.fa
gzip ${chrom}.ambigs.fa
end
#############################################################################
# BLASTZ Mm7 (WORKING - 2005-09-06 - Hiram)
# Experiment, try the alignments without the linage specific
# repeats
ssh pk
mkdir /cluster/data/hg17/bed/blastzMm7.2005-09-06
cd /cluster/data/hg17/bed
ln -s blastzMm7.2005-09-06 blastz.mm7
cd blastzMm7.2005-09-06
cat << '_EOF_' > DEF
# human vs mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_M=50
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Human Hg17
SEQ1_DIR=/cluster/bluearc/hg17/bothMaskedNibs
SEQ1_LEN=/cluster/bluearc/hg17/chrom.sizes
SEQ1_CTGDIR=/cluster/bluearc/hg17/bothMaskedNibs
SEQ1_CTGLEN=/cluster/bluearc/hg17/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=500000
SEQ1_LAP=50
# QUERY: Mouse Mm7
SEQ2_DIR=/cluster/bluearc/mm7/mm7.2bit
SEQ2_LEN=/cluster/bluearc/mm7/chrom.sizes
SEQ2_CTGDIR=/cluster/bluearc/mm7/mm7Chroms_RandomContigs.2bit
SEQ2_CTGLEN=/cluster/bluearc/mm7/mm7Chroms_RandomContigs.sizes
SEQ2_LIFT=/cluster/bluearc/mm7/Chroms_RandomContigs.lft
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=3000000000
SEQ2_LAP=0
BASE=/cluster/data/hg17/bed/blastzMm7.2005-09-06
TMPDIR=/scratch/tmp
'_EOF_'
# happy emacs
cp -p /cluster/data/hg17/chrom.sizes ./S1.len
twoBitInfo /cluster/bluearc/mm7/mm7Chroms_RandomContigs.2bit S2.len
# establish a screen to control this job
screen
time ./doBlastzChainNet.pl -stop chainMerge \
-bigClusterHub=pk \
`pwd`/DEF > toChainMerge.run.out 2>&1 &
# STARTED - 2005-09-06 - 11:00
# detach from screen session: Ctrl-a Ctrl-d
# to reattach to this screen session:
time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk \
-continue=cat -stop=cat \
`pwd`/DEF > catStep.out 2>&1 &
time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk \
-continue=chainRun \
`pwd`/DEF > continueChainRun.out 2>&1 &
time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk \
-continue=chainMerge -stop=chainMerge \
`pwd`/DEF > chainMerge.out 2>&1 &
time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk \
-continue=net -stop=net \
`pwd`/DEF > net.out 2>&1 &
time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk \
-continue=load -stop=load \
`pwd`/DEF > load.out 2>&1 &
time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk \
-swap -stop=load \
`pwd`/DEF > swap.out 2>&1 &
# Create plain pslChrom files to load a simple blastz track
ssh kkstore02
cd /cluster/data/hg17/bed/blastzMm7.2005-09-06
mkdir -p pslChrom
(cd pslParts; ls | awk -F"." '{print $1}' | sort -u) | while read C
do
echo -n "working ${C} ... "
zcat pslParts/${C}.nib*.gz | gzip -c > pslChrom/${C}.psl.gz
echo "done"
done
# Load those alignments
ssh hgwdev
cd /cluster/data/hg17/bed/blastzMm7.2005-09-06
ls pslChrom | sed -e "s/.psl.gz//" | while read T
do
echo "hgLoadPsl -fastLoad -noTNameIx hg17 -table=${T}_blastzMm7 pslChrom/${T}.psl.gz"
hgLoadPsl -fastLoad -noTNameIx hg17 -table=${T}_blastzMm7 pslChrom/${T}.psl.gz
done
# After this same alignment was done with Hg17 query and Mm7
# target, came back to these swapped results in mm7 and manually loaded
# the swapped tables as: chainMm7LSR, chainMm7LSRLink and
# netMm7LSR
# 41,223,632 total rows in the chainMm7Link split tables
# 58,458,613 total rows in the chainMm7LSRLink table
time featureBits hg17 chainMm7LSRLink
# 959444893 bases of 2866216770 (33.474%) in intersection
# real 36m30.822s
# user 14m19.620s
# sys 5m13.910s
time featureBits hg17 chainMm7Link
# 955168137 bases of 2866216770 (33.325%) in intersection
# real 16m13.902s
# user 10m20.780s
# sys 3m42.810s
# And, their intersection:
ssh kolossus
time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 \
chainMm7LSRLink chainMm7Link
# 952667399 bases of 2866216770 (33.238%) in intersection
# real 38m53.448s
# user 8m38.853s
# sys 2m23.362s
# LOAD ACEMBLY TRACK (DONE, 2005-09-12, Fan)
mv /cluster/data/hg17/bed/acembly /cluster/data/hg17/bed/acembly_050217
mkdir -p /cluster/data/hg17/bed/acembly
cd /cluster/data/hg17/bed/acembly
# Data is obtained from Jean Thierry-Mieg mieg@ncbi.nlm.nih.gov
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.genes.proteins.fasta.tar.gz
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.genes.gff.tar.gz
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.mrnas.fasta.tar.gz
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.genes.pfamhits.tar.gz
tar xvzf acembly.ncbi_35g.genes.gff.tar.gz
tar xvzf acembly.ncbi_35g.genes.proteins.fasta.tar.gz
cd acembly.ncbi_35.genes.gff
# the acembly dataset for hg16 had problems with reverse blocks so
# check for these
cat << '_EOF_' > checkReversedBlocks
for i in x1*.gff
do
echo -n "$i working ..."
awk -F"\t" '
{
if ($4 > $5) {
printf "reverse blocks problem for $1"
printf "\n"
}
}
' $i > $i.fixed
echo " done"
done
'_EOF_'
# << this line makes emacs coloring happy
chmod +x checkReversedBlocks
./checkReversedBlocks
ls -l *.fixed
# all *.fixed files are empty so remove - there is no reversing of blocks
rm *.fixed
foreach f (x1.acemblygenes.*.gff)
set c=$f:r:e
egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \
perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff
if (-e ../../../$c/lift/random.lft) then
liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \
ctg-chr${c}_random.gff
endif
grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' | \
grep -v "^chr//" > chr$c.gff
echo "done $c"
end
#- Load into database - use extended genePred
ssh hgwdev
cd /cluster/data/hg17/bed/acembly
# Reloaded without -genePredExt 1/6/05:
ldHgGene -gtf hg17 acembly acembly.ncbi_35.genes.gff/chr*.gff
# for entry with 28212470 from chr6.gff, change to chr6
# and for 29124352 in chr6.gff, change to chr6 (1/13/05)
echo 'update acembly set chrom = "chr6" where chrom = "chr28212470";' \
| hgsql hg17
echo 'update acembly set chrom = "chr6" where chrom = "chr29124352";' \
| hgsql hg17
# checkTableCoords and runGeneCheck to check data
checkTableCoords hg17 acembly
hgPepPred hg17 generic acemblyPep \
acembly.ncbi_35.genes.proteins.fasta/*.fasta
# create table of Acembly gene classifications
cd /cluster/data/hg17/bed/acembly/acembly.ncbi_35.genes.gff
rm acemblyClass.tab
foreach f (x1.acemblygenes.*.gff)
cut -f 9 $f |sed -e 's/;/\t/g' |sed -e 's/transcript_id //' >j.tmp
cut -f 2 j.tmp >j2.tmp
cut -f 3 j.tmp >j3.tmp
paste j3.tmp j2.tmp|sed -e 's/Main_gene/main/g' |sed -e 's/Putative_gene/putative/g' |sed -e 's/ //g' >>acemblyClass.tab
end
rm *.tmp
hgsql hg17 -e 'drop table acemblyClass'
hgsql hg17 < ~src/hg/lib/acemblyClass.sql
hgsql hg17 -e 'load data local infile "acemblyClass.tab" into table acemblClass.tab'
hgsql hg17 -e 'delete from acemblyClass where class!="main" and class!="putative"'
# build acemblyPep table
hgPepPred hg17 generic acemblyPep \
acembly.ncbi_35.genes.proteins.fasta/*.fasta
# Please note, per email from Jean Thierry-Mieg on 9/9/05,
# there are AceView genes (~10,000) without corresponding
# protein sequences. They will fix it next time.
###########################################################################
# LOADING AFFYTXNPHASE2 TRACK (sugnet)
# cd to where data is downloaded.
cd /cluster/store10/sugnet/affyTranscription/graphs/transcriptome.affymetrix.com/download/publication/polyA_minus/graphs
# lift data from hg16 to hg17. This takes a long time.
./liftWigFilesHg16ToHg17.sh
# make the .wib and .wig files. This takes a long time.
./makeWibWigHg17.sh
# Copy .wib files to /cluster/data/hg17 files
mkdir /cluster/data/hg17/bed/affyTxnPhase2/wigData/
cp `find ./ -name "*.hg17.wib"` /cluster/data/hg17/bed/affyTxnPhase2/wigData/
chmod 775 /cluster/data/hg17/bed/affyTxnPhase2/wigData/
chmod 664 /cluster/data/hg17/bed/affyTxnPhase2/wigData/*
# Make gbdb entry
mkdir /gbdb/hg17/wib/affyTxnPhase2
chmod 775 /gbdb/hg17/wib/affyTxnPhase2
cd /gbdb/hg17/wib/affyTxnPhase2
ln -s /cluster/data/hg17/bed/affyTxnPhase2/wigData/* .
cd -
# Load the database tables (using bash) this takes a while
for file in `find ./ -name "*hg17.wig"`; do
base=`basename $file .hg17.wig`
echo "Doing ${base}Txn"
hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/affyTxnPhase2 hg17 ${base}Txn $file
done
# Do the transfrags
cd ../transfrags
./liftHg16ToHg17.sh
./loadHg17Tnfg.sh
# End of affyTxnPhase2
###########################################################################
# Creating download files for the affyTxnPhase2 data
# (DONE - 2006-11-20 - Hiram)
# Copy all of the data above to a temporary /san/sanvol1/scratch/
# location, and run the following script:
#!/bin/sh
mkdir -p rawData/hg17
TOP=`pwd`
export TOP
for dir in `find ./ -type d | grep '_' | grep -v A375_cytosolic_polyAPlus | grep -v FHs738Lu_cytosolic_polyAPlus | grep -v HepG2_CytosolVsNucleusDifferenceGraphs | grep -v HepG2_cytosolic_polyAPlus | grep -v HepG2_cytosolic_polyAMinus | sed -e "s#^./##"`; do
base=`echo $dir | sed -e 's/\.\///; s/\//_/g' | sed -e 's/polyA-/polyAMinus/g' | sed -e 's/-/_/g' | sed -e 's/\+/Plus/g' | $TOP/changeName.pl`
RAW=$TOP/rawData/hg17/$base.data
echo $RAW
cd $dir;
zcat `ls -1 *hg17.bed.gz` | bedSort stdin stdout | cut -f 1,2,3,4 | grep chr | $TOP/avgerizeBed.pl > $RAW
cd $TOP;
done
# Then copy the rawData/hg17/ results directory back to:
/cluster/data/hg17/bed/affyTxnPhase2/rawResults/
# And deliver to hgdownloads via symlinks on hgwdev:
cd /usr/local/apache/htdocs/goldenPath/hg17/affyTxnPhase2/
# to:
ln -s /cluster/data/hg17/bed/affyTxnPhase2/rawData/*.data.gz .
# Remove the san scratch data
###########################################################################
# ALTGRAPHX TRACK (sugnet)
/cluster/store1/sugnet/altSplice
mkdir hg17-2005.03.28
# First get the RNA clusters
cd hg17-2005.03.28
# Don't use RAGE libraries for clone bounds.
~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg17 rage.libs
# Make spec file to run.
foreach c (`echo 'select chrom from chromInfo' | hgsql hg17 | grep -v chrom`)
set out = chrom/$c.bed
echo "clusterRna -mrnaExclude=hg17.rage.libs hg17 /dev/null $out -chrom=$c" >> clusterRna.spec
end
# Tried running it on the minicluster, but can't connect to the
# cluster accounts so run it from here on hgwdev.
./clusterRna.spec >& clusterRna.log
cd ..
# Make script to setup parasol job file for raw altGraphX files on human
cat << '_EOF_' > makeRun.sh
#!/bin/sh
for chrom in `echo "select chrom from chromInfo" | hgsql hg17 | grep -v chrom`; do
echo 'echo "Doing $chrom"'
echo "/cluster/home/sugnet/bin/i386/altSplice -db=hg17 -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/hg17.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/hg17/nib/$chrom.nib"
done
'_EOF_'
# << this line makes emacs coloring happy
mkdir agxs
chmod 755 makeRun.sh
# Minicluster down, have to run on hgwdev.
./makeRun.sh > toRun.sh
chmod 755 toRun.sh
./toRun.sh >& toRun.log &
cat agxs/*.agx > hg17.agx
# make raw altGraphX files for mouse
mkdir ../mm5-2005.03.28/
cd ../mm5-2005.03.28/
# make the rnaClusters
mkdir rnaCluster
cd rnaCluster/
mkdir chrom
# Don't use RAGE libraries for clone bounds.
~/jk/hg/geneBounds/clusterRna/generateRageAccList.csh mm5 rage.libs
# Doing select on mm5 into mm5.rage.libs
# Done.
# Make spec file to run.
foreach c (`echo 'select chrom from chromInfo' | hgsql mm5 | grep -v chrom`)
set out = chrom/$c.bed
echo "clusterRna -mrnaExclude=mm5.rage.libs mm5 /dev/null $out -chrom=$c" >> clusterRna.spec
end
# Tried running it on the minicluster, but can't connect to the
# cluster accounts so run it from here on hgwdev.
chmod 755 clusterRna.spec
./clusterRna.spec >& clusterRna.log &
# Make the gene bounds in rnaCluster.
mkdir agxs
# This script generates the jobs, one per chromosome.
echo << '_EOF_' > makeRun.sh
#!/bin/sh
for chrom in `echo "select chrom from chromInfo" | hgsql mm5 | grep -v chrom`; do
echo 'echo "Doing $chrom"'
echo "/cluster/home/sugnet/bin/i386/altSplice -db=mm5 -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/mm5.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/mm5/nib/$chrom.nib"
done
'_EOF_'
chmod 755 makeRun.sh
./makeRun.sh > toRun.sh
chmod 755 toRun.sh
./toRun.sh >& toRun.log & # Takes an hour or so...
# Consolodiate all of the records in a single file.
cat agxs/*.agx > mm5.agx
# Make the orthologous splicing graphs.
mkdir orthoSpliceExoniphy
cd orthoSpliceExoniphy/
# Get the exoniphy exons...
echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" | hgsql hg17 | grep -v txStart > hg17.exoniphy.bed
# Set up the commands for the orthosplice run.
echo 'select chrom, size from chromInfo' | hgsql hg17 | grep -v chrom > chromSizes.tab
ln -s /cluster/data/hg17/bed/blastz.mm5/axtChain/mouseNet/ nets
ln -s /cluster/data/hg17/bed/blastz.mm5/axtChain/chain/ chains
mkdir agx report logs
cat << '_EOF_' > makeRun.sh
#!/usr/bin/perl -w
open(IN, 'chromSizes.tab') or die "Can't open chromSizes.tab\n";
while(<IN>) {
chomp;
@w = split;
print "/cluster/home/sugnet/bin/i386/orthoSplice -chromSize=$w[1] -exonFile=hg17.exoniphy.bed -trumpNum=3 -chrom=$w[0] -altInFile=../agxs/hg17.$w[0].agx -orthoAgxFile=../../mm5-2005.03.28/mm5.agx -db=hg17 -orthoDb=mm5 -netFile=nets/$w[0].net -chainFile=chains/$w[0].chain -commonFile=agx/$w[0].hg17.mm5.cons.t3.agx -reportFile=report/$w[0].hg17.report -edgeFile=report/$w[0].hg17.edge.report >& logs/$w[0].test.log\n";
}
'_EOF_'
# << emacs
./makeRun.sh > orthoSplice.para.spec
ssh kki
cd /cluster/store1/sugnet/altSplice/hg17-2005.03.28/orthoSpliceExoniphy
para create orthoSplice.para.spec
para push
cat agx/*.agx > hg17.mm5.t3.exoniphy.agx
# Make bed file
agxToBed hg17.mm5.t3.exoniphy.agx hg17.mm5.t3.exoniphy.bed
# Load up files
hgLoadBed hg17 agxBed hg17.mm5.t3.exoniphy.bed
hgLoadBed -notItemRgb -sqlTable=/cluster/home/sugnet/kent/src/hg/lib/altGraphX.sql hg17 altGraphX hg17.mm5.t3.exoniphy.agx
# end altGraphX track
# EXONWALK TRACK (sugnet)
# make altGraphX track (see above)
cd /cluster/store1/sugnet/altSplice/hg17-2005.03.28/orthoSpliceExoniphy
cd exonWalk
mkdir beds
# Make parasol script.
foreach file (`ls ../agx/*.agx`)
set base=`basename $file .agx`
echo "/cluster/home/sugnet/bin/i386/exonWalk db=hg17 minPercent=0 trumpSize=100000 $file beds/$base.bed" >> exonWalk.para.spec
end
para create exonWalk.para.spec
para push
cat beds/*.bed > hg17.mm5.cons.t3.exoniphy.bed
# Predict orfs
mkdir orfs
cd orfs
mkdir bedOrf beds fa borf
cp ~/store1/altSplice/hg17-2005.01.09/orthoSpliceExonify/exonWalk/orfs.mrna2/*.sh ./
splitFile ../../hg17.mm5.cons.t3.exoniphy.bed 500 exonWalk.
cat < < '_EOF_' > makeFa.sh
#!/bin/sh
for file in "$@"
do
base=`basename $file`
echo "Doing $file"
echo "sequenceForBed -db=hg17 -bedIn=$file -fastaOut=fa/$base.fa "
sequenceForBed -db=hg17 -bedIn=$file -fastaOut=fa/$base.fa
done
'_EOF_'
chmod 755 makeFa.sh
makeFa.sh beds/*
# Run borf lots of times...
makeSpec.sh beds/* > para.spec
para create para.spec
para push
mkdir genePred
cat << '_EOF_' > makeGenePred.sh
#!/bin/sh
for file in "$@"
do
base=`basename $file`
/cluster/home/sugnet/bin/i386/borfMatcher -keepNmd beds/$base borf/$base.borf bedOrf/$base.bed genePred/$base.gp
done
'_EOF_'
# << this line makes emacs coloring happy
chmod 755 makeGenePred.sh
makeGenePred.sh beds/*
cat beds/* > hg17.mm5.exonWalk.bed
cat genePred/*.gp > hg17.mm5.exonWalk.gp
wc *.bed *.gp
# 155470 1865640 29956585 hg17.mm5.exonWalk.bed
# 98433 984330 32495119 hg17.mm5.exonWalk.gp
# Load it into the database.
ldHgGene -predTab hg17 exonWalk hg17.mm5.exonWalk.gp
# end exonWalk
####################################################################
### hapmapRecombRate (Daryl; September 19, 2005)
Lifted from hg16; see makeHg16.doc for details
# Update (Jen; October 25, 2005)
Data points that lifted to chroms other than 1-22 + X removed
before release to RR (confirmed with Daryl).
chr4_random: 11 data points
chr6_hla_hap1: 25 data points
### hapmapRecombHotspot (Daryl; September 19, 2005)
Lifted from hg16; see makeHg16.doc for details
### HapMap SNPs (Daryl; February 4, 2006)
# most of this work was done in October and November 2005 for the ENCODE workshop
cd /cluster/store4/gs.17/build34/bed/hapmap/frequencies/2005-10/non-redundant/hapmapSnps
ln -sf ../hg17.daf.all/daf.txt.gz .
ln -sf ../hg17.panTro1.rheMac1.txt.gz .
zcat hg17.panTro1.rheMac1.txt | grep -v chrom | sort >! hg17.panTro1.rheMac1.sort.txt
zcat daf.txt | grep -v chrom | sort >! daf.sort.txt
# check that order matches; should be empty
paste hg17.panTro1.rheMac1.sort.txt daf.sort.txt | awk '$1!=$17||$2!=$18||$3!=$19||$4!=$20||$5!=$21||$6!=$22||$7!=$23||$8!=$24||$11!=$25||$12!=$27||$15!=$26||$16!=$28{print $0;}'
paste hg17.panTro1.rheMac1.sort.txt daf.sort.txt | awk '{printf "%s\t%d\t%d\t%s\t0\t%c\t%c\t%c\t%c\t%c\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n",$1,$2,$3,$4,$6,$7,$8,$11,$15,$12,$16,$29,$30,$31,$32,$33}' >! hapmapSnps.bed
hgLoadBed hg17 hapmapSnps -sqlTable=hapmapSnps.sql hapmapSnps.bed
############################################################################################
# HapMap SNPs rel21a (Feb. 2007, Heather)
# June 2007 [partial fix of hapmapAllelesSummary released 6/25/07:
# using hg17 instead of hg18 liftOver files... for most but not all
# chroms! :( not documented below; error found by user]
# 1/11/08, 1/24/08 (angie): regenerated hapmapAllelesSummary with corrected
# hapmapAllelesChimp.
# get files for each chrom, for each population
# these contain data for all individuals
# not using the JPT+CHB files
ssh kkstore05
cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant/zips
wget http://www.hapmap.org/downloads/genotypes/2007-01/rs_strand/non-redundant/*
# get population data (needed to define children in CEU and YRI trios)
cd /cluster/store12/snp/hapmap
wget http://www.hapmap.org/downloads/samples_individuals/*gz
gunzip pedinfo2sample_CEU.txt.gz
filterPedigree.pl < pedinfo2sample_CEU.txt > CEU.filtered
cp CEU.filtered rel21a/genotypes/2007-01/rs_strand/non-redundant/CEU.list
gunzip pedinfo2sample_YRI.txt.gz
filterPedigree.pl < pedinfo2sample_YRI.txt > YRI.filtered
cp YRI.filtered rel21a/genotypes/2007-01/rs_strand/non-redundant/YRI.list
#!/usr/bin/env perl
while (<STDIN>) {
my @fields = split;
if ($fields[2] == 0 && $fields[3] == 0) {
@subfields = split /:/, $fields[6];
print $subfields[4];
print "\n";
}
}
cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant
zcat zips/*chr22_CEU* | head -1 > header.CEU
zcat zips/*chr22_YRI* | head -1 > header.YRI
# add carriage returns to header.CEU and header.YRI
grep -n -f CEU.list header.CEU > offsets.CEU
grep -n -f YRI.list header.YRI > offsets.YRI
# delete ids in offsets.CEU and offsets.YRI so just column numbers remain
# for each population, combine all chroms, and combine all individuals
# for CEU and YRI, filter out children from trios
# This creates CEU.merge, CHB.merge, JPT.merge, YRI.merge
./merge.csh
#!/bin/tcsh
rm -f CEU.merge
rm -f CHB.merge
rm -f JPT.merge
rm -f YRI.merge
foreach chrom (`cat chrom.list`)
echo $chrom
# CEU
echo "CEU"
set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_CEU_r21a_nr.txt.gz", $1}'`
zcat $fileName | filterCEU.pl >> CEU.merge
# CHB
echo "CHB"
set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_CHB_r21a_nr.txt.gz", $1}'`
zcat $fileName | filterCHB.pl >> CHB.merge
# JPT
echo "JPT"
set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_JPT_r21a_nr.txt.gz", $1}'`
zcat $fileName | filterJPT.pl >> JPT.merge
# YRI
echo "YRI"
set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_YRI_r21a_nr.txt.gz", $1}'`
zcat $fileName | filterYRI.pl >> YRI.merge
end
# Below is filterCEU.pl
# The others are very similar: YRI uses "offsets.YRI"
# CHB and JPT just read the input directly
#!/usr/bin/env perl
# read in a list of the columns that we are keeping
sub initList {
open LIST, "offsets.CEU";
chomp(@list = <LIST>);
close LIST;
$listSize = @list;
}
&initList;
while (<STDIN>) {
my @fields = split;
# skip header
if ($fields[0] eq "rs#") { next; }
# chrom
print $fields[2];
print " ";
# position: add zero-based start coord
print $fields[3] - 1;
print " ";
print $fields[3];
print " ";
# rsId
print $fields[0];
print " ";
# score
print "0 ";
# strand
print $fields[4];
print " ";
# observed
print $fields[1];
print " ";
@alleles = ();
for ( my $loop = 0; $loop < $listSize; $loop++ ) {
push (@alleles, $fields[@list[$loop]-1]);
}
# N is used for missing data
$nCount = 0;
# counts
$aCountHomo = 0;
$cCountHomo = 0;
$gCountHomo = 0;
$tCountHomo = 0;
$aCountHetero = 0;
$cCountHetero = 0;
$gCountHetero = 0;
$tCountHetero = 0;
foreach $allele (@alleles) {
$parent1 = substr($allele, 0, 1);
$parent2 = substr($allele, 1, 1);
# Ns must be together
if ($parent1 eq "N" && $parent2 ne "N") { die "Unexpected input"; }
if ($parent2 eq "N" && $parent1 ne "N") { die "Unexpected input"; }
if ($parent1 eq "N" && $parent2 eq "N") { $nCount++; next; }
if ($parent1 eq "A" && $parent2 eq "A") {
$aCountHomo = $aCountHomo + 2;
next;
}
if ($parent1 eq "C" && $parent2 eq "C") {
$cCountHomo = $cCountHomo + 2;
next;
}
if ($parent1 eq "G" && $parent2 eq "G") {
$gCountHomo = $gCountHomo + 2;
next;
}
if ($parent1 eq "T" && $parent2 eq "T") {
$tCountHomo = $tCountHomo + 2;
next;
}
if ($parent1 eq "A") { $aCountHetero++; }
if ($parent1 eq "C") { $cCountHetero++; }
if ($parent1 eq "G") { $gCountHetero++; }
if ($parent1 eq "T") { $tCountHetero++; }
if ($parent2 eq "A") { $aCountHetero++; }
if ($parent2 eq "C") { $cCountHetero++; }
if ($parent2 eq "G") { $gCountHetero++; }
if ($parent2 eq "T") { $tCountHetero++; }
}
print "A ";
print $aCountHomo;
print " ";
print $aCountHetero;
print " ";
print "C ";
print $cCountHomo;
print " ";
print $cCountHetero;
print " ";
print "G ";
print $gCountHomo;
print " ";
print $gCountHetero;
print " ";
print "T ";
print $tCountHomo;
print " ";
print $tCountHetero;
print " ";
print "\n";
}
# << emacs
# Switch to C programs from kent/src/hg/snp/snpLoad.
# Determine allele1 and allele2 (set allele2 to "none" if monomorphic)
# Alleles are in alphabetical order
# Calculate score (minor allele frequency)
# Log and skip if wrong number of elements in row
# Log and skip if triallelic or quadallelic
# Log and skip degenerate case (no alleles)
# No errors this run
# Still running on kkstore05
# Could rename "hapmap1" to "hapmapCondense"
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 CEU.merge CEU.condense
wc -l hapmap1.log
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 CHB.merge CHB.condense
wc -l hapmap1.log
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 JPT.merge JPT.condense
wc -l hapmap1.log
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 YRI.merge YRI.condense
wc -l hapmap1.log
# save some space
gzip *merge
# load
ssh hgwdev
cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant
cp /cluster/home/heather/kent/src/hg/lib/hapmapSnps.sql .
# modify hapmapSnps for 4 populations
hgLoadBed hg17 hapmapSnpsCEU -sqlTable=hapmapSnpsCEU.sql CEU.condense
hgLoadBed hg17 hapmapSnpsCHB -sqlTable=hapmapSnpsCHB.sql CHB.condense
hgLoadBed hg17 hapmapSnpsJPT -sqlTable=hapmapSnpsJPT.sql JPT.condense
hgLoadBed hg17 hapmapSnpsYRI -sqlTable=hapmapSnpsYRI.sql YRI.condense
# save some more space
ssh kkstore05
cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant
gzip *condense
# sanity check
mysql> select count(*) from hapmapSnpsCEU where homoCount1 + homoCount2 + heteroCount = 0;
+----------+
| count(*) |
+----------+
| 0 |
+----------+
mysql> select count(*) from hapmapSnpsCHB where homoCount1 + homoCount2 + heteroCount = 0;
+----------+
| count(*) |
+----------+
| 0 |
+----------+
mysql> select count(*) from hapmapSnpsJPT where homoCount1 + homoCount2 + heteroCount = 0;
+----------+
| count(*) |
+----------+
| 0 |
+----------+
mysql> select count(*) from hapmapSnpsYRI where homoCount1 + homoCount2 + heteroCount = 0;
+----------+
| count(*) |
+----------+
| 0 |
+----------+
mysql> select max(score) from hapmapSnpsCEU;
+------------+
| max(score) |
+------------+
| 500 |
+------------+
# create indexes
mysql> alter table hapmapSnpsCEU add index name (name);
mysql> alter table hapmapSnpsCEU add index chrom (chrom, bin);
mysql> alter table hapmapSnpsCHB add index name (name);
mysql> alter table hapmapSnpsCHB add index chrom (chrom, bin);
mysql> alter table hapmapSnpsJPT add index name (name);
mysql> alter table hapmapSnpsJPT add index chrom (chrom, bin);
mysql> alter table hapmapSnpsYRI add index name (name);
mysql> alter table hapmapSnpsYRI add index chrom (chrom, bin);
# 2nd step in processing: create hapmapSnpsCombined
ssh hgwdev
cd /cluster/data/hg17/bed/hapmap/rel21a
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap2 hg17
hgLoadBed hg17 hapmapSnpsCombined -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapSnpsCombined.sql hapmapSnpsCombined.tab
# create indexes (not used by browser)
mysql> alter table hapmapSnpsCombined add index name (name);
mysql> alter table hapmapSnpsCombined add index chrom (chrom, bin);
# errors
# nothing that isn't biallelic
# nothing with mixed positions
# over 500K that were not available in all 4 populations
# YRI: 187,485
# CEU: 129,359
# CHB and JPT: 97,095
# Also, 2 strand corrections done
grep -v missing hapmap2.errors
# different strands for rs1621378
# different strands for rs5768
# cleanup to save space
rm hapmapSnpsCombined.tab
# monomorphism
YRI 867,835
CEU 1,252,743
CHB 1,496,438
JPT 1,539,094
combined 607,393
# observed strings
# why is A/T different from other transversions?
A/G 1,344,043
C/T 1,344,542
A/C 352,875
A/T 275,670
C/G 354,299
G/T 354,149
triallelic 1,370
quadallelic 403
other 1,226
# some details on the others:
125 -/A/T
124 -/A/G
107 -/C/T
85 -/A/C
79 -/G/T
25 -/C/G
18 -/A/C/T
13 -/A/G/T
12 -/A/C/G
11 -/C/G/T
7 (LARGEINSERTION)
5 (LARGEDELETION)
6 microsat
2 het
# check for collisions (more than one SNP at the same location)
# none found
ssh hgwdev
cd /cluster/data/hg17/bed/hapmap/rel21a
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckCluster2 hg17 hapmapSnpsCombined > snpCheckCluster2.out
# check against hg17.snp125
ssh hgwdev
cd /cluster/data/hg17/bed/hapmap/rel21a
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapLookup hg17 hapmapSnpsCombined snp125 snp125Exceptions
# 1817 total that are complex type from dbSNP (hapmapLookup.log)
# This is not based on observed string, only on size, class and locType
# 1176 class = mixed
# 616 class = single but locType != exact
# 11 class = named
# 6 class = insertion
# 4 class = deletion
# 2 class = microsat
# 2 class = het
# Generally if class = single the observed string is bi-allelic as expected
# Exceptions to that:
# rs700519 quad-allelic, locType = rangeDeletion
# rs1572672 tri-allelic, locType = between
# rs2357412 tri-allelic, locType = range
# rs2364671 tri-allelic, locType = rangeSubstitution
# rs3959788 quad-allelic, locType = between
# 74 items in hapmapLookup.error
# 59 reverse complement (that's okay)
# 7 multiple alignment (6 from chrX:154,219,000-154,220,500 which is close to PAR)
# Also rs6645103 which is PAR
mysql> select chrom, chromStart, chromEnd, strand, observed, class, locType, weight from snp125 where name = "rs6645103";
+-------------+------------+----------+--------+----------+--------+---------+--------+
| chrom | chromStart | chromEnd | strand | observed | class | locType | weight |
+-------------+------------+----------+--------+----------+--------+---------+--------+
| chrX_random | 273788 | 273789 | - | C/T | single | exact | 3 |
| chrX | 421141 | 421142 | + | C/T | single | exact | 3 |
| chrY | 421141 | 421142 | + | C/T | single | exact | 3 |
+-------------+------------+----------+--------+----------+--------+---------+--------+
# 4 observed with dbSNP complex, hapmap biallelic
# all positive strand, locType = between
# all cluster errors in dbSNP
# rs10485830
# rs7625205 (intronic)
# rs713582
# rs11403115 (class = insertion)
# 3 observed mismatch
# all dbSNP clustering error
# rs2230624 (tri-allelic)
# rs3963317 (monomorphic in hapmap, rangeSubstitution in dbSNP)
# rs5017503 (monomoprhic in hapmap)
# a strange one
# rs731449
# dbSNP strand = -, hapmap strand = +
# dbSNP observed = G/T, hapmap observed = C/T
# dbSNP clustering error rs2321451, which is C/T
# hapmap monomorphic for T
# ortho A
# no repeats, no genes, no mRNAs, no conservation
# Counts of rows where 3 populations have one major allele, the 4th has the other
hapmapMixed hg17
# countCEU = 162931
# countCHB = 46543
# countJPT = 48791
# countYRI = 309105
# Generate summary table (used by filters)
# Summary table includes ortho allele and ortho qual score
# Summary table score is heterozygosity
# Individual zygosity is *not* preserved
ssh hgwdev
# 6/25/08: regenerated with mostly-corrected hapmapAllelesChimp
# 1/11/08 angie: regenerated with finally-corrected (I hope) hapmapAllelesChimp
# 1/24/08 angie: regenerated with finally-corrected (I hope!) hapmapAllelesChimp
cd /cluster/data/hg17/bed/hapmap/rel21a
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapSummary hg17 hapmapSnpsCombined hapmapAllelesChimp hapmapAllelesMacaque
hgLoadBed hg17 hapmapAllelesSummary -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapAllelesSummary.sql hapmapSummary.tab -tab
# sanity check
mysql> select count(*) from hapmapAllelesSummary where majorAlleleCountCEU > totalAlleleCountCEU;
+----------+
| count(*) |
+----------+
| 0 |
+----------+
mysql> select count(*) from hapmapAllelesSummary where majorAlleleCountCHB > totalAlleleCountCHB;
+----------+
| count(*) |
+----------+
| 0 |
+----------+
mysql> select max(score) from hapmapAllelesSummary;
+------------+
| max(score) |
+------------+
| 500 |
+------------+
mysql> select count(*), popCount from hapmapAllelesSummary group by popCount;
+----------+----------+
| count(*) | popCount |
+----------+----------+
| 52479 | 1 |
| 72977 | 2 |
| 207643 | 3 |
| 3700478 | 4 |
+----------+----------+
mysql> select count(*), isMixed from hapmapAllelesSummary group by isMixed;
+----------+---------+
| count(*) | isMixed |
+----------+---------+
| 3192896 | NO |
| 840681 | YES |
+----------+---------+
# histogram of heterozygosity:
0 ************************************************************ 883400
25 ************** 204000
50 ************* 188703
75 *********** 157404
100 ********** 143119
125 ********* 131575
150 ********* 126916
175 ********* 128585
200 ******** 123440
225 ******** 119815
250 ******** 120646
275 ******** 120239
300 ******** 122654
325 ********* 128233
350 ********* 130069
375 ********** 144699
400 ********** 152829
425 ************ 172513
450 *************** 225645
475 ********************************** 503166
500 5927
############################################################################################
### HapMap LD (Daryl; February 11, 2006)
## start from the genotypes files, run Haploview, reformat, and load
mkdir -p /san/sanvol1/hg17/bed/hapmap/genotypes/2006-01/non-redundant/para
cd /san/sanvol1/hg17/bed/hapmap/genotypes/2006-01/non-redundant
# wget all genotype data:
# ftp://www.hapmap.org/genotypes/2006-01/non-redundant/genotypes_chr*_*.b35.txt.gz
# Haploview had to be recompiled because there was a missing JPT sample in the ped file
##runHaploview.csh
#!/bin/csh
if ( $#argv < 2 ) then
echo "usage: $0 <absolutePath> <genotypeFileName.gz> [<javaMaxMem>]"
echo " $0 /cluster/bin/foo bar.gz 2G"
exit 1
endif
set path = $1
set file = $2
set root = $file:r
set memFlag = ""
if ( $#argv >= 3 ) then
set memFlag = "-Xmx$3"
endif
cd /scratch
/bin/cp -f $path/$file .
/bin/gunzip -f $file
/usr/java/jre1.5.0_06/bin/java -d64 $memFlag -jar /cluster/home/daryl/haploview/haploview/Haploview.jar -c -d -n -maxDistance 250 -a $root >&! $root.log
/bin/gzip -f $root.LD $root.CHECK >>& $root.log
/bin/mv -f $root.LD.gz $root.CHECK.gz $root.log $path/
/bin/rm -f $root*
###
cd para
set hv = /cluster/home/daryl/scripts/runHaploview.csh
set ldDir = /cluster/store5/gs.18/build35/bed/hapmap/genotypes/2006-01/non-redundant
foreach pop (YRI CEU CHB JPT JPT+CHB)
foreach chrom (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X)
echo $hv $ldDir genotypes_chr${chrom}_{$pop}.b35.txt.gz 4G >> jobList
end
end
ssh pk
# para create, para try, para push -maxNode=25 ...
#Completed: 120 of 120 jobs
#CPU time in finished jobs: 1564915s 26081.91m 434.70h 18.11d 0.050 y
#IO & Wait Time: 21862s 364.37m 6.07h 0.25d 0.001 y
#Average job time: 13223s 220.39m 3.67h 0.15d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 40742s 679.03m 11.32h 0.47d
#Submission to last job: 104809s 1746.82m 29.11h 1.21d
#### makeDcc.pl
#!/usr/bin/perl -W
$pop = shift || die "usage: makeDcc.pl <pop> <chr>\n";
$chrom = shift || die "usage: makeDcc.pl <pop> <chr>\n";
$geno = "geno/genotypes_${chrom}_${pop}.b35.txt.gz";
$ld = "ld/genotypes_${chrom}_${pop}.b35.txt.LD.gz";
$txt = "dcc/ld_${chrom}_${pop}.b35.txt";
open(GENO,"zcat $geno | " ) || die "can't open $geno";
open(LD, "zcat $ld | " ) || die "can't open $ld";
open(TXT, " > $txt " ) || die "can't open $txt";
<GENO>;#ignore header
while (<GENO>) { @fields = split / /; $pos{$fields[0]} = $fields[3]; }
close(GENO);
<LD>;#ignore header;
while (<LD>) { @fields = split /\t/; $chromStart = $pos{$fields[0]}; $chromEnd = $pos{$fields[1]};
print TXT "$chromStart $chromEnd $pop $fields[0] $fields[1] $fields[2] $fields[4] $fields[3]\n"; }
close(LD);
close(TXT);
system("gzip $txt");
####
#### makeDcc.csh
#!/bin/csh
#set path = "/cluster/home/daryl/scripts";
set path = ".";
foreach pop (CEU CHB JPT YRI JPT+CHB)
foreach chr (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
echo $path/makeDcc.pl $pop chr$chr
end
end
####
#### makeLdBed.pl
#!/usr/bin/perl -W
sub min ($$)
{
my $a = shift @_;
my $b = shift @_;
if ($a<$b) {return $a;}
return $b;
}
sub encodeDprime($)
{
my $val = shift @_;
if ( ($val > 1) || ($val < -1) ) { die "Dprime value ($val) is out of range [-1,1]";}
elsif ($val>=0) { $ret = ord('a') + $val*9;}
else { $ret = ord('A') - $val*9;}
return chr($ret);
}
sub encodeRsquared($)
{
my $val = shift @_;
if ( ($val > 1) || ($val < 0) ) { die "R^2 value ($val) is out of range [0,1]";}
return encodeDprime($val);
}
sub encodeLod($$)
{
my $lod = shift @_;
my $dPrime = shift @_;
$ret = ord('a');
if ($lod>=2) # high LOD
{
if (abs($dPrime)<0.5) { $ret = ord('y'); } # high LOD, low D' -> pink
else { $ret += min((int($lod-abs($dPrime)-1.5)), 9) ;}
}
elsif (abs($dPrime)>0.99) { $ret = ord('z'); } # high D', low LOD -> blue
return chr($ret);
}
$inDir = shift||"data";
$outDir = shift||"bed";
$foo = "";
$bar = "";
@rest = ();
@pops = ("CEU", "CHB", "JPT", "YRI", "JPT+CHB");
printf("> Starting \t" . `date` . "\n");
foreach $pop (@pops)
{
opendir(DIR, $inDir) || die "can't open $inDir";
if ($pop eq "JPT+CHB") { @hmFiles = grep {/^ld_/ && /_JPT/ && /CHB.b35.txt.gz$/} readdir(DIR); }
else { @hmFiles = grep {/^ld_/ && /_${pop}.b35.txt.gz$/} readdir(DIR); }
closedir(DIR);
printf "POP:\t$pop\t$#hmFiles\n";
foreach $hmFile (sort @hmFiles)
{
($foo, $chrom, $bar) = split /_/, $hmFile;
$chrom =~ s/chrx/chrX/;
$chrom =~ s/chry/chrY/;
$outfile = "$outDir/${pop}_${chrom}.hg17.bed";
if ((-e $outfile)||(-e "$outfile.gz")) { next; }
$tmpFile = "/tmp/${pop}_${chrom}.hg17.bed";
printf("$inDir/$hmFile => $outfile.gz\t" . `date`);
open(OUT, "> $tmpFile" ) || die "can't open $tmpFile";
open(IN, "zcat $inDir/$hmFile | " ) || die "can't open $inDir/$hmFile";
$line = <IN>;
if (!defined $line){next;}
chomp($line);
($chromStart, $chromEnd, $pop, $name, $marker2, $dprime, $rsquared, $lod, @rest) = split / /, $line;
$ldCount = 1;
while (<IN>)
{
chomp();
($chromStartNew, $chromEndNew, $pop, $nameNew, $marker2, $dprime, $rsquared, $lod, @rest) = split / /;
if ($chromStart ne $chromStartNew)
{
$chromStart--;
printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n");
$chromStart = $chromStartNew;
$chromEnd = $chromEndNew;
$name = $nameNew;
$ldCount = 1;
$dprimeList = encodeDprime($dprime);
$rsquaredList = encodeRsquared($rsquared);
$lodList = encodeLod($lod, $dprime);
}
elsif ($chromEndNew-$chromStartNew<250000)
{
$chromEnd = $chromEndNew;
$ldCount++;
$dprimeList .= encodeDprime($dprime);
$rsquaredList .= encodeRsquared($rsquared);
$lodList .= encodeLod($lod, $dprime);
}
}
close(IN);
$chromStart--;
printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n");
close(OUT);
system("gzip $tmpFile");
system("mv $tmpFile.gz $outDir");
}
}
printf("> Finished \t" . `date` . "\n");
####
#### getMax.csh -- check for consistency by chrom and population
#!/bin/csh
set out = maxDist.txt
rm -f $out
touch $out
echo this takes about 4 hours to run completely >> $out
foreach f (dcc/ld_*.b35.txt.gz)
echo -n "$f " >> $out
zcat $f | awk '{if ($2-$1>max) max=$2-$1} END {print max}' >> $out
end
#### getSizes.csh -- should all be 249999
#!/bin/csh
set out = wcList.txt
rm -f $out
touch $out
echo "this takes about 2 hours to run completely"
foreach f (dcc/*.txt.gz)
echo -n $f:r:r " " | sed 's/ld_//;s/chr//;s/_/\t/' >> $out
zcat $f | cut -f1 -d " " | uniq | wc -l >> $out
end
#### load.csh
#!/bin/csh
set db = hg17
sed 's/hapmapLd/hapmapLdCeu/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
sed 's/hapmapLd/hapmapLdChb/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
sed 's/hapmapLd/hapmapLdJpt/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
sed 's/hapmapLd/hapmapLdYri/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
sed 's/hapmapLd/hapmapLdChbJpt/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
# about half an hour to an hour per population
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X)
hgLoadBed -noSort -oldTable -strict ${db} hapmapLdCeu CEU_chr${c}.${db}.bed.gz
hgLoadBed -noSort -oldTable -strict ${db} hapmapLdChb CHB_chr${c}.${db}.bed.gz
hgLoadBed -noSort -oldTable -strict ${db} hapmapLdJpt JPT_chr${c}.${db}.bed.gz
hgLoadBed -noSort -oldTable -strict ${db} hapmapLdYri YRI_chr${c}.${db}.bed.gz
hgLoadBed -noSort -oldTable -strict ${db} hapmapLdChbJpt JPT+CHB_chr${c}.${db}.bed.gz
end
rm -f bed.tab
###
# AFFYHUEX1 TRACK (sugnet Wed Oct 5 12:16:42 PDT 2005)
mkdir hg17
cd hg17
pwd
# /cluster/store1/sugnet/affymetrixHumanAllExon/hg17
mkdir gff beds annot
cd gff
# download gff design files
# parse gff script...
#!/usr/bin/perl -w
if(scalar(@ARGV) == 0) {
print STDERR "parseGff.pl - Parse out affymetrixes gff annotation
probesets for human all exon design.
usage:
parseGff.pl file1.design.gff file2.design.gff ... fileN.design.gff
";
exit(1);
}
sub splitField($) {
my $l = shift(@_);
my @w = split / /, $l;
return $w[1];
}
while($file = shift(@ARGV)) {
if(!($file =~ /(.+)\.gff/)) {
die "$file doesn't have .gff suffix\n";
}
$prefix = $1;
print STDERR "Doing file $file.\n";
open(IN, $file) or die "Can't open $file to read.";
open(BED, ">../beds/$prefix.pset.bed") or die "Can't open ../beds/$prefix.pset.bed to write.";
open(ANNOT, ">../annot/$prefix.tab") or die "Can't open ../annot/$prefix.tab to write.";
while($line = <IN>) {
# Only want the probeset records.
if($line =~ /\tprobeset\t/) {
$score = 0;
$cds = 0;
$bounded = 0;
chomp($line);
# pop off an microsoft line endings.
$line =~ s/\r$//;
@words = split /\t/, $line;
# This makes the evidence be comman separated.
$words[8] =~ s/\" \"/,/g;
# This gets rid of pesky quotes.
$words[8] =~ s/\"//g;
# Set the score based on the annotation type
if($words[8] =~ /full/) {
$score = 200;
}
elsif($words[8] =~ /extended/) {
$score = 500;
}
elsif($words[8] =~ /core/) {
$score = 900;
}
if($words[8] =~ /bounded/) {
$score -= 200;
}
if($words[8] =~ /cds/) {
$score += 100;
}
if($score <= 0) {
$score = 100;
}
# Print out the annotation fields.
@fields = split /; /,$words[8];
$id = splitField($fields[1]);
$f = shift(@fields);
$f = splitField($f);
print ANNOT "$f";
while($f = shift(@fields)) {
if($f =~ /^bounded/) {
$bounded = 1;
}
if($f =~ /^cds/) {
$cds = 1;
}
if(!($f =~ /^bounded/ || $f =~ /^cds/)) {
$f = splitField($f);
print ANNOT "\t$f";
}
}
print ANNOT "\t$bounded\t$cds";
print ANNOT "\n";
print BED "$words[0]\t$words[3]\t$words[4]\t$id\t$score\t$words[6]\n";
}
}
close(IN);
close(BED);
close(ANNOT);
}
./parseGff.pl *.gff
cat beds/*.bed > affyHuEx1.bed
hgLoadBed hg17 affyHuEx1 affyHuEx1.bed -strict
cat annot/*.tab > affyHuEx1.annot.tab
# Contents of affyHuEx1Annot.sql file
CREATE TABLE affyHuEx1Annot (
numIndependentProbes smallint not null,
probesetId int(11) not null,
exonClustId int(11) not null,
numNonOverlapProbes smallint not null,
probeCount smallint not null,
transcriptClustId int(11) not null,
probesetType smallint not null,
numXHybeProbe smallint not null,
psrId int(11) not null,
level varchar(10) not null,
evidence varchar(255) not null,
bounded smallint not null,
cds smallint not null,
PRIMARY KEY (probesetId)
);
hg17S -A < affyHuEx1Annot.sql
echo "load data local infile 'affyHuEx1.annot.tab' into table affyHuEx1Annot;" | hg17S -A
# end AFFYHUEX1 track
##########################################################################
# AFFY HUEX1 OFF-BY-ONE FIX (Andy 2006-12-14)
ssh hgwdev
cd /cluster/data/hg17/bed/affyHumanExon
echo "select * from affyHuEx1" | hgsql hg17 | \
tail +2 | awk 'BEGIN{OFS="\t"}{print $2,$3-1,$4,$5,$6,$7}' \
> affyHuEx1.fixed.bed
hgLoadBed hg17 affyHuEx1 affyHuEx1.fixed.bed
##########################################################################
# NSCAN composite track - (2005-09-29 markd) loaded proteins 2005-10-13
cd /cluster/data/hg17/bed/nscan/
# obtained NSCAN and NSCAN-EST predictions from michael brent's group
# at WUSTL
wget http://genome.cse.wustl.edu/predictions/human/hg17_nscan_mm5_9_14_2005/hg17_nscan_mm5_9_14_2005.tar.gz
tar -zxf hg17_nscan_mm5_9_14_2005.tar.gz
wget http://genome.cse.wustl.edu/predictions/human/NCBI35_NSCAN_EST_4-16-2005.tar
gzip -9 NCBI35_NSCAN_EST_4-16-2005.tar
# change protein fasta file to have transcript id in header
foreach f (chr_ptx/*.ptx)
awk '/^>/{$0=$1".a"}{print $0}' $f >$f.fix
end
foreach f (NCBI35_NSCAN_EST_4-16-2005/chr_ptx/*.ptx)
awk '/^>/{$0=$1".a"}{print $0}' $f >$f.fix
end
# load tracks. Note that these have *utr features, rather than
# exon featres. currently ldHgGene creates separate genePred exons
# for these.
ldHgGene -gtf -genePredExt hg17 nscanGene chr_gtf/chr*.gtf
hgPepPred hg17 generic nscanPep chr_ptx/chr*.fix
rm -rf chr_* *.tab
ldHgGene -gtf -genePredExt hg17 nscanEstGene NCBI35_NSCAN_EST_4-16-2005/chr_gtf/chr*.gtf
hgPepPred hg17 generic nscanEstPep NCBI35_NSCAN_EST_4-16-2005/chr_ptx/chr*.fix
rm -rf NCBI35_NSCAN_EST_4-16-2005 *.tab
# update trackDb; need a hg17-specific page to describe informants
human/hg17/nscan.html
human/hg17/trackDb.ra
##########################################################################
# NHGRI DNASE I HYPERSENSITIVE SITES (2005-10-05 kate)
# Submitted by Greg Crawford via web site,
# http://research.nhgri.nih.gov/DNaseHS/May2005/
# In addition, a file containing the 'randoms' was FTP'ed by Greg
# Submitted for hg16 -- lifted to hg17.
# Details of hg16 data prep are in makeHg16.doc
mkdir /cluster/data/hg17/bed/nhgri
cd /cluster/data/hg17/bed/nhgri
cp /cluster/data/hg16/bed/nhgri/hs.bed hs.hg16.bed
liftOver hs.hg16.bed /gbdb/hg16/liftOver/hg16ToHg17.over.chain \
hs.hg17.bed hs.unmapped
grep '^chr' hs.unmapped | wc -l
# 8 unmapped
hgLoadBed hg17 nhgriDnaseHs hs.hg17.bed
# Loaded 14216 elements of size 5
checkTableCoords hg17 nhgriDnaseHs
# UPDATE WGRNA TRACK (DONE, 2005-10-20, Fan)
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir wgRna-2005-10-20
cd wgRna-2005-10-20
# Received the data file, wgtrack_no_bin_oct2005.txt, from Michel Weber's email
# (Michel.Weber@ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg17/bed/wgRna-2005-10-20.
cp wgtrack_no_bin_oct2005.txt wgRna.tab
vi wgRna.tab
# edit wgRna.tab to take out the first line of data field labels.
hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
# Compared to 8/24/05 data, a few records were changed.
##########################################################################
# REBUILD hg17.gnfAtlas2Distance TABLE. SOMEHOW IT HAD MUCH FEWER RECORDS. (DONE 10/27/05, Fan)
# Create table to map between known genes and GNF Atlas2
# expression data.
hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
# Create expression distance table - takes about an hour
hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnfAtlas2 &
# hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
# row count changed to 32458000
##########################################################################
# BUILD ALLEN BRAIN TRACK (DONE 10/29/05 JK)
# Make the working directory
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir allenBrain
cd allenBrain
# Remap the probe alignments from mm7 to hg17
zcat /cluster/data/mm7/bed/bedOver/mm7.hg17.over.chain.gz \
| pslMap -chainMapFile -swapMap \
/cluster/data/mm7/bed/allenBrain/allenBrainAli.psl stdin stdout
| sort -k 14,14 -k 16,16n > unscored.psl
pslRecalcMatch unscored.psl /cluster/data/hg17/nib \
/cluster/data/mm7/bed/allenBrain/allProbes.fa allenBrainAli.psl
# Load the database
hgsql hg17 < ~/kent/src/hg/lib/allenBrainUrl.sql
hgsql hg17 -e 'load data local infile "/cluster/data/mm7/bed/allenBrain/allenBrainUrl.tab" into table allenBrainUrl;'
hgLoadPsl hg17 allenBrainAli.psl
mkdir /gbdb/hg17/allenBrain
ln -s /cluster/data/mm7/bed/allenBrain/allProbes.fa /gbdb/hg17/allenBrain/allProbes.fa
hgLoadSeq hg17 /gbdb/hg17/allenBrain/allProbes.fa
# Make mapping between known genes and allenBrain
hgMapToGene hg17 allenBrainAli -type=psl knownGene knownToAllenBrain
##########################################################################
# BUILD NIBB IMAGE PROGES (DONE 11/07/05 JK)
# Make directory on san for cluster job and copy in sequence
ssh pk
mkdir /san/sanvol1/scratch/hg17/nibbPics
cd /san/sanvol1/scratch/hg17/nibbPics
cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
# Make parasol job dir and sequence list files
mkdir run
cd run
mkdir psl
ls -1 /cluster/sanvol1/scratch/hg17/nib/*.nib > genome.lst
echo ../nibbImageProbes.fa > rna.lst
# Create parasol gensub file file
cat << '_EOF_' > gsub
#LOOP
blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
#ENDLOOP
'_EOF_'
# Create parasol batch
gensub2 genome.lst mrna.lst gsub spec
para create spec
# Do para try/push/time etc.
#Completed: 46 of 46 jobs
#CPU time in finished jobs: 11818s 196.97m 3.28h 0.14d 0.000 y
#IO & Wait Time: 145s 2.41m 0.04h 0.00d 0.000 y
#Average job time: 260s 4.33m 0.07h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 1022s 17.03m 0.28h 0.01d
#Submission to last job: 1060s 17.67m 0.29h 0.01d
# Make sort and filter
catDir psl | sort -k 10 \
| pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
| sort -k 14,14 -k 16,16n \
| sed 's/..\/..\/nib\/chr/chr/' \
| sed 's/.nib//' > ../nibbImageProbes.psl
# Make bed file and copy in stuff
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir nibbPics
cd nibbPics
cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
cp /san/sanvol1/scratch/hg17/nibbPics/nibbImageProbes.psl .
# Load into database
ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa /gbdb/hg17/nibbImageProbes.fa
hgLoadSeq hg17 /gbdb/hg17/nibbImageProbes.fa
hgLoadPsl hg17 nibbImageProbes.psl
###########################################################################
# EXONIPHY WITH DOG (acs, 11/22/05) -- MM7, RN3, CANFAM2, HG17
# first build 4-way multiz alignment from syntenic nets (helps reduce
# false positive predictions due to paralogous alignments)
# (prepare mafNet files from syntenic nets and copy to
# /cluster/bluearc/hg17/mafNetSyn; do this for mm7, rn3, canFam2,
# and galGal2)
# make output dir and run dir
ssh pk
cd /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2
mkdir -p mafSyn runSyn
cd runSyn
# create scripts to run multiz on cluster
cat > oneMultiz.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set multi = /scratch/$user/multiz.hg17Mm7Rn3CanFam2.$c
set pairs = /cluster/bluearc/hg17/mafNetSyn
# special mode --
# with 1 arg, cleanup
if ($#argv == 1) then
rm -fr $multi
exit
endif
# special mode --
# with 3 args, saves an alignment file
if ($#argv == 3) then
cp $multi/$2/$c.maf $3
exit
endif
set s1 = $2
set s2 = $3
set flag = $4
# locate input files -- in pairwise dir, or multiple dir
set d1 = $multi
set d2 = $multi
if (-d $pairs/$s1) then
set d1 = $pairs
endif
if (-d $pairs/$s2) then
set d2 = $pairs
endif
set f1 = $d1/$s1/$c.maf
set f2 = $d2/$s2/$c.maf
# write to output dir
set out = $multi/${s1}${s2}
mkdir -p $out
# check for empty input file
if (-s $f1 && -s $f2) then
echo "Aligning $f1 $f2 $flag"
/cluster/bin/penn/multiz.v10.5 $f1 $f2 $flag > $out/$c.tmp.maf
echo "Ordering $c.maf"
/cluster/bin/penn/maf_project $out/$c.tmp.maf hg17.$c > $out/$c.maf
else if (-s $f1) then
cp $f1 $out
else if (-s $f2) then
cp $f2 $out
endif
'EOF'
# << for emacs
chmod +x oneMultiz.csh
cat > allMultiz.csh << 'EOF'
#!/bin/csh -fe
set c = $1
oneMultiz.csh $c mm7 rn3 0
oneMultiz.csh $c mm7rn3 canFam2 1
# get final alignment file
oneMultiz.csh $c mm7rn3canFam2 /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2/mafSyn/$c.maf
#cleanup
oneMultiz.csh $c
'EOF'
# << for emacs
chmod +x allMultiz.csh
cat > gsub << 'EOF'
#LOOP
allMultiz.csh $(root1) {check out line+ /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2/mafSyn/$(root1).maf}
#ENDLOOP
'EOF'
# << for emacs
cut -f 1 /cluster/data/hg17/chrom.sizes > chrom.lst
set path = (/parasol/bin $path);rehash
gensub2 chrom.lst single gsub jobList
para create jobList
# 46 jobs
para try; para check
para push
# build chromosome-by-chromosome SS files
cd /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2
mkdir run-ss-syn
cd run-ss-syn
mkdir -p /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn
cat > makeSS.csh << 'EOF'
#!/bin/csh -fe
set c = $1
/cluster/bin/phast/msa_view -i MAF -o SS /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/mafSyn/$c.maf --refseq /cluster/bluearc/hg17/chrom/$c.fa | gzip -c > /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn/$c.ss.gz
'EOF'
# << for emacs
chmod +x makeSS.csh
rm -f jobList
foreach chr (`cut -f 1 /cluster/data/hg17/chrom.sizes`)
echo "makeSS.csh $chr" >> jobList
end
para create jobList
# 46 jobs
para try; para check
para push
# now train hmm, with indel model
# note: commands below require bash
# first get a clean set of genes for training (with --indel-strict)
mkdir -p /cluster/data/hg17/bed/exoniphy/train
cd /cluster/data/hg17/bed/exoniphy/train
mkdir -p stats genes
CHROMS="chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22"
for chr in ${CHROMS} ; do
echo $chr
zcat /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn/$chr.ss.gz | clean_genes genes/refGene.$chr.gff - --stats stats/$chr.stats --conserved --indel-strict --groupby exon_id --offset3 4 --offset5 4 > genes/refGene.$chr.clean.gff
done
# get conserved noncoding seqs and add to GFFs
mkdir -p cns
for chr in ${CHROMS} ; do
echo $chr
featureBits -bed=cns/$chr.bed -chrom=$chr hg17 phastConsElementsPaper \!knownGene:exon:100 \!refGene:exon:100 \!mrna \!ensGene \!intronEst \!twinscan
cp genes/refGene.$chr.clean.gff genes/refGene.$chr.withCNS.gff
awk '{printf "%s\tphastCons\tCNS\t%d\t%d\t.\t.\t.\texon_id \"CNS.%s\"\n", $1, $2+1, $3, $4}' cns/$chr.bed >> genes/refGene.$chr.withCNS.gff
done
# now train HMM
# note: actually have to unzip SS files before this step
rm -f alns gffs
for chr in ${CHROMS} ; do
echo /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn/$chr.ss >> alns
echo genes/refGene.$chr.withCNS.gff >> gffs
done
hmm_train -m '*alns' -c ~/phast/data/exoniphy/default.cm -g '*gffs' -R exon_id -i SS -I CDS,background,CNS,5\'splice,3\'splice,prestart -t "((hg17,(mm7,rn3)),canFam2)" > indels.hmm
# training complete; now run exoniphy genome-wide
# first need to split up alignments
mkdir -p /cluster/data/hg17/bed/exoniphy/test/run-split
cd /cluster/data/hg17/bed/exoniphy/test/run-split
cat > doSplit.csh << 'EOF'
#!/bin/csh -fe
set c = $1
mkdir -p /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/$c
/cluster/bin/phast/msa_split /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/mafSyn/$c.maf --refseq /cluster/bluearc/hg17/chrom/$c.fa -i MAF --windows 100000,0 --between-blocks 5000 --min-informative 1000 --out-format SS --out-root /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/$c/$c --tuple-size 3
gzip /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/$c/$c*.ss
'EOF'
# << for emacs
chmod +x doSplit.csh
rm -f jobList
for file in /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/mafSyn/chr*.maf ; do echo doSplit.csh `basename $file .maf` >> jobList ; done
para create jobList
# 43 jobs
para try; para check
para push
# now set up exoniphy run
mkdir -p /cluster/data/hg17/bed/exoniphy/test/run-exoniphy
cd /cluster/data/hg17/bed/exoniphy/test/run-exoniphy
cp -p ../../train/indels.hmm /cluster/bluearc/hg17/exoniphy/training
mkdir -p /cluster/bluearc/hg17/exoniphy/GFF
cat > doExoniphy.sh << 'EOF'
#!/usr/local/bin/bash
root=`basename $1 .ss.gz`
chrom=`echo $root | awk -F\. '{print $1}'`
no=`echo $root | awk 'BEGIN{FS="[-.]"} {printf "%d\n", ($2+10000)/100000}'`
if [ ! -d /cluster/bluearc/hg17/exoniphy/GFF/$chrom ] ; then
mkdir -p /cluster/bluearc/hg17/exoniphy/GFF/$chrom
fi
zcat $1 | /cluster/bin/phast/exoniphy - --hmm /cluster/bluearc/hg17/exoniphy/training/indels.hmm --reflect-strand --extrapolate default --score --indels --alias "hg17=human; mm7=mouse; rn3=rat; canFam2=dog" --seqname $chrom --idpref $chrom.$no > /cluster/bluearc/hg17/exoniphy/GFF/$chrom/$root.gff
'EOF'
# << for emacs
chmod +x doExoniphy.sh
rm -f jobList
for dir in /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/* ; do find $dir -name '*.ss.gz' | awk '{printf "doExoniphy.sh %s\n", $1}' >> jobList ; done
para create jobList
# 27070 jobs
para try; para check
para push
#Completed: 27059 of 27070 jobs
#Crashed: 11 jobs
#CPU time in finished jobs: 8573545s 142892.41m 2381.54h 99.23d 0.272 y
#IO & Wait Time: 73412s 1223.54m 20.39h 0.85d 0.002 y
#Average job time: 320s 5.33m 0.09h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 593s 9.88m 0.16h 0.01d
#Submission to last job: 22823s 380.38m 6.34h 0.26d
# crashed jobs all on random chroms, chrM, etc., and appear to be
# due to all species not being present; okay to ignore
# collect predictions and create track
rm -f exoniphy.gff
for dir in /cluster/bluearc/hg17/exoniphy/GFF/chr* ; do \
rm -f files.* tmp.gff ;\
find $dir -name "chr*.gff" > files ;\
split -l 1000 files files. ;\
for l in files.* ; do cat `cat $l` >> tmp.gff ; done ;\
refeature --sort tmp.gff >> exoniphy.gff ;\
done
ldHgGene -genePredExt -gtf hg17 exoniphyDog exoniphy.gff
# COW SYNTENY (Done, Heather, Dec. 2005)
# Data from Harris A. Lewin <h-lewin@uiuc.edu>
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir syntenyCow
cd syntenyCow
hgLoadBed -noBin hg17 syntenyCow syntenyCow.bed
# add to kent/src/hg/makeDb/trackDb/human/hg17/trackDb.ra
###########################################################################
# New Conservation track (WORKING 2005-12-15 kate)
# Pairwise alignments needed for: monDom2, danRer3, bosTau2
# Use existing alignments for:
# macaque_rheMac1
# rat_rn3
# mouse_mm7
# dog_canFam2
# chicken_galGal2
# xenopus_xenTro1
# fugu_fr1
# rabbit_oryCun1
# armadillo_dasNov1
# elephant_loxAfr1
# tenrec_echTel1
# tetraodon_tetNig1
#########################################################################
# BLASTZ danRer3 (DONE - 2005-12-20 kate)
# Includes both randoms
ssh pk
mkdir /cluster/data/hg17/bed/blastz.danRer3.2005-12-20
cd /cluster/data/hg17/bed
ln -s blastz.danRer3.2005-12-20 blastz.danRer3
cd blastz.danRer3
cat << 'EOF' > DEF
# human target, zebrafish query
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# use parameters suggested for human-fish evolutionary distance
# recommended in doBlastzChainNet.pl help
# (previously used for hg16-fr1, danrer1-mm5)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
# TARGET: Human hg17
SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
SEQ1_SMSK=/cluster/bluearc/hg17/linSpecRep.notInZebrafish
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/san/sanvol1/scratch/hg17/chrom.sizes
# QUERY: zebrafish danRer3
# Use all chroms, including both randoms (chrUn and chrNA)
SEQ2_DIR=/san/sanvol1/scratch/danRer3/nib
SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
SEQ2_LEN=/cluster/bluearc/danRer3/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=1000
BASE=/cluster/data/hg17/bed/blastz.danRer3.2005-12-20
TMPDIR=/scratch/tmp
'EOF'
# << happy emacs
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-stop=net \
`pwd`/DEF >& blastz.out &
# mistakenly started this in blastz.danRer3.2005-12-18 dir --
# need to move DEF file and blastz.out to 2005-12-20 dir.
# bogus stop at net step -- thinks it can't find chains
# I'm just restarting there
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=net \
`pwd`/DEF >& blastz.2.out &
# stopped because vsDanRer3 downloads already there from
# previous run.
ssh hgwdev "rm -fr /usr/local/apache/htdocs/goldenPath/hg17/vsDanRer3"
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=download \
`pwd`/DEF >& blastz.3.out &
# measurements
ssh hgwdev "featureBits hg17 chainDanRer2Link" >& danRer2.fb; cat danRer2.fb
# 70696998 bases of 2866216770 (2.467%) in intersection
ssh hgwdev "featureBits hg17 chainDanRer3Link" >& danRer3.fb; cat danRer3.fb
# 55625762 bases of 2866216770 (1.941%) in intersection
# not sure why there's lower coverage from the newer assembly.
# It's possibly due to different parameters used in the other
# alignment. Rachel is experimenting with hg18/danRer3, and
# if warranted, we might replace this later
#########################################################################
# BLASTZ bosTau2 (DONE - 2005-12-19 kate)
ssh pk
mkdir /cluster/data/hg17/bed/blastz.bosTau2.2005-12-19
cd /cluster/data/hg17/bed
rm blastz.bosTau2
ln -s blastz.bosTau2.2005-12-19 blastz.bosTau2
cd blastz.bosTau2
cat << 'EOF' > DEF
# human vs. cow
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.x86_64
# using parameter used when not using lineage specific repeat
# abridging. This parameter restricts the # matches used by
# dynamic masking. (We can't currently use LSR repeat abridging
# when either assembly sequence is in .2bit).
BLASTZ_M=50
# TARGET: Human (hg17)
SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
SEQ1_LEN=/san/sanvol1/scratch/hg17/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cow (bosTau2)
# chunk it as we can't do whole-genome on 2bits
SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit
SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=10000
BASE=/cluster/data/hg17/bed/blastz.bosTau2.2005-12-19
TMPDIR=/scratch/tmp
'EOF'
# << happy emacs
# use chain parameters for "close" species
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF >& blastz.out &
ssh hgwdev "featureBits hg17 chainBosTau1Link" >& bosTau1.fb; cat bosTau1.fb
ssh hgwdev "featureBits hg17 chainBosTau2Link" >& bosTau2.fb; cat bosTau2.fb
# swapping to get the lift over file in the other direction (Hiram)
ssh pk
mkdir /cluster/data/bosTau2/bed/blastz.hg17.swap
cd /cluster/data/bosTau2/bed
ln -s blastz.hg17.swap blastz.hg17
cd blastz.hg17.swap
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
/cluster/data/hg17/bed/blastz.bosTau2.2005-12-19/DEF > swap.out 2>&1 &
# this failed during the load of the tables, but that is OK, we
# just wanted the liftOver files from this
# manually cleaned this up since the run faild during the MySQL
# load due to out of space problems. These tables do not need to
# be loaded anyway.
sh kkstore02
cd /cluster/data/bosTau2/bed/blastz.hg17.swap
rm -fr psl/
rm -fr axtChain/run/chain/
rm -f axtChain/noClass.net
rm -fr axtChain/net/
rm -fr axtChain/chain/
#########################################################################
# BLASTZ rheMac2 (2006-02-08 kate)
ssh pk
mkdir /cluster/data/hg17/bed/blastz.rheMac2.2006-02-08
cd /cluster/data/hg17/bed
ln -s blastz.rheMac2.2006-02-08 blastz.rheMac2
cd blastz.rheMac2
cat << 'EOF' > DEF
# macaca mulatta vs. hg18
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin
ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64
# TARGET - hg17
SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/san/sanvol1/scratch/hg17/chrom.sizes
# QUERY - macaca mulatta
SEQ2_DIR=/san/sanvol1/scratch/rheMac2/rheMac2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=0
SEQ2_LEN=/san/sanvol1/scratch/rheMac2/rheMac2.sizes
BASE=/san/sanvol1/scratch/hg17/blastz.rheMac2/
RAW=$BASE/raw
TMPDIR=/scratch/tmp
'EOF'
# << happy emacs
# use chain parameters for "close" species
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF >& blastz.out &
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=chainRun \
`pwd`/DEF >& continueChainRun.out &
# NOTE: must set -fileServer (e.g. to pk) if using base dir on SAN
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -fileServer=pk \
-chainMinScore=3000 -chainLinearGap=medium \
-continue=chainMerge \
`pwd`/DEF >& continueChainMerge.out &
# netClass was crashing as it expected a bin in the
# unsplit gap table. Robert added the bin field.
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -fileServer=pk \
-continue=download \
`pwd`/DEF >& continueDownload.out &
ssh hgwdev "featureBits hg17 chainRheMac1Link" >& rheMac1.fb; cat rheMac1.fb
ssh hgwdev "featureBits hg17 chainRheMac2Link" >& rheMac2.fb; cat rheMac2.fb
ssh kkstore02
cd /cluster/data/hg17/bed/blastz.rheMac2
cp -rp mafNet /san/sanvol1/scratch/hg17/mafNet/rheMac2
# SWAP CHAIN AND NET ALIGNMENTS OVER TO RHESUS (rheMac2)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS
# (DONE, 2006-03-22, hartera)
# Do the swap of hg17/rheMac2 alignments over to rheMac2 to produce
# rheMac2/hg17 alignments.
ssh pk
cd /cluster/data/hg17/bed/blastz.rheMac2
# use chain parameters for "close" species
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF >& swap.log &
# Took about 3 hours 40 minutes to run.
#############################################################################
# 17-WAY MULTIZ ALIGNMENTS (DONE - 2005-12-20 kate)
# # redo fix overlaps from xenTro1 and tetNig1 (2006-04-08 kate)
# copy net mafs to cluster-friendly storage for multiz run (2006-01-25 kate)
ssh kkstore01
cd /cluster/data/hg17/bed/blastz.monDom2
cp -rp mafNet /san/sanvol1/scratch/hg17/mafNet/monDom2
ssh kkstore02
cd /cluster/data/hg17/bed
mkdir -p multiz17way.2005-12-20
ln -s multiz17way.2005-12-20 multiz17way
cd multiz17way
# copy MAF's to cluster-friendly server
# These MAF's already on bluearc:
# canFam2, fr1, galGal2, panTro1, rn3
mkdir -p /san/sanvol1/scratch/hg17/mafNet
cd /san/sanvol1/scratch/hg17/mafNet
ln -s /cluster/bluearc/hg17/mafNet/{*} .
# copy others
foreach s (rheMac1 oryCun1 dasNov1 \
loxAfr1 bosTau2 monDom1 xenTro1 tetNig1 danRer3)
echo $s
cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s
end
# a few more
set s = echTel1
cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s
set s = mm7
cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s
set s = canFam2
cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s
set s = rheMac2
cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s
# thanks for the tree, Hiram! Taken from mm7 17way...
# Hiram says this is derived from the latest ENCODE
# tree, with some species removed and branch lengths
# adjusted. The ENCODE tree from the Sept. freeze is:
# ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/phylo/tree_4d.tba.v2.nh
cd /cluster/data/hg17/bed/multiz17way
cat << '_EOF_' > 17way.nh
(((((((((
(human_hg17:0.006690,chimp_panTro1:0.007571):0.024272,
macaque_rheMac2:0.0592):0.023960,
((rat_rn3:0.081728,mouse_mm7:0.077017):0.229273,
rabbit_oryCun1:0.206767):0.1065):0.023026,
(cow_bosTau2:0.159182,dog_canFam2:0.147731):0.039450):0.028505,
armadillo_dasNov1:0.149862):0.015994,
(elephant_loxAfr1:0.104891,tenrec_echTel1:0.259797):0.040371):0.218400,
monodelphis_monDom2:0.371073):0.189124,
chicken_galGal2:0.454691):0.123297,
xenopus_xenTro1:0.782453):0.156067,
((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.492961,
zebrafish_danRer3:0.782561):0.156067);
'_EOF_'
/cluster/bin/phast/draw_tree 17way.nh > 17way.ps
/cluster/bin/phast/all_dists 17way.nh > 17way.distances.txt
grep hg17 17way.distances.txt | sort -k3,3n | \
awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
# edit distances.txt to include featureBits, and chain parameters
# from blastz run.
cat distances.txt
# 0.0143 chimp_panTro1
# 0.0902 macaque_rheMac2
# 0.2563 armadillo_dasNov1
# 0.2651 dog_canFam2
# 0.2677 elephant_loxAfr1
# 0.2766 cow_bosTau2
# 0.3682 rabbit_oryCun1
# 0.4226 tenrec_echTel1
# 0.4677 mouse_mm7
# 0.4724 rat_rn3
# use loose chain params and score from here, down (5000)
# 0.7119 monodelphis_monDom1
# 0.9847 chicken_galGal2
# 1.4357 xenopus_xenTro1
# 1.6577 tetraodon_tetNig1
# 1.6983 fugu_fr1
# 1.7480 zebrafish_danRer3
# the order in the browser display will be by tree topology,
# not by distance, so it will be:
# >> # 0.0143 chimp_panTro1
# >> # 0.0902 macaque_rheMac2
# >> # 0.4677 mouse_mm7
# >> # 0.4724 rat_rn3
# >> # 0.3682 rabbit_oryCun1
# >> # 0.2651 dog_canFam2
# >> # 0.2766 cow_bosTau2
# >> # 0.2563 armadillo_dasNov1
# >> # 0.2677 elephant_loxAfr1
# >> # 0.4226 tenrec_echTel1
# >> # 0.7119 monodelphis_monDom1
# >> # 0.9847 chicken_galGal2
# >> # 1.4357 xenopus_xenTro1
# >> # 1.6577 tetraodon_tetNig1
# >> # 1.6983 fugu_fr1
# >> # 1.7480 zebrafish_danRer3
# make output dir and run dir
ssh pk
cd /cluster/data/hg17/bed/multiz17way.2005-12-20
# create species list and stripped down tree for autoMZ
sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 17way.nh > tmp.nh
echo `cat tmp.nh` > tree-commas.nh
echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
mkdir -p maf run
cd run
# stash binaries
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
set db = hg17
set c = $1
set maf = $2
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/mafNet
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == hg17) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'EOF'
# << happy emacs
chmod +x autoMultiz.csh
cat << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg17/bed/multiz17way.2005-12-20/maf/$(root1).maf}
#ENDLOOP
'EOF'
# << happy emacs
awk '{print $1}' /cluster/data/hg17/chrom.sizes > chrom.lst
# REDO FOR OVERLAPS (2006-04-07 kate)
mv ../maf ../maf.old
# edit spec file to fix maf dir path
gensub2 chrom.lst single spec jobList
para create jobList
# 46 files
para try
para check
para push
para time > run.time
# 36 hrs (not typical -- previous runs were ~16 hrs)
# PHASTCONS CONSERVATION (2006-01-05 kate)
# Redone when multiz redone to fix overlaps (2006-04-12)
# This process is distilled from Hiram and Adam's experiments
# on mouse (mm7) 17way track. Many parameters are now fixed, without
# being experimentally derived, either because the experiments
# were lengthy and produced similar results, or because they
# weren't runnable given the alignment size.
# These parameters are:
# --rho
# --expected-length
# --target-coverage
# Also, instead of generating cons and noncons tree models,
# we use a single, pre-existing tree model -- Elliot Margulies' model
# from the (37-way) ENCODE alignments.
#
# NOTE: Redone 3/20/06, adding rheMac2 to non-informative options,
# by recommendation of Adam Siepel, to correct unwanted
# high conservation in regions with primate-only alignments
# NOTE: reusing cluster-friendly chrom fasta files created earlier
#cd /cluster/data/hg17
#foreach f (`cat chrom.lst`)
#echo $f
#cp $f/*.fa /cluster/bluearc/hg17/chrom
#end
# Split chromosome MAF's into windows and use to generate
# "sufficient statistics" (ss) files for phastCons input
# NOTE: as the SAN fs has lotsa space, we're leaving these
# big (temp) files unzipped, to save time during phastCons run.
# Note also the larger chunk sizes from previous runs -- this
# reduces run-time on the split, slows down the actual phastCons
# enough so jobs don't crash (jobs are very quick, just a minute
# or so), and according to Adam, will produce better results.
# The previous small chunks were probably required by
# the phyloFit step, which we are no longer using for the
# human alignments.
ssh pk
mkdir /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
cp /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod .
# edit, changing rheMac1 -> rheMac2
mkdir run.split
cd run.split
set WINDOWS = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/ss
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/hg17/bed/multiz17way.2005-12-20/maf
set WINDOWS = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/ss
cd $WINDOWS
set c = $1
echo $c
rm -fr $c
mkdir $c
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
-M /cluster/bluearc/hg17/chrom/$c.fa \
-o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
echo "Done" >> $c.done
'EOF'
# << happy emacs
chmod +x doSplit.csh
rm -f jobList
foreach f (../../maf/*.maf)
set c = $f:t:r
echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
end
para create jobList
# 46 jobs
para try
para check
para push
# CPU time in finished jobs: 9511s 158.52m 2.64h 0.11d 0.000 y
# IO & Wait Time: 5391s 89.85m 1.50h 0.06d 0.000 y
# Average job time: 324s 5.40m 0.09h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 2354s 39.23m 0.65h 0.03d
# Submission to last job: 2358s 39.30m 0.66h 0.03d
# check tree model on 5MB chunk, using params recommended by Adam,
# (to verify branch lengths on 2X species)
# he ok'ed the results -- not necessary for next human run
ssh kolossus
cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
/cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
--tree "`cat ../tree-commas.nh`" \
/san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/ss/chr7/chr7.115000658-120000000.ss \
-o phyloFit.tree
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
cd ..
mkdir run.cons
cd run.cons
cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set tmp = /scratch/tmp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
cp -p $san/ss/$c/$f.ss ../elliotsEncode.mod $tmp
pushd $tmp > /dev/null
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncode.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative panTro1,rheMac2 \
--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/pp/$c $san/bed/$c
sleep 1
mv $tmp/$f.pp $san/pp/$c
mv $tmp/$f.bed $san/bed/$c
rm -fr $tmp
'EOF'
# emacs happy
chmod a+x doPhast.csh
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create gsub file
cat > template << 'EOF'
#LOOP
doPhast.csh $(root1) $(file1) 14 .008 .28
#ENDLOOP
'EOF'
# happy emacs
# Create parasol batch and run it
pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
/cluster/data/hg17/bed/multiz17way/cons/run.cons/in.list
popd
gensub2 in.list single template jobList
para create jobList
# 333 jobs
para try
para check
para push
# NOTE: these jobs go fast -- some crashed apparently having
# difficulty accessing input files. Just restart them and
# they work
#CPU time in finished jobs: 15520s 258.67m 4.31h 0.18d 0.000 y
#IO & Wait Time: 15796s 263.27m 4.39h 0.18d 0.001 y
#Average job time: 94s 1.57m 0.03h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 180s 3.00m 0.05h 0.00d
#Submission to last job: 48266s 804.43m 13.41h 0.56d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
# The sed's and the sort get the file names in chrom,start order
# (Hiram tricks -- split into columns on [.-/] with
# identifying x,y,z, to allow column sorting and
# restoring the filename. Warning: the sort column
# will depend on how deep you are in the dir
find ./bed -name "chr*.bed" | \
sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
sort -k7,7 -k9,9n | \
sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
xargs cat | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/hg17/bed/multiz17way/cons
# load into database
ssh hgwdev
cd /cluster/data/hg17/bed/multiz17way/cons
hgLoadBed -strict hg17 phastConsElements17way mostConserved.bed
# Loaded 2212445 elements of size 5
# compare with previous tracks
hgsql hg17 -e "select count(*) from phastConsElements10way"
# 2011952
hgsql hg17 -e "select count(*) from phastConsElements"
# 1601903
# Try for 5% overall cov, and 70% CDS cov (used elen=14, tcov=.008, rho=.28)
featureBits hg17 -enrichment refGene:cds phastConsElements17way
# refGene:cds 1.065%, phastConsElements17way 5.116%, both 0.759%, cover 71.27%, enrich 13.93x
# compare with previous tracks
featureBits hg17 -enrichment refGene:cds phastConsElements10way
# refGene:cds 1.062%, phastConsElements10way 5.003%, both 0.734%, cover 69.18%, enrich 13.83x
featureBits hg17 -enrichment refGene:cds phastConsElements
# refGene:cds 1.062%, phastConsElements 4.810%, both 0.771%, cover 72.65%, enrich 15.11x
# experiments
# previous tracks
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements
# refGene:cds 0.873%, phastConsElements 4.497%, both 0.630%, cover 72.10%, enrich 16.04x
hgsql hg17 -e "select count(*) from phastConsElements where chrom='chr7'"
# 81785
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements10way
# refGene:cds 0.873%, phastConsElements10way 4.700%, both 0.602%, cover 68.94%, enrich 14.67x
hgsql hg17 -e "select count(*) from phastConsElements10way where chrom='chr7'"
# 102959
# len=13, cov=.007, rho=.27
# looks best -- similar chr7 measurements to previous tracks
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_13_007_27
# refGene:cds 0.874%, phastConsElements17way_13_007_27 4.854%, both 0.607%, cover 69.43%, enrich 14.31x
hgsql hg17 -e "select count(*) from phastConsElements17way_13_007_27 where chrom='chr7'"
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_13_005_28
# refGene:cds 0.873%, phastConsElements17way_13_005_28 4.802%, both 0.612%, cover 70.12%, enrich 14.60x
hgsql hg17 -e "select count(*) from phastConsElements17way_13_005_28 where chrom='chr7'"
# 95203
# experiments with other parameters, below
# len=15, cov=.10
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_10
# refGene:cds 0.873%, phastConsElements17way 7.989%, both 0.627%, cover 71.77%, enrich 8.98x
hgsql hg17 -e "select count(*) from phastConsElements17way_15_10 where chrom='chr7'"
# 217767
# => too much overall covg, and too many elements
# len=15, cov=.05
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_05
# refGene:cds 0.873%, phastConsElements17way_15_05 6.880%, both 0.627%, cover 71.77%, enrich 10.43x
hgsql hg17 -e "select count(*) from phastConsElements17way_15_05 where chrom='chr7'"
# 166868
# len=15, cov=.01
# These values were used by Elliott for ENCODE
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_01
# refGene:cds 0.873%, phastConsElements17way_15_01 5.721%, both 0.628%, cover 71.89%, enrich 12.57x
hgsql hg17 -e "select count(*) from phastConsElements17way_15_01 where chrom='chr7'"
# 106034
# len=20, cov=.01
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_20_01
# refGene:cds 0.873%, phastConsElements17way_20_01 7.751%, both 0.634%, cover 72.56%, enrich 9.36x
hgsql hg17 -e "select count(*) from phastConsElements17way_20_01 where chrom='chr7'"
# 106005
# -> wrong direction on coverage
# len=10, cov=.01
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_10_01
# refGene:cds 0.873%, phastConsElements17way_10_01 4.653%, both 0.616%, cover 70.48%, enrich 15.15x
hgsql hg17 -e "select count(*) from phastConsElements17way_10_01 where chrom='chr7'"
# 108279
# => looks good on coverage and element count, check smoothness in browser
# => undersmoothed
# len=10, cov=.05
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_10_05
# refGene:cds 0.873%, phastConsElements17way_10_05 5.365%, both 0.615%, cover 70.44%, enrich 13.13x
hgsql hg17 -e "select count(*) from phastConsElements17way_10_05 where chrom='chr7'"
# 178372
# => fragmented elements
# len=15, cov=.005
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_005
# refGene:cds 0.873%, phastConsElements17way_15_005 5.444%, both 0.628%, cover 71.93%, enrich 13.21x
hgsql hg17 -e "select count(*) from phastConsElements17way_15_005 where chrom='chr7'"
# 90855
# len=20, cov=.005
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_20_005
# refGene:cds 0.873%, phastConsElements17way_20_005 7.373%, both 0.634%, cover 72.61%, enrich 9.85x
hgsql hg17 -e "select count(*) from phastConsElements17way_20_005 where chrom='chr7'"
# 91858
# len=17, cov=.005 rho=.3
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_17_005
# refGene:cds 0.873%, phastConsElements17way_17_005 6.126%, both 0.631%, cover 72.24%, enrich 11.79x
hgsql hg17 -e "select count(*) from phastConsElements17way_17_005 where chrom='chr7'"
# 91243
# len=12, cov=.01, rho=.28 -panTro1
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_12_01_28_p
# refGene:cds 0.873%, phastConsElements17way_12_01_28_p 4.829%, both 0.612%, cover 70.02%, enrich 14.50x
hgsql hg17 -e "select count(*) from phastConsElements17way_12_01_28_p where chrom='chr7'"
# 123638
# len=13, cov=.01, rho=.25 -panTro1
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_13_01_25_p
# refGene:cds 0.873%, phastConsElements17way_13_01_25_p 4.793%, both 0.594%, cover 67.99%, enrich 14.19x
hgsql hg17 -e "select count(*) from phastConsElements17way_13_01_25_p where chrom='chr7'"
# 131895
# len=14, cov=.008, rho=.28
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_14_008_28
# refGene:cds 0.874%, phastConsElements17way_14_008_28 5.227%, both 0.615%, cover 70.37%, enrich 13.46x
hgsql hg17 -e "select count(*) from phastConsElements17way_14_008_28 where chrom='chr7'"
# 106071
# Create merged posterier probability file and wiggle track data files
# pk is currently closer to the san than any other machine
ssh pk
cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
#next time try Angie's simpler sort, below
find ./pp -name "chr*.pp" | \
sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
sort -k7,7 -k9,9n | \
sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
xargs cat | \
nice wigEncode stdin phastCons17way.wig phastCons17way.wib
# about 23 minutes for above
# GOT HERE ON REDO
# NOTE: remember to flip /gbdb link from cons.old to cons
#foreach chr (`awk '{print $1}' /cluster/data/hg17/chrom.sizes`)
#echo $chr
set chr = chr22
cat `ls -1 pp/$chr/$chr.*.pp | sort -t\. -k2,2n` \
| nice wigEncode stdin phastCons17wayNewChr22.wig phastCons17wayNewChr22.wib
#end
date
cp -p phastCons17way.wi? /cluster/data/hg17/bed/multiz17way/cons
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/hg17/bed/multiz17way/cons
ln -s /cluster/data/hg17/bed/multiz17way/cons/phastCons17way.wib \
/gbdb/hg17/multiz17way/phastCons17way.wib
hgLoadWiggle -pathPrefix=/gbdb/hg17/multiz17way hg17 \
phastCons17way phastCons17way.wig
############################################################################
## Run phastCons on Placental mammals
ssh pk
cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
mkdir placental
mkdir run.cons.alt
cd run.cons.alt
# create pruned trees
set tree_doctor = /cluster/bin/phast/tree_doctor
sed 's/ /,/g' ../../species.lst
# hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1,monDom2,galGal2,xenTro1,tetNig1,fr1,danRer3
mkdir placental
$tree_doctor ../elliotsEncode.mod \
--prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1 \
> placental/placental.mod
cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $6
set tmp = /scratch/tmp/hg17/$grp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
cp -p $san/ss/$c/$f.ss $grp/$grp.mod $tmp
pushd $tmp > /dev/null
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative panTro1,rheMac2 \
--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
sleep 1
mv $tmp/$f.pp $san/$grp/pp/$c
mv $tmp/$f.bed $san/$grp/bed/$c
rm -fr $tmp
'EOF'
# << emacs happy
chmod a+x doPhast.csh
# Create gsub file
cat > template << 'EOF'
#LOOP
# template for 5% cov
doPhast.csh $(root1) $(file1) 14 .2 .28 placental
#ENDLOOP
'EOF'
cat > template << 'EOF'
#LOOP
# template same as vertebrate
doPhast.csh $(root1) $(file1) 14 .008 .28 placental
#ENDLOOP
'EOF'
# happy emacs
# Create parasol batch and run it
pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
/cluster/data/hg17/bed/multiz17way/cons/run.cons.alt/in.list
popd
gensub2 in.list single template jobList
para create jobList
# 333 jobs
para try
para check
para push
#.2
#CPU time in finished jobs: 15164s 252.74m 4.21h 0.18d 0.000 y
#IO & Wait Time: 14852s 247.53m 4.13h 0.17d 0.000 y
#Average job time: 90s 1.50m 0.03h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 170s 2.83m 0.05h 0.00d
#Submission to last job: 86364s 1439.40m 23.99h 1.00d
#.008
#CPU time in finished jobs: 13712s 228.53m 3.81h 0.16d 0.000 y
#IO & Wait Time: 14407s 240.12m 4.00h 0.17d 0.000 y
#Average job time: 84s 1.41m 0.02h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 159s 2.65m 0.04h 0.00d
#Submission to last job: 5291s 88.18m 1.47h 0.06d
ssh pk
cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/placental
# The sed's and the sort get the file names in chrom,start order
# (Hiram tricks -- split into columns on [.-/] with
# identifying x,y,z, to allow column sorting and
# restoring the filename. Warning: the sort column
# will depend on how deep you are in the dir
find ./bed -name "chr*.bed" | \
sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
sort -k7,7 -k9,9n | \
sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
xargs cat | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/hg17/bed/multiz17way/cons/placental
# load into database
ssh hgwdev
cd /cluster/data/hg17/bed/multiz17way/cons/placental
hgLoadBed -strict hg17 phastConsElementsPlacental mostConserved.bed
# .2
# Loaded 3775983 elements of size 5
# .008
# Loaded 1290060 elements of size 5
# compare with vertebrate cons
hgsql hg17 -e "select count(*) from phastConsElements17way"
# 2212445
featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental
featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental_14_2_28
featureBits hg17 -enrichment refGene:cds phastConsElements17way
featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental
# refGene:cds 1.070%, phastConsElementsPlacental 3.844%, both 0.667%, cover 62.32%, enrich 16.21x
# refGene:cds 1.069%, phastConsElementsPlacental_14_008_28 3.844%, both 0.667%, cover 62.37%, enrich 16.22x
featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental_14_2_28
#refGene:cds 1.070%, phastConsElementsPlacental_14_2_28 5.223%, both 0.691%, cover 64.62%, enrich 12.37x
featureBits hg17 -enrichment refGene:cds phastConsElements17way
#refGene:cds 1.070%, phastConsElements17way 5.116%, both 0.763%, cover 71.27%, enrich 13.93x
# Create merged posterier probability file and wiggle track data files
# pk is currently closer to the san than any other machine
ssh pk
cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/placental
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
#next time try Angie's simpler sort, below
find ./pp -name "chr*.pp" | \
sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
sort -k7,7 -k9,9n | \
sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
xargs cat | \
nice wigEncode stdin phastConsPlacental.wig phastConsPlacental.wib
# about 23 minutes for above
# GOT HERE ON REDO
# NOTE: remember to flip /gbdb link from cons.old to cons
cp -p phastConsPlacental.wi? \
/cluster/data/hg17/bed/multiz17way/cons/placental
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/hg17/bed/multiz17way/cons/placental
ln -s \
/cluster/data/hg17/bed/multiz17way/cons/placental/phastConsPlacental.wib \
/gbdb/hg17/multiz17way/phastConsPlacental.wib
hgLoadWiggle -pathPrefix=/gbdb/hg17/multiz17way hg17 \
phastConsPlacental phastConsPlacental.wig
############################################################################
## Run phastCons on subgroups (mammals, placentals, and w/o low-cov)
ssh pk
cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
mkdir run.cons.groups
cd run.cons.groups
# create pruned trees
set tree_doctor = /cluster/bin/phast/tree_doctor
sed 's/ /,/g' ../../species.lst
# hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1,monDom2,galGal2,xenTro1,tetNig1,fr1,danRer3
$tree_doctor ../elliotsEncode.mod \
--prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,bosTau2,canFam2,monDom2,galGal2,xenTro1,tetNig1,fr1,danRer3 \
> vertebrate-high.mod
$tree_doctor ../elliotsEncode.mod \
--prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,bosTau2,canFam2,monDom2 \
> mammal-high.mod
$tree_doctor ../elliotsEncode.mod \
--prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,bosTau2,canFam2 \
> placental-high.mod
$tree_doctor ../elliotsEncode.mod \
--prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1,monDom2 \
> mammal.mod
$tree_doctor ../elliotsEncode.mod \
--prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1 \
> placental.mod
foreach f (*.mod)
mkdir $f:r
mv $f $f:r
end
cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $6
set tmp = /scratch/tmp/hg17/$grp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
cp -p $san/ss/$c/$f.ss $grp/$grp.mod $tmp
pushd $tmp > /dev/null
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative panTro1,rheMac2 \
--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
sleep 1
mv $tmp/$f.pp $san/$grp/pp/$c
mv $tmp/$f.bed $san/$grp/bed/$c
rm -fr $tmp
'EOF'
# emacs happy
chmod a+x doPhast.csh
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create gsub file
cat > template << 'EOF'
#LOOP
doPhast.csh $(root1) $(file1) 14 .21 .28 placental-high
doPhast.csh $(root1) $(file1) 14 .2 .28 placental
doPhast.csh $(root1) $(file1) 14 .11 .28 mammal
doPhast.csh $(root1) $(file1) 14 .1 .28 mammal-high
doPhast.csh $(root1) $(file1) 14 .0028 .28 vertebrate-high
#ENDLOOP
'EOF'
# happy emacs
# Create parasol batch for just chr7 (for test purposes) and run it
pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
ls -1 ss/chr7/chr*.ss | sed 's/.ss$//' > \
/cluster/data/hg17/bed/multiz17way/cons/run.cons.groups/in.list
popd
gensub2 in.list single template jobList
para create jobList
# 80 jobs
para try
para check
para push
# 24 minutes
## create Alt Most Conserved track
ssh hgwdev
cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
cat > loadAltElements.csh << 'EOF'
set b = /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups
foreach d (mammal* placental* vertebrate*)
echo $d
cd $d
find ./bed -name "chr*.bed" | \
sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
sort -k7,7 -k9,9n | \
sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
xargs cat | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin \
> $b/$d/mostConserved.bed
set table = `echo $d | perl -wpe "s/(.*)/phastConsElements\u$1/;s/-(.*)/\u$1/"`
hgLoadBed -strict hg17 $table $b/$d/mostConserved.bed
featureBits hg17 -enrichment refGene:cds -chrom=chr7 $table
cd ..
end
'EOF'
csh loadAltElements.csh >&! loadAltElements.log &
grep refGene loadAltElements.log | sort -n -k4
# refGene:cds 0.884%, phastConsElementsPlacentalHigh 4.828%, both 0.606%, cover 68.51%, enrich 14.19x
# refGene:cds 0.884%, phastConsElementsMammal 4.869%, both 0.580%, cover 65.62%, enrich 13.48x
# refGene:cds 0.884%, phastConsElementsMammalHigh 4.887%, both 0.624%, cover 70.60%, enrich 14.45x
# refGene:cds 0.884%, phastConsElementsPlacental 4.904%, both 0.558%, cover 63.14%, enrich 12.88x
# refGene:cds 0.884%, phastConsElementsVertebrateHigh 4.965%, both 0.652%, cover 73.74%, enrich 14.85x
featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way
# refGene:cds 0.884%, phastConsElements17way 4.851%, both 0.623%, cover 70.48%, enrich 14.53x
ssh kkstore02
cd /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups
cat > makeAltWiggle.csh << 'EOF'
set b = `pwd`
set san = /san/sanvol1/scratch/hg17/multiz17way/cons
pushd $san
foreach d (mammal* placental* vertebrate*)
echo $d
cd $d
set table = `echo $d | perl -wpe 's/(.*)/phastCons\u$1/;s/-(.*)/\u$1/'`
echo $table
find ./pp -name "chr*.pp" | \
sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
sort -k7,7 -k9,9n | \
sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
xargs cat | \
nice wigEncode stdin $table.wig $table.wib
mv $table.wig $table.wib $b/$d
cd ..
end
popd
'EOF'
csh makeAltWiggle.csh >&! makeAltWiggle.log &
ssh hgwdev
cd /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups
cat > loadAltWiggle.csh << 'EOF'
set b = `pwd`
foreach d (mammal* placental* vertebrate*)
echo $d
cd $d
set table = `echo $d | perl -wpe 's/(.*)/phastCons\u$1/;s/-(.*)/\u$1/'`
echo $table
ln -s `pwd`/$table.wib /gbdb/hg17/multiz17way
hgLoadWiggle -pathPrefix=/gbdb/hg17/multiz17way hg17 $table $table.wig
cd ..
end
'EOF'
csh loadAltWiggle.csh >&! loadAltWiggle.log &
# Create parasol batch for just chr7 (for test purposes) and run it
pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
ls -1 ss/chr7/chr*.ss | sed 's/.ss$//' > \
/cluster/data/hg17/bed/multiz17way/cons/run.cons.groups/in.list
popd
gensub2 in.list single template jobList
para create jobList
# 80 jobs
para try
para check
para push
# 24 minutes
# Downloads (2006-02-22 kate)
ssh hgwdev
cd /cluster/data/hg17/bed/multiz17way
mkdir mafDownloads
cd mafDownloads
# upstream mafs
cat > mafFrags.csh << 'EOF'
date
foreach i (1000 2000 5000)
echo "making upstream$i.maf"
nice featureBits hg17 refGene:upstream:$i -fa=/dev/null -bed=up.bad
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
rm up.bad
nice mafFrags hg17 multiz17way up.bed upstream$i.maf \
-orgs=../species.lst
nice gzip upstream$i.maf
rm up.bed
end
date
'EOF'
time csh mafFrags.csh >&! mafFrags.log &
# ~1 hour
ssh kkstore02
cd cluster/data/hg17/bed/multiz17way/mafDownloads
cat > downloads.csh << 'EOF'
date
foreach f (../maf/chr*.maf)
set c = $f:t:r
echo $c
nice gzip -c $f > $c.maf.gz
end
md5sum *.gz > md5sum.txt
date
'EOF'
time csh downloads.csh >&! downloads.log
# ~2 hours
# GOT HERE
ssh hgwdev
set dir = /usr/local/apache/htdocs/goldenPath/hg17/multiz17way
mkdir $dir
ln -s /cluster/data/hg17/bed/multiz17way/mafDownloads/{*.gz,md5sum.txt} $dir
cp /usr/local/apache/htdocs/goldenPath/mm7/multiz17way/README.txt $dir
# edit README
# PHASTCONS SCORES DOWNLOADABLES FOR 17WAY (2006-03-20 kate)
ssh kkstore02
cd /cluster/data/hg17/bed/multiz17way
mkdir phastConsDownloads
cd phastConsDownloads
cat > downloads.csh << 'EOF'
date
cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/pp
foreach chr (`awk '{print $1}' /cluster/data/hg17/chrom.sizes`)
echo $chr
cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \
| nice gzip -c \
> /cluster/data/hg17/bed/multiz17way/phastConsDownloads/$chr.gz
end
date
'EOF'
csh downloads.csh >&! downloads.log &
# ~20 minutes
# << happy emacs
ssh hgwdev
cd /cluster/data/hg17/bed/multiz17way/phastConsDownloads
md5sum *.gz > md5sum.txt
set dir = /usr/local/apache/htdocs/goldenPath/hg17/phastCons17way
mkdir $dir
ln -s /cluster/data/hg17/bed/multiz17way/phastConsDownloads/{*.gz,md5sum.txt} $dir
cp /usr/local/apache/htdocs/goldenPath/hg17/phastCons/README.txt $dir
# edit
# UPDATE MONKEY DOWNLOADS (2006-01-12 kate)
# EXTRACT AXT'S AND MAF'S FROM THE RheMac1 NET
# The chr1 was hugely oversized -- the other's were OK, but
# axt's were numbered oddly.
ssh kkstore2
cd /cluster/data/hg17/bed/blastz.rheMac1/axtChain
gunzip -c hg17.rheMac1.net.gz | netSplit stdin humanNet
gunzip -c hg17.rheMac1.all.chain.gz | chainSplit chain stdin
mkdir ../axtNet.new ../mafNet.new
cat > makeMaf.csh << 'EOF'
foreach f (humanNet/chr*.net)
set c = $f:t:r
echo "axtNet on $c"
netToAxt humanNet/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/rheMac1/rheMac1.2bit stdout | axtSort stdin ../axtNet.new/$c.axt
axtToMaf ../axtNet.new/$c.axt \
/cluster/data/hg17/chrom.sizes /cluster/data/rheMac1/chrom.sizes \
../mafNet.new/$c.maf -tPrefix=hg17. -qPrefix=rheMac1.
end
cp -rp ../mafNet.new /san/sanvol1/scratch/hg17/mafNet/rheMac1.new
'EOF'
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
pushd /san/sanvol1/scratch/hg17/mafNet
rm -fr rheMac1
mv rheMac1.new rheMac1
popd
rm -fr axtNet
mv axtNet.new axtNet
cd axtNet
nice gzip *.axt
md5sum *.gz > md5sum.txt
# cleanup
cd ..
rm -fr chain humanNet
ssh hgwdev
ln -s /cluster/data/hg17/bed/blastz.rheMac1/axtNet \
/usr/local/apache/htdocs/goldenPath/rheMac1/axtNet
# Request push to downloads server
# UPDATE OPOSSUM DOWNLOADS (2006-01-17 kate)
# Fix overlaps
ssh kkstore2
cd /cluster/data/hg17/bed/blastz.monDom1
mv axtNet axtNet.old
mv mafNet mafNet.old
mkdir axtNet mafNet
cd axtChain/chain
nice gunzip *.gz
cd ..
nice gunzip -c human.net.gz | netSplit stdin humanNet
cat > makeMaf.csh << 'EOF'
foreach f (humanNet/chr*.net)
set c = $f:t:r
echo "axtNet on $c"
netToAxt humanNet/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/monDom1/monDom1.2bit stdout | axtSort stdin ../axtNet/$c.axt
axtToMaf ../axtNet/$c.axt \
/cluster/data/hg17/chrom.sizes /cluster/data/monDom1/chrom.sizes \
../mafNet/$c.maf -tPrefix=hg17. -qPrefix=monDom1.
end
cp -rp ../mafNet /san/sanvol1/scratch/hg17/mafNet/monDom1.new
'EOF'
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
pushd /san/sanvol1/scratch/hg17/mafNet
rm -fr monDom1
mv monDom1.new monDom1
popd
rm -fr axtNet
mv axtNet.new axtNet
cd axtNet
nice gzip *.axt
md5sum *.gz > md5sum.txt
# cleanup
cd ..
rm -fr chain humanNet
ssh hgwdev
ln -s /cluster/data/hg17/bed/blastz.monDom1/axtNet \
/usr/local/apache/htdocs/goldenPath/monDom1/axtNet
# Request push to downloads server
# UPDATE COW DOWNLOADS (2006-01-17 kate)
# Fix overlaps
ssh kkstore2
cd /cluster/data/bosTau1/bed/zb.hg17
mv axtNet axtNet.old
mv mafNet mafNet.old
mkdir axtNet mafNet
cat > makeMaf.csh << 'EOF'
foreach f (net/chr*.net)
set c = $f:t:r
echo "axtNet on $c"
netToAxt net/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/bosTau1/bosTau1.2bit stdout | axtSort stdin ../axtNet/$c.axt
axtToMaf ../axtNet/$c.axt \
/cluster/data/hg17/chrom.sizes /cluster/data/bosTau1/chrom.sizes \
../mafNet/$c.maf -tPrefix=hg17. -qPrefix=bosTau1.
end
'EOF'
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
cd axtNet
nice gzip *.axt
md5sum *.gz > md5sum.txt
ssh hgwdev
ln -s /cluster/data/hg17/bed/blastz.bosTau1/axtNet \
/usr/local/apache/htdocs/goldenPath/bosTau1/axtNet
# Request push to downloads server
##### UPDATE hg17 knownToVisiGene (2006-01-21 galt)
# Create table that maps between known genes and visiGene database
# mapping to other species such as mouse, zebrafish, frog
# requires visiGene probe track vgImageProbes be created first
knownToVisiGene hg17 -fromProbePsl=vgImageProbes
##### UPDATE hg17 mmBlastTab (2006-01-22 galt)
# Make the protein seqs from mm7.knownGenePep
cd /cluster/data/hg17/bed/geneSorter/blastp
mkdir mm7
cd mm7
pepPredToFa mm7 knownGenePep known.faa
# You may need to build this binary in src/hg/near/pepPredToFa
/cluster/bluearc/blast229/formatdb -i known.faa -t known -n known
mkdir -p /cluster/panasas/home/store/mm7/blastp/
cp known.* /cluster/panasas/home/store/mm7/blastp/
# Make parasol run directory
ssh kk
cd /cluster/data/hg17/bed/geneSorter/blastp/mm7
mkdir run
cd run
mkdir out
# Make blast script
# NOTE!! left off " b 1" from the end of the script because
# we wanted to be able to get the near-best, not just the best one.
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/panasas/home/store/mm7/blastp/known \
-i $1 -o $2 -e 0.001 -m 8
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# this echo trick is used because otherwise the command line is
# too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
Completed: 7735 of 7735 jobs
CPU time in finished jobs: 97096s 1618.26m 26.97h 1.12d 0.003 y
IO & Wait Time: 564656s 9410.94m 156.85h 6.54d 0.018 y
Average job time: 86s 1.43m 0.02h 0.00d
Longest running job: 0s 0.00m 0.00h 0.00d
Longest finished job: 240s 4.00m 0.07h 0.00d
Submission to last job: 1272s 21.20m 0.35h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/mm7/run/out
hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
Scanning through 7735 files
Loading database with 33306 rows
# changed mm6 to mm7 in src/hg/hgGene/hgGeneData/Human/hg17/otherOrgs.ra
# and checked it in.
# hgLoadBlastTab hg17 mmBlastTabTopN -maxPer=250 *.tab
# (not done, this was only used for research)
# hgLoadBlastTab hg17 mmBlastNearBest -topPercent=5 *.tab > hgMmNearBest.stats
# (this will be the new way to go)
Reading seq lengths from hg17.knownGenePep
Finding max gene combined-coverage scores in 7735 files
Scanning through 7735 files
Loading database with 51520 rows
##########################################################################
# MYTOUCH FIX - jen - 2006-01-24
sudo mytouch hg17 gencodeGeneClassJun05 0508301200.00
note - gencodeGeneClassJun05 table on dev only
sudo mytouch hg17 knownGeneLink 0506050000.00
sudo mytouch hg17 ensGtp 0505241200.00
sudo mytouch hg17 ccdsInfo 0505241200.00
##########################################################################
# BLASTZ OPOSSUM monDom2 (WORKING - 2006-01-23 - Hiram)
ssh kk
# running out of disk space on store5:
[hiram@kk /cluster/data/hg17/bed] df -h .
#Filesystem Size Used Avail Use% Mounted on
# 1.5T 1.3T 79G 95% /cluster/store5
# So, keep this elsewhere, and symlink it:
cd /cluster/data/hg17/bed
ln -s /cluster/store9/hg17/bed/blastzMonDom2.2006-01-23 \
./blastzMonDom2.2006-01-23
ln -s blastzMonDom2.2006-01-23 blastz.monDom2
cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23
cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
BLASTZ=blastz.v7
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Human (hg17)
SEQ1_DIR=/scratch/hg/hg17/bothMaskedNibs
SEQ1_LEN=/cluster/data/hg17/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Opossum monDom2
SEQ2_DIR=/scratch/hg/monDom2/monDom2.2bit
SEQ2_LEN=/scratch/hg/monDom2/chrom.sizes
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/hg17/bed/blastzMonDom2.2006-01-23
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
# real 1122m44.191s
# failed during the load of chr19
# hgLoadChain hg17 chr19_chainMonDom2 chr19.chain
# Out of memory needMem - request size 56 bytes
# So, go to kolossus:
ssh kolossus
# There isn't any hg17 db here yet, get it established with a
# chromInfo and a 2bit sequence:
hgsql -e "create database hg17;" mysql
cd /cluster/data/hg17
twoBitInfo hg17.2bit stdout |
awk '{printf "%s\t%s\t/gbdb/hg17/hg17.2bit\n", $1,$2}' \
> chromInfo.kolossus.tab
hgsql hg17 < $HOME/kent/src/hg/lib/chromInfo.sql
hgsql hg17 \
-e 'load data local infile "chromInfo.kolossus.tab" into table chromInfo;'
# it appears /gbdb/hg17 already exists
ln -s /cluster/data/hg17/hg17.2bit /gbdb/hg17/hg17.2bit
# now, loading only chr19:
cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23/axtChain
hgLoadChain hg17 chr19_chainMonDom2 chain/chr19.chain
# real 33m31.689s
# while that is running, back on hgwdev, get the other chains loaded
ssh hgwdev
cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23/axtChain
cp loadUp.csh loadUp.noChr19.csh
# change the foreach line to eliminate the chr19.chain:
diff loadUp.csh loadUp.noChr19.csh
< foreach f (*.chain)
---
> foreach f (`ls *.chain | grep -v chr19.chain`)
# And then run that script
time ./loadUp.noChr19.csh > load.noChr19.out 2>&1
# real 76m8.757s
# When the kolossus load finishes, email to push-request and ask
# for the two tables to be pushed from kolossus to hgwdev:
# chr19_chainMonDom2
# chr19_chainMonDom2Link
# then, continuing:
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-continue=download -bigClusterHub=pk -chainMinScore=5000 \
-chainLinearGap=loose `pwd`/DEF > download.out 2>&1 &
# real 2m42.505s
# now, back on kolossus to run a featurebits
time featureBits hg17 chainMonDom2Link >fb.hg17.chainMonDom2Link 2>&1
# 355119482 bases of 2866216770 (12.390%) in intersection
featureBits hg17 chainMonDom1Link
# 456069062 bases of 2866216770 (15.912%) in intersection
# Then, to swap the results:
ssh kk
cd /cluster/data/hg17/bed/blastz.monDom2
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=5000 \
-chainLinearGap=loose `pwd`/DEF > swap.out 2>&1 &
# running 2006-01-30 11:25
# real 47m27.082s
# failed during the load - as with the Hg18 experiment, something
# is really huge about these results.
#########################################################################
# BUILD MAF ANNOTATION FOR MULTIZ17WAY (kate 2006-02-16)
# Redo to fix overlaps (2006-04-09 kate)
# rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
ssh kkstore01
cd /cluster/data/rheMac2
twoBitInfo -nBed rheMac2.2bit rheMac2.N.bed
ssh kkstore02
cd /cluster/data/hg17/bed/multiz17way
mkdir anno
cd anno
mkdir maf run
cd run
rm sizes nBeds
foreach i (`cat /cluster/data/hg17/bed/multiz17way/species.lst`)
ln -s /cluster/data/$i/chrom.sizes $i.len
ln -s /cluster/data/$i/$i.N.bed $i.bed
echo $i.bed >> nBeds
echo $i.len >> sizes
end
rm jobs.csh
echo date > jobs.csh
foreach i (../../maf/*.maf)
echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/hg17/hg17.2bit ../maf/`basename $i` >> jobs.csh
echo "echo $i" >> jobs.csh
end
echo date >> jobs.csh
# do smaller jobs first
tac jobs.csh > jobsRev.csh
mv jobsRev.csh jobs.csh
csh jobs.csh >&! jobs.log &
# 1.5 hrs.
# 9 hours for redo -- something wrong ?
ssh hgwdev
cd /cluster/data/hg17/bed/multiz17way/anno/maf
mkdir -p /gbdb/hg17/multiz17way/anno/maf
ln -s /cluster/data/hg17/bed/multiz17way/anno/maf/*.maf \
/gbdb/hg17/multiz17way/anno/maf
cat > loadMaf.csh << 'EOF'
date
hgLoadMaf -pathPrefix=/gbdb/hg17/multiz17way/anno/maf \
hg17 multiz17way
date
'EOF'
csh loadMaf.csh >&! loadMaf.log &
# load summary table on kolossus, as it crashes on hgwdev
ssh kolossus
cd /cluster/data/hg17/bed/multiz17way/anno/maf
cat *.maf | \
nice hgLoadMafSummary hg17 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz17waySummary stdin
# Created 3212623 summary blocks from 114139253 components and 17522217 mafs from stdin
# request push to hgwdev
# Dropped unused indexes (2006-05-09 kate)
# NOTE: this is not required in the future, as the loader
# has been fixed to not generate these indexes
hgsql hg17 -e "alter table multiz17waySummary drop index chrom_2"
hgsql hg17 -e "alter table multiz17waySummary drop index chrom_3"
ssh kkstore02
cd /cluster/data/hg17/bed/multiz17way
set sanDir = /san/sanvol1/scratch/hg17/multiz17way/frames
mkdir -p $sanDir/maf
cp -rp maf/* $sanDir/maf
mkdir frames
cd frames
cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .
#edit Makefile to correct species names and set and sanDir
ssh hgwdev
cd /cluster/data/hg17/bed/multiz17way/frames
make getGenes >&! getGenes.log &
# ~1 minute
make getFrames >&! getFrames.log &
# ~2 hours
# NOTE: if jobs get hung up (e.g. running for hours, when
# they should run for minutes, do 'para stop' so that
# the 'para make' can restart the job
make loadDb >&! loadDb.log &
###
# rebuild frames to get bug fix, using 1-pass maf methodology
# (2006-06-09 markd)
ssh kkstore02
cd /cluster/data/hg17/bed/multiz17way/frames
mv mafFrames/ mafFrames.old
nice tcsh # easy way to get process niced
(cat ../maf/*.maf | genePredToMafFrames hg17 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer3 genes/danRer3.gp.gz fr1 genes/fr1.gp.gz galGal2 genes/galGal2.gp.gz hg17 genes/hg17.gp.gz mm7 genes/mm7.gp.gz oryCun1 genes/oryCun1.gp.gz panTro1 genes/panTro1.gp.gz rheMac2 genes/rheMac2.gp.gz rn3 genes/rn3.gp.gz xenTro1 genes/xenTro1.gp.gz | gzip >multiz17way.mafFrames.gz)>&log&
ssh hgwdev
cd /cluster/data/hg17/bed/multiz17way/frames
hgLoadMafFrames hg17 multiz17wayFrames multiz17way.mafFrames.gz >&log&
# EXTRACT LINEAGE-SPECIFIC REPEATS FOR RAT (DONE 2/8/06 angie)
ssh kolossus
mkdir /cluster/data/hg17/rmsk
cd /cluster/data/hg17/rmsk
ln -s ../*/chr*.fa.out .
# Run Arian's DateRepsinRMoutput.pl to add extra columns telling
# whether repeats in -query are also expected in -comp species.
# Even though we already have the human-mouse linSpecReps,
# extractLinSpecReps requires two columns of DateRepsinRMoutput.pl
# additions. So add mouse, then ignore it.
# Rat in extra column 1, Mouse in extra column 2
foreach outfl ( *.out )
echo "$outfl"
/cluster/bluearc/RepeatMasker/DateRepeats \
${outfl} -query human -comp rat -comp mouse
end
# Now extract rat (extra column 1), ignore mouse.
cd ..
mkdir linSpecRep.notInRat
foreach f (rmsk/*.out_rat*_mus-musculus)
set base = $f:t:r:r
echo $base.out.spec
/cluster/bin/scripts/extractLinSpecReps 1 $f > \
linSpecRep.notInRat/$base.out.spec
end
# Distribute and clean up.
rsync -av linSpecRep.notInRat /san/sanvol1/scratch/hg17/
rm -r rmsk
# BLASTZ/CHAIN/NET RN4 (DONE 2/10/06 angie)
ssh kkstore01
mkdir /cluster/data/hg17/bed/blastz.rn4.2006-02-08
cd /cluster/data/hg17/bed/blastz.rn4.2006-02-08
cat << '_EOF_' > DEF
# human vs. rat
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human
SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
SEQ1_SMSK=/san/sanvol1/scratch/hg17/linSpecRep.notInRat
SEQ1_LEN=/cluster/data/hg17/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rat
SEQ2_DIR=/san/sanvol1/scratch/rn4/nib
SEQ2_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInHuman
SEQ2_LEN=/cluster/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg17/bed/blastz.rn4.2006-02-08
'_EOF_'
# << for emacs
doBlastzChainNet.pl DEF -chainLinearGap medium \
-bigClusterHub pk -smallClusterHub pk -workhorse pk \
-blastzOutRoot /san/sanvol1/scratch/blastzHg17Rn4Out >& do.log &
tail -f do.log
rm -f /cluster/data/hg17/bed/blastz.rn4
ln -s blastz.rn4.2006-02-08 /cluster/data/hg17/bed/blastz.rn4
# UPDATE WGRNA TRACK (DONE, 2006-02-15, Fan)
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir wgRna-2006-02-15
cd wgRna-2006-02-15
# Received the data file, wg_track_hg17_feb2006_completed.txt, from Michel Weber's email
# (Michel.Weber@ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg17/bed/wgRna-2006-02-15.
cp -p wg_track_hg17_feb2006_completed.txt wgRna.tab
hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
# Compared to previous data, 2 records deleted, 27 records added.
########################################################################
# BLASTZ Opossum monDom4 (DONE - 2006-02-21 - 2006-02-26 - Hiram)
ssh pk
mkdir /cluster/data/hg17/bed/blastzMonDom4.2006-02-21
cd /cluster/data/hg17/bed
ln -s blastzMonDom4.2006-02-21 blastz.monDom4
cd blastzMonDom4.2006-02-21
cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Human (hg17)
SEQ1_DIR=/scratch/hg/hg17/bothMaskedNibs
SEQ1_LEN=/cluster/data/hg17/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Opossum monDom4
SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg17/bed/blastzMonDom4.2006-02-21
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits hg17 chainMonDom4Link \
> fb.hg17.chainMonDom4Link 2>&1
time nice -n +19 featureBits monDom4 chainHg17Link \
> fb.monDom4.chainHg17Link 2>&1
########################################################################
## Measuring MonDom4 chain pile ups (DONE - 2006-02-26 - Hiram)
ssh kkstore02
cd /cluster/data/hg17/bed/blastz.monDom4/axtChain
# extract coordinates on the target genome of the chains
zcat hg17.monDom4.all.chain.gz | grep "^chain " \
| awk '{printf "%s\t%s\t%s\t%s\t%s\n", $3, $6, $7, $5, $2}' \
| gzip -c > target.chain.bed.gz
# turn that into a wiggle graph with bedItemOverlapCount
# use HGDB_CONF for read-only access to the hg17 DB in bedItemOverlapCount
# it wants to read chromInfo ...
export HGDB_CONF=~/.hg.conf.read-only
# ignore chains longer than 1,000,000
zcat target.chain.bed.gz | awk '$3-$2<1000000 {print}' \
| sort -k1,1 -k2,2n \
| bedItemOverlapCount hg17 stdin \
| wigEncode stdin monDom4PileUps.wig monDom4PileUps.wib
# Do the same for the query coordinates to find out where these
# chains are coming from
zcat hg17.monDom4.all.chain.gz | grep "^chain " \
| awk '{printf "%s\t%s\t%s\t%s\t%s\n", $8, $11, $12, $10, $2}' \
| gzip -c > query.chain.bed.gz
zcat query.chain.bed.gz | awk '$3-$2<1000000 {print}' \
| sort -k1,1 -k2,2n \
| bedItemOverlapCount monDom4 stdin \
| wigEncode stdin hg17PileUps.wig hg17PileUps.wib
# load those wiggles
ssh hgwdev
cd /cluster/data/hg17/bed/blastz.monDom4/axtChain
ln -s `pwd`/monDom4PileUps.wib /gbdb/hg17/wib
ln -s `pwd`/hg17PileUps.wib /gbdb/monDom4/wib
hgLoadWiggle -verbose=2 hg17 monDom4PileUp monDom4PileUps.wig
hgLoadWiggle -verbose=2 monDom4 hg17PileUps hg17PileUps.wig
# add wiggle track type entries to the respective trackDb.ra files
# UPDATE hg17 knownToVisiGene (2006-03-07 galt)
# Create table that maps between known genes and visiGene database
# mapping to other species such as mouse, zebrafish, frog
# requires visiGene probe track vgImageProbes be created first
knownToVisiGene hg17 -fromProbePsl=vgImageProbes
############################################################################
# Add Landmark track (2006-03-08 giardine)
# Note: This track is for regulatory regions and other landmarks that are not
#included in other tracks. It is being gathered from the locus experts
#that are contributing data to the Human Mutation track. This should
#be helpful in understanding the data in the mutation track.
#table definitions for autoSql
autoSql landmark.as landmark -dbLink
#change index on bin to normal index not primary key
#move bin in struct so works as bed 4+
#copy autoSql files to hg/lib and hg/inc (add .o file to makefile)
#cat together landmark files from sources in landmark.bed then sort
grep "^chr" landmark.bed | sort -k1,1 -k2,2n > sortedLandmark.bed
#loading
hgsql hg17 < landmark.sql
hgLoadBed hg17 landmark sortedLandmark.bed -noSort -oldTable -tab
#add to trackDb.ra file (human hg17 level)
#changed landmark track to provide links and attributes in prep for ORegAnno
#data. Got set of test data by grabbing their .gff file used for custom
#tracks and converting to bed, then to landmarks format.
cd humPhen/landmarkData/June06/
#convert data to new formats then
cat newLandmark.txt landmarkORA.txt > allLandmarks.txt
grep "^chr" allLandmarks.txt | sort -k1,1 -k2,2n > sortedAllLandmark.txt
#start new tables
cd humPhen/kent/src/hg/lib/
autoSql landmark.as landmark -dbLink
#move bin in .h file to end of structure, to make load work
mv landmark.h ../inc/landmark.h
#change primary key to indexes where not unique, add index on landmarkId
#limit name, landmarkType, raKey size to 64
hgsql -e "drop table landmark;" hg17
hgsql hg17 < landmark.sql
cd ~giardine/humPhen/landmarkData/June06/
hgLoadBed hg17 landmark sortedAllLandmark.txt -noSort -oldTable -tab
hgsql hg17
load data local infile "landmarkAttrORA.txt" into table landmarkAttr;
load data local infile "landmarkAttrLinkORA.txt" into table landmarkAttrLink;
load data local infile "landmarkAttrCat.txt" into table landmarkAttrCat;
cd ../../kent/src/
make clean
make libs
cd hg
make cgi
cd makeDb/trackDb
make DBS=hg17 update
#test in hgwdev-giardine
#redo landmarks, moving categories out of database
convertORAformat < ORegAnnoBed
#start new tables
cd humPhen/kent/src/hg/lib/
autoSql landmark.as landmark -dbLink
#move bin in .h file to end of structure, to make load work
mv landmark.h ../inc/landmark.h
#change primary key to indexes, add primary key on landmarkId
#limit name, landmarkType, raKey size to 64
#only need to reload attributes rest of data & tables same
hgsql -e "drop table landmarkAttr;" hg17
hgsql -e "drop table landmarkAttrCat;" hg17
cd ../../../../landmarkData/June06/
hgsql hg17
#cut and paste in create table landmarkAttr
load data local infile "landmarkAttrORA.txt" into table landmarkAttr;
#Records: 2028 Deleted: 0 Skipped: 0 Warnings: 8 ???
cd ../../kent/src/
make clean
make libs
cd hg
make cgi
cd makeDb/trackDb
make DBS=hg17 update
#test in hgwdev-giardine
############################################################################
# hg15 -> hg17 LIFTOVER CHAINS (STARTED 3/9/06, DONE 3/10/06 Fan)
# I used a size of 10kb instead of 3kb for the split (blat query) sizes in
# hg17. This had a huge affect on the amount of hits in the blat, which
# then had a huge effect on the amount of chains. I should also mention
# that hg17 chromosomes chr1 and chr2 were split further
# into more than a single query file. This helped a LOT in avoiding
# cluster hippos classically associated with those chroms.
######## LIFTOVER PREPARATION
# Split up hg17
ssh pk
cd /san/sanVol1/scratch/hg17
mkdir -p liftSplits/{split,lift}
bash
for fa in /cluster/data/hg17/?{,?,*hap*}/*.fa; do
c=`basename $fa .fa`
echo $c
faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c
done
mkdir -p biggerSplits/split
cd biggerSplits/
ln -s ../liftSplits/lift
cd split/
ln -s ../../liftSplits/split/* .
faSplit sequence chr1.fa 5 chr1_
faSplit sequence chr2.fa 5 chr2_
rm chr{1,2}.fa
# Make some dirs
cd /san/sanVol1/scratch
mkdir -p hg15
# Copy 11.ooc files to hg15 subdirectory.
cp -p /cluster/store5/gs.16/build33/11.ooc hg15
## First, copy over scripts. (Already done before)
# mkdir -p /san/sanVol1/scratch/fan
# cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan
# cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan
######## LIFTOVER BLATTING
# HG15
ssh pk
cd /cluster/data/hg15
# makeLoChain-align hg15 /scratch/hg/hg15/bothMaskedNibs hg17 \
makeLoChain-align hg15 /scratch/hg/hg15/chromTrfMixedNib hg17 \
/san/sanVol1/scratch/hg17/biggerSplits/split
cd bed
mv blat.hg17.2006-03-09 /san/sanVol1/scratch/hg15
cd /san/sanVol1/scratch/hg15/blat.hg17.2006-03-09/run/
sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg15ToHg17"}' > newspec
para create newspec
para try
para push
# Saw some failures, keep pushing again, they finally all finished.
# The problems were all from one node.
# Used "para remove machine ..." to remove that node from the cluster.
# Completed: 2376 of 2376 jobs
# CPU time in finished jobs: 626355s 10439.25m 173.99h 7.25d 0.020 y
# IO & Wait Time: 49512s 825.20m 13.75h 0.57d 0.002 y
# Average job time: 284s 4.74m 0.08h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 3693s 61.55m 1.03h 0.04d
# Submission to last job: 4165s 69.42m 1.16h 0.05d
######## LIFTOVER CHAINING
# LIFTING
ssh pk
cd /san/sanVol1/scratch/fan
cp mm7SplitLift.sh hg17SplitLift.sh
# change andy to fan, mm7 to hg17, and chrX to chr2, and remove chrUn_random
vi hg17SplitLift.sh
cat << 'EOF' > hg17ChainMergeSplit.sh
#!/bin/bash
cp -r chainRaw/ /scratch/fan/hg17Lifts
pushd /scratch/fan/hg17Lifts
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
cp -r chain `dirs +1`
rm -rf chain chainRaw
'EOF'
chmod +x hg17ChainMergeSplit.sh
# HG15
cd /san/sanVol1/scratch/hg15/blat.hg17.2006-03-09/raw
/san/sanVol1/scratch/fan/hg17SplitLift.sh
cd ../
mkdir chainRun chainRaw
cd chainRun
cat > gsub << 'EOF'
#LOOP
/cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg15/chromTrfMixedNib /san/sanVol1/scratch/hg17/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
'EOF'
ls -1S ../psl/*.psl > in.lst
gensub2 in.lst single gsub spec
para create spec
para push
para time
# Completed: 46 of 46 jobs
# CPU time in finished jobs: 3546s 59.10m 0.98h 0.04d 0.000 y
# IO & Wait Time: 895s 14.92m 0.25h 0.01d 0.000 y
# Average job time: 97s 1.61m 0.03h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 270s 4.50m 0.07h 0.00d
# Submission to last job: 270s 4.50m 0.07h 0.00d
######### CHAINMERGE/NET/NETSUBSET
ssh kolossus
mkdir -p /scratch/fan/hg17Lifts
cd /scratch/fan/hg17Lifts
cp -r /san/sanvol1/scratch/hg15/blat.hg17.2006-03-09/chainRaw/ .
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
# about 30 minutes.
cp -rp chain /san/sanvol1/scratch/hg15/blat.hg17.2006-03-09/
rm -rf chain
rm -rf chainRaw
ssh pk
cd /san/sanvol1/scratch/fan
cat << 'EOF' > netOver.sh
#!/bin/bash
chain=$1
chrom=`basename $chain .chain`
sizesHGOld=$2
sizesHG17=/cluster/data/hg17/chrom.sizes
chainDir=`dirname $chain`
blatDir=`dirname $chainDir`
net=${blatDir}/net/${chrom}.net
over=${blatDir}/over/${chrom}.over
mkdir -p ${blatDir}/{over,net}
/cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG17 $net /dev/null
/cluster/bin/x86_64/netChainSubset $net $chain $over
'EOF'
# << emacs
chmod +x netOver.sh
mkdir netRun
cd netRun/
find /san/sanVol1/scratch/hg15/blat.hg17.2006-03-09/chain -name "*.chain" \
| awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg15/chrom.sizes"}' >> spec
para create spec
para push
para time
# Completed: 44 of 44 jobs
# CPU time in finished jobs: 427s 7.12m 0.12h 0.00d 0.000 y
# IO & Wait Time: 248s 4.13m 0.07h 0.00d 0.000 y
# Average job time: 15s 0.26m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 29s 0.48m 0.01h 0.00d
# Submission to last job: 46s 0.77m 0.01h 0.00d
# seems much faster than mm7.
########## FINISHING
ssh hgwdev
# HG15
cd /san/sanvol1/scratch/hg15/blat.hg17.2006-03-09/over
cat * >> ../hg15ToHg17.over.chain
cd ../
rm -rf psl/ net/ chain/ chainRaw/ over/
cd ../
cp -rp blat.hg17.2006-03-09/ /cluster/data/hg15/bed
cd /cluster/data/hg15/bed
ln -s blat.hg17.2006-03-09 blat.hg17
ln -s `pwd`/blat.hg17/hg15ToHg17.over.chain liftOver/hg15ToHg17.over.chain
ln -s `pwd`/liftOver/hg15ToHg17.over.chain /gbdb/hg15/liftOver/hg15ToHg17.over.chain
mkdir -p /usr/local/apache/htdocs/goldenPath/hg15/liftOver
cd /usr/local/apache/htdocs/goldenPath/hg15/liftOver
cp /gbdb/hg15/liftOver/hg15ToHg17.over.chain .
gzip hg15ToHg17.over.chain
hgAddLiftOverChain hg15 hg17 /gbdb/hg15/liftOver/hg15ToHg17.over.chain
# UPDATED hg17.knownToVisiGene (2006-03-14 galt)
# after making sure hg17.vgAllProbes was up to date (see makeVisiGene.doc)
ssh hgwdev
knownToVisiGene hg17 -fromProbePsl=vgAllProbes
########################################################################
### microRNA targets tracks (DONE - 2006-03-17 - 2006-04-27 - Hiram)
### from: http://pictar.bio.nyu.edu/ Rajewsky Lab
### Nikolaus Rajewsky nr@scarbo.bio.nyu.edu
### Yi-Lu Wang ylw205@nyu.edu
### dg@thp.Uni-Koeln.DE
ssh hgwdev
mkdir /cluster/data/hg17/bed/picTar
cd /cluster/data/hg17/bed/picTar
wget --timestamping \
'http://pictar.bio.nyu.edu/ucsc/new_mammals_bed' -O newMammals.bed
wget --timestamping \
'http://pictar.bio.nyu.edu/ucsc/new_mammals_chicken_bed' \
-O newMammalsChicken.bed
grep -v "^track" newMammals.bed \
| hgLoadBed -strict hg17 picTarMiRNA4Way stdin
# Loaded 205263 elements of size 9
grep -v "^track" newMammalsChicken.bed \
| hgLoadBed -strict hg17 picTarMiRNA5Way stdin
# Loaded 43081 elements of size 9
nice -n +19 featureBits hg17 picTarMiRNA4Way
# 608549 bases of 2866216770 (0.021%) in intersection
nice -n +19 featureBits hg17 picTarMiRNA5Way
# 109059 bases of 2866216770 (0.004%) in intersection
############################################################################
# dbSNP BUILD 125 (Heather, March 2006)
# Set up directory structure
ssh kkstore02
cd /cluster/data/dbSnp
mkdir 125
cd 125
mkdir shared
mkdir shared/data
mkdir shared/schema
mkdir organisms
mkdir organisms/human_9606
mkdir organisms/human_9606/rs_fasta
mkdir organisms/human_9606/database
mkdir organisms/human_9606/database/organism_data
mkdir organisms/human_9606/database/organism_data/hg17
mkdir organisms/human_9606/database/schema
# Get data from NCBI
# Shared data includes data dictionary,
# Shared data includes defined types such as validity, class, function, locType
# Actually this is independent of hg17 build and should go in separate makeDoc
cd shared/data
ftp ftp.ncbi.nih.gov
cd snp/database/organism_shared_data
mget *.gz
cd ../schema
ftp ftp.ncbi.nih.gov
cd snp/database/schema/shared_schema
mget *.gz
# using headers of fasta files for molType, class and observed
cd ../organisms/human_9606/rs_fasta
ftp ftp.ncbi.nih.gov
cd snp/organisms/human_9606/rs_fasta
mget *.gz
cd ../database/organism_data/hg17
ftp ftp.ncbi.nih.gov
cd snp/organisms/human_9606/database/organism_data
# ContigLoc table has coords, orientation, loc_type, and refNCBI allele
get b125_SNPContigLoc_35_1.bcp.gz
# ContigLocusId has function
get b125_SNPContigLocusId_35_1.bcp.gz
get b125_ContigInfo_35_1.bcp.gz
# MapInfo has alignment weights
get b125_SNPMapInfo_35_1.bcp.gz
# SNP has validation status and heterozygosity
get SNP.bcp.gz
# done with FTP
# rename
mv b125_SNPContigLoc_35_1.bcp.gz ContigLoc.gz
mv b125_SNPContigLocusId_35_1.bcp.gz ContigLocusId.gz
mv b125_ContigInfo_35_1.bcp.gz ContigInfo.gz
mv b125_SNPMapInfo_35_1.bcp.gz MapInfo.gz
mv SNP.bcp.gz SNP.gz
# edit table descriptions
cd /cluster/data/dbSnp/125/organisms/human_9606/database/schema
# get CREATE statements from human_9606_table.sql for our 5 tables
# store in table.tmp
# convert and rename tables
sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp
rm table.tmp
sed -f 'tableRename.sed' table2.tmp > table.sql
rm table2.tmp
# get header lines from rs_fasta
cd /cluster/data/dbSnp/125/organisms/human_9606/rs_fasta
/bin/csh gnl.csh
# load on kkr5u00
ssh kkr5u00
hgsql -e mysql 'create database dbSnpHumanBuild125'
cd /cluster/data/dbSnp/125/organisms/human_9606/database/schema
hgsql dbSnpHumanBuild125 < table.sql
cd ../organism_data/hg17
/bin/csh load.csh
# note rowcount
# ContigLoc 24135144
# SNP 10430754
# MapInfo 10271016
# ContigLocusId 9539145
# create working /scratch dir
cd /scratch/snp
mkdir 125
cd 125
mkdir human
cd human
# get hg17 ctgPos, load into dbSnpHumanBuild125, compare contig list between ctgPos and ContigInfo
# get gnl files
cp /cluster/data/dbSnp/125/organisms/human_9606/rs_fasta/*.gnl .
# examine ContigInfo for group_term and edit pipeline.csh
# use "ref_haplotype"
# filter ContigLoc into ContigLocFilter
# this gets rid of alternate assemblies and poor quality alignments
# uses ContigInfo and MapInfo (weight == 10 || weight == 3)
# assumes all contigs are positively oriented
# will abort if not true
mysql> desc ContigLocFilter;
# +---------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +---------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromName | varchar(32) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | phys_pos_from | int(11) | NO | | | |
# | phys_pos | varchar(32) | YES | | NULL | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# +---------------+-------------+------+-----+---------+-------+
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter dbSnpHumanBuild125 ref_haplotype
# note rowcount
# ContigLocFilter 10113426
# how many are positive strand? hopefully 90%
mysql> select count(*) from ContigLocFilter where orientation = 0;
# 9161012
# filter ContigLocusId into ContigLocusIdFilter
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter dbSnpHumanBuild125 ref_haplotype
# note rowcount
# ContigLocusIdFilter 5352542
# condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions)
# assumes SNPs are in numerical order
# will errAbort if not true
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense dbSnpHumanBuild125
# note rowcount
# expect about 50% for human
# ContigLocusIdCondense 4129899
# could delete ContigLocusIdFilter table here
# create chrN_snpFasta tables from *.gnl files
# snpLoadFasta.error will report all SNPs with "lengthTooLong"
# here we have 4428 SNPs with lengthTooLong
# these are noted as ObservedNotAvailable
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta dbSnpHumanBuild125
# split ContigLocFilter by chrom (could start using pipeline.csh here)
# pipeline.csh takes about 35 minutes to run
# create the first chrN_snpTmp
# we will reuse this table name, adding/changing columns as we go
# at this point chrN_snpTmp will have the same description as ContigLocFilter
# this opens a file handle for every chrom, so will not scale to scaffold-based assemblies
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom dbSnpHumanBuild125 ref_haplotype
# generate true coords using loc_type
# possible errors logged to snpLocType.error:
# "Missing quotes in phys_pos for range"
# "Chrom end <= chrom start for range"
# "Wrong size for exact"
# "Unknown locType"
# "Unable to get chromEnd"
# We got none of these
# possible exceptions logged to snpLocType.exceptions:
# RefAlleleWrongSize
# this run got just 40
# morph chrN_snpTmp
mysql> desc chr1_snpTmp;
# +---------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +---------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# +---------------+-------------+------+-----+---------+-------+
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype dbSnpHumanBuild125 ref_haplotype
# expand allele as necessary
# report syntax errors to snpExpandAllele.errors
# this run had 63 of these
# possible exceptions logged to snpExpandAllele.exceptions:
# RefAlleleWrongSize
# this run has 512
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele dbSnpHumanBuild125 ref_haplotype
# the next few steps prepare for working in UCSC space
# sort by position
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort dbSnpHumanBuild125 ref_haplotype
# get hg17 nib files
# get hg17 chromInfo, load into dbSnpHumanBuild125 with editted path
hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" dbSnpHumanBuild125
# lookup reference allele in nibs
# keep reverse complement to use in error checking (snpCheckAlleles)
# check here for SNPs larger than 1024
# errAbort if detected
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC dbSnpHumanBuild125
# morph chrN_snpTmp
mysql> desc chr1_snpTmp;
# +--------------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +--------------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# | refUCSC | blob | YES | | NULL | |
# | refUCSCReverseComp | blob | YES | | NULL | |
# +--------------------+-------------+------+-----+---------+-------+
# compare allele from dbSNP to refUCSC
# locType between is excluded from this check
# log exceptions to snpCheckAllele.exceptions
# if SNP is positive strand, expect allele == refUCSC
# log RefAlleleMismatch if not
# if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp
# If allele == refUCSCRevComp, log RefAlleleNotRevComp
# If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch
# This run we got:
# 0 RefAlleleMismatch
# 49763 RefAlleleNotRevComp
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles dbSnpHumanBuild125
# add class, observed and molType from chrN_snpFasta tables
# log errors to snpReadFasta.errors
# errors detected: no data available, duplicate data
# This run we got:
# 49 no data available
# 226048 duplicate
# chrN_snpFasta has class = 'in-del'
# we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpReadFasta dbSnpHumanBuild125
# morph chrN_snpTmp
# +--------------------+---------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +--------------------+---------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | class | varchar(255) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | molType | varchar(255) | NO | | | |
# | allele | blob | YES | | NULL | |
# | refUCSC | blob | YES | | NULL | |
# | refUCSCReverseComp | blob | YES | | NULL | |
# | observed | blob | YES | | NULL | |
# +--------------------+---------------+------+-----+---------+-------+
# generate exceptions for class and observed
# SingleClassBetweenLocType
# SingleClassRangeLocType
# NamedClassWrongLocType
# ObservedNotAvailable
# ObservedWrongFormat
# ObservedWrongSize
# ObservedMismatch
# RangeSubstitutionLocTypeExactMatch
# SingleClassTriAllelic
# SingleClassQuadAllelic
# This will also detect IUPAC symbols in allele
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved dbSnpHumanBuild125
# add function
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction dbSnpHumanBuild125
# add validation status and heterozygosity
# log error if validation status > 31 or missing
# this run we got 8 missing
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP dbSnpHumanBuild125
# generate chrN_snp125 and snp125Exceptions tables
cp snpCheckAlleles.exceptions snpCheckAlleles.tab
cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab
cp snpExpandAllele.exceptions snpExpandAllele.tab
cp snpLocType.exceptions snpLocType.tab
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable dbSnpHumanBuild125
# PAR SNPs
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpPAR dbSnpHumanBuild125
hgsql -e 'load data local infile "snpPARexceptions.tab" into table snp125Exceptions' dbSnpHumanBuild125
# concat into snp125.tab
# cat chr*_snp125.tab >> snp125.tab
/bin/sh concat.sh
# load
hgsql dbSnpHumanBuild125 < /cluster/home/heather/kent/src/hg/lib/snp125.sql
hgsql -e 'load data local infile "snp125.tab" into table snp125' dbSnpHumanBuild125
# check for multiple alignments
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple dbSnpHumanBuild125
mysql> load data local infile 'snpMultiple.tab' into table snp125Exceptions;
# run and review snpCompareLoctype (currently tuned for 124/125 differences)
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCompareLoctype dbSnpHumanBuild125 snp124subset snp125
# cat snpCompareLoctypeCounts.out
# exactToExact = 8310192
# exactToBetween = 107956
# exactToRange = 16200
# betweenToBetween = 206224
# betweenToExact = 4012
# betweenToRange = 715
# rangeToRange = 98648
# rangeToBetween = 3151
# rangeToExact = 6198
# oldToNew = 10224
# 12043 coord changes in exact (.1%)
# 1370 moved to different chroms
# 3664 coord changes in between (1.7%)
# 2260 off-by-one
# 13 moved to different chroms
# 22198 coord changes in range (22.5%)
# 19548 look like fixes: observedLengthOld != coordSpanOld && observedLengthNew == coordSpanNew
# 1296 look like errors: observedLengthOld == coordSpanOld && observedLengthNew != coordSpanNew
# load on hgwdev
cp snp125.tab /cluster/home/heather/transfer/snp
hgsql dbSnpHumanBuild125 -e 'select * from snp125Exceptions' > /cluster/home/heather/transfer/snp/snp125Exceptions.tab
ssh hgwdev
mysql> load data local infile 'snp125.tab' into table snp125;
# create indexes
mysql> alter table snp125 add index name (name);
mysql> alter table snp125 add index chrom (chrom, bin);
mysql> load data local infile 'snp125Exceptions.tab' into table snp125Exceptions;
mysql> alter table snp125Exceptions add index name(name);
# create snp125ExceptionDesc table
cd /cluster/data/dbSnp
# add counts to exception.template
hgsql hg17 < snp125ExceptionDesc.sql
mysql> load data local file 'exception.template' into table snp125ExceptionDesc;
#######
# Add new case for ObservedWrongSize (Heather June 9, 2006)
# revisions 1.25 and 1.26 kent/src/hg/snp/snpLoad/snpCheckClassAndObserved.c
ssh kkr5u00
cd /scratch/snp/125/human
/bin/csh pipeline.csh
# wait 35 minutes
grep ObservedWrongSize snpCheckClassAndObserved.exceptions > ObservedWrongSize
grep ObservedWrongSize snpPARexceptions.tab >> ObservedWrongSize
cp ObservedWrongSize /cluster/home/heather/transfer/snp
ssh hgwdev
hgsql -e 'alter table snp125Exceptions drop index name' hg17
hgsql -e 'load data local infile "/cluster/home/heather/transfer/snp/ObservedWrongSize" into table snp125Exceptions' hg17
hgsql -e 'alter table snp125Exceptions add index name'
# fix counts
hgsql -e 'select count(*), exception from snp125Exceptions group by exception' hg17
+----------+------------------------------------+
| count(*) | exception |
+----------+------------------------------------+
| 785903 | MultipleAlignments |
| 623 | NamedClassWrongLocType |
| 7686 | ObservedMismatch |
| 4333 | ObservedNotAvailable |
| 97 | ObservedWrongFormat |
| 73558 | ObservedWrongSize |
| 466 | RangeSubstitutionLocTypeExactMatch |
| 62 | RefAlleleMismatch |
| 99849 | RefAlleleNotRevComp |
| 1278 | RefAlleleWrongSize |
| 20749 | SingleClassBetweenLocType |
| 2306 | SingleClassQuadAllelic |
| 15639 | SingleClassRangeLocType |
| 19330 | SingleClassTriAllelic |
+----------+------------------------------------+
# edit /cluster/data/dbSNP/exception.template (need to automate this)
hgsql -e 'delete from snp125ExceptionDesc' hg17
hgsql -e 'load data local infile "/cluster/data/dbSNP/exception.template" into table snp125ExceptionDesc' hg17
###########################
# add rs_fasta to seq/extFile (Heather Nov 2006)
# use 126 rs_fasta files because I didn't save 125 version
ssh hgwdev
mkdir /gbdb/hg17/snp
ln -s /cluster/store12/snp/126/human/rs_fasta/snp.fa /gbdb/hg17/snp/snp.fa
cd /cluster/store12/snp/126/human/rs_fasta
hgLoadSeq hg17 /gbdb/hg18/snp/snp.fa
# clean up after hgLoadSeq
rm seq.tab
# look up id in extFile
# move into separate table
hgsql hg17 < snpSeq.sql
hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 33852294' hg17
hgsql -e 'delete from seq where extFile = 33852294' hg17
hgsql -e 'alter table snpSeq add index acc (acc)' hg17
#############################################################
# Get panTro2 and rheMac2 allele for all SNPs (Heather, Dec 2006, Feb 2007 and
# June 2007 [partial fix released 6/25/07: using hg17 instead of hg18 liftOver
# files... for most but not all chroms! :( not documented below; error found
# by user]
# 1/11/08 (angie): re-running panTro2Qual and subsequent chimp & summary
# steps, so hg17 liftOver files will have been used for all outputs.
# Deletions will probably lift okay
# The insertions have start == end so none of them will lift
# 1/24/08 (angie): constant quality score of 98 for chimp chr{21,M,Y,Y_random}
# was previously put in score field -- corrected to orthoScore.
ssh hgwdev
cd /san/sanvol1/snp/liftOver/hg17
mkdir panTro2All
mkdir rheMac2All
mkdir input
cd input
hgsql -N -e 'select chrom, chromStart, chromEnd, name, score, strand from snp125' hg17 > snp125.bed
lineFileSplit snp.bed lines 100000 snp-
ln -s /san/sanvol1/snp/liftOver/hg17/input /san/sanvol1/snp/liftOver/hg17/panTro2All/input
ln -s /san/sanvol1/snp/liftOver/hg17/input /san/sanvol1/snp/liftOver/hg17/rheMac2All/input
cd ../panTro2All
./makeJobList.csh
mkdir output
mkdir unmapped
cd ../rheMac2All
./makeJobList.csh
mkdir output
mkdir unmapped
# cluster run
ssh pk
cd /san/sanvol1/snp/liftOver/hg17/panTro2All
para create jobList; para try; para check; para push
para time
# Completed: 108 of 108 jobs
# CPU time in finished jobs: 67758s 1129.29m 18.82h 0.78d 0.002 y
# IO & Wait Time: 961s 16.02m 0.27h 0.01d 0.000 y
# Average job time: 636s 10.60m 0.18h 0.01d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 1543s 25.72m 0.43h 0.02d
# Submission to last job: 61513s 1025.22m 17.09h 0.71d
cd /san/sanvol1/snp/liftOver/hg17/rheMac2All
para create jobList; para try; para check; para push
para time
# Completed: 108 of 108 jobs
# CPU time in finished jobs: 1833s 30.56m 0.51h 0.02d 0.000 y
# IO & Wait Time: 1744s 29.06m 0.48h 0.02d 0.000 y
# Average job time: 33s 0.55m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 82s 1.37m 0.02h 0.00d
# Submission to last job: 59987s 999.78m 16.66h 0.69d
# add sequence
# next time do this at the same time as lift
cd /san/sanvol1/snp/liftOver/hg17
mkdir panTro2Seq
mkdir panTro2Seq/input
mkdir panTro2Seq/output
cp panTro2All/output/snp*out panTro2Seq/input
cd panTro2Seq
./makeJobList.csh
cat << 'EOF' > makeJobList.csh
#!/bin/tcsh
rm -f jobList
foreach fileName (`ls input/*`)
set baseName = $fileName:t
echo $baseName
echo "/cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq $fileName /scratch/hg/panTro2/panTro2.2bit output/$baseName" >> jobList
end
'EOF'
cd /san/sanvol1/snp/liftOver/hg17
mkdir rheMac2Seq
mkdir rheMac2Seq/input
mkdir rheMac2Seq/output
cp rheMac2All/output/snp*out rheMac2Seq/input
cd rheMac2Seq
cat << 'EOF' > makeJobList.csh
#!/bin/tcsh
rm -f jobList
foreach fileName (`ls input/*`)
set baseName = $fileName:t
echo $baseName
echo "/cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq $fileName /scratch/hg/rheMac2/rheMac2.2bit output/$baseName" >> jobList
end
'EOF'
# cluster run for sequence
ssh pk
cd /san/sanvol1/snp/liftOver/hg17/panTro2Seq
para create jobList; para try; para check; para push
para time
# Completed: 108 of 108 jobs
# CPU time in finished jobs: 30509s 508.48m 8.47h 0.35d 0.001 y
# IO & Wait Time: 325s 5.42m 0.09h 0.00d 0.000 y
# Average job time: 286s 4.76m 0.08h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 551s 9.18m 0.15h 0.01d
# Submission to last job: 1195s 19.92m 0.33h 0.01d
cd /san/sanvol1/snp/liftOver/hg17/rheMac2Seq
para create jobList; para try; para check; para push
para time
# Completed: 108 of 108 jobs
# CPU time in finished jobs: 28517s 475.28m 7.92h 0.33d 0.001 y
# IO & Wait Time: 576s 9.61m 0.16h 0.01d 0.000 y
# Average job time: 269s 4.49m 0.07h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 509s 8.48m 0.14h 0.01d
# Submission to last job: 1166s 19.43m 0.32h 0.01d
# quality scores
# This takes about 24 hours for each species!! Ugh.
# Solution is to use -bedFile argument to hgWiggle
ssh hgwdev
cd /san/sanvol1/snp/liftOver/hg17
cd panTro2Seq/output
cat << 'EOF' > concat.csh
#!/bin/tcsh
rm -f all.out
foreach fileName (`ls snp*.out`)
cat $fileName >> all.out
end
'EOF'
sort all.out > all.sort
rm all.out
cd /san/sanvol1/snp/liftOver/hg17
mkdir panTro2Qual
cp panTro2Seq/output/all.sort panTro2Qual
cd panTro2Qual
mkdir input
splitFileByColumn all.sort input
mkdir output
# If we do this again, we should write a c program to read qac files into
# memory -- much faster than one hgWiggle process per line.
cat << 'EOF' > addQual.pl
#!/usr/bin/perl -W
$db=shift;
$chromName=shift;
while (<STDIN>)
{
my @fields = split;
my $chrom = $fields[0];
my $chromStart = $fields[1];
my $chromEnd = $fields[2];
my $name = $fields[3];
my $strand = $fields[5];
my $allele = $fields[6];
$cmd="hgWiggle -db=$db -chrom=$chromName -position=$chrom:$chromStart-$chromStart -rawDataOut quality";
open(RESULT, "$cmd |") or die "can't start '$cmd'\n";
while ($line = <RESULT>)
{
$score = int($line);
print "$chrom\t$chromStart\t$chromEnd\t$name\t$score\t$strand\t$allele\n";
}
}
'EOF'
cat << 'EOF' > getQual.csh
#!/bin/tcsh
foreach fileName (`ls input/*`)
set chromName = $fileName:t:r
echo $chromName
addQual.pl panTro2 $chromName < $fileName > output/$chromName
end
'EOF'
# << emacs
./getQual.csh
cd rheMac2Seq/output
concat.csh
sort all.out > all.sort
rm all.out
cd /san/sanvol1/snp/liftOver/hg17
mkdir rheMac2Qual
cp rheMac2Seq/output/all.sort rheMac2Qual
cd rheMac2Qual
mkdir input
splitFileByColumn all.sort input
mkdir output
./getQual.csh
# concatenate, merge and load
# chimp has no qual scores for chr21, chrY and chrM, just use seq files
cd /san/sanvol1/snp/liftOver/hg17/panTro2Qual/output
grep chr21 ../../panTro2Seq/output/all.sort > chr21
grep chrY ../../panTro2Seq/output/all.sort | grep -v random > chrY
grep chrY ../../panTro2Seq/output/all.sort | grep random > chrY_random
grep chrM ../../panTro2Seq/output/all.sort > chrM
#-----------------------------------------------------------------------------
# 1/11/08: replace outputs for chroms that apparently were skipped in the June
# run, and re-run subsequent steps for chimp.
cd /san/sanvol1/snp/liftOver/hg17/panTro2Qual
mv output output-jun25
foreach f (output-jun25/chr*)
if ( "X"`cmp $f output-feb26/$f:t` == "X" ) then
echo $f:t
endif
end
#chr21
#chrM
#chrY
#chrY_random
# <<-- those are the ones that may not have actually been regenerated.
# It appears that the Feb. outputs, instead of the June Seq files, were copied
# to the June output for those chroms. oops!
# As a minor improvement, skip duplicate rows instead of just copying.
foreach chr (chr21 chrM chrY chrY_random)
echo $chr
uniq input/$chr.sort > output/$chr
end
# << emacs
mkdir output-jun25-incorrect
mv output-jun25/chr{21,M,Y,Y_random} output-jun25-incorrect
cat output-jun25/chr* output/chr* > output/qual.tab
# end 1/11/08 fix-specific; proceeding to post-concat.csh chimp steps.
#-----------------------------------------------------------------------------
./concat.csh
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrtho hg17 snp125 qual.tab
hgLoadBed hg17 snp125OrthoPanTro2 snpOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/snp125OrthoPanTro2.sql
#Loaded 9591230 elements of size 17
# previously 9590961
# add index
hgsql hg17
alter table snp125OrthoPanTro2 add index name (name);
alter table snp125OrthoPanTro2 add index chrom (chrom, bin);
# 1/24/08: these used to set score; should have set orthoScore all along.
# tweak to match panTro2 assembly
update snp125OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chr21";
#Query OK, 129170 rows affected (25.37 sec)
#Rows matched: 129170 Changed: 129170 Warnings: 0
update snp125OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY";
#Query OK, 22081 rows affected (25.16 sec)
#Rows matched: 22081 Changed: 22081 Warnings: 0
update snp125OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY_random";
#Query OK, 155 rows affected (25.41 sec)
#Rows matched: 155 Changed: 155 Warnings: 0
# macaque
cd /san/sanvol1/snp/liftOver/hg17/rheMac2Qual/output
./concat.csh
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrtho hg17 snp125 qual.tab
hgLoadBed hg17 snp125OrthoRheMac2 snpOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/snp125OrthoRheMac2.sql
# add index
alter table snp125OrthoRheMac2 add index name (name);
alter table snp125OrthoRheMac2 add index chrom (chrom, bin);
# get hapmap subset for chimp
# skip if lift wasn't size 1
# this run 124822 skipped
cd /cluster/data/hg17/bed/hapmap/rel21a
time /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg17 \
hapmapSnpsCombined snp125OrthoPanTro2
#108.505u 16.869s 2:26.22 85.7% 0+0k 0+0io 4pf+0w
hgLoadBed hg17 hapmapAllelesChimp hapmapOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapAllelesChimp.sql
#Loaded 3930564 elements of size 13
hgsql hg17 -e 'alter table hapmapAllelesChimp add index name(name); \
alter table hapmapAllelesChimp add index chrom (chrom, bin);'
# get hapmap subset for macaque
# this run 106607 skipped
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg17 hapmapSnpsCombined snp125OrthoRheMac2
hgLoadBed hg17 hapmapAllelesMacaque hapmapOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapAllelesMacaque.sql
rm hapmapOrtho.tab
rm hapmapOrtho.err
rm bed.tab
alter table hapmapAllelesMacaque add index name(name);
alter table hapmapAllelesMacaque add index chrom (chrom, bin);
##############################################################################
# HapMap Recombination Rate Phase 2 (Heather Feb. 2006)
# Contacts:
# Gil McVean [mcvean@stats.ox.ac.uk]
# Colin Freeman [cfreeman@stats.ox.ac.uk]
# Simon Myers [smyers@broad.mit.edu]
# Data is missing chromEnd. I am setting chromEnd = chromStart + 1 as a
# kludge for now.
# Solution is to interpolate range but remove gaps.
## ****************************************
# This is a bad assumption about the data format -- here is a description.
## ****************************************
# The recombination rates are for the regions _between_ snps, so these
# files need to be processed slightly differently. For each line i in
# the file (except the header and the last line), the recombination
# rate is for the position on the current line minus 1 [pos(${i})-1] to
# the position on the subsequent line [pos({$i+1})]. The precision is
# a bit obnoxious and can be truncated to 3 or 4 significant figures.
# (Note that the recombination rate on the last line is 0, as this is a
# placeholder.) Here is an example:
#
# > head genetic_map_chr1.txt
# position COMBINED_rate(cM/Mb) Genetic_Map(cM)
# 45413 2.98182170902573 0
# 72434 2.08241435350679 0.0805718043995841
# 78032 2.08135840137317 0.0922291599505152
# 244859 2.88844902005393 0.439455937976397
# 604461 2.88749757426825 1.47814798248583
# 604484 2.88586385769306 1.47821439493004
# 605296 2.88389196108775 1.48055771638249
#
### BED format (like a bedGraph)
# chr1 45412 72434 2.982
# chr1 72433 78302 2.082
# chr1 78031 244859 2.081
# chr1 244858 604461 2.888
# chr1 604460 604484 2.887
# chr1 604483 605296 2.886
# chr1 605295 ..... 2.884
#
# See /cluster/data/hg16/bed/hapmap/recombination/Perlegen/makeBed.pl for an example. /cluster/data/hg16/bed/hapmap/recombination/Perlegen/cmds.csh is also useful.
## ****************************************
ssh hgwdev
cd /cluster/data/hg17/bed/hapmap
mkdir recombination
cd recombination
mkdir phase2
cd phase2
wget --no-check-certificate -N https://mathgen.stats.ox.ac.uk/HapMap_Phase2_rates_hotspots/HapMap_Phase2_rates_hotspots.tgz
# data also available at
# http://www.hapmap.org/downloads/recombination/2006-10_rel21_phaseII
gunzip *.tgz
tar xvf *.tar
cat << 'EOF' > makeBed.csh
#!/bin/tcsh
rm -f recomb.bed
foreach chrom (`cat chrom.list`)
echo $chrom
set fileName=`echo $chrom | awk '{printf "genetic_map_%s.txt", $1}'`
makeBed.pl $chrom < $fileName >> recomb.bed
end
makeBed.pl chrX < genetic_map_chrX_par1.txt >> recomb.bed
makeBed.pl chrX < genetic_map_chrX_non-par.txt >> recomb.bed
makeBed.pl chrX < genetic_map_chrX_par2.txt >> recomb.bed
'EOF'
cat << 'EOF' > makeBed.pl
#!/usr/bin/env perl
$chromName = shift;
while (<STDIN>) {
my @fields = split;
# skip header
if ($fields[0] eq "position") { next; }
print $chromName;
print "\t";
print $fields[0];
print "\t";
print $fields[0] + 1;
print "\t";
my $val1000 = $fields[1] * 1000;
my $valRound = int($val1000);
my $newVal = $valRound / 1000.0;
print $newVal;
print "\n";
}
'EOF'
./makeBed.csh
hgLoadBed hg17 snpRecombRateHapmapPhase2 recomb.bed -tab -bedGraph=4
hgsql -e 'alter table snpRecombRateHapmapPhase add index chrom (chrom, bin)' hg17
############
# UPDATE hg17 knownToVisiGene (2006-04-05 galt)
# Create table that maps between known genes and visiGene database
# mapping to other species such as mouse, zebrafish, frog
# requires visiGene probe track vgImageProbes be created first
knownToVisiGene hg17 -fromProbePsl=vgImageProbes
#############################################################
# ADD A NEW TRACK GROUP (DONE, 6/3/06, Fan)
# Create a new track group, "phenDis".
echo 'INSERT INTO grp (name, label, priority) VALUES ("phenDis", "Phenotype and Disease Associations", 2.5)' \
| hgsql hg17
#############################################################
# hgMut - Human Mutation track - Belinda Giardine
# list of tables by show tables like 'hgMut%'
# summary of current load June 7, 2006
#table definitions for autoSql
autoSql hgMut.as hgMut -dbLink
#move bin in struct so works as bed 4+
#hgMut.sql: change INDEXes as needed, put in enums
#shrink mutId to 64 chars, plus acc to 48
#data files and details under ~giardine/humPhen/
cd humPhen/hgMutData/April2006/
cat hgMutHbVar.txt hgMutPah.txt hgMutBgmut.txt hgMutCftr.txt hgMutARdb.txt > hgMutUnsorted.txt
grep "^chr" hgMutUnsorted.txt | sort -k1,1 -k2,2n > hgMut.bed
#create tables
hgsql hg17 < ../../hgMut.sql
#loading
hgLoadBed hg17 hgMut hgMut.bed -noSort -oldTable -tab
#load small vocab control tables
hgsql hg17 < hgMutLink.sql
hgsql hg17 < hgMutAttrClass.sql
hgsql hg17 < hgMutAttrName.sql
hgsql hg17 < hgMutSrc.sql
#from hgsql hg17
load data local infile "hgMutExtLinkHbVar.txt" into table hgMutExtLink;
load data local infile "hgMutExtLinkARdb.txt" into table hgMutExtLink;
load data local infile "hgMutExtLinkBgmut.txt" into table hgMutExtLink;
load data local infile "hgMutExtLinkCFTR.txt" into table hgMutExtLink;
load data local infile "hgMutExtLinkPah.txt" into table hgMutExtLink;
load data local infile "hgMutExtLinkSP.txt" into table hgMutExtLink;
load data local infile "hgMutAttrHbVar2.txt" into table hgMutAttr;
load data local infile "hgMutAttrHbvarProt2.txt" into table hgMutAttr;
load data local infile "hgMutAttrARdb.txt" into table hgMutAttr;
load data local infile "hgMutAttrARdbProt.txt" into table hgMutAttr;
load data local infile "hgMutAliasHbVar.txt" into table hgMutAlias;
load data local infile "hgMutAliasARdb.txt" into table hgMutAlias;
load data local infile "hgMutAliasBgmut.txt" into table hgMutAlias;
load data local infile "hgMutAliasPah.txt" into table hgMutAlias;
load data local infile "hgMutExtLinkHbVarOmim.txt" into table hgMutExtLink;
load data local infile "hgMutAttrLink.txt" into table hgMutAttrLink;
load data local infile "hgMutAttrSP.txt" into table hgMutAttr;
#############################################################
# gv* Belinda Giardine
# These tables are to replace the hgMut tables
# Most data is converted by me (on PSU machines) to loadable format and copied.
# The Swiss-Prot/UniProt data, is generated from the UniProt database at UCSC,
# using perl scripts and table dumps.
# scripts in kent/src/hg/utils/gvParsers/swissProt/
# everything redone to not depend on the dv track in July 2006
#make list of variants from Swiss-Prot (make sure featureClass 23 is variant)
hgsql -N uniProt > spVars.txt <<end
select feature.acc, start, end-start, featureType.val, featureId.val from
feature, featureType, accToTaxon, featureId where featureClass=23 and
featureType=featureType.id and accToTaxon.acc=feature.acc and taxon=9606 and
feature.featureId=featureId.id;
end
#need list mapping 3 letter amino acids to 1 letter. (aminoInfoDump from PSU)
#known gene protein map (kgProtMap) has psl data from (blastp)
# with qName being the spId
hgsql -N hg17 > kgProtMapDump.txt <<end
select kgProtMap.* from kgProtMap, uniProt.feature where kgProtMap.qName =
uniProt.feature.acc;
end
#table join duplicates; perl script to throw out extra before use
uniqueRows < kgProtMapDump.txt > kgProtMapUniq.txt
#check variables for output and input file names
computeSpVars > errors.txt
#errors.txt will list variants that couldn't be mapped
#July 18, 2006
#37 gaps, 564 proteins (2228 variants) not in kgProtMap (test one did align)
#found 22389
#Swiss-Prot attributes:
hgsql hg17 < listSPconnections.sql > listSPconnections.txt
hgsql proteome < listSpXref2.sql > listSpXref2.txt
convertOmimTitle > gvLinkSPomim.txt
hgsql hg17 < listGeneVals.sql > listGeneVals.txt
convertDisPoly > gvLinkSPuni.txt
cat gvLinkSPuni.txt gvLinkSPomim.txt > gvLinkSp.txt
cp gvLinkSp.txt ../../../gv/gvData/
#creating gv* tables and loading
#June 27, 2006
autoSql gv.as gv -dbLink
#edit indexes and string lengths in .sql file
id=48, srcId=48, raKey=48, attrType=48,
primary key=index on bin and attr and link ids (id, attrType for attrs)
#do enums
#add unique index, to prevent doubles
UNIQUE KEY (chrom(12), chromStart, chromEnd, name)
#added id field to gvPos struct so can keep ID when change name
# char *id; /* Added field to hold ID if change name */
#set to null in gv.c file
#reload data July 2006 with more data and corrected Swiss-Prot data
#also moved gv*(except gvPos) and omimTitle to hgFixed
#prep data: concatenate all the gvPos data, sort
cat gvPosSP.txt gvPosHbVar.txt gvPosARdb.txt gvPosBgmut.txt gvPosCftr.txt
gvPosPah.txt gvPosSrd5a2.txt gvPosBrca.txt > gvPosAll.txt
grep "^chr" gvPosAll.txt | sort -k1,1 -k2,2n > gvPosSortedHg17.bed
#load tables
hgLoadBed hg17 gvPos gvPosSortedHg17.bed -noSort -oldTable -tab
hgsql hg17 < gvSrc.sql
hgsql hg17
load data local infile "gvBrca.txt" into table gv;
load data local infile "gvAttrBrca.txt" into table gvAttr;
load data local infile "gvLinkBrca.txt" into table gvLink;
load data local infile "gvLinkSP.txt" into table gvLink;
load data local infile "gvLinkSPgene.txt" into table gvLink;
load data local infile "gvSP.txt" into table gv;
load data local infile "gvAttrSP.txt" into table gvAttr;
load data local infile "gvAttrLongSP.txt" into table gvAttrLong;
load data local infile "gvLinkHbVar.txt" into table gvLink;
load data local infile "gvHbVar.txt" into table gv;
load data local infile "gvAttrHbVar.txt" into table gvAttr;
load data local infile "gvARdb.txt" into table gv;
load data local infile "gvAttrARdb.txt" into table gvAttr;
load data local infile "gvBgmut.txt" into table gv;
load data local infile "gvAttrBgmut.txt" into table gvAttr;
load data local infile "gvAttrLongBgmut.txt" into table gvAttrLong;
load data local infile "gvLinkBgmut.txt" into table gvLink;
load data local infile "gvCftr.txt" into table gv;
load data local infile "gvAttrCftr.txt" into table gvAttr;
load data local infile "gvPah.txt" into table gv;
load data local infile "gvAttrPah.txt" into table gvAttr;
load data local infile "gvLinkPah.txt" into table gvLink;
load data local infile "gvSrd5a2.txt" into table gv;
load data local infile "gvAttrSrd5a2.txt" into table gvAttr;
load data local infile "gvAttrConservedDisease.txt" into table gvAttr;
#get disease association predictions for conserved variants
#get list a variants that are already done
hgsql -N hg17 > gvWithDiseaseStatus.txt <<end
select id from gvAttr where attrType = 'disease';
end
#use table browser to get variants that intersect most conserved track
#set conserved variants that are null to likely
computeDiseaseAssocCons > gvAttrConservedDisease.txt
#Belinda Giardine Sept 2006
#reload tables, removed ones with sequence mismatches, added label and strand
#added new lsbd BTKbase
#Sequence mismatches were determined by using the position in the reference
#sequence to fetch the sequence affected by the variant. Then for substitions
#and deletions with the nts deleted listed, the sequence was compared.
#Insertions and large deletions could not be checked.
#Belinda Giardine Dec 2006
#reload tables, additions to previous sources and more IDbases
#details in hg18 doc
#Belinda Giardine Jan 2007
#reload tables, additions and corrections, details in hg18 doc
#############################################################
# Illumina Hap300 (Heather, July 2006)
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir illumina
cd illumina
trim.pl < Illumina_HumanHap300_SNPlist_01.13.2006.txt > trim.out
hgsql hg17 < illuminaTmp.sql
hgsql -e "load data local infile 'trim.out' into table illuminaTmp" hg17
# illuminaLookup generates bin
/cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg17 illuminaTmp snp125 snp125Exceptions illuminaLookup.out illuminaLookup.err
# errors:
# unexpected chrom chr1 for snp rs1291584
# unexpected chrom chr17 for snp rs3826555
# unexpected locType between for snp rs2036773
# unexpected locType between for snp rs2249255
# unexpected locType between for snp rs8051412
# unexpected locType between for snp rs1017238
# unexpected locType between for snp rs5019493
# 16 with locType = range*
# 402 not found!
# None that have multiple alignments.
hgsql hg17 < snpArrayIllumina300.sql
hgsql -e "load data local infile 'illuminaLookup.out' into table snpArrayIllumina300" hg17
hgsql -e "alter table snpArrayIllumina300 add index name (name)" hg17
hgsql -e "alter table snpArrayIllumina300 add index chrom (chrom, bin)" hg17
#############################################################
# Illumina Hap550 and Hap650 (Heather, April 2007)
# Transfer from hg18 for Bert Gold at NCI
ssh hgwdev
cd /cluster/data/hg17/bed/illumina
hgsql hg18 < getHg18-550.sql > 550.hg18
hgsql hg18 < getHg18-650.sql > 650.hg18
# get name, chrom, chromStart, chromEnd, strand observed from snp125
# where class = "single" and locType = "exact" and chromEnd = chromStart + 1
# Including tri/quad allelic and multiple-aligning for now
hgsql hg17 < getHg17.sql > snp125single.hg17
# sort and join
sort 550.hg18 > 550.hg18.sort
sort 650.hg18 > 650.hg18.sort
sort snp125single.hg17 > snp125single.hg17.sort
# 560704 lines in 550.join
# 660137 lines in 650.join
# 687 lines in 550.missing
# 706 lines in 650.missing
join 550.hg18.sort snp125single.hg17.sort > 550.join
join 650.hg18.sort snp125single.hg17.sort > 650.join
join -v 1 550.hg18.sort snp125single.hg17.sort > 550.missing
join -v 1 650.hg18.sort snp125single.hg17.sort > 650.missing
# fix column order
awk '{print $2, $3, $4, $1, 0, $5, $6}' 550.join > 550.bed
awk '{print $2, $3, $4, $1, 0, $5, $6}' 650.join > 650.bed
# load
hgLoadBed hg17 snpArrayIllumina550 550.bed -sqlTable=snpArrayIllumina550.sql
hgLoadBed hg17 snpArrayIllumina650 650.bed -sqlTable=snpArrayIllumina650.sql
# indices
mysql> alter table snpArrayIllumina550 add index name (name);
mysql> alter table snpArrayIllumina550 add index chrom (chrom, bin);
mysql> alter table snpArrayIllumina650 add index name (name);
mysql> alter table snpArrayIllumina650 add index chrom (chrom, bin);
#############################################################
# Affy 500K (Heather, September 2006)
# look up rsId using position
ssh hgwdev
cd /cluster/data/hg17/bed/snp/affyData/500K
# awk to create bed format from tsv files
/bin/csh cmds.csh
hgsql hg17 < affy250Nsp.sql
hgsql hg17 < affy250Sty.sql
hgsql -e "load data local infile 'Mapping250K_Nsp.bed' into table affy250Nsp" hg17
hgsql -e "load data local infile 'Mapping250K_Sty.bed' into table affy250Sty" hg17
# look up dbSNP rsIDs using position
# affy250Nsp
# 4311 missing, 7276 multiple
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy250Nsp snp125
mv affyLookup.out affy250Nsp.bed
mv affyLookup.err affy250Nsp.err
hgsql hg17 < snpArrayAffy250Nsp.sql
hgLoadBed hg17 snpArrayAffy250Nsp affy250Nsp.bed -sqlTable=snpArray250Nsp.sql -tab
hgsql -e "alter table snpArrayAffy250Nsp add index name (name)" hg17
hgsql -e "alter table snpArrayAffy250Nsp add index chrom (chrom, bin)" hg17
# affy250Sty
# 3540 missing, 6901 multiple
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy250Sty snp125
mv affyLookup.out affy250Sty.bed
mv affyLookup.err affy250Sty.err
hgsql hg17 < snpArrayAffy250Sty.sql
hgLoadBed hg17 snpArrayAffy250Sty affy250Sty.bed -sqlTable=snpArray250Sty.sql -tab
hgsql -e "alter table snpArrayAffy250Sty add index name (name)" hg17
hgsql -e "alter table snpArrayAffy250Sty add index chrom (chrom, bin)" hg17
#############################################################
# Affy 10K (Sept. 2006, Heather)
# look up rsId using position
ssh hgwdev
cd /cluster/data/hg17/bed/snp/affyData/10K100Kagain
# affy10
# 14 missing, 807 multiple
cp affy10K.txt affy10Temp.bed
hgLoadBed hg17 affy10Temp affy10Temp.bed -sqlTable=affy10Temp.sql -tab -noBin
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy10Temp snp125
mv affyLookup.out affy10.bed
mv affyLookup.err affy10.err
hgLoadBed hg17 snpArrayAffy10 affy10.bed -sqlTable=snpArrayAffy10.sql -tab
# affy10v2
# 12 missing, 716 multiple
hgLoadBed hg17 affy10v2Temp affy10v2Temp.bed -sqlTable=affy10v2Temp.sql -tab -noBin
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy10v2Temp snp125
mv affyLookup.out affy10v2.bed
mv affyLookup.err affy10.errv2
hgLoadBed hg17 snpArrayAffy10v2 affy10v2.bed -sqlTable=snpArrayAffy10v2.sql -tab
# affy50HindIII
# 156 missing, 1396 multiple
hgLoadBed hg17 affy50HindIIITemp affy50HindIII.bed -sqlTable=affy50HindIIITemp.sql -tab -noBin
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy50HindIIITemp snp125
mv affyLookup.out affy50HindIII.bed
mv affyLookup.err affy50HindIII.err
hgLoadBed hg17 snpArrayAffy50HindIII affy50HindIII.bed -sqlTable=snpArrayAffy50HindIII.sql -tab
hgsql -e "alter table snpArrayAffy50HindIII add index name (name)" hg17
hgsql -e "alter table snpArrayAffy50HindIII add index chrom (chrom, bin)" hg17
# affy50XbaI
# 115 missing, 1745 multiple
hgLoadBed hg17 affy50XbaITemp affy50XbaI.bed -sqlTable=affy50XbaITemp.sql -tab -noBin
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy50XbaI snp125
mv affyLookup.out affy50XbaI.bed
mv affyLookup.err affy50XbaI.err
hgLoadBed hg17 snpArrayAffy50XbaI affy50XbaI.bed -sqlTable=snpArrayAffy50XbaI.sql -tab
hgsql -e "alter table snpArrayAffy50XbaI add index name (name)" hg17
hgsql -e "alter table snpArrayAffy50XbaI add index chrom (chrom, bin)" hg17
#########################################################################
# REGULATORY POTENTIAL (DONE - 2006-06-14 - Hiram)
# download data from "James Taylor" <james@bx.psu.edu>
ssh kkstore02
mkdir /cluster/store11/hg17/bed/regPotential7X
cd /cluster/data/hg17/bed
ln -s /cluster/store11/hg17/bed/regPotential7X ./regPotential7X
cd regPotential7X
# This is a lot of data
time for C in 1 2 3 4 5 6 7 8 9 X Y 10 11 12 13 14 15 16 17 18 19 20 21 22
do
wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg17/chr${C}.scores.truncated.bz2"
done
# real 115m1.855s
wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg17/trackDb.html" -O description.html
time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
do
bzcat chr${C}.scores.truncated.bz2
done | wigEncode -noOverlap stdin regPotential7X.wig regPotential7X.wib
# Converted stdin, upper limit 1.00, lower limit -0.00
# real 33m48.487s
# Loading the table on hgwdev
ssh hgwdev
cd /cluster/data/hg17/bed/regPotential7X
ln -s /cluster/data/hg17/bed/regPotential7X/regPotential7X.wib \
/gbdb/hg17/wib/regPotential7X.wib
# using the tmpDir is faster since it is on local disk and it will
# clean up any temporary .tab file it creates there
time hgLoadWiggle -tmpDir=/scratch/tmp \
hg17 regPotential7X regPotential7X.wig
# How about a histogram of the data.
ssh kolossus
cd /cluster/data/hg17/bed/regPotential7X
time hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 -hBinCount=100 \
-hMinVal=0.0 -db=hg17 regPotential7X > histogram.data 2>&1
# real 2m48.810s
# 73 % of the data values are zero
# create download gzip files from the bz2 files:
for F in chr*.scores.truncated.bz2
do
C=`echo $F | awk -F'.' '{print $1}'`
echo -n "${C}.regPotential7X.hg17.gz working ... "
bzcat ${F} | gzip > ${C}.regPotential7X.hg17.gz
echo
done
#########################################################################
####### RE-BUILD RGD HUMAN QTL TRACKS (DONE 06/21/06 Fan) ##############
# DELETED RECORD FROM rgdQtlLink SO CONSISTENT WITH REMOVAL FROM rgdQtl
# (DONE, 2006-06-30, hartera)
ssh hgwdev
mkdir -p /cluster/store8/rgd/human12062005
rm /cluster/data/hg17/bed/rgdQtl
ln -s /cluster/store8/rgd/human12062005 /cluster/data/hg17/bed/rgdQtl
cd /cluster/data/hg17/bed/rgdQtl
# download data files from RGD
wget --timestamp ftp://rgd.mcw.edu/pub/RGD_genome_annotations/human/rgd_human_qtl_12062005.gff
# remove extra line feed character at the end of lines
rmLf rgd_human_qtl_12062005.gff > rgdQtl.gff
# create rgdQtl.tab
awk '{print $1"\t"$4"\t"$5"\t"$10}' rgdQtl.gff |sed -e 's/Chr/chr/g'| \
sed -e 's/"//g' |sed -e 's/RGD://g' | sed -e 's/;//g' |sort -u > rgdQtl.tab
# create rgdQtlLink.tab
cat rgdQtl.gff |cut -f 9 |sed -e 's/; Note /\t/g'|\
sed -e 's/Alignment //' |sed -e 's/;Note /\t/' |\
sed -e 's/"//g' |sed -e 's/RGD://' >j.tmp
cut -f 2 j.tmp >j.1
cut -f 1,3 j.tmp >j.2
paste j.1 j.2 |sort -u >rgdQtlLink.tab
rm j.1 j.2 j.tmp
# load rgdQtl table
hgLoadBed hg17 rgdQtl rgdQtl.tab
# check rgdQtl table
checkTableCoords hg17 rgdQtl
# Go the following error messages:
#hg17.rgdQtl has 2 records with end > chromSize.
#hg17.rgdQtl has 2 records with end > chromSize.
#hg17.rgdQtl has 2 records with end > chromSize.
#hg17.rgdQtl has 1 records with end > chromSize.
#hg17.rgdQtl has 2 records with end > chromSize.
#hg17.rgdQtl has 3 records with end > chromSize.
#hg17.rgdQtl has 1 records with end > chromSize.
#hg17.rgdQtl has 3 records with end > chromSize.
#hg17.rgdQtl has 2 records with end > chromSize.
#hg17.rgdQtl has 1 records with end > chromSize.
#hg17.rgdQtl has 3 records with end > chromSize.
#hg17.rgdQtl has 2 records with end > chromSize.
#hg17.rgdQtl has 1 records with end > chromSize.
#hg17.rgdQtl has 1 records with end > chromSize.
#hg17.rgdQtl has 1 records with end > chromSize.
#hg17.rgdQtl has 1 records with end > chromSize.
#hg17.rgdQtl has 1 records with end < start.
hgsql hg17 -N -e 'select "do1", name, c.size from rgdQtl r, chromInfo c where chromEnd > c.size and r.chrom=c.chrom' >doall
cat << '_EOF_' > do1
hgsql hg17 -e "update rgdQtl set chromEnd = '${2}' where name='${1}'"
'_EOF_'
chmod +x do*
doall
checkTableCoords hg17 rgdQtl
#hg17.rgdQtl has 1 records with end < start.
hgsql hg17 -e 'select * from rgdQtl where chromEnd < chromStart'
# bin chrom chromStart chromEnd name
# 9 chr10 7135612 371019 BW63_H
# 0 chr20 77628133 5242324 AASTH39_H
# Don't know why checkTableCoords only catches one of the two erros.
hgsql hg17 -e "update rgdQtl set chromStart = 271019 where name='BW63_H'"
hgsql hg17 -e "update rgdQtl set chromEnd = 7135612 where name='BW63_H'"
# Delete the following record. The RGD QTL is very questionable.
hgsql hg17 -e "delete from rgdQtl where name='AASTH39_H'"
# load rgdQtlLink table
hgsql hg17 -e "drop table hg17.rgdQtlLink;"
hgsql hg17 <~/kent/src/hg/lib/rgdQtlLink.sql
hgsql hg17 -e 'load data local infile "rgdQtlLink.tab" into table hg17.rgdQtlLink;'
# Delete the record from rgdQtlLink table that was removed from the rgdQtl
# table above. (hartera, 2006-06-30)
hgsql hg17 -e "delete from rgdQtlLink where name='AASTH39_H'"
########################################################################
#########################################################################
#Reload omimTitle table Belinda Giardine June 28, 2006
#fetched omim.txt.Z from OMIM downloads.
#parse out title lines (*FIELD* TI)
convertTitle < omim.txt > omimTitle.txt
#load into omimTitle table
truncate table omimTitle;
load data local infile "omimTitle.txt" into table omimTitle;
#############################################################
# Lift SV track from hg16 (Heather, July 2006)
# hg16 SV track is comprised of 7 subtracks:
# cnpFosmid, cnpSebat, cnpIafrate, cnpSharp, delConrad, delMccarroll, delHinds
# Use the same table formats as hg16; pre-create
# (No bin for del tables)
cd /cluster/data/hg17/bed
mkdir svMixed
cd svMixed
# I got hg17 coords from Andy Sharp for cnpFosmid and delHinds
trimFosmid.pl < cnpFosmid.txt > cnpFosmid.bed
hgLoadBed -tab hg17 cnpFosmid cnpFosmid.bed
hinds.pl < hinds.txt > delHinds.bed
hgLoadBed -tab -noBin hg17 delHinds delHinds.bed
# (7-27-2006 Brooke Rhead -- edited the cnpFosmid table)
# According to Andy Sharp, the name='Gap' items should be removed from
# cnpFosmid. I dumped the table, removed the 'Gap' lines, then dumped the
# table again.
cd /cluster/data/hg17/bed/svMixed
hgsql hg17 -e "select * from cnpFosmid" > cnpFosmid_withGaps.bed
hgsql hg17
delete from cnpFosmid where name='Gap';
hgsql hg17 -e "select * from cnpFosmid" > cnpFosmid_withoutGaps.bed
# Simple lifts for delMccarroll
cat << '_EOF_' > liftConrad.csh
#!/bin/csh
hgsql -N -e 'select * from delMccarroll' hg16 > delMccarroll.hg16
liftOver -minMatch=0.7 delMccarroll.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz delMccarroll.bed delMccarroll.err
hgLoadBed -sqlTable=delMccarroll.sql -tab -noBin hg17 delMccarroll delMccarroll.bed
'_EOF_'
# Lift both chromStart/chromEnd and thickStart/thickEnd for delConrad and join
cat << '_EOF_' > liftConrad.csh
#!/bin/csh
hgsql -N -e 'select chrom, chromStart, chromEnd, name, score, strand from delConrad' hg16 > delConrad.hg16.1
hgsql -N -e 'select chrom, thickStart, thickEnd, name, score, strand from delConrad' hg16 > delConrad.hg16.2
liftOver -minMatch=0.7 delConrad.hg16.1 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz delConrad.tmp.1 delConrad.err.1
liftOver -minMatch=0.7 delConrad.hg16.2 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz delConrad.tmp.2 delConrad.err.2
trimConrad.pl < delConrad.tmp.1 > delConrad.trim.1
trimConrad.pl < delConrad.tmp.2 > delConrad.trim.2
sort delConrad.trim.1 > delConrad.sort.1
sort delConrad.trim.2 > delConrad.sort.2
join delConrad.sort.1 delConrad.sort.2 > delConrad.join
awk '{print $2, $3, $4, $1, 1000, $5, $7, $8}' delConrad.join > delConrad.bed
hgLoadBed -sqlTable=delConrad.sql -noBin hg17 delConrad delConrad.bed
'_EOF_'
# Andy Sharp says the Sebat data has already been lifted, so be conservative here
# Create hg16.cnpSebatLiftCandidate that excludes 5 rows that had wild proliferations
cat << '_EOF_' > liftSebat.csh
hgsql -N -e 'select chrom, chromStart, chromEnd, name, probes, individuals from cnpSebatLiftCandidate' hg16 > cnpSebat.hg16
liftOver -minMatch=0.7 -bedPlus=4 cnpSebat.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz cnpSebat.bed cnpSebat.err
hgLoadBed -sqlTable=cnpSebat.sql -tab hg17 cnpSebat cnpSebat.bed
'_EOF_'
# For Andy's data, use bacEndPairs first, then lift the remainder
cat << '_EOF_' > liftSharp.csh
# assumes a copy of hg16.cnpSharp in hg17.cnpSharpHg16Copy
/cluster/home/heather/kent/src/hg/snp/snpLoad/cnpLookup hg17 bacEndPairs cnpSharpHg16Copy cnpSharpLookup.out cnpSharpLookup.lift cnpSharpLookup.log
sed -e 's/Gain and Loss/GainAndLoss/' cnpSharpLookup.lift > cnpSharpLookup.lift.fix
mv cnpSharpLookup.lift.fix cnpSharpLookup.lift
liftOver -minMatch=0.7 -bedPlus=4 cnpSharpLookup.lift /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz cnpSharp.bed cnpSharp.err
sed -e 's/GainAndLoss/Gain And Loss/' cnpSharp.bed > cnpSharp.bed.fix
mv cnpSharp.bed.fix cnpSharp.bed
hgLoadBed -tab -sqlTable=cnpSharp.sql hg17 cnpSharp cnpSharpLookup.out
hgLoadBed -tab -oldTable hg17 cnpSharp cnpSharp.bed
'_EOF_'
# For the Iafrate data, the BAC End lookup wasn't good, so just lift
# Create hg16.cnpIafrateLiftCandidate that excludes 2 rows that had wild proliferations
cat << '_EOF_' > liftIafrate.csh
hgsql -N -e 'select chrom, chromStart, chromEnd, name, variationType, score from cnpIafrateLiftCandidate' hg16 > cnpIafrate.hg16
sed -e 's/Gain and Loss/GainAndLoss/' cnpIafrate.hg16 > cnpIafrate.hg16.fix
mv cnpIafrate.hg16.fix cnpIafrate.hg16
liftOver -minMatch=0.7 -bedPlus=4 cnpIafrate.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz cnpIafrate.bed cnpIafrate.err
sed -e 's/GainAndLoss/Gain And Loss/' cnpIafrate.bed > cnpIafrate.bed.fix
mv cnpIafrate.bed.fix cnpIafrate.bed
hgLoadBed -sqlTable=cnpIafrate.sql -tab hg17 cnpIafrate cnpIafrate.bed
'_EOF_'
##############################################################################
# Add HapMap CNVRs from Matt Hurles (Heather Dec 2006)
ssh hgwdev
cd /cluster/data/hg17/bed/svRedon
# File from Matthew Hurles (meh@sanger.ac.uk) was essentially bed 4
# I decided to use bed 6 with score always 0 and strand always +
awk '{printf "%st%d\t%d\tcnp%s\t0\t%s\n", $1, $4, $5, $3, $7}' input.gff > input.bed
hgLoadBed hg17 cnpRedon input.bed
##############################################################################
# dbRIP POLYALUL1SVA track added (2006-07-14 - DONE - Hiram)
# dbRIP polyAluL1SVA
# Data provider: Dr. Liang at the Liang lab:
# http://falcon.roswellpark.org/index.html
# Ping.Liang@roswellpark.org
# Adding this track is a new data type into our browser.
# data definitions for dbRIP and polyGenotype were added to
# the hg/lib/ directory:
# -rw-rw-r-- 1 351 Jul 13 12:20 polyGenotype.as
# -rw-rw-r-- 1 694 Jul 13 12:22 polyGenotype.sql
# -rw-rw-r-- 1 6398 Jul 13 12:22 polyGenotype.c
# -rw-rw-r-- 1 980 Jul 10 17:59 dbRIP.as
# -rw-rw-r-- 1 11408 Jul 13 11:16 dbRIP.c
# -rw-rw-r-- 1 1578 Jul 13 12:06 dbRIP.sql
# With associated .h files in hg/inc/
# -rw-rw-r-- 1 4600 Jul 10 18:00 dbRIP.h
# -rw-rw-r-- 1 4375 Jul 13 16:16 polyGenotype.h
# Changes in hgTracks and hgc to make this track appear as it does
# at their browser:
# http://falcon.roswellpark.org:9090/cgi-bin/hgTables
# For this first instance of the track, the data was obtained
# directly from their Genome browser via the tables browser,
# dumping the tables:
# hg17.polyAluL1 and hg17.polyGenotype
# saving these data dumps to:
# (after a couple of versions were used ...)
ssh hgwdev
mkdir /cluster/data/hg17/bed/dbRIP
cd /cluster/data/hg17/bed/dbRIP
# -rw-rw-r-- 1 994485 Aug 1 16:03 dbRIP.2006-08-01.txt.gz
# -rw-rw-r-- 1 18532 Aug 1 16:05 polyGenotype.2006-08-01.txt.gz
# Rearrange their data columns to more closely match the
# standard BED definitions, and split into three different
# data sets:
zcat dbRIP.2006-08-01.txt.gz | headRest 1 stdin | awk -F'\t' '
{
chromStart=$6
chromStart -= 1
chromEnd=$7
if (match($1,"^RIP_SVA_.*")) {
printf "%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $5, chromStart, chromEnd, $1, $12, $2, $3, $4, $8, $9, $10, $11, $15, $13, $14, $16, $17, $18, $19, $20, $21
}
}' | sort -k1,1 -k2,2n > dbRIP.SVA.txt
zcat dbRIP.2006-08-01.txt.gz | headRest 1 stdin | awk -F'\t' '
{
chromStart=$6
chromStart -= 1
chromEnd=$7
if (match($1,"^RIP_L1_.*")) {
printf "%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $5, chromStart, chromEnd, $1, $12, $2, $3, $4, $8, $9, $10, $11, $15, $13, $14, $16, $17, $18, $19, $20, $21
}
}' | sort -k1,1 -k2,2n > dbRIP.L1.txt
zcat dbRIP.2006-08-01.txt.gz | headRest 1 stdin | awk -F'\t' '
{
chromStart=$6
chromStart -= 1
chromEnd=$7
if (match($1,"^RIP_Alu_.*")) {
printf "%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $5, chromStart, chromEnd, $1, $12, $2, $3, $4, $8, $9, $10, $11, $15, $13, $14, $16, $17, $18, $19, $20, $21
}
}' | sort -k1,1 -k2,2n > dbRIP.Alu.txt
# Create three specific sql table create definitions:
sed -e "s/dbRIP/dbRIP_SVA/" $HOME/kent/src/hg/lib/dbRIP.sql > dbRIP_SVA.sql
sed -e "s/dbRIP/dbRIP_L1/" $HOME/kent/src/hg/lib/dbRIP.sql > dbRIP_L1.sql
sed -e "s/dbRIP/dbRIP_Alu/" $HOME/kent/src/hg/lib/dbRIP.sql > dbRIP_Alu.sql
# And loading those three data tables:
hgLoadBed -verbose=3 -maxChromNameLength=6 -strict -tab -notItemRgb \
-sqlTable=dbRIP_SVA.sql hg17 dbRIP_SVA dbRIP.SVA.txt
hgLoadBed -verbose=3 -maxChromNameLength=6 -strict -tab -notItemRgb \
-sqlTable=dbRIP_L1.sql hg17 dbRIP_L1 dbRIP.L1.txt
hgLoadBed -verbose=3 -maxChromNameLength=6 -strict -tab -notItemRgb \
-sqlTable=dbRIP_Alu.sql hg17 dbRIP_Alu dbRIP.Alu.txt
# And an associated table of genotype frequencies
# Add three extra rows to the original data to provide a better handle
# on MySQL lookups for allele Frequency
hgsql hg17 -e "drop table polyGenotype;"
hgsql hg17 < $HOME/kent/src/hg/lib/polyGenotype.sql
zcat polyGenotype.2006-08-01.txt.gz | headRest 1 stdin | \
awk -F'\t' '
{
sampleSize = $3 + $4 + $5
plus = ($3 * 2) + $4
minus = ($5 * 2) + $4
if ((plus + minus) < 1) { alleleFreq=0 } else
{ alleleFreq = plus / (plus + minus) }
if (sampleSize > 0) {
heteroZyg = (2 * alleleFreq * (1.0 - alleleFreq)) * ((sampleSize * 2)/((sampleSize * 2) - 1))
} else {
heteroZyg = 2 * alleleFreq * (1.0 - alleleFreq)
}
printf "%s\t%s\t%d\t%d\t%d\t%.3f\t%.3f\t%.3f\n", $1, $2, $3, $4, $5, sampleSize, alleleFreq, heteroZyg
}
' > polyGenotype.txt
hgsql hg17 -e \
'load data local infile "polyGenotype.txt" into table polyGenotype;'
# A composite track was added to human/hg17/trackDb.ra to contain
# these three tracks, and search methods to get the name column
# participating in the search. Need to figure out how to get some
# of the other text-rich columns participating in the search.
##############################################################################
# hg17 -> hg15 LIFTOVER CHAINS (DONE 7/27/06 Fan)
# I used a size of 10kb instead of 3kb for the split (blat query) sizes in
# hg15. This had a huge affect on the amount of hits in the blat, which
# then had a huge effect on the amount of chains. I should also mention
# that hg15 chromosomes chr1 and chr2 were split further
# into more than a single query file. This helped a LOT in avoiding
# cluster hippos classically associated with those chroms.
######## LIFTOVER PREPARATION
# Split up hg15
ssh pk
cd /san/sanVol1/scratch/hg15
mkdir -p liftSplits/{split,lift}
bash
for fa in /cluster/data/hg15/?{,?}/*.fa; do
c=`basename $fa .fa`
echo $c
faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c
done
mkdir -p biggerSplits/split
cd biggerSplits/
ln -s ../liftSplits/lift
cd split/
ln -s ../../liftSplits/split/* .
faSplit sequence chr1.fa 5 chr1_
faSplit sequence chr2.fa 5 chr2_
rm chr{1,2}.fa
# Make some dirs
# cd /san/sanVol1/scratch
# mkdir -p hg17
# Copy 11.ooc files to hg17 subdirectory.
# cp -p /cluster/store5/gs.16/build33/11.ooc hg17
## First, copy over scripts. (Already done before)
# mkdir -p /san/sanVol1/scratch/fan
# cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan
# cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan
######## LIFTOVER BLATING
# HG17
ssh kk
cd /cluster/data/hg17
#makeLoChain-align hg17 /scratch/hg/hg17/nib hg15 /san/sanVol1/scratch/hg15/biggerSplits/split
makeLoChain-align hg17 /scratch/hg/hg17/bothMaskedNibs hg15 /san/sanVol1/scratch/hg15/liftOver/biggerSplits/split
# Completed: 2392 of 2392 jobs
# CPU time in finished jobs: 25651277s 427521.28m 7125.35h 296.89d 0.813 y
# IO & Wait Time: 74118s 1235.30m 20.59h 0.86d 0.002 y
# Average job time: 10755s 179.25m 2.99h 0.12d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 82545s 1375.75m 22.93h 0.96d
# Submission to last job: 82579s 1376.32m 22.94h 0.96d
ssh kkstore02
cd /cluster/data/hg17
cd bed
mv blat.hg15.2006-07-25 /san/sanVol1/scratch/hg17
ssh pk
cd /san/sanVol1/scratch/hg17/blat.hg15.2006-07-25/run/
sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg17ToHg15"}' > newspec
para create newspec
para try
para push
# Completed: 2392 of 2392 jobs
# CPU time in finished jobs: 612316s 10205.26m 170.09h 7.09d 0.019 y
# IO & Wait Time: 12421s 207.02m 3.45h 0.14d 0.000 y
# Average job time: 261s 4.35m 0.07h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 3524s 58.73m 0.98h 0.04d
# Submission to last job: 3588s 59.80m 1.00h 0.04d
######## LIFTOVER CHAINING
# LIFTING
ssh pk
cd /san/sanVol1/scratch/fan
cp mm7SplitLift.sh hg15SplitLift.sh
# change andy to fan, mm7 to hg15, and chrX to chr2, and remove chrUn_random
vi hg15SplitLift.sh
cat << 'EOF' > hg15ChainMergeSplit.sh
#!/bin/bash
cp -r chainRaw/ /scratch/fan/hg15Lifts
pushd /scratch/fan/hg15Lifts
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
cp -r chain `dirs +1`
rm -rf chain chainRaw
'EOF'
chmod +x hg15ChainMergeSplit.sh
# HG17
cd /san/sanVol1/scratch/hg17/blat.hg15.2006-07-25/raw
/san/sanVol1/scratch/fan/hg15SplitLift.sh
# There was an extra file, nib22.fa, under /cluster/data/hg15/nib, which should not be there.
# -rw-rw-r-- 1 2429 protein 50466533 May 20 2003 nib22.fa
# This caused hg15SplitLift.sh to end abnormally.
cd ../
mkdir chainRun chainRaw
cd chainRun
cat > gsub << 'EOF'
#LOOP
/cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg17/bothMaskedNibs /san/sanVol1/scratch/hg15/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
'EOF'
ls -1S ../psl/*.psl > in.lst
gensub2 in.lst single gsub spec
para create spec
para try
para push
para time
# Completed: 44 of 44 jobs
# CPU time in finished jobs: 3596s 59.94m 1.00h 0.04d 0.000 y
# IO & Wait Time: 919s 15.31m 0.26h 0.01d 0.000 y
# Average job time: 103s 1.71m 0.03h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 274s 4.57m 0.08h 0.00d
# Submission to last job: 284s 4.73m 0.08h 0.00d
######### CHAINMERGE/NET/NETSUBSET
ssh kolossus
mkdir -p /scratch/fan/hg15Lifts
cd /scratch/fan/hg15Lifts
cp -r /san/sanvol1/scratch/hg17/blat.hg15.2006-07-25/chainRaw/ .
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
# about 30 minutes.
cp -rp chain /san/sanvol1/scratch/hg17/blat.hg15.2006-07-25/
rm -rf chain
rm -rf chainRaw
ssh pk
cd /san/sanvol1/scratch/fan
cat << 'EOF' > netOver.sh
#!/bin/bash
chain=$1
chrom=`basename $chain .chain`
sizesHGOld=$2
sizesHG15=/cluster/data/hg15/chrom.sizes
chainDir=`dirname $chain`
blatDir=`dirname $chainDir`
net=${blatDir}/net/${chrom}.net
over=${blatDir}/over/${chrom}.over
mkdir -p ${blatDir}/{over,net}
/cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG15 $net /dev/null
/cluster/bin/x86_64/netChainSubset $net $chain $over
'EOF'
chmod +x netOver.sh
mkdir netRun
cd netRun/
find /san/sanVol1/scratch/hg17/blat.hg15.2006-07-25/chain -name "*.chain" \
| awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg17/chrom.sizes"}' > spec
para create spec
para push
para time
# Completed: 46 of 46 jobs
# CPU time in finished jobs: 438s 7.30m 0.12h 0.01d 0.000 y
# IO & Wait Time: 118s 1.97m 0.03h 0.00d 0.000 y
# Average job time: 12s 0.20m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 28s 0.47m 0.01h 0.00d
# Submission to last job: 67s 1.12m 0.02h 0.00d
########## FINISHING
ssh hgwdev
# HG17
cd /san/sanvol1/scratch/hg17/blat.hg15.2006-07-25/over
cat * >> ../hg17ToHg15.over.chain
cd ../
rm -rf psl/ net/ chain/ chainRaw/ over/
cd ../
cp -rp blat.hg15.2006-07-25/ /cluster/data/hg17/bed
cd /cluster/data/hg17/bed
ln -s blat.hg15.2006-07-25 blat.hg15
ln -s `pwd`/blat.hg15/hg17ToHg15.over.chain liftOver/hg17ToHg15.over.chain
ln -s `pwd`/liftOver/hg17ToHg15.over.chain /gbdb/hg17/liftOver/hg17ToHg15.over.chain
mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/liftOver
gzip /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain
ln -s /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain.gz /gbdb/hg17/liftOver/
cp -p /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain.gz /cluster/data/hg17/bed/liftOver
cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
ln -s /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain.gz hg17ToHg15.over.chain.gz
hgAddLiftOverChain hg17 hg15
############################################################################
############################################################################
# Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2006-08-15 markd)
cd /cluster/data/genbank/data/ccds/hg17
ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
ftp> get CCDS.20060815.tar.gz
mkdir /scratch/tmp/ccds
cd /scratch/tmp/ccds
tar -zxf /cluster/data/genbank/data/ccds/hg17/CCDS.20060815.tar.gz
# import ccds database tables
hgsql -e 'create database ccds'
hgsql ccds </cluster/data/genbank/etc/createTables.sql
hgsql ccds </cluster/data/genbank/etc/createKeys.sql
/cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene
checkTableCoords hg17 -verbose=2 ccdsGene
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
rm -rf /scratch/tmp/ccds
# 2006-08-23 - found bug with some source genes missing from ccdsInfo, fixed ccdsMkTables
# and reload
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene
checkTableCoords hg17 -verbose=2 ccdsGene
joinerCheck -database=hg17 -identifier=ccdsGeneId ~/compbio/kent/src/hg/makeDb/schema/all.joiner
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
# << emacs
##########################################################################
# hars 1 to 202 Sol 09/10/2006
set bedDir = /gbdb/hg17/haseq/bed
mkdir -p $bedDir/hars
pushd /projects/hg/wet/Sol/hars1to49
cp -p hars_1to202.hg17.bed $bedDir/hars/hars_1to202.bed
hgLoadBed hg17 hars $bedDir/hars/hars_1to202.bed
rm -f $bedDir/hars/hars_1to202.bed
popd
# BUILD HPRD DATA FOR KNOWN GENE DETAILS PAGE LINKS (DONE 9/11/06)
# First, build hprdToCdna.tab and hprdToUniProt.tab.
# See hg18.txt for details.
cd ~/data/hprd
mkdir hg17
cd hg17
hgsql hg17 -e 'drop table hprdToCdna'
hgsql hg17 <~/src/hg/lib/hprdToCdna.sql
hgsql hg17 -e 'load data local infile "../hprdToCdna.tab" into table hprdToCdna'
hgsql hg17 -e 'drop table hprdToUniProt'
hgsql hg17 <~/src/hg/lib/hprdToUniProt.sql
hgsql hg17 -e 'load data local infile "../hprdToUniProt.tab" into table hprdToUniProt'
# build knownToHprd table
hgsql hg17 -N -e 'select kgId,hprdId from hprdToCdna, kgXref where cdnaId=kgId' >j.kg1
hgsql hg17 -N -e 'select kgId,hprdId from hprdToUniProt, kgXref where uniProtId=spId' >j.kg2
cat j.kg1 j.kg2 |sort -u >knownToHprd.tab
wc knownToHprd.tab
hgsql hg17 -e 'drop table knownToHprd'
hgsql hg17 <~/src/hg/lib/knownToHprd.sql
hgsql hg17 -e 'load data local infile "knownToHprd.tab" into table knownToHprd'
hgsql hg17 -e 'select count(*) from knownToHprd'
# 19,345 records created.
# remove temporary files.
rm j*
# Do the same for hg17. See hg17.txt for details.
############################################################################
# Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2006-09-20 markd)
# Reloaded due to bug that results in multiple versions of the same accession
# in the ccdsInfo table.
cd /cluster/data/genbank/data/ccds/hg17
ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
get CCDS.20060920.tar.gz
mkdir /scratch/tmp/ccds
cd /scratch/tmp/ccds
tar -zxf /cluster/data/genbank/data/ccds/hg17/CCDS.20060920.tar.gz
# import ccds database tables
hgsql -e 'drop database ccds; create database ccds'
hgsql ccds </cluster/data/genbank/etc/createTables.sql
hgsql ccds </cluster/data/genbank/etc/createKeys.sql
/cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
checkTableCoords hg17 -verbose=2 ccdsGene
joinerCheck -database=hg17 -identifier=ccdsGeneId ~/compbio/kent/src/hg/makeDb/schema/all.joiner
rm -rf /scratch/tmp/ccds
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
# << emacs
#########################################################
# BUILD GAD TRACK (Done, 9/26/06, Fan)
mkdir /cluster/store/gad060926
ln -s /cluster/store/gad060926 /cluster/data/gad
# Receive "allxlsAStxt.txt" from GAD/NIA
# contact person: Shenoy, Narmada, shenoyn@grc.nia.nih.gov
hgsql hg17 -e 'drop table gadAll'
hgsql hg17 <~/src/hg/lib/gadAll.sql
hgsql hg17 -e 'load data local infile "allxlsAStxt.txt" into table gadAll ignore 2 lines'
hgsql hg17 -e 'create index geneSymbol on gadAll(geneSymbol(10))'
# create gad table
hgsql hg17 -N -e \
'select "chr",chromosome, chromStart, chromEnd, geneSymbol from gadAll where chromStart <>0'|\
sed -e 's/chr\t/chr/' |grep -v "chr\." |grep -v " "|sort -u >gad.bed
hgLoadBed hg17 gad gad.bed
#####################################################################
# YALE TRANSCRIPTIONALLY ACTIVE REGIONS (TARs/TransFrags) TRACK IDENTIFIED
# USING A WHOLE GENOME TILING ARRAY (DONE, 2006-10-20, hartera)
# Data is from the paper: Bertone et al. Science 24 December 2004:
# Vol. 306. no. 5705, pp. 2242 - 2246. From Mark Gerstein's lab at Yale.
# Contact at Yale: Joel S. Rozowsky, joel.rozowsky@yale.edu
# The data consist of Transcriptionally Active Regions (TARs or TransFrags)
# found using Affymetrix genome tiling arrays. The data is from the lab
# of Mark Gerstein at Yale.
ssh kkstore02
mkdir /cluster/data/hg17/bed/yaleBertoneTars/
cd /cluster/data/hg17/bed/yaleBertoneTars/
# download Bertone et al. data from this URL:
#http://dart.gersteinlab.org/cgi-bin/ar/download.cgi?ID=TAR_data_NCBI31.txt
# and put it in this directory.
# The sequences used to design the microarrays were from
# UCSC hg13/NCBI Build 31 so the sequences
# should be aligned again using Blat since this is probably better
# than using liftOver across so many assemblies.
# Get sequences from TARs file and put in FASTA format:
# Remove characters from Windows:
dos2unix TAR_data_NCBI31.txt
# The TARs are in order of IDs in the file so the first TAR has ID 1, the
# second is 2 up to the last which is 17517. These IDs are used to link
# to the DART database of TARs at Yale so use these IDs in the FASTA
# header lines. Need to add "TAR" as prefix to ID so that it is unique
# in the seq table.
awk 'BEGIN {FS="\t";n=0;}{if ($1 ~ /^chr/) print ">TAR"n"\n"$14"\n";n++;}' \
TAR_data_NCBI31.txt > yaleBertoneTARSeqs.fa
ssh pk
mkdir -p /san/sanvol1/scratch/hg17/TARs/
cp /cluster/data/hg17/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \
/san/sanvol1/scratch/hg17/TARs/
# Set up to Blat the TAR sequences against hg17
cd /cluster/data/hg17/bed/yaleBertoneTars
ls -1 /san/sanvol1/scratch/hg17/TARs/yaleBertoneTARSeqs.fa > tars.lst
ls -1 /san/sanvol1/scratch/hg17/nib/*.nib > genome.lst
# output dir
mkdir psl
cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/x86_64/blat -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/hg17/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << for emacs
gensub2 genome.lst tars.lst template.sub para.spec
para create para.spec
para try, para check, para push ...
para time
# Completed: 46 of 46 jobs
# CPU time in finished jobs: 429s 7.16m 0.12h 0.00d 0.000 y
# IO & Wait Time: 153s 2.54m 0.04h 0.00d 0.000 y
# Average job time: 13s 0.21m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 38s 0.63m 0.01h 0.00d
# Submission to last job: 107s 1.78m 0.03h 0.00d
# sort and then filter
pslSort dirs raw.psl tmp psl
# use these parameters as for Genbank alignments of native mRNAs
# for finished assemblies.
pslCDnaFilter -minId=0.96 -minCover=0.25 -localNearBest=0.001 \
-minQSize=20 -minNonRepSize=16 -ignoreNs -bestOverlap \
raw.psl yaleBertoneTars.psl
# seqs aligns
# total: 17512 37530
# drop minNonRepSize: 121 254
# drop minIdent: 3827 14532
# drop minCover: 571 897
# weird over: 232 837
# kept weird: 197 201
# drop localBest: 2359 3896
# kept: 17498 17951
# 99.9% were kept.
# check how many aligned
grep '>' yaleBertoneTARSeqs.fa | wc -l
# 17517
# 99.89% of the original set of sequences are in this filtered PSL file.
pslCheck yaleBertoneTars.psl
# psl is ok
# load into database
ssh hgwdev
cd /cluster/data/hg17/bed/yaleBertoneTars
hgLoadPsl hg17 yaleBertoneTars.psl
# Add sequences to /gbdb/hg18 and to seq and extFile tables.
mkdir -p /gbdb/hg17/yaleTARs/
ln -s /cluster/data/hg17/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \
/gbdb/hg17/yaleTARs/
hgLoadSeq hg17 /gbdb/hg17/yaleTARs/yaleBertoneTARSeqs.fa
# trackDb.ra entry is in trackDb/human/trackDb.ra and
# a description exist already as this track is also on hg18.
######################################################################
## reload tfbsCons table - it was based on a newer version of tfbs names that
# are not yet public domain (2006-11-03 - Hiram)
mkdir /cluster/data/hg17/bed/tfbsCons
cd /cluster/data/hg17/bed/tfbsCons
cp -p /cluster/store6/weirauch/TFLOC/hg17/tfbsConsSites.bed .
hgLoadBed -strict hg17 tfbsConsSites \
-sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql \
tfbsConsSites.bed -tab
# this leads to a bunch of extra names in Factors
hgsql -N -e "select name from tfbsConsSites;" hg17 | sort -u > names.new
hgsql -N -e "select name from tfbsConsFactors;" hg17 \
| sort -u > names.factors
comm -13 names.new names.factors > names.extra.factors
for N in `cat extra.names.factors`
do
echo "delete from tfbsConsFactors where name=\"${N}\";" hg17
hgsql -e "delete from tfbsConsFactors where name=\"${N}\";" hg17
done
#########################################################
# BUILD GAD TRACK (Re-Re-Done, 12/12/06, Fan)
mkdir /cluster/store12/gad061211
rm /cluster/data/gad
ln -s /cluster/store12/gad061211 /cluster/data/gad
# Receive "GAD-Hg17DATA.txt" from GAD/NIA
# contact person: Shenoy, Narmada, shenoyn@grc.nia.nih.gov
hgsql hg17 -e 'drop table gadAll'
hgsql hg17 <~/src/hg/lib/gadAll.sql
hgsql hg17 -e 'load data local infile "GAD-Hg17DATA.txt" into table gadAll ignore 1 lines'
hgsql hg17 -e 'create index geneSymbol on gadAll(geneSymbol(10))'
# create gad table
hgsql hg17 -N -e \
'select "chr",chromosome, chromStart, chromEnd, geneSymbol from gadAll
where chromStart <>0 and chromosome<>""'|\
sed -e 's/chr\t/chr/' |grep -v "chr\." |grep -v " "|sort -u >gad.bed
hgLoadBed hg17 gad gad.bed
##########################################################################
# xxBlastTab - Help filter out unwanted paralogs (Galt 2007-01-11)
#
# Background: The xxBlastTab tables are made with a simple blastall
# (blastp with -b 1) which chooses the best match. Unfortunately this
# means that if there is no proper match it will still pick something
# even though it's probably not orthologous. This is especially a problem
# in organisms like rat knownGene which has only 30% gene coverage.
# The strategy here is to filter our xxBlastTab using synteny mappings from
# the chains. This is done by simply taking $db.kg and using /gbdb/$db chains
# and pslMap to lift the genes to the target xx assembly. Then hgMapToGene
# will find which of those mapped ids have good overlap with xx.knownGene.
# The final mapping is then created by doing an inner join between
# the traditional xxBlastTab and the mapping table produced above.
# Then simply drop the old table and rename the new table.
#
# We are starting with xxBlastTab tables already built in the usual way with
# blastall/blastp, probably with doHgNearBlastp.pl script.
#
# I created a new utility script called synBlastp.csh since I have to do this
# several times.
#
# we want to update hg17 for rat and mouse,
# so check ./hgGeneData/Human/hg17/otherOrgs.ra for current settings
ssh hgwdev
synBlastp.csh hg17 rn3
#hg17.rnBlastTab results:
#new number of unique query values:
#10728
#new number of unique target values
#5177
#old number of unique query values:
#24030
#old number of unique target values
#5535
synBlastp.csh hg17 mm7
#new number of unique query values:
#25506
#new number of unique target values
#13462
#old number of unique query values:
#32951
#old number of unique target values
#14803
#####################################################################
##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page
# genbank release 157.0 now contains misc_diff fields for MGC clones
# reloading mRNAs results in gbMiscDiff table being created.
./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna hg17
############
# UPDATE hg17 knownToVisiGene (DONE galt 2007-02-15)
# Create table that maps between known genes and visiGene database
# mapping to other species such as mouse, zebrafish, frog
# requires visiGene probe track vgImageProbes be created first
knownToVisiGene hg17 -fromProbePsl=vgImageProbes
#########################################################
# Chimp Paralogy data from Eichlers lab (DONE Heather Feb. 2007)
cd /cluster/data/hg17/bed/eichler
hgLoadBed hg17 chimpParalogy chimpParalogy.bed -tab -sqlTable=chimpParalogy.sql
############################################################################
# Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-03-02 markd)
cd /cluster/data/genbank/data/ccds/
ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
get CCDS.20070228.tar.gz
mkdir /scratch/tmp/ccds
cd /scratch/tmp/ccds
tar -zxf /cluster/data/genbank/data/ccds/CCDS.20070228.tar.gz
# import ccds database tables
/cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
checkTableCoords hg17 -verbose=2 ccdsGene
joinerCheck -database=hg17 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
# build initial version of ccdsMgcMap table, updated by nightly genbank update
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=hg17 ccdsGene mgcGenes ccdsMgcMap
# load trackDb
cd kent/src/hg/makeDb/trackDb
make alpha
# check in browser
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
ccdsMgcMap
# << emacs
#####################################################
# Vista Enhancers (galt 2007-02-23 done)
#
# Vista from Lawrence-Berkeley has assayed
# 301 human conserved non-coding intra- and inter-
# genic elements for their ability to promote
# lacZ in mouse embryos. A positive looks like
# a mouse with a purple spine.
#
# They provided a custom track with two tracks for pos and neg.
# http://enhancer-test.lbl.gov/cgi-bin/customTrack.cgi
# I am combining the tracks into one with high score for pos.
#
cd /cluster/data/hg17/bed
mkdir vistaEnhancers
cd vistaEnhancers
wget -O custTrk "http://enhancer-test.lbl.gov/cgi-bin/customTrack.cgi"
cat custTrk | head -116 | tail +2 > pos
cat custTrk | tail +118 > neg
cat pos | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t900"}' > bed5
cat neg | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t200"}' >> bed5
wc -l bed5
#301 bed5
hgLoadBed hg17 vistaEnhancers bed5
#Loaded 301 elements of size 5
# add to human/trackDb.ra
track vistaEnhancers
shortLabel Vista Enhancers
longLabel Vista HMR-Conserved Non-coding Human Enhancers from LBNL
group regulation
priority 93
visibility hide
color 50,70,120
type bed 5 .
useScore 1
url http://enhancer-test.lbl.gov/cgi-bin/imagedb.pl?form=presentation&show=1&experiment_id=$$
###
# UPDATES (2007-10-18, conodera)
# see also /projects/compbiousr/wet/browser/vista_enhancer/17Oct2007/Makefile
cd /projects/compbiousr/wet/browser/vista_enhancer/
# download data file from the vista browser (coordinates are for hg17)
# http://enhancer.lbl.gov/cgi-bin/imagedb.pl?show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1
# save as enhancerbrowser.datadownload.txt
# give elements with positive label a score of 900,
# give elements with negative label a score of 200.
# print to 5-field bed file
vista_enhancer.hg17.txt: enhancerbrowser.datadownload.txt
grep ">" $< \
| sed -e 's/>//' \
| tr :- ' ' \
| sed -e 's/positive/900/'\
| sed -e 's/negative/200/' \
| awk '{print $$1"\t"$$2"\t"$$3"\telement_"$$6"\t"$$8}' \
> $@; \
hgLoadBed hg17 vistaEnhancers vista_enhancer.hg17.txt;
# loaded 446 elements of length 5
#########################################################################
# EPONINE-TSS (TRANSCRIPTON START SITE) PREDICTION
# (DONE, 2007-03-08, hartera)
# The Eponine software is version 2 and has not changed in several years
# (contact: Thomas Down at Sanger, td2@sanger.ac.uk). The version downloaded
# for hg16 should be the same as the current version but download again just
# to check. The application includes the TSS model file: eponine-tss2.xml
ssh kkstore02
# Eponine runs fine on 2.5Mb contig, but barfs on much larger contig;
# chop up sequence at gaps into ~2.5Mb chunks for cluster run.
mkdir /san/sanvol1/scratch/hg17/chunks
cd /cluster/data/hg17
foreach f (?{,?}/NT_*/NT_??????.fa)
set ctg = $f:t:r
/cluster/bin/x86_64/faSplit -minGapSize=10 \
-lift=/san/sanvol1/scratch/hg17/chunks/${ctg}.lft \
gap $f 2500000 /san/sanvol1/scratch/hg17/chunks/${ctg}.chunk
end
# seems to ignore the chunk part of the file name
mkdir /cluster/data/hg17/bed/eponine
cd /cluster/data/hg17/bed/eponine
wget --timestamping \
http://www.sanger.ac.uk/Software/analysis/eponine/eponine-scan.jar
# file has the same date and same size as the one downloaded for hg16
# the script requires all of the path setting found in my .tcshrc file.
# Using only set path = (/usr/java/jre1.5.0_06/bin $path)
# as in the doEpo file for hg16 does not work.
cat << '_EOF_' > doEpo
#!/bin/csh -ef
set path = (/usr/java/jre1.5.0_06/bin /bin /usr/bin /usr/X11R6/bin \
/usr/local/bin . /cluster/home/hartera/bin/x86_64 \
/cluster/bin/x86_64 /projects/compbio/bin/x86_64 \
/projects/compbio/bin /projects/compbio/bin/x86_64-linux \
/cluster/bin/scripts)
java -jar ./eponine-scan.jar -threshold 0.999 -seq $1 > $2
'_EOF_'
# << emacs
chmod a+x doEpo
cp /dev/null jobList
foreach f (/san/sanvol1/scratch/hg17/chunks/NT*.fa)
echo "./doEpo {check in line+ $f} {check out exists out/$f:t:r.gff}" \
>> jobList
end
mkdir out
ssh pk
cd /cluster/data/hg17/bed/eponine
/parasol/bin/para create jobList
/parasol/bin/para try, check, push, check etc.....
/parasol/bin/para time
# Completed: 1415 of 1415 jobs
# CPU time in finished jobs: 104501s 1741.68m 29.03h 1.21d 0.003 y
# IO & Wait Time: 6594s 109.91m 1.83h 0.08d 0.000 y
# Average job time: 79s 1.31m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 127s 2.12m 0.04h 0.00d
# Submission to last job: 488s 8.13m 0.14h 0.01d
# lift chunks -> contigs
mkdir contigs/
foreach l (/san/sanvol1/scratch/hg17/chunks/*.lft)
set ctg = $l:t:r
liftUp contigs/$ctg.gff $l warn out/${ctg}*.gff
end
# lift contigs -> chrom
liftUp eponine.gff /cluster/data/hg17/jkStuff/liftAll.lft \
warn contigs/NT_*.gff
# Translate to bed 4 + float-score -- it would be a shame to lose
# those scores in genePred or bed 5 (int score)
awk 'BEGIN {i=0;} \
{printf "%s\t%d\t%d\t%s.%d\t%s\t%s\n", $1, $4-1, $5, $1, i, $6, $7; \
i = i + 1;}' \
eponine.gff > eponine.bed
# load up
ssh hgwdev
cd /cluster/data/hg17/bed/eponine
sed -e 's/bed6FloatScore/eponine/g' \
$HOME/kent/src/hg/lib/bed6FloatScore.sql > eponine.sql
hgLoadBed hg17 eponine eponine.bed -tab -sqlTable=eponine.sql
# Loaded 61013 elements of size 6
# trackDb.ra entry and eponine.html already exist in trackDb directory.
###########################################################################
# ACEScan Track (DONE 2007-03-15 Andy
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir acescan
cd acescan/
cp ~/acescan.gff .
tail +2 acescan.gff > acescan.nh.gff
ldHgGene -out=gp hg17 acescan acescan.nh.gff
rm *.gff
ldHgGene -predTab hg17 acescan acescan.hg17.gp
###########################################################################
# augustusHints track (DONE 2007-4-5 Mario)
mkdir -p /cluster/data/hg17/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.Xp.final
cd /cluster/data/hg17/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.Xp.final
wget http://augustus.gobics.de/predictions/hg17/usingEvidence/augustus.hg17.Trefseq.hmRNA.hsEST.R.Xp.gff
wget http://augustus.gobics.de/predictions/hg17/usingEvidence/augustus.hg17.Trefseq.hmRNA.hsEST.R.pep.aa
ldHgGene -bin hg17 augustusHints augustus.hg17.Trefseq.hmRNA.hsEST.R.Xp.gff
hgPepPred hg17 generic augustusHintsPep augustus.hg17.Trefseq.hmRNA.hsEST.R.Xp.pep.aa
###########################################################################
# augustus de novo track (DONE 2007-4-5 Mario)
mkdir -p /cluster/data/hg17/bed/augustus/usingHints/predictions/Xp.RA.it
cd /cluster/data/hg17/bed/augustus/usingHints/predictions/Xp.RA.it
wget http://augustus.gobics.de/predictions/hg17/deNovo/augustus.hg17.Xp.RA.it.gff
wget http://augustus.gobics.de/predictions/hg17/deNovo/augustus.hg17.Xp.RA.it.pep.aa
ldHgGene -bin hg17 augustusXRA augustus.hg17.Xp.RA.it.gff
hgPepPred hg17 generic augustusXRAPep augustus.hg17.Xp.RA.it.pep.aa
###########################################################################
# SwitchDB TSS Track (DONE 2007-04-12 Andy)
ssh hgwdev
mkdir /cluster/data/hg17/bed/switchDbTss
cd /cluster/data/hg17/bed/switchDbTss
# (obtained from Nathan Trinklein <nathant@switchgeargenomics.com>)
cp ~/all_tss_switchdb_psgene.gz .
gunzip all_tss_switchdb_psgene.gz
cat << "EOF" > reformat.awk
BEGIN{FS="\t"}
{
if (NR > 1)
{
if ($9 !~ "^PSEUDO.*")
{
pseudo = "none";
}
else
{
pseudo = $9;
}
printf("%s\t%d\t%d\t%s\t1000\t%s\t%s\t%s\t%s\t%s\t%s\n", $2, $8, $8+1, $6, $5, $7, $1, $3, $4, pseudo);
}
}
EOF
awk -f reformat.awk all_tss_switchdb_psgene > switchDbTss.bed
ln -s ~/kent/src/hg/lib/switchDbTss.sql
hgLoadBed -sqlTable=switchDbTss.sql hg17 switchDbTss switchDbTss.bed
############################################################################
# enable ORFeome track build. (markd 2007-05-02)
cd ~/kent/src/hg/makeDb/genbank
cvs update -d etc
# edit etc/genbank.conf to add
hg17.orfeomeTables.hgwdev = yes
hg17.orfeomeTables.hgwbeta = yes
# will need to enable for rr later. In the future, this can just be enabled
# as part the normal genbank build. Change above to:
hg18.orfeomeTables.default = yes
###########################################################################
# Transcriptome Phase 3 tracks (Andy 2007-06-10)
ssh hgwdev
bash
cd /san/sanVol1/scratch/andy
mkdir transcriptome
cd transcriptome/
cp /var/ftp/encode/Affy_transcriptome_phase3.tar .
tar xfv Affy_transcriptome_phase3.tar
find . -name '*.bz2' -exec bunzip2 '{}' \;
cat > processWig.sh << "EOF"
#!/bin/bash
theDir=`dirname $1`;
theFile=`basename $1`;
table=affyTxnPhase3${theFile%.sig.wig};
tmp=/scratch/tmp/trans3rdPhase.$$
mkdir $tmp
cp $1 $tmp
pushd $tmp
head -n1 $theFile > $table.sig.track.txt
tail +2 $theFile > tmp; mv tmp $theFile
wigEncode $theFile $table.wig $table.wib
popd
cp $tmp/${table}.* $theDir
rm -rf $tmp
EOF
chmod +x processWig.sh
cat > gsub << "EOF"
#LOOP
./processWig.sh {check in line+ $(path1)} {check out exists $(dir1)/$(root1).track.txt}
#ENDLOOP
EOF
find . -name '*.sig.wig' > wig.lst
gensub2 wig.lst single gsub spec
ssh pk
cd /san/sanVol1/scratch/andy/transcriptome
para create spec
para push
exit
cd /cluster/data/hg17/bed
mkdir transcriptome3rdPhase/{wig,wib,bed}
cd transcriptome3rdPhase/wib/
cp /san/sanVol1/scratch/andy/transcriptome/graphs/human_{long,short}_rna/affyTxnPhase3*.wib .
pushd /gbdb/hg17/wib
ln -s /cluster/data/hg17/bed/transcriptome3rdPhase/wib/* .
popd
cd ../wig/
cp /san/sanVol1/scratch/andy/transcriptome/graphs/human_{long,short}_rna/affyTxnPhase3*.wig .
for f in *; do
hgLoadWiggle hg17 ${f%.wig} $f
done
cd ../bed
for f in /san/sanVol1/scratch/andy/transcriptome/transfrags/human_{long,short}_rna/*; do
newName=`basename $f`;
newName=${newName%.bz2};
bzcat $f | tail +2 > $newName;
tab=affyTxnPhase3Frags${newName%.bed};
hgLoadBed hg17 $tab $newName;
done
#######################################################################
# CLEANUP OF DANRER1 BLASTZ SWAP (DONE, 2007-06-25, hartera)
ssh kkstore02
cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
rm -r run1
cd ../mafNet.new
gzip *.maf
# we don't tend to keep the Blastz PSLs anymore and this is an old
# zebrafish assembly so remove these.
cd ../
rm -r pslChrom
# this removed 1.2 G of data.
#######################################################################
# CLEANUP OF ACEMBLY_050217 DATA (DONE, 2007-06-25, hartera)
ssh kkstore02
cd /cluster/store5/gs.18/build35/bed/acembly_050217
rm GeneCheck.out GeneCheck2 acembly acembly.chk acembly.details \
chrom.out genePred.tab hg16.name hg16Pep.name
cd acembly_gene_lists
rm test transcripts.names *.bak main_gene.list.IDsort mp.ids mp.sort ptest \
maintest gid.tab gid.tab.sort genesGffs.ids genesGffs.ids.uniq
cd ../
# remove fasta files as included in gzipped tar file
rm -r acembly.ncbi_35.genes.proteins.fasta
cd acembly.ncbi_35.genes.gff
gzip *.gff
#######################################################################
# CLEANUP OF DANRER2 BLASTZ SWAP (DONE, 2007-06-25, hartera)
ssh kkstore02
cd /cluster/store5/gs.18/build35/bed/blastz.danRer2.2004-12-08
# remove old axtChrom directory
rm -r axtChrom.orig
cd axtChain
# chain directories can be recreated from all.chain files so remove
rm -r chain chainAR
# gzip net files
gzip net/*.net
# gzip .over files
gzip over/*.over
# removed ~1.3 G data
#############################################################################
# Duke DNaseI HS (2007-06-26 kate)
#
# Submitted by Terry Furey <terry.furey@duke.edu>
# in collaboration with Greg Crawford
# Resubmitted 9/26/07 from FTP site
# Resubmitted 10/25/07 from FTP site
ssh kkstore02
cd /cluster/data/hg17/bed
# download 19GB archive from Duke site, password protected,
# user=ucsc, password=dnase
mkdir -p dukeDnase/2007-10-25/lab
cd dukeDnase/2007-10-25/lab
sftp ucsc@sftp.igsp.duke.edu
mget *
# dukeDnaseHsCd4.bed
# dukeDnaseHsCd4Wiggle.tgz
# unpack and load wiggle (signal) data
nice tar xvfz dukeDnaseHsCd4Wiggle.tgz
# packaged as chr*_dukeDnaseHsCd4Wiggle.out
# fixedStep 1 files
# create wiggle and load into database
cd ..
cat lab/chr*.out | nice wigEncode stdin \
dukeDnaseCd4Signal.wig dukeDnaseCd4Signal.wib >&! wigencode.log &
# upper limit 25.74, lower limit -0.66
ssh hgwdev
cd /cluster/data/hg17/bed/dukeDnase/2007-10-25
rm -f /gbdb/hg17/wib/dukeDnaseCd4Signal.wib
ln -s /cluster/data/hg17/bed/dukeDnase/2007-10-25/dukeDnaseCd4Signal.wib \
/gbdb/hg17/wib
nice hgLoadWiggle hg17 dukeDnaseCd4Signal -pathPrefix=/gbdb/hg17/wib \
dukeDnaseCd4Signal.wig
# load bed file (sites)
ssh hgwdev
cd /cluster/data/hg17/bed/dukeDnase/2007-10-25/
set table = dukeDnaseCd4Sites
sed "s/bed5FloatScore/$table/" ~/kent/src/hg/lib/bed5FloatScore.sql > \
$table.sql
hgsql hg17 -e "DROP TABLE IF EXISTS $table"
hgsql hg17 < $table.sql
hgLoadBed -sqlTable=$table.sql hg17 $table lab/dukeDnaseHsCd4.bed
# Loaded 95723 elements of size 6
# min value: 0.000103164
# max value: 25.7442
#textHistogram -col=5 lab/dukeDnaseHsCd4.bed -binSize=50
300 ******************************** 11789
350 ************************************************************ 22253
400 ********************************************* 16854
450 ********************************* 12333
500 ************************* 9179
550 ********************* 7870
600 ************* 4987
650 ********* 3271
700 ******** 2789
750 ****** 2153
800 **** 1303
850 ** 567
900 * 219
950 85
1000 71
###########################################################################
# Stanford ChIP-seq (Apr - July 2007, Heather)
# Submitted 2007-03-14 by David Johnson <seasquirtdoctor@gmail.com>
# 25bp tags (Solexa sequencing of IP fragments)
# genome-wide, but funded by ENCODE, hence the location of the data
ssh hgwdev
cd /cluster/data/encode/stanford
mkdir -p 2007-03-14/lab
cd 2007-03-14/lab
sort NRSF_chipseq_hg17.bed > data.bed
sort NRSF_chipseq_control_hg17.bed > control.bed
fix.pl < data.bed > fix.bed
fix.pl < control.bed > control_fix.bed
hgLoadBed hg17 stanfordNRSFEnriched fix.bed -tab
hgLoadBed hg17 stanfordNRSFControl control_fix.bed -tab
############################################################################
# Stanford ChIP/chip
# Submitted 2007-07-11 by David Johnson (seasquirtdoctor@gmail.com)
# Replaces submission from 2007-03-23
# 12 subtracks
# genome-wide, but funded by ENCODE, hence the location of the data
ssh hgwdev
cd /cluster/data/encode/stanford/2007-07-11/lab
# Dave gave us bed 5, we need bed 4
./shrink.sh
./load.sh
#########################################################################
# REGULATORY POTENTIAL 7X UPDATED (DONE - 2007-08-01 - Hiram)
# download data from "James Taylor" <james@bx.psu.edu>
ssh kkstore02
mkdir /cluster/data/hg17/bed/regPotential7X.update
cd /cluster/data/hg17/bed/regPotential7X.update
## In theory, only chr4, chr8, chr9 and chrY have updated, fetch them
## all and verify with ../regPotential7X
for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
do
wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg17/chr${C}.scores.truncated.bz2"
echo "DONE - chr${C}.scores.bz2"
done
# create download gzip files from the bz2 files:
time for F in chr*.scores.truncated.bz2
do
C=`echo $F | awk -F'.' '{print $1}'`
echo -n "${C}.regPotential7X.hg17.gz working ... "
bzcat ${F} | gzip > ${C}.regPotential7X.hg17.gz
echo "done"
done
time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
do
zcat chr${C}.regPotential7X.hg17.gz
done | wigEncode -noOverlap stdin regPotential7X.wig regPotential7X.wib
# Converted stdin, upper limit 1.00, lower limit -0.00
# real 16m51.215s
# Loading the table on hgwdev
ssh hgwdev
cd /cluster/data/hg17/bed/regPotential7X.update
mkdir /gbdb/hg17/wib/061116
ln -s /cluster/data/hg17/bed/regPotential7X.update/regPotential7X.wib \
/gbdb/hg17/wib/061116/regPotential7X.wib
# using the tmpDir is faster since it is on local disk and it will
# clean up any temporary .tab file it creates there
time nice -n +19 hgLoadWiggle -tmpDir=/scratch/tmp \
-pathPrefix=/gbdb/hg17/wib/061116 hg17 regPotential7X regPotential7X.wig
# real 0m40.523s
# How about a histogram of the data.
ssh kolossus
cd /cluster/data/hg17/bed/regPotential7X.update
time nice -n +19 hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 \
-hBinCount=100 -hMinVal=0.0 -db=hg17 regPotential7X > histogram.data 2>&1
# real 3m3.829s
# 73 % of the data values are zero
# renaming file directory -- kuhn 08-17-2007
cd /gbdb/hg17/wib
mv 061116 regPot061116
hgsql -e " update regPotential7X SET file = \
/gbdb/hg17/wib/regPot061116/regPotential7X.wib" hg17
Query OK, 2366123 rows affected (31.46 sec)
Rows matched: 2366123 Changed: 2366123 Warnings: 0
###########################################################################
## Create gc5Base download raw data file (DONE - 2007-08-29 - Hiram)
ssh kkstore02
cd /cluster/data/hg17/bed/gc5Base
hgGcPercent -wigOut -doGaps -file=stdout -win=5 \
hg17 /cluster/data/hg17/hg17.2bit 2> /dev/null \
| gzip > hg17.gc5Base.txt.gz
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/hg17/gc5Base
cd /usr/local/apache/htdocs/goldenPath/hg17/gc5Base
ln -s /cluster/data/hg17/bed/gc5Base/hg17.gc5Base.txt.gz .
############################################################################
# INDEL-BASED CONSERVATION TRACK (DONE, 2007-10-02 - 2007-10-03, hartera)
# Data from the Gerton Lunter (gerton.lunter@anat.ox.ac.uk), MRC
# Functional Genetics Unit, University of Oxford, United Kingdom.
# Data is from the paper:
# Lunter G, Ponting CP and Hein J Genome-wide identification of human
# functional DNA using a neutral indel model. PLoS Comput Biol. 2006
# Jan;2(1):e5.
ssh kkstore02
mkdir -p /cluster/data/hg17/bed/consIndels/data
cd /cluster/data/hg17/bed/consIndels/
# Add a README.indels with the e-mail from Gerton Lunter, copy over
# from hg18 condIndels
cp /cluster/data/hg18/bed/consIndels/README.indels .
# get the data
cd data
wget --timestamping \
http://wwwfgu.anat.ox.ac.uk/~gerton/IPS/IPSs.zip
# 15 Mb zip file in GFF format. This contains data for hg17
# comparing it to mm5 (NCBI Build 33) and
# canFam1 (Broad Institute, July 2004). The chr*.mm5.GFF data is old
# data that can be removed.
unzip IPSs.zip
cd /cluster/data/hg17/bed/consIndels
rm ./data/*mm5.GFF
foreach f (./data/*.GFF)
set r = $f:r
echo $r
grep -v "track" $f > ${r}NoHeader.gff
end
# strip off the end of the name e.g. IGS0001:p=.26
# so that the name displayed is short - IGS0001.1. The score field
# is used to determine colouring and this is calculated from FDR
ssh kkstore02
cd /cluster/data/hg18/bed/consIndels
perl -pi.bak -e \
's/(IGS[0-9a-z]+\.?[0-9XY]*):p=?<?\.[0-9]+/$1/' \
./data/chr*NoHeader.gff
# check this looks ok then clean up
rm *.bak
# makes sense to store this as a BED5 table in order to use the score
# for display.
foreach f (./data/*NoHeader.gff)
awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1,$4,$5,$9,$6}' $f \
>> consIndelsHg17Mm5CanFam1.bed
end
# load data
ssh hgwdev
cd /cluster/data/hg17/bed/consIndels
hgLoadBed hg17 consIndelsHg17Mm5CanFam1 consIndelsHg17Mm5CanFam1.bed
# Loaded 593298 elements of size 5
# Get the IDs, posterior probabilities (p) for the segment being neutral,
# and the FDR from the original GFFs for a separate table. Some items
# have p<.001. Can not do Table Browser queries restricting
# p to <, =, or > a specified value unless all values are floats.
# Contacted the data contributor, Gerton Lunter, and he said it would be
# ok to change all p<.001 to p=0.0005
ssh kkstore02
cd /cluster/data/hg17/bed/consIndels/
awk '{if ($1 !~ /random/) print $1;}' /cluster/data/hg17/chrom.sizes \
| sed -e 's/chr//' | sort -n > chrom.lst
grep -v 'hap' chrom.lst > tmp2
tail +4 tmp2 > tmp3
echo "X\nY\n" >> chrom.lst
rm tmp2 tmp3
# chrom.lst has a list of chroms 1-22, then X and Y
foreach c (`cat chrom.lst`)
echo $c
foreach f (./data/chr${c}.GFF)
echo $f
awk 'BEGIN {FS="\t"} {OFS="\t"}{if ($9 ~ /IGS/) print $9,$6;}' $f \
| sed -e 's/:/\t/' \
| sed -e 's/p=\./0\./' | sed -e 's/p<\.001/0\.0005/' \
>> consIndelsConf.txt
end
end
# Add the FDR.
# For this set, there is no false discovery rate (FDR) field but it
# can be related to the score. If score is 999 then FDR is 1% (0.01) and
# if score is 500 then FDR is 10% (0.10). Score is in column 6.
# there are no GFF files for the haplotype chroms
awk 'BEGIN {FS="\t"} {OFS="\t"} {if ($3 ~ /500/) print $1, $2, "0.10"; else if ($3 ~ /999/) print $1, $2, "0.01";}' consIndelsConf.txt > consIndelsHg17Mm5CanFam1Conf.txt
# Create a table definition for the table of identifier, posterior
# probability and false discovery rate (FDR). Already created for hg18
# track (see hg18.txt). It is $HOME/kent/src/hg/lib/itemConf.as.
ssh hgwdev
cd /cluster/data/hg17/bed/consIndels
hgLoadSqlTab hg17 consIndelsHg17Mm5CanFam1Conf \
$HOME/kent/src/hg/lib/itemConf.sql \
consIndelsHg17Mm5CanFam1Conf.txt
# check that all itesm are in this table.
hgsql -N -e 'select distinct(name) from consIndelsHg17Mm5CanFam1;' hg17 \
| sort > consIndels.names.sort
hgsql -N -e 'select distinct(id) from consIndelsHg17Mm5CanFam1Conf;' hg17 \
| sort > consIndels.idsfromConf.sort
wc -l *.sort
# 593298 consIndels.idsfromConf.sort
# 593298 consIndels.names.sort
comm -12 consIndels.names.sort consIndels.idsfromConf.sort | wc -l
# 593298
# so all element IDs are in both tables.
# cleanup
rm ./data/*.bak *.sort
# add trackDb/human/hg17/trackDb.ra entry and add description that
# was written by the data contributor. Add code to hgc.c to display
# the posterior probability and the FDR on the details page for
# track elements. Gerton Lunter provided a description for the data
# on 2007-09-12.
cd ~/kent/src/hg/makeDb/trackDb/human/hg17
cp ../hg18/consIndelsHg18Mm8CanFam2.html consIndelsHg17Mm5CanFam1.html
# check this is correct and add trackDb.ra track entry and search.
##############################################################
# NIMH Bipolar Genome Graphs built-in (DONE 2007-10-04 Galt)
#
# See hg18.txt for details.
#############################################################
#############################################################
# CCC Genome Graphs (DONE 2007-Sept Andy)
#
# See hg18 make doc.
###############################################################
# Affy Transcriptome Phase 3 chrY fix (DONE 2007-12-10, Andy)
ssh kkstore05
cd /cluster/store12/hg17/bed/affyTxnPhase3/raw
zcat sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz | grep -n chrY
#256994657:variableStep chrom=chrY span=1
zcat sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz | head -n256994656 | gzip -c >tmp.wig.gz
mv tmp.wig.gz sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz
zcat sRNA.affyTxnPhase3HeLaTopStrand.wig.gz | grep -n chrY
256994657:variableStep chrom=chrY span=1
zcat sRNA.affyTxnPhase3HeLaTopStrand.wig.gz | head -n256994656 | gzip -c > tmp.wig.gz
mv tmp.wig.gz sRNA.affyTxnPhase3HeLaTopStrand.wig.gz
ssh kolossus
cd /cluster/store12/hg17/bed/affyTxnPhase3/raw
wigEncode sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz affyTxnPhase3HeLaBottomStrand.{wig,wib}
wigEncode sRNA.affyTxnPhase3HeLaTopStrand.wig.gz affyTxnPhase3HeLaTopStrand.{wig,wib}
mv *.wig /cluster/data/hg17/bed/affyTxnPhase3/wig/
mv *.wib /cluster/data/hg17/bed/affyTxnPhase3/wib/
ssh hgwdev
cd /cluster/data/hg17/bed/affyTxnPhase3/wig
hgLoadWiggle hg17 affyTxnPhase3HeLaTopStrand{,.wig}
hgLoadWiggle hg17 affyTxnPhase3HeLaBottomStrand{,.wig}
###########################################################################
# Reload CCDS (2007-12-12 markd)
# import ccds database as described in ccds.txt
set db=hg17
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
# ccdsKgMap
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
# build initial version of ccdsMgcMap table, updated by nightly genbank update
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap
checkTableCoords ${db} -verbose=2 ccdsGene
# update all.jointer to include ${db} in ccdsDb
joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
ccdsMgcMap
# << emacs
############################################################################
# ADD LINKS TO GENETESTS ON hgGene DETAILS PAGE (DONE 12/12/07 Fan)
See hg18.txt for details.
############################################################################
# Reload CCDS (2008-02-01 markd)
# import ccds database as described in ccds.txt
set db=hg17
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
# ccdsKgMap
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
checkTableCoords ${db} -verbose=2 ccdsGene
# update all.jointer to include ${db} in ccdsDb
joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
# << emacs
############################################################################
# CREATE huge TABLE FOR HuGE LINK (DONE 3/6/08, Fan)
# See the HuGE section in hg18.txt for details.
############################################################################
############################################################################
-# DGV V6 (DATABASE OF GENOMIC VARIANTS) (DONE 2/23/09 angie)
+# DGV V7 (DATABASE OF GENOMIC VARIANTS) (DONE 3/11/09 angie)
+# DGV V6 thin regions dropped 2/23/09
# DGV V6 with useless thin regions done 11/12/08
# DGV V5 done 8/11/08
# DGV V4 done 5/9/08
ssh hgwdev
- mkdir /cluster/data/hg17/bed/dgv.v6
- cd /cluster/data/hg17/bed/dgv.v6
+ mkdir /hive/data/genomes/hg17/bed/dgv.v7
+ cd /hive/data/genomes/hg17/bed/dgv.v7
wget --timestamping \
- http://projects.tcag.ca/variation/downloads/variation.hg17.v6.nov.2008.txt
+ http://projects.tcag.ca/variation/downloads/variation.hg17.v7.mar.2009.txt
wget --timestamping \
- http://projects.tcag.ca/variation/downloads/indel.hg17.v6.nov.2008.txt
+ http://projects.tcag.ca/variation/downloads/indel.hg17.v7.mar.2009.txt
# shuffle fields into bed8+ (input has one start coord==0, but min
# nonzero size of 99 not 100 implies most coords are 1-based):
- foreach f (*.v6.*.txt)
+ foreach f (*.v7.*.txt)
tail -n +2 $f \
| perl -wpe 'chomp; \
($id, $landmark, $chr, $start, $end, $varType, \
undef, undef, undef, $ref, $pmid, $method, \
undef, undef, undef, undef, $sample) = split("\t"); \
$id =~ s/^Variation_//; \
$start-- unless ($start == 0); \
$landmark = "" if ($landmark =~ /^chr.*\d\.\.\d/); \
$rgb = "255,128,0"; \
$rgb = "200,0,0" if ($varType =~ /^Inv/); \
$rgb = "0,100,0" if ($varType eq "InDel"); \
$_ = join("\t", $chr, $start, $end, $id, 0, "+", \
$start, $end, $rgb, $landmark, $varType, \
$ref, $pmid, $method, $sample) . "\n";' \
> $f:r.bed
end
- hgsql hg17 -e 'rename table dgv to dgvV5'
+ hgsql hg17 -e 'rename table dgv to dgvV6'
hgLoadBed hg17 dgv *.bed \
-onServer -sqlTable=$HOME/kent/src/hg/lib/dgv.sql -tab
-#Loaded 17479 elements of size 15
+#Loaded 17473 elements of size 15
############################################################################
# KIDD/EICHLER DISCORDANT CLONE ENDS (DONE 6/10/08 angie)
# 8/11/08: Added kiddEichlerToNcbi (ID xref table).
ssh kkstore02
mkdir /cluster/data/hg17/bed/kiddEichlerDiscordant
cd /cluster/data/hg17/bed/kiddEichlerDiscordant
wget --user=uuuu --password=ppppppp \
http://eichlerlab.gs.washington.edu/kiddj/downloads/fosmids.hg17.tgz
tar xvzf fosmids.hg17.tgz
cd bd35
# 8 clone-end linkedFeaturesSeries tracks and one bed custom track.
# bed has illegal coords (maybe for unplaced ends?).
# Load the tracks (translate bacEndPairs format to bed12):
ssh hgwdev
cd /cluster/data/hg17/bed/kiddEichlerDiscordant/bd35
foreach f (abc*.txt)
set track = `echo $f:r \
| perl -wpe 's/^(G|abc)(\d+)discordant/kiddEichlerDisc\u$1$2/ || die;'`
if ($status != 0) break
perl -wpe 'next if s/^#.*\n$//; \
($c, $s, $e, $n, $sc, $st, undef, $bs, $bSt, $bSz)=split; \
@bSts = split(",", $bSt); @bSzs = split(",", $bSz); \
$s--; \
if ($n =~ /transchr/) { \
$bs = 1; \
$#bSts = 0; $#bSzs = 0; \
$bSts[0]--; $e--; \
$bSts[0] -= $s; \
} elsif ($n =~ /OEA/) { \
$bSts[0]--; \
die "bSts[0] $bSts[0] != s $s\n" if ($bSts[0] != $s); \
$bE = $bSts[0] + $bSzs[0]; \
die "bE $bE != e $e\n" if ($bE != $e); \
$bSts[0] -= $s; \
} elsif ($bs == 2) { \
$bSts[0]--; $bSts[1]--; \
if ($bSts[0] > $bSts[1]) { \
# warn "Swapping $n ($bSts[0] > $bSts[1])\n"; \
$tmp = $bSts[0]; $bSts[0] = $bSts[1]; $bSts[1] = $tmp; \
$tmp = $bSzs[0]; $bSzs[0] = $bSzs[1]; $bSzs[1] = $tmp; \
} \
if ($bSts[0] != $s) { \
# warn "Tweaking $n start from $s to $bSts[0]\n"; \
$s = $bSts[0]; \
} \
$bE0 = $bSts[0] + $bSzs[0]; \
$bE1 = $bSts[1] + $bSzs[1]; \
$bE = $bE0 > $bE1 ? $bE0 : $bE1; \
if ($bE != $e) { \
# warn "Tweaking $n end from $e to $bE\n"; \
$e = $bE; \
} \
$bSts[0] -= $s; $bSts[1] -= $s; \
} else { die "#blks is $bs for $n\n"; } \
$bSt = join(",", @bSts) . ","; $bSz = join(",", @bSzs) . ","; \
$rgb = ($n =~ /deletion/) ? "224,0,0" : \
($n =~ /insertion/) ? "0,0,224" : \
($n =~ /inversion/) ? "0,224,0" : \
($n =~ /OEA/) ? "240,160,64" : "0,0,0"; \
$_ = join("\t", $c, $s, $e, $n, $sc, $st, $s, $e, $rgb, \
$bs, $bSz, $bSt) . "\n";' $f \
| hgLoadBed -tab hg17 $track stdin
end
perl -pe 'next if s/^track .*\n$//; \
($c, $s, $e, $n, $sc, $st, $tS, $tE, $r, $bs, $bSz, $bSt) = split; \
@bSts = split(",", $bSt); @bSzs = split(",", $bSz); \
if ($n =~ /transchr/) { \
$bs = 1; \
$#bSts = 0; $#bSzs = 0; \
} elsif ($n =~ /OEA/) { \
$s--; # weird that this is required only for OEA here \
die "$n: bSts[0] $bSts[0] != 0\n" if ($bSts[0] != 0); \
$bE = $s + $bSts[0] + $bSzs[0]; \
die "$n: bE $bE != e $e\n" if ($bE != $e); \
} elsif ($bs == 2) { \
$bSts[0] += $s; $bSts[1] += $s; \
if ($bSts[0] > $bSts[1]) { \
# warn "Swapping $n ($bSts[0] > $bSts[1])\n"; \
$tmp = $bSts[0]; $bSts[0] = $bSts[1]; $bSts[1] = $tmp; \
$tmp = $bSzs[0]; $bSzs[0] = $bSzs[1]; $bSzs[1] = $tmp; \
} \
if ($bSts[0] != $s) { \
# warn "Tweaking $n start from $s to $bSts[0]\n"; \
$s = $bSts[0]; \
} \
$bE0 = $bSts[0] + $bSzs[0]; \
$bE1 = $bSts[1] + $bSzs[1]; \
$bE = $bE0 > $bE1 ? $bE0 : $bE1; \
if ($bE != $e) { \
# warn "Tweaking $n end from $e to $bE\n"; \
$e = $bE; \
} \
$bSts[0] -= $s; $bSts[1] -= $s; \
} else { die "#blks is $bs\n"; } \
$bSt = join(",", @bSts) . ","; $bSz = join(",", @bSzs) . ","; \
$tS = $s; $tE = $e; \
$rgb = ($n =~ /deletion/) ? "224,0,0" : \
($n =~ /insertion/) ? "0,0,224" : \
($n =~ /inversion/) ? "0,224,0" : \
($n =~ /OEA/) ? "240,160,64" : "0,0,0"; \
$_ = join("\t", $c, $s, $e, $n, $sc, $st, $tS, $tE, $rgb, \
$bs, $bSz, $bSt) . "\n";' G248discordant.txt \
| hgLoadBed -tab hg17 kiddEichlerDiscG248 \
stdin
# 8/11/08: get clone ID -> NCBI acc mapping.
ssh kkstore02
mkdir /cluster/data/hg17/bed/kiddEichlerDiscordant/cloneIds
cd /cluster/data/hg17/bed/kiddEichlerDiscordant/cloneIds
# Saved off emailed file from Jeff Kidd to clones_used_3nov.txt.accessions;
# get trace archive trace names for end reads:
foreach n (7 9 10 11 12 13 14)
wget http://hgsv.washington.edu/general/download/clone_mapping/ABC$n/ABC$n.conversion.gz
end
# ABC8 has _a and _b files:
wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_a.conversion.gz
wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_b.conversion.gz
# That file is not available for G248.
gunzip *.gz
# Combine the relevant data from the .conversion files; keep only those
# IDs that are used in the tracks.
cut -f 4 ../bd35/*discordant.txt \
| egrep -v '^(#chrom|track|name)' \
| sed -e 's/,.*//' \
| sort -u > discIds.txt
perl -wpe 's/^OurClone.*\n// || s/^\d+_(HUMAN|\d+_).*\n$// || \
s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\.(F|FORWARD|R|REVERSE)\.\d+\t(\w+)$/$2$3_$6\t$7\t$8/ || \
warn "Parse line $.:\n$_";' \
*.conversion \
| sort > allEnds.tab
grep -wFf discIds.txt allEnds.tab > discEnds.txt
wc -l discIds.txt allEnds.tab discEnds.txt
# 223051 discIds.txt
# 17498527 allEnds.tab
# 573974 discEnds.txt
# discEnds.txt has 2 lines (forward & reverse) for most of its ids...
# ideally we would see 2*(223051) lines in discEnds.txt.
# Get a list of which discordant clone IDs don't have ends in *.conv*:
cut -f 1 allEnds.tab | uniq > all.tmp
comm -23 discIds.txt all.tmp > discNotInConv.txt
wc -l discNotInConv.txt
#16318 discNotInConv.txt
cat > combine.pl <<'_EOF_'
#!/usr/bin/perl -w
use strict;
my ($cloneFile, $endsFile) = @ARGV;
open(CLONES, $cloneFile) || die "Can't open $cloneFile: $!\n";
my %idInfo;
while(<CLONES>) {
(s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\t(\w+)$/$2$3_$6\t$7/ && m/^(\w+)\t(\w+)/) || \
m/^(G248\w+)\t(\w+)$/ || die "Parse line $.:$_";
my ($id, $acc) = ($1, $2);
$idInfo{$id}->[0] = $acc;
}
close(CLONES);
open(ENDS, $endsFile) || die "Can't open $endsFile: $!\n";
while (<ENDS>) {
chomp; my ($id, $dir, $traceName) = split("\t");
if ($dir =~ /^F/) {
$idInfo{$id}->[1] = $traceName;
} elsif ($dir =~ /^R/) {
$idInfo{$id}->[2] = $traceName;
} else { die "What is this \$dir: $dir ?\n"; }
}
close(ENDS);
foreach my $id (sort keys %idInfo) {
my $infoRef = $idInfo{$id};
$infoRef->[0] = '' if (! defined $infoRef->[0]);
$infoRef->[1] = 0 if (! defined $infoRef->[1]);
$infoRef->[2] = 0 if (! defined $infoRef->[2]);
print join("\t", $id, @{$infoRef}) . "\n";
}
'_EOF_'
# << emacs
chmod a+x combine.pl
combine.pl clones_used_3nov.txt.accessions discEnds.txt \
| sort > kiddEichlerToNcbi.txt
# Load table:
ssh hgwdev
cd /cluster/data/hg17/bed/kiddEichlerDiscordant/cloneIds
hgLoadSqlTab hg17 kiddEichlerToNcbi \
$HOME/kent/src/hg/lib/kiddEichlerToNcbi.sql kiddEichlerToNcbi.txt
# Add to makeDb/schema/all.joiner, then check:
runJoiner.csh hg17 kiddEichlerToNcbi $HOME/kent/src/hg/makeDb/schema
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# KIDD/EICHLER VALIDATED SITES (DONE 6/11/08 angie)
ssh hgwdev
mkdir /cluster/data/hg17/bed/kiddEichlerValid
cd /cluster/data/hg17/bed/kiddEichlerValid
wget http://hgsv.washington.edu/general/download/validated_sites/Kidd_2008_sample_level_valided_sites.xls
# Open in Excel, save as Kidd_2008_sample_level_valided_sites.txt,
# move first 9 lines to Kidd_2008_sample_level_valided_sites.header.
# Split into one file per individual:
foreach id (Abc7 Abc8 Abc9 Abc10 Abc11 Abc12 Abc13 Abc14 G248)
set ID = `echo $id | tr 'a-z' 'A-Z'`
grep ${ID}_ Kidd_2008_sample_level_valided_sites.txt \
| perl -wpe 'chomp; s/\r//; ($c, $s, $e, $n, $t) = split; \
$rgb = ($n =~ /deletion/) ? "224,0,0" : \
($n =~ /insertion/) ? "0,0,224" : \
($n =~ /inversion/) ? "0,224,0" : "0,0,0"; \
$t =~ s/:/,/g; \
$n =~ s/^'$ID'_//; $n = "$n,$t"; \
$_ = join("\t", $c, $s, $e, $n, "0", "+", $s, $e, $rgb) . \
"\n";' \
| hgLoadBed -tab hg17 kiddEichlerValid$id stdin
end
################################################
# SPLIT EXPRESSION & REGULATION GROUPS
# (2008-09-09 kate)
echo "insert into grp (name, label, priority, defaultIsClosed) values ('expression', 'Expression', 4.5, 1)" | hgsql hg17
echo "update grp set label='Regulation' where name='regulation'" | hgsql hg17
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
hg17.upstreamGeneTbl = refGene
hg17.upstreamMaf = multiz17way /hive/data/genomes/hg17/bed/multiz17way/species.lst
#############################################################################
# MAKE PCR TARGET FOR UCSC GENES (DONE 11/4/08 angie)
ssh hgwdev
mkdir /cluster/data/hg17/bed/mrnaPcr
cd /cluster/data/hg17/bed/mrnaPcr
# First, get consistent FA and PSL for UCSC Genes.
genePredToBed /cluster/data/hg17/bed/kgHg17F/try3/kg3Try3.gp > ucscGenes.bed
hgsql hg17 -NBe 'select kgId,geneSymbol from kgXref' \
| perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
> idSub.txt
subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
sequenceForBed -keepName -db=hg17 -bedIn=ucscGenesIdSubbed.bed \
-fastaOut=stdout \
| faToTwoBit -ignoreDups stdin kgTargetSeq.2bit
cut -f 1-10 /cluster/data/hg17/bed/kgHg17F/try3/kg3Try3.gp \
| genePredToFakePsl hg17 stdin kgTargetAli.psl /dev/null
# Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
cd /cluster/data/hg17/bed/mrnaPcr
hgLoadPsl hg17 kgTargetAli.psl
mkdir /gbdb/hg17/targetDb
ln -s /cluster/data/hg17/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/hg17/targetDb/
# Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
# /gbdb/hg17/targetDb/kgTargetSeq.2bit .
ssh hgwdev
# Add records to hgcentraltest blatServers and targetDb:
hgsql hgcentraltest -e \
'INSERT into blatServers values ("hg17Kg", "blat13", 17797, 0, 1);'
hgsql hgcentraltest -e \
'INSERT into targetDb values("hg17Kg", "UCSC Genes", \
"hg17", "kgTargetAli", "", "", \
"/gbdb/hg17/targetDb/kgTargetSeq.2bit", 1, now(), "");'
#############################################################################
# fox2ClipSeq from Gene Yeo (DONE - 2009-01-08 - Hiram)
mkdir /hive/data/genomes/hg17/bed/fox2ClipSeq
cd /hive/data/genomes/hg17/bed/fox2ClipSeq
# fetch data
wget --timestamping \
'http://www.snl.salk.edu/~geneyeo/stuff/FOX2.rmsk.BED.gz' \
-O FOX2.rmsk.BED.gz
# remove track line and sort
zcat FOX2.rmsk.BED.gz | grep -v "^track" | sort -k1,1 -k2,2n \
| gzip > sorted.bed.gz
# separate strand data, and turn the positive into blue
zcat sorted.bed.gz | awk '$6 == "+"' | sed -e "s/255,0,0/0,0,255/" \
| gzip > forwardStrand.bed.gz
zcat sorted.bed.gz | awk '$6 == "-"' | gzip > reverseStrand.bed.gz
# turn into wiggle density plot
zcat forwardStrand.bed.gz | bedItemOverlapCount hg17 stdin \
| wigEncode stdin fox2ClipSeqDensityForwardStrand.wig \
fox2ClipSeqDensityForwardStrand.wib
# Converted stdin, upper limit 2401.00, lower limit 1.00
zcat reverseStrand.bed.gz | bedItemOverlapCount hg17 stdin \
| wigEncode stdin fox2ClipSeqDensityReverseStrand.wig \
fox2ClipSeqDensityReverseStrand.wib
# Converted stdin, upper limit 1406.00, lower limit 1.00
# and load tables
zcat forwardStrand.bed.gz reverseStrand.bed.gz \
| hgLoadBed hg17 fox2ClipSeq stdin
# Loaded 4418298 elements of size 9
ln -s `pwd`/*.wib /gbdb/hg17/wib
hgLoadWiggle hg17 fox2ClipSeqDensityForwardStrand \
fox2ClipSeqDensityForwardStrand.wig
hgLoadWiggle hg17 fox2ClipSeqDensityReverseStrand \
fox2ClipSeqDensityReverseStrand.wig
# add composite track definitions to makeDb/trackDb/human/trackDb.ra
#############################################################################