src/hg/makeDb/doc/rheMac2.txt 1.20
1.20 2010/02/12 23:51:36 hiram
last runs to Marmoset calJac3
Index: src/hg/makeDb/doc/rheMac2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/rheMac2.txt,v
retrieving revision 1.19
retrieving revision 1.20
diff -b -B -U 1000000 -r1.19 -r1.20
--- src/hg/makeDb/doc/rheMac2.txt 25 Nov 2009 21:48:42 -0000 1.19
+++ src/hg/makeDb/doc/rheMac2.txt 12 Feb 2010 23:51:36 -0000 1.20
@@ -1,1613 +1,1674 @@
# for emacs: -*- mode: sh; -*-
# Macaca Mulatta (rheMac2)
# Venter Institute final split merge assembly using washU and baylor
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +-------------+---------------------------------+
# | tableName | type |
# +-------------+---------------------------------+
# | xenoRefGene | genePred xenoRefPep xenoRefMrna |
# | nscanGene | genePred nscanPep |
# +-------------+---------------------------------+
# DOWNLOAD SEQUENCE (DONE jan27 2006 Robert)
ssh kkstore01
mkdir /cluster/store2/rheMac2
mkdir /cluster/store2/rheMac2/final
cd /cluster/data
ln -s /cluster/store2/rheMac2/final rheMac2
cd /cluster/data/rheMac2
mkdir downloads
cd downloads
The macaque genome assembly (v. 1.0, Mmul051212) has been posted to the macaque genome ftp site:
ftp://macftp@ftp.macaquegenome.hgsc.bcm.tmc.edu/
The pw is analysis .
mkdir final
cd final
mkdir download
cd download
# A. The assembly itself is represented in three groups, 1) the main assembly, 2) the Chr#_random and ChrU set, and 3) the ChrUr set (a late addition to the ChrU set).
#AGP and Fasta chromosome files for these three sets are:
# 1) Main Assembly - these are scaffolds, not chromosomes.
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1-edit4.trimNoContam.agp.bz2
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1_edit4.scf.fasta.bz2
# 2) Mapped to chromosomes but unplaced Chr#_random, and unmapped contigs ChrU
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.random.ctgs.agp.bz2
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.random.scfs.agp.bz2
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1_edit4.random.chrm.fasta.bz2
# 3) Late addition of 353 contigs to ChrU
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.ChrUr.chrm.fasta.bz2
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.ChrUr.ctgs.agp.bz2
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.ChrUr.scfs.agp.bz2
# B. The contigs fasta and quality files that go into these are in two sets, the bulk of the files (those in parts 1 and 2 of the assembly above) and the few that were added back at the last minute (part 3 above).
# 1)
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.noContam.ctg.fasta.bz2
# 2)
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.noContam.ctg.qv.bz2
# 3) The 353 contigs that were added back at the last minute are included in the following files (other contigs in these files are not included in the assembly)
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.degen.noContam.fasta.bz2
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.degen.noContam.qv.bz2
# C. The final assembly file in the VI internal assembly format is:
wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.asm.bz2
#D. The chromosome files are below, note, all gaps are listed as "fragment yes" in the agp files, although some are spanned by clones and others are not (making use of human synteny or macaque map information instead). These files will be corrected to list the uncaptured gaps as "clone no", but the intervals and other information will not change, so use these for other information now.
wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.chrm.fasta.corr.bz2
wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.chrome.ctgs.agp.corr.bz2
wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.chrome.scfs.agp.corr.bz2
bunzip2 *.bz2
#wrap extra long lines in fasta
fold v1_edit4.random.chrm.fasta |sed -e 's/ChrUr/chrUr/' > rheMac2_random.fa
fold v1.edit4.chrm.fasta.corr |sed -e 's/ChrUr/chrUr/' >rheMac2.fa
fold v1_edit4.scf.fasta |sed -e 's/ChrUr/chrUr/' > rheMac2.scf.fa
mkdir ../Ur
fold v1.edit4.ChrUr.chrm.fasta |sed -e 's/ChrUr/chrUr/' > ../Ur/chrUr.fa
mv v1.edit4.chrome.ctgs.agp.corr v1.edit4.chrome.ctgs.corr.agp
mv v1.edit4.chrome.scfs.agp.corr v1.edit4.chrome.scfs.corr.agp
#*** NOTE FOR NEXT TIME *** use makeGenomeDb.pl instead of the following
# several sections:
# Check that IDs in .fa match IDs in .qual
ssh kkstore01
cd /cluster/data/rheMac2/downloads
# foreach Bin0 contigs linearScaffolds reads
grep ">" *.fasta > id.fa
grep ">" *.qv > id.qual
diff id.fa id.qual
# change chromosomes to lower case
sed -e 's/Chr/chr/' v1.edit4.chrome.ctgs.agp.final > v1.edit4.chrome.ctgs.final.fix2.agp
cat v1.edit4.chrome.scfs.agp.final |sed -e 's/Chr/chr/' > v1.edit4.final.fix2.agp
hgGoldGapGl -noGl rheMac2 /cluster/data/rheMac2/downloads/v1.edit4.final.fix2.agp
/cluster/bin/i386/checkAgpAndFa v1.edit4.final.fix.agp rheMac2.fa > check.out
# SPLIT UP chromosome and make 500k chunks for RM
ssh kkstore02
cd /cluster/data/rheMac2
faSplit byname downloads/rheMac2.fa chrom
for i in `awk '{print $1}' chrom.sizes` ; do echo $i ; mkdir -p ${i##chr} ; mv $i.fa ${i##chr} ; pushd ${i##chr} ; faSplit -lift=$i.lft size $i.fa 500000 split ; popd; done
# generate list
find scaffolds -print > scaffolds.list.0
grep fa scaffolds.list.0 > scaffolds.list
#### MAKE 2BIT NIB FILE
ssh kkstore01
nice /cluster/bin/i386/faToTwoBit rheMac2.fa rheMac2.2bit
nice /cluster/bin/i386/twoBitToFa rheMac2.2bit check.fa
mv rheMac2.2bit ..
# could also use cmp
# note: size match
faSize -detailed=on rheMac2.fa >mac.len
faSize -detailed=on check.fa >check.len
diff mac.len check.len
wc -l *.len
ssh hgwdev
mkdir /gbdb/rheMac2
ln -s /cluster/data/rheMac2/rheMac2.2bit /gbdb/rheMac2/rheMac2.2bit
#### CREATE DATABASE (DONE Jan 27 , 2006 Robert)
ssh hgwdev
hgsql hg17
create database rheMac2;
use rheMac2;
create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;
# add rows to hgcentraltest on hgwbeta in dbDb and defaultDb
# set orderKey past dog, before chimp
# LOAD GAP & GOLD TABLES FROM AGP
ssh hgwdev
hgGoldGapGl -noGl rheMac2 /cluster/data/rheMac2/downloads/v1.edit4.chrome.scfs.final.fix.agp
# CREATE GAP TABLE from v1.edit4.chrome.ctgs.final.fix.agp to get gaps
ssh hgwdev
cd kent/src/hg/lib
# remove bin column for now
hgsql rheMac2 < gap2.sql
cd /cluster/data/rheMac2/downloads
grep fragment v1.edit4.chrome.ctgs.final.fix.agp > fragment.txt
grep clone v1.edit4.chrome.ctgs.final.fix.agp >> fragment.txt
hgsql rheMac2
load data local infile 'fragment.txt' into table gap
# create trackDb/monkey/rheMac2/gap.html
# CREATE CONTIG TABLE (DONE Feb 15,2006 Robert)
ssh hgwdev
cd kent/src/hg/lib
cp ~/kent/src/hg/lib/ctgPos.sql .
# edit ctgPos.sql to increase size of primary key to 20
# this doesn't include strand or contig start/end
hgsql rheMac2 < ctgPos.sql
cd /cluster/data/rheMac2/downloads
grep -v clone v1.edit4.chrome.ctgs.final.fix.agp |grep -v fragment > Contig.out
ssh kkstore02
cd /cluster/data/rheMac2/downloads
getContig.pl < Contig.out > ctgPos
ssh hgwdev
load data local infile 'ctgPos' into table ctgPos
#convert to half open and fix bug in agp file
hgsql rheMac2 -e "update ctgPos set chromStart = chromStart -1 "
hgsql rheMac2 -e "update ctgPos set size = size -1 "
# CREATE CHROMINFO TABLE
# an improvement here would be to have getMaxCoord output the 2bit file name
ssh hgwdev
cd /cluster/data/rheMac2
# get coordinates from agp and put in database table
getcoords.pl < downloads/v1.edit4.chrome.scfs.final.fix.agp > scaffolds.coords
hgsql rheMac2 < /cluster/data/bosTau1/coords.sql
load data local infile 'scaffolds.coords' into table scaffoldCoords
faSize rheMac2.fa -detailed=on > chrom.sizes
# calculate maximum coords; check that start coord is always 1
## /cluster/home/heather/bin/i386/GetMaxCoord > getMaxCoord.rheMac2
edit chromInfo.sql; allow fileName to be null
cp ~baertsch/kent/src/hg/lib/chromInfo.sql .
hgsql rheMac2 < chromInfo.sql
load data local infile 'chrom.sizes' into table chromInfo
update chromInfo set fileName = "/gbdb/rheMac2/rheMac2.2bit"
# REPEATMASKER
# using the split scaffold fa files generated earlier
#*** NOTE FOR NEXT TIME *** use doRepeatMasker.pl instead.
# note the version of RepeatMasker in use; will want this for download README
cat /cluster/bluearc/RepeatMasker/Libraries/version
# RM database version 20060120
# do a trial run
ssh hgwdev
cd /cluster/data/rheMac2/scaffolds/0/0/0
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec macaca Contig1000.fa
# configure
cd /cluster/data/rheMac2
mkdir jkStuff
cp /cluster/data/anoGam1/jkStuff/RMAnopheles jkStuff/RMRhesus
# change references anoGam1 --> rheMac2
mkdir RMRun
# /bin/csh makeJoblist-RM.csh
# do the run (DONE Jan 30, 2006 Robert)
ssh pk
cd /cluster/data/rheMac2/RMRun
para create RMJobs.2
para try
para push
# concatenate into one output file; took about 90 minutes
ssh kkstore01
cd /cluster/data/rheMac2
mkdir repeats
/bin/tcsh concatRM.csh
cd repeats
# use grep -v to get rid of all the headers
# then, add back in an initial header
# this is so hgLoadOut is happy
grep chr repeats.all > repeats.clean
cat repeats.header repeats.clean > repeats.final
grep "There were no repetitive sequences " repeats.final > repeats.notfound
grep -v "There were no repetitive sequences " repeats.final > repeat.out
ssh hgwdev
hgLoadOut rheMac2 repeat.out 2>errors
# Strange perc. fields:
Processing repeats.out
Strange perc. field -6.4 line 652513 of repeat.out
Strange perc. field -2236.0 line 749163 of repeat.out
Strange perc. field -2637.9 line 749163 of repeat.out
Strange perc. field -99.3 line 749163 of repeat.out
Strange perc. field -0.3 line 1386112 of repeat.out
Strange perc. field -0.2 line 2045133 of repeat.out
Strange perc. field -0.4 line 2429164 of repeat.out
Strange perc. field -4.8 line 2778536 of repeat.out
Strange perc. field -0.4 line 2912076 of repeat.out
Strange perc. field -5.5 line 3498109 of repeat.out
Strange perc. field -1.0 line 3642974 of repeat.out
Strange perc. field -2.7 line 3642974 of repeat.out
Strange perc. field -263.9 line 3711627 of repeat.out
Strange perc. field -187.0 line 3711627 of repeat.out
Strange perc. field -3.4 line 3795344 of repeat.out
Strange perc. field -0.1 line 3932239 of repeat.out
Strange perc. field -0.2 line 4092279 of repeat.out
Strange perc. field -0.1 line 4240026 of repeat.out
Strange perc. field -0.1 line 4440809 of repeat.out
Strange perc. field -7915.8 line 4506386 of repeat.out
Strange perc. field -0.1 line 4514494 of repeat.out
Strange perc. field -4361.2 line 4667281 of repeat.out
Strange perc. field -840.4 line 4667281 of repeat.out
Loading up table repeats_rmsk
note: 997 records dropped due to repStart > repEnd
hgsql rheMac2
rename table repeats_rmsk to rmsk
# select count(*) from rmsk;
# SIMPLE REPEATS (DONE Jan 30, 2006 Robert)
# put the results throughout the scaffold 0/0/0 directories,
# same as RepeatMasker, to avoid too many files in the same directory
# *** BUG: FILTERING WAS NOT DONE --> OVERMASKING ***
#*** NOTE FOR NEXT TIME *** use doSimpleRepeat.pl instead.
ssh kkstore01
cd /cluster/data/rheMac2
mkdir bed
cd bed
mkdir simpleRepeat
cd simpleRepeat
/bin/csh makeJoblist-trf.csh
tcsh trf-run.csh > & ! trf.log &
# concatenate into one output file; took about an hour
cat chr*.bed > simpleRepeat.bed
# load
/cluster/bin/i386/hgLoadBed rheMac2 simpleRepeat simpleRepeat.bed \
-sqlTable = /cluster/home/heather/kent/src/hg/lib/simpleRepeat.sql
#Reading trf.bed
#Loaded 628275 elements of size 16
#Sorted
#Saving bed.tab
#Loading rheMac2
create index chrom_2 on simpleRepeat (chrom(16), chromStart);
create index chrom_3 on simpleRepeat (chrom(16), chromEnd);
# CREATE MASKED FA USING REPEATMASKER AND FILTERED TRF FILES
ssh kkstore02
cd /cluster/data/rheMac2
/cluster/bin/i386/maskOutFa -soft downloads/rheMac2.fa repeats/repeat.out rheMac2.softmask.fa
WARNING: negative rEnd: -11 chr1:17040504-17040634 MLT2B4
WARNING: negative rEnd: -15 chr1:39400221-39400370 MLT2B4
WARNING: negative rEnd: -28 chr1:75866521-75866637 MLT2B4
WARNING: negative rEnd: -104 chr1:102866722-102866763 MLT2B4
WARNING: negative rEnd: -52 chr1:132930605-132930673 MLT2B4
WARNING: negative rEnd: -686 chr1:133186227-133186372 L1MCc
WARNING: negative rEnd: -203 chr1:133186415-133186749 L1MCc
WARNING: negative rEnd: -455 chr1:171379505-171379760 L1M3e
WARNING: negative rEnd: -17 chr1:216018086-216018189 L1MCb
WARNING: negative rEnd: -198 chr1:226920721-226920777 L1M4
WARNING: negative rEnd: -131 chr1:226921141-226921200 L1M4
WARNING: negative rEnd: -3 chr1:226921979-226922094 L1M4
WARNING: negative rEnd: -335 chr1:227071405-227071650 L1MCc
WARNING: negative rEnd: -278 chr2:16216076-16216253 L1P4b
WARNING: negative rEnd: -72 chr2:18621605-18622180 L1PBa
WARNING: negative rEnd: -285 chr2:24878971-24879214 L1MCc
WARNING: negative rEnd: -419 chr2:25206834-25207017 L1MCc
WARNING: negative rEnd: -183 chr2:25207033-25207203 L1MCc
WARNING: negative rEnd: -3 chr2:48529930-48529991 MER6
WARNING: negative rEnd: -426 chr2:80341345-80341639 L1M2c
WARNING: negative rEnd: -314 chr2:82400541-82400763 L1MDa
WARNING: negative rEnd: -1015 chr2:82731009-82731348 L1ME1
WARNING: negative rEnd: -282 chr2:82731554-82731973 L1ME1
WARNING: negative rEnd: -129 chr2:113802567-113802946 L1MCc
WARNING: negative rEnd: -528 chr2:129549508-129549693 L1M3e
WARNING: negative rEnd: -437 chr2:160075806-160076108 L1M2
WARNING: negative rEnd: -21 chr3:39033062-39033110 MER6
WARNING: negative rEnd: -271 chr3:58615645-58615852 L1MEb
WARNING: negative rEnd: -202 chr3:70911642-70911963 L1M5
WARNING: negative rEnd: -33 chr3:71817441-71818111 L1M3e
WARNING: negative rEnd: -316 chr3:112797698-112798023 L1PB3
WARNING: negative rEnd: -17 chr3:113476319-113476407 FRAM
WARNING: negative rEnd: -27 chr3:116802770-116802992 L1MB7
WARNING: negative rEnd: -4 chr3:126585635-126585722 FRAM
WARNING: negative rEnd: -632 chr3:183268961-183269045 L1M3b
WARNING: negative rEnd: -1132 chr4:16324742-16324819 L1MEb
WARNING: negative rEnd: -930 chr4:38188841-38189000 L1MEd
WARNING: negative rEnd: -448 chr4:122440460-122440750 L1M3e
WARNING: negative rEnd: -39 chr5:7580689-7581026 L1MEe
WARNING: negative rEnd: -1956 chr5:27414234-27414442 L1MB8
WARNING: negative rEnd: -1682 chr5:27414453-27414671 L1MB8
WARNING: negative rEnd: -513 chr5:66093155-66093311 L1M4b
WARNING: negative rEnd: -35 chr5:97364737-97364827 L1MCa
WARNING: negative rEnd: -1077 chr5:109355971-109356130 L1MEb
WARNING: negative rEnd: -20 chr5:109357351-109357727 L1MEb
WARNING: negative rEnd: -102 chr5:119072794-119072945 L1MB4
WARNING: negative rEnd: -9 chr5:144123391-144123455 L1M3de
WARNING: negative rEnd: -59 chr5:150681576-150681827 L1M4c
WARNING: negative rEnd: -461 chr5:170348753-170349060 L1M3e
WARNING: negative rEnd: -138 chr5:170614438-170615156 L1M1
WARNING: negative rEnd: -103 chr5:180911655-180912016 L1MEd
WARNING: negative rEnd: -38 chr5:181275921-181276086 L1M4
WARNING: negative rEnd: -219 chr6:326885-327048 L1MCc
WARNING: negative rEnd: -243 chr6:18237822-18238057 L1MC
WARNING: negative rEnd: -458 chr6:24563135-24563511 L1M4b
WARNING: negative rEnd: -14 chr6:27838867-27838984 MLT2B4
WARNING: negative rEnd: -92 chr6:33082868-33082962 L1M4
WARNING: negative rEnd: -505 chr6:60951240-60951389 L1PB3
WARNING: negative rEnd: -9 chr6:76795983-76796095 MLT2B4
WARNING: negative rEnd: -1039 chr6:153171051-153171255 L1MEb
WARNING: negative rEnd: -448 chr6:156038512-156038675 L1M4b
WARNING: negative rEnd: -1797 chr6:172500974-172501240 L1M4b
WARNING: negative rEnd: -717 chr7:6530435-6530561 L1M4b
WARNING: negative rEnd: -180 chr7:6530567-6530816 L1M4b
WARNING: negative rEnd: -395 chr7:17573119-17573355 L1M4b
WARNING: negative rEnd: -713 chr7:34841140-34841243 L1PA12
WARNING: negative rEnd: -917 chr7:36602478-36602794 L1MEd
WARNING: negative rEnd: -993 chr7:49724690-49725009 L1MEc
WARNING: negative rEnd: -519 chr7:64924143-64924287 L1PA15-16
WARNING: negative rEnd: -170 chr7:64924684-64924986 L1PA15-16
WARNING: negative rEnd: -1526 chr7:130257854-130258027 L1MDa
WARNING: negative rEnd: -78 chr8:18037533-18037781 L1M3e
WARNING: negative rEnd: -643 chr8:25868692-25868877 L1MCc
WARNING: negative rEnd: -5 chr8:37458013-37458057 FRAM
WARNING: negative rEnd: -420 chr8:40430422-40430587 L1MCc
WARNING: negative rEnd: -73 chr8:53713425-53713503 MLT2B4
WARNING: negative rEnd: -38 chr8:58045714-58046896 L1MEd
WARNING: negative rEnd: -46 chr8:70060490-70060930 L1M4
WARNING: negative rEnd: -3 chr8:71535410-71535460 FRAM
WARNING: negative rEnd: -781 chr8:71840470-71840637 L1MCc
WARNING: negative rEnd: -613 chr8:71841566-71841717 L1MCc
WARNING: negative rEnd: -17 chr9:17870182-17870240 FRAM
WARNING: negative rEnd: -1077 chr9:36754551-36754681 L1MEb
WARNING: negative rEnd: -354 chr9:36754801-36755003 L1MEb
WARNING: negative rEnd: -189 chr9:53706555-53706958 L1MCc
WARNING: negative rEnd: -28 chr9:55631220-55631257 MER6
WARNING: negative rEnd: -259 chr9:83699632-83700159 L1MA4A
WARNING: negative rEnd: -412 chr10:93305328-93305380 L1MDb
WARNING: negative rEnd: -290 chr10:93305684-93305806 L1MDb
WARNING: negative rEnd: -232 chr11:93725539-93725691 L1MCc
WARNING: negative rEnd: -3 chr11:102814963-102815011 FRAM
WARNING: negative rEnd: -614 chr12:24319914-24321380 L1M4b
WARNING: negative rEnd: -557 chr12:24323027-24323094 L1M4b
WARNING: negative rEnd: -259 chr12:100959333-100959494 L1M1
WARNING: negative rEnd: -529 chr12:102350218-102350466 L1M2
WARNING: negative rEnd: -654 chr13:53814747-53815048 L1M4b
WARNING: negative rEnd: -7 chr13:58060931-58061578 L1M4c
WARNING: negative rEnd: -1125 chr13:72166501-72166711 L1ME1
WARNING: negative rEnd: -6 chr13:72540001-72540050 FRAM
WARNING: negative rEnd: -397 chr13:82523197-82523664 L1M3e
WARNING: negative rEnd: -119 chr13:120528068-120528107 MLT2B3
WARNING: negative rEnd: -1934 chr14:27100172-27100323 L1MCc
WARNING: negative rEnd: -158 chr14:27101192-27101535 L1MCc
WARNING: negative rEnd: -281 chr14:60812899-60813046 L1MDa
WARNING: negative rEnd: -376 chr15:22274805-22275047 L1M4b
WARNING: negative rEnd: -669 chr15:23733251-23733307 L1M3e
WARNING: negative rEnd: -422 chr15:24246692-24246910 L1MCc
WARNING: negative rEnd: -290 chr15:24247095-24247225 L1MCc
WARNING: negative rEnd: -252 chr15:24247564-24247617 L1MCc
WARNING: negative rEnd: -11 chr15:24247667-24247964 L1MCc
WARNING: negative rEnd: -261 chr15:77304256-77304526 L1M4c
WARNING: negative rEnd: -121 chr15:80339141-80339347 L1M4b
WARNING: negative rEnd: -488 chr15:83773572-83773877 L1MEd
WARNING: negative rEnd: -101 chr15:83773914-83774074 L1MEd
WARNING: negative rEnd: -92 chr16:40096072-40096144 MLT2B4
WARNING: negative rEnd: -76 chr17:44749449-44749518 MLT2B4
WARNING: negative rEnd: -97 chr17:85835577-85836019 L1MCc
WARNING: negative rEnd: -112 chr18:12965293-12965685 L1MCc
WARNING: negative rEnd: -879 chr18:41292895-41293389 L1MEc
WARNING: negative rEnd: -322 chr18:41294544-41295033 L1MEc
WARNING: negative rEnd: -1137 chr18:59335523-59335630 L1MEb
WARNING: negative rEnd: -102 chr19:5934741-5934887 L1M4
WARNING: negative rEnd: -6 chr19:11007926-11008075 FRAM
WARNING: negative rEnd: -24 chr19:44501149-44501305 L1M3b
WARNING: negative rEnd: -145 chr20:45957350-45957401 L1MDa
WARNING: negative rEnd: -1106 chr20:74584188-74584318 L1ME1
WARNING: negative rEnd: -348 chr20:74584451-74584806 L1ME1
WARNING: negative rEnd: -711 chr20:80285832-80285915 L1MCc
WARNING: negative rEnd: -37 chrX:4743643-4743679 MER6
WARNING: negative rEnd: -1941 chrX:48147060-48147184 L1M4b
WARNING: negative rEnd: -99 chrX:54535186-54535349 L1M4
WARNING: negative rEnd: -606 chrX:80315181-80315657 L1P4b
WARNING: negative rEnd: -65 chrX:97720783-97720963 L1M4
WARNING: negative rEnd: -24 chrX:120523759-120523982 L1ME1
WARNING: negative rEnd: -13 chrX:121193467-121193615 MLT2B4
WARNING: negative rEnd: -939 chrX:153527918-153528288 L1M4
# matches select count(*) from rmsk where repEnd < 0;
+----------+
| count(*) |
+----------+
| 131 |
+----------+
# DOH -- THIS MASKS TOO MUCH! SHOULD HAVE USED FILTERED TRF!
/cluster/bin/i386/maskOutFa -softAdd rheMac2.softmask.fa bed/simpleRepeat/simpleRepeat.bed rheMac2.softmask2.fa
# hard masking (Ns instead of lower case) for download files
# split for use by genscan
/cluster/bin/i386/maskOutFa rheMac2.softmask2.fa hard rheMac2.hardmask.fa
mkdir hardmask-split
/cluster/bin/i386/faSplit about rheMac2.hardmask.fa 2000000 hardmask-split
# DOWNLOAD FILES
# REMAKE SOME BIGZIPS DOWNLOADS IN SAME FORMAT AS FOR OTHER ASSEMBLIES
# WITH ONE FILE PER CHROMOSOME (DONE, 2006-06-12, hartera)
#*** NOTE FOR NEXT TIME *** use makeDownloads.pl instead.
ssh hgwdev
cd /cluster/data/rheMac2
cp rheMac2.softmask.fa /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips
cp rheMac2.softmask2.fa /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips
cp rheMac2.hardmask.fa /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips
cd /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips
nice gzip rheMac2.softmask.fa
nice gzip rheMac2.softmask2.fa
nice gzip rheMac2.hardmask.fa
# Remake some of the bigZips downlods in the same format as for
# other assemblies with one file per chrom (2006-06-12, hartera)
ssh kkstore01
cd /cluster/data/rheMac2
# use rheMac2.softmask2.fa as they contain masking from RepeatMasker
# and from trf.
cat << '_EOF_' > jkStuff/zipAll.csh
mkdir -p bigZips
# RepeatMasker files
tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out
tar cvzf bigZips/chromTrf.tar.gz bed/simpleRepeat/chr*.bed
# get soft masked and hard masked sequences, one per chrom
mkdir softMask hardMask
foreach c (`awk '{print$1}' /cluster/data/rheMac2/chrom.sizes`)
faOneRecord /cluster/data/rheMac2/rheMac2.softmask2.fa \
$c > softMask/${c}.fa
faOneRecord /cluster/data/rheMac2/rheMac2.hardmask.fa \
$c > hardMask/${c}.fa.masked
end
tar cvzf bigZips/chromFa.tar.gz softMask/chr*.fa
tar cvzf bigZips/chromFaMasked.tar.gz hardMask/chr*.fa.masked
'_EOF_'
# << this line makes emacs coloring happy
chmod +x jkStuff/zipAll.csh
csh -ef ./jkStuff/zipAll.csh >& zipAll.log &
# Took about 50 minutes.
rm repeats.all.gz rheMac2.hardmask.fa.gz rheMac2.softmask.fa.gz \
rheMac2.chrom.fa.gz
md5sum *.gz > md5sum.txt
# link the *.gz and *.txt files to hgwdev:/usr/local/apache/...
ssh hgwdev
set gp = /usr/local/apache/htdocs/goldenPath/rheMac2
mkdir -p $gp/bigZips
rm repeats.all.gz rheMac2.hardmask.fa.gz rheMac2.softmask.fa.gz \
rheMac2.chrom.fa.gz md5sum.txt
ln -s /cluster/data/rheMac2/bigZips/{chrom*.gz,*.txt} $gp/bigZips
# Update README.txt to give correct file names.
# REGENERATE 2BIT NIB (DONE Jan 30, 2006 Robert)
ssh kkstore02
cd /cluster/data/rheMac2
/cluster/bin/i386/faToTwoBit rheMac2.softmask2.fa rheMac2.softmask.2bit
/cluster/bin/i386/twoBitToFa rheMac2.softmask.2bit check.softmask.fa
diff -q rheMac2.softmask2.fa check.softmask.fa
mv rheMac2.2bit rheMac2.2bit.unmasked
mv rheMac2.softmask.2bit rheMac2.2bit
# note: size match
# GENERATE 2bit for blastz
mkdir -p /san/sanvol1/scratch/rheMac2/
cd /cluster/data/rheMac2
cp rheMac2.2bit /san/sanvol1/scratch/rheMac2/ -p
ssh kkstore02
cd /cluster/data/hg17/bed/blastz.rheMac2
# edit doBlastzChainNet.pl to add lines for hg17 glitch
# in function getDbFromPath
# if ($db eq "hg") {
# $db = "hg17";
# }
#doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/rheMac2/rheMacOut/ >& do.log &
############################################################################
## BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram)
ssh pk
cd /cluster/data/mm8/bed/blastz.rheMac2.2006-02-17
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > swap.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=net `pwd`/DEF > swap.net.out 2>&1 &
# failed during a san hiccup, finish that off, then:
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=load `pwd`/DEF > swap.load.out 2>&1 &
time nice -n +19 featureBits rheMac2 chainMm8Link
# 877906099 bases of 2646704109 (33.170%) in intersection
# This was done before the loadUp script was improved to load
# empty tables on chroms that have no chains, so sometime later:
ssh hgwdev
hgLoadChain rheMac2 chrUr_chainMm8 /dev/null
# Loading 0 chains into rheMac2.chrUr_chainMm8
# 2006-04-24
############################################################################
# QUALITY SCORES (Done March 9, 2006 Robert)
ssh kkstore01
wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu:21/Assemblies/VI/FinalMergeSplit/v1.edit4.chrm.qv.bz2
cd /cluster/data/rheMac2/downloads/
bunzip2 v1.edit4.chrm.qv.bz2 &
bunzip2 v1.edit4.chrUr.bz2 &
nice fold --spaces v1.edit4.chrm.qv | sed -e 's/Chr/chr/' > rheMac2.chrm.qual.qv
nice fold --spaces v1.edit4.chrUr.qv | sed -e 's/Chr/chr/' > rheMac2.chrUr.qv
cat rheMac2.chrm.qual.qv rheMac2.chrUr.qv > rheMac2.qual.qv
qaToQac rheMac2.qual.qv temp.qac
gzip rheMac2.qual.qv
qacToWig -fixed temp.qac rheMac2.wig
rm temp.qac
wigEncode rheMac2.wig rheMac2.qual.wig rheMac2.qual.wib
mv all.qv rheMac2.qual.qv
qaToQac rheMac2.qual.qv temp.qac
qacToWig -fixed temp.qac rheMac2.wig
wigEncode rheMac2.wig rheMac2.qual.wig rheMac2.qual.wib
ssh hgwdev
hgLoadWiggle rheMac2 quality rheMac2.qual.wig
cp -p rheMac2.qual.qv.gz /usr/local/apache/htdocs/rheMac2/bigZips
# RUN DATEREPEATS TO GET LINEAGE-SPECIFIC (DONE 3/22/06 angie)
ssh kkstore01
cd /cluster/data/rheMac2
# First make per-chrom .fa.out files as should have been done:
tail +4 repeats/repeat.out \
| perl -we '$prevChr = ""; \
while (<>) { \
@w = split; $chr = $w[4]; $c = $chr; $c =~ s/^chr//; \
if ($chr ne $prevChr) { \
print "creating $c/$chr.fa.out\n"; \
system("cat repeats/repeats.header > $c/$chr.fa.out"); \
close(OUT); open(OUT, ">>$c/$chr.fa.out") || die; \
} \
print OUT; \
$prevChr = $chr; \
}'
# Run DateRepeats to get outputs on rodent, non-rodent mammal:
mkdir linSpecRep
cd linSpecRep
ln -s ../*/chr*.fa.out .
foreach f (*.fa.out)
echo $f:r:r
/cluster/bluearc/RepeatMasker/DateRepeats $f -query human \
-comp mouse -comp dog
end
mkdir /cluster/bluearc/rheMac2/linSpecRep
mkdir /cluster/bluearc/rheMac2/linSpecRep/notIn{Rodent,NonRodentMammal}
foreach f (*.out_mus*)
set chr = $f:r:r
echo $chr
/cluster/bin/scripts/extractRepeats 1 $f \
> /cluster/bluearc/rheMac2/linSpecRep/notInRodent/$chr.out.spec
/cluster/bin/scripts/extractRepeats 2 $f \
> /cluster/bluearc/rheMac2/linSpecRep/notInNonRodentMammal/$chr.out.spec
end
# BLASTZ SWAP FOR HUMAN (hg17) (DONE, 2006-03-22, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS
# Documented in makeHg17.doc in the section:
# SWAP CHAIN AND NET ALIGNMENTS OVER TO RHESUS (rheMac2)
# alignments are in: /cluster/data/rheMac2/bed/blastz.hg17.swap
# SWAP CHAINS/NET RN4 (DONE 3/23/06 angie)
ssh kkstore01
mkdir /cluster/data/rheMac2/bed/blastz.rn4.swap
cd /cluster/data/rheMac2/bed/blastz.rn4.swap
doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.rheMac2/DEF \
>& do.log & tail -f do.log
ln -s blastz.rn4.swap /cluster/data/rheMac2/bed/blastz.rn4
###########################################################################
# 5 way multiz WITH RheMac2 (robert, 5/3/06) -- MM8, RN4, CANFAM2, HG18
# rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
# first build 5-way multiz alignment from syntenic nets (helps reduce
# false positive predictions due to paralogous alignments)
# (prepare mafNet files from syntenic nets and copy to
# /san/sanvol1/scratch/rheMac2/mafNetSyn; do this for mm8, rn4, canFam2, rheMac2
ssh kkstore02
cd /cluster/data/rheMac2/bed
mkdir exoniphy
cd exoniphy
cat > makeSynMaf.sh << 'EOF'
#!/bin/bash
fromdb=$1
todb=$2
DIR=$3
outdir=$4/$2
base=`pwd`
cut -f1 /cluster/data/$fromdb/chrom.sizes > chrom.lst
echo making syntenic mafs for $todb in $DIR from $base
cd $DIR
cd axtChain
nice netFilter -syn $fromdb.$todb.net.gz > $fromdb.$todb.syn.net
mkdir -p netSynSplit
echo splitting nets
nice netSplit $fromdb.$todb.syn.net netSynSplit
mkdir -p chain
echo splitting chains $fromdb.$todb.all.chain.gz
nice chainSplit chain $fromdb.$todb.all.chain.gz
echo making mafs
mkdir -p $outdir
for i in `cat $base/chrom.lst` ; do echo "netToAxt netSynSplit/$i.net chain/$i.chain /cluster/data/$fromdb/nib /cluster/data/$todb/nib stdout | axtToMaf stdin -tPrefix=$fromdb. -qPrefix=$todb. /cluster/data/$fromdb/chrom.sizes /cluster/data/$todb/chrom.sizes $outdir/$i.maf " ; nice netToAxt netSynSplit/$i.net chain/$i.chain /cluster/data/$fromdb/nib /cluster/data/$todb/nib stdout | axtToMaf stdin -tPrefix=$fromdb. -qPrefix=$todb. /cluster/data/$fromdb/chrom.sizes /cluster/data/$todb/chrom.sizes $outdir/$i.maf ; done
echo removing temporary files
rm -rf axtChain/chain
rm -rf axtChain/netSynSplit/
# makes syntenic net from net
# with $1 as reference and $2 as query
# uses blastz alignment in $3 then converts to maf written to $4
# sample usage
# makeSynMaf.sh rheMac2 canFam2 /cluster/data/rheMac2/bed/blastz.canFam2 /san/sanvol1/scratch/rheMac2/mafNetSyn
'EOF'
# << for emacs
makeSynMaf.sh rheMac2 canFam2 /cluster/data/rheMac2/bed/blastz.canFam2 /san/sanvol1/scratch/rheMac2/mafNetSyn
makeSynMaf.sh rheMac2 hg18 /cluster/data/rheMac2/bed/blastz.hg18 /san/sanvol1/scratch/rheMac2/mafNetSyn
makeSynMaf.sh rheMac2 mm8 /cluster/data/rheMac2/bed/blastz.mm8 /san/sanvol1/scratch/rheMac2/mafNetSyn
makeSynMaf.sh rheMac2 rn4 /cluster/data/rheMac2/bed/blastz.rn4 /san/sanvol1/scratch/rheMac2/mafNetSyn
# make output dir and run dir
ssh pk
mkdir -p /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2
cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2
mkdir -p mafSyn runSyn
cd runSyn
# create scripts to run multiz on cluster
cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
set path=(/cluster/bin/penn/multiz.v11.x86_64/ $path )
set db = rheMac2
set c = $1
set maf = $2
set run = /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/
set tmp = /scratch/tmp/$db/multiz.syn.$c
set pairs = /san/sanvol1/scratch/$db/mafNetSyn
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == rheMac2) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run $path); rehash
$run/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'EOF'
# << for emacs
chmod +x autoMultiz.csh
cat > gsub << 'EOF'
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/rheMac2/bed/multiz.rheMac2Hg18Mm8Rn4CanFam2/mafSyn/$(root1).maf}
#ENDLOOP
'EOF'
# << for emacs
cut -f 1 /cluster/data/rheMac2/chrom.sizes > chrom.lst
set path = (/parasol/bin $path);rehash
gensub2 chrom.lst single gsub jobList
para create jobList
# 46 jobs
para try; para check
para push
# Load into database
ssh hgwdev
cd /cluster/data/rheMac2/bed/multiz5waySyn/mafSyn
mkdir -p /gbdb/rheMac2/multiz5waySyn/mafSyn
ln -s /cluster/data/rheMac2/bed/multiz5waySyn/mafSyn/*.maf \
/gbdb/rheMac2/multiz5waySyn/maf
cat > loadMaf.csh << 'EOF'
time hgLoadMaf -pathPrefix=/gbdb/rheMac2/multiz5waySyn/maf rheMac2 multiz5waySyn
cat *.maf | \
nice hgLoadMafSummary rheMac2 -minSize=30000 -mergeGap=1500 -maxSize=200000 multiz5waySummary stdin
'EOF'
#<< happy emacs
# Dropped unused indexes (2006-05-09 kate)
# NOTE: this is not required in the future, as the loader
# has been fixed to not generate these indexes
hgsql rheMac2 -e "alter table multiz5waySummary drop index chrom_2"
hgsql rheMac2 -e "alter table multiz5waySummary drop index chrom_3"
# expect lengthy load time for this -- a few hours ?
csh loadMaf.csh >&! loadMaf.log &
#Loading multiz5waySyn into database
#Loaded 6207706 mafs in 21 files from /gbdb/rheMac2/multiz5waySyn/maf
#Advisory lock created
#Advisory lock has been released
#455.360u 156.210s 13:51.54 73.5% 0+0k 0+0io 237pf+0w
#Indexing and tabulating stdin
#Created 795772 summary blocks from 15566976 components and 6207706 mafs from stdin
#Loading into rheMac2 table multiz5waySummary...
#Loading completeAdvisory lock has been released
#drop index to improve performance
hgsql rheMac2 -e " alter table multiz5waySummary drop index chrom_3"
# build chromosome-by-chromosome SS files
cd /cluster/data/rheMac2/bed/multiz.rheMac2Hg18Mm8Rn4CanFam2
mkdir run-ss-syn
cd run-ss-syn
mkdir -p /san/sanvol1/scratch/rheMac2/multiz.rheMac2Hg18Mm8Rn4CanFam2/ssSyn
cat > makeSS.csh << 'EOF'
#!/bin/csh -fe
set c = $1
/cluster/bin/phast/msa_view -i MAF -o SS /cluster/data/rheMac2/bed/multiz.rheMac2Hg18Mm8Rn4CanFam2/mafSyn/$c.maf --refseq /cluster/bluearc/rheMac2/chrom/$c.fa | gzip -c > /san/sanvol1/scratch/rheMac2/multiz.rheMac2Hg18Mm8Rn4CanFam2/ssSyn/$c.ss.gz
'EOF'
# << for emacs
chmod +x makeSS.csh
foreach chr (`cut -f 1 /cluster/data/rheMac2/chrom.sizes`)
echo "makeSS.csh $chr" >> jobList.ss
end
para create jobList.ss
# 20 jobs
para try; para check
para push
# gzip mafs for downloads area
ssh kkstore01
cd cluster/data/hg17/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/mafDownloads
cat > downloads.csh << 'EOF'
date
foreach f (../mafSyn/chr*.maf)
set c = $f:t:r
echo $c
nice gzip -c $f > $c.maf.gz
end
md5sum *.gz > md5sum.txt
date
'EOF'
time csh downloads.csh >&! downloads.log
# ~2 hours
ssh hgwdev
set dir = /usr/local/apache/htdocs/goldenPath/rheMac2/multiz5waySyn
mkdir $dir
ln -s /cluster/data/rheMac2/bed/multiz5waySyn/mafDownloads/{*.gz,md5sum.txt} $dir
cp README.txt $dir
#make multi5wayFrames table for browser track
ssh kkstore01
cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/
set sanDir = /san/sanvol1/scratch/rheMac2/multiz5waySyn/frames
mkdir -p $sanDir/maf
cp -rp maf/* $sanDir/maf
mkdir frames
cd frames
cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .
#edit Makefile to correct species names and set and sanDir
ssh hgwdev
cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/frames
make getGenes >&! getGenes.log &
# ~1 minute
make getFrames >&! getFrames.log &
# NOTE: if jobs get hung up (e.g. running for hours, when
# they should run for minutes, do 'para stop' so that
# the 'para make' can restart the job
make loadDb >&! loadDb.log &
#trackDb entry
#track multiz5waySyn
#shortLabel Conservation
#longLabel Human/Mouse/Rat/Dog Multiz Alignment & Conservation using syntenic alignments
#group compGeno
#priority 104.1
#visibility pack
#color 0, 10, 100
#altColor 0,90,10
#type wigMaf 0.0 1.0
#maxHeightPixels 100:40:11
#wiggle phastCons5way
#pairwiseHeight 12
#spanList 1
#yLineOnOff Off
#autoScale Off
#summary multiz5waySummary
#frames multiz5wayFrames
#irows on
#speciesGroups mammal
#sGroup_mammal hg18 mm8 rn4 canFam2
#speciesDefaultOff
# request push to hgwdev
###
# rebuild frames to get bug fix, using 1-pass maf methodology
# (2006-06-09 markd)
ssh kkstore01
cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/frames
mv mafFrames/ mafFrames.old
nice tcsh # easy way to get process niced
(cat ../mafSyn/*.maf | time genePredToMafFrames rheMac2 stdin stdout canFam2 genes/canFam2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz | gzip >multiz5wayFrames.mafFrames.gz)>&log
ssh hgwdev
cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/frames
hgLoadMafFrames rheMac2 multiz5wayFrames multiz5wayFrames.mafFrames.gz >&log
###############################################################
# PHASTCONS CONSERVATION (NOT DONE yet, 2006-05-06 Robert)
# coverage is only 3% and needs more tweaking
# These parameters are:
# --rho
# --expected-length
# --target-coverage
# Also, instead of generating cons and noncons tree models,
# we use a single, pre-existing tree model -- Elliot Margulies' model
# from the (37-way) ENCODE alignments.
# NOTE: reusing cluster-friendly chrom fasta files created earlier
ssh kkstore01
mkdir /cluster/bluearc/rheMac2/chrom
cd /cluster/data/rheMac2
foreach f (`cat chrom.lst`)
echo $f
cp $f/*.fa /cluster/bluearc/rheMac2/chrom
end
# Split chromosome MAF's into windows and use to generate
# "sufficient statistics" (ss) files for phastCons input
# NOTE: as the SAN fs has lotsa space, we're leaving these
# big (temp) files unzipped, to save time during phastCons run.
# Note also the larger chunk sizes from previous runs -- this
# reduces run-time on the split, slows down the actual phastCons
# enough so jobs don't crash (jobs are very quick, just a minute
# or so), and according to Adam, will produce better results.
# The previous small chunks were probably required by
# the phyloFit step, which we are no longer using for the
# human alignments.
ssh pk
mkdir /cluster/data/rheMac2/bed/multiz5waySyn/cons
cd /cluster/data/rheMac2/bed/multiz5waySyn/cons
cp /cluster/store5/gs.18/build35/bed/multiz17way.2005-12-20/cons/elliotsEncode.mod .
# edit, change to hg18, mm8, and rn4.
mkdir run.split
cd run.split
set WINDOWS = /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/rheMac2/bed/multiz5waySyn/mafSyn
set WINDOWS = /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss
cd $WINDOWS
set c = $1
echo $c
rm -fr $c
mkdir $c
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
-M /cluster/bluearc/rheMac2/chrom/$c.fa \
-o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
echo "Done" >> $c.done
'EOF'
# << happy emacs
chmod +x doSplit.csh
rm -f jobList
foreach f (../../mafSyn/*.maf)
set c = $f:t:r
echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
end
para create jobList
# 49 jobs
para try
para check
para push
# estimate model parameters
# estimate avg. cross-species avg. GC content from chr1 maf's
ssh kolossus
set path = ($path /cluster/bin/phast); rehash
cd /cluster/data/rheMac2/bed/multiz5waySyn/cons
msa_view --aggregate rheMac2,hg18,rn4,mm8,canFam2 \
-i MAF \
--summary-only /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/mafSyn/chr1.maf \
> maf_summary.txt
awk '$1 == "[aggregate]" {printf "%0.3f\n", $3 + $4}' maf_summary.txt
# 0.424
# Be sure to substitute in the right G+C content. Also, notice the
# target coverage of 0.17. We actually want 5% coverage here but
# the final (posterior) coverage is only indirectly related to the
# expected (prior) coverage. One thing to consider is that we
# only have about 40% alignment coverage (excluding chimp, which
# doesn't help us much in identifying conserved regions). As far
# as phastCons is concerned, we want to aim for about 0.05 / 0.4 =
# 0.125 coverage. In this case, though, --target-coverage
# 0.125 resulted in only about 4.1% coverage. I had to iterate
# a couple of times (using only chromosome 1) to find a value that
# got me close to the target of 5%
ssh pk
mkdir -p /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.elements
cd /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.elements
mkdir -p TREES
mkdir -p LOG
ls /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss/chr*/*ss > all.windows
for f in `cat all.windows` ; do root=`basename $f .ss.gz` ; echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ; done
para create jobs.lst
para push
# check tree model on 5MB chunk, using params recommended by Adam,
# he ok'ed the results -- not necessary for next human run
ssh kolossus
cd /cluster/data/rheMac2/bed/multiz5waySyn/cons
/cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
--tree "`cat ../tree-commas.nh`" \
/san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss/chr7/chr7.110000001-120000000.ss \
-o phyloFit.tree
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
# cd ..
mkdir run.cons
cd run.cons
cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set tmp = /scratch/tmp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons
cp -p $san/ss/$c/$f.ss ../elliotsEncode.mod $tmp
pushd $tmp > /dev/null
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncode.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative panTro1,rheMac2 \
--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/pp/$c $san/bed/$c
sleep 1
mv $tmp/$f.pp $san/pp/$c
mv $tmp/$f.bed $san/bed/$c
rm -fr $tmp
'EOF'
# emacs happy
chmod a+x doPhast.csh
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create gsub file
cat > template << 'EOF'
#LOOP
doPhast.csh $(root1) $(file1) 14 .008 .28
#ENDLOOP
'EOF'
# happy emacs
# Create parasol batch and run it
pushd /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons
# mkdir /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.cons
ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
/cluster/data/rheMac2/bed/multiz5waySyn/cons/run.cons/in.list
ssh pk
cd /cluster/store11/gs.19/build36/bed/multiz5waySyn/cons/run.cons
gensub2 in.list single template jobList
para create jobList
# 295 jobs
para try
para check
para push
#Completed: 295 of 295 jobs
#CPU time in finished jobs: 9160s 152.67m 2.54h 0.11d 0.000 y
#IO & Wait Time: 2969s 49.48m 0.82h 0.03d 0.000 y
#Average job time: 41s 0.69m 0.01h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 78s 1.30m 0.02h 0.00d
#Submission to last job: 276s 4.60m 0.08h 0.00d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons
# The sed's and the sort get the file names in chrom,start order
# (Hiram tricks -- split into columns on [.-/] with
# identifying x,y,z, to allow column sorting and
# restoring the filename. Warning: the sort column
# will depend on how deep you are in the dir
find ./bed -name "chr*.bed" | \
sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
sort -k7,7 -k9,9n | \
sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
xargs cat | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/rheMac2/bed/multiz5waySyn/cons
# load into database
ssh hgwdev
cd /cluster/data/rheMac2/bed/multiz5waySyn/cons
hgLoadBed -strict rheMac2 phastConsElements5way mostConserved.bed
#Loaded 602351 elements of size 5
# compare with previous tracks
hgsql rheMac2 -e "select count(*) from phastConsElements5way"
# 2260575
# hgsql hg18 -e "select count(*) from phastConsElements"
# hg18 does not have phastConsElements table
# 1601903
# Try for 5% overall cov, and 70% CDS cov (used elen=13, tcov=.007, rho=.27)
featureBits hg18 -enrichment refGene:cds phastConsElements17way
# refGene:cds 1.072%, phastConsElements17way 5.510%, both 0.759%, cover 70.83%, enrich 12.86x
featureBits hg17 -enrichment refGene:cds phastConsElements17way
# refGene:cds 1.064%, phastConsElements17way 5.104%, both 0.748%, cover 70.29%, enrich 13.77x
# compare with previous tracks
featureBits hg18 -enrichment refGene:cds phastConsElements10way
# refGene:cds 1.062%, phastConsElements10way 5.003%, both 0.734%, cover 69.18%, enrich 13.83x
featureBits hg18 -enrichment refGene:cds phastConsElements
# refGene:cds 1.062%, phastConsElements 4.810%, both 0.771%, cover 72.65%, enrich 15.11x
# Create merged posterier probability file and wiggle track data files
# pk is currently closer to the san than any other machine
ssh pk
cd /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
find ./pp -name "chr*.pp" | \
sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
sort -k7,7 -k9,9n | \
sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
xargs cat | \
nice wigEncode stdin phastCons5way.wig phastCons5way.wib
# about 23 minutes for above
cp -p phastCons17way.wi? /cluster/data/hg18/bed/multiz17way/cons
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/hg18/bed/multiz17way/cons
ln -s `pwd`/phastCons17way.wib /gbdb/hg18/multiz17way/phastCons17way.wib
hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz17way hg18 \
phastCons17way phastCons17way.wig
# ~ 3 minute load
# Downloads (2006-02-22 Fan)
ssh hgwdev
cd /cluster/data/hg18/bed/multiz17way
mkdir mafDownloads
cd mafDownloads
# upstream mafs (mafFrags takes a while)
cat > mafFrags.csh << 'EOF'
date
foreach i (1000 2000 5000)
echo "making upstream$i.maf"
nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
rm up.bad
nice mafFrags hg18 multiz17way up.bed upstream$i.maf \
-orgs=../species.lst
rm up.bed
end
date
'EOF'
time csh mafFrags.csh > mafFrags.log
nice gzip up*.maf
ssh kkstore02
cd /cluster/data/hg18/bed/multiz17way/mafDownloads
cat > downloads.csh << 'EOF'
date
foreach f (../maf/chr*.maf)
set c = $f:t:r
echo $c
nice gzip -c $f > $c.maf.gz
end
md5sum *.gz > md5sum.txt
date
'EOF'
time csh downloads.csh > downloads.log
ssh hgwdev
set dir = /usr/local/apache/htdocs/goldenPath/hg18/multiz17way
mkdir $dir
ln -s /cluster/data/hg18/bed/multiz17way/mafDownloads/{*.gz,md5sum.txt} $dir
#####################################################################
# SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie)
# File emailed from Xinwei She <xws@u.washington.edu>
mkdir /cluster/data/rheMac2/bed/genomicSuperDups
cd /cluster/data/rheMac2/bed/genomicSuperDups
sed -e 's/\t_\t/\t-\t/' rheMac2_genomicSuperDup.tab \
| awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
| hgLoadBed rheMac2 genomicSuperDups stdin\
-sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
######
# ENSEMBL (done 6/20/06 robert)
wget http://www.sanger.ac.uk/Users/lec/macaque_data/gtf.tar.gz
wget http://www.sanger.ac.uk/Users/lec/macaque_data/macaque_ensembl_cdnas.fa.gz
wget -q http://www.sanger.ac.uk/Users/lec/macaque_data/macaque_ensembl_peptides.fa.gz
gunzip *.gz
for i in `ls ensembl_macaque_gtf_data/` ; do echo ${i%%.gtf} ; awk '{print "chr"$0}' ensembl_macaque_gtf_data/$i > chr$i ; done
rm -f chrscaffold.gtf
cp ensembl_macaque_gtf_data/scaffold.gtf scaffold.gtf
cat chr*.gtf > ensGene.gtf
grep -v pseudo ensAll.gtf > ensGene.gtf
ldHgGene rheMac2 ensGene ensGene.gtf -gtf -genePredExt
Reading ensGene.gtf
Read 38561 transcripts in 741623 lines in 1 files
38561 groups 22 seqs 8 sources 4 feature types
38561 gene predictions
hgsql rheMac2 < ~/kent/src/hg/lib/ensGtp.sql
awk '{print $10,$12,$16}' ensGene.gtf |grep ENSMMUP|awk 'NF==3{print $0}'|sed -e 's/\"//g' | sed -e 's/\;//g'|sed -e 's/ /\t/g' |sort -u > ensGtp.tab
echo "load data local infile 'ensGtp.tab' into table ensGtp ignore 1 lines" | hgsql -N rheMac2
sed -e 's/_/ /' macaque_ensembl_peptides.fa > ensPep.fa
hgPepPred rheMac2 ensembl ensPep.fa
grep pseudogene ensAll.gtf > ensPseudo.gtf
grep -v protein_coding ensAll.gtf |grep -v pseudo > ensRna.gtf
##########################################################################
# N-SCAN gene predictions (nscanGene) - (2006-08-04 markd)
cd /cluster/data/rheMac2/bed/nscan/
# obtained NSCAN predictions from michael brent's group
# at WUSTL
wget -nv -r -np http://ardor.wustl.edu/jeltje/rheMac2/chr_gtf
wget -nv -r -np http://ardor.wustl.edu/jeltje/rheMac2/proteins
# clean up and rename downloaded directorys:
mv ardor.wustl.edu/jeltje/rheMac2/chr_gtf .
mv ardor.wustl.edu/jeltje/rheMac2/proteins chr_ptx
rm -rf ardor.wustl.edu
rm chr_*/index.html*
gzip chr_*/*
chmod a-w chr_*/*.gz
# load tracks. Note that these have *utr features, rather than
# exon features. currently ldHgGene creates separate genePred exons
# for these.
ldHgGene -bin -gtf -genePredExt rheMac2 nscanGene chr_gtf/chr*.gtf.gz
# add .a suffix to match transcript id
# add .a suffix to match transcript id
hgPepPred -suffix=.a rheMac2 generic nscanPep chr_ptx/chr*.fa.gz
rm *.tab
# update trackDb; need a rheMac2-specific page to describe informants
rhesus/rheMac2/nscanGene.html (copy from panTro2 and edit)
rhesus/rheMac2/trackDb.ra
# QA found:
Checking keys on database rheMac2
rheMac2.nscanPep.name - hits 22192 of 24608
Error: 2416 of 24608 elements of rheMac2.nscanPep.name are not in key nscanGene.name line 524 of all.joiner
Example miss: CHR1.5.052.A
# due to WUStL giving us a mismatched protein file.
# 2006-08-11 markd - obtained new protein sequences
wget -nv -r -np http://ardor.wustl.edu/jeltje/rheMac2/proteins
hgPepPred -suffix=.a rheMac2 generic nscanPep proteins/chr*.fa.gz
joinerCheck -keys -database=rheMac2 -identifier=nscanName ~/compbio/kent/src/hg/makeDb/schema/all.joiner
Checking keys on database rheMac2
rheMac2.nscanPep.name - hits 22003 of 22003 ok
# 2006-08-11 markd - N-SCAN track has UTR only predictions, reload discarding
# these
ldHgGene -bin -gtf -requireCDS -genePredExt rheMac2 nscanGene chr_gtf/chr*.gtf.gz
# CPGISLANDS (DONE - 2006-09-12 - Robert)
ssh hgwdev
mkdir -p /cluster/data/rheMac2/bed/cpgIsland
cd /cluster/data/rheMac2/bed/cpgIsland
# Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
cvs co hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
make
# gcc readseq.c cpg_lh.c -o cpglh.exe
mv cpglh.exe /cluster/data/rheMac2/bed/cpgIsland/
# cpglh.exe requires hard-masked (N) .fa's.
# There may be warnings about "bad character" for IUPAC ambiguous
# characters like R, S, etc. Ignore the warnings.
ssh kkstore02
cd /cluster/data/rheMac2/bed/cpgIsland
foreach f (../../*/chr*.fa.masked)
set fout=$f:t:r:r.cpg
echo running cpglh on $f to $fout
./cpglh.exe $f > $fout
end
# Transform cpglh output to bed +
cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
# << this line makes emacs coloring happy
awk -f filter.awk chr*.cpg > cpgIsland.bed
ssh hgwdev
cd /cluster/data/rheMac2/bed/cpgIsland
hgLoadBed rheMac2 cpgIslandExt -tab -noBin \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
#Loaded 24226 elements of size 10
#Sorted
#Saving bed.tab
#Loading rheMac2
##########################################################################
# Hg 18 Reciprocal Best Nets (8-13-07, kpollard)
ssh kolossus
cd /cluster/data/hg18/bed/blastz.rheMac2/axtChain
mkdir rBestNet
cp /cluster/data/rheMac2/chrom.sizes /cluster/bluearc/rheMac2/.
cat > rbest.csh << 'EOF'
set swapChain = rheMac2ToHg18.swap.chain
echo "merge and swap $swapChain"
zcat /cluster/data/rheMac2/bed/liftOver/rheMac2ToHg18.over.chain.gz \
| chainSwap stdin stdout | chainMergeSort stdin > $swapChain
echo "split chain"
chainSplit swapChain/ $swapChain
echo "run netter"
chainPreNet $swapChain /cluster/bluearc/hg18/chrom.sizes \
/cluster/bluearc/rheMac2/chrom.sizes stdout | \
chainNet stdin -minSpace=1 /cluster/bluearc/hg18/chrom.sizes \
/cluster/bluearc/rheMac2/chrom.sizes stdout /dev/null | \
netSyntenic stdin rbest.noClass.net
echo "split net"
netSplit rbest.noClass.net rBestNet
echo "extract chains from net"
netChainSubset -verbose=0 rbest.noClass.net $swapChain stdout | \
chainMergeSort stdin | gzip -c > hg18.rheMac2.rbest.chain.gz
echo "split extracted chain"
chainSplit rBestChain/ hg18.rheMac2.rbest.chain.gz
cd ..
mkdir axtRBestNet
echo "extract axts"
foreach f (axtChain/rBestNet/*.net)
echo $f:t:r
netToAxt $f axtChain/swapChain/$f:t:r.chain \
/san/sanvol1/scratch/hg18/hg18.2bit \
/cluster/bluearc/rheMac2/rheMac2.2bit stdout | \
axtSort stdin stdout | gzip -c \
> axtRBestNet/$f:t:r.hg18.rheMac2.net.axt.gz
end
'EOF'
csh rbest.csh >&! rbest.log &
#Not yet loaded
############################################################################
## BLASTZ swap from monDom4 alignments (2007-11-11 - markd)
ssh hgwdev
mkdir /cluster/data/rheMac2/bed/blastz.monDom4.swap
cd /cluster/data/rheMac2/bed/blastz.monDom4.swap
ln -s blastz.monDom4.swap ../blastz.monDom4
/cluster/bin/scripts/doBlastzChainNet.pl \
-swap /cluster/data/monDom4/bed/blastz.rheMac2/DEF >& swap.out&
# fb.rheMac2.chainMonDom4Link.txt:
# 611785347 bases of 2646704109 (23.115%) in intersection
#########################################################################
# Blastz Marmoset calJac1 (DONE - 2007-11-15 - Hiram)
ssh kkstore01
screen # use screen to control this job
mkdir /cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16
cd /cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16
cat << '_EOF_' > DEF
# Rhesus vs marmoset
BLASTZ_M=50
# TARGET: Rhesus RheMac2
SEQ1_DIR=/scratch/hg/rheMac2/rheMac2.2bit
SEQ1_LEN=/cluster/data/rheMac2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Marmoset calJac1
SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=kk > blastz.log 2>&1 &
# real 2115m6.616s
# failed during the load due to MySQL problems on hgwdev
# run loadUp.csh manually on hgwdev
# real 28m35.926s
cat fb.rheMac2.chainCalJac1Link.txt
# 2055107003 bases of 2646704109 (77.648%) in intersection
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-continue=download -chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=kk > download.log 2>&1 &
mkdir /cluster/data/calJac1/bed/blastz.rheMac2.swap
cd /cluster/data/calJac1/bed/blastz.rheMac2.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16/DEF \
-swap -chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=kk > swap.log 2>&1 &
# real 349m36.073s
cat fb.calJac1.chainRheMac2Link.txt
# 2191300051 bases of 2929139385 (74.810%) in intersection
###########################################################################
# Blastz Orangutan ponAbe2 (DONE - 2007-11-16 - Hiram)
ssh kkstore01
screen # use screen to control this job
mkdir /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16
cd /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16
cat << '_EOF_' > DEF
# Rhesus vs marmoset
BLASTZ_M=50
# TARGET: Rhesus RheMac2
SEQ1_DIR=/scratch/hg/rheMac2/rheMac2.2bit
SEQ1_LEN=/cluster/data/rheMac2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Orangutan ponAbe2
SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=kk > blastz.log 2>&1 &
# something got lost during the para make. A couple of jobs couldn't
# finish, due to memory limits. Finish them off on hgwdev
# real 1911m11.906s
# Completed: 108924 of 108928 jobs
# Crashed: 4 jobs
# CPU time in finished jobs: 20658708s 344311.80m 5738.53h 239.11d 0.655 y
# IO & Wait Time: 1801215s 30020.25m 500.34h 20.85d 0.057 y
# Average job time: 206s 3.44m 0.06h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 5401s 90.02m 1.50h 0.06d
# Submission to last job: 111924s 1865.40m 31.09h 1.30d
# continuing after those were done:
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-continue=cat -chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=kk > cat.log 2>&1 &
# some jobs failed due to memory limits on kk nodes
# finished them manually on kolossus and hgwdev, then continuing
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-continue=cat -bigClusterHub=kk > cat.log 2>&1 &
# real 322m54.734s
# failed during load due to MySQL problems on hgwdev
# run loadUp.csh manually on hgwdev
# real 22m22.225s
cat fb.rheMac2.chainPonAbe2Link.txt
# 2333264093 bases of 2646704109 (88.157%) in intersection
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-continue=download -bigClusterHub=kk > download.log 2>&1 &
# and, for the swap
mkdir /cluster/data/ponAbe2/bed/blastz.rheMac2.swap
cd /cluster/data/ponAbe2/bed/blastz.rheMac2.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16/DEF \
-swap -chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=kk > swap.log 2>&1 &
# real 289m9.479s
cat fb.ponAbe2.chainRheMac2Link.txt
# 2556010573 bases of 3093572278 (82.623%) in intersection
############################################################################
# SGP GENES (DONE - 2007-12-20 - Hiram)
ssh kkstore01
mkdir /cluster/data/rheMac2/bed/sgp
cd /cluster/data/rheMac2/bed/sgp
mkdir gtf
for C in `awk '{print $1}' /cluster/data/rheMac2/chrom.sizes`
do
wget --timestamping \
"http://genome.imim.es/genepredictions/M.mulatta/rmJan2006/SGP/200603mm8/${C}.gtf" \
-O "gtf/${C}.gtf"
done
ssh hgwdev
cd /cluster/data/rheMac2/bed/sgp
ldHgGene -gtf -genePredExt rheMac2 sgpGene gtf/chr*.gtf
# Read 31416 transcripts in 264479 lines in 22 files
# 31416 groups 22 seqs 1 sources 3 feature types
# 31416 gene predictions
featureBits rheMac2 sgpGene
# 32556064 bases of 2646704109 (1.230%) in intersection
featureBits rheMac2 -enrichment refGene:CDS sgpGene
# refGene:CDS 0.017%, sgpGene 1.230%, both 0.014%, cover 83.53%, enrich 67.91x
#####################################################################
# LOAD GENEID GENES (DONE - 2007-12-20 - Hiram)
ssh kkstore01
mkdir -p /cluster/data/rheMac2/bed/geneid
cd /cluster/data/rheMac2/bed/geneid
mkdir gtf prot
for C in `awk '{print $1}' /cluster/data/rheMac2/chrom.sizes`
do
echo $C
wget --timestamping \
"http://genome.imim.es/genepredictions/M.mulatta/rmJan2006/geneid_v1.2/${C}.gtf" \
-O "gtf/${C}.gtf"
wget --timestamping \
"http://genome.imim.es/genepredictions/M.mulatta/rmJan2006/geneid_v1.2/${C}.prot" \
-O "prot/${C}.prot"
done
# Add missing .1 to protein id's
cd prot
foreach f (*.prot)
perl -wpe 's/^(>chr\S+)$/$1.1/' $f > $f:r-fixed.prot
end
ssh hgwdev
cd /cluster/data/rheMac2/bed/geneid
ldHgGene -genePredExt -gtf rheMac2 geneid gtf/*.gtf
# Read 30202 transcripts in 248094 lines in 22 files
# 30202 groups 22 seqs 1 sources 3 feature types
# 30202 gene predictions
hgPepPred rheMac2 generic geneidPep prot/chr*-fixed.prot
featureBits rheMac2 geneid
# 33978840 bases of 2646704109 (1.284%) in intersection
featureBits rheMac2 -enrichment refGene:CDS geneid
# refGene:CDS 0.017%, geneid 1.284%, both 0.013%, cover 77.14%, enrich 60.09x
##########################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
rheMac2.upstreamGeneTbl = refGene
############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
+
############################################################################
+# calJac3 Marmoset BLASTZ/CHAIN/NET (DONE - 2010-02-11 - Hiram)
+ screen # use a screen to manage this multi-day job
+ mkdir /hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11
+ cd /hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11
+
+ # same kind of parameters as used in human<->marmoset
+ cat << '_EOF_' > DEF
+# Rhesus vs. marmoset
+BLASTZ=lastz
+# maximum M allowed with lastz is only 254
+BLASTZ_M=254
+BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
+# and place those items here
+BLASTZ_O=600
+BLASTZ_E=150
+BLASTZ_K=4500
+BLASTZ_Y=15000
+BLASTZ_T=2
+
+# TARGET: Rhesus RheMac2
+SEQ1_DIR=/scratch/data/rheMac2/rheMac2.2bit
+SEQ1_LEN=/scratch/data/rheMac2/chrom.sizes
+SEQ1_CHUNK=20000000
+SEQ1_LAP=10000
+SEQ1_LIMIT=5
+
+# QUERY: Marmoset (calJac3)
+SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
+SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
+SEQ2_LIMIT=50
+SEQ2_CHUNK=10000000
+SEQ2_LAP=0
+
+BASE=/hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << this line keeps emacs coloring happy
+
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -syntenicNet \
+ -chainMinScore=5000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ > do.log 2>&1 &
+ # real 248m5.651s
+ cat fb.rheMac2.chainCalJac3Link.txt
+ # 1871513554 bases of 2646704109 (70.711%) in intersection
+
+ mkdir /hive/data/genomes/calJac3/bed/blastz.rheMac2.swap
+ cd /hive/data/genomes/calJac3/bed/blastz.rheMac2.swap
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ /hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11/DEF \
+ -swap -syntenicNet \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 &
+ # real 142m34.894s
+ cat fb.calJac3.chainHg19Link.txt
+ # 1916431926 bases of 2752505800 (69.625%) in intersection
+############################################################################
+