src/hg/makeDb/doc/loxAfr1.txt 1.8
1.8 2009/07/21 21:01:44 markd
added transmap update blurb
Index: src/hg/makeDb/doc/loxAfr1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/loxAfr1.txt,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 1000000 -r1.7 -r1.8
--- src/hg/makeDb/doc/loxAfr1.txt 4 Dec 2008 00:59:07 -0000 1.7
+++ src/hg/makeDb/doc/loxAfr1.txt 21 Jul 2009 21:01:44 -0000 1.8
@@ -1,1048 +1,1057 @@
# for emacs: -*- mode: sh; -*-
# Elephant May 2005 Broad Assembly
#
# DOWNLOAD SEQUENCE
ssh kkstore01
mkdir /cluster/store9/loxAfr1
cd /cluster/data
ln -s /cluster/store9/loxAfr1 loxAfr1
cd /cluster/data/loxAfr1
mkdir jkStuff bed
mkdir broad
cd broad
# ftp'ed with password from Broad
# -rw-rw-r-- 1 braney protein 81083988 May 20 20:08 assembly.agp
# -rw-rw-r-- 1 braney protein 799096831 May 20 20:30 assembly.bases.gz
# -rw-rw-r-- 1 braney protein 1340880552 May 20 21:07 assembly.quals.gz
# -rw-rw-r-- 1 braney protein 408350312 May 21 09:05 unplaced.fasta.gz
# -rw-rw-r-- 1 braney protein 1258465506 May 21 09:40 unplaced.qual.gz
gunzip assembly.bases.gz
faSize assembly.bases
# 2295548473 bases (0 N's 2295548473 real 2295548473 upper 0 lower) in 918949 sequences in 1 files
# Total size: mean 2498.0 sd 2280.1 min 600 (contig_719849) max 181008 (contig_82) median 1929
# N count: mean 0.0 sd 0.0
# U count: mean 2498.0 sd 2280.1
# L count: mean 0.0 sd 0.0
ssh kolossus
cd /cluster/data/loxAfr1/broad
/cluster/bin/x86_64/agpAllToFaFile assembly.agp assembly.bases ../loxAfr1.fa
cd ..
faSize loxAfr1.fa
# 3707398801 bases (1411850328 N's 2295548473 real 2295548473 upper 0 lower) in 233134 sequences in 1 files
# Total size: mean 15902.4 sd 26684.7 min 601 (scaffold_233133) max 615624 (scaffold_0) median 5715
# N count: mean 6056.0 sd 12052.4
# U count: mean 9846.5 sd 16481.1
# L count: mean 0.0 sd 0.0
/cluster/bin/scripts/agpToLift < assembly.agp > ../jkStuff/assembly.lft
# PARTITION SCAFFOLDS FOR REPEATMASKER RUN
# glom the tiny scaffolds up into ~50k collections
ssh kkstore01
cd /cluster/data/loxAfr1
mkdir chunks50k
faSplit about broad/assembly.bases 50000 chunks50k/chunk_
cd chunks50k
for i in 0 1 2 3 4 5 6 7 8 9; do mkdir $i; mv *$i.fa $i; done
# RUN REPEAT MASKER
# make the run directory, output directory, and job list
ssh kkstore01
cd /cluster/data/loxAfr1
tcsh
cat << '_EOF_' > jkStuff/RMElephant
#!/bin/csh -fe
cd $1
/bin/mkdir -p /tmp/loxAfr1/$2
/bin/cp $3 /tmp/loxAfr1/$2/
pushd /tmp/loxAfr1/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -species elephant $2
popd
/bin/cp /tmp/loxAfr1/$2/$2.out ./
/bin/rm -fr /tmp/loxAfr1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/loxAfr1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/loxAfr1
'_EOF_'
# << this line makes emacs coloring happy
chmod +x jkStuff/RMElephant
mkdir RMRun RMOut
cd RMOut
mkdir 0 1 2 3 4 5 6 7 8 9
cd ../chunks50k
for i in */*.fa
do
e=`dirname $i`
d="/cluster/data/loxAfr1"
c=`basename $i`
echo "../jkStuff/RMElephant $d/RMOut/$e $c {check in line+ $d/chunks50k/$i} {check out line+ $d/RMOut/$e/$c.out}"
done > ../RMRun/RMJobs
# do the run
ssh kk
cd /cluster/data/loxAfr1/RMRun
para create RMJobs
para try, check, push, check,...
# Completed: 44022 of 44022 jobs
# CPU time in finished jobs: 20439985s 340666.42m 5677.77h 236.57d 0.648 y
# IO & Wait Time: 171349s 2855.81m 47.60h 1.98d 0.005 y
# Average job time: 468s 7.80m 0.13h 0.01d
# Longest finished job: 2125s 35.42m 0.59h 0.02d
# Submission to last job: 44073s 734.55m 12.24h 0.51d
# Lift up the split-scaffold .out's to scaffold .out's
ssh kkstore01
cd /cluster/data/loxAfr1/RMOut
for i in 0 1 2 3 4 5 6 7 8 9; do echo $i; liftUp -nohead $i.out ../jkStuff/assembly.lft warn $i/*.fa.out>/dev/null; done
head -3 0.out > loxAfr1.out
for i in 0 1 2 3 4 5 6 7 8 9; do tail +4 $i.out; done >> loxAfr1.out
# Load the .out files into the database with:
ssh hgwdev
hgLoadOut loxAfr1 /cluster/data/loxAfr1/RMOut/loxAfr1.out
# Lots of strange perc messages
# Loading up table loxAfr1_rmsk
# note: 6 records dropped due to repStart > repEnd
hgsql loxAfr1 -e 'rename table loxAfr1_rmsk to rmsk'
# Fix up the indices too:
hgsql loxAfr1 -e 'drop index bin on rmsk; drop index genoStart on rmsk; drop index genoEnd on rmsk; \
create index bin on rmsk (genoName(16), bin); \
create index genoStart on rmsk (genoName(16), genoStart); \
create index genoEnd on rmsk (genoName(16), genoEnd);'
# EXTRACTING GAP INFO FROM BLOCKS OF NS (DONE 11/5/04 angie)
ssh kkstore01
mkdir /cluster/data/loxAfr1/bed/fakeAgp
cd /cluster/data/loxAfr1/bed/fakeAgp
faGapSizes ../../downloads/scaffolds.fasta \
-niceSizes=5,10,20,25,30,40,50,100,250,500,1000,10000,100000
# Wow, all block of N's seem to be exactly 100bp long.
# hgFakeAgp's default -minContigGap of 25 will be fine.
hgFakeAgp ../../downloads/scaffolds.fasta fake.agp
ssh hgwdev
hgLoadGap -unsplit loxAfr1 /cluster/data/loxAfr1/bed/fakeAgp/fake.agp
# SIMPLE REPEATS (TRF)
ssh kkstore01
mkdir /cluster/data/loxAfr1/bed/simpleRepeat
cd /cluster/data/loxAfr1/bed/simpleRepeat
nice trfBig -trf=/cluster/bin/i386/trf ../../loxAfr1.fa \
/dev/null -bedAt=simpleRepeat.bed -tempDir=/tmp \
|& egrep -v '^(Removed|Tandem|Copyright|Loading|Allocating|Initializing|Computing|Scanning|Freeing)' \
> trf.log &
# check on this with
tail -f trf.log
# Load this into the database as so
ssh hgwdev
hgLoadBed loxAfr1 simpleRepeat /cluster/data/loxAfr1/bed/simpleRepeat/simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# FILTER SIMPLE REPEATS (TRF) INTO MASK
# make a filtered version of the trf output:
# keep trf's with period <= 12:
ssh kkstore01
cd /cluster/data/loxAfr1/bed/simpleRepeat
awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
# MASK FA USING REPEATMASKER AND FILTERED TRF FILES
ssh kolossus
cd /cluster/data/loxAfr1
/cluster/bin/x86_64/maskOutFa -soft loxAfr1.fa bed/simpleRepeat/trfMask.bed loxAfr1.simple.fa
/cluster/bin/x86_64/maskOutFa -softAdd loxAfr1.simple.fa RMOut/loxAfr1.out loxAfr1.masked.fa
mv loxAfr1.fa loxAfr1.unmasked.fa
# Now clean up the unmasked split scaffolds to avoid confusion later.
rm -r chunks500k scaffoldsSplit.fa jkStuff/scaffoldsSplit.lft
# CREATING DATABASE
# Create the database.
ssh hgwdev
# Make sure there is at least 5 gig free for the database
df -h /var/lib/mysql
Filesystem Size Used Avail Use% Mounted on
# /dev/sdc1 1.8T 915G 746G 56% /var/lib/mysql
hgsql '' -e 'create database loxAfr1'
# STORE SEQUENCE AND ASSEMBLY INFORMATION
# Translate to 2bit
ssh kkstore01
cd /cluster/data/loxAfr1
/cluster/bin/x86_64/faToTwoBit loxAfr1.masked.fa loxAfr1.2bit
# Make chromInfo.tab.
mkdir bed/chromInfo
twoBitInfo loxAfr1.2bit stdout | awk '{printf "%s\t%s\t/gbdb/loxAfr1/loxAfr1.2bit\n",$1,$2;}' \
| sort > bed/chromInfo/chromInfo.tab
# Make symbolic a link from /gbdb/loxAfr1 to the 2bit.
ssh hgwdev
mkdir -p /gbdb/loxAfr1
ln -s /cluster/data/loxAfr1/loxAfr1.2bit /gbdb/loxAfr1/
# Load chromInfo table.
hgsql loxAfr1 < $HOME/kent/src/hg/lib/chromInfo.sql
hgsql loxAfr1 -e 'load data local infile "/cluster/data/loxAfr1/bed/chromInfo/chromInfo.tab" into table chromInfo'
# Make chrom.sizes from chromInfo contents and check scaffold count.
hgsql loxAfr1 -N -e 'select chrom,size from chromInfo' > /cluster/data/loxAfr1/chrom.sizes
wc -l /cluster/data/loxAfr1/chrom.sizes
# 233134 /cluster/data/loxAfr1/chrom.sizes
# CREATING GRP TABLE FOR TRACK GROUPING
# Copy all the data from the table "grp"
# in an existing database to the new database
ssh hgwdev
hgsql loxAfr1 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp'
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 11/4/04 angie)
# Warning: genome and organism fields must correspond
# with defaultDb values
echo 'INSERT INTO dbDb \
(name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) values \
("loxAfr1", "May. 2005", "/gbdb/loxAfr1", "Elephant", \
"", 1, 57, "Elephant", "Loxodonta africana", "/gbdb/loxAfr1/html/description.html", \
0, 0, "Broad May 2005");' \
| hgsql -h genome-testdb hgcentraltest
echo 'INSERT INTO defaultDb (genome, name) values ("Elephant", "loxAfr1");' \
| hgsql -h genome-testdb hgcentraltest
# Make trackDb table so browser knows what tracks to expect:
ssh hgwdev
cd ~/kent/src/hg/makeDb/trackDb
cvs up -d -P
# Edit trackDb/makefile to add loxAfr1 to the DBS variable.
mkdir -p rabbit/loxAfr1
# Create a simple rabbit/loxAfr1/description.html file.
cvs add drosophila/loxAfr1
cvs add drosophila/loxAfr1/description.html
make update DBS=loxAfr1 ZOO_DBS=
# go public on genome-test
cvs ci makefile
cvs ci rabbit/loxAfr1
mkdir /gbdb/loxAfr1/html
# in a clean, updated tree's kent/src/hg/makeDb/trackDb:
make alpha
# PUT SEQUENCE ON /ISCRATCH FOR BLASTZ
# First, agglomerate small scaffolds into chunks of ~100k median
# (many scaffolds are larger than that) so we don't have too many
# files for one dir, but keep a reasonably low job run time:
ssh kkstore01
cd /cluster/data/loxAfr1
mkdir chunksUnsplit
faSplit about scaffolds.fa 100000 chunksUnsplit/chunk_
ssh kkr1u00
mkdir /iscratch/i/loxAfr1
cp -pR /cluster/data/loxAfr1/chunksUnsplit /iscratch/i/loxAfr1/
cp -p /cluster/data/loxAfr1/loxAfr1.2bit /iscratch/i/loxAfr1/
iSync
# PRODUCING GENSCAN PREDICTIONS (DONE 11/4/04 angie)
ssh kkstore01
# Make hard-masked scaffolds and split up for processing:
cd /cluster/data/loxAfr1
maskOutFa scaffolds.fa hard scaffolds.fa.masked
mkdir chunksUnsplitMasked
faSplit about scaffolds.fa.masked 100000 chunksUnsplitMasked/chunk_
mkdir /cluster/data/loxAfr1/bed/genscan
cd /cluster/data/loxAfr1/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
ls -1S ../../chunksUnsplitMasked/chunk*.fa > chunks.list
cat << '_EOF_' > gsub
#LOOP
gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
gensub2 chunks.list single gsub jobList
ssh kki
cd /cluster/data/loxAfr1/bed/genscan
para create jobList
para try, check, push, check, ...
#Completed: 463 of 463 jobs
#Average job time: 12s 0.21m 0.00h 0.00d
#Longest job: 317s 5.28m 0.09h 0.00d
#Submission to last job: 445s 7.42m 0.12h 0.01d
# If there are crashes, diagnose with "para problems".
# If a job crashes due to genscan running out of memory, re-run it
# manually with "-window=1200000" instead of "-window=2400000".
# Concatenate scaffold-level results:
ssh kkstore01
cd /cluster/data/loxAfr1/bed/genscan
cat gtf/*.gtf > genscan.gtf
cat subopt/*.bed > genscanSubopt.bed
cat pep/*.pep > genscan.pep
# Clean up
rm -r /cluster/data/loxAfr1/chunksUnsplitMasked
# Load into the database as so:
ssh hgwdev
cd /cluster/data/loxAfr1/bed/genscan
# Reloaded without -genePredExt 1/6/05:
ldHgGene -gtf loxAfr1 genscan genscan.gtf
hgPepPred loxAfr1 generic genscanPep genscan.pep
hgLoadBed loxAfr1 genscanSubopt genscanSubopt.bed
# MAKE DOWNLOADABLE FILES (DONE 11/4/04 angie)
# RECREATE BIGZIPS DOWNLOADS AND README FILE (DONE, 2006-05-05, hartera)
# ADDED LIFTOVER CHAIN FILES FOR DOWNLOAD AND ADDED md5sum.txt FILE FOR
# THESE LIFTOVER DOWNLOADS AND CREATED CORRECT md5sum.txt FOR vsMm7
# DOWNLOADS (DONE, 2006-05-23, hartera)
ssh kkstore01
mkdir /cluster/data/loxAfr1/zips
cd /cluster/data/loxAfr1
zip -j zips/scaffoldOut.zip RMOut/scaffolds.fa.out
zip -j zips/scaffoldFa.zip scaffolds.fa
zip -j zips/scaffoldFaMasked.zip scaffolds.fa.masked
zip -j zips/scaffoldTrf.zip bed/simpleRepeat/trfMask.bed
foreach f (zips/*.zip)
echo $f
unzip -t $f | tail -1
end
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/loxAfr1
cd /usr/local/apache/htdocs/goldenPath/loxAfr1
mkdir bigZips database
# Create README.txt files in bigZips/ and database/ to explain the files.
cd bigZips
cp -p /cluster/data/loxAfr1/zips/*.zip .
md5sum *.zip > md5sum.txt
# Add more bigZips downloads. Some of the above downloads don't exist
# anymore in bigZips in /usr/local/apache/htdocs/goldenPath/... on hgwdev
# (2006-05-05, hartera)
ssh kkstore04
mkdir /cluster/data/loxAfr1/bigZips
cd /cluster/data/loxAfr1
# soft-masked scaffolds sequences
cp -p loxAfr1.masked.fa.gz ./bigZips/scaffoldSoftMask.fa.gz
# assembly agp file
cp -p ./broad/assembly.agp ./bigZips/
# Simple Repeats
cp -p ./bed/simpleRepeat/trfMask.bed ./bigZips/trf.bed
# RepeatMasker output
cp -p ./RMOut/loxAfr1.out.gz ./bigZips/rmsk.out
# unmasked scaffolds sequences
cp -p loxAfr1.unmasked.fa.gz ./bigZips/scaffold.fa.gz
cd bigZips
gzip assembly.agp
gzip trf.bed
gzip rmsk.out
# check integrity of files
foreach f (*.gz)
echo $f
gunzip -t $f | tail -1
end
md5sum *.gz > md5sum.txt
# link the *.gz and *.txt files to hgwdev:/usr/local/apache/....
ssh hgwdev
set gp=/usr/local/apache/htdocs/goldenPath/loxAfr1
rm -r $gp/bigZips
mkdir -p $gp/bigZips
ln -s /cluster/data/loxAfr1/bigZips/{*.gz,*.txt} $gp/bigZips
# copy over README.txt and edit for bigZips
# Move over loxAfr1ToMm7.over.chain.gz from hgwdevold and then
# add this and loxAfr1ToHg18.over.chain.gz to goldenPath liftOver dir.
# Add liftOver md5sum.txt. loxAfr1ToMm7.over.chain.gz is in the md5sum.txt
# for the vsMm7 directory which is wrong so recreate this md5sum.txt file
# (hartera, 2006-05-23)
ssh hgwdev
set gp=/usr/local/apache/htdocs/goldenPath/loxAfr1
# copy over from hgwdevold
cd /cluster/data/loxAfr1/bed/liftOver
scp \
hgwdevold:/usr/local/apache/htdocs/goldenPath/loxAfr1/liftOver/loxAfr1ToMm7.over.chain.gz .
cd $gp/liftOver
ln -s /cluster/data/loxAfr1/bed/liftOver/*.gz $gp/liftOver
# create md5sum.txt
md5sum *.gz > md5sum.txt
# recreate md5sum.txt for vsMm7 files
cd $gp/vsMm7
rm md5sum.txt
md5sum *.gz > md5sum.txt
# check that vsMm7 README.txt is correct.
# QA NOTE on bigZips above (10/26/07 rhead): a user found that rmsk.out.gz was
# corrupted. It had been gzipped twice. Gunzipped the file
# /cluster/store9/loxAfr1/bigZips/rmsk.out.gz, then renamed it with the .gz
# extension. Also updated the md5sum.txt file.
# SWAP DM1-DROANA1 BLASTZ (DONE 11/4/04 angie)
ssh kkstore01
mkdir /cluster/data/loxAfr1/bed/blastz.dm1.swap.2004-11-03
ln -s blastz.dm1.swap.2004-11-03 /cluster/data/loxAfr1/bed/blastz.dm1
cd /cluster/data/loxAfr1/bed/blastz.dm1
set aliDir = /cluster/data/dm1/bed/blastz.loxAfr1
cp $aliDir/S1.len S2.len
cp $aliDir/S2.len S1.len
# With 11k scaffolds, we don't want a directory with one file per
# scaffold. So just make one .axt with everything -- not too huge
# anyway, given these little insect genomes.
cat $aliDir/axtChrom/chr*.axt \
| axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
| axtSort stdin dm1.axt
du -sh $aliDir/axtChrom dm1.axt
#389M /cluster/data/dm1/bed/blastz.loxAfr1/axtChrom
#389M dm1.axt
# CHAIN MELANOGASTER BLASTZ (DONE 11/4/04 angie)
# Run axtChain on kolossus (one big dm1.axt input)
ssh kolossus
mkdir /cluster/data/loxAfr1/bed/blastz.dm1/axtChain
cd /cluster/data/loxAfr1/bed/blastz.dm1/axtChain
axtChain -verbose=0 ../dm1.axt /cluster/data/loxAfr1/loxAfr1.2bit \
/cluster/data/dm1/nib stdout \
| chainAntiRepeat /cluster/data/loxAfr1/loxAfr1.2bit \
/cluster/data/dm1/nib stdin stdout \
| chainMergeSort stdin > all.chain
# Load chains into database
ssh hgwdev
cd /cluster/data/loxAfr1/bed/blastz.dm1/axtChain
hgLoadChain -tIndex loxAfr1 chainDm1 all.chain
# NET MELANOGASTER BLASTZ (DONE 11/4/04 angie)
ssh kkstore01
cd /cluster/data/loxAfr1/bed/blastz.dm1/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/loxAfr1/bed/blastz.dm1/axtChain
netClass -noAr noClass.net loxAfr1 dm1 melanogaster.net \
|& g -v "table gap doesn't exist"
# Make a 'syntenic' subset:
ssh kkstore01
cd /cluster/data/loxAfr1/bed/blastz.dm1/axtChain
rm noClass.net
netFilter -syn melanogaster.net > melanogasterSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/loxAfr1/bed/blastz.dm1/axtChain
netFilter -minGap=10 melanogaster.net | hgLoadNet loxAfr1 netDm1 stdin
netFilter -minGap=10 melanogasterSyn.net \
| hgLoadNet loxAfr1 netSyntenyDm1 stdin
# MAKE AXTNET (DONE 11/4/04 angie)
ssh kkstore01
cd /cluster/data/loxAfr1/bed/blastz.dm1/axtChain
netToAxt melanogaster.net all.chain /cluster/data/loxAfr1/loxAfr1.2bit \
/cluster/data/dm1/nib stdout \
| axtSort stdin melanogasterNet.axt
# MAKE VSDM1 DOWNLOADABLES (DONE 11/4/04 angie)
ssh kkstore01
cd /cluster/data/loxAfr1/bed/blastz.dm1/axtChain
nice gzip *.{chain,net,axt}
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/loxAfr1/vsDm1
cd /usr/local/apache/htdocs/goldenPath/loxAfr1/vsDm1
cp -p /cluster/data/loxAfr1/bed/blastz.dm1/axtChain/all.chain.gz \
melanogaster.chain.gz
cp -p /cluster/data/loxAfr1/bed/blastz.dm1/axtChain/melanogaster.net.gz .
cp -p /cluster/data/loxAfr1/bed/blastz.dm1/axtChain/melanogasterNet.axt.gz .
# Make a README.txt which explains the files & formats.
md5sum *.gz */*.gz > md5sum.txt
# MAKE 11.OOC FILE FOR BLAT (DONE 11/4/04 angie)
# Use -repMatch=100 (based on size -- for human we use 1024, and
# fly size is ~4.4% of human judging by gapless dm1 genome size from
# featureBits -- we would use 45, but bump that up a bit to be more
# conservative).
ssh kkr1u00
mkdir /cluster/bluearc/loxAfr1
blat /cluster/data/loxAfr1/loxAfr1.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/bluearc/loxAfr1/11.ooc -repMatch=100
#Wrote 9721 overused 11-mers to /cluster/bluearc/loxAfr1/11.ooc
cp -p /cluster/bluearc/loxAfr1/*.ooc /iscratch/i/loxAfr1/
iSync
# GET GENBANK mRNA AND EST COUNTS
# Go to the latest GenBank full release dir and get an idea of how
# many mRNAs and ESTs there are to align.
ssh eieio
cd /cluster/data/genbank/data/processed/genbank.144.0/full
awk '$4 == "Elephant" {print $4 " " $5;}' mrna.gbidx | sort | uniq -c
# 9 Drosophila ananassae
# 1 Drosophila mojavensis
# 33 Drosophila virilis
# Wow, questionable whether we should have a native mRNA track here.
awk '$4 == "Drosophila" {print $4 " " $5;}' est*.gbidx | sort | uniq -c
# 382439 Drosophila melanogaster
# 4105 Drosophila simulans
# 779 Drosophila yakuba
# And a native EST track isn't even a possibility for the new flies
# at this point!
# AUTO UPDATE GENBANK MRNA RUN (DONE 11/16/04 angie)
ssh hgwdev
# Update genbank config and source in CVS:
cd ~/kent/src/hg/makeDb/genbank
cvsup .
# Edit etc/genbank.conf and add these lines (note scaffold-browser settings):
# loxAfr1 (D. ananassae)
loxAfr1.genome = /iscratch/i/loxAfr1/loxAfr1.2bit
loxAfr1.mondoTwoBitParts = 1000
loxAfr1.lift = no
loxAfr1.refseq.mrna.native.load = no
loxAfr1.refseq.mrna.xeno.load = yes
loxAfr1.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005
loxAfr1.genbank.mrna.xeno.load = yes
# GenBank has no D. ananassae ESTs at this point... that may change.
loxAfr1.genbank.est.native.load = no
loxAfr1.genbank.est.xeno.load = no
loxAfr1.downloadDir = loxAfr1
loxAfr1.perChromTables = no
cvs ci etc/genbank.conf
# Since D. ananassae is a new species for us, edit src/lib/gbGenome.c.
# Pick some other browser species, & monkey-see monkey-do.
cvs diff src/lib/gbGenome.c
make
cvs ci src/lib/gbGenome.c
# Edit src/align/gbBlat to add /iscratch/i/loxAfr1/11.ooc
cvs diff src/align/gbBlat
make
cvs ci src/align/gbBlat
# Install to /cluster/data/genbank:
make install-server
ssh `fileServer /cluster/data/genbank/`
cd /cluster/data/genbank
# This is an -initial run, (xeno) RefSeq only:
nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial loxAfr1 &
tail -f [its logfile]
# Load results:
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad loxAfr1
featureBits loxAfr1 xenoRefGene
#16520385 bases of 165766797 (9.966%) in intersection
# Clean up:
rm -rf work/initial.loxAfr1
# This is an -initial run, mRNA only:
nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial loxAfr1 &
tail -f [its logfile]
# Load results:
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad loxAfr1
featureBits loxAfr1 all_mrna
#19602 bases of 165766797 (0.012%) in intersection
featureBits loxAfr1 xenoMrna
#17295487 bases of 165766797 (10.434%) in intersection
# Clean up:
rm -rf work/initial.loxAfr1
# MAKE GCPERCENT (DONE 11/4/04 angie)
ssh hgwdev
mkdir /cluster/data/loxAfr1/bed/gcPercent
cd /cluster/data/loxAfr1/bed/gcPercent
# create and load gcPercent table
hgGcPercent loxAfr1 /cluster/data/loxAfr1
# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 12/?/04 heather)
ssh hgwdev
echo 'insert into blatServers values("loxAfr1", "blat14", "17780", 1, 0); \
insert into blatServers values("loxAfr1", "blat14", "17781", 0, 1);' \
| hgsql -h genome-testdb hgcentraltest
# MAKE Drosophila Proteins track (DONE braney 11/17/04)
ssh kkstore01
mkdir -p /cluster/data/loxAfr1/blastDb
cd /cluster/data/loxAfr1/blastDb
faSplit sequence ../scaffolds.fa 400 x
for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
rm *.fa *.log
ssh kkr1u00
mkdir -p /iscratch/i/loxAfr1/blastDb
cp /cluster/data/loxAfr1/blastDb/* /iscratch/i/loxAfr1/blastDb
(iSync) 2>&1 > sync.out
mkdir -p /cluster/data/loxAfr1/bed/tblastn.dm1FB
cd /cluster/data/loxAfr1/bed/tblastn.dm1FB
ls -1S /iscratch/i/loxAfr1/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst
exit
# back to kkstore01
cd /cluster/data/loxAfr1/bed/tblastn.dm1FB
mkdir fbfa
# calculate a reasonable number of jobs
calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\)
# 18735/(150000/396) = 49.460400
split -l 49 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb
cd fbfa
for i in *; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S fbfa/*.fa > fb.lst
mkdir blastOut
for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.2
mv $3.tmp $3
rm -f $f.1 $f.2
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp
exit 1
'_EOF_'
chmod +x blastSome
gensub2 bug.lst fb.lst blastGsub blastSpec
ssh kk
cd /cluster/data/loxAfr1/bed/tblastn.dm1FB
para create blastSpec
para try, push
# Completed: 151668 of 151668 jobs
# CPU time in finished jobs: 2932565s 48876.08m 814.60h 33.94d 0.093 y
# IO & Wait Time: 694006s 11566.77m 192.78h 8.03d 0.022 y
# Average job time: 24s 0.40m 0.01h 0.00d
# Longest job: 2721s 45.35m 0.76h 0.03d
# Submission to last job: 73860s 1231.00m 20.52h 0.85d
cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainSome
ls -1dS `pwd`/blastOut/fb?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
para create chainSpec
# should run this on the mini-cluster or with my shove script
# so you can limit the number of jobs starting to 3 or 4
para try, push...
# Completed: 383 of 383 jobs
# CPU time in finished jobs: 327s 5.44m 0.09h 0.00d 0.000 y
# IO & Wait Time: 8218s 136.97m 2.28h 0.10d 0.000 y
# Average job time: 22s 0.37m 0.01h 0.00d
# Longest job: 54s 0.90m 0.01h 0.00d
# Submission to last job: 674s 11.23m 0.19h 0.01d
exit
# back to kkstore01
cd /cluster/data/loxAfr1/bed/tblastn.dm1FB/blastOut
for i in fb??
do
awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/loxAfr1/bed/tblastn.dm1FB/blastDm1FB.psl
ssh hgwdev
cd /cluster/data/loxAfr1/bed/tblastn.dm1FB
hgLoadPsl loxAfr1 blastDm1FB.psl
# End tblastn
# SWAP CHAINS FROM DM2, BUILD NETS ETC. (DONE 3/2/05 angie)
ssh kkstore01
mkdir /cluster/data/loxAfr1/bed/blastz.dm2.swap
cd /cluster/data/loxAfr1/bed/blastz.dm2.swap
doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.loxAfr1/DEF \
>& do.log &
tail -f do.log
# Add {chain,net}Dm2 to trackDb.ra if necessary.
# Add /usr/local/apache/htdocs/goldenPath/loxAfr1/vsDm2/README.txt
#########################################################################
# BLASTZ NOTE: with the advent of Angie's script to run the
# blastz process through to chains and nets loaded into the
# database and download files prepared, it is now a juggling act
# to see which klusters are available. The particular options to
# the script to make it go to one kluster or another are to be
# determined at run-time. The typical run-times listed here will
# be a factor in your choice of kluster to operate on.
#########################################################################
# BLASTZ HUMAN Hg17
ssh kk
mkdir /cluster/data/loxAfr1/bed/blastz.hg17
cd /cluster/data/loxAfr1/bed/blastz.hg17
cat << '_EOF_' > DEF
# mouse vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse
SEQ1_DIR=/panasas/store/loxAfr1/nib
# not used
SEQ1_RMSK=/panasas/store/loxAfr1/rmsk
# not used
SEQ1_FLAG=-rodent
SEQ1_SMSK=/panasas/store/loxAfr1/linSpecRep.notInHuman
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Human
SEQ2_DIR=/scratch/hg/hg17/bothMaskedNibs
# RMSK not currently used
SEQ2_RMSK=
# FLAG not currently used
SEQ2_FLAG=
SEQ2_SMSK=/scratch/hg/hg17/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/loxAfr1/bed/blastzHg17.2005_03_14
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << keep emacs coloring happy
cp /cluster/data/loxAfr1/chrom.sizes ./S1.len
sort -rn +1 /cluster/data/hg17/chrom.sizes > S2.len
# establish a screen to control this job
screen
time /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF > \
blast.run.out 2>&1 &
# real 993m28.547s
# user 0m0.198s
# sys 0m0.171s
# detach from screen session: Ctrl-a Ctrl-d
# to reattach to this screen session:
ssh kksilo
screen -d -r
# STARTED - 2005-03-17 21:25
# FINISHED - 2005-03-18 14:00
# Completed: 45347 of 45347 jobs
# CPU time in finished jobs: 16921981s 282033.02m 4700.55h 195.86d 0.537 y
# IO & Wait Time: 2381711s 39695.18m 661.59h 27.57d 0.076 y
# Average job time: 426s 7.09m 0.12h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 9568s 159.47m 2.66h 0.11d
# Submission to last job: 58695s 978.25m 16.30h 0.68d
# Completed: 331 of 331 jobs
# CPU time in finished jobs: 272s 4.54m 0.08h 0.00d 0.000 y
# IO & Wait Time: 1145s 19.08m 0.32h 0.01d 0.000 y
# Average job time: 4s 0.07m 0.00h 0.00d
# Longest job: 24s 0.40m 0.01h 0.00d
# Submission to last job: 265s 4.42m 0.07h 0.00d
# The kki batch doChainRun.csh appears to have failed
# due to underlying changes in the location of hg17 items
# fixup the symlinks which are in a state of flux today, then,
# to recover:
ssh kki
cd /cluster/data/loxAfr1/bed/blastzHg17.2005_03_14/axtChain/run
rm -fr chain
time ./doChainRun.csh
# real 22m47.917s
# user 0m0.380s
# sys 0m0.630s
# Completed: 40 of 40 jobs
# CPU time in finished jobs: 6373s 106.22m 1.77h 0.07d 0.000 y
# IO & Wait Time: 552s 9.20m 0.15h 0.01d 0.000 y
# Average job time: 173s 2.89m 0.05h 0.00d
# Longest job: 662s 11.03m 0.18h 0.01d
# Submission to last job: 1200s 20.00m 0.33h 0.01d
# That was the last part of the chainRun step, can now continue:
ssh kksilo
cd /cluster/data/loxAfr1/bed/blastzHg17.2005_03_14
time /cluster/bin/scripts/doBlastzChainNet.pl -continue chainMerge `pwd`/DEF > chainMerge.run.out 2>&1 &
# STARTED - 2005-03-18 15:00
# FINISHED 2005-03-18 16:33
# checking the numbers for sanity:
ssh hgwdev
# expect ~ 2m30 seconds for chain measurement
time featureBits loxAfr1 chainHg17
# 2596946329 bases of 2597150411 (99.992%) in intersection
time featureBits mm5 chainHg17
# 2507720521 bases of 2615483787 (95.880%) in intersection
# expect ~ 2m30s seconds for net measurement
time featureBits loxAfr1 netHg17
# 2579747741 bases of 2597150411 (99.330%) in intersection
time featureBits mm5 netHg17
# 2504056038 bases of 2615483787 (95.740%) in intersection
ssh kolossus
# expect ~ 20-22 minutes for the chainLink measurement
HGDB_CONF=~/.hg.conf.read-only /usr/bin/time --portability \
featureBits loxAfr1 chainHg17Link
# 966916309 bases of 2597150411 (37.230%) in intersection
HGDB_CONF=~/.hg.conf.read-only /usr/bin/time --portability \
featureBits mm5 chainHg17Link
# 1025750185 bases of 2615483787 (39.218%) in intersection
# swap results to place loxAfr1 alignments onto Hg17
time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \
swap.run.out 2>&1 &
# STARTED - 2005-03-29 - 15:58
# FINI - 2005-03-29 - 18:48
# real 171m26.172s
# user 0m2.270s
# sys 0m0.870s
ssh kolossus
time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 chainMm6Link
# 969459954 bases of 2866216770 (33.824%) in intersection
time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 chainMm5Link
# 1020106336 bases of 2866216770 (35.591%) in intersection
# A measurement script to do all featureBits combinations:
cd /cluster/data/loxAfr1/jkStuff
cat << '_EOF_' > netChainCheck.sh
#!/bin/sh
usage()
{
echo "usage: netChainCheck.sh <db0> <db1> <targetDb>"
echo " does: featureBits <db0> net<targetDb>"
echo " featureBits <db1> net<targetDb>"
echo " as well as the chain and chainLink tables,"
echo " and on the targetDb:"
echo " featureBits <targetDb> net<db0>"
echo " featureBits <targetDb> net<db1>"
echo " and the chain and chainLink tables."
echo -e "\texample: netChainCheck.sh loxAfr1 mm5 fr1"
}
doOne()
{
db=$1
tbl=$2
echo " featureBits $db $tbl"
echo -en " #\t"
time featureBits $db $tbl
}
ucFirstLetter()
{
ucString="$1"
fc=`echo "${ucString}" | sed -e "s/\(.\).*/\1/"`
rest=`echo "${ucString}" | sed -e "s/.\(.*\)/\1/"`
FC=`echo "${fc}" | tr '[a-z]' '[A-Z]'`
echo "${FC}${rest}"
}
if [ "$#" -ne 3 ]; then
usage
exit 255
fi
db0=$1
db1=$2
targetDb=$3
targetDB=`ucFirstLetter "${targetDb}"`
DB0=`ucFirstLetter "${db0}"`
DB1=`ucFirstLetter "${db1}"`
export db0 db1 targetDb targetDB DB0 DB1
# echo "${db0} ${db1} ${targetDb} ${targetDB} ${DB0} ${DB1}"
doOne "${db0}" net${targetDB}
doOne "${db1}" net${targetDB}
doOne "${db0}" chain${targetDB}
doOne "${db1}" chain${targetDB}
doOne "${db0}" chain${targetDB}Link
doOne "${db1}" chain${targetDB}Link
doOne ${targetDb} net${DB0}
doOne ${targetDb} net${DB1}
doOne ${targetDb} chain${DB0}
doOne ${targetDb} chain${DB1}
doOne ${targetDb} chain${DB0}Link
doOne ${targetDb} chain${DB1}Link
'_EOF_'
# << keep emacs coloring happy
# CHAIN AND NET SWAPPED HUMAN BLASTZ (WORKING 12/22/05 kate)
# Working in Brian's blastz dir (not doced).
# This procedure follows conventions in doBlastzNetChain.pl,
# so it can be used to complete the processing
ssh kki
cd /cluster/data/loxAfr1/bed/zb.hg17
ln -s `pwd` /cluster/data/hg17/bed/blastz.loxAfr1
mkdir -p axtChain/run/chain
ls -1S /cluster/data/loxAfr1/bed/zb.hg17/axtChrom/*.axt.gz | \
sed 's/.gz//' > axtChain/run/input.lst
cd axtChain/run
cat > chain.csh << 'EOF'
#!/bin/csh -ef
zcat $1 | \
axtChain -verbose=0 -minScore=3000 -linearGap=medium stdin \
/cluster/data/hg17/nib /cluster/data/loxAfr1/loxAfr1.2bit stdout | \
chainAntiRepeat /cluster/data/hg17/nib /cluster/data/loxAfr1/loxAfr1.2bit \
stdin $2
# delay so parasol check out will succeed on output file
sleep 5
'EOF'
# << happy emacs
cat > gsub << 'EOF'
#LOOP
chain.csh {check in exists $(path1).gz} {check out line+ chain/$(root1).chain}
#ENDLOOP
'EOF'
# << happy emacs
chmod a+x chain.csh
gensub2 input.lst single gsub jobList
para create jobList; para try
para check
para push
# wait for completion
para time > run.time
# finish pipeline
cd /cluster/data/hg17/bed/blastz.loxAfr1
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-continue=chainMerge \
-bigClusterHub=pk \
`pwd`/DEF >& blastz.out &
############################################################################
## create missing hg18 nets (DONE 2006-05-12 markd)
## hg18 chains exist, but are not documented in this file.
# create the net filee
ssh hgwdev
cd /cluster/data/loxAfr1/bed/blastz.hg18.swap/axtChain
nice netClass -verbose=0 -noAr noClass.net loxAfr1 hg18 loxAfr1.hg18.net
nice gzip loxAfr1.hg18.net
############################################################################
## BLASTZ swap from mm9 alignments (2007-11-11 - markd)
ssh hgwdev
mkdir /cluster/data/loxAfr1/bed/blastz.mm9.swap
cd /cluster/data/loxAfr1/bed/blastz.mm9.swap
ln -s blastz.mm9.swap ../blastz.mm9
/cluster/bin/scripts/doBlastzChainNet.pl \
-swap /cluster/data/mm9/bed/blastz.loxAfr1/DEF >& swap.out&
# fb.loxAfr1.chainMm9Link.txt:
# 468255296 bases of 2295548473 (20.398%) in intersection
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
# loxAfr1 - Elephant - Ensembl Genes version 51 (DONE - 2008-12-03 - hiram)
ssh kkr14u07
cd /hive/data/genomes/loxAfr1
cat << '_EOF_' > loxAfr1.ensGene.ra
# required db variable
db loxAfr1
# do we need to translate geneScaffold coordinates
geneScaffolds yes
# ignore genes that do not properly convert to a gene pred, and contig
# names that are not in the UCSC assembly
skipInvalid yes
# ignore the one gene that has an invalid structure from Ensembl:
# 13002: ENSLAFT00000000586 no exonFrame on CDS exon 1
'_EOF_'
# << happy emacs
doEnsGeneUpdate.pl -ensVersion=51 loxAfr1.ensGene.ra
ssh hgwdev
cd /hive/data/genomes/loxAfr1/bed/ensGene.51
featureBits loxAfr1 ensGene
# 21666114 bases of 2295548473 (0.944%) in intersection
*** All done! (through the 'makeDoc' step)
*** Steps were performed in /hive/data/genomes/loxAfr1/bed/ensGene.51
############################################################################
+############################################################################
+# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
+
+vertebrate-wide transMap alignments were built Tracks are created and loaded
+by a single Makefile. This is available from:
+ svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
+
+see doc/builds.txt for specific details.
+############################################################################