src/hg/makeDb/doc/galGal3.txt 1.27
1.27 2009/03/13 23:55:40 markd
added galGal3 N-SCAN
Index: src/hg/makeDb/doc/galGal3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/galGal3.txt,v
retrieving revision 1.26
retrieving revision 1.27
diff -b -B -U 1000000 -r1.26 -r1.27
--- src/hg/makeDb/doc/galGal3.txt 25 Nov 2008 20:22:45 -0000 1.26
+++ src/hg/makeDb/doc/galGal3.txt 13 Mar 2009 23:55:40 -0000 1.27
@@ -1,2648 +1,2676 @@
# for emacs: -*- mode: sh; -*-
# This file describes how we made the browser database on the
# Chicken (Gallus gallus) May 2006 release.
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +-------------+---------------------------------+
# | tableName | type |
# +-------------+---------------------------------+
# | refGene | genePred refPep refMrna |
# | xenoRefGene | genePred xenoRefPep xenoRefMrna |
# | genscan | genePred genscanPep |
# +-------------+---------------------------------+
#########################################################################
# CREATE BUILD DIRECTORY (DONE 5/14/06 angie)
ssh kkstore03
mkdir /cluster/store6/galGal3
ln -s /cluster/store6/galGal3 /cluster/data/
#########################################################################
# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 5/14/06 angie)
mkdir /cluster/data/galGal3/M
cd /cluster/data/galGal3/M
# go to http://www.ncbi.nih.gov/ and search Nucleotide for
# "gallus mitochondrion genome". That shows the gi number:
# 5834843
# Use that number in the entrez linking interface to get fasta:
wget -O chrM.fa \
'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5834843&dopt=FASTA'
# Edit chrM.fa: make sure the long fancy header line says it's the
# Gallus gallus mitochondrion complete genome, and then replace the
# header line with just ">chrM".
#########################################################################
# DOWNLOAD, UNPACK AND CHECK CHROM FASTA & AGP (DONE 5/14/06 angie)
mkdir /cluster/data/galGal3/downloads
cd /cluster/data/galGal3/downloads
wget ftp://genome.wustl.edu/pub/user/lhillier/private/chicken_060412.tar.gz
tar xvzf chicken_060412.tar.gz
cp /dev/null ../chrom.lst
foreach agp (*.agp)
set chr = $agp:r
echo $chr
set fa = $chr.fa
if (! -e $fa) then
echo "*** No fasta for $agp"
break
endif
set c = `echo $chr | sed -e 's/^chr//; s/_random$//;'`
if (! -d ../$c) mkdir ../$c
mv $agp $fa ../$c
end
cd ..
# checkAgpAndFa prints out way too much info -- keep the end/stderr only:
ls -1d ?{,?} E* | sed -e 's@/$@@' > chrom.lst
foreach c (`cat chrom.lst`)
foreach agp ($c/chr$c{,_random}.agp)
if (-e $agp) then
set fa = $agp:r.fa
echo checking consistency of $agp and $fa
checkAgpAndFa $agp $fa | tail -1
endif
end
end
faSize */chr*.fa
#Total size: mean 19306674.4 sd 39157213.7 min 1028 (chr32) max 200994015 (chr1) median 2031799
#########################################################################
# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 5/14/06 angie)
ssh kkstore03
cd /cluster/data/galGal3
foreach c (`cat chrom.lst`)
foreach agp ($c/chr$c{,_random}.agp)
if (-e $agp) then
set fa = $agp:r.fa
echo splitting $agp and $fa
cp -p $agp $agp.bak
cp -p $fa $fa.bak
splitFaIntoContigs $agp $fa . -nSize=5000000
endif
end
end
# splitFaIntoContigs makes new dirs for _randoms. Move their contents
# back into the main chrom dirs and get rid of the _random dirs.
foreach d (*_random)
set base = `echo $d | sed -e 's/_random$//'`
mv $d/lift/oOut.lst $base/lift/rOut.lst
mv $d/lift/ordered.lft $base/lift/random.lft
mv $d/lift/ordered.lst $base/lift/random.lst
rmdir $d/lift
mv $d/* $base
rmdir $d
end
# Un/ has Un_random but for some reason o* files were created. Rename:
foreach f (Un/lift/*)
set g = `echo $f | sed -e 's/ordered/random/; s@/o@/r@;'`
mv $f $g
end
# Make a "pseudo-contig" for processing chrM too:
mkdir M/chrM_1
sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa
mkdir M/lift
echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst
echo "chrM_1" > M/lift/ordered.lst
set mSize = `faSize M/chrM.fa | awk '{print $1;}'`
echo "0 M/chrM_1 $mSize chrM $mSize" > M/lift/ordered.lft
#########################################################################
# REPEAT MASKING (DONE 5/15/06 angie)
# RepeatMasker version and library version:
# March 20 2006 (open-3-1-5) version of RepeatMasker
grep RELEASE /cluster/bluearc/RepeatMasker060320/Libraries/RepeatMaskerLib.embl
#CC RELEASE 20060315; *
#- Split contigs into 500kb chunks, at gaps if possible:
ssh kkstore03
cd /cluster/data/galGal3
foreach c (`cat chrom.lst`)
foreach d ($c/chr${c}*_?{,?})
cd $d
echo "splitting $d"
set contig = $d:t
faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
-minGapSize=100
cd ../..
end
end
#- Make the run directory and job list:
cd /cluster/data/galGal3
mkdir jkStuff
cat << '_EOF_' > jkStuff/RMChicken
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/galGal3/$2
/bin/cp $2 /tmp/galGal3/$2/
cd /tmp/galGal3/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec chicken $2
popd
/bin/cp /tmp/galGal3/$2/$2.out ./
if (-e /tmp/galGal3/$2/$2.align) /bin/cp /tmp/galGal3/$2/$2.align ./
if (-e /tmp/galGal3/$2/$2.tbl) /bin/cp /tmp/galGal3/$2/$2.tbl ./
if (-e /tmp/galGal3/$2/$2.cat) /bin/cp /tmp/galGal3/$2/$2.cat ./
/bin/rm -fr /tmp/galGal3/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/galGal3/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/galGal3
'_EOF_'
# << emacs
chmod +x jkStuff/RMChicken
mkdir RMRun
cp /dev/null RMRun/RMJobs
foreach c (`cat chrom.lst`)
foreach d ($c/chr${c}{,_random}_?{,?})
set ctg = $d:t
foreach f ( $d/${ctg}_?{,?}.fa )
set f = $f:t
echo /cluster/data/galGal3/jkStuff/RMChicken \
/cluster/data/galGal3/$d $f \
'{'check out line+ /cluster/data/galGal3/$d/$f.out'}' \
>> RMRun/RMJobs
end
end
end
#- Do the run
ssh pk
cd /cluster/data/galGal3/RMRun
para make RMJobs
para time
#Completed: 2492 of 2492 jobs
#CPU time in finished jobs: 2613579s 43559.64m 725.99h 30.25d 0.083 y
#IO & Wait Time: 17626s 293.77m 4.90h 0.20d 0.001 y
#Average job time: 1056s 17.60m 0.29h 0.01d
#Longest finished job: 1335s 22.25m 0.37h 0.02d
#Submission to last job: 8741s 145.68m 2.43h 0.10d
#- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
ssh kkstore03
cd /cluster/data/galGal3
foreach d (*/chr*_?{,?})
set contig = $d:t
echo $contig
liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
> /dev/null
end
#- Lift pseudo-contigs to chromosome level
foreach c (`cat chrom.lst`)
echo lifting $c
cd $c
if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
> /dev/null
endif
if (-e lift/random.lft && ! -z lift/random.lft) then
liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \
> /dev/null
endif
cd ..
end
#- Load the .out files into the database with:
ssh hgwdev
cd /cluster/data/galGal3
hgLoadOut galGal3 */chr*.fa.out
# Compare coverage to previous release:
featureBits galGal3 rmsk
#102214417 bases of 1042591351 (9.804%) in intersection
# galGal2 coverage:
#104249260 bases of 1054197620 (9.889%) in intersection
# Uh-oh -- in the past, a drop in coverage has meant an RM problem.
# However, spot-checking the per-chrom coverage of galGal2 vs. galGal3,
# it seems like many small or random chroms have simply had a lot
# of repetitive sequence cut from them (significant drop in size as well
# as drop in coverage), while for many of the large chroms, the coverage
# has gone up. Some chroms grew but the rmsk coverage did not keep up.
# chrUn is quite a bit smaller (121M --> 53M). I saved the per-chrom
# coverage figures in /cluster/data/galGal3/RMCompare/ .
#galGal2:chr1 24219568 183744490 13.181
#galGal3:chr1 25804441 195192300 13.220
#galGal2:chr1_random 186317 1261352 14.771
#galGal3:chr1_random 12930 213918 6.044
#galGal2:chr10 807202 18954178 4.259
#galGal3:chr10 858376 20484937 4.190
#galGal2:chr10_random 525320 3515812 14.942
#galGal3:chr10_random 644 13679 4.708
#galGal2:chr2 16579726 143798269 11.530 [exception.
#galGal3:chr2 17182793 150358687 11.428 [
#galGal2:chr2_random 4670 53846 8.673 [
#galGal3:chr2_random 16589 127706 12.990 [
#galGal2:chr18 461248 8797585 5.243
#galGal3:chr18 507614 10513347 4.828
#galGal2:chrE22C19W28 2628 47202 5.568
#galGal2:chrE50C23 690 10171 6.784
#galGal3:chrE22C19W28_E50C23 42486 822662 5.164
#galGal2:chrUn 18697113 121198700 15.427
#galGal3:chrUn_random 11075578 53400422 20.741
#galGal2:chrW 530067 4135691 12.817
#galGal3:chrW 133707 233885 57.168
#galGal2:chrW_random 109531 229903 47.642
#galGal3:chrW_random 311315 625108 49.802
#galGal2:chrZ 4629721 30832492 15.016
#galGal3:chrZ 10599358 67536383 15.694
#galGal2:chrZ_random 2172389 14348615 15.140
#galGal3:chrZ_random 115568 309836 37.300
# So I'm inclined to think that it's most likely not a RepeatMasker bug
# but a change in the underlying assembly. Something to run by LaDeana
# after reading the release notes...
#########################################################################
# MAKE LIFTALL.LFT (DONE 5/15/06 angie)
ssh kkstore03
cd /cluster/data/galGal3
cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft
#########################################################################
# CREATING DATABASE (DONE 5/15/06 angie)
ssh hgwdev
echo 'create database galGal3' | hgsql ''
# old hgwdev 5/15:
# We are awfully tight here but at least not out of space:
df -h /var/lib/mysql
#/dev/sdc1 1.8T 1.6T 65G 97% /var/lib/mysql
# New hgwdev 5/18:
df -h /data/mysql
#/dev/sdc1 1.8T 685G 1.1T 40% /data/mysql
# Copy over the grp table from previous release:
echo "create table grp (PRIMARY KEY(NAME)) select * from galGal2.grp" \
| hgsql galGal3
#########################################################################
# GOLD AND GAP TRACKS (DONE 5/15/06 angie)
ssh hgwdev
cd /cluster/data/galGal3
hgGoldGapGl -noGl -chromLst=chrom.lst galGal3 /cluster/data/galGal3 .
# featureBits complains if there's no chrM_gap, so make one:
# echo "create table chrM_gap like chr1_gap" | hgsql galGal3
# oops, that won't work until v4.1, so do this for the time being:
echo "create table chrM_gap select * from chr1_gap where 0=1" \
| hgsql galGal3
#########################################################################
# SIMPLE REPEATS (TRF) (DONE 5/15/06 angie)
# TRF runs pretty quickly now... it takes a few hours total runtime,
# so instead of binrsyncing and para-running, just do this on the
# local fileserver
ssh kkstore03
mkdir /cluster/data/galGal3/bed
mkdir /cluster/data/galGal3/bed/simpleRepeat
cd /cluster/data/galGal3/bed/simpleRepeat
mkdir trf
cp /dev/null jobs.csh
foreach d (/cluster/data/galGal3/*/chr*_?{,?})
set ctg = $d:t
foreach f ($d/${ctg}.fa)
set fout = $f:t:r.bed
echo $fout
echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
>> jobs.csh
end
end
csh -efx jobs.csh >&! jobs.log &
# check on this with
tail -f jobs.log
wc -l jobs.csh
#259 jobs.csh
ls -1 trf | wc -l
#259
endsInLf trf/*
#trf/chr12_random_1.bed zero length
#trf/chr13_random_1.bed zero length
# That's OK.
# When job is done do:
liftUp simpleRepeat.bed /cluster/data/galGal3/jkStuff/liftAll.lft warn \
trf/*.bed
# Load into the database:
ssh hgwdev
hgLoadBed galGal3 simpleRepeat \
/cluster/data/galGal3/bed/simpleRepeat/simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
featureBits galGal3 simpleRepeat
#9650062 bases of 1042591351 (0.926%) in intersection
# galGal2 coverage:
#8434365 bases of 1054197620 (0.800%) in intersection
###########################################################################
# CREATE MICROSAT TRACK (done 2006-7-5 JK)
ssh hgwdev
cd /cluster/data/galGal3/bed
mkdir microsat
cd microsat
awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed
/cluster/bin/i386/hgLoadBed galGal3 microsat microsat.bed
#########################################################################
# PROCESS SIMPLE REPEATS INTO MASK (DONE 5/15/06 angie)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh kkstore03
cd /cluster/data/galGal3/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/chr*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# Lift up filtered trf output to chrom coords as well:
cd /cluster/data/galGal3
mkdir bed/simpleRepeat/trfMaskChrom
foreach c (`cat chrom.lst`)
if (-e $c/lift/ordered.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/ordered.lst > $c/lift/oTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
endif
if (-e $c/lift/random.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/random.lst > $c/lift/rTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
endif
end
# Here's the coverage for the filtered TRF:
ssh hgwdev
cat /cluster/data/galGal3/bed/simpleRepeat/trfMaskChrom/*.bed \
> /tmp/filtTrf.bed
featureBits galGal3 /tmp/filtTrf.bed
#4110365 bases of 1042591351 (0.394%) in intersection
# galGal2 coverage:
#4510381 bases of 1054197620 (0.428%) in intersection
#########################################################################
# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 5/15/06 angie)
# Note: just to keep things consistent, redid chr1 and chr2 2/26 with
# the ProcessRepeats-only rerun results (only masking changes were
# 5bp in chr1 and 3bp in chr2)
ssh kkstore03
cd /cluster/data/galGal3
# Soft-mask (lower-case) the contig and chr .fa's,
# then make hard-masked versions from the soft-masked.
set trfCtg=bed/simpleRepeat/trfMask
set trfChr=bed/simpleRepeat/trfMaskChrom
foreach f (*/chr*.fa)
echo "repeat- and trf-masking $f"
maskOutFa -soft $f $f.out $f
set chr = $f:t:r
maskOutFa -softAdd $f $trfChr/$chr.bed $f
echo "hard-masking $f"
maskOutFa $f hard $f.masked
end
foreach c (`cat chrom.lst`)
echo "repeat- and trf-masking contigs of chr$c, chr${c}_random"
foreach d ($c/chr*_?{,?})
set ctg=$d:t
set f=$d/$ctg.fa
maskOutFa -soft $f $f.out $f
maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
maskOutFa $f hard $f.masked
end
end
# Make 2bit for blat/browser usage:
faToTwoBit */chr*.fa galGal3.2bit
# Make soft-masked nib for blastz:
mkdir nib
foreach f (*/chr*.fa)
faToNib -softMask $f nib/$f:t:r.nib
end
#########################################################################
# MAKE CHROMINFO TABLE WITH 2BIT (DONE 5/15/06 angie)
ssh kkstore03
cd /cluster/data/galGal3
mkdir bed/chromInfo
twoBitInfo galGal3.2bit stdout \
| awk '{print $1 "\t" $2 "\t/gbdb/galGal3/galGal3.2bit";}' \
> bed/chromInfo/chromInfo.tab
# Link to 2bit from /gbdb/galGal3/:
ssh hgwdev
mkdir /gbdb/galGal3
ln -s /cluster/data/galGal3/galGal3.2bit /gbdb/galGal3/
# Load /gbdb/galGal3/galGal3.2bit paths into database and save size info.
# Make a special chromInfo.sql with large index size so that
# chrE22C19W28_E50C23 and chrE22C19W28_E50C23_random don't get collapsed.
cd /cluster/data/galGal3/bed/chromInfo
perl -wpe 's/chrom\([0-9]+/chrom\(21/' \
$HOME/kent/src/hg/lib/chromInfo.sql > chromInfo.sql
hgLoadSqlTab galGal3 chromInfo chromInfo.sql chromInfo.tab
hgsql -N galGal3 -e "select chrom,size from chromInfo" \
> /cluster/data/galGal3/chrom.sizes
# take a look at chrom.sizes size
wc chrom.sizes
# 57 114 947 ../../chrom.sizes
#########################################################################
# MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE 5/15/06 angie)
# Use -repMatch=380 (based on size -- for human we use 1024, and
# chicken size is ~37% of human)
ssh kkr1u00
cd /cluster/data/galGal3
mkdir /cluster/bluearc/galGal3
blat galGal3.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/bluearc/galGal3/11.ooc -repMatch=380
#Wrote 13061 overused 11-mers to /cluster/bluearc/galGal3/11.ooc
blat galGal3.2bit /dev/null /dev/null -tileSize=10 \
-makeOoc=/cluster/bluearc/galGal3/10.ooc -repMatch=380
#Wrote 166633 overused 10-mers to /cluster/bluearc/galGal3/10.ooc
mkdir /iscratch/i/galGal3
cp -p /cluster/bluearc/galGal3/*.ooc /iscratch/i/galGal3/
iSync
#########################################################################
# PUT NIBS ON /SCRATCH (DONE 5/15/06 angie)
ssh kkstore03
mkdir /cluster/bluearc/scratch/hg/galGal3
rsync -av /cluster/data/galGal3/nib/* /cluster/bluearc/scratch/hg/galGal3/nib/
cp -p /cluster/data/galGal3/galGal3.2bit /cluster/bluearc/scratch/hg/galGal3/
# Ask cluster-admin to distribute to /scratch on big & small cluster
#########################################################################
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 5/15/06 angie)
# Make trackDb table so browser knows what tracks to expect:
ssh hgwdev
cd ~/kent/src/hg/makeDb/trackDb
cvsup
# Add trackDb directories and a description.html
mkdir chicken/galGal3
cvs add chicken/galGal3
cvs add chicken/galGal3/description.html
cvs ci -m "Initial description for galGal3." chicken/galGal3
# Edit that makefile to add galGal3 in all the right places and do
make update DBS=galGal3
mkdir /gbdb/galGal3/html
mkdir /cluster/data/galGal3/html
ln -s /cluster/data/galGal3/html/description.html /gbdb/galGal3/html/
cvs ci -m "Added galGal3." makefile
# Go public on genome-test. In a clean tree (no mods, up-to-date):
cvs up makefile
make alpha
# Note: hgcentral*.genome values must correspond
# with defaultDb.genome values
hgsql -h genome-testdb hgcentraltest \
-e 'INSERT INTO dbDb \
(name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) values \
("galGal3", "May 2006", "/gbdb/galGal3", "Chicken", \
"chr5:57710001-57780000", 1, 30, "Chicken", \
"Gallus Gallus", "/gbdb/galGal3/html/description.html", \
0, 0, "Chicken Genome Sequencing Consortium May 2006 release");'
#########################################################################
# PRODUCING GENSCAN PREDICTIONS (DONE 5/16/06 angie)
ssh hgwdev
mkdir /cluster/data/galGal3/bed/genscan
cd /cluster/data/galGal3/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Run on small cluster (more mem than big cluster).
ssh kki
cd /cluster/data/galGal3/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the hard-masked contigs that
# *do not* consist of all-N's (which would cause genscan to blow up)
cp /dev/null genome.list
foreach f ( `ls -1S /cluster/data/galGal3/*/chr*_*/chr*_?{,?}.fa.masked` )
egrep '[ACGT]' $f > /dev/null
if ($status == 0) echo $f >> genome.list
end
wc -l genome.list
# 259 genome.list
# Create template file, gsub, for gensub2. For example (3-line file):
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.list single gsub jobList
para make jobList
para time
#Completed: 255 of 259 jobs
#Crashed: 4 jobs
#CPU time in finished jobs: 82258s 1370.97m 22.85h 0.95d 0.003 y
#IO & Wait Time: 1053s 17.55m 0.29h 0.01d 0.000 y
#Average job time: 327s 5.45m 0.09h 0.00d
#Longest finished job: 20131s 335.52m 5.59h 0.23d
#Submission to last job: 23171s 386.18m 6.44h 0.27d
# If there are crashes, diagnose with "para problems" / "para crashed".
# If a job crashes due to genscan running out of memory, re-run it
# manually with "-window=1200000" instead of "-window=2400000".
ssh kkr5u00
cd /cluster/data/galGal3/bed/genscan
/cluster/bin/x86_64/gsBig /cluster/data/galGal3/4/chr4_13/chr4_13.fa.masked gtf/chr4_13.fa.gtf -trans=pep/chr4_13.fa.pep -subopt=subopt/chr4_13.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
/cluster/bin/x86_64/gsBig /cluster/data/galGal3/3/chr3_14/chr3_14.fa.masked gtf/chr3_14.fa.gtf -trans=pep/chr3_14.fa.pep -subopt=subopt/chr3_14.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
/cluster/bin/x86_64/gsBig /cluster/data/galGal3/2/chr2_22/chr2_22.fa.masked gtf/chr2_22.fa.gtf -trans=pep/chr2_22.fa.pep -subopt=subopt/chr2_22.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
/cluster/bin/x86_64/gsBig /cluster/data/galGal3/Un/chrUn_random_3/chrUn_random_3.fa.masked gtf/chrUn_random_3.fa.gtf -trans=pep/chrUn_random_3.fa.pep -subopt=subopt/chrUn_random_3.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
# Convert these to chromosome level files as so:
ssh kkstore03
cd /cluster/data/galGal3/bed/genscan
liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
cat pep/*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/galGal3/bed/genscan
ldHgGene galGal3 genscan genscan.gtf
hgPepPred galGal3 generic genscanPep genscan.pep
hgLoadBed galGal3 genscanSubopt genscanSubopt.bed
#########################################################################
# MAKE GCPERCENT (DONE 5/15/06 angie)
ssh kolossus
mkdir /cluster/data/galGal3/bed/gc5Base
cd /cluster/data/galGal3/bed/gc5Base
hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 galGal3 \
/cluster/data/galGal3 \
| wigEncode stdin gc5Base.wig gc5Base.wib
ssh hgwdev
mkdir /gbdb/galGal3/wib
cd /cluster/data/galGal3/bed/gc5Base
ln -s `pwd`/gc5Base.wib /gbdb/galGal3/wib
hgLoadWiggle -pathPrefix=/gbdb/galGal3/wib galGal3 gc5Base gc5Base.wig
#########################################################################
# CPGISSLANDS (WUSTL) (DONE 5/15/06 angie)
ssh hgwdev
mkdir /cluster/data/galGal3/bed/cpgIsland
cd /cluster/data/galGal3/bed/cpgIsland
# Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
cvs co hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
make
mv cpglh.exe /cluster/data/galGal3/bed/cpgIsland/
ssh kolossus
cd /cluster/data/galGal3/bed/cpgIsland
foreach f (../../*/chr*.fa.masked)
set fout=$f:t:r:r.cpg
echo running cpglh on $f to $fout
nice ./cpglh.exe $f > $fout
end
# Transform cpglh output to bed +
cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
# << this line makes emacs coloring happy
awk -f filter.awk chr*.cpg > cpgIsland.bed
# load into database:
ssh hgwdev
cd /cluster/data/galGal3/bed/cpgIsland
hgLoadBed galGal3 cpgIslandExt -tab \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
#########################################################################
# CPGISLANDS (ANDY LAW) (DONE 5/15/06 angie)
# See notes in makeGalGal2.doc
ssh kolossus
mkdir /cluster/data/galGal3/bed/cpgIslandGgfAndy
cd /cluster/data/galGal3/bed/cpgIslandGgfAndy
# Build the preProcGgfAndy program in
# kent/src/oneShot/preProcGgfAndy into your ~/bin/
# Use unmasked sequence since this is not a mammal...
cp /dev/null cpgIslandGgfAndy.bed
foreach f (../../*/chr*.fa)
set chr = $f:t:r:r
echo preproc and run on unmasked $chr
~/bin/x86_64/preProcGgfAndy $f \
| /cluster/home/angie/ggf-andy-cpg-island.pl \
| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
$gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \
$pGc = (100.0 * $gc / $n); \
$_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
"$pCpG\t$pGc\t$oE\n";' \
>> cpgIslandGgfAndy.bed
end
wc -l ../cpgIsland/cpgIsland.bed *bed
# 22806 ../cpgIsland/cpgIsland.bed
# 76488 cpgIslandGgfAndy.bed
# load into database:
ssh hgwdev
cd /cluster/data/galGal3/bed/cpgIslandGgfAndy
sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \
$HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql
hgLoadBed galGal3 cpgIslandGgfAndy -tab \
-sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed
featureBits galGal3 cpgIslandExt
#15533065 bases of 1042591351 (1.490%) in intersection
featureBits galGal3 cpgIslandGgfAndy
#62026174 bases of 1042591351 (5.949%) in intersection
#########################################################################
# MAKE CHICKEN LINEAGE-SPECIFIC REPEATS (DONE 5/22/06 angie)
# In an email 2/13/04, Arian said we could treat all human repeats as
# lineage-specific, but could exclude these from chicken as ancestral:
# L3, L3a, L3b, MIR, MIR3, MIRb, MIRm
ssh kkstore03
cd /cluster/data/galGal3
mkdir -p /san/sanvol1/galGal3/linSpecRep
foreach f (*/chr*.fa.out)
awk '$10 !~/^(L3|L3a|L3b|MIR|MIR3|MIRb|MIRm)$/ {print;}' $f \
> /san/sanvol1/galGal3/linSpecRep/$f:t:r:r.out.spec
end
# rebuild these as they appear to have become lost
mkdir /hive/data/staging/data/galGal3/linSpecRep
foreach f (*/chr*.fa.out)
awk '$10 !~/^(L3|L3a|L3b|MIR|MIR3|MIRb|MIRm)$/ {print;}' $f \
> /hive/data/staging/data/galGal3/linSpecRep/$f:t:r:r.out.spec
end
#########################################################################
# SWAP CHAINS/NET HG18 (DONE 5/23/06 angie)
ssh kkstore03
mkdir /cluster/data/galGal3/bed/blastz.hg18.swap
cd /cluster/data/galGal3/bed/blastz.hg18.swap
doBlastzChainNet.pl -swap /cluster/data/hg18/bed/blastz.galGal3/DEF \
>& do.log & tail -f do.log
ln -s blastz.hg18.swap /cluster/data/galGal3/bed/blastz.hg18
#########################################################################
# SWAP CHAINS/NET MM8 (DONE 5/24/06 angie)
ssh kkstore03
mkdir /cluster/data/galGal3/bed/blastz.mm8.swap
cd /cluster/data/galGal3/bed/blastz.mm8.swap
doBlastzChainNet.pl -swap /cluster/data/mm8/bed/blastz.galGal3/DEF \
>& do.log & tail -f do.log
ln -s blastz.mm8.swap /cluster/data/galGal3/bed/blastz.mm8
#########################################################################
# SWAP CHAINS/NET RN4 (DONE 5/25/06 angie)
ssh kkstore03
mkdir /cluster/data/galGal3/bed/blastz.rn4.swap
cd /cluster/data/galGal3/bed/blastz.rn4.swap
doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.galGal3/DEF \
>& do.log & tail -f do.log
ln -s blastz.rn4.swap /cluster/data/galGal3/bed/blastz.rn4
#########################################################################
# SWAP CHAINS/NET MONDOM4 (DONE 5/26/06 angie)
ssh kkstore03
mkdir /cluster/data/galGal3/bed/blastz.monDom4.swap
cd /cluster/data/galGal3/bed/blastz.monDom4.swap
doBlastzChainNet.pl -swap /cluster/data/monDom4/bed/blastz.galGal3/DEF \
>& do.log & tail -f do.log
ln -s blastz.monDom4.swap /cluster/data/galGal3/bed/blastz.monDom4
#########################################################################
# BLASTZ/CHAIN/NET XENTRO2 (DONE 5/27/06 angie)
ssh pk
mkdir /cluster/data/galGal3/bed/blastz.xenTro2.2006-05-26
cd /cluster/data/galGal3/bed/blastz.xenTro2.2006-05-26
cat << '_EOF_' > DEF
# chicken vs. frog
BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
# Use same params as used for galGal3-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Chicken galGal3
SEQ1_DIR=/san/sanvol1/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Frog xenTro2 - single chunk big enough to run two of the
# largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/cluster/data/galGal3/bed/blastz.xenTro2.2006-05-26
'_EOF_'
# << emacs
doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/scratch/galGal3XenTro2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
>& do.log & tail -f do.log
ln -s blastz.xenTro2.2006-05-26 /cluster/data/galGal3/bed/blastz.xenTro2
#########################################################################
# SWAP CHAINS/NET DANRER4 (DONE 5/31/06 angie)
ssh kkstore03
mkdir /cluster/data/galGal3/bed/blastz.danRer4.swap
cd /cluster/data/galGal3/bed/blastz.danRer4.swap
doBlastzChainNet.pl -swap /cluster/data/danRer4/bed/blastz.galGal3/DEF \
>& do.log & tail -f do.log
ln -s blastz.danRer4.swap /cluster/data/galGal3/bed/blastz.danRer4
#########################################################################
# GENBANK AUTO UPDATE (DONE 6/9/06 angie)
# align with revised genbank process. drop xeno ESTs.
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add galGal3
# galGal3
galGal3.serverGenome = /cluster/data/galGal3/galGal3.2bit
galGal3.clusterGenome = /scratch/hg/galGal3/galGal3.2bit
galGal3.ooc = /cluster/bluearc/galGal3/11.ooc
galGal3.align.unplacedChroms = chr*_random
galGal3.lift = /cluster/data/galGal3/jkStuff/liftAll.lft
galGal3.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
galGal3.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
galGal3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
galGal3.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
galGal3.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
galGal3.refseq.mrna.native.load = yes
galGal3.refseq.mrna.xeno.load = yes
galGal3.genbank.mrna.xeno.load = yes
galGal3.downloadDir = galGal3
cvs ci etc/genbank.conf
# update /cluster/data/genbank/
make etc-update
ssh kkstore02
cd /cluster/data/genbank
nice bin/gbAlignStep -initial galGal3 &
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
nice ./bin/gbDbLoadStep -drop -initialLoad galGal3 &
# enable daily alignment and update of hgwdev
cd ~/kent/src/makeDb/genbank
cvsup
# add galGal3 to:
etc/align.dbs
etc/hgwdev.dbs
cvs commit
make etc-update
###########################################################################
# HUMAN (hg18) PROTEINS TRACK FOR hg18 (DONE, 2006-06-16, braney)
ssh kkstore03
bash # if not using bash shell already
# make Blast database for non-random chrom sequences
mkdir -p /cluster/data/galGal3/blastDb
cd /cluster/data/galGal3/blastDb
ls ../*/*/*.lft | grep -v lift | sed 's/lft/fa/' > list
ln -s `cat list` .
for i in *.fa
do
/cluster/bluearc/blast229/formatdb -i $i -p F
done
rm *.log *.fa list
cd /cluster/data/galGal3
for i in */*/*.lft
do cat $i ;
done > jkStuff/subChr.lft
mkdir -p /san/sanvol1/scratch/galGal3/blastDb;
cd /cluster/data/galGal3/blastDb
for i in nhr nin nsq;
do
cp *.$i /san/sanvol1/scratch/galGal3/blastDb;
done
mkdir /cluster/data/galGal3/bed/tblastnHg18KG
cd /cluster/data/galGal3/bed/tblastnHg18KG
echo /san/sanvol1/scratch/galGal3/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 259 query.lst
# we want around 250000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(250000/259) = 38.049172
mkdir -p /cluster/bluearc/galGal3/bed/tblastn.hg18KG/kgfa
split -l 60 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/galGal3/bed/tblastn.hg18KG/kgfa/kg
ln -s /cluster/bluearc/galGal3/bed/tblastn.hg18KG/kgfa kgfa
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/galGal3/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/galGal3/jkStuff/liftAll.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.4
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
# then run the Blast cluster jobs
ssh kk
cd /cluster/data/galGal3/bed/tblastn.hg18KG
para create blastSpec
para try, check, push, check etc.
# pushed 100,000 jobs at a time so need to do para push again later
para time
# Completed: 158767 of 158767 jobs
# CPU time in finished jobs: 9352133s 155868.88m 2597.81h 108.24d 0.297 y
# IO & Wait Time: 526347s 8772.45m 146.21h 6.09d 0.017 y
# Average job time: 62s 1.04m 0.02h 0.00d
# Longest finished job: 221s 3.68m 0.06h 0.00d
# Submission to last job: 104854s 1747.57m 29.13h 1.21d
ssh kkstore03
cd /cluster/data/galGal3/bed/tblastn.hg18KG
tcsh
mkdir chainRun
cd chainRun
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS \
/cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh pk
cd /cluster/data/galGal3/bed/tblastn.hg18KG/chainRun
para create chainSpec
para try, check, push, check etc.
# Completed: 613 of 613 jobs
# CPU time in finished jobs: 26409s 440.15m 7.34h 0.31d 0.001 y
# IO & Wait Time: 7787s 129.79m 2.16h 0.09d 0.000 y
# Average job time: 56s 0.93m 0.02h 0.00d
# Longest finished job: 553s 9.22m 0.15h 0.01d
# Submission to last job: 1834s 30.57m 0.51h 0.02d
ssh kkstore03
cd /cluster/data/galGal3/bed/tblastn.hg18KG/blastOut
bash # if using another shell
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/galGal3/bed/tblastn.hg18KG/blastHg18KG.psl
pslCheck /cluster/data/galGal3/bed/tblastn.hg18KG/blastHg18KG.psl
# this is ok.
# load table
ssh hgwdev
cd /cluster/data/galGal3/bed/tblastn.hg18KG
hgLoadPsl galGal3 blastHg18KG.psl
# check coverage
featureBits galGal3 blastHg18KG
# 19638830 bases of 1042591351 (1.884%) in intersection
featureBits galGal2 blastHg16KG
# 17057637 bases of 1054197620 (1.618%) in intersection
featureBits galGal3 refGene:cds blastHg18KG -enrichment
# refGene:cds 0.504%, blastHg18KG 1.884%, both 0.417%, cover 82.71%, enrich 43.91x
featureBits galGal2 refGene:cds blastHg16KG -enrichment
# refGene:cds 0.497%, blastHg16KG 1.618%, both 0.335%, cover 67.42%, enrich 41.66x
ssh kkstore04
rm -rf /cluster/data/galGal3/bed/tblastn.hg18KG/blastOut
rm -rf /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut
#end human proteins
#########################################################################
# BACEND PAIRS TRACK
# DOWNLOAD CLONEENDS (BACENDS) FROM NCBI (DONE 6/20/06 angie)
ssh kkstore03
cd /cluster/data/galGal3
# Plenty of room at the moment:
df -h .
# 2.0T 1.6T 280G 86% /cluster/store6
mkdir -p bed/cloneend/ncbi
cd bed/cloneend/ncbi
wget --timestamping \
ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/gallus_gallus/\*
cd ..
# Extract unversioned accessions from headers and combine into one
# uncompressed file which we will link to from /gbdb/:
zcat ncbi/*ends*.mfa.gz \
| perl -wpe 'if (/^>/) { \
s/^>.*\|(gb|dbj)\|(\w+)\.\d+\|.*/>$2/ || die "parse line $.:$_\n"; }' \
> cloneEnds.fa
# Make sure the sequences are intact after the header-munging:
faSize ncbi/*.mfa.gz
#146777127 bases (3202228 N's 143574899 real 131081897 upper 12493002 lower) in 135327 sequences in 8 files
#Total size: mean 1084.6 sd 183.8 min 64 (gi|30716935|gb|CC322877.1|) max 2431 (gi|30609044|gb|CC263290.1|) median 1095
faSize cloneEnds.fa
#146777127 bases (3202228 N's 143574899 real 131081897 upper 12493002 lower) in 135327 sequences in 1 files
#Total size: mean 1084.6 sd 183.8 min 64 (CC322877) max 2431 (CC263290) median 1095
# Extract pairings from info files:
zcat ncbi/*info*.txt.gz \
| /cluster/bin/scripts/convertCloneEndInfo stdin
#55099 pairs and 22655 singles
# split for cluster run
mkdir /cluster/bluearc/galGal3/cloneEnds
faSplit sequence cloneEnds.fa 100 /cluster/bluearc/galGal3/cloneEnds/cloneEnds
# Check to ensure no breakage:
faSize /cluster/bluearc/galGal3/cloneEnds/*.fa
#146777127 bases (3202228 N's 143574899 real 131081897 upper 12493002 lower) in 135327 sequences in 99 files
#Total size: mean 1084.6 sd 183.8 min 64 (CC322877) max 2431 (CC263290) median 1095
# same numbers as before
# load sequences
ssh hgwdev
mkdir /gbdb/galGal3/cloneend
ln -s /cluster/data/galGal3/bed/cloneend/cloneEnds.fa /gbdb/galGal3/cloneend/
cd /tmp
hgLoadSeq galGal3 /gbdb/galGal3/cloneend/cloneEnds.fa
#135327 sequences
# BACEND SEQUENCE ALIGNMENTS (DONE 7/17/06 angie)
ssh kkstore03
# Make unmasked fasta.
cd /cluster/data/galGal3
mkdir /cluster/bluearc/galGal3/noMask
foreach f (?{,?}/chr*.fa)
echo $f:t:r
perl -wpe 'tr/a-z/A-Z/ if (! /^>/);' $f \
> /cluster/bluearc/galGal3/noMask/$f:t
end
# kluster run [san is down today so use kk this time]
ssh kk
mkdir /cluster/data/galGal3/bed/bacends
cd /cluster/data/galGal3/bed/bacends
mkdir out
# allow blat to run politely in /tmp while it writes output, then
# copy results to results file:
cat << '_EOF_' > runBlat.csh
#!/bin/csh -ef
set root1 = $1
set root2 = $2
set result = $3
rm -fr /scratch/tmp/${root1}_${root2}
mkdir /scratch/tmp/${root1}_${root2}
pushd /scratch/tmp/${root1}_${root2}
/cluster/bin/i386/blat /cluster/bluearc/galGal3/noMask/${root1}.fa \
/cluster/bluearc/galGal3/cloneEnds/${root2}.fa ${root1}.${root2}.psl \
-ooc=/cluster/bluearc/galGal3/10.ooc -tileSize=10
popd
mkdir -p out/${root2}
rm -f ${result}
mv /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
rm -fr /scratch/tmp/${root1}_${root2}
'_EOF_'
# << emacs
chmod +x runBlat.csh
cat << '_EOF_' > template
#LOOP
./runBlat.csh $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
# << emacs
ls -1S /cluster/bluearc/galGal3/cloneEnds/cloneEnds???.fa > bacEnds.lst
ls -1S /cluster/bluearc/galGal3/noMask/chr*.fa > contig.lst
gensub2 contig.lst bacEnds.lst template jobList
para make jobList
#Completed: 5247 of 5247 jobs
#CPU time in finished jobs: 69864s 1164.39m 19.41h 0.81d 0.002 y
#IO & Wait Time: 614130s 10235.51m 170.59h 7.11d 0.019 y
#Average job time: 130s 2.17m 0.04h 0.00d
#Longest finished job: 3306s 55.10m 0.92h 0.04d
#Submission to last job: 3674s 61.23m 1.02h 0.04d
ssh kkstore03
cd /cluster/data/galGal3/bed/bacends
mkdir temp
time pslSort dirs raw.psl temp out/*
#9.349u 1.074s 0:10.54 98.7% 0+0k 0+0io 2pf+0w
time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons raw.psl \
bacEnds.psl /dev/null
#4.838u 0.158s 0:05.09 97.8% 0+0k 0+0io 2pf+0w
# BACEND PAIRS TRACK (DONE 7/17/06 angie)
ssh kolossus
cd /cluster/data/galGal3/bed/bacends
time /cluster/bin/x86_64/pslPairs -tInsert=10000 \
-minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl \
../cloneend/cloneEndPairs.txt all_bacends bacEnds
#1.076u 0.292s 0:01.68 80.9% 0+0k 0+0io 3pf+0w
# Filter by score and sort by {chrom,chromStart}:
awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
cat bacEnds.{slop,short,long,mismatch,orphan} \
| awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed
# CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
awk '{print $5}' bacEndPairs.bed | sort -u
/cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
bacEndPairsBad.bed \
| sorttbl tname tstart | headchg -del \
> bacEnds.load.psl
wc -l bacEnds.*
# 136054 bacEnds.psl
# 106769 bacEnds.load.psl
# 29879 bacEnds.pairs
# 5 bacEnds.long
# 99 bacEnds.lst
# 56 bacEnds.mismatch
# 20125 bacEnds.orphan
# 106 bacEnds.short
# 78 bacEnds.slop
# load into database
ssh hgwdev
cd /cluster/data/galGal3/bed/bacends
hgLoadBed -strict -notItemRgb galGal3 bacEndPairs bacEndPairs.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
#Loaded 29879 elements of size 11
# note - the "Bad" track isn't pushed to RR, just used for assembly QA.
hgLoadBed -strict -notItemRgb galGal3 bacEndPairsBad bacEndPairsBad.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
#Loaded 20360 elements of size 11
time hgLoadPsl galGal3 -table=all_bacends bacEnds.load.psl
#1.123u 0.104s 0:04.23 28.8% 0+0k 0+0io 3pf+0w
# Compares favorably to previous release (good cov up, bad cov down):
featureBits galGal3 all_bacends
#58629720 bases of 1042591351 (5.623%) in intersection
featureBits galGal2 all_bacends
#52184234 bases of 1054197620 (4.950%) in intersection
featureBits galGal3 bacEndPairs
#43710407 bases of 1042591351 (4.192%) in intersection
featureBits galGal2 bacEndPairs
#39206208 bases of 1054197620 (3.719%) in intersection
featureBits galGal3 bacEndPairsBad
#15784430 bases of 1042591351 (1.514%) in intersection
featureBits galGal2 bacEndPairsBad
#27538232 bases of 1054197620 (2.612%) in intersection
# Clean up.
rm psl.tab psl.tab.loaded raw.psl bed.tab
rm -r out
rmdir temp
rm -r /cluster/bluearc/galGal3/noMask/ /cluster/bluearc/galGal3/cloneEnds/
# end BACEND PAIRS TRACK
#########################################################################
# MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 7/13/06 angie - REDONE 8/4)
# REDONE 8/4 after makeDownloads.pl was fixed to not try to include complete
# paths in the tar files.
ssh hgwdev
cd /cluster/data/galGal3
~/kent/src/utils/makeDownloads.pl galGal3 -verbose=2 \
>& jkStuff/downloads.log & tail -f jkStuff/downloads.log
# Edit README.txt files when done:
# *** Edit each README.txt to resolve any notes marked with "***":
# /cluster/data/galGal3/goldenPath/database/README.txt
# /cluster/data/galGal3/goldenPath/bigZips/README.txt
# /cluster/data/galGal3/goldenPath/chromosomes/README.txt
# (The htdocs/goldenPath/galGal3/*/README.txt "files" are just links to those.)
#########################################################################
# MAKE EMPTY CHRM_GOLD (DONE 7/14/06 angie)
# Kayla found that hgTracks was complaining about missing chrM_gold.
# We can use hgFakeAgp to make a fake chrM.agp, and load that.
# However, then the chrM Assembly track shows faked contents that do not
# match the track/assembly description. So remove the faked contents,
# leaving an empty table which is fine -- chrM is not part of the assembly.
ssh hgwdev
cd /cluster/data/galGal3
hgFakeAgp M/chrM.fa M/chrM.agp
hgGoldGapGl -noGl -chromLst=chrom.lst galGal3 /cluster/data/galGal3 .
hgsql galGal3 -e 'delete from chrM_gold'
#########################################################################
# MULTIZ7WAY (DONE 7/18/06 angie)
# (((galGal3 ((hg18 (mm8 rn4)) monDom4)) xenTro2) danRer4)
ssh kkstore03
mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18
# copy MAF's to cluster-friendly server
mkdir /san/sanvol1/scratch/galGal3/mafNet
foreach s (rn4 mm8 hg18 monDom4 xenTro2 danRer4)
echo $s
rsync -av /cluster/data/galGal3/bed/blastz.$s/mafNet/* \
/san/sanvol1/scratch/galGal3/mafNet/$s/
end
# Prune the hg17 17way tree to just these 9 and update db names:
/cluster/bin/phast/tree_doctor \
--prune-all-but=rat_rn3,mouse_mm7,human_hg17,monodelphis_monDom2,chicken_galGal2,xenopus_xenTro1,zebrafish_danRer3 \
--rename="rat_rn3 -> rat_rn4 ; mouse_mm7 -> mouse_mm8 ; human_hg17 -> human_hg18 ; monodelphis_monDom2 -> monodelphis_monDom4 ; chicken_galGal2 -> chicken_galGal3 ; zebrafish_danRer3 -> zebrafish_danRer4 ; xenopus_xenTro1 -> xenopus_xenTro2" \
/cluster/data/hg17/bed/multiz17way/17way.nh > 7way.nh
# *carefully* edit 7way.nh to put galGal3 first.
/cluster/bin/phast/all_dists 7way.nh > 7way.distances.txt
grep galGal3 7way.distances.txt | sort -k3,3n | \
awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
cat distances.txt
#0.9847 human_hg18
#1.0149 monodelphis_monDom4
#1.3425 mouse_mm8
#1.3472 rat_rn4
#1.3604 xenopus_xenTro2
#1.6727 zebrafish_danRer4
# the order in the browser display will be by tree topology,
# not by distance, but in this case (unlike human) they happen to match.
# create species list and stripped down tree for autoMZ
sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 7way.nh \
> tree-commas.nh
sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst
ssh pk
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18
mkdir maf run
cd run
# stash binaries
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
set db = galGal3
set c = $1
set maf = $2
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/mafNet
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'EOF'
# << emacs
chmod +x autoMultiz.csh
cat << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/galGal3/bed/multiz7way.2006-07-18/maf/$(root1).maf}
#ENDLOOP
'EOF'
# << emacs
awk '{print $1}' /cluster/data/galGal3/chrom.sizes > chrom.lst
gensub2 chrom.lst single spec jobList
para make jobList
para time
#Completed: 57 of 57 jobs
#CPU time in finished jobs: 6545s 109.08m 1.82h 0.08d 0.000 y
#IO & Wait Time: 206s 3.43m 0.06h 0.00d 0.000 y
#Average job time: 118s 1.97m 0.03h 0.00d
#Longest finished job: 890s 14.83m 0.25h 0.01d
#Submission to last job: 891s 14.85m 0.25h 0.01d
# Make .gif for tree by pasting the contents of 7way.nh into Galt's
# cgi-bin/phyloGif tool. Save the .gif in your checkout of the browser/
# CVS tree as browser/images/phylo/galGal3_7way.gif .
ssh hgwdev
cd ~/browser/images/phylo
cvs add galGal3_7way.gif
# note: markd says it is best to use "cvs add -kb" for binary files - b0b
cvs ci -m "phyloGif-generated galGal3 7way tree image." galGal3_7way.gif
# Do a "make alpha" to install the file in htdocs/images/phylo/ :
cd ~/browser
cvs up -d -P
make alpha
# Include the file in the push request, and include
# "treeImage phylo/galGal3_7way.gif" in the multiz7way trackDb.ra entry.
#########################################################################
# ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE 7/18/2006 angie - REDONE 7/28)
# REDONE 7/28/06 after Kayla found overlaps and Brian fixed mafAddIRows.
ssh kolossus
mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno
mkdir maf run
cd run
rm -f sizes nBeds
foreach db (`cat /cluster/data/galGal3/bed/multiz7way.2006-07-18/species.lst`)
ln -s /cluster/data/$db/chrom.sizes $db.len
if (! -e /cluster/data/$db/$db.N.bed) then
twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
endif
ln -s /cluster/data/$db/$db.N.bed $db.bed
echo $db.bed >> nBeds
echo $db.len >> sizes
end
echo date > jobs.csh
# do smaller jobs first:
foreach f (`ls -1rS ../../maf/*.maf`)
echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
/cluster/data/galGal3/galGal3.2bit ../maf/$f:t >> jobs.csh
end
echo date >> jobs.csh
csh -efx jobs.csh >&! jobs.log & tail -f jobs.log
# 23min
# Run mafFilter -overlap to make sure everything is cool before loading:
mafFilter -overlap -reject=/tmp/chr1.rej.maf ../maf/chr1.maf > /dev/null
#rejected 0 blocks
# Load anno/maf
ssh hgwdev
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf
mkdir -p /gbdb/galGal3/multiz7way/anno/maf
ln -s /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf/*.maf \
/gbdb/galGal3/multiz7way/anno/maf
nice hgLoadMaf -pathPrefix=/gbdb/galGal3/multiz7way/anno/maf \
galGal3 multiz7way
#Loaded 1488826 mafs in 57 files from /gbdb/galGal3/multiz7way/anno/maf
# Do the computation-intensive part of hgLoadMafSummary on a workhorse
# machine and then load on hgwdev:
ssh kolossus
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf
cat *.maf \
| nice hgLoadMafSummary galGal3 -minSize=30000 -mergeGap=1500 -maxSize=200000 \
-test multiz7waySummary stdin
#Created 518561 summary blocks from 3179814 components and 1488826 mafs from stdin
ssh hgwdev
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf
sed -e 's/mafSummary/multiz7waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
> /tmp/multiz7waySummary.sql
time nice hgLoadSqlTab galGal3 multiz7waySummary /tmp/multiz7waySummary.sql \
multiz7waySummary.tab
#0.000u 0.002s 0:08.00 0.0% 0+0k 0+0io 3pf+0w
rm *.tab
ln -s multiz7way.2006-07-18 /cluster/data/galGal3/bed/multiz7way
#########################################################################
# MULTIZ7WAY DOWNLOADABLES (DONE 7/18/2006 angie - REDONE 7/28)
# Annotated MAF is now documented, so use anno/maf for downloads.
ssh hgwdev
mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads
# upstream mafs
cat > mafFrags.csh << 'EOF'
date
foreach i (1000 2000 5000)
echo "making upstream$i.maf"
nice featureBits galGal3 refGene:upstream:$i -fa=/dev/null -bed=up.bad
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
rm up.bad
nice mafFrags galGal3 multiz7way up.bed upstream$i.maf \
-orgs=../species.lst
rm up.bed
end
date
'EOF'
# << emacs
time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
#10.144u 15.154s 0:52.22 48.4% 0+0k 0+0io 5pf+0w
ssh kkstore03
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads
cat > downloads.csh << 'EOF'
date
foreach f (../anno/maf/chr*.maf)
set c = $f:t:r
echo $c
nice gzip -c $f > $c.maf.gz
end
md5sum *.gz > md5sum.txt
date
'EOF'
# << emacs
time csh -efx downloads.csh >&! downloads.log
#355.340u 3.718s 5:59.55 99.8% 0+0k 0+0io 1pf+0w
nice gzip up*.maf
nice md5sum up*.maf.gz >> md5sum.txt
# Make a README.txt
ssh hgwdev
set dir = /usr/local/apache/htdocs/goldenPath/galGal3/multiz7way
mkdir $dir
ln -s /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads/{*.gz,*.txt} $dir
#########################################################################
# MULTIZ7WAY MAF FRAMES (DONE 7/18/2006 angie - REDONE 7/31)
ssh hgwdev
mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/frames
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/frames
# The following is adapted from MarkD's Makefile used for mm7...
#------------------------------------------------------------------------
# get the genes for all genomes
# mRNAs with CDS. single select to get cds+psl, then split that up and
# create genePred
# using mrna table as genes: danRer4
mkdir genes
foreach queryDb (danRer4)
set tmpExt = `mktemp temp.XXXXXX`
set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
set tmpMrna = ${queryDb}.mrna.${tmpExt}
set tmpCds = ${queryDb}.cds.${tmpExt}
echo $queryDb
hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
from all_mrna,gbCdnaInfo,cds \
where (all_mrna.qName = gbCdnaInfo.acc) and \
(gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
${queryDb} > ${tmpMrnaCds}
cut -f 1-2 ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
stdout \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$queryDb.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
rm -f $tmpExt
end
# using knownGene for rn4 mm8 hg18
# using refGene for galGal3
# using mgcGenes for xenTro2
# no genes for monDom4
# genePreds; (must keep only the first 10 columns for knownGene)
foreach queryDb (galGal3 rn4 mm8 hg18 xenTro2)
unset geneTbl range
if ($queryDb == "xenTro2") then
set geneTbl = mgcGenes
set range = 1-10
else if ($queryDb == "galGal3") then
set geneTbl = refGene
set range = 2-11
else
set geneTbl = knownGene
set range = 1-10
endif
echo $queryDb
hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f $range \
| genePredSingleCover stdin stdout \
| gzip -2c \
> /scratch/tmp/$queryDb.tmp.gz
mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
end
#------------------------------------------------------------------------
# create frames
ssh kkstore03
nice tcsh # easy way to get process niced
cd /cluster/data/galGal3/bed/multiz7way/frames
(cat ../anno/maf/*.maf \
| time genePredToMafFrames galGal3 stdin stdout \
danRer4 genes/danRer4.gp.gz \
rn4 genes/rn4.gp.gz \
hg18 genes/hg18.gp.gz \
mm8 genes/mm8.gp.gz \
xenTro2 genes/xenTro2.gp.gz \
galGal3 genes/galGal3.gp.gz \
| gzip > multiz7way.mafFrames.gz) >& mafFrames.log & tail -f mafFrames.log
#------------------------------------------------------------------------
# load the database
ssh hgwdev
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/frames
hgLoadMafFrames galGal3 multiz7wayFrames multiz7way.mafFrames.gz
#########################################################################
# PHASTCONS (DONE 7/20/2006 angie)
# Using Kate's process from makeHg17.doc.
# This process is distilled from Hiram and Adam's experiments
# on mouse (mm7) 17way track. Many parameters are now fixed, without
# being experimentally derived, either because the experiments
# were lengthy and produced similar results, or because they
# weren't runnable given the alignment size.
# These parameters are:
# --rho
# --expected-length
# --target-coverage
# Also, instead of generating cons and noncons tree models,
# we use a single, pre-existing tree model -- Elliot Margulies' model
# from the (37-way) ENCODE alignments.
ssh kkstore03
cd /cluster/data/galGal3
foreach f (`cat chrom.lst`)
echo $f
cp -p $f/*.fa /san/sanvol1/scratch/galGal3/chrom/
end
# Split chromosome MAF's into windows and use to generate
# "sufficient statistics" (ss) files for phastCons input
# NOTE: as the SAN fs has lotsa space, we're leaving these
# big (temp) files unzipped, to save time during phastCons run.
# Note also the larger chunk sizes from previous runs -- this
# reduces run-time on the split, slows down the actual phastCons
# enough so jobs don't crash (jobs are very quick, just a minute
# or so), and according to Adam, will produce better results.
# The previous small chunks were probably required by
# the phyloFit step, which we are no longer using for the
# human alignments.
ssh pk
mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
/cluster/bin/phast/tree_doctor \
--prune-all-but=rn3,mm7,hg17,monDom2,galGal2,xenTro1,danRer3 \
--rename="galGal2 -> galGal3 ; rn3 -> rn4 ; hg17 -> hg18 ; mm7 -> mm8 ; monDom2 -> monDom4 ; xenTro1 -> xenTro2 ; danRer3 -> danRer4" \
/san/sanvol1/scratch/mm7/cons/elliotsEncode.mod \
> elliotsEncodePruned.mod
mkdir run.split
cd run.split
set WINDOWS = /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/galGal3/bed/multiz7way.2006-07-18/maf
set WINDOWS = /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss
cd $WINDOWS
set c = $1
echo $c
rm -fr $c
mkdir $c
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
-M /san/sanvol1/scratch/galGal3/chrom/$c.fa \
-o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
echo "Done" >> $c.done
'EOF'
# << emacs
chmod +x doSplit.csh
rm -f jobList
foreach f (../../maf/*.maf)
set c = $f:t:r
echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
end
para make jobList
para time
#Completed: 54 of 57 jobs
#Crashed: 3 jobs
#CPU time in finished jobs: 570s 9.49m 0.16h 0.01d 0.000 y
#IO & Wait Time: 229s 3.82m 0.06h 0.00d 0.000 y
#Average job time: 15s 0.25m 0.00h 0.00d
#Longest finished job: 112s 1.87m 0.03h 0.00d
#Submission to last job: 223s 3.72m 0.06h 0.00d
para crashed
#chr12_random, chr17_random and chr32 crashed because their maf's contain
#nothing but comments -- no alignments. That's OK, no windows for them.
# check tree model on 5MB chunk, using params recommended by Adam,
# (to verify branch lengths on 2X species)
ssh kolossus
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
/cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
--tree "`cat ../tree-commas.nh`" \
/san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss/chr7/chr7.10000001-19997442.ss \
-o phyloFit.tree
/cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s REV \
--tree "`cat ../tree-commas.nh`" \
/san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss/chr7/chr7.10000001-19997442.ss \
-o phyloFit.rev.tree
# Comment from makeHg17.doc:
# # he ok'ed the results -- not necessary for next human run
# TODO: maybe run these by Adam... the numbers in the RATE_MAT and
# TREE are different between the phyloFit and elliotsEncode models,
# not sure if the differences are significant. I will proceed with
# elliotsEncode as in makeHg17.doc.
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons
cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set tmp = /scratch/tmp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons
cp -p $san/ss/$c/$f.ss ../elliotsEncodePruned.mod $tmp
pushd $tmp > /dev/null
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncodePruned.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/pp/$c $san/bed/$c
sleep 1
mv $tmp/$f.pp $san/pp/$c
mv $tmp/$f.bed $san/bed/$c
rm -fr $tmp
'EOF'
# << emacs
chmod a+x doPhast.csh
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create gsub file
cat > template << 'EOF'
#LOOP
doPhast.csh $(root1) $(file1) 12 .05 .20
#ENDLOOP
'EOF'
# << emacs
# Create parasol batch and run it
ssh pk
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons
pushd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons
ls -1S ss/chr*/chr*.ss \
| sed 's/.ss$//' \
> /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons/in.list
popd
gensub2 in.list single template jobList
para make jobList
para time
#Completed: 151 of 151 jobs
#CPU time in finished jobs: 2832s 47.20m 0.79h 0.03d 0.000 y
#IO & Wait Time: 644s 10.74m 0.18h 0.01d 0.000 y
#Average job time: 23s 0.38m 0.01h 0.00d
#Longest finished job: 34s 0.57m 0.01h 0.00d
#Submission to last job: 181s 3.02m 0.05h 0.00d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons
# The sed's and the sort get the file names in chrom,start order
# (Hiram tricks -- split into columns on [.-/] with
# identifying x,y,z, to allow column sorting and
# restoring the filename. Warning: the sort column
# will depend on how deep you are in the dir
find ./bed -name "chr*.bed" \
| sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \
| sort -k7,7 -k9,9n \
| sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \
| xargs cat \
| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
| /cluster/bin/scripts/lodToBedScore /dev/stdin > phastConsElements7way.bed
cp -p phastConsElements7way.bed /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
# Measure coverage. If good, load elements into database and proceed with wiggle.
# Try for 5% overall cov, and 70% CDS cov.
ssh hgwdev
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
featureBits galGal3 -enrichment refGene:cds phastConsElements7way.bed
featureBits galGal3 -enrichment xenoRefGene:cds phastConsElements7way.bed
# FIRST ITERATION: doPhast (len cov rho) = (12 .1 .27)
#refGene:cds 0.504%, phastConsElements7way.bed 7.891%, both 0.414%, cover 82.18%, enrich 10.41x
#xenoRefGene:cds 1.785%, phastConsElements7way_12_100_27.bed 7.891%, both 1.610%, cover 90.22%, enrich 11.43x
mv phastConsElements7way.bed phastConsElements7way_12_100_27.bed
# SECOND ITERATION: doPhast (len cov rho) = (12 .1 .3)
#refGene:cds 0.504%, phastConsElements7way.bed 8.336%, both 0.421%, cover 83.61%, enrich 10.03x
#xenoRefGene:cds 1.785%, phastConsElements7way.bed 8.336%, both 1.635%, cover 91.59%, enrich 10.99x
mv phastConsElements7way.bed phastConsElements7way_12_100_30.bed
# THIRD ITERATION: doPhast (len cov rho) = (12 .05 .27)
#refGene:cds 0.504%, phastConsElements7way.bed 7.650%, both 0.415%, cover 82.26%, enrich 10.75x
#xenoRefGene:cds 1.785%, phastConsElements7way.bed 7.650%, both 1.615%, cover 90.49%, enrich 11.83x
mv phastConsElements7way.bed phastConsElements7way_12_050_27.bed
# FOURTH ITERATION: doPhast (len cov rho) = (12 .05 .20)
#refGene:cds 0.504%, phastConsElements7way.bed 6.480%, both 0.392%, cover 77.82%, enrich 12.01x
#xenoRefGene:cds 1.785%, phastConsElements7way.bed 6.480%, both 1.535%, cover 86.02%, enrich 13.28x
mv phastConsElements7way.bed phastConsElements7way_12_050_20.bed
# The coverage doesn't want to drop too much and maybe it shouldn't --
# I don't like the idea of throwing extreme parameters in order to get
# the desired coverage figure from a model that doesn't want to go there.
# Load this up and see how it looks...
# When happy:
hgLoadBed -strict galGal3 phastConsElements7way \
phastConsElements7way_12_050_20.bed
# Create merged posterier probability file and wiggle track data files
ssh kolossus
cd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
#next time try Angie's simpler sort, below
time find ./pp -name "chr*.pp" | \
sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
sort -k7,7 -k9,9n | \
sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
xargs cat | \
nice wigEncode stdin phastCons7way.wig phastCons7way.wib
cp -p phastCons7way.wi? \
/cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
ln -s `pwd`/phastCons7way.wib /gbdb/galGal3/multiz7way/phastCons7way.wib
hgLoadWiggle -pathPrefix=/gbdb/galGal3/multiz7way galGal3 \
phastCons7way phastCons7way.wig
#########################################################################
# PHASTCONS SCORES DOWNLOADABLES FOR 7WAY (DONE 7/20/2006 angie)
ssh kolossus
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18
mkdir phastConsDownloads
cd phastConsDownloads
cat > downloads.csh << 'EOF'
date
cd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/pp
foreach chr (`awk '{print $1}' /cluster/data/galGal3/chrom.sizes`)
echo $chr
cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \
| nice gzip -c \
> /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastConsDownloads/$chr.gz
end
date
'EOF'
# << emacs
csh -efx downloads.csh >&! downloads.log & tail -f downloads.log
# ~3min
md5sum *.gz > md5sum.txt
# Make a README.txt
ssh hgwdev
cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastConsDownloads
set dir = /usr/local/apache/htdocs/goldenPath/galGal3/phastCons7way
mkdir $dir
ln -s /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastConsDownloads/{*.gz,*.txt} $dir
# Clean up after phastCons run.
ssh kkstore03
rm /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/*.tab
rm -r /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons
#########################################################################
# SELF CHAINS (DONE 9/11/06 angie)
# requested by user
ssh kk
mkdir /cluster/data/galGal3/bed/blastz.galGal3.2006-09-08
cd /cluster/data/galGal3/bed/blastz.galGal3.2006-09-08
cat << '_EOF_' > DEF
# chicken vs chicken
BLASTZ=/cluster/bin/penn/i386/blastz
BLASTZ_H=2000
BLASTZ_M=200
# TARGET: chicken galGal3
SEQ1_DIR=/scratch/hg/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: chicken galGal3
SEQ2_DIR=/scratch/hg/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/galGal3/bed/blastz.galGal3.2006-09-08
'_EOF_'
# << emacs
doBlastzChainNet.pl -chainMinScore=10000 -chainLinearGap=medium \
-blastzOutRoot=/cluster/bluearc/galGal3Self \
-bigClusterHub=kk -workhorse=kkr7u00 DEF \
>& do.log & tail -f do.log
ln -s blastz.galGal3.2006-09-08 /cluster/data/galGal3/bed/blastz.galGal3
#########################################################################
# SNP & GENES FROM BEIJING GENOME INST. (DONE 12/4/06 angie)
ssh kkstore03
mkdir /cluster/data/galGal3/bed/bgi
cd /cluster/data/galGal3/bed/bgi
wget --timestamping \
'ftp://61.50.158.101/BGI/snps_by_strain/2006-11-28/*'
# (username & password in email from Yong Zhang 11/29)
# Each .tar.gz unpacks into identical sets of files so they will
# overwrite each other if unpacked into the same directory. So create
# a subdir for each strain, and unpack its files in the subdir:
foreach f (*.tar.gz)
set strain = `echo $f:r:r:r | perl -wpe 's/^(\w)/\u$1/'`
mkdir -p $strain
pushd $strain
tar -xvzf ../$f
popd
end
# Make coverage bed4:
cp /dev/null bgiCov.bed
foreach strain (Broiler Layer Silkie)
foreach f ($strain/ContigCov/ContigCov-chr*.txt)
set chr = `echo $f:t:r | sed -e 's/ContigCov-//'`
grep -v ^Covered $f \
| perl -wpe 'if (/^(\d+)\s+(\d+)\s*$/) { \
$s = $1-1; $_ = "'$chr'\t$s\t$2\t'$strain'.'$chr'.$1.$2\n"; \
} else {die};' \
>> bgiCov.bed
end
end
# Make bgiSnp bed+:
~/kent/src/hg/snp/bgi/bgiSnp.pl */SNPtables/*.txt
# The Layer files had the wrong strain ID code (last digit) -- the
# script substituted in the correct strain ID (Yong Zhang's suggestion).
# Load tables
ssh hgwdev
cd /cluster/data/galGal3/bed/bgi
hgLoadBed galGal3 bgiCov bgiCov.bed
hgLoadBed galGal3 bgiSnp bgiSnp.bed \
-sqlTable=$HOME/kent/src/hg/lib/bgiSnp.sql
# Got this warning but I think it's because the final column is always
# empty so bed.tab doesn't have the final column... table looks OK.
#load of bgiSnp did not go as planned: 3306633 record(s), 0 row(s) skipped, 3306633 warning(s) loading bed.tab
#########################################################################
# STS?
#########################################################################
#########################################################################
# SWAP STICKLEBACK GASACU1
ssh pk
mkdir /cluster/data/galGal3/bed/blastz.gasAcu1.swap
cd /cluster/data/galGal3/bed/blastz.gasAcu1.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/DEF \
-smallClusterHub=pk \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=pk > swap.log 2>&1 &
# failed on the net step:
# HgStepManager: executing step 'net'.
# netChains: looks like previous stage was not successful
# (can't find [galGal3.gasAcu1.]all.chain[.gz]).
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/DEF \
-smallClusterHub=pk \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-continue=net -swap -bigClusterHub=pk > net_swap.log 2>&1 &
# real 7m11.286s
featureBits galGal3 chainGasAcu1Link > fb.galGal3.chainGasAcu1Link.txt
# 32781658 bases of 1042591351 (3.144%) in intersection
#######################################################################
## BLASTZ Chicken chroms vs Stickleback chroms and random contigs
## The above swap of gasAcu1 does *not* include the stickleback chrUn
## To get alignments on the stickleback chrUn, do this alignment of
## stickleback with chrUn to chicken, and swap them back to stickleback.
## Don't do any Db loading, pick out the stickleback randoms after the swap
## (WORKIN - 2007-01-11 - Hiram)
ssh kkstore03
mkdir /cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11
cd /cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11
cat << '_EOF_' > DEF
# Chicken chroms vs. Stickleback chroms and chrUn in contigs
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Chicken galGal3 chroms only, no random sequence
# The largest is 200 million bases, there are 34 of them
SEQ1_DIR=/san/sanvol1/scratch/galGal3/galGal3.noRandoms.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/galGal3/galGal3.noRandoms.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000
# QUERY: Stickleback gasAcu1 chroms and chrUn in contigs
# The largest chrom is 32M bases, the largest contig 418,670 bases
# The smallest chrom chrM is 15,742 bases, smallest contig 60 bases
# there are 5,115 chroms and contig pieces
# total size 1,089,690,673 bases
# 47107538 N's 1042583135 real 834274605 upper 208308530 lower
SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift
SEQ2_CHUNK=1000000
SEQ2_LIMIT=20
SEQ2_LAP=0
BASE=/cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 DEF \
-smallClusterHub=pk \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-blastzOutRoot /cluster/bluearc/galGal3ChrGasAcu1Un \
-stop=net -bigClusterHub=pk > do.log 2>&1 &
# real 116m59.888s
# And swapping:
mkdir /cluster/data/gasAcu1/bed/blastz.galGal3.swap
cd /cluster/data/gasAcu1/bed/blastz.galGal3.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11/DEF \
-smallClusterHub=pk \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -stop=net -bigClusterHub=pk > swap.log 2>&1 &
#######################################################################
## BLASTZ LIZARD ANOCAR1 - (DONE - 2007-02-18 - Hiram)
ssh kkstore05
mkdir /cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18
cd /cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18
cat << '_EOF_' > DEF
# chicken vs. lizard
# Use same params as used for galGal3-xenTro2
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Chicken galGal3
SEQ1_DIR=/san/sanvol1/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold
SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 DEF \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-blastzOutRoot=/san/sanvol1/scratch/galGal3AnoCar1 > do.log 2>&1
time doBlastzChainNet.pl -verbose=2 DEF \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=net \
-blastzOutRoot=/san/sanvol1/scratch/galGal3AnoCar1 > net.log 2>&1 &
time nice -n +19 featureBits galGal3 chainAnoCar1Link \
> fb.galGal3.chainAnoCar1Link.txt 2>&1
# real 0m43.752s
# 106743952 bases of 1042591351 (10.238%) in intersection
## swap documented in anoCar1.txt
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18/DEF \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap > swap.log 2>&1 &
#######################################################################
## BLASTZ FUGU FR2 - (DONE - 2007-03-09 - 2007-03-12 - Hiram)
ssh kkstore03
mkdir /cluster/data/galGal3/bed/blastz.fr2.2007-03-09
cd /cluster/data/galGal3/bed/blastz.fr2.2007-03-09
cat << '_EOF_' > DEF
# chicken vs. fugu
# Use same params as used for galGal3-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Chicken galGal3
SEQ1_DIR=/san/sanvol1/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Fugu fr2
SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ2_CHUNK=10000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/galGal3/bed/blastz.fr2.2007-03-09
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 DEF \
-qRepeats=windowmaskerSdust \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-blastzOutRoot=/cluster/bluearc/galGal3Fr2 > do.log 2>&1
cat fb.galGal3.chainFr2Link.txt
# 30935323 bases of 1042591351 (2.967%) in intersection
mkdir /cluster/data/fr2/bed/blastz.galGal3.swap
cd /cluster/data/fr2/bed/blastz.galGal3.swap
time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
/cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-swap > swap.log 2>&1 &
time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
/cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-continue=net -swap > swap_net.log 2>&1 &
# real 3m1.239s
cat fb.fr2.chainGalGal3Link.txt
# 36175581 bases of 393312790 (9.198%) in intersection
#######################################################################
## BLASTZ FUGU FR1 - (DONE - 2007-03-09 - 2007-03-12 - Hiram)
ssh kkstore03
mkdir /cluster/data/galGal3/bed/blastz.fr1.2007-03-09
cd /cluster/data/galGal3/bed/blastz.fr1.2007-03-09
cat << '_EOF_' > DEF
# chicken vs. fugu
# Use same params as used for galGal3-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Chicken galGal3
SEQ1_DIR=/san/sanvol1/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Fugu fr1
SEQ2_DIR=/san/sanvol1/scratch/fr1/chrUn.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr1/fr1.UnScaffolds.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr1/scaffold.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft
SEQ2_CHUNK=10000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/galGal3/bed/blastz.fr1.2007-03-09
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 DEF \
-qRepeats=windowmaskerSdust \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-blastzOutRoot=/cluster/bluearc/galGal3Fr1 > do.log 2>&1
cat fb.galGal3.chainFr1Link.txt
# 29604522 bases of 1042591351 (2.840%) in intersection
mkdir /cluster/data/fr1/bed/blastz.galGal3.swap
cd /cluster/data/fr1/bed/blastz.galGal3.swap
time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
/cluster/data/galGal3/bed/blastz.fr1.2007-03-09/DEF \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap > swap.log 2>&1 &
# real 3m14.494s
cat fb.fr1.chainGalGal3Link.txt
# 33536400 bases of 315518167 (10.629%) in intersection
#######################################################################
# BLASTZ PLATYPUS ORNANA1 (DONE 4/9/07 angie)
ssh kkstore03
mkdir /cluster/data/galGal3/bed/blastz.ornAna1.2007-04-06
cd /cluster/data/galGal3/bed/blastz.ornAna1.2007-04-06
cat << '_EOF_' > DEF
# chicken vs. platypus
# Use same params as used for galGal3-xenTro2 but back off L to 6000.
# If that causes us to get swamped, we can go back up to 8000.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Chicken galGal3
SEQ1_DIR=/iscratch/i/galGal3/galGal3.2bit
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Platypus ornAna1 - largest chunk big enough for largest scaffold
SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
SEQ2_LEN=/iscratch/i/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/cluster/data/galGal3/bed/blastz.ornAna1.2007-04-06
TMPDIR=/scratch/tmp
'_EOF_'
# << emacs
doBlastzChainNet.pl DEF \
>& do.log & tail -f do.log
ln -s blastz.ornAna1.2007-04-06 /cluster/data/galGal3/bed/blastz.ornAna1
#########################################################################
# BLASTZ/CHAIN/NET HORSE AND CHICKEN (DONE 2/27/07 Fan)
ssh kkstore05
mkdir /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
cd /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
cat << '_EOF_' > DEF
# Horse vs. Chicken
BLASTZ_M=50
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Horse equCab1
SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=500
SEQ1_CHUNK=30000000
SEQ1_LAP=10000
# QUERY: Chicken galGal3
SEQ2_DIR=/scratch/hg/galGal3/galGal3.2bit
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-bigClusterHub pk \
-chainMinScore=5000 -chainLinearGap=loose \
-blastzOutRoot /cluster/bluearc/equCab1/blastz.galGal3 >& do.log &
tail -f do.log
ssh hgwdev
cd /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
ln -s blastz.galGal3.2007-02-22 /cluster/data/equCab1/bed/blastz.galGal3
nice featureBits equCab1 -chrom=chr1 chainGalGal3Link
# 9136777 bases of 177498097 (5.148%) in intersection
bash
time nice -n 19 featureBits equCab1 chainGalGal3Link \
> fb.equCab1.chainGalGal3Link.txt 2>&1
# 114216918 bases of 2421923695 (4.716%) in intersection
ssh kkstore05
mkdir /cluster/data/galGal3/bed/blastz.equCab1.swap
cd /cluster/data/galGal3/bed/blastz.equCab1.swap
bash
time doBlastzChainNet.pl \
/cluster/data/equCab1/bed/blastz.galGal3.2007-02-22/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 &
tail -f swap.log
# the above script stopped with the following lines in the .log file:
# netToAxt axtChain/net/chrM.net axtChain/chain/chrM.chain /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout
# axtSort stdin stdout
# gzip -c
# Processing chrM
# end
# netToAxt axtChain/net/chrUn_random.net axtChain/chain/chrUn_random.chain /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout
# axtSort stdin stdout
# gzip -c
# Processing chrUn_random
# Hiram helped to created the following temp .csh:
cat finiNet.csh
#!/bin/csh -efx
# This script was automatically generated by /cluster/bin/scripts/doBlastzChainNet.pl
# from /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22/DEF
# It is to be executed on kolossus in /cluster/data/galGal3/bed/blastz.equCab1.swap/axtChain .
# It generates nets (without repeat/gap stats -- those are added later on
# hgwdev) from chains, and generates axt, maf and .over.chain from the nets.
# This script will fail if any of its commands fail.
cd /cluster/data/galGal3/bed/blastz.equCab1.swap
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
/scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout \
| axtSort stdin stdout \
| gzip -c > axtNet/$f:t:r.galGal3.equCab1.net.axt.gz
end
# Make mafNet for multiz: one .maf per galGal3 seq.
mkdir mafNet
foreach f (axtNet/*.galGal3.equCab1.net.axt.gz)
axtToMaf -tPrefix=galGal3. -qPrefix=equCab1. $f \
/cluster/data/galGal3/chrom.sizes /san/sanvol1/scratch/equCab1/chrom.sizes \
stdout \
| gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end
ssh kolossus
cd /cluster/data/galGal3/bed/blastz.equCab1.swap
fininet.csh
ssh hgwdev
cd /cluster/data/galGal3/bed/blastz.equCab1.swap
bash
time nice -n 19 featureBits galGal3 chainEquCab1Link \
> fb.galGal3.chainEquCab1Link.txt 2>&1
# 104418042 bases of 1042591351 (10.015%) in intersection
###########################################################################
# ENSEMBL GENES (DONE 11/2/07 angie)
ssh hgwdev
mkdir /cluster/data/galGal3/bed/ensembl
cd /cluster/data/galGal3/bed/ensembl
# Go to www.ensembl.org and click around their evolving interface
# to get the following types of data:
# 1. a tab-separated file that relates Ensembl gene, transcript and
# protein IDs. Save as ensGtp.txt.gz
# 2. peptide fasta with only transcript IDs in header -> ensPep.fa.gz
# They have started dumping GTF files so we can download that directly:
# wget
# 'ftp://ftp.ensembl.org/pub/current_gallus_gallus/data/gtf/*'
# -- but I forgot to do that and just grabbed the GTF from Ensembl
# BioMart along with the GTP and Pep.
# Add "chr" to chrom names in the gene data gtf file to make
# it compatible with our software. Also change MT --> M.
zcat ensembl.gff.gz | sed -e 's/^MT/M/; s/^/chr/;' > ensGene.gtf
ldHgGene -gtf -genePredExt galGal3 ensGene ensGene.gtf
nice featureBits galGal3 ensGene
#30850267 bases of 1042591351 (2.959%) in intersection
# strip header line:
zcat ensGtp.txt.gz | tail +2 \
| hgLoadSqlTab galGal3 ensGtp ~/kent/src/hg/lib/ensGtp.sql \
/dev/stdin -notOnServer
zcat ensPep.fa.gz | sed -e 's/^>.*ENSGALT/>ENSGALT/' \
| hgPepPred galGal3 generic ensPep stdin
############################################################################
# BLASTZ petMar1 Lamprey (WORKING - 2008-04-11 - Hiram)
ssh kkstore03
screen # use a screen to control this multi-day job
mkdir /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11
cd /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11
cat << '_EOF_' > DEF
# Chicken vs Lamprey
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Chicken galGal3
SEQ1_DIR=/scratch/data/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Lamprey petMar1
SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=200
SEQ2_LAP=0
BASE=/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl \
/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=memk -verbose=2 > do.log 2>&1 &
# Completed: 83468 of 83468 jobs
# CPU time in finished jobs: 2199234s 36653.91m 610.90h 25.45d 0.070 y
# IO & Wait Time: 426454s 7107.56m 118.46h 4.94d 0.014 y
# Average job time: 31s 0.52m 0.01h 0.00d
# Longest finished job: 414s 6.90m 0.12h 0.00d
# Submission to last job: 103466s 1724.43m 28.74h 1.20d
# continuing after some kluster interruptions
time nice -n +19 doBlastzChainNet.pl \
/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-continue=cat -bigClusterHub=memk -verbose=2 > cat.log 2>&1 &
# real 13m54.957s
cat fb.galGal3.chainPetMar1Link.txt
# 20134896 bases of 1042591351 (1.931%) in intersection
# and the swap
mkdir /cluster/data/petMar1/bed/blastz.galGal3.swap
cd /cluster/data/petMar1/bed/blastz.galGal3.swap
time nice -n +19 doBlastzChainNet.pl \
/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=memk -verbose=2 > swap.log 2>&1 &
# real 33m41.587s
cat fb.petMar1.chainGalGal3Link.txt
# 22308118 bases of 831696438 (2.682%) in intersection
############################################################################
# BLASTZ petMar1 Lanclet (WORKING - 2008-04-15 - Hiram)
ssh kkstore03
screen # use a screen to control this multi-day job
mkdir /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15
cd /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15
cat << '_EOF_' > DEF
# Chicken vs Lanclet
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Chicken galGal3
SEQ1_DIR=/scratch/data/galGal3/nib
SEQ1_LEN=/scratch/data/galGal3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Lancelet braFlo1 - largest chunk big enough for largest scaffold
# Largest scaffold 7,200,735 - 3032 scaffolds + chrM
SEQ2_DIR=/scratch/data/braFlo1/braFlo1.2bit
SEQ2_LEN=/scratch/data/braFlo1/chrom.sizes
SEQ2_CTGDIR=/scratch/data/braFlo1/braFlo1UnScaffolds.2bit
SEQ2_CTGLEN=/scratch/data/braFlo1/braFlo1UnScaffolds.sizes
SEQ2_LIFT=/scratch/data/braFlo1/braFlo1.lift
SEQ2_CHUNK=10000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl \
/cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -verbose=2 > do.log 2>&1 &
# real 168m49.357s
cat fb.galGal3.chainBraFlo1Link.txt
# 19795687 bases of 1042591351 (1.899%) in intersection
# and the swap
mkdir /cluster/data/braFlo1/bed/blastz.galGal3.swap
cd /cluster/data/braFlo1/bed/blastz.galGal3.swap
time nice -n +19 doBlastzChainNet.pl \
/cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=pk -verbose=2 > swap.log 2>&1 &
# real 4m45.054s
cat fb.braFlo1.chainGalGal3Link.txt
# 30287175 bases of 923355587 (3.280%) in intersection
#############################################################################
# BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-18 - larrym)
ssh kkstore04
screen # use screen to control this multi-day job
mkdir /cluster/data/galGal3/bed/blastz.equCab2.2008-04-18
cd /cluster/data/galGal3/bed/blastz.equCab2.2008-04-18
cat << '_EOF_' > DEF
# Chicken vs. Horse
BLASTZ_M=50
# TARGET: Chicken calGal3
SEQ1_DIR=/scratch/data/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/cluster/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/galGal3/bed/blastz.equCab2.2008-04-18
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/equCab2/blastz.galGal3 >& do.log &
failed b/c para.results got currupted:
para recover jobList recoverJobList
para create recoverJobList
Checking input files
0 jobs written to batch
so all jobs had successfully completed.
Generate run.time file manually:
time doBlastzChainNet.pl `pwd`/DEF -continue=cat \
-verbose=2 -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/equCab2/blastz.galGal3 >>& do.log &
but the batch file has disappeared ... I just decided to remove the psl
files and re-run from scratch:
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/equCab2/blastz.galGal3 >& do.log &
0.248u 0.190s 6:10:43.78 0.0% 0+0k 0+0io 26pf+0w
ln -s blastz.equCab2.2008-04-18 /cluster/data/galGal3/bed/blastz.equCab2
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
#####################################################
# build 2bit with contigs from random chroms and chrUn (DONE braney 2008-08-30)
cd /cluster/data/galGal3
fgrep -h 'random
Un' */*.agp | awk '{if ($5 == "W") print $1, $2-1, $3, $6}' \
| while read chr start stop contig; \
do \
echo ">$contig"; twoBitToFa galGal3.2bit:$chr:$start-$stop stdout \
| tail -n +2; \
done > unRandom.fa
fgrep -v 'random
Un' chrom.sizes | awk '{print $1}' | \
twoBitToFa galGal3.2bit -seqList=stdin noUn.fa
cat noUn.fa unRandom.fa | faToTwoBit stdin galGal3.blastz.2bit
twoBitInfo galGal3.blastz.2bit galGal3.blastz.sizes
#########################################################################
# BLASTZ/CHAIN/NET TAEGUT1 (DONE braney 2008-09-04)
ssh swarm
screen
mkdir /cluster/data/galGal3/bed/blastz.taeGut1.2008-09-06
cd /cluster/data/galGal3/bed/blastz.taeGut1.2008-09-06
cat << _EOF_ > DEF
# chicken vs. zebra finch
BLASTZ_T=2
BLASTZ_M=50
# TARGET: Chicken galGal3
# SEQ1_DIR=/scratch/data/galGal3/galGal3.2bit
# SEQ1_LEN=/scratch/data/galGal3/chrom.sizes
SEQ1_DIR=/hive/data/genomes/galGal3/galGal3.2bit
SEQ1_LEN=/hive/data/genomes/galGal3/chrom.sizes
# SEQ1_CTGDIR=/hive/data/genomes/galGal3/galGal3.blastz.2bit
# SEQ1_CTGLEN=/hive/data/genomes/galGal3/galGal3.blastz.sizes
# SEQ1_LIFT=/hive/data/genomes/galGal3/jkStuff/liftAll.lft
# one chrom at a time
SEQ1_CHUNK=200000000
SEQ1_LAP=0
# QUERY: Zebra finch taeGut1
# SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit
# SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes
SEQ2_DIR=/hive/data/genomes/taeGut1/taeGut1.2bit
SEQ2_LEN=/hive/data/genomes/taeGut1/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/taeGut1/taeGut1.blastz.2bit
SEQ2_CTGLEN=/hive/data/genomes/taeGut1/taeGut1.blastz.sizes
SEQ2_LIFT=/hive/data/genomes/taeGut1/jkStuff/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/hive/data/genomes/galGal3/bed/blastz.taeGut1.2008-09-06
_EOF_
# << emacs
doBlastzChainNet.pl -syntenicNet \
-bigClusterHub=swarm -chainMinScore=3000 -chainLinearGap=medium \
-smallClusterHub=swarm DEF -workhorse=swarm \
-qRepeats=windowmaskerSdust > do.log 2>&1 &
cd /cluster/data/galGal3/bed
rm -f blastz.taeGut1
ln -s blastz.taeGut1.2008-09-06 /cluster/data/galGal3/bed/blastz.taeGut1
############
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
galGal3.upstreamGeneTbl = refGene
galGal3.upstreamMaf = multiz7way /hive/data/genomes/galGal3/bed/multiz7way/species.lst
############################################################################
# QUALITY TRACK (DONE - 2008-11-24 - Hiram)
cd /hive/data/genomes/galGal3
# create an agp file
for C in `cut -f1 chrom.sizes | grep -v random | sed -e "s/chr//" | sort`
do
cat $C/chr${C}.agp
done > galGal3.agp
for C in `cut -f1 chrom.sizes | grep -v random | sed -e "s/chr//" | sort`
do
F=$C/chr${C}_random.agp
if [ -s $F ]; then
cat $F
fi
done >> galGal3.agp
cd /hive/data/genomes/galGal3/downloads
qaToQac contigs.fa.qual.gz assembly.qac
qacAgpLift -mScore=99 -verbose=2 ../galGal3.agp assembly.qac scaffolds.qac
mkdir /hive/data/genomes/galGal3/bed/qual
cd /hive/data/genomes/galGal3/bed/qual
qacToWig -fixed ../../downloads/scaffolds.qac stdout \
| wigEncode stdin qual.wig qual.wib
ln -s `pwd`/qual.wib /gbdb/galGal3/wib
hgLoadWiggle -pathPrefix=/gbdb/galGal3/wib galGal3 quality qual.wig
+
+#############################################################################
+# N-SCAN gene predictions (nscanGene) - (2009-03-13 markd)
+
+ # obtained NSCAN predictions from michael brent's group
+ # at WUSTL
+ cd /cluster/data/galGal3/bed/nscan/
+ http://mblab.wustl.edu/predictions/chicken/galGal3/
+
+ wget -nv http://mblab.wustl.edu/predictions/chicken/galGal3/galGal3.gtf
+ wget -nv http://mblab.wustl.edu/predictions/chicken/galGal3/galGal3.fa
+ wget -nv http://mblab.wustl.edu/predictions/chicken/galGal3/README
+ bzip2 galGal3.*
+ chmod a-w *
+ # load track
+ ldHgGene -bin -gtf -genePredExt galGal3 nscanGene galGal3.gtf.bz2
+ hgPepPred galGal3 generic nscanPep galGal3.fa.bz2
+ rm *.tab
+
+ # chicken/galGal3/trackDb.ra, add:
+ track nscanGene override
+ informant Chicken N-SCAN uses Zerba Finch (taeGut1) as the informant, updated with PASA clusters of chicken cDNAs.
+
+ searchTable nscanGene
+ searchType genePred
+ termRegex chr[0-9a-zA-Z_]+\.([0-9]+|pasa)\.[0-9]+(\.[0-9a-z]+)?
+ searchPriority 50
+