src/hg/makeDb/doc/equCab2.txt 1.29
1.29 2010/02/05 00:06:41 markd
enabled xeno refgene
Index: src/hg/makeDb/doc/equCab2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/equCab2.txt,v
retrieving revision 1.28
retrieving revision 1.29
diff -b -B -U 1000000 -r1.28 -r1.29
--- src/hg/makeDb/doc/equCab2.txt 20 Sep 2009 17:16:42 -0000 1.28
+++ src/hg/makeDb/doc/equCab2.txt 5 Feb 2010 00:06:41 -0000 1.29
@@ -1,2369 +1,2374 @@
# for emacs: -*- mode: sh; -*-
# Equus caballus - ??
# "$Id$"
#########################################################################
# DOWNLOAD SEQUENCE (...
# Looked at kkstore05; equCab1 was 52Gigs, and there was 150+ Gigs available on /cluster/store12,
# so hiram decided we had room to put equCab2 on /cluster/store12 too
ssh kkstore05
mkdir /cluster/store12/equCab2
ln -s /cluster/store12/equCab2 /cluster/data/equCab2
mkdir /cluster/data/equCab2/broad
cd /cluster/data/equCab2/broad
wget --timestamping -r -nd ftp://ftp.broad.mit.edu/pub/assemblies/mammals/horse/Equus2
# sanity check of AGP file:
zcat assembly.bases.gz | grep '^>' | sed -e 's/>//' | sort > contig_names
cat mapped.agp.chromosome.agp | egrep -v 'fragment|clone' | cut -f 6 | sort -u > contig_names2
zcat assembly.quals.gz | grep '^>' | sed -e 's/>//' | sort > contig_names3
diff contig_names contig_names2
diff contig_names contig_names3
# check for non-ATCG
zcat assembly.bases.gz | egrep -v '^>contig' | egrep '[^ATCG]' | wc
no diffs, so the assembly, quality and chromosome files agree.
# Unknown chromosome:
cut -f 1 mapped.agp.chromosome.agp | sort -u | grep '^chrUn' | wc
9604 9604 96040
i.e. lots of unknown fragments
cd /cluster/data/equCab2/broad/
egrep '^chrUn' mapped.agp.chromosome.agp | perl -e '$sum = 0; while(<>) { if(/^\S+\s+(\S+)\s+(\S+)/) { $n = $2 - $1 + 1; $sum += $n;}} print "sum: $sum\n"'
sum: 107858955
about 1/3 of the previous horse.
Coallesce many unknown chromosomes in Broad data to one chrUnk:
cat << '_EOF_' > unk.pl
#!/usr/bin/env perl
# Process unknown regions of mapped.agp.chromosome.agp to create one unknown chromosome.
#
# run thus:
# ./unk.pl < mapped.agp.chromosome.agp > test
use warnings;
use strict;
my $seq = 1;
my $end = 0; # fixed-up end position (field 3) of current chromosome
my $prevEnd = 0; # field three of previous "chromosome" (not necessarily the previous line)
my $count = 0; # current number of $prevEnd lines seen (used to detect single line Unk chromosomes).
my $lastNum = ""; # "Unk" number of previous line
my $ungappedLen = 1000;
my $debug = 0;
my $test = 0;
sub printUnGapped
{
# print a non-bridged gap for single line unknown chromosomes.
my ($prev) = @_;
print "chrUn\t";
print $prev + 1, "\t";
print $prev + 1000, "\t";
print $seq++, "\t";
print "N\t$ungappedLen\tclone\tno\n";
$test++;
return $prev + $ungappedLen;
}
while(<>) {
my @a = split /\s+/;
if($a[0] =~ /^chrUn(\d+)/) {
my $num = $1;
my $newChr = $num ne $lastNum;
# XXXX
if($newChr && $count) {
$end = printUnGapped($end);
}
if($newChr) {
print "chrUn$num\n" if($debug);
$prevEnd = $end;
}
# fixup fields 2 & 3
$a[1] += $prevEnd;
$a[2] += $prevEnd;
# fixup sequence
$a[3] = $seq++;
print "chrUn\t";
for my $i (1..8) {
if(defined($a[$i])) {
print $a[$i],"\t";
}
}
print "\n";
$end = $a[2];
$lastNum = $num;
$count = $newChr ? 1 : $count + 1;
}
}
print STDERR "test: $test\n" if($debug);
'_EOF_'
unk.pl < mapped.agp.chromosome.agp > test
# Sanity checks on unk.pl run:
cut -f 6 test | egrep ^contig | wc
12980 12980 168740
cut -f 6 test | egrep ^contig | sort | uniq | wc
12980 12980 168740
# create our fixed-up up map file (ucsc.agp)
cp test ucsc.agp
egrep -v '^chrUn' mapped.agp.chromosome.agp >> ucsc.agp
Run makeGenomeDb.pl:
makeGenomeDb.pl failed b/c we forgot to lift the QA files, so we manually lifted them thus:
# Lift contig quality scores to chrom coordinates:
qaToQac assembly.quals.gz stdout | qacAgpLift ucsc.agp stdin equCab2.qual.qac
cd /cluster/data/equCab2/bed/qual
qacToWig -fixed /cluster/data/equCab2/broad/equCab2.qual.qac stdout | wigEncode stdin qual.{wig,wib}
# run on hgwdev:
hgLoadWiggle -pathPrefix=/gbdb/equCab2/wib equCab2 quality qual.wig &
then we restarted makeGenomeDb.pl:
makeGenomeDb.pl -continue dbDb equCab2.config.ra > & makeGenomeDb.log2 &
and it finished correctly.
# Sanity checks on makeGenomeDb data:
twoBitToFa equCab2.unmasked.2bit stdout | faSize stdin
2510493021 bases (56068733 N's 2454424288 real 2454424288 upper 0 lower) in 34 sequences in 1 files
Total size: mean 73838030.0 sd 36872490.7 min 16660 (chrM) max 185838109 (chr1) median 80757907
N count: mean 1649080.4 sd 4017533.3
U count: mean 72188949.6 sd 35782988.9
L count: mean 0.0 sd 0.0
%0.00 masked total, %0.00 masked real
zcat assembly.bases.gz | faSize stdin
2428773513 bases (0 N's 2428773513 real 2428773513 upper 0 lower) in 55316 sequences in 1 files
Total size: mean 43907.3 sd 67000.2 min 601 (contig_28626) max 916837 (contig_34896) median 16149
N count: mean 0.0 sd 0.0
U count: mean 43907.3 sd 67000.2
L count: mean 0.0 sd 0.0
%0.00 masked total, %0.00 masked real
2454424288 - 2428773513 == 25650775
i.e. our file has 25,650,775 more real nucleotides:
zcat assembly.bases.gz | egrep '^>contig' | tail
>contig_55315
# consistent with faSize data and length of contig_names
zcat mapped.agp.chromosome.fasta.gz | faSize stdin &
2500873361 bases (46465733 N's 2454407628 real 2454407628 upper 0 lower) in 10096 sequences in 1 files
Total size: mean 247709.3 sd 1055871.2 min 3000 (Un9604.1-3000) max 5000000 (1.1-5000000) median 4235
N count: mean 4602.4 sd 24700.2
U count: mean 243106.9 sd 1042172.0
L count: mean 0.0 sd 0.0
%0.00 masked total, %0.00 masked real
This file is consistent with our gaps:
(56068733 - 46465733) - (9603 * 1000) == 0
Our file has extra real nucleotides:
2454424288 - 2454407628 == 16660
Which is OK, b/c that's the size of ChrM.
Problem is that their assembly uses the same contigs in different chromosomes;
see broad/dupes.txt; e.g. contig_31550 is at the end of chr27 and chr6
and broad/dupes.agp
egrep -v "^track" dupes.agp | awk '{print $3-$2+1}' | ave stdin
total 51268230.000000
51268230 / 2 == 25634115
So there are 25,634,115 duplicated bps.
((2454424288 - 16660) - 25634115) - 2428773513 == 0
i.e. if you take dupes and chrM into account, our data agrees with their raw data.
#########################################################################
# REPEATMASKER (DONE - 2008-04-03 - larrym)
ssh kkstore05
screen # use a screen to manage this job
mkdir /cluster/data/equCab2/bed/repeatMasker
cd /cluster/data/equCab2/bed/repeatMasker
doRepeatMasker.pl -buildDir=/cluster/data/equCab2/bed/repeatMasker -bigClusterHub=pk equCab2 >& do.log &
# XXXX Hiram, I had to run this on hgwdev to get access to mysql; does that make sense?
time featureBits equCab2 rmsk >& fb.equCab2.rmsk.txt &
# 1004742864 bases of 2454424288 (40.936%) in intersection
# RepeatMasker and lib version from do.log:
# RepeatMasker version development-$Id$
# Jan 11 2008 (open-3-1-9) version of RepeatMasker
# CC RELEASE 20071204
# Compare coverage to previous assembly:
featureBits equCab1 rmsk
994130286 bases of 2421923695 (41.047%) in intersection
#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-04-03 - larrym)
ssh kkstore05
screen # use a screen to manage this job
mkdir /cluster/data/equCab2/bed/simpleRepeat
cd /cluster/data/equCab2/bed/simpleRepeat
doSimpleRepeat.pl -buildDir=/cluster/data/equCab2/bed/simpleRepeat -bigClusterHub=memk \
equCab2 >& do.log &
# add simple repeats to the RepeatMasker repeats:
cd /cluster/data/equCab2
twoBitMask equCab2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed equCab2.2bit
twoBitToFa equCab2.2bit stdout | faSize stdin
# 2510493021 bases (56068733 N's 2454424288 real 1449746465 upper 1004677823 lower) in 34 sequences in 1 files
# Total size: mean 73838030.0 sd 36872490.7 min 16660 (chrM) max 185838109 (chr1) median 80757907
# N count: mean 1649080.4 sd 4017533.3
# U count: mean 42639601.9 sd 20847778.6
# L count: mean 29549347.7 sd 15784709.4
# %40.02 masked total, %40.93 masked real
twoBitToFa equCab2.rmsk.2bit stdout | faSize stdin
# 2510493021 bases (56068733 N's 2454424288 real 1450129420 upper 1004294868 lower) in 34 sequences in 1 files
# Total size: mean 73838030.0 sd 36872490.7 min 16660 (chrM) max 185838109 (chr1) median 80757907
# N count: mean 1649080.4 sd 4017533.3
# U count: mean 42650865.3 sd 20852722.8
# L count: mean 29538084.4 sd 15779986.2
# %40.00 masked total, %40.92 masked real
featureBits -countGaps equCab2 rmsk
# 1004742864 bases of 2510493021 (40.022%) in intersection
# Slightly different from above b/c some repeats include N's in gaps.
featureBits -bed=rmsk_and_gaps.bed -countGaps equCab2 rmsk gap &
cat rmsk_and_gaps.bed | awk "{print $3-$2}" | ave stdin
# total 448008
1004742864 - 448008 - 1004294868 = -12
We don't understand the -12 discrepancy
# Run on hgwdev:
rm /gbdb/equCab2/equCab2.2bit
ln -s /cluster/data/equCab2/equCab2.2bit /gbdb/equCab2/equCab2.2bit
########################################################################
# add ctgPos2 track (DONE - 2008-04-04 - larrym)
cd /cluster/data/equCab2/broad
./unk.pl < mapped.agp.chromosome.agp > /dev/null
./mkScaffolds.pl assembly.agp mapped.agp.chromosome.agp >> Contig.bed
awk '{size = $3-$2; printf "%s\t%d\t%s\t%d\t%d\tW\n", $4, size, $1, $2, $3}' Contig.bed > ctgPos2.tab
ssh hgwdev
cd /cluster/data/equCab2/broad
hgLoadSqlTab equCab2 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab
############################################################################
# BLATSERVERS ENTRY (DONE - 2008-04-04 - larrym)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("equCab2", "blat10", "17784", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("equCab2", "blat10", "17785", "0", "1");' \
hgcentraltest
# test it with some sequence
############################################################################
## DEFAULT POSITION (DONE - 2008-04-04 - larrym)
# Set default position to be same as equCab1, as found by a blat search
ssh hgwdev
hgsql -e 'update dbDb set defaultPos="chr11:52,970,942-52,973,091" \
where name="equCab2";' hgcentraltest
############################################################################
## gold.html, ctgPos2.html etc. (working - 2008-04-09 - larrym)
grep contig_ mapped.agp.chromosome.agp | awk '{print $3-$2+1}' | ave stdin
average 43973.978823
count 55815
total 2454407628.000000
# But there is only 55316 uniq contigs:
grep contig_ mapped.agp.chromosome.agp | awk '{printf "%s\t%s\t%s\n", $6, $7, $8}' | sort | uniq | wc -l
55316
# working with only unique contigs, we get numbers which agree with the Broad data:
grep contig_ mapped.agp.chromosome.agp | awk '{printf "%s\t%s\t%s\n", $6, $7, $8}' | sort | uniq | awk '{print $3-$2+1}' | ave stdin
average 43907.251302
total 2428773513.000000
#########################################################################
# Create OOC file for genbank runs (WORKING - 2008-04-09 - larrym)
1024 * (2.5 / 3.1) == 825.805824
# Use -repMatch=825 (based on size -- for human we use 1024, and
# horse size is 84% of human judging by twoBitInfo -noNs)
ssh kkstore05
cd /cluster/data/equCab2
blat equCab2.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=equCab2.11.ooc -repMatch=825
#########################################################################
# make equCab2.chrUn.lift, chrUn.fa and equCab2.UnScaffolds.2bit
ssh hgwdev
cd /cluster/data/equCab2/jkStuff
grep ^chrUn ../broad/Contig.bed | awk '{printf "%d\t%s\t%d\tchrUn\t156378573\n", $2, $4, $3-$2}' > equCab2.chrUn.lift
# oops, I copied over hiram's length in the above command, so redo this
grep ^chrUn ../broad/Contig.bed | awk '{printf "%d\t%s\t%d\tchrUn\t117461955\n", $2, $4, $3-$2}' > equCab2.chrUn.lift
# create FASTA file with chrUn, using Broad scaffold names
cp -p /cluster/data/gasAcu1/jkStuff/lft2BitToFa.pl .
./lft2BitToFa.pl ../equCab2.2bit equCab2.chrUn.lift > chrUn.fa &
faSize chrUn.fa
107858955 bases (14539925 N's 93319030 real 36162792 upper 57156238 lower) in 9604 sequences in 1 files
# create FASTA file w/o unknown chroms
cd ..
twoBitToFa equCab2.2bit stdout | \
perl -e 'my $print=0; while(<>) { if(/^>chr(.+)/) { $print = $1 ne "Un";} print if($print);}' > jkStuff/chr.fa &
faSize jkStuff/chr.fa
2393031066 bases (31925808 N's 2361105258 real 1413583673 upper 947521585 lower) in 33 sequences in 1 files
# corrected chr27:
2367070107 bases (31598964 N's 2335471143 real 1397621407 upper 937849736 lower) in 33 sequences in 1 files
2393031066 + 107858955 + (9603 * 1000) = 2510493021
# corrected chr27:
2367070107 + 107858955 + (9603 * 1000) = 2484532062
# i.e. correct size
cd jkStuff
faToTwoBit chr.fa chrUn.fa ../equCab2.UnScaffolds.2bit
cd ..
twoBitInfo equCab2.UnScaffolds.2bit equCab2.UnScaffolds.sizes
cp -p equCab2.UnScaffolds.2bit /san/sanvol1/scratch/equCab2
cp -p equCab2.UnScaffolds.sizes /san/sanvol1/scratch/equCab2
# run via hg18.txt
#########################################################################
# GENBANK AUTO UPDATE (DONE - 2008-04-10 - larrym)
# align with latest genbank process.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add equCab2 just before equCab1
# equCab2 (Equus caballus)
equCab2.serverGenome = /cluster/data/equCab2/equCab2.2bit
equCab2.clusterGenome = /scratch/data/equCab2/equCab2.2bit
equCab2.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
equCab2.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
equCab2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
equCab2.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
equCab2.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
equCab2.ooc = /cluster/data/equCab2/equCab2.11.ooc
equCab2.lift = /cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
equCab2.align.unplacedChroms = chrUn
equCab2.genbank.mrna.xeno.load = yes
equCab2.downloadDir = equCab2
cvs ci -m "Added equCab2" etc/genbank.conf
# update /cluster/data/genbank:
make etc-update
ssh genbank
screen # use a screen to manage this job
cd /cluster/data/genbank
time /bin/nice -n 19 bin/gbAlignStep -initial equCab2 &
# logFile: var/build/logs/2008.04.09-11:33:43.equCab2.initalign.log
# failed (problem with a machine that didn't have proper rsync, so we rsync'ed that
# machine and re-ran the batch):
para -retries=6 push
time /bin/nice -n 19 bin/gbAlignStep -initial -continue=finish equCab2 &
# logFile: var/build/logs/2008.04.10-14:21:09.equCab2.initalign.log
# 2599.431u 1067.489s 1:03:48.15 95.7% 0+0k 0+0io 10pf+0w
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time /bin/nice -n 19 ./bin/gbDbLoadStep -drop -initialLoad equCab2 &
# logFile: var/dbload/hgwdev/logs/2008.04.10-15:27:13.dbload.log
# 567.199u 198.673s 16:48.99 75.9% 0+0k 0+0io 4pf+0w
# enable daily alignment and update of hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add equCab2 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added equCab2 - Equus caballus" etc/align.dbs etc/hgwdev.dbs
make etc-update
##
###
############################################################################
# equCab2 - Horse - Ensembl Genes (DONE - 2008-04-02 - hiram)
ssh kkstore04
cd /cluster/data/equCab2
cat << '_EOF_' > equCab2.ensGene.ra
# required db variable
db equCab2
# optional nameTranslation, the sed command that will transform
# Ensemble names to UCSC names. With quotes just to make sure.
nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/"
# translate Ensembl chrUnNNNN names to chrUn coordinates
liftUp /cluster/data/equCab2/jkStuff/chrUn.lift
'_EOF_'
# << happy emacs
doEnsGeneUpdate.pl -ensVersion=49 equCab2.ensGene.ra
ssh hgwdev
cd /cluster/data/canFam2/bed/ensGene.49
featureBits equCab2 ensGene
# 39746402 bases of 2454424288 (1.619%) in intersection
############################################################################
# SWAP equCab2 -> hg18
mkdir /cluster/data/equCab2/bed/blastz.hg18.swap
cd /cluster/data/bosTau4/bed/blastz.hg18.swap
time /bin/nice -n 19 doBlastzChainNet.pl \
/cluster/data/hg18/bed/blastz.equCab2.2008-04-10/DEF \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -syntenicNet >& do.log &
0.191u 0.101s 2:20:34.06 0.0% 0+0k 0+0io 6pf+0w
featureBits equCab2 -chrom=chr1 chainHg18Link
126450610 bases of 183561855 (68.887%) in intersection
############################################################################
# SWAP equCab2 -> mm9 (DONE - 2008-04-17 - larrym)
mkdir /cluster/data/equCab2/bed/blastz.mm9.swap
cd /cluster/data/equCab2/bed/blastz.mm9.swap
time /bin/nice -n 19 doBlastzChainNet.pl \
/cluster/data/mm9/bed/blastz.equCab2.2008-04-15/DEF \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -syntenicNet >& do.log &
0.164u 0.128s 1:32:36.50 0.0% 0+0k 0+0io 6pf+0w
featureBits equCab2 -chrom=chr1 chainMm9Link
72255123 bases of 183561855 (39.363%) in intersection
############################################################################
# SWAP equCab2 -> canFam2 (DONE - 2008-04-22 - larrym)
mkdir /cluster/data/equCab2/bed/blastz.canFam2.swap
cd /cluster/data/equCab2/bed/blastz.canFam2.swap
time /bin/nice -n 19 doBlastzChainNet.pl \
/cluster/data/canFam2/bed/blastz.equCab2.2008-04-18/DEF \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -syntenicNet >& do.log &
0.191u 0.108s 2:44:26.23 0.0% 0+0k 0+0io 1pf+0w
############################################################################
# SWAP equCab2 -> galGal3 (DONE - 2008-04-22 - larrym)
mkdir /cluster/data/equCab2/bed/blastz.galGal3.swap
cd /cluster/data/equCab2/bed/blastz.galGal3.swap
time /bin/nice -n 19 doBlastzChainNet.pl \
/cluster/data/galGal3/bed/blastz.equCab2.2008-04-18/DEF \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -syntenicNet >& do.log &
0.170u 0.107s 12:27.33 0.0% 0+0k 0+0io 0pf+0w
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-04-22)
# (auto) establish native host of /cluster/data/<assembly>
df /cluster/data/equCab2
ssh kkstore05
# bash if not using bash shell already
sizes=equCab2.UnScaffolds.sizes
twobit=equCab2.UnScaffolds.2bit
mkdir /cluster/data/equCab2/blastDb
cd /cluster/data/equCab2
awk '{if ($2 > 1000000) print $1}' $sizes > 1meg.lst
if test -s 1meg.lst; then
twoBitToFa -seqList=1meg.lst $twobit temp.fa
faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
rm temp.fa
fi
awk '{if ($2 <= 1000000) print $1}' $sizes > less1meg.lst
if test -s less1meg.lst; then
twoBitToFa -seqList=less1meg.lst $twobit temp.fa
faSplit about temp.fa 1000000 blastDb/y
rm temp.fa
fi
wc 1meg.lst less1meg.lst
# 39 39 252 1meg.lst
# 9598 9598 95975 less1meg.lst
# 9637 9637 96227 total
wc $sizes
# 9637 19274 145695 equCab2.UnScaffolds.sizes
rm 1meg.lst less1meg.lst
cd blastDb
for i in *.fa
do
/cluster/bluearc/blast229/formatdb -i $i -p F
done
rm *.fa
ls *.nsq | wc -l
# 3086
mkdir -p /san/sanvol1/scratch/equCab2/blastDb
cd /cluster/data/equCab2/blastDb
for i in nhr nin nsq;
do
echo $i
cp *.$i /san/sanvol1/scratch/equCab2/blastDb
done
mkdir -p /cluster/data/equCab2/bed/tblastn.hg18KG
cd /cluster/data/equCab2/bed/tblastn.hg18KG
echo /san/sanvol1/scratch/equCab2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 3086 query.lst
# we want around 100,000 jobs per gig (0.0001 jobs per base)
numJobs=`awk '{sum += $2} END {print sum * 0.0001 }' /cluster/data/equCab2/chrom.sizes`
# we want around numJobs jobs
numGenes=`wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`
numQueries=`wc query.lst | awk '{print $1}'`
numKGFiles=`awk -v ng=$numGenes -v nq=$numQueries -v nj=$numJobs 'BEGIN {printf "%d", ng/(nj/nq);exit}' `
echo $numJobs
# 251049
mkdir -p /cluster/bluearc/equCab2/bed/tblastn.hg18KG/kgfa
split -l $numKGFiles /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/equCab2/bed/tblastn.hg18KG/kgfa/kg
ln -s /cluster/bluearc/equCab2/bed/tblastn.hg18KG/kgfa kgfa
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/equCab2/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/equCab2/bed/tblastn.hg18KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/equCab2/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
if test -s /cluster/data/equCab2/blastDb.lft
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/equCab2/blastDb.lft carry $f.2
else
mv $f.2 $f.3
fi
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
exit
ssh pk
cd /cluster/data/equCab2/bed/tblastn.hg18KG
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 253052 of 253052 jobs
# CPU time in finished jobs: 23683489s 394724.82m 6578.75h 274.11d 0.751 y
# IO & Wait Time: 2116386s 35273.09m 587.88h 24.50d 0.067 y
# Average job time: 102s 1.70m 0.03h 0.00d
# Longest finished job: 405s 6.75m 0.11h 0.00d
# Submission to last job: 77580s 1293.00m 21.55h 0.90d
ssh kkstore05
cd /cluster/data/equCab2/bed/tblastn.hg18KG
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/equCab2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS /cluster/bluearc/equCab2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh memk
cd /cluster/data/equCab2/bed/tblastn.hg18KG/chainRun
para create chainSpec
para try, check, push, check etc.
# Completed: 82 of 82 jobs
# CPU time in finished jobs: 86617s 1443.62m 24.06h 1.00d 0.003 y
# IO & Wait Time: 10159s 169.31m 2.82h 0.12d 0.000 y
# Average job time: 1180s 19.67m 0.33h 0.01d
# Longest finished job: 3153s 52.55m 0.88h 0.04d
# Submission to last job: 8853s 147.55m 2.46h 0.10d
ssh kkstore05
cd /cluster/data/equCab2/bed/tblastn.hg18KG/blastOut
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/equCab2/bed/tblastn.hg18KG/preblastHg18KG.psl
cd ..
pslCheck preblastHg18KG.psl
liftUp -nosort -type=".psl" -nohead blastHg18KG.psl /cluster/data/equCab2/jkStuff/chrUn.lift carry preblastHg18KG.psl
# load table
ssh hgwdev
cd /cluster/data/equCab2/bed/tblastn.hg18KG
hgLoadPsl equCab2 blastHg18KG.psl
# check coverage
featureBits equCab2 blastHg18KG
# 41055754 bases of 2454424288 (1.673%) in intersection
featureBits equCab2 refGene:cds blastHg18KG -enrichment
# refGene:cds 0.017%, blastHg18KG 1.673%, both 0.016%, cover 90.76%, enrich 54.26x
featureBits equCab2 ensGene:cds blastHg18KG -enrichment
# ensGene:cds 1.294%, blastHg18KG 1.673%, both 1.023%, cover 79.08%, enrich 47.27x
ssh kkstore05
rm -rf /cluster/data/equCab2/bed/tblastn.hg18KG/blastOut
rm -rf /cluster/bluearc/equCab2/bed/tblastn.hg18KG/blastOut
#end tblastn
##################################################
###########################################################################
# Blastz Platypus ornAna1 (DONE - 2008-04-28 - larrym)
cd /cluster/data/equCab2/jkStuff
cp -p equCab2.chrUn.lift /san/sanvol1/scratch/equCab2
ssh kkstore04
screen # use screen to control this multi-day job
mkdir /cluster/data/equCab2/bed/blastz.ornAna1.2008-04-24
cd /cluster/data/equCab2/bed/blastz.ornAna1.2008-04-24
cat << '_EOF_' > DEF
# TARGET: Horse
SEQ1_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ1_LEN=/scratch/data/equCab2/chrom.sizes
SEQ1_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit
SEQ1_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes
SEQ1_LIFT=/san/sanvol1/scratch/equCab2/equCab2.chrUn.lift
SEQ1_CHUNK=20000000
SEQ1_LIMIT=200
SEQ1_LAP=10000
# QUERY: Platypus ornAna1
SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit
SEQ2_LEN=/scratch/data/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/cluster/data/equCab2/bed/blastz.ornAna1.2008-04-28
TMPDIR=/scratch/tmp
_EOF_
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk -stop=partition \
-chainMinScore=5000 -chainLinearGap=loose \
-blastzOutRoot /cluster/bluearc/ornAna1/blastz.equCab2 >& do.log &
0.095u 0.050s 0:12.21 1.1% 0+0k 0+0io 16pf+0w
Ran twice to determine SEQ2_LIMIT=400 (to get number of jobs down to 107525)
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk -continue=blastz \
-chainMinScore=5000 -chainLinearGap=loose \
-blastzOutRoot /cluster/bluearc/ornAna1/blastz.equCab2 >>& do.log &
failed on part262.lst b/c of a corrupted line in batch file; I fixed the line and
reran successfully (via para push).
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk -continue=cat \
-chainMinScore=5000 -chainLinearGap=loose \
-blastzOutRoot /cluster/bluearc/ornAna1/blastz.equCab2 >>& do.log &
0.209u 0.115s 42:36.73 0.0% 0+0k 0+0io 0pf+0w
#########################################################################
## 6-Way Multiz (DONE - 2008-05-01 - larrym)
ssh hgwdev
mkdir /cluster/data/equCab2/bed/multiz6way
cd /cluster/data/equCab2/bed/multiz6way
# select the 6 organisms from the 30-way recently done on mouse mm9
/cluster/bin/phast/tree_doctor \
--prune-all-but Human_hg18,Mouse_mm9,Dog_canFam2,Platypus_ornAna1,Chicken_galGal3,Horse_equCab1 \
/cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \
> 6-way.fullNames.nh
((((Mouse_mm9:0.325818,Human_hg18:0.126901):0.019763,(Dog_canFam2:0.149350,Horse_equCab1:0.099323):0.038613):0.332197,Platypus_ornAna1:0.488110):0.118797,Chicken_galGal3:0.488824);
rearranged to get horse on top:
cat << '_EOF_' > equCab2.6-way.nh
((((Horse_equCab2:0.099323,Dog_canFam2:0.149350):0.038613,
(Mouse_mm9:0.325818,Human_hg18:0.126901):0.019763):0.332197,
Platypus_ornAna1:0.488110):0.118797,Chicken_galGal3:0.488824);
'_EOF_'
sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' equCab2.6-way.nh
| sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
| sed -e "s/.*_//; s/:.*//" | sort > species.list
# verify that has 6 db names in it
# create a stripped down nh file for use in autoMZ run
echo \
`sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' equCab2.6-way.nh \
| sed -e "s/ / /g"` > tree.6.nh
# results:
# ((((equCab2 canFam2) (mm9 hg18)) ornAna1) galGal3)
# => for making the gif: ((((Horse,Dog), (Mouse, Human)), Platypus), Chicken)
wget -O equCab2_6way.gif 'http://genome.ucsc.edu/cgi-bin/phyloGif?hgsid=106951235&phyloGif_width=219&phyloGif_height=260&boolshad.phyloGif_branchLengths=1&boolshad.phyloGif_lengthLegend=1&boolshad.phyloGif_branchLabels=1&phyloGif_branchDecimals=2&phyloGif_branchMultiplier=1&boolshad.phyloGif_underscores=1&boolshad.phyloGif_htmlPage=1&phyloGif_tree=%28%28%28%28Horse%2CDog%29%2C+%28Mouse%2C+Human%29%29%2C+Platypus%29%2C+Chicken%29&phyloGif_submit=submit'
# verify all blastz's exists
cat << '_EOF_' > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/equCab2/bed/multiz6way
foreach db (`grep -v equCab2 species.list`)
set bdir = /cluster/data/equCab2/bed/blastz.$db
if (-e $bdir/mafRBestNet/chr1.maf.gz) then
echo "$db mafRBestNet"
else if (-e $bdir/mafSynNet/chr1.maf.gz) then
echo "$db mafSynNet"
else if (-e $bdir/mafNet/chr1.maf.gz) then
echo "$db mafNet"
else
echo "$db mafs not found"
endif
end
'_EOF_'
# << happy emacs
chmod +x ./listMafs.csh
./listMafs.csh
canFam2 mafSynNet
galGal3 mafSynNet
hg18 mafSynNet
mm9 mafSynNet
ornAna1 mafNet
/cluster/bin/phast/all_dists equCab2.6-way.nh > 6way.distances.txt
grep -i equcab 6way.distances.txt | sort -k3,3n
Horse_equCab2 Dog_canFam2 0.248673
Horse_equCab2 Human_hg18 0.284600
Horse_equCab2 Mouse_mm9 0.483517
Horse_equCab2 Platypus_ornAna1 0.958243
Horse_equCab2 Chicken_galGal3 1.077754
# use the calculated
# distances in the table below to order the organisms and check
# the button order on the browser. Zebrafish ends up before
# tetraodon and fugu on the browser despite its distance.
# And if you can fill in the table below entirely, you have
# succeeded in finishing all the alignments required.
#
# featureBits chainLink measures
# chainPonAbe2Link chain linearGap
# distance on EquCab2 on other minScore
# 1 0.248673 Dog_canFam2 (% 70.910) (% 70.300) 3000 medium
# 2 0.284600 Human_hg18 (% 66.819) (% 57.162) 3000 medium
# 3 0.483517 Mouse_mm9 (% 37.218) (% 34.821) 3000 medium
# 4 0.958243 Platypus_ornAna1 (% 5.477) (% 7.217) 5000 loose
# 5 1.077754 Chicken_galGal3 (% 3.031) (% 6.568) 3000 medium
# copy net mafs to cluster-friendly storage, splitting chroms
# into 50MB chunks to improve run-time
# NOTE: splitting will be different for scaffold-based reference asemblies
ssh hgwdev
mkdir /cluster/data/equCab2/bed/multiz6way/run.split
cd /cluster/data/equCab2/bed/multiz6way/run.split
# this works by examining the rmsk table for likely repeat areas
# that won't be used in blastz
mafSplitPos equCab2 50 mafSplit.bed
ssh kki
cd /cluster/data/equCab2/bed/multiz6way/run.split
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set targDb = "equCab2"
set db = $1
set sdir = /san/sanvol1/scratch/$targDb/splitStrictMafNet
mkdir -p $sdir
if (-e $sdir/$db) then
echo "directory $sdir/$db already exists -- remove and retry"
exit 1
endif
set bdir = /cluster/data/$targDb/bed/blastz.$db
if (! -e $bdir) then
echo "directory $bdir not found"
exit 1
endif
mkdir -p $sdir/$db
if (-e $bdir/mafRBestNet) then
set mdir = $bdir/mafRBestNet
else if (-e $bdir/mafSynNet) then
set mdir = $bdir/mafSynNet
else if (-e $bdir/mafNet) then
set mdir = $bdir/mafNet
else
echo "$bdir maf dir not found"
exit 1
endif
echo $mdir
foreach f ($mdir/*)
set c = $f:t:r:r
echo " $c"
nice mafSplit mafSplit.bed $sdir/$db/ $f
end
echo "gzipping $sdir/$db mafs"
nice gzip $sdir/$db/*
endif
echo $mdir > $db.done
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
grep -v equCab2 ../species.list > split.list
cat << '_EOF_' > template
#LOOP
doSplit.csh $(path1) {check out line+ $(path1).done}
#ENDLOOP
'_EOF_'
gensub2 split.list single template jobList
para create jobList
para -maxPush=3 push
para push
para time
5 jobs in batch
Completed: 5 of 5 jobs
CPU time in finished jobs: 3230s 53.84m 0.90h 0.04d 0.000 y
IO & Wait Time: 186s 3.10m 0.05h 0.00d 0.000 y
Average job time: 683s 11.39m 0.19h 0.01d
Longest running job: 0s 0.00m 0.00h 0.00d
Longest finished job: 1301s 21.68m 0.36h 0.02d
Submission to last job: 1336s 22.27m 0.37h 0.02d
# ready for the multiz run
ssh pk
cd /cluster/data/equCab2/bed/multiz6way
# actually, the result directory here should be maf.split instead of maf
mkdir -p maf run
cd run
mkdir penn
# use latest penn utilities
setenv P /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
cp -p $P/{autoMZ,multiz,maf_project} penn
# list chrom chunks, any db dir will do; better would be for the
# splitter to generate this file
# We temporarily use __ instead of . to delimit chunk in filename
# so we can use $(root) to get basename
bash
find /san/sanvol1/scratch/equCab2/splitStrictMafNet -type f \
| while read F; do basename $F; done \
| sed -e 's/.maf.gz//' -e 's/\./__/' | sort -u > chromChunks.list
exit
wc -l chromChunks.list
# 64 chromChunks.list
cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = equCab2
set c = $1
set maf = $2
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/splitStrictMafNet
rm -fr $tmp
mkdir -p $tmp
cp ../tree.6.nh ../species.list $tmp
pushd $tmp
foreach s (`cat species.list`)
set c2 = `echo $c | sed 's/__/./'`
set in = $pairs/$s/$c2.maf
set out = $db.$s.sing.maf
if ($s == equCab2) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.6.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz.csh
cat << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/equCab2/bed/multiz6way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << emacs
gensub2 chromChunks.list single template jobList
para create jobList
para try
para push
# 64 jobs in batch
# Completed: 64 of 64 jobs
# CPU time in finished jobs: 45922s 765.37m 12.76h 0.53d 0.001 y
# IO & Wait Time: 626s 10.43m 0.17h 0.01d 0.000 y
# Average job time: 727s 12.12m 0.20h 0.01d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 1141s 19.02m 0.32h 0.01d
# Submission to last job: 2752s 45.87m 0.76h 0.03d
# put the split maf results back together into single chroms
ssh kkstore04
cd /cluster/data/equCab2/bed/multiz6way
# here is where the result directory maf should have already been maf.split
mv maf maf.split
mkdir maf
# going to sort out the redundant header garbage to leave a cleaner maf
for C in `ls maf.split | sed -e "s#__.*##" | sort -u`
do
echo ${C}
head -q -n 1 maf.split/${C}__*.maf | sort -u > maf/${C}.maf
grep -h "^#" maf.split/${C}__*.maf | egrep -v "maf version=1|eof maf" | \
sed -e "s#_MZ_[^ ]* # #g; s#__[0-9]##g" | sort -u >> maf/${C}.maf
grep -h -v "^#" maf.split/${C}__*.maf >> maf/${C}.maf
tail -q -n 1 maf.split/${C}__*.maf | sort -u >> maf/${C}.maf
done
# load tables for a look
ssh hgwdev
mkdir -p /gbdb/equCab2/multiz6way/maf
ln -s /cluster/data/equCab2/bed/multiz6way/maf/*.maf /gbdb/equCab2/multiz6way/maf
# this generates a large 1 Gb multiz6way.tab file in the directory
# where it is running. Best to run this over in scratch.
cd /scratch/tmp
time /bin/nice -n 19 hgLoadMaf -pathPrefix=/gbdb/equCab2/multiz6way/maf equCab2 multiz6way
# 101.386u 33.294s 2:57.36 75.9% 0+0k 0+0io 2pf+0w
# Loaded 5609508 mafs in 34 files from /gbdb/equCab2/multiz6way/maf
# load summary table
time /bin/nice -n 19 cat /gbdb/equCab2/multiz6way/maf/*.maf | hgLoadMafSummary equCab2 -minSize=30000 -mergeGap=1500 -maxSize=200000 multiz6waySummary stdin &
# 0.112u 11.365s 4:13.79 4.5% 0+0k 0+0io 0pf+0w
# Created 705120 summary blocks from 12830585 components and 5609498 mafs from stdin
# Gap Annotation
# prepare bed files with gap info
ssh kkstore04
mkdir /cluster/data/equCab2/bed/multiz6way/anno
cd /cluster/data/equCab2/bed/multiz6way/anno
mkdir maf run
# these actually already all exist from previous multiple alignments
for DB in `cat ../species.list`
do
CDIR="/cluster/data/${DB}"
if [ ! -f ${CDIR}/${DB}.N.bed ]; then
echo "creating ${DB}.N.bed"
echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
else
ls -og ${CDIR}/${DB}.N.bed
fi
done
cd run
rm -f nBeds sizes
for DB in `grep -v equCab2 ../../species.list`
do
echo "${DB} "
ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len
echo ${DB}.len >> sizes
done
ssh kki
cd /cluster/data/equCab2/bed/multiz6way/anno/run
cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set dir = /cluster/data/equCab2/bed/multiz6way
set c = $1
cat $dir/maf/${c}.maf | \
nice mafAddIRows -nBeds=nBeds stdin /cluster/data/equCab2/equCab2.2bit $2
'_EOF_'
# << happy emacs
chmod +x doAnno.csh
cat << '_EOF_' > template
#LOOP
./doAnno.csh $(root1) {check out line+ /cluster/data/equCab2/bed/multiz6way/anno/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 /cluster/data/equCab2/chrom.sizes > chrom.list
gensub2 chrom.list single template jobList
para create jobList
para try... para push...
# 34 jobs in batch
# Completed: 34 of 34 jobs
# CPU time in finished jobs: 2287s 38.12m 0.64h 0.03d 0.000 y
# IO & Wait Time: 899s 14.98m 0.25h 0.01d 0.000 y
# Average job time: 94s 1.56m 0.03h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 363s 6.05m 0.10h 0.00d
# Submission to last job: 363s 6.05m 0.10h 0.00d
ssh hgwdev
cd /cluster/data/equCab2/bed/multiz6way/anno
mkdir -p /gbdb/equCab2/multiz6way/anno/maf
ln -s /cluster/data/equCab2/bed/multiz6way/anno/maf/*.maf /gbdb/equCab2/multiz6way/anno/maf
# by loading this into the table multiz6way, it will replace the
# previously loaded table with the unannotated mafs
# huge temp files are made, do them on local disk
cd /scratch/tmp
time /bin/nice -n 19 hgLoadMaf -pathPrefix=/gbdb/equCab2/multiz6way/anno/maf equCab2 multiz6way
122.861u 43.946s 3:23.21 82.0% 0+0k 0+0io 0pf+0w
Loaded 6251465 mafs in 34 files from /gbdb/equCab2/multiz6way/anno/maf
cat /cluster/data/equCab2/chrom.sizes | \
awk '{if ($2 > 1000000) { print $1 }}' |
while read C
do
echo /gbdb/equCab2/multiz6way/anno/maf/$C.maf
done | xargs cat | \
hgLoadMafSummary equCab2 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz6waySummary stdin
# Created 705120 summary blocks from 12830585 components and 6251455 mafs from stdin
#
# by loading this into the table multiz6waySummary, it will replace
# the previously loaded table with the unannotated mafs
# remove the multiz6way*.tab files in this /scratch/tmp directory
rm multiz6way*
#############################################################################
## Annotate 6-way multiple alignment with gene annotations
## (DONE - 2008-05-02 - larrym)
ssh hgwdev
mkdir /cluster/data/equCab2/bed/multiz6way/frames
cd /cluster/data/equCab2/bed/multiz6way/frames
# dbs: eriEur1, cavPor2, sorAra1 do not exist, can not look at them
cat << '_EOF_' > showGenes.csh
#!/bin/csh -fe
foreach db (`cat ../species.list`)
echo -n "${db}: "
echo -n "Tables: "
set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
foreach table ($tables)
if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
$table == "knownGene") then
set count = `hgsql $db -N -e "select count(*) from $table"`
echo -n "${table}: ${count}, "
endif
end
set orgName = `hgsql hgcentraltest -N -e \
"select scientificName from dbDb where name='$db'"`
set orgId = `hgsql equCab2 -N -e \
"select id from organism where name='$orgName'"`
if ($orgId == "") then
echo "Mrnas: 0"
else
set count = `hgsql equCab2 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
echo "Mrnas: ${count}"
endif
end
'_EOF_'
chmod +x ./showGenes.csh
./showGenes.csh
# canFam2: Tables: ensGene: 29749, refGene: 889, Mrnas: 1667
# equCab2: Tables: ensGene: 27192, refGene: 354, Mrnas: 38402
# galGal3: Tables: ensGene: 22946, refGene: 4300, Mrnas: 27182
# hg18: Tables: ensGene: 55906, knownGene: 56722, mgcGenes: 28715, refGene: 26306, Mrnas: 231949
# mm9: Tables: ensGene: 43973, knownGene: 49409, mgcGenes: 22666, refGene: 21569, Mrnas: 222212
# ornAna1: Tables: ensGene: 30089, refGene: 10, Mrnas: 147
# from this information, conclusions:
# (hiram sez use ensGene rather than knownGene for hg18 (even though knownGene has more genes):
# use knownGene for mm9
# use ensGene for hg18, galGal3, canFam2 and ornAna1
mkdir genes
# knownGene
for DB in mm9
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# ensGene
# for DB in galGal3 canFam2 ornAna1
# for DB in hg18
for DB in equCab2
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
ls -og genes
# -rw-r--r-- 1 1865308 May 2 13:10 canFam2.gp.gz
# -rw-r--r-- 1 1916011 May 16 15:11 equCab2.gp.gz
# -rw-r--r-- 1 1557911 May 2 13:10 galGal3.gp.gz
# -rw-r--r-- 1 2008806 May 2 13:09 hg18.gp.gz
# -rw-r--r-- 1 1965274 May 2 13:09 mm9.gp.gz
# -rw-r--r-- 1 1347330 May 2 13:10 ornAna1.gp.gz
after switching hg18 => ensGene:
# -rw-r--r-- 1 2151412 May 6 14:46 genes/hg18.gp.gz
ssh kkstore04
cd /cluster/data/equCab2/bed/multiz6way/frames
time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames equCab2 stdin stdout canFam2 genes/canFam2.gp.gz hg18 genes/hg18.gp.gz \
mm9 genes/mm9.gp.gz galGal3 genes/galGal3.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1
# see what it looks like in terms of number of annotations per DB:
zcat multiz6way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
# 35858 galGal3
# 236512 mm9
# 236761 hg18
# 249158 canFam2
# 252980 ornAna1
redo with changed hg18:
ssh kkstore04
cd /cluster/data/equCab2/bed/multiz6way/frames
time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames equCab2 stdin stdout canFam2 genes/canFam2.gp.gz hg18 genes/hg18.gp.gz \
mm9 genes/mm9.gp.gz galGal3 genes/galGal3.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1
# see what it looks like in terms of number of annotations per DB:
zcat multiz6way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
# 35858 galGal3
# 236512 mm9
# 249158 canFam2
# 252980 ornAna1
# 254414 hg18
# redo above with updated equCab2 (2008-05-16):
ssh kkstore04
cd /cluster/data/equCab2/bed/multiz6way/frames
time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames equCab2 stdin stdout equCab2 genes/equCab2.gp.gz canFam2 genes/canFam2.gp.gz hg18 genes/hg18.gp.gz \
mm9 genes/mm9.gp.gz galGal3 genes/galGal3.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1
# see what it looks like in terms of number of annotations per DB:
zcat multiz6way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
# 35858 galGal3
# 186570 equCab2
# 236512 mm9
# 249158 canFam2
# 252980 ornAna1
# 254414 hg18
# load the resulting file
ssh hgwdev
cd /cluster/data/equCab2/bed/multiz6way/frames
time /bin/nice -n 19 hgLoadMafFrames equCab2 multiz6wayFrames multiz6way.mafFrames.gz
7.176u 0.735s 0:17.24 45.8% 0+0k 0+0io 2pf+0w
# re-run with equCab2:
8.523u 1.119s 4:46.07 3.3% 0+0k 0+0io 2pf+0w
# enable the trackDb entries:
# frames multiz6wayFrames
# irows on
#############################################################################
# phastCons 6-way (DONE - 2008-05-09 - larrym)
# split 6way mafs into 10M chunks and generate sufficient statistics
# files for # phastCons
ssh kki
mkdir /cluster/data/equCab2/bed/multiz6way/msa.split
cd /cluster/data/equCab2/bed/multiz6way/msa.split
mkdir -p /san/sanvol1/scratch/equCab2/multiz6way/cons/ss
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/equCab2/bed/multiz6way/maf
set WINDOWS = /san/sanvol1/scratch/equCab2/multiz6way/cons/ss
pushd $WINDOWS
set c = $1
rm -fr $c
mkdir $c
twoBitToFa -seq=$c /scratch/data/equCab2/equCab2.2bit /scratch/tmp/equCab2.$c.fa
# need to truncate odd-ball scaffold/chrom names that include dots
# as phastCons utils can't handle them
set CLEAN_MAF = /scratch/tmp/$c.clean.maf.$$
perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$c.maf > $CLEAN_MAF
/cluster/bin/phast/$MACHTYPE/msa_split $CLEAN_MAF -i MAF \
-M /scratch/tmp/equCab2.$c.fa \
-o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
rm -f $CLEAN_MAF /scratch/tmp/equCab2.$c.fa
popd
date >> $c.done
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
cat << '_EOF_' > template
#LOOP
doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
'_EOF_'
# << happy emacs
# do the easy ones first to see some immediate results
ls -1S -r ../maf | sed -e "s/.maf//" > maf.list
gensub2 maf.list single template jobList
para create jobList
para try ... check ... etc
# Completed: 34 of 34 jobs
# CPU time in finished jobs: 1600s 26.66m 0.44h 0.02d 0.000 y
# IO & Wait Time: 1183s 19.72m 0.33h 0.01d 0.000 y
# Average job time: 82s 1.36m 0.02h 0.00d
# Longest finished job: 241s 4.02m 0.07h 0.00d
# Submission to last job: 418s 6.97m 0.12h 0.00d
# take the cons and noncons trees from the mouse 30-way
# Estimates are not easy to make, probably more correctly,
# take the 30-way .mod file, and re-use it here.
ssh hgwdev
cd /cluster/data/equCab2/bed/multiz6way
cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
ssh memk
mkdir -p /cluster/data/equCab2/bed/multiz6way/cons/run.cons
cd /cluster/data/equCab2/bed/multiz6way/cons/run.cons
# there are going to be several different phastCons runs using
# this same script. They trigger off of the current working directory
# $cwd:t which is the "grp" in this script. It is one of:
# all gliers placentals
# Well, that's what it was when used in the Mm9 30-way,
# in this instance, there is only the directory "all"
cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin
set subDir = $1
set f = $2
set c = $2:r
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/equCab2/bed/multiz6way/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/equCab2/multiz6way/cons
if (-s $cons/$grp/$grp.non-inf) then
cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
else
cp -p $cons/$grp/$grp.mod $tmp
cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative `cat $grp.non-inf` \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir
sleep 4
touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir
rm -f $san/$grp/pp/$subDir/$f.pp
rm -f $san/$grp/bed/$subDir/$f.bed
mv $tmp/$f.pp $san/$grp/pp/$subDir
mv $tmp/$f.bed $san/$grp/bed/$subDir
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod a+x doPhast.csh
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/equCab2/multiz6way/cons/all/pp/$(root1)/$(file1).pp}
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
pushd /san/sanvol1/scratch/equCab2/multiz6way/cons
find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" > /cluster/data/equCab2/bed/multiz6way/cons/ss.list
popd
# run for all species
cd ..
mkdir -p all run.cons/all
cd all
/cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin/tree_doctor \
../../mm9.30way.mod \
--prune-all-but=equCab1,hg18,galGal3,mm9,canFam2,ornAna1 \
> all.mod
# edit all.mod to s/equCab2/equCab1/;
# TREE: ((((mm9:0.325818,hg18:0.136019):0.019763,(canFam2:0.149350,equCab2:0.099323):0.038613):0.332197,ornAna1:0.488110):0.118797,galGal3:0.488824);
cd ../run.cons/all
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create template file for "all" run
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/equCab2/multiz6way/cons/all/pp/$(lastDir1)/$(file1).pp}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../../ss.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Completed: 268 of 268 jobs
# CPU time in finished jobs: 8054s 134.23m 2.24h 0.09d 0.000 y
# IO & Wait Time: 2459s 40.98m 0.68h 0.03d 0.000 y
# Average job time: 39s 0.65m 0.01h 0.00d
# Longest finished job: 52s 0.87m 0.01h 0.00d
# Submission to last job: 543s 9.05m 0.15h 0.01d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/equCab2/multiz6way/cons/all
time nice -n +19 cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/equCab2/bed/multiz6way/cons/all
# load into database
ssh hgwdev
cd /cluster/data/equCab2/bed/multiz6way/cons/all
time /bin/nice -n 19 hgLoadBed equCab2 phastConsElements6way mostConserved.bed
# 3.212u 0.439s 0:22.68 16.0% 0+0k 0+0io 0pf+0w
# Loaded 1036081 elements of size 5
# Try for 5% overall cov, and 70% CDS cov
# We don't have any gene tracks to compare CDS coverage
# --rho .31 --expected-length 45 --target-coverage .3
featureBits equCab2 phastConsElements6way
# 134628278 bases of 2454424288 (5.485%) in intersection
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cd /san/sanvol1/scratch/equCab2/multiz6way/cons/all
mkdir -p phastCons6wayScores
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
TOP=`pwd`
export TOP
for D in pp/chr*
do
C=${D/pp\/}
out=phastCons6wayScores/${C}.data.gz
echo "${D} > ${C}.data.gz"
ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
gzip > ${out}
done
'_EOF_'
chmod +x gzipAscii.sh
time /bin/nice -n 19 ./gzipAscii.sh
1244.107u 83.541s 22:46.45 97.1% 0+0k 0+0io 0pf+0w
# rereun to fix corrupted file
1310.778u 34.480s 22:56.12 97.7% 0+0k 0+0io 0pf+0w
# XXXX NOT DONE YET
# copy the phastCons8wayScores to:
# /cluster/data/equCab2/bed/multiz6way/downloads/phastCons6way/phastConsScores
# for hgdownload downloads
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
cd /san/sanvol1/scratch/equCab2/multiz6way/cons/all
time /bin/nice -n 19 ls phastCons6wayScores/*.data.gz | xargs zcat | wigEncode -noOverlap stdin phastCons6way.wig phastCons6way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# real 23m55.813s
time /bin/nice -n 19 cp -p *.wi? /cluster/data/equCab2/bed/multiz6way/cons/all
# 0.010u 10.862s 0:49.38 22.0% 0+0k 0+0io 0pf+0w
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/equCab2/bed/multiz6way/cons/all
ln -s `pwd`/phastCons6way.wib /gbdb/equCab2/multiz6way/phastCons6way.wib
time /bin/nice -n 19 hgLoadWiggle -pathPrefix=/gbdb/equCab2/multiz6way equCab2 phastCons6way phastCons6way.wig
# 12.758u 3.986s 0:42.43 39.4% 0+0k 0+0io 2pf+0w
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/equCab2/bed/multiz6way/cons/all
time /bin/nice -n 19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=equCab2 phastCons6way > histogram.data 2>&1
# real 3m29.727s
cat << '_EOF_' > gnuplot.txt
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Horse EquCab2 Histogram phastCons6way track"
set xlabel " phastCons6way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
gnuplot < gnuplot.txt > histo.png
#############################################################################
## Create downloads for 6-way business (DONE - 2008-05-21 - larrym)
ssh kkstore04
mkdir -p /cluster/data/equCab2/bed/multiz6way/downloads/phastCons6way
cd /cluster/data/equCab2/bed/multiz6way/downloads/phastCons6way
cp -p /san/sanvol1/scratch/equCab2/multiz6way/cons/all/phastCons6wayScores/* .
ln -s ../../cons/all/all.mod ./6way.mod
cp /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/README.txt .
# edit that README.txt to be correct for this 6-way alignment
cd ..
mkdir multiz6way
cd multiz6way
cp -p /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/README.txt .
# edit that README.txt to be correct for this 6-way alignment
ssh kkstore04
cd /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way
ln -s ../../equCab2.6-way.nh ./6way.nh
mkdir -p /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/maf
cd /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/maf
cp -p /cluster/data/equCab2/bed/multiz6way/anno/maf/*.maf .
time /bin/nice -n 19 gzip -v *.maf &
1796.135u 69.195s 31:47.39 97.7% 0+0k 0+0io 0pf+0w
# creating upstream files
ssh hgwdev
cd /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/maf
# creating upstream files from ensGene, bash script:
cat << '_EOF_' > mkUpstream.sh
#!/bin/bash
DB=equCab2
GENE=ensGene
NWAY=multiz6way
export DB GENE
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| mafFrags ${DB} ${NWAY} \
stdin stdout \
-orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
| gzip -c > upstream${S}.maf.gz
echo "done upstream${S}.maf.gz"
done
'_EOF_'
# << happy emacs
chmod +x ./mkUpstream.sh
time /bin/nice -n 19 ./mkUpstream.sh
87.355u 87.965s 4:47.94 60.8% 0+0k 0+0io 6pf+0w
# -rw-rw-r-- 1 larrym protein 38856776 May 19 13:59 upstream5000.maf.gz
# -rw-rw-r-- 1 larrym protein 15604291 May 19 13:58 upstream2000.maf.gz
# -rw-r--r-- 1 larrym protein 8206298 May 19 13:57 upstream1000.maf.gz
# check the names in these upstream files to ensure sanity:
# should be a list of the other 4 species with a high count,
# then ensGene names.
zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' | sort | uniq -c | sort -rn | head
7030 ornAna1
7030 mm9
7030 hg18
7030 galGal3
7030 canFam2
1 ENSECAT00000027191
1 ENSECAT00000027190
ssh kkstore04
cd /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/maf
md5sum *.maf.gz > md5sum.txt
cd ..
ln -s ../../equCab2.6-way.nh ./6way.nh
ln -s $HOME/kent/src/hg/makeDb/trackDb/horse/equCab2/multiz6way.html .
# copy README.txt from mm9 30-way downloads and edit to represent
# this directory
cd ..
mkdir -p phastCons6way/phastConsScores
cd phastCons6way/phastConsScores
cp -p /san/sanvol1/scratch/equCab2/multiz6way/cons/all/phastCons6wayScores/*.data.gz .
md5sum *.data.gz > md5sum.txt
cd ..
ln -s ../../cons/all/all.mod 6way.mod
# copy README.txt from mm9 30way downloads and edit to represent
# this directory
# enable them for hgdownload rsync transfer
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/equCab2
mkdir multiz6way phastCons6way
cd multiz6way
ln -s /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/6way.nh .
ln -s /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/*.html .
ln -s /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/R*.txt .
mkdir maf
cd maf
ln -s /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/maf/*.maf.gz .
ln -s /cluster/data/equCab2/bed/multiz6way/downloads/multiz6way/maf/md5sum.txt
cd ../../phastCons6way
mkdir phastConsScores
cd phastConsScores
ln -s /cluster/data/equCab2/bed/multiz6way/downloads/phastCons6way/phastConsScores/*.gz .
ln -s /cluster/data/equCab2/bed/multiz6way/downloads/phastCons6way/phastConsScores/md5sum.txt .
cd ..
ln -s /cluster/data/equCab2/bed/multiz6way/downloads/phastCons6way/6way.mod .
ln -s /cluster/data/equCab2/bed/multiz6way/downloads/phastCons6way/README.txt .
#############################################################################
# N-SCAN gene predictions (nscanGene) - (2008-04-29 markd)
# obtained NSCAN predictions from michael brent's group
# at WUSTL
cd /cluster/data/equCab2/bed/nscan/
wget http://mblab.wustl.edu/predictions/horse/equCab2/equCab2.gtf
wget http://mblab.wustl.edu/predictions/horse/equCab2/equCab2.prot.fa
# updated readme.html
wget http://mblab.wustl.edu/predictions/horse/equCab2/repl.html
bzip2 equCab2.*
chmod a-w *
# load track
gtfToGenePred -genePredExt equCab2.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt equCab2 nscanGene stdin
hgPepPred equCab2 generic nscanPep equCab2.prot.fa.bz2
rm *.tab
# update trackDb; need a equCab2-specific page to describe informants
horse/equCab2/nscanGene.html (copy from repl.html)
horse/equCab2/trackDb.ra
# set search regex to
termRegex chr[0-9a-zA-Z_]+\.[0-9]+\.[0-9]+
#############################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
##########################################################################
# verify all.joiner entries (DONE - 2008-07-08 - larrym)
# try to get joinerCheck tests clean output on these commands
ssh hgwdev
cd ~/kent/src/hg/makeDb/schema
joinerCheck -database=equCab2 -tableCoverage all.joiner
joinerCheck -database=equCab2 -keys all.joiner
joinerCheck -database=equCab2 -times all.joiner
##########################################################################
# make downloads (DONE - 2008-07-08 - larrym)
# verify all tracks have html pages for their trackDb entries
cd /cluster/data/equCab2
/cluster/bin/scripts/makeDownloads.pl equCab2
# fix the README files
# re-build the upstream files:
cd /cluster/data/equCab2/goldenPath/bigZips
for size in 1000 2000 5000
do
echo $size
featureBits equCab2 ensGene:upstream:$size -fa=stdout \
| gzip -c > ensGene.upstream$size.fa.gz
done
##########################################################################
# create pushQ entry (DONE - 2008-07-08 - larrym)
# verify all tracks have html pages for their trackDb entries
ssh hgwdev
cd /cluster/data/equCab2/jkStuff
/cluster/bin/scripts/makePushQSql.pl equCab2 > equCab2.pushQ.sql
# equCab2 does not have seq
scp -p equCab2.pushQ.sql hgwbeta:/tmp
ssh hgwbeta
cd /tmp
hgsql qapushq < equCab2.pushQ.sql
# verify file sizes in the pushQ entries
# *** All done!
# *** Please edit the output to ensure correctness before using.
# *** 1. Resolve any warnings output by this script.
# *** 2. Remove any entries which should not be pushed.
# *** 3. Add tables associated with the main track table (e.g. *Pep tables
# for gene prediction tracks).
# *** 4. Add files associated with tracks. First, look at the results
# of this query:
# hgsql equCab2 -e 'select distinct(path) from extFile'
# Then, look at file(s) named in each of the following wiggle tables:
# hgsql equCab2 -e 'select distinct(file) from phastCons6way'
# hgsql equCab2 -e 'select distinct(file) from gc5Base'
# hgsql equCab2 -e 'select distinct(file) from quality'
# Files go in the second field after tables (it's tables, cgis, files).
# *** 5. This script currently does not recognize composite tracks. If equCab2
# has any composite tracks, you should manually merge the separate
# per-table entries into one entry.
# *** 6. Make sure that qapushq does not already have a table named equCab2:
# ssh hgwbeta hgsql qapushq -NBe "'desc equCab2;'"ssh hgwbeta hgsql qapushq -NBe "'desc equCab2;'"
# You *should* see this error:
# ERROR 1146 at line 1: Table 'qapushq.equCab2' doesn't exist
# If it already has that table, talk to QA and figure out whether
# it can be dropped or fixed up (by sql or the Push Queue web app).
# *** When everything is complete and correct, use hgsql on hgwbeta to
# execute the sql file. Then use the Push Queue web app to check the
# contents of all entries.
# *** If you haven't already, please add equCab2 to makeDb/schema/all.joiner !
# It should be in both $gbd and $chainDest.
XXXX TODO
# *** When equCab2 is on the RR (congrats!), please doBlastz -swap if you haven't
# already, and add Push Queue entries for those other databases' chains
# and nets to equCab2.
Added following files to pushQ entry:
/gbdb/equCab2/multiz6way/maf/chr*.maf
/gbdb/equCab2/multiz6way/anno/maf/chr*.maf
/gbdb/equCab2/multiz6way/phastCons6way.wib
/gbdb/equCab2/wib/gc5Base.wib
/gbdb/equCab2/wib/qual.wib
##########################################################################
##########################################################################
# Fix duplicate scaffold_34 (DONE - 2008-09-09 - larrym)
Start 2008-08-19:
insert into dbDb (name, description, nibPath, organism, defaultPos, active, genome, scientificName) values ('equCab2Copy', 'Sep. 2007 (test)', '/gbdb/equCab2Copy', 'Horse', 'chr11:52,970,942-52,973,091', 1, 'Horse', 'Equus caballus');
chr6 (correct location):
scaffold_34 - chr6:58759118-84719076
chr27:
scaffold_24 - chr27:1-39960074 - 39,960,074
scaffold_34 - chr27:39961075-65921033 (25959959 bases long)
Example of errors:
human protein FLJ13236 - aligns to chr6:67090333-67092571 and chr27:48292290-48294528
mRNA AJ514427 - aligns to chr6:67600011-67601519 and chr27:48801968-48803476
est CX605339 - aligns to chr6:67238940-67239361 and chr27:48440897-48441318
intronEst CD469838 - aligns to chr6:71553983-71575277 and chr27:52755940-52777234
xenoMrna BC033236 - aligns to chr6:67088771-67092632 and chr27:48290728-48294589
The region to delete starts at the gap precediing scaffold_34:
chr27: 39960074 (zero based)
65921033 - 39960075 + 1 = 25960959 bp's being deleted.
Tables that cross gap:
quality
nscanGene
...
Tracks to ignore while testing:
oligoMatch (Short Match)
a lot of tables are split, so I will have to write some C to do this:
run delete program (/cluster/data/equCab2/fixEquCab2.c):
#table:chromName:number rows deleted
blastHg18KG:tName:1138
chr27_chainCanFam2:tName:94250
chr27_chainGalGal3:tName:30247
chr27_chainHg18:tName:82111
chr27_chainMm9:tName:108140
chr27_chainOrnAna1:tName:49989
chr27_est:tName:823
chr27_gap:chrom:499
chr27_gold:chrom:499
chr27_intronEst:tName:429
chr27_mrna:tName:27
chr27_rmsk:genoName:37468
ctgPos2:chrom:1
ensGene:chrom:0
fixedStep:chrom:0
gc5Base:chrom:5052
multiz6way:chrom:71172
nestedRepeats:chrom:3894
netCanFam2:tName:37661
netGalGal3:tName:4908
netHg18:tName:31495
netMm9:tName:49546
netOrnAna1:tName:8094
nscanGene:chrom:351
phastConsElements6way:chrom:13664
quality:chrom:25353
refGene:chrom:7
simpleRepeat:chrom:2933
testBed:chrom:0
transMapAlnMRna:tName:8972
transMapAlnRefSeq:tName:933
transMapAlnSplicedEst:tName:177200
transMapAlnUcscGenes:tName:1713
xenoMrna:tName:21370
Things that apparently didn't get covered by the script:
multiz6wayFrames, multiz6waySummary, phastCons6way
do manually:
delete from multiz6wayFrames where chrom = 'chr27' and chromStart >= 39960074;
delete from multiz6waySummary where chrom = 'chr27' and chromStart >= 39960074;
delete from phastCons6way where chrom = 'chr27' and (chromStart >= 39960074 || (chromStart < 39960074 && chromEnd >= 39960074));
delete from all_mrna where tName = 'chr27' and tStart >= 39960074;
27 rows
delete from all_est where tName = 'chr27' and tStart >= 39960074;
823 rows
delete from estOrientInfo where chrom = 'chr27' and chromStart >= 39960074;
823 rows
delete from refSeqAli where tName = 'chr27' and tStart >= 39960074;
7 rows
select * from ensGene where chrom = 'chr27' and txStart >= 39960074;
0 rows
delete from refFlat where chrom = 'chr27' and txStart >= 39960074;
7 rows
delete from mrnaOrientInfo where chrom = 'chr27' and chromStart >= 39960074;
34 rows
sanity check:
dbSnoop equCab2Copy stdout | grep ^2008 | egrep -v 'chr[0-9MX]+_' | grep -v trackDb | grep -v hgFindSpec | grep -v chrUn | sort
Truncate chromInfo:
update chromInfo set size = 39960074 where chrom = 'chr27';
# re-build 2bit files:
# create /cluster/data/equCab2/seqList.txt to use as parameter to twoBitToFa:
chr1
...
chr27:0-39960074
chr28
...
chrX
chrUn
chrM
mv equCab2.2bit equCab2.2bit.old
mv equCab2.rmsk.2bit equCab2.rmsk.2bit.old
mv equCab2.unmasked.2bit equCab2.unmasked.2bit.old
mv equCab2.UnScaffolds.2bit equCab2.UnScaffolds.2bit.old
twoBitToFa -seqList=seqList.txt equCab2.2bit.bak equCab2.fa
faSize equCab2.fa
2484532062 bases (55741889 N's 2428790173 real 1433784199 upper 995005974 lower) in 34 sequences in 1 files
2510493021 - 25960959 == 2484532062
agrees!
# twoBitToFa is putting in "chr27:0-39960074, so we have to fix that up with the perl -ne
# all commands run on kkstore05:
twoBitToFa -seqList=seqList.txt equCab2.2bit.old stdout | perl -ne 's/^>chr27:0-39960074/>chr27/;print;' | faToTwoBit stdin equCab2.2bit
twoBitToFa -seqList=seqList.txt equCab2.rmsk.2bit.old stdout | perl -ne 's/^>chr27:0-39960074/>chr27/;print;' | faToTwoBit stdin equCab2.rmsk.2bit
twoBitToFa -seqList=seqList.txt equCab2.unmasked.2bit.old stdout | perl -ne 's/^>chr27:0-39960074/>chr27/;print;' | faToTwoBit stdin equCab2.unmasked.2bit
twoBitToFa -seqList=seqList.txt equCab2.UnScaffolds.2bit.old stdout | perl -ne 's/^>chr27:0-39960074/>chr27/;print;' | faToTwoBit stdin equCab2.UnScaffolds.2bit
2nd run of fixEquCab2:
overlaps: gc5Base chrom chromStart 1
overlaps: quality chrom chromStart 1
overlaps: nscanGene chrom txStart 1
overlaps: xenoMrna tName tStart 2
overlaps: blastHg18KG tName tStart 1
overlaps: nscanGene chrom txStart 1
select * from gc5Base where chrom = 'chr27' and chromStart < 39960074 && chromEnd > 39960074;
select * from quality where chrom = 'chr27' and chromStart < 39960074 && chromEnd > 39960074;
# Fixed gc5Base and quality below
delete from xenoMrna where tName = 'chr27' and tStart < 39960074 && tEnd > 39960074;
2 rows
delete from blastHg18KG where tName = 'chr27' and tStart < 39960074 && tEnd > 39960074;
1 row
1 overlapping row in nscanGene:
delete from nscanGene where chrom = 'chr27' and txStart < 39960074 and txEnd > 39960074;
1 row
Sent email to cluster-admin to reload blat from 2bit file.
After reload, a portion of previously duplicated DNA:
TATTGCAGGCAGGATGACCACTATTGCAACGCTCTCCGGCCTCGAAATGG
now shows up with only one hit.
# download fixed agp from broad
cd /cluster/data/equCab2/broad
mkdir 2008-04-15-updates
cd 2008-04-15-updates
wget ftp://ftp.broad.mit.edu/pub/assemblies/mammals/horse/Equus2/mapped.agp.chromosome.agp
cp -p ucsc.agp ucsc.agp.old
cp -p Contig.bed Contig.bed.old
cp -p ucsc.agp ucsc.agp.old
cp -p mapped.agp.chromosome.agp mapped.agp.chromosome.agp.old
# manually remove relevant section (line 51417-52415)
diff mapped.agp.chromosome.agp 2008-04-15-updates/mapped.agp.chromosome.agp
# no differences (which is good, means they changed what I expected them to change)
# on kkstore05
qaToQac assembly.quals.gz stdout | qacAgpLift ucsc.agp stdin equCab2.qual.qac
Read 55316 qacs from stdin
Got 33 chroms in ucsc.agp
...
chr27 size=39960074
...
cd /cluster/data/equCab2/bed/qual
qacToWig -fixed /cluster/data/equCab2/broad/equCab2.qual.qac stdout | wigEncode stdin qual.wig qual.wib
# run on hgwdev:
hgLoadWiggle -pathPrefix=/gbdb/equCab2/wib equCab2 quality qual.wig &
cd /cluster/data/equCab2/bed/gc5Base
hgGcPercent -wigOut -doGaps -win=5 -file=stdout -verbose=0 equCab2 /cluster/data/equCab2/equCab2.2bit | wigEncode stdin gc5Base.wig gc5Base.wib
hgLoadWiggle -pathPrefix=/gbdb/equCab2/wib equCab2 gc5Base gc5Base.wig &
hg18.chainEquCab2 etc.
grep contig_ mapped.agp.chromosome.agp | awk '{print $3-$2+1}' | ave stdin
count 55316
total 2428773513
grep contig_ mapped.agp.chromosome.agp | awk '{print $3-$2+1}' | sort -r -n | ~/bin/n50.pl 2428773513
count: 6246; len: 112381
cp -p Contig.bed Contig.bed.old
# manually remove extra contig in chr27
grep scaffold_ Contig.bed | awk '{print $3-$2}' | ave stdin
count 73
average 30724621.506849
total 2242897370
grep scaffold_ Contig.bed | awk '{print $3-$2}' | ~/bin/n50.pl 2242897370
count: 33; len: 53663962
checkTableCoords equCab2 -daysOld=1000
equCab2.chr27_chainCanFam2Link has 1604165 records with end > chromSize.
equCab2.chr27_chainGalGal3Link has 374961 records with end > chromSize.
equCab2.chr27_chainHg18Link has 1351976 records with end > chromSize.
equCab2.chr27_chainMm9Link has 1671652 records with end > chromSize.
equCab2.chr27_chainOrnAna1Link has 696995 records with end > chromSize.
equCab2.xenoMrna has 3 records with end > chromSize.
delete from chr27_chainCanFam2Link where tName = 'chr27' and tStart >= 39960074;
1604165 rows
delete from chr27_chainGalGal3Link where tName = 'chr27' and tStart >= 39960074;
374961 rows
delete from chr27_chainHg18Link where tName = 'chr27' and tStart >= 39960074;
1351976 rows
delete from chr27_chainMm9Link where tName = 'chr27' and tStart >= 39960074;
1671652 rows
delete from chr27_chainOrnAna1Link where tName = 'chr27' and tStart >= ;
696995 rows
I don't know how the xenoMrna records got missed:
delete from xenoMrna where tName = 'chr27' and tEnd > 39960074;
3 rows
hg18, mm9, canFam2, ornAna1, galGal3
chr27_chainCanFam2Link hg18, mm9, canFam2, ornAna1, galGal3
In a meeting we (kuhn, hiram et al.) decided to do the nets and chains later,
and focus on releasing core stuff now; i.e. I will re-run all net/chain
stuff from scratch later.
# misc fixes:
# fixup genoLeft column in rmsk table:
update chr27_rmsk set genoLeft = genoLeft + 25959959;
##########################################################################
# re-make downloads (DONE - 2008-09-09 - larrym)
Edit to remove end of chr27:
chrom.sizes, rmsk_and_gaps.bed
equCab2.agp, 27/chr27.agp, 27/chr27.fa.out
bed/simpleRepeat/simpleRepeat.bed, bed/simpleRepeat/bed.tab
bed/simpleRepeat/trfMask.bed, trfMaskChrom/chr27.bed
bed/repeatMasker/bed.tab, bed/repeatMasker/equCab2.fa.out
bed/chromInfo/chromInfo.tab
broad/ctgPos2.tab, broad/scaffolds.bed
# verify all tracks have html pages for their trackDb entries
cd /cluster/data/equCab2
cp -rp goldenPath goldenPath.old
/cluster/bin/scripts/makeDownloads.pl equCab2
# restore the older README files
cd /cluster/data/equCab2
for dir in bigZips database liftOver chromosomes
do
cp -p goldenPath.old/$dir/README.txt goldenPath/$dir/README.txt
done
# re-build the upstream files:
for size in 1000 2000 5000
do
echo $size
featureBits equCab2 ensGene:upstream:$size -fa=stdout \
| gzip -c > ensGene.upstream$size.fa.gz
done
#########################################################################
# Re-run creation of OOC file for genbank runs (DONE - 2008-08-27 - larrym)
1024 * (2.5 / 3.1) == 825.805824
# Use -repMatch=825 (based on size -- for human we use 1024, and
# horse size is 84% of human judging by twoBitInfo -noNs)
cd /cluster/data/equCab2
blat equCab2.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=equCab2.11.ooc -repMatch=825 &
#########################################################################
# REDO-GENBANK AUTO UPDATE (DONE - 2008-09-11 - larrym)
ssh genbank
screen # use a screen to manage this job
cd /cluster/data/genbank
time /bin/nice -n 19 bin/gbAlignStep -initial equCab2 &
# logFile: var/build/logs/2008.09.10-13:33:41.equCab2.initalign.log
# kill'ed b/c /scratch/data hadn't been rsync'ed
# email to cluster-admin:
# please rsync /hive/data/staging/data/equCab2 to /scratch/data/equCab2 on all cluster nodes
# gbAlignStep hadn't got the parasol step, so markd said I could just re-run it:
rm -rf /cluster/genbank/genbank/build/work/initial.equCab2
time /bin/nice -n 19 bin/gbAlignStep -initial equCab2 &
# logFile: var/build/logs/2008.09.10-15:27:53.equCab2.initalign.log
# 8970.392u 3173.177s 8:20:43.48 40.4% 0+0k 0+0io 1233pf+0w
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time /bin/nice -n 19 ./bin/gbDbLoadStep -drop -initialLoad equCab2 &
# logFile: var/dbload/hgwdev/logs/2008.09.11-11:50:22.dbload.log
# 632.112u 222.505s 18:36.80 76.5%
# test some genbank stuff - look at chr6:67,086,866-71,661,987
mRNA AJ514427 - aligns just to chr6:67600011-67601519
est CX605339 - aligns just to chr6:67238940-67239361
intronEst CD469838 - aligns just to chr6:71553983-71575277
xenoMrna BC033236 - aligns just to chr6:67088771-67092632
#########################################################################
# QA by larrym
RefSeq Gene COL2A1 - why is it 1 base shorter now? - chr6:65629018-65660099 vs. chr6:65629017-65660099
http://hgwdev-larrym.cse.ucsc.edu/cgi-bin/hgc?hgsid=1559182&o=65629017&t=65660099&g=refGene&i=NM_001081764&c=chr6&l=62511743&r=76237109&db=equCab2
http://hgwbeta.cse.ucsc.edu/cgi-bin/hgc?hgsid=1501039&o=65629016&t=65660099&g=refGene&i=NM_001081764&c=chr6&l=62511743&r=76237109&db=equCab2
Alignment is on reverse strand; older version is one base longer; why?
Similar with RefSeq Gene IL23 (which is not on reverse strand).
select * from refGene where name = 'NM_001082522';
echo "select * from refGene where name = 'NM_001082522';" | hgsql equCab2
vs.
echo "select * from refGene where name = 'NM_001082522';" | hgsql equCab2Old
True on other chromosomes; e.g.: RefSeq Gene TPH2 - chr28:559352-671879
Update_time: 2008-09-11 12:08:02
markd says don't worry about it - probably a blat change.
##########################################################################
# re-verify all.joiner entries (DONE - 2008-09-15 - larrym)
ssh hgwdev
cd ~/kent/src/hg/makeDb/schema
joinerCheck -database=equCab2 -tableCoverage all.joiner
joinerCheck -database=equCab2 -keys all.joiner
Error: 352 of 19964 elements of equCab2.nscanPep.name are not in key nscanGene.name line 589 of all.joiner
Example miss: CHR27.199.1
equCab2.transMapInfoRefSeq.mappedId - hits 50099 of 51032
Error: 933 of 51032 elements of equCab2.transMapInfoRefSeq.mappedId are not in key transMapAlnRefSeq.qName line 2408 of all.joiner
Example miss: NM_000020.2-1.2
equCab2.transMapInfoUcscGenes.mappedId - hits 94700 of 96413
Error: 1713 of 96413 elements of equCab2.transMapInfoUcscGenes.mappedId are not in key transMapAlnUcscGenes.qName line 2413 of all.joiner
Example miss: UC001RME.1-1.2
equCab2.transMapInfoMRna.mappedId - hits 474538 of 483510
Error: 8972 of 483510 elements of equCab2.transMapInfoMRna.mappedId are not in key transMapAlnMRna.qName line 2418 of all.joinerw
Example miss: A10156.1-1.2
equCab2.transMapInfoSplicedEst.mappedId - hits 6221957 of 6399157
Error: 177200 of 6399157 elements of
Example miss: AA000021.1-1.2
select count(*) from nscanPep where name >= 'CHR27.199.1' and name <= 'CHR28';
352
delete from nscanPep where name >= 'CHR27.199.1' and name <= 'CHR28';
Query OK, 352 rows affected (0.01 sec)
~/src/delOrphans.pl transMapInfoRefSeq mappedId transMapAlnRefSeq qName
deleted: 933
~/src/delOrphans.pl transMapInfoUcscGenes mappedId transMapAlnUcscGenes qName
deleted: 1713
~/src/delOrphans.pl transMapInfoMRna mappedId transMapAlnMRna qName
deleted: 8972
~/src/delOrphans.pl transMapInfoSplicedEst mappedId transMapAlnSplicedEst qName
deleted: 177200
re-run:
joinerCheck -database=equCab2 -keys all.joiner
no errors
joinerCheck -database=equCab2 -times all.joiner
############################################################################
## RE-COMPUTE DEFAULT POSITION (DONE - 2008-09-16 - larrym)
# determine new default position using liftOver (via convert menu)
ssh hgwdev
hgsql -e 'update dbDb set defaultPos="chr11:52,970,942-53,028,811" where name="equCab2";' hgcentraltest
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
equCab2.upstreamGeneTbl = refGene
equCab2.upstreamMaf = multiz6way /hive/data/genomes/equCab2/bed/multiz6way/species.list
################################################
# Fixed tSizes in transMap tracks (2008-10-15 markd)
update equCab2.transMapAlnUcscGenes set tSize=39960074 where tName="chr27" and tSize=65921033;
update equCab2.transMapAlnRefSeq set tSize=39960074 where tName="chr27" and tSize=65921033;
update equCab2.transMapAlnMRna set tSize=39960074 where tName="chr27" and tSize=65921033;
update equCab2.transMapAlnSplicedEst set tSize=39960074 where tName="chr27" and tSize=65921033;
############################################################################
# Re-Run GalGal3 lastz (DONE - 2009-07-01 - Hiram)
# primary run
cd /hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29
cat fb.galGal3.chainEquCab2Link.txt
# 68476046 bases of 1042591351 (6.568%) in intersection
# running the swap
mkdir /hive/data/genomes/equCab2/bed/blastz.galGal3.swap
cd /hive/data/genomes/equCab2/bed/blastz.galGal3.swap
time doBlastzChainNet.pl \
/hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29/DEF \
-noLoadChainSplit -verbose=2 -bigClusterHub=pk \
-swap -workhorse=hgwdev \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 10m1.381s
cat fb.equCab2.chainGalGal3Link.txt
# 73336799 bases of 2428790173 (3.019%) in intersection
############################################################################
# Re-Run ornAna1 alignment (DONE - 2009-07-01,02 - Hiram)
mkdir /hive/data/genomes/equCab2/bed/lastzOrnAna1.2009-07-01
cd /hive/data/genomes/equCab2/bed/lastzOrnAna1.2009-07-01
cat << '_EOF_' > DEF
# Horse vs. Platypus
BLASTZ_M=50
# TARGET: Horse
SEQ1_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ1_LEN=/scratch/data/equCab2/chrom.sizes
SEQ1_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
SEQ1_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
SEQ1_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift
SEQ1_CHUNK=20000000
SEQ1_LIMIT=100
SEQ1_LAP=0
# QUERY: Platypus ornAna1
SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit
SEQ2_LEN=/scratch/data/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/equCab2/bed/lastzOrnAna1.2009-07-01
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl `pwd`/DEF \
-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-workhorse=hgwdev \
-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
# real 1413m12.090s
cat fb.equCab2.chainOrnAna1Link.txt
# 132750149 bases of 2428790173 (5.466%) in intersection
mkdir /hive/data/genomes/ornAna1/bed/blastz.equCab2.swap
cd /hive/data/genomes/ornAna1/bed/blastz.equCab2.swap
time doBlastzChainNet.pl \
/hive/data/genomes/equCab2/bed/lastzOrnAna1.2009-07-01/DEF \
-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-swap -workhorse=hgwdev \
-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
# real 115m47.611s
cat fb.ornAna1.chainEquCab2Link.txt
# 132776204 bases of 1842236818 (7.207%) in intersection
############################################################################
# lastz swap to Dog CanFam2 (DONE - 2009-07-01 - Hiram)
# the original lastz
cd /hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29
cat fb.canFam2.chainEquCab2Link.txt
# 1676663178 bases of 2384996543 (70.300%) in intersection
mkdir /hive/data/genomes/equCab2/bed/blastz.canFam2.swap
cd /hive/data/genomes/equCab2/bed/blastz.canFam2.swap
time doBlastzChainNet.pl \
/hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29/DEF \
-swap -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 286m51.658s
fb.equCab2.chainCanFam2Link.txt
# 1721407500 bases of 2428790173 (70.875%) in intersection
############################################################################
# lastz swap to Mouse Mm9 (DONE - 2009-07-02 - Hiram)
# the original lastz
cd /hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29
cat fb.mm9.chainEquCab2Link.txt
# 912421053 bases of 2620346127 (34.821%) in intersection
mkdir /hive/data/genomes/equCab2/bed/blastz.mm9.swap
cd /hive/data/genomes/equCab2/bed/blastz.mm9.swap
time doBlastzChainNet.pl \
/hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29/DEF \
-swap -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 122m25.314s
cat fb.equCab2.chainMm9Link.txt
# 902295813 bases of 2428790173 (37.150%) in intersection
############################################################################
# lastz swap to Opossum MonDom5 (DONE - 2009-07-02 - Hiram)
# the original lastz
cd /hive/data/genomes/monDom5/bed/lastzEquCab2.2009-06-29
cat fb.monDom5.chainEquCab2Link.txt
# 355004426 bases of 3501660299 (10.138%) in intersection
mkdir /hive/data/genomes/equCab2/bed/blastz.monDom5.swap
cd /hive/data/genomes/equCab2/bed/blastz.monDom5.swap
time doBlastzChainNet.pl \
/hive/data/genomes/monDom5/bed/lastzEquCab2.2009-06-29/DEF \
-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-swap -workhorse=hgwdev \
-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
# real 320m3.573s
cat fb.equCab2.chainMonDom5Link.txt
# 351787662 bases of 2428790173 (14.484%) in intersection
#########################################################################
# lastz swap to Human Hg18 (DONE - 2009-07-02 - Hiram)
# the original lastz
cd /hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29
cat fb.hg18.chainEquCab2Link.txt
# 1647122438 bases of 2881515245 (57.162%) in intersection
mkdir /hive/data/genomes/equCab2/bed/blastz.hg18.swap
cd /hive/data/genomes/equCab2/bed/blastz.hg18.swap
time doBlastzChainNet.pl \
/hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29/DEF \
-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-swap -workhorse=hgwdev \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 238m42.004s
cat fb.equCab2.chainHg18Link.txt
# 1622340736 bases of 2428790173 (66.796%) in intersection
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
############################################################################
+# xenoRefGene - (2010-02-04 markd)
+
+ # enable xeno RefSeq track per user request: add to etc/genbank.conf:
+equCab2.refseq.mrna.xeno.load = yes
+############################################################################