src/hg/makeDb/doc/panTro1.txt 1.8
1.8 2009/11/25 21:48:41 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/panTro1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/panTro1.txt,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 1000000 -r1.7 -r1.8
--- src/hg/makeDb/doc/panTro1.txt 25 Nov 2009 18:29:18 -0000 1.7
+++ src/hg/makeDb/doc/panTro1.txt 25 Nov 2009 21:48:41 -0000 1.8
@@ -1,2501 +1,2501 @@
#!/bin/csh
exit;
DONE First chimp release
# DOWNLOAD FILES FROM WHITEHEAD (kate)
#ftp-genome.wi.mit.edu
#Dir: Assembly.Nov13.2003
# Files:
# contigs.bases.gz
# contigs.quals.gz
# assembly.agp
gunzip *.gz
#The scaffold count is 37930
#% faSize contigs.bases
# Get AGP from LaDeana Hillier
ssh kksilo
cd /cluster/data/panTro1
wget genome.wustl.edu:/private/lhillier/old/pan_troglodytes_agp.040119.tar.gz
gunzip pan_troglodytes_agp.040119.tar.gz
tar xvf pan_troglodytes_agp.040119.tar
rm pan_troglodytes_agp.040119.tar
# creates dir pan_troglodytes_agp, with an AGP per chrom
# REPEATMASK (2003-12-03 and 2004-01-20 kate)
# Split into 500KB chunks for RepeatMasking
ssh kksilo
cd /cluster/data/panTro1
mkdir -p split500K
faSplit sequence contigs.bases 10 split500K/
cd split500K
foreach f (*.fa)
set d = $f:t:r
mkdir -p $d
faSplit about $f 500000 $d/
rm $f
end
# Create RM script and cluster job
# NOTE: actual run was performed in two cluster jobs -- the second
# was RMRun.chimpLib, for the chimp-specific repeats
cd /cluster/data/panTro1
mkdir -p jkStuff
mkdir -p RMRun
rm -f RMRun/RMJobs
cat << '_EOF_' > jkStuff/RMChimp
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/panTro1/$2
/bin/cp $2 /tmp/panTro1/$2
cd /tmp/panTro1/$2
# mask with chimp lib
/cluster/bluearc/RepeatMasker/RepeatMasker -s -ali -nolow -lib /cluster/bluearc/RepeatMasker030619/Libraries/chimp.lib $2
popd
/bin/cp /tmp/panTro1/$2/$2.out ./$2.chimp.out
if (-e /tmp/panTro1/$2/$2.align) /bin/cp /tmp/panTro1/$2/$2.align ./$2.chimp.align
/bin/rm -f /tmp/panTro1/$2/*
cd $1
pushd .
/bin/cp $2 /tmp/panTro1/$2
cd /tmp/panTro1/$2
# mask with default primate lib
/cluster/bluearc/RepeatMasker/RepeatMasker -s -ali $2
popd
/bin/cp /tmp/panTro1/$2/$2.out ./$2.out
if (-e /tmp/panTro1/$2/$2.align) /bin/cp /tmp/panTro1/$2/$2.align ./$2.align
/bin/rm -fr /tmp/panTro1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/panTro1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/panTro1
'_EOF_'
chmod +x jkStuff/RMChimp
mkdir -p RMRun
rm -f RMRun/RMJobs
foreach d ( split500K/?? )
foreach f ( $d/*.fa )
set f = $f:t
echo /cluster/data/panTro1/jkStuff/RMChimp \
/cluster/data/panTro1/$d $f \
'{'check out line+ /cluster/data/panTro1/$d/$f.out'}' \
>> RMRun/RMJobs
end
end
#5367 RMJobs
# Run cluster job
sh kk
cd /cluster/data/panTro1/RMRun
para create RMJobs
para try, para check, para check, para push, para check,...
# TRF: Tandem Repeat Finder (DONE 2004-01-19 kate)
# create job list of 5MB chunks
# Redoing this instead of using the pt0 trf beds, so as to
# include the new and changed scaffolds in the Nov. 13 assembly
ssh kksilo
cd /cluster/data/panTro1
mkdir -p split5M
mkdir -p /cluster/bluearc/panTro1/split5M
faSplit about contigs.bases 5000000 /cluster/bluearc/panTro1/split5M/
cd /cluster/data/panTro1
mkdir -p bed/simpleRepeat
cd bed/simpleRepeat
mkdir trf
ls -1 /cluster/bluearc/panTro1/split5M/*.fa > genome.lst
touch single
cat << 'EOF' > gsub
#LOOP
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)} /dev/null -bedAt={check out line+ /cluster/data/panTro1/bed/simpleRepeat/trf/$(root1).bed} -tempDir=/tmp
#ENDLOOP
'EOF'
gensub2 genome.lst single gsub spec
ssh kk
cd /cluster/data/panTro1/bed/simpleRepeat
para create spec
# 546 jobs written to batch
para try
para push
# MAKE LINEAGE-SPECIFIC REPEATS FOR HUMAN (DONE 2004-06-21 kate)
# Lacking input from Arian, and using blastzSelf as a model,
# I'm using all chimp repeats for the human/chimp blastz.
# Scripts expect *.out.spec filenames.
ssh kkr1u00
cd /cluster/data/panTro1
mkdir /iscratch/i/chimp/panTro1/linSpecRep.human
foreach f (?{,}/*.fa.out)
cp -p $f /iscratch/i/chimp/panTro1/linSpecRep.human/$f:t:r:r.out.spec
end
iSync
# FILTER SIMPLE REPEATS INTO MASK (DONE 2004-01-25 kate)
# make a filtered version # of the trf output:
# keep trf's with period <= 12:
# NOTE: did chr1_random on 2004-01-26 kate)
ssh kksilo
cd /cluster/data/panTro1/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/*.bed)
echo "filtering $f"
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# MASK CONTIGS WITH REPEATMASKER (DONE 2004-01-20 kate)
ssh kksilo
cd /cluster/data/panTro1
cd split500K
cat > maskRM.csh << 'EOF'
foreach d (??)
foreach f ($d/*.fa)
echo "Masking $f"
maskOutFa $f $f.out $f.rmsk1 -soft
maskOutFa $f.rmsk1 $f.chimp.out $f.rmsk -softAdd
end
end
'EOF'
csh maskRM.csh >&! maskRM.log &
tail -100f maskRM.log
#WARNING: negative rEnd: -92 contig_143568:1128-1159 MER46B
#WARNING: negative rEnd: -155 contig_143869:5374-5508 AluJb
# etc.
# Comment in rn3 doc (Hiram) indicates these are OK...
# MASK CONTIGS WITH TRF
# Merge 500K masked chunks into single file, then split into 5Mb chunks
# to prepare for TRF masking
ssh kksilo
cd /cluster/data/panTro1
cd split500K
foreach d (??)
echo "Contig dir $d"
foreach f ($d/?.fa.rmsk $d/??.fa.rmsk $d/???.fa.rmsk)
set g = $f:h
cat $f >> $g.fa
end
end
# check the split500K/??.fa masked files, then create single
# fasta file with repeatmasked sequence
cd /cluster/data/panTro1
cat split500K/??.fa > contigs.bases.rmsk
# check the rmsk file, then
rm split500K/??.fa
mkdir -p split5M.rmsk
faSplit about contigs.bases.rmsk 5000000 split5M.rmsk/
cat > bed/simpleRepeat/runTrf.csh << 'EOF'
foreach f (split5M.rmsk/*.fa)
echo "TRF Masking $f"
set b = $f:t:r
maskOutFa $f bed/simpleRepeat/trfMask/$b.bed $f.msk -softAdd
end
'EOF'
csh bed/simpleRepeat/runTrf.csh >&! bed/simpleRepeat/runTrf.log &
tail -100f bed/simpleRepeat/runTrf.log
# create single fasta file with repeatmasked and trf-masked sequence
cat split5M.rmsk/*.fa.msk > contigs.bases.msk
# MAKE SCAFFOLDS FROM CONTIGS (DONE 2004-01-20 kate)
agpAllToFaFile assembly.agp contigs.bases.msk scaffolds.msk.fa -sizes=scaffold
/cluster/bin/scripts/agpToLift < assembly.agp > assembly.lft
# MAKE CHROMS FROM SCAFFOLDS (DONE 2004-01-20 kate, except chrUn)
# NOTE: chr1_random lost somewhere -- redone 2004-01-26 kate
ssh kksilo
cd /cluster/data/panTro1
cp scaffolds.msk.fa /cluster/bluearc/panTro1
cat > jkStuff/makeChr.csh << 'EOF'
cd /cluster/data/panTro1/pan_troglodytes_agp
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 X Y Un M)
echo $c
mkdir -p ../$c
if (-e ptr${c}.agp) then
# Remove comments as agpToFa can't handle them
# NOTE: chr names are "ptrN" in the AGP files
sed -e 's/#.*//' -e 's/ptr/chr/' ptr${c}.agp > ../$c/chr${c}.agp
~/bin/x86_64/agpToFa -simpleMultiMixed \
../$c/chr${c}.agp chr${c} \
/cluster/data/panTro1/$c/chr${c}.fa \
/cluster/bluearc/panTro1/scaffolds.msk.fa
endif
if (-e ptr${c}_random.agp) then
sed -e 's/#.*//' -e 's/ptr/chr/' ptr${c}_random.agp \
> ../$c/chr${c}_random.agp
~/bin/x86_64/agpToFa -simpleMultiMixed \
../$c/chr${c}_random.agp chr${c}_random \
/cluster/data/panTro1/$c/chr${c}_random.fa \
/cluster/bluearc/panTro1/scaffolds.msk.fa
endif
end
'EOF'
ssh kolossus
cd /cluster/data/panTro1
csh jkStuff/makeChr.csh >&! jkStuff/makeChr.log &
tail -100f jkStuff/makeChr.log
# Un
# Program error: trying to allocate 807899783 bytes in needLargeMem
# agpToFa: memalloc.c:82: needLargeMem: Assertion `0' failed.
# Abort (core dumped)
# This is an 800M chrom! (1500 scaffolds with 50K gaps)
# Have requested smaller gaps, will redo it
# REDO chrUn with revised AGP file (10K gaps) from LaDeana Hillier
# (IN PROGRESS 2004-01-23 kate)
ssh kolossus
cd /cluster/data/panTro1/pan_troglodytes_agp
set c = Un
sed -e 's/#.*//' -e 's/ptr/chr/' ptr${c}_random.agp \
> ../$c/chr${c}_random.agp
~/bin/x86_64/agpToFa -simpleMultiMixed \
../$c/chr${c}_random.agp chr${c}_random \
/cluster/data/panTro1/$c/${c}_random.fa \
/cluster/bluearc/panTro1/scaffolds.msk.fa
ssh kksilo
cd /cluster/data/panTro1
cat > jkStuff/makeNib.csh << 'EOF'
cd /cluster/data/panTro1
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 X Y M Un)
echo $c
cd $c
foreach f (*.fa)
/cluster/bin/i386/faToNib -softMask $f ../nib/chr$f:r.nib
end
cd ..
end
'EOF'
csh jkStuff/makeNib.csh >&! jkStuff/makeNib.log &
tail -100f jkStuff/makeNib.log
# Make symbolic links from /gbdb/panTro1/nib to the real nibs.
ssh hgwdev
mkdir -p /gbdb/panTro1/nib
foreach f (/cluster/data/panTro1/nib/chr*.nib)
ln -s $f /gbdb/panTro1/nib
end
# make lift files from scaffold->chrom agp's, with agpToLift that
# supports negative strand
ssh kksilo
cd /cluster/data/panTro1
foreach d (?{,?})
echo $d
if (-e $d/chr$d.agp) then
echo "lifting chr$d.agp"
jkStuff/agpToLift < $d/chr$d.agp > $d/chr$d.lft
endif
if (-e $d/chr${d}_random.agp) then
echo "lifting chr${d}_random.agp"
jkStuff/agpToLift < $d/chr${d}_random.agp > $d/chr${d}_random.lft
endif
end
cat ?{,?}/*.lft > jkStuff/scaffolds.lft
# CREATE DATABASE (IN PROGRESS 2004-01-21 kate, still needs chrUn in chromINfo)
ssh hgwdev
# use df to make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
df -h /var/lib/mysql
# Filesystem Size Used Avail Use% Mounted on
# /dev/sdc1 1.8T 211G 1.5T 13% /var/lib/mysql
hgsql -e "CREATE DATABASE panTro1"
# if you need to delete this database: !!! WILL DELETE EVERYTHING !!!
# hgsql -e "drop database panTro1"
# create "grp" table for track grouping
hgsql panTro1 \
-e "CREATE TABLE grp (PRIMARY KEY(NAME)) select * from panTro1.grp"
# Load /gbdb/panTro1/nib paths into database and save size info.
hgsql panTro1 < ~/kent/src/hg/lib/chromInfo.sql
cd /cluster/data/panTro1
hgNibSeq -preMadeNib panTro1 /gbdb/panTro1/nib ?{,?}/chr?{,?}{,_random}.fa
echo "select chrom,size from chromInfo" | hgsql -N panTro1 > chrom.sizes
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (IN PROGERSS 2004-01-20 kate)
ssh hgwdev
echo 'INSERT INTO defaultDb VALUES ("Chimp", "panTro1");' \
| hgsql -h genome-testdb hgcentraltest
# Warning: genome and organism fields must correspond
# with defaultDb values, start as not ACTIVE
echo 'INSERT INTO dbDb \
(name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, \
scientificName, htmlPath, hgNearOK) values \
("panTro1", "Nov. 2003", "/gbdb/panTro1/nib", "Chimp", \
"chr6:115705331-115981791", 0, 15, "Chimp", \
"Pan troglodytes", "/gbdb/panTro1/html/description.html", 0);' \
| hgsql -h genome-testdb hgcentraltest
# Make trackDb table so browser knows what tracks to expect:
ssh hgwdev
cd ~/src/hg/makeDb/trackDb
mkdir -p chimp/panTro1
# Add chimp/panTro1/description.html
cvs up -d -P
# Edit that makefile to add panTro1 in all the right places and do
make update
# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE - 2003-07-31 - Hiram)
ssh hgwdev
echo 'INSERT INTO blatServers (db, host, port, isTrans) \
VALUES ("panTro1", "blat8", "17778", "1"); \
INSERT INTO blatServers (db, host, port, isTrans) \
VALUES ("panTro1", "blat8", "17779", "0");' \
| hgsql -h genome-testdb hgcentraltest
# ASSEMBLY AND GAP TRACKS (DONE 2004-01-26 kate)
ssh kksilo
cd /cluster/data/panTro1
ssh hgwdev
hgGoldGapGl -noGl panTro1 /cluster/data/panTro1 .
# generate contig-level gaps from .gap files generated by scaffoldFaToAgp
ssh kksilo
cd /cluster/data/panTro1
~kate/bin/i386/liftUp assembly.chrom.gap jkStuff/scaffolds.lft warn assembly.scaffold.gap
# extract and merge contig-level and scaffold level gaps into .gap files
ssh hgwdev
cd /cluster/data/panTro1
cat > jkStuff/makeGaps.csh << 'EOF'
cd /cluster/data/panTro1
foreach c (?{,?})
echo "loading gaps for chr$c"
cd $c
if ( -e chr$c.agp ) then
sed -n "/^chr$c\t/p" ../assembly.chrom.gap | sort -n +1 > chr$c.frag.gap
grep 'N' chr$c.agp > chr$c.contig.gap
cat chr$c.frag.gap chr$c.contig.gap | sort -n +1 > chr$c.gap
endif
if ( -e chr${c}_random.agp ) then
grep "chr${c}_random" ../assembly.chrom.gap | sort -n +1 > chr${c}_random.frag.gap
grep 'N' chr${c}_random.agp > chr${c}_random.contig.gap
cat chr${c}_random.frag.gap chr${c}_random.contig.gap | sort -n +1 > chr${c}_random.gap
endif
rm chr*.frag.gap chr*.contig.gap
cd ..
end
'EOF'
csh jkStuff/makeGaps.csh > jkStuff/makeGaps.log &
tail -100f jkStuff/makeGaps.log
hgLoadGap panTro1 /cluster/data/panTro1
featureBits panTro1 gold
# 3109246583 bases of 3109246583 (100.000%) in intersection
featureBits panTro1 gap
# 1311128857 bases of 3109246583 (42.169%) in intersection
# REPEATMASKER TRACK (DONE 2004-01-21 kate)
# Split chrom into 5MB chunks
ssh kksilo
cd /cluster/data/panTro1
cat > jkStuff/splitChrom.csh << 'EOF'
foreach c (?{,?})
echo $c
if (-e $c/chr$c.fa) then
cp -p $c/chr$c.agp $c/chr$c.agp.bak
cp -p $c/chr$c.fa $c/chr$c.fa.bak
splitFaIntoContigs $c/chr$c.agp $c/chr$c.fa . -nSize=5000000
endif
if (-e $c/chr${c}_random.fa) then
cp -p $c/chr${c}_random.agp $c/chr${c}_random.agp.bak
cp -p $c/chr${c}_random.fa $c/chr${c}_random.fa.bak
splitFaIntoContigs $c/chr${c}_random.agp $c/chr${c}_random.fa . -nSize=5000000
mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst
mv ${c}_random/lift/ordered.lft $c/lift/random.lft
mv ${c}_random/lift/ordered.lst $c/lift/random.lst
rmdir ${c}_random/lift
rm ${c}_random/chr${c}_random.{agp,fa}
rm -fr $c/chr${c}_random_1
rm -fr $c/chr${c}_random_2
mv ${c}_random/* $c
rmdir ${c}_random
endif
end
'EOF'
csh jkStuff/splitChrom.csh > jkStuff/splitChrom.log &
tail -100f jkStuff/splitChrom.log
# make liftall.lft
ssh kksilo
cd /cluster/data/panTro1
cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft
# Split these pseudo-contigs into 500Kb chunks
ssh kksilo
cd /cluster/data/panTro1
foreach d ( */chr*_?{,?} )
cd $d
echo "splitting $d"
set contig = $d:t
faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \
-maxN=500000
cd ../..
end
# Create RM script and cluster job
# NOTE: actual run was performed in two cluster jobs -- the second
# was RMRun.chimpLib, for the chimp-specific repeats
cd /cluster/data/panTro1
mkdir -p jkStuff
mkdir -p RMRun.chrom
cat << '_EOF_' > jkStuff/RMChimp.chrom
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/panTro1/$2
/bin/cp $2 /tmp/panTro1/$2
cd /tmp/panTro1/$2
# mask with chimp lib
/cluster/bluearc/RepeatMasker030619/RepeatMasker -s -ali -nolow -lib /cluster/bluearc/RepeatMasker030619/Libraries/chimp.lib $2
popd
/bin/cp /tmp/panTro1/$2/$2.out ./$2.chimp.out
if (-e /tmp/panTro1/$2/$2.align) /bin/cp /tmp/panTro1/$2/$2.align ./$2.chimp.align
/bin/rm -f /tmp/panTro1/$2/*
cd $1
pushd .
/bin/cp $2 /tmp/panTro1/$2
cd /tmp/panTro1/$2
# mask with default primate lib
/cluster/bluearc/RepeatMasker030619/RepeatMasker -s -ali $2
popd
/bin/cp /tmp/panTro1/$2/$2.out ./$2.out
if (-e /tmp/panTro1/$2/$2.align) /bin/cp /tmp/panTro1/$2/$2.align ./$2.align
/bin/rm -fr /tmp/panTro1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/panTro1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/panTro1
'_EOF_'
chmod +x jkStuff/RMChimp.chrom
mkdir -p RMRun.chrom
rm -f RMRun.chrom/RMJobs
foreach d ( ?{,?}/chr*_?{,?} )
set ctg = $d:t
foreach f ( $d/${ctg}_?{,?}{,?}.fa )
set f = $f:t
echo /cluster/data/panTro1/jkStuff/RMChimp.chrom \
/cluster/data/panTro1/$d $f \
'{'check out line+ /cluster/data/panTro1/$d/$f.out'}' \
>> RMRun.chrom/RMJobs
end
end
#- Do the run
ssh kk
cd /cluster/data/panTro1/RMRun.chrom
para create RMJobs
para try, para check, para check, para push, para check,...
# additional run for chrUn in RMRun.chrUn (2004-01-24 kate)
# additional run for chr1_random in RMRun.chr1_random (2004-01-26 kate)
# 383 jobs written to batch
#- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
ssh kksilo
cd /cluster/data/panTro1
cat > jkStuff/liftPseudoContigOuts.csh << 'EOF'
foreach d ( ?{,?}/chr*_?{,?} )
cd $d
set contig = $d:t
echo $contig
liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out ${contig}_*.fa.chimp.out > /dev/null
cd ../..
end
'EOF'
csh jkStuff/liftPseudoContigOuts.csh >&! jkStuff/liftPseudoContigOuts.log &
tail -100f jkStuff/liftPseudoContigOuts.log
# Lift pseudo-contigs to chromosome level
cat > jkStuff/liftToChromOut.csh << 'EOF'
foreach i (?{,?})
echo lifting $i
cd $i
if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
liftUp chr$i.fa.out lift/ordered.lft warn `cat lift/oOut.lst` > /dev/null
else
echo "Can not find $i/lift/ordered.lft ."
endif
if (-e lift/random.lft && ! -z lift/random.lft) then
liftUp chr${i}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` > /dev/null
endif
cd ..
end
'EOF'
csh jkStuff/liftToChromOut.csh >&! jkStuff/liftToChromOut.log &
tail -100f jkStuff/liftToChromOut.log
#- Load the .out files into the database with:
ssh hgwdev
cd /cluster/data/panTro1
# ./jkStuff/dropSplitTable.csh rmsk
hgLoadOut panTro1 ?/*.fa.out ??/*.fa.out
# warnings: "Strange perc. field" in several chroms -- occurs w/ negative percent ins.
# 1_random, 2, 3, X,
featureBits panTro1 rmsk
# 1311281819 bases of 3109246583 (42.174%) in intersection
featureBits panTro1 rmsk
# SIMPLE REPEATS TRACK (DONE 2004-01-21 kate)
sh kksilo
mkdir -p /cluster/data/panTro1/bed/simpleRepeat
cd /cluster/data/panTro1/bed/simpleRepeat
mkdir -p trf
rm -f jobs.csh
echo '#\!/bin/csh -fe' > jobs.csh
# create job list of 5MB chunks
foreach f \
(/cluster/data/panTro1/?{,?}/chr?{,?}_[0-9]*/chr?{,?}_?{,?}{,?}.fa \
/cluster/data/panTro1/?{,?}/chr*_random_?{,?}/chr*_random_?{,?}{,?}.fa)
set fout = $f:t:r.bed
echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
>> jobs.csh
end
chmod +x jobs.csh
wc jobs.csh
# 773 4634 116496 jobs.csh
./jobs.csh >&! jobs.log &
# in bash: ./jobs.csh > jobs.log 2>&1 &
tail -f jobs.log
# Additional run for chrUn (2004-01-24 kate)
# Additional run for chr1_random (2004-01-26 kate)
# 27 jobs.chr1_random.csh
# When job is done lift output files
liftUp simpleRepeat.bed /cluster/data/panTro1/jkStuff/liftAll.lft warn trf/*.bed
# Load into the database
ssh hgwdev
cd /cluster/data/panTro1/bed/simpleRepeat
hgLoadBed panTro1 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
# Loaded 531061 elements of size 16
featureBits panTro1 simpleRepeat
# 53732632 bases of 3109246583 (1.728%) in intersection
featureBits panTro1 simpleRepeat
# 54320136 bases of 2865248791 (1.896%) in intersectio
# MASK 5M PSEUDO-CONTIGS (DONE 2004-01-26 kate)
ssh kksilo
cd /cluster/data/panTro1
cat > jkStuff/maskContigs.csh << 'EOF'
foreach i (?{,?})
cd $i
echo masking $i pseudo-contigs
foreach j (chr*_*/chr${i}{,_random}_?{,?}.fa)
set c = $j:t:r
echo masking $j
/cluster/bin/i386/maskOutFa $j $j.out $j.rmsk -soft
/cluster/bin/i386/maskOutFa $j ../bed/simpleRepeat/trfMask/$c.bed $j.msk -softAdd
mv $j $j.nomask
mv $j.msk $j
/cluster/bin/i386/maskOutFa $j hard $j.hmsk
end
cd ..
end
'EOF'
csh jkStuff/maskContigs.csh > jkStuff/maskContigs.log &
tail -100f jkStuff/maskContigs.log
cat > jkStuff/hardMaskChrom.csh << 'EOF'
foreach i (?{,?})
cd $i
if (-e chr${i}.fa) then
echo hard-masking chr$i.fa
/cluster/bin/i386/maskOutFa chr$i.fa hard chr$i.fa.hmsk
endif
if (-e chr${i}_random.fa) then
echo hard-masking chr$i.fa
/cluster/bin/i386/maskOutFa chr${i}_random.fa hard \
chr${i}_random.fa.hmsk
endif
cd ..
end
'EOF'
csh jkStuff/hardMaskChrom.csh >&! jkStuff/hardMaskChrom.log &
tail -100f jkStuff/hardMaskChrom.log
# copy to bluearc for cluster runs
ssh kksilo
cd /cluster/data/panTro1
mkdir -p /cluster/bluearc/panTro1/trfFa
foreach d (*/chr*_?{,?})
cp $d/$d:t.fa /cluster/bluearc/panTro1/trfFa
end
# DOWNLOADS (DONE 2004-02-12 kate)
ssh hgwdev
cd /cluster/data/panTro1
# preliminary downloads of chromosomes for Evan Eichler's lab
# to verify
cat > jkStuff/zipChroms.csh << 'EOF'
cd /cluster/data/panTro1
set dir = /usr/local/apache/htdocs/goldenPath/panTro1/chromosomes
mkdir -p $dir
foreach f (?{,?}/*.fa)
echo "zipping $f"
set b = $f:t:r
nice zip -j $dir/${b}.fa.zip $f
end
'EOF'
csh jkStuff/zipChroms.csh >&! jkStuff/zipChroms.log &
tail -100f jkStuff/zipChroms.log
cd $dir
md5sum *.zip > md5sum.txt
cp ../../panTro1/chromosomes/README.txt .
# edit
# MORE HERE - 2/3/04 angie
ssh kksilo
cd /cluster/data/panTro1
mkdir zip
cat << '_EOF_' > jkStuff/zipOut.csh
rm -rf zip
mkdir zip
zip -j zip/chromOut.zip */chr*.fa.out
'_EOF_'
# << this line makes emacs coloring happy
csh ./jkStuff/zipOut.csh |& tee zip/zipOut.log
cd zip
#- Look at zipAll.log to make sure all file lists look reasonable.
#- Check zip file integrity:
foreach f (*.zip)
unzip -t $f > $f.test
tail -1 $f.test
end
wc -l *.zip.test
#- Copy the .zip files to hgwdev:/usr/local/apache/...
ssh hgwdev
cd /cluster/data/panTro1/zip
set gp = /usr/local/apache/htdocs/goldenPath/panTro1
# create README.txt
mkdir -p $gp/bigZips
cp -p *.zip $gp/bigZips
cd $gp/bigZips
md5sum *.zip > md5sum.txt
# remake chrom quality files (bug fixed in chimpChromQuals that
# left out trailing zeroes for final gap)
# NOTE: not reloading quality track at this point
# (will do this later, if needed)
# 2004-07-06 kate
ssh kksilo
cd /cluster/data/panTro1
cat > jkStuff/chromQualsRedo.csh << 'EOF'
foreach c (?{,?})
echo "chr$c quality scores"
if (-e $c/chr$c.agp) then
chimpChromQuals $c/chr$c.agp scaffolds.qac $c/chr$c.qac
endif
if (-e $c/chr${c}_random.agp) then
chimpChromQuals $c/chr${c}_random.agp \
scaffolds.qac $c/chr${c}_random.qac
endif
end
'EOF'
#
csh jkStuff/chromQualsRedo.csh >& jkStuff/chromQualsRedo.log &
tail -100f jkStuff/chromQualsRedo.log
# quality scores
ssh kksilo
cd /cluster/data/panTro1
cat << '_EOF_' > jkStuff/zipQuals.csh
foreach f (?{,?}/*.qac)
echo $f
set b = $f:r
echo $b
qacToQa $f $b.qa
end
rm -rf zip
mkdir zip
echo "zipping zip/chromQuals.zip"
zip -j zip/chromQuals.zip ?{,?}/chr*.qa
'_EOF_'
# << this line makes emacs coloring happy
csh jkStuff/zipQuals.csh >&! zip/zipQuals.log &
tail -100f zip/zipQuals.log
#- Look at zipQuals.log to make sure all file lists look reasonable.
# cleanup uncompressed quality files
rm (?{,?}/chr*.qac)
#- Copy the .zip files to hgwdev:/usr/local/apache/...
ssh hgwdev
cd /cluster/data/panTro1/zip
set gp = /usr/local/apache/htdocs/goldenPath/panTro1
mkdir -p $gp/bigZips
cp -p *.zip $gp/bigZips
cd $gp/bigZips
md5sum *.zip > md5sum.txt
# other downloadables
ssh kksilo
cd /cluster/data/panTro1
cat << '_EOF_' > jkStuff/zipMost.csh
rm -rf zip
mkdir zip
zip -j zip/chromAgp.zip ?{,?}/chr*.agp
zip -j zip/chromFa.zip ?{,?}/chr?{,?}{,_random}.fa
zip -j zip/chromFaMasked.zip ?{,?}/chr*.fa.hmsk
zip -j zip/scaffolds.lft.zip jkStuff/scaffolds.lft
'_EOF_'
# << this line makes emacs coloring happy
csh jkStuff/zipMost.csh >&! zip/zipMost.log &
tail -100f zip/zipMost.log
#- Look at zipQuals.log to make sure all file lists look reasonable.
#- Copy the .zip files to hgwdev:/usr/local/apache/...
ssh hgwdev
cd /cluster/data/panTro1/zip
set gp = /usr/local/apache/htdocs/goldenPath/panTro1
mkdir -p $gp/bigZips
cp -p *.zip $gp/bigZips
cd $gp/bigZips
md5sum *.zip > md5sum.txt
# AUTO UPDATE GENBANK MRNA RUN (IN PROGRESS - 2004-01-23 - kate)
# distribute masked nibs to cluster disks
ssh kk
mkdir -p /scratch/chimp/panTro1/nib
cp /cluster/data/panTro1/nib/chr*.nib /scratch/chimp/panTro1/nib
ssh eieio
cd /cluster/store5/genbank
# This is a new organism, edit the etc/genbank.conf file and add:
# chimp
panTro1.genome = /scratch/chimp/panTro1/nib/chr*.nib
panTro1.lift = /cluster/store6/panTro1/jkStuff/liftAll.lft
panTro1.downloadDir = panTro1
panTro1.genbank.est.xeno.load = yes
pantro1.refseq.mrna.xeno.load = yes
# NOTE: Mark D must add the new species to the auto-update code
ssh eieio
cd /cluster/bluearc/genbank/work
nice bin/gbAlignStep -iserver=no \
-srcDb=genbank -type=mrna -verbose=1 -initial panTro1
# Completed: 2945 of 2945 jobs
# CPU time in finished jobs: 21679s 361.32m 6.02h 0.25d 0.001 y
# IO & Wait Time: 14040s 234.00m 3.90h 0.16d 0.000 y
# Average job time: 12s 0.20m 0.00h 0.00d
# Longest job: 65s 1.08m 0.02h 0.00d
# Submission to last job: 73s 1.22m 0.02h 0.00d
# Load the results from the above
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad panTro1
# To get this next one started, the above results need to be
# moved out of the way. These things can be removed if there are
# no problems to debug
ssh eieio
cd /cluster/bluearc/genbank/work
mv initial.panTro1 initial.panTro1.genbank.mrna
cd /cluster/data/genbank
nice bin/gbAlignStep -iserver=no \
-srcDb=refseq -type=mrna -verbose=1 -initial panTro1
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 panTro1
# the big EST alignment job
ssh eieio
cd /cluster/bluearc/genbank/work
mv initial.panTro1 initial.panTro1.refSeq.mrna
cd /cluster/data/genbank
nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial panTro1
# recovery
ssh kk
cd /cluster/bluearc/genbank/work/initial.panTro1/align
para push
ssh eieio
cd /cluster/data/genbank
nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 \
-initial -continue=finish -iserver=no panTro1
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad panTro1
# GENSCAN PREDICTIONS (IN PROGRESS - 2004-02-11 kate)
ssh hgwdev
mkdir -p /cluster/data/panTro1/bed/genscan
cd /cluster/data/panTro1/bed/genscan
cvs co hg3rdParty/genscanlinux
ssh kksilo
cd /cluster/data/panTro1/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir -p gtf pep subopt
# Generate a list file, genome.list, of all the pseudo-contigs
ls -1S /cluster/data/panTro1/?{,?}/chr*_*/chr*_*.fa.hmsk > genome.list
# don't use big cluster, due to limitation of memory and swap space on each
# processing node).
ssh kk9
cd /cluster/data/panTro1/bed/genscan
# Create template file, gsub, for gensub2. For example (3-line file):
cat << '_EOF_' > gsub
#LOOP
/cluster/home/kate/bin/RH7/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.list single gsub jobList
para create jobList
# 568 jobs written to batch
para try
para check
para push
# DONE TO HERE
# NOTE: 2 jobs (chr6_7, chr19_10) failed with:
# "No overlap between a and b in mergeTwo" -- investigating,
# but continuing with the track
# Convert these to chromosome level files as so:
ssh kksilo
cd /cluster/data/panTro1/bed/genscan
liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
cat pep/*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/panTro1/bed/genscan
ldHgGene panTro1 genscan genscan.gtf
# Read 41471 transcripts in 303679 lines in 1 files
hgPepPred panTro1 generic genscanPep genscan.pep
hgLoadBed panTro1 genscanSubopt genscanSubopt.bed
# Loaded 535541 elements of size 6
# CPGISLANDS (DONE - 2004-01-26 - kate)
ssh kksilo
mkdir -p /cluster/data/panTro1/bed/cpgIsland
cd /cluster/data/panTro1/bed/cpgIsland
# Copy program as built for previous hg build:
mkdir cpg_dist
cp -p ~/hg15/bed/cpgIsland/cpg_dist/cpglh.exe ./cpg_dist
# This step used to read, but I do not immediately see the .tar
# file anywhere: (there is a copy in ~/rn3/bed/cpgIsland)
# Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu)
# copy the tar file to the current directory
# tar xvf cpg_dist.tar
# cd cpg_dist
# gcc readseq.c cpg_lh.c -o cpglh.exe
# cd ..
# cpglh.exe requires hard-masked (N) .fa's.
# There may be warnings about "bad character" for IUPAC ambiguous
# characters like R, S, etc. Ignore the warnings.
cat > ../../jkStuff/cpg.csh << 'EOF'
foreach f (../../?{,?}/chr?{,?}{,_random}.fa.hmsk)
set fout=$f:t:r:r.cpg
echo producing $fout...
./cpg_dist/cpglh.exe $f > $fout
end
'EOF'
csh ../../jkStuff/cpg.csh >&! ../../jkStuff/cpg.log &
tail -100f ../../jkStuff/cpg.log
# takes ~40 minutes
cat << '_EOF_' > filter.awk
/* chr1\t1325\t3865\t754\tCpG: 183\t64.9\t0.7 */
/* Transforms to: (tab separated columns above, spaces below) */
/* chr1 1325 3865 CpG: 183 754 183 489 64.9 0.7 */
{
width = $3-$2;
printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\n",
$1,$2,$3,$5,$6,width,$6,width*$7*0.01,100.0*2*$6/($3-$2),$7);}
'_EOF_'
# << this line makes emacs coloring happy
awk -f filter.awk chr*.cpg > cpgIsland.bed
ssh hgwdev
cd /cluster/data/panTro1/bed/cpgIsland
hgLoadBed panTro1 cpgIsland -tab -noBin \
-sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed
# stringTab = 1
# Reading cpgIsland.bed
# Loaded 27596 elements
# Sorted
# Saving bed.tab
# Loading panTro1
# GC 5 BASE WIGGLE TRACK (DONE - 2004-01-28 - Hiram)
# working on kksilo, the fileserver for this data.
ssh kksilo
mkdir /cluster/data/panTro1/bed/gc5Base
cd /cluster/data/panTro1/bed/gc5Base
mkdir wigData5 dataLimits5
for n in ../../nib/*.nib
do
c=`basename ${n} | sed -e "s/.nib//"`
C=`echo $c | sed -e "s/chr//"`
echo -n "working on ${c} - ${C} ... "
hgGcPercent -chr=${c} -doGaps \
-file=stdout -win=5 panTro1 ../../nib | grep -w GC | \
awk '
{
bases = $3 - $2
perCent = $5/10.0
printf "%d\t%.1f\n", $2+1, perCent
}' | wigAsciiToBinary -dataSpan=5 -chrom=${c} \
-wibFile=wigData5/gc5Base_${C} -name=${C} stdin > dataLimits5/${c}
echo "done ${c}"
done
# data is compete, load track on hgwdev
ssh hgwdev
cd /cluster/data/panTro1/bed/gc5Base
hgLoadWiggle panTro1 gc5Base wigData5/*.wig
# create symlinks for .wib files
mkdir /gbdb/panTro1/wib
ln -s `pwd`/wigData5/*.wib /gbdb/panTro1/wib
# the trackDb entry
track gc5Base
shortLabel GC Percent
longLabel GC Percent in 5 base windows
group map
priority 23.5
visibility hide
-autoScaleDefault Off
+autoScale Off
maxHeightPixels 128:36:16
graphTypeDefault Bar
gridDefault OFF
windowingFunction Mean
color 0,128,255
altColor 255,128,0
viewLimits 30:70
type wig 0 100
# GENERATE OOC FILE FOR CHIMP MRNA ALIGNMENTS (DONE 2004-01-30 kate)
# Note: have MarkD add ooc file to scripts, and redo chimp mrna alignments
ssh kksilo
cd /cluster/data/panTro1
ls ?{,?}/*.fa > jkStuff/chrom.lst
blat jkStuff/chrom.lst jkStuff/ooc.txt jkStuff/ooc.psl \
-tileSize=11 -makeOoc=11.ooc -repMatch=1024
cp 11.ooc /cluster/bluearc/hg/h/chimp11.ooc /cluster/bluearc/panTro1/11.ooc
# redo chimp mrna's
ssh eieio
cd /cluster/bluearc/genbank/work
mv initial.panTro1 initial.panTro1.genbank.est
cd /cluster/data/genbank
nice bin/gbAlignStep -iserver=no \
-srcDb=genbank -type=mrna -verbose=1 -initial panTro1
# Load the results
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad panTro1
# restart EST's (process dropped out for some reason)
ssh eieio
cd /cluster/bluearc/genbank/work
mv initial.panTro1 initial.panTro1.genbank.mrna2
mv initial.panTro1.genbank.est initial.panTro1
cd /cluster/data/genbank
nice bin/gbAlignStep -iserver=no -continue=finish \
-srcDb=genbank -type=est -verbose=1 -initial panTro1
# CREATING QUALITY SCORE TRACK
# Create compressed scaffold quality file
ssh kksilo
cd /cluster/data/panTro1
zcat contigs.quals.gz | qaToQac stdin stdout | \
chimpChromQuals assembly.agp stdin scaffolds.qac
# qaToQac contigs.quals stdout | chimpChromQuals assembly.agp stdin scaffolds.qac
mkdir -p bed/qual/{wigData,dataLimits}
cat > jkStuff/chromQuals.csh << 'EOF'
foreach c (?{,?})
echo "chr$c quality scores"
if (-e $c/chr$c.agp) then
chimpChromQuals $c/chr$c.agp scaffolds.qac $c/chr$c.qac
qacToWig $c/chr$c.qac -name=chr$c stdout | \
wigAsciiToBinary -dataSpan=1 -chrom=chr$c \
-wibFile=bed/qual/wigData/qual_$c -name=$c stdin > \
bed/qual/dataLimits/chr$c
endif
if (-e $c/chr${c}_random.agp) then
chimpChromQuals $c/chr${c}_random.agp \
scaffolds.qac $c/chr${c}_random.qac
qacToWig $c/chr${c}_random.qac -name=chr${c}_random stdout | \
wigAsciiToBinary -dataSpan=1 -chrom=chr${c}_random \
-wibFile=bed/qual/wigData/qual_${c}r -name=${c}r stdin > \
bed/qual/dataLimits/chr${c}_random
endif
end
'EOF'
csh jkStuff/chromQuals.csh >& jkStuff/chromQuals.log &
tail -100f jkStuff/chromQuals.log
# takes 3-4 hours
# Note: can verify size of chrom qual files by comparing wc to
# chrom length -- subtracting out length of trailing gap when doing this
# as chimpChromQuals doesn't add quality scores for this
# load wiggle into database
ssh hgwdev
cd /cluster/data/panTro1/bed/qual
hgLoadWiggle panTro1 quality wigData/*.wig
# create symlinks for .wib files
mkdir /gbdb/panTro1/wib
ln -s `pwd`/wigData/*.wib /gbdb/panTro1/wib
# the trackDb entry
track quality
shortLabel Quality Scores
longLabel Quality Scores
group map
priority 23.6
visibility hide
-autoScaleDefault Off
+autoScale Off
maxHeightPixels 128:36:16
graphTypeDefault Bar
gridDefault OFF
color 0,128,255
altColor 255,128,0
type wig 0 50
# This wiggle track needs some zoom tables to aid display on whole
# chromosome views
ssh kksilo
cd /cluster/data/panTro1
mkdir -p bed/qual/wigData1K
mkdir -p bed/qual/dataLimits1K
cat << '_EOF_' > jkStuff/qualZoom1K.sh
#!/bin/sh
for c in ? ??
do
if [ -f $c/chr${c}.qac ]; then
echo "chr${c} quality 1K zoom"
qacToWig $c/chr${c}.qac -name=chr${c} stdout |
wigZoom stdin | wigAsciiToBinary -dataSpan=1024 \
-chrom=chr${c} -wibFile=bed/qual/wigData1K/qc1K_${c} \
-name=${c} stdin > bed/qual/dataLimits1K/chr${c}
fi
if [ -f $c/chr${c}_random.qac ]; then
echo "chr${c}_random quality 1K zoom"
qacToWig $c/chr${c}_random.qac -name=chr${c}_random stdout |
wigZoom stdin | wigAsciiToBinary -dataSpan=1024 \
-chrom=chr${c}_random -wibFile=bed/qual/wigData1K/qc1K_${c}r \
-name=${c}_random stdin > bed/qual/dataLimits1K/chr${c}r
fi
done
'_EOF_'
chmod +x ./jkStuff/qualZoom1K.sh
time ./jkStuff/qualZoom1K.sh
# real 227m8.064s
# user 165m8.720s
# sys 3m58.780s
ssh hgwdev
cd bed/qual/wigData1K
# Do not need these for chrom M
rm -f qc1K_M.wig qc1K_M.wib qc1K_Mr.wig qc1K_Mr.wib
hgLoadWiggle -oldTable panTro1 quality *.wig
# create symlinks for .wib files
ln -s `pwd`/*.wib /gbdb/panTro1/wib
# HUMAN ALIGNMENTS (DONE 2004-02-11 kate)
ssh kksilo
cd /cluster/data/panTro1
# start with full chimp-reference chains from
# December Human/Chimp alignments (hg16/pt0, blastz+blat)
mkdir -p bed/blastz-blatHg16.pt0.swap
cd bed/blastz-blatHg16.pt0.swap
cp /cluster/data/pt0/bed/blastz-blatHg16/all.scaffold.chain .
# lift to chimp chromosome coordinates
# (need latest liftUp)
~kate/bin/i386/liftUp all.chain ../../jkStuff/scaffolds.lft \
warn all.scaffold.chain
# rechain to catch alignments spanning scaffold boundaries
ls -1S /cluster/data/panTro1/nib/*.nib > chimpNib.lst
ls -1S /cluster/data/hg16/nib/*.nib > humanNib.lst
chainToPsl all.chain ../../chrom.sizes /cluster/data/hg16/chrom.sizes \
chimpNib.lst humanNib.lst all.chain.psl
# in retrospect, I would have done a chainSplit first...
pslSortAcc nohead pslChain temp all.chain.psl
# Processed 9254050 lines into 38 temp files
mkdir -p pslChain/run
cd pslChain/run
mkdir out chain
ssh kk
cd /cluster/data/panTro1
cd blastz-blatHg16.pt0.swap/pslChain/run
# copy nibs to iservers (seem to have disappeared from /scratch/chimp)
mkdir -p /iscratch/i/chimp/panTro1/nib
cp /cluster/bluearc/scratch/chimp/panTro1/nib/* \
/iscratch/i/chimp/panTro1/nib
iSync
ls -1S /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap/pslChain/*.psl \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -psl $1 \
/scratch/chimp/panTro1/nib \
/iscratch/i/gs.17/build34/bothMaskedNibs $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
# 52 jobs
para try
para push
# merge chains to give unique chain ID's in preparation for netting
ssh kksilo
cd /cluster/data/panTro1
cd bed/blastz-blatHg16.pt0.swap
chainMergeSort pslChain/run/chain/*.chain | chainSplit netChain stdin
# NOTE: next time, save the merged chain file for alter use
# (e.g. netChainSubset will need it)
# load chains into browser
ssh hgwdev
cd /cluster/data/panTro1
cd bed/blastz-blatHg16.pt0.swap/netChain
foreach i (*.chain)
set c = $i:r
hgLoadChain panTro1 ${c}_chainHg16 $i
echo done $c
end
# net the chains
ssh kksilo
cd /cluster/data/panTro1
cd bed/blastz-blatHg16.pt0.swap/netChain
# needs Angie's mod to chainNet to suppress info messages
cat > net.csh << 'EOF'
chainMergeSort -saveId chr*.chain | \
chainPreNet stdin \
/cluster/data/panTro1/chrom.sizes \
/cluster/data/hg16/chrom.sizes stdout | \
~kate/bin/i386/chainNet stdin -minSpace=1 \
/cluster/data/panTro1/chrom.sizes \
/cluster/data/hg16/chrom.sizes stdout /dev/null | \
netSyntenic stdin hNoClass.net
'EOF'
# << this line makes emacs coloring happy
csh net.csh >&! net.log &
tail -100f net.log
# create all.chain file with new ID's, for swapping to human browser
chainMergeSort -saveId chr*.chain > ../all.newId.chain
ssh hgwdev
cd /cluster/data/panTro1
cd bed/blastz-blatHg16.pt0.swap/netChain
netClass hNoClass.net panTro1 hg16 ../human.net
# load net into the browser
ssh hgwdev
cd /cluster/data/panTro1
cd bed/blastz-blatHg16.pt0.swap
hgLoadNet -warn panTro1 netHg16 human.net
rm -r netChain/hNoClass.net
# 8/23/04 move aside pre-re-chaining files to avoid confusion (angie)
mkdir preChain
mv all.scaffold.chain all.chain humanNib.lst chimpNib.lst all.chain.psl \
preChain/
# CHAIN FILES FROM THE NET (DONE kate)
# for downloads area
ssh kksilo
cd /cluster/data/panTro1
cd bed/blastz-blatHg16.pt0.swap
chainMergeSort -saveId *.chain | \
netChainSubset human.net stdin netChain/human.net.chain
chainSplit netChainSubset netChain/human.net.chain
ssh hgwdev
cd /cluster/data/panTro1
cd bed/blastz-blatHg16.pt0.swap/netChainSubset
set dir = /usr/local/apache/htdocs/goldenPath/panTro1/vsHg16/netChain
# add README to vsHg16 dir.
mkdir -p $dir
cp *.chain $dir
cd $dir
gzip *.chain
md5sum *.gz > md5sum.txt
cp ~/kent/src/hg/mouseStuff/chainFormat.doc .
# AXT FILES FROM THE NET (DONE kate)
# for a "Human Best" track that displays alignments at
# base level, and for download
ssh kksilo
cd /cluster/data/panTro1
cd bed/blastz-blatHg16.pt0.swap
mkdir net axtNet
netSplit human.net net
cd axtNet
cat > makeAxt.csh << 'EOF'
cd ../net
foreach f (*.net)
set i = $f:r
echo $i.net
netToAxt $i.net ../netChain/$i.chain \
/cluster/data/panTro1/nib /cluster/data/hg16/nib \
../axtNet/$i.axt
end
'EOF'
# << this line makes emacs coloring happy
csh -f makeAxt.csh >&! makeAxt.log &
tail -100f makeAxt.log
# save in /gbdb and load into database
ssh hgwdev
mkdir -p /gbdb/panTro1/axtNetHg16
cd /gbdb/panTro1/axtNetHg16
foreach f (/cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap/axtNet/*.axt)
ln -s $f /gbdb/panTro1/axtNetHg16
end
cd /cluster/data/panTro1
cd bed/blastz-blatHg16.pt0.swap/axtNet
hgLoadAxt panTro1 axtNetHg16
# copy to download area
ssh kksilo
cd /cluster/data/panTro1
cd bed/blastz-blatHg16.pt0.swap/axtNet
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/panTro1/vsHg16/axtNet
cd /usr/local/apache/htdocs/goldenPath/panTro1/vsHg16/axtNet
cp /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap/axtNet/*.axt .
gzip *.axt
md5sum *.gz > md5sum.txt
# create README.txt
# add to axtInfo table for Comparative Sequence link on gene pred tracks
# (2004-04-27 kate)
ssh hgwdev
hgsql panTro1 < ~/kent/src/hg/lib/axtInfo.sql
foreach f (/gbdb/panTro1/axtNetHg16/chr*.axt)
set chr=$f:t:r
echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
VALUES ('hg16','Reciprocal Best','$chr','$f');" | hgsql panTro1
end
# Genbank ESTs completed, but needed to redo native ESTs/mRNAs with
# new ooc file.
# Remove native ESTs:
cd /cluster/data/genbank
find aligned/genbank.139.0/panTro1 -name 'est.*.native.*' |xargs rm -f
# realign native ESTs
nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial panTro1&
# Reload mRNAs and load ESTS
ssh hgwdev
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad panTro1
# HUMAN REFSEQ MAPPINGS (Writeup incomplete, in progress jk)
liftOver ../../../bedOverHg16/refGene2/hg16.refGene.gtf ../../humanTarget.chain new.gtf unmapped.gtf -minMatch=0.5 -gff
ldHgGene panTro1 mappedRefSeq new.gtf
# HUMAN DELETIONS (kpollard, finished 2/19/04)
# 80-12000 bp indels in human/chimp alignments (from Tarjei's files)
#data
mkdir -p /cluster/data/panTro1/bed/indels
cd /cluster/data/panTro1/bed/indels
#ftp the file indels.human.chimp.tar.gz from ftp.broad.mit.edu
gunzip indels.human.chimp.tar.gz
tar -xvf indels.human.chimp.tar
mkdir arachne
mv indels_arachne.* arachne
gzip arachne/indels_arachne.chimp.fa
gzip arachne/indels_arachne.human.fa
#make .bed files from Tarjei's .fa files
cluster/bin/i386/faSize detailed=on indels.chimp.fa > chimp.all.txt
/cluster/bin/i386/faSimplify -prefix="scaffold_" indels.chimp.fa _ , temp.fa
/cluster/bin/i386/faSize detailed=on temp.fa > chimp.scaff.txt
cat /cluster/data/panTro1/jkStuff/scaffolds.lft | gawk '{print $2"\t"$6}' > scaffStrand.txt
R
#commands in R
scaff<-read.table("chimp.scaff.txt") #read in scaffold and size
scaff.info<-read.table("scaffStrand.txt") #read in scaffold strand info
all<-read.table("chimp.all.txt") #read in full .fa header and size
long<-as.character(all[,1]) #extract full .fa header
longsplit<-matrix(unlist(strsplit(long,",")),nc=4,byrow=TRUE) #split
end<-cbind(as.numeric(longsplit[,4]),as.numeric(all[,2])) #end and size
both<-cbind(scaff,end) #concatenate scaff size end size
sum(both[,2]!=both[,4]) #check sizes agree
dimnames(both)<-list(1:18395,c("scaff","size","start","stop")) #name columns of both
dimnames(scaff.info)<-list(1:37931,c("scaff","strand")) #name rows and columns of scaff.info
both.strand<-merge(both,scaff.info,by="scaff",sort=FALSE) #merge scaff.info and both based on the column scaff
attach(both.strand) #attach dataframe
out<-both.strand #create data set out
out[strand=="+","stop"]<-start[strand=="+"]+size[strand=="+"] #stop for + strands
out[strand=="-","stop"]<-start[strand=="-"]-size[strand=="-"] #stop for - strands
out[strand=="-",]<-out[strand=="-",c(1,2,4,3,5)] #rearrange columns for - strands so start comes before stop
out<-out[,c(1,3,4,2)] #reorder columns: scaff start stop size
out[,4]<-paste("HD",1:length(out[,4]),"_",both[,4],sep="") #make name like HDN_size
write(t(out),"indels.chimp.new.bed",ncol=4)
q()
#back in shell: map to chromosomes
cat indels.chimp.new.bed | gawk '{print $1"\t"$2"\t"$3"\t"$4}' > indels.chimp.tab.bed
/cluster/bin/i386/liftUp -type=.bed indels.chimp.chr.bed /cluster/data/panTro1/jkStuff/scaffolds.lft warn indels.chimp.tab.bed
#load track into chimp browser
mkdir -p /gbdb/panTro1/humanDels
ln -s /cluster/data/panTro1/bed/indels/indels.chimp.chr.bed /gbdb/panTro1/humanDels
cd /cluster/data/panTro1/bed/indels
/cluster/bin/i386/hgLoadBed panTro1 humanDels indels.chimp.chr.bed
#add description file humanDels.html
# to ~/kent/src/hg/makeDb/trackDb/chimp/panTro1
#add a track entry to trackDb.ra
# in ~/kent/src/hg/makeDb/trackDb/chimp/panTro1
# HUMAN REFSEQ ALIGNMENTS (markd)
Added hack in genbank alignment code so that the xeno refseq track
consists only of human refseqs. This had to be reloaded due to
query type getting lost.
# TWINSCAN (genes: baertsch 2004-03-11, proteins: kate 2004-04-22)
# load twinscan predictions without filtering pseudogenes baertsch 3/11/04
# loaded frame info
cd /cluster/data/panTro1/bed
mkdir -p twinscan
cd twinscan
wget http://genes.cs.wustl.edu/predictions/chimp/panTro1_vs_mm4/chimp_panTro1_vs_mm4.tgz
tar xvfz *.tgz
cd chr_gtf
# load gene predictions
ldHgGene panTro1 twinscan chr*.gtf -gtf -frame -geneName
# load predicted proteins
# pare down protein FASTA header to id and add missing .a:
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 X Y)
echo chr$c
perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa
end
hgPepPred panTro1 generic twinscanPep chr_ptx/chr*-fixed.fa
# GENEID GENES (2004-03-16 kate)
ssh hgwdev
mkdir -p /cluster/data/panTro1/bed/geneid/download
cd /cluster/data/panTro1/bed/geneid/download
# Now download *.gtf and *.prot from
set dir = genome.imim.es/genepredictions/P.troglodytes/golden_path_200311/geneid_v1.1/
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 X Y Un)
wget http://$dir/chr$c.gtf
wget http://$dir/chr${c}_random.gtf
wget http://$dir/chr$c.prot
wget http://$dir/chr${c}_random.prot
end
wget http://$dir/readme
# Add missing .1 to protein id's
foreach f (*.prot)
perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
echo "done $f"
end
cd ..
#ldHgGene panTro1 geneid download/*.gtf -exon=CDS
#Read 34198 transcripts in 279022 lines in 51 files
# Compare to Hg16 was: Read 32255 transcripts in 281180 lines
# in 40 files
#34198 groups 51 seqs 1 sources 3 feature types
#34198 gene predictions
ldHgGene panTro1 geneid download/*.gtf -gtf
hgPepPred panTro1 generic geneidPep download/*-fixed.prot
# reload chr1 (bad original data) (2004-04-06 kate)
set dir = genome.imim.es/genepredictions/P.troglodytes/golden_path_200311/geneid_v1.1/
set c = 1
wget http://$dir/chr$c.gtf
wget http://$dir/chr$c.prot
set f = chr1.prot
perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
cd ..
ldHgGene panTro1 geneid download/*.gtf -gtf
# 32432 gene predictions
hgPepPred panTro1 generic geneidPep download/*-fixed.prot
# REVISED HUMAN DELETIONS (kpollard, 4/13/04)
# new file emailed from Tarjei: humanDels.new.dat
# with corrected coordinates (based on chimp agp)
# saved in /cluster/data/panTro1/bed/indels
cd /cluster/data/panTro1/bed/indels
#has chr start end, need to add names
pr -t -n humanDels.new.dat > temp
cat temp | gawk '{size=$4 - $3; print $2"\t"$3"\t"$4"\t""HD"$1"_"size}' > humanDels.new.bed
#load track into chimp browser
ln -s /cluster/data/panTro1/bed/indels/humanDels.new.bed /gbdb/panTro1/humanDels
hgsql panTro1
delete from humanDels;
exit
cd /cluster/data/panTro1/bed/indels
/cluster/bin/i386/hgLoadBed panTro1 humanDels humanDels.new.bed
# BAC Ends (2004-04-22 kate)
# download BAC ends from NCBI. The chimp set appear to
# be from Riken, Japan (source "djb", databank japan).
# Ref: Fujiyama, et. al, Science Jan 4, 2002
ssh eieio
mkdir -p /cluster/data/ncbi/bacends/chimp
cd /cluster/data/ncbi/bacends/chimp
mkdir bacends.chimp.1
cd bacends.chimp.1
wget ftp.ncbi.nih.gov/genomes/BACENDS/pan_troglodytes/AllBACends.mfa.gz
wget ftp.ncbi.nih.gov/genomes/BACENDS/pan_troglodytes/cl_acc_gi_len.gz
# problems with wget -- downloaded from browser
gunzip *.gz
# create pairs info file
/cluster/bin/scripts/convertBacEndPairInfo cl_acc_gi_len
# 78846 pairs and 896 singles
# create sequence file
# edit convert.pl to allow "dbj" source instead of "gb"
# (data bank japan vs. genbank)
cp /cluster/store1/bacends.4/convert.pl .
convert.pl < AllBACends.mfa > BACends.fa
faCount BACends.fa
ssh hgwdev
mkdir -p /gbdb/panTro1/bacends
cd /gbdb/panTro1/bacends
ln -s /cluster/data/ncbi/bacends/chimp/bacends.chimp.1/BACends.fa BACends.chimp.fa
# 123470915 bases, 158588 sequences
# split to cluster dir -- break into 10 files for a totol
# of 10 * 30K (#scaffolds) = 300K jobs
ssh eieio
cd /cluster/data/ncbi/bacends/chimp/bacends.chimp.1
mkdir -p /iscratch/i/chimp/bacends.1
# split sequence file and copy for cluster access
faSplit sequence BACends.fa 10 /iscratch/i/chimp/bacends.1/
ssh kkr1u00
/cluster/bin/iSync
# split scaffolds file for cluster access
cd /cluster/bluearc/panTro1
mkdir -p scaffolds
faSplit byName scaffolds.msk.fa scaffolds/
# blat vs. unmasked scaffolds
# use /cluster/bluearc/booch/bacends/Makefile
# edit bacends.lst
ssh kk
cd /cluster/data/panTro1/bed/bacends
mkdir run
cd run
ls -1S /cluster/bluearc/panTro1/scaffolds | sed 's.^./cluster/bluearc/panTro1/scaffolds/.' > scaffolds.lst
ls /iscratch/i/chimp/bacends.1/*.fa > bacends.lst
mkdir ../out
cat > gsub << 'EOF'
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -ooc=/scratch/hg/h/11.ooc {check out line+ /cluster/data/panTro1/bed/bacends/out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'EOF'
gensub2 scaffolds.lst bacends.lst gsub jobList
foreach f (`cat bacends.lst`)
set d = $f:r:t
echo $d
mkdir -p /cluster/data/panTro1/bed/bacends/out/$d
end
para create jobList
# 379310 jobs
para try
para check
para push
# Note -- these run quickly!
# Longest job: 595s 9.92m
# lift alignments
ssh kksilo
cd /cluster/data/panTro1/bed/bacends
pslSort dirs raw.psl temp out/??
pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
raw.psl bacEnds.psl /dev/null
#Processed 16577599 alignments
mkdir lifted
liftUp lifted/bacEnds.lifted.psl \
/cluster/data/panTro1/jkStuff/scaffolds.lft warn bacEnds.psl
pslSort dirs bacEnds.sorted.psl temp lifted
rmdir temp
# 520377 bacEnds.sorted.psl
set ncbiDir = /cluster/data/ncbi/bacends/chimp/bacends.chimp.1
pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=600000 -slopval=10000 -hardMax=750000 -slop -short -long -orphan -mismatch -verbose bacEnds.sorted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds
26805 bacEnds.pairs
# 96 w/ score=750
982 bacEnds.short
9 bacEnds.long
1179 bacEnds.slop
29071 bacEnds.orphan
190 bacEnds.mismatch
# 158588 sequences
# compared to hg16
203386 bacEnds.pairs
# 1524 w/ score=750
# 396 w/ score=500
# 234 w/ score=250
# 96 w/ score=125
6839 bacEnds.short
57 bacEnds.long
4388 bacEnds.slop
70851 bacEnds.orphan
1404 bacEnds.mismatch
# 603416 sequences
# TODO: replace w/ awk & sort
# create header required by "rdb" tools
echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
cat header bacEnds.pairs | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairs.bed
cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch bacEnds.orphan \
| row score ge 300 | sorttbl chr start | headchg -del > bacEndPairsBad.bed
# note: need "rdb"
# note: tune pslPairs variables for draft chimp
# ~150Kbp total length of clone for BAC's
# 50Kbp - 600Kbp for early human
# 25Kbp - 350Kbp for finished human
# minimize # falling into slop, short, long
# try hardMax 750K. tryout min/max variants
# try to get as many as possible, uniquely
# note - this track isn't pushed to RR, just used for assembly QA
# create table of all bac end alignments
extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \
bacEndPairsBad.bed | \
sorttbl tname tstart | headchg -del > bacEnds.load.psl
# load into database
ssh hgwdev
cd /cluster/data/panTro1/bed/bacends
hgLoadBed panTro1 bacEndPairs bacEndPairs.bed \
-sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairs.sql
# Loaded 26805
# note - this track isn't pushed to RR, just used for assembly QA
hgLoadBed panTro1 bacEndPairsBad bacEndPairsBad.bed \
-sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairsBad.sql
# Loaded 31396
#hgLoadPsl panTro1 -nobin -table=all_bacends bacEnds.load.psl
# NOTE: truncates file to 0 if -nobin is used
hgLoadPsl panTro1 -table=all_bacends bacEnds.load.psl
#load of all_bacends did not go as planned: 441072 record(s), 0 row(s) skipped, 30 warning(s) loading psl.tab
# load BAC end sequences
#hgLoadSeq panTro1 /gbdb/ncbi/bacends/chimp/bacends.chimp.1/BACends.chimp.fa
hgLoadSeq panTro1 /gbdb/panTro1/bacends/BACends.chimp.fa
# 158588 sequences
# for analysis, you might want to run:
make bacEndBlock.bed
featureBits panTro1 bacEndPairs.bed \!gap -bed=bacEndBlocks.bed -noRandom
# 1791977681 bases of 2412409802 (74.282%) in intersection
# human hg16: 2827808812 bases of 2843433602 (99.450%) in intersection
featureBits panTro1 bacEndPairs.bed -bed=bacEndBlocksGap.bed -noRandom
# 2007151617 bases of 2412409802 (83.201%) in intersection
# human hg16: 2836036212 bases of 2843433602 (99.740%) in intersection
# added searchSpec for bacEndPairs to include chimp clone prefixes
# CONTIGS TRACK (2004-04-23 kate)
# make ctgPos (contig name, size, chrom, chromStart, chromEnd) from lift
ssh kksilo
cd /cluster/data/panTro1/bed
mkdir ctgPos
cd ctgPos
awk 'BEGIN {OFS="\t"} {print $4, $1, $3+$1, $2}' ../../assembly.lft \
> contig-scaffold.bed
liftUp contig-chrom.bed ../../jkStuff/scaffolds.lft warn contig-scaffold.bed
awk 'BEGIN {OFS="\t"} {print $4, $3-$2, $1, $2, $3}' contig-chrom.bed \
> ctgPos.tab
ssh hgwdev
cd /cluster/data/panTro1/bed/ctgPos
echo "drop table ctgPos" | hgsql panTro1
cat ~/kent/src/hg/lib/ctgPos.sql \
| sed 's/contig(12)/contig/' | hgsql panTro1
#hgsql panTro1 < ~/kent/src/hg/lib/ctgPos.sql
echo "load data local infile 'ctgPos.tab' into table ctgPos" | hgsql panTro1
# FUGU BLAT ALIGNMENTS (2004-04-27 kate)
# reloaded table w/ new index 2004-05-13 kate)
ssh kk
mkdir /cluster/data/panTro1/bed/blatFr1
cd /cluster/data/panTro1/bed/blatFr1
ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
ls -1S /scratch/chimp/panTro1/nib/*.nib > chimp.lst
cat << '_EOF_' > gsub
#LOOP
blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
mkdir psl
gensub2 chimp.lst fugu.lst gsub spec
para create spec
para try, check, push, check, ...
#CPU time in finished jobs: 6324220s 105403.66m 1756.73h 73.20d 0.201 y
#IO & Wait Time: 2344146s 39069.11m 651.15h 27.13d 0.074 y
#Average job time: 288s 4.81m 0.08h 0.00d
#Longest job: 14817s 246.95m 4.12h 0.17d
#Submission to last job: 51381s 856.35m 14.27h 0.59d
# Sort alignments:
ssh kksilo
cd /cluster/data/panTro1/bed/blatFr1
pslCat -dir psl | pslSortAcc nohead chrom temp stdin
# Processed 782670 lines into 4 temp files
# lift query side to Fugu browser chrUn coordinates
liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl
# load into database:
ssh hgwdev
cd /cluster/data/panTro1/bed/blatFr1
hgLoadPsl -fastLoad -table=blatFr1 panTro1 all.psl
# load problem with 2 rows -- qBaseInsert values were negative,
# and were set to 0 when loaded. Blat problem ?
# add to trackDb as type xeno psl fr1, with colorChromDefault off
# Reciprocal Best chain files for download
# just swap the human-reference file generated in makeHg16.doc
ssh kksilo
cd /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap
chainSwap /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/rBest.chain \
stdout | chainMergeSort stdin > rBest.chain
ssh hgwdev
cd /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap
set dir = /usr/local/apache/htdocs/goldenPath/panTro1/vsHg16
mkdir -p $dir
cp -p rBest.chain $dir/chimp.best.chain
cd $dir
gzip *.chain
# create md5sum.txt and README's
# split and load into table for QA purposes
ssh kksilo
cd /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap
chainSplit rBest rBest.chain
ssh hgwdev
cd /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap
cd rBest
foreach i (*.chain)
set c = $i:r
hgLoadChain panTro1 ${c}_rBestChainHg16 $i
echo done $c
end
# LOAD ENSEMBL GENES (DONE 6/16/04 angie)
ssh kksilo
mkdir /cluster/data/panTro1/bed/ensembl
cd /cluster/data/panTro1/bed/ensembl
# Get the ensembl gene data from
# http://www.ensembl.org/Pan_troglodytes/martview
# Follow this sequence through the pages:
# Page 1) Make sure that the Pan_troglodytes choice is selected. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Structures" box.
# Page 4) Choose GTF as the ouput. choose gzip compression. hit export.
# Save as ensembl.gff.gz
# Add "chr" to front of each line in the gene data gtf file to make
# it compatible with our software.
gunzip -c ensembl.gff.gz \
| perl -wpe 's/^([0-9]+|E\w+|[XY]|Un(_random)?)/chr$1/ \
|| die "Line $. doesnt start with chimp chrom:\n$_"' \
> ensGene.gtf
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper. Use ensMart to create it as above, except:
# Page 3) Choose the "Features" box. In "Ensembl Attributes", check
# Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# Choose Text, tab-separated as the output format. Result name ensGtp.
# Save file as ensGtp.txt.gz
gunzip ensGtp.txt.gz
# Load Ensembl peptides:
# Get them from ensembl as above in the gene section except for
# Page 3) Choose the "Sequences" box.
# Page 4) Transcripts/Proteins. Peptide. Format = FASTA.
# Save file as ensemblPep.fa.gz
gunzip -c ensemblPep.fa.gz \
| perl -wpe 's/^(>ENSPTRT\d+\.\d+).*/$1/' \
> ensPep.fa
# load up:
ssh hgwdev
cd /cluster/data/panTro1/bed/ensembl
ldHgGene -gtf -genePredExt panTro1 ensGene \
/cluster/data/panTro1/bed/ensembl/ensGene.gtf
hgsql panTro1 < ~/kent/src/hg/lib/ensGtp.sql
hgsql panTro1 -e 'load data local infile "ensGtp.txt" into table ensGtp'
hgPepPred panTro1 generic ensPep ensPep.fa
# MAP ENCODE ORTHOLOG REGIONS -- FREEZE 1 (DONE 2004-07-09 kate)
ssh kksilo
mkdir /cluster/data/panTro1/bed/encodeRegions
cd /cluster/data/panTro1/bed/encodeRegions
ssh kksilo
mkdir /cluster/data/panTro1/bed/encodeRegions
cd /cluster/data/panTro1/bed/encodeRegions
# at .60, mapped 42, at .50 mapped all 44
liftOver -minMatch=.50 \
/cluster/data/hg16/bed/encodeRegions/encodeRegions.bed \
/cluster/data/hg16/bed/liftOver/hg16ToPanTro1.newId.over.chain \
encodeRegions.bed encodeRegions.unmapped
wc -l *
# 44 encodeRegions.bed
# 0 encodeRegions.unmapped
# 44 total
ssh hgwdev
cd /cluster/data/panTro1/bed/encodeRegions
hgLoadBed panTro1 encodeRegionsNew encodeRegions.bed -noBin
# TODO: remove this stuff
# swap hg16->panTro1 chains for display in chimp browser (temporary, for ENCODE development)
ssh kolossus
cd /cluster/data/hg16/bed/liftOver
chainSwap hg16TopanTro1.chain panTro1ToHg16.swp.chain
# TODO: remove this stuff
ssh hgwdev
cd /cluster/data/hg16/bed/liftOver
hgLoadChain panTro1 liftOverHg16SwpChain panTro1ToHg16.swp.chain
# TODO: remove this stuff
# also load "all chains" liftOver chain (not reciprocal best)
ssh kolossus
cd /cluster/data/hg16/bed/liftOver
chainSwap /cluster/data/hg16/blat-blastz.panTro1/over.chain \
panTro1ToHg16.all.swp.chain
ssh hgwdev
cd /cluster/data/hg16/bed/liftOver
hgLoadChain panTro1 liftOverHg16SwpAllChain panTro1ToHg16.all.swp.chain
# TODO: remove this stuff too
ssh kolossus
cd /cluster/data/hg16/bed/liftOver
chainSwap /cluster/data/hg16/bed/blastz-blat.panTro1/over.newId.chain \
panTro1ToHg16.newId.swp.chain
ssh hgwdev
cd /cluster/data/hg16/bed/liftOver
hgLoadChain panTro1 liftOverHg16SwpNewIdChain panTro1ToHg16.newId.swp.chain
# ACCESSIONS FOR CONTIGS (DONE 2004-08-27 kate)
# Used for Scaffold track details and ENCODE AGP's
ssh hgwdev
cd /cluster/data/panTro1/bed
mkdir -p contigAcc
cd contigAcc
# extract WGS contig to accession mapping from Entrez Nucleotide summaries
# To get the summary file, access the Genbank page for the project
# by searching:
# genus[ORGN] AND WGS[KYWD]
# At the end of the page, click on the list of accessions for the contigs.
# Select summary format, and send to file.
# output to file <species>.acc
contigAccession chimp.acc > contigAcc.tab
wc -l contigAcc.tab
# 361864 contigAcc.tab
# extract all records from chrN_gold, using table browser
# (includes header line)
faSize /cluster/data/panTro1/contigs.bases
# 361864
hgsql panTro1 < $HOME/kent/src/hg/lib/contigAcc.sql
echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \
hgsql panTro1
hgsql panTro1 -e "SELECT COUNT(*) FROM contigAcc"
# 361864
# HUMAN HG17 NET AXT'S FOR DOWNLOAD (2005-02-11 kate)
# By user request
ssh kolossus
# kksilo currently off-limit for logins
cd /cluster/data/panTro1/bed
mkdir -p blastz.hg17.swp
cd blastz.hg17.swp
mkdir axtChain
cd axtChain
cp /cluster/data/hg17/bed/blastz.panTro1/axtChain/chimp.net .
netSplit chimp.net chimpNet
chainSwap /cluster/data/hg17/bed/blastz.panTro1/axtChain/all.chain all.chain
chainSplit chain all.chain
mkdir -p ../axtNet
cat > makeAxt.csh << 'EOF'
foreach f (chimpNet/chr*.net)
set c = $f:t:r
echo "axtNet on $c"
netToAxt chimpNet/$c.net chain/$c.chain /cluster/data/panTro1/nib /cluster/data/hg17/nib stdout | axtSort stdin ../axtNet/$c.axt
end
'EOF'
csh makeAxt.csh >&! makeAxt.log &
tail -100f makeAxt.log
cd ../axtNet
nice gzip *.axt
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/panTro1
mkdir -p vsHg17
cd vsHg17
mkdir -p axtNet
nice cp -rp /cluster/data/panTro1/bed/blastz.hg17.swp/axtNet/*.axt.gz axtNet
#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)
############################################################################
## BLASTZ swap from mm8 alignments (DONE - 2006-02-23 - Hiram)
ssh pk
cd /cluster/data/mm8/bed/blastzPanTro1.2006-02-23
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits panTro1 chainMm8Link
# 901976621 bases of 2733948177 (32.992%) in intersection
############################################################################
# LIFTOVER CHAINS TO PANTRO2
# Split (using makeLoChain-split) of panTro2 is doc'ed in makePanTro2.doc
# Do what makeLoChain-split says to do next (start blat alignment)
ssh kk
cd /cluster/data/panTro1/bed/liftOver
makeLoChain-align panTro1 /scratch/hg/panTro1/nib panTro2 \
/iscratch/i/panTro2/split10k \
/cluster/bluearc/panTro2/11.ooc >&! align.log &
# Do what its output says to do next (start cluster job)
cd /cluster/data/panTro1/bed/blat.panTro2.2006-04-04/run
para shove
para time >&! run.time
#CPU time in finished jobs: 906023s 15100.39m 251.67h 10.49d 0.029 y
#IO & Wait Time: 22074s 367.90m 6.13h 0.26d 0.001 y
#Average job time: 343s 5.72m 0.10h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 4260s 71.00m 1.18h 0.05d
#Submission to last job: 4965s 82.75m 1.38h 0.06d
# lift alignments
ssh kkr1u00
cd /cluster/data/panTro1/bed/liftOver
makeLoChain-lift panTro1 panTro2 >&! lift.log &
# chain alignments
ssh kki
cd /cluster/data/panTro1/bed/liftOver
makeLoChain-chain panTro1 /scratch/hg/panTro1/nib \
panTro2 /scratch/hg/panTro2/nib >&! chain.log &
# Do what its output says to do next (start cluster job)
cd /cluster/data/panTro1/bed/blat.panTro2.2006-04-04/chainRun
para shove
para time >&! run.time
#CPU time in finished jobs: 3884s 64.73m 1.08h 0.04d 0.000 y
#IO & Wait Time: 594s 9.91m 0.17h 0.01d 0.000 y
#Average job time: 86s 1.44m 0.02h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 245s 4.08m 0.07h 0.00d
#Submission to last job: 401s 6.68m 0.11h 0.00d
# net alignment chains
ssh kkstore03
cd /cluster/data/panTro1/bed/liftOver
makeLoChain-net panTro1 panTro2 >&! net.log &
# load reference to over.chain into database table,
# and create symlinks /gbdb and download area
ssh hgwdev
cd /cluster/data/panTro1/bed/liftOver
makeLoChain-load panTro1 panTro2 >&! load.log &
# test by converting a region using the "convert" link on
# the browser, and comparing to blat of the same region
#######################################################################
# 2bit file to filesystems for kluster runs (2006-07-11 kate)
ssh kkr1u00
mkdir /iscratch/i/panTro1
cd /iscratch/i/panTro1
cp -p /cluster/data/panTro1/panTro1.2bit .
cp -p /cluster/data/panTro1/chrom.sizes .
foreach R (2 3 4 5 6 7 8)
rsync -a --progress /iscratch/i/panTro1/ kkr${R}u00:/iscratch/i/panTro1/
end
# request push to /scratch/hg from cluster admins ?
# GOT HERE -- need pk and SAN to be up to complete
ssh pk
mkdir /san/sanvol1/scratch/panTro1
cd /san/sanvol1/scratch/panTro1
cp -p /cluster/data/panTro1/panTro1.2bit .
cp -p /cluster/data/panTro1/chrom.sizes .
#######################################################################
# dbSNP BUILD 125 (Heather, August 2006)
# Our panTro1 has a ctgPos but it doesn't match the dbSNP ContigInfo,
# so I worked just with non-random chrom coords as provided in ContigLoc.
# dbSNP is using the new convention for chimp chroms, so I translated them:
# chr2A --> chr12
# chr2B --> chr13
# chr3 --> chr2
# chr4 --> chr3
# chr5 --> chr4
# chr6 --> chr5
# chr7 --> chr6
# chr8 --> chr7
# chr9 --> chr11
# chr10 --> chr8
# chr11 --> chr9
# chr12 --> chr10
# chr13 --> chr14
# chr14 --> chr15
# chr15 --> chr16
# chr16 --> chr18
# chr17 --> chr19
# chr18 --> chr17
# chr19 --> chr20
# chr20 --> chr21
# chr21 --> chr22
# chr22 --> chr23
# Set up directory structure
ssh kkstore02
cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598
mkdir data
mkdir schema
mkdir rs_fasta
# Get data from NCBI (anonymous FTP)
cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598/data
ftp ftp.ncbi.nih.gov
cd snp/organisms/chimpanzee_9598/database/organism_data
# ContigLoc table has coords, orientation, loc_type, and refNCBI allele
get b125_SNPContigLoc_1_1.bcp.gz
# ContigLocusId has function
get b125_SNPContigLocusId_1_1.bcp.gz
get b125_ContigInfo_1_1.bcp.gz
# MapInfo has alignment weights
get b125_SNPMapInfo_1_1.bcp.gz
# SNP has univar_id, validation status and heterozygosity
get SNP.bcp.gz
# Get schema from NCBI
cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598/schema
ftp ftp.ncbi.nih.gov
cd snp/organisms/chimpanzee_9598/database/organism_schema
get chimpanzee_9598_table.sql.gz
# Get fasta files from NCBI
# using headers of fasta files for molType
cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598/rs_fasta
ftp ftp.ncbi.nih.gov
cd snp/organisms/chimpanzee_9598/rs_fasta
prompt
mget *.gz
# add rs_fasta to seq/extFile
# 2 edits first: strip header to just rsId, and remove duplicates
# work on /cluster/store12 (kkstore05) which has more disk space
cp rs_ch*.fas.gz /cluster/store12/snp/125/chimp/rs_fasta
ssh kkstore05
cd /cluster/store12/snp/125/chimp/rs_fasta
# concat into rsAll.fas
cat << '_EOF_' > concat.csh
#!/bin/csh -ef
rm -f rsAll.fas
foreach file (rs_ch*.fas)
echo $file
zcat $file >> rsAll.fas
end
'_EOF_'
# snpCleanSeq strips the header and skips duplicates
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa
rm rsAll.fas
# load on hgwdev
ssh hgwdev
mkdir /gbdb/panTro1/snp
ln -s /cluster/store12/snp/125/chimp/rs_fasta/snp.fa /gbdb/panTro1/snp/snp.fa
cd /cluster/store12/snp/125/chimp/rs_fasta
hgLoadSeq panTro1 /gbdb/chimp/snp/snp.fa
# look up id in extFile
# move into separate table
hgsql panTro1 < snpSeq.sql
hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 179085' panTro1
hgsql -e 'delete from seq where extFile = 179085' panTro1
hgsql -e 'alter table snpSeq add index acc (acc)' panTro1
# clean up after hgLoadSeq
rm seq.tab
# Simplify names of data files
cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598/data
mv b125_ContigInfo_1_1.bcp.gz ContigInfo.gz
mv b125_SNPContigLoc_1_1.bcp.gz ContigLoc.gz
mv b125_SNPContigLocusId_1_1.bcp.gz ContigLocusId.gz
mv b125_SNPMapInfo_1_1.bcp.gz MapInfo.gz
mv SNP.bcp.gz SNP.gz
ls -1 *.gz > filelist
# edit table descriptions
cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598/schema
# get CREATE statements from chimpanzee_9598_table.sql for our 5 tables
# store in table.tmp
# convert and rename tables
sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp
rm table.tmp
sed -f 'tableRename.sed' table2.tmp > table.sql
rm table2.tmp
# get header lines from rs_fasta
cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598/rs_fasta
/bin/csh gnl.csh
# load on kkr5u00
ssh kkr5u00
hgsql -e mysql 'create database panTro1snp125'
cd /cluster/data/dbSNP/125/organisms/chimpanzee_9598/schema
hgsql panTro1snp125 < table.sql
cd ../data
/bin/csh load.csh
# note rowcount
# ContigLoc 1666593
# SNP 1542718
# MapInfo 1608359
# ContigLocusId 582130
# Get UniVariation from 126 (recently downloaded for mm8snp126)
cd /cluster/data/dbSNP/126/shared
hgsql panTro1snp125 < UniVariation.sql
zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' panTro1snp125
# create working /scratch dir
cd /scratch/snp/125
mkdir chimp
cd chimp
# panTro1.ctgPos table uses different convention
# get gnl files
cp /cluster/data/dbSNP/125/organisms/chimpanzee_9598/rs_fasta/*.gnl .
# examine ContigInfo for group_term and edit pipeline.csh
# use "ref_assembly"
# filter ContigLoc into ContigLocFilter
# errors reported to stdout
# this gets rid of alternate assemblies (using ContigInfo)
# this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo)
# assumes all contigs are positively oriented; will abort if not true
mysql> desc ContigLocFilter;
# +---------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +---------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromName | varchar(32) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | start | int(11) | NO | | | |
# | end | int(11) | YES | | NULL | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# +---------------+-------------+------+-----+---------+-------+
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter125 panTro1snp125 ref_assembly Arachne
# note rowcount
# ContigLocFilter 1354272
# how many are positive strand? hopefully 90%
# We get 100% here
mysql> select count(*) from ContigLocFilter where orientation = 0;
# 1270940
# note count by loc_type
mysql> select count(*), loc_type from ContigLocFilter group by loc_type;
# +----------+----------+
# | count(*) | loc_type |
# +----------+----------+
# | 1469 | 1 |
# | 1350271 | 2 |
# | 2532 | 3 |
# +----------+----------+
# filter ContigLocusId into ContigLocusIdFilter
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter panTro1snp125 ref_assembly
# note rowcount
559975
# condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions)
# assumes SNPs are in numerical order; will errAbort if not true
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense panTro1snp125
# note rowcount; expect about 50% (ascertainment bias for SNPs within genes)
# actually we get about 90% here
# ContigLocusIdCondense 539366
# could delete ContigLocusIdFilter table here
# create chrN_snpFasta tables from *.gnl files
# we are just using molType, but also storing class and observed
# need chromInfo for this
# convert to our panTro1 chrom convention
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta panTro1snp125
/bin/csh renameFasta.csh
# (could start using pipeline.csh here)
# (pipeline.csh takes about 35 minutes to run)
# split ContigLocFilter by chrom
# create the first chrN_snpTmp
# we will reuse this table name, adding/changing columns as we go
# at this point chrN_snpTmp will have the same description as ContigLocFilter
# this opens a file handle for every chrom, so will not scale to scaffold-based assemblies
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom panTro1snp125 ref_assembly
# check coords using loc_type
# possible errors logged to snpLocType.error:
# Unknown locType
# Between with allele != '-'
# Exact with end != start + 1
# Range with end < start
# possible exceptions logged to snpLocType.exceptions:
# RefAlleleWrongSize
# This run 4 exceptions/errors: couldn't handle N(100)
# morph chrN_snpTmp
mysql> desc chr1_snpTmp;
# +---------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +---------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# +---------------+-------------+------+-----+---------+-------+
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype125 panTro1snp125 ref_assembly
# expand allele as necessary
# report syntax errors to snpExpandAllele.errors
# possible exceptions logged to snpExpandAllele.exceptions:
# RefAlleleWrongSize
# 43 alleles expanded
# Couldn't handle 4 that had N(100)
# Fixed with revision 1.26 of snpExpandAllele.c and data edit.
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele panTro1snp125 ref_assembly
# the next few steps prepare for working in UCSC space
# sort by position
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort panTro1snp125 ref_assembly
# rename MT --> M
hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" panTro1snp125
# chrom conversion!!
hgsql -e 'rename table chr3_snpTmp to chr2_snpTmp' panTro1snp125
hgsql -e 'rename table chr4_snpTmp to chr3_snpTmp' panTro1snp125
hgsql -e 'rename table chr5_snpTmp to chr4_snpTmp' panTro1snp125
hgsql -e 'rename table chr6_snpTmp to chr5_snpTmp' panTro1snp125
hgsql -e 'rename table chr7_snpTmp to chr6_snpTmp' panTro1snp125
hgsql -e 'rename table chr8_snpTmp to chr7_snpTmp' panTro1snp125
hgsql -e 'rename table chr9_snpTmp to chr11A_snpTmp' panTro1snp125
hgsql -e 'rename table chr10_snpTmp to chr8_snpTmp' panTro1snp125
hgsql -e 'rename table chr11_snpTmp to chr9_snpTmp' panTro1snp125
hgsql -e 'rename table chr11A_snpTmp to chr11_snpTmp' panTro1snp125
hgsql -e 'rename table chr12_snpTmp to chr10_snpTmp' panTro1snp125
hgsql -e 'rename table chr2A_snpTmp to chr12_snpTmp' panTro1snp125
hgsql -e 'rename table chr22_snpTmp to chr23_snpTmp' panTro1snp125
hgsql -e 'rename table chr21_snpTmp to chr22_snpTmp' panTro1snp125
hgsql -e 'rename table chr20_snpTmp to chr21_snpTmp' panTro1snp125
hgsql -e 'rename table chr19_snpTmp to chr20_snpTmp' panTro1snp125
hgsql -e 'rename table chr17_snpTmp to chr19_snpTmp' panTro1snp125
hgsql -e 'rename table chr18_snpTmp to chr17_snpTmp' panTro1snp125
hgsql -e 'rename table chr16_snpTmp to chr18_snpTmp' panTro1snp125
hgsql -e 'rename table chr15_snpTmp to chr16_snpTmp' panTro1snp125
hgsql -e 'rename table chr14_snpTmp to chr15_snpTmp' panTro1snp125
hgsql -e 'rename table chr13_snpTmp to chr14_snpTmp' panTro1snp125
hgsql -e 'rename table chr2B_snpTmp to chr13_snpTmp' panTro1snp125
# get nib files
ssh hgwdev
cd /cluster/data/panTro1/nib
cp *.nib /cluster/home/heather/transfer/snp/panTro1
ssh kkr5u00
cd /scratch/snp/125/chimp
mkdir nib
cp /cluster/home/heather/transfer/snp/panTro1/*.nib nib
rm /cluster/home/heather/transfer/snp/panTro1/*.nib
# edit path in chromInfo, load into panTro1snp125 with editted path
# lookup reference allele in nibs
# keep reverse complement to use in error checking (snpCheckAlleles)
# check here for SNPs larger than 1024
# errAbort if detected
# check for coords that are too large, log to snpRefUCSC.error and skip
# This run no errors
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC panTro1snp125
# morph chrN_snpTmp
mysql> desc chr1_snpTmp;
# +--------------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +--------------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# | refUCSC | blob | YES | | NULL | |
# | refUCSCReverseComp | blob | YES | | NULL | |
# +--------------------+-------------+------+-----+---------+-------+
# compare allele from dbSNP to refUCSC
# locType between is excluded from this check
# log exceptions to snpCheckAllele.exceptions
# if SNP is positive strand, expect allele == refUCSC
# log RefAlleleMismatch if not
# if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp
# If allele == refUCSCRevComp, log RefAlleleNotRevComp
# If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch
# This run we got:
# 0 RefAlleleMismatch
# 6800 RefAlleleNotRevComp
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles panTro1snp125
# add class and observed using univar_id from SNP table
# to get class (subsnp_class) and observed (var_str) from UniVariation
# log errors to snpClassAndObserved.errors
# errors detected:
# class = 0 in UniVariation
# class > 8 in UniVariation
# univar_id = 0 in SNP
# no row in SNP for snp_id in chrN_snpTmp
# This run we got:
# 3 class = 0 in UniVariation
# 0 class > 8 in UniVariation
# 0 univar_id = 0 in SNP
# 1 no row in SNP for snp_id in chrN_snpTmp
# dbSNP has class = 'in-del'
# we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3
# actually for this run all class = 'single'
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved panTro1snp125
# morph chrN_snpTmp
# +--------------------+---------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +--------------------+---------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | class | varchar(255) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# | refUCSC | blob | YES | | NULL | |
# | refUCSCReverseComp | blob | YES | | NULL | |
# | observed | blob | YES | | NULL | |
# +--------------------+---------------+------+-----+---------+-------+
# generate exceptions for class and observed
# SingleClassBetweenLocType
# SingleClassRangeLocType
# NamedClassWrongLocType
# ObservedWrongFormat
# ObservedWrongSize
# ObservedMismatch
# RangeSubstitutionLocTypeExactMatch
# SingleClassTriAllelic
# SingleClassQuadAllelic
# This will also detect IUPAC symbols in allele
# None of these this run
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved panTro1snp125
# add function
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction panTro1snp125
# add validation status and heterozygosity
# log error if validation status > 31 or missing
# no errors this run
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP panTro1snp125
# add molType
# errors detected: missing or duplicate molType
# this run 4557 duplicates
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype panTro1snp125
# generate chrN_snp125 and snp125Exceptions tables
cp snpCheckAlleles.exceptions snpCheckAlleles.tab
cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab
cp snpExpandAllele.exceptions snpExpandAllele.tab
cp snpLocType.exceptions snpLocType.tab
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable panTro1snp125 125
# concat into snp125.tab
# cat chr*_snp125.tab >> snp125.tab
/bin/sh concat.sh
# check for multiple alignments
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple panTro1snp125 125
mysql> load data local infile 'snpMultiple.tab' into table snp125Exceptions;
# load on hgwdev
cp snp125.tab /cluster/home/heather/transfer/snp/panTro1
hgsql panTro1snp125 -e 'select * from snp125Exceptions' > /cluster/home/heather/transfer/snp/panTro1/snp125Exceptions.tab
ssh hgwdev
cd transfer/snp/panTro1
mysql> load data local infile 'snp125.tab' into table snp125;
mysql> load data local infile 'snp125Exceptions.tab' into table snp125Exceptions;
# create indexes
mysql> alter table snp125 add index name (name);
mysql> alter table snp125 add index chrom (chrom, bin);
mysql> alter table snp125Exceptions add index name(name);
# create snp125ExceptionDesc table
cd /cluster/data/dbSNP
hgsql panTro1 < snp125ExceptionDesc.sql
# add counts to exception.panTro1.125, can start with exception.template
hgsql -e 'select count(*), exception from snp125Exceptions group by exception' panTro1
mysql> load data local infile 'exception.panTro1.125' into table snp125ExceptionDesc;
mysql> select count(*), exception from snp125Exceptions group by exception;
+----------+---------------------------+
| count(*) | exception |
+----------+---------------------------+
| 24785 | MultipleAlignments |
| 675 | ObservedMismatch |
| 6800 | RefAlleleNotRevComp |
| 4 | RefAlleleWrongSize |
| 2532 | SingleClassBetweenLocType |
| 3 | SingleClassQuadAllelic |
| 1469 | SingleClassRangeLocType |
| 61 | SingleClassTriAllelic |
+----------+---------------------------+