src/hg/makeDb/doc/hg16.txt 1.10
1.10 2009/04/27 20:11:36 hiram
liftOver to hg19 done
Index: src/hg/makeDb/doc/hg16.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg16.txt,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 1000000 -r1.9 -r1.10
--- src/hg/makeDb/doc/hg16.txt 10 Nov 2008 20:28:17 -0000 1.9
+++ src/hg/makeDb/doc/hg16.txt 27 Apr 2009 20:11:36 -0000 1.10
@@ -1,11932 +1,11946 @@
# for emacs: -*- mode: sh; -*-
# This file describes how we made the browser database on
# NCBI build 34 (July 18, 2003 freeze)
# HOW TO BUILD A ASSEMBLY FROM NCBI FILES
# ---------------------------------------
# Make gs.17 directory, gs.17/build34 directory, and gs.17/ffa directory.
mkdir /cluster/store4/gs.17
mkdir /cluster/store4/gs.17/build34
mkdir /cluster/store4/gs.17/agp
mkdir /cluster/store4/gs.17/ffa
# Make a symbolic link from /cluster/store1 to this location
cd /cluster/store1
ln -s /cluster/store4/gs.17 ./gs.17
# Make a symbolic link from your home directory to the build dir:
ln -s /cluster/store4/gs.17/build34 ~/oo
# NCBI download site:
ftp ftp.ncbi.nih.gov
# user and password from /cse/guests/kent/buildHg6.doc
cd build_34
# Download all finished agp's and fa's into gs.17/agp
mget chr*.agp
mget chr*.fa.gz
gunzip *.gz
# Download contig agp's into gs.17/build34
get ref_placed.agp # used to be in reference.agp
get ref_unplaced.agp # used to be in reference.agp
get DR51.agp
get PAR.agp # new for this build - PAR regions added to chrY
cat ref_placed.agp ref_unplaced.agp DR51.agp > ncbi_build34.agp
# Download contig fa's into gs.17/ffa
get ref_placed.fa.gz # used to be in reference.fa
get ref_unplaced.fa.gz # used to be in reference.fa
get DR51.fa.gz
get PAR.fa.gz # new for this build - PAR regions added to chrY
get sequence.inf
cat ref_placed.fa ref_unplaced.fa DR51.fa > ncbi_build34.fa
# Download assembly related files into gs.17/build34
get seq_contig.md
get contig_overlaps.agp
# Download questionable join certificates file
get e-certificates.txt
mkdir certificates
mv e-certificates.txt certificates
# Save a copy of the original seq_contig.md file
cp seq_contig.md seq_contig.md.orig
# For build34, edit the seq_contig.md file to remove the alternative chr7
# sequence supplied by the Toronto group: NT_079590, NT_079591, NT_079592,
# NT_079593, NT_079594, NT_079595, NT_079596, NT_079597
# Edit seq_contig.md to make the DR51 alternative haplotype look like a
# chr6_random sequence:
# 9606 6 32491690 32629063 + NG_002432 GI:28212469 CONTIG DR51 1
# to
# 9606 6|NG_002432 1 137374 + NG_002432 GI:28212469 CONTIG DR51 1
# Move this edited DR51 line next to other chr6_random contigs (for creating
# the lift file)
# Sanity check
/cluster/bin/i386/checkYbr build34/ncbi_build34.agp ffa/ncbi_build34.fa \
build34/seq_contig.md
# Convert fa files into UCSC style fa files and place in "contigs" directory
# inside the gs.17/build34 directory
cd build34
mkdir contigs
/cluster/bin/i386/faNcbiToUcsc -split -ntLast ../ffa/ncbi_build34.fa \
contigs
# Copy over chrM contig from previous version
cd ~/oo
cp -r gs.17/build33/M .
# Determine the chromosome sizes from agps
/cluster/bin/scripts/getChromSizes ../agp
# Create lift files (this will create chromosome directory structure) and
# inserts file
/cluster/bin/scripts/createNcbiLifts -s chrom_sizes seq_contig.md .
# Create contig agp files (will create contig directory structure)
/cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build34.agp .
# Create chromsome random agp files.
/cluster/bin/scripts/createNcbiChrAgp -randomonly .
# Copy the original chrN.agp files from the gs.17/agp directory
# into each of the chromosome directories since they contain better
# gap information. Delete the comments at top from these.
# Distribute contig .fa to appropriate directory (assumes all files
# are in "contigs" directory).
# create global data link for everyone. No more home directory
# links required.
ln -s /cluster/store4/gs.17/build34 /cluster/data/hg16
cd /cluster/data/hg16
/cluster/bin/scripts/distNcbiCtgFa contigs .
rm -r contigs
# Copy over jkStuff from previous build (??)
mkdir jkStuff
cp /cluster/store1/gs.17/build33/jkStuff/*.sh jkStuff
/build31/jkStuff/*.csh jkStuff
cp /cluster/store1/gs.17/build33/jkStuff/*.gsub jkStuff
# Create contig gl files
/cluster/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md
# Create chromosome gl files
jkStuff/liftGl.sh contig.gl
# Files ready for repeat-masking and trf
# CREATE STS/FISH/BACENDS/CYTOBANDS DIRECTORY STRUCTURE AND SETUP
# (DONE 2003-07-23 Terry)
# Create directory structure to hold information for these tracks
cd /projects/hg2/booch/psl/
# Change Makefile parameters for OOVERS, GSVERS, PREVGS, PREVOO
make new
# Update all Makefiles with latest OOVERS and GSVERS, DATABASE, and locations of .fa files
# Makefiles in:
# /gs.17/build33/
# /gs.17/build33/bacends
# /gs.17/build33/cytobands
# /gs.17/build33/cytoPlots
# /gs.17/build33/fish
# /gs.17/build33/fosends
# /gs.17/build33/g2g
# /gs.17/build33/geneticPlots
# /gs.17/build33/primers
# /gs.17/build33/recombrate
# /gs.17/build33/sts
# /gs.17/build33/stsPlots
# Create accession_info file *****
make accession_info.rdb
# UPDATE STS INFORMATION (DONE 2003-07-23 Terry)
# Download and unpack updated information from dbSTS:
cd /projects/hg2/booch/psl/update
wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts
wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases
wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.Z
mv sts.Z dbSTS.FASTA.dailydump.Z
gunzip dbSTS.FASTA.dailydump.Z
# Make new directory for this info and move files there
mkdir /cluster/store1/sts.8
cp all.STS.fa /cluster/store1/sts.8
cp all.primers /cluster/store1/sts.8
cp all.primers.fa /cluster/store1/sts.8
# Copy new files to cluster
ssh kkstore
cd /cluster/store1/sts.8
cp /cluster/store1/sts.8 /*.* /scratch/hg/STS
# Ask for propagation from sysadmin
# Load the sequences into the database (after database created)
ssh hgwdev
mkdir /gbdb/hg16/sts.8
cd /gbdb/hg16/sts.8
ln -s /cluster/store1/sts.8/all.STS.fa ./all.STS.fa
ln -s /cluster/store1/sts.8/all.primers.fa ./all.primers.fa
cd /cluster/store2/tmp
hgLoadRna addSeq hg16 /gbdb/hg16/sts.8/all.STS.fa
hgLoadRna addSeq hg16 /gbdb/hg16/sts.8/all.primers.fa
# CREATE STS MARKER ALIGNMENTS (DONE 2003-08-03 Terry)
# Create full sequence alignments
ssh kk
cd /cluster/home/booch/sts
# Update Makefile with latest OOVERS and GSVERS and
# run cluster jobs
make new
make jobList
para create jobList
para push
# wait until alignments done
make stsMarkers.psl
# Copy files to final destination and remove originals
make copy.assembly
make clean
# Create primer alignments
ssh kk
cd /cluster/home/booch/primers
# Update Makefile with latest OOVERS and GSVERS and
# run cluster jobs
make new
make jobList.scratch
para create jobList
para push
# Do an initial quick filter of results (takes a while, still) and create
# final file - best done on eieio since disks local
ssh eieio
make filter
make primers.psl
# Copy files to final destination and remove
make copy.assembly
make clean
# Create ePCR alignments
ssh kk
cd /cluster/home/booch/epcr
# Update Makefile with latest OOVERS and GSVERS
make new
make jobList
para create jobList
para push
make all.epcr
# Copy files to final destination and remove
make copy.assembly
make clean
# CREATE AND LOAD STS MARKERS TRACK (DONE 2003-08-03 Terry)
# Copy in current stsInfo2.bed and stsAlias.bed files
cd /projects/hg2/booch/psl/gs.17/build33
cp ../update/stsInfo2.bed .
cp ../update/stsAlias.bed .
# Create final version of sts sequence placements
ssh kks00
cd /projects/hg2/booch/psl/gs.17/build33/sts
make stsMarkers.final
# Create final version of primers placements
# Make sure PRIMERS variable in Makefile is pointing to current version
cd /projects/hg2/booch/psl/gs.17/build33/primers
make primers.final
# Create bed file
cd /projects/hg2/booch/psl/gs.17/build33
make stsMap.bed
# Create database tables
ssh hgwdev
cd /projects/hg2/booch/psl/tables
hgsql hg16 < all_sts_primer.sql
hgsql hg16 < all_sts_seq.sql
hgsql hg16 < stsAlias.sql
hgsql hg16 < stsInfo2.sql
hgsql hg16 < stsMap.sql
# Load the tables
cd /projects/hg2/booch/psl/gs.17/build34/sts/
echo 'load data local infile "stsMarkers.psl.filter.lifted" into table all_sts_seq;' | hgsql hg16
cd /projects/hg2/booch/psl/gs.17/build34/primers/
echo 'load data local infile "primers.psl.filter.lifted" into table all_sts_primer;' | hgsql hg16
cd /projects/hg2/booch/psl/gs.17/build34/
echo 'load data local infile "stsAlias.bed" into table stsAlias;' | hgsql hg16
echo 'load data local infile "stsInfo2.bed" into table stsInfo2;' | hgsql hg16
echo 'load data local infile "stsMap.bed" into table stsMap;' | hgsql hg16
# CREATE AND LOAD RECOMBINATION RATE TRACK (DONE 2003-08-05 Terry)
# (must be done after STS Markers track)
# Create bed file
cd /projects/hg2/booch/psl/gs.17/build34/recombrate
make recombRate.bed
# Create database table
ssh hgwdev
cd /projects/hg2/booch/psl/tables
hgsql hg16 < recombRate.sql
# Load the table
cd /projects/hg2/booch/psl/gs.17/build34/recombrate/
echo 'load data local infile "recombRate.bed" into table recombRate;' | hgsql hg16
# UPDATE BACEND SEQUENCES (DONE 2003-07-23 Terry)
# **** Sequences were determined to not have changed since bacends.4 *****
# **** No new sequences downloaded - See makeHg15.doc for download instructions *****
# Load the sequences into the database (after database created)
ssh hgwdev
mkdir /gbdb/hg16/bacends.4
cd /gbdb/hg16/bacends.4
ln -s /cluster/store1/bacends.4/BACends.fa ./BACends.fa
cd /cluster/store2/tmp
hgLoadRna addSeq hg16 /gbdb/hg16/bacends.4/BACends.fa
# BACEND SEQUENCE ALIGNMENTS (DONE 2003-08-01 Terry)
# (alignments done without RepeatMasking)
# Create full sequence alignments
ssh kk
cd /cluster/home/booch/bacends
# Update Makefile with latest OOVERS and GSVERS and run cluster jobs
make new
make jobList
para create jobList
para push
# Compile alignments and lift the files (takes a while)
ssh eieio
make bacEnds.psl.lifted
# Copy files to final destination and remove
make copy.assembly
make clean # (may want to wait until sure they're OK)
# BACEND PAIRS TRACK (DONE 2003-08-01 Terry)
# Add /projects/compbiousr/booch/booch/scripts to your path
# Update Makefile with new location of pairs/singles
# files, if necessary (DONE)
cd /projects/hg2/booch/psl/gs.17/build33/bacends
# Make initial file of alignments
make bacEnds.rdb
# Try to fish out more pairs
make bacEndsMiss.psl
# Re-make bacEnds.rdb with new info
make bacEnds.rdb
# Create bacEndPairs track file
make bacEndPairs.bed
# Create bacEndPairsBad and bacEndPairsLong files
make bacEndPairsBad.bed
# Create psl file to load
make bacEnds.load.psl
# Create database tables
ssh hgwdev
cd /projects/hg2/booch/psl/tables
hgsql hg16 < all_bacends.sql
hgsql hg16 < bacEndPairs.sql
hgsql hg16 < bacEndPairsBad.sql
hgsql hg16 < bacEndPairsLong.sql
# Load the tables
cd /projects/hg2/booch/psl/gs.17/build34/bacends/
echo 'load data local infile "bacEnds.load.psl" into table all_bacends;' | hgsql hg16
echo 'load data local infile "bacEndPairs.bed" into table bacEndPairs;' | hgsql hg16
echo 'load data local infile "bacEndPairsBad.bed" into table bacEndPairsBad;' | hgsql hg16
echo 'load data local infile "bacEndPairsLong.bed" into table bacEndPairsLong;' | hgsql hg16
# FOSEND SEQUENCE ALIGNMENTS (DONE 2003-08-03 Terry)
# Create full sequence alignments
ssh kk
cd /cluster/home/booch/fosends
# Update Makefile with latest OOVERS and GSVERS and run cluster jobs
make new
make jobList
para create jobList
para push
# Compile alignments and lift the files (takes a while)
ssh eieio
cd /cluster/home/booch/fosends
make fosEnds.psl.lifted
# Copy files to final destination and remove
make copy.assembly
make clean
# FOSEND PAIRS TRACK (DONE 2003-08-01 Terry)
# Update Makefile with location of pairs files, if necessary
ssh kks00
cd /projects/hg2/booch/psl/gs.17/build33/fosends
# Make initial file of alignments
make fosEnds.rdb
# Try to fish out more pairs
make fosEndsMiss.psl
# Re-make bacEnds.rdb with new info
make fosEnds.rdb
# Create bacEndPairs track file
make fosEndPairs.bed
# Create bacEndPairsBad and bacEndPairsLong files
make fosEndPairsBad.bed
# Create psl file to load
make fosEnds.load.psl
# Create database tables
ssh hgwdev
cd /projects/hg2/booch/psl/tables
hgsql hg16 < all_fosends.sql
hgsql hg16 < fosEndPairs.sql
hgsql hg16 < fosEndPairsBad.sql
hgsql hg16 < fosEndPairsLong.sql
# Load the tables
cd /projects/hg2/booch/psl/gs.17/build34/fosends/
echo 'load data local infile "fosEnds.load.psl" into table all_fosends;' | hgsql hg16
echo 'load data local infile "fosEndPairs.bed" into table fosEndPairs;' | hgsql hg16
echo 'load data local infile "fosEndPairsBad.bed" into table fosEndPairsBad;' | hgsql hg16
echo 'load data local infile "fosEndPairsLong.bed" into table fosEndPairsLong;' | hgsql hg16
# Load the sequences (change fosends.# to match correct location) (done for hg15 early 4/9/2003)
mkdir /gbdb/hg15/fosends.3
cd /gbdb/hg15/fosends.3
ln -s /cluster/store1/fosends.3/fosEnds.fa ./fosEnds.fa
cd /cluster/store2/tmp
hgLoadRna addSeq hg15 /gbdb/hg15/fosends.3/fosEnds.fa
# UPDATE FISH CLONES INFORMATION (DONE 2003-07-23 Terry)
# Download the latest info from NCBI
# point browser at http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
# change "Show details on sequence-tag" to "yes"
# change "Download or Display" to "Download table for UNIX"
# press Submit - save as /projects/hg2/booch/psl/fish/hbrc/hbrc.20030723.table
# Format file just downloaded.
cd /projects/hg2/booch/psl/fish/
# Edit Makefile to point at file just downloaded (variables HBRC, HBRCFORMAT)
make HBRC
# (Manually added 21 results from FHCRC)
# Copy it to the new freeze location
cp /projects/hg2/booch/psl/fish/all.fish.format /projects/hg2/booch/psl/gs.17/build34/fish/
# Save it as the new "gold" file
cp all.fish.format all.fish.format.gold
# CREATE AND LOAD FISH CLONES TRACK (DONE 2003-08-08 Terry)
# (must be done after Coverage, STS markers track and BAC end pairs track)
# Extract the file with clone positions from database
ssh hgwdev
echo 'select * into outfile "/tmp/booch/clonePos.txt" from clonePos' | hgsql hg16
mv /tmp/booch/clonePos.txt /projects/hg2/booch/psl/gs.17/build34/fish
# Get current clone/accession information
ssh kks00
cd /projects/hg2/booch/psl/gs.17/build34/fish
wget http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out
# Create initial placement file
cp /projects/hg2/booch/psl/gs.17/build33/fish/extract.pl .
make cyto.markers.bed
# Get sequences for accessions not in genome
# goto http://www.ncbi.nlm.nih.gov/entrez/batchentrez.cgi?db=Nucleotide
# select file "/projects/hg2/booch/psl/gs.17/build33/fish/not.found.acc
# change output to FASTA format
# download results to "/projects/hg2/booch/psl/gs.17/build33/fish/not.found.fa"
# Place sequences against genome
make blat
# Try to incorporate new placements
make cyto.markers.bed2
# Create bed file
make fishClones.bed
# Create database table
ssh hgwdev
cd /projects/hg2/booch/psl/tables
hgsql hg16 < fishClones.sql
# Load the table
cd /projects/hg2/booch/psl/gs.17/build34/fish/
echo 'load data local infile "fishClones.bed" into table fishClones;' | hgsql hg16
# CREATE AND LOAD CHROMOSOME BANDS TRACK (DONE 2003-08-08 Terry)
# (must be done after FISH Clones track)
# Create bed file
ssh kks00
cd /projects/hg2/booch/psl/gs.17/build34/cytobands/
make setBands.txt # NOTE: may get errors if inserts file out-of-sync with pctSetBands file
make cytobands.pct.ranges
make predict
# Create database table
ssh hgwdev
cd /projects/hg2/booch/psl/tables
hgsql hg16 < cytoBand.sql
# Load the table
cd /projects/hg2/booch/psl/gs.17/build34/cytobands/
echo 'load data local infile "cytobands.bed" into table cytoBand;' | hgsql hg16
# Make cytoBandIdeo track for ideogram gif on hgTracks page.
# For human cytoBandIdeo is just a replicate of the cytoBand track.
# Make the cytoBand track (above) and then:
echo "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" | hgsql hg16
# CREATING DATABASE (DONE - 2003-07-26 - Hiram)
ssh hgwdev
# Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
df -h /var/lib/mysql
# Filesystem Size Used Avail Use% Mounted on
# /dev/sda1 472G 416G 31G 93% /var/lib/mysql
# Create the database.
echo 'create database hg16' | hgsql hg15
# make a semi-permanent read-only alias (add this to your .cshrc/.bashrc):
# (I have not seen a use for this in any procedures ? -Hiram)
# alias hg16 mysql -u hguser -phguserstuff -A hg16
# (use 'hgsql hg16' instead)
# Initialize the relational-mrna and external sequence info tables:
hgLoadRna new hg16
# Copy over grp table (for track grouping) from another database:
echo "create table grp (PRIMARY KEY(NAME)) select * from hg15.grp" \
| hgsql hg16
# add ENCODE track. Move Repeats lower in priority
echo 'UPDATE grp SET priority=7 WHERE name="varRep"'| hgsql hg16
echo 'INSERT INTO grp (name, label, priority) VALUES ("encode", "ENCODE Tracks", 8)' | hgsql hg16
# New ENCODE groups
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeGenes", "ENCODE Regions and Genes", 8.1)' | hgsql hg16
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeTxLevels", "ENCODE Transcript Levels", 8.2)' | hgsql hg16
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChip", "ENCODE Chromatin Immunoprecipitation", 8.3)' | hgsql hg16
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChrom", "ENCODE Chromosome, Chromatin and DNA Structure", 8.4)' | hgsql hg16
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeCompGeno", "ENCODE Comparative Genomics", 8.5)' | hgsql hg16
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeVariation", "ENCODE Variation", 8.6)' | hgsql hg16
echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeAnalysis", "ENCODE Analysis", 8.9)' | hgsql hg16
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE - 2003-07-26 - Hiram)
ssh hgwdev
# Enter hg16 into hgcentraltest.dbDb so test browser knows about it:
echo 'insert into dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName) \
values("hg16", "July 2003", "/gbdb/hg16/nib", "Human", \
"chr7:26828631-26938371", 1, 10, "Human", "Homo sapiens");' \
| hgsql -h genome-testdb hgcentraltest
# Make trackDb table so browser knows what tracks to expect:
cd ~kent/src/hg/makeDb/trackDb
cvs up -d -P .
# Edit that makefile to add hg16 in all the right places and do
make update
make alpha
cvs commit makefile
# MAKE LIFTALL.LFT, NCBI.LFT (DONE - 2003-07-26 - Hiram)
cd /cluster/data/hg16
mkdir -p jkStuff
cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft
# Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly.
# Note: this ncbi.lift will not lift floating contigs to chr_random coords,
# but it will show the strand orientation of the floating contigs
# (grep for '|').
mdToNcbiLift seq_contig.md jkStuff/ncbi.lft
# If a lift file has been edited (e.g. as in 6.2.5 above), edit ncbi.lft
# to match. If no step 6.2.5 then no editing needed
# REPEAT MASKING (DONE - 2003-07-25 - Hiram, REDONE 2003-08-02)
# Split contigs, run RepeatMasker, lift results
# Notes:
# * Using new RepeatMasker in /cluster/bluearc/RepeatMasker030619
# Always check for new RepeatMasker before proceeding
# * Contigs (*/N{T,G}_*/N{T,G}_*.fa) are split into 500kb chunks to make
# RepeatMasker runs manageable on the cluster ==> results need lifting.
# * For the NCBI assembly we repeat mask on the sensitive mode setting
# (RepeatMasker -s)
#- Split contigs into 500kb chunks:
ssh eieio
cd /cluster/data/hg16
foreach chrom ( ?{,?} )
foreach c ( $chrom/N{T,G}_?????? )
set contig = $c:t
echo "splitting ${chrom}/${contig}/${contig}.fa"
faSplit size ${chrom}/${contig}/$contig.fa 500000 \
${chrom}/${contig}/${contig}_ -lift=${chrom}/${contig}/$contig.lft \
-maxN=500000
end
end
#- Make the run directory and job list:
cd /cluster/data/hg16
mkdir -p jkStuff
# According to RepeatMasker help file, no arguments are required to
# specify species because its default is set for primate (human)
# This run script saves the .tbl file to be sent to Arian. He uses
# those for his analysis. Sometimes he needs the .cat and .align files for
# checking problems. Krish needs the .align files, they are large.
cat << '_EOF_' > jkStuff/RMHuman
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/hg16/$2
/bin/cp $2 /tmp/hg16/$2/
cd /tmp/hg16/$2
/cluster/bluearc/RepeatMasker030619/RepeatMasker -ali -s $2
popd
/bin/cp /tmp/hg16/$2/$2.out ./
if (-e /tmp/hg16/$2/$2.align) /bin/cp /tmp/hg16/$2/$2.align ./
if (-e /tmp/hg16/$2/$2.tbl) /bin/cp /tmp/hg16/$2/$2.tbl ./
# if (-e /tmp/hg16/$2/$2.cat) /bin/cp /tmp/hg16/$2/$2.cat ./
/bin/rm -fr /tmp/hg16/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/hg16/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/hg16
'_EOF_'
# << this line makes emacs coloring happy
chmod +x jkStuff/RMHuman
ssh eieio
cd /cluster/data/hg16
mkdir RMRun
rm -f RMRun/RMJobs
touch RMRun/RMJobs
foreach d ( ?{,?} )
foreach c ( $d/N{T,G}_*/N{T,G}_*_*.fa )
set f = $c:t
set cc = $c:h
set contig = $cc:t
echo /cluster/store4/gs.17/build34/jkStuff/RMHuman \
/cluster/store4/gs.17/build34/${d}/${contig} $f \
'{'check out line+ /cluster/store4/gs.17/build34/${d}/${contig}/$f.out'}' \
>> RMRun/RMJobs
end
end
# We have 6015 jobs in RMJobs:
wc RMRun/RMJobs
# 6015 42105 1184896 RMRun/RMJobs
#- Do the run
ssh kk
cd /cluster/data/hg16/RMRun
para create RMJobs
para try, para check, para check, para push, para check,...
#- While that is running, you can run TRF (simpleRepeat) on the small
# cluster. See SIMPLE REPEAT section below
# CPU time in finished jobs: 33575296s 559588.26m 9326.47h 388.60d 1.065 y
# IO & Wait Time: 238878s 3981.30m 66.36h 2.76d 0.008 y
# Average job time: 7513s 125.21m 2.09h 0.09d
# Longest job: 18457s 307.62m 5.13h 0.21d
# Submission to last job: 55537s 925.62m 15.43h 0.64d
#- Lift up the split-contig .out's to contig-level .out's
ssh eieio
cd /cluster/data/hg16
foreach d ( ?{,?}/N{T,G}_* )
cd $d
set contig = $d:t
liftUp $contig.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out
cd ../..
end
#- Lift up RepeatMask .out files to chromosome coordinates via
# picked up jkStuff/liftOut2.sh from the hg15 build. Reset the
# liftUp command from ~kent/bin/$MACHTYPE to be from
# /cluster/bin/i386. Took the redirection to dev/null off of the
# command and capture the output here to see what errors we have.
./jkStuff/liftOut2.sh > liftOut2.out 2>&1 &
#- By this point, the database should have been created (above):
ssh hgwdev
cd /cluster/data/hg16
hgLoadOut hg16 ?/*.fa.out ??/*.fa.out
# errors during this load:
Processing 2/chr2.fa.out
Strange perc. field -6.1 line 243430 of 2/chr2.fa.out
Strange perc. field -5.6 line 243430 of 2/chr2.fa.out
Strange perc. field -6.1 line 243432 of 2/chr2.fa.out
Strange perc. field -5.6 line 243432 of 2/chr2.fa.out
Processing 5/chr5.fa.out
Strange perc. field -0.3 line 4339 of 5/chr5.fa.out
Processing 19/chr19.fa.out
Strange perc. field -18.6 line 77032 of 19/chr19.fa.out
# SIMPLE REPEAT [TRF] TRACK (DONE - 2003-07-25 - Hiram)
# Distribute contigs to /iscratch/i
ssh kkr1u00
rm -rf /iscratch/i/gs.17/build34/contigs
mkdir -p /iscratch/i/gs.17/build34/contigs
cd /cluster/data/hg16
cp -p contigs/*.fa /iscratch/i/gs.17/build34/contigs
# Make sure the total size looks like what you'd expect:
du ./contigs /iscratch/i/gs.17/build34/contigs
# 2839768 ./contigs
# 2839768 /iscratch/i/gs.17/build34/contigs
~kent/bin/iSync
# Create cluster parasol job like so:
mkdir -p /cluster/data/hg16/bed/simpleRepeat
cd /cluster/data/hg16/bed/simpleRepeat
mkdir trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
# << this line makes emacs coloring happy
chmod +x runTrf
cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
ls -1S /iscratch/i/gs.17/build34/contigs/*.fa > genome.lst
gensub2 genome.lst single gsub spec
para create spec
para try
para check
para push
para check
# Completed: 472 of 472 jobs
# CPU time in finished jobs: 36177s 602.95m 10.05h 0.42d 0.001 y
# IO & Wait Time: 2038s 33.97m 0.57h 0.02d 0.000 y
# Average job time: 81s 1.35m 0.02h 0.00d
# Longest job: 6992s 116.53m 1.94h 0.08d
# Submission to last job: 10703s 178.38m 2.97h 0.12d
# When cluster run is done, a couple of extra files not caught in
# the above sequence
./runTrf /cluster/store4/gs.17/build34/M/NT_999999/NT_999999.fa trf/NT_999999.bed
# That produces an empty .bed file, mark it so:
echo "# trf run produces nothing for this one" >> trf/NT_999999.bed
liftUp simpleRepeat.bed /cluster/data/hg16/jkStuff/liftAll.lft \
warn trf/*.bed > lu.out 2>&1
# Load into the database:
ssh hgwdev
cd /cluster/data/hg16/bed/simpleRepeat
/cluster/bin/i386/hgLoadBed hg16 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# stringTab = 0
# Reading simpleRepeat.bed
# Loaded 627883 elements
# Sorted
# Saving bed.tab
# Loading hg16
# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2003-07-27 - Hiram - REDONE 07-30)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh eieio
cd /cluster/data/hg16/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# Lift up filtered trf output to chrom coords as well:
cd /cluster/data/hg16
mkdir -p bed/simpleRepeat/trfMaskChrom
foreach c (?{,?})
if (-e $c/lift/ordered.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/ordered.lst > $c/lift/oTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
endif
if (-e $c/lift/random.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/random.lst > $c/lift/rTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
endif
end
# MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE - 2003-07-27)
# -Hiram
# This used to be done right after RepeatMasking. Now, we mask with
# TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above.
ssh eieio
cd /cluster/data/hg16
# Make chr*.fa from contig .fa
# Copied chrFa.sh from hg15/jkStuff - reset path from ~kent to
# /cluster for the ctgToChromFa comand
tcsh ./jkStuff/chrFa.sh > chrFa.out 2>&1 &
# copied these three scripts from hg15 - fixup path names to
# reference /cluster/bin instead of ~kent/bin
#- Soft-mask (lower-case) the contig and chr .fa's
tcsh ./jkStuff/makeFaMasked.sh > maFaMasked.out 2>&1
#- Make hard-masked .fa.masked files as well:
tcsh ./jkStuff/makeHardMasked.sh > maHardMasked.out 2>&1
#- Rebuild the nib, mixedNib, maskedNib files:
tcsh ./jkStuff/makeNib.sh > maNib.out 2>&1
# Make symbolic links from /gbdb/hg16/nib to the real nibs.
ssh hgwdev
mkdir -p /gbdb/hg16/nib
foreach f (/cluster/store4/gs.17/build34/nib/chr*.nib)
ln -s $f /gbdb/hg16/nib
end
# Load /gbdb/hg16/nib paths into database and save size info.
hgsql hg16 < ~/kent/src/hg/lib/chromInfo.sql
cd /cluster/data/hg16
hgNibSeq -preMadeNib hg16 /gbdb/hg16/nib ?{,?}/chr?{,?}{,_random}.fa
echo "select chrom,size from chromInfo" | hgsql -N hg16 > chrom.sizes
# Copy the masked contig fa to /iscratch and /scratch:
# And everything else we will need for blastz runs, etc ...
ssh kkr1u00
rm -rf /iscratch/i/gs.17/build34/trfFa
mkdir -p /iscratch/i/gs.17/build34/trfFa
cp -p /cluster/data/hg16/?{,?}/N{T,G}_*/N{T,G}_??????.fa /iscratch/i/gs.17/build34/trfFa
rm -rf /iscratch/i/gs.17/build34/bothMaskedNibs
mkdir -p /iscratch/i/gs.17/build34/bothMaskedNibs
cp -p /cluster/data/hg16/nib/*.nib /iscratch/i/gs.17/build34/bothMaskedNibs
rm -rf /iscratch/i/gs.17/build34/rmsk
mkdir -p /iscratch/i/gs.17/build34/rmsk
cp -p /cluster/data/hg16/?{,?}/*.out /iscratch/i/gs.17/build34/rmsk
~kent/bin/iSync
# ssh kkstore
# Since kkstore is currently /cluster/bluearc/scratch, better to do
# this on eieio and copy to
rm -rf /scratch/hg/gs.17/build34/trfFa
mkdir -p /scratch/hg/gs.17/build34/trfFa
cp -p /cluster/data/hg16/?{,?}/N{T,G}_*/N{T,G}_??????.fa /scratch/hg/gs.17/build34/trfFa
rm -rf /scratch/hg/gs.17/build34/bothMaskedNibs
mkdir /scratch/hg/gs.17/build34/bothMaskedNibs
cp -p /cluster/data/hg16/nib/*.nib /scratch/hg/gs.17/build34/bothMaskedNibs
rm -rf /scratch/hg/gs.17/build34/rmsk
mkdir -p /scratch/hg/gs.17/build34/rmsk
cp -p /cluster/data/hg16/?{,?}/*.out /scratch/hg/gs.17/build34/rmsk
# request rsync of kkstore /scratch
# O+O: ASSEMBLY [GOLD], GAP, COVERAGE, MAP CONTIGS TRACKS (DONE - 2003-07-27)
# Store o+o info in database.
ssh eieio
cd /cluster/store4/gs.17/build34
if (-f contig_overlaps.agp) then
jkStuff/liftGl.sh contig.gl
else
ssh hgwdev
hgGoldGapGl -noGl hg16 /cluster/store4/gs.17 build34
echo ""
echo "*** Note from makeHg15.doc:"
echo "Come back to this step later when we have contig_overlaps.agp\!"
endif
ssh hgwdev
cd /cluster/store4/gs.17/build34
if (-f contig_overlaps.agp) then
hgGoldGapGl hg16 /cluster/store4/gs.17 build34
cd /cluster/store4/gs.17
/cluster/bin/i386/hgClonePos hg16 build34 ffa/sequence.inf /cluster/store4/gs.17 -maxErr=3
end
cd /cluster/store4/gs.17
# (2/27/04 angie) re-loaded -- chr{1,4,8,15}_random lift files changed
# 7/30/04.
hgCtgPos hg16 build34
# CREATE NON-STANDARD JOIN CERTIFICATES WEB PAGE AND TABLE
# Filter certificates file to only contain those relevant to current assembly
cd ~/hg16/certificates
/cluster/bin/scripts/extractCertificates.pl e-certificates.txt ~/hg16 \
> e-certificates.filter.txt
# Create initial web page and table for loading into database
hgCert e-certificates.filter.txt > certificates.html
# Donna's edits to html page
# (3/2/04 angie: edit cert.tab to remove some extra tab characters in comments
# so mySql doesn't truncate them, & reload)
# Load cert table into database
ssh hgwdev
cd ~/hg16/certificates
echo "drop table certificate" | hgsql hg16
hgsql hg16 < ~/kent/src/hg/lib/certificate.sql
echo 'load data local infile "cert.tab" into table certificate;' \
| hgsql hg16
# AUTO UPDATE GENBANK MRNA RUN (WORKING - 2003-07-30 - Hiram)
ssh eieio
cd /cluster/store5/genbank
# This is a new organism, edit the etc/genbank.conf file and add:
# hg16
hg16.genome = /scratch/hg/gs.17/build34/bothMaskedNibs/chr*.nib
hg16.lift = /cluster/store4/gs.17/build34/jkStuff/liftAll.lft
hg16.genbank.est.xeno.load = yes
hg16.mgcTables.default = full
hg16.mgcTables.mgc = all
hg16.downloadDir = hg16
ssh eieio
cd /cluster/store5/genbank
nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \
-srcDb=genbank -type=mrna -verbose=1 -initial hg16
# Completed: 49591 of 49591 jobs
# CPU time in finished jobs: 3853288s 64221.47m 1070.36h 44.60d 0.122 y
# IO & Wait Time: 246323s 4105.38m 68.42h 2.85d 0.008 y
# Average job time: 83s 1.38m 0.02h 0.00d
# Longest job: 21265s 354.42m 5.91h 0.25d
# Submission to last job: 22930s 382.17m 6.37h 0.27d
# Load the results from the above
ssh hgwdev
cd /cluster/store5/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg16
# To get this next one started, the above results need to be
# moved out of the way. These things can be removed if there are
# no problems to debug
ssh eieio
cd /cluster/bluearc/genbank/work
mv initial.hg16 initial.hg16.genbank.mrna
ssh eieio
cd /cluster/store5/genbank
nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \
-srcDb=refseq -type=mrna -verbose=1 -initial hg16
# Completed: 68740 of 68740 jobs
# CPU time in finished jobs: 1253290s 20888.16m 348.14h 14.51d 0.040 y
# IO & Wait Time: 309126s 5152.10m 85.87h 3.58d 0.010 y
# Average job time: 23s 0.38m 0.01h 0.00d
# Longest job: 13290s 221.50m 3.69h 0.15d
# Submission to last job: 13609s 226.82m 3.78h 0.16d
# The iservers came back on-line, so use them for this run.
# The batch file can be found in:
# /cluster/store5/genbank/work/initial.hg16/align
ssh hgwdev
cd /cluster/store5/genbank
nice bin/gbDbLoadStep -verbose=1 hg16
nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial hg16
# GC PERCENT (DONE 2003-07-31 - Hiram)
ssh hgwdev
mkdir -p /cluster/data/hg16/bed/gcPercent
cd /cluster/data/hg16/bed/gcPercent
hgsql hg16 < ~/kent/src/hg/lib/gcPercent.sql
hgGcPercent hg16 ../../nib
# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE - 2003-07-31 - Hiram)
ssh hgwdev
# Substitute BBB with the correct number for the hostname:
echo 'insert into blatServers values("hg16", "blat6", "17778", "1"); \
insert into blatServers values("hg16", "blat6", "17779", "0");' \
| hgsql -h genome-testdb hgcentraltest
# PRODUCING GENSCAN PREDICTIONS (DONE - 2003-08-01 - Hiram)
ssh eieio
mkdir -p /cluster/data/hg16/bed/genscan
cd /cluster/data/hg16/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir -p gtf pep subopt
# Generate a list file, genome.list, of all the contigs
# *that do not have pure Ns* (due to heterochromatin, unsequencable
# stuff) which would cause genscan to run forever.
rm -f genome.list
touch genome.list
foreach f ( `ls -1S /cluster/store4/gs.17/build34/?{,?}/N{T,G}_*/N{T,G}_??????.fa.masked` )
egrep '[ACGT]' $f > /dev/null
if ($status == 0) echo $f >> genome.list
end
# Log into kkr1u00 (not kk!). kkr1u00 is the driver node for the small
# cluster (kkr2u00 -kkr8u00. Genscan has problem running on the
# big cluster, due to limitation of memory and swap space on each
# processing node).
ssh kkr1u00
cd /cluster/data/hg16/bed/genscan
# Create template file, gsub, for gensub2. For example (3-line file):
cat << '_EOF_' > gsub
#LOOP
/cluster/home/hiram/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.list single gsub jobList
para create jobList
para try
para check
para push
# Completed: 491 of 491 jobs (this was with only 6 CPUs available)
# CPU time in finished jobs: 216220s 3603.67m 60.06h 2.50d 0.007 y
# IO & Wait Time: 85597s 1426.62m 23.78h 0.99d 0.003 y
# Average job time: 615s 10.24m 0.17h 0.01d
# Longest job: 10986s 183.10m 3.05h 0.13d
# Submission to last job: 54395s 906.58m 15.11h 0.63d
# Issue either one of the following two commands to check the
# status of the cluster and your jobs, until they are done.
parasol status
para check
# If there were out-of-memory problems (run "para problems"), then
# re-run those jobs by hand but change the -window arg from 2400000
# to 1200000. In build33, this was 22/NT_011519.
# In build34 there were NO failures !
# Convert these to chromosome level files as so:
ssh eieio
cd /cluster/data/hg16/bed/genscan
liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/N{T,G}*.gtf
liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/N{T,G}*.bed
cat pep/*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/hg16/bed/genscan
ldHgGene hg16 genscan genscan.gtf
# Reading genscan.gtf
# Read 42974 transcripts in 326300 lines in 1 files
# 42974 groups 41 seqs 1 sources 1 feature types
# 42974 gene predictions
hgPepPred hg16 generic genscanPep genscan.pep
# Processing genscan.pep
hgLoadBed hg16 genscanSubopt genscanSubopt.bed
# stringTab = 0
# Reading genscanSubopt.bed
# Loaded 518038 elements
# Sorted
# Creating table definition for
# Saving bed.tab
# Loading hg16
# CPGISLANDS (DONE - 2003-08-01 - Hiram)
ssh eieio
mkdir -p /cluster/data/hg16/bed/cpgIsland
cd /cluster/data/hg16/bed/cpgIsland
# Copy program as built for previous hg build:
mkdir cpg_dist
cp -p ~/hg15/bed/cpgIsland/cpg_dist/cpglh.exe ./cpg_dist
# This step used to read, but I do not immediately see the .tar
# file anywhere: (there is a copy in ~/rn3/bed/cpgIsland)
# Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu)
# copy the tar file to the current directory
# tar xvf cpg_dist.tar
# cd cpg_dist
# gcc readseq.c cpg_lh.c -o cpglh.exe
# cd ..
# cpglh.exe requires hard-masked (N) .fa's.
# There may be warnings about "bad character" for IUPAC ambiguous
# characters like R, S, etc. Ignore the warnings.
foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked)
set fout=$f:t:r:r.cpg
echo producing $fout...
./cpg_dist/cpglh.exe $f > $fout
end
cat << '_EOF_' > filter.awk
/* chr1\t1325\t3865\t754\tCpG: 183\t64.9\t0.7 */
/* Transforms to: (tab separated columns above, spaces below) */
/* chr1 1325 3865 CpG: 183 754 183 489 64.9 0.7 */
{
width = $3-$2;
printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\n",
$1,$2,$3,$5,$6,width,$6,width*$7*0.01,100.0*2*$6/($3-$2),$7);}
'_EOF_'
# << this line makes emacs coloring happy
awk -f filter.awk chr*.cpg > cpgIsland.bed
ssh hgwdev
cd /cluster/data/hg16/bed/cpgIsland
hgLoadBed hg16 cpgIsland -tab -noBin \
-sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed
# stringTab = 1
# Reading cpgIsland.bed
# Loaded 27596 elements
# Sorted
# Saving bed.tab
# Loading hg16
# VERIFY REPEATMASKER RESULTS (DONE - 2003-08-01 - Hiram)
# Run featureBits on hg16 and on a comparable genome build, and compare:
ssh hgwdev
featureBits hg16 rmsk
# --> 1388770568 bases of 2865697954 (48.462%) in intersection
# --> 1388044886 bases of 2865697954 (48.437%) in intersection
# --> 1388157103 bases of 2863665240 (48.475%) in intersection
featureBits hg15 rmsk
# --> 1386879340 bases of 2866466359 (48.383%) in intersection
featureBits hg13 rmsk
# --> 1383216615 bases of 2860907679 (48.349%) in intersection
# PREPARE CLUSTER FOR BLASTZ RUN (DONE - 2003-08-05 - Hiram)
ssh eieio
# This is where kkstore /scratch is kept:
cd /cluster/bluearc/scratch/hg/gs.17/build34/rmsk
# The following will mark each line for rat and mouse
# Rat first will column 1, Mouse second will be column 2
foreach outfl ( *.out )
echo "$outfl"
/cluster/bluearc/RepeatMasker030619/DateRepsinRMoutput.pl \
${outfl} -query human -comp rat -comp mouse
end
# Now extract each one, 1 = Rat, 2 = Mouse
cd /cluster/bluearc/scratch/hg/gs.17/build34
mkdir linSpecRep.notInRat
mkdir linSpecRep.notInMouse
foreach f (rmsk/*.out_rat_mus)
set base = $f:t:r:r
echo $base.out.spec
/cluster/bin/scripts/extractLinSpecReps 1 $f > \
linSpecRep.notInRat/$base.out.spec
/cluster/bin/scripts/extractLinSpecReps 2 $f > \
linSpecRep.notInMouse/$base.out.spec
end
# That produced no difference at all between those two targets.
# Have requested confirmation from Arian
# BLASTZ MOUSE (DONE - 2003-08-07 - Hiram)
ssh eieio
cd /cluster/bluearc/mm3.RM030619
foreach f (rmsk.spec/*.out_rat_hum)
set base = $f:t:r:r
echo $base.out.spec
/cluster/bin/scripts/extractLinSpecReps 2 $f > \
linSpecRep.notInHuman/$base.out.spec
end
ssh eieio
mkdir -p /cluster/data/hg16/bed/blastz.mm3
cd /cluster/data/hg16/bed/blastz.mm3
cat << '_EOF_' > DEF
# mouse vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInMouse
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Mouse
SEQ2_DIR=/iscratch/i/mm3.RM030619/mixedNib/
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman/
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/store4/gs.17/build34/bed/blastz.mm3
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line makes emacs coloring happy
# Save the DEF file in the current standard place
DS=`date -I`
cp DEF ~angie/hummus/DEF.mm3-hg16.$DS
ssh kk
cd ~hg16/bed/blastz.mm3
cd /cluster/data/hg16/bed/blastz.mm3
# source the DEF file to establish environment for following commands
bash
. ./DEF
# follow the next set of directions slavishly
mkdir -p $BASE/run
# give up on avoiding angie's directories
# tcl script
# creates xdir.sh and joblist run/j
~angie/hummus/make-joblist $DEF > $BASE/run/j
# xdir.sh makes a bunch of result directories in $BASE/raw/
# based on chrom name and CHUNK size
sh $BASE/xdir.sh
cd $BASE/run
# now edit j to prefix path to executable name
# NOTE: we should have a controlled version of schwartz bin executables
sed -e 's#^#/cluster/bin/penn/#' j > j2
wc -l j*
head j2
# make sure the j2 edits are OK, then use it:
mv j2 j
# para create will create the file: 'batch' for the cluster run
para create j
# 39663 jobs
para try
para check
para push
# ... etc ...
# With competition on the cluster:
# Completed: 39663 of 39663 jobs
# CPU time in finished jobs: 14365996s 239433.27m 3990.55h 166.27d 0.456 y
# IO & Wait Time: 681029s 11350.48m 189.17h 7.88d 0.022 y
# Average job time: 379s 6.32m 0.11h 0.00d
# Longest job: 9275s 154.58m 2.58h 0.11d
# Submission to last job: 53023s 883.72m 14.73h 0.61d
# post-process blastz
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3
# source the DEF file again in case you are coming back to this
# (must be bash shell)
. ./DEF
# a new run directory
mkdir -p run.1
mkdir -p $BASE/lav
# create a new job list to convert out files to lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
> run.1/jobList
cd run.1
# make sure the job list is OK
wc -l jobList
# 312 jobs
head jobList
# run on cluster
ssh kk
cd /cluster/data/hg16/bed/blastz.mm3/run.1
para create jobList
para try
para check
para push
# etc.
# Completed: 339 of 339 jobs
# CPU time in finished jobs: 11666s 194.44m 3.24h 0.14d 0.000 y
# IO & Wait Time: 69155s 1152.58m 19.21h 0.80d 0.002 y
# Average job time: 238s 3.97m 0.07h 0.00d
# Longest job: 1332s 22.20m 0.37h 0.02d
# Submission to last job: 1497s 24.95m 0.42h 0.02d
# convert lav files to axt
ssh kk
cd /cluster/data/hg16/bed/blastz.mm3
mkdir axtChrom
# a new run directory
mkdir run.2
cd run.2
# create template file for gensub2
# usage: blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/scripts/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastz.mm3/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastz.mm3/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/mm3.RM030619/mixedNib/
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
ls -1S /cluster/store4/gs.17/build34/bed/blastz.mm3/lav > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
# 42 jobs
head jobList
cd /cluster/data/hg16/bed/blastz.mm3/run.2
para create jobList
para try
para check
para push
# The two crashed jobs are about chr19 and chr19_random
# Its chr19_random .fa file is almost all masked sequence
# The resulting .axt file is empty. The chr19 is too big
#Completed: 40 of 42 jobs
#Crashed: 2 jobs
#CPU time in finished jobs: 1908s 31.80m 0.53h 0.02d 0.000 y
#IO & Wait Time: 22178s 369.64m 6.16h 0.26d 0.001 y
#Average job time: 602s 10.04m 0.17h 0.01d
#Longest job: 1723s 28.72m 0.48h 0.02d
#Submission to last job: 1802s 30.03m 0.50h 0.02d
# To fixup the chr19 axtsort problem
# sometimes alignments are so huge that they cause axtSort to run out
# of memory. Run them in two passes like this:
ssh kkr1u00
cd /cluster/data/hg16/bed/blastz.mm3
set base=/cluster/data/hg16/bed/blastz.mm3
set seq1_dir=/iscratch/i/gs.17/build34/bothMaskedNibs
set seq2_dir=/iscratch/i/mm3.RM030619/mixedNib/
foreach c (lav/chr19)
pushd $c
set chr=$c:t
set out=axtChrom/$chr.axt
echo "Translating $chr lav to $out"
foreach d (*.lav)
set smallout=$d.axt
lavToAxt $d $seq1_dir $seq2_dir stdout \
| axtDropSelf stdin stdout \
| axtSort stdin $smallout
end
cat `ls -1 *.lav.axt | sort -g` > $base/$out
popd
end
# Remove the empty axtChrom/chr19_random.axt file to avoid future
# processing errors
# translate sorted axt files into psl
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3
mkdir -p pslChrom
set tbl = "blastzMm3"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# That takes about 20 minutes
# chr19 came along later
ssh kkr1u00
set tbl = "blastzMm3"
foreach f (axtChrom/chr19.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
set tbl = "blastzMm3"
cd /cluster/data/hg16/bed/blastz.mm3/pslChrom
/cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl
# This takes 30 minutes to an hour
# and later chr19
/cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr19_${tbl}.psl
# create trackDb/human/hg16 and get a trackDb.ra file started with:
# track blastzMm3
# shortLabel Mouse Blastz
# longLabel Blastz All Mouse (Feb. 03) Alignments
# group compGeno
# priority 130
# visibility hide
# color 100,50,0
# altColor 255,240,200
# spectrum on
# type psl xeno mm3
# otherDb mm3
# remake trackDb tables
# redo chr1 (featureBits shows 7% lower aligments than hg16)
# (DONE 2003-09-09 kate)
# blastz run ended prematurely -- .tmp files leftover, not moved to .out's
ssh kk
cd /cluster/data/hg16/bed/blastz.mm3
bash
. ./DEF
cd $BASE
mkdir run.chr1
# create job list for human chr1, with parasol output file validation
~angie/hummus/make-joblist $DEF | \
/cluster/bin/scripts/blastz-clusterjob.pl $BASE | \
grep 'run chr1.nib' | \
sed -e 's#^#/cluster/bin/penn/#' \
> $BASE/run.chr1/spec
grep 'chr1/' $BASE/xdir.sh > $BASE/xdir.chr1.sh
mv raw/chr1 raw/chr1.old
mkdir raw/chr1
sh xdir.chr1.sh
cd run.chr1
para create spec
# 2925 jobs
para try
para check
para push
# ... etc ...
ssh eieio
bash
cd /cluster/data/hg16/bed/blastz.mm3
. DEF
mv lav/chr1 lav/chr1.old
mkdir run.chr1.lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
| grep 'lav chr1 ' > run.chr1.lav/jobList
cd run.chr1.lav
wc -l jobList
# 25 jobs
head jobList
# run on cluster
ssh kk
cd /cluster/data/hg16/bed/blastz.mm3/run.chr1.lav
para create jobList
para try
para check
para push
# etc.
# convert lav files to chrom axt
/cluster/bin/scripts/blastz-chromlav2axt /cluster/data/hg16/bed/blastz.mm3/lav/chr1 /cluster/data/hg16/bed/blastz.mm3/axtChrom/chr1.axt /cluster/data/hg16/nib /cluster/data/mm3.RM030619/mixedNib
# translate sorted axt files into psl
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3
mv pslChrom/chr1_blastzMm3.psl pslChrom/chr1_blastzMm3.psl.old
/cluster/bin/i386/axtToPsl axtChrom/chr1.axt S1.len S2.len \
pslChrom/chr1_blastzMm3.psl
# reload database table
hgsql hg16 -e "drop table chr1_blastzMm3"
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm3/pslChrom
/cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr1_blastzMm3.psl
# make chain
cd /cluster/data/hg16/bed/blastz.mm3/axtChain/run1
mv chain/chr1.chain chain/chr1.chain.old
mv out/chr1.out out/chr1.out.old
axtFilter -notQ=chrUn_random /cluster/data/hg16/bed/blastz.mm3/axtChrom/chr1.axt | axtChain stdin \
/cluster/data/hg16/nib \
/cluster/data/mm3/mixedNib chain/chr1.chain > out/chr1.out
# sort chains
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
mv all.chain all.chain.old
chainMergeSort run1/chain/*.chain > all.chain
mv chain chain.old
mkdir chain
chainSplit chain all.chain
# reload chr1 chain into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm3/axtChain/chain
hgLoadChain hg16 chr1_chainMm3 chr1.chain
# Loading 510456 chains into hg16.chr1_chainMm3
# make net
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
cd chain
/cluster/bin/i386/chainPreNet chr1.chain /cluster/data/hg16/chrom.sizes \
/cluster/data/mm3/chrom.sizes ../preNet/chr1.chain
cd ..
cd preNet
mv ../n1/chr1.net ../n1/chr1.net.old
/cluster/bin/i386/chainNet chr1.chain -minSpace=1 \
/cluster/data/hg16/chrom.sizes \
/cluster/data/mm3/chrom.sizes ../n1/chr1.net /dev/null
cd ..
cp hNoClass.net hNoClass.net.old
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
netClass hNoClass.net hg16 mm3 mouse.net \
-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \
-qNewR=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
# rm -r n1 hNoClass.net
# Make a 'syntenic' subset of these with
mv mouseSyn.net mouseSyn.net.old
netFilter -syn mouse.net > mouseSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
netFilter -minGap=10 mouse.net | hgLoadNet hg16 netMm3 stdin
netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm3 stdin
# make tight subset of net
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
mv ../axtNet/chr1.axt ../axtNet/chr1.old.axt
netToAxt mouseNet/chr1.net chain/chr1.chain /cluster/data/hg16/nib \
/cluster/data/mm3.RM030619/mixedNib ../axtNet/chr1.axt
mv ../axtTight/chr1.axt ../axtTight/chr1.axt.old
cd ../axtNet
subsetAxt chr1.axt ../axtTight/chr1.axt \
/cluster/data/subsetAxt/coding.mat 3400
# translate to psl
cd ../axtTight
axtToPsl chr1.axt ../S1.len ../S2.len ../pslTight/chr1_blastzTightMm3.psl
# Load table into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm3/pslTight
hgLoadPsl -noTNameIx hg16 chr1_blastzTightMm3.psl
# $ featureBits -chrom=chr1 hg16 chr1_blastzTightMm3.psl
# 14052627 bases of 221562941 (6.342%) in intersection
# hg15: 13990547 bases of 218713898 (6.397%) in intersection
# make axtNet300
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
netSplit mouse.net mouseNet
mv ../axtNet300/chr1.axt ../axtNet300/chr1.axt.old
netToAxt -maxGap=300 mouseNet/chr1.net chain/chr1.chain /cluster/data/hg16/nib /cluster/data/mm3.RM030619/mixedNib ../axtNet300/chr1.axt
# create 2-way maf file for humor alignment
set multizDir = /cluster/data/hg16/bed/humor.2003-09-02
cd /cluster/data/hg16
set mouseDir = bed/blastz.mm3/axtNet300
axtSort $mouseDir/chr1.axt $mouseDir/chr1.axt.sorted
mv $mouseDir/chr1.axt.sorted $mouseDir/chr1.axt
axtToMaf $mouseDir/chr1.axt \
/cluster/data/hg16/chrom.sizes /cluster/data/mm3/chrom.sizes \
$multizDir/maf/chr1.mm3.maf.unfixed -tPrefix=hg16. -qPrefix=mm3.
/cluster/bin/scripts/fixmaf.pl \
< $multizDir/maf/chr1.mm3.maf.unfixed > $multizDir/maf/chr1.mm3.maf
rm $multizDir/maf/chr1.mm3.maf.unfixed
# NET MOUSE BLASTZ (DONE - 2003-08-22 - Hiram)
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
/cluster/data/mm3/chrom.sizes ../preNet/$i
end
# This foreach loop will take about 15 min to execute.
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
/cluster/data/mm3/chrom.sizes ../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
# memory usage 2490523648, utime 15421 s/100, stime 3665
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
~/bin/i386/netClass hNoClass.net hg16 mm3 mouse.net \
-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \
-qNewR=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman
# If things look good do
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
rm -r n1 hNoClass.net
# Make a 'syntenic' subset of these with
netFilter -syn mouse.net > mouseSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
netFilter -minGap=10 mouse.net | hgLoadNet hg16 netMm3 stdin
netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm3 stdin
# make net
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
mkdir mouseNet
netSplit mouse.net mouseNet
foreach n (mouseNet/chr*.net)
set c=$n:t:r
echo "netToAxt: $c.net -> $c.axt"
rm -f ../axtNet/$c.axt
netToAxt mouseNet/$c.net chain/$c.chain \
/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
/cluster/data/mm3.RM030619/mixedNib \
../axtNet/$c.axt
echo "Complete: $c.net -> $c.axt"
end
# MAKE BLASTZ BEST MOUSE MM3 (DONE - 2003-08-26 - Hiram)
# IMPORTANT NOTE - This axtBest process has been replaced by the
# chain to net to axt process. Note procedure below continues
# after the chain and nets have been produced.
# Consolidate AXT files to chrom level, sort, pick best, make psl.
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3/axtChrom
mkdir -p /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom
# copy chrom axt's to bluearc, to avoid hitting fileserver too hard
cp -p *.axt /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom
# chr19 came along later:
cp -p chr19.axt /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom
ssh kk
cd /cluster/data/hg16/bed/blastz.mm3
mkdir -p axtBest pslBest
mkdir run.3
cd run.3
# create script to filter files
cat << '_EOF_' > doBestAxt
#!/bin/csh -f
# usage: doBestAxt chr axt-file best-file psl-file
/cluster/bin/i386/axtBest $2 $1 $3 -minScore=300
sleep 1
/cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastz.mm3/S1.len \
/cluster/data/hg16/bed/blastz.mm3/S2.len $4
'_EOF_'
# << this line makes emacs coloring happy
chmod +x doBestAxt
cd ../axtChrom
ls -1S | sed 's/.axt$//' > ../run.3/chrom.list
cd ../run.3
# create template for cluster job
cat << '_EOF_' > gsub
#LOOP
doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.mm3/axtBest/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.mm3/pslBest/$(root1)_blastzBestMm3.psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 chrom.list single gsub jobList
wc -l jobList
# 41 jobs
head jobList
ssh kk
cd /cluster/data/hg16/bed/blastz.mm3
cd run.3
para create jobList
para try
para check
para push
# With the chr19 situation, went back and reran this situation.
# for some unknown reason the first time it had 9 failed jobs:
Completed: 32 of 41 jobs
Crashed: 9 jobs
CPU time in finished jobs: 827s 13.78m 0.23h 0.01d 0.000 y
IO & Wait Time: 1299s 21.65m 0.36h 0.02d 0.000 y
Average job time: 66s 1.11m 0.02h 0.00d
Longest job: 361s 6.02m 0.10h 0.00d
Submission to last job: 1195s 19.92m 0.33h 0.01d
# And then rerunning those 9 failed jobs, only chr19 failed:
Completed: 8 of 9 jobs
Crashed: 1 jobs
CPU time in finished jobs: 748s 12.47m 0.21h 0.01d 0.000 y
IO & Wait Time: 2290s 38.16m 0.64h 0.03d 0.000 y
Average job time: 380s 6.33m 0.11h 0.00d
Longest job: 1247s 20.78m 0.35h 0.01d
Submission to last job: 1261s 21.02m 0.35h 0.01d
# Better yet, Jim says to be consistent, do all the chroms in
# this manner:
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
mkdir mouseNet
netSplit mouse.net mouseNet
foreach n (mouseNet/chr*.net)
set c=$n:t:r
echo "netToAxt: $c.net -> $c.axt"
rm -f ../axtNet/$c.axt
netToAxt mouseNet/$c.net chain/$c.chain \
/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
/cluster/data/mm3.RM030619/mixedNib \
../axtNet/$c.axt
echo "Complete: $c.net -> $c.axt"
end
mkdir -p /cluster/data/hg16/bed/blastz.mm3/axtBest
cd /cluster/data/hg16/bed/blastz.mm3/axtBest
ln -s ../axtNet/chr*.axt .
# copy net axt's to download area (DONE 2003-09-24 kate)
cd /cluster/data/hg16/bed/blastz.mm3/axtNet
gzip *.axt
mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtNet
cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtNet
# add README.txt file to dir, if needed
# Convert those axt files to psl
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3
mkdir pslBest
foreach a (axtBest/chr*.axt)
set c=$a:t:r
echo "processing $c.axt -> ${c}_blastzBestMm3.psl"
/cluster/bin/i386/axtToPsl axtBest/${c}.axt \
S1.len S2.len pslBest/${c}_blastzBestMm3.psl
echo "Done: ${c}_blastzBestMm3.psl"
end
# Load tables
ssh hgwdev
set base="/cluster/data/hg16/bed/blastz.mm3"
set tbl="blastzBestMm3"
cd $base/pslBest
/cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl
# check results
# the original axtBest stuff from the axtBest operation:
# featureBits hg16 blastzBestMm3
# 1027438291 bases of 2865248791 (35.859%) in intersection
# After going through the chain->net->axt operation:
# featureBits hg16 blastzBestMm3
# 991468768 bases of 2865248791 (34.603%) in intersection
# And finally after fixing a blastz execution problem on chr1:
# 1007362800 bases of 2865248791 (35.158%) in intersection
# featureBits hg15 blastzBestMm3
# 1035090465 bases of 2866466359 (36.110%) in intersection
# Make /gbdb links and add them to the axtInfo table:
mkdir -p /gbdb/hg16/axtBestMm3
cd /gbdb/hg16/axtBestMm3
foreach f (/cluster/data/hg16/bed/blastz.mm3/axtNet/chr*.axt)
ln -s $f .
end
cd /cluster/data/hg16/bed/blastz.mm3/axtNet
rm -f axtInfoInserts.sql
touch axtInfoInserts.sql
foreach f (/gbdb/hg16/axtBestMm3/chr*.axt)
set chr=$f:t:r
echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
VALUES ('mm3','Blastz Best in Genome','$chr','$f');" \
>> axtInfoInserts.sql
end
hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql
hgsql hg16 < axtInfoInserts.sql
# MAKING THE AXTTIGHT FROM AXTBEST (DONE - 2003-08-25 - Hiram)
# After creating axtBest alignments above, use subsetAxt to get axtTight:
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3/axtNet
mkdir -p ../axtTight
tcsh
foreach i (*.axt)
echo $i
subsetAxt $i ../axtTight/$i \
~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
end
# translate to psl
cd ../axtTight
mkdir -p ../pslTight
foreach i (*.axt)
set c = $i:r
axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm3.psl
echo "Done: $i"
end
# Load tables into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm3/pslTight
hgLoadPsl -noTNameIx hg16 chr*_blastzTightMm3.psl
# copy to axt's to download area (DONE 2003-09-24 kate)
cd /cluster/data/hg16/bed/blastz.mm3/axtTight
gzip *.axt
mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtTight
cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtTight
# add README.txt file to dir, if needed
# CHAIN MOUSE BLASTZ (DONE 2003-08-28 - Hiram)
# Run axtChain on little cluster
ssh kkr1u00
mkdir -p /cluster/data/hg16/bed/blastz.mm3/axtChain/run1
cd /cluster/data/hg16/bed/blastz.mm3/axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg16/bed/blastz.mm3/axtChrom/*.axt > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# axtFilter -notQ=chrUn_random $1 | axtChain stdin
cat << '_EOF_' > doChain
#!/bin/csh
axtFilter -notQ=chrUn_random $1 | axtChain stdin \
/iscratch/i/gs.17/build34/bothMaskedNibs \
/iscratch/i/mm3.RM030619/mixedNib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
mkdir out chain
# 41 jobs
gensub2 input.lst single gsub jobList
para create jobList
para try
para push # ... etc ...
Completed: 41 of 41 jobs
CPU time in finished jobs: 31379s 522.98m 8.72h 0.36d 0.001 y
IO & Wait Time: 10761s 179.35m 2.99h 0.12d 0.000 y
Average job time: 1028s 17.13m 0.29h 0.01d
Longest job: 10327s 172.12m 2.87h 0.12d
Submission to last job: 10327s 172.12m 2.87h 0.12d
# now on the cluster server, sort chains
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# these steps take ~20 minutes
# optionally: rm run1/chain/*.chain
# Load chains into database
# next machine
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm3/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain hg16 ${c}_chainMm3 $i
echo done $c
end
# NET MOUSE BLASTZ (DONE - 2003-08-22 - Hiram)
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
/cluster/data/mm3/chrom.sizes ../preNet/$i
end
# This foreach loop will take about 15 min to execute.
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
/cluster/data/mm3/chrom.sizes ../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
# memory usage 2490523648, utime 15421 s/100, stime 3665
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
~/bin/i386/netClass hNoClass.net hg16 mm3 mouse.net \
-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \
-qNewR=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman
# If things look good do
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
rm -r n1 hNoClass.net
# Make a 'syntenic' subset of these with
netFilter -syn mouse.net > mouseSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
netFilter -minGap=10 mouse.net | hgLoadNet hg16 netMm3 stdin
netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm3 stdin
# Add entries for net and chain to human/hg16 trackDb
# MAKE HUMAN-MOUSE MM3 OVER.CHAIN FOR LIFTOVER (2004-07-09 kate)
ssh eieio
set chainDir = /cluster/data/hg16/bed/blastz.mm3/axtChain
gunzip *.gz
ssh kolossus
set chainDir = /cluster/data/hg16/bed/blastz.mm3/axtChain
cd $chainDir
mkdir subset
cat > makeSubset.csh << 'EOF'
set chainDir = /cluster/data/hg16/bed/blastz.mm3/axtChain
foreach f ($chainDir/chain/*.chain.gz)
set c = $f:t:r:r
echo subsetting $c
gunzip -c $f | netChainSubset $chainDir/mouseNet/$c.net stdin \
subset/$c.chain
end
'EOF'
# << for emacs
csh makeSubset.csh >&! makeSubset.log &
tail -100f makeSubset.log
cat subset/*.chain > /cluster/data/hg16/bed/liftOver/hg16Tomm3.chain
# test reciprocal best on chr6 for ENr233
ssh kkstore
cd /cluster/data/hg16/bed/blastz.mm3/axtChain
# renumber chains to assure unique ID's,
# since netting splits some (should redo the liftOver chain with new ID's)
# then sort by score for netter
mkdir uniqueSubset
chainMergeSort subset/chr6.chain > uniqueSubset/chr6.chain
mkdir swappedSubset
chainSwap uniqueSubset/chr6.chain swappedSubset/chr6.chain
mkdir recipBestTest
cd recipBestTest
chainSort ../uniqueSubset/chr6.chain stdout | \
chainNet stdin \
/cluster/data/hg16/chrom.sizes /cluster/data/mm3/chrom.sizes\
human.chr6.net mouse.chr6.net
netChainSubset mouse.chr6.net ../swappedSubset/chr6.chain stdout | \
chainSwap stdin chr6.chain
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm3/axtChain/recipBestTest
hgLoadChain hg16 rBestChainMm3 chr6.chain
# didn't filter enuff -- perhaps didn't rechain in proper direction
# BLASTZ RAT (DONE - 2003-08-07 - Hiram)
ssh eieio
mkdir -p /cluster/data/hg16/bed/blastz.rn3
cd /cluster/data/hg16/bed/blastz.rn3
cat << '_EOF_' > DEF
# rat vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInRat
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Rat
SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/store4/gs.17/build34/bed/blastz.rn3
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line makes emacs coloring happy
# Save the DEF file in the current standard place
DS=`date -I`
cp DEF ~angie/hummus/DEF.rn3-hg16.$DS
ssh kk
cd /cluster/data/hg16/bed/blastz.rn3
# source the DEF file to establish environment for following commands
. ./DEF
# follow the next set of directions slavishly
mkdir -p $BASE/run
# give up on avoiding angie's directories
# tcl script
# creates xdir.sh and joblist run/j
~angie/hummus/make-joblist $DEF > $BASE/run/j
# xdir.sh makes a bunch of result directories in $BASE/raw/
# based on chrom name and CHUNK size
sh $BASE/xdir.sh
cd $BASE/run
# now edit j to prefix path to executable name
# NOTE: we should have a controlled version of schwartz bin executables
sed -e 's#^#/cluster/bin/penn/#' j > j2
wc -l j*
head j2
# make sure the j2 edits are OK, then use it:
mv j2 j
# para create will create the file: 'batch' for the cluster run
para create j
# 39663 jobs
para try
para check
para push
# ... etc ...
# Completed: 41697 of 41697 jobs
# CPU time in finished jobs: 14155946s 235932.43m 3932.21h 163.84d 0.449 y
# IO & Wait Time: 1005629s 16760.49m 279.34h 11.64d 0.032 y
# Average job time: 364s 6.06m 0.10h 0.00d
# Longest job: 4310s 71.83m 1.20h 0.05d
# Submission to last job: 35086s 584.77m 9.75h 0.41d
# post-process blastz
ssh kk
cd /cluster/data/hg16/bed/blastz.rn3
# source the DEF file again in case you are coming back to this
# (must be bash shell)
. ./DEF
# a new run directory
mkdir -p run.1
mkdir -p $BASE/lav
# create a new job list to convert out files to lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
> run.1/jobList
cd run.1
# make sure the job list is OK
wc -l jobList
# 339 jobs
head jobList
# run on cluster
ssh kk
cd /cluster/data/hg16/bed/blastz.rn3/run.1
para create jobList
para try
para check
para push
# etc.
# Completed: 339 of 339 jobs
# CPU time in finished jobs: 6562s 109.37m 1.82h 0.08d 0.000 y
# IO & Wait Time: 154475s 2574.58m 42.91h 1.79d 0.005 y
# Average job time: 475s 7.92m 0.13h 0.01d
# Longest job: 924s 15.40m 0.26h 0.01d
# Submission to last job: 933s 15.55m 0.26h 0.01d
# convert lav files to axt
ssh kk
cd /cluster/data/hg16/bed/blastz.rn3
mkdir axtChrom
# a new run directory
mkdir run.2
cd run.2
# create template file for gensub2
# usage: blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/scripts/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastz.rn3/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastz.rn3/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
ls -1S /cluster/store4/gs.17/build34/bed/blastz.rn3/lav > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
# 42 jobs
head jobList
para create jobList
para try
para check
para push
# ... etc ...
# The crashed job is again chr19_random
# Completed: 41 of 42 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 1507s 25.12m 0.42h 0.02d 0.000 y
# IO & Wait Time: 17520s 292.00m 4.87h 0.20d 0.001 y
# Average job time: 464s 7.73m 0.13h 0.01d
# Longest job: 1214s 20.23m 0.34h 0.01d
# Submission to last job: 1214s 20.23m 0.34h 0.01d
# Remove the empty axtChrom/chr19_random.axt file to avoid future
# processing errors
# translate sorted axt files into psl
ssh eieio
cd /cluster/data/hg16/bed/blastz.rn3
mkdir -p pslChrom
set tbl = "blastzRn3"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# That takes about 20 minutes
# Load database tables
ssh hgwdev
set tbl = "blastzRn3"
cd /cluster/data/hg16/bed/blastz.rn3/pslChrom
/cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl
# This takes 30 minutes to an hour
# New entry in human/hg16/trackDb.ra
# track blastzRn3
# shortLabel Rat Blastz
# longLabel Merged Blastz Rat (June 03) Alignments
# group compGeno
# priority 142
# visibility hide
# color 100,50,0
# altColor 255,240,200
# spectrum on
# type psl xeno rn3
# otherDb rn3
# MAKE BLASTZ BEST RAT RN3 (DONE - 2003-08-08 - Hiram - Redone 08-26)
# IMPORTANT NOTE - this axtBest process has been replaced by
# the chain -> net -> netToAxt process. So, after chains and
# nets have been created, pick up this best process below.
# Consolidate AXT files to chrom level, sort, pick best, make psl.
ssh eieio
cd /cluster/data/hg16/bed/blastz.rn3/axtChrom
mkdir -p /cluster/bluearc/hg16/bed/blastz.rn3/axtChrom
# copy chrom axt's to bluearc, to avoid hitting fileserver too hard
cp -p *.axt /cluster/bluearc/hg16/bed/blastz.rn3/axtChrom
ssh kk
cd /cluster/data/hg16/bed/blastz.rn3
mkdir -p axtBest pslBest
mkdir run.3
cd run.3
# create script to filter files
cat << '_EOF_' > doBestAxt
#!/bin/csh -f
# usage: doBestAxt chr axt-file best-file psl-file
/cluster/bin/i386/axtBest $2 $1 $3 -minScore=300
sleep 1
/cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastz.rn3/S1.len \
/cluster/data/hg16/bed/blastz.rn3/S2.len $4
'_EOF_'
# << this line makes emacs coloring happy
chmod +x doBestAxt
cd ../axtChrom
ls -1S | sed 's/.axt$//' > ../run.3/chrom.list
cd ../run.3
# create template for cluster job
cat << '_EOF_' > gsub
#LOOP
doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastz.rn3/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.rn3/axtBest/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.rn3/pslBest/$(root1)_blastzBestRn3.psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 chrom.list single gsub jobList
wc -l jobList
# 41 jobs
head jobList
cd /cluster/data/hg16/bed/blastz.rn3
cd run.3
para create jobList
para try
para check
para push
# 106 minutes, almost all I/O time:
# Completed: 41 of 41 jobs
# CPU time in finished jobs: 2225s 37.09m 0.62h 0.03d 0.000 y
# IO & Wait Time: 36349s 605.81m 10.10h 0.42d 0.001 y
# Average job time: 941s 15.68m 0.26h 0.01d
# Longest job: 6415s 106.92m 1.78h 0.07d
# Submission to last job: 6417s 106.95m 1.78h 0.07d
# Better yet, Jim says to be consistent, do all the chroms in
# this manner:
ssh eieio
cd /cluster/data/hg16/bed/blastz.rn3/axtChain
mkdir ratNet
netSplit rat.net ratNet
mkdir ../axtNet
foreach n (ratNet/chr*.net)
set c=$n:t:r
echo "netToAxt: $c.net -> $c.axt"
rm -f ../axtNet/$c.axt
netToAxt -maxGap=25 ratNet/$c.net chain/$c.chain \
/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
/cluster/bluearc/rat/rn3/softNib \
../axtNet/$c.axt
echo "Complete: $c.net -> $c.axt"
end
mkdir -p /cluster/data/hg16/bed/blastz.rn3/axtBest
cd /cluster/data/hg16/bed/blastz.rn3/axtBest
ln -s ../axtNet/chr*.axt .
# copy net axt's to download area (DONE 2003-09-24 kate)
ssh eieio
cd /cluster/data/hg16/bed/blastz.rn3/axtNet
gzip *.axt
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtNet
cd /cluster/data/hg16/bed/blastz.rn3/axtNet
cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtNet
# add README.txt file to dir, if needed
# Convert those axt files to psl
ssh eieio
cd /cluster/data/hg16/bed/blastz.rn3
mkdir pslBest
foreach a (axtBest/chr*.axt)
set c=$a:t:r
echo "processing $c.axt -> ${c}_blastzBestRn3.psl"
/cluster/bin/i386/axtToPsl axtBest/${c}.axt \
S1.len S2.len pslBest/${c}_blastzBestRn3.psl
echo "Done: ${c}_blastzBestRn3.psl"
end
# Load tables
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.rn3/pslBest
/cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzBestRn3.psl
# check results
# Via the netToAxt process:
# featureBits hg16 blastzBestRn3
# 976121391 bases of 2865248791 (34.068%) in intersection
# With the original axtBest process, before the netToAxt process:
# featureBits hg16 blastzBestRn3
# 1002119325 bases of 2865248791 (34.975%) in intersection
# Hg15 results:
# featureBits hg15 blastzBestRn3
# 992724355 bases of 2866466359 (34.632%) in intersection
# Make /gbdb links and add them to the axtInfo table:
mkdir -p /gbdb/hg16/axtBestRn3
cd /gbdb/hg16/axtBestRn3
ln -s /cluster/data/hg16/bed/blastz.rn3/axtNet/chr*.axt .
cd /cluster/data/hg16/bed/blastz.rn3/axtNet
rm -f axtInfoInserts.sql
touch axtInfoInserts.sql
foreach f (/gbdb/hg16/axtBestRn3/chr*.axt)
set chr=$f:t:r
echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \
>> axtInfoInserts.sql
end
# Already done above. This table needs definition only once
# hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql
hgsql hg16 < axtInfoInserts.sql
# MAKING RAT AXTTIGHT FROM AXTBEST (DONE - 2003-08-26 - Hiram)
# After creating axtBest alignments above, use subsetAxt to get axtTight:
ssh eieio
cd /cluster/data/hg16/bed/blastz.rn3/axtNet
mkdir -p ../axtTight
tcsh
foreach i (*.axt)
echo $i
subsetAxt $i ../axtTight/$i \
~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
end
# translate to psl
cd ../axtTight
mkdir -p ../pslTight
foreach i (*.axt)
set c = $i:r
axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn3.psl
echo "Done: $i"
end
# Load tables into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.rn3/pslTight
hgLoadPsl -noTNameIx hg16 chr*_blastzTightRn3.psl
# copy axt's to download area (DONE 2003-09-24 kate)
ssh eieio
cd /cluster/data/hg16/bed/blastz.rn3/axtTight
gzip *.axt
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtTight
cd /cluster/data/hg16/bed/blastz.rn3/axtTight
cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtTight
# add README.txt file to dir, if needed
# CHAIN RAT BLASTZ (DONE 2003-08-08 - Hiram)
# Run axtChain on little cluster
ssh kkr1u00
cd /cluster/data/hg16/bed/blastz.rn3
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg16/bed/blastz.rn3/axtChrom/*.axt > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# axtFilter -notQ=chrUn_random $1 | axtChain stdin
cat << '_EOF_' > doChain
#!/bin/sh
axtFilter $1 | axtChain stdin \
/iscratch/i/gs.17/build34/bothMaskedNibs \
/iscratch/i/rn3/bothMaskedNibs $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
mkdir out chain
# 41 jobs
gensub2 input.lst single gsub jobList
para create jobList
para try
para push # ... etc ...
# With only 6 CPUs available:
# Completed: 40 of 40 jobs
# CPU time in finished jobs: 21791s 363.19m 6.05h 0.25d 0.001 y
# IO & Wait Time: 12491s 208.18m 3.47h 0.14d 0.000 y
# Average job time: 857s 14.28m 0.24h 0.01d
# Longest job: 2724s 45.40m 0.76h 0.03d
# Submission to last job: 5875s 97.92m 1.63h 0.07d
# now on the cluster server, sort chains
ssh eieio
cd /cluster/data/hg16/bed/blastz.rn3/axtChain
/cluster/bin/i386/chainMergeSort run1/chain/*.chain > all.chain
/cluster/bin/i386/chainSplit chain all.chain
# these steps take ~20 minutes
# optionally: rm run1/chain/*.chain
# Load chains into database
# next machine
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.rn3/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain hg16 ${c}_chainRn3 $i
echo done $c
end
CREATE bigZips stuff for release (DONE 2003-08-01, 08-06, 08-08 - Hiram)
# make bigZips/mrna.zip (markd 8 aug 2003)
# on hgbeta:
cd /genbank
./bin/i386/gbGetSeqs -get=seq -db=hg16 -native genbank mrna download/hg16/bigZips/mrna.fa
zip download/hg16/bigZips/mrna.zip download/hg16/bigZips/mrna.fa
rm download/hg16/bigZips/mrna.fa
ssh hgwdev
# This stuff has to work in a different way because this stuff
# updates on a daily basis. - (DONE 2003-08-09 - Hiram)
cd /usr/local/apache/htdocs/goldenPath/hg16/bigZips
featureBits hg16 refGene:upstream:1000 -fa=upstream1000.fa
zip upstream1000.zip upstream1000.fa
rm upstream1000.fa
featureBits hg16 refGene:upstream:2000 -fa=upstream2000.fa
zip upstream2000.zip upstream2000.fa
rm upstream2000.fa
featureBits hg16 refGene:upstream:5000 -fa=upstream5000.fa
zip upstream5000.zip upstream5000.fa
rm upstream5000.fa
# MAKING MOUSE AND RAT SYNTENY (MOUSE done 2003-09-16)(RAT Done 2003-08-28)
ssh hgwdev
mkdir -p /cluster/data/hg16/bed/syntenyMm3
cd /cluster/data/hg16/bed/syntenyMm3
# Copy all the needed scripts from /cluster/data/hg15/bed/syntenyMouse
cp -p /cluster/data/hg15/bed/syntenyMouse/*.pl .
./syntenicBest.pl -db=hg16 -table=blastzBestMm3
./smooth.pl
./joinsmallgaps.pl
./fillgap.pl -db=hg16 -table=blastzBestMm3
./synteny2bed.pl
hgLoadBed hg16 syntenyMm3 ucsc100k.bed
# And for the Rat, same thing, different directory:
mkdir ../syntenyRn3
cd ../syntenyRn3
../syntenyMm3/syntenicBest.pl -db=hg16 -table=blastzBestRn3
# smooth.pl overwrites genomeBest2phase created by the previous
# run of this above. Runs quickly.
../syntenyMm3/smooth.pl
# joinsmallgaps.pl overwrites genomeBest3phase created above. Runs quickly.
../syntenyMm3/joinsmallgaps.pl
# fillgap.pl creates genomeBestFinal
../syntenyMm3/fillgap.pl -db=hg16 -table=blastzBestRn3
# synteny2bed.pl creates ucsc100k.bed
../syntenyMm3/synteny2bed.pl
hgLoadBed hg16 syntenyRn3 ucsc100k.bed
# Loaded 1537 elements
# NET RAT BLASTZ (WORKING - 2003-08-11 - Hiram)
ssh eieio
cd /cluster/data/hg16/bed/blastz.rn3/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
/cluster/data/rn3/chrom.sizes ../preNet/$i
end
# This foreach loop will take about 15 min to execute.
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
/cluster/data/rn3/chrom.sizes ../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
# memory usage 2511495168, utime 15658 s/100, stime 3383
# The netClass operations requires an "ancientRepeat" table to exist
# in either hg16 or rn3. So, create the table:
ssh hgwdev
mkdir -p /cluster/data/hg16/bed/ancientRepeat
cd /cluster/data/hg16/bed/ancientRepeat
# mysqldump needs write permission to this directory
# and you need to use your read/write enabled user with password
chmod 777 .
hgsqldump --all --tab=. hg15 ancientRepeat
chmod 775 .
hgsql hg16 < ancientRepeat.sql
mysqlimport -u<r/w user> -p<r/w pass> hg16 ancientRepeat.txt
# This is a hand curated table obtained from Arian.
# The mouse.net argument here should have been rat.net
cd /cluster/data/hg16/bed/blastz.rn3/axtChain
/cluster/bin/i386/netClass hNoClass.net hg16 rn3 mouse.net \
-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInRat \
-qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
# If things look good do
ssh eieio
cd /cluster/data/hg16/bed/blastz.rn3/axtChain
rm -r n1 hNoClass.net
# The arguments here should have been rat.net and ratSyn.net
# Make a 'syntenic' subset of these with
netFilter -syn mouse.net > mouseSyn.net
# The mouse.net argument here should have been rat.net from the
# netClass operation above.
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.rn3/axtChain
netFilter -minGap=10 mouse.net | hgLoadNet hg16 netRn3 stdin
netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetRn3 stdin
# Add entries for net and chain to human/hg16 trackDb
# MAKE HUMAN-RAT OVER.CHAIN FOR LIFTOVER (2004-07-09 kate)
ssh eieio
set chainDir = /cluster/data/hg16/bed/blastz.rn3/axtChain
cd $chainDir/ratNet
gunzip *.gz
ssh kolossus
cd /cluster/data/hg16/bed/liftOver
mkdir hg16Torn3
cd hg16Torn3
set chainDir = /cluster/data/hg16/bed/blastz.rn3/axtChain
mkdir subset
cat > makeSubset.csh << 'EOF'
set chainDir = /cluster/data/hg16/bed/blastz.rn3/axtChain
foreach f ($chainDir/chain/*.chain)
set c = $f:t:r:r
echo subsetting $c
netChainSubset $chainDir/ratNet/$c.net $f subset/$c.chain
end
'EOF'
# << for emacs
csh makeSubset.csh >&! makeSubset.log &
tail -100f makeSubset.log
cat subset/*.chain > /cluster/data/hg16/bed/liftOver/hg16Torn3.chain
# Make Known Genes Track
This task has many steps and currently it is described by two documents:
1. makeProteins072003.doc
describes how the protein databases, biosql072003 and proteins072003,
were built
2. makeKgHg16.doc
describes how the Known Genes related database tables
were built for hg16. makeKgHg16.doc could be merged
with makeHg16.doc after minor editing of the format style.
# LIFTING REPEATMASKER .ALIGN FILES
# for this work, I had to delete some comments that were in the .align files.
# The edited files were
# NT_008046_01.fa.align (around line 10586)
# NT_008046_75.fa.align (around line 3320)
# The lines I deleted are:
#
# These elements can be clipped out with the options is_clip or is_only.
# The latter does not run the 'normal' RepeatMasker routine and positions in the current
# .out file will not correspond with the -is_only reconstructed sequence.
#
foreach d (?{,?}/NT_??????)
set c=$d:t
cd $d
echo $c to $c.fa.align
/cluster/bin/scripts/liftRMAlign.pl $c.lft > $c.fa.align
cd ../..
end
foreach chr (?{,?})
cd $chr
echo making symbolic links for chr$chr NT .fa.align files
foreach ctg (NT_??????)
ln -s $ctg/$ctg.fa.align
end
cd ..
if (-e $chr/lift/ordered.lft) then
echo making $chr/chr$chr.fa.align
/cluster/bin/scripts/liftRMAlign.pl $chr/lift/ordered.lft \
> $chr/chr$chr.fa.align
endif
if (-e $chr/lift/random.lft) then
echo making $chr/chr${chr}_random.fa.align
/cluster/bin/scripts/liftRMAlign.pl $chr/lift/random.lft \
> $chr/chr${chr}_random.fa.align
endif
echo removing symbolic links for chr$chr NT .fa.align files
rm $chr/NT_??????.fa.align
end
# TWINSCAN 1.3 GENE PREDICTIONS (2003-12-12 braney)
cd /cluster/data/hg16/bed
rm -fr twinscan
mkdir twinscan.2003-12-12
ln -s twinscan.2003-12-12 twinscan
cd twinscan
tarFile=Hs-NCBI34-TS13-pseudo-masked.tgz
wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13_pseudo/Hs-NCBI34-TS13-pseudo-masked.tgz
wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13_pseudo/md5sum.txt
# check file transferred correctly
md5sum $tarFile | diff - md5sum.txt
tar xvfz $tarFile
unset tarFile
# pare down protein FASTA header to id and add missing .a:
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
echo chr$c
perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa
end
ldHgGene hg16 twinscan chr_gtf/chr*.gtf -gtf -genePredExt
hgPepPred hg16 generic twinscanPep chr_ptx/chr*-fixed.fa
# RAW TWINSCAN 1.3 GENE PREDICTIONS, WITHOUT FILTERING OF PSEUDOGENES
# (2004-01-11 acs)
cd /cluster/data/hg16/bed
mkdir twinscan_raw.2004-01-11
ln -s twinscan.2004-01-11 twinscan_raw
cd twinscan_raw
tarFile=NCBI34_Hs_TS13_11_11_03.tgz
wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13/$tarFile
wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13/md5sum.txt
# check file transferred correctly
md5sum $tarFile | diff - md5sum.txt
tar xvfz $tarFile
unset tarFile
# pare down protein FASTA header to id and add missing .a:
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
echo chr$c
perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa
end
ldHgGene hg16 twinscan_raw chr_gtf/chr*.gtf -gtf
hgPepPred hg16 generic twinscanrawPep chr_ptx/chr*-fixed.fa
# LOAD GENEID GENES (DONE - 2003-09-02 - Hiram RELOADED -gtf 2004-04-02 kate)
mkdir -p /cluster/data/hg16/bed/geneid/download
cd /cluster/data/hg16/bed/geneid/download
# Now download *.gtf and *.prot from
set dir = genome.imim.es/genepredictions/H.sapiens/golden_path_200307/geneid_v1.1/
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y Un)
wget http://$dir/chr$c.gtf
wget http://$dir/chr${c}_random.gtf
wget http://$dir/chr$c.prot
wget http://$dir/chr${c}_random.prot
end
wget http://$dir/readme
# Add missing .1 to protein id's
foreach f (*.prot)
perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
echo "done $f"
end
cd ..
ldHgGene hg16 geneid download/*.gtf -gtf
# Read 32255 transcripts in 281180 lines in 40 files
# 32255 groups 40 seqs 1 sources 3 feature types
# 32255 gene predictions
hgPepPred hg16 generic geneidPep download/*-fixed.prot
# QA NOTE: [ASZ 2007-10-02] sudo mytouch hg16 geneidPep 200404021400.00
# HUMAN/MOUSE/RAT ALIGMNMENT USING HUMOR(MULTIZ) (IN PROGRESS 2003-0829 kate)
# Multiple alignment with Mm3, Rn3
ssh eieio
# make mouse axtNet300
cd /cluster/data/hg16/bed/blastz.mm3/axtChain/mouseNet
mkdir -p ../../axtNet300
foreach f (chr*.net)
set c = $f:r
echo "mouse axtNet300 on $c"
netToAxt -maxGap=300 $c.net ../chain/$c.chain /cluster/data/hg16/nib /cluster/data/mm3.RM030619/mixedNib ../../axtNet300/$c.axt
end
# make rat axtNet300
cd /cluster/data/hg16/bed/blastz.rn3/axtChain/ratNet
mkdir -p ../../axtNet300
foreach f (chr*.net)
set c = $f:r
echo "rat axtNet300 on $c"
netToAxt -maxGap=300 $c.net ../chain/$c.chain /cluster/data/hg16/nib /cluster/data/rn3/nib ../../axtNet300/$c.axt
end
# create 2-way maf files
#set multizDir = /cluster/data/hg16/bed/humor.2003-09-02
set multizDir = /cluster/data/hg16/bed/humor.2003-09-08
mkdir -p $multizDir/maf
cd /cluster/data/hg16
set mouseDir = bed/blastz.mm3/axtNet300
set ratDir = bed/blastz.rn3/axtNet300
foreach c (`cut -f 1 chrom.sizes`)
echo "making mouse mafs on $c"
# NOTE: this sort should probably be earlier in the pipeline
axtSort $mouseDir/$c.axt $mouseDir/$c.axt.sorted
mv $mouseDir/$c.axt.sorted $mouseDir/$c.axt
axtToMaf $mouseDir/$c.axt /cluster/data/hg16/chrom.sizes /cluster/data/mm3/chrom.sizes $multizDir/maf/$c.mm3.maf.unfixed -tPrefix=hg16. -qPrefix=mm3.
/cluster/bin/scripts/fixmaf.pl \
< $multizDir/maf/$c.mm3.maf.unfixed > $multizDir/maf/$c.mm3.maf
echo "making rat mafs on $c"
axtSort $ratDir/$c.axt $ratDir/$c.axt.sorted
mv $ratDir/$c.axt.sorted $ratDir/$c.axt
axtToMaf $ratDir/$c.axt /cluster/data/hg16/chrom.sizes /cluster/data/rn3/chrom.sizes $multizDir/maf/$c.rn3.maf.unfixed -tPrefix=hg16. -qPrefix=rn3.
/cluster/bin/scripts/fixmaf.pl \
< $multizDir/maf/$c.rn3.maf.unfixed > $multizDir/maf/$c.rn3.maf
rm $multizDir/maf/*.unfixed
end
# copy maf files to bluearc for cluster run
set clusterDir = /cluster/bluearc/hg16/bed
mkdir $clusterDir/blastz.mm3/mafNet300
cp $multizDir/maf/*.mm3.maf $clusterDir/blastz.mm3/mafNet300
mkdir /cluster/bluearc/hg16/bed/blastz.rn3/mafNet300
cp $multizDir/maf/*.rn3.maf $clusterDir/blastz.rn3/mafNet300
# create scripts to run on cluster
# run "humor"
cd $multizDir
mkdir hmr
mkdir run
cd run
cat << EOF > doHumor.kk
/cluster/bin/penn/humor.v4 $clusterDir/blastz.mm3/mafNet300/\$1.mm3.maf $clusterDir/blastz.rn3/mafNet300/\$1.rn3.maf > $multizDir/hmr/\$1.hmr.maf
EOF
chmod +x doHumor.kk
cat << EOF > gsub
#LOOP
doHumor.kk \$(root1) {check out line+ $multizDir/hmr/\$(root1).hmr.maf}
#ENDLOOP
EOF
cd $clusterDir/blastz.mm3/mafNet300
# NOTE: probably want a better way to make the chrom list
ls *.maf | awk -F. '{print $1}' > $multizDir/run/chrom.list
cd $multizDir/run
gensub2 chrom.list single gsub jobList
# run jobs
ssh kkr9u01
#set multizDir = /cluster/data/hg16/bed/humor.2003-09-02
set multizDir = /cluster/data/hg16/bed/humor.2003-09-08
cd $multizDir/run
para create jobList
para try
para check
para push
# longest job 27 minutes
# setup external files for database reference
ssh hgwdev
mkdir -p /gbdb/hg16/humorMm3Rn3
cd /gbdb/hg16/humorMm3Rn3
foreach f ($multizDir/hmr/*.maf)
ln -s $f .
end
# load into database
#cd $multizDir/hmr/*.maf
/cluster/bin/i386/hgLoadMaf -warn hg16 humorMm3Rn3
# copy files to download area (2003-10-24 kate)
set dir = /usr/local/apache/htdocs/goldenPath/hg16/humorMm3Rn3
mkdir -p $dir
cp -p /gbdb/hg16/humorMm3Rn3/*.maf $dir
cd $dir
gzip *
# edit downloads page to add linke to humorMm3Rn3
# add pairwise mafs to downloads page (2003-11-25 kate)
set dir = /usr/local/apache/htdocs/goldenPath/hg16/humorMm3Rn3
mkdir $dir/{rn3,mm3}
cd /cluster/data/hg16/bed/humor/maf
cp *.mm3.maf $dir/mm3
cp *.rn3.maf $dir/rn3
gzip $dir/mm3/*
gzip $dir/rn3/*
# Create upstream files (kent)
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg16/humorMm3Rn3
echo hg16 mm3 rn3 > org.txt
foreach i (1000 2000 5000)
featureBits hg16 refGene:upstream:$i -fa=/dev/null -bed=up.bad
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
rm up.bad
mafFrags hg16 humorMm3Rn3 up.bed upstream$i.maf -orgs=org.txt
rm up.bed
end
# MAKING BLASTZ SELF (DONE - 2003-08-08 - Hiram)
# The procedure for lineage spec business with self is to simply
# use the actual repeat masker output for this human assembly as
# the lineage specific repeats for itself. Thus, merely make
# symlinks to the repeat masker out files and name them as expected
# for blastz. In this case they are called notInHuman but they
# really mean InHuman. Yes, it is confusing, but that's just the
# nature of the game in this case.
ssh eieio
mkdir -p /cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman
cd /cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman
foreach f (../rmsk/*.fa.out)
set base = $f:t:r:r
echo $base.out.spec
ln -s $f $base.out.spec
end
ssh eieio
mkdir -p /cluster/data/hg16/bed/blastzSelf
cd /cluster/data/hg16/bed/blastzSelf
cat << '_EOF_' > DEF
# human vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Human
SEQ2_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
BASE=/cluster/store4/gs.17/build34/bed/blastzSelf
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line makes emacs coloring happy
# Save the DEF file in the current standard place
DS=`date -I`
cp DEF ~angie/hummus/DEF.hg16-hg16.$DS
ssh kk
cd /cluster/data/hg16/bed/blastzSelf
# source the DEF file to establish environment for following commands
. ./DEF
# follow the next set of directions slavishly
mkdir -p $BASE/run
# give up on avoiding angie's directories
# tcl script
# creates xdir.sh and joblist run/j
~angie/hummus/make-joblist $DEF > $BASE/run/j
# xdir.sh makes a bunch of result directories in $BASE/raw/
# based on chrom name and CHUNK size
sh $BASE/xdir.sh
cd $BASE/run
# now edit j to prefix path to executable name
# NOTE: we should have a controlled version of schwartz bin executables
sed -e 's#^#/cluster/bin/penn/#' j > j2
wc -l j*
# 114921 j
head j2
# make sure the j2 edits are OK, then use it:
mv j2 j
# para create will create the file: 'batch' for the cluster run
para create j
# 114921 jobs
para try
para check
para push
# ... etc ...
# With some cluster difficulties, bluearc hangups, etc:
Completed: 114921 of 114921 jobs
CPU time in finished jobs: 19898031s 331633.85m 5527.23h 230.30d 0.631 y
IO & Wait Time: 42606494s 710108.24m 11835.14h 493.13d 1.351 y
Average job time: 544s 9.06m 0.15h 0.01d
Longest job: 111877s 1864.62m 31.08h 1.29d
Submission to last job: 344744s 5745.73m 95.76h 3.99d
# post-process blastz
ssh eieio
cd /cluster/data/hg16/bed/blastzSelf
# source the DEF file again in case you are coming back to this
# (must be bash shell)
. ./DEF
# a new run directory
mkdir -p run.1
mkdir -p $BASE/lav
# create a new job list to convert out files to lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
> run.1/jobList
cd run.1
# make sure the job list is OK
wc -l jobList
# 339 jobs
head jobList
# run on cluster
ssh kk
cd /cluster/data/hg16/bed/blastzSelf/run.1
para create jobList
para try
para check
para push
# etc.
#Completed: 339 of 339 jobs
#CPU time in finished jobs: 21101s 351.68m 5.86h 0.24d 0.001 y
#IO & Wait Time: 74915s 1248.58m 20.81h 0.87d 0.002 y
#Average job time: 283s 4.72m 0.08h 0.00d
#Longest job: 2028s 33.80m 0.56h 0.02d
#Submission to last job: 2993s 49.88m 0.83h 0.03d
# convert lav files to axt
ssh kk
cd /cluster/data/hg16/bed/blastzSelf
mkdir axtChrom
# a new run directory
mkdir run.2
cd run.2
# create template file for gensub2
# usage: blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/scripts/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastzSelf/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastzSelf/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/gs.17/build34/bothMaskedNibs
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
ls -1S /cluster/store4/gs.17/build34/bed/blastzSelf/lav > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
# 42 jobs
head jobList
cd /cluster/data/hg16/bed/blastzSelf/run.2
para create jobList
para try
para check
para push
# We have two crashed jobs here. The data for chr7 and chr19 is
# too much for the processing. Have to run those separately on
# the file server eieio.
Completed: 40 of 42 jobs
Crashed: 2 jobs
CPU time in finished jobs: 4737s 78.95m 1.32h 0.05d 0.000 y
IO & Wait Time: 57154s 952.57m 15.88h 0.66d 0.002 y
Average job time: 1547s 25.79m 0.43h 0.02d
Longest job: 7969s 132.82m 2.21h 0.09d
Submission to last job: 8029s 133.82m 2.23h 0.09d
# Fixup chr7 and chr19 by running them in two passes like this:
ssh eieio
cd /cluster/data/hg16/bed/blastzSelf
set base=/cluster/data/hg16/bed/blastzSelf
set seq1_dir=/cluster/data/hg16/nib
set seq2_dir=/cluster/data/hg16/nib
foreach c (lav/chr19 lav/chr7)
pushd $c
set chr=$c:t
set out=axtChrom/$chr.axt
echo "Translating $chr lav to $out"
foreach d (*.lav)
set smallout=$d.axt
lavToAxt $d $seq1_dir $seq2_dir stdout \
| axtDropSelf stdin stdout \
| axtSort stdin $smallout
end
cat `ls -1 *.lav.axt | sort -g` > $base/$out
popd
end
# translate sorted axt files into psl
ssh eieio
cd /cluster/data/hg16/bed/blastzSelf
# Need to drop overlaps to eliminate diagonals
# DropOverlap seems to drop more than axtDropSelf above
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
mv axtChrom/$c.axt axtChrom/$c.axt
/cluster/bin/i386/axtDropOverlap axtChrom/$c.axt \
/cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/$c.axt
echo "Done: $c"
end
cd /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped
gzip *.axt
# Needed a deliver of these right away: (REMOVED 2005-01-27)
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsSelf
cd /usr/local/apache/htdocs/goldenPath/hg16/vsSelf
cp -p /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/*.axt.gz .
ssh eieio
mkdir -p /cluster/data/hg16/bed/blastzSelf/pslChrom
cd /cluster/data/hg16/bed/blastzSelf
set tbl = "blastzSelf"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
zcat /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/${c}.axt.gz | \
/cluster/bin/i386/axtToPsl stdin S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# That takes about 20 minutes
XXXX Pick this up tomorrow, 03-09-12 with pslChromDroppedFix
# Load database tables
ssh hgwdev
set tbl = "blastzSelf"
cd /cluster/data/hg16/bed/blastzSelf/pslChrom
/cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzSelf.psl
# This takes 30 minutes to an hour
# create trackDb/human/hg16 and get a trackDb.ra file started with:
# remake trackDb tables
# PRODUCE FUGU BLAT ALIGNMENT (IN PROGRESS 2003-08-22 kate)
# Use masked scaffolds from fr1 assembly (same sequence as
# previous BlatFugu, however it's repeat and TRF-masked).
# NOTE: can't access /iscratch/i from fileserver
ssh kk
mkdir /cluster/data/hg16/bed/blatFr1
cd /cluster/data/hg16/bed/blatFr1
mkdir psl
# next time, use N?_?????? (to pick up NG_ contigs)
foreach f (/cluster/data/hg16/?{,?}/NT_??????/NT_??????.fa)
set c=$f:t:r
echo $c
mkdir -p psl/$c
end
# special case for NG_002432
mkdir -p psl/NG_002432
# create cluster job
cd run
ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
ls -1S /scratch/hg/gs.17/build34/trfFa/*.fa > human.lst
cat << 'EOF' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/hg16/bed/blatFr1/psl/$(root1)/$(root1)_$(root2).psl}
#ENDLOOP
'EOF'
# << this line makes emacs coloring happy
gensub2 human.lst fugu.lst gsub spec
para create spec
# 283798 jobs
para try
para check
para push
para check
# cd psl
# count files with aligments
# find . -not -size 427c | wc -l
# 89878
# count files with no aligments
# find . -size 427c | wc -l
# 195265
# When cluster run is done, sort alignments
# into chrom directory
ssh eieio
cd /cluster/data/hg16/bed/blatFr1
pslCat -dir psl/N?_?????? | \
liftUp -type=.psl stdout \
/cluster/data/hg16/jkStuff/liftAll.lft warn stdin | \
pslSortAcc nohead chrom temp stdin
# 15 minutes ?
# Processed 855648 lines into 4 temp files
# Rename to correspond with tables and load into database:
ssh hgwdev
cd /cluster/data/hg16/bed/blatFr1/chrom
rm -f chr*_blatFr1.psl
foreach i (chr?{,?}{,_random}.psl)
set r = $i:r
echo $r
mv $i ${r}_blatFr1.psl
end
# Next assembly, lift fugu scaffolds to Fugu browser chrUn,
# so you can link to other browser. And don't need to load sequence
# liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl
hgLoadPsl -noTNameIx hg16 *.psl
# $ featureBits hg16 blatFr1 refGene:CDS
# 12787423 bases of 2865248791 (0.446%) in intersection
# $ featureBits hg15 blatFugu refGene:CDS
# 12427544 bases of 2866466359 (0.434%) in intersection
# Edit trackDb.ra to include blatFr1
# NOTE: already in top-level trackDb.ra
# Make fugu /gbdb/ symlink and load Fugu sequence data.
# NOTE: don't need to do this in next assembly
mkdir /gbdb/hg16/fuguSeq
cd /gbdb/hg16/fuguSeq
ln -s /cluster/data/fr1/fugu_v3.masked.fa
# hide .tab file
cd /cluster/store2/tmp
hgLoadSeq hg16 /gbdb/hg16/fuguSeq/fugu_v3.masked.fa
# MAKE BLASTZ BEST SELF (RE-DONE - 2003-08-28 - Hiram)
# Pick up on this process below after chain and nets have been
# done. This run.3 business is obsolete
# Consolidate AXT files to chrom level, sort, pick best, make psl.
ssh eieio
cd /cluster/data/hg16/bed/blastzSelf/axtChrom
mkdir -p /cluster/bluearc/hg16/bed/blastzSelf/axtChrom
# copy chrom axt's to bluearc, to avoid hitting fileserver too hard
cp -p *.axt /cluster/bluearc/hg16/bed/blastzSelf/axtChrom
ssh kk
cd /cluster/data/hg16/bed/blastzSelf
mkdir -p axtBest pslBest
mkdir run.3
cd run.3
# create script to filter files
cat << '_EOF_' > doBestAxt
#!/bin/csh -f
# usage: doBestAxt chr axt-file best-file psl-file
/cluster/bin/i386/axtBest $2 $1 $3 -minScore=300
sleep 1
/cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastzSelf/S1.len \
/cluster/data/hg16/bed/blastzSelf/S2.len $4
'_EOF_'
# << this line makes emacs coloring happy
chmod +x doBestAxt
cd ../axtChrom
ls -1S | sed 's/.axt$//' > ../run.3/chrom.list
cd ../run.3
# create template for cluster job
cat << '_EOF_' > gsub
#LOOP
doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastzSelf/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastzSelf/axtBest/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastzSelf/pslBest/$(root1)_blastzBestMm3.psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 chrom.list single gsub jobList
wc -l jobList
# 42 jobs
head jobList
ssh kkr1u00
cd /cluster/data/hg16/bed/blastzSelf/run.3
para create jobList
para try
para check
para push
Completed: 38 of 42 jobs
Crashed: 4 jobs
CPU time in finished jobs: 1884s 31.41m 0.52h 0.02d 0.000 y
IO & Wait Time: 8421s 140.34m 2.34h 0.10d 0.000 y
Average job time: 271s 4.52m 0.08h 0.00d
Longest job: 2061s 34.35m 0.57h 0.02d
Submission to last job: 2277s 37.95m 0.63h 0.03d
# Some of these files are getting too big for this operation
# We will have to get back to these via the chains, nets and a
# netToAxt trick
# Problems:
/cluster/data/hg16/bed/blastzSelf/axtBest/chr19.axt is empty
/cluster/data/hg16/bed/blastzSelf/pslBest/chr19_blastzBestMm3.psl is empty
Out of memory - request size 1564 bytes
/cluster/data/hg16/bed/blastzSelf/axtBest/chr7.axt is empty
/cluster/data/hg16/bed/blastzSelf/pslBest/chr7_blastzBestMm3.psl is empty
Out of memory - request size 634045604 bytes
/cluster/data/hg16/bed/blastzSelf/axtBest/chr1.axt is empty
/cluster/data/hg16/bed/blastzSelf/pslBest/chr1_blastzBestMm3.psl is empty
ut of memory - request size 984185908 bytes
/cluster/data/hg16/bed/blastzSelf/axtBest/chr2.axt is empty
/cluster/data/hg16/bed/blastzSelf/pslBest/chr2_blastzBestMm3.psl is empty
Out of memory - request size 973662824 bytes
# Here is the replacement process for the above sequence
# Better yet, Jim says to be consistent, do all the chroms in
# this manner:
ssh eieio
cd /cluster/data/hg16/bed/blastzSelf/axtChain
mkdir humanNet
mkdir ../axtNet
netSplit human.net humanNet
foreach n (humanNet/chr*.net)
set c=$n:t:r
echo "netToAxt: $c.net -> $c.axt"
rm -f ../axtNet/$c.axt
netToAxt humanNet/$c.net chain/$c.chain \
/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
../axtNet/$c.axt
echo "Complete: $c.net -> $c.axt"
end
mkdir -p /cluster/data/hg16/bed/blastzSelf/axtBest
cd /cluster/data/hg16/bed/blastzSelf/axtBest
ln -s ../axtNet/chr*.axt .
# Convert those axt files to psl
ssh eieio
cd /cluster/data/hg16/bed/blastzSelf
mkdir pslBest
foreach a (axtBest/chr*.axt)
set c=$a:t:r
echo "processing $c.axt -> ${c}_blastzBestSelf.psl"
/cluster/bin/i386/axtToPsl axtBest/${c}.axt \
S1.len S2.len pslBest/${c}_blastzBestSelf.psl
echo "Done: ${c}_blastzBestSelf.psl"
end
# Load tables
ssh hgwdev
set base="/cluster/data/hg16/bed/blastzSelf"
set tbl="blastzBestSelf"
cd $base/pslBest
/cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl
# check results
# After going through the chain->net->axt operation:
# featureBits hg16 blastzBestSelf
# 1388295977 bases of 2865248791 (48.453%) in intersection
# Hg15 doesn't have a BestSelf, gave this a try with the following
# result:
# featureBits hg15 blastzSelf
# Out of memory - request size 6 bytes
# Make /gbdb links and add them to the axtInfo table:
mkdir -p /gbdb/hg16/axtBestSelf
cd /gbdb/hg16/axtBestSelf
ln -s /cluster/data/hg16/bed/blastzSelf/axtNet/chr*.axt .
cd /cluster/data/hg16/bed/blastzSelf/axtNet
rm -f axtInfoInserts.sql
touch axtInfoInserts.sql
foreach f (/gbdb/hg16/axtBestSelf/chr*.axt)
set chr=$f:t:r
echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
VALUES ('hg16','Blastz Best in Genome','$chr','$f');" \
>> axtInfoInserts.sql
end
# This table has already been created above
# hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql
hgsql hg16 < axtInfoInserts.sql
# MAKE BLASTZ BEST SELF (NOT NECESSARY - NOT USEFUL - NOT NEEDED - NOT DONE)
# MAKING CHAIN SELF BLASTZ (DONE - 2003-08-27 - Hiram)
# MAKING CHAIN SELF BLASTZ (RE-DONE - 2003-09-04 - Hiram)
# 2003-09-04 - with dropped overlap axtChrom
# Run axtChain on little cluster
ssh kkr1u00
mkdir -p /cluster/data/hg16/bed/blastzSelf/axtChain/run1
cd /cluster/data/hg16/bed/blastzSelf/axtChain/run1
mkdir out chain
ls -1S /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/*.axt > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# The -notQ_random (new argument to axtFilter) will omit any
# *_random from the query.
cat << '_EOF_' > doChain
#!/bin/csh
~/bin/i386/axtFilter -notQ_random $1 | axtChain stdin \
/iscratch/i/gs.17/build34/bothMaskedNibs \
/iscratch/i/gs.17/build34/bothMaskedNibs $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
mkdir out chain
gensub2 input.lst single gsub jobList
# edit jobList and remove the first one that does chr19
# It is a job that would fail anyway after more than an
# hour of run time. It will be done separately below
para create jobList
# 41 jobs
para try
para push # ... etc ...
# Completed: 41 of 41 jobs
# CPU time in finished jobs: 27107s 451.78m 7.53h 0.31d 0.001 y
# IO & Wait Time: 16236s 270.60m 4.51h 0.19d 0.001 y
# Average job time: 1057s 17.62m 0.29h 0.01d
# Longest job: 4989s 83.15m 1.39h 0.06d
# Submission to last job: 240988s 4016.47m 66.94h 2.79d
# The chr19 recovery process:
ssh kk
mkdir -p /cluster/data/hg16/bed/blastzSelf/axtChain/run1.19
cd /cluster/data/hg16/bed/blastzSelf/axtChain/run1.19
cat << '_EOF_' > gsubQ
#LOOP
doChainQ.sh $(path2) $(path1) {check out line+ chain/$(root1).$(path2).chain} {check out line+ out/$(root1).$(path2).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChainQ.sh
#!/bin/sh
~/bin/i386/axtFilter -notQ_random -q=$1 $2 | axtChain stdin \
/cluster/store4/gs.17/build34/nib \
/cluster/store4/gs.17/build34/nib $3 > $4
'_EOF_'
# << this line makes emacs coloring happy
chmod +x doChainQ.sh
# This is a mistake, this should have been chr19.axt only
ls -1S /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/*.axt > input.lst
pushd /cluster/data/hg16
ls -d ?{,?} | sed -e "s/^/chr/" | grep -v chr19 \
> /cluster/data/hg16/bed/blastzSelf/axtChain/run1.19/chrom19.lst
popd
mkdir out chain
gensub2 input.lst chrom19.lst gsubQ spec19
para create spec19
para try
para check
para push
... etc ...
Completed: 948 of 1050 jobs
Crashed: 102 jobs
CPU time in finished jobs: 45918s 765.30m 12.75h 0.53d 0.001 y
IO & Wait Time: 1700328s 28338.80m 472.31h 19.68d 0.054 y
Average job time: 1842s 30.70m 0.51h 0.02d
Longest job: 13247s 220.78m 3.68h 0.15d
Submission to last job: 13268s 221.13m 3.69h 0.15d
# the "crashed 102" jobs are empty chains.
# This mistakenly did them all, the input.lst should have been
# chr19 only.
# So, copy the chr19 results to the ../run1/chain result location
cp -p chain/chr19*.chain ../run1/chain
# now on the cluster server, sort chains
ssh eieio
cd /cluster/data/hg16/bed/blastzSelf/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# these steps take ~20 minutes
# optionally: rm run1/chain/*.chain
# Load chains into database
# next machine
ssh hgwdev
cd /cluster/data/hg16/bed/blastzSelf/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain hg16 ${c}_chainSelf $i
echo done $c
end
# DELIVER these chain files to hgdownload (2005-01-27 - Hiram)
ssh eieio
cd /cluster/data/hg16/bed/blastzSelf/axtChain/chain
gzip chr*.chain
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/hg16/vsSelf
cd /cluster/data/hg16/bed/blastzSelf/axtChain/chain
cp -p *.chain.gz /usr/local/apache/htdocs/goldenPath/hg16/vsSelf
# fixup README file, request push
# NET SELF BLASTZ (RE-DONE 2003-09-09 - DONE - 2003-08-27 - Hiram)
ssh eieio
cd /cluster/data/hg16/bed/blastzSelf/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
/cluster/data/hg16/chrom.sizes ../preNet/$i
end
# This foreach loop will take about 15 min to execute.
cd ..
mkdir n1
cd preNet
# Probably OK to make this minSpace=10, used to be 1
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=10 \
/cluster/data/hg16/chrom.sizes \
/cluster/data/hg16/chrom.sizes ../n1/$n /dev/null
end
# The above takes about 5 minutes
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
# memory usage 200167424, utime 2489 s/100, stime 161
ssh hgwdev
cd /cluster/data/hg16/bed/blastzSelf/axtChain
~/bin/i386/netClass hNoClass.net hg16 hg16 human.net \
-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman \
-qNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman
# If things look good do
ssh eieio
cd /cluster/data/hg16/bed/blastzSelf/axtChain
rm -r n1 hNoClass.net
# Make a 'syntenic' subset of these with
netFilter -syn human.net > humanSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastzSelf/axtChain
netFilter -minGap=10 human.net | hgLoadNet hg16 netSelf stdin
netFilter -minGap=10 humanSyn.net | hgLoadNet hg16 syntenyNetSelf stdin
# Add entries for net and chain to human/hg16 trackDb
# MAKING SELF AXTTIGHT FROM AXTCHROM (DONE - 2003-09-09 - Hiram)
ssh eieio
cd /cluster/data/hg16/bed/blastzSelf/axtChrom
mkdir -p /cluster/data/hg16/bed/blastzSelf/axtTight
tcsh
foreach i (*.axt)
echo $i
subsetAxt $i /cluster/data/hg16/bed/blastzSelf/axtTight/$i \
~kent/src/hg/mouseStuff/subsetAxt/90.mat 5000
end
# translate to psl
cd ../axtTight
mkdir -p ../pslTight
foreach i (*.axt)
set c = $i:r
axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightSelf.psl
echo "Done: $i"
end
# Load tables into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastzSelf/pslTight
hgLoadPsl -noTNameIx hg16 chr*_blastzTightSelf.psl
# MAKING SELF SYNTENY - Can be done after Best (NEEDS TO BE REDONE 2003-09-09)
ssh hgwdev
mkdir -p /cluster/data/hg16/bed/syntenySelf
cd /cluster/data/hg16/bed/syntenySelf
# Use the scripts that were already copied to ../syntenyMm3
The first one takes 3 to 4 hours.
../syntenyMm3/syntenicBest.pl -db=hg16 -table=blastzBestSelf > synBest.out 2>&1
XXXX - Running 2003-08-27 21:32
../syntenyMm3/smooth.pl
../syntenyMm3/joinsmallgaps.pl
../syntenyMm3/fillgap.pl -db=hg16 -table=blastzBestSelf
../syntenyMm3/synteny2bed.pl
# Load results
hgLoadBed hg16 syntenySelf ucsc100k.bed
# SGP GENE PREDICTIONS vs Mm4 (DONE - 2003-12-30 - Hiram)
mkdir -p /cluster/data/hg16/bed/sgp_mm4/download
cd /cluster/data/hg16/bed/sgp_mm4/download
foreach f (/cluster/data/hg16/?{,?}/chr?{,?}{,_random}.fa)
set chr = $f:t:r
wget --timestamping \
http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/$chr.gtf
wget --timestamping \
http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/$chr.prot
end
wget --timestamping \
http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/chrUn.gtf -O chrUn_random.gtf
wget --timestamping \
http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/chrUn.prot -O chrUn_random.prot
wget --timestamping \
http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/readme
# Add missing .1 to protein id's
foreach f (*.prot)
perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
end
cd ..
# since this is a relolad of this table updating the data
# from Mm3 to Mm4. First check what is there:
# featureBits hg16 sgpGene
# 39781330 bases of 2865248791 (1.388%) in intersection
# now drop that table, and reload
hgsql -e "drop table sgpGene;" hg16
# This used to be done with -exon=CDS but it will do the same
# thing _AND_ add stop codons when done with -gtf, so do this
# with -gtf
ldHgGene -gtf hg16 sgpGene download/*.gtf
# Read 42880 transcripts in 322086 lines in 39 files
# 42880 groups 39 seqs 1 sources 3 feature types
# 42880 gene predictions
hgsql -e "drop table sgpPep;" hg16
hgPepPred hg16 generic sgpPep download/*-fixed.prot
# featureBits hg16 sgpGene
# 39698249 bases of 2865248791 (1.386%) in intersection
# featureBits hg15 sgpGene
# 40395614 bases of 2866466359 (1.409%) in intersection
# SGP GENE PREDICTIONS - Mm3 (DONE - 2003-09-14 - Hiram - to be verified)
mkdir -p /cluster/data/hg16/bed/sgp/download
cd /cluster/data/hg16/bed/sgp/download
foreach f (/cluster/data/hg16/?{,?}/chr?{,?}{,_random}.fa)
set chr = $f:t:r
wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/$chr.gtf
wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/$chr.prot
end
wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/chrUn.gtf -O chrUn_random.gtf
wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/chrUn.prot -O chrUn_random.prot
# Add missing .1 to protein id's
foreach f (*.prot)
perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
end
cd ..
ldHgGene hg16 sgpGene download/*.gtf -exon=CDS
# Read 43109 transcripts in 323911 lines in 39 files
# 43109 groups 39 seqs 1 sources 3 feature types
# 43109 gene predictions
hgPepPred hg16 generic sgpPep download/*-fixed.prot
# featureBits hg16 sgpGene
# 39781330 bases of 2865248791 (1.388%) in intersection
# featureBits hg15 sgpGene
# 40395614 bases of 2866466359 (1.409%) in intersection
# SGP GENES (UPDATE 1/18/2006)
sgpPep table dropped, replaced by hgc generated protein seq in browser
LOAD NCI60 (DONE: Fan 10/20/2003)
o - # ssh hgwdev
cd /projects/cc/hg/mapplots/data/NCI60/dross_arrays_nci60/
mkdir hg16
cd hg16
findStanAlignments hg16 ../BC2.txt.ns ../../image/cumulative_plates.011204.list.human hg16.image.psl >& hg16.image.log
cp ../experimentOrder.txt ./
sed -e 's/ / \.\.\//g' < experimentOrder.txt > epo.txt
egrep -v unknown hg16.image.psl > hg16.image.good.psl
stanToBedAndExpRecs hg16.image.good.psl hg16.nci60.exp hg16.nci60.bed `cat epo.txt`
hgsql hg16 < ../../scripts/nci60.sql
echo "load data local infile 'hg16.nci60.bed' into table nci60" | hgsql hg16
mkdir /cluster/store4/gs.17/build34/bed/nci60
mv hg16.nci60.bed /cluster/store4/gs.17/build34/bed/nci60
rm *.psl
# LOAD AFFYRATIO [GNF in progress jk Sept 19, 2003]
# LOAD AFFYRATIO U95Av2 sequences [DONE hartera Feb 2, 2004]
# Used consensus/exemplar sequences instead of target sequences
# LOAD AFFYRATIO [in progress, Feb 4, 2004]
# changed pslReps parameters as minAli = 0.97 was too stringent
# Set up cluster job to align consenesus/exemplars to hg16
ssh kkr1u00
cd /cluster/data/hg16/bed
rm -rf affyGnf.2004-02-04/
mkdir affyGnf.2004-02-04
cd affyGnf.2004-02-04/
mkdir -p /iscratch/i/affy
cp /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa /iscratch/i/affy
iSync
ssh kk
cd /cluster/data/hg16/bed/affyGnf.2004-02-04
ls -1 /iscratch/i/affy/HG-U95Av2_all.fa > affy.lst
ls -1 /scratch/hg/gs.17/build34/trfFa/ > allctg.lst
echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc /scratch/hg/gs.17/build34/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
gensub2 allctg.lst affy.lst template.sub para.spec
mkdir psl
para create para.spec
# Actually do the job with usual para try/check/push/time etc.
# para time 2/4/04
#Completed: 491 of 491 jobs
#CPU time in finished jobs: 8344s 139.06m 2.32h 0.10d 0.000 y
#IO & Wait Time: 2281s 38.02m 0.63h 0.03d 0.000 y
#Average job time: 22s 0.36m 0.01h 0.00d
#Longest job: 289s 4.82m 0.08h 0.00d
#Submission to last job: 388s 6.47m 0.11h 0.00d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU95.psl
pslSort dirs raw.psl tmp psl
# change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least 95% identity in aligned region.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
liftUp affyU95.psl ../../jkStuff/liftAll.lft warn contig.psl
# Merge with spot data and load into database. added -chip flag to
# affyPslAndAtlasToBed to allow correct parsing
ssh hgwdev
cd /cluster/data/hg16/bed/affyGnf.2004-02-04
/cluster/home/sugnet/bin/i386/affyPslAndAtlasToBed -chip=U95Av2 affyU95.psl /projects/compbiodata/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt affyRatio.bed affyRatio.exr >& affyPslAndAtlasToBed.log
hgLoadBed -sqlTable=$HOME/src/hg/lib/affyRatio.sql hg16 affyRatio affyRatio.bed
# This affyU95 load was later changed to eliminate the long names
# hgLoadPsl hg16 affyU95.psl
# by the following:
sed -e "s/U95Av2://" affyU95.psl | sed -e "s/;//" > affyU95shortQname.psl
hgLoadPsl hg16 -table=affyU95 affyU95shortQname.psl
# Clean up
rm -r psl tmp err affyRatio.bed affyRatio.exr bed.tab scores.tab *.debug batch.bak contig.psl raw.psl
LOAD AffyUclaRatio [in progress jk Sept 19, 2003]
#LOAD AffyUclaRatio and AFFY U133A and U133B sequences[DONE hartera Feb 3, 2004]
# Used consensus/exemplar sequences instead of target sequences
# Set up cluster job to align consensus/exemplars to hg16
ssh kkr1u00
cd /cluster/data/hg16/bed
rm -rf affyUcla.2004-02-04/
mkdir affyUcla.2004-02-04
cd affyUcla.2004-02-04/
mkdir -p /iscratch/i/affy
cp /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa /iscratch/i/affy
iSync
ssh kk
cd /cluster/data/hg16/bed/affyUcla.2004-02-04/
ls -1 /iscratch/i/affy/HG-U133AB_all.fa > affy.lst
ls -1 /scratch/hg/gs.17/build34/trfFa/ > allctg.lst
echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc /scratch/hg/gs.17/build34/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
gensub2 allctg.lst affy.lst template.sub para.spec
mkdir psl
para create para.spec
# Actually do the job with usual para try/check/push/time etc.
# on 2/4/04:
#Completed: 491 of 491 jobs
#CPU time in finished jobs: 23137s 385.61m 6.43h 0.27d 0.001 y
#IO & Wait Time: 23057s 384.29m 6.40h 0.27d 0.001 y
#Average job time: 94s 1.57m 0.03h 0.00d
#Longest job: 617s 10.28m 0.17h 0.01d
#Submission to last job: 747s 12.45m 0.21h 0.01d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU133.psl.
pslSort dirs raw.psl tmp psl
# change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least 95% identity in aligned region.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
liftUp affyU133.psl ../../jkStuff/liftAll.lft warn contig.psl
# Merge with spot data and load into database.
ssh hgwdev
cd /cluster/data/hg16/bed/affyUcla.2004-01-28/
# added to hashPsls to process shorter Affy probe set names
# assumes that names has 2 colons but when shortened to fit in the seq
# database, there is only 1.
# e.g. full name: "consensus:HG-U133A:212933_x_at;" short name: "HG-U133A:212933_x_at;"
affyUclaMergePslData affyUclaMergePslData -pslFile=affyU133.psl -affyFile=/projects/compbio/data/microarray/affyUcla/data/030602_ucla_normal_human_tissue_snapshot.txt -bedOut=affyUcla.bed -expRecordOut=affyUcla.expRecords -expFile=/projects/compbio/data/microarray/affyUcla/data/expNames -toDiffFile=toDiff.txt
hgLoadBed -sqlTable=$HOME/src/hg/lib/affyUcla.sql hg16 affyUcla affyUcla.bed
hgLoadPsl hg16 affyU133.psl
# Clean up
rm -r psl tmp err affyUcla.bed affyUcla.expRecords bed.tab *.debug batch.bak contig.psl raw.psl
# Add in sequence data for affyU95 and affyU133 tracks.
# Copy probe sequence to /gbdb if it isn't already
mkdir -p /gbdb/hgFixed/affyProbes
cd /gbdb/hgFixed/affyProbes
ln -s /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa .
ln -s /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa .
# use perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
# in HG-U95Av2_all.fa seque
# reload sequences with "U95Av2" prefix removed so acc matches name used
# in other dependent tables for affyU95Av2 only
hgLoadSeq -abbr=U95Av2: hg16 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
hgLoadSeq hg16 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa
# QA repush 2006-02-08 seq/extFile to correct mismatched ID for affyU133 alignment data (Jen)
# Load AFFYUCLANORM, extended version of affyUcla track. Hopefully
# final freeze of data set.
mkdir ~sugnet/store1/
cd hg16
mkdir affyUcla
cd affyUcla/
ssh kk
cd /cluster/store1/sugnet/hg16/affyUcla
cp /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all ./
ls -1 /scratch/hg/gs.17/build34/trfFa/* > allctg.lst
echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
echo "HG-U133AB_all" > affy.lst
gensub2 allctg.lst affy.lst template.sub para.spec
mkdir psl
para create para.spec
Checking input files
491 jobs written to batch
updated job database on disk
para push
# Wait until jobs run...
exit
pslSort dirs hg16.affyU133AB_all.psl tmp psl
# Lots of messages
writing hg16.affyU133AB_all.psl
Cleaning up temp files
wc hg16.affyU133AB_all.psl
60962 1280141 13677509 hg16.affyU133AB_all.psl
ls /cluster/data/hg16/jkStuff/liftAll.lft
/cluster/data/hg16/jkStuff/liftAll.lft
liftUp hg16.affyU133AB_all.lifted.psl /cluster/data/hg16/jkStuff/liftAll.lft warn hg16.affyU133AB_all.psl
Got 491 lifts in /cluster/data/hg16/jkStuff/liftAll.lft
Lifting hg16.affyU133AB_all.psl
pslReps -minCover=0.5 -sizeMatters -minAli=0.97 -nearTop=0.005 hg16.affyU133AB_all.lifted.psl hg16.affyU133AB_all.lifted.pslReps.psl out.psr
Processing hg16.affyU133AB_all.lifted.psl to hg16.affyU133AB_all.lifted.pslReps.psl and out.psr
Processed 60957 alignments
affyUclaMergePslData -pslFile=hg16.affyU133AB_all.lifted.pslReps.psl -affyFile=/projects/compbio/data/microarray/affyUcla/data/feature_biomaterial_chip_logratios_formatForTrack.txt -bedOut=hg16.affyUcla.bed -expRecordOut=hg16.affyUcla.expRecords -expFile=/projects/compbio/data/microarray/affyUcla/data/expNames.sorted.txt
Reading psls from: hg16.affyU133AB_all.lifted.pslReps.psl
Outputing beds:
............................................
Freeing Memory.
Done.
addUclaAnnotations.pl hg16.affyUcla.expRecords /projects/compbio/data/microarray/affyUcla/data/normal_tissue_database_annotations2.txt > hg16.affyUcla.annotations.expRecords
# Load the databases
cp ~/jk/hg/lib/affyRatio.sql ./
sed -e 's/affyRatio/affyUclaNorm/' < affyRatio.sql > affyUclaNorm.sql
# Just use the hgLoadBed program specifying sqlFile
hgLoadBed hg16 affyUclaNorm hg16.affyUcla.bed -sqlTable=affyUclaNorm.sql
Reading hg16.affyUcla.bed
Loaded 44446 elements of size 15
Sorted
Saving bed.tab
Loading hg16
cp ~/jk/hg/lib/expRecord.sql ./
sed -e 's/expRecord/affyUclaNormExps/' < expRecord.sql > affyUclaNormExps.sql
hgFixedS -A < affyUclaNormExps.sql
echo "load data local infile 'hg16.affyUcla.annotations.expRecords' into table affyUclaNormExps" | hgFixedS -A
# Cleanup
rm HG-U133AB_all
# DO FAMILY BROWSER VERSIONS OF AFFYUCLANORMAL TRACK (In Progress -jk 3/2/2004)
# (This is suspended because GNF Gene Atlas data is available and public!)
# Create affyU133Orient table data
ssh eieio
cd /cluster/data/hg16/bed/affyUcla.2044-02-04
pslSortAcc nohead chrom temp affyU133.psl
rm -r temp
cd chrom
#This loop takes about 15 minutes
foreach i (*.psl)
polyInfo $i /cluster/data/hg16/nib/$i:r.nib \
/projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa \
$i:r.polyInfo
echo done $i
end
cat *.polyInfo > ../affyU133OrientInfo.bed
rm *.polyInfo
# Load orientation table data
ssh hgwdev
cd /cluster/data/hg16/bed/affyUcla.2044-02-04
sed 's/mrnaOrientInfo/affyU133OrientInfo/' \
$HOME/kent/src/hg/lib/mrnaOrientInfo.sql > affyU133OrientInfo.sql
hgLoadBed hg16 affyU133OrientInfo affyU133OrientInfo.bed \
-sqlTable=affyU133OrientInfo.sql > /dev/null
# Do clustering (this takes about 10 minutes to run)
clusterRna hg16 u133Cluster.bed /dev/null -noEst -noRefSeq -group=u133Group.tab -mRNAOrient=affyU133OrientInfo -rna=affyU133
~~~
# GNF ATLAS 2 [Done jk 3/29/2004]
# Align probes from GNF1H chip.
ssh kk
cd /cluster/data/hg16/bed
mkdir -p geneAtlas2/run/psl
cd geneAtlas2/run
mkdir -p /cluster/bluearc/geneAtlas2
cp /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /cluster/bluearc/geneAtlas2
ls -1 /scratch/hg/gs.17/build34/trfFa/ > genome.lst
ls -1 /cluster/bluearc/geneAtlas2/gnf1h.fa > mrna.lst
echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/11.ooc /scratch/hg/gs.17/build34/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub
gensub2 genome.lst mrna.lst gsub spec
para create spec
para try
para check
para push
para time
#Completed: 491 of 491 jobs
#CPU time in finished jobs: 10718s 178.63m 2.98h 0.12d 0.000 y
#IO & Wait Time: 1499s 24.99m 0.42h 0.02d 0.000 y
#Average job time: 25s 0.41m 0.01h 0.00d
#Longest job: 652s 10.87m 0.18h 0.01d
#Submission to last job: 723s 12.05m 0.20h 0.01d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create gnf1h.psl.
pslSort dirs raw.psl tmp psl
pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
liftUp ../affyGnf1h.psl ../../../jkStuff/liftAll.lft warn contig.psl
rm -r contig.psl raw.psl psl
# Load probes and alignments from GNF1H into database.
ssh hgwdev
cd /cluster/data/hg16/bed/geneAtlas2
ln -s /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /gbdb/hgFixed/affyProbes
hgLoadPsl hg16 affyGnf1h.psl
hgLoadSeq hg16 /gbdb/hgFixed/affyProbes/gnf1h.fa
grep -v U133B ../affyUcla.2004-02-04/affyU133.psl | sed 's/exemplar://' \
| sed 's/consensus://' \
| sed 's/HG-U133A://' | sed 's/;//' > affyU133A.psl
hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \
affyU133A.psl /cluster/data/hg16/bed/geneAtlas2/affyGnf1h.psl
# Note that the unmapped 11000 records are from all-N sequences.
hgLoadBed hg16 gnfAtlas2 gnfAtlas2.bed
# GENE BOUNDS (RNACLUSTER) (DONE 10-05-03 Chuck)
# Create rnaCluster table (depends on {est,mrna}OrientInfo created but not checked in)
cd /cluster/store4/gs.17/build34/
# Create a list of accessions that come from RAGE libraries and need to
# be excluded. (added by Chuck Wed Nov 27 13:09:07 PST 2002)
~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg16 \
rage.libs
mkdir -p bed/rnaCluster/chrom
# Exclude accesions in the RAGE file
foreach f (?{,?}/chr*.fa)
set c = $f:t:r
set out = bed/rnaCluster/chrom/$c.bed
echo clusterRna -mrnaExclude=hg16.rage.libs hg16 /dev/null $out -chrom=$c
clusterRna -mrnaExclude=hg16.rage.libs hg16 /dev/null $out -chrom=$c
end
cd bed/rnaCluster
hgLoadBed hg16 rnaCluster chrom/*.bed > /dev/null
# MAKE UNIGENE ALIGNMENTS (DONE - 2003-10-09 - Hiram)
# Download of the latest UniGene version is now automated by a
# cron job -- see /cluster/home/angie/crontab ,
# /cluster/home/angie/unigeneVers/unigene.csh .
# If hgwdev gets rebooted, that needs to be restarted... maybe there's
# a more stable place to set up that cron job.
# substitute XXX -> the uniGene version used by SAGE, if building the
# uniGene/SAGE track; or just the latest uniGene version in
# /projects/cc/hg/sugnet/uniGene/ , if doing uniGene alignments only.
# set Version = XXX
set Version = 162 (bash: export Version=162)
cd /projects/cc/hg/sugnet/uniGene/uniGene.$Version
gunzip Hs.seq.uniq.gz Hs.data.gz
../countSeqsInCluster.pl Hs.data counts.tab
../parseUnigene.pl Hs.seq.uniq Hs.seq.uniq.simpleHeader.fa leftoverData.tab
# Distribute UniGene sequence to /iscratch/i/ (kkstore can see /projects)
ssh kkstore
set Version = 162 # same as above
mkdir -p /iscratch/i/uniGene.$Version
cp -p \
/projects/cc/hg/sugnet/uniGene/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \
/iscratch/i/uniGene.$Version
ssh kkr1u00
~kent/bin/iSync
ssh kk
set Version = 162 # same as above
mkdir -p /cluster/data/hg16/bed/uniGene.$Version
cd /cluster/data/hg16/bed/uniGene.$Version
ls -1S /scratch/hg/gs.17/build34/trfFa/*.fa > allctg.lst
ls -1S /iscratch/i/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \
> uniGene.lst
cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
gensub2 allctg.lst uniGene.lst template.sub para.spec
para create para.spec
mkdir psl
para try
para check
para push
# Checking finished jobsCompleted: 491 of 491 jobs
# CPU time in finished jobs: 39689s 661.49m 11.02h 0.46d 0.001 y
# IO & Wait Time: 38269s 637.81m 10.63h 0.44d 0.001 y
# Average job time: 159s 2.65m 0.04h 0.00d
# Longest job: 1805s 30.08m 0.50h 0.02d
# Submission to last job: 1972s 32.87m 0.55h 0.02d
# ssh eieio
set Version = 162 # same as above
cd /cluster/data/hg16/bed/uniGene.$Version
pslSort dirs raw.psl tmp psl >& pslSort.log
liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \
| pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
stdin hg16.uniGene.lifted.pslReps.psl /dev/null
# use hg16.uniGene.lifted.pslReps.psl for building SAGE track (next).
# LOAD SAGE DATA (TBD)
ssh hgwdev
cd ~/kent/src/hg/sage
make
# XXX = uniGene build for which SAGE was built -- not necessarily current!
# Figure out the build number by peeking at this file:
wget -O - ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt 2> /dev/null
# Or, look at the contents of this directory:
ls /projects/cc/hg/sugnet/uniGene
# set Version = XXX
set Version=162
mkdir /projects/cc/hg/sugnet/sage/sage.$Version
cd /projects/cc/hg/sugnet/sage/sage.$Version
ncftp ftp://ftp.ncbi.nih.gov/pub/sage
mget -R map/readme.txt map/info.txt extr info map/Hs
quit
# That downloaded about 380 Mb of data
mkdir map
mv Hs map
cd map/Hs/NlaIII
unzip -j SAGEmap_tag_ug-rel.zip
cd ../../../extr/
../../scripts/summarizeCounts.pl expCounts.tab ./SAGE_*
../../scripts/countGenesPerTag.pl expCounts.tab allTags.count.tab
../../scripts/createArraysForTags.pl allTags.count.tab tagExpArrays.tab \
./SAGE_*
../../scripts/countsPerExp.pl expCounts.tab expList.tab
cd ../map/Hs/NlaIII/
cat << '_EOF_' > /tmp/t.pl
#!/usr/local/bin/perl
while (<>) {
chomp($_);
@p = split(/\t/, $_);
print "$p[2]\t$p[3]\t$p[0]\n";
}
'_EOF_'
chmod +x /tmp/t.pl
cat SAGEmap_tag_ug-rel | /tmp/t.pl | sort | sed -e 's/ /_/g' \
> SAGEmap_ug_tag-rel_Hs
cd ../../../extr
createSageSummary ../map/Hs/NlaIII/SAGEmap_ug_tag-rel_Hs \
tagExpArrays.tab sageSummary.sage
# Create the uniGene alignments
# /cluster/data/hg16/uniGene/hg16.uniGene.lifted.pslReps.psl
# -- see "MAKE UNIGENE ALIGNMENTS" above
# continuing from above, we are already in this extr directory
cd /projects/cc/hg/sugnet/sage/sage.$Version/extr
addAveMedScoreToPsls \
/cluster/data/hg16/bed/uniGene.$Version/hg16.uniGene.lifted.pslReps.psl \
sageSummary.sage uniGene.wscores.bed
hgLoadBed hg16 uniGene_2 uniGene.wscores.bed
hgsql hg16 < ~kent/src/hg/lib/sage.sql
echo "load data local infile 'sageSummary.sage' into table sage" \
| hgsql hg16
cd ../info
../../scripts/parseRecords.pl ../extr/expList.tab > sageExp.tab
hgsql hg16 < ~/kent/src/hg/lib/sageExp.sql
echo "load data local infile 'sageExp.tab' into table sageExp" | hgsql hg16
# update ~/kent/src/hg/makeDb/trackDb/human/hg16/uniGene_2.html
# with current uniGene date.
# MAKING FOLDUTR TABLES (DONE - jk - 2003-10-14, REDONE jk 2004-04-07)
# First set up directory structure and extract UTR sequence on hgwdev
ssh hgwdev
mkdir -p /cluster/data/hg16/bed/rnaStruct
cd /cluster/data/hg16/bed/rnaStruct
mkdir -p utr3/split utr5/split utr3/fold utr5/fold
utrFa hg16 knownGene utr3 utr3/utr.fa
utrFa hg16 knownGene utr5 utr5/utr.fa
# Split up files and make files that define job.
ssh kk
cd /cluster/data/hg16/bed/rnaStruct
faSplit sequence utr3/utr.fa 50000 utr3/split/s
faSplit sequence utr5/utr.fa 50000 utr5/split/s
ls -1 utr3/split > utr3/in.lst
ls -1 utr5/split > utr5/in.lst
cd utr3
cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
cp gsub ../utr5
# Do cluster run for 3' UTRs
gensub2 in.lst single gsub spec
para create spec
para try
para push
#CPU time in finished jobs: 842416s 14040.26m 234.00h 9.75d 0.027 y
#IO & Wait Time: 78541s 1309.02m 21.82h 0.91d 0.002 y
#Average job time: 32s 0.53m 0.01h 0.00d
#Longest job: 3318s 55.30m 0.92h 0.04d
#Submission to last job: 4282s 71.37m 1.19h 0.05d
#
#Completed: 37250 of 37250 jobs
#CPU time in finished jobs: 1028594s 17143.24m 285.72h 11.91d 0.033 y
#IO & Wait Time: 125807s 2096.78m 34.95h 1.46d 0.004 y
#Average job time: 31s 0.52m 0.01h 0.00d
#Longest job: 3396s 56.60m 0.94h 0.04d
#Submission to last job: 4422s 73.70m 1.23h 0.05d
# Do cluster run for 5' UTRs
cd ../utr5
gensub2 in.lst single gsub spec
para create spec
para try
para push
#Completed: 25808 of 25808 jobs
#CPU time in finished jobs: 51700s 861.67m 14.36h 0.60d 0.002 y
#IO & Wait Time: 114430s 1907.16m 31.79h 1.32d 0.004 y
#Average job time: 6s 0.11m 0.00h 0.00d
#Longest job: 1044s 17.40m 0.29h 0.01d
#Submission to last job: 1164s 19.40m 0.32h 0.01d
#
#Completed: 29770 of 29770 jobs
#CPU time in finished jobs: 100407s 1673.45m 27.89h 1.16d 0.003 y
#IO & Wait Time: 93019s 1550.32m 25.84h 1.08d 0.003 y
#Average job time: 6s 0.11m 0.00h 0.00d
#Longest job: 2209s 36.82m 0.61h 0.03d
#Submission to last job: 2596s 43.27m 0.72h 0.03d
# Load database
ssh hgwdev
cd /cluster/data/hg16/bed/rnaStruct/utr5
hgLoadRnaFold hg16 foldUtr5 fold
cd ../utr3
hgLoadRnaFold hg16 foldUtr3 fold
# Clean up
rm -r split fold err batch.bak
cd ../utr5
rm -r split fold err batch.bak
# TBA (Webb Miller's Threaded Blockset Aligner) Alignments (CFTR region) 2003-10-17 kate
# 9-way alignment: human, chimp, baboon, mouse, rat, doc, cat, cow, pig
# Using sequences from browser (human, mouse, rat), and from
# Elliot Margulies at NISC (via Webb)
# unrolled sequences and ran TBA in /cluster/data/nisc/targets/cftr/tba9Mammal
ssh kksilo
mkdir -p /cluster/data/hg16/bed/nisc/cftr
ln -s /cluster/data/nisc/targets/cftr/tba9Mammal/human.out \
/cluster/data/hg16/bed/nisc/cftr/tba9Mammal.maf
# setup external files for database reference
ssh hgwdev
set table = tba9MammalCFTR
mkdir -p /gbdb/hg16/$table
cd /gbdb/hg16/$table
ln -s /cluster/data/hg16/bed/nisc/cftr/tba9Mammal.maf tba.maf
mkdir -p /gbdb/hg16/${table}
cd /gbdb/hg16/${table}
ln -s /cluster/data/hg16/bed/nisc/cftr/tba9Mammal.maf tba.maf
# load into database
cd /cluster/data/hg16/bed/nisc/cftr
/cluster/bin/i386/hgLoadMaf -WARN hg16 $table
# TBA with Non-mammalian species included (Fugu & Chicken) 2003-10-20 kate
ssh hgwdev
ln -s /cluster/data/nisc/targets/cftr/CFTR.non-mammal/human.out \
/cluster/data/hg16/bed/nisc/cftr/tbaFishBird.maf
set table = tbaFishBirdCFTR
mkdir -p /gbdb/hg16/$table
cd /gbdb/hg16/$table
ln -s /cluster/data/hg16/bed/nisc/cftr/tbaFishBird.maf tba.maf
cd /cluster/data/hg16/bed/nisc/cftr
/cluster/bin/i386/hgLoadMaf -WARN hg16 $table
# 1072 warnings (mostly score=0's, a few minus scores)
# 4377 rows
# TBA 25-species CFTR region (DONE 2003-10-28 kate)
# run in /cluster/data/nisc/targets/cftr/25way, using makefile
ssh hgwdev
ln -s /cluster/data/nisc/targets/cftr/25way/human.maf \
/cluster/data/hg16/bed/nisc/cftr/tba25.maf
set table = tba25CFTR
mkdir -p /gbdb/hg16/$table
cd /gbdb/hg16/$table
ln -s /cluster/data/hg16/bed/nisc/cftr/tba25.maf tba.maf
cd /cluster/data/hg16/bed/nisc/cftr
/cluster/bin/i386/hgLoadMaf -WARN hg16 $table
# 22267 rows
# 24 warnings
# MAKE HG16-PANTRO1 MAF FOR MULTIZ/TBA (DONE 3/8/04 angie)
ssh kolossus
mkdir /cluster/data/hg16/bed/blastz-blat.panTro1.lifted
cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted
# use the combined blastz-blat reciprocal best human-pt0 chain, but
# assign unique IDs:
chainSort /cluster/data/pt0/bed/blastz-blatHg16/human.best.2.chain stdout \
| chainMergeSort stdin \
| chainSplit pt0RBestChain stdin
# re-net with the new IDs:
mkdir pt0RBestNet
foreach f (pt0RBestChain/*.chain)
echo chaining $f
chainNet $f /cluster/data/hg16/chrom.sizes \
/cluster/data/pt0/scaffold.sizes pt0RBestNet/$f:t:r.net /dev/null
end
# Now lift chain to panTro1 coords:
mkdir chain
foreach f (pt0RBestChain/*.chain)
liftUp -chainQ rBestChain/$f:t \
/cluster/data/panTro1/jkStuff/scaffolds.lft warn $f
end
# re-net with panTro1 coords (liftUp -netQ doesn't like - strand lifting):
mkdir rBestNet
foreach f (rBestChain/*.chain)
echo chaining $f
chainNet $f /cluster/data/hg16/chrom.sizes \
/cluster/data/panTro1/chrom.sizes rBestNet/$f:t:r.net /dev/null
end
# make axt and maf from the hg16-panTro1 net:
mkdir axtRBestNet mafRBestNet
foreach f (rBestNet/chr*.net)
set chr = $f:t:r
netToAxt $f rBestChain/$chr.chain /cluster/data/hg16/nib \
/cluster/data/panTro1/nib stdout \
| axtSort stdin axtRBestNet/$chr.axt
axtToMaf axtRBestNet/$chr.axt /cluster/data/hg16/chrom.sizes \
/cluster/data/panTro1/chrom.sizes mafRBestNet/$chr.maf \
-tPrefix=hg16. -qPrefix=panTro1.
end
# copy reciprocal net axt's for download (2004-10-04 kate)
cd axtRBestNet
gzip *.axt
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg16/vsPanTro1
mkdir axtRBestNet
cd axtRBestNet
cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/axtRBestNet/*.gz .
md5sum *.gz > md5sum.txt
# load renumbered chains into database (2004-03-14 kate)
ssh hgwdev
cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/rBestChain
foreach i (*.chain)
set c = $i:r
hgLoadChain hg16 ${c}_rBestChainPanTro1 $i
echo done $c
end
# save for download (2004-05-14 kate)
ssh kksilo
cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/rBestChain
chainMergeSort -saveId *.chain > ../rBest.chain
ssh hgwdev
cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/rBestChain
set dir = /usr/local/apache/htdocs/goldenPath/hg16/vsPanTro1
mkdir -p $dir
cp -p ../rBest.chain $dir/human.best.chain
cd $dir
gzip *.chain
# copy README file
# load net into database (2004-03-14 kate)
ssh hgwdev
cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted
cat rBestNet/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
netClass noClass.net hg16 panTro1 human.net
netFilter -chimpSyn human.net > rBest.net
hgLoadNet -warn hg16 rBestNetPanTro1 rBest.net
# EXPERIMENT: TBA WHOLE CHROM 5 SPECIES (DONE ENOUGH 3/8/04 angie)
# Put 2-ways in /cluster/bluearc
ssh eieio
mkdir /cluster/bluearc/hg16/tba
mkdir /cluster/bluearc/hg16/tba/{hp,hg}
cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/chr*.maf \
/cluster/bluearc/hg16/tba/hp
# hg16-mm3 already in /cluster/bluearc/hg16/bed/blastz.mm3/mafNet300
# hg16-rn3 already in /cluster/bluearc/hg16/bed/blastz.rn3/mafNet300
cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf \
/cluster/bluearc/hg16/tba/hg
ssh kolossus
mkdir /cluster/data/hg16/bed/tbaExperiment
cd /cluster/data/hg16/bed/tbaExperiment
# tba needs to run multiz, so make sure they're in $PATH:
set path = (/cluster/bin/penn $path)
# Try just one chromosome:
set chr = chr16
# tba needs filenames to correspond to its tree input, so make links to
# maf and fasta:
rm -f human.chimp.maf human.mouse.maf human.rat.maf human.chicken.maf \
human
mafSort /cluster/bluearc/hg16/tba/hp/$chr.z.mm3/mafNet300/$chr.mm3.maf > \
human.mouse.mafmaf > human.chimp.maf
mafSort /cluster/bluearc/hg16/bed/blastz.mm3/mafNet300/$chr.mm3.maf > \
human.mouse.maf
mafSort /cluster/bluearc/hg16/bed/blastz.rn3/mafNet300/$chr.rn3.maf > \
human.rat.maf
mafSort /cluster/bluearc/hg16/tba/hg/$chr.hg.maf > human.chicken.maf
ln -s /cluster/data/hg16/?{,rror that tba is dying with is this:
# ?}/$chr.fa human
tba "(((human chimp) (mouse rat)) chicken)" \
human.chimp.maf human.mouse.maf human.rat.maf human.chicken.maf
# Doh -- looks like tba wants *all* pairwise inputs, and how do we
# tell which rat-chicrror that tba is dying with is this:
# ken alignments to include for a given human chr??
# The error that tba is dying with is this:
# pair2tb.v4: alignments of human out of order around 172596-175110
# ... even though inputs are sorted...? Oh well, clean up:
rm human*
rm -r /cluster/bluearc/hg16/tba/
# CREATING KNOWNtOsUPER (which enables superFamily stuff in hgNear/hgGene)
# First see if need to update superfamily data from
# ftp server at supfam.mrc-lmb.cam.ac.uk following instructions
# in /cluster/store1/superFamily/genomes/README.ucsc. Then
# make sure that knownToEnsembl and ensGtp tables are created, then:
zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper hg16 hs stdin
# BLASTZ CHICKEN (done, 11/3/2003, Adam)
# (adapted from BLASTZ mouse/rat, above)
# NOTE: this first time we're using the contigs that Terry has
# installed at /cluster/bluearc/gg0 (see fa and split100
# subdirectories). When we have an assembly, things should be able to
# proceed more as with mouse and rat
ssh kk
mkdir -p /cluster/data/hg16/bed/blastz.gg0
cd /cluster/data/hg16/bed/blastz.gg0
# first it looks like we need to run TRF on the contigs (realizing
# this on second time through!)
mkdir trf
cd trf
rm -rf jobList
foreach file (/cluster/bluearc/gg0/split100/*.fa)
set root=$file:t:r
echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $file /dev/null -bedAt=/cluster/data/hg16/bed/blastz.gg0/trf/${root}.bed -tempDir=/tmp" >> jobList
end
#(run jobList on cluster) -- took 2.5 min.
# add new softmasking to reflect TRF output
mkdir /cluster/bluearc/gg0/split100_with_trf
rm -rf jobList
foreach file (/cluster/bluearc/gg0/split100/*.fa)
set root=$file:t:r
echo "/cluster/bin/i386/maskOutFa -softAdd $file /cluster/data/hg16/bed/blastz.gg0/trf/${root}.bed /cluster/bluearc/gg0/split100_with_trf/${root}.fa" >> jobList
end
(run jobList on cluster) # took <1 min.
# now set up for BLASTZ (picking up with instructions above for
# mouse and rat)
cat << '_EOF_' > DEF
# chicken vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_Q=/cluster/data/penn/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Chicken
SEQ2_DIR=/cluster/bluearc/gg0/split100_with_trf
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=
SEQ2_LAP=
BASE=/cluster/store4/gs.17/build34/bed/blastz.gg0
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line makes emacs coloring happy
# Save the DEF file in the current standard place
DS=`date -I`
cp DEF ~angie/hummus/DEF.gg0-hg16.$DS
# source the DEF file to establish environment for following commands
. ./DEF
# follow the next set of directions slavishly
mkdir -p $BASE/run
# give up on avoiding angie's directories
# tcl script
# creates xdir.sh and joblist run/j
~angie/hummus/make-joblist $DEF > $BASE/run/j
# xdir.sh makes a bunch of result directories in $BASE/raw/
# based on chrom name and CHUNK size
sh $BASE/xdir.sh
cd $BASE/run
# now edit j to prefix path to executable name
# NOTE: we should have a controlled version of schwartz bin executables
sed -e 's#^#/cluster/bin/penn/#' j > j2
wc -l j*
head j2
# make sure the j2 edits are OK, then use it:
mv j2 j
# para create will create the file: 'batch' for the cluster run
para create j
para try
para check
para push
# ... etc ...
#Completed: 33561 of 33561 jobs
#CPU time in finished jobs: 11426279s 190437.98m 3173.97h 132.25d 0.362 y
#IO & Wait Time: 212940s 3549.01m 59.15h 2.46d 0.007 y
#Average job time: 347s 5.78m 0.10h 0.00d
#Longest job: 4036s 67.27m 1.12h 0.05d
#Submission to last job: 16433s 273.88m 4.56h 0.19d
# post-process blastz
ssh kk
cd /cluster/data/hg16/bed/blastz.gg0
# source the DEF file again in case you are coming back to this
# (must be bash shell)
. ./DEF
# a new run directory
mkdir -p run.1
mkdir -p $BASE/lav
# create a new job list to convert out files to lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
> run.1/jobList
cd run.1
# make sure the job list is OK
wc -l jobList
# 339 jobs
head jobList
# run on cluster
ssh kk
cd /cluster/data/hg16/bed/blastz.rn3/run.1
para create jobList
para try
para check
para push
# etc.
#Completed: 339 of 339 jobs
#CPU time in finished jobs: 8611s 143.52m 2.39h 0.10d 0.000 y
#IO & Wait Time: 106450s 1774.17m 29.57h 1.23d 0.003 y
#Average job time: 339s 5.66m 0.09h 0.00d
#Longest job: 456s 7.60m 0.13h 0.01d
#Submission to last job: 465s 7.75m 0.13h 0.01d
# convert lav files to axt
ssh kk
cd /cluster/data/hg16/bed/blastz.gg0
mkdir axtChrom
# a new run directory
mkdir run.2
cd run.2
# create custom version of blastz-chromlav2axt with -fa option,
# because nibs aren't available for chicken
cp /cluster/bin/scripts/blastz-chromlav2axt .
# (hand edit: add -fa option to call to lavToAxt)
# create template file for gensub2
# usage: blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
cat << '_EOF_' > gsub
#LOOP
/cluster/store4/gs.17/build34/bed/blastz.gg0/run.2/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastz.gg0/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastz.gg0/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /cluster/bluearc/gg0/fa/chicken_with_trf.fa
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
ls -1S /cluster/store4/gs.17/build34/bed/blastz.gg0/lav > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
# 42 jobs
head jobList
para create jobList
para try
para check
para push
# ... etc ...
#Completed: 39 of 42 jobs
#Crashed: 3 jobs
#CPU time in finished jobs: 32763s 546.05m 9.10h 0.38d 0.001 y
#IO & Wait Time: 48182s 803.03m 13.38h 0.56d 0.002 y
#Average job time: 2076s 34.59m 0.58h 0.02d
#Longest job: 5291s 88.18m 1.47h 0.06d
#Submission to last job: 5291s 88.18m 1.47h 0.06d
# The crashes are three of the "randoms" (chr8, 18, 19) -- parasol
# thinks they crashed because of 0-length output files
# This run took quite a bit longer than with mouse and rat, presumably
# because of the use of the fa file
# Remove the empty axtChrom/chr*_random.axt files to avoid future
# processing errors
# translate sorted axt files into psl
ssh eieio
cd /cluster/data/hg16/bed/blastz.gg0
mkdir -p pslChrom
set tbl = "blastzGg0"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# (~5 minutes)
# Load database tables
ssh hgwdev
set tbl = "blastzGg0"
cd /cluster/data/hg16/bed/blastz.gg0/pslChrom
/cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl
# New entry in human/hg16/trackDb.ra
# track blastzGg0
# shortLabel Chicken Blastz
# longLabel Blastz Chicken (Gg0-contigs, 5.2x coverage)
# group compGeno
# priority 145.9
# visibility hide
# color 100,50,0
# altColor 255,240,200
# spectrum on
# type psl xeno
# MAKE BLASTZ BEST CHICKEN (finished, Adam, 11/3/03)
# Consolidate AXT files to chrom level, sort, pick best, make psl.
ssh eieio
cd /cluster/data/hg16/bed/blastz.gg0/axtChrom
mkdir -p /cluster/bluearc/hg16/bed/blastz.gg0/axtChrom
# copy chrom axt's to bluearc, to avoid hitting fileserver too hard
cp -p *.axt /cluster/bluearc/hg16/bed/blastz.gg0/axtChrom
ssh kk
cd /cluster/data/hg16/bed/blastz.gg0
mkdir -p axtBest pslBest
mkdir run.3
cd run.3
# create script to filter files
cat << '_EOF_' > doBestAxt
#!/bin/csh -f
# usage: doBestAxt chr axt-file best-file psl-file
/cluster/bin/i386/axtBest $2 $1 $3 -minScore=300
sleep 1
/cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastz.gg0/S1.len \
/cluster/data/hg16/bed/blastz.gg0/S2.len $4
'_EOF_'
# << this line makes emacs coloring happy
# NOTE: in a subsequent run, we have used -minScore=6000 and added
# the -matrix option to use HoxD55.q (need to add a line with gap
# penalties to the bottom of the score matrix file, e.g., "O =
# 400, E = 30"; see
# /cluster/data/hg16/bed/blastz.gg0/run.3.2003-11-11). These new
# options should be considered part of the standard procedure, at
# least for now.
chmod +x doBestAxt
cd ../axtChrom
ls -1S | sed 's/.axt$//' > ../run.3/chrom.list
cd ../run.3
# create template for cluster job
cat << '_EOF_' > gsub
#LOOP
doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastz.gg0/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.gg0/axtBest/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.gg0/pslBest/$(root1)_blastzBestGg0.psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 chrom.list single gsub jobList
wc -l jobList
head jobList
cd /cluster/data/hg16/bed/blastz.gg0
cd run.3
para create jobList
para try
para check
para push
#Checking finished jobs
#Completed: 39 of 39 jobs
#CPU time in finished jobs: 1111s 18.52m 0.31h 0.01d 0.000 y
#IO & Wait Time: 7775s 129.58m 2.16h 0.09d 0.000 y
#Average job time: 228s 3.80m 0.06h 0.00d
#Longest job: 1375s 22.92m 0.38h 0.02d
#Submission to last job: 1375s 22.92m 0.38h 0.02d
# create human/chicken mafs
cd /cluster/data/hg16/bed/blastz.gg0
mkdir maf
foreach file (axtBest/*.axt)
set root=$file:t:r
echo $root
/cluster/bin/i386/axtToMaf $file S1.len S2.len maf/${root}.maf.unfixed -tPrefix=hg16. -qPrefix=gg0.
/cluster/bin/scripts/fixmaf.pl < maf/${root}.maf.unfixed > maf/${root}.maf
end
# MULTIZ HUMAN/MOUSE/RAT/CHICKEN (Finished, Adam, 11/3)
# (chicken added to human/mouse/rat alignments described above [HUMOR])
ssh kk
mkdir /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0
mkdir hmrc
# wrapper script for multiz
cat << EOF > mz
#!/bin/csh
/cluster/bin/penn/tbaBin/multiz \$1 \$2 - > \$3
EOF
chmod +x mz
# put the MAFs on bluearc
ssh eieio
mkdir -p /cluster/bluearc/multiz.hg16mm3rn3gg0/hmr
mkdir -p /cluster/bluearc/multiz.hg16mm3rn3gg0/hc
cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf /cluster/bluearc/multiz.hg16mm3rn3gg0/hmr
cp /cluster/data/hg16/bed/blastz.gg0/maf/*.maf /cluster/bluearc/multiz.hg16mm3rn3gg0/hc
logout # back to kk
# set up joblist
rm -f jobList
foreach file (/cluster/bluearc/multiz.hg16mm3rn3gg0/hmr/*.maf)
set root=`echo $file:t:r | sed 's/\.hmr//'`
echo "/cluster/data/hg16/bed/multiz.hg16mm3rn3gg0/mz /cluster/bluearc/multiz.hg16mm3rn3gg0/hc/${root}.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0/hmrc/${root}.maf" >> jobList
end
# (run on cluster) 41 jobs, ~10 min
# FIXME: maybe should run on the common denominator of the two
# sets, then copy over remaining MAFs (?) In this case, copied
# chr8_random and chr18_random from hmr
# clean up bluearc (these are big files!)
rm -r /cluster/bluearc/multiz.hg16mm3rn3gg0
# setup external files for database reference
ssh hgwdev
mkdir -p /gbdb/hg16/multizMm3Rn3Gg0
ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0/hmrc/*.maf /gbdb/hg16/multizMm3Rn3Gg0
# load into database
# cd $multizDir/hmr/*.maf
/cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3Gg0
# add dummy entry to dbDb so that name shows up as "Chicken"
echo 'insert into dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName) values ("gg0", "November 2003", "", "Chicken", "", 0, 0, "Chicken", "Gallus gallus");' | hgsql -h genome-testdb hgcentraltest
# BLASTZ Mm4 (DONE - 2003-10-31 - Hiram)
ssh kk
mkdir -p /cluster/data/hg16/bed/blastz.mm4.2003-10-29
cd /cluster/data/hg16/bed
ln -s blastz.mm4.2003-10-29 blastz.mm4
cd blastz.mm4
cat << '_EOF_' > DEF
# human vs. mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# RMSK not currently used
SEQ1_RMSK=/iscratch/i/gs.17/build34/rmsk
# FLAG not currently used
SEQ1_FLAG=-primate
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInMouse
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Mouse
SEQ2_DIR=/scratch/mus/mm4/softNib
# RMSK not currently used
SEQ2_RMSK=/scratch/mus/mm4/rmsk
# FLAG not currently used
SEQ2_FLAG=-rodent
SEQ2_SMSK=/scratch/mus/mm4/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg16/bed/blastz.mm4
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
# prepare first cluster run
ssh kk
cd /cluster/data/hg16/bed/blastz.mm4
source DEF
/cluster/data/mm4/jkStuff/BlastZ_run0.sh
cd run.0
para try, check, push, check, ....
# Completed: 43390 of 43392 jobs
# Crashed: 2 jobs
# CPU time in finished jobs: 15770466s 262841.10m 4380.69h 182.53d 0.500 y
# IO & Wait Time: 626227s 10437.11m 173.95h 7.25d 0.020 y
# Average job time: 378s 6.30m 0.10h 0.00d
# Longest job: 8052s 134.20m 2.24h 0.09d
# Submission to last job: 45886s 764.77m 12.75h 0.53d
# the two crashed jobs:
# /cluster/home/angie/schwartzbin/blastz-run chr10.nib 40000001 50010000 chrX.nib 120000001 150000000 /cluster/data/hg16/bed/blastz.mm4/DEF
# blastz: Illegal character '@' in sequence file.
# /cluster/home/angie/schwartzbin/blastz-run chr18.nib 1 10010000 chr15.nib 60000001 90000000 /cluster/data/hg16/bed/blastz.mm4/DEF
# seq_read(/tmp/blastz.zstcGa/s1.fa): Input/output error
# unusual errors. Simply try them again and they work
# Second cluster run to convert the .out's to .lav's
# You do NOT want to run this on the big cluster. It brings
# the file server to its knees. Run this on the small cluster.
ssh kkr1u00
cd /cluster/data/hg16/bed/blastz.mm4
source DEF
/cluster/data/mm4/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# Completed: 339 of 339 jobs
# CPU time in finished jobs: 15434s 257.23m 4.29h 0.18d 0.000 y
# IO & Wait Time: 2393s 39.89m 0.66h 0.03d 0.000 y
# Average job time: 53s 0.88m 0.01h 0.00d
# Longest job: 1128s 18.80m 0.31h 0.01d
# Submission to last job: 2561s 42.68m 0.71h 0.03d
# Third cluster run to convert lav's to axt's
source DEF
cd /cluster/data/hg16/bed/blastz.mm4
/cluster/data/mm4/jkStuff/BlastZ_run2.sh
cd run.2
para try, check, push, etc ...
# Completed: 38 of 42 jobs
# Crashed: 4 jobs
# CPU time in finished jobs: 1826s 30.44m 0.51h 0.02d 0.000 y
# IO & Wait Time: 9781s 163.01m 2.72h 0.11d 0.000 y
# Average job time: 305s 5.09m 0.08h 0.00d
# Longest job: 1489s 24.82m 0.41h 0.02d
# Submission to last job: 5125s 85.42m 1.42h 0.06d
# FAILED: chr1, chr19, chr19_random, chr5
# try these on kolossus
ssh kolossus
cd /cluster/data/hg16/bed/blastz.mm4/run.2
/cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
/cluster/data/hg16/bed/blastz.mm4/lav/chr1 \
/cluster/data/hg16/bed/blastz.mm4/axtChrom/chr1.axt \
/cluster/data/hg16/nib /cluster/data/mm4/nib
/cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
/cluster/data/hg16/bed/blastz.mm4/lav/chr19 \
/cluster/data/hg16/bed/blastz.mm4/axtChrom/chr19.axt \
/cluster/data/hg16/nib /cluster/data/mm4/nib
/cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
/cluster/data/hg16/bed/blastz.mm4/lav/chr19_random \
/cluster/data/hg16/bed/blastz.mm4/axtChrom/chr19_random.axt \
/cluster/data/hg16/nib /cluster/data/mm4/nib
/cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
/cluster/data/hg16/bed/blastz.mm4/lav/chr5 \
/cluster/data/hg16/bed/blastz.mm4/axtChrom/chr5.axt \
/cluster/data/hg16/nib /cluster/data/mm4/nib
# about 26 minutes total time for those four
# chr19_random.axt is still empty, remove it to avoid errors later
# translate sorted axt files into psl
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm4
mkdir -p pslChrom
set tbl = "blastzMm4"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# That takes about 30 minutes
# Load database tables
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm4/pslChrom
/cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzMm4.psl
# this is a 55 minute job
# featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of
# memory. But if you reset your ~/.hg.conf to use the read-only
# user and contact the hgwdev host, and build featureBits as a
# x86_64 binary, you can run it on kolossus:
# featureBits hg16 blastzMm3
# 1050190071 bases of 2865248791 (36.653%) in intersection
# featureBits hg16 blastzMm4
# 1056761609 bases of 2865248791 (36.882%) in intersection
# CHAIN Mm4 BLASTZ (DONE - 2003-11-03 - Hiram)
# The axtChain is best run on the small kluster, or the kk9 kluster
# in this case, it was run on the kk kluster
ssh kkr1u00
mkdir -p /cluster/data/hg16/bed/blastz.mm4/axtChain/run1
cd /cluster/data/hg16/bed/blastz.mm4/axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg16/bed/blastz.mm4/axtChrom/*.axt > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtFilter -notQ_random $1 | axtChain stdin \
/iscratch/i/gs.17/build34/bothMaskedNibs \
/iscratch/i/mm4/softNib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
# 41 jobs
gensub2 input.lst single gsub jobList
para create jobList
para try
para push # ... etc ...
# Completed: 41 of 41 jobs
# CPU time in finished jobs: 24547s 409.12m 6.82h 0.28d 0.001 y
# IO & Wait Time: 3955s 65.91m 1.10h 0.05d 0.000 y
# Average job time: 695s 11.59m 0.19h 0.01d
# Longest job: 7336s 122.27m 2.04h 0.08d
# Submission to last job: 8251s 137.52m 2.29h 0.10d
# now on the file server, sort chains
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm4/axtChain
time chainMergeSort run1/chain/*.chain > all.chain
# real 10m5.525s
# user 8m9.350s
# sys 0m48.450s
time chainSplit chain all.chain
# real 10m23.201s
# user 7m51.930s
# sys 0m53.910s
# these steps take ~20 minutes
# optionally: rm run1/chain/*.chain
# Load chains into database
# next machine
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm4/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain hg16 ${c}_chainMm4 $i
echo done $c
end
# NET Mm4 (DONE - 2003-11-03 - Hiram)
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm4/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
/cluster/data/mm4/chrom.sizes ../preNet/$i
end
# real 11m58.018s
# user 4m10.390s
# sys 2m10.780s
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
/cluster/data/mm4/chrom.sizes ../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
# memory usage 2505211904, utime 15891 s/100, stime 3245
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm4/axtChain
time netClass hNoClass.net hg16 mm4 mouse.net \
-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \
-qNewR=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInHuman
# real 14m2.042s
# user 10m6.450s
# sys 1m46.950s
# If things look good do
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm4/axtChain
rm -r n1 hNoClass.net
# Make a 'syntenic' subset of these with
netFilter -syn mouse.net > mouseSyn.net
# real 9m44.445s
# user 6m42.660s
# sys 1m10.100s
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm4/axtChain
netFilter -minGap=10 mouse.net | hgLoadNet hg16 netMm4 stdin
netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm4 stdin
# real 12m53.070s
# user 6m6.540s
# sys 0m50.580s
# check results
# featureBits hg16 netMm4
# 2823565051 bases of 2865248791 (98.545%) in intersection
# featureBits hg16 netMm3
# 2834484276 bases of 2865248791 (98.926%) in intersection
# featureBits hg16 syntenyNetMm3
# 2804467412 bases of 2865248791 (97.879%) in intersection
# featureBits hg16 syntenyNetMm4
# 2786960572 bases of 2865248791 (97.268%) in intersection
# Add entries for net and chain to mouse/hg16 trackDb
# make net
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm4/axtChain
mkdir mouseNet
time netSplit mouse.net mouseNet
# real 10m44.479s
# user 6m43.680s
# sys 1m20.860s
mkdir ../axtNet
foreach n (mouseNet/chr*.net)
set c=$n:t:r
echo "netToAxt: $c.net -> $c.axt"
rm -f ../axtNet/$c.axt
netToAxt mouseNet/$c.net chain/$c.chain \
/cluster/data/hg16/nib \
/cluster/data/mm4/nib \
../axtNet/$c.axt
echo "Complete: $c.net -> axtNet/$c.axt"
end
ssh hgwdev
mkdir -p /cluster/data/hg16/bed/blastz.mm4/axtBest
cd /cluster/data/hg16/bed/blastz.mm4/axtBest
ln -s ../axtNet/chr*.axt .
# copy net axt's to download area
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm4/axtNet
mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtNet
cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtNet
cd /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtNet
gzip *.axt
# add README.txt file to dir, if needed
# Convert those axt files to psl
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm4
mkdir pslBest
foreach a (axtBest/chr*.axt)
set c=$a:t:r
echo "processing $c.axt -> ${c}_blastzBestMm4.psl"
/cluster/bin/i386/axtToPsl axtBest/${c}.axt \
S1.len S2.len pslBest/${c}_blastzBestMm4.psl
echo "Done: ${c}_blastzBestMm4.psl"
end
# Load tables
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm4/pslBest
time /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzBestMm4.psl
# real 10m47.853s
# user 2m48.700s
# sys 0m24.250s
# check results
# featureBits hg16 blastzBestMm4
# 996722004 bases of 2865248791 (34.787%) in intersection
# featureBits hg16 blastzBestMm3
# 1007362800 bases of 2865248791 (35.158%) in intersection
# Make /gbdb links and add them to the axtInfo table:
mkdir -p /gbdb/hg16/axtBestMm4
cd /gbdb/hg16/axtBestMm4
ln -s /cluster/data/hg16/bed/blastz.mm4/axtNet/chr*.axt .
cd /cluster/data/hg16/bed/blastz.mm4/axtNet
rm -f axtInfoInserts.sql
foreach f (/gbdb/hg16/axtBestMm4/chr*.axt)
set chr=$f:t:r
echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
VALUES ('mm4','Blastz Best in Genome','$chr','$f');" \
>> axtInfoInserts.sql
end
hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql
# table axtInfo may already exist, ignore create error.
hgsql hg16 < axtInfoInserts.sql
# MAKING THE AXTTIGHT FROM AXTBEST (DONE - 2003-11-04 - Hiram)
# After creating axtBest alignments above, use subsetAxt to get axtTight:
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm4/axtNet
mkdir ../axtTight
tcsh
foreach i (*.axt)
echo subsetAxt $i ../axtTight/$i
subsetAxt $i ../axtTight/$i \
~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
end
# translate to psl
cd ../axtTight
mkdir ../pslTight
foreach i (*.axt)
set c = $i:r
axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm4.psl
echo "Done: $i"
end
# Load tables into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm4/pslTight
hgLoadPsl -noTNameIx hg16 chr*_blastzTightMm4.psl
# check results
# featureBits hg16 blastzTightMm4
# 162641577 bases of 2865248791 (5.676%) in intersection
# featureBits hg16 blastzTightMm3
# 164148288 bases of 2865248791 (5.729%) in intersection
# copy to axt's to download area
cd /cluster/data/hg16/bed/blastz.mm4/axtTight
mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtTight
cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtTight
cd /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtTight
gzip *.axt
# add README.txt file to dir, if needed
# RUNNING AXTBEST (DONE 12/2/03 angie)
# Penn State complained of a loss in coverage when using axtNet instead
# of axtBest. So run axtBest for them, and axtToMaf in prep for multiz.
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm4.2003-10-29
# I removed links from axtBest/* to axtNet/*
foreach f (axtChrom/chr*.axt)
set chr=$f:t:r
echo axtBesting $chr
axtBest $f $chr axtBest/$chr.axt -minScore=300
end
# As usual, ran out of mem on chr19, so use kolossus & 2 passes:
ssh kolossus
cd /cluster/data/hg16/bed/blastz.mm4.2003-10-29
set chr = chr19
foreach d (lav/$chr/*.lav)
set smallout=$d.axt
lavToAxt $d /cluster/data/hg16/nib /cluster/data/mm4/nib stdout \
| axtSort stdin $smallout
axtBest $smallout $chr $smallout:r.axtBest
end
cat `ls -1 lav/$chr/*.axtBest | sort -g` \
> lav/$chr/$chr.axtBestPieces
axtBest lav/$chr/$chr.axtBestPieces $chr axtBest/$chr.axt
rm lav/$chr/*.axt*
# MAKE MAF FROM AXTBEST FOR PENN STATE (DONE 12/2/03 angie)
ssh eieio
cd /cluster/data/hg16/bed/blastz.mm4.2003-10-29
mkdir mafBest
foreach f (axtBest/chr*.axt)
set maf = mafBest/$f:t:r.hm.maf
echo translating $f to $maf
axtToMaf $f \
/cluster/data/hg16/chrom.sizes /cluster/data/mm4/chrom.sizes \
$maf -tPrefix=hg16. -qPrefix=mm4.
end
# MAKING MOUSE MM4 SYNTENY (DONE 2003-11-05 - Hiram)
ssh hgwdev
mkdir -p /cluster/data/hg16/bed/syntenyMm4
cd /cluster/data/hg16/bed/syntenyMm4
# updating the scripts in use here from
# /cluster/data/hg16/bed/syntenyMm3
cp -p /cluster/data/hg16/bed/syntenyMm3/*.pl .
# fix the syntenicBest script to not try and work on empty
# results from its queries. Also, set the db and table name
# in the script itself so the arguments are not needed
./syntenicBest.pl
# on the order of 3 to 4 hours to complete syntenicBest
# almost no time, or only a few minutes at most for any of
# the rest
../syntenyMm3/smooth.pl
../syntenyMm3/joinsmallgaps.pl
# set db and table name in fillgap.pl
./fillgap.pl
../syntenyMm3/synteny2bed.pl
hgLoadBed hg16 syntenyMm4 ucsc100k.bed
# featureBits hg16 syntenyMm3
# 2651945520 bases of 2865248791 (92.556%) in intersection
# featureBits hg16 syntenyMm4
# 2560252977 bases of 2865248791 (89.355%) in intersection
# hgTracks.c needed to be updated to recognize syntenyMm4 so it
# would color properly.
# TIGR GENE INDEX (DONE 2004-05020 Fan)
mkdir -p /cluster/data/hg16/bed/tigr
cd /cluster/data/hg16/bed/tigr
wget ftp://ftp.tigr.org/pub/data/tgi/Homo_sapiens/TGI_track_HumanGenome_hg16_05-2004.tgz
tar xvzf TGI*.tgz
foreach f (*cattle*)
set f1 = `echo $f | sed -e 's/cattle/cow/g'`
mv $f $f1
end
foreach o (mouse cow human pig rat)
echo $o
setenv O $o
foreach f (chr*_$o*s)
tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
end
end
ssh hgwdev
cd /cluster/data/hg16/bed/tigr
hgsql hg16 -e "drop table tigrGeneIndex"
hgsql hg16 < ~/kent/src/hg/lib/tigrGeneIndex.sql
foreach f (*.gff)
echo Processing $f ...
/cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC hg16 tigrGeneIndex $f
hgsql hg16 -e "select count(*) from tigrGeneIndex"
end
# Total of 354491 entries created in tigrGeneIndex table.
hgsql hg16 -e "update tigrGeneIndex set cdsStart = txStart;"
hgsql hg16 -e "update tigrGeneIndex set cdsEnd = txEnd;"
checkTableCoords hg16 tigrGeneIndex
gzip *.gff *TCs
# LOAD VEGA GENES AND PSEUDOGENES (DONE 2003-11-11 braney )
#####
##### WARNING: vega procedure changed, use process later in file
#####
mkdir ~/hg16/bed/vega
cd ~/hg16/bed/vega
wget "http://www.sanger.ac.uk/Users/keenan/vega_homo_sapiens_core_4_0.gtf.gz"
gunzip vega_homo_sapiens_core_4_0.gtf.gz
# Load genes and Immunoglobulin/Pseudogenes into 2 separate tracks
awk '$2 != "Pseudogene" && $2 != "Ig_Pseudogene_Segment" && $2 != "Ig_Segment" {print "chr"$0}' \
vega_homo_sapiens_core_4_0.gtf > vega_fixed.gtf
awk '$2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment" {print "chr"$0}' \
vega_homo_sapiens_core_4_0.gtf > vega_pseudo.gtf
ldHgGene hg16 vegaGene vega_fixed.gtf -gtf
ldHgGene hg16 vegaPseudoGene vega_pseudo.gtf -gtf
wget "http://www.sanger.ac.uk/Users/keenan/vega_pep_dump_ncbi34.fa.gz"
hgPepPred hg16 generic vegaPep vega_pep_dump_ncbi34.fa
vegaBuildInfo vega_homo_sapiens_core_4_0.gtf vegaInfo.tab
hgsql hg16 < ~/kent/src/hg/lib/vegaInfo.sql
echo "load data local infile 'vegaInfo.tab' into table vegaInfo" | hgsql hg16
# Set cdsStart and cdsEnd to 0 if method is Novel_Transcript
foreach ntname (`echo 'select name from vegaGene,vegaInfo \
where vegaGene.name = vegaInfo.transcriptId AND \
vegaInfo.method = "Novel_Transcript"' \
| hgsql -N hg16`)
echo "update vegaGene set cdsStart = 0 where name = '$ntname'" \
| hgsql hg16
echo "update vegaGene set cdsEnd = 0 where name = '$ntname'" \
| hgsql hg16
end
# LOAD FIRSTEF TRACK Done 2003-07-31 braney
# Create firstEF track from Zhang lab at CSHL
# contacts
# Gengxin Chen <cheng@cshl.edu>
# Ivo Grosse <grosse@ipk-gatersleben.de>
# Michael Zhang <mzhang@cshl.edu>
mkdir /cluster/data/hg16/bed/firstEF
cd /cluster/data/hg16/bed/firstEF
# Got firstEF.txt from Gengzin 7/30/03
hgLoadBed hg16 firstEF firstEF.txt
# Load chicken sequence loaded & processed by booch & acs (2003-11-4 kate)
hgLoadSeq hg16 /gbdb/gg0/chicken.fa
# 73234 sequences
# LOAD ENSEMBL GENES (DONE 2003-11-07 angie)
mkdir /cluster/data/hg16/bed/ensembl
cd /cluster/data/hg16/bed/ensembl
# Get the ensembl protein data from
# http://www.ensembl.org/Homo_sapiens/martview
# Follow this sequence through the pages:
# Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Structures" box.
# Page 4) Choose GTF as the ouput. choose gzip compression. hit export.
# Save as ensemblGene.gtf.gz
# Ensembl handles random chromosomes differently than us, so we
# strip this data. Fortunately it just loses a couple of genes.
# Add "chr" to front of each line in the gene data gtf file to make
# it compatible with our software.
# Finally, get rid of the ".1" or ".2" after the name
gunzip -c ensemblGene.gtf.gz \
| grep -v ^6_DR51 \
| grep -v ^DR51 \
| grep -v _NT_ \
| perl -wpe 's/^([0-9]|X|Y|Un)/chr$1/ \
|| die "Line $. doesnt start with human chrom:\n$_"' \
| sed -e 's/\..\"/\"/g' \
> ensGene.gtf
ssh hgwdev
/cluster/bin/i386/ldHgGene hg16 ensGene \
/cluster/data/hg16/bed/ensembl/ensGene.gtf
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper. Use ensMart to create it as above, except:
# Page 3) Choose the "Features" box. In "Ensembl Attributes", check
# Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# Choose Text, tab-separated as the output format. Result name ensGtp.
# Save file as ensGtp.txt.gz
gunzip ensGtp.txt.gz
hgsql hg16 < ~/kent/src/hg/lib/ensGtp.sql
echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg16
# Load Ensembl peptides:
# Get them from ensembl as above in the gene section except for
# Page 3) Choose the "Sequences" box.
# Page 4) Transcripts/Proteins. Peptide. Format = FASTA.
# Save file as ensemblPep.fa.gz
gunzip ensemblPep.fa.gz
hgPepPred hg16 ensembl ensemblPep.fa
LOAD GENOMIC DUPES (DONE - 2003-11-11 - Hiram)
o - Load genomic dupes
ssh hgwdev
mkdir /cluster/data/hg16/bed/genomicDups
cd /cluster/data/hg16/bed/genomicDups
# pick up Build34GenomicDups.gz from
# http://humanparalogy.cwru.edu/build34/files_for_ucsc/build34_ucsc.htm
# it has a user and password login. you can use this wget command
# with the user/password:
wget --http-user=X --http-passwd=X \
"http://humanparalogy.cwru.edu/build34/files_for_ucsc/Build34GenomicDups.gz"
gunzip *.gz
# awk -f filter.awk oo33_dups_for_kent > genomicDups.bed
hgsql hg16 < ~/kent/src/hg/lib/genomicDups.sql
hgLoadBed hg16 -oldTable genomicDups Build34GenomicDups
# load of genomicDups did not go as planned: 57702 record(s), 0 row(s) skipped, 57702 warning(s) loading bed.tab
# There was an error in this data delivery. To fixup:
hgsql -e \
'update genomicDups set name = concat(otherChrom,":",otherStart);' \
hg16
# LOAD CHIMP NET (2003-11-20 kate)
# NOTE: Net preparation doc'ed in makePt0.doc
ssh hgwdev
cd /cluster/data/pt0/bed/blastz.hg16/axtChain
netFilter -minGap=10 chimp.net | hgLoadNet hg16 netPt0 stdin
netFilter -minGap=10 chimpSyn.net | hgLoadNet hg16 syntenyNetPt0 stdin
# CHIMP BEST CHAINS, IN CHROMOSOME COORDINATES (2004-02-25 kate)
# NOTE: start with scaffold-based human-reference reciprocal best chains
# doc'ed in makePt0.doc, then lift using scaffold lift file in panTro1
# NOTENOTENOTE: Angie redid this with chain renumbering
ssh kksilo
mkdir -p /cluster/data/hg16/bed/blastz-blat.panTro1
cd /cluster/data/hg16/bed/blastz-blat.panTro1
liftUp -chainQ best.chain \
/cluster/data/panTro1/jkStuff/scaffolds.lft \
warn /cluster/data/pt0/bed/blastz-blatHg16/human.best.chain
chainSplit bestChain best.chain
# Load chains into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz-blat.panTro1/bestChain
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain hg16 ${c}_bestChainPanTro1 $i
end
# CHIMP ALL CHAINS, IN CHROMOSOME COORDINATES (2004-02-25 kate)
ssh kksilo
cd /cluster/data/hg16/bed/blastz-blat.panTro1
liftUp -chainQ all.chain \
/cluster/data/panTro1/jkStuff/scaffolds.lft \
warn /cluster/data/pt0/bed/blastz-blatHg16/all.chain
chainSplit chain all.chain
ssh hgwdev
cd /cluster/data/hg16/bed/blastz-blat.panTro1/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain hg16 ${c}_chainPanTro1 $i
echo done $c
end
# CHIMP RECIPRCAL BEST NET, IN CHROMOSOME COORDS (kate)
# Redo the netting on chrom-based chain files
ssh kolossus
cd /cluster/data/hg16/bed/blastz-blat.panTro1
~/bin/x86_64/chainNet all.chain -minSpace=10 \
/cluster/data/hg16/chrom.sizes /cluster/data/panTro1/chrom.sizes \
human.net chimp.net
ssh kksilo
cd /cluster/data/hg16/bed/blastz-blat.panTro1
chainSwap all.chain all.swap.chain
~/bin/i386/netChainSubset chimp.net all.swap.chain stdout | \
chainSort stdin chimpNet.chain
ssh kolossus
cd /cluster/data/hg16/bed/blastz-blat.panTro1
~/bin/x86_64/chainNet all.chain -minSpace=10 \
/cluster/data/hg16/chrom.sizes /cluster/data/panTro1/chrom.sizes \
human.net chimp.net
# UPDATE WOODY BINARIES (see PHYLOHMM CONSERVATION entries below)
# done, acs, 2003-11-19
ssh hgwdev
cd /cluster/data/woody # better place? don't have permission in /cluster/install
cvs update -dP
cd src
make
# make sure Makefile has INSTALLDIR = /cluster/bin/woody
make install
# CFTR PHYLOHMM CONSERVERVATION
# done, acs, 2003-11-19 (currently using 9-way alignment)
# NOTE: essentially the same procedure applies for any Zoo or ENCODE
# target, as long as a suitable tree topology is available for the
# species in question (when distant species are included, e.g.,
# chicken and fish, the branch-length estimation procedure may need to
# be adapted slightly -- details to come)
ssh hgwdev
# (update woody binaries, if necessary -- see above)
# make sure /cluster/bin/penn/tbaBin and /cluster/bin/woody in path
mkdir -p /cluster/data/nisc/targets/cftr/phyloHMMcons
cd /cluster/data/nisc/targets/cftr/phyloHMMcons
# extract sufficient stats for phylog. analysis from MAF file
CFTR_START=115365025 # boundaries of CFTR region in hg16 coords
CFTR_END=117242450 # (these don't have to be perfect)
maf_project /cluster/data/nisc/targets/cftr/tba9way.maf /cluster/data/nisc/targets/cftr/tba9Mammal/human > cftr9_humanref.maf
msa_view cftr9_humanref.maf -i MAF -o SS -s ${CFTR_START} -e ${CFTR_END} -r 1 -O hg16,chimp,baboon,mm3,rn3,cow,pig,cat,dog > cftr9.ss
head cftr9.ss
#NSEQS = 9
#LENGTH = 2063003
#TUPLE_SIZE = 1
#NTUPLES = 57302
#NAMES = hg16,chimp,baboon,mm3,rn3,cow,pig,cat,dog
#ALPHABET = ACGTN
#IDX_OFFSET = 115365024
#NCATS = -1
#
#0 C-------- 26480
# fit a phylogenetic model to the data, with rate variation
echo "((((1,2),3),(4,5)),((6,7),(8,9)))" > cftr9.nh
# (indexes refer to sequences in the order of the NAMES line in
# the *.ss file)
fit_tree_model -m cftr9.ss -i SS -t cftr9.nh -s REV -o cftr9_rev -E -l fit.log -k 5 -a 4.8 -T -p MED
# (takes about 5 min. Watch log file for convergence --
# single lines correspond to outer maximization algorithm,
# interleaved sets of lines correspond to inner maximization
# algorithms [see http://www.cse.ucsc.edu/~acs/Siepel-03-0304.pdf
# for background])
# Note: k=5 is adequate for a good estimate of the alpha
# parameter, even though we'll use k=10 in the next step. The -a
# argument just provides a reasonable starting value for alpha, to
# speed convergence
# (check estimated branch lengths to be sure they make sense)
cat cftr9_rev.nh
#((((hg16:0.005601,chimp:0.005707):0.019356,baboon:0.034458):0.080743,(mm3:0.072487,rn3:0.079445):0.287368):0.035643,((cow:0.107791,pig:0.102431):0.040419,(cat:0.074444,dog:0.104476):0.053251):0.035643);
# (small deviations from one data set to the next are normal)
# you can also do "draw_tree cftr9_rev.nh > cftr9_rev.ps" to get a
# simple postscript rendering of the tree. Zero- or
# near-zero-length branches usually indicate a problem, e.g.,
# incorrect topology
# (also check cftr9_rev.mod; look in particular at ALPHA)
cat cftr9_rev.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#NRATECATS: 5
#ALPHA: 4.778715
#TRAINING_LNL: -6471907.615171
#BACKGROUND: 0.304536 0.191156 0.191907 0.312401
#RATE_MAT:
# -0.848833 0.150792 0.552489 0.145552
# 0.240232 -1.259134 0.166198 0.852704
# 0.876738 0.165547 -1.285792 0.243507
# 0.141887 0.521764 0.149586 -0.813238
#TREE: ((((1:0.005601,2:0.005707):0.019356,3:0.034458):0.080743,(4:0.072487,5:0.079445):0.287368):0.035643,((6:0.107791,7:0.102431):0.040419,(8:0.074444,9:0.104476):0.053251):0.035643);
# now compute the posterior probabilities of interest, according to
# a phylo-HMM
label -m cftr9.ss -d cftr9_rev.mod -i SS -o cftr9 -k 10 -L 0.9 -A -p 0 -j 1 -x -s chr7
# (takes 12 min)
# (check postprob file)
wc cftr9.postprob
#1752168 3504336 31539024 cftr9.postprob
head cftr9.postprob
#115370785 0.0664
#115370786 0.0583
#115370787 0.0448
#115370788 0.0271
#115370789 0.0217
#115370790 0.0232
#115370791 0.0331
#115370792 0.0396
#115370793 0.0417
#115370794 0.0557
# load as a (Hiramesque) wiggle track
cd /cluster/data/nisc/targets/cftr/phyloHMMcons
zcat cftr9.postprob.gz | wigAsciiToBinary -chrom=chr7 -binsize=1024 \
-dataSpan=1 -wibFile=chr7_phyloHMMcons_CFTR -name=cftr9 stdin
rm -r /gbdb/hg16/wib/chr7_phyloHMMcons_CFTR.wib
ln -s \
/cluster/data/nisc/targets/cftr/phyloHMMcons/chr7_phyloHMMcons_CFTR.wib \
/gbdb/hg16/wib/chr7_phyloHMMcons_CFTR.wib
hgLoadWiggle hg16 chr7_phyloHMMcons_CFTR chr7_phyloHMMcons_CFTR.wig
chmod 664 chr7_phyloHMMcons_CFTR.wib
chmod 775 .
# add trackDb.ra entry, e.g.,
#track phyloHMMcons_CFTR
#shortLabel phyloHMMcons CFTR
#longLabel phylo-HMM-based conservation, CFTR (post. prob. of slowest of 10 rates)
#group compGeno
#priority 150
#visibility hide
#color 175,150,128
#altColor 255,128,0
#type wig 0.0 1.0
#autoScaleDefault Off
# adapt HTML for details page, if necessary (e.g., copy an existing
# phyloHMMcons*.html page to phyloHMMcons_CFTR.html, edit to
# reflect data set, do "make update", don't forget to cvs add and
# commit)
# cleanup
rm cftr9.ss cftr9_humanref.maf # easy to regenerate
gzip cftr9.postprob
# CFTR PHYLOHMM CONSERVATION, 25-way alignment
# done, acs, 2003-11-21
# This can be done exactly as above for the 9-way alignment, except
# that the tree estimation procedure has to be adjusted to circumvent
# the problem that the distant species align only in conserved regions
# (so that a tree estimated from the whole data set will have
# disproportionally short branches to and among these species). The
# procedure I've used is semi-manual and somewhat ad hoc, but I'll
# record the main steps here for completeness. I'll only cover the
# tree estimation procedure (running 'label' and loading the track is
# the same as before) .
ssh hgwdev
mkdir /cluster/data/nisc/targets/cftr/phyloHMMcons25
cd /cluster/data/nisc/targets/cftr/phyloHMMcons25
# extract sufficient statistics for two data sets: all sites for
# mammals and sites in 3rd codon positions for all species. I'm
# not including platyplus with the mammals (it's technically a
# mammal, but a monotreme, and quite distant) because it seems to
# align mostly in conserved regions
maf_project /cluster/data/nisc/targets/cftr/25way/tba.maf /cluster/data/nisc/targets/cftr/25way/human > cftr25_humanref.maf
setenv CFTR_START 115365025
setenv CFTR_END 117242450
setenv SPEC_ORDER hg16,chimp,orangutan,baboon,macaque,vervet,lemur,rabbit,rn3,mm3,cow,pig,horse,cat,dog,ajbat,cpbat,hedgehog,opossum,dunnart,platypus,chicken,zfish,tetra,fr1
msa_view cftr25_humanref.maf -i MAF -o SS -s $CFTR_START -e $CFTR_END -r 1 -O $SPEC_ORDER > cftr25.ss
# whole data set, ordered suff stats -- use this for 'label'
msa_view cftr25.ss -i SS -o SS -z -l 21,22,23,24,25 -x > cftr20.ss
# exclude non-mammals (plus platypus)
/bin/echo -e 'NCATS = 3\ncds 1-3' > cats.cm # category map for cds sites
/cluster/home/acs/woody/scripts/refFlat2gff.pl -S -P -A hg16 -w 'chrom="chr7" and cdsStart > 115365025 and cdsEnd < 117242450' | sed 's/chr7/hg16/' | egrep -v 'NM_152829|NM_001233|NM_018412' > cftr.gff
# gets refseq annotations for this region as a gff; the egrep
# explicitly removes some duplicate entries (should have a
# better way of doing this); the sed changes the seq name so
# that msa_view recognizes it's the same as the name in the
# alignment
msa_view cftr25_humanref.maf -i MAF -o SS -z -c cats.cm -g cftr.gff -O $SPEC_ORDER > cftr25.3.ss
# now fit a tree model to each data set
echo "((((((((((1,2),3),((4,5),6)),7),(8,(9,10))),((((11,12),(13,(14,15))),(16,17)),18)),(19,20)),21),22),(23,(24,25)))" > cftr25.nh
fit_tree_model -m cftr25.3.ss -C 3 -i SS -t cftr25.nh -s REV -o cftr25 -E -l cftr25.3.log -T -p MED -k 5 -a 1.8
# (this next one may take an hour or two -- run it on a fast
# workstation or be sure to nice if on hgwdev; you can speed it up
# by giving it a good starting *.mod file based on the above [-M option])
echo "(((((((1,2),3),((4,5),6)),7),(8,(9,10))),((((11,12),(13,(14,15))),(16,17)),18)),(19,20))" > cftr20.nh
fit_tree_model -m cftr20.ss -i SS -t cftr20.nh -s REV -o cftr20 -E -l cftr20.log -T -p MED -k 5 -a 4
cp cftr20.mod cftr25_hybrid.mod
# Now edit cftr25_hybrid.mod by hand. Copy the tail end of the
# TREE line from cftr25.3.mod, corresponding to all nodes and
# branches outside of the clade for the non-monotreme mammals, and
# append it to the TREE line in cftr25_hybrid.mod (adjusting
# parens as necessary). Then multiply each one of these new
# branch lengths by a factor of 1.2 (computed as the sum of all
# branch lengths in cftr25.mod divided by the sum of the
# corresponding branch lengths in cftr25.3.mod). The resulting
# tree is well supported within the (non-monotreme mammals) and
# includes a reasonable approximation of the non-mammal branch
# lengths. Proceed with 'label' using cftr25_hybrid.mod.
# cleanup
rm cftr25_humanref.maf cftr*.ss # easy to regenerate
# HMR PHYLOHMM CONSERVATION
# (started, acs, 2003-11-11, finished 11-19)
ssh hgwdev
# (update woody binaries, if necessary -- see above)
# (also, make sure /cluster/bin/woody in path)
mkdir /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
# estimate a phylog. model using the entire genome-wide alignments
# first extract sufficient statistics by chromosome
ssh eieio
cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
foreach file (/cluster/data/hg16/bed/humor/hmr/*.maf)
set prefix = $file:t:r
msa_view -i MAF $file -o SS -z -O hg16,mm3,rn3 > $prefix.ss
end
logout
# NOTE: may be worth doing the above as a small cluster job instead
# (put the mafs on bluearc -- end up doing this below anyway)
# now combine suff stats across chromosomes
# (back on hgwdev)
ls chr*.ss > files
msa_view -i SS -o SS -A hg16,mm3,rn3 '*files' > all.ss
# estimate the model (very fast, now that suff stats are avail)
echo "(1,(2,3));" > tree.nh
fit_tree_model -i SS -m all.ss -t tree.nh -s REV -k 10 -o rev_dg
cat rev_dg.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#NRATECATS: 10
#ALPHA: 4.428803
#TRAINING_LNL: -448054115.568696
#BACKGROUND: 0.286083 0.213573 0.213691 0.286652
#RATE_MAT:
# -0.891523 0.166770 0.574850 0.149902
# 0.223389 -1.146311 0.153784 0.769137
# 0.769591 0.153699 -1.147159 0.223869
# 0.149605 0.573055 0.166888 -0.889548
#TREE: (1:0.192598,(2:0.076303,3:0.083043):0.192598);
# now, break up the genome-wide MAFs into pieces; it's worth doing
# this as a little cluster job
ssh eieio
mkdir -p /cluster/bluearc/hg16/bed/humor
cp /cluster/data/hg16/bed/humor/hmr/*.maf /cluster/data/hg16/?{,?}/chr*.fa /cluster/bluearc/hg16/bed/humor
logout
ssh kk
cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
cat << EOF > doSplit
#!/bin/sh
WOODY=/cluster/bin/woody
FA_SRC=/cluster/bluearc/hg16/bed/humor
WINDOWS=/cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11/WINDOWS
maf=$1
prefix=`echo $maf | awk -F\/ '{print $NF}' | awk -F\. '{print $1}'`
mkdir -p /scratch/msa_split
${WOODY}/msa_split $maf -i MAF -M ${FA_SRC}/$prefix.fa -O hg16,mm3,rn3 -w 1000000,0 -r /scratch/msa_split/$prefix -o SS -I 1000 -d 1 -B 5000
cd /scratch/msa_split
for file in ${prefix}.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
rm -f /scratch/msa_split/${prefix}.*.ss
EOF
chmod +x doSplit
mkdir -p WINDOWS
rm -f WINDOWS/* jobs.lst
foreach file (/cluster/bluearc/hg16/bed/humor/*.maf)
echo "doSplit $$file" >> jobs.lst
end
para create jobs.lst
# etc ... (run cluster job)
# now setup and run the cluster job to compute the conservation scores
# NOTE: the TMP dir should be set to something other than /scratch,
# as it is not shared between cluster nodes ?
cat << EOF > doPostProbs
#!/bin/sh
WOODY=/cluster/bin/woody
TMP=/scratch/phyloHMMcons
file=$1
root=`echo $file | awk -F\/ '{print $NF}' | sed 's/\.ss\.gz//'`
chrom=`echo $root | awk -F\. '{print $1}'`
mkdir -p $TMP
zcat $file | $WOODY/label -m - -d rev_dg.mod -i SS -o $TMP/$root -k 10 -L 0.9 -A -p 0 -j 1 -s $chrom -x
mkdir -p POSTPROBS/$chrom
gzip -c $TMP/$root.postprob > POSTPROBS/$chrom/$root.postprob.gz
rm $TMP/$root.postprob
EOF
chmod +x doPostProbs
mkdir -p POSTPROBS
rm -f jobs2.lst
foreach file (WINDOWS/chr*.ss.gz)
echo "doPostProbs $file" >> jobs2.lst
end
para create jobs2.lst
# etc ... (run cluster job)
logout
# finally, make track
# phyloHMMcons.hg16mm3rn3.2003-11-11 dir)
ssh eieio
cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
mkdir wibLimits
mkdir wib
foreach dir (POSTPROBS/*)
set chrom = $dir:t
echo $chrom
zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \
wigAsciiToBinary -chrom=$chrom -binsize=1024 \
-dataSpan=1 -wibFile=wib/${chrom}_phyloHMMcons -name=hmr \
stdin > wibLimits/${chrom}
end
ssh hgwdev
cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
hgLoadWiggle hg16 phyloHMMcons_HMR wib/*_phyloHMMcons.wig
ln -s `pwd`/wib/chr*_phyloHMMcons.wib /gbdb/hg16/wib
chmod 775 . wib
chmod 664 wib/*.wib
# add entry to trackDb.ra
#track phyloHMMcons_HMR
#shortLabel phyloHMMcons HMR
#longLabel phylo-HMM-based conservation, human-mouse-rat (post. prob. of slowest of 10 rates)
#group compGeno
#priority 150
#visibility hide
#color 175,150,128
#altColor 255,128,0
#type wig 0.0 1.0
#autoScaleDefault Off
# cleanup (only when you're pretty sure you're done!)
rm -r chr*.ss WINDOWS wiggle.tab para.results batch*
# CHICKEN BLAT (translated)
# (done, acs, 2003-11-19)
# (using repeat- and TRF-masked files already created -- see
# CHICKEN BLASTZ, above)
ssh kk
# set up main dir
cd /cluster/data/hg16/bed
mkdir blat.gg0.2003-11-19
ln -s blat.gg0.2003-11-19 blat.gg0
cd blat.gg0
# warning: I'm writing this up in a rush -- watch for errors!
# set up cluster job
cat << EOF > make-joblist.pl
#!/usr/bin/perl
# script to create a job list for translated blat of human
# vs. another species; assumes directory of fa files for the xeno
# species. Output directories are set up as a side effect.
# USAGE: make-joblist.pl <hg-nibs-dir> <xeno-fa-dir> <hg-chr-lengths-file> <output_root_dir>
$SIZE=10000000; # partitioning params for human
$OVERLAP=10000;
# read lengths of chromosomes
open(LENF, $ARGV[2]);
while (<LENF>) { ($chr, $l) = split(/\s+/); $length{$chr} = $l;}
close(LENF);
@falist = <$ARGV[1]/*.fa>;
foreach $nib (<$ARGV[0]/*.nib>) {
$nib =~ /.*(chr.*)\.nib/ || die();
$chr = $1;
$l = $length{$chr};
for ($start = 1; $start <= $l; $start += $SIZE) {
$end = $start + $SIZE + $OVERLAP - 1;
if ($end > $l) { $end = $l; }
$dir = sprintf("%s/%s/%d_%d", $ARGV[3], $chr, $start, $end);
foreach $fa (@falist) {
$fa =~ /.*\/([^\/]+)\.fa/ || die();
$name = $1;
printf "/cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax %s:%d-%d %s {check out line+ %s/%s_%d_%d_%s.psl}\\n", $nib, $start, $end, $fa, $dir, $chr, $start, $end, $name;
}
`mkdir -p $dir`; # set up output directories
}
}
EOF
# NOTE: there's a slight error above with indexing. Next time use
# something like:
# for ($start = 0; $start < $l; $start += $SIZE) {
# $end = $start + $SIZE + $OVERLAP;
# if ($end >= $l) { $end = $l; }
# The "make-lift.pl" script below should be changed also to be
# consistent (should be enough to change exactly the same lines)
chmod +x make-joblist.pl
cp /cluster/data/hg16/bed/blastz.gg0/S1.len . # just borrow existing lens
mkdir -p run
./make-joblist.pl /iscratch/i/gs.17/build34/bothMaskedNibs /cluster/bluearc/gg0/split100_with_trf S1.len /cluster/data/hg16/bed/blat.gg0/psl > run/jobs.lst
# make sure directory structure is created under psl
cd run
para create jobs.lst ; para try ; para check ; para push ; # etc...
#33561 jobs in batch
#0 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 33561 of 33561 jobs
#CPU time in finished jobs: 14432527s 240542.12m 4009.04h 167.04d 0.458 y
#IO & Wait Time: 147210s 2453.50m 40.89h 1.70d 0.005 y
#Average job time: 434s 7.24m 0.12h 0.01d
#Longest job: 14117s 235.28m 3.92h 0.16d
#Submission to last job: 31483s 524.72m 8.75h 0.36d
# post process psl files
cd .. # back to main blat.gg0 dir
cat << EOF > make-lift.pl
#!/usr/bin/perl
# create a lift spec to map psl files for windows to chromosome coords
# USAGE: make-lift.pl <hg-nibs-dir> <hg-chr-lengths-file>
$SIZE=10000000;
$OVERLAP=10000;
open(LENF, $ARGV[1]);
while (<LENF>) { ($chr, $l) = split(/\s+/); $length{$chr} = $l;}
close(LENF);
foreach $nib (<$ARGV[0]/*.nib>) {
$nib =~ /.*(chr.*)\.nib/ || die();
$chr = $1;
$l = $length{$chr};
for ($start = 1; $start <= $l; $start += $SIZE) {
$end = $start + $SIZE + $OVERLAP - 1;
if ($end > $l) { $end = $l; }
printf "%d\t%s:%d-%d\t%d\t%s\t%d\n", $start, $chr, $start, $end, $end-$start, $chr, $l;
}
}
EOF
chmod +x make-lift.pl
./make-lift.pl /iscratch/i/gs.17/build34/bothMaskedNibs S1.len > psl.lft
mkdir -p pslChrom
foreach dir ( psl/* )
set chrom = $dir:t
echo $chrom
/cluster/bin/i386/pslCat -dir $dir/* | /cluster/bin/i386/liftUp pslChrom/${chrom}_blatGg0.psl psl.lft warn stdin
end
# Load database tables
ssh hgwdev
cd /cluster/data/hg16/bed/blat.gg0/pslChrom
/cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*.psl
# New entry in human/hg16/trackDb.ra
# track blatGg0
# shortLabel Chicken Blat
# longLabel Chicken Translated Blat (Gg0-contigs, 5.2x coverage)
# group compGeno
# priority 145.95
# visibility hide
# color 100,50,0
# altColor 255,240,200
# spectrum on
# type psl xeno
# look at coverage
featureBits hg16 blatGg0 knownGene:CDS
#18205137 bases of 2865248791 (0.635%) in intersection
featureBits hg16 knownGene:CDS
#31268809 bases of 2865248791 (1.091%) in intersection
# RELOAD ENSEMBL GENES WITH VERSION 34a (DONE 2003/12/16 markd)
# save current tables, just in case.
rename table ensGene to ensGene_old;
rename table ensGtp to ensGtp_old;
rename table ensPep to ensPep_old;
mkdir /cluster/data/hg16/bed/ensembl34a
cd /cluster/data/hg16/bed/ensembl34a
# Get the ensembl protein data from
# http://www.ensembl.org/Homo_sapiens/martview
# Follow this sequence through the pages:
# Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Structures" box.
# Page 4) Choose GTF as the ouput. choose gzip compression. hit export.
# Save as ensemblGene.gtf.gz
# Ensembl handles random chromosomes differently than us, so we
# strip this data. Fortunately it just loses a couple of genes.
# Add "chr" to front of each line in the gene data gtf file to make
# it compatible with our software.
# Finally, get rid of the ".1" or ".2" after the name
zcat ensemblGene.gtf.gz \
| grep -v ^6_DR51 \
| grep -v ^DR51 \
| grep -v _NT_ \
| perl -wpe 's/^([0-9]|X|Y|Un)/chr$1/ \
|| die "Line $. doesnt start with human chrom:\n$_"' \
| sed -e 's/\..\"/\"/g' \
> ensGene.gtf
ssh hgwdev
/cluster/bin/i386/ldHgGene hg16 ensGene \
/cluster/data/hg16/bed/ensembl34a/ensGene.gtf
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper. Use ensMart to create it as above, except:
# Page 3) Choose the "Features" box. In "Ensembl Attributes", check
# Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# Choose Text, tab-separated as the output format. Result name ensGtp.
# Save file as ensGtp.txt.gz
gunzip ensGtp.txt.gz
hgsql hg16 < ~/kent/src/hg/lib/ensGtp.sql
echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg16
gzip ensGtp.txt
# Load Ensembl peptides:
# Get them from ensembl as above in the gene section except for
# Page 3) Choose the "Sequences" box.
# Page 4) Transcripts/Proteins. Peptide. Format = FASTA.
# Save file as ensemblPep.fa.gz
zcat ensemblPep.fa.gz | hgPepPred hg16 ensembl stdin
# compare size of old and new tables as a sanity check
drop table ensGene_old;
drop table ensGtp_old;
drop table ensPep_old;
# Create knownToEnsembl column and knownToSuperfamily column
hgMapToGene hg16 ensGene knownGene knownToEnsembl
zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper hg16 hs stdin
# LOAD ECgene tables ((redone with existing data) braney, 2004-01-30)
cd /cluster/data/hg16/bed
rm -f ECgene
mkdir ECgene.2003-12-18
ln -s ECgene.2003-12-18 ECgene
cd ECgene
wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_hg16_v1.1_25oct2003_genes.txt.gz"
wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_hg16_v1.1_25oct2003_genepep.txt.gz"
gunzip *.gz
ldHgGene -predTab hg16 ECgene ECgene_hg16_v1.1_25oct2003_genes.txt
hgPepPred hg16 tab ECgenePep ECgene_hg16_v1.1_25oct2003_genepep.txt
rm genePred.tab
gzip *
# QA NOTE: [ASZ, 2007-10-01] mytouch to ECGenePep table 200401301000.00
# contents were fine. passed -keys rule.
# MULTIZ HUMAN/MOUSE/RAT/CHIMP (kpollard, 12/16/03)
# chimp added to human/mouse/rat (HUMOR) alignment described above
# for now, human referenced and no new BLASTZ runs
ssh kk
#fix order in human/chimp BLASTZ MAF files
#use Kate's new files in humanBestAxt.2
cd /cluster/data/pt0/bed/blastz-blatHg16
mkdir humanBestAxt.ord
mkdir maf.ord
foreach file (humanBestAxt.2/*.axt)
set root=$file:t:r
echo $root
/cluster/bin/i386/axtSort $file humanBestAxt.ord/${root}.axt
/cluster/bin/i386/axtToMaf humanBestAxt.ord/${root}.axt ../blastz.hg16/S1.len /cluster/data/pt0/scaffold.sizes maf.ord/${root}.maf.unfixed -tPrefix=hg16. -qPrefix=pt0.
/cluster/bin/scripts/fixmaf.pl < maf.ord/${root}.maf.unfixed > maf.ord/${root}.maf
end
#test on chr11 with HMR
ssh eieio
mkdir -p /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hp.ord
cp /cluster/data/pt0/bed/blastz-blatHg16/maf.ord/chr11.maf /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hp.ord
logout # back to kk
/cluster/data/hg16/bed/multiz.hg16mm3rn3gg0pt0/mz /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hp.ord/chr11.maf /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hmr/chr11.hmr.maf /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0pt0/hmrp/chr11.ord.maf
#looks good, go ahead with HMRP multiz
mkdir -p /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0
mkdir hmrp
# wrapper script for multiz
cat << EOF > mz
#!/bin/csh
/cluster/bin/penn/tbaBin/multiz \$1 \$2 - > \$3
EOF
chmod +x mz
ssh eieio
# clean up bluearc
rm -r /cluster/bluearc/multiz.hg16mm3rn3gg0pt0
# move MAFS to bluearc
mkdir -p /cluster/bluearc/multiz.hg16mm3rn3pt0/hmr
mkdir -p /cluster/bluearc/multiz.hg16mm3rn3pt0/hp
cp /cluster/data/hg16/bed/humor/hmr/*.maf /cluster/bluearc/multiz.hg16mm3rn3pt0/hmr
cp /cluster/data/pt0/bed/blastz-blatHg16/maf.ord/*.maf /cluster/bluearc/multiz.hg16mm3rn3pt0/hp
logout
# set up joblist (common denominator set: no chr19_random in hmr)
foreach file (/cluster/bluearc/multiz.hg16mm3rn3pt0/hmr/*.maf)
set root=`echo $file:t:r | sed 's/\.hmr//'`
echo "/cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/mz /cluster/bluearc/multiz.hg16mm3rn3pt0/hp/${root}.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp/${root}.maf" >> jobList
end
#run MULTIZ
chmod +x jobList
para create jobList
#submit 10 jobs
para try
#keep an eye on them
para check
para finished
para running
#once these are done, submit rest
para push
para check
para time
#ran on cluster: 41 jobs, longest 42 min
#copy over chr19_random.maf from human/chimp
cp /cluster/bluearc/multiz.hg16mm3rn3pt0/hp/chr19_random.maf /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp/chr19_random.maf
# clean up bluearc
ssh eieio
rm -r /cluster/bluearc/multiz.hg16mm3rn3pt0
logout
# setup external files for database reference
ssh hgwdev
mkdir -p /gbdb/hg16/multizMm3Rn3Pt0
ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp/*.maf /gbdb/hg16/multizMm3Rn3Pt0
#load into database
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp
/cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3Pt0
# 5385226 mafs in 42 files
# 0-2594 warnings/file
#NOTE: only added track to hgwdev-kpollard (for now).
# LIFTOVER RNAGENE FROM HG15 (DONE CIRCA 12/27/03 schattner)
# Replaced below by new RNAGENES (2004-03-09)
cd /cluster/data/hg16/bed/bedOver
mkdir rnaGene
cd rnaGene
hgsql -N hg15 '-e select * from rnaGene' > rnaGeneHg15.bed
liftOver rnaGeneHg15.bed ../over.chain rnaGeneLiftGene.bed \
rnaGeneLiftGeneMiss.bed
hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/rnaGene.sql hg16 rnaGene \
/cluster/data/hg16/bed/bedOver/rnaGene/rnaGeneLiftGene.bed
LOAD RNAGENES (DONE - 2004-03-09 - Hiram)
# http://www.genetics.wustl.edu/eddy
# Sean Eddy, eddy@genetics.wustl.edu
# Dept. of Genetics, Washington University School of Medicine
ssh hgwdev
mkdir -p /cluster/data/hg16/bed/rnaGene
cd /cluster/data/hg16/bed/rnaGene
mkdir rnaGenePrevious
# save previous rnaGene track for reference
hgsqldump -T rnaGenePrevious hg16 rnaGene
wget --timestamping \
ftp://ftp.genetics.wustl.edu/pub/eddy/annotation/human-hg16/*
grep -v "^#" ncrna-hg16-mito.gff | sed -e "s/^NT_999999/chrM/" > mito.gff
grep -v "^#" ncrna-hg16-chrom.gff > chrom.gff
cat chrom.gff mito.gff > all.gff
hgsql -e 'drop table rnaGene;' hg16
hgsql hg16 < ~/kent/src/hg/lib/rnaGene.sql
hgRnaGenes hg16 all.gff
# rmMm3Rn3 3-way Regulatory Potential Score track (DONE - 2004-01-14 - Hiram)
# Data from: James Taylor james@bx.psu.edu
# Track description from: Francesca Chiaromonte chiaro@stat.psu.edu
ssh eieio
# Right now we are out of space on this /cluster/store4 filesystem,
# so send the data to the bluearc
mkdir /cluster/bluearc/hg16/bed/regPotential3X
ln -s /cluster/bluearc/hg16/bed/regPotential3X \
/cluster/data/hg16/bed/regPotential3X
cd /cluster/data/hg16/bed/regPotential3X
mkdir data
cd data
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
wget --timestamping \
"http://www.bx.psu.edu/~james/rp/hg16_mm3_rn3/chr${c}.hmr.maf.gz_rpscores.txt.truncated.bz2"
wget --timestamping \
"http://www.bx.psu.edu/~james/rp/hg16_mm3_rn3/chr${c}.hmr.maf.gz_rpscores.txt.bz2"
end
# The truncated files were a test. They want to see the raw data.
ssh eieio
cd /cluster/data/hg16/bed/regPotential3X
mkdir wigRawData
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
bzcat data/chr${c}.hmr.maf.gz_rpscores.txt.bz2 | sort -n | \
wigAsciiToBinary -chrom=chr${c} -dataSpan=5 \
-verbose -wibFile=wigRawData/chr${c}_rpMm3Rn3_Data \
-name=${c} stdin > chr${c}.out
echo chr${c} done
end
ssh hgwdev
cd /cluster/data/hg16/bed/regPotential3X/wigRawData
hgLoadWiggle hg16 regPotential3X chr*_rpMm3Rn3_Data.wig
ln -s `pwd`/chr*_rpMm3Rn3_Data.wib /gbdb/hg16/wib
# rmMm4 2-way Regulatory Potential Score track (DONE - 2004-01-14 - Hiram)
# Data from: James Taylor james@bx.psu.edu
# Track description from: Francesca Chiaromonte chiaro@stat.psu.edu
ssh eieio
# Right now we are out of space on this /cluster/store4 filesystem,
# so send the data to the bluearc
mkdir /cluster/bluearc/hg16/bed/regPotential2X
ln -s /cluster/bluearc/hg16/bed/regPotential2X \
/cluster/data/hg16/bed/regPotential2X
cd /cluster/data/hg16/bed/regPotential2X
mkdir data
cd data
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 M X Y)
wget --timestamping \
"http://www.bx.psu.edu/~james/rp/hg16_mm4/chr${c}.axt_rpscores.txt.truncated"
wget --timestamping \
"http://www.bx.psu.edu/~james/rp/hg16_mm4/chr${c}.axt_rpscores.txt"
end
gzip *.truncated *.txt
# I'll bet you could gzip the .wig files too and zcat them
# into hgLoadWiggle ?
# The truncated files were a test. It turns out the full scores
# are desired to be seen
# The data is for every 5 bases. Doesn't appear to be in order,
# so sort it into wigAsciiToBinary
ssh eieio
cd /cluster/data/hg16/bed/regPotential2X
mkdir wigFiles
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 M X Y)
zcat data/chr${c}.axt_rpscores.txt.gz | sort -n | \
wigAsciiToBinary -chrom=chr${c} -dataSpan=5 \
-wibFile=wigFiles/chr${c}_rpMm4 stdin > chr${c}.limits
echo chr${c} done
end
# To load the data
# (some day in the future the above wigAsciiToBinary function
# will be folded into hgLoadWiggle and thus one command)
ssh hgwdev
cd /cluster/data/hg16/bed/regPotential2X/wigFiles
hgLoadWiggle hg16 regPotential2X chr*_rpMm4.wig
ln -s `pwd`/chr*_rpMm4.wib /gbdb/hg16/wib
# an optional data load to check a display problem
mkdir wigTrunc
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 M X Y)
zcat data/chr${c}.axt_rpscores.txt.truncated.gz | sort -n | \
wigAsciiToBinary -chrom=chr${c} -dataSpan=5 \
-wibFile=wigTrunc/chr${c}_rpMm4t stdin > chr${c}t.limits
end
ssh hgwdev
cd /cluster/data/hg16/bed/regPotential2X/wigTrunc
hgLoadWiggle hg16 regPotential2XTrunc chr*_rpMm4t.wig
ln -s `pwd`/chr*_rpMm4t.wib /gbdb/hg16/wib
# CREATE chimpSimpleDiff TRACK AND TABLE
# Convert chimp quality scores from uncompressed contig to compressed
# supercontig format. This will take half an hour or so.
cd /cluster/data/pt0
zcat contigs.quals.gz | qaToQac stdin stdout | \
chimpSuperQuals assembly.agp stdin scaffolds.qac
# Make single base pair high quality differences into a bed file
# and load into database
cd /cluster/data/hg16/bed
mkdir chimpSimpleDiff
cd chimpSimpleDiff
chimpHiQualDiffs /cluster/data/pt0/bed/blastz-blatHg16/axtBest \
/cluster/data/pt0/scaffolds.qac chimpSimpleDiff.bed
sed 's/simpleNucDiff/chimpSimpleDiff/' ~/kent/src/hg/lib/simpleNucDiff.sql > \
chimpSimpleDiff.sql
hgLoadBed -sqlTable=simpleNucDiff.sql hg16 chimpSimpleDiff.bed
### chimpFixedDiff -- panTro1 (Daryl, July 8, 2005)
# Convert chimp quality scores from uncompressed to compressed
# chromosome format. This took 22 minutes on crow.
cd /cluster/data/panTro1
cat */chr*.qa | qaToQac stdin chrom.qac
# Make single base pair high quality differences into a bed file
# and load into database
cd /cluster/data/hg16/bed
mkdir chimpFixedDiff
cd chimpFixedDiff
sed 's/simpleNucDiff/chimpFixedDiff/' ~/kent/src/hg/lib/simpleNucDiff.sql > chimpFixedDiffs.sql
# chimpHiQualDiffs was changed to allow different quality
# parameters as command line options
## FIRST ATTEMPT:
set axtDir = cluster/data/hg16/bed/blastz-blat.panTro1.lifted/axtRBestNet
## time chimpFixedDiffs /$axtDir /cluster/data/panTro1/chrom.qac chimpFixedDiffs.bed >& chimpFixedDiffs.log
# This crashed twice at the same place, but ran successfully when
# each chromosome was run separately.
mkdir chroms; cd chroms
ls -1 $axtDir | grep chr | grep axt | sed 's/.axt//' | xargs mkdir
rmdir chr*random
foreach f (chr*)
echo -n $f " "
ln -s /$axtDir/$f.axt $f/$f.axt
time nice chimpFixedDiffs $f /cluster/data/panTro1/chrom.qac $f.chimpFixedDiffs.bed>>& cfd.log
end
cat chr*bed > ../chimpFixedDiffs.bed
## The load (sort) ran out of memory on hgwdev, so I sorted the
## file first on kolossus (3 minutes) and then loaded it on hgwdev
ssh kolossus
hgLoadBed -strict -sqlTable=chimpFixedDiffs.sql -noLoad hg16 chimpFixedDiff chimpFixedDiffs.bed
exit
## hgwdev (37 minutes)
hgLoadBed -hasBin -noSort -sqlTable=chimpFixedDiffs.sql hg16 chimpFixedDiff bed.tab
TODO: need to filter out polymorphic sites (SNPs)
## LS-SNP links [load data only] (Daryl Thomas; November 3, 2005)
# Data from Rachel Karchin in the Andrej Sali lab at UCSF
# /cluster/data/hg16/bed/lssnp
hgsql hg16 < ${HOME}/kent/src/hg/lib/lsSnpFunction.sql
hgsql hg16 < ${HOME}/kent/src/hg/lib/lsSnpStructure.sql
mysql> load data local infile "snp-human2-function-predictions.txt" into table lsSnpFunction;
Query OK, 7689 rows affected (0.52 sec)
mysql> load data local infile "snp-human2-structure-predictions.txt" into table lsSnpStructure;
Query OK, 28144 rows affected (2.39 sec)
# gc5Base wiggle TRACK (DONE - 2004-03-12 - Hiram)
# reloaded wib files 2005-05-17 to place them in /gbdb/hg16/wib/gc5Base
# a demonstration wiggle track. Perform a gc count with a 5 base
# window. Also compute a "zoomed" view for display efficiency.
mkdir /cluster/data/hg16/bed/gc5Base
cd /cluster/data/hg16/bed/gc5Base
# in the script below, the 'grep -w GC' selects the lines of
# output from hgGcPercent that are real data and not just some
# information from hgGcPercent. The awk computes the number
# of bases that hgGcPercent claimed it measured, which is not
# necessarily always 5 if it ran into gaps, and then the division
# by 10.0 scales down the numbers from hgGcPercent to the range
# [0-100]. Two columns come out of the awk print statement:
# <position> and <value> which are fed into wigAsciiToBinary through
# the pipe. It is set at a dataSpan of 5 because each value
# represents the measurement over five bases beginning with
# <position>. The result files end up in ./wigData5.
cat << '_EOF_' > runGcPercent.sh
#!/bin/sh
mkdir -p wigData5
mkdir -p dataLimits5
for n in ../../nib/*.nib
do
c=`basename ${n} | sed -e "s/.nib//"`
C=`echo $c | sed -e "s/chr//"`
echo -n "working on ${c} - ${C} ... "
hgGcPercent -chr=${c} -doGaps \
-file=stdout -win=5 hg16 ../../nib | grep -w GC | \
awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \
wigAsciiToBinary \
-dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \
-name=${C} stdin 2> dataLimits5/${c}
echo "done"
done
'_EOF_'
chmod +x runGcPercent.sh
# This is going to take perhaps two hours to run. It is a lot of
# data. make sure you do it on the fileserver:
ssh eieio
cd /cluster/data/hg16/bed/gc5Base
./runGcPercent.sh
# load the .wig files back on hgwdev:
ssh hgwdev
cd /cluster/data/hg16/bed/gc5Base
hgLoadWiggle -pathPrefix=/gbdb/hg16/wib/gc5Base hg16 gc5Base wigData5/*.wig
# and symlink the .wib files into /gbdb
mkdir /gbdb/hg16/wib/gc5Base
ln -s `pwd`/wigData5/*.wib /gbdb/hg16/wib/gc5Base
# to speed up display for whole chromosome views, compute a "zoomed"
# view and load that on top of the existing table. The savings
# comes from the number of data table rows the browser needs to load
# for a full chromosome view. Without the zoomed view there are
# over 43,000 data rows for chrom 1. With the zoomed view there are
# only 222 rows needed for the display. If your original data was
# at 1 value per base the savings would be even greater.
# Pretty much the same data calculation
# situation as above, although this time note the use of the
# 'wigZoom -dataSpan=1000 stdin' in the pipeline. This will average
# together the data points coming out of the awk print statment over
# a span of 1000 bases. Thus each <position> coming out of wigZoom
# will represent the measurement of GC in the next 1000 bases. Note
# the use of -dataSpan=1000 on the wigAsciiToBinary to account for
# this type of data. You want your dataSpan here to be an exact
# multiple of your original dataSpan (5*200=1000) and on the order
# of at least 1000, doesn't need to go too high. For data that is
# originally at 1 base per value, a convenient span is: -dataSpan=1024
# A new set of result files ends up in ./wigData5_1K/*.wi[gb]
cat << '_EOF_' > runZoom.sh
#!/bin/sh
mkdir -p wigData5_1K
mkdir -p dataLimits5_1K
for n in ../../nib/*.nib
do
c=`basename ${n} | sed -e "s/.nib//"`
C=`echo $c | sed -e "s/chr//"`
echo -n "working on ${c} - ${C} ... "
hgGcPercent -chr=${c} -doGaps \
-file=stdout -win=5 hg16 ../../nib | grep -w GC | \
awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \
wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \
-dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \
-name=${C} stdin 2> dataLimits5_1K/${c}
echo "done"
done
'_EOF_'
chmod +x runZoom.sh
# This is going to take even longer than above, certainly do this
# on the fileserver
ssh eieio
time ./runZoom.sh
real 232m3.265s
user 302m37.050s
sys 16m13.770s
# Then load these .wig files into the same database as above
ssh hgwdev
hgLoadWiggle -pathPrefix=/gbdb/hg16/wib/gc5Base -oldTable hg16 gc5Base \
wigData5_1K/*.wig
# and symlink these .wib files into /gbdb
mkdir -p /gbdb/hg16/wib/gc5Base
ln -s `pwd`/wigData5_1K/*.wib /gbdb/hg16/wib/gc5Base
# KNOWN GENES TRACK (STARTED - 2004-01-15 - with Gene Sorter complete
# 1004-02-17 Hiram)
# you will probably need to make the programs in kent/src/hg/protein
cd ~/kent/src/hg/protein
make
# The scripts run below will check for programs and let you know
# which ones are missing
# obtain new SwissProt database (should be done about once a month)
# the swiss prot data is currently living on store5, first step is
# on the fileserver. This script was used once as it was created,
# it may need to be verified and improved as it is used again. See
# comments at the top of the script.
ssh eieio
cd /cluster/data/swissprot
~/kent/src/hg/protein/mkSwissProtDB.sh
# that obtains the data and unpacks it, second step is on hgwdev
# to create the database
ssh hgwdev
cd /cluster/data/swissprot
~/kent/src/hg/protein/mkSwissProtDB.sh
# Now the proteins database can be created from that. Must be on hgwdev
# Again, a script that has been used once upon creation, see
# comments in it. For example currently it is assumed these two
# scripts have been run on the same day. In this case 03112
ssh hgwdev
cd /cluster/data/proteins
~/kent/src/hg/protein/mkProteinsDB.sh
# with those two databases existing, read for the actual known genes
# track build. Must be on hgwdev since it is all mostly database
# operations. The {Date} argument is the date stamp created by the
# above two scripts. Something of the form YYMMDD, e.g.: 031112
# Again, a script that has been used only once at creation, see
# comments at top of script.
ssh hgwdev
mkdir /cluster/data/hg16/bed/knownGenes
cd /cluster/data/hg16/bed/knownGenes
DateStamp=040115
~/kent/src/hg/protein/KGprocess.sh ${DateStamp}
# that runs to a point where it prepares data and jobList for a
# cluster run. Continue with a cluster run on kk
ssh kk
cd /cluster/data/hg16/bed/knownGenes/kgBestMrna
para create jobList
para try
para check
para push
# this is a quick cluster job. Less than five minutes. e.g.:
# Completed: 43580 of 43580 jobs
# CPU time in finished jobs: 114636s 1910.60m 31.84h 1.33d 0.004 y
# IO & Wait Time: 111889s 1864.82m 31.08h 1.30d 0.004 y
# Average job time: 5s 0.09m 0.00h 0.00d
# Longest job: 9s 0.15m 0.00h 0.00d
# Submission to last job: 282s 4.70m 0.08h 0.00d
# Continuing back on hgwdev, run the same script again
ssh hgwdev
cd /cluster/data/hg16/bed/knownGenes
DateStamp=031112
~/kent/src/hg/protein/KGprocess.sh ${DateStamp}
# that should run to completion and the known genes track is ready
# Add the proteins link into gdbPdb.hgcentral:
hgsql -e 'INSERT INTO gdbPdb (genomeDb, proteomeDb) \
VALUES ("hg16","proteins040115");' \
-h genome-testdb hgcentraltest
# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
# This depends on the go and uniProt databases as well as
# the kgAlias and kgProAlias tables. The hgKgGetText takes
# about 5 minutes when the database is not too busy. The rest
# is real quick.
ssh hgwdev
cd /cluster/data/hg16/bed/knownGenes.2004-01-29
mkdir index
cd index
hgKgGetText hg16 knownGene.text
ixIxx knownGene.text knownGene.ix knownGene.ixx
ln -s /cluster/data/hg16/bed/knownGenes.2004-01-29/index/knownGene.ix /gbdb/hg16/knownGene.ix
ln -s /cluster/data/hg16/bed/knownGenes.2004-01-29/index/knownGene.ixx /gbdb/hg16/knownGene.ixx
# VEGA GENES UPDATE from 2004/01/15 below (2004-02-04 - Hiram)
mv ~/hg16/bed/vega ~/hg16/bed/vega.badcds
mkdir /cluster/data/hg16/bed/vegaUpdate
cd /cluster/data/hg16/bed/vegaUpdate
wget --timestamping ftp://ftp.sanger.ac.uk/pub/searle/*.gtf.gz
# Load genes and Immunoglobulin/Pseudogenes into 2 separate tracks. Just
# omit snoRNAs, as there are so few of them
zcat *.gtf.gz | awk '!(/small nucleolar RNA/ || $2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}' > vegaGene.gtf
zcat *.gtf.gz | awk '($2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}' > vegaPseudoGene.gtf
ldHgGene -gtf hg16 vegaGeneUpdate vegaGene.gtf
ldHgGene -gtf hg16 vegaPseudoGeneUpdate vegaPseudoGene.gtf
wget http://www.sanger.ac.uk/Users/keenan/vega_pep_dump_ncbi34.fa.gz
hgPepPred hg16 generic vegaPep vega_pep_dump_ncbi34.fa
vegaBuildInfo vega_homo_sapiens_core_4_0.gtf vegaInfo.tab
hgsql hg16 < ~/kent/src/hg/lib/vegaInfo.sql
hgsql -e "load data local infile 'vegaInfo.tab' into table vegaInfo" hg16
# LOAD VEGA GENES AND PSEUDOGENES (reloaded 2004/01/15 markd)
# reloaded due to bug in creating bogus CDS
mv ~/hg16/bed/vega ~/hg16/bed/vega.badcds
mkdir ~/hg16/bed/vega
cd ~/hg16/bed/vega
wget http://www.sanger.ac.uk/Users/keenan/vega_homo_sapiens_ncbi34.gtf.gz
# Load genes and Immunoglobulin/Pseudogenes into 2 separate tracks. Just
# omit snoRNAs, as there are so few of them
zcat vega_homo_sapiens_ncbi34.gtf.gz | awk '!(/small nucleolar RNA/ || $2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}' > vegaGene.gtf
zcat vega_homo_sapiens_ncbi34.gtf.gz | awk '($2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}' > vegaPseudoGene.gtf
ldHgGene -gtf hg16 vegaGene vegaGene.gtf
ldHgGene -gtf hg16 vegaPseudoGene vegaPseudoGene.gtf
wget http://www.sanger.ac.uk/Users/keenan/vega_pep_dump_ncbi34.fa.gz
hgPepPred hg16 generic vegaPep vega_pep_dump_ncbi34.fa
vegaBuildInfo vega_homo_sapiens_core_4_0.gtf vegaInfo.tab
hgsql hg16 < ~/kent/src/hg/lib/vegaInfo.sql
hgsql -e "load data local infile 'vegaInfo.tab' into table vegaInfo" hg16
# KNOWN GENES UPDATE (DONE - 2004-01-29 - Hiram)
# RELOADED THE cgapBiocDesc AND cgapAlias TABLES TO REMOVE REPLICATED ROWS
# (DONE, 2005-07-26, hartera)
# RELOADED cgapAlias AGAIN AS TOO MANY ROWS REMOVED BEFORE (hartera, 2005-10-06)
# update swissProt and proteins databases
# You want to run these two scripts on the same day to keep the
# the date stamp consistent. In this case the data stamp is 040115
ssh eieio
cd /cluster/data/swissprot
~kent/src/hg/protein/mkSwissProtDB.sh
# that obtains the data and unpacks it, second step is on hgwdev
# to create the database
ssh hgwdev
cd /cluster/data/swissprot
~/kent/src/hg/protein/mkSwissProtDB.sh
# Now the proteins database can be created from that. Must be on
# hgwdev
ssh hgwdev
cd /cluster/data/proteins
~/kent/src/hg/protein/mkProteinsDb.sh 040115
# prepare all the tables in a temporary database, then move
# into Hg16. Leave a link in hg16/bed so it can be found
mkdir /cluster/data/kgDB/bed/hg16
ln -s /cluster/data/kgDB/bed/hg16 \
/cluster/data/hg16/bed/knownGenes.2004-01-29
cd /cluster/data/kgDB/bed/hg16
~/kent/src/hg/protein/KGprocess.sh kgDB hg16 040115
# That runs to a point that prepares a cluster job, continuing on kk
ssh kk
cd /cluster/data/kgDB/bed/hg16/kgBestMrna
para create jobList
para try
para push
... etc ...
# on a busy cluster, takes almost an hour:
# Completed: 46583 of 46583 jobs
# CPU time in finished jobs: 127351s 2122.51m 35.38h 1.47d 0.004 y
# IO & Wait Time: 119182s 1986.37m 33.11h 1.38d 0.004 y
# Average job time: 5s 0.09m 0.00h 0.00d
# Longest job: 14s 0.23m 0.00h 0.00d
# Submission to last job: 3513s 58.55m 0.98h 0.04d
# Continuing back on hgwdev, run the same script again
ssh hgwdev
cd /cluster/data/kgDB/bed/hg16
~/kent/src/hg/protein/KGprocess.sh kgDB hg16 040115
# should continue to completion, all tables are in kgDB and can be
# moved if they check out to be similar to existing tables in hg16
# You can verify table sizes with the script:
~kent/src/hg/protein/checkTbls.pl kgDB
~kent/src/hg/protein/checkTbls.pl hg16 kg
# should have similar row counts in each of these outputs
# This rename can be done more simply with the 'rename' command
# instead of the 'alter table' used here.
cat << '_EOF_' > renameTables.sh
#!/bin/sh
SOURCE=kgDB
TARGET=hg16
for T in cgapAlias cgapBiocDesc cgapBiocPathway dupSpMrna \
keggMapDesc keggPathway kgAlias kgProtAlias kgXref \
knownGene knownGeneLink knownGeneMrna knownGenePep mrnaRefseq spMrna
do
hgsql -e "drop table ${T};" ${TARGET}
hgsql -e "alter table ${SOURCE}.${T} rename ${TARGET}.${T}" mysql
echo "done $T"
done
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x renameTables.sh
./renameTables.sh
# RELOAD THE cgapBiocDesc AND cgapAlias TABLES (hartera, 2005-07-26)
# Reload the cgapBiocDesc and cgapAlias tables as they have replicated
# rows. Need to sort and unique the file before loading into the database.
cd /cluster/data/kgDB/bed/hg16
sort -u cgapBIOCARTAdesc.tab > cgapBIOCARTAdescSorted.tab
# for cgapAlias, the number of rows in the table is different to the
# tab file here so dump the table first.
# RELOAD cgapAlias AGAIN AS sort -nu REMOVES MORE ROWS THAN sort -u
# OR sort -n | uniq.
#USE sort -n then uniq TO SORT ON THE IDs AND THEN UNIQ(hartera, 2005-10-06)
# hgsql -N -e 'select * from cgapAlias;' hg16 > cgapAliasDump.txt
# above command used to get alias file from hg16 before sorting
sort -n cgapAliasDump.txt | uniq > cgapAliasDumpSorted.tab
hgsql hg16 -e "drop table cgapBiocDesc"
hgsql hg16 -e "drop table cgapAlias"
hgsql hg16 < ~/kent/src/hg/lib/cgapBiocDesc.sql
hgsql hg16 < ~/kent/src/hg/lib/cgapAlias.sql
hgsql hg16 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" \
into table cgapBiocDesc'
hgsql hg16 -e 'load data local infile "cgapAliasDumpSorted.tab" \
into table cgapAlias'
# the following extra process will be included in the next version
# of KGprocess.sh to create the kgProtMap table:
mkdir /cluster/data/kgDB/bed/hg16/kgProtMap
cd /cluster/data/kgDB/bed/hg16/kgProtMap
awk '{print ">" $1;print $2}' ../refMrna.tab > kgMrna.fa
/scratch/blast/formatdb -i kgMrna.fa -p F
echo "`date` creating kgPep.fa"
hgsql -N -e 'select spID,seq from kgXref,knownGenePep where kgID=name' ${DB} \
| awk '{print ">" $1;print $2}' >kgPep.fa
rm -fr kgPep
rm -f jobList
mkdir kgPep
faSplit sequence kgPep.fa 5000 kgPep/kgPep
for f in kgPep/*.fa
do
echo ./kgProtBlast.csh $f >> jobList
done
awk '{printf "%s\t%s\n", $3,$2}' ../kgXref.tab > kgProtMrna.pairs
# run a cluster job
ssh kk9
cd /cluster/data/kgDB/bed/hg16/kgProtMap
para create jobList
para try
para push ... etc
# Completed: 4949 of 4949 jobs
# CPU time in finished jobs: 1061454s 17690.90m 294.85h 12.29d 0.034 y
# IO & Wait Time: 13400s 223.33m 3.72h 0.16d 0.000 y
# Average job time: 217s 3.62m 0.06h 0.00d
# Longest job: 996s 16.60m 0.28h 0.01d
# Submission to last job: 12152s 202.53m 3.38h 0.14d
# back to hgwdev
ssh hgwdev
cd /cluster/data/kgDB/bed/hg16/kgProtMap
find ./psl.tmp -name '*.psl.gz' | xargs zcat | \
pslReps -nohead stdin psl.tmp/kgProtMrna.psl /dev/null
cd psl.tmp
(pslMap kgProtMrna.psl ../../tight_mrna.psl stdout | \
sort -k 14,14 -k 16,16n -k 17,17n > kgProtMap.psl) > kgProtMap.out 2>&1
# this table data is ready to load, verify it by comparison with
# existing kgProtMap data, then load:
hgLoadPsl hg16 kgProtMap.psl
# MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN (DONE 2/18/04 angie)
# In an email 2/13/04, Arian said we could treat all human repeats as
# lineage-specific for human-chicken blastz. Scripts expect *.out.spec
# filenames, so set that up:
ssh kkr1u00
cd /cluster/data/hg16
mkdir /iscratch/i/gs.17/build34/linSpecRep.Chicken
foreach f (/scratch/hg/gs.17/build34/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/gs.17/build34/linSpecRep.Chicken/$f:t:r:r.out.spec
end
iSync
# Use these the next time we run human-chicken blastz.
# BLASTZ CHICKEN (GALGAL2) (DONE 2/26/04 angie)
ssh kk
# space is awful tight on store4 -- use store7.
mkdir -p /cluster/store7/hg16/bed/blastz.galGal2.2004-02-25
ln -s /cluster/store7/hg16/bed/blastz.galGal2.2004-02-25 \
/cluster/data/hg16/bed/
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
# Set L=10000 (higher threshold on blastz's outer loop) and abridge
# repeats.
cat << '_EOF_' > DEF
# human vs. chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.Chicken
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chicken
SEQ2_DIR=/iscratch/i/galGal2/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/store7/hg16/bed/blastz.galGal2.2004-02-25
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
# first cluster run: raw blastz alignments
ssh kk
bash # if a csh/tcsh user
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
source DEF
mkdir $RAW run.0
/cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
sh ./xdir.sh
cd run.0
sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
para create jobList
para try, check, push, check, ....
#Completed: 51189 of 51189 jobs
#Average job time: 477s 7.95m 0.13h 0.01d
#Longest job: 2318s 38.63m 0.64h 0.03d
#Submission to last job: 29598s 493.30m 8.22h 0.34d
# second cluster run: lift raw alignments -> lav dir
ssh kki
bash # if a csh/tcsh user
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
source DEF
mkdir run.1 lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
cd run.1
wc -l jobList
para create jobList
para try, check, push, etc ...
#Completed: 339 of 339 jobs
#Average job time: 6s 0.11m 0.00h 0.00d
#Longest job: 21s 0.35m 0.01h 0.00d
#Submission to last job: 150s 2.50m 0.04h 0.00d
# third run: lav -> axt
ssh kki
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
mkdir axtChrom pslChrom run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| $HOME/bin/x86_64/lavToAxt stdin \
/iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/galGal2/nib stdout \
| $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt
$HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
../../pslChrom/$chr.psl
'_EOF_'
# << this line keeps emacs coloring happy
chmod a+x do.csh
cp /dev/null jobList
foreach d (../lav/chr*)
echo "do.csh $d" >> jobList
end
para create jobList
para try, check, push, check
#Completed: 42 of 42 jobs
#Average job time: 38s 0.64m 0.01h 0.00d
#Longest job: 147s 2.45m 0.04h 0.00d
#Submission to last job: 147s 2.45m 0.04h 0.00d
# RUN AXTBEST AND GENERATE MAF FOR MULTIZ (DONE 2/26/04 angie)
ssh kolossus
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
mkdir axtBest pslBest
foreach chrdir (lav/chr*)
set chr=$chrdir:t
echo axtBesting $chr
axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/$chr.psl
end
mkdir mafBest
foreach f (axtBest/chr*.axt)
set maf = mafBest/$f:t:r.hg.maf
axtToMaf $f \
/cluster/data/hg16/chrom.sizes /cluster/data/galGal2/chrom.sizes \
$maf -tPrefix=hg16. -qPrefix=galGal2.
end
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
cat pslBest/chr*.psl | hgLoadPsl -table=blastzBestGalGal2 hg16 stdin
# CHAIN CHICKEN BLASTZ (DONE 2/26/04 angie)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# Make our own linear gap file with reduced gap penalties,
# in hopes of getting longer chains:
cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize 11
smallSize 111
position 1 2 3 11 111 2111 12111 32111 72111 152111 252111
qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtFilter -notQ=chrUn $1 \
| axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=../../chickenHumanTuned.gap \
-minScore=5000 stdin \
/iscratch/i/gs.17/build34/bothMaskedNibs \
/iscratch/i/galGal2/nib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# axtChrom/chr1{8,9}_random.axt are empty, so the {out line +} checks
# failed:
#Completed: 40 of 42 jobs
#Crashed: 2 jobs
#Average job time: 28s 0.46m 0.01h 0.00d
#Longest job: 76s 1.27m 0.02h 0.00d
#Submission to last job: 92s 1.53m 0.03h 0.00d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# Load chains into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain/chain
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain hg16 ${c}_chainGalGal2 $i
end
# RESCORE CHICKEN BLASTZ (DONE 3/1/04 angie)
# Webb noticed low scores in latest runs with repeats abridged --
# PSU's restore_rpts program rescored alignments with default matrix
# instead of BLASTZ_Q matrix. Rescore them here so the chainer sees
# the higher scores:
ssh kolossus
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
mkdir axtChrom.rescore
foreach f (axtChrom/chr*.axt)
axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
$f axtChrom.rescore/$f:t
end
mv axtChrom axtChrom.orig
mv axtChrom.rescore axtChrom
# NET HUMAN BLASTZ (DONE 2/26/04 angie)
ssh kksilo
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
netClass noClass.net hg16 galGal2 human.net
# Make a 'syntenic' subset:
ssh kksilo
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
rm noClass.net
# Make a 'syntenic' subset of these with
netFilter -syn human.net > humanSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
netFilter -minGap=10 human.net | hgLoadNet hg16 netGalGal2 stdin
netFilter -minGap=10 humanSyn.net | hgLoadNet hg16 netSyntenyGalGal2 stdin
# Add entries for chainGalGal2, netGalGal2, syntenyGalGal2 to
# human/hg16 trackDb
# MAKE VSGALGAL2 DOWNLOADABLES (DONE 3/1/04 angie)
ssh kksilo
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
# Webb asked for axtChrom/chr22.axt... since axtChrom is rel. small
# this time, just put it all out there.
zip /cluster/data/hg16/zip/GGaxtChrom.zip axtChrom/chr*.axt
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
cp all.chain chicken.chain
zip /cluster/data/hg16/zip/chicken.chain.zip chicken.chain
rm chicken.chain
cp human.net chicken.net
zip /cluster/data/hg16/zip/chicken.net.zip chicken.net
rm chicken.net
cp humanSyn.net chickenSyn.net
zip /cluster/data/hg16/zip/chickenSyn.net.zip chickenSyn.net
rm chickenSyn.net
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/hg16/vsGalGal2
cd /usr/local/apache/htdocs/goldenPath/hg16/vsGalGal2
mv /cluster/data/hg16/zip/GGaxtChrom.zip axtChrom.zip
mv /cluster/data/hg16/zip/chicken*.zip .
md5sum *.zip > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# MULTIZ HUMAN/MOUSE/RAT/GALGAL2 (DONE 3/8/04 angie)
# (galGal2 added to human/mouse/rat alignments described above [HUMOR])
# put the MAFs on bluearc
ssh eieio
mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2/hmr
mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2/hg
cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf \
/cluster/bluearc/multiz.hg16mm3rn3galGal2/hmr
cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf \
/cluster/bluearc/multiz.hg16mm3rn3galGal2/hg
ssh kki
mkdir /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2
mkdir hmrg
# Wrapper script required because of stdout redirect:
cat << '_EOF_' > doMultiz
#!/bin/csh
/cluster/bin/penn/multiz $1 $2 - > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doMultiz
rm -f jobList
foreach file (/cluster/bluearc/multiz.hg16mm3rn3galGal2/hmr/*.maf)
set root=$file:t:r:r
echo "doMultiz /cluster/bluearc/multiz.hg16mm3rn3galGal2/hg/${root}.hg.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/hmrg/${root}.maf" >> jobList
end
para create jobList
para try, check, push, check
#Completed: 41 of 41 jobs
#Average job time: 88s 1.47m 0.02h 0.00d
#Longest job: 276s 4.60m 0.08h 0.00d
#Submission to last job: 278s 4.63m 0.08h 0.00d
# clean up bluearc (these are big files!)
rm -r /cluster/bluearc/multiz.hg16mm3rn3galGal2
# setup external files for database reference
ssh hgwdev
mkdir -p /gbdb/hg16/multizMm3Rn3GalGal2
ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/hmrg/*.maf \
/gbdb/hg16/multizMm3Rn3GalGal2
# load into database
/cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3GalGal2
# LOAD SOFTBERRY GENES (DONE - 2004-02-10 - Hiram)
mkdir -p /cluster/data/hg16/bed/softberry
cd /cluster/data/hg16/bed/softberry
set file = Soft_fgenesh_jul03.tar.gz
wget --timestamping ftp://www.softberry.com/pub/SC_HUM_JUL03/$file
tar xzvf $file
ldHgGene hg16 softberryGene fgenesh_jul03/chr*.gff
hgPepPred hg16 softberry fgenesh_jul03/*.protein
hgSoftberryHom hg16 fgenesh_jul03/*.protein
# CHIMP (panTro1) ALIGNMENTS (2004-02-12 kate)
# lift scaffold-based reciprocal best chains to chrom coordinates
ssh eieio
mkdir -p bed/blastz-blat.panTro1
cd bed/blastz-blat.panTro1
cp /cluster/data/pt0/bed/blastz-blatHg16/human.best.chain \
best.scaffolds.chain
cp /cluster/data/panTro1/jkStuff/scaffolds.lft scaffolds.lft
~kate/bin/i386/liftUp -chainQ best.chain scaffolds.lft \
warn best.scaffolds.chain
#Make a track from Tarjei's chimp deletions file (2/12/04, kpollard)
# 80-12000 bp indels in human/chimp alignments
#make .bed files from Tarjei's .fa files
cd /cluster/data/panTro1/bed/indels
/cluster/bin/i386/faSimplify indels.human.fa , , temp.fa
/cluster/bin/i386/faSize detailed=on temp.fa > human.start.txt
/cluster/bin/i386/faSimplify indels.human.fa ">" , temp.fa
/cluster/bin/i386/faSize detailed=on temp.fa > human.chr.txt
R
#Commands in R
chr<-read.table("human.chr.txt") #read in chromosome and size
start<-read.table("human.start.txt") #read in start and size
both<-cbind(chr,start) #concatinate: chrN size start size
sum(both[,2]!=both[,4]) #check that the size columns are identical
#0
both[,4]<-both[,2]+both[,3] #add start and size to get stop
both<-both[,c(1,3,4,2)] #reorder columns to get chrN start stop size
both[,4]<-paste("CD",1:length(both[,4]),"_",both[,4],sep="") #make name like CDN_size
write(t(both),"indels.human.bed",ncol=4) #write bed file
q() #quit
#delimit with tabs
cat indels.human.bed | gawk '{print $1"\t"$2"\t"$3"\t"$4}' > indels.human.tab.bed
#load track into browser
mkdir -p /gbdb/hg16/hg_insert
ln -s /cluster/data/panTro1/bed/indels/indels.human.tab.bed /gbdb/hg16/hg_insert
cd /cluster/data/panTro1/bed/indels
/cluster/bin/i386/hgLoadBed hg16 hg_insert indels.human.tab.bed
#change name to humanDels
hgsql hg16
rename table hg_insert to chimpDels;
exit
#add description file chimpDels.html
# to ~/kent/src/hg/makeDb/trackDb/human/hg16
#add a track entry to trackDb.ra
# in ~/kent/src/hg/makeDb/trackDb/human/hg16
# FAMILY BROWSER UPDATE (DONE - 2004-02-17 - Hiram)
# to be done after knownGene tables are complete from known gene
# process.
#
# Cluster together various alt-splicing isoforms.
# Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/hg16/bed/famBro.2004-02-17
ln -l /cluster/data/hg16/bed/famBro.2004-02-17 /cluster/data/hg16/bed/famBro
cd /cluster/data/hg16/bed/famBro
hgClusterGenes hg16 knownGene knownIsoforms knownCanonical
# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/hg16/bed/famBro/blastp
cd /cluster/data/hg16/bed/famBro/blastp
pepPredToFa hg16 knownGenePep known.faa
# You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
# This command is in /projects/compbio/bin/$MACH/formatdb
# Copy over database to bluearc
rm -fr /cluster/bluearc/hg16/blastp
mkdir -p /cluster/bluearc/hg16/blastp
cp -p /cluster/data/hg16/bed/famBro/blastp/known.* /cluster/bluearc/hg16/blastp
# Load up cluster/bluearc with blastp and related files
# if necessary
if (! -e /cluster/bluearc/blast/blastall) then
mkdir -p /cluster/bluearc/blast
cp /projects/compbio/bin/i686/blastall /cluster/bluearc/blast
mkdir -p /cluster/bluearc/blast/data
cp /projects/compbio/bin/i686/data/* /cluster/bluearc/blast/data
endif
# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/hg16/bed/famBro/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg
# Make parasol run directory (this would not work on kk, use kk9 instead)
# Need to check the difference between the blast in /scratch/blast
# and this /cluster/bluearc/blast
ssh kk9
mkdir /cluster/data/hg16/bed/famBro/blastp/self
cd /cluster/data/hg16/bed/famBro/blastp/self
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data
export BLASTMAT
/cluster/bluearc/blast/blastall -p blastp \
-d /cluster/bluearc/hg16/blastp/known -i $1 -o $2 \
-e 0.01 -m 8 -b 1000
'_EOF_'
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# Create parasol batch
# 'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
# This should finish in ~15 minutes if the cluster is free.
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs: 73213s 1220.22m 20.34h 0.85d 0.002 y
# IO & Wait Time: 20054s 334.23m 5.57h 0.23d 0.001 y
# Average job time: 12s 0.20m 0.00h 0.00d
# Longest job: 118s 1.97m 0.03h 0.00d
# Submission to last job: 1117s 18.62m 0.31h 0.01d
# Load into database. This takes about an hour.
ssh hgwdev
cd /cluster/data/hg16/bed/famBro/blastp/self/run/out
hgLoadBlastTab hg16 knownBlastTab *.tab
# Scanning through 7748 files
# Loading database with 11376875 rows
cd /cluster/data/hg16/bed/famBro
# Create table that maps between known genes and RefSeq
hgMapToGene hg16 refGene knownGene knownToRefSeq
# may need to build this command in src/hg/near/hgMapToGene
# row count changed from 32674 to 35416
# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg16 \
> refToLl.txt
hgMapToGene hg16 refGene knownGene knownToLocusLink -lookup=refToLl.txt
# row count went from 32845 to 35146
# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg16 knownGene name proteinID Pfam knownToPfam
# row count went from 31201 to 32225
# JK Fixed bug that let multiple identical columns happen in knownToPfam
# on April 15, 2004. Row count now 30467
# Create table to map between known genes and GNF Atlas2
# expression data.
hgMapToGene hg16 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
# Create expression distance table - takes about an hour
# (Regenerated April 16, 2004 in response to knownToGnfAtlas2 update)
hgExpDistance hg16 hgFixed.gnfHumanAtlas2MedianRatio \
hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnfAtlas2
# Create a table that maps between known genes and
# the nice affy expression data.
hgMapToGene "-type=bed 12" hg16 affyUcla knownGene knownToU133
# row count went from 34148 to 36818
# Create expression distance table. This will take about an hour.
cd ~/kent/src/hg/near/hgExpDistance
time hgExpDistance hg16 affyUcla affyUclaExp knownExpDistance \
-weights=affyUcla.weight -lookup=knownToU133
# 42 genes, 42 weights, 26.500000 total wieght
# Got 36818 unique elements in affyUcla
# Made knownExpDistance.tab
# Loaded knownExpDistance
# Made query index
# real 80m50.113s
# user 62m33.290s
# sys 2m15.200s
# This command should be done elsewhere, /tmp or something like that
# It makes a temporary .tab file of almost 1 Gb
# row count went from 34148000 to 36818000
# Create table that maps between known genes and
# the GNF data.
hgMapToGene hg16 affyU95 knownGene knownToU95
cd /tmp
# hgFixed.gnfHumanU95Exps argument is unused, no need to exist
hgExpDistance hg16 hgFixed.gnfHumanU95MedianRatio hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95
# row count went from 11718000 to 17330000
# original makeNear.doc had this as:
# hgExpDistance hg16 affyGnfU95 affyGnfU95Exps knownGnfDistance -lookup=knownToU95
# Make sure that GO database is up to date.
See README in /cluster/store1/geneOntology.
# I update this GO database very carefully, checking that all
# structures in it remain the same from release to release and
# backing up the current go DB in a backup database. In this case
# the backup is go040107 - when it was loaded for Mm4, and the new
# go database is based on data from Dec 17th 2003 and Feb 2004 according
# to the time stamp on the fetched data. This build was done in
# /cluster/store1/geneOntology/20040217
cd /cluster/data/hg16/bed/famBro
# Create knownToEnsembl column
hgMapToGene hg16 ensGene knownGene knownToEnsembl
# table row count went from previous version: 36068 to 38251
# Make knownToCdsSnp column. This is a little complicated by
# having to merge data form the snpTsc and the snpNih tracks.
hgMapToGene hg16 snpTsc knownGene knownToCdsSnp -createOnly -all -cds
hgMapToGene hg16 snpTsc knownGene snp1 -noLoad -all -cds
hgMapToGene hg16 snpNih knownGene snp2 -noLoad -all -cds
sort snp1.tab snp2.tab > knownToCdsSnp.tab
rm snp1.tab snp2.tab
hgsql \
-e 'load data local infile "knownToCdsSnp.tab" into table knownToCdsSnp;' \
hg16
# row count went from 87273 to 106199
# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This is already done, see makeMm3.doc for procedure
# the directory: /cluster/bluearc/ce1/blastp should have data
# Create the ceBlastTab (the blastall binary only works on kk9 for now ...)
ssh kk9
mkdir /cluster/data/hg16/bed/famBro/blastp/ce1
cd /cluster/data/hg16/bed/famBro/blastp/ce1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
-p blastp -d /cluster/bluearc/ce1/blastp/wormPep \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
# This should finish in ~10 minutes if the cluster is free.
# Here's the para time results
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs: 28869s 481.16m 8.02h 0.33d 0.001 y
# IO & Wait Time: 20454s 340.89m 5.68h 0.24d 0.001 y
# Average job time: 6s 0.11m 0.00h 0.00d
# Longest job: 52s 0.87m 0.01h 0.00d
# Submission to last job: 584s 9.73m 0.16h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/hg16/bed/famBro/blastp/ce1/run/out
hgLoadBlastTab hg16 ceBlastTab -maxPer=1 *.tab
# row count went from 25599 to 26958
# Make mouse ortholog column using blastp on mouse known genes.
# First make mouse protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This already exists. See makeMm4.doc for procedure
# the directory: /cluster/bluearc/mm4/blastp should have data
# Make parasol run directory
ssh kk9
mkdir /cluster/data/hg16/bed/famBro/blastp/mm4
cd /cluster/data/hg16/bed/famBro/blastp/mm4
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
-p blastp -d /cluster/bluearc/mm4/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# Create parasol batch
# (wordLine wouldn't run on kk9:
# wordLine: /lib/i686/libc.so.6: version `GLIBC_2.3' not found
# run this echo statement on hgwdev
# this echo trick is used because otherwise the command line is
# too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
# takes about 15 minutes:
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs: 54179s 902.98m 15.05h 0.63d 0.002 y
# IO & Wait Time: 20428s 340.47m 5.67h 0.24d 0.001 y
# Average job time: 10s 0.16m 0.00h 0.00d
# Longest job: 76s 1.27m 0.02h 0.00d
# Submission to last job: 2031s 33.85m 0.56h 0.02d
# Load into database.
ssh hgwdev
cd /cluster/data/hg16/bed/famBro/blastp/mm4/run/out
hgLoadBlastTab hg16 mmBlastTab -maxPer=1 *.tab
# Scanning through 7748 files
# Loading database with 35611 rows
# row count went from 33191 to 35611
# REFSEQ HOMOLOGS (DONE 6/18/04 angie)
# Translate mmBlastTab's knownGene acc's into RefSeq where possible,
# since our users frequently ask for help in determining homologs for
# human/mouse RefSeq accs...
ssh hgwdev
hgsql hg16 -e \
'create table mmRefSeqHomolog \
select hg16.knownToRefSeq.value as name, \
mm3.knownToRefSeq.value as homolog, \
mmBlastTab.identity, mmBlastTab.aliLength, mmBlastTab.mismatch, \
mmBlastTab.gapOpen, mmBlastTab.qStart, mmBlastTab.qEnd, \
mmBlastTab.tStart, mmBlastTab.tEnd, mmBlastTab.eValue , \
mmBlastTab.bitScore \
from mmBlastTab, hg16.knownToRefSeq, mm3.knownToRefSeq \
where hg16.knownToRefSeq.name = mmBlastTab.query and \
mm3.knownToRefSeq.name = mmBlastTab.target;'
# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This is already done, see makeMm3.doc for procedure
# the directory: /cluster/bluearc/dr1/blastp should have data
# Make parasol run directory
ssh kk9
mkdir /cluster/data/hg16/bed/famBro/blastp/dr1
cd /cluster/data/hg16/bed/famBro/blastp/dr1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
-p blastp -d /cluster/bluearc/dr1/blastp/ensembl \
-i $1 -o $2 -e 0.005 -m 8 -b 1
'_EOF_'
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs: 40575s 676.24m 11.27h 0.47d 0.001 y
# IO & Wait Time: 19781s 329.69m 5.49h 0.23d 0.001 y
# Average job time: 8s 0.13m 0.00h 0.00d
# Longest job: 95s 1.58m 0.03h 0.00d
# Submission to last job: 2036s 33.93m 0.57h 0.02d
# Load into database.
ssh hgwdev
cd /cluster/data/hg16/bed/famBro/blastp/dr1/run/out
hgLoadBlastTab hg16 drBlastTab -maxPer=1 *.tab
# Scanning through 7748 files
# Loading database with 32204 rows
# row count went from 30339 to 32204
# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This is already done, see makeMm3.doc for procedure
# the directory: /cluster/bluearc/sc1/blastp should have data
# Make parasol run directory
ssh kk9
mkdir /cluster/data/hg16/bed/famBro/blastp/sc1
cd /cluster/data/hg16/bed/famBro/blastp/sc1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs: 8577s 142.96m 2.38h 0.10d 0.000 y
# IO & Wait Time: 19756s 329.26m 5.49h 0.23d 0.001 y
# Average job time: 4s 0.06m 0.00h 0.00d
# Longest job: 15s 0.25m 0.00h 0.00d
# Submission to last job: 1172s 19.53m 0.33h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/hg16/bed/famBro/blastp/sc1/run/out
hgLoadBlastTab hg16 scBlastTab -maxPer=1 *.tab
# row count went from 17089 to 17886
# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# if it doesn't exist already
# This is already done, see makeMm3.doc for procedure
# the directory: /cluster/bluearc/dm1/blastp should have data
# Make parasol run directory
ssh kk9
mkdir /cluster/data/hg16/bed/famBro/blastp/dm1
cd /cluster/data/hg16/bed/famBro/blastp/dm1
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs: 33371s 556.18m 9.27h 0.39d 0.001 y
# IO & Wait Time: 19546s 325.77m 5.43h 0.23d 0.001 y
# Average job time: 7s 0.11m 0.00h 0.00d
# Longest job: 53s 0.88m 0.01h 0.00d
# Submission to last job: 1657s 27.62m 0.46h 0.02d
# Load into database.
ssh hgwdev
cd /cluster/data/hg16/bed/famBro/blastp/dm1/run/out
hgLoadBlastTab hg16 dmBlastTab -maxPer=1 *.tab
# Scanning through 7748 files
# Loading database with 28645 rows
# row count went from 27173 to 28645
LOAD SNPS (Done. Daryl Thomas; February 18, 2004)
# SNP processing has been condensed into a single script,
# which makes snpNih, snpTsc, and snpMap
# ${HOME}/kent/src/hg/snp/locations/processSnpLocations.csh
# snpBuild = 119
# Run from directory $oo/bed/snp/build$snpBuild/snpMap
mkdir -p $oo/bed/snp/build$snpBuild/snpMap
cd $oo/bed/snp/build$snpBuild/snpMap
processSnpLocations.csh hg16 human 34_2 119 >& log &
# check data:
# wc -l snpTsc.bed; hg16 -e "select count(*) from snpTsc;
# wc -l snpNih.bed; hg16 -e "select count(*) from snpNih;
# wc -l snpMap.bed; hg16 -e "select count(*) from snpMap;
# select * from snpNih limit 5; desc snpNih; show indexes from snpNih"
# select * from snpTsc limit 5; desc snpTsc; show indexes from snpTsc"
# select * from snpMap limit 5; desc snpMap; show indexes from snpMap"
# remove temp files
# rm human* *bed.gz
LOAD SNP DETAILS (Done. Daryl Thomas; February 18, 2004)
# SNP processing has been condensed into a single script,
# which makes dbSnpRsHg
# ${HOME}/kent/src/hg/snp/details/processSnpDetails.csh
# snpBuild = 119
# Run from directory $oo/bed/snp/build$snpBuild/snpMap
mkdir -p $oo/bed/snp/build$snpBuild/details/Done
mkdir -p $oo/bed/snp/build$snpBuild/details/Observed
cd $oo/bed/snp/build$snpBuild/details
processSnpDetails.csh hg16 human 119 >& log &
load data local infile "$fileBase.out" into table $database.$table
gzip $fileBase.out
# check data:
# hgFixed -e "select count(*) from dbSnpRsHg;
# select * from dbSnpRSHg limit 5; desc dbSnpRsHg; show indexes from dbSnpRSHg"
# remove temp files
# rm dbSnpRs*
# LOAD SNPS ( Daryl Thomas; February ??, 2005)
set db = hg16
set org = human
set build = 122
set dir = /cluster/bluearc/snp/$db/build$build
# ssh to some quiet machine with fast access to the bluearc
# it takes ~4.5 hours to download the data
# (build 124 directly to /cluster/bluearc/... from eieio)
# Check to make sure the chrMT file is included
mkdir -p $dir $dir/ds_ch.xml $dir/det $dir/str $dir/loc $dir/seq
cd $dir
ln -s /cluster/data/$db/jkStuff/liftAll.lft .
screen
ftp ftp.ncbi.nih.gov
cd snp/$org/XML
prompt
mget ds_ch*.xml.gz
exit # screen
exit # machine
# TODO: check chromStart for each locType
cp -f {$HOME}/kent/src/hg/snp/parseDbSnpXML /cluster/bin/scripts
chmod 775 /cluster/bin/scripts/parseDbSnpXML
ssh kk
touch jobList
foreach file ( /cluster/bluearc/snp/$db/build$build/ds_ch*.xml.gz )
set out = $file:t:r
echo /cluster/bin/scripts/parseDbSnpXML $file /cluster/bluearc/snp/$db/build$build $out.contig >> jobList
end
# para create jobList; para push; para check ...
# CPU time in finished jobs: 28235s 470.58m 7.84h 0.33d 0.001 y
# IO & Wait Time: 1986s 33.10m 0.55h 0.02d 0.000 y
# Average job time: 1119s 18.65m 0.31h 0.01d
# Longest job: 2339s 38.98m 0.65h 0.03d
exit # kk
mv -r $dir /cluster/data/$db/bed/snp/build$build
set dir = /cluster/data/$db/bed/snp/build$build
cd $dir
ssh eieio # or wherever data is local
# concatenate the details files to make it easier to lift (and load)
time zcat det/ds_ch*.xml.contig.det.gz > $db.build$build.contig.bed
# 16.120u 13.070s 1:35.26 30.6% 0+0k 0+0io 86pf+0w (hgwdev)
time gzip $db.build$build.contig.bed
# 102.307u 5.524s 1:48.97 98.9% 0+0k 0+0io 1pf+0w (eieio/store5)
# some of the NT contigs are not in the liftSpec - this is expected as snps that map to
# alternate assemblies (Celera) are in the original files, but we disregard their mappings.
time liftUp $db.build$build.bed liftAll.lft warn $db.build$build.contig.bed.gz
# 190.473u 18.873s 3:52.33 90.1% 0+0k 0+0io 1pf+0w (eieio/store5)
time gzip $db.build$build.bed
# 107.476u 5.286s 1:54.25 98.6% 0+0k 0+0io 0pf+0w
ssh hgwdev # or wherever database is located
# hgLoadBed is the important step - check to make sure there are no warnings
time hgLoadBed $db snp $db.build$build.bed.gz -sqlTable=${HOME}/kent/src/hg/lib/snp.sql
# Loaded 8722437 elements of size 16
# 206.170u 48.370s 35:59.52 11.7% 0+0k 0+0io 82994pf+0w
# basic snp table is now loaded, but exception column needs to be updated
# ~ 3 hours wall clock time from here to end
# run queries from snpException.query against snp table
mkdir -p /usr/local/apache/htdocs/qa/test-results/snpException/build$build
cd /usr/local/apache/htdocs/qa/test-results/snpException/build$build
time snpException $db 0 ${db}snpException > ${db}snpException.log
chmod o+rx .
chmod o+r *
# 24.590u 34.150s 41:04.48 2.3% 0+0k 0+0io 191pf+0w
# check alignment of flanking sequences
time snpValid $db /cluster/data/$db/bed/snp/build$build/seq > ${db}snpValid.log
# 4688.790u 172.770s 1:28:45.62 91.2% 0+0k 0+0io 23000pf+0w
# 5205.860u 216.570s 1:55:10.27 78.4% 0+0k 0+0io 72408pf+0w (hgwdev)
### NOTE: the pseudoautosomal snps are reported in the chrX files
### only, which causes problems for snpValid when checking the
### chrY snp mappings. I got around this by confirming that all
### of the 'missing flank' errors (#23) were in pseudoautosomal
### regions and ignoring them. I manually truncated the
### hg17snpException.23.bed file before continuing with the next
### step. This could/should be fixed in the next iteration.
# update snpExceptions table to match the number of exceptions found in the snpValid results
# these numbers come from counting the numbers of lines in the output files without headers
mysql> update snpExceptions set num=60797 where exceptionId=21;
mysql> update snpExceptions set num=5657 where exceptionId=22;
mysql> update snpExceptions set num=284098 where exceptionId=23;
mysql> update snpExceptions set num=173 where exceptionId=24;
# create list of statements to update the snp table and run them
time tail +3 ${db}snpException.* | awk '/rs/ {printf "%s\t%d\t%d\n",$4,$2,$5}' | sort -k1,2n > exceptionList.txt
# ~10 seconds
time updateExceptionList.pl < exceptionList.txt > updateExceptionList.sql
# 36.270u 1.980s 0:38.27 99.9% 0+0k 0+0io 337pf+0w
time hgsql $db < updateExceptionList.sql
# 18.130u 26.680s 58:39.97 1.2% 0+0k 0+0io 413pf+0w build122 (had to optimize table during run)
# 8.420u 10.370s 11:58.44 2.6% 0+0k 0+0io 413pf+0w build123 (this is mostly a mysql process)
# 6.550u 9.370s 14:34.17 1.8% 0+0k 0+0io 413pf+0w build124
# > wc -l build12*/updateExceptionList.sql
# 1110994 build122/updateExceptionList.sql
# 387166 build123/updateExceptionList.sql
# 383759 build124/updateExceptionList.sql
# Add Affy SNPs from new submission
#!/bin/csh -fe
set db = hg16
cd /cluster/data/$db/bed/snp/affy/latest
touch affy.txt affy.bed Affy.bed bed.tab
rm -f affy*.txt affy*.bed Affy.bed* bed.tab
# datafile was provided by Valmeekam, Venu [Venu_Valmeekam@affymetrix.com]
tar xfz affyhg16maps.tgz
wc -l affy*txt
awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10K\t0\n", $1,$2,$3,$4,$6,$7);}' < affy10K.txt > affy10K.bed
awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10Kv2\t0\n", $1,$2,$3,$4,$6,$7);}' < affy10Kv2.txt > affy10Kv2.bed
awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_HindIII\t0\n",$1,$2,$3,$4,$6,$7);}' < affy50K_HindIII.txt > affy50K_HindIII.bed
awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_XbaI\t0\n", $1,$2,$3,$4,$6,$7);}' < affy50K_XbaI.txt > affy50K_XbaI.bed
# this is a temporary kluge to fix some bad input data.
cat affy*.bed | sed 's/_par//' > Affy.bed
# the source enum for 'dbSnp' is 2; all of the affy* values are higher.
hgsql $db -e "delete from snp where source > 2 "
hgLoadBed $db snp Affy.bed -oldTable -tab
rm -f affy*.txt affy*.bed bed.tab
gzip Affy.bed
#mysql> select source, count(*) from snp group by source;
#+-----------------+----------+
#| source | count(*) |
#+-----------------+----------+
#| dbSnp | 8722437 |
#| Affy10K | 11464 |
#| Affy10Kv2 | 10128 |
#| Affy50K_HindIII | 56965 |
#| Affy50K_XbaI | 58646 |
#+-----------------+----------+
#5 rows in set (52.96 sec)
# March 7, 2005: fixed pseudoautosomal snps:
#affy10Kv2.txt:chrX_par 1920780 1920781 SNP_A-1606360 0 ? C/T
#affy10Kv2.txt:chrX_par 2047561 2047562 SNP_A-1510197 0 ? G/T
#affy10Kv2.txt:chrX_par 2047486 2047487 SNP_A-1510243 0 ? A/G
#affy10Kv2.txt:chrX_par 2060858 2060859 SNP_A-1606356 0 ? A/G
#affy10Kv2.txt:chrX_par 2163964 2163965 SNP_A-1606329 0 ? C/T
delete from snp where chrom = 'chrY' and name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329');
update snp set chrom = 'chrX' where name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329');
insert into snp
select bin, 'chrY' as chrom, chromStart, chromEnd, name, score, strand,
observed, molType, class, valid, avHet, avHetSE, func, locType, source, exception
from snp
where name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329');
select chrom, count(*) from snp where name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329') group by chrom;
### hapmapRecombRate (Daryl; September 19, 2005)
# updated coordinates (Daryl; December 8, 2005)
mkdir -p /cluster/data/hg16/bed/hapmap/recombination/HapMap_PhaseI/20051115
cd /cluster/data/hg16/bed/hapmap/recombination/HapMap_PhaseI/20051115
wget -N http://www.stats.ox.ac.uk/~cfreeman/HapMap_Phase1/genetic_map_HapMap_Phase1_UCSC.tar.gz
tar xvfz genetic_map_HapMap_Phase1_UCSC.tar.gz
tail --lines=+2 -q Gen_map_chr*_COMBINED_UCSC.txt | sed 's/_non_par//;s/_par1//;s/_par2//' | awk '{printf "%s\t%d\t%d\t%0.3f\n",$1,$2,$3,$4}' >! hg16.hapmapRecombRate.bed
liftOver hg16.hapmapRecombRate.bed /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain.gz hg17.hapmapRecombRate.bed hg16ToHg17.unmapped
hgLoadBed -bedGraph=4 hg16 hapmapRecombRate hg16.hapmapRecombRate.bed
hgLoadBed -bedGraph=4 hg17 hapmapRecombRate hg17.hapmapRecombRate.bed
rm -f bed.tab Gen_map_chr*.txt
### hapmapRecombHotspot (Daryl; September 19, 2005; chr X data update October 21, 2005)
wget -N http://www.stats.ox.ac.uk/~mcvean/HapMap/hotspots/Genomewidehots16a.txt
wget -N http://www.stats.ox.ac.uk/~mcvean/HapMap/hotspots/chrX_non_par_hotspots.txt
wget -N http://www.stats.ox.ac.uk/~mcvean/HapMap/hotspots/chrX_par1_hotspots.txt
# this takes about 3 seconds to run
rm -f hg*.hapmapRecombHotspots.bed
tail +2 Genomewidehots16a.txt | awk -F " " '{printf "chr%s\t%d\t%d\n",$1, $3-1, $4}' > hg16.hapmapRecombHotspots.bed
tail +2 chrX_non_par_hotspots.txt | sed s/_non_par// | awk -F " " '{printf "chr%s\t%d\t%d\n",$1, $3-1, $4}' >> hg16.hapmapRecombHotspots.bed
tail +2 chrX_par1_hotspots.txt | sed s/_par1// | awk -F " " '{printf "chr%s\t%d\t%d\n",$1, $3-1, $4}' >> hg16.hapmapRecombHotspots.bed
liftOver hg16.hapmapRecombHotspots.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.hapmapRecombHotspots.bed hg16ToHg17.unmapped
hgLoadBed hg16 hapmapRecombHotspots hg16.hapmapRecombHotspots.bed
hgLoadBed hg17 hapmapRecombHotspots hg17.hapmapRecombHotspots.bed
rm -f bed.tab
### encodeRecombHotspot (Daryl; December 8, 2005)
mkdir -p /cluster/data/hg16/bed/hapmap/recombination/ENCODE_16c.1/hotspots
cd /cluster/data/hg16/bed/hapmap/recombination/ENCODE_16c.1/hotspots
wget -N http://www.stats.ox.ac.uk/~cfreeman/ENCODE_16c.1/Hotspots16c1.txt
wget -N http://www.stats.ox.ac.uk/~cfreeman/ENCODE_16c.1/Readme_rates_hotspots.txt
tail +2 Hotspots16c1.txt | sed 's/ENm010\.7p15\.2/chr7/;s/ENm013\.7q21\.13/chr7/;s/ENm014\.7q31\.33/chr7/;s/ENr112\.2p16\.3/chr2/;s/ENr113\.4q26/chr4/;s/ENr123\.12q12/chr12/;s/ENr131\.2q37\.1/chr2/;s/ENr213\.18q12\.1/chr18/;s/ENr232\.9q34\.11/chr9/;s/ENr321\.8q24\.11/chr8/' | awk '{printf "%s\t%d\t%d\n", $1, $3, $4}' > hg16.encodeRecombHotspot.bed
liftOver hg16.encodeRecombHotspot.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.encodeRecombHotspot.bed hg16ToHg17.unmapped
hgLoadBed hg16 encodeRecombHotspot hg16.encodeRecombHotspot.bed
hgLoadBed hg17 encodeRecombHotspot hg17.encodeRecombHotspot.bed
rm -f bed.tab *bed *unmapped
### Perlegen Recombination Rates and Hotspots (Daryl; December 9, 2005)
# Home page: http://www.stats.ox.ac.uk/mathgen/Recombination.html
mkdir -p /cluster/data/hg16/bed/hapmap/recombination/Perlegen
cd /cluster/data/hg16/bed/hapmap/recombination/Perlegen
wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/README.txt
mkdir -p /cluster/data/hg16/bed/hapmap/recombination/Perlegen/hotspots
cd /cluster/data/hg16/bed/hapmap/recombination/Perlegen/hotspots
wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/README.txt
wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/hotspots.zip
unzip hotspots.zip
tail +2 hotspots.txt | grep -v 1.51000 | awk '{printf "chr%s\t%d\t%d\n",$1,$3-1,$4}' > hg16.perlegenRecombHotspots.bed
tail +2 coldspots.txt | grep -v "-" | awk '{printf "chr%s\t%d\t%d\n",$1,$3-1,$4}' > hg16.perlegenRecombColdspots.bed
liftOver hg16.perlegenRecombHotspots.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.perlegenRecombHotspots.bed hg16ToHg17.hots.unmapped
liftOver hg16.perlegenRecombColdspots.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.perlegenRecombColdspots.bed hg16ToHg17.cold.unmapped
hgLoadBed hg16 perlegenRecombHotspots hg16.perlegenRecombHotspots.bed
hgLoadBed hg17 perlegenRecombHotspots hg17.perlegenRecombHotspots.bed
hgLoadBed hg16 perlegenRecombColdspots hg16.perlegenRecombColdspots.bed
hgLoadBed hg17 perlegenRecombColdspots hg17.perlegenRecombColdspots.bed
rm -f bed.tab hg1*ed *spots*txt
mkdir -p /cluster/data/hg16/bed/hapmap/recombination/Perlegen/rates
cd /cluster/data/hg16/bed/hapmap/recombination/Perlegen/rates
cp ../makeBed.pl .
chmod ug+x makeBed.pl
wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/README.txt
wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/recombination_rates.zip
unzip recombination_rates.zip
rm -f hg16.perlegenRecombRate.bed
time ./makeBed.pl > hg16.perlegenRecombRate.bed
cut -f1 hg16.perlegenRecombRate.bed | sort -u
wc -l hg16.perlegenRecombRate.bed
liftOver hg16.perlegenRecombRate.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.perlegenRecombRate.bed hg16ToHg17.rates.unmapped
hgLoadBed hg16 perlegenRecombRate hg16.perlegenRecombRate.bed
hgLoadBed hg17 perlegenRecombRate hg17.perlegenRecombRate.bed
rm -f bed.tab chr*_rates.txt hg1*ed
# HapMap Linkage Disequilibrium (Daryl; January 2006)
mkdir -p /cluster/data/hg16/bed/hapmap/ld_data/2005-10/data
cd /cluster/data/hg16/bed/hapmap/ld_data/2005-10/data
screen
ftp www.hapmap.org
cd ld_data/2005-10
prompt
mget ld_chr*.txt.gz
# look for consistency in max LD distance
set out = maxDist.txt
rm -f $out
touch $out
foreach f (ld_*.txt.gz)
echo -n "$f " >> $out
zcat $f | awk '{if ($2-$1>max) max=$2-$1} END {print max}' >> $out
end
# most should be 249999
grep -v 249999 maxDist.txt
# look for consistency in line counts
# ssh eieio; screen
set out = wcList.txt
rm -f $out
touch $out
# this takes about 2 hours to run completely on eieio (local disk)"
foreach f (*.txt.gz)
echo -n $f:r:r " " | sed 's/ld_//;s/chr//;s/_/\t/' >> $out
zcat $f | cut -f1 -d " " | uniq | wc -l >> $out
end
# plot the sizes from wcList.txt by population (lines)
# with chrom on the X axis and size on the Y axis.
# look for anomalies
mkdir ../bed
cd ../bed
# from the raw LD values, compute colors and encode
cat << EOF > makeLdBed.pl
#!/usr/bin/perl -W
sub min ($$)
{
my $a = shift @_;
my $b = shift @_;
if ($a<$b) {return $a;}
return $b;
}
sub encodeDprime($)
{
my $val = shift @_;
if ( ($val > 1) || ($val < -1) ) { die "Dprime value ($val) is out of range [-1,1]";}
elsif ($val>=0) { $ret = ord('a') + $val*9;}
else { $ret = ord('A') - $val*9;}
return chr($ret);
}
sub encodeRsquared($)
{
my $val = shift @_;
if ( ($val > 1) || ($val < 0) ) { die "R^2 value ($val) is out of range [0,1]";}
return encodeDprime($val);
}
sub encodeLod($$)
{
my $lod = shift @_;
my $dPrime = shift @_;
$ret = ord('a');
if ($lod>=2) # high LOD
{
if (abs($dPrime)<0.5) { $ret = ord('y'); } # high LOD, low D' -> pink
else { $ret += min((int($lod-abs($dPrime)-1.5)), 9) ;}
}
elsif (abs($dPrime)>0.99) { $ret = ord('z'); } # high D', low LOD -> blue
return chr($ret);
}
$inDir = shift||"data";
$outDir = shift||"bed";
$foo = "";
$bar = "";
@rest = ();
@pops = ("CEU", "CHB", "JPT", "YRI");
foreach $pop (@pops)
{
opendir(DIR, $inDir) || die "can't open $inDir";
@hmFiles = grep {/^ld_/ && /_${pop}.txt.gz$/} readdir(DIR); #ld_chr22_CEU.txt.gz
closedir(DIR);
printf "\nPOP:\t$pop\t$#hmFiles\n";
foreach $hmFile (sort @hmFiles)
{
($foo, $chrom, $bar) = split /_/, $hmFile;
$chrom =~ s/chrx/chrX/;
$chrom =~ s/chry/chrY/;
$outfile = "$outDir/${pop}_${chrom}.bed";
if ((-e $outfile)||(-e "$outfile.gz")) { next; }
$tmpFile = "/tmp/${pop}_${chrom}.bed";
printf("$inDir/$hmFile => $outfile.gz\t" . `date`);
open(OUT, "> $tmpFile" ) || die "can't open $tmpFile";
open(IN, "zcat $inDir/$hmFile | " ) || die "can't open $inDir/$hmFile";
$line = <IN>;
chomp($line);
($chromStart, $chromEnd, $pop, $name, $marker2, $dprime, $rsquared, $lod, @rest) = split / /, $line;
$ldCount = 1;
while (<IN>)
{
chomp();
($chromStartNew, $chromEndNew, $pop, $nameNew, $marker2, $dprime, $rsquared, $lod, @rest) = split / /;
if ($chromStart ne $chromStartNew)
{
$chromStart--;
printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n");
$chromStart = $chromStartNew;
$chromEnd = $chromEndNew;
$name = $nameNew;
$ldCount = 1;
$dprimeList = encodeDprime($dprime);
$rsquaredList = encodeRsquared($rsquared);
$lodList = encodeLod($lod, $dprime);
}
elsif ($chromEndNew-$chromStartNew<250000)
{
$chromEnd = $chromEndNew;
$ldCount++;
$dprimeList .= encodeDprime($dprime);
$rsquaredList .= encodeRsquared($rsquared);
$lodList .= encodeLod($lod, $dprime);
}
}
close(IN);
$chromStart--;
printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n");
close(OUT);
system("gzip $tmpFile");
system("mv $tmpFile.gz $outDir");
}
}
EOF
#
chmod ug+x ./makeLdBed.pl
ssh eieio
screen
time ./makeLdBed.pl
# look for consistency in line counts
# ssh eieio
set out = wcList.txt
rm -f $out
touch $out
foreach f (*.bed.gz)
echo -n $f:r:r " " | sed 's/chr//g;s/_/\t/g' >> $out
zcat $f | wc -l >> $out
end
# plot the sizes from wcList.txt by population (lines)
# with chrom on the X axis and size on the Y axis.
# look for anomalies
# load data
sed 's/hapmapLd/hapmapLdCeu/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16
sed 's/hapmapLd/hapmapLdChb/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16
sed 's/hapmapLd/hapmapLdJpt/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16
sed 's/hapmapLd/hapmapLdYri/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16
# The length of each of the three value vectors (rsquared, dprime,
# and lod) is the same and is stored in the score field.
# 30-40 minutes
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X)
echo
echo -n loading CEU chr${c}
zcat CEU_chr${c}.bed.gz | wc -l
hgLoadBed -noSort -oldTable -strict hg16 hapmapLdCeu CEU_chr${c}.bed.gz
echo
echo -n loading CHB chr${c}
zcat CHB_chr${c}.bed.gz | wc -l
hgLoadBed -noSort -oldTable -strict hg16 hapmapLdChb CHB_chr${c}.bed.gz
echo
echo -n loading JPT chr${c}
zcat JPT_chr${c}.bed.gz | wc -l
hgLoadBed -noSort -oldTable -strict hg16 hapmapLdJpt JPT_chr${c}.bed.gz
echo
echo -n loading YRI chr${c}
zcat YRI_chr${c}.bed.gz | wc -l
hgLoadBed -noSort -oldTable -strict hg16 hapmapLdYri YRI_chr${c}.bed.gz
end
rm -f bed.tab
# Tajima's D (DONE -- 2005-06-04 -- Daryl)
# Data from Chris Carlson in Debbie Nickerson's lab
# Chris Carlson [csc47<AT>u<DOT>washington<DOT>edu]
set db=hg16
set dir=/cluster/data/$db/bed/tajdpoly/latest
cd $dir
set chain = "/gbdb/hg17/liftOver/hg17ToHg16.over.chain"
foreach p (AD ED XD)
# lift SNP tracks
set f = $p.SNP.track
set in = /cluster/data/hg17/bed/tajdpoly/latest/$f.bed4
set out = /cluster/data/hg16/bed/tajdpoly/latest/$f.$db
liftOver $in $chain $out.$db.bed4 $out.$db.unmapped
# lift tajd tracks
set f = $p.tajd.track
set in = /cluster/data/hg17/bed/tajdpoly/latest/$f.bedGraph
set out = /cluster/data/hg16/bed/tajdpoly/latest/$f.$db
liftOver $in $chain $out.bedGraph $out.unmapped
# load SNP tracks
set f = $p.SNP.track.hg16
echo `date` $f "=>" $f.bed4
hgLoadBed $db tajdSnp$p $f.bed4
head -3 $f*
hgsql -e "select * from tajdSnp$p limit 3" $db
# load tajd tracks
set f = $p.tajd.track.$db
echo `date` $f "=>" $f.bedGraph
hgLoadBed -bedGraph=4 $db tajd$p $f.bedGraph
head -3 $f*
hgsql -e "select * from tajd$p limit 3" $db
end
# deleting elements that overlap with gaps -- tajd files have overlaps due to the windowing scheme (snps are not found in gaps)
rm -f delete.sql
touch delete.sql
set $where="where t.chrom=g.chrom and (t.chromStart between g.chromStart and g.chromEnd or t.chromEnd between g.chromStart and g.chromEnd)"
foreach p (AD ED XD SnpAD SnpED SnpXD)
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
echo "select 'tajd$p' as pop, t.chrom, t.chromStart from tajd${p} t, chr${c}_gap g $where " | \
hgsql $db | grep -v pop | \
awk '{printf "delete from %s where chrom=\"%s\" and chromStart=%d;\n", $1, $2, $3}' >> delete.sql
end
end
$db < delete.sql
# cleanup elements that didn't get deleted properly
## cleanup.pl
#!/usr/bin/perl -W
$pop=shift;
while (<>)
{
if (/^(chr..?)\s+(\d+)/)
{ print "delete from tajd$pop where chrom='$1' and chromStart<$2 and chromEnd>$2;\n"; }
}
##
foreach p (AD ED XD)
featureBits $db tajd$p gap -bed=$p.inGaps.bed
cleanup.pl < $p.inGaps.bed $p | $db
featureBits $db tajd$p gap -bed=$p.inGaps.bed ## should be empty now
end
# JAX ORTHOLOG (WORKING hiram 2004-02-20 )
# Add Jackson labs info
cd /cluster/data/hg16/bed
mkdir jaxOrtholog
cd jaxOrtholog
wget --timestamping ftp://ftp.informatics.jax.org/pub/reports/HMD_Human4.rpt
# save a little space
gzip HMD_Human4.rpt
# this is a tricky one to parse. This .rpt file is plain text, no
# tabs, with expected text columns to contain the data. We need to
# convert this. Beware of table changes, you may need to rework
# this each time if they change the data. Here is what we have
# today, an example first line with text columns numbered:
# 1234567 101234567 201234567 301234567 401234567 501234567 601234567 701234567
801234567 90123456 100123456 110123456 120123456 130123456 140123456 150123456 1
60123456 170123456 180123456 170
# MGI:1918914 71664 0610006F02Rik
10 syntenic D3 196410 MGC173
01 12q13.13
# ^ mgiId
# ^ mouse chr
# ^ mouseCm position
# ^ possible Mouse band
# Mouse-Human Symbol ^
# Human Symbol ^
# ^ Human Band(s)
# This awk script picks out the correct columns, removes spaces,
# picks the first of possibly several human band designations,
# and decides if a mouse band has been specified
cat << '_EOF_' > jaxToUCSC.awk
/^MGI:/ {
LAST=NF
PREV=LAST-1
humanSymbol = substr($0,153,26)
gsub(" ","",humanSymbol)
Band = substr($0,179)
gsub(" *$","",Band)
gsub("^ *","",Band)
mgiId = substr($0,1,31)
gsub(" ","",mgiId)
mouseSym = substr($0,63,26)
gsub(" ","",mouseSym)
mouseChr = substr($0,89,13)
gsub(" ","",mouseChr)
mouseCm = substr($0,102,9)
gsub(" ","",mouseCm)
mouseBand = substr($0,111,11)
gsub(" ","",mouseBand)
if (length(mouseBand) < 1) { mouseBand = "N/A" }
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n", humanSymbol,Band,
mgiId,mouseSym,mouseChr,mouseCm,mouseBand
}
'_EOF_'
# << this line makes emacs coloring happy
# then using that script to fix it:
zcat HMD_Human4.rpt.gz | awk -f jaxToUCSC.awk > jaxOrtholog.tab
# Drop (just in case), create and load the table:
hgsql -e 'drop table jaxOrtholog;' hg16
hgsql hg16 < ~/kent/src/hg/lib/jaxOrtholog.sql
hgsql -e \
'load data local infile "jaxOrtholog.tab" into table jaxOrtholog;' hg16
# save a little space
gzip jaxOrtholog.tab
LOAD ACEMBLY (DONE - 2004-03-30 - Hiram)
mkdir -p /cluster/data/hg16/bed/acembly
cd /cluster/data/hg16/bed/acembly
# Data is obtained from:
# Danielle et Jean Thierry-Mieg mieg@ncbi.nlm.nih.gov
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.genes.proteins.fasta.tar.gz
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.genes.gff.tar.gz
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.mrnas.fasta.tar.gz
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.genes.pfamhits.tar.gz
tar xvzf acembly.ncbi_34.genes.gff.tar.gz
tar xvzf acembly.ncbi_34.genes.proteins.fasta.tar.gz
cd acembly.ncbi_34.genes.gff
# chrom 6.gff is broken, it has a bogus number in the first column
# where a 6 should be. Fix-up until I hear from the authors:
mv x1.acemblygenes.6.gff x1.acemblygenes.6.gff.broken
sed -e "s/^28212469/6/" x1.acemblygenes.6.gff.broken > x1.acemblygenes.6.gff
# There are a number of start and end coordinates that are
# in reversed order. Until I hear from the authors, I have
# switched those coords:
cat << '_EOF_' > fixupReversedBlocks
#!/bin/sh
for i in x1*.gff
do
echo -n "$i working ..."
awk -F"\t" '
{
if ($4 > $5) {
printf "%s\t%s\t%s\t%s\t%s", $1, $2, $3, $5, $4
for ( i = 6; i <= NF; ++i ) {
printf "\t%s", $i
}
printf "\n"
} else
print
}
' $i > $i.fixed
echo " done"
done
'_EOF_'
# << this line makes emacs coloring happy
chmod +x fixupReversedBlocks
./fixupReversedBlocks
# Save just the floating-contig features to different files for lifting
# and lift up the floating-contig features to chr*_random coords:
# NOTE: file prefix (x1) has been added since build 31
foreach f (x1.acemblygenes.*.gff.fixed)
set c=$f:r:e
set c=$f:r:r:e
egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \
perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff
if (-e ../../../$c/lift/random.lft) then
liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \
ctg-chr${c}_random.gff
endif
grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' | \
grep -v "^chr//" > chr$c.gff
echo "done $c"
end
# that last grep strips out _random or floating contig lines from the
# normal chrom gff, and add the "chr" prefix
# Three of them end up empty, check for this and remove them
# if necessary
rm -f chr19_random.gff chr18_random.gff chrUn.gff
# There was one error in a coordinate on chr17_random:
# chr17_random acembly stop_codon -2 0 . + 1 gene_id M17S2; transcript_id M17S2.cDec03;
# This line was removed (shows up as first line) from
# chr17_random.gff before the database load
#- Load into database:
cd ..
ldHgGene -gtf hg16 acembly acembly.ncbi_34.genes.gff/chr*.gff
hgPepPred hg16 generic acemblyPep \
acembly.ncbi_34.genes.proteins.fasta/*.fasta
# check that the track is OK
checkTableCoords hg16 acembly
# should display no errors
# MAKE HUMAN-CHIMP OVER.CHAIN FOR LIFTOVER (DONE 3/2/04 angie)
ssh kolossus
mkdir /cluster/data/hg16/bed/bedOver/hg16toPt0
cd /cluster/data/hg16/bed/bedOver/hg16toPt0
# use the combined blastz-blat best human chain, but assign unique IDs
# so that netChainSubset doesn't die:
chainSort /cluster/data/pt0/bed/blastz-blatHg16/human.best.2.chain stdout \
| chainMergeSort stdin \
| chainSplit chain stdin
# re-net with the new IDs:
mkdir net
foreach f (chain/*.chain)
echo chaining $f
chainNet $f /cluster/data/hg16/chrom.sizes \
/cluster/data/pt0/scaffold.sizes net/$f:t:r.net /dev/null
end
# Now get a single-cov subset as usual:
mkdir subset
foreach f (chain/*.chain)
echo subsetting net/$f:t:r.net, $f to subset/$f:t
netChainSubset net/$f:t:r.net $f subset/$f:t
end
cat subset/*.chain > /cluster/data/hg16/bed/bedOver/hg16Topt0.chain
# make it available:
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/hg16/liftOver/
zip -j hg16Topt0.zip /cluster/data/hg16/bed/bedOver/hg16Topt0.chain
# update README.txt
# lift scaffold-based over.chain to chrom-based (2004-07-09 kate)
ssh kksilo
cd /cluster/data/hg16/bed/bedOver
liftUp -chainQ hg16TopanTro1.chain /cluster/data/panTro1/jkStuff/scaffolds.lft warn hg16Topt0.chain
# NOTE: these chains appear to be broken up -- try using all chains,
# instead of reciprocal best
ssh kolossus
cd /cluster/data/hg16/bed/blastz-blat.panTro1
netChainSubset human.net all.chain over.chain
# load just for ENCODE dev
hgLoadChain hg16 liftOverPanTro1Chain over.chain
# TODO: delete table
ssh kolossus
cd /cluster/data/hg16/bed/blastz-blat.panTro1
chainSwap \
/cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap/all.newId.chain \
all.newId.swp.chain
chainSplit chain.newId all.newId.swp.chain
mkdir preNet
cd chain.newId
cat > preNet.csh << 'EOF'
foreach i (*.chain)
echo pre-netting $i
chainSort $i stdout | \
chainPreNet stdin /cluster/data/hg16/chrom.sizes \
/cluster/data/panTro1/chrom.sizes ../preNet/$i
end
'EOF'
csh preNet.csh >&! preNet.log &
tail -100f preNet.log
cd ..
# << for emacs
mkdir n1
cd preNet
cat > net.csh << 'EOF'
foreach i (*.chain)
set n = $i:r.net
echo netting $i
chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
/cluster/data/panTro1/chrom.sizes ../n1/$n /dev/null
end
'EOF'
csh net.csh >&! net.log &
tail -100f net.log
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
# GOT HERE
ssh hgwdev
cd /cluster/data/hg16/bed/blastz-blat.panTro1
netClass hNoClass.net hg16 panTro1 chimp.newId.net
# chain files from the net
ssh kolossus
cd /cluster/data/hg16/bed/blastz-blat.panTro1
netChainSubset chimp.newId.net all.newId.swp.chain over.newId.chain
cp over.newId.chain \
/cluster/data/hg16/bed/liftOver/hg16ToPanTro1.newId.over.chain
mv hg16TopanTro1.chain hg16Topantro1.chain.old
cd /cluster/data/hg16/bed/liftOver
ln -s hg16ToPanTro1.newId.over.chain hg16TopanTro1.chain
ssh hgwdev
cd /cluster/data/hg16/bed/blastz-blat.panTro1
hgLoadChain hg16 liftOverPanTro1NewIdChain over.newId.chain
# MAKE HUMAN-CHICKEN OVER.CHAIN FOR LIFTOVER (DONE 3/2/04 angie)
ssh kolossus
mkdir /cluster/data/hg16/bed/bedOver/hg16TogalGal2
cd /cluster/data/hg16/bed/bedOver/hg16TogalGal2
set chainDir = /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
netSplit $chainDir/human.net net
mkdir subset
foreach f ($chainDir/chain/*.chain)
echo subsetting $f:t:r
netChainSubset net/$f:t:r.net $f subset/$f:t
end
cat subset/*.chain > /cluster/data/hg16/bed/bedOver/hg16TogalGal2.chain
# HUMAN/MOUSE/RAT/CHICKEN (HMRG) PHYLOHMM CONSERVATION (IN PROGRESS 2004-03-8 kate)
# Set path
set path = ($path /cluster/bin/woody)
# Obtain phylogenetic model (hmrc_rev_dg.mod)
# from Adam (hand-tuned, instead of fit_model)
# then, create New Hampshire tree for data (.nh file)
cat hmrc_rev_dg.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#NRATECATS: 10
#ALPHA: 4.4
#BACKGROUND: 0.286083 0.213573 0.213691 0.286652
#RATE_MAT:
#-0.891523 0.166770 0.574850 0.149902
#0.223389 -1.146311 0.153784 0.769137
#0.769591 0.153699 -1.147159 0.223869
#0.149605 0.573055 0.166888 -0.889548
#TREE: ((1:0.192598,(2:0.076303,3:0.083043):0.192598):0.47,4:0.47);
/cluster/data/woody/scripts/extract-tree.pl human,mouse,rat,chicken \
hmrc_rev_dg.mod
#((human:0.192598,(mouse:0.076303,rat:0.083043):0.192598):0.47,chicken:0.47);
ssh eieio
set path = ($path /cluster/bin/woody)
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2
cd phyloHMM
# now, break up the genome-wide MAFs into pieces; it's worth doing
# this as a little cluster job
# NOTE: using the hg16 chr fasta files stashed on bluearc for hg16 humor run
# NOTE: next time add "check out" lines to assure files are created
ssh eieio
mkdir -p /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3galGal2
cp /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/hmrg/*.maf /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3galGal2
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
cat << 'EOF' > doSplit
#!/bin/sh
WOODY=/cluster/bin/woody
FA_SRC=/cluster/bluearc/hg16/bed/humor
WINDOWS=/cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM/WINDOWS
maf=$1
c =`basename $maf .maf`
echo $c
mkdir -p /scratch/msa_split
${WOODY}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg16,mm3,rn3,galGal2 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -d 1 -B 5000
echo "Copying..."
cd /scratch/msa_split
for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
rm -f /scratch/msa_split/$c.*.ss
echo "Done copying"
'EOF'
chmod +x doSplit
mkdir -p WINDOWS
rm -f WINDOWS/* jobs.lst
foreach file (/cluster/bluearc/hg16/bed/multiz.hg16mm3rn3galGal2/*.maf)
echo "doSplit $file" >> jobs.lst
end
ssh kk
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
para create jobs.lst
# para try, para push, etc.
# now setup and run the cluster job to compute the conservation scores
# NOTE: need to use gensub2, check out+ facilities to check for
# failures. Will want to chunk msa_split output (above) into chr dirs.
# to make the gensub template reasonable.
cat << 'EOF' > doPostProbs
#!/bin/sh
WOODY=/cluster/bin/woody
TMP=/tmp/phyloHMMcons
file=$1
root=`basename $file .ss.gz`
chrom=`echo $root | awk -F\. '{print $1}'`
echo $chrom
mkdir -p $TMP
zcat $file | $WOODY/label -m - -d hmrc_rev_dg.mod -i SS -o $TMP/$root -k 10 -L 0.9 -A -p 0 -j 1 -s $chrom -x
mkdir -p POSTPROBS/$chrom
gzip -c $TMP/$root.postprob > POSTPROBS/$chrom/$root.postprob.gz
rm $TMP/$root.postprob
'EOF'
chmod +x doPostProbs
mkdir -p POSTPROBS
rm -f jobs2.lst
foreach file (WINDOWS/chr*.ss.gz)
echo "doPostProbs $file" >> jobs2.lst
end
wc -l job2.lst
para create jobs2.lst
# etc ... (run cluster job)
# Create wiggle (.wib) file using and load into database
ssh eieio
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
mkdir wibLimits
mkdir wib
cat > makeWig.csh << 'EOF'
foreach dir (POSTPROBS/*)
set chrom = $dir:t
echo $chrom
zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \
wigAsciiToBinary -chrom=$chrom \
-dataSpan=1 -wibFile=wib/${chrom}_hmrg_phyloHMM -name=hmrg \
stdin > wibLimits/${chrom}
end
'EOF'
csh makeWig.csh >&! makeWig.log &
ssh hgwdev
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
hgLoadWiggle hg16 multizMm3Rn3GalGal2_phyloHMM_wig wib/*_hmrg_phyloHMM.wig
ln -s `pwd`/wib/chr*_hmrg_phyloHMM.wib /gbdb/hg16/wib
chmod 775 . wib
chmod 664 wib/*.wib
# Add zoom records to table to speed display of large regions (>600Kbp)
# NOTE: this doesn't work -- the rows were dropped
ssh eieio
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
mkdir -p wib1K wibLimits1K
cat > wigZoom1K.csh << 'EOF'
foreach dir (POSTPROBS/*)
set chrom = $dir:t
echo $chrom
zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \
wigZoom stdin | wigAsciiToBinary -chrom=$chrom \
-dataSpan=1024 -wibFile=wib1K/${chrom}_hmrg_phyloHMM_1K \
-name=hmrg stdin > wibLimits1K/${chrom}
end
'EOF'
csh wigZoom1K.csh >&! wigZoom1K.log &
tail -100f wigZoom1K.log
ssh hgwdev
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM/wib1K
hgLoadWiggle -oldTable hg16 multizMm3Rn3GalGal2_phyloHMM_wig *.wig
# create symlinks for .wib files
ln -s `pwd`/*.wib /gbdb/hg16/wib
# NOTE: this doesn't work -- the rows were dropped
# setup external files for database reference
# reuse mafs loaded in the maf track (just symlink the /gbdb dir before
# loading
ssh hgwdev
ln -s /gbdb/hg16/multizMm3Rn3GalGal2 /gbdb/hg16/multizMm3Rn3GalGal2_phyloHMM
# load into database
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
/cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3GalGal2_phyloHMM
# create trackDb entry
# track multizMm3Rn3GalGal2_phyloHMM
# type wigMaf 0.0 1.0
# wiggle multizMm3Rn3GalGal2_phyloHMM_wig
# etc.
# Load pairwise mafs
ssh hgwdev
cd /gbdb/hg16
mkdir -p mouse_hmrg rat_hmrg chicken_hmrg
foreach f (/cluster/data/hg16/bed/humor/maf/*.mm3.maf)
ln -s $f /gbdb/hg16/mouse_hmrg
end
cd /tmp
hgLoadMaf -WARN hg16 mouse_hmrg
foreach f (/cluster/data/hg16/bed/humor/maf/*.rn3.maf)
ln -s $f /gbdb/hg16/rat_hmrg
end
cd /tmp
hgLoadMaf -WARN hg16 mouse_hmrg
foreach f (/cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf)
ln -s $f /gbdb/hg16/chicken_hmrg
end
cd /tmp
hgLoadMaf -WARN hg16 chicken_hmrg
# copy files to download area
set dir = /usr/local/apache/htdocs/goldenPath/hg16/multizMm3Rn3GalGal2
mkdir $dir
ln -s $dir multiz
cp -p /gbdb/hg16/multizMm3Rn3GalGal2_phyloHMM/*.maf $dir
cd $dir
gzip *
# As the 5-way alignment is imminent, this wasn't completed
# edit downloads page to add links
# add pairwise mafs to downloads page
mkdir $dir/{rn3,mm3}
cd /cluster/data/hg16/bed/humor/maf
cp *.mm3.maf $dir/mm3
cp *.rn3.maf $dir/rn3
gzip $dir/mm3/*
gzip $dir/rn3/*
# also add human/chicken maf's
# Create upstream files
ssh hgwdev
echo hg16 mm3 rn3 galGal2> org.txt
foreach i (1000 2000 5000)
featureBits hg16 refGene:upstream:$i -fa=/dev/null -bed=up.bad
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
rm up.bad
mafFrags hg16 multizMm3Rn3GalGal2 up.bed upstream$i.maf -orgs=org.txt
rm up.bed
end
# miRNA track (DONE - 2004-05-04 - Hiram)
# data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
# and Michel.Weber@ibcg.biotoul.fr
# notify them if this assembly updates to renew this track
ssh hgwdev
mkdir /cluster/data/rn3/bed/miRNA
cd /cluster/data/rn3/bed/miRNA
wget --timestamping \
hgLoadBed rn3 miRNA rn3.bed
# entry in trackDb/trackDb.ra already there
# miRNA track (UPDATED - 2004-05-04 - Hiram)
# (first version done 2004-03-02)
# data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
# and Michel.Weber@ibcg.biotoul.fr
# notify them if this assembly updates to renew this track
cd /cluster/data/hg16/bed
mv miRNA miRNA.2004_03_02
mkdir miRNA
cd miRNA
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/hsa_ncbi34.*"
grep -v "^track " hsa_ncbi34.bed | sed -e "s/ /\t/g" > hg16.bed
# check existing track for comparison after update load
# featureBits hg16 miRNA
# 15385 bases of 2865248791 (0.001%) in intersection
hgLoadBed hg16 miRNA hg16.bed
# featureBits hg16 miRNA
# 16923 bases of 2865248791 (0.001%) in intersection
# added an entry to trackDb/trackDb.ra: (good for Mm4 and Ce1 too)
track miRNA
shortLabel miRNA
longLabel MicroRNAs from the miRNA Registry
group genes
priority 63
visibility hide
useScore 1
color 255,64,64
type bed 8
url http://www.sanger.ac.uk/cgi-bin/Rfam/mirna/mirna_entry.pl?id=$$
# Note the useScore item. This colors plus strand items in black
# and minus strand items in gray. A rarely used option.
# This same track is in Rn3, Mm4 and Ce2 too. Added
# findBedPos(query, hgp, "miRNA");
# to lib/hgFind.c to allow searching for these items.
#5-WAY MULTIZ & PHYLO-HMM HUMAN/CHIMP/MOUSE/RAT/CHICKEN (3/19/04, kpollard)
# UPDATE WOODY BINARIES
ssh hgwdev
cd /cluster/data/woody
cvs update -dP
cd src
make
# make sure Makefile has INSTALLDIR = /cluster/bin/woody
make install
#MULTIZ to add chimp, then chicken to HUMOR (see above)
ssh kk
set fiveDir = /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
mkdir -p $fiveDir/hmrp
mkdir -p $fiveDir/hmrpg
cd $fiveDir
#wrapper script for multiz
cat << EOF > mz
#!/bin/csh
/cluster/bin/penn/tbaBin/multiz \$1 \$2 - > \$3
EOF
chmod +x mz
#CHIMP
# put the MAFs on bluearc
ssh eieio
set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
mkdir -p $clustDir/hp
mkdir -p $clustDir/hmr
cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf $clustDir/hmr
cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/*.maf $clustDir/hp
logout # back to kk
#set up joblist (common denominator set: no chr19_random in hmr)
set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
cd $fiveDir
rm -f jobList
foreach file ($clustDir/hmr/*.maf)
set root=`echo $file:t:r | sed 's/\.hmr//'`
echo "mz $clustDir/hp/${root}.maf $file $fiveDir/hmrp/${root}.maf" >> jobList
end
#run on kk
chmod +x jobList
para create jobList
#para try, para check, para push, etc.
#add chr19_random from hp to hmrp
cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/chr19_random.maf $fiveDir/hmrp
#clean up bluearc
ssh eieio
set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
rm -r $clustDir/hp
rm -r $clustDir/hmr
#CHICKEN
# put the MAFs on bluearc
ssh eieio
set fiveDir = /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
mkdir -p $clustDir/hmrp
mkdir -p $clustDir/hg
cp $fiveDir/hmrp/*.maf $clustDir/hmrp
cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf
$clustDir/hg
logout # back to kk
logout #move to kki
#set up job list 2
ssh kki
set fiveDir = /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
cd $fiveDir
rm -f jobList.2
foreach file ($clustDir/hg/*.maf)
set root=`echo $file:t:r | sed 's/\.hg//'`
echo "mz $file $clustDir/hmrp/${root}.maf $fiveDir/hmrpg/${root}.maf" >> jobList.2
end
#run on kki
chmod +x jobList.2
para create jobList.2
#para try, para check, para push, etc.
# clean up bluearc
ssh eieio
rm -r /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
logout
#PHYLO-HMM CONSERVATION
#Set path
set path = ($path /cluster/bin/woody)
#Create "sufficient statistics" (SS) file from maf
ssh eieio
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
mkdir phyloHMM
cd phyloHMM
# create script to run msa_view.
cat > makeSS.csh << 'EOF'
set path = ($path /cluster/bin/woody)
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg
foreach f (chr*.maf)
set c = $f:r
echo "$c"
msa_view $f -i MAF -o SS -s 1 -r 1 -O hg16,mm3,rn3,panTro1,galGal2 > \
../phyloHMM/$c.ss
end
'EOF'
csh makeSS.csh >&! makeSS.log &
tail -100f makeSS.log
head phyloHMM/chr1.ss
head phyloHMM/chrY.ss
#model hpmrc_rev_dg.mod (from Adam)
set path = ($path /cluster/bin/woody)
cat hpmrc_rev_dg.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#NRATECATS: 10
#ALPHA: 4.4
#BACKGROUND: 0.286083 0.213573 0.213691 0.286652
#RATE_MAT:
# -0.891523 0.166770 0.574850 0.149902
# 0.223389 -1.146311 0.153784 0.769137
# 0.769591 0.153699 -1.147159 0.223869
# 0.149605 0.573055 0.166888 -0.889548
#TREE: ((1:0.0056,2:0.0057):0.1043,(3:0.076303,4:0.083043):0.2753):0.47,5:0.47);
/cluster/data/woody/scripts/extract-tree.pl human,chimp,mouse,rat,chicken \
hpmrc_rev_dg.mod
#((human:0.0056,chimp:0.0057):0.1043,(mouse:0.076303,rat:0.083043):0.2753):0.47,chicken:0.47);
#order is human-chimp-mouse-rat-chicken, so fix maf order in next step
#break up the genome-wide MAFs into pieces
# NOTE: using the hg16 chr fasta files stashed on bluearc for hg16 humor
mkdir -p /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
cp /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg/*.maf /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
cat << 'EOF' > doSplit
#!/bin/sh
WOODY=/cluster/bin/woody
FA_SRC=/cluster/bluearc/hg16/bed/humor
WINDOWS=/cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM/WINDOWS
maf=$1
c=`basename $maf .maf`
echo $c
mkdir -p /scratch/msa_split
${WOODY}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg16,panTro1,mm3,rn3,galGal2 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -d 1 -B 5000
echo "Copying..."
cd /scratch/msa_split
for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
rm -f /scratch/msa_split/$c.*.ss
echo "Done copying"
'EOF'
chmod +x doSplit
mkdir -p WINDOWS
rm -f WINDOWS/* jobs.lst
foreach file (/cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/*.maf)
echo "doSplit $file" >> jobs.lst
end
#run on kki
ssh kki
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
para create jobs.lst
# para try, para check, para push, etc.
logout
#compute the conservation scores
# NOTE: need to use gensub2, check out+ facilities to check for
# failures. Will want to chunk msa_split output (above) into chr dirs.
# to make the gensub template reasonable.
ssh kk
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
cat << 'EOF' > doPostProbs
#!/bin/sh
WOODY=/cluster/bin/woody
TMP=/tmp/phyloHMMcons
file=$1
root=`basename $file .ss.gz`
chrom=`echo $root | awk -F\. '{print $1}'`
echo $chrom
mkdir -p $TMP
zcat $file | $WOODY/label -m - -d hpmrc_rev_dg.mod -i SS -o $TMP/$root -k 10 -L 0.9 -A -p 0 -j 1 -s $chrom -x
mkdir -p POSTPROBS/$chrom
gzip -c $TMP/$root.postprob > POSTPROBS/$chrom/$root.postprob.gz
rm $TMP/$root.postprob
'EOF'
# << this line makes emacs coloring happy
chmod +x doPostProbs
mkdir -p POSTPROBS
rm -f jobs2.lst
foreach file (WINDOWS/chr*.ss.gz)
echo "doPostProbs $file" >> jobs2.lst
end
wc -l jobs2.lst
para create jobs2.lst
#para try, para check, para push, etc.
#1 problem: chr19_random crashed - due to no alignments in HMR. Leave out.
# Create wiggle (.wib) file and load into database
ssh eieio
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
mkdir wibLimits
mkdir wib
cat > makeWig.csh << 'EOF'
foreach dir (POSTPROBS/*)
set chrom = $dir:t
echo $chrom
zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \
wigAsciiToBinary -chrom=$chrom \
-dataSpan=1 -wibFile=wib/${chrom}_hpmrg_phyloHMM -name=hpmrg \
stdin > wibLimits/${chrom}
end
'EOF'
# << this line makes emacs coloring happy
csh makeWig.csh >&! makeWig.log &
#load tables
ssh hgwdev
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
hgLoadWiggle hg16 mzPt1Mm3Rn3Gg2_pHMM_wig wib/*_hpmrg_phyloHMM.wig
ln -s `pwd`/wib/chr*_hpmrg_phyloHMM.wib /gbdb/hg16/wib
chmod 775 . wib
chmod 664 wib/*.wib
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg
mkdir -p /gbdb/hg16/mzPt1Mm3Rn3Gg2_pHMM
ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg/*.maf /gbdb/hg16/mzPt1Mm3Rn3Gg2_pHMM
hgLoadMaf hg16 -warn mzPt1Mm3Rn3Gg2_pHMM
cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/
mkdir -p /gbdb/hg16/chimp_hmrg
ln -s /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/*.maf /gbdb/hg16/chimp_hmrg
hgLoadMaf hg16 -warn chimp_hmrg
#cleanup bluearc
ssh eieio
rm -r /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
logout
#Add description file: mzPt1Mm3Rn3Gg2_pHMM.html
#Add track to trackDb.ra: mzPt1Mm3Rn3Gg2_pHMM
#Copy files to download area
cd /gbdb/hg16
set dir = /usr/local/apache/htdocs/goldenPath/hg16/mzPt1Mm3Rn3Gg2
mkdir $dir
ln -s $dir multiz
cp -p /gbdb/hg16/mzPt1Mm3Rn3Gg2_pHMM/*.maf $dir
cd $dir
gzip *
# edit downloads page to add links
# add pairwise mafs to downloads page
mkdir $dir/{rn3,mm3,pt1,gg2}
cd /cluster/data/hg16/bed/humor/maf
cp *.mm3.maf $dir/mm3
cp *.rn3.maf $dir/rn3
gzip $dir/mm3/*
gzip $dir/rn3/*
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest
cp *.maf $dir/gg2
gzip $dir/gg2/*
cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/
cp *.maf $dir/pt1
gzip $dir/pt1/*
# EXONIPHY HMR
# (started, acs, 2004-03-23)
# (redone 2004-07-01, with new version of software; have revised
# docs accordingly)
# Warning: some commands here require bash shell
ssh hgwdev
# (make sure /cluster/bin/phast is in path)
mkdir /cluster/store6/exoniphy.hg16mm3rn3.2004-03-23
cd /cluster/data/hg16/bed
ln -s /cluster/store6/exoniphy.hg16mm3rn3.2004-03-23
ln -s exoniphy.hg16mm3rn3.2004-03-23 exoniphy.hg16mm3rn3
# first, break up the genome-wide MAFs into pieces; it's worth doing
# this as a little cluster job
ssh eieio
mkdir -p /cluster/bluearc/hg16/bed/humor
cp /cluster/data/hg16/bed/humor/hmr/*.maf /cluster/data/hg16/?{,?}/chr*.fa /cluster/bluearc/hg16/bed/humor
logout
ssh kk
cd /cluster/data/hg16/bed/exoniphy.hg16mm3rn3
cat << '_EOF_' > doSplit
#!/bin/sh
PHAST=/cluster/bin/phast
FA_SRC=/cluster/bluearc/hg16/bed/humor
WINDOWS=/cluster/data/hg16/bed/exoniphy.hg16mm3rn3/WINDOWS
maf=$1
prefix=`basename $maf .hmr.maf`
chr=`echo $prefix | sed 's/chr//g ; s/_random//g'`
mkdir -p /scratch/msa_split
${PHAST}/msa_split $maf --in-format MAF --refseq ${FA_SRC}/$prefix.fa --order hg16,mm3,rn3 --windows 50000,2000 --out-root /scratch/msa_split/$prefix --out-format SS --min-informative 1000 --between-blocks 1000 --tuple-size 3
mkdir -p ${WINDOWS}/$chr
cd /scratch/msa_split
for file in `ls | egrep -w ${prefix}` ; do gzip -c $file > ${WINDOWS}/$chr/$file.gz ; rm $file ; done
_EOF_
# << this line makes emacs coloring happy
chmod +x doSplit
mkdir -p WINDOWS
rm -rf WINDOWS/* jobs.lst
for file in /cluster/bluearc/hg16/bed/humor/*.maf ; do echo "doSplit $file" >> jobs.lst ; done
para create jobs.lst
# etc ... (run cluster job)
# now set up cluster job for exoniphy.
cat << '_EOF_' > doExoniphy
#!/bin/bash
zcat $1 | /cluster/bin/phast/exoniphy - ${*:3} > $2
_EOF_
# << this line makes emacs coloring happy
chmod +x doExoniphy
rm -f jobs.lst
for dir in WINDOWS/* ; do
chrNo=`basename $dir`
mkdir -p OUTPUT/$chrNo
for file in $dir/* ; do
base=`basename $file .ss.gz`
chrStr=`echo $base | awk -F\. '{print $1}'`
echo "doExoniphy $file OUTPUT/$chrNo/$base.gff --seqname $chrStr --idpref $base --score --indels --quiet " >> jobs.lst
done
done
#[acs@kk exoniphy.hg16mm3rn3]$ wc jobs.lst
# 59175 591750 7179445 jobs.lst
para create jobs.lst
# etc... (run cluster job)
#Completed: 59175 of 59175 jobs
#CPU time in finished jobs: 49361849s 822697.48m 13711.62h 571.32d 1.565 y
#IO & Wait Time: 258451s 4307.52m 71.79h 2.99d 0.008 y
#Average job time: 839s 13.98m 0.23h 0.01d
#Longest job: 1868s 31.13m 0.52h 0.02d
#Submission to last job: 75584s 1259.73m 21.00h 0.87d
# create track
logout
ssh hgwdev
cd /cluster/data/hg16/bed/exoniphy.hg16mm3rn3
for dir in OUTPUT/* ; do
chrNo=`basename $dir`
echo $chrNo
find $dir -name "*.gff" | grep -v random > files
if [ -s files ] ; then cat `cat files` | refeature - --unique --sort --include-only CDS,start_codon,stop_codon > chr$chrNo.gff ; fi
find $dir -name "*.gff" | grep random > files
if [ -s files ] ; then cat `cat files` | refeature - --unique --sort --include-only CDS,start_codon,stop_codon > chr${chrNo}_random.gff ; fi
done
ldHgGene -gtf -frame hg16 exoniphy chr*.gff
#track exoniphy
#shortLabel Exoniphy
#longLabel Exoniphy: Conserved Exon Predictions (Human/Mouse/Rat)
#group genes
#priority 50.9
#visibility hide
#color 173,17,162
#type genePred
#
# Load tfbsCons track DONE 2004-03-31 braney
#
set humordir=/gbdb/hg16/humorMm3Rn3
set transfacdir=/projects/compbio/data/transfac
set outdir=hg16_tfbsCons
ssh hgwdev
mkdir /cluster/data/hg16/bed/tfbsCons
cd /cluster/data/hg16/bed/tfbsCons
# Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts weirauch@soe.ucsc.edu
set tarfile=/cluster/data/hg15/bed/tfbsCons/tfbsConsUtils.tar.gz
tar zxf $tarfile
# the following takes days (says Matt)
nice getTfbsConsData.pl `pwd` $humordir $transfacdir ./IDS.txt $outdir -over &
cd $outdir
rm chr*.bed
hgLoadBed -noSort hg16 tfbsCons -sqlTable=$HOME/kent/src/hg/lib/tfbsCons.sql tfbsCons.bed -tab
# Get mapping of ID's from Matt so we can link into the TRANSFAC database
set idmap=/cluster/data/hg16/bed/tfbsCons/tfbsConsMap
hgsql hg16 < ~/kent/src/hg/lib/tfbsConsMap.sql
echo "load data local infile '$idmap' into table tfbsConsMap;" | hgsql hg16
# PREP FOR LIFTOVER CHAINS TO HG16 (2004-04-12 kate)
# split into 3K chunks
ssh eieio
set tempDir = /cluster/bluearc/hg/gs.17/build34/liftOver
cd $tempDir
mkdir lift
cat > split.csh << 'EOF'
set scratch = /iscratch/i/gs.17/build34/liftOver/split
mkdir -p $scratch
foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M)
echo chr$i
faSplit -lift=lift/chr$i.lft size /cluster/data/hg16/$i/chr$i.fa -oneFile 3000 $scratch/chr$i
end
'EOF'
csh split.csh >&! split.log &
tail -100f split.log
/cluster/bin/iSync
# ECORES FROM GENOSCOPE [DONE, hartera, 2004-03-31]
# download data from http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecores# ecotigHF - ecores on Human, genome conserved with Fugu, Fr1
# ecotigHT - ecores on Human, genome conserved with Tetraodon (March 2004)
ssh hgwdev
mkdir /cluster/data/hg16/bed/ecores/
# add parse_ecotig.pl to this directory
# FUGU
mkdir /cluster/data/hg16/bed/ecores/fr1
cd /cluster/data/hg16/bed/ecores/fr1/
# download data for ecotigHF to this directory
# parse ecotig files to produce a bed format file
perl ../parse_ecotig.pl < ecotigHF > ecotigHF.bed
# change from upper to lower case for "CHR"
perl -pi.bak -e 's/CHR/chr/g' ecotigHF.bed
hgLoadBed -tab hg16 ecoresFr1 ecotigHF.bed
# clean up
rm *.bak
# TETRAODON
mkdir /cluster/data/hg16/bed/ecores/tetraodon
cd /cluster/data/hg16/bed/ecores/tetraodon/
# download data for ecotigHT to this directory
# parse ecotig files to produce a bed format file
perl ../parse_ecotig.pl < ecotigHT > ecotigHT.bed
# change from upper to lower case for "CHR"
perl -pi.bak -e 's/CHR/chr/g' ecotigHT.bed
hgLoadBed -tab hg16 ecoresTetraodon ecotigHT.bed
# clean up
rm *.bak
# add entries in kent/src/hg/makeDb/trackDb/human/hg16/trackDb.ra
# add html for details pages to this directory:
# ecoresFr1.html and ecoresTetraodon.html
# VNTR MICROSATELLITE REPEATS FROM GEROME BREEN (DONE 4/28/04 angie)
ssh hgwdev
mkdir /cluster/data/hg16/bed/vntr
cd /cluster/data/hg16/bed/vntr
# saved email attachment from Gerome Breen <g.breen@iop.kcl.ac.uk>
# as HumJuly2003microsats_finished_for_angieH.txt
# Replace 1-based start coords with 0-based, tweak n/a distance values:
tail +2 HumJuly2003microsats_finished_for_angieH.txt \
| perl -wpe 's/(first|last) in chromosome\/sequence/-1/i' \
| awk '{printf "%s\t%d\t%d\t%s\t%s\t%d\t%s\t%s\t%s\t%s\n", $1, $2-1, $3, $4, $5, $6, $7, $8, $9, $10;}' \
> vntr.bed
hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/vntr.sql hg16 \
vntr vntr.bed
# WEBB'S PUTATIVE NON-EXONIC CONSERVED REGIONS (DONE 4/6/04 angie)
ssh hgwdev
mkdir /cluster/data/hg16/bed/webbNonExonic
cd /cluster/data/hg16/bed/webbNonExonic
wget http://bio.cse.psu.edu/~webb/nonexonic.tar.gz
tar xvzf nonexonic.tar.gz
# Score should really be scaled from 5k..276k --> 200-1000
cat chr* \
| awk '{printf "%s\t%d\t%d\t%s:%d-%d\t%d\t%c\n", $2, $3-1, $4, $5, $6, $7, $9, $8;}' \
> webbNonExonic.bed
hgLoadBed hg16 webbNonExonic webbNonExonic.bed
# phylo HMM data quintile calculation
ssh eieio
cat << '_EOF_' > /tmp/allpHMMdata.sh
#!/bin/sh
# there is only an empty file in chr13_random, it causes all
# files following it on the xargs zcat line to be missed.
# Eliminate it from the processing
find ./POSTPROBS -type f | grep -v chr13_random | sort -t\. -k2,2n | \
xargs zcat | awk '{print $2}' > /tmp/pHMM.data
'_EOF_'
chmod +x /tmp/allpHMMdata.sh
cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
time /tmp/allpHMMdata.sh
# Create top 5 % set of data for phyloHMMcons.hg16mm3rn3.2003-11-11
# (DONE - 2004-05-15 - Hiram)
cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
cat << '_EOF_' > top5.sh
#!/bin/sh
#
# Do not work on chr13_random, it has no data
# this for loop should have been:
# ls POSTPROBS/chr* | sort -t\. -k2,2n | while read i
# to get the data in properly sorted order. With this as is,
# we will need to sort the coords later to make any wiggle
# track out of this data
#
mkdir top5_data
for i in POSTPROBS/chr*
do
c=${i/POSTPROBS\//}
echo $i $c
if [ "$c" != "chr13_random" ]; then
if [ ! -f top5_data/$c.ascii.gz ]; then
find ${i} -type f | sort -t\. -k2,2n | while read FN
do
zcat ${FN}
done | awk '{if ($2 > 0.450) print}' > top5_data/$c.ascii
rm -f top5_data/$c.ascii.gz
gzip top5_data/$c.ascii &
else
ls -og top5_data/$c.ascii.gz
fi
fi
done
'_EOF_'
chmod +x top5_data
# running this script takes several hours, make sure you do it
# on the file server
ssh eieio
cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
# Then, to make the histogram data:
cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11/top5_data
cat << '_EOF' > mkHisto.sh
#!/bin/sh
for f in chr*.ascii.gz
do
zcat $f
done | textHistogram -real -col=2 -binSize=0.001 -maxBinCount=1000 stdin
'_EOF_'
chmod +x mkHisto.sh
./mkHisto.sh > histoGram.data
# BLASTZ FUGU (FR1) (DONE 4/19/04 angie)
ssh kk
# space is awful tight on store4 -- use store7.
mkdir -p /cluster/store7/hg16/bed/blastz.fr1.2004-04-19
ln -s /cluster/store7/hg16/bed/blastz.fr1.2004-04-19 \
/cluster/data/hg16/bed/
cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
# Set L=6000 (more relaxed than chicken) and abridge repeats.
# Treat all repeats as lineage-specific (reuse linSpecRep.Chicken).
cat << '_EOF_' > DEF
# human vs. fugu
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Reuse parameters from human-chicken.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.Chicken
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Fugu
SEQ2_DIR=/iscratch/i/fr1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/fr1/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/store7/hg16/bed/blastz.fr1.2004-04-19
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
bash # if a csh/tcsh user
source DEF
mkdir $RAW run.0
/cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
sh ./xdir.sh
cd run.0
sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
para create jobList
para try, check, push, check, ....
#Completed: 11865 of 11865 jobs
#Average job time: 414s 6.90m 0.11h 0.00d
#Longest job: 709s 11.82m 0.20h 0.01d
#Submission to last job: 5678s 94.63m 1.58h 0.07d
# second cluster run: lift raw alignments -> lav dir
ssh kki
cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
bash # if a csh/tcsh user
source DEF
mkdir run.1 lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
cd run.1
wc -l jobList
para create jobList
para try, check, push, etc ...
#Completed: 339 of 339 jobs
#Average job time: 4s 0.07m 0.00h 0.00d
#Longest job: 19s 0.32m 0.01h 0.00d
#Submission to last job: 91s 1.52m 0.03h 0.00d
# third run: lav -> axt
ssh kki
cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
mkdir axtChrom pslChrom run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| $HOME/bin/x86_64/lavToAxt stdin \
/iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/fr1/nib stdout \
| $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt
$HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
../../pslChrom/$chr.psl
'_EOF_'
# << this line keeps emacs coloring happy
chmod a+x do.csh
cp /dev/null jobList
foreach d (../lav/chr*)
echo "do.csh $d" >> jobList
end
para create jobList
para try, check, push, check
#Completed: 42 of 42 jobs
#Average job time: 16s 0.26m 0.00h 0.00d
#Longest job: 75s 1.25m 0.02h 0.00d
#Submission to last job: 80s 1.33m 0.02h 0.00d
# CHAIN FUGU BLASTZ (REDONE 10/1/04 angie)
# NOTE: originally done 4/19, but with a buggy axtChain.
# axtChain dir moved aside to axtChain.orig before rebuilding.
# Run axtChain on little cluster
ssh kki
cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
# Check size>0 for .axt files (empty inputs cause out line+ check to fail):
cp /dev/null input.lst
foreach f (`ls -1S /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChrom/*.axt`)
if (-s $f) then
echo $f >> input.lst
endif
end
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# Reuse gap penalties from chicken run.
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=/cluster/data/blastz/chickenHumanTuned.gap \
-minScore=5000 $1 \
/iscratch/i/gs.17/build34/bothMaskedNibs \
/iscratch/i/fr1/nib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
#Completed: 41 of 41 jobs
#Average job time: 26s 0.44m 0.01h 0.00d
#Longest job: 121s 2.02m 0.03h 0.00d
#Submission to last job: 121s 2.02m 0.03h 0.00d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# Load chains into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain/chain
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain hg16 ${c}_chainFr1 $i
end
# NET FUGU BLASTZ (REDONE 10/1/04 angie)
# NOTE: originally done 4/19, but with results of a buggy axtChain.
ssh kksilo
cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
netClass noClass.net hg16 fr1 fugu.net
# Make a 'syntenic' subset:
ssh kksilo
cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
rm noClass.net
# Make a 'syntenic' subset of these with
netFilter -syn fugu.net > fuguSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
netFilter -minGap=10 fugu.net | hgLoadNet hg16 netFr1 stdin
netFilter -minGap=10 fuguSyn.net | hgLoadNet hg16 netSyntenyFr1 stdin
# LIFTOVER CHAIN TO FUGU FR1 (DONE 2004-09-28 kate)
ssh kolossus
cd /cluster/data/hg16/bed/blastz.fr1/axtChain
time netChainSubset human.net all.chain \
/cluster/data/hg16/bed/liftOver/hg16ToFr1.chain
# RUN AXTBEST (DONE 4/20/04 angie)
# Webb asked for axtBest too...
ssh kolossus
cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
mkdir axtBest
foreach f (axtChrom/*.axt)
set chr=$f:t:r
echo axtBesting $chr
axtBest $f $chr axtBest/$chr.axt -minScore=300
end
# H-INVITATIONAL GENE ANNOTATION DATABASE (2004-04-29 kate)
# https://www.jbirc.aist.go.jp/hinv/top.html
# Create knownGene table to reference HINV gene ID's
# for link on knownGenes details page
# Also, create an HINV gene track, just to look at
# (probably not publish, as these are just mRNA alignments
# already visible on browser).
# download CDNA file (release 1.0)
ssh kksilo
mkdir /cluster/data/hinv
cd /cluster/data/hinv
wget http://www.jbirc.aist.go.jp/hinv/download/alldata/flatfile/FCDNA.gz
gunzip FCDNA.gz
mv FCDNA FCDNA.1.0
# set up assembly work area
ssh eieio
cd /cluster/data/hg16
mkdir -p bed/hinv
cd bed/hinv
# extract H-INV ID's and Genbank accessions of mRNAs
awk '/CDNA_ACCESSION-NO:/ {print $2}' < /cluster/data/hinv/FCDNA.1.0 \
> accessions.txt
awk '/CDNA_H-INVITATIONAL-ID:/ {print $2}' < /cluster/data/hinv/FCDNA.1.0 \
> ids.txt
paste accessions.txt ids.txt > queries.txt
# create PSL file from alignments for these mRNA's, extracted from the
# table of all aligned mRNA's
hgsql hg16 -s -e "SELECT * FROM all_mrna" | cut -f 2- > all_mrna.tab
pslReps /dev/null stdout /dev/null | cat - all_mrna.tab > all_mrna.psl
# using pslReps to generate the PSL file header
pslSelect -queryPairs=queries.txt all_mrna.psl hinv_mrna.psl
# load track of mrna alignments
hgwdev
cd /cluster/data/hg16/bed/hinv
hgLoadPsl hg16 -table=HInvGeneMrna hinv_mrna.psl
# also make a gene track using the genomic exon coordinates for build34
# in the FCDNA file. NOTE: not all of the genes have these
ssh kksilo
cd /cluster/data/hg16/bed/hinv
/cluster/data/hinv/hinvToGff.pl < /cluster/data/hinv/FCDNA.1.0 > hinv.gff
ssh hgwdev
cd /cluster/data/hg16/bed/hinv
ldHgGene hg16 HInvGene hinv.gff
# Read 40140 transcripts
# TrackDb for this
# track HInvGene
# shortLabel H-INV Gene
# longLabel H-Invitational Genes
# group genes
# priority 37
# visibility hide
# color 0,100,180
# type genePred .
# also make a table with various useful items for each gene
ssh hgwdev
hgsql hg16 < ~/kent/src/hg/lib/HInv.sql
cd /cluster/data/hg16/bed/hinv
/cluster/data/hinv/hinvToTable.pl < /cluster/data/hinv/FCDNA.1.0 > HInv.tab
echo 'load data local infile "HInv.tab" into table HInv' | hgsql hg16
# create table for knownGenes detail page
ssh hgwdev
cd /cluster/data/hg16/bed/hinv
hgMapToGene hg16 HInvGeneMrna knownGene knownToHInv
# GENERATE GALGAL2 MAF FOR MULTIZ FROM NET (DONE 5/10/04 angie)
ssh kksilo
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
netSplit human.net net
ssh kolossus
cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
mkdir axtNet
foreach f (axtChain/net/*)
set chr = $f:t:r
netToAxt $f axtChain/chain/$chr.chain /cluster/data/hg16/nib \
/cluster/data/galGal2/nib stdout \
| axtSort stdin axtNet/$chr.axt
end
mkdir mafNet
foreach f (axtNet/chr*.axt)
set maf = mafNet/$f:t:r.hg.maf
axtToMaf $f \
/cluster/data/hg16/chrom.sizes /cluster/data/galGal2/chrom.sizes \
$maf -tPrefix=hg16. -qPrefix=galGal2.
end
# MULTIZ HUMAN/MOUSE/RAT/GALGAL2 WITH NET MAF FOR ALL (DONE 5/10/04 angie)
# (galGal2 net maf added to human/mouse/rat alignments described above [HUMOR])
# put the MAFs on bluearc
ssh eieio
mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hmr
mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hg
cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf \
/cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hmr
cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafNet/*.maf \
/cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hg
ssh kki
mkdir /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet
mkdir hmrg
# Wrapper script required because of stdout redirect:
cat << '_EOF_' > doMultiz
#!/bin/csh
/cluster/bin/penn/multiz $1 $2 - > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doMultiz
rm -f jobList
foreach file (/cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hmr/*.maf)
set root=$file:t:r:r
echo "doMultiz /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hg/${root}.hg.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet/hmrg/${root}.maf" >> jobList
end
para create jobList
para try, check, push, check
#Completed: 40 of 41 jobs
#Crashed: 1 jobs
#Average job time: 84s 1.40m 0.02h 0.00d
#Longest job: 267s 4.45m 0.07h 0.00d
#Submission to last job: 290s 4.83m 0.08h 0.00d
# The crash was due to empty hg/chr18_random.hg.maf -- OK.
# clean up bluearc (these are big files!)
rm -r /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet
# put this out there for Glenn Tesler (not a browser track!)
ssh eieio
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet/hmrg
gzip *
ssh hgwdev
mkdir /usr/local/apache/htdocs/angie/hg16.multizMm3Rn3GalGal2.allNet
foreach f (/cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet/hmrg/*)
ln -s $f /usr/local/apache/htdocs/angie/hg16.multizMm3Rn3GalGal2.allNet
end
# EPONINE TSS PREDICTION (DONE 5/21/04 angie)
# Eponine runs fine on 2.5Mb contig, but barfs on much larger contig;
# chop up sequence at gaps into ~2.5Mb chunks for cluster run.
ssh eieio
mkdir /cluster/bluearc/hg16/chunks
cd /cluster/data/hg16
# Note: faSplit seems to ignore the ".chunk_" suffix below:
foreach f (?{,?}/NT_*/NT_??????.fa)
set ctg = $f:t:r
faSplit -minGapSize=10 -lift=/cluster/bluearc/hg16/chunks/$ctg.lft \
gap $f 2500000 /cluster/bluearc/hg16/chunks/$ctg.chunk_
end
mkdir /cluster/data/hg16/bed/eponine
cd /cluster/data/hg16/bed/eponine
wget http://www.sanger.ac.uk/Software/analysis/eponine/eponine-scan.jar
cat << '_EOF_' > doEpo
#!/bin/csh
set path=(/usr/java/j2re1.4.1_01/bin $path)
java -jar ./eponine-scan.jar -threshold 0.999 -seq $1 > $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doEpo
cp /dev/null jobList
foreach f (/cluster/bluearc/hg16/chunks/NT*.fa)
echo "./doEpo {check in line+ $f} {check out exists out/$f:t:r.gff}" \
>> jobList
end
mkdir out
ssh kk9
cd /cluster/data/hg16/bed/eponine
para create jobList
para try, check, push, check, ...
#Completed: 1588 of 1588 jobs
#Average job time: 208s 3.47m 0.06h 0.00d
#Longest job: 447s 7.45m 0.12h 0.01d
#Submission to last job: 3591s 59.85m 1.00h 0.04d
# lift chunks -> contigs
mkdir contigs/
foreach l (/cluster/bluearc/hg16/chunks/*.lft)
set ctg = $l:t:r
liftUp contigs/$ctg.gff $l warn out/${ctg}*.gff
end
# lift contigs -> chrom
liftUp eponine.gff ../../jkStuff/liftAll.lft warn contigs/NT_*.gff
# Translate to bed 4 + float-score -- it would be a shame to lose
# those scores in genePred or bed 5 (int score)
awk 'BEGIN {i=0;} \
{printf "%s\t%d\t%d\t%s.%d\t%s\t%s\n", $1, $4-1, $5, $1, i, $6, $7; \
i = i + 1;}' \
eponine.gff > eponine.bed
# load up
ssh hgwdev
cd /cluster/data/hg16/bed/eponine
sed -e 's/bed6FloatScore/eponine/g' \
$HOME/kent/src/hg/lib/bed6FloatScore.sql > eponine.sql
hgLoadBed hg16 eponine eponine.bed -tab -sqlTable=eponine.sql
# RELOAD ENSEMBL GENES WITH VERSION 34d (DONE 2004/05/20 baertsch)
# save current tables, just in case.
rename table ensGene to ensGene_old;
rename table ensGtp to ensGtp_old;
rename table ensPep to ensPep_old;
mkdir /cluster/data/hg16/bed/ensembl34d
cd /cluster/data/hg16/bed/ensembl34d
# Get the ensembl protein data from
# http://www.ensembl.org/Homo_sapiens/martview
# Follow this sequence through the pages:
# Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Structures" box.
# Page 4) Choose GTF as the ouput. choose gzip compression. hit export.
# Save as ensemblGene.gtf.gz
# Ensembl handles random chromosomes differently than us, so we
# strip this data. Fortunately it just loses a couple of genes.
# Add "chr" to front of each line in the gene data gtf file to make
# it compatible with our software.
# Finally, get rid of the ".1" or ".2" after the name
zcat ensbuild34d.gff.gz \
| grep -v ^6_DR51 \
| grep -v ^DR51 \
| grep -v _NT_ \
| perl -wpe 's/^([0-9]|X|Y|Un)/chr$1/ \
|| die "Line $. doesnt start with human chrom:\n$_"' \
| sed -e 's/\..\"/\"/g' \
> ensGene.gtf
ssh hgwdev
/cluster/bin/i386/ldHgGene -gtf -genePredExt hg16 ensGene \
/cluster/data/hg16/bed/ensembl34d/ensGene.gtf
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper. Use ensMart to create it as above, except:
# Page 3) Choose the "Features" box. In "Ensembl Attributes", check
# Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# Choose Text, tab-separated as the output format. Result name ensGtp.
# Save file as ensGtp.txt.gz
gunzip ensGtp.txt.gz
hgsql hg16 < ~/kent/src/hg/lib/ensGtp.sql
echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg16
gzip ensGtp.txt
# Load Ensembl peptides:
# Get them from ensembl as above in the gene section except for
# Page 3) Choose the "Sequences" box.
# Page 4) Transcripts/Proteins. Peptide. Format = FASTA.
# Save file as ensemblPep.fa.gz
zcat ensemblPep.fa.gz | hgPepPred hg16 ensembl stdin
# compare size of old and new tables as a sanity check
drop table ensGene_old;
drop table ensGtp_old;
drop table ensPep_old;
# Create knownToEnsembl column
hgMapToGene hg16 ensGene knownGene knownToEnsembl
#### BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-05-24 - Fan)
# Get the ensembl gene/protein cross-reference data from
# http://www.ensembl.org/Homo_sapiens/martview
# Follow this sequence through the pages:
# Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Feature" box, select gene, transcript, protein,
SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC
# Page 4) Choose "Text, tab separated". choose gzip compression. hit export.
# Save as ensXref.txt
sed ensXref.txt -e 's/\./\t/g' > ensemblXref3.tab
hgsql hg16 -e "drop table ensemblXref3"
hgsql hg16 < ~/src/hg/lib/ensemblXref3.sql
hgsql hg16 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines'
#### REBUILD SUPERFAMILY RELATED TABLES (DONE - 2004-05-21 - Fan)
# Download Superfamily data files and build the Superfamily DB
# from supfam.mrc-lmb.cam.ac.uk
mkdir /cluster/store1/superFamily/040516
cd /cluster/store1/superFamily/040516
# ftp over the following two files:
ass_16-May-2004.tab.gz
supfam_16-May-2004.sql.gz
# This may take about an hour.
hgsql hg16 -e "create database superfam040516"
hgsql superfam040516 < supfam_16-May-2004.sql
# Make sure to add an index on id of the des table of superfam040516.
hgsql superfam040516 < ~/src/hg/lib/sfAssign.sql
hgsql superfam040516 -e 'load data local infile "ass_16-May-2004.tab" into table superfam040516.sfAssign;'
# Build or rebuild Superfamily track and create sf tables needed for PB
hgsql hg16 < ~/src/hg/lib/sfAssign.sql
cd /cluster/store1/superFamily/040516
hgsql hg16 -e 'load data local infile "ass_16-May-2004.tab" into table hg16.sfAssign;'
# If hg16.sfDes already exists, drop it.
hgsql superfam040516 -e "select * from des" >sfDes.tab
hgsql hg16 < ~/src/hg/lib/sfDes.sql
hgsql hg16 -e 'load data local infile "sfDes.tab" into table hg16.sfDes ignore 1 lines;'
# If hg16.superfamily already exists, drop it.
hgSuperfam hg16 > sf.log
# It is normal that many proteins does not have corresponding Superfamily entries.
# If hg16.sfDescription exists, drop it.
hgsql hg16 < ~/src/hg/lib/sfDescription.sql
hgsql hg16 -e 'LOAD DATA local INFILE "sfDescription.tab" into table hg16.sfDescription;'
# Finally, load the superfamily table.
hgLoadBed hg16 superfamily superfamily.tab -tab
# Create knownToSuperfamily table
cat /cluster/store1/superFamily/040516/ass_16-May-2004.tab \
| hgKnownToSuper hg16 hs stdin
# creates 32542 rows in knownToSuper
# seq table acc field is too small; up the max to match new hgLoadSeq
# schema (2004/05/22 markd)
alter table modify column `acc` varchar(128) NOT NULL default '';
#### Blat knownGene proteins to determine exons (braney 2004-06-02)
ssh kk
mkdir -p /cluster/data/hg16/bed/blat.hg16KG.2004-05-27
cd /cluster/data/hg16/bed
rm blat.hg16KG
ln -s blat.hg16KG.2004-05-27 blat.hg16KG
cd blat.hg16KG
pepPredToFa hg16 knownGenePep known.fa
hgPepPred hg16 generic blastKGPep00 known.fa
cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
ls -1S /scratch/hg/gs.17/build34/bothMaskedNibs/*.nib > human.lst
mkdir kgfa
cd kgfa
faSplit sequence ../known.fa 300 kg
ls -1S kgfa/*.fa > kg.lst
cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
gensub2 human.lst kg.lst blatGsub blatSpec
mkdir psl
cd psl
foreach i (`cat ../human.lst`)
mkdir `basename $i .nib`
end
para create blatSpec
para push
# Completed: 12222 of 12222 jobs
# CPU time in finished jobs: 23286365s 388106.09m 6468.43h 269.52d 0.738 y
# IO & Wait Time: 710342s 11839.03m 197.32h 8.22d 0.023 y
# Average job time: 1963s 32.72m 0.55h 0.02d
# Longest job: 106239s 1770.65m 29.51h 1.23d
# Submission to last job: 106248s 1770.80m 29.51h 1.23d
pslSort dirs raw.psl /tmp psl/*
pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
sort -rn cooked.psl | pslUniq stdin hg16KG.psl
pslxToFa hg16KG.psl hg16KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
kgName hg16 hg16KG.psl blastKGRef00
ssh hgwdev
cd /cluster/data/hg16/bed/blat.hg16KG
hgsql hg16 < ~/kent/src/lib/hg/blastRef.sql
echo "rename table blastRef to blastKGRef00" | hgsql hg16
echo "load data local infile 'blastKGRef00' into table blastKGRef00" | hgsql hg16
### RUN BLASTZ VS. MACACA MULATTA
#get sequence from trace repository
cd /cluster/bluearc/macaca
for i in 01 02 03 04 05 06 07 08 09 10 11 12 13 14 ; do echo $i ; wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/macaca_mulatta/fasta.macaca_mulatta.0$i.gz ; done
# distribute contigs to bluearc and /iscratch/i for cluster run
#split the sequence into 1mb chunks (about 13k reads per file)
ssh kksilo
mkdir -p /cluster/bluearc/macaca/split
for i in 001 002 003 004 005 006 007 008 009 010 011 012 013 014 ; do faSplit about macacca_mulatta.$i.fa 10000000 split/$i/mac ; done
find split -name \*.fa > mac.lst
hgsql hg16 -N < chromLen.sql > S1.len
#flatten directory structure for Angie's scripts
for i in `ls` ; do cd /iscratch/i/macaca/$i ; for j in `ls` ; do mv $j ../$i.$j ; done ; done
ssh kkr1u00
mkdir -p /iscratch/i/macaca/
df /iscratch/i
cp /cluster/bluearc/macaca/split/* /iscratch/i/macaca
/cluster/bin/scripts/iSync
# make DEF file for blastz
ssh kksilo
cd /cluster/bluearc/macaca
# NOTE: need schwartzbin below for utils still not in penn bin
cat << '_EOF_' > DEF
# human vs. macaca mulatta
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/cluster/data/blastz/human_mulatta.q
BLASTZ_ABRIDGE_REPEATS=0
SEQ1_DIR=/scratch/hg/gs.17/build34/bothMaskedNibs/
SEQ1_RMSK=/scratch/hg/gs.17/build34/rmsk/
SEQ1_SMSK=
SEQ1_FLAG=-primate
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ2_DIR=/iscratch/i/macaca/
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=-primate
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/bluearc/macaca
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
DEBUG=0
'_EOF_'
# << this line makes emacs coloring happy
# Save the DEF file in the current standard place
cp DEF ~angie/hummus/DEF.hg16-rm0.`date -I`
ssh kk
cd /cluster/bluearc/macaca
# source the DEF file to establish environment for following commands
bash
source ./DEF
cp /cluster/data/mm4/jkStuff/BlastZ_run0.sh .
./BlastZ_run0.sh
cd run.1
para try
para check
para push
# Second cluster run to convert the .out's to .lav's
cp /cluster/data/mm4/jkStuff/BlastZ_run1.sh .
ssh kk
cd /cluster/data/pt0/bed/blastz.hg16
bash
source DEF
./BlastZ_run1.sh
cd run.2
para try
para check
para push
# Prepare third cluster run script to convert lav's to axt's
cd /cluster/bluearc/macaca/
cat << '_EOF_' > ../../jkStuff/BlastZ_run2.sh
#!/bin/sh
# prepare third cluster run for blastz processing
# NOTE: should run this on iservers (4G),
# with chr19 and chr1 on kolossus (8G)
M=`uname -n`
if [ "$M" != "kk" ]; then
echo "ERROR: you are on machine: '$M'"
echo -e "\tthis script expects machine kk"
exit 255
fi
source DEF
mkdir axtChrom
mkdir run.2
cd run.2
# usage: blastz-contiglav2axt lav-dir axt-file seq1-dir seq2-file
echo '#LOOP' > gsub
echo '/cluster/bin/scripts/blastz-contiglav2axt '${BASE}'/lav/$(root1) {check out line+ '${BASE}'/axtChrom/$(root1).axt} '${SEQ1_DIR}' /cluster/bluearc/macaca/split/'${path2} >> gsub
echo '#ENDLOOP' >> gsub
ls -1S ${BASE}/lav > chrom.list
gensub2 chrom.list ../mac.lst gsub jobList
wc -l jobList
echo "running 'para create'"
para create jobList
echo "Ready for cluster run. para try, check, push, etc ..."
'_EOF_'
chmod +x ../../jkStuff/BlastZ_run2.sh
# Third cluster run to convert lav's to axt's
source DEF
../../jkStuff/BlastZ_run2.sh
cd run.2
para try, check, push, etc ...
# NOTE: ran this on kolossus and mini-cluster
# 30 min. to 2 hrs. per chrom
# Wrapper script required because of stdout redirect:
cd /cluster/bluearc/macaca
cat << '_EOF_' > doMultiz
#!/bin/csh
/cluster/bin/penn/multiz $1 $2 - > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doMultiz
rm -f jobList
foreach file (/cluster/data/pt0/bed/blastz-blatHg16.2003-11-24/maf/*.maf)
set root=$file:t:r:r
echo "doMultiz /cluster/data/pt0/bed/blastz-blatHg16.2003-11-24/maf/${root}.maf $file /cluster/bluearc/macaca/blastz.hg16/${root}.maf" >> jobList
end
para create jobList
para try, check, push, check
# seq table acc field is too small; up the max to match new hgLoadSeq
# schema (2004/05/22 markd)
alter table modify column `acc` varchar(128) NOT NULL default '';
#### Blat knownGene proteins to determine exons (braney 2004-06-02)
ssh kk
mkdir blat.hg16KG.2004-05-27
rm blat.hg16KG
ln -s blat.hg16KG.2004-05-27 blat.hg16KG
pepPredToFa hg16 knownGenePep known.fa
grep ">" known.fa | sed "s/>//" > kgName.lst
kgName hg16 kgName.lst kg.mapNames
cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
ls -1S /scratch/hg/gs.17/build34/bothMaskedNibs/*.nib > human.lst
mkdir kgfa
cd kgfa
faSplit sequence ../known.fa 300 kg
ls -1S kgfa/*.fa > kg.lst
cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
gensub2 human.lst kg.lst blatGsub blatSpec
mkdir psl
cd psl
foreach i (`cat ../human.lst`)
mkdir `basename $i .nib`
end
para create blatSpec
para push
# Completed: 12222 of 12222 jobs
# CPU time in finished jobs: 23286365s 388106.09m 6468.43h 269.52d 0.738 y
# IO & Wait Time: 710342s 11839.03m 197.32h 8.22d 0.023 y
# Average job time: 1963s 32.72m 0.55h 0.02d
# Longest job: 106239s 1770.65m 29.51h 1.23d
# Submission to last job: 106248s 1770.80m 29.51h 1.23d
pslSort dirs raw.psl /tmp psl/*
pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
pslxToFa uniq.psl uniq_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
# LIFTOVER CHAINS TO HG17 (DONE 2004-07-14 kate)
# run alignment
# NOTE: split hg17 to /iscratch/i is doc'ed in makeHg17.doc
ssh kk
cd /cluster/data/hg16
makeLoChain-align hg16 /scratch/hg/gs.17/build34/bothMaskedNibs \
hg17 /iscratch/i/hg17/liftOver/split
# Created parasol job in bed/blat.hg17.2004-07-14/run
cd bed
rm blat.hg17
ln -s blat.hg17.2004-07-14 blat.hg17
cd blat.hg17/run
para try
para check
para push
# lift results
# the lift directory was defined in makeHg17.doc when split was performed
# this expects data in bed/blat.hg17, so symlink must be there
# use kolossus for speed
ssh kolossus
cd /cluster/data/hg16/bed/blat.hg17
makeLoChain-lift hg16 hg17 /cluster/data/hg17/bed/liftOver/liftSplit \
>&! lift.log &
tail -100f lift.log
# 25 minutes
# chain alignments
ssh kk
makeLoChain-chain hg16 /cluster/data/hg16/nib hg17 /cluster/data/hg17/nib
# Created parasol job in /cluster/data/hg16/bed/blat.hg17/chainRun
cd /cluster/data/hg16/bed/blat.hg17/chainRun
para try
# 46 jobs
para check
para push
# make alignment net
ssh kolossus
makeLoChain-net hg16 hg17
# load into database and copy to download directory
ssh hgwdev
makeLoChain-load hg16 hg17
cp /cluster/data/hg16/bed/blat.hg17/over.chain \
/cluster/data/hg16/bed/liftOver/hg16ToHg17.chain
# Finished loading hg16ToHg17.over.chain
# Now, add download link for /usr/local/apache/htdocs/goldenPath/hg16/liftOver/hg16ToHg17.over.chain.gz
# LIFTOVER CHAIN FROM HG17 TO HG16 (IN PROGRESS 2005-01-03 kate)
ssh kolossus
cd /cluster/data/hg16/bed/blast.hg17
mkdir net.hg17
cd chain
chainMergeSort
chainNet stdin /cluster/data/hg16/chrom.sizes \
/cluster/data/hg17/chrom.sizes \
/dev/null ../net.hg17
time chainSwap
netChainSubset net.hg17
# ENCODE Regions (kate)
# NOTE: these instructions are not yet complete (scripts and datafiles
# are currently in ~kate/encode)
mkRegionsBed.pl build34_regions.txt > encodeRegionsHg16.bed
hgLoadBed hg16 encodeRegions encodeRegionsHg16.bed -noBin
mkdir -p /cluster/data/hg16/bed/encodeRegions
cp encodeRegionsHg16.bed /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed
# Create hgFixed table for name+description
hgsql -D hgFixed < ${HOME}/kent/src/hg/lib/encodRegionInfo.sql
sed -e 's/^/INSERT INTO encodeRegionInfo (name, descr) VALUES (\"/' \
-e 's/|/\",\"/' \
-e 's/$/\");/' < regionInfo.txt | hgsql -D hgFixed
# create frameset for region display
make
# create sequence downloads
set dir = /usr/local/apache/htdocs/ENCODE/sequences
rm sizes.txt
foreach b (hg12 hg13 hg15 hg16)
encodeSequence.pl regions.$b.txt /cluster/data/$b/nib > $b.fa
cp $b.fa $dir
faCount $b.fa | awk '{print $1, $2}' > $dir/${b}_count.txt
echo $b >> sizes.txt
faSize $b.fa >> sizes.txt
echo "" >> sizes.txt
end
cp sizes.txt $dir
cd $dir
md5sum *.fa > md5sum.txt
# QA
checkEncodeRegions.pl regions.hg12.txt /cluster/data/hg12/nib > hg12.check
cp sizes.txt $dir
# etc.
csh printRegionDiffs.csh > regionDiffs.out
## end of blastz macaca mulatta alignment
# UN-ANNOTATED (EXCEPT FOR CROSS-SPECIES) REGIONS (DONE 6/8/04 angie)
# Anton Nekrutenko asked for this... easy to do with featureBits!
# NOTE: excluding mRNAs this time because of the controversial
# just-submitted-to-GenBank intronic BV* "mRNA" seqs.
ssh hgwdev
mkdir /cluster/data/hg16/bed/unAnnotated
cd /cluster/data/hg16/bed/unAnnotated
nice featureBits hg16 -minSize=12 \
\!gap \
\!knownGene \!refGene \!mgcGenes \
\!vegaGene \!vegaPseudoGene \!ensGene \!acembly \!ECgene \
\!geneid \!genscan \!twinscan \!slamMouse \!sgpGene \!softberryGene \
\!rnaGene \!superfamily \
\!est \!xenoMrna \!HInvGene \!tigrGeneIndex \
\!uniGene_2 \
\!cpgIsland \!rmsk \!simpleRepeat \
-bed=unAnnotated.bed
#905732944 bases of 2865248791 (31.611%) in intersection
hgLoadBed hg16 unAnnotated unAnnotated.bed
# not much of a drop in coverage with the -minSize:
nice featureBits hg16 unAnnotated
#903585585 bases of 2865248791 (31.536%) in intersection
# ANDY LAW CPGISSLANDS (DONE 6/15/04 angie)
# See notes about this in makeGalGal2.doc.
ssh eieio
mkdir /cluster/data/hg16/bed/cpgIslandGgfAndy
cd /cluster/data/hg16/bed/cpgIslandGgfAndy
cp /dev/null cpgIslandAndy.bed
cp /dev/null cpgIslandGgfAndy.bed
foreach f (../../?{,?}/chr*.fa)
set chr = $f:t:r
echo preproc $chr
/cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.preproc
echo running original on $chr
awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.preproc \
| /cluster/home/angie/andy-cpg-island.pl \
| perl -wpe '$i=0 if (not defined $i); \
chomp; ($s,$e) = split("\t"); $s--; \
$_ = "'$chr'\t$s\t$e\tcpg$i\n"; $i++' \
>> cpgIslandAndy.bed
echo running modified on $chr
/cluster/home/angie/ggf-andy-cpg-island.pl $chr.preproc \
| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
$gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \
$pGc = (100.0 * $gc / $n); \
$_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
"$pCpG\t$pGc\t$oE\n";' \
>> cpgIslandGgfAndy.bed
end
# load into database:
ssh hgwdev
cd /cluster/data/hg16/bed/cpgIslandGgfAndy
# this one is a bed 4:
hgLoadBed hg16 cpgIAndy -tab -noBin cpgIslandAndy.bed
# this one is a cpgIslandExt but with a different table name:
sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \
$HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql
hgLoadBed hg16 cpgIslandGgfAndy -tab -noBin \
-sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed
# WOW, even masking out repeat bases from the results, there's a huge
# increase in reported islands!!
featureBits hg16 cpgIsland
#21077002 bases of 2865248791 (0.736%) in intersection
featureBits hg16 cpgIslandGgfAndy
#135249416 bases of 2865248791 (4.720%) in intersection
featureBits hg16 cpgIslandGgfAndy \!rmsk
#68714633 bases of 2865248791 (2.398%) in intersection
wc -l ../cpgIsland/cpgIsland.bed *bed
# 27596 ../cpgIsland/cpgIsland.bed
# 376478 cpgIslandAndy.bed
# 260761 cpgIslandGgfAndy.bed
# http://www.pnas.org/cgi/content/full/99/6/3740
# Takai D Jones PA
# Comprehensive analysis of CpG islands in human chromosomes 21 and 22
#
# Regions of DNA of greater than 500 bp with a G+C equal to or
# greater than 55% and observed CpG/expected CpG of 0.65 were more
# likely to be associated with the 5' regions of genes and this
# definition excluded most Alu-repetitive elements.
#
# Also, our description reduced the number of CpG islands located
# on these chromosomes from 14,062 to 1,101, which is more
# consistent with the expected number of genes (750) located on
# these two chromosomes.
#
# To exclude "mathematical CpG islands" (for example, a 300-bp
# sequence containing one G, 150 Cs, and only one CpG, which would
# meet the criteria of a CpG island), we added one more condition:
# that there are at least seven CpGs in these 200 bp. This number
# was selected on the basis that there would be 200/16 (i.e.,
# 12.5) CpGs in a random DNA fragment containing no suppression of
# CpG. Because Gardiner-Garden and Frommer's criterion (1) of
# ObsCpG/ExpCpG of 0.6 would accommodate (0.6 × 12.5) CpGs (i.e.,
# 7.5), we selected seven CpGs as being a reasonable cutoff for
# the initial analysis.
#
egrep -w '^chr2[12]' ../cpgIsland/cpgIsland.bed | wc -l
# 1033
egrep -w '^chr2[12]' cpgIslandAndy.bed | wc -l
# 16462
# Hmm, how did I find fewer with looser params?? Better run Takai and
# Jones's script on chr21 and chr22 for comparison...
egrep -w '^chr2[12]' cpgIslandGgfAndy.bed |wc -l
# 10680
# OK, I just have to try again with masked sequence:
ssh eieio
cd /cluster/data/hg16/bed/cpgIslandGgfAndy
cp /dev/null cpgIslandMaskedAndy.bed
cp /dev/null cpgIslandMaskedGgfAndy.bed
foreach f (../../?{,?}/chr*.fa.masked.gz)
set chr = $f:t:r:r:r
echo preproc $chr
zcat $f \
| /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
> $chr.masked.preproc
echo running original on $chr
awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.masked.preproc \
| /cluster/home/angie/andy-cpg-island.pl \
| perl -wpe '$i=0 if (not defined $i); \
chomp; ($s,$e) = split("\t"); $s--; \
$_ = "'$chr'\t$s\t$e\tcpg$i\n"; $i++' \
>> cpgIslandMaskedAndy.bed
echo running modified on $chr
/cluster/home/angie/ggf-andy-cpg-island.pl $chr.masked.preproc \
| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
$gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \
$pGc = (100.0 * $gc / $n); \
$_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
"$pCpG\t$pGc\t$oE\n";' \
>> cpgIslandMaskedGgfAndy.bed
end
ssh hgwdev
cd /cluster/data/hg16/bed/cpgIslandGgfAndy
hgLoadBed hg16 cpgIAndyMasked -tab -noBin cpgIslandMaskedAndy.bed
# this one is a cpgIslandExt but with a different table name:
sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
$HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandMaskedGgfAndy.sql
hgLoadBed hg16 cpgIslandGgfAndyMasked -tab -noBin \
-sqlTable=cpgIslandMaskedGgfAndy.sql cpgIslandMaskedGgfAndy.bed
featureBits hg16 cpgIAndyMasked
#93307698 bases of 2865248791 (3.257%) in intersection
featureBits hg16 cpgIslandGgfAndyMasked
#56180461 bases of 2865248791 (1.961%) in intersection
wc -l *ed
# 376478 cpgIslandAndy.bed
# 260761 cpgIslandGgfAndy.bed
# 125851 cpgIslandMaskedAndy.bed
# 80350 cpgIslandMaskedGgfAndy.bed
# 6/28/04 -- masking simpleRepeats, and even repeats other than Alu's,
# might not be the right thing to do (?). Give it a try with less-masked
# sequence.
ssh eieio
cd /cluster/data/hg16/bed/cpgIslandGgfAndy
cp /dev/null cpgIslandGgfAndyOnlyRM.bed
cp /dev/null cpgIslandGgfAndyOnlyRMAlu.bed
foreach f (../../?{,?}/chr*.fa)
set chr = $f:t:r
echo preproc, ggf-andy $chr onlyRM
zcat $f.out.gz > /tmp/tmp.fa.out
maskOutFa $f /tmp/tmp.fa.out stdout \
| /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
| /cluster/home/angie/ggf-andy-cpg-island.pl \
| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
$gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \
$pGc = (100.0 * $gc / $n); \
$_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
"$pCpG\t$pGc\t$oE\n";' \
>> cpgIslandGgfAndyOnlyRM.bed
echo preproc, ggf-andy $chr onlyRMAlu
head -3 /tmp/tmp.fa.out > /tmp/tmp2.fa.out
awk '$11 == "SINE/Alu" {print;}' /tmp/tmp.fa.out >> /tmp/tmp2.fa.out
maskOutFa $f /tmp/tmp2.fa.out stdout \
| /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
| /cluster/home/angie/ggf-andy-cpg-island.pl \
| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
$gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \
$pGc = (100.0 * $gc / $n); \
$_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
"$pCpG\t$pGc\t$oE\n";' \
>> cpgIslandGgfAndyOnlyRMAlu.bed
end
# 80314 cpgIslandGgfAndyOnlyRM.bed
# 110598 cpgIslandGgfAndyOnlyRMAlu.bed
ssh hgwdev
cd /cluster/data/hg16/bed/cpgIslandGgfAndy
sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRM/g' \
$HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql
hgLoadBed hg16 cpgIslandGgfAndyOnlyRM -tab -noBin -sqlTable=/tmp/c.sql \
cpgIslandGgfAndyOnlyRM.bed
sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRMAlu/g' \
$HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql
hgLoadBed hg16 cpgIslandGgfAndyOnlyRMAlu -tab -noBin -sqlTable=/tmp/c.sql \
cpgIslandGgfAndyOnlyRMAlu.bed
featureBits hg16 cpgIslandGgfAndyOnlyRM
#56275308 bases of 2865248791 (1.964%) in intersection
featureBits hg16 cpgIslandGgfAndyOnlyRMAlu
#78743130 bases of 2865248791 (2.748%) in intersection
#### mrnaBlastz track - all mrnas aligned using blastz Robert 2/20/2004
mkdir /cluster/data/hg16/bed/mrnaBlastz
cd /cluster/data/hg16/bed/mrnaBlastz
/cluster/data/genbank/bin/i386/gbGetSeqs -gbRoot=/cluster/data/genbank genbank mrna mrna.fa -db=hg16 -native
faTrimPolyA mrna.fa hg16Mrna.fa
faSize hg16Mrna.fa -detailed=on > S2.len
mkdir /cluster/bluearc/hg/mrnaHg16
faSplit sequence hg16Mrna.fa 100 /cluster/bluearc/hg/mrnaHg16/mrna
ls -1 /cluster/bluearc/scratch/hg/mrnaHg16/ > mrna.lst
hgsql hg16 < chromInfo.sql > S1.len
awk '{print $1}' S1.len |grep -v random > S1.lst
cd /cluster/bluearc/hg/gs.17/build34/mrnaBlastz
make-joblist
para create spec
para push
~angie/hummus/do.out2lav DEF > j
para create j
para push
#!/bin/tcsh
set base="/cluster/bluearc/hg/gs.17/build34/mrnaBlastz"
cd $base
mkdir -p pslRaw
foreach c (lav/*)
pushd $c
set chr=$c:t
set out=$base/pslRaw/$chr.psl
echo "Translating $chr lav to $out"
cat `ls -1 *.lav | sort -g` \
| lavToPsl stdin stdout \
| sed -e 's@scratch/hg/gs.17/build34/bothMaskedNibs//@@' | sed -e 's/\.nib:[0-9]*-[0-9]*//' > $out
popd
end
for i in `ls pslRaw/` ; do echo sortIt.sh pslRaw/$i pslSort/$i >> spec.sort ; done
para create spec.sort - sorts pslRaw to pslSort
for i in `awk '{print $1}' S1.len` ; do echo pslFilterDups pslSort/$i.psl pslFilter/$i.psl >> spec.dup ; done
para create spec.dup - filters pslSort to pslFilter using pslFilterDups
for i in `awk '{print $1}' S1.len` ; do echo axtChain -linearGap=linearGap.txt -psl pslFilter/$i.psl /scratch/hg/gs.17/build34/bothMaskedNibs/ -faQ /cluster/data/hg16/bed/mrnaBlastz/hg16Mrna.fa chain/$i.chain >> spec.chain ; done
para create spec.chain - chains pslFilter to chain
mkdir chainFilter
for i in `awk '{print $1}' S1.len` ; do echo doFilter ../chain/$i.chain ../chainFilter/$i.chain >> spec.filter ; done
spec.filter - filters chain to chainFilter using doFilter
mkdir -p preNet
cd chainFilter
foreach i ( *.chain)
chainPreNet $i ../S1.len ../S2.len ../preNet/$i
end
ls /cluster/data/hg16/nib/*.nib > S1.lst
for i in `awk '{print $1}' S1.len`; do chainToPsl ../preNet/$i.chain ../S1.len ../S2.len ../S1.lst /cluster/data/hg16/bed/mrnaBlastz/hg16Mrna.fa ../psl/$i.psl >> spec.chain2psl.new ; echo $i done chainToPsl ; done
ssh kk9-10
para create spec.chain2psl.new
for i in `awk '{print $1}' S1.len`; do hgLoadPsl -noTNameIx hg16 -table=${i}_mrnaBlastz psl/$i.psl ; echo $i done ; done
## end of blastz Mrna track
#### BUILD RETROGENE TRACK ( done Robert 6/15/2004)
cp /cluster/data/genbank/data/aligned/genbank.137.0/hg16/full/mrna.native.rawPsl.gz .
gunzip mrna.native.rawPsl.gz
awk '{OFS="\t";print $1,$2,$3,$4,$5,$6,$7,$8,$9,substr($10,1,index($10,".")-1),$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23}' mrna.native.rawPsl > mrnaBlat.psl
hgLoadPsl hg16 mrnaBlat.psl
hgsql hg16 -N -B < refGene.sql > refGene.tab
cd /cluster/bluearc/hg/gs.17/build34/mrnaBlastz/
netToBed /cluster/data/hg16/bed/blastz.mm3/axtChain/mouseSynNet.net mouseSyn.bed
ssh eieio
pslCat -nohead -check all_mrna.psl /cluster/bluearc/hg/gs.17/build34/mrnaBlastz/psl/*.psl |awk '{print $0, $1*3-$2}' | sort -k 10,10 -k 22nr -T /tmp | awk '{OFS="\t"; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21}' > blatBlastz.psl
awk '{OFS="\t"; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21}' /scratch/blatBlastz.psl > /scratch/x.psl
hgsql hg16 < mrna.sql | grep -v matches | awk '{OFS="\t"; print $2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22}' > all_mrna.psl
tawk '$12 > 1 && $12 < 9999999{x=$11;$11=$12;$12=x;print $0}' /cluster/data/kgDB/bed/hg16/kgBestMrna/sortedKnownGene.tab > sortedKnownGene.tab
ssh kkr1u00
cd /cluster/data/hg16/bed/pseudo
cp refGene.tab /iscratch/i/hg/gs.17/build34/pseudo
cp /cluster/data/hg16/bed/simpleRepeat.bed /iscratch/i/hg/gs.17/build34/pseudo
cp mrnaHg16.fa /iscratch/i/hg/gs.17/build34/pseudo
cp mouseSyn.bed /iscratch/i/hg/gs.17/build34/pseudo
cp sortedKnownGene.tab /iscratch/i/hg/gs.17/build34/pseudo
pslSplit nohead -chunkSize=121 /iscratch/i/hg/gs.17/build34/pseudo blatBlastz.psl
cd /iscratch/i/hg/gs.17/build34/pseudo
iSync
ssh kk
cd /cluster/data/hg16/bed/pseudo
para create spec.kk
para push
#post process and load track
./buildSort.sh
### PHASTCONS HUMAN/CHIMP/MOUSE/RAT/CHICKEN (6/20/04, acs)
# this is an addendum to Katie's '5-WAY MULTIZ & PHYLO-HMM' (see above)
# just redoing the 'label' step with the new 'phastCons' program
# picking up where it says "compute the conservation scores"
ssh hgwdev
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
# set up wrapper for phastCons
cat << '_EOF_' > doPhastCons
#!/bin/sh
PHAST=/cluster/bin/phast
TMP=/tmp/phastCons
file=$1
root=`basename $file .ss.gz`
chrom=`echo $root | awk -F\. '{print $1}'`
mkdir -p $TMP PREDICTIONS/$chrom PHASTCONS/$chrom
zcat $file | $PHAST/phastCons - hpmrc_rev_dg.mod --nrates 20 --transitions 0.018,0.002 --viterbi PREDICTIONS/$chrom/$root.bed --score --seqname $chrom --quiet > ${TMP}/$root.pp
gzip -c $TMP/$root.pp > PHASTCONS/$chrom/$root.pp.gz
rm $TMP/$root.pp
'_EOF_'
chmod u+x doPhastCons
# the --transitions arguments are approximate maximum likelihood
# estimates obtained by running the program *without* --transitions
# (causes estimation by EM) on five randomly selected 1M bp
# windows. All estimates were in the same ballpark (took a rough average)
# set up cluster job
ssh eieio
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
cp WINDOWS/*.ss.gz /cluster/bluearc/hg16/bed/hg16mm3rn3panTro1galGal2-SS/
logout
rm -f jobs.lst
for file in /cluster/bluearc/hg16/bed/hg16mm3rn3panTro1galGal2-SS/*.ss.gz ; do echo doPhastCons $file >> jobs.lst ; done
ssh kk
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
para create ; para try ; para push ... etc.
# now create tracks
mkdir -p PHASTCONS/wib
for dir in PHASTCONS/chr* ; do \
echo $dir ;\
chr=`basename $dir` ;\
zcat `ls $dir/*.pp.gz | sort -t\. -k2,2n` | \
wigAsciiToBinary -chrom=$chr \
-wibFile=PHASTCONS/wib/${chr}_phastCons stdin ;\
done
hgLoadWiggle hg16 phastCons PHASTCONS/wib/chr*_phastCons.wig
mkdir -p /gbdb/hg16/wib
rm -f /gbdb/hg16/wib/chr*phastCons.wib
ln -s `pwd`/PHASTCONS/wib/*.wib /gbdb/hg16/wib
chmod 775 . PHASTCONS PHASTCONS/wib
chmod 664 PHASTCONS/wib/*.wib
# tweak scores and names of predictions
cat PREDICTIONS/*/*.bed | sed 's/id //' | \
awk '{printf "%s\t%s\t%s\tlod=%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", \
$1, $2, $3, $5, 147.49 * log($5) - 240.34, $6, $7, $8, $9, \
$10, $11, $12}' > all.bed
hgLoadBed hg16 phastConsElements all.bed
# Scores are transformed as follows, for a reasonable-looking
# "spectrum". Let x_max be the maximum score (here
# x_max = 4490) and let x_med be the median score (here x_med =
# 39). The scores are transformed via the function f(x) = a *
# log x + b, s.t. f(x_med) = 300 and f(x_max) = 1000. Solving
# for a and b, you get b = (300 log x_max - 1000 log x_med) /
# (log x_max - log x_med), a = (1000 - b) / log x_max. Here a =
# 147.49, b = -240.34
#track phastCons
#shortLabel phastCons
#longLabel phastCons Conservation Score, Human/Chimp/Mouse/Rat/Chicken
#group compGeno
#priority 103
#visibility hide
#color 0,10,100
#maxHeightPixels 40
#type wig 0.0 1.0
#autoScaleDefault off
#track phastConsElements
#shortLabel phastConsElements
#longLabel phastCons Conserved Elements, Human/Chimp/Mouse/Rat/Chicken
#group compGeno
#priority 104
#visibility hide
#spectrum on
#color 0,60,120
#altColor 200,220,255
#exonArrows off
#type bed 12 .
# Ensembl 34d GENE PREDICTIONS (2004-07-13 baertsch)
## reloaded ensGene to add frame info, no change to data
/cluster/bin/i386/ldHgGene -gtf -genePredExt hg16 ensGene \
/cluster/data/hg16/bed/ensembl34d/ensGene.gtf
# TWINSCAN 1.3 GENE PREDICTIONS (2004-07-13 baertsch)
## reloaded twinscan to add frame info, no change to data
ldHgGene hg16 twinscan chr_gtf/chr*.gtf -gtf -genePredExt
#### AFFYTRANSFRAG AND AFFYTRANCRIPTION TRACKS - (2004-07-21 sugnet)
# tracks covering about 1/3 of genome with probes
# every 5bp and hybridized to RNA from SK-N-AS cell line.
# Lifted from genome version hg15.
# affyTransfrag track: lift tranfrags to hg16
cd /cluster/store6/weber/affy/transfrags/transfragsLabeled/
mkdir hg16
cd hg16
liftOver ../SK_phase2_tfgs_final.biggerThan50bp.tab /cluster/store4/gs.17/build34/bed/bedOver/33to34.chain \
SK_phase2_tfgs_final.hg16.bed SK_phase2_tfgs_final.err.bed
# check to make sure that most lifted...
wc *.bed
# 12 49 346 SK_phase2_tfgs_final.err.bed
# 170749 853745 6936780 SK_phase2_tfgs_final.hg16.bed
# 170761 853794 6937126 total
hgLoadBed hg16 affyTransfrags SK_phase2_tfgs_final.hg16.bed
# Reading SK_phase2_tfgs_final.hg16.bed
# Loaded 170749 elements of size 5
# Sorted
# Creating table definition for
# Saving bed.tab
# Loading hg16
# affyTranscription track:
cd /cluster/store6/weber/affy/graph/hg15/gz
gunzip *.gz
mkdir hg16
cd hg16
ln -s ../*.signal ./
# remapGraphs.pl just makes a quick bed file for each signal file with 1bp spans
# and then lifts via liftOver to new genome.
remapGraphs.pl -liftChain /cluster/store4/gs.17/build34/bed/bedOver/33to34.chain \
-oldGenome hg15 -newGenome hg16 *.signal
# Lifting chr13.hg16.signal.
# Lifting chr13.sk.signal.
# Lifting chr14.sk.signal.
# Lifting chr19.sk.signal.
# Lifting chr20.sk.signal.
# Lifting chr21.sk.signal.
# Lifting chr22.hg16.signal.
# Lifting chr22.sk.signal.
# Lifting chr6.sk.signal.
# Lifting chr7.sk.signal.
# Lifting chrX.sk.signal.
# Lifting chrY.sk.signal.
# runWiggles.sh just calls wigAsciiToBinary for each signal file.
cat ../runWiggles.sh | sed -e 's/hg15/hg16/g' | sed -e 's/sk/hg16/g' > runWiggles.sh
./runWiggles.sh
hgLoadWiggle -pathPrefix=/gbdb/hg16/wib/affyTranscription hg16 affyTranscription *.wig
Connected to database hg16 for track affyTranscription
Creating table definition with 13 columns in hg16.affyTranscription
Saving wiggle.tab
Loading hg16
cp *.wib /cluster/data/hg16/bed/affyTranscription/wib/
cd /gbdb/hg15/wib/affyTranscription/
ln -s /cluster/data/hg16/bed/affyTranscription/wib/*.wib ./
cd /cluster/data/hg16/bed/affyTranscription/wib/*.wib
chmod 664 *.wib
cd /cluster/store6/weber/affy/graph/hg15/gz/hg16
rm *.wib *.wig *.bed
gzip *hg16.signal &
# EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 2004/08/11 markd)
cd /cluster/bluearc/scratch/hg/gs.17/build34/rmsk
# Run Arian's DateRepsinRMoutput.pl to add extra columns telling
# whether repeats in -query are also expected in -comp species.
# Even though we already have the human-mouse linSpecReps,
# extractLinSpecReps requires two columns of DateRepsinRMoutput.pl
# additions. So add mouse, then ignore it.
# Dog in extra column 1, Mouse in extra column 2
foreach outfl ( *.out )
echo "$outfl"
/cluster/bluearc/RepeatMasker/DateRepsinRMoutput.pl \
${outfl} -query human -comp dog -comp mouse
end
# Now extract dog (extra column 1), ignore mouse.
cd /cluster/bluearc/scratch/hg/gs.17/build34
mkdir linSpecRep.notInDog
foreach f (rmsk/*.out_dog_mus)
set base = $f:t:r:r
echo $base.out.spec
/cluster/bin/scripts/extractLinSpecReps 1 $f > \
linSpecRep.notInDog/$base.out.spec
end
# Clean up.
rm /cluster/bluearc/scratch/hg/gs.17/build34/rmsk/*.out_dog_mus
# copy to iservers
ssh kkr1u00
cp -r /cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInDog /iserver/kkr1u00/i/gs.17/build34/
iSync
# BLASTZ DOG (CANFAM1) (DONE 2004/08/12 markd)
ssh kk
# store4 low on disk space; symlink to store7
mkdir -p /cluster/store7/hg16/bed/blastz.canFam1.2004-08-10
ln -s /cluster/store7/hg16/bed/blastz.canFam1.2004-08-10 /cluster/data/hg16/bed
cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
# Use default (Human-Mouse) settings for starters.
cat << '_EOF_' > DEF
# human vs. dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Default
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Human
SEQ1_DIR=/scratch/hg/gs.17/build34/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInDog
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Dog
SEQ2_DIR=/scratch/hg/canFam1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg16/bed/blastz.canFam1.2004-08-10
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
# first cluster run: raw blastz alignments
bash # if a csh/tcsh user
cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
source DEF
mkdir -p $RAW run.0
/cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j 2>log
sh ./xdir.sh
cd run.0
sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
# edit jobList to do chr19 first; hg17 run notes indicated
# this might save around 4 hours
para create jobList
para try, check, push, check, ....
#Completed: 93225 of 93225 jobs
#CPU time in finished jobs: 18459718s 307661.97m 5127.70h 213.65d
#IO & Wait Time: 429193s 7153.21m 119.22h 4.97d
#Average job time: 203s 3.38m 0.06h 0.00d
#Longest job: 18951s 315.85m 5.26h 0.22d
#Submission to last job: 58889s 981.48m 16.36h 0.68d
# second cluster run: lift raw alignments -> lav dir
ssh kki
bash # if a csh/tcsh user
cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
source DEF
mkdir run.1 lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
cd run.1
wc -l jobList
para create jobList
para try, check, push, etc ...
#Completed: 339 of 339 jobs
#CPU time in finished jobs: 3771s 62.85m 1.05h 0.04d 0.000 y
#IO & Wait Time: 6671s 111.18m 1.85h 0.08d 0.000 y
#Average job time: 31s 0.51m 0.01h 0.00d
#Longest job: 334s 5.57m 0.09h 0.00d
#Submission to last job: 1464s 24.40m 0.41h 0.02d
# third run: lav -> axt
ssh kki
cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
mkdir axtChrom pslChrom run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| /cluster/bin/x86_64/lavToAxt stdin \
/iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/canFam1/nib stdout \
| /cluster/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt
/cluster/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
../../pslChrom/$chr.psl
'_EOF_'
# << this line keeps emacs coloring happy
chmod a+x do.csh
cp /dev/null jobList
foreach d (../lav/chr*)
echo "do.csh $d" >> jobList
end
para create jobList
para try, check, push, check
#Completed: 42 of 42 jobs
#CPU time in finished jobs: 1297s 21.62m 0.36h 0.02d 0.000 y
#IO & Wait Time: 15428s 257.13m 4.29h 0.18d 0.000 y
#Average job time: 398s 6.64m 0.11h 0.00d
#Longest job: 1714s 28.57m 0.48h 0.02d
#Submission to last job: 1723s 28.72m 0.48h 0.02d
# axtChrom/chr19_random.axt is empty, probably ok
# CHAIN DOG BLASTZ (DONE)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
/iscratch/i/gs.17/build34/bothMaskedNibs \
/iscratch/i/canFam1/nib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
# edit to remove chr19_random
para create jobList
para try, check, push, check...
#Completed: 41 of 41 jobs
#CPU time in finished jobs: 8233s 137.22m 2.29h 0.10d 0.000 y
#IO & Wait Time: 11718s 195.29m 3.25h 0.14d 0.000 y
#Average job time: 487s 8.11m 0.14h 0.01d
#Longest job: 4623s 77.05m 1.28h 0.05d
#Submission to last job: 4971s 82.85m 1.38h 0.06d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# hg17 said:
# Lots of chaff with scores in the 3000's. Many very-high-scoring
# chains. So filter the chain down somewhat...
# didn't bother rechecking, just filtered.
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chain
rm chain/*
chainSplit chain all.chain
gzip all.chain.unfiltered
# Load chains into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain hg16 ${c}_chainCanFam1 $i
end
# Coverage is significantly higher than mouse:
featureBits hg16 -chrom=chr1 chainCanFam1Link
# 123343602 bases of 221562941 (55.670%) in intersection
# NET DOG BLASTZ (DONE 2004/08/15)
ssh kolossus
cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
netClass noClass.net hg16 canFam1 dog.net
# Make a 'syntenic' subset:
ssh kksilo
cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
rm noClass.net
# Make a 'syntenic' subset of these with
netFilter -syn dog.net > dogSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
netFilter -minGap=10 dog.net | hgLoadNet hg16 netCanFam1 stdin
netFilter -minGap=10 dogSyn.net | hgLoadNet hg16 syntenyNetCanFam1 stdin
# Add entries for chainCanFam1, netCanFam1 to human/hg16 trackDb
# LIFTOVER CHAIN TO DOG CANFAM1 (DONE 2004-09-16 kate)
ssh kolossus
cd /cluster/data/hg16/bed/blastz.canFam1/axtChain
time netChainSubset dog.net all.chain \
/cluster/data/hg16/bed/liftOver/hg16ToCanFam1.chain
# LOAD ENSEMBL ESTS (DONE 2004-09-07 braney)
cd /cluster/data/hg16/bed
mkdir ensEst
cd ensEst
# Get the ensembl EST data from http://www.ensembl.org/
# Go to the Martview link
# Choose Homo sapiens as the organism
# Follow this sequence through the pages:
# Page 1) Choose the Ensembl ESTs choice. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Structures" box.
# Page 4) Choose GTF as the ouput, choose gzip compression and then hit Export.
# Name file ensEst.gff.gz
# Ensembl handles random chromosomes differently than us. They give the
# contig name. We can lift these up to our chrN_random chromosomes
gunzip ensEst.gff.gz
sed "/^[0-9XY]*\t/d" ensEst.gff | sed "s/^.*_NT/NT/" > random.gff
liftUp -type=".gff" liftRandom.gff /cluster/data/hg16/jkStuff/liftAll.lft warn random.gff
sed "/_NT_/d" ensEst.gff | sed "s/^/chr/" > unrandom.gff
cat liftRandom.gff unrandom.gff > fixed.gff
ldHgGene hg16 ensESTGene fixed.gff
# Get the ensembl protein data from http://www.ensembl.org/
# Go to the Martview link
# Choose Homo sapien as the organism
# Follow this sequence through the pages:
# Page 1) Choose the Ensembl ESTs choice. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Sequences" box.
# Page 4) Choose Transcripts/Proteins and Gene sequence Only as the ouput,
# choose text/fasta and gzip compression and then hit export. Name to ensEstPep.fasta
gunzip ensEstPep.fasta.gz
sed "s/|.*//" ensEstPep.fasta > fixedPep.fa
hgPepPred hg16 generic ensESTPep fixedPep.fa
# ensGtp associates geneId/transcriptId/proteinId for name searches
# Use ensMart to create it as above, except:
# Page 3) Choose the "Features" box. In "Ensembl Attributes", check
# Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# Choose Text, tab-separated as the output format.
# Save file as ensGtp.tsv.gz
gunzip ensGtp.tsv.gz
sed "s/ensGtp/ensESTGtp/" ~/kent/src/hg/lib/ensGtp.sql | hgsql hg16
echo "load data local infile 'ensESTGtp.tsv' into table ensESTGtp ignore 1 lines" | hgsql hg16
# QA Note - table ensGtp was updated on 2004-08-18 to remove a header line that
was included in the actual table data. This was not ever pushed out to the rr.
Table fix (push) done on 2006-01-31 (Jen). Original push on 2004-06. No other
pushQ entries exist for table change on 2004-08.
# BLASTZ MOUSE MM5 (DONE 2004-09-10 kate)
ssh kk
# use store7 (lots of space)
mkdir -p /cluster/store7/hg16/bed/blastz.mm5.2004-09-10
ln -s /cluster/store7/hg16/bed/blastz.mm5.2004-09-10 \
/cluster/data/hg16/bed
cd /cluster/data/hg16/bed
ln -s blastz.mm5.2004-09-10 blastz.mm5
cd blastz.mm5
cat << '_EOF_' > DEF
# human vs. mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInMouse
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Mouse
SEQ2_DIR=/scratch/mus/mm5/softNib
# RMSK not currently used
SEQ2_RMSK=/scratch/mus/mm5/rmsk
# FLAG not currently used
SEQ2_FLAG=-rodent
SEQ2_SMSK=/scratch/mus/mm5/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg16/bed/blastz.mm5.2004-09-10
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
# first cluster run: blastz alignments
ssh kk
bash # if a csh/tcsh user
cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10
source DEF
mkdir $RAW run.0
/cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
sh ./xdir.sh
cd run.0
sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
para create jobList
# 44060 jobs
para try, check, push, check, ....
# Average job time: 382s 6.37m 0.11h 0.00d
# Longest job: 4510s 75.17m 1.25h 0.05d
# Submission to last job: 26324s 438.73m 7.31h 0.30d
# second cluster run: lift raw alignments -> lav dir
ssh kki
bash # if a csh/tcsh user
cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10
source DEF
mkdir run.1 lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
cd run.1
wc -l jobList
para create jobList
# 339 jobs
para try, check, push, etc ...
# Average job time: 16s 0.27m 0.00h 0.00d
# Longest job: 112s 1.87m 0.03h 0.00d
# Submission to last job: 401s 6.68m 0.11h 0.00d
# convert lav files to axt
ssh kki
cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10
mkdir axtChrom pslChrom
# a new run directory
mkdir run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| /cluster/bin/x86_64/lavToAxt -dropSelf stdin \
/iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/mus/mm5/softNib stdout \
| /cluster/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt
/cluster/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
../../pslChrom/$chr.psl
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x do.csh
cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.mm5.2004-09-10/pslChrom/$(root1).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
ls -1Sd ../lav/chr* > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
# 42 jobs
head jobList
para create jobList
para try, check, push, check,...
# Load database tables
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm5/pslChrom
foreach f (*.psl)
set c = $f:r
hgLoadPsl -noTNameIx hg16 -table=${c}_blastzMm5 $f
end
# takes 30-60 min
# CHAIN MOUSE MM5 BLASTZ (DONE 2004-09-15 kate)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
/iscratch/i/gs.17/build34/bothMaskedNibs \
/iscratch/i/mus/mm5/softNib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
# edit to remove chr19_random
para create jobList
# 41 jobs
para try, check, push, check...
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
time chainMergeSort run1/chain/*.chain > all.chain
# 5 min -- 230.070u 58.980s 5:07.13 94.1% 0+0k 0+0io 117pf+0w
time chainSplit chain all.chain
# 5 min -- 208.490u 56.360s 4:48.81 91.7% 0+0k 0+0io 125pf+0w
rm run1/chain/*.chain
# Load chains into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain/chain
foreach i (*.chain)
set c = $i:r
echo $c
hgLoadChain hg16 ${c}_chainMm5 $i
end
# compare with previous mouse, and with this assembly on later human
featureBits hg16 -chrom=chr1 chainMm5
featureBits hg17 -chrom=chr1 chainMm5
featureBits hg16 -chrom=chr1 chainMm3
featureBits hg16 -chrom=chr1 chainMm5Link
# 83288228 bases of 221562941 (37.591%) in intersection
featureBits hg17 -chrom=chr1 chainMm5Link
# 83773012 bases of 222827847 (37.595%) in intersection
featureBits hg16 -chrom=chr1 chainMm3Link
# 82665800 bases of 221562941 (37.310%) in intersection
# NET MOUSE MM5 BLASTZ (DONE 2004-09-16 kate)
ssh kolossus
cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# < 10 minutes
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
time netClass noClass.net hg16 mm5 human.net
# 15 minutes
# Make a 'syntenic' subset:
ssh kksilo
cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
rm noClass.net
# Make a 'syntenic' subset of these with
netFilter -syn human.net > humanSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
netFilter -minGap=10 human.net | hgLoadNet hg16 netMm5 stdin
netFilter -minGap=10 humanSyn.net | hgLoadNet hg16 netSyntenyMm5 stdin
# GOT HERE
# Add entries for chainMm5, netMm5, netSyntenyMm5
# human/hg16 trackDb
# LIFTOVER CHAIN TO MOUSE MM5 (DONE 2004-09-16 kate)
ssh kolossus
cd /cluster/data/hg16/bed/blastz.mm5/axtChain
time netChainSubset human.net all.chain \
/cluster/data/hg16/bed/liftOver/hg16ToMm5.chain
# 7 mins.
# TIGHT FOR MOUSE MM5 (TBD kate)
# BEST FOR MOUSE MM5 (TBD kate)
# SYNTENIC NET FOR MOUSE MM5 (TBD kate)
# DOWNLOADS FOR MOUSE MM5 (TBD kate)
# BLASTZ FOR ZEBRAFISH DANRER1 (WORKING 2004-09-29 kate)
# Treat all repeats as lineage-specific
ssh kkr1u00
mkdir /iscratch/i/gs.17/build34/linSpecRep.notInZebrafish
foreach f (/iscratch/i/gs.17/build34/rmsk/chr*.fa.out)
cp -p $f \
/iscratch/i/gs.17/build34/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
end
iSync
ssh kk
# use store7 (lots of space)
mkdir -p /cluster/store7/hg16/bed/blastz.danRer1.2004-09-29
ln -s /cluster/store7/hg16/bed/blastz.danRer1.2004-09-29 \
/cluster/data/hg16/bed
cd /cluster/data/hg16/bed
ln -s blastz.danRer1.2004-09-29 blastz.danRer1
cd blastz.danRer1
cat << '_EOF_' > DEF
# human vs zebrafish (danRer1)
# params for zebrafish -- L=6000 (threshold for gapped alignments)
# (same params as used for Fugu)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Reuse parameters from hg16-fr1.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# Target: Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInZebrafish
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# Query: Zebrafish (danRer1)
SEQ2_DIR=/iscratch/i/danRer1/nib/
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/danRer1/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/hg16/bed/blastz.danRer1.2004-09-29
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
# Save the DEF file in the current standard place
cp DEF ~angie/hummus/DEF.hg16-danRer1.2004-09-29
# prepare first cluster run
ssh kk
bash # if a csh/tcsh user
cd /cluster/data/hg16/bed/blastz.danRer1
source DEF
mkdir $RAW run.0
/cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
sh ./xdir.sh
cd run.0
sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
para create jobList
# 57630 jobs
para try, check, push, check, ....
# Average job time: 477s 7.95m 0.13h 0.01d
# Longest job: 12147s 202.45m 3.37h 0.14d
# second cluster run: lift raw alignments -> lav dir
ssh kki
cd /cluster/data/hg16/bed/blastz.danRer1
bash # if a csh/tcsh user
source DEF
mkdir run.1 lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
cd run.1
wc -l jobList
para create jobList
# 339 jobs
para try
para check
para push
# GOT HERE
# third run: lav -> axt
ssh kki
cd /cluster/data/hg16/bed/blastz.danRer1
mkdir axtChrom run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /iscratch/i/gs.17/build34/bothMaskedNibs \
/iscratch/i/danRer1/nib stdout \
| axtSort stdin $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x do.csh
cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg16/bed/blastz.danRer1/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
ls -1Sd ../lav/chr* > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
head jobList
para create jobList
# 42 jobs
# GOT HERE
# CHAIN TETRAODON (tetNig1) BLASTZ (DONE, 2004-08-26, hartera)
# Make chains with rescored blastz
# Run axtChain on little cluster
ssh kki
cd /cluster/data/hg17/bed/blastz.tetNig1
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/hg17/bed/blastz.tetNig1/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# Make our own linear gap file with reduced gap penalties,
# in hopes of getting longer chains - works well for species at
# chicken-human distance or greater
cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize 11
smallSize 111
position 1 2 3 11 111 2111 12111 32111 72111 152111 252111
qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -linearGap=../../chickenHumanTuned.gap $1 \
/iscratch/i/gs.18/build35/bothMaskedNibs \
/iscratch/i/tetNig1/nib $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
ara try, check, push, check,...
=======
# 29 jobs
para try, check, push, check,...
# SEGMENTAL DUPLICATIONS (DONE 10/21/04 angie)
ssh hgwdev
mkdir /cluster/data/hg16/bed/genomicSuperDups
cd /cluster/data/hg16/bed/genomicSuperDups
wget http://humanparalogy.gs.washington.edu/segDupDb.tar
# This tar file contains files for both hg16 and hg17. A note
# from Xinwei She about the contents:
#Build34 contains 4 tables: 3 of them are already in the genome browser source code:
#genomicSuperDups, celereCoverage and celeraDupPositive. A new table, vanillaTrack,
#which display the Celera assembly overlay in the public assembly build34, is added.
#There trackDb entries can be founded in the file trackDb.add.
#
#Build35 contains only 2 tables: genomicSuperDups and celeraDupPositive.
tar xvf segDupDb.tar
cd bd34
# use tail +2 to skip past the header line:
zcat celeraCoverage.tab.gz | tail +2 \
| hgLoadBed -tab hg16 celeraCoverage stdin
zcat celeraDupPositive.tab.gz | tail +2 \
| hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/celeraDupPositive.sql \
hg16 celeraDupPositive stdin
zcat genomicSuperDups.tab.gz | tail +2 \
| hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql \
hg16 genomicSuperDups stdin
# Change the name of "vanillaTrack" to celeraOverlay:
zcat vanillaTrack.mysqldump.gz | sed -e 's/vanillaTrack/celeraOverlay/g' \
| hgsql hg16
# It needs a new index, and it needs a bin field, so dump out its
# contents and load them back in using hgLoadBed and an edited
# SQL definition:
hgsql hg16 -N -e 'select * from celeraOverlay' > celeraOverlay.bed
# Make a ~/kent/src/hg/lib/celeraOverlay.as and run autoSql.
# Add bin and indices to celeraOverlay.sql, and reload with hgLoadBed:
hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/celeraOverlay.sql \
hg16 celeraOverlay celeraOverlay.bed
# clean up
rm celeraOverlay.bed bed.tab
# YALE PSEUDOGENES (started Robert Baertsch, finished JK 2/21/05)
ssh hgwdev
cd /cluster/data/hg16/bed
mkdir pseudoYale
cd pseudoYale
# Place file obtained from Mark Gerstein at yale in pseudoYale.gtf
ldHgGene hg16 pseudoYale pseudoYale.gtf
# Note - I'm guessing how this goes. Robert left no record. -jk
## refresh vega tracks with vega build30 (done 5/4/04 Robert)
##download vega mysql tables
cd /cluster/store8/ensembl
mkdir vega30_35c
cd vega30_35c
ln /cluster/store8/ensembl/vega30_35c /cluster/data/hg17/bed/vega30 -s
for i in `cat tables` ; do wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/$i.gz ; done
wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/homo_sapiens_vega_30_35c_mysql40_compatible..sql.gz
gunzip *.gz
##create mysql database
mysql
create database vega30
use vega30
source homo_sapiens_vega_30_35c_mysql40_compatible.sql
source dropMt.sql
source load.sql
exit
hgsql vega30 -N -B < vegaGene.sql > vegaGene.tab
awk -f vegaGene.awk < vegaGene.tab > vegaGene.gp
ldHgGene hg17 vegaGene -predTab vegaGene.gp -gtf -genePredExt
hgsql vega30 -N -B < vegaPseudo.sql > vegaPseudo.tab
awk -f vegaPseudo.awk < vegaPseudo.tab > vegaPseudo.gp
ldHgGene hg17 vegaPseudoGene -predTab vegaPseudo.gp -gtf -genePredExt
#load processed pseudogenes
grep Processed vegaPseudo.tab > vegaProcPseudo.tab
awk -f vegaPseudo.awk < vegaProcPseudo.tab > vegaProcPseudo.gp
ldHgGene hg17 vegaProcessedPseudo -predTab vegaProcPseudo.gp -gtf -genePredExt
#load vegaInfo
hgsql vega30 -N -B < vegaGeneInfo.sql > vegaInfo.tab
hgsql vega30 -N -B < vegaPseudoInfo.sql >> vegaInfo.tab
hgsql hg17 -N -B < /cluster/home/baertsch/kent/src/hg/lib/vegaInfo.sql
echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg17 -N -B
#load down to hg16
liftOver vegaGene.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaGeneHg16.gp unMapped.gp -genePred
liftOver vegaPseudo.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaPseudoGeneHg16.gp unMappedPseudo.gp -genePred
ldHgGene hg16 vegaGene -predTab vegaGeneHg16.gp -gtf
ldHgGene hg16 vegaPseudoGene -predTab vegaPseudoGeneHg16.gp -gtf
echo 'truncate table vegaInfo' | hgsql hg16 -N -B
echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg16 -N -B
# QA note - table vegaPep dropped during this update. Not dropped from rr at
time of initial push, creating a -times error in joinerCheck. Table vegaPep
dropped from hgwbeta and rr/mgc on 2006-01-31.
#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)
##########################################################################
# CNPs from University of Washington (Done, Heather and Daryl, June/July 2005)
# data from http://humanparalogy.gs.washington.edu/structuralvariation
ssh hgwdev
cd /cluster/data/hg16/bed
mkdir cnp
cd cnp
# Sharp data
cp dupArray.txt cnpSharp.bed.orig
# change CNP type to match Iafrate data (with permission from Andy)
sed -e "s/dup/Gain" cnpSharp.bed.orig > cnpSharp.bed.2
sed -e "s/del/Loss/" cnpSharp.bed.2 > cnpSharp.bed.3
sed -e "s/Both Loss and Gain/Gain and Loss/" cnpSharp.bed.3 > cnpSharp.bed
hgLoadBed hg16 cnpSharp -tab -sqlTable=cnpSharp.sql cnpSharp.bed
# Loaded 160 elements of size 14
# note: 11 names with special characters: CTD-2183E4*, RP11-111A4?, RP11-325E8#, RP11-1000I9*, RP11-159F11*,
# RP11-177L24*, RP11-136P13*, RP11-1151C19*, RP11-1008M3*, RP11-379N11?, CTD-3185D7#
# no apparent problems with these
hgsql hgFixed < cnpSharpCutoff.sql
echo 'load data local infile "sampleCUTOFF.txt" into table cnpSharpCutoff' | hgsql hgFixed
hgsql hg16 < cnpSharpSamples.sql
echo 'load data local infile "andyArraySample.txt" into table cnpSharpSamples' | hgsql hg16
hgsql hg16 < cnpSharpSampleCount.sql
hgsql hg16 < sampleCount.sql
# fosmid discordants
# don't need the id column
cp fosmidDiscordant.txt fosmidDiscordant.bed
hgLoadBed hg16 fosmidDiscordantPrelim -tab -sqlTable=fosmidDiscordantPrelim.sql fosmidDiscordant.bed
hgsql hg16 < fosmidDiscordant.sql
echo 'insert into fosmidDiscordant select bin, chrom, chromStart, chromEnd, name from fosmidDiscordantPrelim' | hgsql hg16
echo 'drop table fosmidDiscordantPrelim' | hgsql hg16
# Iafrate data
cp Iafrate.txt cnpIafrate.bed
hgLoadBed hg16 cnpIafrate -tab -sqlTable=cnpIafrate.sql cnpIafrate.bed
# Sebat data
cp Sebat.txt cnpSebat.bed
hgLoadBed hg16 cnpSebat -tab -sqlTable=cnpSebat.sql cnpSebat.bed
# deletions added May 2006
# From mccarroll@molbio.mgh.harvard.edu
genId.pl < mcCarrolldels.txt > mcCarrolldels.bed
hgLoadBed hg16 -noBin -tab delMccarroll mcCarrolldels.bed
# Hinds data via Andy Sharp
sort -n hindsDels.txt > hindsDels.sort
genId.pl < hindsDels.sort > hindsDels.bed
hgLoadBed hg16 -noBin -tab delHinds hindsDels.bed
# From conrad@uchicago.edu
conrad.pl < conradDels.txt > conradDels.bed
hgLoadBed hg16 -noBin -tab delConrad conradDels.bed
##########################################################################
# sno/miRNA track from Michel Weber (DONE - 2005-06-16 - Hiram)
# received the data file UCSC_snotrack_hg16.txt via email
ssh hgwdev
cd /cluster/data/hg16/bed/wgRna
# As a quick first pass at classification, take a look at the
# items in the hg17.wgRna table and use those as a guide
hgsql -N -e "select * from wgRna;" hg17 > hg17.wgRna.txt
awk '{print $5,$10}' hg17.wgRna.txt > name.type.hg17
# combine this new sno data with the existing miRNA data
hgsql -N -e "select * from miRNA;" hg16 > hg16.miRNA.txt
cat << '_EOF_' > addTypes.pl
#!/usr/bin/env perl
use warnings;
use strict;
my %types; # key is name, value is the type
open (FH, "name.type.hg17") or die "Can not open name.type.hg17";
while (my $line=<FH>)
{
chomp $line;
my ($name, $type) = split('\s+',$line);
$types{$name} = $type;
}
close (FH);
open (FH,"grep ^chr UCSC_snotrack_hg16.txt | sort -k1,1 -k2,2n|") or
die "can not open UCSC_snotrack_hg16.txt";
while (my $line=<FH>)
{
chomp $line;
my $type="unknown";
my ($chrom, $start, $end, $name, $score, $strand) = split('\s+',$line);
if (exists($types{$name})) { $type = $types{$name}; }
else { if ($name =~ m/^HBII/) { $type = "CDBox"; } }
print "$chrom\t$start\t$end\t$name\t$score\t$strand\t0\t0\t$type\n";
}
close (FH);
'_EOF_'
# happy emacs
chmod +x addTypes.pl
awk '{print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t""miRna"}' \
hg16.miRNA.txt > hg16.wgRna.tab
./addTypes.pl >> hg16.wgRna.tab
hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/wgRna.sql hg16 wgRna \
hg16.wgRna.tab
# this leaves 16 items classified as unknown, request to
# Michel Weber for proper classification
################################################################################
# Build hg17Kg table for KG II for hg16, using hg17 KG data (DONE 2005-07-11 Fan).
ssh hgwdev
cd /cluster/data/mm6/bed
mkdir hg17Kg
cd hg17Kg
hgsql hg16 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl
hgsql hg16 -N -e \
'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \
|sort -u > all_mrna.cds
bash
mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
exit
hgsql hg16 -e 'drop table mrnaGp'
hgsql hg16 < ~/src/hg/lib/mrnaGp.sql
hgsql hg16 -e 'load data local infile "all_mrna.gp" into table mrnaGp'
hgsql hg16 -N -e \
'select mrnaGp.* from mrnaGp,hg17.knownGene where mrnaGp.name = knownGene.name and mrnaGp.chrom=knownGene.chrom' \
|sort -u > mrnaGp2.tab
hgsql hg16 -e 'drop table mrnaGp2'
hgsql hg16 < ~/src/hg/lib/mrnaGp2.sql
hgsql hg16 -e 'load data local infile "mrnaGp2.tab" into table mrnaGp2'
# Create hg16Kg table in hg17 to get over a hurdle that we can not do join
# between mySQL DBs
hgsql hg17 -e 'drop table hg16Kg'
hgsql hg17 < ~/src/hg/lib/hg16Kg.sql
hgsql hg16 -N -e 'select * from knownGene' >hg16Kg.tab
hgsql hg17 -e 'load data local infile "hg16Kg.tab" into table hg16Kg'
hgsql hg17 -N -e \
'select hg16Kg.* from hg16Kg, knownGene where hg16Kg.name=knownGene.name and knownGene.name not like "NM_%" and hg16Kg.chrom=knownGene.chrom '\
>j
cut -f 1-10 j >j1
# j1 are mRNA records through old KG process.
# j2 are RefSeq records based on hg17 KG
# mrnaGp2 are mRNA records based on hg17 KG non-Refseq entries and GenBank CDS data (which is incomplete).
hgsql hg16 -N -e \
'select refGene.* from refGene, hg17.knownGene where hg17.knownGene.name=refGene.name' >j2
cat j1 j2 mrnaGp2.tab |sort -u >j.tab
~/kent/src/hg/protein/sortKg.pl j.tab >hg17Kg.tab
wc hg17Kg.tab
hgsql hg16 -e "delete from hg17Kg"
hgsql hg16 -e 'load data local infile "hg17Kg.tab" into table hg17Kg'
####################################################################
# Make mouse ortholog column using blastp on mm6 known genes. (DONE 7/12/05, Fan).
# First make mouse protein database and copy it to /cluster/panasas
# if it doesn't exist already
# This already exists. See makeMm6.doc for procedure
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/hg16/bed/blastp/mm6
cd /cluster/data/hg16/bed/blastp/mm6
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/panasas/home/store/mm6/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# this echo trick is used because otherwise the command line is
# too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
Completed: 5812 of 5812 jobs
CPU time in finished jobs: 96031s 1600.52m 26.68h 1.11d 0.003 y
IO & Wait Time: 15641s 260.68m 4.34h 0.18d 0.000 y
Average job time: 19s 0.32m 0.01h 0.00d
Longest running job: 0s 0.00m 0.00h 0.00d
Longest finished job: 168s 2.80m 0.05h 0.00d
Submission to last job: 766s 12.77m 0.21h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/hg16/bed/blastp/mm6/run/out
hgLoadBlastTab hg16 mmBlastTab -maxPer=1 *.tab
# Scanning through 5812 files
# Loading database with 35707 rows
# Update otherOrg.ra under hg/hgGene/hgGeneData/Human/hg16 to mm6 instead of
# mm4.
##########################################################################
# EVOFOLD - RNA secondary structure predictions lifted from hg17 (Jakob Skou Pedersen)
# Jakob Skou Pedersen, July 12, 2005
ssh -C hgwdev
mkdir -p /cluster/data/hg16/bed/evofold
cd /cluster/data/hg16/bed/evofold
# lifting folds from hg17 to hg16
echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from evofold;" | hgsql hg17 | sed -e 1d > foldsHg17.bed
liftOver -minMatch=1.0 foldsHg17.bed /cluster/data/hg17/bed/liftOver/hg17ToHg16.over.chain tmp.bed unmapped.bed
# remove elements which are wrong size after lifting
awk '$3-$2 == $7' tmp.bed > foldsHg16.bed
hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql hg16 evofold foldsHg16.bed
# clean up
rm foldsHg17.bed unmapped.bed tmp.bed
# Tajima's D (DONE -- 2005-09-20 -- Daryl)
# Data from Chris Carlson in Debbie Nickerson's lab
# Chris Carlson [csc47<AT>u<DOT>washington<DOT>edu]
# lifted down from hg17. See makeHg17.doc for details
# AFFYHUEX1 track (sugnet Wed Oct 5 12:18:18 PDT 2005)
mkdir hg16
cd hg16
pwd
# /cluster/store1/sugnet/affymetrixHumanAllExon/hg16
mkdir gff beds annot
cd gff
# download gff design files
cp ../../hg17/gff/parseGff.pl .
# parse gff script...
#!/usr/bin/perl -w
if(scalar(@ARGV) == 0) {
print STDERR "parseGff.pl - Parse out affymetrixes gff annotation
probesets for human all exon design.
usage:
parseGff.pl file1.design.gff file2.design.gff ... fileN.design.gff
";
exit(1);
}
sub splitField($) {
my $l = shift(@_);
my @w = split / /, $l;
return $w[1];
}
while($file = shift(@ARGV)) {
if(!($file =~ /(.+)\.gff/)) {
die "$file doesn't have .gff suffix\n";
}
$prefix = $1;
print STDERR "Doing file $file.\n";
open(IN, $file) or die "Can't open $file to read.";
open(BED, ">../beds/$prefix.pset.bed") or die "Can't open ../beds/$prefix.pset.bed to write.";
open(ANNOT, ">../annot/$prefix.tab") or die "Can't open ../annot/$prefix.tab to write.";
while($line = <IN>) {
# Only want the probeset records.
if($line =~ /\tprobeset\t/) {
$score = 0;
$cds = 0;
$bounded = 0;
chomp($line);
# pop off an microsoft line endings.
$line =~ s/\r$//;
@words = split /\t/, $line;
# This makes the evidence be comman separated.
$words[8] =~ s/\" \"/,/g;
# This gets rid of pesky quotes.
$words[8] =~ s/\"//g;
# Set the score based on the annotation type
if($words[8] =~ /full/) {
$score = 200;
}
elsif($words[8] =~ /extended/) {
$score = 500;
}
elsif($words[8] =~ /core/) {
$score = 900;
}
if($words[8] =~ /bounded/) {
$score -= 200;
}
if($words[8] =~ /cds/) {
$score += 100;
}
if($score <= 0) {
$score = 100;
}
# Print out the annotation fields.
@fields = split /; /,$words[8];
$id = splitField($fields[1]);
$f = shift(@fields);
$f = splitField($f);
print ANNOT "$f";
while($f = shift(@fields)) {
if($f =~ /^bounded/) {
$bounded = 1;
}
if($f =~ /^cds/) {
$cds = 1;
}
if(!($f =~ /^bounded/ || $f =~ /^cds/)) {
$f = splitField($f);
print ANNOT "\t$f";
}
}
print ANNOT "\t$bounded\t$cds";
print ANNOT "\n";
print BED "$words[0]\t$words[3]\t$words[4]\t$id\t$score\t$words[6]\n";
}
}
close(IN);
close(BED);
close(ANNOT);
}
./parseGff.pl *.gff
cat beds/*.bed > affyHuEx1.bed
hgLoadBed hg16 affyHuEx1 affyHuEx1.bed -strict
cat annot/*.tab > affyHuEx1.annot.tab
cp ../hg17/affyHuEx1Annot.sql ./
# Contents of affyHuEx1Annot.sql file
CREATE TABLE affyHuEx1Annot (
numIndependentProbes smallint not null,
probesetId int(11) not null,
exonClustId int(11) not null,
numNonOverlapProbes smallint not null,
probeCount smallint not null,
transcriptClustId int(11) not null,
probesetType smallint not null,
numXHybeProbe smallint not null,
psrId int(11) not null,
level varchar(10) not null,
evidence varchar(255) not null,
bounded smallint not null,
cds smallint not null,
PRIMARY KEY (probesetId)
);
hg16S -A < affyHuEx1Annot.sql
echo "load data local infile 'affyHuEx1.annot.tab' into table affyHuEx1Annot;" | hg16S -A
# end AFFYHUEX1 track
##########################################################################
# NHGRI DNASE I HYPERSENSITIVE SITES (2005-10-05 kate)
# Submitted by Greg Crawford via web site,
# http://research.nhgri.nih.gov/DNaseHS/May2005/
# In addition, a file containing the 'randoms' was FTP'ed by Greg
# NOTE: bad chr8_random entry removed, as per G. Crawford
# Same display as ENCODE track by Angie...
# Jim asked to add scores for grayscale-coloring:
# clusters of 2 drawn in 50%, clusters of 3 drawn in 75%,
# and clusters of 4 or more drawn in 100% black.
mkdir /cluster/data/hg16/bed/nhgri/lab
cd /cluster/data/hg16/bed/nhgri/lab
foreach c (`cut -f 1 /cluster/data/hg16/chrom.sizes`)
echo $c
wget -nd http://research.nhgri.nih.gov/DNaseHS/May2005/clusters/$c.LynxClusters.bed
end
cd ..
# special handling for ID's on chrM (they are preceded by 'M_')
ls lab/chr*.bed lab/randoms.txt \
| grep -v chrM | xargs cat | grep '^chr' \
| perl -wpe 'if (/500bp_(\d+)_(\d+)/) { \
$id = $1 . "_" . $2; \
$score = ($2 >= 4) ? 1000 : $2 * 250; \
s/500bp.+/$id\t$score/; } else { die "parse"; }' > hs.bed
cat lab/chrM*.bed | grep '^chr' \
| perl -wpe 'if (/500bp_(M_\d+)_(\d+)/) { \
$id = $1 . "_" . $2; \
$score = ($2 >= 4) ? 1000 : $2 * 250; \
s/500bp.*/$id\t$score/; } else { die "parse"; }' >> hs.bed
hgLoadBed hg16 nhgriDnaseHs hs.bed
# Loaded 14224 elements of size 5
checkTableCoords hg16 nhgriDnaseHs
# MYTOUCH FIX - jen - 2006-01-24
sudo mytouch hg16 superfamily 0407141100.00
sudo mytouch hg16 acemblyPep 0406151200.00
sudo mytouch hg16 twinscanPep 0407141200.00
sudo mytouch hg16 superfamily 0407141100.00
sudo mytouch hg16 ensPep 0407141100.00
sudo mytouch hg16 knownToEnsembl 0407141100.00
sudo mytouch hg16 sfDescription 0407141100.00
sudo mytouch hg16 ensEstGtp 0409081800.00
sudo mytouch hg16 ensEstPep 0409081800.00
##########################################################################
# AFFY HUEX1 OFF-BY-ONE FIX (Andy 2006-12-14)
ssh hgwdev
cd /cluster/data/hg16/bed/affyHumanExon
liftOver /cluster/data/hg17/bed/affyHumanExon/affyHuEx1.fixed.bed \
/gbdb/hg17/liftOver/hg17ToHg16.over.chain.gz affyHuEx1.fixed.bed affyHuEx1.unmapped
awk 'BEGIN{OFS="\t"}{print $4,$3-$2}' affyHuEx1.fixed.bed | sort -k2,2nr | head
#2325773 204918
#2402134 204802
#3645108 60419
#2366900 52086
#3016074 9552
#3641787 8061
#2321649 8054
# So there's 4 of them with problems this time:
egrep -v "\b(2325773|2402134|3645108|2366900)\b" affyHuEx1.fixed.bed > alreadyok.bed
egrep "\b(2325773|2402134|3645108|2366900)\b" affyHuEx1.fixed.bed \
/cluster/data/hg17/bed/affyHumanExon/affyHuEx1.fixed.bed > good.hg17.bed
bedToFa /cluster/data/hg17/hg17.2bit good.hg17.bed good.hg17.fa
gfClient blat6 17785 /cluster/data/hg16/nib good.hg17.fa bad.hg16.psl
tail +6 bad.hg16.psl | awk '$11==$13{print}' > good.hg16.psl
pslToBed good.hg16.psl good.hg16.bed
# Scores were lost in the transformations. Put em back in.
egrep "\b(2325773|2402134|3645108|2366900)\b" affyHuEx1.fixed.bed
#chr1 24924744 25129662 2325773 500 +
#chr1 24924872 25129674 2402134 900 -
#chr1 168139941 168192027 2366900 1000 +
#chr16 2600606 2661025 3645108 200 +
awk 'BEGIN{OFS="\t"}
$4=="2325773"{score="500";}
$4=="2402134"{score="900";}
$4=="3645108"{score="200";}
{print $1,$2,$3,$4,score,$6}' good.hg16.bed > good.bed
cat alreadyok.bed good.bed > affyHuEx1.fixed.bed
bedSort affyHuEx1.fixed.bed tmp.bed
rm good.* bad.* alreadyok.bed
hgLoadBed hg16 affyHuEx1 affyHuEx1.fixed.bed
##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page
# genbank release 157.0 now contains misc_diff fields for MGC clones
# reloading mRNAs results in gbMiscDiff table being created.
./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna hg16
################################################
# SPLIT EXPRESSION & REGULATION GROUPS
# (2008-09-09 kate)
echo "insert into grp (name, label, priority) values ('expression', 'Expression', 4.5)" | hgsql hg16
echo "update grp set label='Regulation' where name='regulation'" | hgsql hg16
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
echo hg16 panTro1 mm3 rn3 galGal2> /hive/data/genomes/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/species.lst
update genbank.conf:
hg16.upstreamGeneTbl = refGene
hg16.upstreamMaf = mzPt1Mm3Rn3Gg2_pHMM /hive/data/genomes/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/species.lst
#############################################################################
# MAKE PCR TARGET FOR UCSC GENES (DONE 11/4/08 angie)
ssh hgwdev
mkdir /cluster/data/hg16/bed/mrnaPcr
cd /cluster/data/hg16/bed/mrnaPcr
# First, get consistent FA and PSL for UCSC Genes.
genePredToBed /cluster/data/hg16/bed/kgHg16C/kgBestMrna/knownGene.tab \
> ucscGenes.bed
hgsql hg16 -NBe 'select kgId,geneSymbol from kgXref' \
| perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
> idSub.txt
subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
sequenceForBed -keepName -db=hg16 -bedIn=ucscGenesIdSubbed.bed \
-fastaOut=stdout \
| faToTwoBit -ignoreDups stdin kgTargetSeq.2bit
cut -f 1-10 /cluster/data/hg16/bed/kgHg16C/kgBestMrna/knownGene.tab \
| genePredToFakePsl hg16 stdin kgTargetAli.psl /dev/null
# Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
cd /cluster/data/hg16/bed/mrnaPcr
hgLoadPsl hg16 kgTargetAli.psl
mkdir /gbdb/hg16/targetDb
ln -s /cluster/data/hg16/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/hg16/targetDb/
# Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
# /gbdb/hg16/targetDb/kgTargetSeq.2bit .
ssh hgwdev
# Add records to hgcentraltest blatServers and targetDb:
hgsql hgcentraltest -e \
'INSERT into blatServers values ("hg16Kg", "blat13", 17795, 0, 1);'
hgsql hgcentraltest -e \
'INSERT into targetDb values("hg16Kg", "UCSC Genes", \
"hg16", "kgTargetAli", "", "", \
"/gbdb/hg16/targetDb/kgTargetSeq.2bit", 1, now(), "");'
#############################################################################
+# LIFTOVER TO Hg19 (DONE - 2009-04-24 - Hiram )
+ mkdir /hive/data/genomes/hg16/bed/blat.hg19.2009-04-24
+ cd /hive/data/genomes/hg16/bed/blat.hg19.2009-04-24
+ # -debug run to create run dir, preview scripts...
+ doSameSpeciesLiftOver.pl -buildDir=`pwd` -debug hg16 hg19
+ # Real run:
+ time nice -n +19 \
+ $HOME/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl \
+ -buildDir=`pwd` -verbose=2 \
+ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
+ hg16 hg19 > do.log 2>&1 &
+ # real 93m11.093s
+
+#############################################################################