src/hg/makeDb/doc/hg19.txt 1.30
1.30 2009/07/13 20:37:38 hiram
danRer6 lastz done, phylo tree 4d site prediction done for 46 way, biocyc data obtained
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.29
retrieving revision 1.30
diff -b -B -U 4 -r1.29 -r1.30
--- src/hg/makeDb/doc/hg19.txt 8 Jul 2009 03:32:16 -0000 1.29
+++ src/hg/makeDb/doc/hg19.txt 13 Jul 2009 20:37:38 -0000 1.30
@@ -5077,8 +5079,63 @@
nice -n +19 featureBits hg18 exoniphy
# 27475705 bases of 2881515245 (0.954%) in intersection
#########################################################################
+# BIOCYCTABLES NEEDED BY hgGene (DONE - 2009-06-22 - Hiram)
+
+# First register with BioCyc to download their HumanCyc database
+# The site will email you the URL for download. Beware, they supply
+# a URL to a directory chock a block full of data, almost 7 Gb,
+# you only need one file
+
+ mkdir /hive/data/outside/bioCyc/090623
+ cd /hive/data/outside/bioCyc/090623
+
+ mkdir download
+ cd download
+ wget --timestamping --no-directories --recursive \
+ "http://bioinformatics.ai.sri.com/ecocyc/dist/flatfiles-52983746/humancyc-flatfiles.tar.Z"
+ tar xvzf humancyc-flatfiles.tar.Z
+
+ mkdir /hive/data/genomes/hg19/bed/bioCyc
+ cd /hive/data/genomes/hg19/bed/bioCyc
+ # clean the headers from these files
+ grep -E -v "^#|^UNIQUE-ID" /hive/data/outside/bioCyc/090623/genes.col \
+ > genes.tab
+ # this file isn't consistent in its number of columns
+ grep -E -v "^#|^UNIQUE-ID" /hive/data/outside/bioCyc/090623/pathways.col \
+| awk -F'\t' '{if (140 == NF) { printf "%s\t\t\n", $0; } else { print $0}}' \
+ > pathways.tab
+
+ hgsql hg19 -e 'create database bioCyc090623'
+
+ hgLoadSqlTab bioCyc090623 genes ~/src/hg/lib/bioCycGenes.sql ./genes.tab
+ hgLoadSqlTab bioCyc090623 pathways ~/src/hg/lib/bioCycPathways.sql ./pathways.tab
+
+# Create bioCycMapDesc.tab
+ hgsql bioCyc090623 -N \
+ -e 'select UNIQUE_ID, NAME from pathways' | sort -u > bioCycMapDesc.tab
+XXX see alternative below
+
+ # this kgBioCyc0 thing needs kgXref and other UCSC gene tables to work
+# Create bioCycPathway.tab
+ kgBioCyc0 bioCyc090623 hg19 hg19
+
+ hgLoadSqlTab hg19 bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
+ hgLoadSqlTab hg19 bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
+
+XXX maybe instead do this in the gene build procedure
+ # from the UCSC genes build procedure
+# Do BioCyc Pathways build
+ mkdir $dir/bioCyc
+ cd $dir/bioCyc
+ grep -v '^#' $bioCycPathways > pathways.tab
+ grep -v '^#' $bioCycGenes > genes.tab
+ kgBioCyc1 genes.tab pathways.tab $db bioCycPathway.tab bioCycMapDesc.tab
+ hgLoadSqlTab $tempDb bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
+ hgLoadSqlTab $tempDb bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
+
+##############################################################################
nscanGene (2009-06-22 markd)
# nscanGene track from WUSTL
cd /cluster/data/hg19/bed/nscan
wget http://mblab.wustl.edu/~jeltje/hg19_tracks/hg19.updated.gtf
@@ -5098,4 +5155,153 @@
# validate search expression
hgc-sql -Ne 'select name from nscanGene' hg19 | egrep -v -e '^chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?$' |wc -l
#########################################################################
+# Phylogenetic tree from 46-way (DONE - 2009-06-25,07-07 - Hiram)
+
+ # Extract 4-fold degenerate sites based on
+ # of RefSeq Reviewed, coding
+ mkdir /hive/data/genomes/hg19/bed/multiz46way/4d
+ cd /hive/data/genomes/hg19/bed/multiz46way/4d
+
+ hgsql hg19 -Ne \
+ "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA'" | cut -f 2-20 \
+ > refSeqReviewed.gp
+ wc -l refSeqReviewed.gp
+ # 14077 refSeqReviewed.gp
+ genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
+ wc -l refSeqReviewedNR.gp
+ # 7951 refSeqReviewedNR.gp
+
+ ssh memk
+ mkdir /hive/data/genomes/hg19/bed/multiz46way/4d/run
+ cd /hive/data/genomes/hg19/bed/multiz46way/4d/run
+ mkdir ../mfa
+
+# whole chrom mafs version, using new version of
+# uses memory-efficient version of phast, from Melissa Hubisz at Cornell (mjhubisz@gmail.com)
+ cat << '_EOF_' > 4d.csh
+#!/bin/csh -fe
+set r = "/hive/data/genomes/hg19/bed/multiz46way"
+set c = $1
+set infile = $r/maf/$2
+set outfile = $3
+cd /scratch/tmp
+# 'clean' maf
+perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
+awk -v C=$c '$2 == C {print}' $r/4d/refSeqReviewedNR.gp > $c.gp
+set PHASTBIN=/cluster/bin/phast.2008-12-18
+$PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
+$PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/$outfile
+rm -f $c.gp $c.maf $c.ss
+'_EOF_'
+ # << happy emacs
+ chmod +x 4d.csh
+
+ ls -1S /hive/data/genomes/hg19/bed/multiz46way/maf/*.maf | \
+ egrep -E -v "chrM|chrUn|random|_hap" | sed -e "s#.*multiz46way/maf/##" \
+ > maf.list
+
+ cat << '_EOF_' > template
+#LOOP
+4d.csh $(root1) {check in line+ $(path1)} {check out line+ mfa/$(root1).mfa}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ gensub2 maf.list single template stdout | tac > jobList
+XXX - ready to go here - 2009-07-06
+ rm -fr /cluster/data/hg19/bed/multiz46way/4d/mfa
+ mkdir /cluster/data/hg19/bed/multiz46way/4d/mfa
+ para create jobList
+ para try
+ para check
+ para push
+
+ # combine mfa files
+ cd ..
+ sed -e "s/ /,/g" ../species.list > species.lst
+ /cluster/bin/phast/msa_view --aggregate `cat species.lst` mfa/*.mfa | \
+ sed s/"> "/">"/ > 4d.all.mfa
+
+ sed -e 's/,macEug1.*//' species.lst > placentals.lst
+ # XXX this didn't work
+ /cluster/bin/phast/msa_view --aggregate `cat placentals.lst` mfa/*.mfa | \
+ sed s/"> "/">"/ > 4d.placentals.mfa
+
+ # use phyloFit to create tree model (output is phyloFit.mod)
+ set PHASTBIN=/cluster/bin/phast.2008-12-18
+ time $PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA \
+ --subst-mod REV --tree ../tree-commas.nh 4d.all.mfa
+ # real 111m23.119s
+ mv phyloFit.mod phyloFit.all.mod
+ grep TREE phyloFit.all.mod | sed 's/TREE\:\ //' > tree_4d.46way.nh
+
+ sed -e 's/.*,choHof1,//' species.lst > notPlacentals.list
+
+ $PHASTBIN/tree_doctor \
+ --prune=`cat notPlacentals.list` \
+ tree_4d.46way.nh > tree_4d.46way.placental.nh
+
+#########################################################################
+# LASTZ Zebrafish DanRer6 (DONE - 2009-07-08,10 - Hiram)
+ mkdir /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
+ cd /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
+
+ cat << '_EOF_' > DEF
+# human vs X. zebrafish
+BLASTZ_H=2000
+BLASTZ_Y=3400
+BLASTZ_L=6000
+BLASTZ_K=2200
+BLASTZ_Q=/scratch/data/blastz/HoxD55.q
+
+# TARGET: Human hg19
+SEQ1_DIR=/scratch/data/hg19/nib
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=10000
+
+# QUERY: Zebrafish danRer6
+SEQ2_DIR=/scratch/data/danRer6/danRer6.2bit
+SEQ2_LEN=/scratch/data/danRer6/chrom.sizes
+SEQ2_CHUNK=20000000
+SEQ2_LAP=0
+SEQ2_LIMIT=40
+
+BASE=/hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << happy emacs
+
+ # establish a screen to control this job
+ screen
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+ > do.log 2>&1 &
+ # real 1678m17.827s
+ # failed during the chain step due to encodek cluster problems
+ # finish that manually, then:
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -continue=chainMerge > chainMerge.log 2>&1 &
+ # real 167m6.930s
+ cat fb.hg19.chainDanRer6Link.txt
+ # 88391631 bases of 2897316137 (3.051%) in intersection
+
+ # running the swap - DONE - 2009-06-02
+ mkdir /hive/data/genomes/danRer6/bed/blastz.hg19.swap
+ cd /hive/data/genomes/danRer6/bed/blastz.hg19.swap
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08/DEF \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -swap > swap.log 2>&1 &
+ # real 183m21.102s
+ cat fb.danRer6.chainHg19Link.txt
+ # 96424507 bases of 1506896106 (6.399%) in intersection
+
+##############################################################################