src/hg/makeDb/doc/hg19.txt 1.30

1.30 2009/07/13 20:37:38 hiram
danRer6 lastz done, phylo tree 4d site prediction done for 46 way, biocyc data obtained
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.29
retrieving revision 1.30
diff -b -B -U 4 -r1.29 -r1.30
--- src/hg/makeDb/doc/hg19.txt	8 Jul 2009 03:32:16 -0000	1.29
+++ src/hg/makeDb/doc/hg19.txt	13 Jul 2009 20:37:38 -0000	1.30
@@ -5077,8 +5079,63 @@
     nice -n +19 featureBits hg18 exoniphy
     #	27475705 bases of 2881515245 (0.954%) in intersection
 
 #########################################################################
+# BIOCYCTABLES NEEDED BY hgGene (DONE - 2009-06-22 - Hiram)
+
+# First register with BioCyc to download their HumanCyc database
+# The site will email you the URL for download.  Beware, they supply
+#	a URL to a directory chock a block full of data, almost 7 Gb,
+#	you only need one file
+
+    mkdir /hive/data/outside/bioCyc/090623
+    cd /hive/data/outside/bioCyc/090623
+
+    mkdir download
+    cd download
+    wget --timestamping --no-directories --recursive \
+	"http://bioinformatics.ai.sri.com/ecocyc/dist/flatfiles-52983746/humancyc-flatfiles.tar.Z"
+    tar xvzf humancyc-flatfiles.tar.Z
+
+    mkdir /hive/data/genomes/hg19/bed/bioCyc
+    cd /hive/data/genomes/hg19/bed/bioCyc
+    #	clean the headers from these files
+    grep -E -v "^#|^UNIQUE-ID" /hive/data/outside/bioCyc/090623/genes.col \
+	> genes.tab
+    #	this file isn't consistent in its number of columns
+    grep -E -v "^#|^UNIQUE-ID" /hive/data/outside/bioCyc/090623/pathways.col \
+| awk -F'\t' '{if (140 == NF) { printf "%s\t\t\n", $0; } else { print $0}}' \
+	> pathways.tab
+
+    hgsql hg19 -e 'create database bioCyc090623'
+
+    hgLoadSqlTab bioCyc090623 genes ~/src/hg/lib/bioCycGenes.sql ./genes.tab
+    hgLoadSqlTab bioCyc090623 pathways ~/src/hg/lib/bioCycPathways.sql ./pathways.tab
+
+# Create bioCycMapDesc.tab
+    hgsql bioCyc090623 -N \
+	-e 'select UNIQUE_ID, NAME from pathways' | sort -u >  bioCycMapDesc.tab
+XXX see alternative below
+
+    #	this kgBioCyc0 thing needs kgXref and other UCSC gene tables to work
+# Create bioCycPathway.tab
+    kgBioCyc0 bioCyc090623 hg19 hg19
+
+    hgLoadSqlTab hg19 bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
+    hgLoadSqlTab hg19 bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
+
+XXX maybe instead do this in the gene build procedure
+    # from the UCSC genes build procedure
+# Do BioCyc Pathways build
+    mkdir $dir/bioCyc
+    cd $dir/bioCyc
+    grep -v '^#' $bioCycPathways > pathways.tab
+    grep -v '^#' $bioCycGenes > genes.tab
+    kgBioCyc1 genes.tab pathways.tab $db bioCycPathway.tab bioCycMapDesc.tab
+    hgLoadSqlTab $tempDb bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
+    hgLoadSqlTab $tempDb bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
+
+##############################################################################
 nscanGene (2009-06-22 markd)
    # nscanGene track from WUSTL
    cd /cluster/data/hg19/bed/nscan
    wget http://mblab.wustl.edu/~jeltje/hg19_tracks/hg19.updated.gtf
@@ -5098,4 +5155,153 @@
    # validate search expression
    hgc-sql -Ne 'select name from nscanGene' hg19 | egrep -v -e '^chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?$' |wc -l
 
 #########################################################################
+# Phylogenetic tree from 46-way  (DONE - 2009-06-25,07-07 - Hiram)
+
+    # Extract 4-fold degenerate sites based on 
+    # of RefSeq Reviewed, coding
+    mkdir /hive/data/genomes/hg19/bed/multiz46way/4d
+    cd /hive/data/genomes/hg19/bed/multiz46way/4d
+
+    hgsql hg19 -Ne \
+    "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA'" | cut -f 2-20 \
+	> refSeqReviewed.gp
+    wc -l refSeqReviewed.gp
+    # 14077 refSeqReviewed.gp
+    genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
+    wc -l refSeqReviewedNR.gp
+    # 7951 refSeqReviewedNR.gp
+
+    ssh memk
+    mkdir /hive/data/genomes/hg19/bed/multiz46way/4d/run
+    cd /hive/data/genomes/hg19/bed/multiz46way/4d/run
+    mkdir ../mfa
+
+# whole chrom mafs version, using new version of 
+# uses memory-efficient version of phast, from Melissa Hubisz at Cornell (mjhubisz@gmail.com)
+    cat << '_EOF_' > 4d.csh
+#!/bin/csh -fe
+set r = "/hive/data/genomes/hg19/bed/multiz46way"
+set c = $1
+set infile = $r/maf/$2
+set outfile = $3
+cd /scratch/tmp
+# 'clean' maf
+perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf    
+awk -v C=$c '$2 == C {print}' $r/4d/refSeqReviewedNR.gp > $c.gp
+set PHASTBIN=/cluster/bin/phast.2008-12-18
+$PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
+$PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/$outfile
+rm -f $c.gp $c.maf $c.ss
+'_EOF_'
+    # << happy emacs
+    chmod +x 4d.csh
+
+    ls -1S /hive/data/genomes/hg19/bed/multiz46way/maf/*.maf | \
+        egrep -E -v "chrM|chrUn|random|_hap" | sed -e "s#.*multiz46way/maf/##" \
+	> maf.list
+
+    cat << '_EOF_' > template
+#LOOP
+4d.csh $(root1) {check in line+ $(path1)} {check out line+ mfa/$(root1).mfa}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+
+    gensub2 maf.list single template stdout | tac > jobList
+XXX - ready to go here - 2009-07-06
+    rm -fr /cluster/data/hg19/bed/multiz46way/4d/mfa
+    mkdir /cluster/data/hg19/bed/multiz46way/4d/mfa
+    para create jobList
+    para try
+    para check
+    para push
+
+    # combine mfa files
+    cd ..
+    sed -e "s/ /,/g" ../species.list > species.lst
+    /cluster/bin/phast/msa_view --aggregate `cat species.lst` mfa/*.mfa | \
+        sed s/"> "/">"/ > 4d.all.mfa
+
+    sed -e 's/,macEug1.*//' species.lst > placentals.lst
+    #  XXX this didn't work
+    /cluster/bin/phast/msa_view --aggregate `cat placentals.lst` mfa/*.mfa | \
+        sed s/"> "/">"/ > 4d.placentals.mfa
+
+    # use phyloFit to create tree model (output is phyloFit.mod)
+    set PHASTBIN=/cluster/bin/phast.2008-12-18
+    time $PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA \
+	--subst-mod REV --tree ../tree-commas.nh 4d.all.mfa
+    #	real    111m23.119s
+    mv phyloFit.mod phyloFit.all.mod
+    grep TREE phyloFit.all.mod | sed 's/TREE\:\ //' > tree_4d.46way.nh
+
+    sed -e 's/.*,choHof1,//' species.lst > notPlacentals.list
+
+    $PHASTBIN/tree_doctor \
+        --prune=`cat notPlacentals.list` \
+                tree_4d.46way.nh > tree_4d.46way.placental.nh
+
+#########################################################################
+# LASTZ Zebrafish DanRer6 (DONE - 2009-07-08,10 - Hiram)
+    mkdir /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
+    cd /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
+
+    cat << '_EOF_' > DEF
+# human vs X. zebrafish
+BLASTZ_H=2000
+BLASTZ_Y=3400
+BLASTZ_L=6000
+BLASTZ_K=2200
+BLASTZ_Q=/scratch/data/blastz/HoxD55.q
+
+# TARGET: Human hg19
+SEQ1_DIR=/scratch/data/hg19/nib
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=10000
+
+# QUERY: Zebrafish danRer6
+SEQ2_DIR=/scratch/data/danRer6/danRer6.2bit
+SEQ2_LEN=/scratch/data/danRer6/chrom.sizes
+SEQ2_CHUNK=20000000
+SEQ2_LAP=0
+SEQ2_LIMIT=40
+
+BASE=/hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
+TMPDIR=/scratch/tmp
+'_EOF_'
+    # << happy emacs
+
+    #	establish a screen to control this job
+    screen
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	`pwd`/DEF \
+	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+	> do.log 2>&1 &
+    #	real    1678m17.827s
+    #	failed during the chain step due to encodek cluster problems
+    #	finish that manually, then:
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	`pwd`/DEF \
+	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+	-continue=chainMerge > chainMerge.log 2>&1 &
+    #	real    167m6.930s
+    cat fb.hg19.chainDanRer6Link.txt 
+    #	88391631 bases of 2897316137 (3.051%) in intersection
+
+    #	running the swap - DONE - 2009-06-02
+    mkdir /hive/data/genomes/danRer6/bed/blastz.hg19.swap
+    cd /hive/data/genomes/danRer6/bed/blastz.hg19.swap
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	/hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08/DEF \
+	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+	-swap > swap.log 2>&1 &
+    #	real    183m21.102s
+    cat fb.danRer6.chainHg19Link.txt 
+    #	96424507 bases of 1506896106 (6.399%) in intersection
+
+##############################################################################