src/hg/makeDb/doc/hg19.txt 1.52
1.52 2009/10/27 21:35:59 hiram
Rerun the 4D site predictions with chrX only and non-ChrX others
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.51
retrieving revision 1.52
diff -b -B -U 4 -r1.51 -r1.52
--- src/hg/makeDb/doc/hg19.txt 26 Oct 2009 17:04:20 -0000 1.51
+++ src/hg/makeDb/doc/hg19.txt 27 Oct 2009 21:35:59 -0000 1.52
@@ -5159,9 +5159,259 @@
# validate search expression
hgc-sql -Ne 'select name from nscanGene' hg19 | egrep -v -e '^chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?$' |wc -l
#########################################################################
+# Phylogenetic tree from 46-way for chrX (DONE - 2009-10-26 - Hiram)
+# We need two trees, one for chrX only, and a second for all other chroms
+ mkdir /hive/data/genomes/hg19/bed/multiz46way/4dX
+ cd /hive/data/genomes/hg19/bed/multiz46way/4dX
+
+ hgsql hg19 -Ne \
+ "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA' and refGene.chrom='chrX'" \
+ | cut -f 2-20 > refSeqReviewed.gp
+ wc -l refSeqReviewed.gp
+ # 727 refSeqReviewed.gp
+ genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
+ wc -l refSeqReviewedNR.gp
+ # 401 refSeqReviewed.gp
+
+ ssh memk
+ mkdir /hive/data/genomes/hg19/bed/multiz46way/4dX/run
+ cd /hive/data/genomes/hg19/bed/multiz46way/4dX/run
+ mkdir ../mfa
+
+# whole chrom mafs version, using new version of
+# uses memory-efficient version of phast, from Melissa Hubisz at Cornell
+# mjhubisz at gmail.com
+
+ cat << '_EOF_' > 4dX.csh
+#!/bin/csh -fe
+set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
+set r = "/hive/data/genomes/hg19/bed/multiz46way"
+set c = $1
+set infile = $r/maf/$2
+set outfile = $3
+cd /scratch/tmp
+# 'clean' maf
+perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
+awk -v C=$c '$2 == C {print}' $r/4dX/refSeqReviewedNR.gp > $c.gp
+set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
+$PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
+$PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4dX/$outfile
+rm -f $c.gp $c.maf $c.ss
+'_EOF_'
+ # << happy emacs
+ chmod +x 4dX.csh
+
+ ls -1S /hive/data/genomes/hg19/bed/multiz46way/maf/chrX.maf | \
+ egrep -E -v "chrM|chrUn|random|_hap" | sed -e "s#.*multiz46way/maf/##" \
+ > maf.list
+
+ cat << '_EOF_' > template
+#LOOP
+4dX.csh $(root1) $(path1) {check out line+ mfa/$(root1).mfa}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ gensub2 maf.list single template stdout | tac > jobList
+ # run this one job on hgwdev, takes a few minutes:
+ ./4dX.csh chrX chrX.maf mfa/chrX.mfa
+ # not sure what these warnings are about:
+# WARNING: ignoring out-of-range feature
+# chrX genepred CDS 1 -1 . + 2 transcript_id "NM_000475"
+# WARNING: ignoring out-of-range feature
+# chrX genepred CDS 1 -1 . + 2 transcript_id "NM_005365.2"
+
+ # combine mfa files
+ cd ..
+ sed -e "s/ /,/g" ../species.list > species.lst
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
+ --aggregate `cat species.lst` mfa/*.mfa | sed s/"> "/">"/ > 4dX.chrX.mfa
+
+ sed -e 's/,macEug1.*//' species.lst > placentals.lst
+ awk '
+BEGIN { good = 1 }
+{
+ if (match($0, "^> macEug1")) { good = 0 }
+ if (good) {print}
+}
+' mfa/*.mfa > placentals.mfa
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
+ --aggregate `cat placentals.lst` placentals.mfa | sed s/"> "/">"/ \
+ > 4dX.placentals.mfa
+
+ sed -e 's/,tupBel1.*//' species.lst > primates.lst
+ awk '
+BEGIN { good = 1 }
+{
+ if (match($0, "^> tupBel1")) { good = 0 }
+ if (good) {print}
+}
+' mfa/*.mfa > primates.mfa
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
+ --aggregate `cat primates.lst` primates.mfa | sed -e "s/> />/" \
+ > 4dX.primates.mfa
+
+ # use phyloFit to create tree model (output is phyloFit.mod)
+ time /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
+ --EM --precision MED --msa-format FASTA --subst-mod REV \
+ --tree ../tree-commas.nh 4dX.chrX.mfa
+ # real 0.54.139s
+ mv phyloFit.mod phyloFit.chrX.mod
+
+ grep TREE phyloFit.chrX.mod | sed 's/TREE\:\ //' > tree_4d.chrX.46way.nh
+
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/tree_doctor \
+ --no-branchlen --prune-all-but=`cat primates.lst` ../tree-commas.nh \
+ > tree_commas.primates.nh
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/tree_doctor \
+ --no-branchlen --prune-all-but=`cat placentals.lst` ../tree-commas.nh \
+ > tree_commas.placentals.nh
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
+ --EM --precision MED --msa-format FASTA --subst-mod REV \
+ --tree tree_commas.primates.nh 4dX.primates.mfa
+ mv phyloFit.mod phyloFit.chrX.primates.mod
+ grep TREE phyloFit.chrX.primates.mod | sed 's/TREE\:\ //' \
+ > tree_4d.chrX.primates.46way.nh
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
+ --EM --precision MED --msa-format FASTA --subst-mod REV \
+ --tree tree_commas.placentals.nh 4dX.placentals.mfa
+ mv phyloFit.mod phyloFit.chrX.placentals.mod
+ grep TREE phyloFit.chrX.placentals.mod | sed 's/TREE\:\ //' \
+ > tree_4d.chrX.placentals.46way.nh
+
+#########################################################################
+# Phylogenetic tree from 46-way for non-chrX (DONE - 2009-10-27 - Hiram)
+# We need two trees, one for chrX only, and a second for all other chroms
+ mkdir /hive/data/genomes/hg19/bed/multiz46way/4dNoX
+ cd /hive/data/genomes/hg19/bed/multiz46way/4dNoX
+
+ hgsql hg19 -Ne \
+ "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA'" \
+ | cut -f 2-20 | egrep -E -v "chrM|chrUn|random|_hap|chrX" \
+ > refSeqReviewed.gp
+ wc -l refSeqReviewed.gp
+ # 12977 refSeqReviewed.gp
+ genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
+ wc -l refSeqReviewedNR.gp
+ # 7252 refSeqReviewed.gp
+
+ ssh memk
+ mkdir /hive/data/genomes/hg19/bed/multiz46way/4dNoX/run
+ cd /hive/data/genomes/hg19/bed/multiz46way/4dNoX/run
+ mkdir ../mfa
+
+# whole chrom mafs version, using new version of
+# uses memory-efficient version of phast, from Melissa Hubisz at Cornell
+# mjhubisz at gmail.com
+
+ cat << '_EOF_' > 4dNoX.csh
+#!/bin/csh -fe
+set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
+set r = "/hive/data/genomes/hg19/bed/multiz46way"
+set c = $1
+set infile = $r/maf/$2
+set outfile = $3
+cd /scratch/tmp
+# 'clean' maf
+perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
+awk -v C=$c '$2 == C {print}' $r/4dNoX/refSeqReviewedNR.gp > $c.gp
+set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
+$PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
+$PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4dNoX/$outfile
+rm -f $c.gp $c.maf $c.ss
+'_EOF_'
+ # << happy emacs
+ chmod +x 4dNoX.csh
+
+ ls -1S /hive/data/genomes/hg19/bed/multiz46way/maf/chr*.maf | \
+ egrep -E -v "chrM|chrUn|random|_hap|chrX" \
+ | sed -e "s#.*multiz46way/maf/##" \
+ > maf.list
+
+ cat << '_EOF_' > template
+#LOOP
+4dNoX.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ gensub2 maf.list single template stdout | tac > jobList
+ para try ... check ... push ... etc
+ para time
+# Completed: 23 of 23 jobs
+# CPU time in finished jobs: 9032s 150.53m 2.51h 0.10d 0.000 y
+# IO & Wait Time: 672s 11.21m 0.19h 0.01d 0.000 y
+# Average job time: 422s 7.03m 0.12h 0.00d
+# Longest finished job: 860s 14.33m 0.24h 0.01d
+# Submission to last job: 1210s 20.17m 0.34h 0.01d
+
+ # combine mfa files
+ cd ..
+ sed -e "s/ /,/g" ../species.list > species.lst
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
+ --aggregate `cat species.lst` mfa/*.mfa | sed s/"> "/">"/ \
+ > 4dNoX.all.mfa
+
+ sed -e 's/,macEug1.*//' species.lst > placentals.lst
+ awk '
+BEGIN { good = 1 }
+{
+ if (match($0, "^> macEug1")) { good = 0 }
+ if (good) {print}
+}
+' mfa/*.mfa > placentals.mfa
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
+ --aggregate `cat placentals.lst` placentals.mfa | sed s/"> "/">"/ \
+ > 4dNoX.placentals.mfa
+
+ sed -e 's/,tupBel1.*//' species.lst > primates.lst
+ awk '
+BEGIN { good = 1 }
+{
+ if (match($0, "^> tupBel1")) { good = 0 }
+ if (good) {print}
+}
+' mfa/*.mfa > primates.mfa
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
+ --aggregate `cat primates.lst` primates.mfa | sed -e "s/> />/" \
+ > 4dNoX.primates.mfa
+
+
+ # use phyloFit to create tree model (output is phyloFit.mod)
+ time /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
+ --EM --precision MED --msa-format FASTA --subst-mod REV \
+ --tree ../tree-commas.nh 4dNoX.all.mfa
+XXX - running Tue Oct 27 13:21:49 PDT 2009
+ # about 40 minutes
+ mv phyloFit.mod phyloFit.NoChrX.mod
+
+ grep TREE phyloFit.chrX.mod | sed 's/TREE\:\ //' > tree_4d.chrX.46way.nh
+
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/tree_doctor \
+ --no-branchlen --prune-all-but=`cat primates.lst` ../tree-commas.nh \
+ > tree_commas.primates.nh
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/tree_doctor \
+ --no-branchlen --prune-all-but=`cat placentals.lst` ../tree-commas.nh \
+ > tree_commas.placentals.nh
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
+ --EM --precision MED --msa-format FASTA --subst-mod REV \
+ --tree tree_commas.primates.nh 4dNoX.primates.mfa
+ mv phyloFit.mod phyloFit.NoChrX.primates.mod
+ grep TREE phyloFit.NoChrX.primates.mod | sed 's/TREE\:\ //' \
+ > tree_4d.NoChrX.primates.46way.nh
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
+ --EM --precision MED --msa-format FASTA --subst-mod REV \
+ --tree tree_commas.placentals.nh 4dNoX.placentals.mfa
+ mv phyloFit.mod phyloFit.NoChrX.placentals.mod
+ grep TREE phyloFit.NoChrX.placentals.mod | sed 's/TREE\:\ //' \
+ > tree_4d.NoChrX.placentals.46way.nh
+
+#########################################################################
# Phylogenetic tree from 46-way (DONE - 2009-06-25,07-07 - Hiram)
+# This was an early first time experiment. All this was redone
+# above for chrX only and non-chrX trees
# Extract 4-fold degenerate sites based on
# of RefSeq Reviewed, coding
mkdir /hive/data/genomes/hg19/bed/multiz46way/4d
@@ -5181,9 +5431,10 @@
cd /hive/data/genomes/hg19/bed/multiz46way/4d/run
mkdir ../mfa
# whole chrom mafs version, using new version of
-# uses memory-efficient version of phast, from Melissa Hubisz at Cornell (mjhubisz@gmail.com)
+# uses memory-efficient version of phast, from Melissa Hubisz at Cornell
+# mjhubisz at gmail.com
cat << '_EOF_' > 4d.csh
#!/bin/csh -fe
set r = "/hive/data/genomes/hg19/bed/multiz46way"
set c = $1
@@ -5924,8 +6175,157 @@
# Longest finished job: 1635s 27.25m 0.45h 0.02d
# Submission to last job: 2965s 49.42m 0.82h 0.03d
+ # run phyloP with score=LRT
+ ssh swarm
+ cd /cluster/data/hg18/bed/multiz44way/consPhyloP
+ mkdir run.phyloP
+ cd run.phyloP
+
+ # Adjust model file base composition background and rate matrix to be
+ # representative of whole-genome
+
+ grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
+ # 0.539
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
+ ../../4d/phyloFit.all.mod 0.539 > ../../4d/46way.all.mod
+
+ # repeat for chrX only tree
+ cd /cluster/data/hg18/bed/multiz46way/4d
+ $PHASTBIN/modFreqs 4d.chrX.mod $gc > 46way.chrX.mod
+ ln -s `pwd`/46way.chrX.mod /usr/local/apache/golenPath/hg18/phastCons46way
+
+cat > doPhyloP.csh << 'EOF'
+ set f = $1
+ set out = $2
+ set c = $f:r:r
+ set n = $f:r:e
+ set tmp = /scratch/tmp/$f
+ rm -fr $tmp
+ mkdir -p $tmp
+ cp -p /cluster/data/hg18/bed/multiz46way/consPhyloP/ss/$c/$n/$f.ss $tmp
+ cp -p tree.mod $tmp
+ pushd $tmp > /dev/null
+ set PHASTBIN = /cluster/bin/phast.2008-12-18
+ $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $c \
+ -i SS tree.mod $f.ss > $f.wig
+ popd > /dev/null
+ mkdir -p $out:h
+ mv $tmp/$f.wig $out
+ rm -fr $tmp
+'EOF'
+
+ # Create list of chunks
+ pushd /cluster/data/hg18/bed/multiz46way/consPhyloP/ss
+ ls chr*/*/chr*.*.ss | sed -e 's/.ss$//' -e 's/^\.\///' > \
+ /cluster/data/hg18/bed/multiz46way/consPhyloP/run.phyloP/in.list
+ popd > /dev/null
+
+ # need to fill in chr8, neglected in main run
+ pushd /cluster/data/hg18/bed/multiz46way/consPhyloP/ss
+ ls chr8/*/chr*.*.ss | sed -e 's/.ss$//' -e 's/^\.\///' > \
+ /cluster/data/hg18/bed/multiz46way/consPhyloP/run.phyloP/in.chr8.list
+ popd > /dev/null
+
+ # Create template file
+ # file1 == $chr/$chunk/file name without .ss suffix
+ cat > template << 'EOF'
+#LOOP
+csh ../doPhyloP.csh $(file1) {check out line+ wig/$(dir1)/$(file1).wig}
+#ENDLOOP
+'EOF'
+ # setup run for all species
+ mkdir all
+ cd all
+ cp ../../../4d/46way.all.mod tree.mod
+ rm -fr wig
+ mkdir wig
+
+ # << happy emacs
+ gensub2 ../in.list single ../template jobList
+ # 2823 jobs
+ para create jobList
+ para try
+ para check
+ para push
+
+ para time
+ #Completed: 2823 of 2823 jobs
+ #CPU time in finished jobs: 4691641s 78194.02m 1303.23h 54.30d 0.149 y
+ #IO & Wait Time: 171343s 2855.71m 47.60h 1.98d 0.005 y
+ #Average job time: 1723s 28.71m 0.48h 0.02d
+ #Longest finished job: 2451s 40.85m 0.68h 0.03d
+ #Submission to last job: 6055s 100.92m 1.68h 0.07d
+
+ ssh hgwdev
+ cd /cluster/data/hg18/bed/multiz46way/consPhyloP/run.phyloP
+# check for clean dir here -- chr* will match garbage if it's there
+cat > listWig.csh << 'EOF'
+ foreach c (`ls -d chr*`)
+ foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
+ ls -1 $d/*.wig | sort -n -t\. -k3
+ end
+ end
+'EOF'
+
+ cd all/wig
+ csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP46wayAll.wig phyloP46wayAll.wib
+ # Reloaded to include chr8 (2008-01-15 kate)
+ #Converted stdin, upper limit 7.13, lower limit -15.41
+ # Load gbdb and database with wiggle.
+ ln -s \
+ /cluster/data/hg18/bed/multiz46way/consPhyloP/run.phyloP/all/wig/phyloP46wayAll.wib \
+ /gbdb/hg18/multiz46way/phyloP46wayAll.wib
+ hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz46way hg18 phyloP46wayAll phyloP46wayAll.wig
+
+ # placental-only: exclude all but these:
+ cd /cluster/data/hg18/bed/multiz46way/4d
+ set PHASTBIN = /cluster/bin/phast.2008-12-18
+ $PHASTBIN/tree_doctor 46way.all.mod \
+ --prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,\
+ micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2,\
+ vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,\
+ sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1 \
+ > 46way.placental.mod
+ cd ../consPhyloP/run.phyloP
+ mkdir placental
+ cd placental
+ cp ../../../4d/46way.placental.mod tree.mod
+ mkdir wig
+ gensub2 ../in.list single ../template jobList
+ # 2823 jobs
+ para create jobList
+ para try
+ para check
+ para push
+
+ para time
+ #Completed: 2823 of 2823 jobs
+ #CPU time in finished jobs: 3358003s 55966.71m 932.78h 38.87d 0.106 y
+ #IO & Wait Time: 142664s 2377.74m 39.63h 1.65d 0.005 y
+ #Average job time: 1240s 20.67m 0.34h 0.01d
+ #Longest finished job: 1781s 29.68m 0.49h 0.02d
+ #Submission to last job: 4383s 73.05m 1.22h 0.05d
+
+ # load wiggle
+ ssh hgwdev
+ cd /cluster/data/hg18/bed/multiz46way/consPhyloP/run.phyloP/placental/wig
+ csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP46wayPlacMammal.wig phyloP46wayPlacMammal.wib
+ #Converted stdin, upper limit 3.46, lower limit -14.42
+
+ # Load gbdb and database with wiggle.
+ ln -s \
+ /cluster/data/hg18/bed/multiz46way/consPhyloP/run.phyloP/placental/wig/phyloP46wayPlacMammal.wib \
+ /gbdb/hg18/multiz46way/phyloP46wayPlacMammal.wib
+ hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz46way hg18 phyloP46wayPlacMammal phyloP46wayPlacMammal.wig
+
+ cd /cluster/data/hg18/bed/multiz46way/4d
+ set PHASTBIN = /cluster/bin/phast.2008-12-18
+ $PHASTBIN/tree_doctor 46way.all.mod \
+ --prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2 \
+ > 46way.euarchontoglires.mod
+
#########################################################################
# LASTZ Zebrafish DanRer6 (DONE - 2009-07-08,10 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
cd /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08