src/hg/makeDb/doc/felCatV17e.txt 1.10

1.10 2010/05/03 20:51:20 chinhli
maf files done
Index: src/hg/makeDb/doc/felCatV17e.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/felCatV17e.txt,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 4 -r1.9 -r1.10
--- src/hg/makeDb/doc/felCatV17e.txt	27 Apr 2010 19:20:00 -0000	1.9
+++ src/hg/makeDb/doc/felCatV17e.txt	3 May 2010 20:51:20 -0000	1.10
@@ -549,8 +549,13 @@
 # LASTZ Mouse Swap (DONE - 2010-03-22 - Chin)
     cd /hive/data/genomes/mm9/bed/lastzFelCatV17e.2010-03-22
     cat fb.mm9.chainFelCatV17eLink.txt
     #   637007193 bases of 2620346127 (24.310%) in intersection
+    # make sure the link exists:
+    # cd /hive/data/genomes/mm9/bed
+    # ln -s /hive/data/genomes/mm9/bed/lastzFelCatV17e.2010-03-22 \
+    #      /lastz.felCatV17e
+
     # swap
     mkdir /hive/data/genomes/felCatV17e/bed/blastz.mm9.swap
     cd /hive/data/genomes/felCatV17e/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
@@ -586,8 +591,16 @@
 # LASTZ Panda Swap (DONE - 2010-03-22 - Chin)
     cd /hive/data/genomes/ailMel1/bed/lastzFelCatV17e.2010-03-22
     cat fb.felCatV17e.chainAilMel1Link.txt
     # 1503647735 bases of 1990635005 (75.536%) in intersection
+    # link it
+    #  cd /hive/data/genomes/ailMel1/bed
+    #  ln -s /hive/data/genomes/ailMel1/bed/lastzFelCatV17e.2010-03-22
+    #  lastz.felCatV17e
+    # run rbest 2010-04-30
+    time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
+        felCatV17e ailMel1 > rbest.log 2>&1 &
+    #   real    234m49.556s
 
     #   re-run swap 04-26
     mkdir /hive/data/genomes/felCatV17e/bed/blastz.ailMel1.swap
     cd /hive/data/genomes/felCatV17e/bed/blastz.ailMel1.swap
@@ -602,18 +615,35 @@
     # 1507273252 bases of 2245312831 (67.130%) in intersection
 
     #   Due to the SEQ1 and SEQ2 spec'ed in
     # /hive/data/genomes/ailMel1/bed/lastzFelCatV17e.2010-03-22/DEF
-    # need to copy the swap directory from ailMel1 to felCatv17e
-    #  also need to rename from ailMel1.felCatV17e, or 6way step will have
-    #  problem
-    #  Ping hiram for this 
+    # need to copy the dDirectory from ailMel1 to felCatv17e
+    #  without this, 6way step will have problem
+    #  cd /hive/data/genomes/ailMel1/bed/blastz.felCatV17e.swap]
+    #  cp -pr * /hive/data/genomes/felCatV17e/bed/blastz.ailMel1.swap/ 
+    #  cd /hive/data/genomes/felCatV17e/bed
+    #  ln -s lastzAilMel1.2010-03-22 lastz.ailMel1
+
+
+    # mkdir /hive/data/genomes/felCatV17e/bed/lastzAilMel1.2010-03-22
+    # cd /hive/data/genomes/ailMel1/bed/lastzFelCatV17e.2010-03-22
+    # cp -rp * /hive/data/genomes/felCatV17e/bed/lastzAilMel1.2010-03-22/ 
+    # cd /hive/data/genomes/felCatV17e/bed
+    # ln -s /hive/data/genomes/felCatV17e/bed/lastzAilMel1.2010-03-22 lastz.ailMel1
+    # cd /hive/data/genomes/ailMel1/bed
+    # ln -s  blastz.felCatV17e.swap  lastz.felCatV17e
+
+
+
 
 #####################################################################
 # LASTZ Oppsom  Swap (DONE - 2010-03-22 - Chin)
     cd /hive/data/genomes/monDom5/bed/lastzFelCatV17e.2010-03-22
     cat fb.monDom5.chainFelCatV17eLink.txt
     # 178616721 bases of 3501660299 (5.101%) in intersection
+    # make sure the link exist
+    # cd /hive/data/genomes/monDom5/bed
+    #  ln -s lastzFelCatV17e.2010-03-22 lastz.felCatV17e 
 
     # Swap
     mkdir /hive/data/genomes/felCatV17e/bed/blastz.monDom5.swap
     cd /hive/data/genomes/felCatV17e/bed/blastz.monDom5.swap
@@ -770,10 +800,10 @@
     ln -s blastz.hg19.swap lastz.hg19
     ln -s blastz.canFam2.swap lastz.canFam2
     ln -s blastz.mm9.swap lastz.mm9
     ln -s blastz.monDom5.swap lastz.monDom5
-    # ln -s blastz.ailMel1.swap lastz.ailMel1 (we switched SEQ1 and SEQ2)
-    ln -s /hive/data/genomes/ailMel1/bed/blastz.felCatV17e.swap lastz.ailMel1
+    ln -s /hive/data/genomes/felCatV17e/bed/lastzAilMel1.2010-03-22 /
+            lastz.ailMel1
 
 
     cd /hive/data/genomes/felCatV17e/bed/multiz6way
     #   Use 6way.distances.txt to create the table below
@@ -798,9 +828,9 @@
         `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
     chomp $chainLinkMeasure;
     $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
     $chainLinkMeasure =~ s/\%//;
-    my $swapFile="/hive/data/genomes/${D}/bed/lastz.felCatV17e/fb.${D}.chainCalJac3Link.txt";
+    my $swapFile="/hive/data/genomes/${D}/bed/lastz.felCatV17e/fb.${D}.chainFelCatV17eLink.txt";
     my $swapMeasure = "N/A";
     if ( -s $swapFile ) {
         $swapMeasure =
             `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
@@ -828,16 +858,156 @@
 '_EOF_'
     # << happy emacs
     chmod +x ./sizeStats.pl
     ./sizeStats.pl
-###### 04-27 stop here due to wrong output from sizeStats.pl 
-###### pending move/rename aelMel1/felcatV17e lastz swap results
-# 02  0.1986 - Panda ailMel1    (% 0.000) (N/A)
-# 04  0.2011 - Dog canFam2      (% 73.720) (N/A)
-# 06  0.3567 - Human hg19       (% 60.870) (N/A)
-# 08  0.5692 - Mouse mm9        (% 30.972) (N/A)
-# 10  0.7913 - Opossum monDom5  (% 8.364) (N/A)
+#
+#       If you can fill in all the numbers in this table, you are ready for
+#       the multiple alignment procedure
+#
+#                         featureBits chainLink measures
+#                                  chainFlCatV17eLink
+#    distance                on felCatV17e on other
+# 02  0.1986 - Panda ailMel1    (% 75.536) (% 67.130)
+# 04  0.2011 - Dog canFam2      (% 73.720) (% 62.098)
+# 06  0.3567 - Human hg19       (% 60.870) (% 43.696)
+# 08  0.5692 - Mouse mm9        (% 30.972) (% 24.310)
+# 10  0.7913 - Opossum monDom5  (% 8.364) (% 5.101)
+
+    # create species list and stripped down tree for autoMZ
+    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
+        6way.nh > tmp.nh
+    echo `cat tmp.nh` > tree-commas.nh
+    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
+    sed 's/[()]//g; s/,/ /g' tree.nh > species.list
+
+    #   collect the single whole mafs into one place for splitting:
+    mkdir singleMaf
+    cd singleMafs
+    ln -s ../../lastz.hg19/axtChain/felCatV17e.hg19.synNet.maf.gz .
+    ln -s ../../lastz.mm9/axtChain/felCatV17e.mm9.synNet.maf.gz .
+    ln -s ../../lastz.canFam2/axtChain/felCatV17e.canFam2.synNet.maf.gz .
+    ln -s ../../lastz.monDom5/axtChain/felCatV17e.monDom5.synNet.maf.gz .
+    # N50 for ailMel1 is 1281781 (less than 10 ~ 20 MB) use rbest
+    ln -s ../../lastz.ailMel1/mafRBestNet/felCatV17e.ailMel1.rbest.maf.gz .
+
+
+    # to use rbest or net, use n50.pl chrom.size to tell
+    mkdir /hive/data/genomes/felCatV17e/bed/multiz6way/splitMaf
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/splitMaf
+    for D in ailMel1
+do
+    mkdir ${D}
+    mafSplit -useHashedName=8 -byTarget /dev/null ${D}/ \
+        ../singleMafs/felCatV17e.${D}.rbest.maf.gz
+done
+
+for D in hg19 mm9 canFam2 monDom5
+do
+    mkdir ${D}
+    mafSplit -useHashedName=8 -byTarget /dev/null ${D}/ \
+        ../singleMafs/felCatV17e.${D}.synNet.maf.gz
+done
+
+
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way
+    mkdir penn
+    cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn
+    cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn
+    cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn
+    cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn
+
+    ssh swarm
+    mkdir /hive/data/genomes/felCatV17e/bed/multiz6way/run
+    mkdir /hive/data/genomes/felCatV17e/bed/multiz6way/run/maf
+
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/run
+
+    #  set the db and pairs directories here
+    cat > autoMultiz.csh << '_EOF_'
+#!/bin/csh -ef
+set db = felCatV17e
+set topDir = /hive/data/genomes/$db/bed/multiz6way
+set c = $1
+set result = $2
+set pennBin = $topDir/penn
+set run = `/bin/pwd`
+set tmp = /scratch/tmp/$db/multiz.$c
+set pairs = $topDir/splitMaf
+/bin/rm -fr $tmp
+/bin/mkdir -p $tmp
+/bin/cp -p $topDir/tree.nh $topDir/species.list $tmp
+pushd $tmp > /dev/null
+foreach s (`/bin/sed -e "s/^$db //" species.list`)
+    set in = $pairs/$s/$c.maf
+    set out = $db.$s.sing.maf
+    if (-e $in.gz) then
+        /bin/zcat $in.gz > $out
+        if (! -s $out) then
+            echo "##maf version=1 scoring=autoMZ" > $out
+        endif
+    else if (-e $in) then
+        /bin/ln -s $in $out
+    else
+        echo "##maf version=1 scoring=autoMZ" > $out
+    endif
+end
+set path = ($pennBin $path); rehash
+$pennBin/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \
+        > /dev/null
+popd > /dev/null
+/bin/rm -f $result
+/bin/cp -p $tmp/$c.maf $result
+/bin/rm -fr $tmp
+/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
+'_EOF_'
+# << happy emacs
+    chmod +x autoMultiz.csh
+
+    cat  << '_EOF_' > template
+#LOOP
+./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/felCatV17e/bed/multiz6way/run/maf/$(root1).maf}
+#ENDLOOP
+'_EOF_'
+# << happy emacs
 
+    find ../splitMaf -type f | grep "/[0-9][0-9][0-9].maf" \
+        | xargs -L 1 basename | sort -u > chr.part.list
+    gensub2 chr.part.list single template jobList
+    para -ram=8g create jobList
+
+    #   put the split mafs back together into a single result
+    head -q -n 1 maf/000.maf > felCatV17e.6way.maf
+    for F in maf/*.maf
+do
+    grep -h -v "^#" ${F} >> felCatV17e.6way.maf
+done
+    tail -q -n 1 maf/000.maf >> felCatV17e.6way.maf
+    tail -q -n 1 maf/hg19_${C}.*.maf | sort -u >> ../maf/${C}.maf
+
+    #   real    13m32.340s
+
+    # load tables for a look
+    mkdir -p /gbdb/felCatV17e/multiz6way/maf
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/maf
+    ln -s `pwd`/felCatV17e.6way.maf \
+        /gbdb/felCatV17e/multiz6way/maf/multiz6way.maf
+
+    # this generates an immense multiz6way.tab file in the directory
+    #   where it is running.  Best to run this over in scratch.
+    cd /data/tmp
+    time nice -n +19 hgLoadMaf \
+        -pathPrefix=/gbdb/felCatV17e/multiz6way/maf felCatV17e multiz6way
+    #   Loaded 13316945 mafs in 1 files from
+    #   /gbdb/felCatV17e/multiz6way/maf
+    #   real    9m9.365s
+
+    # load summary table
+    time nice -n +19 cat /gbdb/felCatV17e/multiz6way/maf/*.maf \
+        | hgLoadMafSummary felCatV17e -minSize=30000 -verbose=2 \
+                -mergeGap=1500 -maxSize=200000  multiz6waySummary stdin
+# Created 2330531 summary blocks from 99659162 components and
+#       13316945 mafs from stdin
+    #   real    17m54.685s
 
 
 
 ####################################################################