src/hg/makeDb/doc/ce8.txt 1.3

1.3 2009/09/24 20:54:24 hiram
Tried to get the sanger genes up, not complete
Index: src/hg/makeDb/doc/ce8.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/ce8.txt,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 4 -r1.2 -r1.3
--- src/hg/makeDb/doc/ce8.txt	28 Jul 2009 21:55:26 -0000	1.2
+++ src/hg/makeDb/doc/ce8.txt	24 Sep 2009 20:54:24 -0000	1.3
@@ -369,10 +369,10 @@
 
 ############################################################################
 ## 5-Way multiple alignment (DONE - 2009-07-28 - Hiram)
 
-    mkdir /cluster/data/ce8/bed/multiz5way
-    cd /cluster/data/ce8/bed/multiz5way
+    mkdir /hive/data/genomes/ce8/bed/multiz5way
+    cd /hive/data/genomes/ce8/bed/multiz5way
     #	See notes in ce6.txt for 6-way alignment.  This is the tree from
     #	there.
 
     cat << '_EOF_' > 5way.nh
@@ -399,11 +399,11 @@
 #  2 0.0140  - briggsae_cb3        (% 42.300)  (% 39.763)  1000     loose
 #  3 0.0180  - brenneri_caePb2     (% 40.677)  (% 32.313)  1000     loose
 #  3 0.0270  - japonica_caeJap2    (% 27.192)  (% 20.450)  1000     loose
 
-    cd /cluster/data/ce8/bed/multiz5way
+    cd /hive/data/genomes/ce8/bed/multiz5way
     #	bash shell syntax here ...
-    export H=/cluster/data/ce8/bed
+    export H=/hive/data/genomes/ce8/bed
     mkdir mafLinks
     for G in caeRem3 cb3 caePb2 caeJap2
     do
 	mkdir mafLinks/$G
@@ -480,9 +480,9 @@
 #ENDLOOP
 '_EOF_'
 # << happy emacs
 
-    awk '{print $1}' /cluster/data/ce8/chrom.sizes > chrom.lst
+    awk '{print $1}' /hive/data/genomes/ce8/chrom.sizes > chrom.lst
     gensub2 chrom.lst single template jobList
     para create jobList
     para -maxNode=1 push
     para check ... push ... etc ...
@@ -521,4 +521,71 @@
 
     #	remove the temporary .tab files:
     rm multiz5*.tab
 ############################################################################
+# reset default position, same as ce4 on the ZC101 / unc-52 locus
+    ssh hgwdev
+    hgsql -e 'update dbDb set defaultPos="chrII:14646344-14667746"
+	where name="ce8";' hgcentraltest
+
+############################################################################
+## SANGER GENE TRACK (WORKING - 2009-07-29 - Hiram)
+##	There is a tremendous amount of extraneous notations in the gff
+##	files.  Filter them down to a manageable set, change the chrom
+##	names, eliminate duplicates, select only what will work in
+##	ldHgGene
+    mkdir /hive/data/genomes/ce8/bed/sangerGene
+    cd /hive/data/genomes/ce8/bed/sangerGene
+for C in I II III IV V X
+do
+    echo -n "${C} "
+    cat ../../ws204/genome_feature_tables/GFF2/CHROMOSOME_${C}.gff | \
+    sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \
+        s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \
+        s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \
+        s/CHROMOSOME_M/chrM/g;" \
+        -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \
+        -e 's/CDS "//' -e 's/"//' \
+                > chr${C}.gff
+done
+C=M
+echo -n "${C} "
+cat ../../ws204/genome_feature_tables/GFF2/CHROMOSOME_MtDNA.gff | \
+sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \
+    s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \
+    s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \
+    s/CHROMOSOME_M/chrM/g; s/chrMtDNA/chrM/g;" \
+    -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \
+    -e 's/CDS "//' -e 's/"//' \
+            > chr${C}.gff
+for C in I II III IV V X M
+do
+    echo "chr${C}.gff -> filtered.chr${C}.gff"
+    grep -v "^#" chr${C}.gff | awk -F'\t' '
+BEGIN { IGNORECASE=1 }
+{
+    if (match($2,"curated|Coding_transcript")) {
+	if (match($3,"intron|coding_exon|exon|cds|three_prime_UTR|five_prime_UTR")) {
+	    gsub("coding_exon","CDS",$3)
+            gsub("Transcript ","",$9)
+            gsub(" .*","",$9)
+            gsub("three_prime_UTR","3utr",$3)
+            gsub("five_prime_UTR","5utr",$3)
+            for (i = 1; i < 9; ++i) {
+                printf "%s\t", $i
+            }
+            printf "%s\n", $9
+        }
+    }
+}
+' | sort -u | sort -k4n > filtered.chr${C}.gff
+done
+
+    ssh hgwdev
+    cd /hive/data/genomes/ce8/bed/sangerGene
+    nice -n +19 ldHgGene ce8 sangerGene filtered.*.gff
+    nice -n +19 ldHgGene -out=filteredGenePred.tab ce8 sangerGene filtered.*.gff
+    #	Read 55287 transcripts in 1027094 lines in 7 files
+    #	55287 groups 7 seqs 3 sources 5 feature types
+    #	31064 gene predictions
+
+###############################################################################