src/hg/makeDb/doc/caeJap2.txt 1.1

1.1 2009/07/23 17:46:02 hiram
Initial sequence through masking
Index: src/hg/makeDb/doc/caeJap2.txt
===================================================================
RCS file: src/hg/makeDb/doc/caeJap2.txt
diff -N src/hg/makeDb/doc/caeJap2.txt
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/makeDb/doc/caeJap2.txt	23 Jul 2009 17:46:02 -0000	1.1
@@ -0,0 +1,163 @@
+# for emacs: -*- mode: sh; -*-
+
+# Caenorhabditis japonica
+#	Washington University School of Medicine GSC and Sanger Institute
+#	WUSTL version 4.0.1 Jan 2009
+
+#  $Id$
+
+########################################################################
+#  Download sequence (DONE - 2009-07-22 - Hiram)
+    mkdir -p /hive/data/genomes/caeJap2/wustl
+    cd /hive/data/genomes/caeJap2/wustl
+
+    wget --timestamping \
+ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_japonica/assembly/Caenorhabditis_japonica-4.0.1/ASSEMBLY
+    wget --timestamping \
+ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_japonica/assembly/Caenorhabditis_japonica-4.0.1/README
+    wget --timestamping \
+ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_japonica/assembly/Caenorhabditis_japonica-4.0.1/output/*.*
+
+    cat << '_EOF_' > superToAgp.pl
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my %superSizes;
+
+open (FH, "<faCount.supercontigs.txt") or die "can not read
+faCount.supercontigs
+.txt";
+while (my $line = <FH>) {
+    next if ($line =~ m/^#/);
+    next if ($line =~ m/^total/);
+    my ($name, $size, $rest) = split('\s+',$line,3);
+    $superSizes{$name} = $size;
+}
+close (FH);
+
+my %contigSizes;
+open (FH, "<faCount.contigs.txt") or die "can not read faCount.contigs.txt";
+while (my $line = <FH>) {
+    next if ($line =~ m/^#/);
+    next if ($line =~ m/^total/);
+    my ($name, $size, $rest) = split('\s+',$line,3);
+    $contigSizes{$name} = $size;
+}
+close (FH);
+
+my $superName = "";
+my $contigPart = "";
+my $chromStart = 1;
+my $chromEnd = 1;
+my $id = 1;
+my $a = "";
+my $b = "";
+my $size = 0;
+my $skipSuper = 0;
+
+open (FH, "zcat supercontigs.gz|") or die "can not read supercontigs.gz";
+while (my $line = <FH>) {
+    next if ($line =~ m/^\s*$/);
+    chomp $line;
+    if ($line =~ m/^supercontig /) {
+        ($a, $b) = split('\s+', $line);
+        $superName = $b;
+        $superName =~ s/Superc/C/;
+        if (!exists($superSizes{$superName})) {
+            $skipSuper = 1;
+            next;
+        } else {
+            $skipSuper = 0;
+        }
+        if ($chromEnd > 1) {
+            $size = 1000;
+            $chromEnd = $chromStart + $size - 1;
+            printf "chrUn\t%d\t%d\t%d\tN\t%d\tcontig\tno\n", $chromStart,
+                $chromEnd, $id++, $size;
+            $chromStart = $chromEnd + 1;
+        }
+        next;
+    } elsif ($line =~ m/^contig /) {
+        next if ($skipSuper > 0);
+        ($a, $b) = split('\s+', $line);
+        $contigPart = $b;
+        die "can not find size for $contigPart"
+                if (!exists($contigSizes{$contigPart}));
+        $size = $contigSizes{$contigPart};
+        $chromEnd = $chromStart + $size - 1;
+        printf "chrUn\t%d\t%d\t%d\tW\t%s\t1\t%d\t+\n", $chromStart,
+            $chromEnd, $id++, $contigPart, $size;
+        $chromStart = $chromEnd + 1;
+        next;
+    } elsif ($line =~ m/^gap /) {
+        next if ($skipSuper > 0);
+        my ($g, $gapSize, $gapDev, $star, $x) = split('\s+', $line);
+        $size = $gapSize;
+        $size = 10 if ($size < 0);
+        die "gap size is 0" if ($size == 0);
+        $chromEnd = $chromStart + $size - 1;
+        printf "chrUn\t%d\t%d\t%d\tN\t%d\tfragment\tyes\n", $chromStart,
+            $chromEnd, $id++, $size;
+        $chromStart = $chromEnd + 1;
+    } else {
+        die "do not recognize line: $line";
+    }
+}
+close (FH);
+'_EOF_'
+    # << happy emacs
+    chmod +x superToAgp.pl
+
+    ./superToAgp.pl > chrUn.agp
+    qaToQac contigs.fa.qual.gz contigs.qac
+    qacAgpLift chrUn.agp contigs.qac chrUn.qual.qac
+
+
+########################################################################
+# initial genome browser build (DONE - 2009-07-23 - Hiram)
+    cd /hive/data/genomes/caeJap2
+    cat << '_EOF_' > caeJap2.config.ra
+# Config parameters for makeGenomeDb.pl:
+db caeJap2
+clade worm
+genomeCladePriority 10
+scientificName Caenorhabditis japonica
+commonName C. japonica
+assemblyDate Jan. 2009
+assemblyLabel Washington University School of Medicine GSC C. japonica 4.0.1
+orderKey 881
+mitoAcc none
+fastaFiles /hive/data/genomes/caeJap2/wustl/contigs.fa.gz
+agpFiles /hive/data/genomes/caeJap2/wustl/chrUn.agp
+qualFiles /hive/data/genomes/caeJap2/wustl/chrUn.qual.qac
+dbDbSpeciesDir worm
+taxId 281687
+'_EOF_'
+    # << happy emacs
+
+    #	verify sequence and AGP specs are OK
+    makeGenomeDb.pl -stop=agp caeJap2.config.ra
+    makeGenomeDb.pl -continue=db caeJap2.config.ra > makeGenomeDb.db.log 2>&1
+
+    ln -s `pwd`/caeJap2.unmasked.2bit /gbdb/caeJap2/caeJap2.2bit
+    #	your personal browser should be functioning now
+
+########################################################################
+# Repeat Masker (DONE - 2009-07-23 - Hiram)
+    mkdir /hive/data/genomes/caeJap2/bed/repeatMasker
+    cd /hive/data/genomes/caeJap2/bed/repeatMasker
+    doRepeatMasker.pl -buildDir=`pwd` caeJap2 > do.log 2>&1
+
+########################################################################
+# Simple Repeats (DONE - 2009-07-23 - Hiram)
+    mkdir /hive/data/genomes/caeJap2/bed/simpleRepeat
+    cd /hive/data/genomes/caeJap2/bed/simpleRepeat
+    doSimpleRepeat.pl -buildDir=`pwd` caeJap2 > do.log 2>&1
+
+########################################################################
+# Window Masker (DONE - 2009-07-23 - Hiram)
+    mkdir /hive/data/genomes/caeJap2/bed/windowMasker
+    cd /hive/data/genomes/caeJap2/bed/windowMasker
+    doWindowMasker.pl -buildDir=`pwd` caeJap2 > do.log 2>&1