src/hg/makeDb/doc/danRer6.txt 1.1

1.1 2009/06/23 07:01:08 galt
start zv8
Index: src/hg/makeDb/doc/danRer6.txt
===================================================================
RCS file: src/hg/makeDb/doc/danRer6.txt
diff -N src/hg/makeDb/doc/danRer6.txt
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/makeDb/doc/danRer6.txt	23 Jun 2009 07:01:08 -0000	1.1
@@ -0,0 +1,118 @@
+##################################################
+#
+#  danRer6 = Danio rerio Zv8   Dec. 2008
+#
+#  by Galt Barber 2009-06-17
+#
+
+# Danio Rerio (zebrafish) from Sanger, version Zv8 (released 2008-12-12)
+#  Project website:
+#    http://www.sanger.ac.uk/Projects/D_rerio/
+#  Assembly notes:
+#    http://www.sanger.ac.uk/Projects/D_rerio/
+#    http://www.sanger.ac.uk/Projects/D_rerio/Zv8_assembly_information.shtml
+
+# TODO this comment is copied over and may need removal
+#  NOTE:  this doc may have genePred loads that fail to include
+#  the bin column.  Please correct that for the next build by adding
+#  a bin column when you make any of these tables:
+#
+
+###############################################
+# set up main genome directory
+
+    ssh hgwdev
+    cd /hive/data/genomes
+    mkdir danRer6
+    cd danRer6
+
+
+###########################################################################
+# DOWNLOAD SEQUENCE (DONE, 2009-06-17, galt)
+
+    cd /hive/data/genomes/danRer6
+    mkdir download
+    cd download
+
+    # get sequence from Sanger
+
+    wget --timestamping -r -np -l 2 -nd -L 'ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv8release'
+
+    # matches ftp site sizes	
+    ls -l --block-size=K
+-rw-rw-r-- 1 galt protein       1K Apr  9 15:56 README
+-rw-rw-r-- 1 galt protein    3999K Dec 12  2008 Zv8_chr.agp
+-rw-rw-r-- 1 galt protein 1553865K Dec 12  2008 Zv8_contigs.fa
+-rw-rw-r-- 1 galt protein    6475K Dec 12  2008 Zv8_scaffold.agp
+-rw-rw-r-- 1 galt protein 1470539K Dec 12  2008 Zv8_scaffolds.fa
+-rw-rw-r-- 1 galt protein 1471257K Apr  9 15:55 Zv8_toplevel.fa
+
+    # Zv8_chr.agp maps contigs (in Zv8_contigs.fa) to chr1-25 
+    # Oddly enough, the contig names are 
+    # all in the form of scaffoldName.1, ... 
+    # so that any original contig name is lost.
+
+    # Zv8_scaffold.agp maps contigs to scaffolds, 
+    #  producing Zv8_scaffolds.fa
+
+    # Not entirely sure yet about Zv8_toplevel.fa
+    #  but it seems to be more scaffolds only.
+    # I think only the configs.fa is useful fasta.
+
+    # There is nothing that maps the scaffolds to
+    # the chroms except that they gave the contig names
+    # like scaffold1.1 etc.  This connection via name 
+    # parts is not official agp structure, it's a hack.
+
+    # People want to see the detailed internal gap structure
+    # of the scaffolds even inside their chroms, so that
+    # creates a minor problem.
+
+    # Additionally, we want to include the chroms for sure,
+    # but we want to also include any other scaffolds not
+    # already included in the chroms. I am going to write
+    # a c program to merge the chrom and scaffold agps
+    # so that we don't duplicate scaffolds that are in 
+    # the chroms.  It will have to rely on the name 
+    # hack they provide to connect scaffold to chrom.
+
+    # This agp/fasta structure has existed at least since Zv7
+    # and I don't know if Sanger or others use it more 
+    # generally.
+
+    # I created a new program agpSangerUnfinished in kent/src/hg
+    # because Sanger wants to use unfinished clones but can't
+    # officially report it that way, so they hack their agp file
+    # but they don't change contigs.fa to match.
+    # compile it
+    cd kent/src/hg/agpSangerUnfinished
+    make
+    rehash
+
+    cd /hive/data/genomes/danRer6/download
+
+    # fix unfinished names and coordinates
+    agpSangerUnfinished Zv8_chr.agp Zv8_contigs.fa Zv8_chr.fix.agp
+    agpSangerUnfinished Zv8_scaffold.agp Zv8_contigs.fa Zv8_scaffold.fix.agp
+
+    # I created a new program agpMergeChromScaf in kent/src/hg
+    # as mentioned above for merging scaffold data 
+    # nog alreadhromScaf Zv8_
+    # compile it
+    cd kent/src/hg/agpMergeChromScaf
+    make
+    rehash
+
+    cd /hive/data/genomes/danRer6/download
+    agpMergeChromScaf Zv8_chr.fix.agp Zv8_scaffold.fix.agp danRer6.agp
+
+    # Now make final fasta from fixed/merged agp
+    agpAllToFaFile danRer6.agp Zv8_contigs.fa danRer6.fa
+
+    # check it out!
+    faToTwoBit danRer6.fa danRer6.2bit
+    checkAgpAndFa danRer6.agp danRer6.2bit >& checkAgpAndFa.log
+    tail -1 checkAgpAndFa.log
+#All AGP and FASTA entries agree - both files are valid
+
+