src/hg/makeDb/doc/felCatV17e.txt 1.2

1.2 2010/03/16 19:34:59 chinhli
Complete repeat mask, simple repeat, fix gap using comp.gap
Index: src/hg/makeDb/doc/felCatV17e.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/felCatV17e.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 4 -r1.1 -r1.2
--- src/hg/makeDb/doc/felCatV17e.txt	5 Mar 2010 18:26:54 -0000	1.1
+++ src/hg/makeDb/doc/felCatV17e.txt	16 Mar 2010 19:34:59 -0000	1.2
@@ -1,15 +1,22 @@
 # for emacs: -*- mode: sh; -*-
 
 #       $Id$
 
-# Marmoset sequence: http://panda.genomics.org.cn/page/panda/download.jsp
-# ftp.ncbi.nlm.nih.gov:genbank/genomes/Eukaryotes/vertebrates_mammals/
-#       Callithrix_jacchus/Callithrix_jacchus-3.2
-#       Callithrix jacchus
+# Felis Catus (domestic cat) --  NHGRI/GTB V17e/felCatV17e (2009-05-25)
+  
+
+# file template copied from calJac3.txt
+
+
+# Felis catus (Project ID: 32759) by NHGRI/Genome Technology Branch [Draft
+#    assembly] sequence: 
+# ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/
+#       Felis_catus/catChrV17e
+#       Felis catus
 
 ##########################################################################
-# Download sequence (DONE - 2010-02-04 - Hiram)
+# Download sequence (DONE - 2010-03-03 Chin)
     mkdir /hive/data/genomes/felCatV17e
     cd /hive/data/genomes/felCatV17e
     mkdir genbank
     cd genbank
@@ -17,11 +24,12 @@
         --no-remove-listing -np \
 "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Felis_catus/catChrV17e/*"
     # FINISHED --09:05:15--
     # Downloaded: 151 files, 1.3G in 7m 42s (2.98 MB/s)
+    # Read ASSEMBLY_INFO 
 
     mkdir ucscChr
-    cd ucscChr
+    # stay at genbank directory
     #   fixup the accession names to become UCSC chrom names
 
 S=Primary_Assembly/assembled_chromosomes
 cut -f1 ${S}/chr2acc  | while read C
@@ -31,8 +39,19 @@
     zcat ${S}/AGP/chr${C}.agp.gz \
         | sed -e "s/^${ACC}/chr${C}/" | gzip > ucscChr/chr${C}.agp.gz
 done
 
+# use compenet instead 
+S=Primary_Assembly/assembled_chromosomes
+cut -f1 ${S}/chr2acc  | while read C
+do
+    ACC=`grep "${C}" ${S}/chr2acc | cut -f2`
+    echo "${ACC} -> chr${C}"
+    zcat ${S}/AGP/chr${C}.comp.agp.gz \
+        | sed -e "s/^${ACC}/chr${C}/" | gzip > ucscChr/chr${C}.comp.agp.gz
+done
+
+
 S=Primary_Assembly/assembled_chromosomes
 cut -f1 ${S}/chr2acc  | while read C
 do
     ACC=`grep "${C}" ${S}/chr2acc | cut -f2`
@@ -50,13 +69,15 @@
    #        lower) in 19 sequences in 19 files
 
 
    # For unplaced scalfolds, named them as chrUn_xxxxxxxx
+   #   where xxxxxx is the original access id as: chrUn_ACBE01000381.1 
    # and put it into chrUn.* files 
+   # copy all the comment lines (start with #)
 zcat ${S}/AGP/unplaced.scaf.agp.gz | grep "^#" > ucscChr/chrUn.agp
+   # append the gap records
 zcat ${S}/AGP/unplaced.scaf.agp.gz | grep -v "^#" \
         | sed -e "s/^/chrUn_/" >> ucscChr/chrUn.agp
-
 gzip ucscChr/chrUn.agp &
 
 S=Primary_Assembly/unplaced_scaffolds
 zcat ${S}/FASTA/unplaced.scaf.fa.gz \
@@ -60,8 +81,11 @@
 
 S=Primary_Assembly/unplaced_scaffolds
 zcat ${S}/FASTA/unplaced.scaf.fa.gz \
         | sed -e "s#^>.*|gb|#>chrUn_#; s#|.*##" | gzip > ucscChr/chrUn.fa.gz
+   # about 104034 sequences in the unplaced
+zcat chrUn.fa.gz | grep "^>" | wc
+   #  104034  104034 2275648
 
    # Check them with faSize 
    faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz 
    # 287642232 bases (3696852 N's 283945380 real 283945380 upper 0 
@@ -71,9 +95,9 @@
    #        lower) in 104034 sequences in 1 files
 
 
 ##########################################################################
-# Initial genome build (DONE - 2009-12-17 - Hiram)
+# Initial genome build (Done - 2010-03-03 -Chin)
     cd /hive/data/genomes/felCatV17e
 
     cat << '_EOF_' > felCatV17e.config.ra
 # Config parameters for makeGenomeDb.pl:
@@ -101,5 +125,148 @@
 time makeGenomeDb.pl -continue=db -stop=db felCatV17e.config.ra > db.log 2>&1 &
 #real    7m50.591s
 
 time makeGenomeDb.pl -continue=dbDb -stop=dbDb felCatV17e.config.ra > dbDb.log 2>&1 &
+time makeGenomeDb.pl -continue=trackDb felCatV17e.config.ra > trackDb.log 2>&1 &
+
+#### 03/15 fixing the agp using the component one
+#          manually run hgLoadGapGl (as in jkStuff/makeDb.csh) 
+#          without rerun the makeGenomeDb -continue=gap step
+ 
+zcat genbank/ucscChr/chr*.comp.agp.gz | grep -v '^#' \
+    | sort -k1,1 -k2n,2n > felCatV17e.comp.agp
+
+zcat genbank/ucscChr/chrUn.agp.gz  | grep -v '^#' >> felCatV17e.comp.agp
+grep chrM felCatV17e.agp >> felCatV17e.comp.agp
+checkAgpAndFa felCatV17e.comp.agp felCatV17e.unmasked.2bit
+    # loop: chrM, dnaOffset=0, seqSize=17009
+    # chrM 1 17009 230 F NC_001700 1 17009 +
+    # agpFrag->chromStart: 0, agpFrag->chromEnd: 17009, dnaOffset: 0
+    # FASTA sequence entry
+    # valid Fasta file entry
+    # all AGP and FASTA entries agree - both files are valid
+
+  dbSnoop felCatV17e before.txt 
+hgGoldGapGl -noGl felCatV17e felCatV17e.comp.agp
+  dbSnoop felCatV17e after.txt
+  diff before.txt after.txt # to see the change of rows in gold and gap tables
+    # > 35,861,584    gold    506,133 24,249,424      11,612,160
+    # > 28,192,744    gap     500,507 22,068,200      6,124,544
+    # 26,27d27
+    # < 858,728       gold    9,961   539,240 319,488
+    # < 345,208       gap     4,335   236,664 108,544
+
+featureBits -countGaps felCatV17e gap
+    # 1169658726 bases of 3160303948 (37.011%) in intersection
+hgsql -e "select type from gap;" felCatV17e | sort | uniq -c
+    #	381 contig
+    # 500126 fragment
+    #	1 type
+
+
+
+##########################################################################
+# running repeat masker (DONE - 2010-03-05 - Chin)
+    mkdir /hive/data/genomes/felCatV17e/bed/repeatMasker
+    cd /hive/data/genomes/felCatV17e/bed/repeatMasker
+    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
+        -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
+        -smallClusterHub=encodek felCatV17e > do.log 2>&1 &
+    # real    530m25.613s
+
+    cat faSize.rmsk.txt
+    # 3160303948 bases (1169668943 N's 1990635005 real 1183631401 
+    # upper 807003604 lower) in 104054 sequences in 1 files
+    # %25.54 masked total, %40.54 masked real
+
+##########################################################################
+# running simple repeat (DONE - 2010-03-16 Chin)
+    mkdir /hive/data/genomes/felCatV17e/bed/simpleRepeat
+    cd /hive/data/genomes/felCatV17e/bed/simpleRepeat
+    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
+        -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
+        felCatV17e > do.log 2>&1 &
+    #  reall    149m19.948s
+    cat fb.simpleRepeat
+    #   49107705 bases of 2690950627 (1.825%) in intersection
+
+
+    cd /hive/data/genomes/felCatV17e
+    twoBitMask felCatV17e.rmsk.2bit \
+        -add bed/simpleRepeat/trfMask.bed felCatV17e.2bit
+    #   you can safely ignore the warning about fields >= 13
+
+    twoBitToFa felCatV17e.2bit stdout | faSize stdin > faSize.felCatV17e.2bit.txt
+    cat faSize.felCatV17e.2bit.txt
+    #  3160303948 bases (1169668943 N's 1990635005 real 1182766163 upper
+    #   807868842 lower) in 104054 sequences in 1 files
+    #   %25.56 masked total, %40.58 masked real
+
+    # double check with featureBits
+    featureBits -countGaps felCatV17e gap
+    #  1169668943 bases of 3160303948 (37.011%) in intersection
+
+    rm /gbdb/felCatV17e/felCatV17e.2bit
+    ln -s `pwd`/felCatV17e.2bit /gbdb/felCatV17e/felCatV17e.2bit
+
+
+
+########################################################################
+# Marking *all* gaps - they are not all in the AGP file
+#       (Working - 2010-03-15 - Chin)
+    mkdir /hive/data/genomes/felCatV17e/bed/allGaps
+    cd /hive/data/genomes/felCatV17e/bed/allGaps
+
+    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
+        -strand=+ ../../felCatV17e.rmsk.2bit > findMotif.txt 2>&1
+    #   real    1m45.351s
+
+    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
+    featureBits felCatV17e -not gap -bed=notGap.bed
+    #  1990645222 bases of 1990645222 (100.000%) in intersection
+    featureBits felCatV17e allGaps.bed notGap.bed -bed=new.gaps.bed
+    #  10217 bases of 1990645222 (0.001%) in intersection
+    #    (real time 7+ hours)
+    #   what is the last index in the existing gap table:
+    hgsql -N -e "select ix from gap;" felCatV17e | sort -n | tail -1
+    #   101050
+
+#### here here 03/16
+    cat << '_EOF_' > mkGap.pl
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $ix=`hgsql -N -e "select ix from gap;" felCatV17e | sort -n | tail -1`;
+chomp $ix;
+
+open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
+while (my $line = <FH>) {
+    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
+    ++$ix;
+    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
+        $chromEnd, $ix, $chromEnd-$chromStart;
+}
+close (FH);
+'_EOF_'
+    # << happy emacs
+    chmod +x ./mkGap.pl
+    ./mkGap.pl > other.gap
+    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
+        -noLoad felCatV17e otherGap other.gap
+    #   Reading other.gap
+    #   Loaded 1294 elements of size 8
+    #   Sorted
+    #   Saving bed.tab
+    #   No load option selected, see file: bed.tab
+    wc -l bed.tab
+    #   1294 bed.tab
+    #   starting with this many
+    hgsql -e "select count(*) from gap;" felCatV17e
+    #   500507
+    hgsql felCatV17e -e 'load data local infile "bed.tab" into table gap;'
+    #   result count:
+    hgsql -e "select count(*) from gap;" felCatV17e
+    #   501801
+    # == 500507 + 1294