src/hg/makeDb/doc/felCatV17e.txt 1.2
1.2 2010/03/16 19:34:59 chinhli
Complete repeat mask, simple repeat, fix gap using comp.gap
Index: src/hg/makeDb/doc/felCatV17e.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/felCatV17e.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 4 -r1.1 -r1.2
--- src/hg/makeDb/doc/felCatV17e.txt 5 Mar 2010 18:26:54 -0000 1.1
+++ src/hg/makeDb/doc/felCatV17e.txt 16 Mar 2010 19:34:59 -0000 1.2
@@ -1,15 +1,22 @@
# for emacs: -*- mode: sh; -*-
# $Id$
-# Marmoset sequence: http://panda.genomics.org.cn/page/panda/download.jsp
-# ftp.ncbi.nlm.nih.gov:genbank/genomes/Eukaryotes/vertebrates_mammals/
-# Callithrix_jacchus/Callithrix_jacchus-3.2
-# Callithrix jacchus
+# Felis Catus (domestic cat) -- NHGRI/GTB V17e/felCatV17e (2009-05-25)
+
+
+# file template copied from calJac3.txt
+
+
+# Felis catus (Project ID: 32759) by NHGRI/Genome Technology Branch [Draft
+# assembly] sequence:
+# ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/
+# Felis_catus/catChrV17e
+# Felis catus
##########################################################################
-# Download sequence (DONE - 2010-02-04 - Hiram)
+# Download sequence (DONE - 2010-03-03 Chin)
mkdir /hive/data/genomes/felCatV17e
cd /hive/data/genomes/felCatV17e
mkdir genbank
cd genbank
@@ -17,11 +24,12 @@
--no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Felis_catus/catChrV17e/*"
# FINISHED --09:05:15--
# Downloaded: 151 files, 1.3G in 7m 42s (2.98 MB/s)
+ # Read ASSEMBLY_INFO
mkdir ucscChr
- cd ucscChr
+ # stay at genbank directory
# fixup the accession names to become UCSC chrom names
S=Primary_Assembly/assembled_chromosomes
cut -f1 ${S}/chr2acc | while read C
@@ -31,8 +39,19 @@
zcat ${S}/AGP/chr${C}.agp.gz \
| sed -e "s/^${ACC}/chr${C}/" | gzip > ucscChr/chr${C}.agp.gz
done
+# use compenet instead
+S=Primary_Assembly/assembled_chromosomes
+cut -f1 ${S}/chr2acc | while read C
+do
+ ACC=`grep "${C}" ${S}/chr2acc | cut -f2`
+ echo "${ACC} -> chr${C}"
+ zcat ${S}/AGP/chr${C}.comp.agp.gz \
+ | sed -e "s/^${ACC}/chr${C}/" | gzip > ucscChr/chr${C}.comp.agp.gz
+done
+
+
S=Primary_Assembly/assembled_chromosomes
cut -f1 ${S}/chr2acc | while read C
do
ACC=`grep "${C}" ${S}/chr2acc | cut -f2`
@@ -50,13 +69,15 @@
# lower) in 19 sequences in 19 files
# For unplaced scalfolds, named them as chrUn_xxxxxxxx
+ # where xxxxxx is the original access id as: chrUn_ACBE01000381.1
# and put it into chrUn.* files
+ # copy all the comment lines (start with #)
zcat ${S}/AGP/unplaced.scaf.agp.gz | grep "^#" > ucscChr/chrUn.agp
+ # append the gap records
zcat ${S}/AGP/unplaced.scaf.agp.gz | grep -v "^#" \
| sed -e "s/^/chrUn_/" >> ucscChr/chrUn.agp
-
gzip ucscChr/chrUn.agp &
S=Primary_Assembly/unplaced_scaffolds
zcat ${S}/FASTA/unplaced.scaf.fa.gz \
@@ -60,8 +81,11 @@
S=Primary_Assembly/unplaced_scaffolds
zcat ${S}/FASTA/unplaced.scaf.fa.gz \
| sed -e "s#^>.*|gb|#>chrUn_#; s#|.*##" | gzip > ucscChr/chrUn.fa.gz
+ # about 104034 sequences in the unplaced
+zcat chrUn.fa.gz | grep "^>" | wc
+ # 104034 104034 2275648
# Check them with faSize
faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz
# 287642232 bases (3696852 N's 283945380 real 283945380 upper 0
@@ -71,9 +95,9 @@
# lower) in 104034 sequences in 1 files
##########################################################################
-# Initial genome build (DONE - 2009-12-17 - Hiram)
+# Initial genome build (Done - 2010-03-03 -Chin)
cd /hive/data/genomes/felCatV17e
cat << '_EOF_' > felCatV17e.config.ra
# Config parameters for makeGenomeDb.pl:
@@ -101,5 +125,148 @@
time makeGenomeDb.pl -continue=db -stop=db felCatV17e.config.ra > db.log 2>&1 &
#real 7m50.591s
time makeGenomeDb.pl -continue=dbDb -stop=dbDb felCatV17e.config.ra > dbDb.log 2>&1 &
+time makeGenomeDb.pl -continue=trackDb felCatV17e.config.ra > trackDb.log 2>&1 &
+
+#### 03/15 fixing the agp using the component one
+# manually run hgLoadGapGl (as in jkStuff/makeDb.csh)
+# without rerun the makeGenomeDb -continue=gap step
+
+zcat genbank/ucscChr/chr*.comp.agp.gz | grep -v '^#' \
+ | sort -k1,1 -k2n,2n > felCatV17e.comp.agp
+
+zcat genbank/ucscChr/chrUn.agp.gz | grep -v '^#' >> felCatV17e.comp.agp
+grep chrM felCatV17e.agp >> felCatV17e.comp.agp
+checkAgpAndFa felCatV17e.comp.agp felCatV17e.unmasked.2bit
+ # loop: chrM, dnaOffset=0, seqSize=17009
+ # chrM 1 17009 230 F NC_001700 1 17009 +
+ # agpFrag->chromStart: 0, agpFrag->chromEnd: 17009, dnaOffset: 0
+ # FASTA sequence entry
+ # valid Fasta file entry
+ # all AGP and FASTA entries agree - both files are valid
+
+ dbSnoop felCatV17e before.txt
+hgGoldGapGl -noGl felCatV17e felCatV17e.comp.agp
+ dbSnoop felCatV17e after.txt
+ diff before.txt after.txt # to see the change of rows in gold and gap tables
+ # > 35,861,584 gold 506,133 24,249,424 11,612,160
+ # > 28,192,744 gap 500,507 22,068,200 6,124,544
+ # 26,27d27
+ # < 858,728 gold 9,961 539,240 319,488
+ # < 345,208 gap 4,335 236,664 108,544
+
+featureBits -countGaps felCatV17e gap
+ # 1169658726 bases of 3160303948 (37.011%) in intersection
+hgsql -e "select type from gap;" felCatV17e | sort | uniq -c
+ # 381 contig
+ # 500126 fragment
+ # 1 type
+
+
+
+##########################################################################
+# running repeat masker (DONE - 2010-03-05 - Chin)
+ mkdir /hive/data/genomes/felCatV17e/bed/repeatMasker
+ cd /hive/data/genomes/felCatV17e/bed/repeatMasker
+ time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
+ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
+ -smallClusterHub=encodek felCatV17e > do.log 2>&1 &
+ # real 530m25.613s
+
+ cat faSize.rmsk.txt
+ # 3160303948 bases (1169668943 N's 1990635005 real 1183631401
+ # upper 807003604 lower) in 104054 sequences in 1 files
+ # %25.54 masked total, %40.54 masked real
+
+##########################################################################
+# running simple repeat (DONE - 2010-03-16 Chin)
+ mkdir /hive/data/genomes/felCatV17e/bed/simpleRepeat
+ cd /hive/data/genomes/felCatV17e/bed/simpleRepeat
+ time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
+ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
+ felCatV17e > do.log 2>&1 &
+ # reall 149m19.948s
+ cat fb.simpleRepeat
+ # 49107705 bases of 2690950627 (1.825%) in intersection
+
+
+ cd /hive/data/genomes/felCatV17e
+ twoBitMask felCatV17e.rmsk.2bit \
+ -add bed/simpleRepeat/trfMask.bed felCatV17e.2bit
+ # you can safely ignore the warning about fields >= 13
+
+ twoBitToFa felCatV17e.2bit stdout | faSize stdin > faSize.felCatV17e.2bit.txt
+ cat faSize.felCatV17e.2bit.txt
+ # 3160303948 bases (1169668943 N's 1990635005 real 1182766163 upper
+ # 807868842 lower) in 104054 sequences in 1 files
+ # %25.56 masked total, %40.58 masked real
+
+ # double check with featureBits
+ featureBits -countGaps felCatV17e gap
+ # 1169668943 bases of 3160303948 (37.011%) in intersection
+
+ rm /gbdb/felCatV17e/felCatV17e.2bit
+ ln -s `pwd`/felCatV17e.2bit /gbdb/felCatV17e/felCatV17e.2bit
+
+
+
+########################################################################
+# Marking *all* gaps - they are not all in the AGP file
+# (Working - 2010-03-15 - Chin)
+ mkdir /hive/data/genomes/felCatV17e/bed/allGaps
+ cd /hive/data/genomes/felCatV17e/bed/allGaps
+
+ time nice -n +19 findMotif -motif=gattaca -verbose=4 \
+ -strand=+ ../../felCatV17e.rmsk.2bit > findMotif.txt 2>&1
+ # real 1m45.351s
+
+ grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
+ featureBits felCatV17e -not gap -bed=notGap.bed
+ # 1990645222 bases of 1990645222 (100.000%) in intersection
+ featureBits felCatV17e allGaps.bed notGap.bed -bed=new.gaps.bed
+ # 10217 bases of 1990645222 (0.001%) in intersection
+ # (real time 7+ hours)
+ # what is the last index in the existing gap table:
+ hgsql -N -e "select ix from gap;" felCatV17e | sort -n | tail -1
+ # 101050
+
+#### here here 03/16
+ cat << '_EOF_' > mkGap.pl
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $ix=`hgsql -N -e "select ix from gap;" felCatV17e | sort -n | tail -1`;
+chomp $ix;
+
+open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
+while (my $line = <FH>) {
+ my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
+ ++$ix;
+ printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
+ $chromEnd, $ix, $chromEnd-$chromStart;
+}
+close (FH);
+'_EOF_'
+ # << happy emacs
+ chmod +x ./mkGap.pl
+ ./mkGap.pl > other.gap
+ hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
+ -noLoad felCatV17e otherGap other.gap
+ # Reading other.gap
+ # Loaded 1294 elements of size 8
+ # Sorted
+ # Saving bed.tab
+ # No load option selected, see file: bed.tab
+ wc -l bed.tab
+ # 1294 bed.tab
+ # starting with this many
+ hgsql -e "select count(*) from gap;" felCatV17e
+ # 500507
+ hgsql felCatV17e -e 'load data local infile "bed.tab" into table gap;'
+ # result count:
+ hgsql -e "select count(*) from gap;" felCatV17e
+ # 501801
+ # == 500507 + 1294