src/hg/makeDb/doc/felCatV17e.txt 1.1
1.1 2010/03/05 18:26:54 chinhli
New support for felCatV17e
Index: src/hg/makeDb/doc/felCatV17e.txt
===================================================================
RCS file: src/hg/makeDb/doc/felCatV17e.txt
diff -N src/hg/makeDb/doc/felCatV17e.txt
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/makeDb/doc/felCatV17e.txt 5 Mar 2010 18:26:54 -0000 1.1
@@ -0,0 +1,105 @@
+# for emacs: -*- mode: sh; -*-
+
+# $Id$
+
+# Marmoset sequence: http://panda.genomics.org.cn/page/panda/download.jsp
+# ftp.ncbi.nlm.nih.gov:genbank/genomes/Eukaryotes/vertebrates_mammals/
+# Callithrix_jacchus/Callithrix_jacchus-3.2
+# Callithrix jacchus
+
+##########################################################################
+# Download sequence (DONE - 2010-02-04 - Hiram)
+ mkdir /hive/data/genomes/felCatV17e
+ cd /hive/data/genomes/felCatV17e
+ mkdir genbank
+ cd genbank
+wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
+ --no-remove-listing -np \
+"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Felis_catus/catChrV17e/*"
+ # FINISHED --09:05:15--
+ # Downloaded: 151 files, 1.3G in 7m 42s (2.98 MB/s)
+
+ mkdir ucscChr
+ cd ucscChr
+ # fixup the accession names to become UCSC chrom names
+
+S=Primary_Assembly/assembled_chromosomes
+cut -f1 ${S}/chr2acc | while read C
+do
+ ACC=`grep "${C}" ${S}/chr2acc | cut -f2`
+ echo "${ACC} -> chr${C}"
+ zcat ${S}/AGP/chr${C}.agp.gz \
+ | sed -e "s/^${ACC}/chr${C}/" | gzip > ucscChr/chr${C}.agp.gz
+done
+
+S=Primary_Assembly/assembled_chromosomes
+cut -f1 ${S}/chr2acc | while read C
+do
+ ACC=`grep "${C}" ${S}/chr2acc | cut -f2`
+ echo "${ACC} -> chr${C}"
+ echo ">chr${C}" > ucscChr/chr${C}.fa
+ zcat ${S}/FASTA/chr${C}.fa.gz | grep -v "^>" >> ucscChr/chr${C}.fa
+ gzip ucscChr/chr${C}.fa &
+done
+ # Check them with faSize
+ faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz
+ # 2872644707 bases (1165972091 N's 1706672616 real 1706672616 upper 0
+ # lower) in 19 sequences in 19 files
+ faSize ucscChr/chr*.fa.gz
+ # 2872644707 bases (1165972091 N's 1706672616 real 1706672616 upper 0
+ # lower) in 19 sequences in 19 files
+
+
+ # For unplaced scalfolds, named them as chrUn_xxxxxxxx
+ # and put it into chrUn.* files
+zcat ${S}/AGP/unplaced.scaf.agp.gz | grep "^#" > ucscChr/chrUn.agp
+zcat ${S}/AGP/unplaced.scaf.agp.gz | grep -v "^#" \
+ | sed -e "s/^/chrUn_/" >> ucscChr/chrUn.agp
+
+gzip ucscChr/chrUn.agp &
+
+S=Primary_Assembly/unplaced_scaffolds
+zcat ${S}/FASTA/unplaced.scaf.fa.gz \
+ | sed -e "s#^>.*|gb|#>chrUn_#; s#|.*##" | gzip > ucscChr/chrUn.fa.gz
+
+ # Check them with faSize
+ faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz
+ # 287642232 bases (3696852 N's 283945380 real 283945380 upper 0
+ # lower) in 104034 sequences in 1 files
+ faSize ucscChr/chrUn.fa.gz
+ # 287642232 bases (3696852 N's 283945380 real 283945380 upper 0
+ # lower) in 104034 sequences in 1 files
+
+
+##########################################################################
+# Initial genome build (DONE - 2009-12-17 - Hiram)
+ cd /hive/data/genomes/felCatV17e
+
+ cat << '_EOF_' > felCatV17e.config.ra
+# Config parameters for makeGenomeDb.pl:
+db felCatV17e
+clade mammal
+genomeCladePriority 16
+scientificName Felis catus
+commonName Cat
+assemblyDate Dec. 2008
+assemblyLabel NHGRI/Genome Technology Branch (NCBI project 10703, accession ACBE0100000)
+assemblyShortLabel NHGRI/GTB V17e
+orderKey 216
+mitoAcc NC_001700
+fastaFiles /hive/data/genomes/felCatV17e/genbank/ucscChr/chr*.fa.gz
+agpFiles /hive/data/genomes/felCatV17e/genbank/ucscChr/chr*.agp.gz
+# qualFiles none
+dbDbSpeciesDir cat
+taxId 9685
+'_EOF_'
+
+
+makeGenomeDb.pl -stop seq felCatV17e.config.ra > seq.log 2>&1 &
+time makeGenomeDb.pl -continue=agp -stop=agp felCatV17e.config.ra > agp.log 2>&1 &
+# real 0m50.486s
+time makeGenomeDb.pl -continue=db -stop=db felCatV17e.config.ra > db.log 2>&1 &
+#real 7m50.591s
+
+time makeGenomeDb.pl -continue=dbDb -stop=dbDb felCatV17e.config.ra > dbDb.log 2>&1 &
+