src/hg/makeDb/doc/hg19.txt 1.1
1.1 2009/03/05 19:52:55 hiram
Initial database established
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: src/hg/makeDb/doc/hg19.txt
diff -N src/hg/makeDb/doc/hg19.txt
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/makeDb/doc/hg19.txt 5 Mar 2009 19:52:55 -0000 1.1
@@ -0,0 +1,142 @@
+# for emacs: -*- mode: sh; -*-
+
+# This file describes how we made the browser database on
+# NCBI build 37 (February 2009 freeze) aka:
+# GRCh37 - Genome Reference Consortium Human Reference 37
+# Assembly Accession: GCA_000001405.1
+
+# "$Id$";
+
+#############################################################################
+# Download sequence (DONE - 2009-02-04 - Hiram)
+ mkdir -p /hive/data/genomes/hg19/download
+ cd /hive/data/genomes/hg19/download
+ mkdir -p assembled_chromosomes
+ wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
+ --directory-prefix=assembled_chromosomes \
+ -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
+ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/assembled_chromosomes
+
+ mkdir -p alternate_loci
+for N in 1 2 3 4 5 6 7 8 9
+do
+wget --cut-dirs=6 --no-parent --timestamping --no-remove-listing -m \
+ --directory-prefix=alternate_loci \
+ -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
+ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/ALT_REF_LOCI_${N}
+done
+
+ mkdir -p unlocalized_scaffolds
+ wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
+ --directory-prefix=unlocalized_scaffolds \
+ -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
+ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unlocalized_scaffolds
+
+ mkdir -p unplaced_scaffolds
+ wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
+ --directory-prefix=unplaced_scaffolds \
+ -nH --ftp-user=anonymous --ftp-password=yourEmail@your.domain \
+ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/unplaced_scaffolds
+
+ mkdir -p placed_scaffolds
+ wget --cut-dirs=8 --no-parent --timestamping --no-remove-listing -m \
+ --directory-prefix=placed_scaffolds \
+ -nH --ftp-user=anonymous --ftp-password=hiram@soe.ucsc.edu \
+ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/Primary_Assembly/placed_scaffolds
+
+ mkdir ucscChr
+ cd ucscChr
+ for F in ../assembled_chromosomes/FASTA/chr*.fa
+do
+ C=`basename $F`
+ C=${C/.fa}
+ echo -n "${C} "
+ H=`head -1 "${F}"`
+ chrN=`echo $H | sed -e "s/.*Homo sapiens chromosome /chr/; s/, .*//"`
+ A=`echo $H | sed -e "s/. Homo.*//; s/.*gb.//"`
+ echo $chrN $A
+ grep -v "^#" ../assembled_chromosomes/AGP/${chrN}.comp.agp \
+ | sed -e "s/^${A}/${chrN}/" > ${chrN}.agp
+ echo ">${chrN}" > ${chrN}.fa
+ grep -v "^>" ../assembled_chromosomes/FASTA/${chrN}.fa >> ${chrN}.fa
+done
+
+ rm -f scaffolds.agp
+ find ../alternate_loci -type f | grep ".agp$" | while read F
+do
+ grep "^GL" $F | sed -e \
+"s/^GL000250.1/chr6_apd_hap1/" -e \
+"s/^GL000251.1/chr6_cox_hap2/" -e \
+"s/^GL000252.1/chr6_dbb_hap3/" -e \
+"s/^GL000253.1/chr6_mann_hap4/" -e \
+"s/^GL000254.1/chr6_mcf_hap5/" -e \
+"s/^GL000255.1/chr6_qbl_hap6/" -e \
+"s/^GL000256.1/chr6_ssto_hap6/" -e \
+"s/^GL000257.1/chr4_ctg9_hap1/" -e \
+"s/^GL000258.1/chr17_ctg5_hap1/"
+done > scaffolds.agp
+
+ find ../unlocalized_scaffolds -type f | grep ".agp$" \
+| while read F
+do
+ C=`basename ${F}`
+ C=${C/.unlocalized.scaf.agp}
+ grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/${C}_gl\1_random/"
+done >> scaffolds.agp
+
+ find ../unplaced_scaffolds -type f | grep ".agp$" \
+| while read F
+do
+ grep "^GL" ${F} | sed -e "s/^GL\([0-9]*\).1/chrUn_gl\1/"
+done >> scaffolds.agp
+
+ rm -f scaffolds.fa
+ find ../alternate_loci -type f | grep ".fa$" | while read F
+do
+ sed -e \
+"s/>.*GL000250.*/>chr6_apd_hap1/" -e \
+"s/>.*GL000251.*/>chr6_cox_hap2/" -e \
+"s/>.*GL000252.*/>chr6_dbb_hap3/" -e \
+"s/>.*GL000253.*/>chr6_mann_hap4/" -e \
+"s/>.*GL000254.*/>chr6_mcf_hap5/" -e \
+"s/>.*GL000255.*/>chr6_qbl_hap6/" -e \
+"s/>.*GL000256.*/>chr6_ssto_hap6/" -e \
+"s/>.*GL000257.*/>chr4_ctg9_hap1/" -e \
+"s/>.*GL000258.*/>chr17_ctg5_hap1/" ${F}
+done > scaffolds.fa
+
+ find ../unlocalized_scaffolds -type f | grep ".fa$" | while read F
+do
+ sed -e \
+"s/^>.*GL\([0-9]*\).* chromosome \([0-9]*\).*/>chr\2_gl\1_random/" ${F}
+done >> scaffolds.fa
+
+ find ../unplaced_scaffolds -type f | grep ".fa$" | while read F
+do
+ sed -e "s/.*\(GL[0-9]*\).*/\1/; s/GL/>chrUn_gl/" $F
+done >> scaffolds.fa
+
+############################################################################
+## Create database (DONE - 2009-03-04 - Hiram)
+ cd /hive/data/genomes/hg19
+ cat << '_EOF_' > hg19.config.ra
+# Config parameters for makeGenomeDb.pl:
+db hg19
+scientificName Homo sapiens
+commonName Human
+assemblyDate Feb. 2009
+assemblyLabel GRCh37 Genome Reference Consortium Human Reference 37 (GCA_000001405.1)
+orderKey 14
+mitoAcc NC_001807
+fastaFiles /hive/data/genomes/hg19/download/ucscChr/*.fa
+agpFiles /hive/data/genomes/hg19/download/ucscChr/*.agp
+# qualFiles /dev/null
+dbDbSpeciesDir human
+taxId 9606
+'_EOF_'
+ # << happy emacs
+
+ time makeGenomeDb.pl hg19.config.ra > makeGenomeDb.log 2>&1
+ # real 14m8.958s
+
+############################################################################