src/hg/makeDb/doc/calJac3.txt 1.1

1.1 2010/02/04 23:49:59 hiram
Downloading new Marmoset assembly
Index: src/hg/makeDb/doc/calJac3.txt
===================================================================
RCS file: src/hg/makeDb/doc/calJac3.txt
diff -N src/hg/makeDb/doc/calJac3.txt
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/makeDb/doc/calJac3.txt	4 Feb 2010 23:49:59 -0000	1.1
@@ -0,0 +1,62 @@
+# for emacs: -*- mode: sh; -*-
+
+#	$Id$
+
+# Marmoset sequence: http://panda.genomics.org.cn/page/panda/download.jsp
+# ftp.ncbi.nlm.nih.gov:genbank/genomes/Eukaryotes/vertebrates_mammals/
+#	Callithrix_jacchus/Callithrix_jacchus-3.2
+#	Callithrix jacchus
+
+##########################################################################
+# Download sequence (DONE - 2010-02-04 - Hiram)
+    mkdir /hive/data/genomes/calJac3
+    cd /hive/data/genomes/calJac3
+    mkdir genbank
+    cd genbank
+    wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
+	--no-remove-listing -np \
+"ftp.ncbi.nlm.nih.gov:genbank/genomes/Eukaryotes/vertebrates_mammals/Callithrix_jacchus/Callithrix_jacchus-3.2/*"
+
+    mkdir ucscChr
+    cd ucscChr
+    #	fixup the accession names to become UCSC chrom names
+zcat ../Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \
+    | sed -e "s/^>.*gb|\([A-Z]*[0-9]*\).1.*/>chrUn_\1/" > chrUn.fa
+
+zcat ../Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
+    | sed -e "s/^\([A-Z]*[0-9]*\).1/chrUn_\1/" > chrUn.agp
+
+find ../Primary_Assembly/unlocalized_scaffolds/FASTA -type f \
+        | grep "unlocalized.scaf.fa.gz$" | head -1 | while read F
+do
+    C=`basename ${F}`
+    C=${C/.unlocalized.scaf.fa.gz}
+    zcat "${F}" | sed -e "s/^>.*gb|\([A-Z]*[0-9]*\).1.*/>${C}_\1_random/"
+done > chr_randoms.fa
+
+find ../Primary_Assembly/unlocalized_scaffolds/AGP -type f | grep ".agp.gz$" \
+| while read F
+do
+    C=`basename ${F}`
+    echo -n "${C} "
+    C=${C/.unlocalized.scaf.agp.gz}
+    echo "${C}"
+    zcat "${F}" | sed -e "s/^\([A-Z]*[0-9]*\).1/${C}_\1_random/"
+done > chr_randoms.agp
+
+AC="../Primary_Assembly/assembled_chromosomes"
+for F in ${AC}/FASTA/chr*.fa.gz
+do
+    C=`basename $F`
+    C=${C/.fa.gz}
+    echo -n "${C} "
+    H=`zcat "${F}" | head -1`
+    chrN=`echo $H | sed -e "s/.*Callithrix jacchus chromosome /chr/; s/,
+.*//"`
+    A=`echo $H | sed -e "s/. Callithrix.*//; s/.*gb.//"`
+    echo $chrN $A
+    zcat ${AC}/AGP/${chrN}.comp.agp.gz \
+        | sed -e "s/^${A}/${chrN}/" > ${chrN}.agp
+    echo ">${chrN}" > ${chrN}.fa
+    zcat ${AC}/FASTA/${chrN}.fa.gz | grep -v "^>" >> ${chrN}.fa
+done