src/hg/makeDb/doc/hg19.txt 1.54
1.54 2009/11/02 18:24:53 braney
started building hgPal downloads
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.53
retrieving revision 1.54
diff -b -B -U 4 -r1.53 -r1.54
--- src/hg/makeDb/doc/hg19.txt 29 Oct 2009 23:13:02 -0000 1.53
+++ src/hg/makeDb/doc/hg19.txt 2 Nov 2009 18:24:53 -0000 1.54
@@ -7710,4 +7710,157 @@
# remove temporary files.
rm j*
+
+#########################################################################
+# hgPal downloads (working braney
+# FASTA from 44way for refGene, knownGene, knownCanonical
+
+ ssh hgwdev
+ screen
+ bash
+ rm -rf /cluster/data/hg19/bed/multiz46way/pal
+ mkdir /cluster/data/hg19/bed/multiz46way/pal
+ cd /cluster/data/hg19/bed/multiz46way/pal
+ for i in `cat ../species.list`; do echo $i; done > order.lst
+
+ mz=multiz46way
+ gp=refGene
+ db=hg19
+ mkdir exonAA exonNuc ppredAA ppredNuc
+ for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
+ do
+ echo "date"
+ echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \
+ gzip -c > ppredAA/$j.ppredAA.fa.gz"
+ echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
+ gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
+ echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
+ gzip -c > exonNuc/$j.exonNuc.fa.gz"
+ echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
+ gzip -c > exonAA/$j.exonAA.fa.gz"
+ done > $gp.jobs
+
+ time sh -x $gp.jobs > $gp.jobs.log 2>&1 &
+ sleep 1
+ tail -f $gp.jobs.log
+
+# real 525m57.376s
+# user 25m36.072s
+# sys 7m41.565s
+
+ ssh kolossus
+ mz=multiz46way
+ gp=refGene
+ db=hg19
+ zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
+ zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
+ zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
+ zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
+
+ rm -rf exonAA exonNuc ppredAA ppredNuc
+
+ # we're only distributing exons at the moment
+ mz=multiz46way
+ gp=refGene
+ db=hg19
+ pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
+ mkdir -p $pd
+ ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
+ ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
+
+ mz=multiz46way
+ gp=knownGene
+ db=hg19
+ mkdir exonAA exonNuc ppredAA ppredNuc
+ for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
+ do
+ echo "date"
+ echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \
+ gzip -c > ppredAA/$j.ppredAA.fa.gz"
+ echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
+ gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
+ echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
+ gzip -c > exonNuc/$j.exonNuc.fa.gz"
+ echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
+ gzip -c > exonAA/$j.exonAA.fa.gz"
+ done > $gp.$mz.jobs
+
+ time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
+ sleep 1
+ tail -f $gp.$mz.job.log
+
+# real 442m46.735s
+# user 43m3.060s
+# sys 10m45.635s
+
+
+ mz=multiz46way
+ gp=knownGene
+ db=hg19
+
+ zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
+ zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
+ zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
+ zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
+
+ rm -rf exonAA exonNuc ppredAA ppredNuc
+
+ mz=multiz46way
+ gp=knownGene
+ db=hg19
+ pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
+ mkdir -p $pd
+ ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
+ ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
+
+ # now do the canonical set
+ cd /cluster/data/hg19/bed/multiz46way/pal
+ mz=multiz46way
+ gp=knownCanonical
+ db=hg19
+ for j in `awk '{print $1}' /cluster/data/hg19/chrom.sizes`
+ do
+ echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed
+ done
+
+ mkdir exonAA exonNuc ppredAA ppredNuc
+ for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
+ do
+ echo "date"
+ echo "mafGene -geneBeds=$j.known.bed $db $mz knownGene order.lst stdout | \
+ gzip -c > ppredAA/$j.ppredAA.fa.gz"
+ echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | \
+ gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
+ echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | \
+ gzip -c > exonNuc/$j.exonNuc.fa.gz"
+ echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | \
+ gzip -c > exonAA/$j.exonAA.fa.gz"
+ done > $gp.$mz.jobs
+
+ time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
+ sleep 1
+ tail -f $gp.$mz.job.log
+
+# real 326m12.849s
+# user 17m40.850s
+# sys 3m59.648s
+
+ rm *.known.bed
+ mz=multiz46way
+ gp=knownCanonical
+ db=hg19
+ zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
+ zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
+ zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
+ zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
+
+ rm -rf exonAA exonNuc ppredAA ppredNuc
+
+ mz=multiz46way
+ gp=knownCanonical
+ db=hg19
+ pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
+ mkdir -p $pd
+ ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
+ ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz