src/hg/makeDb/doc/h1n1.txt 1.5

1.5 2009/05/15 22:46:08 fanhsu
Added the latest h1n1_0514Seq track build process, with good CDS display.
Index: src/hg/makeDb/doc/h1n1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/h1n1.txt,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 4 -r1.4 -r1.5
--- src/hg/makeDb/doc/h1n1.txt	15 May 2009 22:44:54 -0000	1.4
+++ src/hg/makeDb/doc/h1n1.txt	15 May 2009 22:46:08 -0000	1.5
@@ -188,4 +188,95 @@
 
 # load the psl result into humanHA table
     hgLoadPsl h1n1 swineHA.psl
 
+#######################################################################################
+# CREATE h1n1_0514Seq TRACK (DONE.  5/15/09, Fan)
+
+    mkdir -p /hive/data/genomes/h1n1/bed/h1n1_0514Seq
+    cd /hive/data/genomes/h1n1/bed/h1n1_0514Seq
+
+# create .fa files from downloaded .fasta files
+
+    mkdir -p /gbdb/h1n1/h1n1_0514Seq
+    cd ../../download/0514
+    
+    dos2unix < epiflu_0514_dna_sequence.fasta > epiflu_0514_dna_sequence.fa 
+    dos2unix < epiflu_0514_protein_sequence.fasta > epiflu_0514_protein_sequence.fa 
+    
+    cp -p ../../download/0514/epiflu_0514_dna_sequence.fa /gbdb/h1n1/h1n1_0514Seq
+
+    cd /hive/data/genomes/h1n1/bed/h1n1_0514Seq
+
+    hgLoadSeq -replace h1n1 /gbdb/h1n1/h1n1_0514Seq/epiflu_0514_dna_sequence.fa 
+
+# BLAT, make sure the port number is correct.
+
+    gfClient -minScore=200 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892  /gbdb/h1n1/nib \
+    -out=psl -t=dna -q=dna /gbdb/h1n1/h1n1_0514Seq/epiflu_0514_dna_sequence.fa h1n1_0514Seq.psl
+
+# load the psl result into h1n1_0514Seq table
+    hgLoadPsl h1n1 h1n1_0514Seq.psl
+
+# update h1n1SeqXref table
+
+    fgrep ">" /gbdb/h1n1/h1n1_0514Seq/epiflu_0514_dna_sequence.fa|\
+    sed -e 's/ | /\t/g' |sed -e "s/>//" > h1n1SeqXref.tab
+
+    hgsql h1n1 -e 'delete from h1n1SeqXref'
+    hgsql h1n1 -e 'load data local infile "h1n1SeqXref.tab" into table h1n1SeqXref'
+
+# create protein sequences file with appropriate IDs
+
+    cp -p ~/h1n1/download/0514/epiflu_0514_protein_sequence.fa j1
+
+    ~/src/utils/faToTab/faToTab.pl /dev/null j1 >j2.tab
+    cut -f 1 j2.tab >jj1
+    cut -f 2 j2.tab >jseq
+
+    cat jj1 |sed -e 's/ | /\t/g'|sed -e 's/>//' >jj
+    cut -f 1 jj > jDnaId
+    cut -f 2 jj > j2
+    cut -f 1,2 jj|sed -e 's/\t/_/' |\
+    sed -e 's/_PB2//'|\
+    sed -e 's/_HA//'|\
+    sed -e 's/_NS//'|\
+    sed -e 's/_PA//'|\
+    sed -e 's/_NA//'|\
+    sed -e 's/_NP//'|\
+    sed -e 's/-/_/'|\
+    sed -e 's/_PB1//' > jProtId
+
+    paste jDnaId jProtId >h1n1DnaProt.tab
+    hgsql h1n1 -e 'delete from h1n1DnaProt'
+    hgsql h1n1 -e 'load data local infile "h1n1DnaProt.tab" into table h1n1DnaProt'
+
+# remove _F2 sequences, they are too short
+    paste jProtId jseq |sort -u  >epiflu_0514_protein_sequence.tab
+
+    tabToFa epiflu_0514_protein_sequence
+
+    mkdir -p /gbdb/h1n1/h1n1_0514ProtSeq
+    cp epiflu_0514_protein_sequence.fa /gbdb/h1n1/h1n1_0514ProtSeq
+
+# BLAT the protein sequences, make sure port number is correct.
+
+    gfClient -minScore=50 -minIdentity=60 -nohead hgwdev.cse.ucsc.edu 18891  /gbdb/h1n1/nib \
+      -q=prot -out=psl -t=dnax  /gbdb/h1n1/h1n1_0514ProtSeq/epiflu_0514_protein_sequence.fa testPsl0.psl
+
+    hgsql h1n1 -e 'delete from aaSeq'
+    hgsql h1n1 -e 'load data local infile "epiflu_0514_protein_sequence.tab" into table aaSeq'
+
+    rm j*
+
+    cat testPsl0.psl |grep -v "_NEP"|grep -v "_M2"|grep -v "_F2"|sed -e 's/_M1//' >testPsl.psl
+    hgLoadPsl h1n1 testPsl.psl
+
+# construct CDS info based on corresponding DNA and protein alignments.
+
+    hgsql h1n1 -N -e \
+    'select p.qName,  p.tStart-d.tStart+d.qStart+1, "xxx", d.qStart+(d.tEnd-d.tStart)-(d.tEnd-p.tEnd) from testPsl p, h1n1_0514Seq d, h1n1DnaProt n where dnaId=d.qName and d.qName=p.qName' |\
+    sed -e 's/\txxx\t/\.\./' >h1n1_0514SeqCds.tab
+
+    hgLoadSqlTab h1n1 h1n1_0514SeqCds  ~/hg/lib/cdsSpec.sql h1n1_0514SeqCds.tab
+
+####################################################################################