src/hg/makeDb/doc/h1n1.txt 1.6

1.6 2009/05/18 20:43:42 fanhsu
Added dnaSeq table.
Index: src/hg/makeDb/doc/h1n1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/h1n1.txt,v
retrieving revision 1.5
retrieving revision 1.6
diff -b -B -U 1000000 -r1.5 -r1.6
--- src/hg/makeDb/doc/h1n1.txt	15 May 2009 22:46:08 -0000	1.5
+++ src/hg/makeDb/doc/h1n1.txt	18 May 2009 20:43:42 -0000	1.6
@@ -1,282 +1,299 @@
 ##########################################################################
 # CREATE H1N1 DATABASE (STARTED 4/26/09, Fan) 
 
 # for emacs: -*- mode: sh; -*-
 
 # DOWNLOAD SEQUENCE (DONE, Fan, 4/26/09)
 
     ssh hgwdev
     mkdir /hive/data/genomes/h1n1
     cd /hive/data/genomes/h1n1
 
 # Get H1N1 sequences from GISAID/EpiFluDB
     
     mkdir download
     cd download
 
 # downloaded the following sequences:
 
     epiflu_0421_dna_sequence.fasta
     epiflu_0425_dna_sequence.fasta
     epiflu_0421_protein_sequence.fasta
     epiflu_0425_protein_sequence.fasta
 
 # translate to nib
     cd ..
 
     ln -s /hive/data/genomes/h1n1 ~/h1n1
     cd ~/h1n1
     mkdir nib
 
 # use sequence segments in epiflu_0421_dna_sequence.fasta as the base genome
 
     fgrep -v ">" download/epiflu_0421_dna_sequence.fasta >j.1
     echo ">chr1" >j.0
     cat j.0 j.1 >chr1.fa
     rm j.0 j.1
 
     faToNib chr1.fa nib/chr1.nib
 
 # CREATING DATABASE (DONE 4/26/09)
 
     # Create the h1n1 database.
 
     ssh hgwdev
     echo 'create database h1n1' | hgsql ''
 
     # make a semi-permanent read-only alias:
     alias h1n1 "mysql -u hguser -phguserstuff -A h1n1"
 
 # CREATING GRP TABLE FOR TRACK GROUPING (DONE 4/26/09)
     ssh hgwdev
     echo "create table grp (PRIMARY KEY(NAME)) select * from hg18.grp" \
       | hgsql h1n1
 
     # remove ENCODE groups
     echo "delete from grp where name like 'encode%'" | hgsql h1n1
 
 # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE 4/26/09)
 
     # Make symbolic links from /gbdb/h1n1/nib to the real nibs.
     ssh hgwdev
     mkdir -p /gbdb/h1n1/nib
     ln -s /hive/data/genomes/h1n1/nib/chr1.nib /gbdb/h1n1/nib/chr1.nib 
 
     # Load /gbdb/h1n1/nib paths into database and save size info.
     hgsql h1n1  < ~/src/hg/lib/chromInfo.sql
 
     hgNibSeq -preMadeNib h1n1 /gbdb/h1n1/nib chr1.fa
     echo "select chrom,size from chromInfo" | hgsql -N h1n1 > chrom.sizes
     
 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR H1N1 (DONE 08/02/06)
     echo 'insert into defaultDb values("A/California/04/2009(EPI_ISL_29573)", "h1n1");' \
       | hgsql -h genome-testdb hgcentraltest
 
     echo 'insert into dbDb values("h1n1", "Apr. 2009", \
           "/gbdb/h1n1/nib", "Human case of H1N1 swine influenza", "chr1", 1, 99.5, "A H1N1","Human case of H1N1 swine influenza", "/gbdb/h1n1/html/description.html", 0, 0, "GISAID sequence as of Apr. 21st, 2009", 99999);' \
       | hgsql -h genome-testdb hgcentraltest
 
     echo 'insert into genomeClade values("A H1N1", "other", 100);'\
       | hgsql -h genome-testdb hgcentraltest
 
     # Make trackDb table so browser knows what tracks to expect:
     ssh hgwdev
     cd ~/src/hg/makeDb/trackDb
     cvs up -d -P
 
     # Edit that makefile to add h1n1 in all the right places
 
     vi makefile
 
     make update
     make alpha
     cvs commit makefile
 
 # START BLAT SERVERS (on hgwdev for now, DONE 4/26/09, Fan)
     
     cd /hive/data/genomes/h1n1/nib
     gfServer start hgwdev 18891 -trans -mask -log=gfServer.trans.log chr1.nib &
     gfServer start hgwdev 18892 -stepSize=5  -log=gfServer.log       chr1.nib &
 
     # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR H1N1 
 
     echo 'insert into blatServers values("h1n1", "hgwdev", "18891", "1", "0"); \
           insert into blatServers values("h1n1", "hgwdev", "18892", "0", "0");' \
       | hgsql -h genome-testdb hgcentraltest
 
 # CREATE h1n1Gene TRACK (DONE. 4/26/09, Fan)
 
     mkdir /hive/data/genomes/h1n1/bed/h1n1Gene
     cd    /hive/data/genomes/h1n1/bed/h1n1Gene 
 
     cp ../../download/epiflu_0421_dna_sequence.fasta h1n1Gene.fa
     # manually edit h1n1Gene.fa
     # change line like:
     # >EPI176470 | HA | A/California/04/2009 | EPI_ISL_29573 | 2009712049_seg4 | H1N1
     # into:
     # >HA EPI176470
 
     gfClient -minScore=200 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892  /gbdb/h1n1/nib \
     -out=psl -t=dna -q=dna h1n1Gene.fa h1n1GenePsl.psl
 
     hgLoadPsl h1n1 h1n1GenePsl.psl
     hgsql h1n1 -N -e 'select tName, tStart, tEnd, qName from h1n1GenePsl' >h1n1Gene.bed
     hgLoadBed h1n1  h1n1Gene h1n1Gene.bed
 
 # CREATE h1n1Seq TRACK (DONE.  4/26/09, Fan)
 
     mkdir –p /hive/data/genomes/h1n1/bed/h1n1Seq
     cd /hive/data/genomes/h1n1/bed/h1n1Seq
 
 # get h1n1Seq sequences
     cat ../../download/*dna*.fasta |sed -e 's/ | /_/g' |sed -e 's/_EPI_ISL/ | /' >h1n1Seq.fa
 
 # create .fa file
 
     mkdir -p /gbdb/h1n1/h1n1Seq
     cp -p h1n1Seq.fa /gbdb/h1n1/h1n1Seq/h1n1Seq.fa
 
     hgLoadSeq –replace h1n1 /gbdb/h1n1/h1n1Seq/h1n1Seq.fa
 
 # BLAT
     gfClient -minScore=200 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892  /gbdb/h1n1/nib \
     -out=psl -t=dna -q=dna h1n1Seq.fa h1n1Seq.psl
 
 # load the psl result into h1n1Seq table
     hgLoadPsl h1n1 h1n1Seq.psl
 
 # CREATE humanHA TRACK (DONE.  4/26/09, Fan)
 
     mkdir -p /hive/data/genomes/h1n1/bed/humanHA
     cd /hive/data/genomes/h1n1/bed/humanHA
 
 # get humanHA sequences
     cat ../../download/humanHA_dna.fasta |sed -e 's/ | /_/g' |sed -e 's/_EPI_ISL/ | /' >humanHA.fa
 
 # create .fa file
 
     mkdir -p /gbdb/h1n1/humanHA
     cp -p humanHA.fa /gbdb/h1n1/humanHA/humanHA.fa
 
     hgLoadSeq -replace h1n1 /gbdb/h1n1/humanHA/humanHA.fa
 
 # BLAT
     gfClient -minScore=500 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892  /gbdb/h1n1/nib \
     -out=psl -t=dna -q=dna humanHA.fa humanHA.psl
 
 # load the psl result into humanHA table
     hgLoadPsl h1n1 humanHA.psl
 
 # CREATE swineHA TRACK (DONE.  4/26/09, Fan)
 
     mkdir -p /hive/data/genomes/h1n1/bed/swineHA
     cd /hive/data/genomes/h1n1/bed/swineHA
 
 # get swineHA sequences
     cat ../../download/swineHA_dna.fasta |sed -e 's/ | /_/g' |sed -e 's/_EPI_ISL/ | /' >swineHA.fa
 
 # create .fa file
 
     mkdir -p /gbdb/h1n1/swineHA
     cp -p swineHA.fa /gbdb/h1n1/swineHA/swineHA.fa
 
     hgLoadSeq -replace h1n1 /gbdb/h1n1/swineHA/swineHA.fa
 
 # BLAT
     gfClient -minScore=500 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892  /gbdb/h1n1/nib \
     -out=psl -t=dna -q=dna swineHA.fa swineHA.psl
 
 # load the psl result into humanHA table
     hgLoadPsl h1n1 swineHA.psl
 
 #######################################################################################
 # CREATE h1n1_0514Seq TRACK (DONE.  5/15/09, Fan)
 
     mkdir -p /hive/data/genomes/h1n1/bed/h1n1_0514Seq
     cd /hive/data/genomes/h1n1/bed/h1n1_0514Seq
 
 # create .fa files from downloaded .fasta files
 
     mkdir -p /gbdb/h1n1/h1n1_0514Seq
     cd ../../download/0514
     
     dos2unix < epiflu_0514_dna_sequence.fasta > epiflu_0514_dna_sequence.fa 
     dos2unix < epiflu_0514_protein_sequence.fasta > epiflu_0514_protein_sequence.fa 
     
     cp -p ../../download/0514/epiflu_0514_dna_sequence.fa /gbdb/h1n1/h1n1_0514Seq
 
     cd /hive/data/genomes/h1n1/bed/h1n1_0514Seq
 
     hgLoadSeq -replace h1n1 /gbdb/h1n1/h1n1_0514Seq/epiflu_0514_dna_sequence.fa 
 
 # BLAT, make sure the port number is correct.
 
     gfClient -minScore=200 -minIdentity=80 -nohead hgwdev.cse.ucsc.edu 18892  /gbdb/h1n1/nib \
     -out=psl -t=dna -q=dna /gbdb/h1n1/h1n1_0514Seq/epiflu_0514_dna_sequence.fa h1n1_0514Seq.psl
 
 # load the psl result into h1n1_0514Seq table
     hgLoadPsl h1n1 h1n1_0514Seq.psl
 
 # update h1n1SeqXref table
 
     fgrep ">" /gbdb/h1n1/h1n1_0514Seq/epiflu_0514_dna_sequence.fa|\
     sed -e 's/ | /\t/g' |sed -e "s/>//" > h1n1SeqXref.tab
 
     hgsql h1n1 -e 'delete from h1n1SeqXref'
     hgsql h1n1 -e 'load data local infile "h1n1SeqXref.tab" into table h1n1SeqXref'
 
+# create dna sequences table
+
+    cp -p ~/h1n1/download/0514/epiflu_0514_dna_sequence.fa j1
+
+    ~/src/utils/faToTab/faToTab.pl /dev/null j1 >j2.tab
+    cut -f 1 j2.tab >jj1
+    cut -f 2 j2.tab >jseq
+
+    cat jj1 |sed -e 's/ | /\t/g'|sed -e 's/>//' >jj
+    cut -f 1 jj > jDnaId
+
+    paste jDnaId jseq >epiflu_0514_dna_sequence.tab
+
+    hgLoadSqlTab h1n1 dnaSeq ~/src/hg/lib/dnaSeq.sql epiflu_0514_dna_sequence.tab
+
+    rm j1 jj jj1 jseq jDnaId j2.tab
+
 # create protein sequences file with appropriate IDs
 
     cp -p ~/h1n1/download/0514/epiflu_0514_protein_sequence.fa j1
 
     ~/src/utils/faToTab/faToTab.pl /dev/null j1 >j2.tab
     cut -f 1 j2.tab >jj1
     cut -f 2 j2.tab >jseq
 
     cat jj1 |sed -e 's/ | /\t/g'|sed -e 's/>//' >jj
     cut -f 1 jj > jDnaId
     cut -f 2 jj > j2
     cut -f 1,2 jj|sed -e 's/\t/_/' |\
     sed -e 's/_PB2//'|\
     sed -e 's/_HA//'|\
     sed -e 's/_NS//'|\
     sed -e 's/_PA//'|\
     sed -e 's/_NA//'|\
     sed -e 's/_NP//'|\
     sed -e 's/-/_/'|\
     sed -e 's/_PB1//' > jProtId
 
     paste jDnaId jProtId >h1n1DnaProt.tab
     hgsql h1n1 -e 'delete from h1n1DnaProt'
     hgsql h1n1 -e 'load data local infile "h1n1DnaProt.tab" into table h1n1DnaProt'
 
 # remove _F2 sequences, they are too short
     paste jProtId jseq |sort -u  >epiflu_0514_protein_sequence.tab
 
     tabToFa epiflu_0514_protein_sequence
 
     mkdir -p /gbdb/h1n1/h1n1_0514ProtSeq
     cp epiflu_0514_protein_sequence.fa /gbdb/h1n1/h1n1_0514ProtSeq
 
 # BLAT the protein sequences, make sure port number is correct.
 
     gfClient -minScore=50 -minIdentity=60 -nohead hgwdev.cse.ucsc.edu 18891  /gbdb/h1n1/nib \
       -q=prot -out=psl -t=dnax  /gbdb/h1n1/h1n1_0514ProtSeq/epiflu_0514_protein_sequence.fa testPsl0.psl
 
     hgsql h1n1 -e 'delete from aaSeq'
     hgsql h1n1 -e 'load data local infile "epiflu_0514_protein_sequence.tab" into table aaSeq'
 
     rm j*
 
     cat testPsl0.psl |grep -v "_NEP"|grep -v "_M2"|grep -v "_F2"|sed -e 's/_M1//' >testPsl.psl
     hgLoadPsl h1n1 testPsl.psl
 
 # construct CDS info based on corresponding DNA and protein alignments.
 
     hgsql h1n1 -N -e \
     'select p.qName,  p.tStart-d.tStart+d.qStart+1, "xxx", d.qStart+(d.tEnd-d.tStart)-(d.tEnd-p.tEnd) from testPsl p, h1n1_0514Seq d, h1n1DnaProt n where dnaId=d.qName and d.qName=p.qName' |\
     sed -e 's/\txxx\t/\.\./' >h1n1_0514SeqCds.tab
 
     hgLoadSqlTab h1n1 h1n1_0514SeqCds  ~/hg/lib/cdsSpec.sql h1n1_0514SeqCds.tab
 
 ####################################################################################