src/hg/makeDb/doc/hivVax003Vax004.txt 1.8

1.8 2009/03/02 21:41:47 fanhsu
Checked in the section of the posSelection tracks build for vax004.
Index: src/hg/makeDb/doc/hivVax003Vax004.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hivVax003Vax004.txt,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 1000000 -r1.7 -r1.8
--- src/hg/makeDb/doc/hivVax003Vax004.txt	14 Jan 2009 18:10:52 -0000	1.7
+++ src/hg/makeDb/doc/hivVax003Vax004.txt	2 Mar 2009 21:41:47 -0000	1.8
@@ -1,751 +1,927 @@
 # for emacs: -*- mode: sh; -*-
 #########################################################################
 # hivVax003Vax004 DATABASE BUILD (DONE 5/20/08, Fan)
 
     ssh hiv1
     mkdir -p /cluster/store12/medical/hiv/hivVax003Vax004
     cd /cluster/store12/medical/hiv/hivVax003Vax004
 
 #########################################################################
 # Create hivVax003Vax004 DB
 
     hgsql –e 'create database hivVax003Vax004'
 
 # Ask admin to copy over all tables from hiv1 to hivVax003Vax004
 
 #########################################################################
 # CREATE MAF TRACKS FOR VAX004
 
     mkdir -p /cluster/store12/medical/hiv/hivVax003Vax004/msa
     cd /cluster/store12/medical/hiv/hivVax003Vax004/msa
 
 # create a script file, doall
 
     hgsql hivVax003Vax004 -N -e \
     'select id from dnaSeq where id like "%U%"'\
     |sed -e 's/ss/do1 ss/g' >doall
 
 # create one line script file, do1, with the following line in it:
 
     hgsql hivVax003Vax004 -N -e  "select id, seq from vax004Msa where id='${1}'"
 
     chmod +x do*
 
 # run the script to get the .tab file with all MSA sequences of VAX004
     doall >Vax003Vax004.tab
 # convert .tab into .fa file
     tabToFa Vax003Vax004
 
 # grab the base alignment sequence
     echo ">hivVax003Vax004" >Vax003Vax004.aln
     hgsql hivVax003Vax004 -N -e 'select seq from vax004Msa where id="HXB2"'  >> Vax003Vax004.aln
 
 # prepare an interium file, jjAll.mfa
     cat Vax003Vax004.aln Vax003Vax004.fa >jjAll.mfa
     echo = >>jjAll.mfa
 
 # Run xmfaToMafVax003Vax004 to create a precursor file for the final .maf
 
     xmfaToMafVax003Vax004Vax004 jjAll.mfa j.out  org1=hivVax003Vax004
     cat j.out|sed -e 's/\./_/g'|sed -e 's/_chr/\.chr/g' >chr1.tmp
 
     rm jjAll.mfa j.out
 
     cat chr1.tmp |sed -e 's/ss_U/U/g' >chr1.maf
 
 # copy .maf to /gbdb.
 
     mkdir -p  /gbdb/hivVax003Vax004/vax004Maf 
     cp chr1.maf /gbdb/hivVax003Vax004/vax004Maf -p
 
     hgLoadMaf hivVax003Vax004 vax004Maf
 
 # create another copy for protein MAF.
 
     mkdir -p  /gbdb/hivVax003Vax004/vax004AaMaf 
     cp -p chr1.maf /gbdb/hivVax003Vax004/vax004AaMaf
     hgLoadMaf hivVax003Vax004 vax004AaMaf
     
 #########################################################################
 # CREATE CONSERVATION TRACKS FOR VAX003 AE STRAIN
 
     mkdir -p /cluster/store12/medical/hiv/hivVax003Vax004/conservation/AE
     cd /cluster/store12/medical/hiv/hivVax003Vax004/conservation/AE
 
 # create the .wig file and .fa file of the consensus sequence.
     gsidMsa hivVax003Vax004 vax003AEMsa HXB2 6228 vax003AECons.wig vax003AEConsensus.fa
 # encode and load the wig file
     wigEncode vax003AECons.wig stdout vax003AECons.wib \
     | hgLoadWiggle hivVax003Vax004 vax003AECons stdin
 
 # copy .wib file to /gbdb
     mkdir -p /gbdb/hivVax003Vax004/wib
     cp vax003AECons.wib /gbdb/hivVax003Vax004/wib
 
 # do the same for protein conservation track
 
     mkdir aa
     cd aa
 
 # create .wig file
     gsidAaMsa2 hivVax003Vax004 vax003AEMsa HXB2 6228 vax003AEAaCons.wig vax003AEAaConsensus.fa
 
 # encode and load the .wib file   
     wigEncode vax003AEAaCons.wig stdout vax003AEAaCons.wib \
     | hgLoadWiggle hivVax003Vax004 vax003AEAaCons stdin
 
     cp vax003AEAaCons.wib /gbdb/hivVax003Vax004/wib
 
 #########################################################################
 # CREATE MAF TRACKS FOR VAX003 AE STRAIN
 
     mkdir -p /cluster/store12/medical/hiv/hivVax003Vax004/msa/AE
     cd /cluster/store12/medical/hiv/hivVax003Vax004/msa/AE
 
 # create a script file, doall
 
     hgsql hivVax003Vax004 -N -e \
     'select id from dnaSeq where id like "%T%"'\
     |sed -e 's/ss/do1 ss/g' >doall
 
 # create one line script file, do1, with the following line in it:
 
     hgsql hivVax003Vax004 -N -e  "select id, seq from vax003AEMsa where id='${1}'"
 
     chmod +x do*
 
 # run the script to get the .tab file with all MSA sequences of VAX003 AE
     doall >Vax003Vax004.tab
 # convert .tab into .fa file
     tabToFa Vax003Vax004
 
 # grab the base alignment sequence
     echo ">hivVax003Vax004" >Vax003Vax004.aln
     hgsql hivVax003Vax004 -N -e 'select seq from vax003AEMsa where id="HXB2"'  >> Vax003Vax004.aln
 
 # prepare an interium file, jjAll.mfa
     cat Vax003Vax004.aln Vax003Vax004.fa >jjAll.mfa
     echo = >>jjAll.mfa
 
 # Run xmfaToMafVax003Vax004 to create a precursor file for the final .maf
 
     xmfaToMafVax003Vax004 jjAll.mfa j.out  org1=hivVax003Vax004
     cat j.out|sed -e 's/\./_/g'|sed -e 's/_chr/\.chr/g' >chr1.tmp
 
 #    rm jjAll.mfa j.out
 
     cat chr1.tmp |sed -e 's/ss_T/T/g' >chr1.maf
 
 # copy .maf to /gbdb.
 
     mkdir -p  /gbdb/hivVax003Vax004/vax004Maf
     cp chr1.maf /gbdb/hivVax003Vax004/vax004Maf -p
 
     hgLoadMaf hivVax003Vax004 vax004Maf
 
 # create another copy for protein MAF.
 
     mkdir -p  /gbdb/hivVax003Vax004/vax004AaMaf 
     cp -p chr1.maf /gbdb/hivVax003Vax004/vax004AaMaf
     hgLoadMaf hivVax003Vax004 vax004AaMaf
 
 #########################################################################
 # COPY OVER MSA TABLES FOR VAX003 B STRAIN
 
     mkdir -p /cluster/store12/medical/hiv/hivVax003Vax004/msa/B
     cd /cluster/store12/medical/hiv/hivVax003Vax004/msa/B
 
 # get table definition
     mysqldump -d hivVax003Vax004 vax003BMsa -u medcat -p$HGPSWD|hgsql hivVax003Vax004
 
 # load the table   
     hgsql hivVax003Vax004 -e "insert into vax003BMsa select * from hivVax003Vax004.vax003BMsa"
 
 #########################################################################
 # CREATE CONSERVATION TRACKS FOR VAX003 B STRAIN
 
     mkdir -p /cluster/store12/medical/hiv/hivVax003Vax004/conservation/B
     cd /cluster/store12/medical/hiv/hivVax003Vax004/conservation/B
 
 # create the .wig file and .fa file of the consensus sequence.
     gsidMsa hivVax003Vax004 vax003BMsa HXB2 6228 vax003BCons.wig vax003BConsensus.fa
 
 # encode and load the wig file
     wigEncode vax003BCons.wig stdout vax003BCons.wib \
     | hgLoadWiggle hivVax003Vax004 vax003BCons stdin
 
 # copy .wib file to /gbdb
     mkdir -p /gbdb/hivVax003Vax004/wib
     cp vax003BCons.wib /gbdb/hivVax003Vax004/wib
 
 # do the same for protein conservation track
 
     mkdir aa
     cd aa
 
 # create .wig file
     gsidAaMsa2 hivVax003Vax004 vax003BMsa HXB2 6228 vax003BAaCons.wig vax003BAaConsensus.fa
 
 # encode and load the .wib file   
     wigEncode vax003BAaCons.wig stdout vax003BAaCons.wib \
     | hgLoadWiggle hivVax003Vax004 vax003BAaCons stdin
 
     cp vax003BAaCons.wib /gbdb/hivVax003Vax004/wib
 
 #########################################################################
 # CREATE MAF TRACKS FOR VAX003 B STRAIN
 
     mkdir -p /cluster/store12/medical/hiv/hivVax003Vax004/msa/B
     cd /cluster/store12/medical/hiv/hivVax003Vax004/msa/B
 
 # create a script file, doall
 
     hgsql hivVax003Vax004 -N -e \
     'select id from dnaSeq where id like "%T%"'\
     |sed -e 's/ss/do1 ss/g' >doall
 
 # create one line script file, do1, with the following line in it:
 
     hgsql hivVax003Vax004 -N -e  "select id, seq from vax003BMsa where id='${1}'"
 
     chmod +x do*
 
 # run the script to get the .tab file with all MSA sequences of VAX004
     doall >Vax003Vax004.tab
 # convert .tab into .fa file
     tabToFa Vax003Vax004
 
 # grab the base alignment sequence
     echo ">hivVax003Vax004" >Vax003Vax004.aln
     hgsql hivVax003Vax004 -N -e 'select seq from vax003BMsa where id="HXB2"'  >> Vax003Vax004.aln
 
 # prepare an interium file, jjAll.mfa
     cat Vax003Vax004.aln Vax003Vax004.fa >jjAll.mfa
     echo = >>jjAll.mfa
 
 # Run xmfaToMafVax003Vax004 to create a precursor file for the final .maf
 
     xmfaToMafVax003Vax004 jjAll.mfa j.out  org1=hivVax003Vax004
     cat j.out|sed -e 's/\./_/g'|sed -e 's/_chr/\.chr/g' >chr1.tmp
 
     rm jjAll.mfa j.out
 
     cat chr1.tmp |sed -e 's/ss_T/T/g' >chr1.maf
 
 # copy .maf to /gbdb.
 
     mkdir -p  /gbdb/hivVax003Vax004/vax003BMaf
     cp chr1.maf /gbdb/hivVax003Vax004/vax003BMaf -p
     
     hgLoadMaf hivVax003Vax004 vax003BMaf
 
 # create another copy for protein MAF.
 
     mkdir -p  /gbdb/hivVax003Vax004/vax003BAaMaf 
     cp -p chr1.maf /gbdb/hivVax003Vax004/vax003BAaMaf
     hgLoadMaf hivVax003Vax004 vax003BAaMaf
 
 #########################################################################
 # Process, check, correct and load VAX003 clinical tables
 
     mkdir -p /data/home/fanhsu/medical/hiv/hivVax003Vax004/clinical
     cd /data/home/fanhsu/medical/hiv/hivVax003Vax004/clinical
 
 # copy over original raw data files
     
     cp -p /cluster/store12/medical/vaxGen/fromEvie/VAX003/*.txt .
     ls -l *.txt
 
 # shorten the file name and run processRaw to generate .sql def
 
     cp "VAX003 RNACD4 match with sequence ID_20080501_EMZ18Jun.txt" VAX003_RNACD4080501.txt
     processRaw VAX003_RNACD4080501.txt
 
     hgsql hiv1 -e 'drop database  hivVax003Vax004Build'
     hgsql hiv1 -e 'create database  hivVax003Vax004Build'
 
 # create hivVax003Vax004Build DB to be used in this build process
     hgsql hiv1 -e 'create database hivVax003Vax004Build'
 
 # load raw demographic and RNACD3 data
     hgsql hivVax003Vax004Build < GSID_DEMOG_SEQNO_003Raw.sql
     hgsql hivVax003Vax004Build < VAX003_RNACD4080501Raw.sql
 
     hgsql hivVax003Vax004Build -e \
     'load data local infile "GSID_DEMOG_SEQNO_003.txt" into table GSID_DEMOG_SEQNO_003Raw ignore 1 lines'
 
     hgsql hivVax003Vax004Build -e \
     'load data local infile "VAX003_RNACD4080501.txt" into table VAX003_RNACD4080501Raw ignore 1 lines'
 
 # build initial gsidClinicRecTemp table ...
 
     hgsql hivVax003Vax004Build -N -e \
     'select "specId",GSID, MBLabcd, DRNACD4, "rna","cd4" from VAX003_RNACD4080501Raw' \
     >gsidClinicRecTemp.tab
 
     hgsql hivVax003Vax004Build -e 'drop table gsidClinicRecTemp'
     getDbTableDef hiv1 gsidClinicRecTemp >gsidClinicRecTemp.sql
     hgsql hivVax003Vax004Build < gsidClinicRecTemp.sql
 
     hgsql hivVax003Vax004Build -e \
     'load data local infile "gsidClinicRecTemp.tab" into table gsidClinicRecTemp'
 
 # build subjLabcode table ...
 
     hgsql hivVax003Vax004Build -N -e \
     'select GSID, MBLabcd from VAX003_RNACD4080501Raw where MBLabcd!=""' \
     | sort -u > subjLabcode.tab
 
     hgsql hivVax003Vax004Build -e "drop table subjLabcode"
 
     getDbTableDef hiv1 subjLabcode > subjLabcode.sql
     hgsql hivVax003Vax004Build < subjLabcode.sql
     hgsql hivVax003Vax004Build -e \
     'load data local infile "subjLabcode.tab" into table subjLabcode'
 
 # fill in labCode in gsidClinicRecTemp
 
     hgsql hivVax003Vax004Build -e \
     'update gsidClinicRecTemp t, subjLabcode l set t.labCode=l.labCode where t.subjId=l.subjId'
 
 # fill in specimenId
     hgsql hivVax003Vax004Build -e \
     'update gsidClinicRecTemp t, GSID_DEMOG_SEQNO_003Raw r set t.specimenId=r.SpecimenNumber where t.subjId=r.subjId and r.SpecimenNumber !=""'
 
 # fill in RNA 
     hgsql hivVax003Vax004Build -e \
     'update gsidClinicRecTemp t, VAX003_RNACD4080501Raw r set t.hivQuan=r.RNA where t.subjId=r.GSID and t.daysCollection=r.DRNACD4' 
 
 # fill in CD4
     hgsql hivVax003Vax004Build -e \
     'update gsidClinicRecTemp t, VAX003_RNACD4080501Raw r set t.cd4Count=r.CD4ABS where t.subjId=r.GSID and t.daysCollection=r.DRNACD4'
 
 # change RNA "399" to "200" (which will be displayed as "<400")
     hgsql hivVax003Vax004Build -e \
     'update gsidClinicRecTemp set hivQuan="200" where hivQuan = "399"'
 
 # update cd4 NULL ...
     hgsql hivVax003Vax004Build -e \
     'update gsidClinicRecTemp set cd4Count="NULL" where cd4Count="."'
 
 # Echo update daysCollection NULL ...
     hgsql hivVax003Vax004Build -e \
     'update gsidClinicRecTemp set daysCollection="NULL" where daysCollection="."'
 
 # build gsidClinicRecNew table
 
     hgsql hivVax003Vax004Build -N -e 'select * from gsidClinicRecTemp ' \
     |uniq |sed -e 's/NULL/-1/g'  > gsidClinicRecNew.tab
 
     hgsql hivVax003Vax004Build -e 'drop table gsidClinicRecNew'
     hgsql hivVax003Vax004Build < gsidClinicRecNew.sql
     hgsql hivVax003Vax004Build -e \
     'load data local infile "gsidClinicRecNew.tab" into table gsidClinicRecNew'
 
 # set NULLs
 
     hgsql hivVax003Vax004Build -e \
     'update gsidClinicRecNew set hivQuan=NULL where hivQuan=-1'
     hgsql hivVax003Vax004Build -e \
     'update gsidClinicRecNew set cd4Count=NULL where cd4Count=-1'
 
     hgsql hivVax003Vax004Build -e \
     'update gsidClinicRecNew set daysCollection=NULL where daysCollection=-1'
 
     hgsql hivVax003Vax004Build -e \
     'update gsidClinicRecNew set specimenId=NULL where specimenId="specId"'
 
 # build gsidClinicRecWithSeqNew table
 
     hgsql hivVax003Vax004Build -N -e \
     'select c.* from GSID_DEMOG_SEQNO_003Raw r,gsidClinicRecNew c where SequenceDataStatus="Sequence data exist" and r.subjId=c.subjId and r.labCode=c.labCode' \
     |sort -u |sed -e 's/NULL/-1/g' >gsidClinicRecWithSeqNew.tmp
 
     hgsql hivVax003Vax004Build -e 'drop table gsidClinicRecWithSeqNew'
     hgsql hivVax003Vax004Build < gsidClinicRecWithSeqNew.sql
     hgsql hivVax003Vax004Build -e \
     'load data local infile "gsidClinicRecWithSeqNew.tmp" into table gsidClinicRecWithSeqNew'
     rm gsidClinicRecWithSeqNew.tmp
 
     hgsql hivVax003Vax004Build -e \
     'update gsidClinicRecWithSeqNew set hivQuan=NULL where hivQuan=-1'
     hgsql hivVax003Vax004Build -e \
     'update gsidClinicRecWithSeqNew set cd4Count=NULL where cd4Count=-1'
 
 # compare with previous old data
     hgsql hivVax003Vax004Build -N -e 'select * from gsidClinicRecWithSeqNew' |sort -u >j.n
     hgsql hivVax003Vax004 -N -e 'select * from gsidClinicRecWithSeq'|sort -u  >j.o
     diff j.o j.n |grep -v "GSID4" >j.diff
 
 # load the newly build data into hivVax003Vax004 tables
 
     hgsql hivVax003Vax004 -e 'delete from gsidClinicRec where subjId like "GSID3%"'
     hgsql hivVax003Vax004 -e 'delete from gsidClinicRecWithSeq where subjId like "GSID3%"'
 
    hgsql hivVax003Vax004 -e \
     "insert into gsidClinicRec select * from hivVax003Vax004Build.gsidClinicRecNew"
 
     hgsql hivVax003Vax004 -e \
     "insert into gsidClinicRecWithSeq select * from hivVax003Vax004Build.gsidClinicRecWithSeqNew"
 
 
 #########################################################################
 # Build the gsidSubjSeq table (used by Table View).
 
    gsidSubjSeq hivVax003Vax004 dnaSeqId > j.dna
    gsidSubjSeq hivVax003Vax004 aaSeqId > j.aa
 
    cut -f 1 j.dna >j.1
    cut -f 1 j.aa  >j.2
 
    cut -f 2 j.dna  >j.3
    cut -f 2 j.aa   >j.4
 
    paste j.1 j.3 j.4> gsidSubjSeq.tab
 
    hgsql hivVax003Vax004 -e 'delete from gsidSubjSeq'
    hgsql hivVax003Vax004 -e \
    'load data local infile "gsidSubjSeq.tab" into table gsidSubjSeq'
 
    rm j.1 j.2 j.3 j.4 j.dna j.aa
 
 #################################################################################
 # RE-BUILD CONSERVATION AND MAF TRACKS FOR VAX003 AE STRAIN (DONE, 7/10/08, Fan)
 
 # First cut the vax003AEMsa sequences so that they start with VPV and end with REKR
 
 # rename existing vax003AEMsa table as vax003AEMsaOld
     hgsql hivVax003Vax004 –e 'rename table vax003AEMsa to vax003AEMsaOld'
 
 # use BLAT to visually decide what are the appropriate starting and ending positions to cut.
 
     hgsql hivVax003Vax004 -N -e \
     'select id,substring(seq, 124, 1743) from vax003AEMsaOld' >vax003AEMsa.tab
 
     tabToFa vax003AEMsa
 # use resulting vax003AEMsa.fa to check that the cut is correct, and then load the new MSA sequences.
 
     hgsql hivVax003Vax004 < ~/src/hg/lib/vax003AEMsa.sql
     hgsql hivVax003Vax004 -N -e 'load data local infile "vax003AEMsa.tab" into table vax003AEMsa'
 
 # RE-BUILD CONSERVATION TRACKS FOR VAX003 AE STRAIN  
 
     mkdir -p \
     /cluster/store12/medical/hiv/hivVax003Vax004/conservation/AE/rebuild
     cd /cluster/store12/medical/hiv/hivVax003Vax004/conservation/AE/rebuild
 
 # create the .wig file and .fa file of the consensus sequence.
     gsidMsa hivVax003Vax004 vax003AEMsa HXB2 6348 vax003AECons.wig vax003AEConsensus.fa
 # encode and load the wig file
     wigEncode vax003AECons.wig stdout vax003AECons.wib \
     | hgLoadWiggle hivVax003Vax004 vax003AECons stdin
 
 # copy .wib file to /gbdb
     mkdir -p /gbdb/hivVax003Vax004/wib
     cp vax003AECons.wib /gbdb/hivVax003Vax004/wib
 
 # do the same for protein conservation track
 
     mkdir aa
     cd aa
 
 # create .wig file
     gsidAaMsa2 hivVax003Vax004 vax003AEMsa HXB2 6348 vax003AEAaCons.wig vax003AEAaConsensus.fa
 
 # encode and load the .wib file   
     wigEncode vax003AEAaCons.wig stdout vax003AEAaCons.wib \
     | hgLoadWiggle hivVax003Vax004 vax003AEAaCons stdin
 
     cp vax003AEAaCons.wib /gbdb/hivVax003Vax004/wib
 
 # CREATE MAF TRACKS FOR VAX003 AE STRAIN
 
     mkdir -p /cluster/store12/medical/hiv/hivVax003Vax004/msa/AE/rebuild
     cd /cluster/store12/medical/hiv/hivVax003Vax004/msa/AE/rebuild
 
 # create a script file, doall
 
     hgsql hivVax003Vax004 -N -e \
     'select id from dnaSeq where id like "%T%"'\
     |sed -e 's/ss/do1 ss/g' >doall
 
 # create one line script file, do1, with the following line in it:
 
     hgsql hivVax003Vax004 -N -e  "select id, seq from vax003AEMsa where id='${1}'"
 
     chmod +x do*
 
 # run the script to get the .tab file with all MSA sequences of VAX003 AE
     doall >Vax003Vax004.tab
 # convert .tab into .fa file
     tabToFa Vax003Vax004
 
 # grab the base alignment sequence
     echo ">hivVax003Vax004" >Vax003Vax004.aln
     hgsql hivVax003Vax004 -N -e 'select seq from vax003AEMsa where id="HXB2"'  >> Vax003Vax004.aln
 
 # prepare an interium file, jjAll.mfa
     cat Vax003Vax004.aln Vax003Vax004.fa >jjAll.mfa
     echo = >>jjAll.mfa
 
 # Run xmfaToMafVax003Vax004 to create a precursor file for the final .maf
 
     xmfaToMafVax003Vax004 jjAll.mfa j.out  org1=hivVax003Vax004
     cat j.out|sed -e 's/\./_/g'|sed -e 's/_chr/\.chr/g' >chr1.tmp
 
 #    rm jjAll.mfa j.out
 
     cat chr1.tmp |sed -e 's/ss_T/T/g' >chr1.maf
 
 # copy .maf to /gbdb.
 
     mkdir -p  /gbdb/hivVax003Vax004/vax003AEMaf
     cp chr1.maf /gbdb/hivVax003Vax004/vax003AEMaf -p
 
     hgLoadMaf hivVax003Vax004 vax003AEMaf
 
 # create another copy for protein MAF.
 
     mkdir -p  /gbdb/hivVax003Vax004/vax003AEMaf 
     cp -p chr1.maf /gbdb/hivVax003Vax004/vax003AEAaMaf
     hgLoadMaf hivVax003Vax004 vax003AEAaMaf
 
 #################################################################################
 # RE-BUILD CONSERVATION AND MAF TRACKS FOR VAX003 B STRAIN (DONE, 7/10/08, Fan)
 # First cut the vax003BMsa sequences so that they start with VPV and end with REKR
 
 # rename existing vax003BMsa table as vax003BMsaOld
     hgsql hivVax003Vax004 –e 'rename table vax003BMsa to vax003BMsaOld'
 
 # use BLAT to visually decide what are the appropriate starting and ending positions to cut.
 
     hgsql hivVax003Vax004 -N -e \
     'select id,substring(seq, 121, 1620) from vax003BMsaOld' >vax003BMsa.tab
 
     tabToFa vax003BMsa
 # use resulting vax003BMsa.fa to check that the cut is correct, then load the new MSA sequences.
 
     hgsql hivVax003Vax004 -e 'drop table vax003BMsa' 
     hgsql hivVax003Vax004 < ~/src/hg/lib/vax003BMsa.sql
     hgsql hivVax003Vax004 -N -e 'load data local infile "vax003BMsa.tab" into table vax003BMsa'
 
 # RE-BUILD CONSERVATION TRACKS FOR VAX003 B STRAIN  
 
     mkdir -p \
     /cluster/store12/medical/hiv/hivVax003Vax004/conservation/B/rebuild
     cd /cluster/store12/medical/hiv/hivVax003Vax004/conservation/B/rebuild
 
 # create the .wig file and .fa file of the consensus sequence.
     gsidMsa hivVax003Vax004 vax003BMsa HXB2 6348 vax003BCons.wig vax003BConsensus.fa
 # encode and load the wig file
     wigEncode vax003BCons.wig stdout vax003BCons.wib \
     | hgLoadWiggle hivVax003Vax004 vax003BCons stdin
 
 # copy .wib file to /gbdb
     mkdir -p /gbdb/hivVax003Vax004/wib
     cp vax003BCons.wib /gbdb/hivVax003Vax004/wib
 
 # do the same for protein conservation track
 
     mkdir aa
     cd aa
 
 # create .wig file
     gsidAaMsa2 hivVax003Vax004 vax0 '03BMsa HXB2 6348 vax003BAaCons.wig vax003BAaConsensus.fa
 
 # encode and load the .wib file   
     wigEncode vax003BAaCons.wig stdout vax003BAaCons.wib \
     | hgLoadWiggle hivVax003Vax004 vax003BAaCons stdin
 
     cp vax003BAaCons.wib /gbdb/hivVax003Vax004/wib
 
 # CREATE MAF TRACKS FOR VAX003 B STRAIN
 
     mkdir -p /cluster/store12/medical/hiv/hivVax003Vax004/msa/B/rebuild
     cd /cluster/store12/medical/hiv/hivVax003Vax004/msa/B/rebuild
 
 # create a script file, doall
 
     hgsql hivVax003Vax004 -N -e \
     'select id from dnaSeq where id like "%T%"'\
     |sed -e 's/ss/do1 ss/g' >doall
 
 # create one line script file, do1, with the following line in it:
 
     hgsql hivVax003Vax004 -N -e  "select id, seq from vax003BMsa where id='${1}'"
 
     chmod +x do*
 
 # run the script to get the .tab file with all MSA sequences of VAX003 B
     doall >Vax003Vax004.tab
 # convert .tab into .fa file
     tabToFa Vax003Vax004
 
 # grab the base alignment sequence
     echo ">hivVax003Vax004" >Vax003Vax004.aln
     hgsql hivVax003Vax004 -N -e 'select seq from vax003BMsa where id="HXB2"'  >> Vax003Vax004.aln
 
 # prepare an interium file, jjAll.mfa
     cat Vax003Vax004.aln Vax003Vax004.fa >jjAll.mfa
     echo = >>jjAll.mfa
 
 # Run xmfaToMafVax003Vax004 to create a precursor file for the final .maf
 
     xmfaToMafVax003Vax004 jjAll.mfa j.out  org1=hivVax003Vax004
     cat j.out|sed -e 's/\./_/g'|sed -e 's/_chr/\.chr/g' >chr1.tmp
 
 #    rm jjAll.mfa j.out
 
     cat chr1.tmp |sed -e 's/ss_T/T/g' >chr1.maf
 
 # copy .maf to /gbdb.
 
     mkdir -p  /gbdb/hivVax003Vax004/vax003BMaf
     cp chr1.maf /gbdb/hivVax003Vax004/vax003BMaf -p
 
     hgLoadMaf hivVax003Vax004 vax003BMaf
 
 # create another copy for protein MAF.
 
     mkdir -p  /gbdb/hivVax003Vax004/vax003BMaf 
     cp -p chr1.maf /gbdb/hivVax003Vax004/vax003BAaMaf
     hgLoadMaf hivVax003Vax004 vax003BAaMaf
 
 ########################################################################################
 # REBUILD THE gsidClinicRecWithSeq TABLE (DONE 11/03/08, Fan)
 
 mkdir -p /hive/groups/gsid/medical/hiv/hivVax003Vax004/clinical/novRebuild
 
 cd /hive/groups/gsid/medical/hiv/hivVax003Vax004/clinical/novRebuild
 
 #copy table gsidClinicRecNew into gsidClinicRecNew2
 
 cp ~/hg/lib/gsidClinicRec.sql gsidClinicRecNew2.sql
 vi gsidClinicRecNew2.sql
 hgsql  hivVax003Vax004Build -e 'drop table gsidClinicRecNew2'
 hgsql  hivVax003Vax004Build < gsidClinicRecNew2.sql
 
 hgsql hivVax003Vax004Build -N -e 'select * from gsidClinicRecNew' >gsidClinicRecNew2.tab
 
 hgsql  hivVax003Vax004Build -e \
 'load data local infile "gsidClinicRecNew2.tab" into table  gsidClinicRecNew2'
 
 hgsql hivVax003Vax004Build -N -e 'select * from gsidClinicRecNew2' >j.tab
 diff j.tab gsidClinicRecNew2.tab
 
 # change 200 to 399 so that they are consistent between two table
 
 hgsql hivVax003Vax004Build -e 'update gsidClinicRecNew2 set hivQuan=399 where hivQuan=200'
 hgsql hivVax003Vax004Build -N -e 'select * from gsidClinicRecNew2' >jj.tab
 diff j.tab jj.tab
 
 # rebuild the gsidClinicRecWithSeq table for VAX003 subjects
 
 hgsql hivVax003Vax004Build -N -e \
 'select c.specimenId, c.subjId, c.labCode, c.daysCollection, r.RNA, r.CD4ABS from gsidClinicRecNew2 c, VAX003_RNACD4080501Raw r where c.specimenId=r.SpecimenNo and c.daysCollection=r.DRNACD4 and c.subjId=r.GSID and r.RNA=c.hivQuan and r.CD4ABS=c.cd4Count and r.SpecimenNo != ""' >j.out
 
 cut -f 1-4 j.out >j.1
 
 # revert back from 399 to 200
 cut -f 5-6 j.out  | sed -e 's/399\t/200\t/' >j.2
 
 paste j.1 j.2 >gsidClinicRecWithSeq.vax003.tab
 
 hgsql hivVax003Vax004 -e 'delete from gsidClinicRecWithSeq where subjId like "GSID3%"'
 hgsql hivVax003Vax004 -e 'load data local infile "gsidClinicRecWithSeq.vax003.tab" into table gsidClinicRecWithSeq'
 
 # update the same table for the other 3 genomes
 
 hgsql hivgne8v2 -e 'delete from gsidClinicRecWithSeq where subjId like "GSID3%"'
 hgsql hivgne8v2 -e 'load data local infile "gsidClinicRecWithSeq.vax003.tab" into table gsidClinicRecWithSeq'
 
 hgsql hivmn2 -e 'delete from gsidClinicRecWithSeq where subjId like "GSID3%"'
 hgsql hivmn2 -e 'load data local infile "gsidClinicRecWithSeq.vax003.tab" into table gsidClinicRecWithSeq'
 
 hgsql hiva244 -e 'delete from gsidClinicRecWithSeq where subjId like "GSID3%"'
 hgsql hiva244 -e 'load data local infile "gsidClinicRecWithSeq.vax003.tab" into table gsidClinicRecWithSeq'
 
 ######################################################################################
 
 # Create VAX003 subtype B Positive Selection tracks for hivVax003Vax004
 
 cd /hive/groups/gsid/medical/hiv/hivVax003Vax004
 mkdir posSelection
 cd posSelection
 
 # BLAT /hive/groups/gsid/medical/hiv/hiva244/posSelection/BMsaAaConsensus.fa
 # against hivVax003Vax004 base genome, select psl without header option
 # cut and paste the result into the file BMsa.psl
 
 hgLoadPsl -keep -table=BMsaPsl -nobin hivVax003Vax004 BMsa.psl 
 
 # will get the following error:
 #Processing BMsa.psl
 #Can't start query:
 #LOAD DATA CONCURRENT  INFILE
 '/cluster/hive/groups/gsid/medical/hiv/hivVax003Vax004/posSelection/BMsa.psl'
 INTO TABLE BMsaPsl
 
 #mySQL error 13: Can't get stat of
 '/cluster/hive/groups/gsid/medical/hiv/hivVax003Vax004/posSelection/BMsa.psl'
 (Errcode: 13)
 
 # load manually then
 
 hgsql hivVax003Vax004
 load data local infile "BMsa.psl" into table BMsaPsl;
 quit
 
 # build the positive selection tracks for model 2 and model 8.
 
 gsidPosSelect hivVax003Vax004  BMsaPsl posSelBuild pSelectBModel2
 posSelModel2.bed
 hgLoadBed hivVax003Vax004 posSelModel2 posSelModel2.bed
 
 gsidPosSelect hivVax003Vax004 BMsaPsl posSelBuild pSelectBModel8
 posSelModel8.bed
 hgLoadBed hivVax003Vax004 posSelModel8 posSelModel8.bed
 
 ##########################################################################
 # BUILD THE POSITIVE SELECTION TRACKS FOR VAX003 SUBTYPE AE
 
     ssh hiv1
     mkdir -p /hive/groups/gsid/medical/hiv/posSelection/AE/hivVax003Vax004
     cd /hive/groups/gsid/medical/hiv/posSelection/AE/hivVax003Vax004
 
 # BLAT
 # /cluster/hive/groups/gsid/medical/hiv/posSelection/AE/AEMsaAaConsensus.fa
 # against hivVax003Vax004 base genome, select psl without header option
 # cut and paste the result into the file AEMsa.psl
 
 hgLoadPsl -keep -table=AEMsaPsl -nobin hivVax003Vax004 AEMsa.psl 
 
 # will get the following error:
 #Processing AEMsa.psl
 #Can't start query:
 #LOAD DATA CONCURRENT  INFILE
 #'/cluster/hive/groups/gsid/medical/hiv/posSelection/AE/hivVax003Vax004/AEMsa.ps#l'
 INTO TABLE AEMsaPsl
 
 #mySQL error 13: Can't get stat of
 #'/cluster/hive/groups/gsid/medical/hiv/posSelection/AE/hivVax003Vax004/AEMsa.ps#l'
 (Errcode: 13)
 
 # load manually then
 
 hgsql hivVax003Vax004
 load data local infile "AEMsa.psl" into table AEMsaPsl;
 quit
 
 # build positive selection tracks for model 2 and model 8.
 
 gsidPosSelect hivVax003Vax004  AEMsaPsl posSelBuild pSelectAEModel2
 posSelAEModel2.bed
 hgLoadBed hivVax003Vax004 posSelAEModel2 posSelAEModel2.bed
 
 gsidPosSelect hivVax003Vax004 AEMsaPsl posSelBuild pSelectAEModel8
 posSelAEModel8.bed
 hgLoadBed hivVax003Vax004 posSelAEModel8 posSelAEModel8.bed
 
 ##########################################################################
+# BUILD THE POSITIVE SELECTION TRACKS FOR VAX004 (Done Fan, 3/2/09)
 
+cd /cluster/hive/groups/gsid/medical/hiv/posSelection
+
+mkdir vax004
+cd vax004
+
+# Since there are large number (12) of subclasses and 4 HIV genomes,
+# this has to be automated.  So create the do1, do2, do3 script first. 
+# Please note that the do3 script works on all 4 HIV genomes.
+
+cat << '_EOF_' >do1
+#do1.1
+
+mkdir -p $1
+# start with clean slate
+rm $1/*
+
+cp -p /hive/groups/gsid/medical/vaxGen/fromKeith/posSelection/073008/PAML-outfiles/VAX004-$1-sites.paml $1
+
+cp /hive/groups/gsid/medical/vaxGen/fromKeith/posSelection/073008/data/$1.nex $1
+
+cp /hive/groups/gsid/medical/vaxGen/fromKeith/posSelection/073008/PAML-outfiles/VAX004-$1-sites.paml $1
+
+#do1.2
+
+cd $1
+cat VAX004-$1-sites.paml|grep "+-" >j.tmp
+get1stHalf j.tmp >$1Model2.paml
+cat $1Model2.paml |\
+sed -e 's/+-//g'|\
+sed -e 's/ \* / xxx /g'|\
+sed -e 's/\*//g'|\
+sed -e 's/xxx/\*/g'|\
+sed -e 's/  / /g'|\
+sed -e 's/  / /g'|\
+sed -e 's/  / /g'|\
+sed -e 's/  / /g'|\
+sed -e 's/  / /g'|\
+sed -e 's/ //'|\
+sed -e 's/ /\t/g' > vax004$1Model2.tab
+
+hgLoadSqlTab -notOnServer hgFixed vax004$1Model2 ~/src/hg/lib/posSelectModel.sql vax004$1Model2.tab
+
+get2ndHalf j.tmp >$1Model8.paml
+cat $1Model8.paml |\
+sed -e 's/+-//g'|\
+sed -e 's/ \* / xxx /g'|\
+sed -e 's/\*//g'|\
+sed -e 's/xxx/\*/g'|\
+sed -e 's/  / /g'|\
+sed -e 's/  / /g'|\
+sed -e 's/  / /g'|\
+sed -e 's/  / /g'|\
+sed -e 's/  / /g'|\
+sed -e 's/ //'|\
+sed -e 's/ /\t/g' > vax004$1Model8.tab
+
+hgLoadSqlTab -notOnServer hgFixed vax004$1Model8 ~/src/hg/lib/posSelectModel.sql vax004$1Model8.tab
+
+rm j.tmp
+
+#do1.3
+
+cat $1.nex|grep 'U\.'|\
+    sed -e 's/  / /g'|\
+    sed -e 's/  / /g'|\
+    sed -e 's/  / /g'|\
+    sed -e 's/  / /g'|\
+    sed -e 's/  / /g'|\
+    sed -e 's/ /\t/g' >vax004$1Msa.tab
+chmod +rx *.tab
+
+hgLoadSqlTab -notOnServer hgFixed vax004$1Msa /hive/groups/gsid/medical/hiv/posSelection/vax004/dnaSeq.sql vax004$1Msa.tab
+cd ..
+
+#do1.4
+
+hgsql -N -e "select concat('do2 ${1} ', id) from hgFixed.vax004${1}Msa limit 1" >doit
+chmod +x doit
+doit
+'_EOF_'
+
+chmod +x do1
+
+cat << '_EOF_' >do2
+gsidAaMsa2 hgFixed vax004$1Msa $2 1 $1/$1Msa.wig $1/$1MsaAaConsensus.fa
+'_EOF_'
+chmod +x do2
+
+cat << '_EOF_' >do3
+# process hivVax003Vax004
+hgsql hgcentralhiv1 -N -e "select concat('blatit ${1} hivVax003Vax004 ', port) from blatServers where db='hivVax003Vax004' and isTrans=1" >doBlat
+chmod +x doBlat
+./doBlat
+
+cd $1
+gsidPosSelect hivVax003Vax004 vax004$1MsaPsl hgFixed vax004$1Model2 posSelVax004$1Model2.bed
+hgLoadBed     hivVax003Vax004 posSelVax004$1Model2                  posSelVax004$1Model2.bed
+
+gsidPosSelect hivVax003Vax004 vax004$1MsaPsl hgFixed vax004$1Model8 posSelVax004$1Model8.bed
+hgLoadBed     hivVax003Vax004 posSelVax004$1Model8                  posSelVax004$1Model8.bed
+cd ..
+
+# process hivmn2
+hgsql hgcentralhiv1 -N -e "select concat('blatit ${1} hivmn2 ', port) from blatServers where db='hivmn2' and isTrans=1" >doBlat
+chmod +x doBlat
+./doBlat
+
+cd $1
+gsidPosSelect hivmn2 vax004$1MsaPsl hgFixed vax004$1Model2 posSelVax004$1Model2.bed
+hgLoadBed     hivmn2 posSelVax004$1Model2                  posSelVax004$1Model2.bed
+
+gsidPosSelect hivmn2 vax004$1MsaPsl hgFixed vax004$1Model8 posSelVax004$1Model8.bed
+hgLoadBed     hivmn2 posSelVax004$1Model8                  posSelVax004$1Model8.bed
+cd ..
+
+# process hivgne8v2
+hgsql hgcentralhiv1 -N -e "select concat('blatit ${1} hivgne8v2 ', port) from blatServers where db='hivgne8v2' and isTrans=1" >doBlat
+chmod +x doBlat
+./doBlat
+
+cd $1
+gsidPosSelect hivgne8v2 vax004$1MsaPsl hgFixed vax004$1Model2 posSelVax004$1Model2.bed
+hgLoadBed     hivgne8v2 posSelVax004$1Model2                  posSelVax004$1Model2.bed
+
+gsidPosSelect hivgne8v2 vax004$1MsaPsl hgFixed vax004$1Model8 posSelVax004$1Model8.bed
+hgLoadBed     hivgne8v2 posSelVax004$1Model8                  posSelVax004$1Model8.bed
+cd ..
+
+# process hiva244
+hgsql hgcentralhiv1 -N -e "select concat('blatit ${1} hiva244 ', port) from blatServers where db='hiva244' and isTrans=1" >doBlat
+chmod +x doBlat
+./doBlat
+
+cd $1
+gsidPosSelect hiva244 vax004$1MsaPsl hgFixed vax004$1Model2 posSelVax004$1Model2.bed
+hgLoadBed     hiva244 posSelVax004$1Model2                  posSelVax004$1Model2.bed
+
+gsidPosSelect hiva244 vax004$1MsaPsl hgFixed vax004$1Model8 posSelVax004$1Model8.bed
+hgLoadBed     hiva244 posSelVax004$1Model8                  posSelVax004$1Model8.bed
+cd ..
+'_EOF_'
+
+chmod +x do3
+
+# Now run the scripts for all subclasses.
+
+do1 Hispanic
+do1 Midwest
+do1 Northeast
+do1 Other
+do1 South
+do1 Southwest
+do1 Westcoast
+do1 White
+do1 Asian
+do1 Black
+do1 pla
+do1 vac
+
+# BTW, do1 calls do2
+
+do3 Hispanic
+do3 Midwest
+do3 Northeast
+do3 Other
+do3 South
+do3 Southwest
+do3 Westcoast
+do3 White
+do3 Asian
+do3 Black
+do3 pla
+do3 vac
+
+##########################################################################