src/hg/makeDb/doc/hivA244.txt 1.4
1.4 2009/03/03 00:07:49 fanhsu
Added posSelection for vax004 section.
Index: src/hg/makeDb/doc/hivA244.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hivA244.txt,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/makeDb/doc/hivA244.txt 14 Jan 2009 18:24:49 -0000 1.3
+++ src/hg/makeDb/doc/hivA244.txt 3 Mar 2009 00:07:49 -0000 1.4
@@ -1,494 +1,498 @@
##########################################################################
# CREATE HIVA244 DATABASE (STARTED 9/5/08 DONE 9/14/08, Fan)
# for emacs: -*- mode: sh; -*-
ssh hiv1
mkdir /hive/groups/gsid/medical/hiv/hiva244
cd /hive/groups/gsid/medical/hiv/hiva244
# Receive data file, reference.fasta, from Keith Crandall dated 8/29/08,
# keep the A244 part and name it hivA244.fa.
# translate to nib
mkdir nib
cp ”Vp hivA244.fa chr1.fa
faToNib chr1.fa nib/chr1.nib
CREATING DATABASE
# Create the hiva244 database.
hgsql hivA2442 -e 'create database hiva244'
# CREATING GRP TABLE FOR TRACK GROUPING
echo "create table grp (PRIMARY KEY(NAME)) select * from hivA2442.grp" \
| hgsql hiva244
# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION
mkdir -p /gbdb/hiva244/nib
cp -p nib/chr1.nib /gbdb/hiva244/nib
# Load /gbdb/hiva244/nib paths into database and save size info.
hgsql hiva244 < ~/src/hg/lib/chromInfo.sql
hgNibSeq -preMadeNib hiva244 /gbdb/hiva244/nib /gbdb/hiva244/nib/chr1.fa
echo "select chrom,size from chromInfo" | hgsql -N hiva244 > chrom.sizes
# MAKE HGCENTRALHIV1 ENTRY AND TRACKDB TABLE FOR HIVA244
echo 'insert into defaultDb values("HIV A244 (GP120)", "hiva244");' \
| hgsql -h localhost hgcentralhiv1
echo 'insert into dbDb values("hiva244", "Sept. 2008", \
"/gbdb/hiva244/nib", "HIV A244 (GP120)", "chr1", 1, 2050, \
"HIV A244 (GP120)","Human immunodeficiency virus 1", \
"/gbdb/hiva244/html/description.html", 0, 0, " sequence as of Oct., 2008");' \
| hgsql hgcentralhiv1 -h localhost
echo 'insert into genomeClade values("HIV A244 (GP120)", "other", 200);'\
| hgsql hgcentralhiv1 -h localhost
# Edit that makefile to add hiva244 in all the right places
cd ~src/hg/makeDb/trackDb
vi makefile
mkdir ”Vp hiv/hiva244
cp ”Vp hiv/hivmn2/trackDb hiv/hiva244
make update
make alpha update DBS=hiva244
cvs update
cvs commit makefile
# MAKE hgcentralhiv1 blatServers ENTRY FOR hiva244
# Ask admin to start BLAT server processes
echo 'insert into blatServers values("hiva244", "hiv1", "17796", "1", "0"); \
insert into blatServers values("hiva244", "hiv1", "17797", "0", "0");' \
| hgsql hgcentralhiv1 -h localhost
# CREATE TRACKDB TABLE
cd src/hg/makeDb/trackDb/hiv
mkdir hiva244
cvs add hiva244
cd hiva244
cp ../hivmn2/trackDb.ra . ”Vp
cvs add trackDb.ra
cvs commit trackDb.ra
cd ~/src/hg/makeDb/trackDb
make update DBS=hiva244
make alpha update DBS=hiva244
# copy over tables from hivVax003Vax004
cd /hive/groups/gsid/medical/hiv/hiva244
echo 'cp -p /data/mysql/hivVax003Vax004/$1.* /data/mysql/hiva244' >copy1Table
chmod +x copy1Table
copy1Table dnaSeq
copy1Table aaSeq
copy1Table gsidSubjInfo
copy1Table gsidClinicRec
copy1Table gsIdXref
copy1Table vax003AEMsa
copy1Table vax003BMsa
copy1Table vax003BMsa
copy1Table vax004Msa
# CREATE VAX004 TRACK
cd /hive/groups/gsid/medical/hiv/hiva244
mkdir vax004
cd vax004
# get vax004 sequences
hgsql hiva244 -N -e 'select * from dnaSeq where id like "%U%"' >vax004.tab
# create .fa file
tabToFa vax004
mkdir -p /gbdb/hiva244/vax004
cp -p vax004.fa /gbdb/hiva244/vax004/vax004.fa
hgLoadSeq -replace hiva244 /gbdb/hiva244/vax004/vax004.fa
# BLAT
gfClient -minScore=200 -minIdentity=80 -nohead hiv1.cse.ucsc.edu 17797 /gbdb/hiva244/nib \
-out=psl -t=dna -q=dna vax004.fa vax004.psl
# count the result
wc *.psl
cut -f 10 vax004.psl |wc
cut -f 10 vax004.psl |sort -u |wc
# load the psl result into vax004 table
hgLoadPsl hiva244 vax004.psl
# hgLoadPsl has some file permission problem. Finish this by manually load the psl.tab file.
hgsql hivA244 -e 'load data local infile "psl.tab" into table vax004'
# CREATE VAX003 TRACK
cd /hive/groups/gsid/medical/hiv/hiva244
mkdir vax003
cd vax003
# get vax003 sequences
hgsql hiva244 -N -e 'select * from dnaSeq where id like "%T%"' >vax003.tab
# create .fa file
tabToFa vax003
mkdir -p /gbdb/hiva244/vax003
cp -p vax003.fa /gbdb/hiva244/vax003/vax003.fa
hgLoadSeq -replace hiva244 /gbdb/hiva244/vax003/vax003.fa
# BLAT
gfClient -minScore=200 -minIdentity=80 -nohead hiv1.cse.ucsc.edu 17797 /gbdb/hiva244/nib \
-out=psl -t=dna -q=dna vax003.fa vax003.psl
# count the result
wc *.psl
cut -f 10 vax003.psl |wc
cut -f 10 vax003.psl |sort -u |wc
# load the psl result into vax003 table
hgLoadPsl hiva244 vax003.psl
# hgLoadPsl has some file permission problem. Finish this by manually load the psl.tab file.
hgsql hiva244 -e 'load data local infile "psl.tab" into table vax003'
#CREATE HIVGENE TRACK
cd /hive/groups/gsid/medical/hiv/hiva244
mkdir hivGene
cd hivGene
# CREATE hivGene TRACK
# during initial hiv1 build, landMark.txt was created by:
# copy the sequence data from HIV Sequence Database web page:
# http://www.hiv.lanl.gov/content/sequence/HIV/REVIEWS/HXB2.html
# create raw sequence file, landMark.txt
# copy over this file
cp ”Vp /hive/groups/gsid/medical/hiv/hiv1/hivGene/landMark.txt .
# Process landMark.txt into hivGene.fa
faToTab -type=protein landMark.txt landMark.tab
cut -f 1 landMark.tab |toLower stdin j1
cut -f 2 landMark.tab >j2
paste j1 j2 >hivGene.tab
rm j1 j2
# create hivGene.fa
tabToFa hivGene
# BLAT against base genome
gfClient localhost 17796 /gbdb/hiva244/nib -out=psl -t=dnax -q=prot hivGene.fa hivGene.psl
# load into temp table, testPsl
hgLoadPsl hiva244 table=testPsl hivGene.psl
# finish load manually to overcome permission problem
hgsql hiva244 ”Ve 'load data local infile "psl.tab" into table testPsl'
# create initial bed file
hgsql hiva244 -N -e \
'select "chr1", tStart, tEnd, qName from testPsl where (tEnd-tStart)/3/qSize>0.70'\
>hivGene.bed0
# remove tat and rev, upper case ENV, POL, and GAG
cat hivGene.bed0 |grep -v tat |grep -v rev|sed -e 's/env/ENV/' |sed -e 's/pol/POL/'|\
sed -e 's/gag/GAG/' >hivGene.bed1
# The alignment for A244 base genome is just too poor for any of those V* regions
# This track really does not provide much significant info.
# manually add ENV entry
cp hivGene.bed1 hivGene.bed
vi hivGene.bed
# load it into hivGene table
hgLoadBed hiva244 hivGene hivGene.bed
hgsql hiva244 -e 'drop table testPsl'
# CREATE INTERPRO TRACK
cd /hive/groups/gsid/medical/hiv/hiva244
mkdir interPro
cd interPro
# get all HIV-1 domain sequences
# use what we built for hiv1 before
cp /hive/groups/gsid/medical/hiv/hiv1/interPro/interProHiv1.fa interProhiva244.fa
# BLAT against base genome
gfClient localhost 17796 /gbdb/hiva244/nib -out=psl -t=dnax -q=prot interProhiva244.fa interProhiva244.psl
# load it into a temp table
hgLoadPsl hiva244 table=testPsl interProhiva244.psl
# finish above manually.
hgsql hiva244 ”Ve ”„load data local infile "psl.tab" into table testPsl”¦
# create bed file from this temp table
hgsql hiva244 -N -e \
'select "chr1", tStart, tEnd, qName from testPsl where (tEnd-tStart)/3/qSize>0.42' \
> interProhiva244.bed
# load the bed file into the track table
hgLoadBed hiva244 interPro interProhiva244.bed
# drop the temp table.
hgsql hiva244 -e 'drop table testPsl'
# CREATE CONSERVATION TRACKS
mkdir -p /hive/groups/gsid/medical/hiv/hiva244/conservation
cd /hive/groups/gsid/medical/hiv/hiva244/conservation
# create the .wig file and .fa file of the consensus sequence.
gsidMsa hiva244 vax003BMsa A244-gp120 1 vax003BCons.wig vax003BConsensus.fa
gsidMsa hiva244 vax003AEMsa A244-gp120 1 vax003AECons.wig vax003AEConsensus.fa
# encode and load the wig file
wigEncode vax003BCons.wig stdout vax003BCons.wib \
| hgLoadWiggle hiva244 vax003BCons stdin
wigEncode vax003AECons.wig stdout vax003AECons.wib \
| hgLoadWiggle hiva244 vax003AECons stdin
# copy .wib file to /gbdb
mkdir -p /gbdb/hiva244/wib
cp vax003BCons.wib /gbdb/hiva244/wib
cp vax003AECons.wib /gbdb/hiva244/wib
# do the same for protein conservation track
mkdir aa
cd aa
# create .wig file
gsidAaMsa2 hiva244 vax003BMsa A244-gp120 1 vax003BAaCons.wig vax003AaBConsensus.fa
gsidAaMsa2 hiva244 vax003AEMsa A244-gp120 1 vax003AEAaCons.wig vax003AaAEConsensus.fa
# encode and load the .wib file
wigEncode vax003BAaCons.wig stdout vax003BAaCons.wib \
| hgLoadWiggle hiva244 vax003BAaCons stdin
cp vax003BAaCons.wib /gbdb/hiva244/wib
wigEncode vax003AEAaCons.wig stdout vax003AEAaCons.wib \
| hgLoadWiggle hiva244 vax003AEAaCons stdin
cp vax003AEAaCons.wib /gbdb/hiva244/wib
# CREATE MAF TRACKS FOR vax003AE STRAIN
cd /hive/groups/gsid/medical/hiv/hiva244
mkdir -p msa/AE
cd msa/AE
# create a script file, doall
hgsql hiva244 -N -e \
'select id from vax003AEMsa where id like "%T%"'\
|sed -e 's/ss/do1 ss/g' >doall
# create one line script file, do1, with the following line in it:
hgsql hiva244 -N -e "select id, seq from vax003AEMsa where id='${1}'"
chmod +x do*
# run the script to get the .tab file with all MSA sequences of VAX003AE
doall >a244.tab
# convert .tab into .fa file
tabToFa a244
# grab the base alignment sequence
echo ">hiva244" >a244.aln
hgsql hiva244 -N -e 'select seq from vax003AEMsa where id="A244-gp120"' >> a244.aln
# prepare an interium file, jjAll.mfa
cat a244.aln a244.fa >jjAll.mfa
echo = >>jjAll.mfa
# Run xmfaToMafA244AE to create a precursor file for the final .maf
xmfaToMafA244AE jjAll.mfa j.out org1=hiva244
cat j.out|sed -e 's/\./_/g'|sed -e 's/_chr/\.chr/g' >chr1.tmp
# rm jjAll.mfa j.out
cat chr1.tmp |sed -e 's/ss_T/T/g' >chr1.maf
# copy .maf to /gbdb.
mkdir -p /gbdb/hiva244/vax003AEMaf
cp chr1.maf /gbdb/hiva244/vax003AEMaf -p
echo before load
hgLoadMaf hiva244 vax003AEMaf
# create another copy for protein MAF.
mkdir -p /gbdb/hiva244/vax003AEAaMaf
cp -p chr1.maf /gbdb/hiva244/vax003AEAaMaf
hgLoadMaf hiva244 vax003AEAaMaf
# CREATE MAF TRACKS FOR vax003B STRAIN
cd /hive/groups/gsid/medical/hiv/hiva244
mkdir -p msa/B
cd msa/B
# create a script file, doall
hgsql hiva244 -N -e \
'select id from vax003BMsa where id like "%T%"'\
|sed -e 's/ss/do1 ss/g' >doall
# create one line script file, do1, with the following line in it:
hgsql hiva244 -N -e "select id, seq from vax003BMsa where id='${1}'"
chmod +x do*
# run the script to get the .tab file with all MSA sequences of VAX003B
doall >a244.tab
# convert .tab into .fa file
tabToFa a244
# grab the base alignment sequence
echo ">hiva244" >a244.aln
hgsql hiva244 -N -e 'select seq from vax003BMsa where id="A244-gp120"' >> a244.aln
# prepare an interium file, jjAll.mfa
cat a244.aln a244.fa >jjAll.mfa
echo = >>jjAll.mfa
# Run xmfaToMafA244B to create a precursor file for the final .maf
xmfaToMafA244B jjAll.mfa j.out org1=hiva244
cat j.out|sed -e 's/\./_/g'|sed -e 's/_chr/\.chr/g' >chr1.tmp
# rm jjAll.mfa j.out
cat chr1.tmp |sed -e 's/ss_T/T/g' >chr1.maf
# copy .maf to /gbdb.
mkdir -p /gbdb/hiva244/vax003BMaf
cp chr1.maf /gbdb/hiva244/vax003BMaf -p
echo before load
hgLoadMaf hiva244 vax003BMaf
# create another copy for protein MAF.
mkdir -p /gbdb/hiva244/vax003BAaMaf
cp -p chr1.maf /gbdb/hiva244/vax003BAaMaf
hgLoadMaf hiva244 vax003BAaMaf
##########################################################################
# REBUILD THE gsidClinicRecWithSeq TABLE (DONE 11/03/08, Fan)
# See details in hivVax003Vax004.txt.
###########################################################
# BUILD THE POSITIVE SELECTION TRACKS FOR VAX003 SUBTYPE B
ssh hiv1
mkdir -p /hive/groups/gsid/medical/hiv/posSelection/B/hiva244
cd /hive/groups/gsid/medical/hiv/posSelection/B/hiva244
# BLAT /cluster/hive/groups/gsid/medical/hiv/posSelection/B/BMsaAaConsensus.fa
# against hiva244 base genome, select psl without header option
# cut and paste the result into the file BMsa.psl
hgLoadPsl -keep -table=BMsaPsl -nobin hiva244 BMsa.psl
# will get the following error:
#Processing BMsa.psl
#Can't start query:
#LOAD DATA CONCURRENT INFILE
'/cluster/hive/groups/gsid/medical/hiv/hiva244/posSelection/BMsa.psl' INTO
TABLE BMsaPsl
#mySQL error 13: Can't get stat of
'/cluster/hive/groups/gsid/medical/hiv/hiva244/posSelection/BMsa.psl'
(Errcode: 13)
# load manually then
hgsql hiva244
load data local infile "BMsa.psl" into table BMsaPsl;
quit
# build positive selection tracks for model 2 and model 8.
gsidPosSelect hiva244 BMsaPsl posSelBuild pSelectBModel2 posSelBModel2.bed
hgLoadBed hiva244 posSelBModel2 posSelBModel2.bed
gsidPosSelect hiva244 BMsaPsl posSelBuild pSelectBModel8 posSelBModel8.bed
hgLoadBed hiva244 posSelBModel8 posSelBModel8.bed
##########################################################################
# BUILD THE POSITIVE SELECTION TRACKS FOR VAX003 SUBTYPE AE
ssh hiv1
mkdir -p /hive/groups/gsid/medical/hiv/posSelection/AE/hiva244
cd /hive/groups/gsid/medical/hiv/posSelection/AE/hiva244
# BLAT
# /cluster/hive/groups/gsid/medical/hiv/posSelection/AE/AEMsaAaConsensus.fa
# against hiva244 base genome, select psl without header option
# cut and paste the result into the file AEMsa.psl
hgLoadPsl -keep -table=AEMsaPsl -nobin hiva244 AEMsa.psl
# will get the following error:
#Processing AEMsa.psl
#Can't start query:
#LOAD DATA CONCURRENT INFILE
'/cluster/hive/groups/gsid/medical/hiv/hiva244/posSelection/AEMsa.psl' INTO
TABLE AEMsaPsl
#mySQL error 13: Can't get stat of
'/cluster/hive/groups/gsid/medical/hiv/hiva244/posSelection/AEMsa.psl'
(Errcode: 13)
# load manually then
hgsql hiva244
load data local infile "AEMsa.psl" into table AEMsaPsl;
quit
# build positive selection tracks for model 2 and model 8.
gsidPosSelect hiva244 AEMsaPsl posSelBuild pSelectAEModel2
posSelAEModel2.bed
hgLoadBed hiva244 posSelAEModel2 posSelAEModel2.bed
gsidPosSelect hiva244 AEMsaPsl posSelBuild pSelectAEModel8 posSelAEModel8.bed
hgLoadBed hiva244 posSelAEModel8 posSelAEModel8.bed
##########################################################################
+# BUILD THE POSITIVE SELECTION TRACKS FOR VAX004 (Done Fan, 3/2/09)
+
+# Please see the corresponding section in hivVax003Vax004.txt for details.
+##########################################################################