src/hg/makeDb/doc/hg19.txt 1.88
1.88 2010/03/05 17:44:58 chinhli
Add H-InvDB_7.0 support
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.87
retrieving revision 1.88
diff -b -B -U 4 -r1.87 -r1.88
--- src/hg/makeDb/doc/hg19.txt 1 Mar 2010 21:05:26 -0000 1.87
+++ src/hg/makeDb/doc/hg19.txt 5 Mar 2010 17:44:58 -0000 1.88
@@ -8673,4 +8673,90 @@
'INSERT into targetDb values("hg19Kg", "UCSC Genes", \
"hg19", "kgTargetAli", "", "", \
"/gbdb/hg19/targetDb/kgTargetSeq.2bit", 1, now(), "");'
+############################################################################
+# H-INVITATIONAL GENE ANNOTATION DATABASE (Working 2010-0226, chin)
+ #http://h-invitational.jp/hinv/ahg-db/index.jsp
+ # Create knownGene table to reference HINV gene ID's
+ # for link on knownGenes details page
+ # Also, create an HINV gene track
+
+ # download CDNA file H-InvDB_7.0 (Feb 16, 2010) -- got release # from downloads page).
+ # ftp://ftp.ddbj.nig.ac.jp/mirror_database/hinv/jbirc_ff/annotation/
+ mkdir /cluster/data/hinv/H-InvDB_7.0
+ cd /cluster/data/hinv/H-InvDB_7.0
+ wget --timestamp \
+ftp://ftp.ddbj.nig.ac.jp/mirror_database/hinv/jbirc_ff/annotation/FCDNA.gz
+
+ # set up assembly work area
+ mkdir /cluster/data/hg19/bed/hinv
+ cd /cluster/data/hg19/bed/hinv
+
+ # extract H-INV ID's and Genbank accessions of mRNAs
+ zcat /cluster/data/hinv/H-InvDB_7.0/FCDNA.gz \
+ | awk '/CDNA_ACCESSION-NO:/ {print $2}' > accessions.txt
+ zcat /cluster/data/hinv/H-InvDB_7.0/FCDNA.gz \
+ | awk '/CDNA_H-INVITATIONAL-ID:/ {print $2}' > ids.txt
+ paste accessions.txt ids.txt > queries.txt
+ wc -l ids.txt
+xxx need to ask which version of invitation id to use
+ # 56419 ids.txt
+
+ # create PSL file from alignments for these mRNA's, extracted from the
+ # table of all aligned mRNA's
+ hgsql hg19 -s -e "SELECT * FROM all_mrna" | cut -f 2- > all_mrna.tab
+
+ ssh kkstore02
+ cd /cluster/data/hg19/bed/hinv
+ pslReps /dev/null stdout /dev/null | cat - all_mrna.tab > all_mrna.psl
+
+ # using pslReps to generate the PSL file header
+ pslSelect -queryPairs=queries.txt all_mrna.psl hinv_mrna.psl
+
+ # NEXT TIME, LOAD HInvGeneMrna TABLE AFTER HInv TABLE IS LOADED TO AVOID
+ # joinerCheck TO COMPLAIN.
+ # load track of mrna alignments
+ ssh hgwdev
+ cd /cluster/data/hg19/bed/hinv
+ hgLoadPsl hg19 -table=HInvGeneMrna hinv_mrna.psl
+ hgsql hg19 -s -e \
+ "select distinct(qName) from HInvGeneMrna order by qName" > hg19.mrna
+ hgsql hg17 -s -e \
+ "select distinct(qName) from HInvGeneMrna order by qName" > hg17.mrna
+ wc -l hg*.mrna
+ # 41023 hg17.mrna
+ # 54974 hg19.mrna
+
+ comm -1 -3 *.mrna > hg19.aligned
+ wc -l hg19.aligned
+ # 14758 (transcripts newly aligned in hg19)
+ comm -2 -3 *.mrna > hg17.aligned
+ wc -l hg17.aligned
+ # 807 (transcripts no longer aligned in hg19)
+ comm -2 -3 ids.txt hg19.mrna > hg19.notaligned
+ wc -l hg19.notaligned
+ # 1445 (transcripts not aligned in hg19 -- checking on why...)
+
+ # also make a table with various useful items for each transcript
+ ssh hgwdev
+ hgsql hg19 < ~/kent/src/hg/lib/HInv.sql
+ cd /cluster/data/hg19/bed/hinv
+ /cluster/data/hinv/hinvToTable.pl < /cluster/data/hinv/2005-02-02/FCDNA.2.2 > HInv.tab
+ echo 'load data local infile "HInv.tab" into table HInv' | hgsql hg19
+ hgsql hg17 -s -e "select count(*) from HInv"
+ # 41118
+ hgsql hg19 -s -e "select count(*) from HInv"
+ # 56419
+
+ # !!! DO THIS AFTER KG IS BUILD !!!
+ # DONE (4/13/06 Fan).
+ # create table for knownGenes detail page
+ ssh hgwdev
+ cd /cluster/data/hg19/bed/hinv
+ hgMapToGene hg19 HInvGeneMrna knownGene knownToHInv
+
+# QA NOTE (3-6-2006): did a mytouch to update the time for the HInvGeneMrna table
+# (because joinerCheck was complaining during -times check):
+# sudo mytouch hg19 HInvGeneMrna 200602031600.00
+# touch -t 200602031600.00 /var/lib/mysql/hg19/HInvGeneMrna.MYD
+