src/hg/makeDb/doc/hg19.txt 1.88

1.88 2010/03/05 17:44:58 chinhli
Add H-InvDB_7.0 support
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.87
retrieving revision 1.88
diff -b -B -U 4 -r1.87 -r1.88
--- src/hg/makeDb/doc/hg19.txt	1 Mar 2010 21:05:26 -0000	1.87
+++ src/hg/makeDb/doc/hg19.txt	5 Mar 2010 17:44:58 -0000	1.88
@@ -8673,4 +8673,90 @@
       'INSERT into targetDb values("hg19Kg", "UCSC Genes", \
          "hg19", "kgTargetAli", "", "", \
          "/gbdb/hg19/targetDb/kgTargetSeq.2bit", 1, now(), "");'
 
+############################################################################
+# H-INVITATIONAL GENE ANNOTATION DATABASE (Working 2010-0226, chin)
+    #http://h-invitational.jp/hinv/ahg-db/index.jsp
+    # Create knownGene table to reference HINV gene ID's
+    #  for link on knownGenes details page
+    # Also, create an HINV gene track
+
+    # download CDNA file H-InvDB_7.0  (Feb 16, 2010) -- got release # from downloads page).
+    #  ftp://ftp.ddbj.nig.ac.jp/mirror_database/hinv/jbirc_ff/annotation/
+   mkdir /cluster/data/hinv/H-InvDB_7.0
+   cd /cluster/data/hinv/H-InvDB_7.0
+   wget --timestamp \
+ftp://ftp.ddbj.nig.ac.jp/mirror_database/hinv/jbirc_ff/annotation/FCDNA.gz
+
+    # set up assembly work area
+    mkdir /cluster/data/hg19/bed/hinv
+    cd /cluster/data/hg19/bed/hinv
+
+    # extract H-INV ID's and Genbank accessions of mRNAs
+    zcat /cluster/data/hinv/H-InvDB_7.0/FCDNA.gz \
+      |  awk '/CDNA_ACCESSION-NO:/ {print $2}' > accessions.txt
+    zcat /cluster/data/hinv/H-InvDB_7.0/FCDNA.gz \
+      | awk '/CDNA_H-INVITATIONAL-ID:/ {print $2}'  > ids.txt
+    paste accessions.txt ids.txt > queries.txt
+    wc -l ids.txt
+xxx need to ask which version of invitation id to use
+    #   56419 ids.txt
+
+    # create PSL file from alignments for these mRNA's, extracted from the
+    #       table of all aligned mRNA's
+    hgsql hg19 -s -e "SELECT * FROM all_mrna"  | cut -f 2- > all_mrna.tab
+
+    ssh kkstore02
+    cd /cluster/data/hg19/bed/hinv
+    pslReps /dev/null stdout /dev/null | cat - all_mrna.tab > all_mrna.psl
+
+    # using pslReps to generate the PSL file header
+    pslSelect -queryPairs=queries.txt all_mrna.psl hinv_mrna.psl
+
+    # NEXT TIME, LOAD HInvGeneMrna TABLE AFTER HInv TABLE IS LOADED TO AVOID
+    # joinerCheck TO COMPLAIN.
+    # load track of mrna alignments
+    ssh hgwdev
+    cd /cluster/data/hg19/bed/hinv
+    hgLoadPsl hg19 -table=HInvGeneMrna hinv_mrna.psl
+    hgsql hg19 -s -e \
+        "select distinct(qName) from HInvGeneMrna order by qName" > hg19.mrna
+    hgsql hg17 -s -e \
+        "select distinct(qName) from HInvGeneMrna order by qName" > hg17.mrna
+    wc -l hg*.mrna
+        # 41023 hg17.mrna
+        # 54974 hg19.mrna
+
+    comm -1 -3 *.mrna > hg19.aligned
+    wc -l hg19.aligned
+        # 14758 (transcripts newly aligned in hg19)
+    comm -2 -3 *.mrna > hg17.aligned
+    wc -l hg17.aligned
+        # 807 (transcripts no longer aligned in hg19)
+    comm -2 -3 ids.txt hg19.mrna > hg19.notaligned
+    wc -l hg19.notaligned
+        # 1445 (transcripts not aligned in hg19 -- checking on why...)
+
+    # also make a table with various useful items for each transcript
+    ssh hgwdev
+    hgsql hg19 < ~/kent/src/hg/lib/HInv.sql
+    cd /cluster/data/hg19/bed/hinv
+    /cluster/data/hinv/hinvToTable.pl < /cluster/data/hinv/2005-02-02/FCDNA.2.2 > HInv.tab
+    echo 'load data local infile "HInv.tab" into table HInv' | hgsql hg19
+    hgsql hg17 -s -e "select count(*) from HInv"
+        # 41118
+    hgsql hg19 -s -e "select count(*) from HInv"
+        # 56419
+
+    # !!! DO THIS AFTER KG IS BUILD !!!
+    # DONE (4/13/06 Fan).
+    # create table for knownGenes detail page
+    ssh hgwdev
+    cd /cluster/data/hg19/bed/hinv
+    hgMapToGene hg19 HInvGeneMrna knownGene knownToHInv
+
+# QA NOTE (3-6-2006): did a mytouch to update the time for the HInvGeneMrna table
+# (because joinerCheck was complaining during -times check):
+# sudo mytouch hg19 HInvGeneMrna 200602031600.00
+# touch -t 200602031600.00 /var/lib/mysql/hg19/HInvGeneMrna.MYD
+