src/hg/makeDb/doc/hg18.txt 1.381
1.381 2009/09/09 18:13:50 hartera
Documented code changes and reloaded vegaPep table with only those proteins whose transcripts are in vegaGtp.
Index: src/hg/makeDb/doc/hg18.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg18.txt,v
retrieving revision 1.380
retrieving revision 1.381
diff -b -B -U 4 -r1.380 -r1.381
--- src/hg/makeDb/doc/hg18.txt 4 Sep 2009 16:51:03 -0000 1.380
+++ src/hg/makeDb/doc/hg18.txt 9 Sep 2009 18:13:50 -0000 1.381
@@ -28615,9 +28615,9 @@
# Converted stdin, upper limit 11.63, lower limit -28.64
hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 sfrs1ConsensusSites sfrs1ConsensusSites.wig
ln -s $(pwd -P)/sfrs1ConsensusSites.wib /gbdb/hg18/wib/
############################################################################
-# VEGA GENES UPDATE TO BUILD 35 (DONE, 2009-08-04 - 2009-09-04, hartera)
+# VEGA GENES UPDATE TO BUILD 35 (DONE, 2009-08-04 - 2009-09-09, hartera)
# Needs updating as the current version is build 33.
# Download the human VEGA Genes posted on ftp site on 2009-03-31
# 2009-08-03 (hartera) - Added code to register track handler for
# vegaGeneComposite.
@@ -28628,10 +28628,14 @@
# 2009-08-22 - Finished code for adding Vega report URLs to the details pages.
# Loaded the vegaGtp table.
# 2009-09-01 - 2009-09-02 (hartera). Loaded a vegaPep table for the protein
# sequence link on the details pages.
-# 2009-08-04 Re-load all tables as some reverted to the older version during
+# 2009-09-04 Re-load all tables as some reverted to the older version during
# mySQL 5 upgrade.
+# 2009-09-08 - 2009-09-09 Code change to change message on details page when
+# no protein is available and change to trackDb to make vegaGene items a
+# darker blue colour. Reloaded vegaPep after removing proteins whose
+# transcripts are not in vegaGtp to make all.joiner happy.
mkdir /hive/data/genomes/hg18/bed/vega35
cd /hive/data/genomes/hg18/bed/vega35
wget --timestamping "ftp://ftp.sanger.ac.uk/pub/vega/human/*" \
@@ -28733,5 +28737,46 @@
hgsql -e 'drop table vegaGtp;' hg18
hgLoadSqlTab hg18 vegaGtp ensGtp.sql vegaGtp.tab
hgsql -e 'drop table vegaPep;' hg18
hgPepPred hg18 tab vegaPep vegaPep.hg18.fa.tab
+
+ # 2009-09-08 (hartera). Changed message in code for details page when no
+ # protein sequence is available to be more explanatory. "Non-protein
+ # coding gene or gene fragment, no protein prediction available." Changed
+ # the colouring for the vegaGene subtrack to be darker blue so there is
+ # more of a contrast between vegaGene and vegaPseudoGene subtracks.
+
+ # 2009-09-09 (hartera) - re-loaded vegaPep table with only those proteins
+ # that have a transcript ID in vegaGtp.
+ # all.joiner is complaining as there are about 23,000 extra proteins in
+ # vegaPep that do not have transcripts in vegaGtp. Decided to remove these
+ # and e-mailed the HAVANA group to ask about the discrepancy.
+ cd /hive/data/genomes/hg18/bed/vega35
+ awk '{print $2}' vegaGtp.tab | sort | uniq > vegaGtp.tx.ids
+ awk '{print $1}' vegaPep.hg18.fa.tab | sort | uniq > vegaPep.tx.ids
+ wc -l *.tx.ids
+ # 81244 vegaGtp.tx.ids
+ # 60003 vegaPep.tx.ids
+ # Number of transcripts that have a protein ID:
+ hgsql -Ne 'select transcript from vegaGtp where protein like "OTTHUMP%";' \
+ hg18 | sort | uniq > vegaGtpWithProt.tx.ids
+ wc -l vegaGtpWithProt.tx.ids
+ # 36747 vegaGtpWithProt.tx.ids
+
+ # find those that are common to both.
+ comm -12 vegaGtp.tx.ids vegaPep.tx.ids > pepandGtp.tx.ids
+ wc -l pepandGtp.tx.ids
+ # 36747 pepandGtp.tx.ids
+ comm -12 pepandGtp.tx.ids vegaGtpWithProt.tx.ids | wc -l
+ # 36747
+ # Therefore all the vegaGtp transcripts with a protein ID are in the
+ # protein FASTA file.
+ hgsql -Ne 'select * from vegaPep as p, vegaGtp as g where g.protein \
+ like "OTTHUMP%" and p.name = g.transcript;' hg18 \
+ > vegaPepOnlyInGtp.hg18.fa.tab
+ wc -l vegaPepOnlyInGtp.hg18.fa.tab
+ # 36747 vegaPepOnlyInGtp.hg18.fa.tab
+
+ hgsql -e 'drop table vegaPep;' hg18
+ hgPepPred hg18 tab vegaPep vegaPepOnlyInGtp.hg18.fa.tab
+
############################################################################