src/hg/near/makeNear.doc 1.42

1.42 2009/10/16 17:35:33 kent
Adding update procedure for hg19 P2P tracks.
Index: src/hg/near/makeNear.doc
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/near/makeNear.doc,v
retrieving revision 1.41
retrieving revision 1.42
diff -b -B -U 4 -r1.41 -r1.42
--- src/hg/near/makeNear.doc	14 Oct 2009 23:37:40 -0000	1.41
+++ src/hg/near/makeNear.doc	16 Oct 2009 17:35:33 -0000	1.42
@@ -267,9 +267,9 @@
 #vidal 
  
 cat nature04209-s17.xls | gawk '{print $1 "\t" $3 "\t" "1.0"}' > humanVidal.p2p
 
-hgNetDist humanVidal.p2p hg18 humanVidalP2P -threshold=2 -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
+hgLoadNetDist humanVidal.p2p hg18 humanVidalP2P -threshold=2 -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
 
 #Added to hgNearData/Human/hg18/columnDb.ra
 #-------------
 name vidalP2p
@@ -432,38 +432,52 @@
 
 ---
 #----------------------------------------------------------
 
-## (kg3 hg18 upgrade done galt 2007-03-29) 
-# kg3 hg18 upgrade of Human p2p Protein-to-protein network - P2P columns
-# I used the hgNetDist program to calculate network-distances for all gene pairs from
-# the interaction.p2p files and created *.pathLengths files as output.
-# These were then read by hgLoadNetDist to remap the ids and create the
-# actual mysql tables used.  Note that hgLoadNetDist was split off as a
-# separate program from hgNetDist to make the planned incremental kg3 upgrades
-# much easier and faster, since only the hgLoadNetDist has to be run
-# once the pathLengths files have been created.
+## (kent 2009-10-16) 
+# HPRD p2p update used in hg19
+
+# First go to http://www.hprd.org, follow the download link, fill in the information they
+# request for academic users, and download HPRD_SINGLE_PSIMI_070609.xml.tar.gz into
+# /hive/data/outside/hprd/070609, and then unpack it with
+    cd /hive/data/outside/hprd/070609
+    tar -zxvf HPRD_SINGLE_PSIMI_070609.xml.tar.gz
+
+# Now run the hprdXmlToTab program, which was largely generated by autoDtd/autoXml.
+    hprdXmlToTab HPRD_SINGLE_PSIMI_070609.xml p2p.tab complex.tab
+# interaction count = 40075
+
+# Now use hgNetDist to generate pathLengthrs file.  This takes an hour or two.
+    hgNetDist -verbose=2 -weighted -threshold=2 p2p.tab hprd.pathLengths
+
+#----------------------------------------------------------
+
+## (kg3 hg19 upgrade done kent 2009-10-13) 
+# kg3 hg19 creation of Human p2p Protein-to-protein network - P2P columns
+# Note could just reuse the pathLengths files calculated in the hg18 build, since
+# these don't depend on an assembly.
+
+#Copy in from hg18 database
+	cp /hive/data/genomes/hg18/p2p /hive/data/genomes/hg19
 
 #hprd
-        hgNetDist -verbose=2 -weighted -threshold=2 \
-                /cluster/data/hg18/p2p/hprd/hprd.p2p \
-                /cluster/data/hg18/p2p/hprd/hprd.pathLengths
-        hgLoadNetDist /cluster/data/hg18/p2p/hprd/hprd.pathLengths hg18 humanHprdP2P \
+        hgLoadNetDist /hive/data/outside/hprd/070609/hprd.pathLengths hg19 humanHprdP2P \
                 -sqlRemap="select distinct value, name from knownToHprd"
+	# hgLoadNetDist 86 id-remapping misses, see missing.tab
+
 
 #vidal
-        hgNetDist -verbose=2 -skipFirst -threshold=2 \
-                /cluster/data/hg18/p2p/vidal/humanVidal.p2p \
-                /cluster/data/hg18/p2p/vidal/humanVidal.pathLengths
-        hgLoadNetDist /cluster/data/hg18/p2p/vidal/humanVidal.pathLengths hg18 humanVidalP2P \
+        hgLoadNetDist /hive/data/genomes/hg19/p2p/vidal/humanVidal.pathLengths hg19 humanVidalP2P \
         -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
+	# hgLoadNetDist 22 id-remapping misses, see missing.tab
+
 
 #wanker
-        hgNetDist -verbose=2 -skipFirst -threshold=2 \
-                /cluster/data/hg18/p2p/wanker/humanWanker.p2p \
-                /cluster/data/hg18/p2p/wanker/humanWanker.pathLengths
-        hgLoadNetDist /cluster/data/hg18/p2p/wanker/humanWanker.pathLengths hg18 humanWankerP2P \
+        hgLoadNetDist /hive/data/genomes/hg19/p2p/wanker/humanWanker.pathLengths hg19 humanWankerP2P \
         -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
+	# hgLoadNetDist 54 id-remapping misses, see missing.tab
+
+
 
 ###############################################################
 # Affy All Exon GeneSorter column. (DONE Andy, 2008-03-17)
 # NOTE - in future doing this in genome database rather than