src/hg/near/makeNear.doc 1.42

1.42 2009/10/16 17:35:33 kent
Adding update procedure for hg19 P2P tracks.
Index: src/hg/near/makeNear.doc
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/near/makeNear.doc,v
retrieving revision 1.41
retrieving revision 1.42
diff -b -B -U 1000000 -r1.41 -r1.42
--- src/hg/near/makeNear.doc	14 Oct 2009 23:37:40 -0000	1.41
+++ src/hg/near/makeNear.doc	16 Oct 2009 17:35:33 -0000	1.42
@@ -1,618 +1,632 @@
 # These are instructions for building the tables used by the Gene Sorter
 # (hgNear) -- these are often also used by hgGene.
 # Don't start these until there is a knownGene track, or a track with the
 # usual genePred fields followed by a proteinID field with SwissProt IDs.
 
 # for emacs: -*- mode: sh; -*-
 
 # Set up db variable (you'll actually need to redo this
 # after each ssh until we work out a better system.)
 ssh hgwdev
 set db = hg19
 
 ########################################################################
 # Cluster together various alt-splicing isoforms.
 hgClusterGenes $db knownGene knownIsoforms knownCanonical
 
 
 ########################################################################
 # Use doHgNearBlastp.pl to run blastp on knownGene proteins vs. self and 
 # vs. other Gene Sorter organisms (most recent db for each organism).
 mkdir -p /cluster/data/$db/bed/hgNearBlastp
 cd /cluster/data/$db/bed/hgNearBlastp
 
 # Get peptide sequences for each db's hgNear geneset:
 pepPredToFa $db knownGenePep $db.known.faa
 pepPredToFa mm8 knownGenePep mm8.known.faa
 pepPredToFa rn4 knownGenePep rn4.known.faa
 pepPredToFa danRer3 ensPep danRer3.ensPep.faa
 pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa
 pepPredToFa ce2 sangerPep ce2.sangerPep.faa
 pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa
 
 # Configure doHgNearBlastp.pl.  Use recipBest for all organisms more distant 
 # than mammal-mammal.  For mammal-mammal, synBlastp.csh is used instead 
 # after loading the table with doHgNearBlast.pl results -- see below.
 # NOTE: If $db is not a mammal, then recipBest is probably appropriate 
 # for all queryDbs.  
 # NOTE: if $db does not have knownGene, then change targetGenesetPrefix 
 # to the root of whatever geneset hgNear uses for $db (e.g. sanger for 
 # sangerGene, flyBase for flyBaseGene etc.).
 cat << _EOF_ > config.ra
 # Latest human vs. other Gene Sorter orgs:
 # mouse, rat, zebrafish, fly, worm, yeast
 
 targetGenesetPrefix known
 targetDb hg19
 queryDbs mm8 rn4 danRer3 dm3 ce2 sacCer1
 recipBest danRer3 dm3 ce2 sacCer1
 
 hg19Fa    /cluster/data/hg19/bed/hgNearBlastp/hg19.known.faa
 mm8Fa     /cluster/data/hg19/bed/hgNearBlastp/mm8.known.faa
 rn4Fa     /cluster/data/hg19/bed/hgNearBlastp/rn4.known.faa
 danRer3Fa /cluster/data/hg19/bed/hgNearBlastp/danRer3.ensPep.faa
 dm3Fa     /cluster/data/hg19/bed/hgNearBlastp/dm3.flyBasePep.faa
 ce2Fa     /cluster/data/hg19/bed/hgNearBlastp/ce2.sangerPep.faa
 sacCer1Fa /cluster/data/hg19/bed/hgNearBlastp/sacCer1.sgdPep.faa
 
 buildDir /cluster/data/hg19/bed/hgNearBlastp
 scratchDir /san/sanvol1/scratch/hg19HgNearBlastp
 _EOF_
 
 # Run with -noLoad so we can eyeball files, manually load $db tables now,
 # and later overload other databases' hgBlastTab tables.
 doHgNearBlastp.pl -noLoad config.ra >& do.log &
 tail -f do.log
 
 # Run the load scripts for dm3 tables manually as suggested by the
 # output of doHgNearBlastp.pl:
 
 # Load self-blastp (knownBlastp) and $db.??BlastTab immediately:
 # *** -noLoad was specified -- you can run this script manually to load hg19 tables:
     run.hg19.hg19/loadPairwise.csh
 # *** -noLoad was specified -- you can run these scripts manually to load hg19 tables:
     run.hg19.mm8/loadPairwise.csh
     run.hg19.rn4/loadPairwise.csh
     run.hg19.danRer3/loadPairwise.csh
     run.hg19.dm3/loadPairwise.csh
     run.hg19.ce2/loadPairwise.csh
     run.hg19.sacCer1/loadPairwise.csh
 # For mammal-mammal pairs, run synBlastp.csh:
 synBlastp.csh $db mm8
 synBlastp.csh $db rn4
 
 # **When hgNearOk is set for $db on the RR**, load *.hgBlastTab and 
 # make a separate push request for them.
 # *** -noLoad was specified -- you can run these scripts manually to load hgBlastTab in query databases:
     run.mm8.hg19/loadPairwise.csh
     run.rn4.hg19/loadPairwise.csh
     run.danRer3.hg19/loadPairwise.csh
     run.dm3.hg19/loadPairwise.csh
     run.ce2.hg19/loadPairwise.csh
     run.sacCer1.hg19/loadPairwise.csh
 # For mammal-mammal pairs, run synBlastp.csh:
 synBlastp.csh mm8 $db
 synBlastp.csh rn4 $db
 
 
 ########################################################################
 # MAPPINGS TO OTHER SETS OF IDS
 # Create table that maps between known genes and RefSeq
 hgMapToGene $db refGene knownGene knownToRefSeq
 
 # Create table that maps between known genes and LocusLink
 echo "select mrnaAcc,locusLinkId from refLink" | hgsql -N $db > refToLl.txt
 hgMapToGene $db refGene knownGene knownToLocusLink -lookup=refToLl.txt
 
 # Create table that maps between known genes and Pfam domains
 hgMapViaSwissProt $db knownGene name proteinID Pfam knownToPfam
 
 # Create table that maps between known genes and other datasets when
 # they exist, e.g.:
 hgMapToGene $db HInvGeneMrna knownGene knownToHInv
 hgMapToGene $db allenBrainAli -type=psl knownGene knownToAllenBrain 
 hgMapToGene $db ensGene knownGene knownToEnsembl
 hgMapToGene $db snp129 knownGene knownToCdsSnp -all -cds
 
 
 ########################################################################
 # EXPRESSION: MAPPING + DISTANCE
 
 # Create a table that maps between known genes and 
 # the nice affyUcla expression data.
 hgMapToGene "-type=bed 12" $db affyUclaNorm knownGene knownToU133
 
 # Create expression distance table.  This will take about an hour.
 cd ~/src/hg/near/hgExpDistance
 hgExpDistance $db affyUcla affyUclaExp knownExpDistance -weights=affyUcla.weight -lookup=knownToU133
 
 # Format and load the GNF data
 cd /cluster/data/$db/bed
 mkdir affyGnf95
 cd affyGnf95
 affyPslAndAtlasToBed -newType ../affyU95.psl /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 affyGnfU95.tab affyGnfU95Exps.tab -shortOut
 hgsql $db < ~/src/hg/affyGnf/affyGnfU95.sql
 
 # Create table that maps between known genes and 
 # the GNF data.
 hgMapToGene $db affyU95 knownGene knownToU95
 cd ~/src/hg/near/hgExpDistance
 #hgExpDistance $db affyGnfU95 affyGnfU95Exps knownGnfDistance -lookup=knownToU95
 hgExpDistance $db hgFixed.gnfHumanU95MedianRatio hgFixed.gnfHumanU95MedianExps gnfU95Distance -lookup=knownToU95
 
 #For worm did instead
 #hgExpDistance ce1 kimLifeCycleMedian kimWormLifeCycleMedian kimExpDistance
 hgExpDistance ce1 hgFixed.kimWormLifeMedianRatio hgFixed.kimWormLifeMedianExps kimExpDistance
 
 #For mouse did instead
 hgExpDistance mm3 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74
 hgExpDistance mm3 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74
 hgExpDistance mm3 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74
 
 ########################################################################
 # Make sure that GO database is up to date.
 See README in /cluster/store1/geneOntology.
 
 
 ########################################################################
 ## (markd did this when??)
 # create Rankprop homology score and PSI-BLAST tables.
 # requires rankprop code from Bill Noble <noble@gs.washington.edu>
 # which includes scripts to build these tables on the cluster.
 
     # run psiblast
 
       ssh hgwdev
       mkdir /cluster/bluearc/markd/rankprop
       cd  /cluster/bluearc/markd/rankprop
      
       # link rankprop to work directory:
       ln -s /cluster/store7/markd/rankprop/rankprop/bin .
 
       mkdir -p blastRun
       cd blastRun
 
       # hs.sw+tr is human swissprot+trembl (on hgwdev)
       ../bin/cluster-blast-setup hs.sw+tr
 
       ssh kk
       cd /cluster/bluearc/markd/rankprop/blastRun/hs.sw+tr
       para create jobs.para
       para try ,...
       # one job crashed on Query=P01722, no idea why; remove it from database
       # and rerun
 
       # finish up run
       cd ..
       ../bin/cluster-blast-finishup hs.sw+tr
 
   # run rankprop
 
       # rankprop run, on rack9 due to memory sies of kk9
       ssh kk9  
       cd /cluster/bluearc/markd/rankprop/rankpRun/
       ../bin/cluster-rankprop-setup -maxHits 1000 hs.sw+tr/max1k 10
       cd hs.sw+tr
       para create jobs.para
       para try, push, blah
       cd ..
       ../bin/cluster-rankprop-finishup  hs.sw+tr/max1k
 
 
       # load database
       cd /cluster/bluearc/markd/rankprop/results
       spLoadRankProp -noKgIdFile=hs.sw+tr/max1k.hg17.nokg.spids hg17 rankProp hs.sw+tr/max1k.rankp.gz >&hs.sw+tr/max1k.hg17.log
       spLoadPsiBlast hg17 spPsiBlast hs.sw+tr.eval.gz
 
 #---------
 
 ## (aamp andy pohl did this when??)
 # Human data from Shyamsundar R, et al. (2005) Genome Biol 6(3):R22
 hgExpDistance hg17 hgFixed.humanNormalRatio hgFixed.humanNormalExps humanNormalDistance -lookup=knownToLocusLink
 
 # Mouse landscape data.
 hgExpDistance mm6 hgFixed.mouseLandscape hgFixed.mouseLandscapeExps mouseLandscapeDistance -lookup=knownToXM
 
 # Remaking some distance tables after the knownGene updates (DONE 01/24/2006 Andy)
 # (current working directory irrelevant... it's all database)
 hgExpDistance mm6 hgFixed.mouseLandscape hgFixed.mouseLandscapeExps mouseLandscapeDistance -lookup=knownToXM
 hgExpDistance hg17 hgFixed.gladHumES hgFixed.gladHumESExps gladHumESDistance -lookup=knownToGnfAtlas2
 
 #----------------------------------------------------------
 
 ## (galt 2005-06-03)
 # p2p Protein-to-protein network - P2P column and sort order
 # I wrote the hgNetDist program to calculate network-distances for all gene pairs from gene-to-gene edges in input data .tab,
 # using the Floyd-Warshall dynamic programming algorithm.
 # These .tab files are from Josh Stuart /cluster/home/jstuart/Data/Interaction/P2P/{Worm,Fly,Yeast}/Compendium/data.tab
 # I have also deposited copies of the .tab files used in /cluster/data/$db/p2p/
 # added entries to hgNearData/$species/{orderDb,columbDb}.ra
 # added hgNearData/$species/p2p.html
 # added hgNearData/p2p.html
 #  
 #yeast: (1.5 hours for about 5000 genes, 24452 edges)
 hgNetDist yeastP2P.tab sacCer1 yeastP2P -threshold=3
  
 #fly: (3 hours for about 6500 genes, 19993 edges)
 # The temporary table bdgpGeneFb2Bdgp was constructed just for fly from this sql command:
 # create table bdgpGeneFb2Bdgp as
 # select flyBaseId, name bdgpName from bdgpGene a, bdgpGeneInfo b
 # where SUBSTRING_INDEX(name, "-R", 1) = bdgpName order by flyBaseId, bdgpName;
 #
 hgNetDist flytest.tab dm1 flyP2P -threshold=2 -sqlRemap="select flyBaseId, bdgpName from bdgpGeneFb2Bdgp"
  
 #worm: (15 minutes for about 2514 genes, 3871 edges - interaction data available is small)
 hgNetDist wormP2P.tab ce2 wormP2P -threshold=2
  
 
 #----------------------------------------------------------
 
 # FLYP2P (DONE 2006-02-06 angie)
     ssh hgwdev
     mkdir /cluster/data/dm2/bed/p2p
     cd /cluster/data/dm2/p2p
     cp /cluster/data/dm1/p2p/flyP2P.tab .
     hgNetDist flyP2P.tab dm2 flyP2P \
       -sqlRemap='select fbgn,name from flyBase2004Xref'
 
 
 
 ## (galt 2006-07-31)
 # Human p2p Protein-to-protein network - P2P column and sort order
 # I used the hgNetDist program to calculate network-distances for all gene pairs from gene-to-gene edges in input data,
 # I have also deposited copies of the .tab files used in /cluster/data/$db/p2p/{vidal,wanker}
 # added entries to hgNearData/Human/{orderDb,columbDb}.ra
 # added hgNearData/Human/{vidal,wanker}P2p.html
 #  
 #vidal 
  
 cat nature04209-s17.xls | gawk '{print $1 "\t" $3 "\t" "1.0"}' > humanVidal.p2p
 
-hgNetDist humanVidal.p2p hg18 humanVidalP2P -threshold=2 -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
+hgLoadNetDist humanVidal.p2p hg18 humanVidalP2P -threshold=2 -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
 
 #Added to hgNearData/Human/hg18/columnDb.ra
 #-------------
 name vidalP2p
 type distance humanVidalP2P query target distance
 visibility off
 shortLabel Vidal P2P
 longLabel Human Protein-Protein Interaction Network from Marc Vidal
 priority 12
 
 
 #Added to hgNearData/Human/hg18/orderDb.ra
 #-----------
 name vidalP2p
 shortLabel Vidal Protein-to-Protein
 longLabel P2P Network Distance to Selected Gene from Marc Vidal data
 type pair humanVidalP2P query target distance 1
 priority 9
 
 
 #wanker
 cat table_S3.txt | gawk '{print $4 "\t" $7 "\t" "1.0"}' > humanWanker.p2p
 
 hgNetDist wanker/humanWanker.p2p hg18 humanWankerP2P -threshold=2 -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
 
 #Did exactly the same for hg17 also.
 
 #Did the same thing to the .ra files all over again for name wankerP2p.
 
 ---
 
 # show amount of overlap between Vidal and Wanker data sets
 #These following queries joining both P2P data were 
 #glacial until I added these better indexes:
 
 create index ninny on humanVidalP2P(query(9),target(9));
 create index ninny on humanWankerP2P(query(9),target(9));
 
 select count(*) from humanVidalP2P v, humanWankerP2P w where v.query=w.query and v.target=w.target; 
 #+----------+
 #| count(*) |
 #+----------+
 #|     1661 |
 #+----------+
 
 
 #----------------------------------------------------------
 
 ## (galt 2006-09-15 thru 2006-11-06) 
 # HPRD p2p
 # Human p2p Protein-to-protein network - P2P column and sort order
 # I used the hgNetDist program to calculate network-distances for all gene pairs from 
 # gene-to-gene edges in input data.
 # I extended hgNetDist to handle two situations better:
 #  One is duplicate pairs.  I have now made it sort the input by distance
 #  so that priority is given to the record for the pair with the shortest distance.
 #  When further records for the same pair occur, they are ignored.
 #  Two is supporting explicit values for self.  Until now, these did
 #  not occur in the data, and an automatic self distance of 0 was put out
 #  simply to make GS slightly easier to use.  Now I can preserve the distance
 #  if any given in the input for self-self interactions.
 # Fan Hsu first downloaded the "single" xml to /cluster/store12/hprd/060906/
 !! TODO !!
 # I have also deposited copies of the .tab files used in /cluster/data/$db/p2p/hprd/working,
 # and I also put the source code for hprdRun there.
 # added entries to hgNearData/Human/{orderDb,columbDb}.ra
 # added hgNearData/Human/hprdP2p.html
 #  
 #hprd
 
 
 #Manually repaired the single.xml file as approved by HPRD
 #to remove one unneeded tag and one invalid interaction.
 #Also had to remove the colon in the names like xmlns:xsi 
 #in the second line
 mv HPRD_SINGLE_PSIMI_060106.xml single-fixed.xml
 
 #autoDtd had a bug in that it thinks "02995" is an integer
 # when it is clearly better handled as an id.  
 # this causes it to truncate leading zeros (atoi)
 # which are lost.  But since knownToHprd has leading zeros,
 # this causes failure of some records. 
 # To compensate, pad leading zeros in p2p ids.
 # Jim said I could fix it so I did and checked it in.
 
 autoDtd single-fixed.xml out.dtd out.stats -tree=out.tree -atree=out.atree
 
 #Counts from stats: interaction 34109
 
 # create parser with autoXml code generator
 autoXml out.dtd hprd
 
 #I wrote hprdRun.c using hprd.c,h
 #and I ran it to create p2p file (distance 1.5 for complex) and a separate complex table
 #as jim requested for later use.
 
 hprdRun single-fixed.xml hprd.p2p hprdComplex.tab
 # participant count=1 which is < 2 for participant id = 66942
 # this is a known problem with their .xml confirmed by HPRD, ignore it.
 
 hgNetDist hprd.p2p hg18 humanHprdP2P -weighted -threshold=2 -sqlRemap="select distinct value, name from knownToHprd"
 
 #inTab=hprd.p2p db=hg18 table=humanHprdP2P
 #reading edges hprd.p2p
 #slCount(edges)=43567 for hprd.p2p
 #beginning processing sqlRemap query [select distinct value, name from knownToHprd]
 #beginning processing data hprd.p2p ...
 #number of nodes=8898
 
 #412 sqlRemap misses!  see missing.tab
 
 #e.g. id 6257 not found in aliasHash!
 # other testing shows that 8486 out of 8898 hprid's match hit in knownToHprd
 #  so that is sufficient coverage. 
 
 #hgsql hg18 -e 'drop table if exists humanHprdP2P; create table humanHprdP2P (query varchar(255), target varchar(255), distance float);'
 #hgsql hg18 -e 'load data local infile "hgNetDist.tmp.tab" into table humanHprdP2P ignore 1 lines;'
 #hgsql hg18 -e 'create index query on humanHprdP2P (query(8));'
 
 
 
 
 #Added to hgNearData/Human/hg18/columnDb.ra
 #-------------
 name hprdP2p
 type distance humanHprdP2P query target distance
 visibility off
 shortLabel HPRD P2P
 longLabel Human Protein-Protein Interaction Network from the Human Reference Protein Database
 priority 12.2
 itemUrl http://www.hprd.org/protein/%s
 itemUrlQuery select value from knownToHprd where name='%s'
 
 #Added to hgNearData/Human/hg18/orderDb.ra
 #-----------
 name hprdP2p
 shortLabel HRPD Protein-to-Protein
 longLabel P2P Network Distance to Selected Gene from the Human Reference Protein Database
 type pair humanHprdP2P query target distance 1
 priority 9.2
 
 #Did exactly the same for hg17 also.
 
 # made hgNear/hgNearData/Human/hprdP2p.html description page
 # added to cvs
 
 #cvs committed hgNear changes requested by HPRD
 # to support linking from the Interaction pair to protein
 # page on their site.  see itemUrlQuery in GS.
 
 # HPRD requested an illustration describing the network distances.
 #  Mike Long helped me do the artwork with illustrator and other
 #  software.  The final .png is checked into browser/images/,
 #  and copied to hgwdev:/usr/local/apache/images/
 #  The image is referred to by hprdP2p.html mentioned above.
 
 # checked in changes to .ra config files
 
 # committed changes
 # added pushQ entry
 
 ---
 #----------------------------------------------------------
 
-## (kg3 hg18 upgrade done galt 2007-03-29) 
-# kg3 hg18 upgrade of Human p2p Protein-to-protein network - P2P columns
-# I used the hgNetDist program to calculate network-distances for all gene pairs from
-# the interaction.p2p files and created *.pathLengths files as output.
-# These were then read by hgLoadNetDist to remap the ids and create the
-# actual mysql tables used.  Note that hgLoadNetDist was split off as a
-# separate program from hgNetDist to make the planned incremental kg3 upgrades
-# much easier and faster, since only the hgLoadNetDist has to be run
-# once the pathLengths files have been created.
+## (kent 2009-10-16) 
+# HPRD p2p update used in hg19
+
+# First go to http://www.hprd.org, follow the download link, fill in the information they
+# request for academic users, and download HPRD_SINGLE_PSIMI_070609.xml.tar.gz into
+# /hive/data/outside/hprd/070609, and then unpack it with
+    cd /hive/data/outside/hprd/070609
+    tar -zxvf HPRD_SINGLE_PSIMI_070609.xml.tar.gz
+
+# Now run the hprdXmlToTab program, which was largely generated by autoDtd/autoXml.
+    hprdXmlToTab HPRD_SINGLE_PSIMI_070609.xml p2p.tab complex.tab
+# interaction count = 40075
+
+# Now use hgNetDist to generate pathLengthrs file.  This takes an hour or two.
+    hgNetDist -verbose=2 -weighted -threshold=2 p2p.tab hprd.pathLengths
+
+#----------------------------------------------------------
+
+## (kg3 hg19 upgrade done kent 2009-10-13) 
+# kg3 hg19 creation of Human p2p Protein-to-protein network - P2P columns
+# Note could just reuse the pathLengths files calculated in the hg18 build, since
+# these don't depend on an assembly.
+
+#Copy in from hg18 database
+	cp /hive/data/genomes/hg18/p2p /hive/data/genomes/hg19
 
 #hprd
-        hgNetDist -verbose=2 -weighted -threshold=2 \
-                /cluster/data/hg18/p2p/hprd/hprd.p2p \
-                /cluster/data/hg18/p2p/hprd/hprd.pathLengths
-        hgLoadNetDist /cluster/data/hg18/p2p/hprd/hprd.pathLengths hg18 humanHprdP2P \
+        hgLoadNetDist /hive/data/outside/hprd/070609/hprd.pathLengths hg19 humanHprdP2P \
                 -sqlRemap="select distinct value, name from knownToHprd"
+	# hgLoadNetDist 86 id-remapping misses, see missing.tab
+
 
 #vidal
-        hgNetDist -verbose=2 -skipFirst -threshold=2 \
-                /cluster/data/hg18/p2p/vidal/humanVidal.p2p \
-                /cluster/data/hg18/p2p/vidal/humanVidal.pathLengths
-        hgLoadNetDist /cluster/data/hg18/p2p/vidal/humanVidal.pathLengths hg18 humanVidalP2P \
+        hgLoadNetDist /hive/data/genomes/hg19/p2p/vidal/humanVidal.pathLengths hg19 humanVidalP2P \
         -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
+	# hgLoadNetDist 22 id-remapping misses, see missing.tab
+
 
 #wanker
-        hgNetDist -verbose=2 -skipFirst -threshold=2 \
-                /cluster/data/hg18/p2p/wanker/humanWanker.p2p \
-                /cluster/data/hg18/p2p/wanker/humanWanker.pathLengths
-        hgLoadNetDist /cluster/data/hg18/p2p/wanker/humanWanker.pathLengths hg18 humanWankerP2P \
+        hgLoadNetDist /hive/data/genomes/hg19/p2p/wanker/humanWanker.pathLengths hg19 humanWankerP2P \
         -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
+	# hgLoadNetDist 54 id-remapping misses, see missing.tab
+
+
 
 ###############################################################
 # Affy All Exon GeneSorter column. (DONE Andy, 2008-03-17)
 # NOTE - in future doing this in genome database rather than 
 # hgFixed, since it needs to change with each gene build....
 # Doing so as part of UCSC Genes update, Jim Kent, 2008-07-14)
 
 ssh hgwdev
 cd /cluster/data/hg18/bed
 mkdir affyAllExonGsColumn
 cd affyAllExonGsColumn/
 echo "select * from knownGene" | \
    hgsql hg18 | tail -n+2 > knownGene.gp
 overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \
    ../affyHumanExon/affyHuEx1.bed knownGene.gp ids.txt
 echo "select * from affyHumanExon" | \
    hgsql hgFixed | tail +2 > expData.txt
 affyAllExonGSColumn expData.txt ids.txt column.txt
 hgLoadSqlTab hgFixed affyHumanExonGs expData.sql column.txt
 hgRatioMicroarray -database=hgFixed -clump=affyHumanExon.ra -minAbsVal=0.01 \
    affyHumanExonGs affyHumanExonGsRatio
 hgMedianMicroarray hgFixed affyHumanExonGs affyHumanExonExps \
    affyHumanExon.ra affyHumanExonGsMedian affyHumanExonMedianExps
 hgMedianMicroarray hgFixed affyHumanExonGsRatio \
    affyHumanExonExps affyHumanExon.ra affyHumanExonGsRatioMedian \
    affyHumanExonMedianExps
 hgExpDistance hgFixed affyHumanExonGsRatioMedian \
    affyHumanExonGsMedianExps affyHumanExonGsRatioMedianDist
 
 ###############################################################
 ## Affy All Exon GeneSorter column human redo, mouse and rat initial
 ## DONE (2009-01-29, Andy)
 ## human first, although mm8 and rn4 have the most concise instructions
 ssh hgwdev
 cd /hive/data/genomes/hg18/bed/affyAllExonGsColumn
 cp ~/kent/src/hg/lib/exp{Data,Record}.sql .
 echo "select * from knownGene" | \
    hgsql hg18 | tail -n+2 > knownGene.gp
 zcat ../affyAllExonProbes/hg18.bed.gz \
    | sed 's/\([[:digit:]]\+\)|[[:alpha:]]\+/\1/' > probes.bed
 overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \
    probes.bed knownGene.gp ids.txt
 echo "select name,expCount,expScores from affyExonTissues" | \
    hgsql hg18 | tail -n+2 > expData.txt
 affyAllExonGSColumn expData.txt ids.txt column.txt
 hgLoadSqlTab hg18 affyExonTissuesGs expData.sql column.txt
 hgMedianMicroarray hg18 affyExonTissuesGs affyExonTissuesExps \
    affyHumanExon.ra affyHumanExonGsMedian affyHumanExonMedianExps
 grep -A5 affyExonTissuesAll ~/hg/makeDb/hgCgiData/Human/microarrayGroups.ra | grep names | sed 's/^names //; s/,$//' | tr ',' '\n' | awk 'BEGIN{OFS="\t"; id=0;}{ print id, $1, $1, "n/a", "n/a", "n/a", "3", "n/a,n/a,"$1","; id = id + 1;}' > all.expRecords
 hgLoadSqlTab hg18 affyExonTissuesGsExps expRecord.sql all.expRecords
 cat << "EOF" > affyExonTissuesGs.ra
 breast breast 0 1 2
 cerebellum cerebellum 3 4 5
 heart heart 6 7 8
 kidney kidney 9 10 11
 liver liver 12 13 14
 muscle muscle 15 16 17
 pancreas pancreas 18 19 20
 prostate prostate 21 22 23
 spleen spleen 24 25 26
 testes testes 27 28 29
 thyroid thyroid 30 31 32
 EOF
 # << emacs;
 hgExpDistance hg18 affyExonTissuesGsMedian \
    affyExonTissuesGsMedianExps affyExonTissuesGsMedianDist
 
 ## mouse
 mkdir /hive/data/genomes/mm9/bed/affyAllExonGsColumn
 cd /hive/data/genomes/mm9/bed/affyAllExonGsColumn
 cp ~/kent/src/hg/lib/exp{Data,Record}.sql .
 echo "select * from knownGene" | \
    hgsql mm9 | tail -n+2 > knownGene.gp
 echo "create table knownToKnown (name varchar(255) not null, value varchar(255) not null, index(name(7)))" | hgsql mm9
 sed 's/^\(\<.\S\+\>\).*/\1\t\1/' knownGene.gp > knownToKnown.txt
 echo load data local infile \'knownToKnown.txt\' into table knownToKnown | hgsql mm9
 zcat ../affyAllExonProbes/mm9.bed.gz \
    | sed 's/\([[:digit:]]\+\)|[[:alpha:]]\+/\1/' > probes.bed
 overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \
    probes.bed knownGene.gp ids.txt
 echo "select name,expCount,expScores from affyExonTissues" | \
    hgsql mm9 | tail -n+2 > expData.txt
 affyAllExonGSColumn expData.txt ids.txt column.txt
 hgLoadSqlTab mm9 affyExonTissuesGs expData.sql column.txt
 grep -A5 affyExonTissuesGroupByTissueMedian ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra \
   | grep names | sed 's/names //; s/,$//' | tr ',' '\n' \
   | awk 'BEGIN{OFS=" "; ix=0;}{print $1, $1, ix, ix+1, ix+2; ix = ix + 3;}' \
   > affyExonTissuesGs.ra
 grep -A5 affyExonTissuesAll ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra | grep names | sed 's/^names //; s/,$//' | tr ',' '\n' | awk 'BEGIN{OFS="\t"; id=0;}{ print id, $1, $1, "n/a", "n/a", "n/a", "3", "n/a,n/a,"$1","; id = id + 1;}' > all.expRecords
 hgLoadSqlTab mm9 affyExonTissuesGsExps expRecord.sql all.expRecords
 hgMedianMicroarray mm9 affyExonTissuesGs mm9.affyExonTissuesGsExps \
    affyExonTissuesGs.ra affyExonTissuesGsMedian mm9.affyExonTissuesGsMedianExps
 hgExpDistance mm9 affyExonTissuesGsMedian \
    affyExonTissuesGsMedianExps affyExonTissuesGsMedianDist
 
 ## rat
 mkdir /hive/data/genomes/rn4/bed/affyAllExonGsColumn
 cd /hive/data/genomes/rn4/bed/affyAllExonGsColumn
 cp ~/kent/src/hg/lib/exp{Data,Record}.sql .
 echo "select * from knownGene" | \
    hgsql rn4 | tail -n+2 > knownGene.gp
 echo "create table knownToKnown (name varchar(255) not null, value varchar(255) not null, index(name(7)))" | hgsql rn4
 sed 's/^\(\<.\S\+\>\).*/\1\t\1/' knownGene.gp > knownToKnown.txt
 echo load data local infile \'knownToKnown.txt\' into table knownToKnown | hgsql rn4
 zcat ../affyAllExonProbes/rn4.bed.gz \
    | sed 's/\([[:digit:]]\+\)|[[:alpha:]]\+/\1/' > probes.bed
 overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \
    probes.bed knownGene.gp ids.txt
 echo "select name,expCount,expScores from affyExonTissues" | \
    hgsql rn4 | tail -n+2 > expData.txt
 affyAllExonGSColumn expData.txt ids.txt column.txt
 hgLoadSqlTab rn4 affyExonTissuesGs expData.sql column.txt
 grep -A5 affyExonTissuesGroupByTissueMedian ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra \
   | grep names | sed 's/names //; s/,$//' | tr ',' '\n' \
   | awk 'BEGIN{OFS=" "; ix=0;}{print $1, $1, ix, ix+1, ix+2; ix = ix + 3;}' \
   > affyExonTissuesGs.ra
 grep -A5 affyExonTissuesAll ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra | grep names | sed 's/^names //; s/,$//' | tr ',' '\n' | awk 'BEGIN{OFS="\t"; id=0;}{ print id, $1, $1, "n/a", "n/a", "n/a", "3", "n/a,n/a,"$1","; id = id + 1;}' > all.expRecords
 hgLoadSqlTab rn4 affyExonTissuesGsExps expRecord.sql all.expRecords
 hgMedianMicroarray rn4 affyExonTissuesGs rn4.affyExonTissuesGsExps \
    affyExonTissuesGs.ra affyExonTissuesGsMedian rn4.affyExonTissuesGsMedianExps
 hgExpDistance rn4 affyExonTissuesGsMedian \
    affyExonTissuesGsMedianExps affyExonTissuesGsMedianDist
 
 ## mm8
 
 mkdir /hive/data/genomes/mm8/bed/affyAllExonGsColumn
 cd /hive/data/genomes/mm8/bed/affyAllExonGsColumn
 cp ~/kent/src/hg/lib/exp{Data,Record}.sql .
 echo "select * from knownGene" | \
    hgsql mm8 | tail -n+2 > knownGene.gp
 echo "create table knownToKnown (name varchar(255) not null, value varchar(255) not null, index(name(7)))" | hgsql mm8
 sed 's/^\(\<.\S\+\>\).*/\1\t\1/' knownGene.gp > knownToKnown.txt
 echo load data local infile \'knownToKnown.txt\' into table knownToKnown | hgsql mm8
 zcat ../affyAllExonProbes/mm8.bed.gz \
    | sed 's/\([[:digit:]]\+\)|[[:alpha:]]\+/\1/' > probes.bed
 overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \
    probes.bed knownGene.gp ids.txt
 echo "select name,expCount,expScores from affyExonTissues" | \
    hgsql mm8 | tail -n+2 > expData.txt
 affyAllExonGSColumn expData.txt ids.txt column.txt
 hgLoadSqlTab mm8 affyExonTissuesGs expData.sql column.txt
 grep -A5 affyExonTissuesGroupByTissueMedian ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra \
   | grep names | sed 's/names //; s/,$//' | tr ',' '\n' \
   | awk 'BEGIN{OFS=" "; ix=0;}{print $1, $1, ix, ix+1, ix+2; ix = ix + 3;}' \
   > affyExonTissuesGs.ra
 grep -A5 affyExonTissuesAll ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra \
   | grep names | sed 's/^names //; s/,$//' | tr ',' '\n' \
   | awk 'BEGIN{OFS="\t"; id=0;}{ print id, $1, $1, "n/a", "n/a", "n/a", "3", "n/a,n/a,"$1","; id = id + 1;}' > all.expRecords
 hgLoadSqlTab mm8 affyExonTissuesGsExps expRecord.sql all.expRecords
 hgMedianMicroarray mm8 affyExonTissuesGs mm8.affyExonTissuesGsExps \
    affyExonTissuesGs.ra affyExonTissuesGsMedian mm8.affyExonTissuesGsMedianExps
 hgExpDistance mm8 affyExonTissuesGsMedian \
    affyExonTissuesGsMedianExps affyExonTissuesGsMedianDist