src/hg/near/makeNear.doc 1.42
1.42 2009/10/16 17:35:33 kent
Adding update procedure for hg19 P2P tracks.
Index: src/hg/near/makeNear.doc
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/near/makeNear.doc,v
retrieving revision 1.41
retrieving revision 1.42
diff -b -B -U 1000000 -r1.41 -r1.42
--- src/hg/near/makeNear.doc 14 Oct 2009 23:37:40 -0000 1.41
+++ src/hg/near/makeNear.doc 16 Oct 2009 17:35:33 -0000 1.42
@@ -1,618 +1,632 @@
# These are instructions for building the tables used by the Gene Sorter
# (hgNear) -- these are often also used by hgGene.
# Don't start these until there is a knownGene track, or a track with the
# usual genePred fields followed by a proteinID field with SwissProt IDs.
# for emacs: -*- mode: sh; -*-
# Set up db variable (you'll actually need to redo this
# after each ssh until we work out a better system.)
ssh hgwdev
set db = hg19
########################################################################
# Cluster together various alt-splicing isoforms.
hgClusterGenes $db knownGene knownIsoforms knownCanonical
########################################################################
# Use doHgNearBlastp.pl to run blastp on knownGene proteins vs. self and
# vs. other Gene Sorter organisms (most recent db for each organism).
mkdir -p /cluster/data/$db/bed/hgNearBlastp
cd /cluster/data/$db/bed/hgNearBlastp
# Get peptide sequences for each db's hgNear geneset:
pepPredToFa $db knownGenePep $db.known.faa
pepPredToFa mm8 knownGenePep mm8.known.faa
pepPredToFa rn4 knownGenePep rn4.known.faa
pepPredToFa danRer3 ensPep danRer3.ensPep.faa
pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa
pepPredToFa ce2 sangerPep ce2.sangerPep.faa
pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa
# Configure doHgNearBlastp.pl. Use recipBest for all organisms more distant
# than mammal-mammal. For mammal-mammal, synBlastp.csh is used instead
# after loading the table with doHgNearBlast.pl results -- see below.
# NOTE: If $db is not a mammal, then recipBest is probably appropriate
# for all queryDbs.
# NOTE: if $db does not have knownGene, then change targetGenesetPrefix
# to the root of whatever geneset hgNear uses for $db (e.g. sanger for
# sangerGene, flyBase for flyBaseGene etc.).
cat << _EOF_ > config.ra
# Latest human vs. other Gene Sorter orgs:
# mouse, rat, zebrafish, fly, worm, yeast
targetGenesetPrefix known
targetDb hg19
queryDbs mm8 rn4 danRer3 dm3 ce2 sacCer1
recipBest danRer3 dm3 ce2 sacCer1
hg19Fa /cluster/data/hg19/bed/hgNearBlastp/hg19.known.faa
mm8Fa /cluster/data/hg19/bed/hgNearBlastp/mm8.known.faa
rn4Fa /cluster/data/hg19/bed/hgNearBlastp/rn4.known.faa
danRer3Fa /cluster/data/hg19/bed/hgNearBlastp/danRer3.ensPep.faa
dm3Fa /cluster/data/hg19/bed/hgNearBlastp/dm3.flyBasePep.faa
ce2Fa /cluster/data/hg19/bed/hgNearBlastp/ce2.sangerPep.faa
sacCer1Fa /cluster/data/hg19/bed/hgNearBlastp/sacCer1.sgdPep.faa
buildDir /cluster/data/hg19/bed/hgNearBlastp
scratchDir /san/sanvol1/scratch/hg19HgNearBlastp
_EOF_
# Run with -noLoad so we can eyeball files, manually load $db tables now,
# and later overload other databases' hgBlastTab tables.
doHgNearBlastp.pl -noLoad config.ra >& do.log &
tail -f do.log
# Run the load scripts for dm3 tables manually as suggested by the
# output of doHgNearBlastp.pl:
# Load self-blastp (knownBlastp) and $db.??BlastTab immediately:
# *** -noLoad was specified -- you can run this script manually to load hg19 tables:
run.hg19.hg19/loadPairwise.csh
# *** -noLoad was specified -- you can run these scripts manually to load hg19 tables:
run.hg19.mm8/loadPairwise.csh
run.hg19.rn4/loadPairwise.csh
run.hg19.danRer3/loadPairwise.csh
run.hg19.dm3/loadPairwise.csh
run.hg19.ce2/loadPairwise.csh
run.hg19.sacCer1/loadPairwise.csh
# For mammal-mammal pairs, run synBlastp.csh:
synBlastp.csh $db mm8
synBlastp.csh $db rn4
# **When hgNearOk is set for $db on the RR**, load *.hgBlastTab and
# make a separate push request for them.
# *** -noLoad was specified -- you can run these scripts manually to load hgBlastTab in query databases:
run.mm8.hg19/loadPairwise.csh
run.rn4.hg19/loadPairwise.csh
run.danRer3.hg19/loadPairwise.csh
run.dm3.hg19/loadPairwise.csh
run.ce2.hg19/loadPairwise.csh
run.sacCer1.hg19/loadPairwise.csh
# For mammal-mammal pairs, run synBlastp.csh:
synBlastp.csh mm8 $db
synBlastp.csh rn4 $db
########################################################################
# MAPPINGS TO OTHER SETS OF IDS
# Create table that maps between known genes and RefSeq
hgMapToGene $db refGene knownGene knownToRefSeq
# Create table that maps between known genes and LocusLink
echo "select mrnaAcc,locusLinkId from refLink" | hgsql -N $db > refToLl.txt
hgMapToGene $db refGene knownGene knownToLocusLink -lookup=refToLl.txt
# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt $db knownGene name proteinID Pfam knownToPfam
# Create table that maps between known genes and other datasets when
# they exist, e.g.:
hgMapToGene $db HInvGeneMrna knownGene knownToHInv
hgMapToGene $db allenBrainAli -type=psl knownGene knownToAllenBrain
hgMapToGene $db ensGene knownGene knownToEnsembl
hgMapToGene $db snp129 knownGene knownToCdsSnp -all -cds
########################################################################
# EXPRESSION: MAPPING + DISTANCE
# Create a table that maps between known genes and
# the nice affyUcla expression data.
hgMapToGene "-type=bed 12" $db affyUclaNorm knownGene knownToU133
# Create expression distance table. This will take about an hour.
cd ~/src/hg/near/hgExpDistance
hgExpDistance $db affyUcla affyUclaExp knownExpDistance -weights=affyUcla.weight -lookup=knownToU133
# Format and load the GNF data
cd /cluster/data/$db/bed
mkdir affyGnf95
cd affyGnf95
affyPslAndAtlasToBed -newType ../affyU95.psl /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 affyGnfU95.tab affyGnfU95Exps.tab -shortOut
hgsql $db < ~/src/hg/affyGnf/affyGnfU95.sql
# Create table that maps between known genes and
# the GNF data.
hgMapToGene $db affyU95 knownGene knownToU95
cd ~/src/hg/near/hgExpDistance
#hgExpDistance $db affyGnfU95 affyGnfU95Exps knownGnfDistance -lookup=knownToU95
hgExpDistance $db hgFixed.gnfHumanU95MedianRatio hgFixed.gnfHumanU95MedianExps gnfU95Distance -lookup=knownToU95
#For worm did instead
#hgExpDistance ce1 kimLifeCycleMedian kimWormLifeCycleMedian kimExpDistance
hgExpDistance ce1 hgFixed.kimWormLifeMedianRatio hgFixed.kimWormLifeMedianExps kimExpDistance
#For mouse did instead
hgExpDistance mm3 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74
hgExpDistance mm3 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74
hgExpDistance mm3 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74
########################################################################
# Make sure that GO database is up to date.
See README in /cluster/store1/geneOntology.
########################################################################
## (markd did this when??)
# create Rankprop homology score and PSI-BLAST tables.
# requires rankprop code from Bill Noble <noble@gs.washington.edu>
# which includes scripts to build these tables on the cluster.
# run psiblast
ssh hgwdev
mkdir /cluster/bluearc/markd/rankprop
cd /cluster/bluearc/markd/rankprop
# link rankprop to work directory:
ln -s /cluster/store7/markd/rankprop/rankprop/bin .
mkdir -p blastRun
cd blastRun
# hs.sw+tr is human swissprot+trembl (on hgwdev)
../bin/cluster-blast-setup hs.sw+tr
ssh kk
cd /cluster/bluearc/markd/rankprop/blastRun/hs.sw+tr
para create jobs.para
para try ,...
# one job crashed on Query=P01722, no idea why; remove it from database
# and rerun
# finish up run
cd ..
../bin/cluster-blast-finishup hs.sw+tr
# run rankprop
# rankprop run, on rack9 due to memory sies of kk9
ssh kk9
cd /cluster/bluearc/markd/rankprop/rankpRun/
../bin/cluster-rankprop-setup -maxHits 1000 hs.sw+tr/max1k 10
cd hs.sw+tr
para create jobs.para
para try, push, blah
cd ..
../bin/cluster-rankprop-finishup hs.sw+tr/max1k
# load database
cd /cluster/bluearc/markd/rankprop/results
spLoadRankProp -noKgIdFile=hs.sw+tr/max1k.hg17.nokg.spids hg17 rankProp hs.sw+tr/max1k.rankp.gz >&hs.sw+tr/max1k.hg17.log
spLoadPsiBlast hg17 spPsiBlast hs.sw+tr.eval.gz
#---------
## (aamp andy pohl did this when??)
# Human data from Shyamsundar R, et al. (2005) Genome Biol 6(3):R22
hgExpDistance hg17 hgFixed.humanNormalRatio hgFixed.humanNormalExps humanNormalDistance -lookup=knownToLocusLink
# Mouse landscape data.
hgExpDistance mm6 hgFixed.mouseLandscape hgFixed.mouseLandscapeExps mouseLandscapeDistance -lookup=knownToXM
# Remaking some distance tables after the knownGene updates (DONE 01/24/2006 Andy)
# (current working directory irrelevant... it's all database)
hgExpDistance mm6 hgFixed.mouseLandscape hgFixed.mouseLandscapeExps mouseLandscapeDistance -lookup=knownToXM
hgExpDistance hg17 hgFixed.gladHumES hgFixed.gladHumESExps gladHumESDistance -lookup=knownToGnfAtlas2
#----------------------------------------------------------
## (galt 2005-06-03)
# p2p Protein-to-protein network - P2P column and sort order
# I wrote the hgNetDist program to calculate network-distances for all gene pairs from gene-to-gene edges in input data .tab,
# using the Floyd-Warshall dynamic programming algorithm.
# These .tab files are from Josh Stuart /cluster/home/jstuart/Data/Interaction/P2P/{Worm,Fly,Yeast}/Compendium/data.tab
# I have also deposited copies of the .tab files used in /cluster/data/$db/p2p/
# added entries to hgNearData/$species/{orderDb,columbDb}.ra
# added hgNearData/$species/p2p.html
# added hgNearData/p2p.html
#
#yeast: (1.5 hours for about 5000 genes, 24452 edges)
hgNetDist yeastP2P.tab sacCer1 yeastP2P -threshold=3
#fly: (3 hours for about 6500 genes, 19993 edges)
# The temporary table bdgpGeneFb2Bdgp was constructed just for fly from this sql command:
# create table bdgpGeneFb2Bdgp as
# select flyBaseId, name bdgpName from bdgpGene a, bdgpGeneInfo b
# where SUBSTRING_INDEX(name, "-R", 1) = bdgpName order by flyBaseId, bdgpName;
#
hgNetDist flytest.tab dm1 flyP2P -threshold=2 -sqlRemap="select flyBaseId, bdgpName from bdgpGeneFb2Bdgp"
#worm: (15 minutes for about 2514 genes, 3871 edges - interaction data available is small)
hgNetDist wormP2P.tab ce2 wormP2P -threshold=2
#----------------------------------------------------------
# FLYP2P (DONE 2006-02-06 angie)
ssh hgwdev
mkdir /cluster/data/dm2/bed/p2p
cd /cluster/data/dm2/p2p
cp /cluster/data/dm1/p2p/flyP2P.tab .
hgNetDist flyP2P.tab dm2 flyP2P \
-sqlRemap='select fbgn,name from flyBase2004Xref'
## (galt 2006-07-31)
# Human p2p Protein-to-protein network - P2P column and sort order
# I used the hgNetDist program to calculate network-distances for all gene pairs from gene-to-gene edges in input data,
# I have also deposited copies of the .tab files used in /cluster/data/$db/p2p/{vidal,wanker}
# added entries to hgNearData/Human/{orderDb,columbDb}.ra
# added hgNearData/Human/{vidal,wanker}P2p.html
#
#vidal
cat nature04209-s17.xls | gawk '{print $1 "\t" $3 "\t" "1.0"}' > humanVidal.p2p
-hgNetDist humanVidal.p2p hg18 humanVidalP2P -threshold=2 -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
+hgLoadNetDist humanVidal.p2p hg18 humanVidalP2P -threshold=2 -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
#Added to hgNearData/Human/hg18/columnDb.ra
#-------------
name vidalP2p
type distance humanVidalP2P query target distance
visibility off
shortLabel Vidal P2P
longLabel Human Protein-Protein Interaction Network from Marc Vidal
priority 12
#Added to hgNearData/Human/hg18/orderDb.ra
#-----------
name vidalP2p
shortLabel Vidal Protein-to-Protein
longLabel P2P Network Distance to Selected Gene from Marc Vidal data
type pair humanVidalP2P query target distance 1
priority 9
#wanker
cat table_S3.txt | gawk '{print $4 "\t" $7 "\t" "1.0"}' > humanWanker.p2p
hgNetDist wanker/humanWanker.p2p hg18 humanWankerP2P -threshold=2 -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
#Did exactly the same for hg17 also.
#Did the same thing to the .ra files all over again for name wankerP2p.
---
# show amount of overlap between Vidal and Wanker data sets
#These following queries joining both P2P data were
#glacial until I added these better indexes:
create index ninny on humanVidalP2P(query(9),target(9));
create index ninny on humanWankerP2P(query(9),target(9));
select count(*) from humanVidalP2P v, humanWankerP2P w where v.query=w.query and v.target=w.target;
#+----------+
#| count(*) |
#+----------+
#| 1661 |
#+----------+
#----------------------------------------------------------
## (galt 2006-09-15 thru 2006-11-06)
# HPRD p2p
# Human p2p Protein-to-protein network - P2P column and sort order
# I used the hgNetDist program to calculate network-distances for all gene pairs from
# gene-to-gene edges in input data.
# I extended hgNetDist to handle two situations better:
# One is duplicate pairs. I have now made it sort the input by distance
# so that priority is given to the record for the pair with the shortest distance.
# When further records for the same pair occur, they are ignored.
# Two is supporting explicit values for self. Until now, these did
# not occur in the data, and an automatic self distance of 0 was put out
# simply to make GS slightly easier to use. Now I can preserve the distance
# if any given in the input for self-self interactions.
# Fan Hsu first downloaded the "single" xml to /cluster/store12/hprd/060906/
!! TODO !!
# I have also deposited copies of the .tab files used in /cluster/data/$db/p2p/hprd/working,
# and I also put the source code for hprdRun there.
# added entries to hgNearData/Human/{orderDb,columbDb}.ra
# added hgNearData/Human/hprdP2p.html
#
#hprd
#Manually repaired the single.xml file as approved by HPRD
#to remove one unneeded tag and one invalid interaction.
#Also had to remove the colon in the names like xmlns:xsi
#in the second line
mv HPRD_SINGLE_PSIMI_060106.xml single-fixed.xml
#autoDtd had a bug in that it thinks "02995" is an integer
# when it is clearly better handled as an id.
# this causes it to truncate leading zeros (atoi)
# which are lost. But since knownToHprd has leading zeros,
# this causes failure of some records.
# To compensate, pad leading zeros in p2p ids.
# Jim said I could fix it so I did and checked it in.
autoDtd single-fixed.xml out.dtd out.stats -tree=out.tree -atree=out.atree
#Counts from stats: interaction 34109
# create parser with autoXml code generator
autoXml out.dtd hprd
#I wrote hprdRun.c using hprd.c,h
#and I ran it to create p2p file (distance 1.5 for complex) and a separate complex table
#as jim requested for later use.
hprdRun single-fixed.xml hprd.p2p hprdComplex.tab
# participant count=1 which is < 2 for participant id = 66942
# this is a known problem with their .xml confirmed by HPRD, ignore it.
hgNetDist hprd.p2p hg18 humanHprdP2P -weighted -threshold=2 -sqlRemap="select distinct value, name from knownToHprd"
#inTab=hprd.p2p db=hg18 table=humanHprdP2P
#reading edges hprd.p2p
#slCount(edges)=43567 for hprd.p2p
#beginning processing sqlRemap query [select distinct value, name from knownToHprd]
#beginning processing data hprd.p2p ...
#number of nodes=8898
#412 sqlRemap misses! see missing.tab
#e.g. id 6257 not found in aliasHash!
# other testing shows that 8486 out of 8898 hprid's match hit in knownToHprd
# so that is sufficient coverage.
#hgsql hg18 -e 'drop table if exists humanHprdP2P; create table humanHprdP2P (query varchar(255), target varchar(255), distance float);'
#hgsql hg18 -e 'load data local infile "hgNetDist.tmp.tab" into table humanHprdP2P ignore 1 lines;'
#hgsql hg18 -e 'create index query on humanHprdP2P (query(8));'
#Added to hgNearData/Human/hg18/columnDb.ra
#-------------
name hprdP2p
type distance humanHprdP2P query target distance
visibility off
shortLabel HPRD P2P
longLabel Human Protein-Protein Interaction Network from the Human Reference Protein Database
priority 12.2
itemUrl http://www.hprd.org/protein/%s
itemUrlQuery select value from knownToHprd where name='%s'
#Added to hgNearData/Human/hg18/orderDb.ra
#-----------
name hprdP2p
shortLabel HRPD Protein-to-Protein
longLabel P2P Network Distance to Selected Gene from the Human Reference Protein Database
type pair humanHprdP2P query target distance 1
priority 9.2
#Did exactly the same for hg17 also.
# made hgNear/hgNearData/Human/hprdP2p.html description page
# added to cvs
#cvs committed hgNear changes requested by HPRD
# to support linking from the Interaction pair to protein
# page on their site. see itemUrlQuery in GS.
# HPRD requested an illustration describing the network distances.
# Mike Long helped me do the artwork with illustrator and other
# software. The final .png is checked into browser/images/,
# and copied to hgwdev:/usr/local/apache/images/
# The image is referred to by hprdP2p.html mentioned above.
# checked in changes to .ra config files
# committed changes
# added pushQ entry
---
#----------------------------------------------------------
-## (kg3 hg18 upgrade done galt 2007-03-29)
-# kg3 hg18 upgrade of Human p2p Protein-to-protein network - P2P columns
-# I used the hgNetDist program to calculate network-distances for all gene pairs from
-# the interaction.p2p files and created *.pathLengths files as output.
-# These were then read by hgLoadNetDist to remap the ids and create the
-# actual mysql tables used. Note that hgLoadNetDist was split off as a
-# separate program from hgNetDist to make the planned incremental kg3 upgrades
-# much easier and faster, since only the hgLoadNetDist has to be run
-# once the pathLengths files have been created.
+## (kent 2009-10-16)
+# HPRD p2p update used in hg19
+
+# First go to http://www.hprd.org, follow the download link, fill in the information they
+# request for academic users, and download HPRD_SINGLE_PSIMI_070609.xml.tar.gz into
+# /hive/data/outside/hprd/070609, and then unpack it with
+ cd /hive/data/outside/hprd/070609
+ tar -zxvf HPRD_SINGLE_PSIMI_070609.xml.tar.gz
+
+# Now run the hprdXmlToTab program, which was largely generated by autoDtd/autoXml.
+ hprdXmlToTab HPRD_SINGLE_PSIMI_070609.xml p2p.tab complex.tab
+# interaction count = 40075
+
+# Now use hgNetDist to generate pathLengthrs file. This takes an hour or two.
+ hgNetDist -verbose=2 -weighted -threshold=2 p2p.tab hprd.pathLengths
+
+#----------------------------------------------------------
+
+## (kg3 hg19 upgrade done kent 2009-10-13)
+# kg3 hg19 creation of Human p2p Protein-to-protein network - P2P columns
+# Note could just reuse the pathLengths files calculated in the hg18 build, since
+# these don't depend on an assembly.
+
+#Copy in from hg18 database
+ cp /hive/data/genomes/hg18/p2p /hive/data/genomes/hg19
#hprd
- hgNetDist -verbose=2 -weighted -threshold=2 \
- /cluster/data/hg18/p2p/hprd/hprd.p2p \
- /cluster/data/hg18/p2p/hprd/hprd.pathLengths
- hgLoadNetDist /cluster/data/hg18/p2p/hprd/hprd.pathLengths hg18 humanHprdP2P \
+ hgLoadNetDist /hive/data/outside/hprd/070609/hprd.pathLengths hg19 humanHprdP2P \
-sqlRemap="select distinct value, name from knownToHprd"
+ # hgLoadNetDist 86 id-remapping misses, see missing.tab
+
#vidal
- hgNetDist -verbose=2 -skipFirst -threshold=2 \
- /cluster/data/hg18/p2p/vidal/humanVidal.p2p \
- /cluster/data/hg18/p2p/vidal/humanVidal.pathLengths
- hgLoadNetDist /cluster/data/hg18/p2p/vidal/humanVidal.pathLengths hg18 humanVidalP2P \
+ hgLoadNetDist /hive/data/genomes/hg19/p2p/vidal/humanVidal.pathLengths hg19 humanVidalP2P \
-sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
+ # hgLoadNetDist 22 id-remapping misses, see missing.tab
+
#wanker
- hgNetDist -verbose=2 -skipFirst -threshold=2 \
- /cluster/data/hg18/p2p/wanker/humanWanker.p2p \
- /cluster/data/hg18/p2p/wanker/humanWanker.pathLengths
- hgLoadNetDist /cluster/data/hg18/p2p/wanker/humanWanker.pathLengths hg18 humanWankerP2P \
+ hgLoadNetDist /hive/data/genomes/hg19/p2p/wanker/humanWanker.pathLengths hg19 humanWankerP2P \
-sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
+ # hgLoadNetDist 54 id-remapping misses, see missing.tab
+
+
###############################################################
# Affy All Exon GeneSorter column. (DONE Andy, 2008-03-17)
# NOTE - in future doing this in genome database rather than
# hgFixed, since it needs to change with each gene build....
# Doing so as part of UCSC Genes update, Jim Kent, 2008-07-14)
ssh hgwdev
cd /cluster/data/hg18/bed
mkdir affyAllExonGsColumn
cd affyAllExonGsColumn/
echo "select * from knownGene" | \
hgsql hg18 | tail -n+2 > knownGene.gp
overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \
../affyHumanExon/affyHuEx1.bed knownGene.gp ids.txt
echo "select * from affyHumanExon" | \
hgsql hgFixed | tail +2 > expData.txt
affyAllExonGSColumn expData.txt ids.txt column.txt
hgLoadSqlTab hgFixed affyHumanExonGs expData.sql column.txt
hgRatioMicroarray -database=hgFixed -clump=affyHumanExon.ra -minAbsVal=0.01 \
affyHumanExonGs affyHumanExonGsRatio
hgMedianMicroarray hgFixed affyHumanExonGs affyHumanExonExps \
affyHumanExon.ra affyHumanExonGsMedian affyHumanExonMedianExps
hgMedianMicroarray hgFixed affyHumanExonGsRatio \
affyHumanExonExps affyHumanExon.ra affyHumanExonGsRatioMedian \
affyHumanExonMedianExps
hgExpDistance hgFixed affyHumanExonGsRatioMedian \
affyHumanExonGsMedianExps affyHumanExonGsRatioMedianDist
###############################################################
## Affy All Exon GeneSorter column human redo, mouse and rat initial
## DONE (2009-01-29, Andy)
## human first, although mm8 and rn4 have the most concise instructions
ssh hgwdev
cd /hive/data/genomes/hg18/bed/affyAllExonGsColumn
cp ~/kent/src/hg/lib/exp{Data,Record}.sql .
echo "select * from knownGene" | \
hgsql hg18 | tail -n+2 > knownGene.gp
zcat ../affyAllExonProbes/hg18.bed.gz \
| sed 's/\([[:digit:]]\+\)|[[:alpha:]]\+/\1/' > probes.bed
overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \
probes.bed knownGene.gp ids.txt
echo "select name,expCount,expScores from affyExonTissues" | \
hgsql hg18 | tail -n+2 > expData.txt
affyAllExonGSColumn expData.txt ids.txt column.txt
hgLoadSqlTab hg18 affyExonTissuesGs expData.sql column.txt
hgMedianMicroarray hg18 affyExonTissuesGs affyExonTissuesExps \
affyHumanExon.ra affyHumanExonGsMedian affyHumanExonMedianExps
grep -A5 affyExonTissuesAll ~/hg/makeDb/hgCgiData/Human/microarrayGroups.ra | grep names | sed 's/^names //; s/,$//' | tr ',' '\n' | awk 'BEGIN{OFS="\t"; id=0;}{ print id, $1, $1, "n/a", "n/a", "n/a", "3", "n/a,n/a,"$1","; id = id + 1;}' > all.expRecords
hgLoadSqlTab hg18 affyExonTissuesGsExps expRecord.sql all.expRecords
cat << "EOF" > affyExonTissuesGs.ra
breast breast 0 1 2
cerebellum cerebellum 3 4 5
heart heart 6 7 8
kidney kidney 9 10 11
liver liver 12 13 14
muscle muscle 15 16 17
pancreas pancreas 18 19 20
prostate prostate 21 22 23
spleen spleen 24 25 26
testes testes 27 28 29
thyroid thyroid 30 31 32
EOF
# << emacs;
hgExpDistance hg18 affyExonTissuesGsMedian \
affyExonTissuesGsMedianExps affyExonTissuesGsMedianDist
## mouse
mkdir /hive/data/genomes/mm9/bed/affyAllExonGsColumn
cd /hive/data/genomes/mm9/bed/affyAllExonGsColumn
cp ~/kent/src/hg/lib/exp{Data,Record}.sql .
echo "select * from knownGene" | \
hgsql mm9 | tail -n+2 > knownGene.gp
echo "create table knownToKnown (name varchar(255) not null, value varchar(255) not null, index(name(7)))" | hgsql mm9
sed 's/^\(\<.\S\+\>\).*/\1\t\1/' knownGene.gp > knownToKnown.txt
echo load data local infile \'knownToKnown.txt\' into table knownToKnown | hgsql mm9
zcat ../affyAllExonProbes/mm9.bed.gz \
| sed 's/\([[:digit:]]\+\)|[[:alpha:]]\+/\1/' > probes.bed
overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \
probes.bed knownGene.gp ids.txt
echo "select name,expCount,expScores from affyExonTissues" | \
hgsql mm9 | tail -n+2 > expData.txt
affyAllExonGSColumn expData.txt ids.txt column.txt
hgLoadSqlTab mm9 affyExonTissuesGs expData.sql column.txt
grep -A5 affyExonTissuesGroupByTissueMedian ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra \
| grep names | sed 's/names //; s/,$//' | tr ',' '\n' \
| awk 'BEGIN{OFS=" "; ix=0;}{print $1, $1, ix, ix+1, ix+2; ix = ix + 3;}' \
> affyExonTissuesGs.ra
grep -A5 affyExonTissuesAll ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra | grep names | sed 's/^names //; s/,$//' | tr ',' '\n' | awk 'BEGIN{OFS="\t"; id=0;}{ print id, $1, $1, "n/a", "n/a", "n/a", "3", "n/a,n/a,"$1","; id = id + 1;}' > all.expRecords
hgLoadSqlTab mm9 affyExonTissuesGsExps expRecord.sql all.expRecords
hgMedianMicroarray mm9 affyExonTissuesGs mm9.affyExonTissuesGsExps \
affyExonTissuesGs.ra affyExonTissuesGsMedian mm9.affyExonTissuesGsMedianExps
hgExpDistance mm9 affyExonTissuesGsMedian \
affyExonTissuesGsMedianExps affyExonTissuesGsMedianDist
## rat
mkdir /hive/data/genomes/rn4/bed/affyAllExonGsColumn
cd /hive/data/genomes/rn4/bed/affyAllExonGsColumn
cp ~/kent/src/hg/lib/exp{Data,Record}.sql .
echo "select * from knownGene" | \
hgsql rn4 | tail -n+2 > knownGene.gp
echo "create table knownToKnown (name varchar(255) not null, value varchar(255) not null, index(name(7)))" | hgsql rn4
sed 's/^\(\<.\S\+\>\).*/\1\t\1/' knownGene.gp > knownToKnown.txt
echo load data local infile \'knownToKnown.txt\' into table knownToKnown | hgsql rn4
zcat ../affyAllExonProbes/rn4.bed.gz \
| sed 's/\([[:digit:]]\+\)|[[:alpha:]]\+/\1/' > probes.bed
overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \
probes.bed knownGene.gp ids.txt
echo "select name,expCount,expScores from affyExonTissues" | \
hgsql rn4 | tail -n+2 > expData.txt
affyAllExonGSColumn expData.txt ids.txt column.txt
hgLoadSqlTab rn4 affyExonTissuesGs expData.sql column.txt
grep -A5 affyExonTissuesGroupByTissueMedian ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra \
| grep names | sed 's/names //; s/,$//' | tr ',' '\n' \
| awk 'BEGIN{OFS=" "; ix=0;}{print $1, $1, ix, ix+1, ix+2; ix = ix + 3;}' \
> affyExonTissuesGs.ra
grep -A5 affyExonTissuesAll ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra | grep names | sed 's/^names //; s/,$//' | tr ',' '\n' | awk 'BEGIN{OFS="\t"; id=0;}{ print id, $1, $1, "n/a", "n/a", "n/a", "3", "n/a,n/a,"$1","; id = id + 1;}' > all.expRecords
hgLoadSqlTab rn4 affyExonTissuesGsExps expRecord.sql all.expRecords
hgMedianMicroarray rn4 affyExonTissuesGs rn4.affyExonTissuesGsExps \
affyExonTissuesGs.ra affyExonTissuesGsMedian rn4.affyExonTissuesGsMedianExps
hgExpDistance rn4 affyExonTissuesGsMedian \
affyExonTissuesGsMedianExps affyExonTissuesGsMedianDist
## mm8
mkdir /hive/data/genomes/mm8/bed/affyAllExonGsColumn
cd /hive/data/genomes/mm8/bed/affyAllExonGsColumn
cp ~/kent/src/hg/lib/exp{Data,Record}.sql .
echo "select * from knownGene" | \
hgsql mm8 | tail -n+2 > knownGene.gp
echo "create table knownToKnown (name varchar(255) not null, value varchar(255) not null, index(name(7)))" | hgsql mm8
sed 's/^\(\<.\S\+\>\).*/\1\t\1/' knownGene.gp > knownToKnown.txt
echo load data local infile \'knownToKnown.txt\' into table knownToKnown | hgsql mm8
zcat ../affyAllExonProbes/mm8.bed.gz \
| sed 's/\([[:digit:]]\+\)|[[:alpha:]]\+/\1/' > probes.bed
overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \
probes.bed knownGene.gp ids.txt
echo "select name,expCount,expScores from affyExonTissues" | \
hgsql mm8 | tail -n+2 > expData.txt
affyAllExonGSColumn expData.txt ids.txt column.txt
hgLoadSqlTab mm8 affyExonTissuesGs expData.sql column.txt
grep -A5 affyExonTissuesGroupByTissueMedian ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra \
| grep names | sed 's/names //; s/,$//' | tr ',' '\n' \
| awk 'BEGIN{OFS=" "; ix=0;}{print $1, $1, ix, ix+1, ix+2; ix = ix + 3;}' \
> affyExonTissuesGs.ra
grep -A5 affyExonTissuesAll ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra \
| grep names | sed 's/^names //; s/,$//' | tr ',' '\n' \
| awk 'BEGIN{OFS="\t"; id=0;}{ print id, $1, $1, "n/a", "n/a", "n/a", "3", "n/a,n/a,"$1","; id = id + 1;}' > all.expRecords
hgLoadSqlTab mm8 affyExonTissuesGsExps expRecord.sql all.expRecords
hgMedianMicroarray mm8 affyExonTissuesGs mm8.affyExonTissuesGsExps \
affyExonTissuesGs.ra affyExonTissuesGsMedian mm8.affyExonTissuesGsMedianExps
hgExpDistance mm8 affyExonTissuesGsMedian \
affyExonTissuesGsMedianExps affyExonTissuesGsMedianDist