8f85cfb621850ffe710909abca6d8d91cd1b8929 kate Thu Aug 25 13:02:46 2016 -0700 Add hg38 tables to Gene Sorter. refs #17288 diff --git src/hg/near/makeNear.doc src/hg/near/makeNear.doc index c323852..a5607ae 100644 --- src/hg/near/makeNear.doc +++ src/hg/near/makeNear.doc @@ -1,670 +1,674 @@ # These are instructions for building the tables used by the Gene Sorter # (hgNear) -- these are often also used by hgGene. # Don't start these until there is a knownGene track, or a track with the # usual genePred fields followed by a proteinID field with SwissProt IDs. # for emacs: -*- mode: sh; -*- # Set up db variable (you'll actually need to redo this # after each ssh until we work out a better system.) ssh hgwdev set db = hg19 ######################################################################## # Cluster together various alt-splicing isoforms. hgClusterGenes $db knownGene knownIsoforms knownCanonical ######################################################################## # Use doHgNearBlastp.pl to run blastp on knownGene proteins vs. self and # vs. other Gene Sorter organisms (most recent db for each organism). mkdir -p /cluster/data/$db/bed/hgNearBlastp cd /cluster/data/$db/bed/hgNearBlastp # Get peptide sequences for each db's hgNear geneset: pepPredToFa $db knownGenePep $db.known.faa pepPredToFa mm8 knownGenePep mm8.known.faa pepPredToFa rn4 knownGenePep rn4.known.faa pepPredToFa danRer3 ensPep danRer3.ensPep.faa pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa pepPredToFa ce2 sangerPep ce2.sangerPep.faa pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa # Configure doHgNearBlastp.pl. Use recipBest for all organisms more distant # than mammal-mammal. For mammal-mammal, synBlastp.csh is used instead # after loading the table with doHgNearBlast.pl results -- see below. # NOTE: If $db is not a mammal, then recipBest is probably appropriate # for all queryDbs. # NOTE: if $db does not have knownGene, then change targetGenesetPrefix # to the root of whatever geneset hgNear uses for $db (e.g. sanger for # sangerGene, flyBase for flyBaseGene etc.). cat << _EOF_ > config.ra # Latest human vs. other Gene Sorter orgs: # mouse, rat, zebrafish, fly, worm, yeast targetGenesetPrefix known targetDb hg19 queryDbs mm8 rn4 danRer3 dm3 ce2 sacCer1 recipBest danRer3 dm3 ce2 sacCer1 hg19Fa /cluster/data/hg19/bed/hgNearBlastp/hg19.known.faa mm8Fa /cluster/data/hg19/bed/hgNearBlastp/mm8.known.faa rn4Fa /cluster/data/hg19/bed/hgNearBlastp/rn4.known.faa danRer3Fa /cluster/data/hg19/bed/hgNearBlastp/danRer3.ensPep.faa dm3Fa /cluster/data/hg19/bed/hgNearBlastp/dm3.flyBasePep.faa ce2Fa /cluster/data/hg19/bed/hgNearBlastp/ce2.sangerPep.faa sacCer1Fa /cluster/data/hg19/bed/hgNearBlastp/sacCer1.sgdPep.faa buildDir /cluster/data/hg19/bed/hgNearBlastp scratchDir /san/sanvol1/scratch/hg19HgNearBlastp _EOF_ # Run with -noLoad so we can eyeball files, manually load $db tables now, # and later overload other databases' hgBlastTab tables. doHgNearBlastp.pl -noLoad config.ra >& do.log & tail -f do.log # Run the load scripts for dm3 tables manually as suggested by the # output of doHgNearBlastp.pl: # Load self-blastp (knownBlastp) and $db.??BlastTab immediately: # *** -noLoad was specified -- you can run this script manually to load hg19 tables: run.hg19.hg19/loadPairwise.csh # *** -noLoad was specified -- you can run these scripts manually to load hg19 tables: run.hg19.mm8/loadPairwise.csh run.hg19.rn4/loadPairwise.csh run.hg19.danRer3/loadPairwise.csh run.hg19.dm3/loadPairwise.csh run.hg19.ce2/loadPairwise.csh run.hg19.sacCer1/loadPairwise.csh # For mammal-mammal pairs, run synBlastp.csh: synBlastp.csh $db mm8 synBlastp.csh $db rn4 # **When hgNearOk is set for $db on the RR**, load *.hgBlastTab and # make a separate push request for them. # *** -noLoad was specified -- you can run these scripts manually to load hgBlastTab in query databases: run.mm8.hg19/loadPairwise.csh run.rn4.hg19/loadPairwise.csh run.danRer3.hg19/loadPairwise.csh run.dm3.hg19/loadPairwise.csh run.ce2.hg19/loadPairwise.csh run.sacCer1.hg19/loadPairwise.csh # For mammal-mammal pairs, run synBlastp.csh: synBlastp.csh mm8 $db synBlastp.csh rn4 $db ######################################################################## # MAPPINGS TO OTHER SETS OF IDS # Create table that maps between known genes and RefSeq hgMapToGene $db refGene knownGene knownToRefSeq # Create table that maps between known genes and LocusLink echo "select mrnaAcc,locusLinkId from refLink" | hgsql -N $db > refToLl.txt hgMapToGene $db refGene knownGene knownToLocusLink -lookup=refToLl.txt # Create table that maps between known genes and Pfam domains hgMapViaSwissProt $db knownGene name proteinID Pfam knownToPfam # Create table that maps between known genes and other datasets when # they exist, e.g.: hgMapToGene $db HInvGeneMrna knownGene knownToHInv hgMapToGene $db allenBrainAli -type=psl knownGene knownToAllenBrain hgMapToGene $db ensGene knownGene knownToEnsembl hgMapToGene $db snp130 knownGene knownToCdsSnp -all -cds -ignoreStrand ######################################################################## # EXPRESSION: MAPPING + DISTANCE # Create a table that maps between known genes and # the nice affyUcla expression data. hgMapToGene "-type=bed 12" $db affyUclaNorm knownGene knownToU133 # Create expression distance table. This will take about an hour. cd ~/src/hg/near/hgExpDistance hgExpDistance $db affyUcla affyUclaExp knownExpDistance -weights=affyUcla.weight -lookup=knownToU133 # Format and load the GNF data cd /cluster/data/$db/bed mkdir affyGnf95 cd affyGnf95 affyPslAndAtlasToBed -newType ../affyU95.psl /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 affyGnfU95.tab affyGnfU95Exps.tab -shortOut hgsql $db < ~/src/hg/affyGnf/affyGnfU95.sql # Create table that maps between known genes and # the GNF data. hgMapToGene $db affyU95 knownGene knownToU95 cd ~/src/hg/near/hgExpDistance #hgExpDistance $db affyGnfU95 affyGnfU95Exps knownGnfDistance -lookup=knownToU95 hgExpDistance $db hgFixed.gnfHumanU95MedianRatio hgFixed.gnfHumanU95MedianExps gnfU95Distance -lookup=knownToU95 #For worm did instead #hgExpDistance ce1 kimLifeCycleMedian kimWormLifeCycleMedian kimExpDistance hgExpDistance ce1 hgFixed.kimWormLifeMedianRatio hgFixed.kimWormLifeMedianExps kimExpDistance #For mouse did instead hgExpDistance mm3 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74 hgExpDistance mm3 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74 hgExpDistance mm3 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74 ######################################################################## # Make sure that GO database is up to date. See README in /cluster/store1/geneOntology. ######################################################################## ## (markd did this when??) # create Rankprop homology score and PSI-BLAST tables. # requires rankprop code from Bill Noble <noble@gs.washington.edu> # which includes scripts to build these tables on the cluster. # run psiblast ssh hgwdev mkdir /cluster/bluearc/markd/rankprop cd /cluster/bluearc/markd/rankprop # link rankprop to work directory: ln -s /cluster/store7/markd/rankprop/rankprop/bin . mkdir -p blastRun cd blastRun # hs.sw+tr is human swissprot+trembl (on hgwdev) ../bin/cluster-blast-setup hs.sw+tr ssh kk cd /cluster/bluearc/markd/rankprop/blastRun/hs.sw+tr para create jobs.para para try ,... # one job crashed on Query=P01722, no idea why; remove it from database # and rerun # finish up run cd .. ../bin/cluster-blast-finishup hs.sw+tr # run rankprop # rankprop run, on rack9 due to memory sies of kk9 ssh kk9 cd /cluster/bluearc/markd/rankprop/rankpRun/ ../bin/cluster-rankprop-setup -maxHits 1000 hs.sw+tr/max1k 10 cd hs.sw+tr para create jobs.para para try, push, blah cd .. ../bin/cluster-rankprop-finishup hs.sw+tr/max1k # load database cd /cluster/bluearc/markd/rankprop/results spLoadRankProp -noKgIdFile=hs.sw+tr/max1k.hg17.nokg.spids hg17 rankProp hs.sw+tr/max1k.rankp.gz >&hs.sw+tr/max1k.hg17.log spLoadPsiBlast hg17 spPsiBlast hs.sw+tr.eval.gz #--------- ## (aamp andy pohl did this when??) # Human data from Shyamsundar R, et al. (2005) Genome Biol 6(3):R22 hgExpDistance hg17 hgFixed.humanNormalRatio hgFixed.humanNormalExps humanNormalDistance -lookup=knownToLocusLink # Mouse landscape data. hgExpDistance mm6 hgFixed.mouseLandscape hgFixed.mouseLandscapeExps mouseLandscapeDistance -lookup=knownToXM # Remaking some distance tables after the knownGene updates (DONE 01/24/2006 Andy) # (current working directory irrelevant... it's all database) hgExpDistance mm6 hgFixed.mouseLandscape hgFixed.mouseLandscapeExps mouseLandscapeDistance -lookup=knownToXM hgExpDistance hg17 hgFixed.gladHumES hgFixed.gladHumESExps gladHumESDistance -lookup=knownToGnfAtlas2 #---------------------------------------------------------- ## (galt 2005-06-03) # p2p Protein-to-protein network - P2P column and sort order # I wrote the hgNetDist program to calculate network-distances for all gene pairs from gene-to-gene edges in input data .tab, # using the Floyd-Warshall dynamic programming algorithm. # These .tab files are from Josh Stuart /cluster/home/jstuart/Data/Interaction/P2P/{Worm,Fly,Yeast}/Compendium/data.tab # I have also deposited copies of the .tab files used in /cluster/data/$db/p2p/ # added entries to hgNearData/$species/{orderDb,columbDb}.ra # added hgNearData/$species/p2p.html # added hgNearData/p2p.html # #yeast: (1.5 hours for about 5000 genes, 24452 edges) hgNetDist yeastP2P.tab sacCer1 yeastP2P -threshold=3 #fly: (3 hours for about 6500 genes, 19993 edges) # The temporary table bdgpGeneFb2Bdgp was constructed just for fly from this sql command: # create table bdgpGeneFb2Bdgp as # select flyBaseId, name bdgpName from bdgpGene a, bdgpGeneInfo b # where SUBSTRING_INDEX(name, "-R", 1) = bdgpName order by flyBaseId, bdgpName; # hgNetDist flytest.tab dm1 flyP2P -threshold=2 -sqlRemap="select flyBaseId, bdgpName from bdgpGeneFb2Bdgp" #worm: (15 minutes for about 2514 genes, 3871 edges - interaction data available is small) hgNetDist wormP2P.tab ce2 wormP2P -threshold=2 #---------------------------------------------------------- # FLYP2P (DONE 2006-02-06 angie) ssh hgwdev mkdir /cluster/data/dm2/bed/p2p cd /cluster/data/dm2/p2p cp /cluster/data/dm1/p2p/flyP2P.tab . hgNetDist flyP2P.tab dm2 flyP2P \ -sqlRemap='select fbgn,name from flyBase2004Xref' ## (galt 2006-07-31) # Human p2p Protein-to-protein network - P2P column and sort order # I used the hgNetDist program to calculate network-distances for all gene pairs from gene-to-gene edges in input data, # I have also deposited copies of the .tab files used in /cluster/data/$db/p2p/{vidal,wanker} # added entries to hgNearData/Human/{orderDb,columbDb}.ra # added hgNearData/Human/{vidal,wanker}P2p.html # #vidal cat nature04209-s17.xls | gawk '{print $1 "\t" $3 "\t" "1.0"}' > humanVidal.p2p hgLoadNetDist humanVidal.p2p hg18 humanVidalP2P -threshold=2 -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA" #Added to hgNearData/Human/hg18/columnDb.ra #------------- name vidalP2p type distance humanVidalP2P query target distance visibility off shortLabel Vidal P2P longLabel Human Protein-Protein Interaction Network from Marc Vidal priority 12 #Added to hgNearData/Human/hg18/orderDb.ra #----------- name vidalP2p shortLabel Vidal Protein-to-Protein longLabel P2P Network Distance to Selected Gene from Marc Vidal data type pair humanVidalP2P query target distance 1 priority 9 #wanker cat table_S3.txt | gawk '{print $4 "\t" $7 "\t" "1.0"}' > humanWanker.p2p hgNetDist wanker/humanWanker.p2p hg18 humanWankerP2P -threshold=2 -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA" #Did exactly the same for hg17 also. #Did the same thing to the .ra files all over again for name wankerP2p. --- # show amount of overlap between Vidal and Wanker data sets #These following queries joining both P2P data were #glacial until I added these better indexes: create index ninny on humanVidalP2P(query(9),target(9)); create index ninny on humanWankerP2P(query(9),target(9)); select count(*) from humanVidalP2P v, humanWankerP2P w where v.query=w.query and v.target=w.target; #+----------+ #| count(*) | #+----------+ #| 1661 | #+----------+ #---------------------------------------------------------- ## (galt 2006-09-15 thru 2006-11-06) # HPRD p2p # Human p2p Protein-to-protein network - P2P column and sort order # I used the hgNetDist program to calculate network-distances for all gene pairs from # gene-to-gene edges in input data. # I extended hgNetDist to handle two situations better: # One is duplicate pairs. I have now made it sort the input by distance # so that priority is given to the record for the pair with the shortest distance. # When further records for the same pair occur, they are ignored. # Two is supporting explicit values for self. Until now, these did # not occur in the data, and an automatic self distance of 0 was put out # simply to make GS slightly easier to use. Now I can preserve the distance # if any given in the input for self-self interactions. # Fan Hsu first downloaded the "single" xml to /cluster/store12/hprd/060906/ !! TODO !! # I have also deposited copies of the .tab files used in /cluster/data/$db/p2p/hprd/working, # and I also put the source code for hprdRun there. # added entries to hgNearData/Human/{orderDb,columbDb}.ra # added hgNearData/Human/hprdP2p.html # #hprd #Manually repaired the single.xml file as approved by HPRD #to remove one unneeded tag and one invalid interaction. #Also had to remove the colon in the names like xmlns:xsi #in the second line mv HPRD_SINGLE_PSIMI_060106.xml single-fixed.xml #autoDtd had a bug in that it thinks "02995" is an integer # when it is clearly better handled as an id. # this causes it to truncate leading zeros (atoi) # which are lost. But since knownToHprd has leading zeros, # this causes failure of some records. # To compensate, pad leading zeros in p2p ids. # Jim said I could fix it so I did and checked it in. autoDtd single-fixed.xml out.dtd out.stats -tree=out.tree -atree=out.atree #Counts from stats: interaction 34109 # create parser with autoXml code generator autoXml out.dtd hprd #I wrote hprdRun.c using hprd.c,h #and I ran it to create p2p file (distance 1.5 for complex) and a separate complex table #as jim requested for later use. hprdRun single-fixed.xml hprd.p2p hprdComplex.tab # participant count=1 which is < 2 for participant id = 66942 # this is a known problem with their .xml confirmed by HPRD, ignore it. hgNetDist hprd.p2p hg18 humanHprdP2P -weighted -threshold=2 -sqlRemap="select distinct value, name from knownToHprd" #inTab=hprd.p2p db=hg18 table=humanHprdP2P #reading edges hprd.p2p #slCount(edges)=43567 for hprd.p2p #beginning processing sqlRemap query [select distinct value, name from knownToHprd] #beginning processing data hprd.p2p ... #number of nodes=8898 #412 sqlRemap misses! see missing.tab #e.g. id 6257 not found in aliasHash! # other testing shows that 8486 out of 8898 hprid's match hit in knownToHprd # so that is sufficient coverage. #hgsql hg18 -e 'drop table if exists humanHprdP2P; create table humanHprdP2P (query varchar(255), target varchar(255), distance float);' #hgsql hg18 -e 'load data local infile "hgNetDist.tmp.tab" into table humanHprdP2P ignore 1 lines;' #hgsql hg18 -e 'create index query on humanHprdP2P (query(8));' #Added to hgNearData/Human/hg18/columnDb.ra #------------- name hprdP2p type distance humanHprdP2P query target distance visibility off shortLabel HPRD P2P longLabel Human Protein-Protein Interaction Network from the Human Reference Protein Database priority 12.2 itemUrl http://www.hprd.org/protein/%s itemUrlQuery select value from knownToHprd where name='%s' #Added to hgNearData/Human/hg18/orderDb.ra #----------- name hprdP2p shortLabel HRPD Protein-to-Protein longLabel P2P Network Distance to Selected Gene from the Human Reference Protein Database type pair humanHprdP2P query target distance 1 priority 9.2 #Did exactly the same for hg17 also. # made hgNear/hgNearData/Human/hprdP2p.html description page # added to cvs #cvs committed hgNear changes requested by HPRD # to support linking from the Interaction pair to protein # page on their site. see itemUrlQuery in GS. # HPRD requested an illustration describing the network distances. # Mike Long helped me do the artwork with illustrator and other # software. The final .png is checked into browser/images/, # and copied to hgwdev:/usr/local/apache/images/ # The image is referred to by hprdP2p.html mentioned above. # checked in changes to .ra config files # committed changes # added pushQ entry --- #---------------------------------------------------------- ## (kent 2009-10-16) # HPRD p2p update used in hg19 # First go to http://www.hprd.org, follow the download link, fill in the information they # request for academic users, and download HPRD_SINGLE_PSIMI_070609.xml.tar.gz into # /hive/data/outside/hprd/070609, and then unpack it with cd /hive/data/outside/hprd/070609 tar -zxvf HPRD_SINGLE_PSIMI_070609.xml.tar.gz # Now run the hprdXmlToTab program, which was largely generated by autoDtd/autoXml. hprdXmlToTab HPRD_SINGLE_PSIMI_070609.xml p2p.tab complex.tab # interaction count = 40075 # Now use hgNetDist to generate pathLengthrs file. This takes an hour or two. hgNetDist -verbose=2 -weighted -threshold=2 p2p.tab hprd.pathLengths #---------------------------------------------------------- ## (kg3 hg19 upgrade done kent 2009-10-13) # kg3 hg19 creation of Human p2p Protein-to-protein network - P2P columns # Note could just reuse the pathLengths files calculated in the hg18 build, since # these don't depend on an assembly. #Copy in from hg18 database cp /hive/data/genomes/hg18/p2p /hive/data/genomes/hg19 #hprd hgLoadNetDist /hive/data/outside/hprd/070609/hprd.pathLengths hg19 humanHprdP2P \ -sqlRemap="select distinct value, name from knownToHprd" # hgLoadNetDist 86 id-remapping misses, see missing.tab #vidal hgLoadNetDist /hive/data/genomes/hg19/p2p/vidal/humanVidal.pathLengths hg19 humanVidalP2P \ -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA" # hgLoadNetDist 22 id-remapping misses, see missing.tab #wanker hgLoadNetDist /hive/data/genomes/hg19/p2p/wanker/humanWanker.pathLengths hg19 humanWankerP2P \ -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA" # hgLoadNetDist 54 id-remapping misses, see missing.tab ############################################################### # Affy All Exon GeneSorter column. (DONE Andy, 2008-03-17) # NOTE - in future doing this in genome database rather than # hgFixed, since it needs to change with each gene build.... # Doing so as part of UCSC Genes update, Jim Kent, 2008-07-14) ssh hgwdev cd /cluster/data/hg18/bed mkdir affyAllExonGsColumn cd affyAllExonGsColumn/ echo "select * from knownGene" | \ hgsql hg18 | tail -n+2 > knownGene.gp overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \ ../affyHumanExon/affyHuEx1.bed knownGene.gp ids.txt echo "select * from affyHumanExon" | \ hgsql hgFixed | tail +2 > expData.txt affyAllExonGSColumn expData.txt ids.txt column.txt hgLoadSqlTab hgFixed affyHumanExonGs expData.sql column.txt hgRatioMicroarray -database=hgFixed -clump=affyHumanExon.ra -minAbsVal=0.01 \ affyHumanExonGs affyHumanExonGsRatio hgMedianMicroarray hgFixed affyHumanExonGs affyHumanExonExps \ affyHumanExon.ra affyHumanExonGsMedian affyHumanExonMedianExps hgMedianMicroarray hgFixed affyHumanExonGsRatio \ affyHumanExonExps affyHumanExon.ra affyHumanExonGsRatioMedian \ affyHumanExonMedianExps hgExpDistance hgFixed affyHumanExonGsRatioMedian \ affyHumanExonGsMedianExps affyHumanExonGsRatioMedianDist ############################################################### ## Affy All Exon GeneSorter column human redo, mouse and rat initial ## DONE (2009-01-29, Andy) ## human first, although mm8 and rn4 have the most concise instructions ssh hgwdev cd /hive/data/genomes/hg18/bed/affyAllExonGsColumn cp ~/kent/src/hg/lib/exp{Data,Record}.sql . echo "select * from knownGene" | \ hgsql hg18 | tail -n+2 > knownGene.gp zcat ../affyAllExonProbes/hg18.bed.gz \ | sed 's/\([[:digit:]]\+\)|[[:alpha:]]\+/\1/' > probes.bed overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \ probes.bed knownGene.gp ids.txt echo "select name,expCount,expScores from affyExonTissues" | \ hgsql hg18 | tail -n+2 > expData.txt affyAllExonGSColumn expData.txt ids.txt column.txt hgLoadSqlTab hg18 affyExonTissuesGs expData.sql column.txt hgMedianMicroarray hg18 affyExonTissuesGs affyExonTissuesExps \ affyHumanExon.ra affyHumanExonGsMedian affyHumanExonMedianExps grep -A5 affyExonTissuesAll ~/hg/makeDb/hgCgiData/Human/microarrayGroups.ra | grep names | sed 's/^names //; s/,$//' | tr ',' '\n' | awk 'BEGIN{OFS="\t"; id=0;}{ print id, $1, $1, "n/a", "n/a", "n/a", "3", "n/a,n/a,"$1","; id = id + 1;}' > all.expRecords hgLoadSqlTab hg18 affyExonTissuesGsExps expRecord.sql all.expRecords cat << "EOF" > affyExonTissuesGs.ra breast breast 0 1 2 cerebellum cerebellum 3 4 5 heart heart 6 7 8 kidney kidney 9 10 11 liver liver 12 13 14 muscle muscle 15 16 17 pancreas pancreas 18 19 20 prostate prostate 21 22 23 spleen spleen 24 25 26 testes testes 27 28 29 thyroid thyroid 30 31 32 EOF # << emacs; hgExpDistance hg18 affyExonTissuesGsMedian \ affyExonTissuesGsMedianExps affyExonTissuesGsMedianDist ## mouse mkdir /hive/data/genomes/mm9/bed/affyAllExonGsColumn cd /hive/data/genomes/mm9/bed/affyAllExonGsColumn cp ~/kent/src/hg/lib/exp{Data,Record}.sql . echo "select * from knownGene" | \ hgsql mm9 | tail -n+2 > knownGene.gp echo "create table knownToKnown (name varchar(255) not null, value varchar(255) not null, index(name(7)))" | hgsql mm9 sed 's/^\(\<.\S\+\>\).*/\1\t\1/' knownGene.gp > knownToKnown.txt echo load data local infile \'knownToKnown.txt\' into table knownToKnown | hgsql mm9 zcat ../affyAllExonProbes/mm9.bed.gz \ | sed 's/\([[:digit:]]\+\)|[[:alpha:]]\+/\1/' > probes.bed overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \ probes.bed knownGene.gp ids.txt echo "select name,expCount,expScores from affyExonTissues" | \ hgsql mm9 | tail -n+2 > expData.txt affyAllExonGSColumn expData.txt ids.txt column.txt hgLoadSqlTab mm9 affyExonTissuesGs expData.sql column.txt grep -A5 affyExonTissuesGroupByTissueMedian ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra \ | grep names | sed 's/names //; s/,$//' | tr ',' '\n' \ | awk 'BEGIN{OFS=" "; ix=0;}{print $1, $1, ix, ix+1, ix+2; ix = ix + 3;}' \ > affyExonTissuesGs.ra grep -A5 affyExonTissuesAll ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra | grep names | sed 's/^names //; s/,$//' | tr ',' '\n' | awk 'BEGIN{OFS="\t"; id=0;}{ print id, $1, $1, "n/a", "n/a", "n/a", "3", "n/a,n/a,"$1","; id = id + 1;}' > all.expRecords hgLoadSqlTab mm9 affyExonTissuesGsExps expRecord.sql all.expRecords hgMedianMicroarray mm9 affyExonTissuesGs mm9.affyExonTissuesGsExps \ affyExonTissuesGs.ra affyExonTissuesGsMedian mm9.affyExonTissuesGsMedianExps hgExpDistance mm9 affyExonTissuesGsMedian \ affyExonTissuesGsMedianExps affyExonTissuesGsMedianDist ## rat mkdir /hive/data/genomes/rn4/bed/affyAllExonGsColumn cd /hive/data/genomes/rn4/bed/affyAllExonGsColumn cp ~/kent/src/hg/lib/exp{Data,Record}.sql . echo "select * from knownGene" | \ hgsql rn4 | tail -n+2 > knownGene.gp echo "create table knownToKnown (name varchar(255) not null, value varchar(255) not null, index(name(7)))" | hgsql rn4 sed 's/^\(\<.\S\+\>\).*/\1\t\1/' knownGene.gp > knownToKnown.txt echo load data local infile \'knownToKnown.txt\' into table knownToKnown | hgsql rn4 zcat ../affyAllExonProbes/rn4.bed.gz \ | sed 's/\([[:digit:]]\+\)|[[:alpha:]]\+/\1/' > probes.bed overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \ probes.bed knownGene.gp ids.txt echo "select name,expCount,expScores from affyExonTissues" | \ hgsql rn4 | tail -n+2 > expData.txt affyAllExonGSColumn expData.txt ids.txt column.txt hgLoadSqlTab rn4 affyExonTissuesGs expData.sql column.txt grep -A5 affyExonTissuesGroupByTissueMedian ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra \ | grep names | sed 's/names //; s/,$//' | tr ',' '\n' \ | awk 'BEGIN{OFS=" "; ix=0;}{print $1, $1, ix, ix+1, ix+2; ix = ix + 3;}' \ > affyExonTissuesGs.ra grep -A5 affyExonTissuesAll ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra | grep names | sed 's/^names //; s/,$//' | tr ',' '\n' | awk 'BEGIN{OFS="\t"; id=0;}{ print id, $1, $1, "n/a", "n/a", "n/a", "3", "n/a,n/a,"$1","; id = id + 1;}' > all.expRecords hgLoadSqlTab rn4 affyExonTissuesGsExps expRecord.sql all.expRecords hgMedianMicroarray rn4 affyExonTissuesGs rn4.affyExonTissuesGsExps \ affyExonTissuesGs.ra affyExonTissuesGsMedian rn4.affyExonTissuesGsMedianExps hgExpDistance rn4 affyExonTissuesGsMedian \ affyExonTissuesGsMedianExps affyExonTissuesGsMedianDist ## mm8 mkdir /hive/data/genomes/mm8/bed/affyAllExonGsColumn cd /hive/data/genomes/mm8/bed/affyAllExonGsColumn cp ~/kent/src/hg/lib/exp{Data,Record}.sql . echo "select * from knownGene" | \ hgsql mm8 | tail -n+2 > knownGene.gp echo "create table knownToKnown (name varchar(255) not null, value varchar(255) not null, index(name(7)))" | hgsql mm8 sed 's/^\(\<.\S\+\>\).*/\1\t\1/' knownGene.gp > knownToKnown.txt echo load data local infile \'knownToKnown.txt\' into table knownToKnown | hgsql mm8 zcat ../affyAllExonProbes/mm8.bed.gz \ | sed 's/\([[:digit:]]\+\)|[[:alpha:]]\+/\1/' > probes.bed overlapSelect -inFmt=genePred -selectFmt=bed -idOutput \ probes.bed knownGene.gp ids.txt echo "select name,expCount,expScores from affyExonTissues" | \ hgsql mm8 | tail -n+2 > expData.txt affyAllExonGSColumn expData.txt ids.txt column.txt hgLoadSqlTab mm8 affyExonTissuesGs expData.sql column.txt grep -A5 affyExonTissuesGroupByTissueMedian ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra \ | grep names | sed 's/names //; s/,$//' | tr ',' '\n' \ | awk 'BEGIN{OFS=" "; ix=0;}{print $1, $1, ix, ix+1, ix+2; ix = ix + 3;}' \ > affyExonTissuesGs.ra grep -A5 affyExonTissuesAll ~/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra \ | grep names | sed 's/^names //; s/,$//' | tr ',' '\n' \ | awk 'BEGIN{OFS="\t"; id=0;}{ print id, $1, $1, "n/a", "n/a", "n/a", "3", "n/a,n/a,"$1","; id = id + 1;}' > all.expRecords hgLoadSqlTab mm8 affyExonTissuesGsExps expRecord.sql all.expRecords hgMedianMicroarray mm8 affyExonTissuesGs mm8.affyExonTissuesGsExps \ affyExonTissuesGs.ra affyExonTissuesGsMedian mm8.affyExonTissuesGsMedianExps hgExpDistance mm8 affyExonTissuesGsMedian \ affyExonTissuesGsMedianExps affyExonTissuesGsMedianDist ############################################################### ## GTEx ## (2016-08-16, kate) # Create a table that maps between known genes and GTEx (GENCODE V19) gene ids -#hgMapToGene "-type=bed 12" hg19 gtexGeneModel knownGene knownToGtex cd /hive/data/genomes/hg19/bed/gtex + #hgMapToGene hg19 -type=genePred gtexGeneModelV6 knownGene knownToGtex hgMapToGene hg19 -noLoad -all -type=genePred gtexGeneModelV6 knownGene knownToGtex.all hgMapToGene hg19 -all -type=genePred gtexGeneModelV6 knownGene knownToGtex #hgMapToGene hg38 -type=genePred gtexGeneModelV6 knownGene knownToGtex hgMapToGene hg38 -all -type=genePred gtexGeneModelV6 knownGene knownToGtex # Create expression distance table from median table. This will take about an hour. #cd ~/src/hg/near/hgExpDistance # TODO: Consider adding weights file hgsql hgFixed -e select 1.0, id, name from gtexTissue > gtex.weights # edit to downweight brain (13 tissues total) hgExpDistance hg19 -verbose=2 -lookup=knownToGtex -weights=gtex.weights \ hgFixed.gtexTissueMedian hgFixed.gtexTissue \ - gtexDistance2 >&! distance2.log & + gtexDistance + +hgExpDistance hg19 -verbose=2 -lookup=knownToGtex -weights=gtex.weights \ + hgFixed.gtexTissueMedian hgFixed.gtexTissue \ + gtexDistance # Create ratio table. # TODO: Consider adding -clump file # Defaults for microarray are too high -- RPKM is significant >2 roughly hgRatioMicroarray -minAbsVal=2 -minMaxVal=5 -database=hgFixed gtexTissueMedian gtexTissueMedianRatio2 # Get max scores for columnDb.ra hgMaxExp hgFixed gtexTissueMedian # 219385.91 hgMaxExp hgFixed gtexTissueMedianRatio # 12.53