src/hg/makeDb/doc/visiGene.txt 1.10
1.10 2009/10/14 15:29:11 kent
Creating knownToHprd column.
Index: src/hg/makeDb/doc/visiGene.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/visiGene.txt,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 1000000 -r1.9 -r1.10
--- src/hg/makeDb/doc/visiGene.txt 9 Sep 2008 21:59:09 -0000 1.9
+++ src/hg/makeDb/doc/visiGene.txt 14 Oct 2009 15:29:11 -0000 1.10
@@ -1,1450 +1,1451 @@
# CREATE EMPTY DATABASE AND TABLES.
hgsql -e "create database visiGeneNew" mysql
hgsql visiGeneNew < ~/kent/src/hg/visiGene/visiGene.sql
makeTableDescriptions visiGeneNew ~/kent/src/hg/visiGene/visiGene.as
# LOAD PAUL GRAY/MAHONEY LAB DATA.
# Transferred images from Paul Gray's Mac to mine and converted
# his spreadsheet to a tab-separated file, cloning.tab.
cd ~/kent/src/hg/visiGene/vgLoadMahoney
vgLoadMahoney /gbdb/visiGene mm5 cloning.tab clonePcr.bed outDir
cd outDir
visiGeneLoad whole.ra whole.tab /dev/null -database=visiGeneNew
visiGeneLoad slices.ra slices.tab /dev/null -database=visiGeneNew
# LOAD JACKSON LABS DATA.
# First ask Galt to create a local copy of the Jackson labs
# database. I'm not sure how he did it.
cd ~/kent/src/hg/visiGene/vgLoadJax
vgLoadJax /gbdb/visiGene jackson visiGene
./loadNew
# Update the privateUser fields where we don't have permissions by entering
# this at the mysql prompt.
update submissionSet,journal set submissionSet.privateUser=-1
where (journal.name like 'Nat %' or journal.name = 'Nature')
and submissionSet.journal = journal.id and submissionSet.name like 'jax%'
# LOAD NIBB IMAGES
# Do this after creating the nibbImageProbe.fa file as described
# in makeXenTro1.doc, and after creating the nibbImageProbes table
# in hg17 as describe in makeHg17.doc. The image files are
# loaded in /cluster/store11/visiGene/offline/nibbFrog.
ssh kolossus
cd /cluster/store11/visiGene/offline
nibbParseImageDir nibbFrog nibFrog.tab bad.tab
nibbPrepImages nibbFrog nibFrog.tab \
/cluster/store11/visiGene/gbdb/200/inSitu/XenopusLaevis/nibb \
/cluster/store11/visiGene/gbdb/full/inSitu/XenopusLaevis/nibb
# Note the nibbPrepImages step is a 2 day process, next time may
# want to run it on the kki cluster. It does need to be run on a 64
# bit machine because of bugs in the 32 bit image magick convert program.
ssh hgwdev
cd ~/kent/src/hg/visiGene/vgLoadNibb
hgMapToGene hg17 nibbImageProbes knownGene knownToNibbImage
# Now go into the gene sorter on hg17, configure it to just show
# the name, genbank, and NIBB Xenopus columns. Filter on * in the
# NIBB Xenopus column (which will get rid of rows with no data in that
# column). Save the text output to names.raw. Then get rid of names
# that are no more than genbank accessions as so:
awk '$1 != $2 {printf("%s\t%s\n", $1, $3);}' names.raw > names.txt
# Now create the .tab and .ra files as so:
vgLoadNibb /cluster/store11/visiGene/offline/nibbFrog \
/cluster/store11/visiGene/offline/nibbFrog.tab \
/cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa \
names.tab stage.tab outDir
visiGeneLoad outDir/nibb.ra outDir/nibb.tab /dev/null -database=visiGeneNew
# LOAD GENSAT IMAGES
# This was done with the assistance of Mike Dicuccio at NCBI,
# dicuccio@ncbi.nlm.nih.gov. If updating probably it's best to
# get in touch with him and make sure that the ftp site is up to
# date.
# Download data from NCBI into /cluster/store11/visiGene/offline/gensat
cd /cluster/store11/visiGene/offline
mkdir gensat
cd gensat
mkdir RawData
cd RawData
wget --timeStamping ftp://ftp.ncbi.nih.gov/pub/gensat/RawData/GENSAT-20051120.xml.gz
wget --timeStamping ftp://ftp.ncbi.nih.gov/pub/gensat/RawData/NCBI_Gensat-20051120.dtd
# At this point if the dtd has changed you may need to remake
# kent/src/hg/visiGene/gensat/lib/gs.c with autoXml. Once
# this is done then do the download with gensatImageDownload.
# It'll take about 3 days. The results will be in the Institutions dir.
cd /cluster/store11/visiGene/offline/gensat
zcat RawData/GENSAT-20051120.xml.gz | gensatImageDownload . download.log
# Create parasol directory and a list of the jpg files.
ssh kki
cd /cluster/store11/visiGene/offline/gensat
mkdir prepImageRun
find Institutions -name '*.jpg' -print | sed 's/Institutions\//' > prepImageRun/jpg.lst
cd prepImageRun
# Create parasol batch
cat << '_EOF_' > gsub
#LOOP
vgPrepImage /cluster/store11/visiGene/offline/gensat/Institutions /cluster/store11/
visiGene/gbdb/200/inSitu/Mouse/gensat /cluster/store11/visiGene/gbdb/full/inSitu/Mo
use/gensat $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 jpg.lst single gsub spec
para make spec
# Note the above procedure would take about 3 days. I ended up copying the
# data over to /san/sanvol1, and doing it on the pita cluster. The job
# there just took two hours, with just 100 cpus available. It took
# an hour to copy the data over, and eight hours to copy it back though,
# and some tweaking.
# MAKE FULL TEXT INDEX
cd /cluster/store11/visiGene/gbdb
vgGetText visiGene.text mm7 hg17
ixIxx visiGene.text visiGene.ix visiGene.ixx
# (Galt 2006-02)
# RSYNC'd from /cluster/store11/visiGene to /san/sanvol1/visiGene
# and moved the /gbdb/visiGene symlink to point to the new location.
# I also had to manually run a script to find symlinks pointing from full/ over to
# /cluster/store11/offline and remake them to point correctly to /san/sanvol1/visiGene/offline.
# Allen Brain Atlas jp2 image prep (Galt 2006-02-12)
# Create parasol directory and a list of the jpg files.
ssh pk
cd /san/sanvol1/visiGene/offline/allenBrain
mkdir prepImageRun
find imageDisk -name '*.jp2' -print | sed 's/imageDisk\///' > prepImageRun/jpg.lst
cd prepImageRun
# Create parasol batch
cat << '_EOF_' > gsub
#LOOP
vgPrepImage /san/sanvol1/visiGene/offline/allenBrain/imageDisk /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 jpg.lst single gsub spec
para make spec -maxNode=50
[pk:/san/sanvol1/visiGene/offline/allenBrain/prepImageRun> /parasol/bin/para time
11748 jobs in batch
4291 jobs (including everybody's) in Parasol queue.
Checking finished jobs
Completed: 11748 of 11748 jobs
CPU time in finished jobs: 474919s 7915.32m 131.92h 5.50d 0.015 y
IO & Wait Time: 5029116s 83818.60m 1396.98h 58.21d 0.159 y
Average job time: 469s 7.81m 0.13h 0.01d
Longest running job: 0s 0.00m 0.00h 0.00d
Longest finished job: 41811s 696.85m 11.61h 0.48d
Submission to last job: 172301s 2871.68m 47.86h 1.99d
# -maxNode=50 was needed.
# Note that because it opens up to 40 output files at the same time, it overwhelms NFS
# when a lot of nodes are running, it can bring down the SAN. Because I was nearly
# done when it came back up, I just re-pushed with -maxNode=50 to keep it under control.
# However in the future, something like this should be done to keep the file access local
# as much as possible.
# Here is the proposed new way:
# -----------------------
cat << '_EOF_' > gsub
#LOOP
./vgPrep.csh $(path1) $(root1) $(file1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > vgPrep.csh
#!/bin/tcsh
mkdir -p /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain/$1
mkdir -p /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain/$1
cp /san/sanvol1/visiGene/offline/allenBrain/imageDisk/$1 /scratch/tmp/$3
vgPrepImage /san/sanvol1/visiGene/offline/allenBrain/imageDisk /scratch/tmp/vg200$2 /scratch/tmp/vgfull$2 $1
set err = $status
if (! $err ) then
cp -r /scratch/tmp/vg200$2/* /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain
cp -r /scratch/tmp/vgfull$2/* /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain
endif
rm -f /scratch/tmp/$3
rm -fr /scratch/tmp/vg200$2
rm -fr /scratch/tmp/vgfull$2
if ( $err ) then
exit 1
endif
'_EOF_'
# << this line makes emacs coloring happy
# -----------------------
# ADDED TWO ADDITIONAL ZOOM-OUT LEVELS 5 AND 6:
# Ran /san/sanvol1/offline/level56Run/ cluster job on a list of all files dumped
# from the visiGene.imageFile table so that we made new zoom out levels 5 and 6
# for all pictures. Since it was a special one-time deal, I just used ImageMagick.
# vgPrepImage.c has been modified to do the 2 new zoomout levels so that they
# will be built automatically in future.
# Ran several checks to make sure no files were missing, fixed any errors.
# Found embedded space in some nibb filenames, found a couple of gensat
# images which had previously failed to download and redownloaded them ok.
# Found a few missing things and 0 bytes jpgs and re-ran them.
# It should be pretty clean right now.
# LOAD ALLEN BRAIN DATA
vgLoadAllen \
/san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain \
/san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20051021.tab \
/cluster/data/mm7/bed/allenBrain/allProbes.fa \
/cluster/data/mm7/bed/allenBrain/allProbes.tab \
output
#backed-up data in case of trouble:
mkdir /san/sanvol1/visiGene/dump/visiGene.20060220
hgsqldump visiGene -T /san/sanvol1/visiGene/dump/visiGene.20060220
#load into visiGene db
visiGeneLoad -database=visiGene output/aba.ra output/aba.tab /dev/null
# Manually added several researchers names to the contributor and submissionContributor tables
# at the request of Susan Sunkin as well as updating the text for contributor, copyright, acknowledgements.
# I manually also updated aba.ra and vgLoadAllen.c to reflect her changes. The manual mods
# to contributor which work great in the visiGene search are not currently automatically
# supported, and would thus be lost if we ever nuke it and start fresh.
# At some point, we will probably add an additional field to the .ra structure
# and have visiGeneLoad support it.
# RE-MAKE FULL TEXT INDEX
cd hg/visiGene/vgGetText
make alpha
# basically does this, but puts it in cgi-bin/visiGeneData/:
#vgGetText visiGene.text mm7 hg17
#ixIxx visiGene.text visiGene.ix visiGene.ixx
# (hgVisiGene cgi v128 now knows about this new location)
############################
-# REBUILD WITH NEW vgProbeTrack PROGRAM - AFTER ADDING ALLEN BRAIN (DONE galt 2006-03-15)
+# REBUILD WITH NEW vgProbeTrack PROGRAM - AFTER ADDING ALLEN BRAIN (DONE galt 2009-10-12)
# (make a backup of visiGene db and these tables: {mm6,mm7,...}.vgProbes and {hg17,hg18,...}.vgAllProbes
-cd /san/sanvol1/visiGene/dump
+cd /hive/data/inside/visiGene/dump
# (this backup shown is really an example template for the next person who needs to do this)
-mkdir visiGene20060315
-cd visiGene20060315
+mkdir visiGene.20060315
+cd visiGene.20060315
hgsqldump visiGene -T .
mkdir mm6; hgsqldump mm6 vgProbes -T mm6
mkdir mm7; hgsqldump mm7 vgProbes -T mm7
mkdir mm8; hgsqldump mm8 vgProbes -T mm8
+mkdir mm9; hgsqldump mm9 vgProbes -T mm9
mkdir hg17; hgsqldump hg17 vgAllProbes -T hg17
mkdir hg18; hgsqldump hg18 vgAllProbes -T hg18
#(do any others needed that might not be listed here)
#(document the reason for making the backup)
echo 'vgLoadAllenBrain has been run, so making backup of visiGene db and probe tracks before updating, ' > README
# OK, NOW USE vgProbeTrack TO UPDATE
cd ~/kent/src/hg/visiGene/vgProbeTrack
# -sqlPath must be included whenever the vgProbes or vgAllProbes track tables do not yet exist for the db
# so it can find the .sql script to create vgProbes or vgAllProbes tables as needed.
# I happen to know that only AllenBrain was updated since last time, and that is mouse only
# populate vgPrb with any new stuff in visiGene.probe (works for all taxons at once).
vgProbeTrack POP
# find sequence using various methods - given probe seq, primers, bacs, refseq, etc.
# must specify a specific assembly to use, so just using mm7 since mm8 still in qa.
# this finds any stuff for the mouse taxon
vgProbeTrack SEQ working mm7
# create alignments using either refSeqAli or all_mrna or bacEnds or blat. Took 1.5 hours.
# alignments are individually tracked per assembly here
# alignment successes go in $db.vgProbes psl track, and whether succeeded or failed,
# it only looks for things that have not already attempted alignment
# the status goes into visiGene.vgPrbAli with .db="mm7"
# because mm7.vgProbes is a new table, to create it we include the -sqlPath so
# it can find the vgProbes.sql script
vgProbeTrack ALI working mm7 -sqlPath=..
# this finds any seq required for mm7.vgProbes track not already in mm7.seq
# adds the new .fa file in /cluster/data/mm7/bed/visiGene/
# adds a symlink to it in /gbdb/mm7/visiGene/
# and runs hgLoadSeq mm6 /gbdb/mm7/visiGene/vgPrbExt_??????.fa to add it to mm7.seq
vgProbeTrack EXT working mm7
# mm6.vgProbes was already complete from previous probe track creation,
# it just needed to catch the new Allen Brain probes and align them. About 1.5 hours.
vgProbeTrack ALI working mm6
vgProbeTrack EXT working mm6
# hg17.vgAllProbes was pre-existing with all probes, just need to add new allenBrain mouse
# this internally uses pslMap against the mm7 to hg17 liftover chain.gz
# Because it is "Xeno" (from mouse to human), it creates track vgAllProbes,
# and maintains the list of processed alignments in visiGene.vgPrbAliAll.
vgProbeTrack PSLMAP working hg17 mm7
# updates hg17.seq/extFile similarly to the EXT command, but for All probes.
# just like with EXT, EXTALL puts .fa in /cluster/data/hg17/visiGene
# and symlink in /gbdb/hg17/visiGene and updates using hgLoadSeq.
# if a sequence has already been loaded it will not be loaded again.
vgProbeTrack EXTALL working hg17
# hg18.vgAllProbes never existed before
vgProbeTrack PSLMAP working hg18 mm7 -sqlPath=..
# because the nibb blatz probe track hg18.nibbImageProbes was never done on hg18
# until just now (see makeHg18.doc), we have to add it for the first time.
# "nibb" is not really a db here, so I manually put in a taxon mapping for it,
# so it appears as Xenopus laevis 8355, see the source code.
vgProbeTrack REMAP working hg18 nibb nibbImageProbes /gbdb/hg18/nibbImageProbes.fa
vgProbeTrack EXTALL working hg18
# mm8 is in qa and so it is basically ready to use now. About 1.5 hours.
vgProbeTrack ALI working mm8 -sqlPath=..
vgProbeTrack EXT working mm8
# RE-MAKE knownToVisiGene tables (see respective makedocs for these)
#knownToVisiGene mm6
#knownToVisiGene mm7
#knownToVisiGene mm8
#knownToVisiGene hg17 -fromProbePsl=vgAllProbes
#knownToVisiGene hg18 -fromProbePsl=vgAllProbes
############################
### JACKSON UPDATE (done 2006-04-01 galt) #############
# updated jackson20060328 db on kkr3u00 (see hg/visiGene/jackson/makeJackson.doc)
# Dropped old visiGeneOld db, asked Heather to clone visiGene db to visiGeneOld db,
# and then ran this query to remove the old previous JAX info:
# MULTI-TABLE DELETE:
delete submissionSource, submissionSet, submissionContributor, image,
imageProbe, expressionLevel, imageFile from
submissionSource so,
submissionSet ss,
submissionContributor sc,
image i,
imageProbe ip,
expressionLevel el,
imageFile f
where so.id = 2
and ss.submissionSource = so.id
and sc.submissionSet = ss.id
and i.submissionSet = ss.id
and ip.image = i.id
and el.imageProbe = ip.id
and f.submissionSet = ss.id;
#delete query (get rid of all submissionSource.id=2)
#Query OK, 164717 rows affected (48 min 16.07 sec)
# Workaround for uniProt access from kkr3u00
ssh hgwdev
setenv jdb jackson20060328
cd ~/kent/src/hg/visiGene/vgLoadJax
hgsqldump uniProt taxon commonName -T .
ssh kkr3u00
setenv jdb jackson20060328
cd ~/kent/src/hg/visiGene/vgLoadJax
hgsql mysql -e "create database uniProt"
hgsql uniProt < taxon.sql
hgsql uniProt < commonName.sql
# hgsql uniProt -e 'show tables'
hgsql uniProt -e "load data local infile 'taxon.txt' into table taxon"
hgsql uniProt -e "load data local infile 'commonName.txt' into table commonName"
# hgsql uniProt -e 'show table status\G'
# cleanup
rm taxon.*
rm commonName.*
#update vgLoadJax.c to update the date given in .ra acknowledgements
#recompile vgLoadJax on dev
#run vgLoadJax to create .ra .tab .txt for each submissionSet
ssh kkr3u00
setenv jdb jackson20060328
cd ~/kent/src/hg/visiGene/vgLoadJax
#remove any old data dir
rm -fr visiGene/
# visiGene in line below is just an output dir for the .ra/.tab/.txt files
~/bin/i386/vgLoadJax /san/sanvol1/visiGene/gbdb jackson20060328 visiGene
#ref 32185: missing title from BIB_Refs, ref skipped
#Calculating age from postnatal
#ref 67768: missing title from BIB_Refs, ref skipped
#Calculating age from postnatal month 3
#Calculating age from postnatal
#Calculating age from postnatal
#Calculating age from postnatal month 4
#Calculating age from postnatal month 4
#Calculating age from Not Specified 12.5
#refCount=2970
#ran loadAll to load the updated jax .ra .tab .txt into visiGene db
ssh hgwdev
cd ~/kent/src/hg/visiGene/vgLoadJax
loadAll
#loadAll.output has 1112 lines like
#visiGene/100423.ra
# ran vgGetText to update cgi-bin-galt/visiGeneData/ using visiGene db
cd ~/kent/src/hg/visiGene/vgGetText
make alpha
# output:
#vgGetText /usr/local/apache/cgi-bin/visiGeneData/visiGene.text mm7 hg17
#probe has 19276 rows
#gene has 15173 rows
#imageProbe has 115500 rows
# recompiled hgVisiGene
############################
# REBUILD PROBETRACK (DONE galt 2006-04-04)
# WITH vgProbeTrack PROGRAM - AFTER DOING JAX UPDATE 20060328
# (make a backup of visiGene db and these tables: {mm6,mm7,...}.vgProbes and {hg17,hg18,...}.vgAllProbes
cd /san/sanvol1/visiGene/dump
# (this backup shown is really an example template for the next person who needs to do this)
mkdir visiGene.20060404
cd visiGene.20060404
hgsqldump visiGene -T .
mkdir mm6; hgsqldump mm6 vgProbes -T mm6
mkdir mm7; hgsqldump mm7 vgProbes -T mm7
mkdir mm8; hgsqldump mm8 vgProbes -T mm8
mkdir hg17; hgsqldump hg17 vgAllProbes -T hg17
mkdir hg18; hgsqldump hg18 vgAllProbes -T hg18
#(do any others needed that might not be listed here)
#(document the reason for making the backup)
echo 'vgLoadJax jackson20060328 has been run, so making backup of visiGene db and probe tracks before updating probeTracks, ' > README
# OK, NOW USE vgProbeTrack TO UPDATE
cd ~/kent/src/hg/visiGene/vgProbeTrack
# Make sure vgProbeTrack program is up to date
make
# -sqlPath must be included whenever the vgProbes or vgAllProbes track tables do not yet exist for the db
# so it can find the .sql script to create vgProbes or vgAllProbes tables as needed.
# I happen to know that only JAX was updated since last time, and that is mouse only
# populate vgPrb with any new stuff in visiGene.probe (works for all taxons at once).
vgProbeTrack POP
#new probe records found = 1285, # new vgPrb records added = 1285
# most of these are old, but we updated JAX by dropping completely and re-adding
# so these probes find their way back via sequence identity of probes in vgPrb.sequence
# find sequence using various methods - given probe seq, primers, bacs, refseq, etc.
# must specify a specific assembly to use, so mm7 is ready to use now, mm8 still in qa.
# this finds any stuff for the mouse taxon
vgProbeTrack SEQ working mm7
#rc = 0 = count of primers for mrna search for taxon 10090
#rc = 0 = count of primers for genome search for taxon 10090
#bac list read done.
#found seq for 0 bacEndPairs
#rc = 549 = count of refSeq mrna for mm7
#rc = 18 = count of genRef mrna for mm7
#rc = 33 = count of genbank mrna for mm7
#rc = 428 = count of flatRef mrna for mm7
#rc = 0 = count of flatAll mrna for mm7
#rc = 1 = count of linkRef mrna for mm7
#rc = 0 = count of linkAll mrna for mm7
#rc = 1 = count of kgAlRef mrna for mm7
#rc = 37 = count of kgAlAll mrna for mm7
# create alignments using either refSeqAli or all_mrna or bacEnds or blat. Took 1.5 hours.
# alignments are individually tracked per assembly here
# alignment successes go in $db.vgProbes psl track, and whether succeeded or failed,
# it only looks for things that have not already attempted alignment
# the status goes into visiGene.vgPrbAli with .db="mm7"
vgProbeTrack ALI working mm7
# this finds any seq required for mm7.vgProbes track not already in mm7.seq
# adds the new .fa file in /cluster/data/mm7/bed/visiGene/
# adds a symlink to it in /gbdb/mm7/visiGene/
# and runs hgLoadSeq mm6 /gbdb/mm7/visiGene/vgPrbExt_??????.fa to add it to mm7.seq
vgProbeTrack EXT working mm7
# mm6.vgProbes was already complete from previous probe track creation,
# it just needed to catch the new Allen Brain probes and align them. About 1.5 hours.
vgProbeTrack ALI working mm6
vgProbeTrack EXT working mm6
# hg17.vgAllProbes was pre-existing with all probes, just need to add new allenBrain mouse
# this internally uses pslMap against the mm7 to hg17 liftover chain.gz
# Because it is "Xeno" (from mouse to human), it creates track vgAllProbes,
# and maintains the list of processed alignments in visiGene.vgPrbAliAll.
vgProbeTrack PSLMAP working hg17 mm7
# updates hg17.seq/extFile similarly to the EXT command, but for All probes.
# just like with EXT, EXTALL puts .fa in /cluster/data/hg17/visiGene
# and symlink in /gbdb/hg17/visiGene and updates using hgLoadSeq.
# if a sequence has already been loaded it will not be loaded again.
vgProbeTrack EXTALL working hg17
# hg18.vgAllProbes existed before
vgProbeTrack PSLMAP working hg18 mm7
vgProbeTrack EXTALL working hg18
# mm8 is in qa and so it is basically ready to use now.
vgProbeTrack ALI working mm8
vgProbeTrack EXT working mm8
# RE-MAKE knownToVisiGene tables (see respective makedocs for these)
knownToVisiGene mm6
knownToVisiGene mm7
knownToVisiGene mm8
knownToVisiGene hg17 -fromProbePsl=vgAllProbes
knownToVisiGene hg18 -fromProbePsl=vgAllProbes
# update text/index for visiGene
cd hg/visiGene/vgGetText
make alpha
#vgGetText /usr/local/apache/cgi-bin/visiGeneData/visiGene.text mm7 hg17
#probe has 19276 rows
#gene has 15173 rows
#imageProbe has 115500 rows
############################
#
# Patch contributors so we can search MGI submission sets
# by specifying JAX or MGI in the search box.
#
select id from submissionSource where name = 'MGI';
+----+
| id |
+----+
| 6 |
+----+
# note: we have to double the search word or else the search doesn't work
insert into contributor set name = 'JAX JAX';
insert into contributor set name = 'MGI MGI';
mysql> select * from contributor where name in ('JAX JAX','MGI MGI');
+------+---------+
| id | name |
+------+---------+
| 3981 | JAX JAX |
| 3982 | MGI MGI |
+------+---------+
insert into submissionContributor select id, '3981' from submissionSet where submissionSource = 6;
insert into submissionContributor select id, '3982' from submissionSet where submissionSource = 6;
##### ADD SUPPORT FOR ABURL (DONE 2006-04-19 galt)
# I manually updated this, currently needed only by JAX,
# adding antibodySource table that maps abSubmitId to antibody and submissionSource
# and adding field abUrl to submissionSource table.
# The code for vgLoadJax and visiGeneLoad were also updated to support this new
# link from antibody probe to submissionSource website for further details.
# Since this will be automatically maintained in future, no point in belaboring the makefile.
# This also involved an update to hgVisiGene including passing submissionSource id on
# the url to the primers page so that the external link can be made when it is an antibody.
##### REPLICATED submissionSet.privateUser SETTINGS TO NEW VISIGENE DB (DONE 2006-04-24 galt)
# This was an oversight caused by full removal of all old jax submissionSets
# when we did the jax 2006-03-28 update. Since we had lost the privateUser settings,
# I just replicated it from visiGeneOld with a simple query.
update visiGene.submissionSet n, visiGeneOld.submissionSet o set n.privateUser=-1
where o.privateUser=-1 and o.name = n.name;
# currently this is just jax submissionSets for which we have not received permissions to use.
##### ADDED IMAGEFILE-FORWARDING TO COMBINE MAHONEY AND JAX-MAHONEY ANNOTATIONS (DONE 2006-04-26 galt)
# The idea here is that JAX has some useful annotations, but including them made a lot of
# unnecessary duplication in the system. Although it wasn't easy, we have come up with
# a method to map the imageFiles from Mahoney to the ones in JAX. We have made imageFileFwd table
# to store that mapping information, and added code to hgVisiGene to use it. Wholemount steps
# are manual, while slices steps use hg/visiGene/vgLoadJax/forwardSlices.c I wrote to map them Mah->JAX.
# Additional complications are that JAX combined several slices together into one image
# following a certain pattern. Luckily for the wholemounts, the original images were not modified by JAX.
# This means that we can get a perfect match Mah->JAX for the wholemounts using md5sum (produced unique values).
# Because both we and JAX imported the Mahoney data/spreadsheet into auto-incremented primary keyfield
# tables, the original order is preserved and allows a surprisingly good mah->jax many-to-one slices mapping.
# I also extended vgLoadJax to be able to find the primers in the PRB_Notes which was useful both
# for mapping the slices, and because we end up using JAX annotations for the fullCaption() page,
# so that we don't lose primer info. For the remaining fraction where Mahoney never supplied primers
# (actually they have since updated the info, but neither JAX nor we have gotten that yet),
# something over 20%, we have managed to instead just map on gene. This worked surprisingly well,
# and made either correct or very close matches.
# Since previously, vgLoadJax looked for the mahoney set in jax and excluded it, we need to
# make and import it into visiGene. I have removed the skipping of mahoney set from the
# vgLoadJax code (so that next time we update jax, the mahoney set will not be excluded)
# and added a commandline option to do just a single submission set. I happen to know that
# the mahoney set in jax is jax92242. THIS STEP WON'T BE NEEDED IN FUTURE.
# the latest jax sybase db conversion is on kkr3u00 because it had space and little use.
ssh kkr3u00
cd ~/kent/src/hg/visiGene/vgLoadJax
# clean out any old subdirectory
rm -fr visiJaxMahoney
# process just the jaxMahoney submissionSet
${HOME}/bin/i386/vgLoadJax -oneSubmissionSet=92242 /san/sanvol1/visiGene/gbdb jackson20060328 visiJaxMahoney
# load it into visiGene db
ssh hgwdev
visiGeneLoad visiJaxMahoney/92242.ra visiJaxMahoney/92242.tab visiJaxMahoney/92242.txt
# we are going to treat the jax version of Mahoney as "privateUser"
# in order to suppress it and reduce the duplication of Mahoney images
hgsql visiGene -e 'update submissionSet set privateUser=-1 where name like "jax92242"'
submissionSets: (for reference)
name id
-----------------------------------
mahoneyWhole = 1
mahoneySlices01 = 2
jax92242 = 1820
ssh hgwdev
cd /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/mahoney/wholeMount
md5sum *.jpg | sed -e 's/ /\t/' | sort > ~/kent/src/hg/visiGene/vgLoadJax/mahoneyWholeMount.md5
#(quick - 1 or 2 minutes only)
#Find the jax-Mahoney images that are WholeMount
ssh hgwdev
cd /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax
# WARNING: change database name and submissionSet id constants in text below if needed!!!
# jaxMahoney = 1820, bodyPart.id = 1 for name="whole"
hgsql visiGene -BN -e 'select distinct imageFile.fileName from imageFile, image, specimen, bodyPart \
where imageFile.submissionSet=1820 and bodyPart=1 \
and image.imageFile=imageFile.id and image.specimen=specimen.id' \
| xargs md5sum | sed -e 's/ /\t/' | sort > ~/kent/src/hg/visiGene/vgLoadJax/jaxMahoneyWholeMount.md5
cd ~/kent/src/hg/visiGene/vgLoadJax
# verify that they are unique by md5sum:
wc -l *.md5
1833 jaxMahoneyWholeMount.md5
1843 mahoneyWholeMount.md5
sort -k 1,1 -u jaxMahoneyWholeMount.md5 | wc -l
1833
sort -k 1,1 -u mahoneyWholeMount.md5 | wc -l
1843
hgsql visiGene
create table mahoneyWholeMountMd5 (
md5 char(32) not null, # md5 sum of .jpg
fileName varchar(10) not null, # .jpg fileName
INDEX(md5),
INDEX(fileName)
);
load data local infile 'mahoneyWholeMount.md5' into table mahoneyWholeMountMd5;
analyze table mahoneyWholeMountMd5;
create table jaxMahoneyWholeMountMd5 (
md5 char(32) not null, # md5 sum of .jpg
fileName varchar(10) not null, # .jpg fileName
INDEX(md5),
INDEX(fileName)
);
load data local infile 'jaxMahoneyWholeMount.md5' into table jaxMahoneyWholeMountMd5;
analyze table jaxMahoneyWholeMountMd5;
# verify that they match uniquely and completely:
select count(*) from mahoneyWholeMountMd5 m, jaxMahoneyWholeMountMd5 j where m.md5 = j.md5;
+----------+
| count(*) |
+----------+
| 1833 |
+----------+
# make forwarding table (NO NEED TO DO IN FUTURE, IS IN visiGene.as,.sql)
CREATE TABLE imageFileFwd (
fromIf int not null, # From imageFile
toIf int not null, # To imageFile
#Indices
INDEX(fromIf),
INDEX(toIf)
);
# WARNING: change submissionSet ids!!!
# find how the mahoney matches to the jaxMahoney
# (I verified that all filenames are unique in all 3 submissionSets: jaxM, mWhole, mSlices)
insert into imageFileFwd
select mi.id, ji.id from imageFile mi, imageFile ji, mahoneyWholeMountMd5 m, jaxMahoneyWholeMountMd5 j
where m.md5 = j.md5 and mi.fileName=m.fileName and ji.fileName=j.fileName
and mi.submissionSet=1 and ji.submissionSet=1820;
# Records: 1828
# the wholemounts are now done, so let's do the slices next!
# Cluster Run to do OCR on jaxMahoneySlices (Galt 2006-04-28)
# if program ocrad is not in /cluster/bin/i386, download and compile it (very easy)
# ocrad is a gnu program
# Create parasol directory and a list of the jpg files.
ssh hgwdev
cd /san/sanvol1/visiGene/offline/jax
mkdir ocrJaxMahoneyRun
cd ocrJaxMahoneyRun
mkdir output
# make list of jaxMahoneySlice .jpgs
# WARNING: change database and submissionSet ids!!!
hgsql visiGene -BN -e 'select distinct imageFile.fileName from \
imageFile, image, specimen \
where imageFile.submissionSet=1820 and bodyPart<>1 \
and image.imageFile=imageFile.id and image.specimen=specimen.id' \
> jaxMahoneySlices.list
# Create parasol batch
cat << '_EOF_' > gsub
#LOOP
./ocrSlices.csh $(file1) $(root1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > ocrSlices.csh
#!/bin/tcsh -ef
if ( -e output/$2.map ) then
rm output/$2.map
endif
if ( -e output/$2.ocr ) then
rm output/$2.ocr
endif
convert /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/$1 output/$2.pgm
# There wasn't a single threshold value that worked, so do entire series
/cluster/bin/i386/ocrad --threshold=.4 --charset=ascii output/$2.pgm >> output/$2.ocr
/cluster/bin/i386/ocrad --threshold=.5 --charset=ascii output/$2.pgm >> output/$2.ocr
/cluster/bin/i386/ocrad --charset=ascii output/$2.pgm >> output/$2.ocr
/cluster/bin/i386/ocrad --threshold=.6 --charset=ascii output/$2.pgm >> output/$2.ocr
/cluster/bin/i386/ocrad --threshold=.7 --charset=ascii output/$2.pgm >> output/$2.ocr
/cluster/bin/i386/ocrad --threshold=.8 --charset=ascii output/$2.pgm >> output/$2.ocr
/cluster/bin/i386/ocrad --threshold=.9 --charset=ascii output/$2.pgm >> output/$2.ocr
@ x = $2
# special handling for 7996.jpg thru 8060.jpg, the Accession does not end in "aa" for these.
if ( ($x >= 7996) && ($x <= 8060) ) then
cat output/$2.ocr | tr lOoiI 10011 | tr -d . | perl -0ne 'print "$1\n" while ( /(T\d{8})/gs )' | sort -u > output/$2.temp
else
cat output/$2.ocr | tr lOoiI 10011 | tr -d . | perl -0ne 'print "$1\n" while ( /(T\d{8}aa)/gs )' | sort -u > output/$2.temp
endif
set tempTs = ( `cat output/$2.temp` )
if ( $#tempTs > 0 ) then
foreach t ( $tempTs )
if ( ($x >= 7996) && ($x <= 8060) ) then # special handling for these
if (-e /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/mahoney/slices/${t}aa) then
set t = "${t}aa"
endif
if (-e /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/mahoney/slices/${t}00) then
set t = "${t}00"
endif
endif
echo "$1\t$t" >> output/$2.map
end
else
echo "$1\tNO_TEXT" > output/$2.map
endif
rm output/$2.temp
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x ocrSlices.csh
ssh pk
cd /san/sanvol1/visiGene/offline/jax/ocrJaxMahoneyRun
gensub2 jaxMahoneySlices.list single gsub spec
para create spec
para try
para push
para check
para time
#2095 jobs in batch
#292661 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 2095 of 2095 jobs
#CPU time in finished jobs: 1059s 17.65m 0.29h 0.01d 0.000 y
#IO & Wait Time: 5687s 94.79m 1.58h 0.07d 0.000 y
#Average job time: 3s 0.05m 0.00h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 7s 0.12m 0.00h 0.00d
#Submission to last job: 134s 2.23m 0.04h 0.00d
cat output/*.map | sort > ~/kent/src/hg/visiGene/vgLoadJax/jaxMahoneySlices.map
--------------------------
ssh hgwdev
cd ~/kent/src/hg/visiGene/vgLoadJax
cat *.map | wc -l
4066
cat *.map | grep NO_TEXT | wc -l
1 (turns out to be a jax annotation caption error - missing leading zero in mtf#, ignoring)
hgsql visiGene
create table jaxMahoneySlicesMap (
jFileName varchar(10) not null, # jaxMahoney .jpg fileName
mFileName varchar(20) not null # mahoney .jpg fileName
);
load data local infile 'jaxMahoneySlices.map' into table jaxMahoneySlicesMap;
update jaxMahoneySlicesMap set mFileName = concat(mFileName,".jpg") where mFileName <> "NO_TEXT";
create index jFileName on jaxMahoneySlicesMap(jFileName);
create index mFileName on jaxMahoneySlicesMap(mFileName);
analyze table jaxMahoneySlicesMap;
# WARNING: change submissionSet ids!!!
insert into imageFileFwd
select mi.id, ji.id from imageFile mi, imageFile ji, jaxMahoneySlicesMap map
where ji.fileName=map.jFileName and mi.fileName=map.mFileName
and mi.submissionSet=2 and ji.submissionSet=1820;
# Records: 3896
# It's looking good.
# clean up
drop table mahoneyWholeMountMd5;
drop table jaxMahoneyWholeMountMd5;
drop table jaxMahoneySlicesMap;
#############################################################
#
# Patch contributors so we can search submission sets
# by specifying Mahoney in the search box.
#
select id from submissionSource where name like 'Mahoney%';
+----+
| id |
+----+
| 1 |
+----+
# note: we have to double the search word or else the search doesn't work
insert into contributor set name = 'Mahoney mahoney';
mysql> select * from contributor where name in ('Mahoney mahoney');
+------+-----------------+
| id | name |
+------+-----------------+
| 3987 | Mahoney mahoney |
+------+-----------------+
insert into submissionContributor select id, '3987' from submissionSet where submissionSource = 1;
############################
### JACKSON UPDATE (re-done to fix expression data 2006-06-05 galt) #############
# The expression data was not correctly matching subpanels,
# and the bodyPart was incorrectly displaying just "floor plate"
# instead of the full part-tree-lineage available in field printName.
#
# We found the solution in vgLoadJax was to NOT use the GXD_Expression
# table at all - apparently it is not necessary as the data is in other tables.
# This meant that we are using GXD_Strength values instead of the old 1/0 for level.
# We made the vgLoadJax code treat these correctly, and tweaked hgVisiGene too.
# And then we also decided to add the expression pattern while we were at it
# since JAX db had it - so added it to vgLoadJax and hgVisiGene.
# previously updated: jackson20060328 db on kkr3u00
# (see above, and see hg/visiGene/jackson/makeJackson.doc)
# save imageFileFwd data in new form for easy restore:
create table iffKeepThis as
select a.fileName "fromFN", b.fileName "toFN" from imageFileFwd iff, imageFile a, imageFile b
where iff.fromIf = a.id and iff.toIf = b.id;
create index fromFN on iffKeepThis(fromFn(10));
create index toFN on iffKeepThis(toFn(10));
# Asked Heather to clone visiGene db to visiGeneBadExpr db,
# and then ran this query to remove the old previous JAX info:
# MULTI-TABLE DELETE:
# CRITICAL! to make sure that analyze table has been run on all tables involved,
# otherwise this will run forever. Don't assume that the cardinality is defined.
# Running analyze table is super quick.
analyze table submissionSource;
analyze table submissionSet;
analyze table submissionContributor;
analyze table image;
analyze table imageFile;
analyze table imageProbe;
analyze table expressionLevel;
delete from submissionSource where name = 'MGI';
# 1 rows
delete submissionSet from submissionSet ss left join submissionSource so on ss.submissionSource=so.id where so.id is null;
# 1113 rows
delete submissionContributor from submissionContributor sc left join submissionSet ss on sc.submissionSet=ss.id where ss.id is null;
# 7926 rows
delete image from image i left join submissionSet ss on i.submissionSet=ss.id where ss.id is null;
# 33816 rows
delete imageFile from imageFile imf left join submissionSet ss on imf.submissionSet=ss.id where ss.id is null;
# 13854 rows
delete imageProbe from imageProbe ip left join image i on ip.image=i.id where i.id is null;
# 35395 rows
delete expressionLevel from expressionLevel el left join imageProbe ip on el.imageProbe=ip.id where ip.id is null;
# 102293 rows
delete from imageFileFwd;
# 5724 rows
delete antibodySource from antibodySource abs left join submissionSource so on abs.submissionSource=so.id where so.id is null;
# 745 rows
#recompile vgLoadJax on dev
#run vgLoadJax to create .ra .tab .txt for each submissionSet
ssh kkr3u00
cd ~/kent/src/hg/visiGene/vgLoadJax
#remove any old data dir
rm -fr visiGene/
# visiGene in line below is just an output dir for the .ra/.tab/.txt files
vgLoadJax /san/sanvol1/visiGene/gbdb jackson20060328 visiGene
#refCount=2971
#ran loadAll to load the updated jax .ra .tab .txt into visiGene db
ssh hgwdev
cd ~/kent/src/hg/visiGene/vgLoadJax
loadAll
# deal with parallel Mahoney-in-Jax data
select * from submissionSet where name='jax92242' \G
*************************** 1. row ***************************
id: 2848
name: jax92242
publication: Mouse Brain Organization Revealed Through Direct Genome-Scale TF Expression Analysis
select id,name from submissionSet where name like 'mahoney%';
+----+-----------------+
| id | name |
+----+-----------------+
| 2 | mahoneySlices01 |
| 1 | mahoneyWhole01 |
+----+-----------------+
# save imageFileFwd data in new form for easy restore:
insert into imageFileFwd
select a.id, b.id from iffKeepThis iff, imageFile a, imageFile b
where iff.fromFN = a.fileName and iff.toFN = b.fileName
and a.submissionSet in (1,2) and b.submissionSet in (2848);
drop table iffKeepThis;
# Since we had lost the privateUser settings,
# I just replicated it from visiGeneBadExpr backup with a simple query.
update visiGene.submissionSet n, visiGeneBadExpr.submissionSet o set n.privateUser=-1
where o.privateUser=-1 and o.name = n.name;
# currently this is just jax submissionSets for which we have not received permissions to use,
# and the mahoney-in-jax that is suppressed.
# ran vgGetText to update cgi-bin-galt/visiGeneData/ using visiGene db
cd ~/kent/src/hg/visiGene/vgGetText
make alpha
# recompiled hgVisiGene earlier to support new expression level scale, and pattern
# RE-MAKE knownToVisiGene tables (see respective makedocs for these)
knownToVisiGene mm6
knownToVisiGene mm7
knownToVisiGene mm8
knownToVisiGene hg17 -fromProbePsl=vgAllProbes
knownToVisiGene hg18 -fromProbePsl=vgAllProbes
# Patch contributors so we can search MGI submission sets
# by specifying JAX or MGI in the search box.
#
select id from submissionSource where name = 'MGI';
+----+
| id |
+----+
| 7 |
+----+
# note: we have to double the search word or else the search doesn't work
# skip adding these two which are already there:
# insert into contributor set name = 'JAX JAX';
# insert into contributor set name = 'MGI MGI';
mysql> select * from contributor where name in ('JAX JAX','MGI MGI');
+------+---------+
| id | name |
+------+---------+
| 3981 | JAX JAX |
| 3982 | MGI MGI |
+------+---------+
insert into submissionContributor select id, '3981' from submissionSet where submissionSource = 7;
insert into submissionContributor select id, '3982' from submissionSet where submissionSource = 7;
#######################################################
#
# Received a major update from Susan Sunkin at ABA
# consisting of 6000 new images (we had 12000 already)
#
#
# Allen Brain Atlas jp2 image prep (Galt 2006-12-12)
# Create parasol directory and a list of the jpg files.
ssh pk
cd /san/sanvol1/visiGene/offline/allenBrain
rm -fr prepImageRun
mkdir prepImageRun
find imageDisk -name '*.jp2' -print | sed 's/imageDisk\///' | grep May_06 > prepImageRun/jpg.lst
cd prepImageRun
# Create parasol batch
cat << '_EOF_' > gsub
#LOOP
vgPrepImage /san/sanvol1/visiGene/offline/allenBrain/imageDisk /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 jpg.lst single gsub spec
para make spec -maxNode=50
[pk:prepImageRun> /parasol/bin/para time
6317 jobs in batch
266106 jobs (including everybody's) in Parasol queue.
Checking finished jobs
Completed: 6317 of 6317 jobs
CPU time in finished jobs: 267986s 4466.44m 74.44h 3.10d 0.008 y
IO & Wait Time: 368981s 6149.68m 102.49h 4.27d 0.012 y
Average job time: 101s 1.68m 0.03h 0.00d
Longest running job: 0s 0.00m 0.00h 0.00d
Longest finished job: 1471s 24.52m 0.41h 0.02d
Submission to last job: 43292s 721.53m 12.03h 0.50d
# -maxNode=50 was needed because it opens many output files at the same time - do not overwhelm NFS
# -----------------------
# Allen Brain Atlas update (Galt 2007-02-08)
# see mm6.txt for prep running allenCleanup and allenCollectSeq
# LOAD ALLEN BRAIN DATA
# note mm6,mm7,mm8 all have the same thing since it is for mouse generally
# note make sure the contributors list in vgLoadAllen.c is correct
vgLoadAllen \
/san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain \
/san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20061204.tab \
/cluster/data/mm6/bed/allenBrain/allProbes.fa \
/cluster/data/mm6/bed/allenBrain/allProbes.tab \
output
#Got 17913 images
#Got 17913 named probes
#Got 17913 probe sequences
# Did not do this:
# (instead, I asked Heather to clone entire visiGene db to visiGeneOld)
#backed-up data in case of trouble:
#mkdir /san/sanvol1/visiGene/dump/visiGene.20061220
#hgsqldump visiGene -T /san/sanvol1/visiGene/dump/visiGene.20061220
#restore fileLocation to point to dev
update fileLocation set name =
concat('http://hgwdev.cse.ucsc.edu',substring(name,INSTR(name,'/visiGene/')));
# 14 rows
# clean out the old ABA records before we do a full load
delete from submissionSource where name = 'Allen Brain Atlas (ABA)';
# 1 row
delete submissionSet from submissionSet ss left join submissionSource so on
ss.submissionSource=so.id where so.id is null;
# 1 row
delete submissionContributor from submissionContributor sc left join
submissionSet ss on sc.submissionSet=ss.id where ss.id is null;
# 13 rows
delete image from image i left join submissionSet ss on i.submissionSet=ss.id
where ss.id is null;
# 11736 rows
delete imageFile from imageFile imf left join submissionSet ss on
imf.submissionSet=ss.id where ss.id is null;
# 11736 rows
delete imageProbe from imageProbe ip left join image i on ip.image=i.id where
i.id is null;
# 11737 rows
delete expressionLevel from expressionLevel el left join imageProbe ip on
el.imageProbe=ip.id where ip.id is null;
# 0 rows
delete antibodySource from antibodySource abs left join submissionSource so on
abs.submissionSource=so.id where so.id is null;
# 0 rows
#load into visiGene db
visiGeneLoad -database=visiGene output/aba.ra output/aba.tab /dev/null
# RE-MAKE FULL TEXT INDEX
cd hg/visiGene/vgGetText
make alpha
# basically does this, and puts it in cgi-bin/visiGeneData/:
#vgGetText visiGene.text mm8 hg18
#ixIxx visiGene.text visiGene.ix visiGene.ixx
############################
# REBUILD PROBETRACK (DONE galt 2007-02-15)
# WITH vgProbeTrack PROGRAM - AFTER DOING Allen Brain Atlas update 2007-02-08
# (make a backup of visiGene db and these tables: {mm6,mm7,...}.vgProbes and {hg17,hg18,...}.vgAllProbes
cd /san/sanvol1/visiGene/dump
# (this backup shown is really an example template for the next person who needs to do this)
mkdir visiGene.20070215
cd visiGene.20070215
hgsqldump visiGene -T .
mkdir mm6; hgsqldump mm6 vgProbes -T mm6
mkdir mm7; hgsqldump mm7 vgProbes -T mm7
mkdir mm8; hgsqldump mm8 vgProbes -T mm8
mkdir hg17; hgsqldump hg17 vgAllProbes -T hg17
mkdir hg18; hgsqldump hg18 vgAllProbes -T hg18
#(do any others needed that might not be listed here)
#(document the reason for making the backup)
echo 'vgLoadAllen has been run on ABA update 2007-02-08, so making backup of visiGene db and probe tracks before updating probeTracks, ' > README
# OK, NOW USE vgProbeTrack TO UPDATE
cd ~/kent/src/hg/visiGene/vgProbeTrack
# Make sure vgProbeTrack program is up to date
make
# -sqlPath must be included whenever the vgProbes or vgAllProbes track tables do not yet exist for the db
# so it can find the .sql script to create vgProbes or vgAllProbes tables as needed.
# I happen to know that only JAX was updated since last time, and that is mouse only
# populate vgPrb with any new stuff in visiGene.probe (works for all taxons at once).
vgProbeTrack POP
# new probe records found = 7335, # new vgPrb records added = 7314
# most of these are old, but we updated ABA by dropping completely and re-adding
# so these probes find their way back via sequence identity of probes in vgPrb.sequence
# find sequence using various methods - given probe seq, primers, bacs, refseq, etc.
# must specify a specific assembly to use, so mm8 is ready to use now
# this finds any stuff for the mouse taxon
vgProbeTrack SEQ working mm8
rc = 17 = count of primers for genome search for taxon 10090
rc = 141 = count of primers for mrna search for taxon 10090
bac list read done.
found seq for 0 bacEndPairs
rc = 93 = count of refSeq mrna for mm8
rc = 1 = count of genRef mrna for mm8
rc = 4 = count of genbank mrna for mm8
rc = 19 = count of flatRef mrna for mm8
rc = 0 = count of flatAll mrna for mm8
rc = 0 = count of linkRef mrna for mm8
rc = 0 = count of linkAll mrna for mm8
rc = 1 = count of kgAlRef mrna for mm8
rc = 4 = count of kgAlAll mrna for mm8
# create alignments using either refSeqAli or all_mrna or bacEnds or blat. Took 1.5 hours.
# alignments are individually tracked per assembly here
# alignment successes go in $db.vgProbes psl track, and whether succeeded or failed,
# it only looks for things that have not already attempted alignment
# the status goes into visiGene.vgPrbAli with .db="mm8"
vgProbeTrack ALI working mm8
# this finds any seq required for mm8.vgProbes track not already in mm8.seq
# adds the new .fa file in /cluster/data/mm8/bed/visiGene/
# adds a symlink to it in /gbdb/mm8/visiGene/
# and runs hgLoadSeq mm6 /gbdb/mm8/visiGene/vgPrbExt_??????.fa to add it to mm8.seq
vgProbeTrack EXT working mm8
# mm6.vgProbes was already complete from previous probe track creation,
# it just needed to catch the new Allen Brain probes and align them. About 1.5 hours.
vgProbeTrack ALI working mm7
vgProbeTrack EXT working mm7
# mm6.vgProbes was already complete from previous probe track creation,
# it just needed to catch the new Allen Brain probes and align them. About 1.5 hours.
vgProbeTrack ALI working mm6
vgProbeTrack EXT working mm6
# hg18.vgAllProbes was pre-existing with all probes, just need to add new allenBrain mouse
# this internally uses pslMap against the mm8 to hg18 liftover chain.gz
# Because it is "Xeno" (from mouse to human), it creates track vgAllProbes,
# and maintains the list of processed alignments in visiGene.vgPrbAliAll.
vgProbeTrack PSLMAP working hg18 mm8
# updates hg18.seq/extFile similarly to the EXT command, but for All probes.
# just like with EXT, EXTALL puts .fa in /cluster/data/hg18/visiGene
# and symlink in /gbdb/hg18/visiGene and updates using hgLoadSeq.
# if a sequence has already been loaded it will not be loaded again.
vgProbeTrack EXTALL working hg18
# hg17.vgAllProbes existed before
vgProbeTrack PSLMAP working hg17 mm7
vgProbeTrack EXTALL working hg17
# RE-MAKE knownToVisiGene tables (see respective makedocs for these)
knownToVisiGene mm6
knownToVisiGene mm7
knownToVisiGene mm8
knownToVisiGene hg17 -fromProbePsl=vgAllProbes
knownToVisiGene hg18 -fromProbePsl=vgAllProbes
# update text/index for visiGene
cd hg/visiGene/vgGetText
make alpha
#vgGetText /usr/local/apache/cgi-bin/visiGeneData/visiGene.text mm7 mm8 hg17 hg18
#probe has 26611 rows
#gene has 20413 rows
#imageProbe has 125765 rows
################### (galt 2007-04-20 done)
# FIXED TWO ADDITIONAL ZOOM-OUT LEVELS 5 AND 6:
# Ran /san/sanvol1/offline/level56RunJax/ cluster job on a list of all files needed.
# Somehow 13000 pix were missing from the list when we made zoom out levels 5 and 6
# originally
cd /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/
find . -type d > dlist
vi dlist
#remove anything starting with "foo" or "ztest" or "goo" plus "."
#That should leave just valid directories.
cat dlist | sed -e 's/\.\///' > dlist2
[hgwdev:jax> cat level6missing.csh
#!/bin/tcsh
set nonomatch
while (1)
set i=$<
if ("$i" == "") then
break
endif
if ( -e $i/*_6_000.jpg) then
else
echo "$i"
endif
end
cat dlist2 | level6missing.csh > dlist3
cd /san/sanvol1/visiGene/offline
mkdir level56RunJax
cd level56RunJax
cp ../level56Run/level56.csh .
cp ../level56Run/gsub .
cat gsub
[hgwdev:level56RunJax> cat gsub
#LOOP
./level56.csh $(path1)
#ENDLOOP
cat /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/dlist3 | gawk '{print
"/san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/" $1 ".jpg"}' > jpg.lst
pk
cd /san/sanvol1/visiGene/offline/level56RunJax
gensub2 jpg.lst single gsub spec
para create spec
para try
para push
para time
#Completed: 13235 of 13235 jobs
#CPU time in finished jobs: 3819s 63.65m 1.06h 0.04d 0.000 y
#IO & Wait Time: 45674s 761.23m 12.69h 0.53d 0.001 y
#Average job time: 4s 0.06m 0.00h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 21s 0.35m 0.01h 0.00d
#Submission to last job: 560s 9.33m 0.16h 0.01d
# Followup to show that it worked:
#[hgwdev:jax> cat dlist2 | level6missing.csh > dlist3X
#[hgwdev:jax> ll dlist*
#-rw-rw-r-- 1 galt protein 117205 Apr 20 13:13 dlist
#-rw-rw-r-- 1 galt protein 85191 Apr 20 13:15 dlist2
#-rw-rw-r-- 1 galt protein 71495 Apr 20 13:33 dlist3
#-rw-rw-r-- 1 galt protein 0 Apr 20 14:25 dlist3X
#
#This shows that all completed (because dlist3X is empty)
#-----------------------------------------------
# Rsync request
#please rsync from /san to hgnfs1:
rsync hgwdev:/san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/
hgnfs1:/hgnfs1:/export/gbdb2/full/inSitu/Mouse/jax/
#
################### (galt 2007-05-09 done)
# Adding support to vgProbeTrack and knownToVisiGene for
# the BLATZ'd frog probes to mm8 which Jim did recently.
# knownToVisiGene no longer uses -fromProbePsl option,
# instead it automatically detects vgProbes and vgAllProbes
# and uses them in that order if no symbolic matches were found.
# Added a SELFMAP command to vgProbeTrack to migrate any missing
# self alignments in vgProbes to vgAllProbes
# Made a backup of visiGene.vg* first:
ssh hgwdev
cd /san/sanvol1/visiGene/dump
mkdir visiGene.20070509
cd visiGene.20070509
hgsqldump visiGene -T .
cd ~/kent/src/hg/visiGene/vgProbeTrack
vgProbeTrack -sqlPath=.. REMAP working mm8 nibb nibbImageProbes /gbdb/mm8/nibbImageProbes.fa
#FYI: Table mm8.vgAllProbes does not exist
#hgPepPred visiGene generic vgRemapTemp /gbdb/mm8/nibbImageProbes.fa
#Processing /gbdb/mm8/nibbImageProbes.fa
#Count of Psls found for reMap: 1379
#cat vgPrbReMap.psl vgAllProbes.psl | sort -u | sort -k 10,10 >vgAllProbesNew.psl
#hgLoadPsl mm8 vgAllProbesNew.psl -table=vgAllProbes
#Processing vgAllProbesNew.psl
#rm vgPrbReMap.psl vgAllProbes.psl vgAllProbesNew.psl
vgProbeTrack SELFMAP working mm8
#Count of nonBac Psls found for pslMap: 24615
#Count of bac Psls found for pslMap: 0
#cat bac.psl nonBac.psl > vgPrbSelfMap.psl
#cat vgPrbSelfMap.psl vgAllProbes.psl | sort -u | sort -k 10,10 >
#vgAllProbesNew.psl
#hgLoadPsl mm8 vgAllProbesNew.psl -table=vgAllProbes
#Processing vgAllProbesNew.psl
#rm vgPrbSelfMap.psl vgAllProbes.psl vgAllProbesNew.psl
vgProbeTrack EXTALL working mm8
#rc = 981 = count of sequences for vgPrbExt.fa, to use with mm8 trackvgAllProbes
#cp vgPrbExt.fa /cluster/data/mm8/bed/visiGene/vgPrbExt_YDGSWH.fa
#ln -s /cluster/data/mm8/bed/visiGene/vgPrbExt_YDGSWH.fa/gbdb/mm8/visiGene/vgPrbExt_YDGSWH.fa
#hgLoadSeq mm8 /gbdb/mm8/visiGene/vgPrbExt_YDGSWH.fa
#981 sequences
#Updating seq table
knownToVisiGene mm8
####################################################
################### (galt 2008-04-04 done)
# Slight name change for NIBB (affected visiGene)
# removed the word "Japanese " from NIBB name in visiGene.submissionSource
# removed same thing from vgLoadNibb.c source code.
# requested push of table hgwdev.visiGene.submissionSource.
################### (galt 2008-08-18 done)
# make downloads for visiGene
ssh hgwdev
co browser # if you haven't already done it
change browser module, downloads.html to add links to visiGene download
cvs commit browser/downloads.html
# updating the visiGene downloads
cd /usr/local/apache/htdocs/goldenPath
mkdir visiGene
cd visiGene
mkdir database
cd database
vi README
---------
This directory contains the downloadable tables in the UCSC visiGene
database. This database is shared by the program VisiGene
http://genome.ucsc.edu/cgi-bin/hgVisiGene
and tracks that incorporate visiGene data, such as the Known Genes tracks.
To see descriptions of the tables in visiGene, visit the Table Browser:
http://genome.ucsc.edu/cgi-bin/hgTables
select "All Tables" as the group, select visiGene as the database,
and select a table. Then click the "describe table schema" button.
---------
hgsqldump visiGene -T .
rm vgPrbAli.*
rm vgPrbAliAll.*
sed -i -e 's/hgwdev[.]cse/genome/' fileLocation.txt
gzip *.txt
Do a push-request:
------------
please rsync (with appropriate flags)
hgwdev:/usr/local/apache/htdocs/goldenPath/visiGene/
to
hgdownload:/usr/local/apache2/htdocs/goldenPath/visiGene/
Reason:
Now users will have an easier time of downloading visiGene database.
------------
also, first time only, do
update browser sandbox with links on downloads.html,
then do a push-request:
Please push downloads.html from dev to hgdownload:
hgwdev:/usr/local/apache/htdocs/downloads.html
to
hgdownload:/usr/local/apache2/htdocs/downloads.html
Reason:
added the page links for visiGene database download.
################### (galt 2008-09-08 done)
# move visiGene data to hive
ssh hgwdev
mv /san/SanVol1/visiGene /hive/data/inside/visiGene
ln -s /hive/data/inside/visiGene /gbdb/visiGene
# note /usr/local/apache/htdocs/visiGene is still a symlink to /gbdb/visiGene