src/hg/makeDb/doc/visiGene.txt 1.10

1.10 2009/10/14 15:29:11 kent
Creating knownToHprd column.
Index: src/hg/makeDb/doc/visiGene.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/visiGene.txt,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 1000000 -r1.9 -r1.10
--- src/hg/makeDb/doc/visiGene.txt	9 Sep 2008 21:59:09 -0000	1.9
+++ src/hg/makeDb/doc/visiGene.txt	14 Oct 2009 15:29:11 -0000	1.10
@@ -1,1450 +1,1451 @@
 # CREATE EMPTY DATABASE AND TABLES.
 hgsql -e "create database visiGeneNew" mysql
 hgsql visiGeneNew < ~/kent/src/hg/visiGene/visiGene.sql
 makeTableDescriptions visiGeneNew ~/kent/src/hg/visiGene/visiGene.as
 
 # LOAD PAUL GRAY/MAHONEY LAB DATA.
 # Transferred images from Paul Gray's Mac to mine and converted
 # his spreadsheet to a tab-separated file, cloning.tab.
 cd ~/kent/src/hg/visiGene/vgLoadMahoney
 vgLoadMahoney /gbdb/visiGene mm5 cloning.tab clonePcr.bed outDir
 cd outDir
 visiGeneLoad whole.ra whole.tab /dev/null -database=visiGeneNew
 visiGeneLoad slices.ra slices.tab /dev/null -database=visiGeneNew
 
 # LOAD JACKSON LABS DATA.
 # First ask Galt to create a local copy of the Jackson labs
 # database.  I'm not sure how he did it.
 cd ~/kent/src/hg/visiGene/vgLoadJax
 vgLoadJax /gbdb/visiGene jackson visiGene
 ./loadNew
 
 # Update the privateUser fields where we don't have permissions by entering
 # this at the mysql prompt.
 update submissionSet,journal set submissionSet.privateUser=-1 
    where (journal.name like 'Nat %' or journal.name = 'Nature')  
    and submissionSet.journal = journal.id and submissionSet.name like 'jax%'
 
 # LOAD NIBB IMAGES
 # Do this after creating the nibbImageProbe.fa file as described
 # in makeXenTro1.doc, and after creating the nibbImageProbes table
 # in hg17 as describe in makeHg17.doc.  The image files are
 # loaded in /cluster/store11/visiGene/offline/nibbFrog.
 ssh kolossus
 cd /cluster/store11/visiGene/offline
 nibbParseImageDir nibbFrog nibFrog.tab bad.tab
 nibbPrepImages nibbFrog nibFrog.tab \
 	/cluster/store11/visiGene/gbdb/200/inSitu/XenopusLaevis/nibb \
 	/cluster/store11/visiGene/gbdb/full/inSitu/XenopusLaevis/nibb
 # Note the nibbPrepImages step is a 2 day process, next time may
 # want to run it on the kki cluster.  It does need to be run on a 64
 # bit machine because of bugs in the 32 bit image magick convert program.
 
 ssh hgwdev
 cd ~/kent/src/hg/visiGene/vgLoadNibb
 hgMapToGene hg17 nibbImageProbes knownGene knownToNibbImage
 
 # Now go into the gene sorter on hg17, configure it to just show
 # the name, genbank, and NIBB Xenopus columns.  Filter on * in the
 # NIBB Xenopus column (which will get rid of rows with no data in that
 # column).  Save the text output to names.raw.  Then get rid of names
 # that are no more than genbank accessions as so:
 awk '$1 != $2 {printf("%s\t%s\n", $1, $3);}' names.raw > names.txt
 
 # Now create the .tab and .ra files as so:
 vgLoadNibb /cluster/store11/visiGene/offline/nibbFrog \
 	/cluster/store11/visiGene/offline/nibbFrog.tab \
 	/cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa \
 	names.tab stage.tab outDir
 visiGeneLoad outDir/nibb.ra outDir/nibb.tab /dev/null -database=visiGeneNew
 
 
 # LOAD GENSAT IMAGES
 # This was done with the assistance of Mike Dicuccio at NCBI, 
 # dicuccio@ncbi.nlm.nih.gov.  If updating probably it's best to
 # get in touch with him and make sure that the ftp site is up to
 # date.   
 
 # Download data from NCBI into /cluster/store11/visiGene/offline/gensat
 cd /cluster/store11/visiGene/offline
 mkdir gensat
 cd gensat
 mkdir RawData
 cd RawData
 wget --timeStamping ftp://ftp.ncbi.nih.gov/pub/gensat/RawData/GENSAT-20051120.xml.gz
 wget --timeStamping ftp://ftp.ncbi.nih.gov/pub/gensat/RawData/NCBI_Gensat-20051120.dtd
 
 # At this point if the dtd has changed you may need to remake 
 # kent/src/hg/visiGene/gensat/lib/gs.c with autoXml.  Once
 # this is done then do the download with gensatImageDownload.
 # It'll take about 3 days. The results will be in the Institutions dir.
 cd /cluster/store11/visiGene/offline/gensat
 zcat RawData/GENSAT-20051120.xml.gz | gensatImageDownload . download.log
 
 # Create parasol directory and a list of the jpg files.
 ssh kki
 cd /cluster/store11/visiGene/offline/gensat
 mkdir prepImageRun
 find Institutions -name '*.jpg' -print | sed 's/Institutions\//' > prepImageRun/jpg.lst
 cd prepImageRun
 
 # Create parasol batch
 cat << '_EOF_' > gsub
 #LOOP
 vgPrepImage /cluster/store11/visiGene/offline/gensat/Institutions /cluster/store11/
 visiGene/gbdb/200/inSitu/Mouse/gensat /cluster/store11/visiGene/gbdb/full/inSitu/Mo
 use/gensat $(path1)
 #ENDLOOP
 '_EOF_'
 # << this line makes emacs coloring happy
 gensub2 jpg.lst single gsub spec
 para make spec
 
 # Note the above procedure would take about 3 days.  I ended up copying the
 # data over to /san/sanvol1, and doing it on the pita cluster.  The job
 # there just took two hours, with just 100 cpus available.  It took
 # an hour to copy the data over, and eight hours to copy it back though,
 # and some tweaking.
 
 # MAKE FULL TEXT INDEX
 cd /cluster/store11/visiGene/gbdb
 vgGetText visiGene.text mm7 hg17
 ixIxx visiGene.text visiGene.ix visiGene.ixx
 
 
 # (Galt 2006-02)
 # RSYNC'd from /cluster/store11/visiGene to /san/sanvol1/visiGene
 # and moved the /gbdb/visiGene symlink to point to the new location.
 # I also had to manually run a script to find symlinks pointing from full/ over to 
 # /cluster/store11/offline and remake them to point correctly to /san/sanvol1/visiGene/offline.
 
 # Allen Brain Atlas jp2 image prep (Galt 2006-02-12)
 # Create parasol directory and a list of the jpg files.
 ssh pk
 cd /san/sanvol1/visiGene/offline/allenBrain
 mkdir prepImageRun
 find imageDisk -name '*.jp2' -print | sed 's/imageDisk\///' > prepImageRun/jpg.lst
 cd prepImageRun
 # Create parasol batch
 cat << '_EOF_' > gsub
 #LOOP
 vgPrepImage /san/sanvol1/visiGene/offline/allenBrain/imageDisk /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain $(path1)
 #ENDLOOP
 '_EOF_'
 # << this line makes emacs coloring happy
 gensub2 jpg.lst single gsub spec
 para make spec -maxNode=50
 
 [pk:/san/sanvol1/visiGene/offline/allenBrain/prepImageRun> /parasol/bin/para time
 11748 jobs in batch
 4291 jobs (including everybody's) in Parasol queue.
 Checking finished jobs
 Completed: 11748 of 11748 jobs
 CPU time in finished jobs:     474919s    7915.32m   131.92h    5.50d  0.015 y
 IO & Wait Time:               5029116s   83818.60m  1396.98h   58.21d  0.159 y
 Average job time:                 469s       7.81m     0.13h    0.01d
 Longest running job:                0s       0.00m     0.00h    0.00d
 Longest finished job:           41811s     696.85m    11.61h    0.48d
 Submission to last job:        172301s    2871.68m    47.86h    1.99d
 
 
 # -maxNode=50 was needed. 
 # Note that because it opens up to 40 output files at the same time, it overwhelms NFS
 # when a lot of nodes are running, it can bring down the SAN.  Because I was nearly
 # done when it came back up, I just re-pushed with -maxNode=50 to keep it under control.
 # However in the future, something like this should be done to keep the file access local
 # as much as possible.
 # Here is the proposed new way:
 # -----------------------
 cat << '_EOF_' > gsub
 #LOOP
 ./vgPrep.csh $(path1) $(root1) $(file1)
 #ENDLOOP
 '_EOF_'
 # << this line makes emacs coloring happy
 
 cat << '_EOF_' > vgPrep.csh
 #!/bin/tcsh
 mkdir -p /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain/$1
 mkdir -p /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain/$1
 cp /san/sanvol1/visiGene/offline/allenBrain/imageDisk/$1 /scratch/tmp/$3
 vgPrepImage /san/sanvol1/visiGene/offline/allenBrain/imageDisk /scratch/tmp/vg200$2 /scratch/tmp/vgfull$2 $1
 set err = $status
 if (! $err ) then
     cp -r /scratch/tmp/vg200$2/* /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain
     cp -r /scratch/tmp/vgfull$2/* /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain
 endif
 rm -f  /scratch/tmp/$3
 rm -fr /scratch/tmp/vg200$2
 rm -fr /scratch/tmp/vgfull$2
 if ( $err ) then
     exit 1
 endif
 '_EOF_'
 # << this line makes emacs coloring happy
 
 # -----------------------
 
 # ADDED TWO ADDITIONAL ZOOM-OUT LEVELS 5 AND 6:
 # Ran /san/sanvol1/offline/level56Run/ cluster job on a list of all files dumped
 #  from the visiGene.imageFile table so that we made new zoom out levels 5 and 6
 #  for all pictures.  Since it was a special one-time deal, I just used ImageMagick.
 # vgPrepImage.c has been modified to do the 2 new zoomout levels so that they
 #  will be built automatically in future.
 
 # Ran several checks to make sure no files were missing, fixed any errors.
 # Found embedded space in some nibb filenames, found a couple of gensat 
 # images which had previously failed to download and redownloaded them ok.
 # Found a few missing things and 0 bytes jpgs and re-ran them.  
 # It should be pretty clean right now.
 
 # LOAD ALLEN BRAIN DATA
 vgLoadAllen \
  /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain \
  /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20051021.tab \
  /cluster/data/mm7/bed/allenBrain/allProbes.fa \
  /cluster/data/mm7/bed/allenBrain/allProbes.tab \
  output
 #backed-up data in case of trouble:
 mkdir /san/sanvol1/visiGene/dump/visiGene.20060220
 hgsqldump visiGene -T /san/sanvol1/visiGene/dump/visiGene.20060220
 #load into visiGene db
 visiGeneLoad -database=visiGene output/aba.ra output/aba.tab /dev/null
 
 # Manually added several researchers names to the contributor and submissionContributor tables
 # at the request of Susan Sunkin as well as updating the text for contributor, copyright, acknowledgements.
 # I manually also updated aba.ra and vgLoadAllen.c to reflect her changes.  The manual mods
 # to contributor which work great in the visiGene search are not currently automatically
 # supported, and would thus be lost if we ever nuke it and start fresh.
 # At some point, we will probably add an additional field to the .ra structure
 # and have visiGeneLoad support it.
 
 # RE-MAKE FULL TEXT INDEX
 cd hg/visiGene/vgGetText
 make alpha
 # basically does this, but puts it in cgi-bin/visiGeneData/:
 #vgGetText visiGene.text mm7 hg17
 #ixIxx visiGene.text visiGene.ix visiGene.ixx
 # (hgVisiGene cgi v128 now knows about this new location)
 
 ############################
 
-# REBUILD WITH NEW vgProbeTrack PROGRAM - AFTER ADDING ALLEN BRAIN (DONE galt 2006-03-15)
+# REBUILD WITH NEW vgProbeTrack PROGRAM - AFTER ADDING ALLEN BRAIN (DONE galt 2009-10-12)
 
 # (make a backup of visiGene db and these tables: {mm6,mm7,...}.vgProbes and {hg17,hg18,...}.vgAllProbes
-cd /san/sanvol1/visiGene/dump
+cd /hive/data/inside/visiGene/dump
 # (this backup shown is really an example template for the next person who needs to do this)
-mkdir visiGene20060315
-cd visiGene20060315
+mkdir visiGene.20060315
+cd visiGene.20060315
 hgsqldump visiGene -T .
 mkdir mm6; hgsqldump mm6 vgProbes -T mm6
 mkdir mm7; hgsqldump mm7 vgProbes -T mm7
 mkdir mm8; hgsqldump mm8 vgProbes -T mm8
+mkdir mm9; hgsqldump mm9 vgProbes -T mm9
 mkdir hg17; hgsqldump hg17 vgAllProbes -T hg17
 mkdir hg18; hgsqldump hg18 vgAllProbes -T hg18
 #(do any others needed that might not be listed here)
 #(document the reason for making the backup)
 echo 'vgLoadAllenBrain has been run, so making backup of visiGene db and probe tracks before updating, ' > README
 
 # OK, NOW USE vgProbeTrack TO UPDATE
 
 cd ~/kent/src/hg/visiGene/vgProbeTrack
 
 # -sqlPath must be included whenever the vgProbes or vgAllProbes track tables do not yet exist for the db
 # so it can find the .sql script to create vgProbes or vgAllProbes tables as needed.
 # I happen to know that only AllenBrain was updated since last time, and that is mouse only
 
 # populate vgPrb with any new stuff in visiGene.probe (works for all taxons at once).
 vgProbeTrack POP
 
 # find sequence using various methods - given probe seq, primers, bacs, refseq, etc.
 #  must specify a specific assembly to use, so just using mm7 since mm8 still in qa.
 #  this finds any stuff for the mouse taxon
 vgProbeTrack SEQ working mm7  
 
 # create alignments using either refSeqAli or all_mrna or bacEnds or blat.  Took 1.5 hours.  
 # alignments are individually tracked per assembly here
 # alignment successes go in $db.vgProbes psl track, and whether succeeded or failed,
 # it only looks for things that have not already attempted alignment
 # the status goes into visiGene.vgPrbAli with .db="mm7"
 # because mm7.vgProbes is a new table, to create it we include the -sqlPath so
 # it can find the vgProbes.sql script
 vgProbeTrack ALI working mm7 -sqlPath=..
 
 # this finds any seq required for mm7.vgProbes track not already in mm7.seq 
 # adds the new .fa file in /cluster/data/mm7/bed/visiGene/
 # adds a symlink to it in /gbdb/mm7/visiGene/
 # and runs hgLoadSeq mm6 /gbdb/mm7/visiGene/vgPrbExt_??????.fa to add it to mm7.seq
 vgProbeTrack EXT working mm7
 
 # mm6.vgProbes was already complete from previous probe track creation, 
 #  it just needed to catch the new Allen Brain probes and align them.  About 1.5 hours.
 vgProbeTrack ALI working mm6
 vgProbeTrack EXT working mm6
 
 # hg17.vgAllProbes was pre-existing with all probes, just need to add new allenBrain mouse
 # this internally uses pslMap against the mm7 to hg17 liftover chain.gz
 # Because it is "Xeno" (from mouse to human), it creates track vgAllProbes,
 # and maintains the list of processed alignments in visiGene.vgPrbAliAll.
 vgProbeTrack PSLMAP working hg17 mm7  
 # updates hg17.seq/extFile similarly to the EXT command, but for All probes.
 # just like with EXT, EXTALL puts .fa in /cluster/data/hg17/visiGene
 # and symlink in /gbdb/hg17/visiGene and updates using hgLoadSeq.
 # if a sequence has already been loaded it will not be loaded again.
 vgProbeTrack EXTALL working hg17
 
 # hg18.vgAllProbes never existed before
 vgProbeTrack PSLMAP working hg18 mm7  -sqlPath=..
 # because the nibb blatz probe track hg18.nibbImageProbes was never done on hg18 
 # until just now (see makeHg18.doc), we have to add it for the first time.
 # "nibb" is not really a db here, so I manually put in a taxon mapping for it, 
 # so it appears as Xenopus laevis 8355, see the source code.
 vgProbeTrack REMAP working hg18 nibb nibbImageProbes /gbdb/hg18/nibbImageProbes.fa  
 vgProbeTrack EXTALL working hg18
 
 # mm8 is in qa and so it is basically ready to use now.  About 1.5 hours.
 vgProbeTrack ALI working mm8  -sqlPath=..
 vgProbeTrack EXT working mm8
 
 
 # RE-MAKE knownToVisiGene tables (see respective makedocs for these)
 #knownToVisiGene mm6
 #knownToVisiGene mm7
 #knownToVisiGene mm8
 #knownToVisiGene hg17 -fromProbePsl=vgAllProbes
 #knownToVisiGene hg18 -fromProbePsl=vgAllProbes
 
 ############################
 
 ###  JACKSON UPDATE (done 2006-04-01 galt)  #############
 
 # updated jackson20060328 db on kkr3u00 (see hg/visiGene/jackson/makeJackson.doc)
 
 # Dropped old visiGeneOld db, asked Heather to clone visiGene db to visiGeneOld db,
 # and then ran this query to remove the old previous JAX info:
 # MULTI-TABLE DELETE:
 delete submissionSource, submissionSet, submissionContributor, image,
 imageProbe, expressionLevel, imageFile from
 submissionSource so,
 submissionSet ss,
 submissionContributor sc,
 image i,
 imageProbe ip,
 expressionLevel el,
 imageFile f
 where so.id = 2
 and ss.submissionSource = so.id
 and sc.submissionSet = ss.id
 and i.submissionSet = ss.id
 and ip.image = i.id
 and el.imageProbe = ip.id
 and f.submissionSet = ss.id;
 
 #delete query (get rid of all submissionSource.id=2)
 #Query OK, 164717 rows affected (48 min 16.07 sec)
 
 # Workaround for uniProt access from kkr3u00
 ssh hgwdev
 setenv jdb jackson20060328
 cd ~/kent/src/hg/visiGene/vgLoadJax
 hgsqldump uniProt taxon commonName -T .
 ssh kkr3u00
 setenv jdb jackson20060328
 cd ~/kent/src/hg/visiGene/vgLoadJax
 hgsql mysql -e "create database uniProt"
 hgsql uniProt < taxon.sql
 hgsql uniProt < commonName.sql
 # hgsql uniProt -e 'show tables'
 hgsql uniProt -e "load data local infile 'taxon.txt' into table taxon"
 hgsql uniProt -e "load data local infile 'commonName.txt' into table commonName"
 # hgsql uniProt -e 'show table status\G'
 # cleanup
 rm taxon.*
 rm commonName.*
 
 #update vgLoadJax.c to update the date given in .ra acknowledgements
 #recompile vgLoadJax on dev
 #run vgLoadJax to create .ra .tab .txt for each submissionSet
 ssh kkr3u00
 setenv jdb jackson20060328
 cd ~/kent/src/hg/visiGene/vgLoadJax
 #remove any old data dir
 rm -fr visiGene/
 # visiGene in line below is just an output dir for the .ra/.tab/.txt files
 ~/bin/i386/vgLoadJax /san/sanvol1/visiGene/gbdb jackson20060328 visiGene
 #ref 32185: missing title from BIB_Refs, ref skipped
 #Calculating age from postnatal
 #ref 67768: missing title from BIB_Refs, ref skipped
 #Calculating age from postnatal month 3
 #Calculating age from postnatal
 #Calculating age from postnatal
 #Calculating age from postnatal month 4
 #Calculating age from postnatal month 4
 #Calculating age from Not Specified 12.5
 #refCount=2970
 
 
 #ran loadAll to load the updated jax .ra .tab .txt into visiGene db
 ssh hgwdev
 cd ~/kent/src/hg/visiGene/vgLoadJax
 loadAll
 #loadAll.output has 1112 lines like
 #visiGene/100423.ra
 
 # ran vgGetText to update cgi-bin-galt/visiGeneData/ using visiGene db
 cd ~/kent/src/hg/visiGene/vgGetText
 make alpha
 # output:
 #vgGetText /usr/local/apache/cgi-bin/visiGeneData/visiGene.text mm7 hg17
 #probe has 19276 rows
 #gene has 15173 rows
 #imageProbe has 115500 rows
 
 # recompiled hgVisiGene 
 
 
 ############################
 
 # REBUILD PROBETRACK   (DONE galt 2006-04-04)
 #    WITH vgProbeTrack PROGRAM - AFTER DOING JAX UPDATE 20060328 
 
 # (make a backup of visiGene db and these tables: {mm6,mm7,...}.vgProbes and {hg17,hg18,...}.vgAllProbes
 cd /san/sanvol1/visiGene/dump
 # (this backup shown is really an example template for the next person who needs to do this)
 mkdir visiGene.20060404
 cd visiGene.20060404
 hgsqldump visiGene -T .
 mkdir mm6; hgsqldump mm6 vgProbes -T mm6
 mkdir mm7; hgsqldump mm7 vgProbes -T mm7
 mkdir mm8; hgsqldump mm8 vgProbes -T mm8
 mkdir hg17; hgsqldump hg17 vgAllProbes -T hg17
 mkdir hg18; hgsqldump hg18 vgAllProbes -T hg18
 #(do any others needed that might not be listed here)
 #(document the reason for making the backup)
 echo 'vgLoadJax jackson20060328 has been run, so making backup of visiGene db and probe tracks before updating probeTracks, ' > README
 
 # OK, NOW USE vgProbeTrack TO UPDATE
 
 cd ~/kent/src/hg/visiGene/vgProbeTrack
 
 # Make sure vgProbeTrack program is up to date
 make
 
 # -sqlPath must be included whenever the vgProbes or vgAllProbes track tables do not yet exist for the db
 # so it can find the .sql script to create vgProbes or vgAllProbes tables as needed.
 # I happen to know that only JAX was updated since last time, and that is mouse only
 
 # populate vgPrb with any new stuff in visiGene.probe (works for all taxons at once).
 vgProbeTrack POP
 #new probe records found = 1285, # new vgPrb records added = 1285
 #   most of these are old, but we updated JAX by dropping completely and re-adding
 #   so these probes find their way back via sequence identity of probes in vgPrb.sequence
 
 # find sequence using various methods - given probe seq, primers, bacs, refseq, etc.
 #  must specify a specific assembly to use, so mm7 is ready to use now, mm8 still in qa.
 #  this finds any stuff for the mouse taxon
 vgProbeTrack SEQ working mm7
 
 #rc = 0 = count of primers for mrna search for taxon 10090
 #rc = 0 = count of primers for genome search for taxon 10090
 #bac list read done.
 #found seq for 0 bacEndPairs
 #rc = 549 = count of refSeq mrna for mm7
 #rc = 18 = count of genRef mrna for mm7
 #rc = 33 = count of genbank mrna for mm7
 #rc = 428 = count of flatRef mrna for mm7
 #rc = 0 = count of flatAll mrna for mm7
 #rc = 1 = count of linkRef mrna for mm7
 #rc = 0 = count of linkAll mrna for mm7
 #rc = 1 = count of kgAlRef mrna for mm7
 #rc = 37 = count of kgAlAll mrna for mm7
 
 
 # create alignments using either refSeqAli or all_mrna or bacEnds or blat.  Took 1.5 hours.  
 # alignments are individually tracked per assembly here
 # alignment successes go in $db.vgProbes psl track, and whether succeeded or failed,
 # it only looks for things that have not already attempted alignment
 # the status goes into visiGene.vgPrbAli with .db="mm7"
 vgProbeTrack ALI working mm7
 
 # this finds any seq required for mm7.vgProbes track not already in mm7.seq 
 # adds the new .fa file in /cluster/data/mm7/bed/visiGene/
 # adds a symlink to it in /gbdb/mm7/visiGene/
 # and runs hgLoadSeq mm6 /gbdb/mm7/visiGene/vgPrbExt_??????.fa to add it to mm7.seq
 vgProbeTrack EXT working mm7
 
 # mm6.vgProbes was already complete from previous probe track creation, 
 #  it just needed to catch the new Allen Brain probes and align them.  About 1.5 hours.
 vgProbeTrack ALI working mm6
 vgProbeTrack EXT working mm6
 
 # hg17.vgAllProbes was pre-existing with all probes, just need to add new allenBrain mouse
 # this internally uses pslMap against the mm7 to hg17 liftover chain.gz
 # Because it is "Xeno" (from mouse to human), it creates track vgAllProbes,
 # and maintains the list of processed alignments in visiGene.vgPrbAliAll.
 vgProbeTrack PSLMAP working hg17 mm7  
 # updates hg17.seq/extFile similarly to the EXT command, but for All probes.
 # just like with EXT, EXTALL puts .fa in /cluster/data/hg17/visiGene
 # and symlink in /gbdb/hg17/visiGene and updates using hgLoadSeq.
 # if a sequence has already been loaded it will not be loaded again.
 vgProbeTrack EXTALL working hg17
 
 # hg18.vgAllProbes existed before
 vgProbeTrack PSLMAP working hg18 mm7 
 vgProbeTrack EXTALL working hg18
 
 # mm8 is in qa and so it is basically ready to use now.
 vgProbeTrack ALI working mm8
 vgProbeTrack EXT working mm8
 
 
 # RE-MAKE knownToVisiGene tables (see respective makedocs for these)
 knownToVisiGene mm6
 knownToVisiGene mm7
 knownToVisiGene mm8
 knownToVisiGene hg17 -fromProbePsl=vgAllProbes
 knownToVisiGene hg18 -fromProbePsl=vgAllProbes
 
 # update text/index for visiGene
 cd hg/visiGene/vgGetText
 make alpha
 #vgGetText /usr/local/apache/cgi-bin/visiGeneData/visiGene.text mm7 hg17
 #probe has 19276 rows
 #gene has 15173 rows
 #imageProbe has 115500 rows
 
 
 ############################
 #
 # Patch contributors so we can search MGI submission sets 
 #  by specifying JAX or MGI in the search box.
 #
 
 select id from submissionSource where name = 'MGI';
 +----+
 | id |
 +----+
 | 6  |
 +----+
 
 # note: we have to double the search word or else the search doesn't work
 insert into contributor set name = 'JAX JAX';
 insert into contributor set name = 'MGI MGI';
 mysql> select * from contributor where name in ('JAX JAX','MGI MGI');
 +------+---------+
 | id   | name    |
 +------+---------+
 | 3981 | JAX JAX |
 | 3982 | MGI MGI |
 +------+---------+
 
 insert into submissionContributor select id, '3981' from submissionSet where submissionSource = 6;
 insert into submissionContributor select id, '3982' from submissionSet where submissionSource = 6;
 
 ##### ADD SUPPORT FOR ABURL (DONE 2006-04-19 galt)
 # I manually updated this, currently needed only by JAX,
 # adding antibodySource table that maps abSubmitId to antibody and submissionSource
 # and adding field abUrl to submissionSource table.
 # The code for vgLoadJax and visiGeneLoad were also updated to support this new 
 # link from antibody probe to submissionSource website for further details.
 # Since this will be automatically maintained in future, no point in belaboring the makefile.
 # This also involved an update to hgVisiGene including passing submissionSource id on
 # the url to the primers page so that the external link can be made when it is an antibody.
 
 ##### REPLICATED submissionSet.privateUser SETTINGS TO NEW VISIGENE DB (DONE 2006-04-24 galt)
 # This was an oversight caused by full removal of all old jax submissionSets 
 # when we did the jax 2006-03-28 update.  Since we had lost the privateUser settings,
 # I just replicated it from visiGeneOld with a simple query.
 update visiGene.submissionSet n, visiGeneOld.submissionSet o set n.privateUser=-1
 where o.privateUser=-1 and o.name = n.name;
 # currently this is just jax submissionSets for which we have not received permissions to use.
 
 
 ##### ADDED IMAGEFILE-FORWARDING TO COMBINE MAHONEY AND JAX-MAHONEY ANNOTATIONS (DONE 2006-04-26 galt)
 # The idea here is that JAX has some useful annotations, but including them made a lot of 
 # unnecessary duplication in the system.  Although it wasn't easy, we have come up with
 # a method to map the imageFiles from Mahoney to the ones in JAX.  We have made imageFileFwd table
 # to store that mapping information, and added code to hgVisiGene to use it.  Wholemount steps
 # are manual, while slices steps use hg/visiGene/vgLoadJax/forwardSlices.c I wrote to map them Mah->JAX.
 # Additional complications are that JAX combined several slices together into one image
 # following a certain pattern.  Luckily for the wholemounts, the original images were not modified by JAX.
 # This means that we can get a perfect match Mah->JAX for the wholemounts using md5sum (produced unique values).
 # Because both we and JAX imported the Mahoney data/spreadsheet into auto-incremented primary keyfield
 # tables, the original order is preserved and allows a surprisingly good mah->jax many-to-one slices mapping.
 # I also extended vgLoadJax to be able to find the primers in the PRB_Notes which was useful both 
 # for mapping the slices, and because we end up using JAX annotations for the fullCaption() page,
 # so that we don't lose primer info. For the remaining fraction where Mahoney never supplied primers
 # (actually they have since updated the info, but neither JAX nor we have gotten that yet),
 # something over 20%, we have managed to instead just map on gene. This worked surprisingly well,
 # and made either correct or very close matches.
 
 # Since previously, vgLoadJax looked for the mahoney set in jax and excluded it, we need to 
 # make and import it into visiGene.  I have removed the skipping of mahoney set from the 
 # vgLoadJax code (so that next time we update jax, the mahoney set will not be excluded)
 # and added a commandline option to do just a single submission set.  I happen to know that
 # the mahoney set in jax is jax92242.  THIS STEP WON'T BE NEEDED IN FUTURE.
 
 # the latest jax sybase db conversion is on kkr3u00 because it had space and little use.
 ssh kkr3u00
 cd ~/kent/src/hg/visiGene/vgLoadJax
 # clean out any old subdirectory
 rm -fr visiJaxMahoney
 # process just the jaxMahoney submissionSet
 ${HOME}/bin/i386/vgLoadJax -oneSubmissionSet=92242 /san/sanvol1/visiGene/gbdb jackson20060328 visiJaxMahoney
 
 # load it into visiGene db
 ssh hgwdev
 visiGeneLoad visiJaxMahoney/92242.ra  visiJaxMahoney/92242.tab  visiJaxMahoney/92242.txt
 
 # we are going to treat the jax version of Mahoney as "privateUser" 
 # in order to suppress it and reduce the duplication of Mahoney images
 hgsql visiGene -e 'update submissionSet set privateUser=-1 where name like "jax92242"'
 
 submissionSets:  (for reference)
 name       id
 -----------------------------------
 mahoneyWhole    = 1
 mahoneySlices01 = 2
 jax92242        = 1820
 
 			  
 ssh hgwdev
 cd /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/mahoney/wholeMount
 md5sum *.jpg | sed -e 's/  /\t/' | sort > ~/kent/src/hg/visiGene/vgLoadJax/mahoneyWholeMount.md5
 #(quick - 1 or 2 minutes only)
 
 #Find the jax-Mahoney images that are WholeMount
 ssh hgwdev
 cd /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax
 
 # WARNING: change database name and submissionSet id constants in text below if needed!!!
 #  jaxMahoney = 1820, bodyPart.id = 1 for name="whole"
 hgsql visiGene -BN -e 'select distinct imageFile.fileName from imageFile, image, specimen, bodyPart \
 where imageFile.submissionSet=1820 and bodyPart=1 \
 and image.imageFile=imageFile.id and image.specimen=specimen.id' \
  | xargs md5sum | sed -e 's/  /\t/' | sort > ~/kent/src/hg/visiGene/vgLoadJax/jaxMahoneyWholeMount.md5
 
 cd ~/kent/src/hg/visiGene/vgLoadJax
 
 # verify that they are unique by md5sum:
 wc -l *.md5
    1833 jaxMahoneyWholeMount.md5
    1843 mahoneyWholeMount.md5
 sort -k 1,1 -u jaxMahoneyWholeMount.md5 | wc -l
    1833
 sort -k 1,1 -u mahoneyWholeMount.md5 | wc -l
    1843
 
 hgsql visiGene
 
 create table mahoneyWholeMountMd5 (
     md5 char(32) not null,         # md5 sum of .jpg
     fileName varchar(10) not null, # .jpg fileName
     INDEX(md5),
     INDEX(fileName)
 );
 load data local infile 'mahoneyWholeMount.md5' into table mahoneyWholeMountMd5;
 analyze table mahoneyWholeMountMd5;
 
 create table jaxMahoneyWholeMountMd5 (
     md5 char(32) not null,         # md5 sum of .jpg
     fileName varchar(10) not null, # .jpg fileName
     INDEX(md5),
     INDEX(fileName)
 );
 load data local infile 'jaxMahoneyWholeMount.md5' into table jaxMahoneyWholeMountMd5;
 analyze table jaxMahoneyWholeMountMd5;
 
 # verify that they match uniquely and completely:
 select count(*) from mahoneyWholeMountMd5 m, jaxMahoneyWholeMountMd5 j where m.md5 = j.md5;
 +----------+
 | count(*) |
 +----------+
 |     1833 |
 +----------+
 
 # make forwarding table (NO NEED TO DO IN FUTURE, IS IN visiGene.as,.sql)
 CREATE TABLE imageFileFwd (
     fromIf int not null,      # From imageFile
     toIf   int not null,      #   To imageFile
         #Indices
     INDEX(fromIf),
     INDEX(toIf)
 );
 
 # WARNING: change submissionSet ids!!!
 # find how the mahoney matches to the jaxMahoney
 #    (I verified that all filenames are unique in all 3 submissionSets: jaxM, mWhole, mSlices)
 insert into imageFileFwd
 select mi.id, ji.id from imageFile mi, imageFile ji, mahoneyWholeMountMd5 m, jaxMahoneyWholeMountMd5 j
 where m.md5 = j.md5 and mi.fileName=m.fileName and ji.fileName=j.fileName
  and mi.submissionSet=1 and ji.submissionSet=1820;
 # Records: 1828
 
 # the wholemounts are now done, so let's do the slices next!
 
 
 
 # Cluster Run to do OCR on jaxMahoneySlices (Galt 2006-04-28)
 #  if program ocrad is not in /cluster/bin/i386, download and compile it (very easy)
 #  ocrad is a gnu program
 
 # Create parasol directory and a list of the jpg files.
 ssh hgwdev
 cd /san/sanvol1/visiGene/offline/jax
 mkdir ocrJaxMahoneyRun
 cd ocrJaxMahoneyRun
 mkdir output
 
 # make list of jaxMahoneySlice .jpgs
 # WARNING: change database and submissionSet ids!!!
 hgsql visiGene -BN -e 'select distinct imageFile.fileName from \
 imageFile, image, specimen \
 where imageFile.submissionSet=1820 and bodyPart<>1 \
 and image.imageFile=imageFile.id and image.specimen=specimen.id' \
  > jaxMahoneySlices.list
 
 # Create parasol batch
 cat << '_EOF_' > gsub
 #LOOP
 ./ocrSlices.csh $(file1) $(root1)
 #ENDLOOP
 '_EOF_'
 # << this line makes emacs coloring happy
 
 cat << '_EOF_' > ocrSlices.csh
 #!/bin/tcsh -ef
 if ( -e output/$2.map ) then
     rm output/$2.map
 endif
 if ( -e output/$2.ocr ) then
     rm output/$2.ocr
 endif
 convert /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/$1 output/$2.pgm
 # There wasn't a single threshold value that worked, so do entire series
 /cluster/bin/i386/ocrad --threshold=.4 --charset=ascii output/$2.pgm >> output/$2.ocr
 /cluster/bin/i386/ocrad --threshold=.5 --charset=ascii output/$2.pgm >> output/$2.ocr
 /cluster/bin/i386/ocrad --charset=ascii output/$2.pgm >> output/$2.ocr
 /cluster/bin/i386/ocrad --threshold=.6 --charset=ascii output/$2.pgm >> output/$2.ocr
 /cluster/bin/i386/ocrad --threshold=.7 --charset=ascii output/$2.pgm >> output/$2.ocr
 /cluster/bin/i386/ocrad --threshold=.8 --charset=ascii output/$2.pgm >> output/$2.ocr
 /cluster/bin/i386/ocrad --threshold=.9 --charset=ascii output/$2.pgm >> output/$2.ocr
 
 @ x = $2
 # special handling for 7996.jpg thru 8060.jpg, the Accession does not end in "aa" for these.
 if ( ($x >= 7996) && ($x <= 8060) ) then
     cat output/$2.ocr | tr lOoiI 10011 | tr -d . | perl -0ne 'print "$1\n" while ( /(T\d{8})/gs )' | sort -u > output/$2.temp
 else
     cat output/$2.ocr | tr lOoiI 10011 | tr -d . | perl -0ne 'print "$1\n" while ( /(T\d{8}aa)/gs )' | sort -u > output/$2.temp
 endif
 
 set tempTs = ( `cat output/$2.temp` )
 if ( $#tempTs > 0 ) then
     foreach t ( $tempTs )
         if ( ($x >= 7996) && ($x <= 8060) ) then  # special handling for these
             if (-e /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/mahoney/slices/${t}aa) then
                 set t = "${t}aa"
             endif
             if (-e /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/mahoney/slices/${t}00) then
                 set t = "${t}00"
             endif
         endif
         echo "$1\t$t" >> output/$2.map
     end
 else
     echo "$1\tNO_TEXT" > output/$2.map
 endif
 rm output/$2.temp
 
 '_EOF_'
 # << this line makes emacs coloring happy
 
 chmod a+x ocrSlices.csh
 
 ssh pk
 cd /san/sanvol1/visiGene/offline/jax/ocrJaxMahoneyRun
 
 gensub2 jaxMahoneySlices.list single gsub spec
 para create spec
 para try
 para push
 para check
 para time
 #2095 jobs in batch
 #292661 jobs (including everybody's) in Parasol queue.
 #Checking finished jobs
 #Completed: 2095 of 2095 jobs
 #CPU time in finished jobs:       1059s      17.65m     0.29h    0.01d  0.000 y
 #IO & Wait Time:                  5687s      94.79m     1.58h    0.07d  0.000 y
 #Average job time:                   3s       0.05m     0.00h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:               7s       0.12m     0.00h    0.00d
 #Submission to last job:           134s       2.23m     0.04h    0.00d
 
 cat output/*.map | sort > ~/kent/src/hg/visiGene/vgLoadJax/jaxMahoneySlices.map
 
 --------------------------
 
 ssh hgwdev
 cd ~/kent/src/hg/visiGene/vgLoadJax
 
 cat *.map | wc -l
    4066
 cat *.map | grep NO_TEXT | wc -l
     1  (turns out to be a jax annotation caption error - missing leading zero in mtf#, ignoring)
 
 hgsql visiGene
 
 create table jaxMahoneySlicesMap (
     jFileName varchar(10) not null, # jaxMahoney .jpg fileName
     mFileName varchar(20) not null  #    mahoney .jpg fileName
 );
 load data local infile 'jaxMahoneySlices.map' into table jaxMahoneySlicesMap;
 update jaxMahoneySlicesMap set mFileName = concat(mFileName,".jpg") where mFileName <> "NO_TEXT";
 create index jFileName on jaxMahoneySlicesMap(jFileName);
 create index mFileName on jaxMahoneySlicesMap(mFileName);
 analyze table jaxMahoneySlicesMap;
 
 
 # WARNING: change submissionSet ids!!!
 insert into imageFileFwd
 select mi.id, ji.id from imageFile mi, imageFile ji, jaxMahoneySlicesMap map
 where ji.fileName=map.jFileName and mi.fileName=map.mFileName
  and mi.submissionSet=2 and ji.submissionSet=1820;
 # Records: 3896
 
 # It's looking good.
 
 # clean up
 drop table mahoneyWholeMountMd5;
 drop table jaxMahoneyWholeMountMd5;
 drop table jaxMahoneySlicesMap;
 
 #############################################################
 #
 # Patch contributors so we can search submission sets
 #  by specifying Mahoney in the search box.
 #
 
 select id from submissionSource where name like 'Mahoney%';
 +----+
 | id |
 +----+
 | 1  |
 +----+
 
 # note: we have to double the search word or else the search doesn't work
 insert into contributor set name = 'Mahoney mahoney';
 mysql> select * from contributor where name in ('Mahoney mahoney');
 +------+-----------------+
 | id   | name            |
 +------+-----------------+
 | 3987 | Mahoney mahoney |
 +------+-----------------+
 
 insert into submissionContributor select id, '3987' from submissionSet where submissionSource = 1;
 
 ############################
 
 ###  JACKSON UPDATE (re-done to fix expression data 2006-06-05 galt)  #############
 
 # The expression data was not correctly matching subpanels,
 # and the bodyPart was incorrectly displaying just "floor plate" 
 # instead of the full part-tree-lineage available in field printName.
 #
 # We found the solution in vgLoadJax was to NOT use the GXD_Expression
 # table at all - apparently it is not necessary as the data is in other tables.
 # This meant that we are using GXD_Strength values instead of the old 1/0 for level.
 # We made the vgLoadJax code treat these correctly, and tweaked hgVisiGene too.
 # And then we also decided to add the expression pattern while we were at it
 # since JAX db had it - so added it to vgLoadJax and hgVisiGene.
 
 # previously updated: jackson20060328 db on kkr3u00 
 # (see above, and see hg/visiGene/jackson/makeJackson.doc)
 
 # save imageFileFwd data in new form for easy restore:
 create table iffKeepThis as 
 select a.fileName "fromFN", b.fileName "toFN" from imageFileFwd iff, imageFile a, imageFile b 
 where iff.fromIf = a.id and iff.toIf = b.id;
 
 create index fromFN on iffKeepThis(fromFn(10));
 create index toFN on iffKeepThis(toFn(10));
 
 
 # Asked Heather to clone visiGene db to visiGeneBadExpr db,
 # and then ran this query to remove the old previous JAX info:
 # MULTI-TABLE DELETE:
 
 # CRITICAL! to make sure that analyze table has been run on all tables involved,
 # otherwise this will run forever.  Don't assume that the cardinality is defined.
 # Running analyze table is super quick.
 
 analyze table submissionSource;
 analyze table submissionSet;
 analyze table submissionContributor;
 analyze table image;
 analyze table imageFile;
 analyze table imageProbe;
 analyze table expressionLevel;
 
 delete from submissionSource where name = 'MGI';
 # 1 rows
 
 delete submissionSet from submissionSet ss left join submissionSource so on ss.submissionSource=so.id where so.id is null;
 # 1113 rows
 
 delete submissionContributor from submissionContributor sc left join submissionSet ss on sc.submissionSet=ss.id where ss.id is null;
 # 7926 rows
 
 delete image from image i left join submissionSet ss on i.submissionSet=ss.id where ss.id is null;
 # 33816 rows 
 
 delete imageFile from imageFile imf left join submissionSet ss on imf.submissionSet=ss.id where ss.id is null;
 # 13854 rows
 
 delete imageProbe from imageProbe ip left join image i on ip.image=i.id where i.id is null;
 # 35395 rows
 
 delete expressionLevel from expressionLevel el left join imageProbe ip on el.imageProbe=ip.id where ip.id is null;
 # 102293 rows
 
 delete from imageFileFwd;
 # 5724 rows 
 
 delete antibodySource from antibodySource abs left join submissionSource so on abs.submissionSource=so.id where so.id is null;
 # 745 rows
 
 
 
 
 #recompile vgLoadJax on dev
 #run vgLoadJax to create .ra .tab .txt for each submissionSet
 ssh kkr3u00
 cd ~/kent/src/hg/visiGene/vgLoadJax
 #remove any old data dir
 rm -fr visiGene/
 # visiGene in line below is just an output dir for the .ra/.tab/.txt files
 vgLoadJax /san/sanvol1/visiGene/gbdb jackson20060328 visiGene
 #refCount=2971
 
 #ran loadAll to load the updated jax .ra .tab .txt into visiGene db
 ssh hgwdev
 cd ~/kent/src/hg/visiGene/vgLoadJax
 loadAll
 
 
 # deal with parallel Mahoney-in-Jax data
 
 select * from submissionSet where name='jax92242' \G
 *************************** 1. row ***************************
               id: 2848
             name: jax92242
      publication: Mouse Brain Organization Revealed Through Direct Genome-Scale TF Expression Analysis
 
 select id,name from submissionSet where name like 'mahoney%';
 +----+-----------------+
 | id | name            |
 +----+-----------------+
 |  2 | mahoneySlices01 |
 |  1 | mahoneyWhole01  |
 +----+-----------------+
 
 # save imageFileFwd data in new form for easy restore:
 insert into imageFileFwd 
 select a.id, b.id from iffKeepThis iff, imageFile a, imageFile b 
 where iff.fromFN = a.fileName and iff.toFN = b.fileName
 and a.submissionSet in (1,2) and b.submissionSet in (2848);
 
 drop table iffKeepThis;
 
 # Since we had lost the privateUser settings,
 # I just replicated it from visiGeneBadExpr backup with a simple query.
 update visiGene.submissionSet n, visiGeneBadExpr.submissionSet o set n.privateUser=-1
 where o.privateUser=-1 and o.name = n.name;
 # currently this is just jax submissionSets for which we have not received permissions to use,
 # and the mahoney-in-jax that is suppressed.
 
 
 # ran vgGetText to update cgi-bin-galt/visiGeneData/ using visiGene db
 cd ~/kent/src/hg/visiGene/vgGetText
 make alpha
 
 # recompiled hgVisiGene earlier to support new expression level scale, and pattern
 
 
 # RE-MAKE knownToVisiGene tables (see respective makedocs for these)
 knownToVisiGene mm6
 knownToVisiGene mm7
 knownToVisiGene mm8
 knownToVisiGene hg17 -fromProbePsl=vgAllProbes
 knownToVisiGene hg18 -fromProbePsl=vgAllProbes
 
 # Patch contributors so we can search MGI submission sets 
 #  by specifying JAX or MGI in the search box.
 #
 
 select id from submissionSource where name = 'MGI';
 +----+
 | id |
 +----+
 | 7  |
 +----+
 
 # note: we have to double the search word or else the search doesn't work
 # skip adding these two which are already there:
 #   insert into contributor set name = 'JAX JAX';
 #   insert into contributor set name = 'MGI MGI';
 mysql> select * from contributor where name in ('JAX JAX','MGI MGI');
 +------+---------+
 | id   | name    |
 +------+---------+
 | 3981 | JAX JAX |
 | 3982 | MGI MGI |
 +------+---------+
 
 insert into submissionContributor select id, '3981' from submissionSet where submissionSource = 7;
 insert into submissionContributor select id, '3982' from submissionSet where submissionSource = 7;
 
 #######################################################
 #
 #  Received a major update from Susan Sunkin at ABA
 #  consisting of 6000 new images (we had 12000 already)
 #
 #
 
 # Allen Brain Atlas jp2 image prep (Galt 2006-12-12)
 # Create parasol directory and a list of the jpg files.
 ssh pk
 cd /san/sanvol1/visiGene/offline/allenBrain
 rm -fr prepImageRun
 mkdir prepImageRun
 find imageDisk -name '*.jp2' -print | sed 's/imageDisk\///' | grep May_06 > prepImageRun/jpg.lst
 
 cd prepImageRun
 # Create parasol batch
 cat << '_EOF_' > gsub
 #LOOP
 vgPrepImage /san/sanvol1/visiGene/offline/allenBrain/imageDisk /san/sanvol1/visiGene/gbdb/200/inSitu/Mouse/allenBrain /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain $(path1)
 #ENDLOOP
 '_EOF_'
 # << this line makes emacs coloring happy
 gensub2 jpg.lst single gsub spec
 para make spec -maxNode=50
 
 [pk:prepImageRun> /parasol/bin/para time
 6317 jobs in batch
 266106 jobs (including everybody's) in Parasol queue.
 Checking finished jobs
 Completed: 6317 of 6317 jobs
 CPU time in finished jobs:     267986s    4466.44m    74.44h    3.10d  0.008 y
 IO & Wait Time:                368981s    6149.68m   102.49h    4.27d  0.012 y
 Average job time:                 101s       1.68m     0.03h    0.00d
 Longest running job:                0s       0.00m     0.00h    0.00d
 Longest finished job:            1471s      24.52m     0.41h    0.02d
 Submission to last job:         43292s     721.53m    12.03h    0.50d
 
 
 # -maxNode=50 was needed because it opens many output files at the same time - do not overwhelm NFS
 
 # -----------------------
 
 # Allen Brain Atlas update (Galt 2007-02-08)
 
 # see mm6.txt for prep running allenCleanup and allenCollectSeq
 
 # LOAD ALLEN BRAIN DATA
 # note mm6,mm7,mm8 all have the same thing since it is for mouse generally
 # note make sure the contributors list in vgLoadAllen.c is correct
 vgLoadAllen \
  /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/allenBrain \
  /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20061204.tab \
  /cluster/data/mm6/bed/allenBrain/allProbes.fa \
  /cluster/data/mm6/bed/allenBrain/allProbes.tab \
  output
 #Got 17913 images
 #Got 17913 named probes
 #Got 17913 probe sequences
 
 # Did not do this: 
 # (instead, I asked Heather to clone entire visiGene db to visiGeneOld)
 #backed-up data in case of trouble:
 #mkdir /san/sanvol1/visiGene/dump/visiGene.20061220
 #hgsqldump visiGene -T /san/sanvol1/visiGene/dump/visiGene.20061220
 
 #restore fileLocation to point to dev
 update fileLocation set name =
 concat('http://hgwdev.cse.ucsc.edu',substring(name,INSTR(name,'/visiGene/')));
 # 14 rows
 
 # clean out the old ABA records before we do a full load
 delete from submissionSource where name = 'Allen Brain Atlas (ABA)';
 # 1 row
 
 delete submissionSet from submissionSet ss left join submissionSource so on
 ss.submissionSource=so.id where so.id is null;
 # 1 row
 
 delete submissionContributor from submissionContributor sc left join
 submissionSet ss on sc.submissionSet=ss.id where ss.id is null;
 # 13 rows
 
 delete image from image i left join submissionSet ss on i.submissionSet=ss.id
 where ss.id is null;
 # 11736 rows
 
 delete imageFile from imageFile imf left join submissionSet ss on
 imf.submissionSet=ss.id where ss.id is null;
 # 11736 rows
 
 delete imageProbe from imageProbe ip left join image i on ip.image=i.id where
 i.id is null;
 # 11737 rows
 
 delete expressionLevel from expressionLevel el left join imageProbe ip on
 el.imageProbe=ip.id where ip.id is null;
 # 0 rows
 
 delete antibodySource from antibodySource abs left join submissionSource so on
 abs.submissionSource=so.id where so.id is null;
 # 0 rows
 
 
 
 #load into visiGene db
 visiGeneLoad -database=visiGene output/aba.ra output/aba.tab /dev/null
 
 
 # RE-MAKE FULL TEXT INDEX
 cd hg/visiGene/vgGetText
 make alpha
 # basically does this, and puts it in cgi-bin/visiGeneData/:
 #vgGetText visiGene.text mm8 hg18
 #ixIxx visiGene.text visiGene.ix visiGene.ixx
 
 ############################
 
 # REBUILD PROBETRACK   (DONE galt 2007-02-15)
 #    WITH vgProbeTrack PROGRAM - AFTER DOING Allen Brain Atlas update 2007-02-08
 
 # (make a backup of visiGene db and these tables: {mm6,mm7,...}.vgProbes and {hg17,hg18,...}.vgAllProbes
 cd /san/sanvol1/visiGene/dump
 # (this backup shown is really an example template for the next person who needs to do this)
 mkdir visiGene.20070215
 cd visiGene.20070215
 hgsqldump visiGene -T .
 mkdir mm6; hgsqldump mm6 vgProbes -T mm6
 mkdir mm7; hgsqldump mm7 vgProbes -T mm7
 mkdir mm8; hgsqldump mm8 vgProbes -T mm8
 mkdir hg17; hgsqldump hg17 vgAllProbes -T hg17
 mkdir hg18; hgsqldump hg18 vgAllProbes -T hg18
 #(do any others needed that might not be listed here)
 #(document the reason for making the backup)
 echo 'vgLoadAllen has been run on ABA update 2007-02-08, so making backup of visiGene db and probe tracks before updating probeTracks, ' > README
 
 # OK, NOW USE vgProbeTrack TO UPDATE
 
 cd ~/kent/src/hg/visiGene/vgProbeTrack
 
 # Make sure vgProbeTrack program is up to date
 make
 
 # -sqlPath must be included whenever the vgProbes or vgAllProbes track tables do not yet exist for the db
 # so it can find the .sql script to create vgProbes or vgAllProbes tables as needed.
 # I happen to know that only JAX was updated since last time, and that is mouse only
 
 # populate vgPrb with any new stuff in visiGene.probe (works for all taxons at once).
 vgProbeTrack POP
 # new probe records found = 7335, # new vgPrb records added = 7314
 #   most of these are old, but we updated ABA by dropping completely and re-adding
 #   so these probes find their way back via sequence identity of probes in vgPrb.sequence
 
 # find sequence using various methods - given probe seq, primers, bacs, refseq, etc.
 #  must specify a specific assembly to use, so mm8 is ready to use now
 #  this finds any stuff for the mouse taxon
 vgProbeTrack SEQ working mm8
 
 rc = 17 = count of primers for genome search for taxon 10090
 rc = 141 = count of primers for mrna search for taxon 10090
 bac list read done.
 found seq for 0 bacEndPairs
 rc = 93 = count of refSeq mrna for mm8
 rc = 1 = count of genRef mrna for mm8
 rc = 4 = count of genbank mrna for mm8
 rc = 19 = count of flatRef mrna for mm8
 rc = 0 = count of flatAll mrna for mm8
 rc = 0 = count of linkRef mrna for mm8
 rc = 0 = count of linkAll mrna for mm8
 rc = 1 = count of kgAlRef mrna for mm8
 rc = 4 = count of kgAlAll mrna for mm8
 
 # create alignments using either refSeqAli or all_mrna or bacEnds or blat.  Took 1.5 hours.  
 # alignments are individually tracked per assembly here
 # alignment successes go in $db.vgProbes psl track, and whether succeeded or failed,
 # it only looks for things that have not already attempted alignment
 # the status goes into visiGene.vgPrbAli with .db="mm8"
 vgProbeTrack ALI working mm8
 
 # this finds any seq required for mm8.vgProbes track not already in mm8.seq 
 # adds the new .fa file in /cluster/data/mm8/bed/visiGene/
 # adds a symlink to it in /gbdb/mm8/visiGene/
 # and runs hgLoadSeq mm6 /gbdb/mm8/visiGene/vgPrbExt_??????.fa to add it to mm8.seq
 vgProbeTrack EXT working mm8
 
 # mm6.vgProbes was already complete from previous probe track creation, 
 #  it just needed to catch the new Allen Brain probes and align them.  About 1.5 hours.
 vgProbeTrack ALI working mm7
 vgProbeTrack EXT working mm7
 
 # mm6.vgProbes was already complete from previous probe track creation, 
 #  it just needed to catch the new Allen Brain probes and align them.  About 1.5 hours.
 vgProbeTrack ALI working mm6
 vgProbeTrack EXT working mm6
 
 # hg18.vgAllProbes was pre-existing with all probes, just need to add new allenBrain mouse
 # this internally uses pslMap against the mm8 to hg18 liftover chain.gz
 # Because it is "Xeno" (from mouse to human), it creates track vgAllProbes,
 # and maintains the list of processed alignments in visiGene.vgPrbAliAll.
 vgProbeTrack PSLMAP working hg18 mm8  
 # updates hg18.seq/extFile similarly to the EXT command, but for All probes.
 # just like with EXT, EXTALL puts .fa in /cluster/data/hg18/visiGene
 # and symlink in /gbdb/hg18/visiGene and updates using hgLoadSeq.
 # if a sequence has already been loaded it will not be loaded again.
 vgProbeTrack EXTALL working hg18
 
 # hg17.vgAllProbes existed before
 vgProbeTrack PSLMAP working hg17 mm7 
 vgProbeTrack EXTALL working hg17
 
 
 # RE-MAKE knownToVisiGene tables (see respective makedocs for these)
 knownToVisiGene mm6
 knownToVisiGene mm7
 knownToVisiGene mm8
 knownToVisiGene hg17 -fromProbePsl=vgAllProbes
 knownToVisiGene hg18 -fromProbePsl=vgAllProbes
 
 # update text/index for visiGene
 cd hg/visiGene/vgGetText
 make alpha
 #vgGetText /usr/local/apache/cgi-bin/visiGeneData/visiGene.text mm7 mm8 hg17 hg18
 #probe has 26611 rows
 #gene has 20413 rows
 #imageProbe has 125765 rows
 
 ################### (galt 2007-04-20 done)
 # FIXED TWO ADDITIONAL ZOOM-OUT LEVELS 5 AND 6:
 # Ran /san/sanvol1/offline/level56RunJax/ cluster job on a list of all files needed.
 # Somehow 13000 pix were missing from the list when we made zoom out levels 5 and 6
 # originally
 
 cd /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/
 
 find . -type d > dlist
 
 vi dlist
 
 #remove anything starting with "foo" or "ztest" or "goo" plus "."
 #That should leave just valid directories.
 
 cat dlist | sed -e 's/\.\///' > dlist2
 
 [hgwdev:jax> cat level6missing.csh
 #!/bin/tcsh
 set nonomatch
 while (1)
     set i=$<
     if ("$i" == "") then
         break
     endif
     if ( -e $i/*_6_000.jpg) then
     else
         echo "$i"
     endif
 end
 
 cat dlist2 | level6missing.csh > dlist3
 
 cd /san/sanvol1/visiGene/offline
 mkdir level56RunJax
 cd level56RunJax
 cp ../level56Run/level56.csh .
 cp ../level56Run/gsub .
 cat gsub
 [hgwdev:level56RunJax> cat gsub
 #LOOP
 ./level56.csh $(path1)
 #ENDLOOP
 
 cat /san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/dlist3 | gawk '{print
 "/san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/" $1 ".jpg"}' > jpg.lst
 
 pk
 cd /san/sanvol1/visiGene/offline/level56RunJax
 gensub2 jpg.lst single gsub spec
 para create spec
 para try
 para push
 para time
 
 #Completed: 13235 of 13235 jobs
 #CPU time in finished jobs:       3819s      63.65m     1.06h    0.04d  0.000 y
 #IO & Wait Time:                 45674s     761.23m    12.69h    0.53d  0.001 y
 #Average job time:                   4s       0.06m     0.00h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:              21s       0.35m     0.01h    0.00d
 #Submission to last job:           560s       9.33m     0.16h    0.01d
 
 # Followup to show that it worked:
 #[hgwdev:jax> cat dlist2 | level6missing.csh > dlist3X
 #[hgwdev:jax> ll dlist*
 #-rw-rw-r--  1 galt protein 117205 Apr 20 13:13 dlist
 #-rw-rw-r--  1 galt protein  85191 Apr 20 13:15 dlist2
 #-rw-rw-r--  1 galt protein  71495 Apr 20 13:33 dlist3
 #-rw-rw-r--  1 galt protein      0 Apr 20 14:25 dlist3X
 #
 #This shows that all completed (because dlist3X is empty)
 
 #-----------------------------------------------
 
 # Rsync request
 #please rsync from /san to hgnfs1:
 
 rsync hgwdev:/san/sanvol1/visiGene/gbdb/full/inSitu/Mouse/jax/
 hgnfs1:/hgnfs1:/export/gbdb2/full/inSitu/Mouse/jax/
 
 #
 ################### (galt 2007-05-09 done)
 # Adding support to vgProbeTrack and knownToVisiGene for
 #  the BLATZ'd frog probes to mm8 which Jim did recently.
 # knownToVisiGene no longer uses -fromProbePsl option,
 #  instead it automatically detects vgProbes and vgAllProbes
 #  and uses them in that order if no symbolic matches were found.
 # Added a SELFMAP command to vgProbeTrack to migrate any missing
 #  self alignments in vgProbes to vgAllProbes 
 # Made a backup of visiGene.vg* first:
 ssh hgwdev
 cd /san/sanvol1/visiGene/dump
 mkdir visiGene.20070509
 cd visiGene.20070509
 hgsqldump visiGene -T .
 
 cd ~/kent/src/hg/visiGene/vgProbeTrack
 
 vgProbeTrack -sqlPath=.. REMAP working mm8 nibb nibbImageProbes /gbdb/mm8/nibbImageProbes.fa
 #FYI: Table mm8.vgAllProbes does not exist
 #hgPepPred visiGene generic vgRemapTemp /gbdb/mm8/nibbImageProbes.fa
 #Processing /gbdb/mm8/nibbImageProbes.fa
 #Count of Psls found for reMap: 1379
 #cat vgPrbReMap.psl vgAllProbes.psl | sort -u | sort -k 10,10 >vgAllProbesNew.psl
 #hgLoadPsl mm8 vgAllProbesNew.psl -table=vgAllProbes
 #Processing vgAllProbesNew.psl
 #rm vgPrbReMap.psl vgAllProbes.psl vgAllProbesNew.psl
 
 vgProbeTrack SELFMAP working mm8
 #Count of nonBac Psls found for pslMap: 24615
 #Count of bac Psls found for pslMap: 0
 #cat bac.psl nonBac.psl > vgPrbSelfMap.psl
 #cat vgPrbSelfMap.psl vgAllProbes.psl | sort -u | sort -k 10,10 >
 #vgAllProbesNew.psl
 #hgLoadPsl mm8 vgAllProbesNew.psl -table=vgAllProbes
 #Processing vgAllProbesNew.psl
 #rm vgPrbSelfMap.psl vgAllProbes.psl vgAllProbesNew.psl
 
 vgProbeTrack EXTALL working mm8
 #rc = 981 = count of sequences for vgPrbExt.fa, to use with mm8 trackvgAllProbes
 #cp vgPrbExt.fa /cluster/data/mm8/bed/visiGene/vgPrbExt_YDGSWH.fa
 #ln -s /cluster/data/mm8/bed/visiGene/vgPrbExt_YDGSWH.fa/gbdb/mm8/visiGene/vgPrbExt_YDGSWH.fa
 #hgLoadSeq mm8 /gbdb/mm8/visiGene/vgPrbExt_YDGSWH.fa
 #981 sequences
 #Updating seq table
 
 knownToVisiGene mm8
 
 ####################################################
 
 ################### (galt 2008-04-04 done)
 # Slight name change for NIBB (affected visiGene)
 # removed the word "Japanese " from NIBB name in visiGene.submissionSource
 # removed same thing from vgLoadNibb.c source code.
 # requested push of table hgwdev.visiGene.submissionSource.
 
 ################### (galt 2008-08-18 done)
 # make downloads for visiGene
 
 ssh hgwdev
 co browser   # if you haven't already done it
 
 change browser module, downloads.html to add links to visiGene download
 
 cvs commit browser/downloads.html
 
 # updating the visiGene downloads
 
 cd /usr/local/apache/htdocs/goldenPath
 mkdir visiGene
 cd visiGene
 mkdir database
 cd database
 
 
 vi README
 ---------
 This directory contains the downloadable tables in the UCSC visiGene
 database. This database is shared by the program VisiGene
  http://genome.ucsc.edu/cgi-bin/hgVisiGene
 and tracks that incorporate visiGene data, such as the Known Genes tracks.
 
 To see descriptions of the tables in visiGene, visit the Table Browser:
   http://genome.ucsc.edu/cgi-bin/hgTables
 select "All Tables" as the group, select visiGene as the database,
 and select a table.  Then click the "describe table schema" button.
 ---------
 
 hgsqldump visiGene -T .
 
 rm vgPrbAli.*
 rm vgPrbAliAll.*
 
 sed -i -e 's/hgwdev[.]cse/genome/' fileLocation.txt
 
 gzip *.txt
 
 Do a push-request:
 ------------
 please rsync (with appropriate flags)
 
 hgwdev:/usr/local/apache/htdocs/goldenPath/visiGene/
 
 to
 
 hgdownload:/usr/local/apache2/htdocs/goldenPath/visiGene/
 
 Reason:
  Now users will have an easier time of downloading visiGene database.
 
 ------------
 
 also, first time only, do 
 
 update browser sandbox with links on downloads.html,
 then do a push-request:
 
 Please push downloads.html from dev to hgdownload:
 
 hgwdev:/usr/local/apache/htdocs/downloads.html
 
 to
 
 hgdownload:/usr/local/apache2/htdocs/downloads.html
 
 Reason:
  added the page links for visiGene database download.
 
 
 ################### (galt 2008-09-08 done)
 # move visiGene data to hive
 ssh hgwdev
 mv /san/SanVol1/visiGene /hive/data/inside/visiGene
 ln -s /hive/data/inside/visiGene /gbdb/visiGene
 # note /usr/local/apache/htdocs/visiGene is still a symlink to /gbdb/visiGene