52e47105152b7e5dd2621b9e8439446e0d77336f
braney
  Mon Oct 23 12:01:11 2023 -0700
build Jaspar 2024 motif table

diff --git src/hg/makeDb/doc/hgFixed.txt src/hg/makeDb/doc/hgFixed.txt
index 54518c8..b226141 100644
--- src/hg/makeDb/doc/hgFixed.txt
+++ src/hg/makeDb/doc/hgFixed.txt
@@ -1,1093 +1,1104 @@
 #This describes how at least some of the tables in
 #hgFixed were created.  This is a database containing
 #primarily expression data.  There are two main formats:
 #  expRecord.as - This describes the mRNA sources for
 #     a series of microarray experiments 
 #  expData.as - This describes the measured value 
 #     in either absolute or relative ratio terms of
 #     each gene/probe/target in a series of microarray
 #     experiments.  Each expData is associated with
 #     an expRecord, thogh expDatas sometimes share
 #     the same expRecord.
 
 #The Human Affy GNF Expression Atlas 2003 Version:
 # Create the main expRecord table and the expData table for
 # the absolute measurements as so:
 hgGnfMicroarray gnfHumanU95AllExps gnfHumanU95All /projects/compbio/data/microarray/affyGnfHuman/data_public_U95
 # Convert these to ratios using the median of medians of non-cancerous
 # cell types as the denominator as so:
 cd ~/src/hg/makeDb/hgRatioMicroarray
 hgRatioMicroarray gnfHumanU95All gnfHumanU95AllRatio -clump=gnfClump.ra
 # Take the median value over multiple replicants and put in this table:
 cd ../hgMedianMicroarray
 hgMedianMicroarray hgFixed gnfHumanU95AllRatio gnfHumanU95AllExps gnfU95Median.ra gnfHumanU95MedianRatio gnfHumanU95MedianExps -minExps=1
 # Also make a median version of the absolute measurements
 hgMedianMicroarray hgFixed gnfHumanU95All gnfHumanU95AllExps gnfU95Median.ra gnfHumanU95Median gnfHumanU95MedianExps -minExps=1
 
 # The Mouse Affy GNF Expression Atlas:
 # Create the expRecord tables for U74 a/b/c and the expData table for
 # the absolute measurements:
 hgGnfMicroarray gnfMouseU74aAllExps gnfMouseU74aAll /projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74
 hgGnfMicroarray gnfMouseU74bAllExps gnfMouseU74bAll /projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt
 hgGnfMicroarray gnfMouseU74cAllExps gnfMouseU74cAll /projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt
 # Convert these to ratios using the median of medians of 
 # cell types as the denominator as so:
 
 cd ~/src/hg/makeDb/hgRatioMicroarray
 hgRatioMicroarray gnfMouseU74aAll gnfMouseU74aAllRatio -clump=gnfMouseU74aClump.ra
 hgRatioMicroarray gnfMouseU74bAll gnfMouseU74bAllRatio -clump=gnfMouseU74bClump.ra
 hgRatioMicroarray gnfMouseU74cAll gnfMouseU74cAllRatio -clump=gnfMouseU74cClump.ra
 # Take the median value over multiple replicants and put in this table:
 cd ../hgMedianMicroarray
 hgMedianMicroarray hgFixed gnfMouseU74aAllRatio gnfMouseU74aAllExps gnfMouseU74aMedian.ra gnfMouseU74aMedianRatio gnfMouseU74aMedianExps -minExps=1
 hgMedianMicroarray hgFixed gnfMouseU74bAllRatio gnfMouseU74bAllExps gnfMouseU74bMedian.ra gnfMouseU74bMedianRatio gnfMouseU74bMedianExps -minExps=1
 hgMedianMicroarray hgFixed gnfMouseU74cAllRatio gnfMouseU74cAllExps gnfMouseU74cMedian.ra gnfMouseU74cMedianRatio gnfMouseU74cMedianExps -minExps=1
 # Also make a median version of the absolute measurements
 hgMedianMicroarray hgFixed gnfMouseU74aAll gnfMouseU74aAllExps gnfMouseU74aMedian.ra gnfMouseU74aMedian gnfMouseU74aMedianExps -minExps=1
 hgMedianMicroarray hgFixed gnfMouseU74bAll gnfMouseU74bAllExps gnfMouseU74bMedian.ra gnfMouseU74bMedian gnfMouseU74bMedianExps -minExps=1
 hgMedianMicroarray hgFixed gnfMouseU74cAll gnfMouseU74cAllExps gnfMouseU74cMedian.ra gnfMouseU74cMedian gnfMouseU74cMedianExps -minExps=1
 
 
 #The Human GNF Expression Atlas 2 (2004)
 #
 # Create the main expRecord table and the expData table for
 # the absolute measurements as so:
 hgGnfMicroarray gnfHumanAtlas2AllExps gnfHumanAtlas2All /projects/compbio/data/microarray/geneAtlas2/human/U133A+GNF1B_101402.AD.txt -chip=U133A+GNF1B
 # Convert these to ratios using the median of medians of non-cancerous
 # cell types as the denominator as so:
 cd ~/src/hg/makeDb/hgRatioMicroarray
 hgRatioMicroarray gnfHumanAtlas2All gnfHumanAtlas2AllRatio -clump=gnfHumanAtlas2Clumps.ra
 # Take the median value over multiple replicants and put in this table:
 cd ../hgMedianMicroarray
 hgMedianMicroarray hgFixed gnfHumanAtlas2AllRatio gnfHumanAtlas2AllExps gnfHumanAtlas2.ra gnfHumanAtlas2MedianRatio gnfHumanAtlas2MedianExps -minExps=1
 # Also make a median version of the absolute measurements
 hgMedianMicroarray hgFixed gnfHumanAtlas2All gnfHumanAtlas2AllExps gnfHumanAtlas2.ra gnfHumanAtlas2Median gnfHumanAtlas2MedianExps -minExps=1
 
 #The Mouse GNF Expression Atlas 2 (2004)
 # Create the main expRecord table and the expData table for
 # the absolute measurements as so:
 hgGnfMicroarray gnfMouseAtlas2AllExps gnfMouseAtlas2All /projects/compbio/data/microarray/geneAtlas2/mouse/GNF1M_20030403.AD.txt -chip=GNF1M
 # Convert these to ratios using the median of medians of non-cancerous
 # cell types as the denominator as so:
 cd ~/src/hg/makeDb/hgRatioMicroarray
 hgRatioMicroarray gnfMouseAtlas2All gnfMouseAtlas2AllRatio -clump=../hgMedianMicroarray/gnfMouseAtlas2.ra
 # Take the median value over multiple replicants and put in this table:
 cd ../hgMedianMicroarray
 hgMedianMicroarray hgFixed gnfMouseAtlas2AllRatio gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2MedianRatio gnfMouseAtlas2MedianExps -minExps=1
 # Also make a median version of the absolute measurements
 hgMedianMicroarray hgFixed gnfMouseAtlas2All gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2Median gnfMouseAtlas2MedianExps -minExps=1
 
 #The Rat GNF Expression Atlas 2 (2004)
 # Create the main expRecord table and the expData table for
 # the absolute measurements as so:
 hgGnfMicroarray gnfRatAtlas2AllExps gnfRatAtlas2All /projects/compbio/data/microarray/geneAtlas2/rat/PivotNoApwithTissues.txt -chip=RG-U34A -ref=http://expression.gnf.org/ratlas
 # Convert these to ratios using the median of medians of non-cancerous
 # tissues or cell types (in this case, this is all the tissues) as the 
 # denominator as so:
 cd ~/src/hg/makeDb/hgRatioMicroarray
 hgRatioMicroarray gnfRatAtlas2All gnfRatAtlas2AllRatio -clump=gnfRatAtlas2Clumps.ra
 # Take the median value over multiple replicants and put in this table.
 # Use Clumps.ra file renamed as gnfRatAtlas2.ra as this contains all the 
 # tissues since there are no cancer tissues in this expression data set:
 cd ../hgMedianMicroarray
 hgMedianMicroarray hgFixed gnfRatAtlas2AllRatio gnfRatAtlas2AllExps gnfRatAtlas2.ra gnfRatAtlas2MedianRatio gnfRatAtlas2MedianExps -minExps=1
 # Also make a median version of the absolute measurements
 hgMedianMicroarray hgFixed gnfRatAtlas2All gnfRatAtlas2AllExps gnfRatAtlas2.ra gnfRatAtlas2Median gnfRatAtlas2MedianExps -minExps=1
 
 # C. elegans life cycle data from the Kim Lab via the Stanford Microarray Database.
 cd ~/kent/src/hg/makeDb/hgStanfordMicroarray
 hgStanfordMicroarray hgFixed kimWormLifeAllRatio kimWormLifeAllExps /projects/compbio/data/microarray/wormLifeCycle/spots -swap '-trimName=(green)' -suppress=green '-trimTissue=(repeat #?)'
 cd ../hgMedianMicroarray
 hgMedianMicroarray hgFixed kimWormLifeAllRatio kimWormLifeAllExps kimMed.ra kimWormLifeMedianRatio kimWormLifeMedianExps
 
 # D. melanogaster life cycle data from Arbeitman et al 2002 
 # via the Stanford Microarray Database.
 cd ~/kent/src/hg/makeDb/hgStanfordMicroarray
 # absolute:
 hgStanfordMicroarray -geneField="Systematic name" -dataField=CH2I_MEDIAN \
   hgFixed arbFlyLifeAll arbFlyLifeAllExps \
   /projects/compbio/data/microarray/flyLifeCycle/spots
 # ratios:
 hgStanfordMicroarray -geneField="Systematic name" \
   hgFixed arbFlyLifeAllRatio arbFlyLifeAllExps \
   /projects/compbio/data/microarray/flyLifeCycle/spots
 cd ../hgMedianMicroarray
 echo "select name,id from arbFlyLifeAllExps" | hgsql -N hgFixed  \
   | sort > arbMed.ra
 # edit arbMed.ra to collapse the N=1, N=2 lines.
 # median absolute:
 hgMedianMicroarray hgFixed arbFlyLifeAll arbFlyLifeAllExps arbMed.ra \
   arbFlyLifeMedian arbFlyLifeMedianExps
 # median ratios:
 hgMedianMicroarray hgFixed arbFlyLifeAllRatio arbFlyLifeAllExps arbMed.ra \
   arbFlyLifeMedianRatio arbFlyLifeMedianExps
 # cvs add and check in arbMed.ra
 
 ###########################
 # REGENERATING FLY LIFE-CYCLE TABLES. (DONE 5/12/2006 ANDY)
 hgsql hgFixed -e "rename table kimWormLifeAllRatio to kimWormLifeAllRatio_old"  
 hgsql hgFixed -e "rename table kimWormLifeMedianExps to kimWormLifeMedianExps_old"
 hgsql hgFixed -e "rename table kimWormLifeMedianRatio to kimWormLifeMedianRatio_old"
 
 # The scopDes table, which is used by the SuperFamily column in hgNear.
 mkdir /cluster/store1/scop
 cd /cluster/store1/scop
 wget http://scop.mrc-lmb.cam.ac.uk/scop/parse/dir.des.scop.txt_1.63
 grep -v '^#' dir.des.scop.txt* > scopDes.txt
 hgsql hgFixed < ~/kent/src/hg/lib/scopDes.sql
 echo "load data local infile 'scopDes.txt' into table scopDes;" | hgsql hgFixed
 
 # The Yeast Cell Cycle Time Course from Cho RJ et al 1998
 cd /cluster/data/sacCer1/download/systematic_results/expression_data
 hgGnfMicroarray yeastChoCellCycleExps yeastChoCellCycle  \
 	Cho_et_al_full_data.txt -chip=affyYeast \
 	-chopName=/ \
 	-url=http://yscdp.stanford.edu/yeast_cell_cycle/cellcycle.html \
 	-ref=http://www.pnas.org/cgi/content/abstract/95/7/3752 \
 	-credit=http://yscdp.stanford.edu/yeast_cell_cycle/cellcycle.html
 cd ~/src/hg/makeDb/hgRatioMicroarray
 hgRatioMicroarray yeastChoCellCycle yeastChoCellCycleRatio
 
 # Mouse expression data by sex on Affy MOE430A arrays from
 # John Rinn (john.rinn@yale.edu) et al.
 cd /projects/compbio/data/microarray/rinnEtAl
 hgGnfMicroarray mouseRinnSexExps mouseRinnSex rinnEtAlSpots.txt \
     -chip=MOE430A \
     -url=n/a \
     -ref=n/a \
     -credit=n/a
 cd ~/kent/src/hg/makeDb/hgRatioMicroarray 
 hgRatioMicroarray mouseRinnSex mouseRinnSexRatio
 cd ~/kent/src/hg/makeDb/hgMedianMicroarray 
 hgMedianMicroarray hgFixed mouseRinnSex mouseRinnSexExps mouseRinnSex.ra mouseRinnSexMedian mouseRinnSexMedianExps
 hgMedianMicroarray hgFixed mouseRinnSexRatio mouseRinnSexExps mouseRinnSex.ra mouseRinnSexMedianRatio mouseRinnSexMedianExps
 
 # D. melanogaster full euchromatic expression profile (FEEP) -- 
 # Stolc et al. 2004.  
 1# Loaded up absolute tables directly from files downloaded from 
 # http://genome.med.yale.edu/FEEP/FEEP.html --
 # see /projects/compbio/data/microarray/flyFEEP/README .
 # Extract ratio from absolute:
 hgRatioMicroarray flyFeepAll flyFeepAllRatio
 cd ~/kent/src/hg/makeDb/hgMedianMicroarray
 echo "select description,id from flyFeepAllExps" | hgsql -N hgFixed  \
   | sort > flyFeepMed.ra
 # edit flyFeepMed.ra to collapse lines with the same initial character.
 # median absolute:
 hgMedianMicroarray hgFixed flyFeepAll flyFeepAllExps flyFeepMed.ra \
   flyFeepMedian flyFeepMedianExps
 # median ratios:
 hgMedianMicroarray hgFixed flyFeepAllRatio flyFeepAllExps flyFeepMed.ra \
   flyFeepMedianRatio flyFeepMedianExps
 # cvs add and check in flyFeepMed.ra
 
 # Human data from Shyamsundar R, et al. (2005) Genome Biol 6(3):R22
 mkdir -p /projects/compbio/data/microarray/shyamsundarEtAl
 cd /projects/compbio/data/microarray/shyamsundarEtAl
 wget ftp://smd-ftp.stanford.edu/smd/publications/426/3130/exptsetno_3130.tar.gz
 wget ftp://smd-ftp.stanford.edu/smd/publications/426/3130/exptset_3130.meta
 tar xfz exptsetno_3130.tar.gz
 rm exptsetno_3130.tar.gz
 mkdir spots
 cat << _EOF_ > cleanXls.awk
 { 
   if (/^!/) 
      {
      line = \$0
      gsub(/\"|,/, "", line)
      print line
      }
    else
      print
 }
 _EOF_
 for file in *.xls; do
    awk -f cleanXls.awk $file > spots/$file
 done
 cd ~/kent/src/hg/makeDb/hgMedianMicroarray
 # The hgFixed.history doesn't have the errata column
 echo alter table history add column errata varchar(255) | hgsql hgFixed
 hgStanfordMicroarray -dataField="Normalized Ch2 Intensity (Median)" \
   hgFixed humanNormal humanNormalExps /projects/compbio/data/microarray/shyamsundarEtAl/spots
 hgStanfordMicroarray -dataField="Log(base2) of R/G Normalized Ratio (Mean)" \
   hgFixed humanNormalRatio humanNormalExps /projects/compbio/data/microarray/shyamsundarEtAl/spots
 echo "select name from humanNormalExps" | hgsql -N hgFixed | awk "{print \"\'\"\$0\"\'\"}" > col1
 echo "select id from humanNormalExps" | hgsql -N hgFixed > col2
 n=`wc -l < col1`
 for i in `seq 1 $n`; do echo "n/a" >> col1.5; done
 paste col1 col1.5 col2 | sort | tr '\t' ' ' > humanNormal.ra
 rm col1 col1.5 col2
 # EDIT humanNormal.ra by hand and combine the like tissues
 hgMedianMicroarray -minExps=1 hgFixed humanNormal humanNormalExps humanNormal.ra \
   humanNormalMedian humanNormalMedianExps
 hgMedianMicroarray -minExps=1 hgFixed humanNormalRatio humanNormalExps humanNormal.ra \
   humanNormalMedianRatio humanNormalMedianExps
 #### HUMAN NORMAL DATA FIXING (10/5/2006 Andy)
 ssh hgwdev
 cd /projects/compbio/data/microarray/shyamsundarEtAl
 mv spots/13729.xls .
 tail +23 13729.xls | cut -f8,63 > data.txt
 echo 13729 > arrays.txt
 for array in spots/*; do 
     echo $array >> arrays.txt
     tail +23 $array | cut -f63 > newCol.txt
     paste data.txt newCol.txt > tmp.txt
     mv tmp.txt data.txt 
 done
 sed '/^[[:space:]]/d' data.txt > tmp.txt
 mv tmp.txt data.txt
 sed 's/spots\///;s/\.xls.*$//' arrays.txt > tmp.txt
 mv tmp.txt arrays.txt
 for id in `cat arrays.txt`; do grep $id -B1 exptset_3130.meta | grep Name | sed 's/.*=//;s/\"//g' >> names.txt; done
 paste arrays.txt names.txt | sort -k2,2 > tmp.txt
 mv tmp.txt arrays.txt
 rm names.txt
 # I changed my mind 
 
 echo "" | cat - names.txt | tr '\n' '\t' > oneLine.txt
 cat oneLine.txt data.txt > tmp.txt
 mv tmp.txt data.
 
 # (copy/paste this into columnDb.ra)
 
 # Mouse data from Zhang, et. al The functional landscape of mouse gene expression" J Biol. 
 # http://hugheslab.med.utoronto.ca/Zhang/
 mkdir -p /cluster/store2/microarray
 ln -s /cluster/store2/microarray /cluster/data/microarray
 mkdir -p /cluster/data/microarray/zhangEtAl
 cd /cluster/data/microarray/zhangEtAl
 wget http://hugheslab.med.utoronto.ca/Zhang/expression_39309_normalized.txt
 sed 's/\(XM_[0-9]\+\)\.1/\1/' expression_39309_normalized.txt > arrays.txt
 hgGenericMicroarray hgFixed mouseLandscape arrays.txt
 wget http://hugheslab.med.utoronto.ca/Zhang/mouse_XM_mRNA_NCBI_2.fa
 sed 's/^>.*|\(XM.*\)\.1|.*$/>\1/' mouse_XM_mRNA_NCBI_2.fa > xm.fa
 ssh kk9
 cd /santest/scratch
 mkdir andy
 cd andy/
 cp /cluster/data/microarray/zhangEtAl/xm.fa .
 ls -1 /panasas/store/mm6/nib/* | grep -v random > chroms.lst
 cat << _EOF_ > gsub
 #LOOP
 blat -ooc=/scratch/hg/h/mouse11.ooc -fine -q=rna -noHead \$(path1) xm.fa xm.\$(root1).psl
 #ENDLOOP
 _EOF_
 gensub2 chroms.lst single gsub spec
 para create spec
 para push
 para time
 #Completed: 22 of 22 jobs
 #CPU time in finished jobs:      36298s     604.96m    10.08h    0.42d  0.001 y
 #IO & Wait Time:                    91s       1.52m     0.03h    0.00d  0.000 y
 #Average job time:                1654s      27.57m     0.46h    0.02d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:            2955s      49.25m     0.82h    0.03d
 #Submission to last job:          2957s      49.28m     0.82h    0.03d
 cat *.psl > xm.psl
 ssh hgwdev
 cd /cluster/data/microarray/zhangEtAl
 cp /santest/scratch/andy/xm.psl .
 hgLoadPsl -table=xmMrna mm6 xm.psl
 hgMapToGene -type=psl -cds mm6 xmMrna knownGene knownToXM
 echo drop table xmMrna | hgsql mm6 
 
 # REBASE 505 (4-28-2005)  (Done 5/18/2005 Andy)
    ssh hgwdev
    # download files
    curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgenz > rebase.gcg
    curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgref > rebaseRefs.txt
    # References file
    tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f2- -d' ' > c2
    tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f1 -d' ' | sed 's/\.//' > c1
    paste c1 c2 | sed '/^$/d' > rebaseRefs.txt
    rm c1 c2
    # Load the cutters table.
    hgCutters hgFixed rebase.gcg
    # Load the other table.
    hgsql hgFixed -e "echo delete from rebaseRefs"
    hgsql hgFixed -e "load data local infile 'rebaseRefs.txt' into table rebaseRefs"
    
 # REBASE 603 (3-1-2006)  (Done 3-2-2006 Andy)
    ssh hgwdev
    # download files
    curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgenz > rebase.gcg
    curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgref > rebaseRefs.txt
    # References file
    tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f2- -d' ' > c2
    tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f1 -d' ' | sed 's/\.//' > c1
    paste c1 c2 | sed '/^$/d' > rebaseRefs.txt
    rm c1 c2
    # Load the cutters table.
    hgCutters hgFixed rebase.gcg
    # Load the other table.
    hgsql hgFixed -e "echo delete from rebaseRefs"
    hgsql hgFixed -e "load data local infile 'rebaseRefs.txt' into table rebaseRefs"   
 
 # REBASE 902 (2009) (DONE 2009-02-09, Andy)
    ssh hgwdev
    mkdir /hive/data/outside/rebase
    cd /hive/data/outside/rebase
    tail -n+15 rebaseRefs.txt | sed '/^$/d; s/^\s\+\([[:digit:]]\+\)\.\s\+\</\1\t/' > tmp
    mv tmp rebaseRefs.txt
    hgCutters hgFixed rebase.gcg
    hgsql hgFixed -e "delete from rebaseRefs"
    hgsql hgFixed -e "load data local infile 'rebaseRefs.txt' into table rebaseRefs"   
    mkdir rebase902
    mv rebase.gcg rebaseRefs.txt rebase902/
 
 # REBASE 201 (Dec 31, 2011) (DONE 2012-01-24, Andy)
    ssh hgwdev
    cd /hive/data/outside/rebase
    mkdir rebase201
    cd rebase201/
    # go to http://rebase.neb.com/rebase/rebase.f3.html
    # save the GET FILE files, then scp them to this dir
    ln -s link_gcgenz.txt rebase.gcg
    hgCutters hgFixed rebase.gcg
    tail -n +15 link_gcgref.txt | sed '/^$/d; s/^\s*\([[:digit:]]\+\)\.\s\+\</\1\t/' > rebaseRefs.txt
    hgsqldump -d --compact hgFixed rebaseRefs | grep -v "^SET" > rebaseRefs.sql
    hgLoadSqlTab hgFixed rebaseRefs{,.sql,.txt} 
 
 # GLADSTONE hESC Novartis microarray data.
    # 1. Download http://www.genmapp.org/temp/humansimpleESC.zip
    # 2. Manually convert using MS access on Bob's laptop to a tab-delimited text file.
    # 3. Add column names to that file manually.
    ssh hgwdev
    mkdir /projects/compbio/data/microarray/gladstone
    cp hESC.txt /projects/compbio/data/microarray/gladstone
    cd /projects/compbio/data/microarray/gladstone
    cut -f1,4,6 hESC.txt | tail +2 | sort -k3,3 -k1,1 > data.1
    for tiss in `cut -f3 data.1 | sort | uniq`; do 
       grep $tiss data.1 | cut -f1,2 | sort -k1,1 | cut -f2 > data.${tiss}.1
       echo $tiss | cat - data.${tiss}.1 > data.${tiss}.2
    done
    paste data.*.2 > data.2
    grep Lung data.1 | cut -f1 | sort > names
    echo Probe | cat - names | paste - data.2 > data.3
    cat << _EOF_ > fixGladstone.sed
 s/_/ /;
 s/Embryonicstemcell/Embryonic Stem Cell/;
 s/Smoothmuscle/Smooth Muscle/;
 s/Salivarygland/Salivary Gland/;
 s/Lymphnode/Lymph Node/;
 s/Bonemarrow/Bone Marrow/;
 s/Spinalcord/Spinal Cord/;
 s/Wholebrain/Whole Brain/;
 s/blood/Blood/;
 _EOF_
    head -n1 data.3 | sed -f fixGladstone.sed > header
    tail +2 data.3 | cat header - > data.4
    mv data.4 generic.hESC.txt
    rm data.* names header
    hgGenericMicroarray hgFixed gladHumES generic.hESC.txt
    hgRatioMicroarray gladHumES gladHumESRatio
 
 # GLADSTONE 
    ssh hgwdev
    cd /projects/compbio/data/microarray/gladstone
    awk '{if ($3 == $4) print}' hESC.txt > bestQ.hESC.txt
    cat << _EOF_ | hgsql hgFixed
 CREATE TABLE gladHumESOtherData (
     name varchar(255) not null,    # Name of item
     tissueQ varchar(255) not null,  # Name of Q-associated tissue
     qVal float not null, # Q value
     hVal float not null, # H value
       #Indices
     INDEX(name(8)),
     INDEX(tissueQ(10))
 );
 _EOF_
    cut -f1,2,5,6 hESC.txt | tail +2 | sort -k1,1 -k3,3n \
      | awk '{printf("%s\t%s\t%s\t%s\n", $4, $3, $2, $1)}' \
      | uniq -f3 \
      | awk '{printf("%s\t%s\t%s\t%s\n", $4, $1, $2, $3)}' \
      > gladOther.txt
    # Fix up the tissue column
    cut -f2 gladOther.txt > tmp.tiss.1
    sed -f fixGladstone.sed tmp.tiss.1 > tmp.tiss.2
    cut -f1 gladOther.txt > tmp.names
    cut -f3- gladOther.txt | paste tmp.names tmp.tiss.2 - \
      > tmp.glad
    mv tmp.glad gladOther.txt
    rm tmp.*
    echo "load data local infile 'gladOther.txt' into table gladHumESOtherData" | hgsql hgFixed
 
 # PRINCETON STEM CELL ARRAYS
    ssh hgwdev
    mkdir /projects/compbio/data/microarray/princetonESC
    cd /projects/compbio/data/microarray/princetonESC
    for num in i ii iii iv v vi vii; do
       wget http://stemcell.princeton.edu/affy_cluster_${num}.html
       grep "td bgcolor=\"#FFFFAA\" align=center class=ssb" affy_cluster_${num}.html | sed 's/.*<p>\(.*\)<\/td>/\1/' > names
       grep "<td class=fixed align=right>" affy_cluster_${num}.html | sed 's/.*right>\(.*\)&nbsp;<\/td>.*$/\1/' | colify 9 /dev/stdin > data
       paste names data >> tmp.txt
       rm names data affy_cluster_${num}.html
    done
    echo "~Bone Marrow RhoLo~Bone Marrow RhoHi~Bone Marrow Sca-~Bone Marrow Lin+~Fetal Liver Sca+~Fetal Liver Sca-~Fetal Liver Lin+~Neural Stem Cells~Embryonic Stem Cells" | tr '~' '\t' | cat - tmp.txt > princeton.txt
    rm tmp.txt
 
 # QA push cghNci60Exps on 2006-02-07 to rr. Table/data previously missing (Jen)
 # QA re-push rosChr22Dat on 2006-02-08 to fix table formatting/timestamps (Jen)
 
 # AFFY ALL EXON HUMAN ARRAYS (INCLUDES TABLES ON HG17 AND HG18) (Done 3/15/2006, Andy)
      # Chuck put them in tab-delimited file in ~sugnet
    ssh hgwdev
    cd /projects/compbio/data/microarray
    mkdir affyHumanExon
    cd affyHumanExon/
    cp ~sugnet/plier-gcbg-sketch.summary.txt .
    sed -e "s/huex_wta_//g" -e "s/\.CEL//g" plier-gcbg-sketch.summary.txt > data.txt
    hgGenericMicroarray hgFixed affyHumanExon data.txt
      # Chuck put probe data into two tables in hg17.
      # Grab the bed first.  Change the original name because a lot got started
      # without keeping Chuck's naming convention in mind.  oh well.
    hgsql hg17 -e "rename table affyHuEx1 to affyHumanExonProbes"
    hgsql hg17 -e "rename table affyHuEx1Annot to affyHumanExonProbeAnnot"
    hgsql hg17 -e "select * from affyHuEx1" | tail +2 | cut -f2-7 | > hg17.probes.bed
      # Lift to hg18
    liftOver hg17.probes.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain hg18.probes.bed hg18.unMapped
      # How many didn't get lifted (out of 1.4 million)? 
    wc -l hg18.unMapped
 #    276 hg18.unMapped
      # That's not bad at all. 99.99% of them lifted fine.
      # Load the hg18 probe bed.  Change the name of the hg17 one.
    hgLoadBed hg18 affyHumanExonProbes hg18.probes.bed
      # Deal with that extra annotation table of Chuck's.  I made a new autosql 
      # which almost matches it except for the name/probeSet fields.
      # First copy it out of hg17 and into a file with the new column order.
    hgsql hg17 -e "select probesetId,numIndependentProbes,exonClustId,numNonOverlapProbes,probeCount,transcriptClustId,probesetType,numXHybeProbe,psrId,level,evidence,bounded,cds from affyHumanExonProbeAnnot" \
    | tail +2 > annot.tab
      # Load that into hgFixed and change the name.
    hgLoadSqlTab hgFixed affyAllExonProbe ~/kent/src/hg/lib/affyAllExonProbe.sql annot.tab
    hgsql hgFixed -e "rename table affyAllExonProbe to affyHumanExonProbeAnnot"
      # Make ratio table for the microarray
    hgRatioMicroarray affyHumanExon affyHumanExonRatio
      # Merge probe beds with array data and load those beds.
    bedMergeExpData hgFixed.affyHumanExonRatio hg17.affyHumanExonProbes hg17.bed
    bedMergeExpData hgFixed.affyHumanExonRatio hg18.affyHumanExonProbes hg18.bed
    hgLoadBed hg17 affyHumanExon hg17.bed
    hgLoadBed hg18 affyHumanExon hg18.bed
      # Create human-level trackDb entry and affyHumanExon.html
      # and check into cvs.
 
 ###### AFFY HUMAN EXONS (COMPLETE DATA) (DONE 7-21-2006, Andy)
     ssh hgwdev
     cd /projects/compbio/data/microarray/affyHumanExon/
     mkdir moreData
     cd moreData/
     ssh bark
     cd /scratch
     cp forAndy/* /projects/compbio/data/microarray/affyHumanExon/moreData
     exit
     sed -e "s/huex_wta_//g" -e "s/\.CEL//g" exonData.vs.tab > data.txt
     hgGenericMicroarray hgFixed affyHumanExon data.txt
     hgsql hgFixed -e "select * from affyHumanExonExps" | sed "/^\+/d" | tail +2 | sed "s/_.,/,/" > newExps.tab
     hgsql hgFixed -e "delete from affyHumanExonExps"
     hgsql hgFixed -e "load data local infile 'newExps.tab' into table affyHumanExonExps"
     cd ~/kent/src/hg/makeDb/hgRatioMicroarray/
     # Make file affyHumanExon.ra in the medSpec style.
     hgRatioMicroarray -minAbsVal=0 -clump=affyHumanExon.ra affyHumanExon affyHumanExonRatio    
     bedMergeExpData hgFixed.affyHumanExonRatio hg17.affyHumanExonProbes hg17.bed
     bedMergeExpData hgFixed.affyHumanExonRatio hg18.affyHumanExonProbes hg18.bed
     hgLoadBed hg17 affyHumanExon hg17.bed
     hgLoadBed hg18 affyHumanExon hg18.bed
 # Copied affyHumanExon to hg16 (DONE 10-12-2006, Andy)
     cd /cluster/data/hg16/bed/
     mkdir affyHumanExon
     cd affyHumanExon/
     echo "select name,expCount,expScores from affyHumanExon" | hgsql hg17 | tail +2 > expdata.tab
     cp ~/kent/src/hg/lib/expData.sql .
     hgLoadSqlTab hgFixed expData expData.sql expdata.tab
     bedMergeExpData hgFixed.expData hg16.affyHuEx1 hg16.bed
     hgLoadBed hg16 affyHumanExon hg16.bed 
     hgsql -e 'drop table expData' hgFixed
 
 # QA push new cutters and rebaseRefs tables (04-06-2006: ASZ).
 
 ### load ncbi taxonomy tables (04-11-2006: Robert).
 
 mkdir /cluster/store5/taxonomy
 cd /cluster/store5/taxonomy
 ln /cluster/store5/taxonomy /cluster/data/taxonomy -s
 wget ftp://ftp.taxon.nih.gov/pub/taxonomy/taxdump.tar.gz
 tar xvfz taxdump.tar.gz 
 sed -e 's/\t|\t/~/g' names.dmp |sed -e 's/\t|//g' |awk -F~ 'length($3)<2{OFS="\t";print $2,$1,$4}length($3)>=2{OFS="\t";print $3,$1,$4}' > taxonName.txt 
 sed -e 's/\t|\t/~/g' division.dmp |sed -e 's/\t|//g' |awk -F~ '{OFS="\t";$1=$1;print $0}' > taxonDivision.txt 
 sed -e 's/\t|\t/~/g' gencode.dmp |sed -e 's/\t|//g' |awk -F~ '{OFS="\t";$1=$1;print $0}' > taxonGencode.txt 
 sed -e 's/\t|\t/~/g' nodes.dmp |sed -e 's/\t|//g' |awk -F~ '{OFS="\t";$1=$1;print $0}' > ncbiNode.txt
 pushd ~/kent/src/hg/lib
 autoSql taxonNode.as taxonNode -dbLink
 autoSql taxonXref.as taxonXref -dbLink
 autoSql taxonName.as taxonName -dbLink
 autoSql taxonGeneticCode.as taxonGeneticCode -dbLink
 autoSql taxonDivision.as taxonDivision -dbLink
 mv taxon*.h ../inc
 make
 #edit .sql files to add indexes
 
 hgsql hgFixed < taxonName.sql
 hgsql hgFixed < taxonNode.sql
 hgsql hgFixed < taxonDivision.sql
 hgsql hgFixed < taxonGeneticCode.sql
 popd
 
 hgsql hgFixed -e "load data local infile 'taxonName.txt' into table taxonName;"
 hgsql hgFixed -e "load data local infile 'taxonNode.txt' into table taxonNode" 
 hgsql hgFixed -e "load data local infile 'taxonDivision.txt' into table taxonDivision;"
 hgsql hgFixed -e "load data local infile 'taxonGencode.txt' into table taxonGeneticCode;"
 
 echo "select o.name, n.taxon as ncbi_taxon, n.name , toGenus from sp060115.taxon t, hgFixed.taxonName n, organism o where o.name = n.name and n.taxon = t.id order by toGenus;" | hgsql hg17 -N -B > taxonXref.txt
 hgsql hgFixed -e "load data local infile 'taxonXref.txt' into table taxonXref;"
 
 #--**************************************************************************
 #--  This is the NCBI genetic code table
 #--  Initial base data set from Andrzej Elzanowski while at PIR International
 #--  Addition of Eubacterial and Alternative Yeast by J.Ostell at NCBI
 #--  Base 1-3 of each codon have been added as comments to facilitate
 #--    readability at the suggestion of Peter Rice, EMBL
 #--  Later additions by Taxonomy Group staff at NCBI
 #--
 #--  Version 3.9
 #--     Code 14 differs from code 9 only by translating UAA to Tyr rather than
 #--     STOP.  A recent study (Telford et al, 2000) has found no evidence that
 #--     the codon UAA codes for Tyr in the flatworms, but other opinions exist.
 #--     There are very few GenBank records that are translated with code 14,
 #--     but a test translation shows that retranslating these records with code
 #--     9 can cause premature terminations.  Therefore, GenBank will maintain
 #--     code 14 until further information becomes available.
 #--
 #--  Version 3.8
 #--     Added GTG start to Echinoderm mitochondrial code, code 9
 #--
 #--  Version 3.7
 #--     Added code 23 Thraustochytrium mitochondrial code
 #--        formerly OGMP code 93
 #--        submitted by Gertraude Berger, Ph.D.
 #--
 #--  Version 3.6
 #--     Added code 22 TAG-Leu, TCA-stop
 #--        found in mitochondrial DNA of Scenedesmus obliquus
 #--        submitted by Gertraude Berger, Ph.D.
 #--        Organelle Genome Megasequencing Program, Univ Montreal
 #--
 #--  Version 3.5
 #--     Added code 21, Trematode Mitochondrial
 #--       (as deduced from: Garey & Wolstenholme,1989; Ohama et al, 1990)
 #--     Added code 16, Chlorophycean Mitochondrial
 #--       (TAG can translated to Leucine instaed to STOP in chlorophyceans
 #--        and fungi)
 #--
 #--  Version 3.4
 #--     Added CTG,TTG as allowed alternate start codons in Standard code.
 #--        Prats et al. 1989, Hann et al. 1992
 #--
 #--  Version 3.3 - 10/13/95
 #--     Added alternate intiation codon ATC to code 5
 #--        based on complete mitochondrial genome of honeybee
 #--        Crozier and Crozier (1993)
 #--
 #--  Version 3.2 - 6/24/95
 #--  Code       Comments
 #--   10        Alternative Ciliate Macronuclear renamed to Euplotid Macro...
 #--   15        Bleharisma Macro.. code added
 #--    5        Invertebrate Mito.. GTG allowed as alternate initiator
 #--   11        Eubacterial renamed to Bacterial as most alternate starts
 #--               have been found in Achea
 #--
 #--
 #--  Version 3.1 - 1995
 #--  Updated as per Andrzej Elzanowski at NCBI
 #--     Complete documentation in NCBI toolkit documentation
 #--  Note: 2 genetic codes have been deleted
 #--
 #--   Old id   Use id     - Notes
 #--
 #--   id 7      id 4      - Kinetoplast code now merged in code id 4
 #--   id 8      id 1      - all plant chloroplast differences due to RNA edit
 #--
 #--*************************************************************************
 #
 #Genetic-code-table ::= {
 # {
 #  name "Standard" ,
 #  name "SGC0" ,
 #  id 1 ,
 #  ncbieaa  "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 #  sncbieaa "---M---------------M---------------M----------------------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # },
 # {
 #  name "Vertebrate Mitochondrial" ,
 #  name "SGC1" ,
 #  id 2 ,
 #  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
 #  sncbieaa "--------------------------------MMMM---------------M------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # },
 # {
 #  name "Yeast Mitochondrial" ,
 #  name "SGC2" ,
 #  id 3 ,
 #  ncbieaa  "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 #  sncbieaa "----------------------------------MM----------------------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # },
 # {
 #    name "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate
 # Mitochondrial; Mycoplasma; Spiroplasma" ,
 #  name "SGC3" ,
 #  id 4 ,
 #  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 #  sncbieaa "--MM---------------M------------MMMM---------------M------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # },
 # {
 #  name "Invertebrate Mitochondrial" ,
 #  name "SGC4" ,
 ##  id 5 ,
 #  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG",
 #  sncbieaa "---M----------------------------MMMM---------------M------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # },
 # {
 #  name "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear" ,
 #  name "SGC5" ,
 #  id 6 ,
 #  ncbieaa  "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 #  sncbieaa "-----------------------------------M----------------------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # },
 # {
 #  name "Echinoderm Mitochondrial; Flatworm Mitochondrial" ,
 #  name "SGC8" ,
 #  id 9 ,
 #  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
 #  sncbieaa "-----------------------------------M---------------M------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # },
 # {
 #  name "Euplotid Nuclear" ,
 #  name "SGC9" ,
 #  id 10 ,
 #  ncbieaa  "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 #  sncbieaa "-----------------------------------M----------------------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # },
 # {
 #  name "Bacterial and Plant Plastid" ,
 #  id 11 ,
 #  ncbieaa  "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 #  sncbieaa "---M---------------M------------MMMM---------------M------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # },
 # {
 #  name "Alternative Yeast Nuclear" ,
 #  id 12 ,
 #  ncbieaa  "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 #  sncbieaa "-------------------M---------------M----------------------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # },
 # {
 #  name "Ascidian Mitochondrial" ,
 #  id 13 ,
 #  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG",
 #  sncbieaa "---M------------------------------MM---------------M------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # },
 # {
 #  name "Alternative Flatworm Mitochondrial" ,
 #  id 14 ,
 #  ncbieaa  "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
 #  sncbieaa "-----------------------------------M----------------------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # } ,
 # {
 #  name "Blepharisma Macronuclear" ,
 #  id 15 ,
 #  ncbieaa  "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 #  sncbieaa "-----------------------------------M----------------------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # } ,
 # {
 #  name "Chlorophycean Mitochondrial" ,
 #  id 16 ,
 #  ncbieaa  "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 #  sncbieaa "-----------------------------------M----------------------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # } ,
 # {
 #  name "Trematode Mitochondrial" ,
 #  id 21 ,
 #  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
 #  sncbieaa "-----------------------------------M---------------M------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # } ,
 # {
 #  name "Scenedesmus obliquus Mitochondrial" ,
 #  id 22 ,
 #  ncbieaa  "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 #  sncbieaa "-----------------------------------M----------------------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # } ,
 # {
 #  name "Thraustochytrium Mitochondrial" ,
 #  id 23 ,
 #  ncbieaa  "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
 #  sncbieaa "--------------------------------M--M---------------M------------"
 #  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
 #  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
 #  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
 # }
 #}
 
 ##########################################################################
 # Added Zebrafish microarray data (DONE, 2006-06-10, hartera)
 # From Leonard Zon's group at the Children's Hospital, Boston
 # Contact: Tony DiBiase, adibiase@enders.tch.harvard.edu
 # Data is normalized and log2 transformed, then centered on mean of 0.
 # Changed table names and reloaded MedianExps table so that the extras has
 # the strain plus time point for the name otherwise the average is taken
 # over all time points for a strain for the track display 
 # when Tissue Averages is selected. (DONE, 2006-07-30, hartera) 
 # Changed so that the extras column for the MedianExps table has the 
 # developmental stage so that an average is taken across all strains for 
 # each stage when Tissue Averages is selected.
 # (Jim recommended displaying it this way and then it also fits in with the
 # current framework for this type of track).
 # Also added the strain name and stage to the extra column for the 
 # Experiments tables (AllExps and MedianExps) so that when Chip ID is 
 # selected then all of these are shown. (DONE, 2006-08-11, hartera)
 # Added absolute data (before logs were taken). (DONE, 2006-09-19, hartera)
 # The absolute value data was centered on a mean of 0. The log data was
 # the log2 transformed normalized data, centered on a mean of 0.
 # This section now OBSOLETE so removed. See section below on UPDATE of
 # zebrafish microarray data. 
 
 ##########################################################################
 # UPDATE the Zebrafish microarray data (DONE, 2006-06-16 - 2006-10-18, hartera)
 # From Leonard Zon's group at the Children's Hospital, Boston
 # Contact: Tony DiBiase, adibiase@enders.tch.harvard.edu
 # Data is Loess normalized absolute values. Then use microarray processing
 # programs to create ratio tables.
 # The new data set was obtained so that the ratios could be calculated 
 # directly from the normalized absolute data. The ratios are calculated as 
 # the value for a probeset in one array to the median value across all arrays
 # for that probset and then a log2 is taken. This allows comparison 
 # between arrays that may differ due to technical or biological differences.
 # RE-CREATE tables. Data was log2 already so antilog the values to get 
 # absolute values and then pass through the microarray processing programs.
 # (DONE, 2007-01-05 - 2007-01-08, hartera)
    ssh hgwdev
    mkdir /projects/compbio/data/microarray/zebrafishWT
    cd /projects/compbio/data/microarray/zebrafishWT
    # copy the data here received by e-mail and unzip
    unzip wt34.loessNorm.absval.2006-10-12.zip 
    mv wt34.loessNorm.absval.2006-10-12.txt wtAffyNormLog2.txt
    dos2unix wtAffyNormLog2.txt 
 cat << 'EOF' > format.pl
 #!/usr/bin/perl -w
 use strict;
 
 while (<STDIN>)
 {
 # reformat file. change Tu to TU and remove experiment name from the
 # column headings and translate the name to something human readable.
 my ($f, @a, $n, $strain, $somites, $hpf, $fullName);
 $f = $_;
 if ($f !~ /at/)
    {
    @a = split(/\t/, $f);
    foreach $n (@a)
       {
       $fullName = "";
       $somites = 0;
       $hpf = 0;
       if ($n =~ /^([A-Za-z]+)\.([0-9]+)\.([0-9]+)\.[0-9]+\.[0-9]+/)
          {
          $strain = $1;
          $somites = $2;
          $hpf = $3;
          $strain =~ s/Tu/TU/;
          if ($somites > 0)
             {
             $fullName = $strain . "-" . $somites . "-somites";
             }
          elsif ($hpf > 0) 
             {
             $fullName = $strain . "-" . $hpf . "-hpf";
             }       
          print "\t$fullName";
          }
        }
      print "\n";
     }
 else
    {
    print $f;
    }
 }
 'EOF'
    # << emacs
 chmod +x format.pl
 perl format.pl < wtAffyNormLog2.txt > zebrafishWTNormLog2.txt
 # antilog the values, log is base 2
 cat << 'EOF' > cnvToAntilog
 #!/usr/bin/awk -f
 BEGIN {
     FS = "\t"
     RS = "\n"
     ORS=""
 }
 {
     print $1 "\t"
     x=2
     while (x < NF) {
         print 2^$x "\t"
         x++
     }
     print 2^$NF "\n"
 }
 'EOF'
 chmod +x cnvToAntilog
 # run script and skip header line in file
 tail +2 zebrafishWTNormLog2.txt | cnvToAntiLog > tmp.txt
 # add back header line:
 head -1 zebrafishWTNormLog2.txt > header
 cat header tmp.txt > zebrafishWTNormAbs.txt
 
 # Then load the data into hgFixed using hgGnfMicroarrray and use options
 # to set the url, ref, and credit to "n/a" and chip to Zebrafish.
 # Need to use this program to get 3 extras needed for hgMedianMicroarray
 # No need to round the values this time as they are larger and have 
 # a larger range.
 # Create the main expRecord table and the expData table for the 
 # absolute measurements
 hgGnfMicroarray zebrafishZonWTAllExps zebrafishZonWTAll \
      zebrafishWTNormAbs.txt -chip=Zebrafish -url=n/a -ref=n/a -credit=n/a 
 
 # Changed the Exps table so that the extras column for the MedianExps table 
 # has the strain and developmental stage in the second field so that an 
 # average is taken across all strains for each stage when Tissue Averages 
 # is selected.
 # (Jim recommended displaying it this way and then it also fits in with the
 # current framework for this type of track).
 
 hgsql -N -e 'select name, extras from zebrafishZonWTAllExps;' hgFixed \
       > zfishWTExps.extras
 
 cat << 'EOF' > cnvExtras.pl
 #!/usr/bin/perl -w
 use strict;
 
 while (<STDIN>) {
 my ($line, @extras);
 $line = $_;
 @extras = split(/,/, $line);
 $line =~ s/n\/a/$extras[2]/;
 print $line;
 }
 'EOF'
 chmod +x cnvExtras.pl
 cnvExtras.pl < zfishWTExps.extras > zfishWTExps.extras.new 
 # create set of mySQL statements from this to update the AllExps table
 # to include the name in the second field of extras - same as in the third
 # field. This is used for display when the "Arrays Grouped By Replicate
 # Medians" (or Means) is selected from the track controls on the 
 # description page.
 
 awk 'BEGIN {FS = "\t"} {print "update zebrafishZonWTAllExps set extras = \"
 "$2 "\"" " where name = \""$1"\";";}' zfishWTExps.extras.new \
     > zfishWTExpsNewExtras.sql
 hgsql hgFixed < zfishWTExpsNewExtras.sql
 
 # Convert these to ratios using the median of the absoulute values 
 # across all experiments to be the denominator for each probeset.
 # minAbsVal is 0 here as no value in this dataset is less than 1 and the
 # default for this parameter is 20.
 hgRatioMicroarray -minAbsVal=0 zebrafishZonWTAll zebrafishZonWTAllRatio 
 
 # Create the .ra file for the Median tables
 hgsql -N -e 'select extras, id from zebrafishZonWTAllExps;' hgFixed \
       > zfishWTExps
 # remove extra information and leave experiment name
 perl -pi.bak -e 's/Zebrafish,[A-Za-z]+\-[0-9]+\-[a-z]+,//' zfishWTExps
 perl -pi.bak -e 's/,//' zfishWTExps
 
 # alter script so that name for each experiment in column 2 is not just the 
 # strain but the strain plus time point (same as first column). This goes into
 # the extras column for zebrafishWTMedianExps and is used for Tissue Averages
 # display for the array data track. Otherwise an average is taken for the 
 # strain (hartera, 2006-07-30).
 # change so that column 2 is the time point so that an average of time points
 # is taken for the "Tissue Averages" Display (hartera, 2006-08-11)
 cat << 'EOF' > cnvToMedian
 #!/usr/bin/awk -f
 
 BEGIN {
     FS = "\t";
     OFS = "\t";
 }
 
 {
     data[$1] = data[$1] " " $2;
 }
 
 END {
     for (id in data) {
         split(id, a, "\\-");
         print id, a[2]a[3], substr(data[id], 2);
     }
 }
 'EOF'
    # << emacs
 chmod +x cnvToMedian
 cnvToMedian zfishWTExps > zfishZonWTMedian.ra
 # re-order the *.ra file as this determines the order of display
 sort zfishZonWTMedian.ra | grep "14somites" > tmp.ra
 sort zfishZonWTMedian.ra | grep "15somites" >> tmp.ra
 sort zfishZonWTMedian.ra | grep "hpf" >> tmp.ra
 mv tmp.ra zfishZonWTMedian.ra
 cp zfishZonWTMedian.ra ~/kent/src/hg/makeDb/hgMedianMicroarray
 cd ~/kent/src/hg/makeDb/hgMedianMicroarray
 
 # Take the median value over multiple replicants and put in this table:
 hgMedianMicroarray hgFixed zebrafishZonWTAllRatio zebrafishZonWTAllExps \
   zfishZonWTMedian.ra zebrafishZonWTMedianRatio  \
   zebrafishZonWTMedianExps -minExps=1
 
 # Make a median version of the absolute experiments:
 hgMedianMicroarray hgFixed zebrafishZonWTAll zebrafishZonWTAllExps \
   zfishZonWTMedian.ra zebrafishZonWTMedian zebrafishZonWTMedianExps -minExps=1
 
 # get distribution of MedianRatio scores:
 hgsql -N -e 'select * from zebrafishZonWTMedianRatio;' hgFixed > medRatioData
 awk '{print $3}' medRatioData > medRatioData2
 perl -pi.bak -e 's/,/\n/g' medRatioData2
 textHistogram -real -binSize=0.2 -maxBinCount=1100 -minVal=-200 \
     medRatioData2 > histMedRatio.out
 
 # from this histogram, see that most values fall between -2 to +2 so set the
 # trackDb for the Affy Zon Wild Type Array track to have expScale of 2.0
 # and expStep to 0.2 for the log scale to display the ratios in this track.
  
 ##########################################################################
 
 #The Mouse GNF Expression Atlas 2 (2004)
 ##########################################################################
 # Updated gv* tables for the Locus Variants tracks 
 # (Belinda Giardine Sept 2006)
 # This track is now available for hg17 and hg18, only the gvPos table needs to
 # be redone for each build unless new mutations are added.  This load changes
 # the schema (strand, label for gvPos) and adds a new LSDB (BTKbase) and more
 # sanity checks on all the data causing some mismapped variants to be
 # discarded.
 
 ##########################################################################
 # mgcMBLabValid - Load of Genbank accession that are in the Brent lab clone
 # validation database.  This contains both human and mouse clones.  Since
 # the Brent lab is no longer doing MGC validations, this set is fixed
 # and shared by all mouse and human assemblies. (2006-10-26 markd)
     mkdir -p /cluster/data/genbank/data/download/mgcMBLab
     cd /cluster/data/genbank/data/download/mgcMBLab
     # save list of 41805 accessions received from brent lab as
     # mgcMBLabValid.2006-10-25.acc
     hgLoadSqlTab hgFixed mgcMBLabValid ~/compbio/genbank/kent/src/hg/lib/mgcMBLabValid.sql mgcMBLabValid.2006-10-25.acc
     gzip mgcMBLabValid.2006-10-25.acc
 
 ##########################################################################
 # ZEBRAFISH DEVELOPMENTAL ARRAYS FROM GENOME INSTITUTE OF SINGAPORE (GIS) 
 # Data from Article:
 # Transcriptome Analysis of Zebrafish Embryogenesis Using Microarrays Mathavan
 # S, Lee SGP, Mak A, Miller LD, Murthy KRK, et al. PLoS Genetics Vol. 1, No. 2,
 # e29, pages 260-276 doi:10.1371/journal.pgen.0010029
 # Contact: Sinnakaruppan Mathavan <mathavans@gis.a-star.edu.sg>
 # Downloaded expression data from
 # http://giscompute.gis.a-star.edu.sg/~govind/zebrafish/data_download.html
 # after clicking on link to download largest dataset (12.9 MB):
 # ene expression data showing the expression profile during different stages
 # of zebrafish embryonic development for the genes selected from the array are
 # presented (Compugen array). Each value represents an average performance of
 # 2-4 replicates. GenBank id of the selected gene is given as the identifier.
 # Total RNA from different stages of embryonic development, adult male and
 # female were pooled in equal concentrations and used as reference RNA. The
 # genes were annotated using Zebrafish Chip Annotation Database.
 
      ssh hgwdev
      mkdir -p /projects/compbio/data/microarray/zebrafishGISDev
      # Downloaded data and saved in Excel as a tab, separated text file:
      # PLOSGISData.txt
      # This file contains Genbank accessions and the expression values
      # which are log2 based. 
 
 ##########################################################################
 # Belinda Giardine April 2007
 # gv* tables: 
 #	reload tables, additions and corrections, details in hg18 doc
 
 #############################################################################
 ## Add CTD data	
 
     mkdir /hive/data/outside/ctdbase
     cd /hive/data/outside/ctdbase
 
 #   Download chem_gene_ixns.tsv from CTD site, http://ctd.mdibl.org/downloads/.
     wget "http://ctdbase.org/reports/CTD_chem_gene_ixns.tsv.gz"
     gunzip CTD_chem_gene_ixns.tsv.gz
 
 
     hgsql hg18 -e 'create database ctdBraney'
     hgsql ctdBraney < ~/kent/src/hg/lib/chem_gene_ixns.sql
 
     hgsql ctdBraney -e 'load data local infile "CTD_chem_gene_ixns.tsv" into table chem_gene_ixns'
 
 # create sorted data
 
     hgsql hg19 -N -e \
     'select x.geneSymbol, ChemicalId, count(distinct Interaction), ChemicalName from kgXref x, ctdBraney.chem_gene_ixns c where x.geneSymbol=c.GeneSymbol group by x.geneSymbol, ChemicalId'|\
     sort -k 1,1 -k 3,3nr -k 4,4 >ctdSorted.tab
 
     hgsql hgFixed < ~/kent/src/hg/lib/ctdSorted.sql
     hgsql hgFixed -e 'load data local infile "ctdSorted.tab" into table ctdSorted'
 
 #####
 # Jaspar 2022 PFM  (DONE 2021/12/08 braney)
 #####
 mkdir -p /hive/data/outside/jaspar/2022/all
 cd  /hive/data/outside/jaspar/2022/all
 wget "https://jaspar.genereg.net/download/data/2022/CORE/JASPAR2022_CORE_non-redundant_pfms_jaspar.zip"
 unzip JASPAR2022_CORE_non-redundant_pfms_jaspar.zip
 for i in *.jaspar; do f=`awk '{print $1; exit}' $i | tr -d '>'`;  tail -n +1  $i | awk '{count=NF - 3;  for (ii=3; ii <= NF - 1; ii++) counts[ii - 3] += $ii;} END {for(ii=0; ii < count; ii++) printf "%d\t", counts[ii]; printf "\n"}'  > /tmp/1;  tail -n +2   $i | awk '{count=NF - 3;  for (ii=3; ii <= NF - 1; ii++) printf "%d\t", $ii; printf "\n"}' >> /tmp/1; tail -n +1 $i | awk '{count=NF - 3;  for (ii=3; ii <= NF - 1; ii++) counts[ii - 3] += $ii;} END {for(ii=0; ii < count; ii++) printf "%d\t", counts[ii]; printf "\n"}' | awk '{numCols=NF; for(ii=1;ii <= numCols; ii++) mat[NR-1][ii] = $ii;} END {for(jj=1; jj <= numCols; jj++) {for(ii=0; ii < NR; ii++) printf "%d\t",mat[ii][jj]; printf "\n";}}'  /tmp/1 | awk '{for (ii=2; ii <= NF; ii++) printf "%f\t",$ii/$1; printf "\n";}' | awk -v name=$f '{numCols=NF; for(ii=1;ii <= numCols; ii++) mat[NR-1][ii] = $ii;} END {printf "%s\t%d\t",name,NR;for(jj=1; jj <= numCols; jj++) {for(ii=0; ii < NR; ii++) printf "%g,",mat[ii][jj]; printf "\t";}} END {printf "\n"}'  ; done > jasparMotif.tab
 
 hgLoadSqlTab hgFixed jasparCore2022 ~/kent/src/hg/lib/dnaMotif.sql jasparMotif.tab
+
+#####
+# Jaspar 2024 PFM  (DONE 2024/10/23 braney)
+#####
+mkdir -p /hive/data/outside/jaspar/2024/all
+cd  /hive/data/outside/jaspar/2024/all
+wget "https://testjaspar.uio.no/download/data/2024/CORE/JASPAR2024_CORE_non-redundant_pfms_jaspar.zip"
+unzip JASPAR2024_CORE_non-redundant_pfms_jaspar.zip
+for i in *.jaspar; do f=`awk '{print $1; exit}' $i | tr -d '>'`;  tail -n +1  $i | awk '{count=NF - 3;  for (ii=3; ii <= NF - 1; ii++) counts[ii - 3] += $ii;} END {for(ii=0; ii < count; ii++) printf "%d\t", counts[ii]; printf "\n"}'  > /tmp/1;  tail -n +2   $i | awk '{count=NF - 3;  for (ii=3; ii <= NF - 1; ii++) printf "%d\t", $ii; printf "\n"}' >> /tmp/1; tail -n +1 $i | awk '{count=NF - 3;  for (ii=3; ii <= NF - 1; ii++) counts[ii - 3] += $ii;} END {for(ii=0; ii < count; ii++) printf "%d\t", counts[ii]; printf "\n"}' | awk '{numCols=NF; for(ii=1;ii <= numCols; ii++) mat[NR-1][ii] = $ii;} END {for(jj=1; jj <= numCols; jj++) {for(ii=0; ii < NR; ii++) printf "%d\t",mat[ii][jj]; printf "\n";}}'  /tmp/1 | awk '{for (ii=2; ii <= NF; ii++) printf "%f\t",$ii/$1; printf "\n";}' | awk -v name=$f '{numCols=NF; for(ii=1;ii <= numCols; ii++) mat[NR-1][ii] = $ii;} END {printf "%s\t%d\t",name,NR;for(jj=1; jj <= numCols; jj++) {for(ii=0; ii < NR; ii++) printf "%g,",mat[ii][jj]; printf "\t";}} END {printf "\n"}'  ; done > jasparMotif.tab
+
+hgLoadSqlTab hgFixed jasparCore2024 ~/kent/src/hg/lib/dnaMotif.sql jasparMotif.tab