851385903f257839da60df1ba89e525360742b4a
braney
Wed Dec 8 14:23:24 2021 -0800
add JASPAR 2022 core non-redundant PWMs to hgFixed
diff --git src/hg/makeDb/doc/hgFixed.txt src/hg/makeDb/doc/hgFixed.txt
index 99ca174..54518c8 100644
--- src/hg/makeDb/doc/hgFixed.txt
+++ src/hg/makeDb/doc/hgFixed.txt
@@ -1,1083 +1,1093 @@
#This describes how at least some of the tables in
#hgFixed were created. This is a database containing
#primarily expression data. There are two main formats:
# expRecord.as - This describes the mRNA sources for
# a series of microarray experiments
# expData.as - This describes the measured value
# in either absolute or relative ratio terms of
# each gene/probe/target in a series of microarray
# experiments. Each expData is associated with
# an expRecord, thogh expDatas sometimes share
# the same expRecord.
#The Human Affy GNF Expression Atlas 2003 Version:
# Create the main expRecord table and the expData table for
# the absolute measurements as so:
hgGnfMicroarray gnfHumanU95AllExps gnfHumanU95All /projects/compbio/data/microarray/affyGnfHuman/data_public_U95
# Convert these to ratios using the median of medians of non-cancerous
# cell types as the denominator as so:
cd ~/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray gnfHumanU95All gnfHumanU95AllRatio -clump=gnfClump.ra
# Take the median value over multiple replicants and put in this table:
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed gnfHumanU95AllRatio gnfHumanU95AllExps gnfU95Median.ra gnfHumanU95MedianRatio gnfHumanU95MedianExps -minExps=1
# Also make a median version of the absolute measurements
hgMedianMicroarray hgFixed gnfHumanU95All gnfHumanU95AllExps gnfU95Median.ra gnfHumanU95Median gnfHumanU95MedianExps -minExps=1
# The Mouse Affy GNF Expression Atlas:
# Create the expRecord tables for U74 a/b/c and the expData table for
# the absolute measurements:
hgGnfMicroarray gnfMouseU74aAllExps gnfMouseU74aAll /projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74
hgGnfMicroarray gnfMouseU74bAllExps gnfMouseU74bAll /projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt
hgGnfMicroarray gnfMouseU74cAllExps gnfMouseU74cAll /projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt
# Convert these to ratios using the median of medians of
# cell types as the denominator as so:
cd ~/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray gnfMouseU74aAll gnfMouseU74aAllRatio -clump=gnfMouseU74aClump.ra
hgRatioMicroarray gnfMouseU74bAll gnfMouseU74bAllRatio -clump=gnfMouseU74bClump.ra
hgRatioMicroarray gnfMouseU74cAll gnfMouseU74cAllRatio -clump=gnfMouseU74cClump.ra
# Take the median value over multiple replicants and put in this table:
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed gnfMouseU74aAllRatio gnfMouseU74aAllExps gnfMouseU74aMedian.ra gnfMouseU74aMedianRatio gnfMouseU74aMedianExps -minExps=1
hgMedianMicroarray hgFixed gnfMouseU74bAllRatio gnfMouseU74bAllExps gnfMouseU74bMedian.ra gnfMouseU74bMedianRatio gnfMouseU74bMedianExps -minExps=1
hgMedianMicroarray hgFixed gnfMouseU74cAllRatio gnfMouseU74cAllExps gnfMouseU74cMedian.ra gnfMouseU74cMedianRatio gnfMouseU74cMedianExps -minExps=1
# Also make a median version of the absolute measurements
hgMedianMicroarray hgFixed gnfMouseU74aAll gnfMouseU74aAllExps gnfMouseU74aMedian.ra gnfMouseU74aMedian gnfMouseU74aMedianExps -minExps=1
hgMedianMicroarray hgFixed gnfMouseU74bAll gnfMouseU74bAllExps gnfMouseU74bMedian.ra gnfMouseU74bMedian gnfMouseU74bMedianExps -minExps=1
hgMedianMicroarray hgFixed gnfMouseU74cAll gnfMouseU74cAllExps gnfMouseU74cMedian.ra gnfMouseU74cMedian gnfMouseU74cMedianExps -minExps=1
#The Human GNF Expression Atlas 2 (2004)
#
# Create the main expRecord table and the expData table for
# the absolute measurements as so:
hgGnfMicroarray gnfHumanAtlas2AllExps gnfHumanAtlas2All /projects/compbio/data/microarray/geneAtlas2/human/U133A+GNF1B_101402.AD.txt -chip=U133A+GNF1B
# Convert these to ratios using the median of medians of non-cancerous
# cell types as the denominator as so:
cd ~/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray gnfHumanAtlas2All gnfHumanAtlas2AllRatio -clump=gnfHumanAtlas2Clumps.ra
# Take the median value over multiple replicants and put in this table:
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed gnfHumanAtlas2AllRatio gnfHumanAtlas2AllExps gnfHumanAtlas2.ra gnfHumanAtlas2MedianRatio gnfHumanAtlas2MedianExps -minExps=1
# Also make a median version of the absolute measurements
hgMedianMicroarray hgFixed gnfHumanAtlas2All gnfHumanAtlas2AllExps gnfHumanAtlas2.ra gnfHumanAtlas2Median gnfHumanAtlas2MedianExps -minExps=1
#The Mouse GNF Expression Atlas 2 (2004)
# Create the main expRecord table and the expData table for
# the absolute measurements as so:
hgGnfMicroarray gnfMouseAtlas2AllExps gnfMouseAtlas2All /projects/compbio/data/microarray/geneAtlas2/mouse/GNF1M_20030403.AD.txt -chip=GNF1M
# Convert these to ratios using the median of medians of non-cancerous
# cell types as the denominator as so:
cd ~/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray gnfMouseAtlas2All gnfMouseAtlas2AllRatio -clump=../hgMedianMicroarray/gnfMouseAtlas2.ra
# Take the median value over multiple replicants and put in this table:
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed gnfMouseAtlas2AllRatio gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2MedianRatio gnfMouseAtlas2MedianExps -minExps=1
# Also make a median version of the absolute measurements
hgMedianMicroarray hgFixed gnfMouseAtlas2All gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2Median gnfMouseAtlas2MedianExps -minExps=1
#The Rat GNF Expression Atlas 2 (2004)
# Create the main expRecord table and the expData table for
# the absolute measurements as so:
hgGnfMicroarray gnfRatAtlas2AllExps gnfRatAtlas2All /projects/compbio/data/microarray/geneAtlas2/rat/PivotNoApwithTissues.txt -chip=RG-U34A -ref=http://expression.gnf.org/ratlas
# Convert these to ratios using the median of medians of non-cancerous
# tissues or cell types (in this case, this is all the tissues) as the
# denominator as so:
cd ~/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray gnfRatAtlas2All gnfRatAtlas2AllRatio -clump=gnfRatAtlas2Clumps.ra
# Take the median value over multiple replicants and put in this table.
# Use Clumps.ra file renamed as gnfRatAtlas2.ra as this contains all the
# tissues since there are no cancer tissues in this expression data set:
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed gnfRatAtlas2AllRatio gnfRatAtlas2AllExps gnfRatAtlas2.ra gnfRatAtlas2MedianRatio gnfRatAtlas2MedianExps -minExps=1
# Also make a median version of the absolute measurements
hgMedianMicroarray hgFixed gnfRatAtlas2All gnfRatAtlas2AllExps gnfRatAtlas2.ra gnfRatAtlas2Median gnfRatAtlas2MedianExps -minExps=1
# C. elegans life cycle data from the Kim Lab via the Stanford Microarray Database.
cd ~/kent/src/hg/makeDb/hgStanfordMicroarray
hgStanfordMicroarray hgFixed kimWormLifeAllRatio kimWormLifeAllExps /projects/compbio/data/microarray/wormLifeCycle/spots -swap '-trimName=(green)' -suppress=green '-trimTissue=(repeat #?)'
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed kimWormLifeAllRatio kimWormLifeAllExps kimMed.ra kimWormLifeMedianRatio kimWormLifeMedianExps
# D. melanogaster life cycle data from Arbeitman et al 2002
# via the Stanford Microarray Database.
cd ~/kent/src/hg/makeDb/hgStanfordMicroarray
# absolute:
hgStanfordMicroarray -geneField="Systematic name" -dataField=CH2I_MEDIAN \
hgFixed arbFlyLifeAll arbFlyLifeAllExps \
/projects/compbio/data/microarray/flyLifeCycle/spots
# ratios:
hgStanfordMicroarray -geneField="Systematic name" \
hgFixed arbFlyLifeAllRatio arbFlyLifeAllExps \
/projects/compbio/data/microarray/flyLifeCycle/spots
cd ../hgMedianMicroarray
echo "select name,id from arbFlyLifeAllExps" | hgsql -N hgFixed \
| sort > arbMed.ra
# edit arbMed.ra to collapse the N=1, N=2 lines.
# median absolute:
hgMedianMicroarray hgFixed arbFlyLifeAll arbFlyLifeAllExps arbMed.ra \
arbFlyLifeMedian arbFlyLifeMedianExps
# median ratios:
hgMedianMicroarray hgFixed arbFlyLifeAllRatio arbFlyLifeAllExps arbMed.ra \
arbFlyLifeMedianRatio arbFlyLifeMedianExps
# cvs add and check in arbMed.ra
###########################
# REGENERATING FLY LIFE-CYCLE TABLES. (DONE 5/12/2006 ANDY)
hgsql hgFixed -e "rename table kimWormLifeAllRatio to kimWormLifeAllRatio_old"
hgsql hgFixed -e "rename table kimWormLifeMedianExps to kimWormLifeMedianExps_old"
hgsql hgFixed -e "rename table kimWormLifeMedianRatio to kimWormLifeMedianRatio_old"
# The scopDes table, which is used by the SuperFamily column in hgNear.
mkdir /cluster/store1/scop
cd /cluster/store1/scop
wget http://scop.mrc-lmb.cam.ac.uk/scop/parse/dir.des.scop.txt_1.63
grep -v '^#' dir.des.scop.txt* > scopDes.txt
hgsql hgFixed < ~/kent/src/hg/lib/scopDes.sql
echo "load data local infile 'scopDes.txt' into table scopDes;" | hgsql hgFixed
# The Yeast Cell Cycle Time Course from Cho RJ et al 1998
cd /cluster/data/sacCer1/download/systematic_results/expression_data
hgGnfMicroarray yeastChoCellCycleExps yeastChoCellCycle \
Cho_et_al_full_data.txt -chip=affyYeast \
-chopName=/ \
-url=http://yscdp.stanford.edu/yeast_cell_cycle/cellcycle.html \
-ref=http://www.pnas.org/cgi/content/abstract/95/7/3752 \
-credit=http://yscdp.stanford.edu/yeast_cell_cycle/cellcycle.html
cd ~/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray yeastChoCellCycle yeastChoCellCycleRatio
# Mouse expression data by sex on Affy MOE430A arrays from
# John Rinn (john.rinn@yale.edu) et al.
cd /projects/compbio/data/microarray/rinnEtAl
hgGnfMicroarray mouseRinnSexExps mouseRinnSex rinnEtAlSpots.txt \
-chip=MOE430A \
-url=n/a \
-ref=n/a \
-credit=n/a
cd ~/kent/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray mouseRinnSex mouseRinnSexRatio
cd ~/kent/src/hg/makeDb/hgMedianMicroarray
hgMedianMicroarray hgFixed mouseRinnSex mouseRinnSexExps mouseRinnSex.ra mouseRinnSexMedian mouseRinnSexMedianExps
hgMedianMicroarray hgFixed mouseRinnSexRatio mouseRinnSexExps mouseRinnSex.ra mouseRinnSexMedianRatio mouseRinnSexMedianExps
# D. melanogaster full euchromatic expression profile (FEEP) --
# Stolc et al. 2004.
1# Loaded up absolute tables directly from files downloaded from
# http://genome.med.yale.edu/FEEP/FEEP.html --
# see /projects/compbio/data/microarray/flyFEEP/README .
# Extract ratio from absolute:
hgRatioMicroarray flyFeepAll flyFeepAllRatio
cd ~/kent/src/hg/makeDb/hgMedianMicroarray
echo "select description,id from flyFeepAllExps" | hgsql -N hgFixed \
| sort > flyFeepMed.ra
# edit flyFeepMed.ra to collapse lines with the same initial character.
# median absolute:
hgMedianMicroarray hgFixed flyFeepAll flyFeepAllExps flyFeepMed.ra \
flyFeepMedian flyFeepMedianExps
# median ratios:
hgMedianMicroarray hgFixed flyFeepAllRatio flyFeepAllExps flyFeepMed.ra \
flyFeepMedianRatio flyFeepMedianExps
# cvs add and check in flyFeepMed.ra
# Human data from Shyamsundar R, et al. (2005) Genome Biol 6(3):R22
mkdir -p /projects/compbio/data/microarray/shyamsundarEtAl
cd /projects/compbio/data/microarray/shyamsundarEtAl
wget ftp://smd-ftp.stanford.edu/smd/publications/426/3130/exptsetno_3130.tar.gz
wget ftp://smd-ftp.stanford.edu/smd/publications/426/3130/exptset_3130.meta
tar xfz exptsetno_3130.tar.gz
rm exptsetno_3130.tar.gz
mkdir spots
cat << _EOF_ > cleanXls.awk
{
if (/^!/)
{
line = \$0
gsub(/\"|,/, "", line)
print line
}
else
print
}
_EOF_
for file in *.xls; do
awk -f cleanXls.awk $file > spots/$file
done
cd ~/kent/src/hg/makeDb/hgMedianMicroarray
# The hgFixed.history doesn't have the errata column
echo alter table history add column errata varchar(255) | hgsql hgFixed
hgStanfordMicroarray -dataField="Normalized Ch2 Intensity (Median)" \
hgFixed humanNormal humanNormalExps /projects/compbio/data/microarray/shyamsundarEtAl/spots
hgStanfordMicroarray -dataField="Log(base2) of R/G Normalized Ratio (Mean)" \
hgFixed humanNormalRatio humanNormalExps /projects/compbio/data/microarray/shyamsundarEtAl/spots
echo "select name from humanNormalExps" | hgsql -N hgFixed | awk "{print \"\'\"\$0\"\'\"}" > col1
echo "select id from humanNormalExps" | hgsql -N hgFixed > col2
n=`wc -l < col1`
for i in `seq 1 $n`; do echo "n/a" >> col1.5; done
paste col1 col1.5 col2 | sort | tr '\t' ' ' > humanNormal.ra
rm col1 col1.5 col2
# EDIT humanNormal.ra by hand and combine the like tissues
hgMedianMicroarray -minExps=1 hgFixed humanNormal humanNormalExps humanNormal.ra \
humanNormalMedian humanNormalMedianExps
hgMedianMicroarray -minExps=1 hgFixed humanNormalRatio humanNormalExps humanNormal.ra \
humanNormalMedianRatio humanNormalMedianExps
#### HUMAN NORMAL DATA FIXING (10/5/2006 Andy)
ssh hgwdev
cd /projects/compbio/data/microarray/shyamsundarEtAl
mv spots/13729.xls .
tail +23 13729.xls | cut -f8,63 > data.txt
echo 13729 > arrays.txt
for array in spots/*; do
echo $array >> arrays.txt
tail +23 $array | cut -f63 > newCol.txt
paste data.txt newCol.txt > tmp.txt
mv tmp.txt data.txt
done
sed '/^[[:space:]]/d' data.txt > tmp.txt
mv tmp.txt data.txt
sed 's/spots\///;s/\.xls.*$//' arrays.txt > tmp.txt
mv tmp.txt arrays.txt
for id in `cat arrays.txt`; do grep $id -B1 exptset_3130.meta | grep Name | sed 's/.*=//;s/\"//g' >> names.txt; done
paste arrays.txt names.txt | sort -k2,2 > tmp.txt
mv tmp.txt arrays.txt
rm names.txt
# I changed my mind
echo "" | cat - names.txt | tr '\n' '\t' > oneLine.txt
cat oneLine.txt data.txt > tmp.txt
mv tmp.txt data.
# (copy/paste this into columnDb.ra)
# Mouse data from Zhang, et. al The functional landscape of mouse gene expression" J Biol.
# http://hugheslab.med.utoronto.ca/Zhang/
mkdir -p /cluster/store2/microarray
ln -s /cluster/store2/microarray /cluster/data/microarray
mkdir -p /cluster/data/microarray/zhangEtAl
cd /cluster/data/microarray/zhangEtAl
wget http://hugheslab.med.utoronto.ca/Zhang/expression_39309_normalized.txt
sed 's/\(XM_[0-9]\+\)\.1/\1/' expression_39309_normalized.txt > arrays.txt
hgGenericMicroarray hgFixed mouseLandscape arrays.txt
wget http://hugheslab.med.utoronto.ca/Zhang/mouse_XM_mRNA_NCBI_2.fa
sed 's/^>.*|\(XM.*\)\.1|.*$/>\1/' mouse_XM_mRNA_NCBI_2.fa > xm.fa
ssh kk9
cd /santest/scratch
mkdir andy
cd andy/
cp /cluster/data/microarray/zhangEtAl/xm.fa .
ls -1 /panasas/store/mm6/nib/* | grep -v random > chroms.lst
cat << _EOF_ > gsub
#LOOP
blat -ooc=/scratch/hg/h/mouse11.ooc -fine -q=rna -noHead \$(path1) xm.fa xm.\$(root1).psl
#ENDLOOP
_EOF_
gensub2 chroms.lst single gsub spec
para create spec
para push
para time
#Completed: 22 of 22 jobs
#CPU time in finished jobs: 36298s 604.96m 10.08h 0.42d 0.001 y
#IO & Wait Time: 91s 1.52m 0.03h 0.00d 0.000 y
#Average job time: 1654s 27.57m 0.46h 0.02d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 2955s 49.25m 0.82h 0.03d
#Submission to last job: 2957s 49.28m 0.82h 0.03d
cat *.psl > xm.psl
ssh hgwdev
cd /cluster/data/microarray/zhangEtAl
cp /santest/scratch/andy/xm.psl .
hgLoadPsl -table=xmMrna mm6 xm.psl
hgMapToGene -type=psl -cds mm6 xmMrna knownGene knownToXM
echo drop table xmMrna | hgsql mm6
# REBASE 505 (4-28-2005) (Done 5/18/2005 Andy)
ssh hgwdev
# download files
curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgenz > rebase.gcg
curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgref > rebaseRefs.txt
# References file
tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f2- -d' ' > c2
tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f1 -d' ' | sed 's/\.//' > c1
paste c1 c2 | sed '/^$/d' > rebaseRefs.txt
rm c1 c2
# Load the cutters table.
hgCutters hgFixed rebase.gcg
# Load the other table.
hgsql hgFixed -e "echo delete from rebaseRefs"
hgsql hgFixed -e "load data local infile 'rebaseRefs.txt' into table rebaseRefs"
# REBASE 603 (3-1-2006) (Done 3-2-2006 Andy)
ssh hgwdev
# download files
curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgenz > rebase.gcg
curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgref > rebaseRefs.txt
# References file
tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f2- -d' ' > c2
tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f1 -d' ' | sed 's/\.//' > c1
paste c1 c2 | sed '/^$/d' > rebaseRefs.txt
rm c1 c2
# Load the cutters table.
hgCutters hgFixed rebase.gcg
# Load the other table.
hgsql hgFixed -e "echo delete from rebaseRefs"
hgsql hgFixed -e "load data local infile 'rebaseRefs.txt' into table rebaseRefs"
# REBASE 902 (2009) (DONE 2009-02-09, Andy)
ssh hgwdev
mkdir /hive/data/outside/rebase
cd /hive/data/outside/rebase
tail -n+15 rebaseRefs.txt | sed '/^$/d; s/^\s\+\([[:digit:]]\+\)\.\s\+\\1\t/' > tmp
mv tmp rebaseRefs.txt
hgCutters hgFixed rebase.gcg
hgsql hgFixed -e "delete from rebaseRefs"
hgsql hgFixed -e "load data local infile 'rebaseRefs.txt' into table rebaseRefs"
mkdir rebase902
mv rebase.gcg rebaseRefs.txt rebase902/
# REBASE 201 (Dec 31, 2011) (DONE 2012-01-24, Andy)
ssh hgwdev
cd /hive/data/outside/rebase
mkdir rebase201
cd rebase201/
# go to http://rebase.neb.com/rebase/rebase.f3.html
# save the GET FILE files, then scp them to this dir
ln -s link_gcgenz.txt rebase.gcg
hgCutters hgFixed rebase.gcg
tail -n +15 link_gcgref.txt | sed '/^$/d; s/^\s*\([[:digit:]]\+\)\.\s\+\\1\t/' > rebaseRefs.txt
hgsqldump -d --compact hgFixed rebaseRefs | grep -v "^SET" > rebaseRefs.sql
hgLoadSqlTab hgFixed rebaseRefs{,.sql,.txt}
# GLADSTONE hESC Novartis microarray data.
# 1. Download http://www.genmapp.org/temp/humansimpleESC.zip
# 2. Manually convert using MS access on Bob's laptop to a tab-delimited text file.
# 3. Add column names to that file manually.
ssh hgwdev
mkdir /projects/compbio/data/microarray/gladstone
cp hESC.txt /projects/compbio/data/microarray/gladstone
cd /projects/compbio/data/microarray/gladstone
cut -f1,4,6 hESC.txt | tail +2 | sort -k3,3 -k1,1 > data.1
for tiss in `cut -f3 data.1 | sort | uniq`; do
grep $tiss data.1 | cut -f1,2 | sort -k1,1 | cut -f2 > data.${tiss}.1
echo $tiss | cat - data.${tiss}.1 > data.${tiss}.2
done
paste data.*.2 > data.2
grep Lung data.1 | cut -f1 | sort > names
echo Probe | cat - names | paste - data.2 > data.3
cat << _EOF_ > fixGladstone.sed
s/_/ /;
s/Embryonicstemcell/Embryonic Stem Cell/;
s/Smoothmuscle/Smooth Muscle/;
s/Salivarygland/Salivary Gland/;
s/Lymphnode/Lymph Node/;
s/Bonemarrow/Bone Marrow/;
s/Spinalcord/Spinal Cord/;
s/Wholebrain/Whole Brain/;
s/blood/Blood/;
_EOF_
head -n1 data.3 | sed -f fixGladstone.sed > header
tail +2 data.3 | cat header - > data.4
mv data.4 generic.hESC.txt
rm data.* names header
hgGenericMicroarray hgFixed gladHumES generic.hESC.txt
hgRatioMicroarray gladHumES gladHumESRatio
# GLADSTONE
ssh hgwdev
cd /projects/compbio/data/microarray/gladstone
awk '{if ($3 == $4) print}' hESC.txt > bestQ.hESC.txt
cat << _EOF_ | hgsql hgFixed
CREATE TABLE gladHumESOtherData (
name varchar(255) not null, # Name of item
tissueQ varchar(255) not null, # Name of Q-associated tissue
qVal float not null, # Q value
hVal float not null, # H value
#Indices
INDEX(name(8)),
INDEX(tissueQ(10))
);
_EOF_
cut -f1,2,5,6 hESC.txt | tail +2 | sort -k1,1 -k3,3n \
| awk '{printf("%s\t%s\t%s\t%s\n", $4, $3, $2, $1)}' \
| uniq -f3 \
| awk '{printf("%s\t%s\t%s\t%s\n", $4, $1, $2, $3)}' \
> gladOther.txt
# Fix up the tissue column
cut -f2 gladOther.txt > tmp.tiss.1
sed -f fixGladstone.sed tmp.tiss.1 > tmp.tiss.2
cut -f1 gladOther.txt > tmp.names
cut -f3- gladOther.txt | paste tmp.names tmp.tiss.2 - \
> tmp.glad
mv tmp.glad gladOther.txt
rm tmp.*
echo "load data local infile 'gladOther.txt' into table gladHumESOtherData" | hgsql hgFixed
# PRINCETON STEM CELL ARRAYS
ssh hgwdev
mkdir /projects/compbio/data/microarray/princetonESC
cd /projects/compbio/data/microarray/princetonESC
for num in i ii iii iv v vi vii; do
wget http://stemcell.princeton.edu/affy_cluster_${num}.html
grep "td bgcolor=\"#FFFFAA\" align=center class=ssb" affy_cluster_${num}.html | sed 's/.*
\(.*\)<\/td>/\1/' > names
grep "
" affy_cluster_${num}.html | sed 's/.*right>\(.*\) <\/td>.*$/\1/' | colify 9 /dev/stdin > data
paste names data >> tmp.txt
rm names data affy_cluster_${num}.html
done
echo "~Bone Marrow RhoLo~Bone Marrow RhoHi~Bone Marrow Sca-~Bone Marrow Lin+~Fetal Liver Sca+~Fetal Liver Sca-~Fetal Liver Lin+~Neural Stem Cells~Embryonic Stem Cells" | tr '~' '\t' | cat - tmp.txt > princeton.txt
rm tmp.txt
# QA push cghNci60Exps on 2006-02-07 to rr. Table/data previously missing (Jen)
# QA re-push rosChr22Dat on 2006-02-08 to fix table formatting/timestamps (Jen)
# AFFY ALL EXON HUMAN ARRAYS (INCLUDES TABLES ON HG17 AND HG18) (Done 3/15/2006, Andy)
# Chuck put them in tab-delimited file in ~sugnet
ssh hgwdev
cd /projects/compbio/data/microarray
mkdir affyHumanExon
cd affyHumanExon/
cp ~sugnet/plier-gcbg-sketch.summary.txt .
sed -e "s/huex_wta_//g" -e "s/\.CEL//g" plier-gcbg-sketch.summary.txt > data.txt
hgGenericMicroarray hgFixed affyHumanExon data.txt
# Chuck put probe data into two tables in hg17.
# Grab the bed first. Change the original name because a lot got started
# without keeping Chuck's naming convention in mind. oh well.
hgsql hg17 -e "rename table affyHuEx1 to affyHumanExonProbes"
hgsql hg17 -e "rename table affyHuEx1Annot to affyHumanExonProbeAnnot"
hgsql hg17 -e "select * from affyHuEx1" | tail +2 | cut -f2-7 | > hg17.probes.bed
# Lift to hg18
liftOver hg17.probes.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain hg18.probes.bed hg18.unMapped
# How many didn't get lifted (out of 1.4 million)?
wc -l hg18.unMapped
# 276 hg18.unMapped
# That's not bad at all. 99.99% of them lifted fine.
# Load the hg18 probe bed. Change the name of the hg17 one.
hgLoadBed hg18 affyHumanExonProbes hg18.probes.bed
# Deal with that extra annotation table of Chuck's. I made a new autosql
# which almost matches it except for the name/probeSet fields.
# First copy it out of hg17 and into a file with the new column order.
hgsql hg17 -e "select probesetId,numIndependentProbes,exonClustId,numNonOverlapProbes,probeCount,transcriptClustId,probesetType,numXHybeProbe,psrId,level,evidence,bounded,cds from affyHumanExonProbeAnnot" \
| tail +2 > annot.tab
# Load that into hgFixed and change the name.
hgLoadSqlTab hgFixed affyAllExonProbe ~/kent/src/hg/lib/affyAllExonProbe.sql annot.tab
hgsql hgFixed -e "rename table affyAllExonProbe to affyHumanExonProbeAnnot"
# Make ratio table for the microarray
hgRatioMicroarray affyHumanExon affyHumanExonRatio
# Merge probe beds with array data and load those beds.
bedMergeExpData hgFixed.affyHumanExonRatio hg17.affyHumanExonProbes hg17.bed
bedMergeExpData hgFixed.affyHumanExonRatio hg18.affyHumanExonProbes hg18.bed
hgLoadBed hg17 affyHumanExon hg17.bed
hgLoadBed hg18 affyHumanExon hg18.bed
# Create human-level trackDb entry and affyHumanExon.html
# and check into cvs.
###### AFFY HUMAN EXONS (COMPLETE DATA) (DONE 7-21-2006, Andy)
ssh hgwdev
cd /projects/compbio/data/microarray/affyHumanExon/
mkdir moreData
cd moreData/
ssh bark
cd /scratch
cp forAndy/* /projects/compbio/data/microarray/affyHumanExon/moreData
exit
sed -e "s/huex_wta_//g" -e "s/\.CEL//g" exonData.vs.tab > data.txt
hgGenericMicroarray hgFixed affyHumanExon data.txt
hgsql hgFixed -e "select * from affyHumanExonExps" | sed "/^\+/d" | tail +2 | sed "s/_.,/,/" > newExps.tab
hgsql hgFixed -e "delete from affyHumanExonExps"
hgsql hgFixed -e "load data local infile 'newExps.tab' into table affyHumanExonExps"
cd ~/kent/src/hg/makeDb/hgRatioMicroarray/
# Make file affyHumanExon.ra in the medSpec style.
hgRatioMicroarray -minAbsVal=0 -clump=affyHumanExon.ra affyHumanExon affyHumanExonRatio
bedMergeExpData hgFixed.affyHumanExonRatio hg17.affyHumanExonProbes hg17.bed
bedMergeExpData hgFixed.affyHumanExonRatio hg18.affyHumanExonProbes hg18.bed
hgLoadBed hg17 affyHumanExon hg17.bed
hgLoadBed hg18 affyHumanExon hg18.bed
# Copied affyHumanExon to hg16 (DONE 10-12-2006, Andy)
cd /cluster/data/hg16/bed/
mkdir affyHumanExon
cd affyHumanExon/
echo "select name,expCount,expScores from affyHumanExon" | hgsql hg17 | tail +2 > expdata.tab
cp ~/kent/src/hg/lib/expData.sql .
hgLoadSqlTab hgFixed expData expData.sql expdata.tab
bedMergeExpData hgFixed.expData hg16.affyHuEx1 hg16.bed
hgLoadBed hg16 affyHumanExon hg16.bed
hgsql -e 'drop table expData' hgFixed
# QA push new cutters and rebaseRefs tables (04-06-2006: ASZ).
### load ncbi taxonomy tables (04-11-2006: Robert).
mkdir /cluster/store5/taxonomy
cd /cluster/store5/taxonomy
ln /cluster/store5/taxonomy /cluster/data/taxonomy -s
wget ftp://ftp.taxon.nih.gov/pub/taxonomy/taxdump.tar.gz
tar xvfz taxdump.tar.gz
sed -e 's/\t|\t/~/g' names.dmp |sed -e 's/\t|//g' |awk -F~ 'length($3)<2{OFS="\t";print $2,$1,$4}length($3)>=2{OFS="\t";print $3,$1,$4}' > taxonName.txt
sed -e 's/\t|\t/~/g' division.dmp |sed -e 's/\t|//g' |awk -F~ '{OFS="\t";$1=$1;print $0}' > taxonDivision.txt
sed -e 's/\t|\t/~/g' gencode.dmp |sed -e 's/\t|//g' |awk -F~ '{OFS="\t";$1=$1;print $0}' > taxonGencode.txt
sed -e 's/\t|\t/~/g' nodes.dmp |sed -e 's/\t|//g' |awk -F~ '{OFS="\t";$1=$1;print $0}' > ncbiNode.txt
pushd ~/kent/src/hg/lib
autoSql taxonNode.as taxonNode -dbLink
autoSql taxonXref.as taxonXref -dbLink
autoSql taxonName.as taxonName -dbLink
autoSql taxonGeneticCode.as taxonGeneticCode -dbLink
autoSql taxonDivision.as taxonDivision -dbLink
mv taxon*.h ../inc
make
#edit .sql files to add indexes
hgsql hgFixed < taxonName.sql
hgsql hgFixed < taxonNode.sql
hgsql hgFixed < taxonDivision.sql
hgsql hgFixed < taxonGeneticCode.sql
popd
hgsql hgFixed -e "load data local infile 'taxonName.txt' into table taxonName;"
hgsql hgFixed -e "load data local infile 'taxonNode.txt' into table taxonNode"
hgsql hgFixed -e "load data local infile 'taxonDivision.txt' into table taxonDivision;"
hgsql hgFixed -e "load data local infile 'taxonGencode.txt' into table taxonGeneticCode;"
echo "select o.name, n.taxon as ncbi_taxon, n.name , toGenus from sp060115.taxon t, hgFixed.taxonName n, organism o where o.name = n.name and n.taxon = t.id order by toGenus;" | hgsql hg17 -N -B > taxonXref.txt
hgsql hgFixed -e "load data local infile 'taxonXref.txt' into table taxonXref;"
#--**************************************************************************
#-- This is the NCBI genetic code table
#-- Initial base data set from Andrzej Elzanowski while at PIR International
#-- Addition of Eubacterial and Alternative Yeast by J.Ostell at NCBI
#-- Base 1-3 of each codon have been added as comments to facilitate
#-- readability at the suggestion of Peter Rice, EMBL
#-- Later additions by Taxonomy Group staff at NCBI
#--
#-- Version 3.9
#-- Code 14 differs from code 9 only by translating UAA to Tyr rather than
#-- STOP. A recent study (Telford et al, 2000) has found no evidence that
#-- the codon UAA codes for Tyr in the flatworms, but other opinions exist.
#-- There are very few GenBank records that are translated with code 14,
#-- but a test translation shows that retranslating these records with code
#-- 9 can cause premature terminations. Therefore, GenBank will maintain
#-- code 14 until further information becomes available.
#--
#-- Version 3.8
#-- Added GTG start to Echinoderm mitochondrial code, code 9
#--
#-- Version 3.7
#-- Added code 23 Thraustochytrium mitochondrial code
#-- formerly OGMP code 93
#-- submitted by Gertraude Berger, Ph.D.
#--
#-- Version 3.6
#-- Added code 22 TAG-Leu, TCA-stop
#-- found in mitochondrial DNA of Scenedesmus obliquus
#-- submitted by Gertraude Berger, Ph.D.
#-- Organelle Genome Megasequencing Program, Univ Montreal
#--
#-- Version 3.5
#-- Added code 21, Trematode Mitochondrial
#-- (as deduced from: Garey & Wolstenholme,1989; Ohama et al, 1990)
#-- Added code 16, Chlorophycean Mitochondrial
#-- (TAG can translated to Leucine instaed to STOP in chlorophyceans
#-- and fungi)
#--
#-- Version 3.4
#-- Added CTG,TTG as allowed alternate start codons in Standard code.
#-- Prats et al. 1989, Hann et al. 1992
#--
#-- Version 3.3 - 10/13/95
#-- Added alternate intiation codon ATC to code 5
#-- based on complete mitochondrial genome of honeybee
#-- Crozier and Crozier (1993)
#--
#-- Version 3.2 - 6/24/95
#-- Code Comments
#-- 10 Alternative Ciliate Macronuclear renamed to Euplotid Macro...
#-- 15 Bleharisma Macro.. code added
#-- 5 Invertebrate Mito.. GTG allowed as alternate initiator
#-- 11 Eubacterial renamed to Bacterial as most alternate starts
#-- have been found in Achea
#--
#--
#-- Version 3.1 - 1995
#-- Updated as per Andrzej Elzanowski at NCBI
#-- Complete documentation in NCBI toolkit documentation
#-- Note: 2 genetic codes have been deleted
#--
#-- Old id Use id - Notes
#--
#-- id 7 id 4 - Kinetoplast code now merged in code id 4
#-- id 8 id 1 - all plant chloroplast differences due to RNA edit
#--
#--*************************************************************************
#
#Genetic-code-table ::= {
# {
# name "Standard" ,
# name "SGC0" ,
# id 1 ,
# ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
# sncbieaa "---M---------------M---------------M----------------------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
# name "Vertebrate Mitochondrial" ,
# name "SGC1" ,
# id 2 ,
# ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
# sncbieaa "--------------------------------MMMM---------------M------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
# name "Yeast Mitochondrial" ,
# name "SGC2" ,
# id 3 ,
# ncbieaa "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
# sncbieaa "----------------------------------MM----------------------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
# name "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate
# Mitochondrial; Mycoplasma; Spiroplasma" ,
# name "SGC3" ,
# id 4 ,
# ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
# sncbieaa "--MM---------------M------------MMMM---------------M------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
# name "Invertebrate Mitochondrial" ,
# name "SGC4" ,
## id 5 ,
# ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG",
# sncbieaa "---M----------------------------MMMM---------------M------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
# name "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear" ,
# name "SGC5" ,
# id 6 ,
# ncbieaa "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
# sncbieaa "-----------------------------------M----------------------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
# name "Echinoderm Mitochondrial; Flatworm Mitochondrial" ,
# name "SGC8" ,
# id 9 ,
# ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
# sncbieaa "-----------------------------------M---------------M------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
# name "Euplotid Nuclear" ,
# name "SGC9" ,
# id 10 ,
# ncbieaa "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
# sncbieaa "-----------------------------------M----------------------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
# name "Bacterial and Plant Plastid" ,
# id 11 ,
# ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
# sncbieaa "---M---------------M------------MMMM---------------M------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
# name "Alternative Yeast Nuclear" ,
# id 12 ,
# ncbieaa "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
# sncbieaa "-------------------M---------------M----------------------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
# name "Ascidian Mitochondrial" ,
# id 13 ,
# ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG",
# sncbieaa "---M------------------------------MM---------------M------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
# name "Alternative Flatworm Mitochondrial" ,
# id 14 ,
# ncbieaa "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
# sncbieaa "-----------------------------------M----------------------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# } ,
# {
# name "Blepharisma Macronuclear" ,
# id 15 ,
# ncbieaa "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
# sncbieaa "-----------------------------------M----------------------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# } ,
# {
# name "Chlorophycean Mitochondrial" ,
# id 16 ,
# ncbieaa "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
# sncbieaa "-----------------------------------M----------------------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# } ,
# {
# name "Trematode Mitochondrial" ,
# id 21 ,
# ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
# sncbieaa "-----------------------------------M---------------M------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# } ,
# {
# name "Scenedesmus obliquus Mitochondrial" ,
# id 22 ,
# ncbieaa "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
# sncbieaa "-----------------------------------M----------------------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# } ,
# {
# name "Thraustochytrium Mitochondrial" ,
# id 23 ,
# ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
# sncbieaa "--------------------------------M--M---------------M------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# }
#}
##########################################################################
# Added Zebrafish microarray data (DONE, 2006-06-10, hartera)
# From Leonard Zon's group at the Children's Hospital, Boston
# Contact: Tony DiBiase, adibiase@enders.tch.harvard.edu
# Data is normalized and log2 transformed, then centered on mean of 0.
# Changed table names and reloaded MedianExps table so that the extras has
# the strain plus time point for the name otherwise the average is taken
# over all time points for a strain for the track display
# when Tissue Averages is selected. (DONE, 2006-07-30, hartera)
# Changed so that the extras column for the MedianExps table has the
# developmental stage so that an average is taken across all strains for
# each stage when Tissue Averages is selected.
# (Jim recommended displaying it this way and then it also fits in with the
# current framework for this type of track).
# Also added the strain name and stage to the extra column for the
# Experiments tables (AllExps and MedianExps) so that when Chip ID is
# selected then all of these are shown. (DONE, 2006-08-11, hartera)
# Added absolute data (before logs were taken). (DONE, 2006-09-19, hartera)
# The absolute value data was centered on a mean of 0. The log data was
# the log2 transformed normalized data, centered on a mean of 0.
# This section now OBSOLETE so removed. See section below on UPDATE of
# zebrafish microarray data.
##########################################################################
# UPDATE the Zebrafish microarray data (DONE, 2006-06-16 - 2006-10-18, hartera)
# From Leonard Zon's group at the Children's Hospital, Boston
# Contact: Tony DiBiase, adibiase@enders.tch.harvard.edu
# Data is Loess normalized absolute values. Then use microarray processing
# programs to create ratio tables.
# The new data set was obtained so that the ratios could be calculated
# directly from the normalized absolute data. The ratios are calculated as
# the value for a probeset in one array to the median value across all arrays
# for that probset and then a log2 is taken. This allows comparison
# between arrays that may differ due to technical or biological differences.
# RE-CREATE tables. Data was log2 already so antilog the values to get
# absolute values and then pass through the microarray processing programs.
# (DONE, 2007-01-05 - 2007-01-08, hartera)
ssh hgwdev
mkdir /projects/compbio/data/microarray/zebrafishWT
cd /projects/compbio/data/microarray/zebrafishWT
# copy the data here received by e-mail and unzip
unzip wt34.loessNorm.absval.2006-10-12.zip
mv wt34.loessNorm.absval.2006-10-12.txt wtAffyNormLog2.txt
dos2unix wtAffyNormLog2.txt
cat << 'EOF' > format.pl
#!/usr/bin/perl -w
use strict;
while ()
{
# reformat file. change Tu to TU and remove experiment name from the
# column headings and translate the name to something human readable.
my ($f, @a, $n, $strain, $somites, $hpf, $fullName);
$f = $_;
if ($f !~ /at/)
{
@a = split(/\t/, $f);
foreach $n (@a)
{
$fullName = "";
$somites = 0;
$hpf = 0;
if ($n =~ /^([A-Za-z]+)\.([0-9]+)\.([0-9]+)\.[0-9]+\.[0-9]+/)
{
$strain = $1;
$somites = $2;
$hpf = $3;
$strain =~ s/Tu/TU/;
if ($somites > 0)
{
$fullName = $strain . "-" . $somites . "-somites";
}
elsif ($hpf > 0)
{
$fullName = $strain . "-" . $hpf . "-hpf";
}
print "\t$fullName";
}
}
print "\n";
}
else
{
print $f;
}
}
'EOF'
# << emacs
chmod +x format.pl
perl format.pl < wtAffyNormLog2.txt > zebrafishWTNormLog2.txt
# antilog the values, log is base 2
cat << 'EOF' > cnvToAntilog
#!/usr/bin/awk -f
BEGIN {
FS = "\t"
RS = "\n"
ORS=""
}
{
print $1 "\t"
x=2
while (x < NF) {
print 2^$x "\t"
x++
}
print 2^$NF "\n"
}
'EOF'
chmod +x cnvToAntilog
# run script and skip header line in file
tail +2 zebrafishWTNormLog2.txt | cnvToAntiLog > tmp.txt
# add back header line:
head -1 zebrafishWTNormLog2.txt > header
cat header tmp.txt > zebrafishWTNormAbs.txt
# Then load the data into hgFixed using hgGnfMicroarrray and use options
# to set the url, ref, and credit to "n/a" and chip to Zebrafish.
# Need to use this program to get 3 extras needed for hgMedianMicroarray
# No need to round the values this time as they are larger and have
# a larger range.
# Create the main expRecord table and the expData table for the
# absolute measurements
hgGnfMicroarray zebrafishZonWTAllExps zebrafishZonWTAll \
zebrafishWTNormAbs.txt -chip=Zebrafish -url=n/a -ref=n/a -credit=n/a
# Changed the Exps table so that the extras column for the MedianExps table
# has the strain and developmental stage in the second field so that an
# average is taken across all strains for each stage when Tissue Averages
# is selected.
# (Jim recommended displaying it this way and then it also fits in with the
# current framework for this type of track).
hgsql -N -e 'select name, extras from zebrafishZonWTAllExps;' hgFixed \
> zfishWTExps.extras
cat << 'EOF' > cnvExtras.pl
#!/usr/bin/perl -w
use strict;
while () {
my ($line, @extras);
$line = $_;
@extras = split(/,/, $line);
$line =~ s/n\/a/$extras[2]/;
print $line;
}
'EOF'
chmod +x cnvExtras.pl
cnvExtras.pl < zfishWTExps.extras > zfishWTExps.extras.new
# create set of mySQL statements from this to update the AllExps table
# to include the name in the second field of extras - same as in the third
# field. This is used for display when the "Arrays Grouped By Replicate
# Medians" (or Means) is selected from the track controls on the
# description page.
awk 'BEGIN {FS = "\t"} {print "update zebrafishZonWTAllExps set extras = \"
"$2 "\"" " where name = \""$1"\";";}' zfishWTExps.extras.new \
> zfishWTExpsNewExtras.sql
hgsql hgFixed < zfishWTExpsNewExtras.sql
# Convert these to ratios using the median of the absoulute values
# across all experiments to be the denominator for each probeset.
# minAbsVal is 0 here as no value in this dataset is less than 1 and the
# default for this parameter is 20.
hgRatioMicroarray -minAbsVal=0 zebrafishZonWTAll zebrafishZonWTAllRatio
# Create the .ra file for the Median tables
hgsql -N -e 'select extras, id from zebrafishZonWTAllExps;' hgFixed \
> zfishWTExps
# remove extra information and leave experiment name
perl -pi.bak -e 's/Zebrafish,[A-Za-z]+\-[0-9]+\-[a-z]+,//' zfishWTExps
perl -pi.bak -e 's/,//' zfishWTExps
# alter script so that name for each experiment in column 2 is not just the
# strain but the strain plus time point (same as first column). This goes into
# the extras column for zebrafishWTMedianExps and is used for Tissue Averages
# display for the array data track. Otherwise an average is taken for the
# strain (hartera, 2006-07-30).
# change so that column 2 is the time point so that an average of time points
# is taken for the "Tissue Averages" Display (hartera, 2006-08-11)
cat << 'EOF' > cnvToMedian
#!/usr/bin/awk -f
BEGIN {
FS = "\t";
OFS = "\t";
}
{
data[$1] = data[$1] " " $2;
}
END {
for (id in data) {
split(id, a, "\\-");
print id, a[2]a[3], substr(data[id], 2);
}
}
'EOF'
# << emacs
chmod +x cnvToMedian
cnvToMedian zfishWTExps > zfishZonWTMedian.ra
# re-order the *.ra file as this determines the order of display
sort zfishZonWTMedian.ra | grep "14somites" > tmp.ra
sort zfishZonWTMedian.ra | grep "15somites" >> tmp.ra
sort zfishZonWTMedian.ra | grep "hpf" >> tmp.ra
mv tmp.ra zfishZonWTMedian.ra
cp zfishZonWTMedian.ra ~/kent/src/hg/makeDb/hgMedianMicroarray
cd ~/kent/src/hg/makeDb/hgMedianMicroarray
# Take the median value over multiple replicants and put in this table:
hgMedianMicroarray hgFixed zebrafishZonWTAllRatio zebrafishZonWTAllExps \
zfishZonWTMedian.ra zebrafishZonWTMedianRatio \
zebrafishZonWTMedianExps -minExps=1
# Make a median version of the absolute experiments:
hgMedianMicroarray hgFixed zebrafishZonWTAll zebrafishZonWTAllExps \
zfishZonWTMedian.ra zebrafishZonWTMedian zebrafishZonWTMedianExps -minExps=1
# get distribution of MedianRatio scores:
hgsql -N -e 'select * from zebrafishZonWTMedianRatio;' hgFixed > medRatioData
awk '{print $3}' medRatioData > medRatioData2
perl -pi.bak -e 's/,/\n/g' medRatioData2
textHistogram -real -binSize=0.2 -maxBinCount=1100 -minVal=-200 \
medRatioData2 > histMedRatio.out
# from this histogram, see that most values fall between -2 to +2 so set the
# trackDb for the Affy Zon Wild Type Array track to have expScale of 2.0
# and expStep to 0.2 for the log scale to display the ratios in this track.
##########################################################################
#The Mouse GNF Expression Atlas 2 (2004)
##########################################################################
# Updated gv* tables for the Locus Variants tracks
# (Belinda Giardine Sept 2006)
# This track is now available for hg17 and hg18, only the gvPos table needs to
# be redone for each build unless new mutations are added. This load changes
# the schema (strand, label for gvPos) and adds a new LSDB (BTKbase) and more
# sanity checks on all the data causing some mismapped variants to be
# discarded.
##########################################################################
# mgcMBLabValid - Load of Genbank accession that are in the Brent lab clone
# validation database. This contains both human and mouse clones. Since
# the Brent lab is no longer doing MGC validations, this set is fixed
# and shared by all mouse and human assemblies. (2006-10-26 markd)
mkdir -p /cluster/data/genbank/data/download/mgcMBLab
cd /cluster/data/genbank/data/download/mgcMBLab
# save list of 41805 accessions received from brent lab as
# mgcMBLabValid.2006-10-25.acc
hgLoadSqlTab hgFixed mgcMBLabValid ~/compbio/genbank/kent/src/hg/lib/mgcMBLabValid.sql mgcMBLabValid.2006-10-25.acc
gzip mgcMBLabValid.2006-10-25.acc
##########################################################################
# ZEBRAFISH DEVELOPMENTAL ARRAYS FROM GENOME INSTITUTE OF SINGAPORE (GIS)
# Data from Article:
# Transcriptome Analysis of Zebrafish Embryogenesis Using Microarrays Mathavan
# S, Lee SGP, Mak A, Miller LD, Murthy KRK, et al. PLoS Genetics Vol. 1, No. 2,
# e29, pages 260-276 doi:10.1371/journal.pgen.0010029
# Contact: Sinnakaruppan Mathavan
# Downloaded expression data from
# http://giscompute.gis.a-star.edu.sg/~govind/zebrafish/data_download.html
# after clicking on link to download largest dataset (12.9 MB):
# ene expression data showing the expression profile during different stages
# of zebrafish embryonic development for the genes selected from the array are
# presented (Compugen array). Each value represents an average performance of
# 2-4 replicates. GenBank id of the selected gene is given as the identifier.
# Total RNA from different stages of embryonic development, adult male and
# female were pooled in equal concentrations and used as reference RNA. The
# genes were annotated using Zebrafish Chip Annotation Database.
ssh hgwdev
mkdir -p /projects/compbio/data/microarray/zebrafishGISDev
# Downloaded data and saved in Excel as a tab, separated text file:
# PLOSGISData.txt
# This file contains Genbank accessions and the expression values
# which are log2 based.
##########################################################################
# Belinda Giardine April 2007
# gv* tables:
# reload tables, additions and corrections, details in hg18 doc
#############################################################################
## Add CTD data
mkdir /hive/data/outside/ctdbase
cd /hive/data/outside/ctdbase
# Download chem_gene_ixns.tsv from CTD site, http://ctd.mdibl.org/downloads/.
wget "http://ctdbase.org/reports/CTD_chem_gene_ixns.tsv.gz"
gunzip CTD_chem_gene_ixns.tsv.gz
hgsql hg18 -e 'create database ctdBraney'
hgsql ctdBraney < ~/kent/src/hg/lib/chem_gene_ixns.sql
hgsql ctdBraney -e 'load data local infile "CTD_chem_gene_ixns.tsv" into table chem_gene_ixns'
# create sorted data
hgsql hg19 -N -e \
'select x.geneSymbol, ChemicalId, count(distinct Interaction), ChemicalName from kgXref x, ctdBraney.chem_gene_ixns c where x.geneSymbol=c.GeneSymbol group by x.geneSymbol, ChemicalId'|\
sort -k 1,1 -k 3,3nr -k 4,4 >ctdSorted.tab
hgsql hgFixed < ~/kent/src/hg/lib/ctdSorted.sql
hgsql hgFixed -e 'load data local infile "ctdSorted.tab" into table ctdSorted'
+#####
+# Jaspar 2022 PFM (DONE 2021/12/08 braney)
+#####
+mkdir -p /hive/data/outside/jaspar/2022/all
+cd /hive/data/outside/jaspar/2022/all
+wget "https://jaspar.genereg.net/download/data/2022/CORE/JASPAR2022_CORE_non-redundant_pfms_jaspar.zip"
+unzip JASPAR2022_CORE_non-redundant_pfms_jaspar.zip
+for i in *.jaspar; do f=`awk '{print $1; exit}' $i | tr -d '>'`; tail -n +1 $i | awk '{count=NF - 3; for (ii=3; ii <= NF - 1; ii++) counts[ii - 3] += $ii;} END {for(ii=0; ii < count; ii++) printf "%d\t", counts[ii]; printf "\n"}' > /tmp/1; tail -n +2 $i | awk '{count=NF - 3; for (ii=3; ii <= NF - 1; ii++) printf "%d\t", $ii; printf "\n"}' >> /tmp/1; tail -n +1 $i | awk '{count=NF - 3; for (ii=3; ii <= NF - 1; ii++) counts[ii - 3] += $ii;} END {for(ii=0; ii < count; ii++) printf "%d\t", counts[ii]; printf "\n"}' | awk '{numCols=NF; for(ii=1;ii <= numCols; ii++) mat[NR-1][ii] = $ii;} END {for(jj=1; jj <= numCols; jj++) {for(ii=0; ii < NR; ii++) printf "%d\t",mat[ii][jj]; printf "\n";}}' /tmp/1 | awk '{for (ii=2; ii <= NF; ii++) printf "%f\t",$ii/$1; printf "\n";}' | awk -v name=$f '{numCols=NF; for(ii=1;ii <= numCols; ii++) mat[NR-1][ii] = $ii;} END {printf "%s\t%d\t",name,NR;for(jj=1; jj <= numCols; jj++) {for(ii=0; ii < NR; ii++) printf "%g,",mat[ii][jj]; printf "\t";}} END {printf "\n"}' ; done > jasparMotif.tab
+
+hgLoadSqlTab hgFixed jasparCore2022 ~/kent/src/hg/lib/dnaMotif.sql jasparMotif.tab
|