d87e5f63bc000d560c21bca638b6d5f3dee0173d
kate
Wed Apr 3 12:06:45 2019 -0700
Add scripts and make docs for ENCODE 3 TF binding site tracks. refs #21139
diff --git src/hg/makeDb/doc/encodeRegHg19.txt src/hg/makeDb/doc/encodeRegHg19.txt
index 1196a19..d1a59a2 100644
--- src/hg/makeDb/doc/encodeRegHg19.txt
+++ src/hg/makeDb/doc/encodeRegHg19.txt
@@ -1,1753 +1,1754 @@
#######################################################################
# ENCODE Regulation track (In progress March 2010, very rough instructions
# as am still defining how to build track...). JK
# make root dir and link in our bad.bed file for filtering out
# blacklisted and multiple mapping regions.
mkdir -p /cluster/data/hg19/bed/wgEncodeReg
cd /cluster/data/hg19/bed/wgEncodeReg
ln -s /hive/users/kent/regulate/companion/mapping/bad.bed badRegions.bed
# Create the DNAse peak clusters subtrack.
# Get all of the narrowPeak format files for the wgEncodeUwDnaseSeq
# linked into directory /hive/users/kent/regulate/dnase/peaks
mkdir dnase
cd dnase
mkdir peaks
ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeUwDnase/*.narrowPeak.gz peaks
# Process these into clusters in a bed file and load clusters into
# table.
/bin/ls -1 peaks/*.narrowPeak.gz > peak.lst
uw02 peak.lst peak.table
regCluster peak.table /dev/null peak.bed
awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClustered.bed
hgLoadBed hg19 wgEncodeRegDnaseClustered wgEncodeRegDnaseClustered.bed
# Make wgEncodeRegDnaseClusteredInput table. Start with mdbQuery, and
# then do some massaging since not completely in sync with file list.
mdbQuery out=tab "select obj,cell,treatment,replicate,lab,dateUnrestricted from hg19 where obj like 'wgEncodeUwDnase%' and view='Peaks'" | sed 's/n\/a/None/' > inputMdb.tab
cut -f 1 peak.table | sed 's/\.narrowPeak\.gz//' | sed 's/peaks\///' > inputs.lst
weedLines inputs.lst inputMdb.tab wgEncodeRegDnaseClusteredInputs.tab -invert
hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredInputs ~/kent/src/hg/lib/clusterInputDnase.sql \
wgEncodeRegDnaseClusteredInputs.tab
# Make transcription factor binding site clusters. Start with Anshul's
# nice uniform peak calls, downloaded from
# ftp://ftp-private.ebi.ac.uk/byDataType/peaks/jan2011/spp/optimal
# using the user name and password in the ENCODE wiki at:
# http://genomewiki.ucsc.edu/EncodeDCC/index.php/Locations_of_ENCODE_Data
# Put all of the ones that actually are based on two replicates into
# /cluster/data/hg19/bed/wgEncodeReg/tfbs/peaks
# Once that is done do a little work towards making the cluster input.
cd /cluster/data/hg19/bed/wgEncodeReg/tfbs
/bin/ls peaks/*.narrowPeak.gz > peak.lst
regClusterMakeTableOfTables ans01 peak.lst partial.table
# Unfortunately at this point only about 90% of the peak files actually
# have metaDb data associated with them in metaDb, so run C program that
# takes it from metaDb if possible, otherwise tries to get it from file
# names.
regClusterAttachMetadataToTableOfTables hg19 partial.table withDupes.table
# One more complication - there are often several cases where there is
# more than one data set on the same cell+treatment+antibody combination.
# For good reasons hgBedToBedExps only wants one.
# This next bit filters them out based on a picks file that was hand
# created - picksFromDupes. This was based on the criteria that
# the Broad CTCF were used, and that Snyd was used in preference
# to OpenChromatin, which in turn was used in preference over Haib.
# The rational for this is the Broad CTCF set has been in use for a
# while with no complaints, and when examing the Pol2 signals, which
# were with CTCF the main component of the Dupes, the Haib were the
# weakest and the Snyd the strongest.
cp ~/src/hg/regulate/wgEncodeReg/picksFromDupes.tab .
wgEncodeRegTfbsDedupeHelp withDupes.table picksFromDupes.tab peak.table
# Make three column file: fileName, cell+treatment, antibody
awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' peak.table > fileCellAb.tab
# Create config file out for clustering job, using a table assigning
# cell lines+treatments to single letter codes edited by hand. In
# the future may be able to automate this by having fixed upper case codes for
# tier1&2 cell lines, and then just use first letter lower case codes
# for rest. That is the pattern cellLetter2.tab follows anyway.
cp ~/kent/src/hg/regulate/regClusterBedExpCfg/cellLetter2.tab .
regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter2.tab
# Do the actual clustering
hgBedsToBedExps -dupeLetterOk cluster.cfg peak.bed peak.exps
# Load database (reloaded 3/26/2012 to fix a relatively rare bug)
hgLoadBed hg19 wgEncodeRegTfbsClustered peak.bed
hgLoadSqlTab hg19 wgEncodeRegTfbsCells ~/kent/src/hg/lib/expRecord.sql peak.exps
# Create inputTrackTable - six columns:
awk '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $7, ($10 == "None" ? $8 : $8 "+" $10), $9, $8, $10, $11);}' peak.table | sed 's/AlnRep/PkRep/'> wgEncodeRegTfbsClusteredInputs.tab
hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputs ~/kent/src/hg/lib/clusterInputTrackTable4.sql wgEncodeRegTfbsClusteredInputs.tab
# Handle the RNA seq, starting by pooling it and filtering out multiple
# mapping and blacklisted regions.
mkdir -p /hive/data/genomes/hg19/bed/wgEncodeReg/txn/pooled
cd /hive/data/genomes/hg19/bed/wgEncodeReg/txn/pooled
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqGm12878R2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqH1hescR2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHct116R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHct116R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHct116R2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHelas3R2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHepg2R2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHsmmR2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHuvecR2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqK562R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqK562R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqK562R2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqMcf7R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqMcf7R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqMcf7R2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqNhekR2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHepg2R2x75Il200SigPooled.bedGraph
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2.bigWig \
/gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep2V2.bigWig stdout \
| bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHelas3R2x75Il200SigPooled.bedGraph
# Now have to normalize. This takes two passes. The first just creates
# a tab-separated file with normalization values. A batch file is made
# by running awk on the tab file, and this batch file executes the
# second half, transforming the bedGraphs to normalized values. The
# normalization standardizes all files as if the total signal integrated to 10
# billion over the genome.
rm -f ../norm.tab
foreach i (*SigPooled.bedGraph)
echo processing $i
echo -n $i " " >> ../norm.tab
awk '{sum += $4*($3-$2);} END {print 10000000000/sum}' $i >> ../norm.tab
end
awk '{printf("colTransform 4 %s 0 %g ../normalized/%s\n", $1, $2, $1);}' ../norm.tab > ../norm.csh
chmod a+x ../norm.csh
mkdir ../normalized
../norm.csh
# Make bigWigs
cd ../normalized
foreach i (*.bedGraph)
echo processing $i
bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw
end
# Link into /gbdb
ln -s `pwd`/*.bw /gbdb/hg19/bbi
# Tell database about it
foreach i (*.bw)
hgBbiDbLink hg19 $i:r /gbdb/hg19/bbi/$i
end
# Clean up
cd /hive/data/genomes/hg19/bed/wgEncodeReg/txn
rm pooled/*.bedGraph normalized/*.bedGraph
# Make metadata: (forgive the long line)
# JK TODO - add Hepg2 and Helas3 from here on down
mdbQuery out=ra "select * from hg19 where \
obj='wgEncodeCaltechRnaSeqNhlfR2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqK562R2x75Il200SigRep1V2' or \
obj='wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep1V2'" \
| sed 's/SigRep1V2/SigPooled/g' | sed 's/wgEncodeCal/wgEncodeRegTxnCal/' \
| grep -v ^replicate > mdb.ra
# Now patch mdb.ra into ~/kent/src/hg/makeDb/trackDb/human/hg19/metaDb/alpha/wgEncodeReg.ra
# At this point actually are done with this track, except for
# all the trackDb work on the overlayed histone mod tracks.
# The rest is for some coloring stuff that is not yet actually used.
# Do work to figure out how to color cells. This is based on
# hierarchical clustering of wgEncodeUwAffyAllExon.
mkdir -p /hive/data/genomes/hg19/bed/wgEncodeReg/cellColors
cd /hive/data/genomes/hg19/bed/wgEncodeReg
# Start with some alas time consuming data cleanup to get rid of
# overlaps - losing about 10% of the data in the process which is
# totally ok.
mkdir allExon
cd allExon
mkdir inLinks noOverlap onlyCommon
ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeUwAffyExonArray/*.gz inLinks
cd inLinks
foreach i (*.gz)
bedRemoveOverlap $i ../noOverlap/$i:r
end
cd ..
# Now make up a file of regions present in all inputs for further data
# cleanup
bedCommonRegions noOverlap/*.* > commonExons.bed
cd noOverlap
foreach i (*.broadPeak)
echo processing $i
bedRestrictToPositions $i ../commonExons.bed ../onlyCommon/$i
end
rm -r noOverlap
# Now pool replicates where possible
mdbQuery "select fileName,cell,replicate from hg19 where composite='wgEncodeUwAffyExonArray' and view='SimpleSignal'" -out=ra \
| sed 's/\.gz//g' > replicate.ra
encodeMergeReplicatesBatch replicate.ra onlyCommon merge.sh merged.ra merged
mkdir merged
chmod a+x merge.sh
merge.sh
# Resort merged output - somehow gets slightly messed by
# encodeMergeReplicates.
cd merged
foreach i (*.broadPeak)
echo sorting $i
sort -k 1,1 -k 2,2n $i > tmp
mv tmp $i
end
# Make file that converts fileName to cell
raToTab merged.ra -cols=fileName,cell stdout \
| sed 's/wgEncode/merged\/wgEncode/' > fileToCell.tab
# Now build the tree. Took 2.5 minutes
ls -1 merged/*.broadPeak > clusterInput.lst
regClusterTreeCells rename=fileToCell.tab clusterInput.lst cluster.tree cluster.distances
#########################################################################
# 2012 ENCODE Regulation supertrack updates: #7526 (kate)
#########################################################################
# DNase Clusters: #9954 (2013-01-11)
#
# Create the DNAse peak clusters track using AWG uniformly processed peaks (Jan 2011 freeze, Sep 2012 analysis pubs).
# See doc/encodeAwgHg19.txt
# There are 125 cell type+treatments represented -- UW, Duke, and Uw+Duke for 14 cell types
# Previous track had 74
# Redoing filtering (2014-06-24) V3 version, to include single-cell clusters
cd /hive/data/genomes/hg19/bed/wgEncodeReg
mkdir -p dnaseAwg
cd dnaseAwg
set dir = `pwd`
cd /hive/data/genomes/hg19/bed/wgEncodeAwgDnaseUniform/narrowPeak
ls *.narrowPeak > $dir/peak.lst
# generate a normalization factor
# output format:
# 0 1 2
# wgEncodeAwgDnaseDuke8988tPeak.bed 0 1 2 6 10.7234 .
regClusterMakeTableOfTables awgDnase01 $dir/peak.lst $dir/peak.table -verbose=3 >&! $dir/regTable.out &
regClusterMakeTableOfTables awgDnase01Hg19 $dir/peak.lst $dir/peak.meta.table -verbose=3 >&! $dir/regTable.meta.out &
# Make cluster table: wgEncodeRegDnaseClusteredV2 (bed5, with name=#datasets represented in cluster)
regCluster $dir/peak.table /dev/null $dir/peak.bed
# Read 125 sources from peak.table
# 2233542 singly-linked clusters, 2343745 clusters in 24 chromosomes
regCluster -bedSources $dir/peak.meta.table /dev/null $dir/peak.meta.bed
# filter out low scoring, single peaks in cluster and load in database
cd $dir
awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV2.bed
wc -l wgEncodeRegDnaseClusteredV2.bed
# 1281988 wgEncodeRegDnaseClusteredV2.bed
hgLoadBed hg19 wgEncodeRegDnaseClusteredV2 wgEncodeRegDnaseClusteredV2.bed
# V3 of track -- don't filter out single celltype peaks!
# August 2014, KRR
# filter out low scoring, but retain single cell type clusters
awk '$5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV3a.bed
# 1867665 wgEncodeRegDnaseClusteredV3a.bed
hgLoadBed hg19 wgEncodeRegDnaseClusteredV3a wgEncodeRegDnaseClusteredV3a.bed
# do same with metadata version, as we will actually load track with
# this additional info, for VAI integration
awk '$5 >= 100' peak.meta.bed > peak.meta.filtered.bed
wc -l peak.meta.filtered.bed
1867665 peak.meta.filtered.bed
# format to BED5+floatscore+sources for hgBedSources
# which will extract, uniquify, and assign ID's to sources
awk 'BEGIN {OFS="\t"}{print $1, $2, $3, $4, $5, 0, $7;}' peak.meta.filtered.bed > peak.meta.bed6
hgBedSources peak.meta.bed6 regDnase
mv regDnaseSources.tab wgEncodeRegDnaseClusteredSources.tab
# Load sources table
autoSql $HOME/kent/src/hg/lib/idName.as idName
hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredSources idName.sql wgEncodeRegDnaseClusteredSources.tab
# merge files and format to BED5+sourceCount+sourceIds+sourceVals
awk '{print $8}' peak.meta.filtered.bed > peak.vals
awk 'BEGIN {OFS="\t"}{print $1, $2, $3, $4, $5, $7, $8;}' regDnase.bed | paste - peak.vals > wgEncodeRegDnaseClusteredV3.bed
hgLoadBed hg19 wgEncodeRegDnaseClusteredV3 -sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql -renameSqlTable \
-as=$HOME/kent/src/hg/lib/bed5SourceVals.as wgEncodeRegDnaseClusteredV3.bed
# NOTE: V1 had ~1M clusters, V2 has 1.2M, V3 has 1.9M
# It would be nice if hgc sorted datasets by signal or cellType
# Create metadata table: wgEncodeRegV2DnaseClusteredInputs (inputTrackTable setting)
# was: objName, cellType, treatment, replicate, lab, dateUnrestricted
# For V2, AWG is providing pooled replicates, and sometimes pooled labs. Also,
# this data has all passed timestamp restriction, so don't need to show
# objName, cellType, treatment, lab(s)
# DISPLAY: Consider item name change to % assays represented instead of count.
# Also, could color item name label to highlight ubiquitous vs. cell-type-specific
# (top quartile -> color indicating high expression (red/orange) ?, bottom green/blue ?)
# Make wgEncodeRegDnaseClusteredInputV2 table.
mdbQuery out=tab "select obj,cell,treatment,lab,dccAccession from hg19 where obj like 'wgEncodeAwgDnase%UniPk' and view='Peaks'" > wgEncodeRegDnaseClusteredInputsV2.tab
hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredInputsV2 \
~/kent/src/hg/lib/clusterInputAwgDnase.sql wgEncodeRegDnaseClusteredInputsV2.tab
# Set up downloads
cd $dir
mkdir downloads
gzip -c wgEncodeRegDnaseClusteredV2.bed > downloads/wgEncodeRegDnaseClusteredV2.bed.gz
gzip -c wgEncodeRegDnaseClusteredInputsV2.tab > downloads/wgEncodeRegDnaseClusteredInputsV2.tab.gz
gzip -c wgEncodeRegDnaseClusteredV3.bed > downloads/wgEncodeRegDnaseClusteredV3.bed.gz
cp wgEncodeRegDnaseClusteredSources.tab downloads
# use (existing) ENCODE DCC downloads dir
set dload = /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegDnaseClustered
cd $dload
# call existing stuff release1 (use ENCODE-style downloads scheme)
mkdir release1
ln wgEncodeRegDnaseClustered.bed.gz wgEncodeRegDnaseClusteredInputs.tab.gz release1
cp README.txt md5sum.txt files.txt release1
# new release dir
mkdir release2
ln -s release2 beta
ln -s release2 releaseLatest
cd $dir/downloads
ln wgEncodeRegDnaseClusteredV2.bed.gz wgEncodeRegDnaseClusteredInputsV2.tab.gz $dload
cd $dload
ln wgEncodeRegDnaseClusteredV2.bed.gz wgEncodeRegDnaseClusteredInputsV2.tab.gz releaseLatest
# include older files as well
ln wgEncodeRegDnaseClustered.bed.gz wgEncodeRegDnaseClusteredInputs.tab.gz releaseLatest
cp README.txt releaseLatest
cd releaseLatest
md5sum *.gz > md5sum.txt
# vim README.txt
cp README.txt md5sum.txt ..
mkdir release3
rm beta releaseLatest
ln -s release3 beta
ln -s release3 releaseLatest
cd $dir/downloads
ln wgEncodeRegDnaseClusteredV3.bed.gz wgEncodeRegDnaseClusteredSources.tab $dload
cd $dload
ln wgEncodeRegDnaseClusteredV3.bed.gz wgEncodeRegDnaseClusteredSources.tab releaseLatest
cp README.txt releaseLatest
cd releaseLatest
md5sum *.gz > md5sum.txt
# vim README.txt
cp README.txt md5sum.txt ..
# add minimal metaObject to metaDb to associate new table with (pseudo-)composite
# wgEncodeRegDnaseClustered -- this allows hgTrackUi to locate downloads
cd ~/kent/src/hg/makeDb/trackDb/human/hg19/metaDb/alpha
cat > wgEncodeRegDnaseClustered.ra << 'EOF'
metaObject wgEncodeRegDnaseClusteredV2
objType table
composite wgEncodeRegDnaseClustered
fileName wgEncodeRegDnaseClusteredV2.bed.gz
tableName wgEncodeRegDnaseClusteredV2
project wgEncode
'EOF'
cat > wgEncodeRegDnaseClustered.ra << 'EOF'
metaObject wgEncodeRegDnaseClusteredV3
objType table
composite wgEncodeRegDnaseClustered
fileName wgEncodeRegDnaseClusteredV3.bed.gz
tableName wgEncodeRegDnaseClusteredV3
project wgEncode
'EOF'
#########################################################################
# Preliminary/Experimental/Older work below
#########################################################################
# ENCODE Regulation track V2: Track #7526 (In progress 2012-06-20 kate)
# This version will have:
# * Replace Tx with CSHL stranded data, pooled (JK advising, also AWG use)
# http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformRNASignal.txt
# These are from DCC wgEncodeCshlLongRnaSeq downloads (Long polyA+, whole cell),
# AWG did not pool reps or plus/minus for their hub (but we will)
# AWG has filtered for bad regions so no longer necessary
# * Possibly a 10th cell type (HMEC) for Tx and Histone
# Note: single replicate for HMEC Tx (others have 2)
# * Two additional cell lines for Histones (Hela, HepG2)
# - Note Hela & HepG2 already added to Tx track (July2012 release)
# * Histones from uniform processing: H3K4me1, H3K4me3, H3K27ac
# (http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/)
# wgEncodeBroadHistone{cell}{histone}StdAln_?Reps.norm5.rawsignal.bw
# * Add latest TF's, and use AWG uniform peak calls
# * Add latest DNase calls (possibly use AWG uniform)
# (http://www.uwencode.org/public/rthurman/encode_integrated_open_chrom/data/dataHub)
# AWG trackDb: http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt
# - can use subGroups line to determine cell type, but need to take care
# with treatments, which are in filename but not subGroups
# - NOTE: AWG includes Duke data processed like UW. Where both
# labs had data, they are merged. View is .01FDR peaks (narrow, bigbed 6+)
# Previous track: 74 cell types (2 reps), all UW. This version: 125 cell types, pooled replicates, merged UW+Duke
# Also considering adding new data types from AWG hub:
# - Short RNA's
# - AWG uniform RNA contigs
# (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/rna_contigs/)
# - Segmentations (chromHMM, segWay, or combined)
# (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/segmentations)
# - FAIRE (cell-specific, union over 25 cell types)
# (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/faire/bigBed/)
# NOTE: following Jim's methods from initial version
# make root dir and link in our bad.bed file for filtering out
# blacklisted and multiple mapping regions.
cd /hive/data/genomes/hg19/bed
mkdir wgEncodeRegV2
cd wgEncodeRegV2
ln -s /hive/data/genomes/hg19/bed/wgEncodeReg/badRegions.bed .
#(was /hive/users/kent/regulate/companion/mapping/bad.bed)
######
# Small RNA's
# For signal, pool strands and replicates
######
cd /hive/data/genomes/hg19/bed/
mkdir -p wgEncodeRegV2/shortRna/pooled
cd wgEncodeRegV2/shortRna/pooled
cat > list.csh << 'EOF'
foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hmec Hsmm Huvec K562 Nhek Nhlf)
echo "processing ${cell}"
ls /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotal*.bigWig
'EOF'
csh list.csh >&! list.out &
# There are 6 of overalpping with histones and long RNAs. 4 other tier2's.
cat > pool.csh << 'EOF'
foreach cell (Gm12878 H1hesc Helas3 Hepg2 K562 Nhek A549 Cd20 Imr90 Mcf7)
echo "processing ${cell}"
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapMinusRawRep1.bigWig \
/gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapMinusRawRep2.bigWig \
/gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapPlusRawRep1.bigWig \
/gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapPlusRawRep2.bigWig \
wgEncodeRegTxnCshlShortRnaSeq${cell}CellTotalRawSigPooled.bedGraph
end
'EOF'
csh pool.csh >&! pool.out &
# got 8 cells, two failed -- CD20 and IMR90, may be rescuable, but continuing
# for now with others to see if its worth it.
cat > big.csh << 'EOF'
foreach i (*.bedGraph)
echo processing $i
bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw
end
'EOF'
csh big.csh >&! big.out &
# add to database
set dir = /gbdb/hg19/bbi/encodeRegV2
mkdir $dir
ln -s `pwd`/*.bw $dir
foreach i (*.bw)
hgBbiDbLink hg19 $i:r $dir/$i
end
######
# Handle the RNA seq, starting by pooling it and filtering out multiple
# mapping and blacklisted regions.
######
cd /hive/data/genomes/hg19/bed/
mkdir -p wgEncodeRegV2/txn/pooled
cd wgEncodeRegV2/txn/pooled
cat > poolTx.csh << 'EOF'
foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hsmm Huvec K562 Nhek Nhlf)
echo "processing ${cell}"
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep1.bigWig \
/gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep2.bigWig \
/gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep1.bigWig \
/gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep2.bigWig \
wgEncodeRegTxnCshlLongRnaSeq${cell}CellPapRawSigPooled.bedGraph
end
'EOF'
csh poolTx.csh >&! poolTx.out &
# ~8 minutes per cell type
# NOTE: 25 chroms in Hepg2, H1hesc, Huvec, Nhlf (male or unknown sex)
# Add in HMEC, which has single replicates
cat > poolHmec.csh << 'EOF'
foreach cell (Hmec)
echo "processing ${cell}"
bigWigMerge -adjust=-1 \
/gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep1.bigWig \
/gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep1.bigWig \
wgEncodeRegTxnCshlLongRnaSeq${cell}CellPapRawSigPooled.bedGraph
end
'EOF'
csh poolHmec.csh >&! poolHmec.out &
# Normalize. This takes two passes. The first creates
# a tab-separated file with normalization values. A batch file is made
# by running awk on the tab file, and this batch file executes the
# second half, transforming the bedGraphs to normalized values. The
# normalization standardizes all files as if the total signal integrated to 10
# billion over the genome.
rm -f ../norm.tab
cat > makeNorm.csh << 'EOF'
foreach i (*SigPooled.bedGraph)
echo processing $i
echo -n $i " " >> ../norm.tab
awk '{sum += $4*($3-$2);} END {print 10000000000/sum}' $i >> ../norm.tab
end
awk '{printf("echo %s; colTransform 4 %s 0 %g ../normalized/%s\n", $1, $1, $2, $1);}' ../norm.tab > ../norm.csh
'EOF'
csh makeNorm.csh >&! makeNorm.out &
# ~2 mins per cell type
mkdir ../normalized
csh ../norm.csh >&! ../norm.out &
#### STOPPED HERE
cd ../normalized
# make bigWigs
cat > big.csh << 'EOF'
foreach i (*.bedGraph)
echo processing $i
bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw
end
'EOF'
csh big.csh >&! big.out &
# add to database
set dir = /gbdb/hg19/bbi/encodeRegV2
mkdir $dir
ln -s `pwd`/*.bw $dir
foreach i (*.bw)
hgBbiDbLink hg19 $i:r $dir/$i
end
######
# Possible new track: FAIRE Peaks Union across 25 cells, from AWG
######
cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
mkdir -p faire/peaks
cd faire/peaks
wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformFaire.txt
sed -n 's/bigDataUrl /wget /p' uniformFaire.txt | grep Union > wget.csh
csh wget.csh >&! wget.out &
mv *Union*.bb encodeAwgFairePeaks25CellUnion.bb
bigBedToBed encodeAwgFairePeaks25CellUnion.bb encodeAwgFairePeaks25CellUnion.bed
# head -1 *.bed
chr1 10033 10376
# bed 3
hgLoadBed hg19 encodeAwgFairePeaks25CellUnion encodeAwgFairePeaks25CellUnion.bed
# Read 2369615 elements of size 3 from encodeAwgFairePeaks25CellUnion.bed
######
# Create the DNAse peak clusters subtrack.
######
cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
mkdir -p dnase/peaks
cd dnase
wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.html
wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt
sed -n 's/bigDataUrl /wget /p' uniformDnase.txt > wget.csh
csh wget.csh >&! wget.out &
# 125 files (.bb), dated Nov 22, 2011
# Sample case had ~150K peaks (by bigBedInfo)
# These are narrowPeak (by bigBedSummary) w/ position & signalVal, no name, score or stats
#$ bigBedToBed wgEncodeUwDnaseSKNMC.fdr01peaks.hg19.bb stdout | head
chr1 10180 10330 . 0 . 26.000000 -1 -1 -1
# NOTE: Steve Wilder is planning to add scores and make formats of these files more uniform
# rename to UCSC style and convert to bed: wgEncodeAwgDnasePeak
ls *.fdr01peaks.hg19.bb | \
perl -wpe 's/(wgEncode)(\S+)(Dnase)(\S+)(.fdr01peaks.hg19.bb)/bigBedToBed $1$2$3$4$5 $1Awg$3\u\L$2\E\u\L$4\EPeak.bed/' > unbig.csh
# convert to bed and rename
csh unbig.csh >&! unbig.out
# udc couldn't read 4 bytes from wgEncodeUWDukeDnaseHepG2.fdr01peaks.hg19.bb, did read 0
# This dataset was truncated
# Repeated wget and bigBedToBed for HepG2 (8/23)
ls -1 *AwgDnase*.bed > peak.lst
# Datasets with treated cells (hints from long label)
grep '(' uniformDnase.txt | grep longLabel > treated.txt
wc -l treated.txt
# 7 datasets
# Tamoxifen -> Tam10030 (Duke) or 40HAM20nM72hr (UW), Estradiol -> Est10nm30m, Ifna4h, Andro, Hypox
# TODO: Edit to rename
# generate a normalization factor to
# convert signalValue to score (0-1000). Also note score field; narrowPeak is 7 (default)
# output format:
# 0 1 2
# wgEncodeAwgDnaseDuke8988tPeak.bed 0 1 2 6 10.7234 .
regClusterMakeTableOfTables awgDnase01 peak.lst peak.table -verbose=3 >&! regTable.out &
# Make cluster table: wgEncodeRegDnaseClusteredV2 (bed5, with name=#datasets represented in cluster)
regCluster peak.table /dev/null peak.bed
#Read 125 sources from peak.table
# 2233542 singly-linked clusters, 2343745 clusters in 24 chromosomes
# filter out low scoring, single peaks in cluster and load in database
awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV2.bed
wc -l wgEncodeRegDnaseClusteredV2.bed
hgLoadBed hg19 wgEncodeRegDnaseClusteredV2 wgEncodeRegDnaseClusteredV2.bed
# Read 1281988 elements of size 5 from wgEncodeRegDnaseClusteredV2.bed
# load individual peak files as a track (needed by hgc)
# TODO: generate labels out of metadata
cat > loadDnase.csh << 'EOF'
set b = wgEncodeAwgDnase
set t = ${b}Peak
foreach f (${b}*)
set table = $f:r
hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table $f
set cell = `echo $f | perl -wpe 's/(wgEncodeAwgDnase[A-Z][a-z]+)(.*)Peak.bed/$2/'`
echo " track $table" >> $t.ra
echo " type narrowPeak" >> $t.ra
echo " parent $t" >> $t.ra
echo " shortLabel $cell DNase" >> $t.ra
echo " longLabel $cell DNaseI HS Uniform Peaks from ENCODE/Analysis" >> $t.ra
echo " subGroups cellType=$cell treatment=None" >> $t.ra
echo "" >> $t.ra
end
'EOF'
# NOTE: V1 had ~1M clusters.
# Other data tables: Need to load the peak files as they are used by hgc
# These should probably also be subtracks.
# It would be nice if hgc sorted datasets by signal or cellType
# Create metadata table: wgEncodeRegV2DnaseClusteredInputs (inputTrackTable setting)
# was: objName, cellType, treatment, replicate, lab, dateUnrestricted
# For V2, AWG is providing pooled replicates, and sometimes pooled labs. Also,
# this data has all passed timestamp restriction, so don't need to show
# objName, cellType, treatment, lab(s)
# DISPLAY: Consider item name change to % assays represented instead of count.
# Also, could color item name label to highlight ubiquitous vs. cell-type-specific
# (top quartile -> color indicating high expression (red/orange) ?, bottom green/blue ?)
######
# Uniform histone signal for overlay wiggles
# Note: there are UW signal files as well -- omitting these for consistency,
# Check if merging would be a better option
######
cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
mkdir -p histone/signal
cd histone/signal
cat > getHistones.csh << 'EOF'
foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hsmm Huvec K562 Nhek Nhlf Hmec)
foreach histone (H3k4me1 H3k4me3 H3k27ac)
set var = "wgEncodeBroadHistone${cell}${histone}"
echo $var
curl -O http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/${var}StdAln_\[1-3\]Reps.norm5.rawsignal.bw
end
end
'EOF'
csh getHistones.csh >&! getHistones.out &
# save real files (>1k size) as curl creates info files when match used
mkdir sav
find . -name '*.bw' -a -size +1k -exec mv '{}' sav \;
rm *.bw
mv sav/* .
rmdir sav
# note: missing Hela and HepG2 H3K4me1 - these have different filename pattern (check with Anshul why)
# use curl to retrieve by pattern match as we don't have read perms on directory. Then remove non-useful files
curl -O http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/wgEncodeBroadHistoneHelas3H3k4me1Std_\[1-3]Reps.norm5.rawsignal.bw
# good one is '*Std_1Rep', remove 2Rep and 3Rep
rm *Hela*Std_[2-3]*
wget http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/wgEncodeBroadHistoneHepg2H3k4me1Std_1Reps.norm5.rawsignal.bw
######
# TFBS from Jan2011 freeze
######
# The TFBS track is a specialized type: factorSource. Requires 3 tables:
# track table: wgEncodeRegTfbsClustered (bed15)
# source table: wgEncodeRegTfbsCells
# Maps cell type (including treatment) to single letter
# Currently, there are fixed unique assignments for Tier1 and 2.
# There are all upper case.
# Tier 3 just use first char of cell name, lower-cased. Not unique at all!
# Looks like only rows 2 and 3 of the table are actually used/informative
# Ex. row 0 G GM12878 n/a n/a n/a 3 n/a,n/a/,GM12878,
# inputTrack (metadata) table: wgEncodeRegTfbsClusteredInputs
# DISPLAY: Better with target protein instead of antibody ?
# DISPLAY: Consider item showing letter for all experiments, with name colored (brown ?) if this peak
# found it the cell type
# DISPLAY: fix ordering of cell letters -- they should be sorted
# (in cellLetter.tab first, so Exps files has correct id's)
cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
mkdir -p tfbs/peaks
cd tfbs/peaks
wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformTfbs_nocontrols.txt
# this file has metadata
# there are 3 'method' values: SPP, PeakSeq, wiggler
# there are 2 'quality' values: Good, Caution
# There are 144 antibodies (for 122 factors), 75 cell types, 95 cell types plus treatments, 457 datasets
# Version 1 of this track had 148 antibodies, 67 cell types, 95 cell type+treatments, 486 datasets
# -> basically the same data, but cleaned up
sed -n 's/bigDataUrl /wget /p' uniformTfbs_nocontrols.txt | grep Tfbs > wget.csh
sed -n 's/bigDataUrl /wget /p' uniformTfbs_nocontrols.txt | grep -v Tfbs > wget2.csh
csh wget.csh >&! wget.out &
# 416 files
csh wget2.csh >&! wget2.out &
# 41 (have Histone in filename)
# sample filename: spp.optimal.wgEncodeUwTfbsHcfaaCtcfStdAlnRep1_VS_wgEncodeUwTfbsHcfaaInputStdAlnRep1.bb
# from hub trackDb, metadata is in subGroups line:
# subGroups tier=3 cellType=HCFaa factor=CTCF antibody=CTCF lab=UW treatment=None protocol=Std quality=Good method=SPP
# Note 'quality'. Think about dropping those with quality='Caution'. Unfortunately some
# Tier1 , esp. Pol2 are only available with this quality
# Q: What does Rep1 mean ? Most are 'Rep0' (Answer from Anshul -- Rep0 are pooled, Rep1 were single-rep but adequate qual)
# (Jim's methods exluded those not based on 2 replicates)
# Q: Why are some datasets 'StdAln' and some are 'Aln' ? (Answer from Anshul -- names are from UCSC)
# Q: How is quality assigned ?
# rename to UCSC style and convert to bed: wgEncodeAwgTfbsPeak
cd ..
ls peaks/spp.optimal.*.bb | \
perl -wpe 's^peaks/(spp.optimal.)(wgEncode)(\S+)(Tfbs)(\S+)(StdAln|Aln)(Rep\d_VS\S+)^bigBedToBed peaks/$1$2$3$4$5$6$7 peaks/$2Awg$4$3$5Peak.bed^' > unbig.csh
# convert to bed and rename
csh unbig.csh >&! unbig.out &
# NOTE: ~10% (49 of 457 total) are dupes from different labs
# Factors are: CFOS, CMYC, CTCF, E2F6, EBF1F FOXA1, GATA2, JUND, MAFK
# MAX, POL2, RAD21, SRF, YY1
ls -1 peaks/*.bed > peak.lst
######
# TFBS from July 2012 freeze
# Uniform processing with slightly more relaxed threshold (2% FDR vs 1% based on
# input Anshul received from users. And there are more data sets (690 after removing
# quality failures vs. 495 in previous freeze)
# Based on Anshul's recommendations, using 'optimal' SPP peaks, with black list filtering
# (as in previous track).
######
cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
mkdir -p tfbs2012
cd tfbs2012
set dir = `pwd`
mkdir logs
# use high-quality uniform peaks used to build Uniform TFBS track
# (see doc/encodeAwgHg19.txt)
# This will require much less work than previous build, as much of the processing
# was done to make this predecessor track (uniform naming & normalized scoring, extracting metadata)
ln -s /hive/data/genomes/hg19/bed/wgEncodeAwgTfbsUniform/downloads peaks
ls peaks/*.narrowPeak.gz > peak.lst
wc -l peak.lst
# 690 peak.lst
# extract metadata for files, to format:
# filename cell[+treatment] antibody
cat > getMeta.csh << 'EOF'
foreach p (`cat $1`)
set obj = $p:t:r:r
echo -n "$p\t"
# note: must strip off treatment=None
mdbPrint hg19 -obj=$obj | raToTab stdin -cols=cell,treatment,antibody stdout | \
perl -wne '($cell, $treat, $ab) = split; $cellTreat = $cell . "+" . $treat; \
$cellTreat =~ s/\+None//; print $cellTreat, "\t", $ab, "\n";'
end
'EOF'
csh getMeta.csh peak.lst > fileCellAb.withDupes.tab
# alternate extraction of metadata - include lab, so we can be inclusive of max data
# filename cell[+treatment]+lab antibody
cat > getMetaBig.csh << 'EOF'
foreach p (`cat $1`)
set obj = $p:t:r:r
echo -n "$p\t"
# note: must strip off treatment=None
mdbPrint hg19 -obj=$obj | raToTab stdin -cols=cell,treatment,lab,antibody stdout | \
perl -wne '($cell, $treat, $lab, $ab) = split; $cellTreat = $cell . "+" . $treat . "+" . $lab; \
$cellTreat =~ s/\+None//; print $cellTreat, "\t", $ab, "\n";'
end
'EOF'
csh getMetaBig.csh peak.lst > fileCellAb.tab
# assign upper case codes to cell types in tiers 1, 2, and 2.5
# (tier 3's will all be assigned lower case first char)
# note: treated cells have same code as untreated
cp ~/kent/src/hg/regulate/regClusterBedExpCfg/cellLetter2.tab cellCodesTop.txt
# hand-edit in codes, promoting Tier 2.5 to caps (A549, MCF-7, SK-N-SH, IMR90)
# create config file for clustering
regClusterBedExpCfg -noNormalize -tabList fileCellAb.withDupes.tab \
-cellLetter=cellCodesTop.txt -noLetterOk clusters.cfg
# format is:
# antibody cell[+treatment] cell_char file score_column normalization filename
# CTCF H1-hESC 1 file 4 1 peaks/wgEncodeAwgTfbsBroadH1hescCtcfUniPk.narrowPeak.gz
regClusterBedExpCfg -noNormalize -tabList fileCellAb.tab \
-cellLetter=cellCodesTop.txt -noLetterOk clusters.all.cfg
# format is:
# antibody cell[+treatment]+lab cell_char file score_column normalization filename
# CTCF H1-hESC 1 file 4 1 peaks/wgEncodeAwgTfbsBroadH1hescCtcfUniPk.narrowPeak.gz
# do clustering
hgBedsToBedExps -dupeLetterOk clusters.cfg clusters.bed clusters.exps >&! logs/clusters.out &
hgBedsToBedExps -dupeLetterOk clusters.all.cfg clusters.bed clusters.exps >&! logs/clusters.all.out & # 4 dupes (HAIB A549+Etoh/USF1, HepG2/NRSF, SK-N-SH/NRSF -- these are V04* vs. Pcr*)
# and SYDH K562/Pol2 -- Iggrab control vs none indicated
# hand-edit fileCellAb.tab to add V04 and Iggrab to cell field to uniquify
regClusterBedExpCfg -noNormalize -tabList fileCellAb.uniq.tab \
-cellLetter=cellCodesTop.txt -noLetterOk clusters.uniq.cfg
# cluster again, using uniquified config file
hgBedsToBedExps -dupeLetterOk clusters.uniq.cfg clusters.bed clusters.exps >&! logs/clusters.uniq.out &
cat logs/clusters.uniq.out
#Loaded 690 records from clusters.uniq.cfg
#Got 194 sources
#Got 189 factors
## Success!
wc -l clusters.bed
# 4910269 clusters.bed
# Load database, first clusters
hgLoadBed hg19 wgEncodeRegTfbsClusteredV4 clusters.bed
# Read 4910269 elements of size 15 from clusters.bed
# was 4735242 in V3
# was 2750490 in V2
# Load sourceTable
hgLoadSqlTab hg19 wgEncodeRegTfbsCellsV4 ~/kent/src/hg/lib/expRecord.sql clusters.exps
# Create inputTrackTable - six columns:
cat > getInputs.csh << 'EOF'
foreach p (`cat $1`)
set obj = $p:t:r:r
echo -n "$obj\t"
# note: strip off treatment=None
mdbPrint hg19 -obj=$obj | raToTab stdin -cols=cell,treatment,lab,antibody stdout | \
perl -wne '($cell, $treat, $lab, $ab) = split; \
$source = $cell . "+" . $treat . "+" . $lab; \
$source =~ s/\+None//; \
print $source, "\t", $ab, "\t", $cell, "\t", $treat, "\t", $lab, "\n";'
end
'EOF'
csh getInputs.csh peak.lst > wgEncodeRegTfbsClusteredInputsV4.tab
hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputsV4 ~/kent/src/hg/lib/clusterInputTrackTable4.sql wgEncodeRegTfbsClusteredInputsV4.tab
# Clustering by target
cat > getMetaHuge.csh << 'EOF'
foreach p (`cat $1`)
set obj = $p:t:r:r
echo -n "$p\t"
# note: must strip off treatment=None
mdbPrint hg19 -obj=$obj | \
raToTab stdin -cols=cell,treatment,lab,antibody,target stdout | \
perl -wne '($cell, $treat, $lab, $ab, $target) = split; \
$source = $cell . "+" . $treat . "+" . $lab; \
$source =~ s/\+None//; print $source, "\t", $ab, "\t", $target, "\n";'
end
'EOF'
csh getMetaHuge.csh peak.lst > fileCellAbTarget.tab
regClusterBedExpCfg -noNormalize -useTarget -tabList fileCellAbTarget.tab \
-cellLetter=cellCodesTop.txt -noLetterOk clusters.target.cfg
# format is:
# target cell[+treatment]+lab+ab cell_char file score_column normalization filename
hgBedsToBedExps -dupeLetterOk clusters.target.cfg clusters.target.bed clusters.target.exps >&! \
logs/clusters.target.out &
# 4 dupes: 3 HudsonAlpha (2 protocols), 1 Stanford (2 controls)
# hand-edit fileCellAbTarget.tab to add V04 and Iggrab to cell field to uniquify
# create cluster config file
regClusterBedExpCfg -noNormalize -useTarget -tabList fileCellAbTarget.uniq.tab \
-cellLetter=cellCodesTop.txt -noLetterOk clusters.target.cfg
# next step is the actual clustering, and takes some time (minutes)
hgBedsToBedExps -dupeLetterOk clusters.target.cfg clusters.target.bed clusters.target.exps >&! \
logs/clusters.target.out &
#Loaded 690 records from clusters.target.cfg
#Got 690 sources
#Got 161 factors
hgLoadBed hg19 wgEncodeRegTfbsClusteredV6 clusters.target.bed
ln -s clusters.target.bed wgEncodeRegTfbsClusteredV6.tab
# V5 with separate HA-E2F1 factor: Read 4385242 elements of size 15 from clusters.target.bed
# Read 4380444 elements of size 15 from clusters.target.bed
hgLoadSqlTab hg19 wgEncodeRegTfbsCellsV6 ~/kent/src/hg/lib/expRecord.sql clusters.target.exps
ln -s clusters.targets.exps wgEncodeRegTfbsCellsV6.tab
# Create inputTrackTable - six columns:
# For target-based, include antibody in source field
cat > getTargetInputs.csh << 'EOF'
foreach p (`cat $1`)
set obj = $p:t:r:r
echo -n "$obj\t"
# note: strip off treatment=None
mdbPrint hg19 -obj=$obj | raToTab stdin -cols=cell,treatment,lab,antibody,target stdout | \
perl -wne '($cell, $treat, $lab, $ab, $target) = split; \
$source = $cell . "+" . $treat . "+" . $lab . "+" . $ab; \
$source =~ s/\+None//; \
print $source, "\t", $target, "\t", $ab, "\t", \
$cell, "\t", $treat, "\t", $lab, "\n";'
end
'EOF'
csh getTargetInputs.csh peak.lst > wgEncodeRegTfbsClusteredInputsV6.tab
hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputsV6 \
~/kent/src/hg/lib/clusterInputTrackTable5.sql wgEncodeRegTfbsClusteredInputsV6.tab
# compare coverage of V2-V6 (V5 is same as 6, except clusters HA-E2F1 as separate target)
# V2 - current version. June 2011 TFBS (425 inputs)
featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV2
# wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV2 7.797%, both 4.706%, cover 35.14%, enrich 4.51x
featureBits hg19 -noRandom -enrichment wgEncodeRegTfbsClusteredV2 wgEncodeRegDnaseClusteredV2
# wgEncodeRegTfbsClusteredV2 7.797%, wgEncodeRegDnaseClusteredV2 13.390%, both 4.706%, cover 60.36%, enrich 4.51x
# 'V3' - clustered by antibody. Mar 2012 TFBS, datasets thinned to uniquify (e.g. no multi-lab)
# (635 inputs)
featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV3
# wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV3 12.855%, both 6.578%, cover 49.13%, enrich 3.82x
# 'V4' - clustered by antibody. Mar 2012 TFBS, all datasets passing Anshul's qual filter (690)
featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV4
# wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV4 13.156%, both 6.690%, cover 49.96%, enrich 3.80
# 'V5' - clustered by target. Mar 2012 TFBS, all datasets passing Anshul's qual filter (690)
featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV5
# wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV5 13.156%, both 6.690%, cover 49.96%, enrich 3.80x
# 'V6' - clustered by target. Mar 2012 TFBS, all datasets passing Anshul's qual filter (690)
featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV6
# wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV6 13.156%, both 6.690%, cover 49.96%, enrich 3.80x
featureBits hg19 -noRandom -enrichment wgEncodeRegTfbsClusteredV5 wgEncodeRegDnaseClusteredV2
# wgEncodeRegTfbsClusteredV5 13.156%, wgEncodeRegDnaseClusteredV2 13.390%, both 6.690%, cover 50.86%, enrich 3.80x
# V6 is release target -- rename the antibody clustering to V3a, and rename V6 to V3 (use symlinks in build dir)
# Set up downloads
set dir = /hive/data/genomes/hg19/bed/wgEncodeRegV2/tfbs2012
cd $dir
mkdir downloads
gzip -c wgEncodeRegTfbsClusteredV3.bed > downloads/wgEncodeRegTfbsClusteredV3.bed.gz
# disallow name truncation
gzip -c -n wgEncodeRegTfbsClusteredInputsV3.tab > downloads/wgEncodeRegTfbsClusteredInputsV3.tab.gz
gzip -c wgEncodeRegTfbsCellsV3.tab > downloads/wgEncodeRegTfbsCellsV3.tab.gz
# use (existing) ENCODE DCC downloads dir
set dload = /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegTfbsClustered
cd $dload
# new release dir
mkdir release3
ln -s release3 beta
ln -s release3 releaseLatest
cd $dir/downloads
ln wgEncodeRegTfbsClusteredV3.bed.gz wgEncodeRegTfbsClusteredInputsV3.tab.gz wgEncodeRegTfbsCellsV3.tab.gz $dload
cd $dload
ln wgEncodeRegTfbsClusteredV3.bed.gz wgEncodeRegTfbsClusteredInputsV3.tab.gz wgEncodeRegTfbsCellsV3.tab.gz releaseLatest
# include older files as well
cp wgEncodeRegTfbsClusteredV2.bed.gz wgEncodeRegTfbsClusteredInputs.tab.gz wgEncodeRegTfbsCells.tab.gz releaseLatest
cp README.txt releaseLatest
# clean up bogus names embedded in zips)
# gunzip wgEncodeRegTfbsClusteredV2.bed.gz wgEncodeRegTfbsClusteredInputs.tab.gz wgEncodeRegTfbsCells.tab.gz
md5sum *.gz > md5sum.txt
# vim README.txt
# use wgEncodeRegDnaseClustered for basics
cp README.txt md5sum.txt ..
# add minimal metaObject to metaDb to associate new table with (pseudo-)composite
# wgEncodeRegTfbsClustered -- this allows hgTrackUi to locate downloads
cd $dir
cat > wgEncodeRegTfbsClustered.ra << 'EOF'
metaObject wgEncodeRegTfbsClusteredV3
objType table
composite wgEncodeRegTfbsClustered
fileName wgEncodeRegTfbsClusteredV3.bed.gz
tableName wgEncodeRegTfbsClusteredV3
project wgEncode
'EOF'
# also, add aux tables, then mdbUpdate, mdbPrint
# This script creates a BED5+ download file with clusters and cell info
clusterAddSources.pl wgEncodeRegTfbsClusteredV3.bed wgEncodeTfbsClusteredInputsV3.tab > \
wgEncodeRegTfbsClusteredWithCellsV3.bed
gzip -c -n wgEncodeRegTfbsClusteredWithCellsV3.bed > downloads/wgEncodeRegTfbsClusteredWithCellsV3.bed.gz
cd downloads
ln wgEncodeRegTfbsClusteredWithCellsV3.bed.gz $dload
cd $dload
ln wgEncodeRegTfbsClusteredWithCellsV3.bed.gz releaseLatest
md5sum *.gz > md5sum.txt
# vim README.txt
cp README.txt md5sum.txt ..
############
# Fixes to Inputs table (2014-05-19 kate)
# Matches manual changes to uniqify Cells table
hgsql hg19 -e 'update wgEncodeRegTfbsClusteredInputsV3 set source="A549+EtOH_0.02pct+HudsonAlpha+V04+USF-1" where tableName="wgEncodeAwgTfbsHaibA549Usf1V0422111Etoh02UniPk"'
hgsql hg19 -e 'update wgEncodeRegTfbsClusteredInputsV3 set source="HepG2+HudsonAlpha+V04+NRSF" where tableName="wgEncodeAwgTfbsHaibHepg2NrsfV0416101UniPk"'
hgsql hg19 -e 'update wgEncodeRegTfbsClusteredInputsV3 set source="SK-N-SH+HudsonAlpha+V04+NRSF" where tableName="wgEncodeAwgTfbsHaibSknshNrsfV0416101UniPk"'
hgsql hg19 -e 'update wgEncodeRegTfbsClusteredInputsV3 set source="K562+Stanford+Iggrab+Pol2(phosphoS2)" where tableName="wgEncodeAwgTfbsSydhK562Pol2s2IggrabUniPk"'
#######################################################################
# performance eval
# try changing expIds and expScores to varchar 2048 from blob
# create table
alter table kateTemp disable keys;
insert into kateTemp (select * from wgEncodeTfbsClusteredV3)
# try bigBed
set dir = /cluster/data/hg19/bed/wgEncodeRegV2/tfbs2012
cd $dir
set track = wgEncodeRegTfbsClusteredV3
bedSort clusters.target.bed clusters.target.sorted.bed
/cluster/bin/x86_64/bedToBigBed -type=bed15 -tab clusters.target.sorted.bed \
/hive/data/genomes/hg19/chrom.sizes $track.bb
# Error line 1 of xx.bed: expecting 690 elements in expIds list (bed field 14)
# Might be a bogus error -- looks OK on inspection
# GOT HERE
ln -s `pwd`/$track.bb /gbdb/hg19/wgEncodeReg
hgBbiDbLink hg19 $track wg/gbdb/hg19/wgEncodeReg/$track.bb
# for now, adding maxWindowToDraw 1M setting to trackDb to limit performance hit
# shrink table format to BED5+, with columns 6 and 7 non-sparse comma-sep lists
# of exp id's and scores
clusterShrinkExps.pl clusters.target.sorted.bed > clusters.shrunk.bed
+# NOTE: moving this tool to makeDb/hgBedsToBedExps/bedExpsToFactorSource.pl
hgLoadBed -sqlTable=~/kent/src/hg/lib/factorSource.sql -renameSqlTable \
hg19 wgEncodeRegTfbsClusteredShrunkMini test.shrink.bed
hgLoadBed -sqlTable=~/kent/src/hg/lib/factorSource.sql -renameSqlTable \
hg19 wgEncodeRegTfbsClusteredShrunk clusters.shrunk.bed
# rename Shrunk to V3
# copy to downloads
cp clusters.shrunk.bed downloads/wgEncodeRegTfbsClusteredV3.bed
cd downloads
gzip wgEncodeRegTfbsClusteredV3.bed
cd /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegTfbsClustered/releaseLatest
md5sum wgEncodeRegTfbsClusteredV3.bed >> md5sum.txt
# and remove old entry
# (see Wrangler HowTo in ENCODE2 wiki for these steps)
# replace md5sum in md5sum.txt, metaDb
# create files.txt
encodeMkFilesList hg19
#######################################################################
# EXPERIMENTAL: TF motif tracks, from EMBL/Aus (MEME toolkit)
# These are generated using FIMO tool:
# Charles E. Grant, Timothy L. Bailey, and William Stafford Noble, "FIMO: Scanning for occurrences of a given motif", Bioinformatics 27(7):10171018, 2011
# http://bioinformatics.oxfordjournals.org/content/early/2011/02/16/bioinformatics.btr064.full
# http://meme.ebi.edu.au/meme/cgi-bin/fimo.cgi
mkdir /cluster/data/hg19/bed/tfbsFimo
cd /cluster/data/hg19/bed/tfbsFimo
hgLoadBed hg19 tfbsFimoJaspar TfMotifsFIMO_Jaspar.hg19.bed
# Read 1386302 elements of size 6 from TfMotifsFIMO_Jaspar.hg19.bed
hgLoadBed hg19 tfbsFimoTransfac TfMotifsFIMO_Transfac.hg19.bed
# Read 3665198 elements of size 6 from TfMotifsFIMO_Transfac.hg19.bed
#######################################################################
# more motif exploratory work -- give Larry M's transRegCode a ride
cd /hive/data/genomes/hg19/bed/wgEncodeRegV2/tfbs2012
mkdir motifs
cd motifs
# lift motif localizations from hg18 (where did they come from originally ?)
hgsql hg18 -Ne "select * from wgEncodeRegTfbsClusteredMotifs" | \
cut --complement -f 1 | \
liftOver -bedPlus=4 stdin /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz \
motifs.bed motifs.unmapped
hgLoadBed hg19 -as=~/kent/src/hg/lib/bed6FloatScore.as wgEncodeRegTfbsClusteredMotifs motifs.bed
# Read 1750827 elements of size 6 from motifs.bed
#######################################################################
# Table cleanup/renaming (more to come):
TfbsClusteredV3a -> TfbsClusteredV3a_old
TfbsClusteredV4 -> TfbsClusteredV3a
TfbsClusteredV5 -> TfbsClusteredV3b
#######################################################################
# Factorbook motifs (Jun 2013, and then Dec 2013-Feb 2014, kate)
# Data consists of motif locations, positions weight matrices,
# and curation (which motifs are canonical for which factors.
# Basis of the track is supplementary data from Wang et al. Sep2012 Genome Res paper,
# plus newer data provided by Bong Hyun Kim (nghyun.kim nghyun.kim@umassmed.edu)
# of Factorbook team at Zhiping Weng's lab (Zlab).
# From Bong-Hyun:
#These are discovered motifs from 2012 freeze, methods in GR 2012:22(9), Wang et al.
#Basically the motif were discovered from 500 strongest signal peaks by MEME, and then they are tested if they are enriched compared to the flanking regions of the peaks or GC content matching regions. And then the motifs are condensed by manual curation. Then the peak regions are searched by the condensed motifs using FIMO. Note that UA (unannotated) motifs are new motif described in the genome research paper, but UAK's are potentially unannotated motifs found newer datasets and not yet carefully curated.
cd /cluster/data/hg19/bed
mkdir factorbook
cd factorbook
#################
# Motif locations within ENCODE peaks
# Each line is the genomic location of a motif whose name is the 5th column.
# The 6, 7 and 8th columns are q-value of the motif match, peak number (or bed file line number)
# in the original uniform processed peak call, and peak summit, respectively.
#
# Load these as a BED table of merged data from all experiments, with q-value replaced with
# -log10 score. Duplicate items are removed, with the item scored as maximum found.
# (NOTE: this table could be expanded to included the sources as in factorSource tables -- this
# would allow it to be used to show motifs in the individual experiment narrowPeak files)
wget http://zlab.umassmed.edu/zlab/publications/TFBS.tar.gz
# data from Sep2012 Genome Research paper (through Jan 2011 ENCODE freeze)
wget http://zlab.umassmed.edu/~kimb/encode_tfbs.tar.gz
# newer data (Jan 2011 - Mar 2012 ENCODE freeze)
tar xvfz encode_tfbs.tar.gz
cd encode_tfbs
cat *.bed | wc -l
# 6316631
# ls *.bed | wc -l
# 294
# why so few ? -- asking Bong-hyun
# NOTE: some beds are for controls!
# NOTE: there are motifs for different factors in these peaks (not just target of assay)
# Around 6M localizations in these files, some appearing multiply reflecting detection
# in peaks from different expereriments. These will be merged in the final table.
# convert:
# chrom, start, end, strand, name, qvalue ->
# chrom, start, end, name, float_score, strand (bed6FLoatScore)
cat encode_tfbs/*.bed pub_tfbs/*.summary | \
awk '{OFS="\t"; print $1, $2-1, $3, $5, $6, $4}' > motifs.withdupes.bed
wc -l motifs.withdupes.bed
# 12115922
bedSort motifs.withdupes.bed stdout | uniq > motifs.bed
wc -l motifs.bed
# 12022426 motifs.bed
hgLoadBed hg19 -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \
wgEncodeRegTfbsClusteredMotifs_factorBook motifs.bed
# Read 12022426 elements of size 6 from motifs.bed
# TODO: remove multiple hits at same position, remove q-value (we will rescore later)
awk '{$5=0; print}' motifs.bed | uniq > motifs.uniq.bed
wc -l motifs.uniq.bed
# 2631434 motifs.uniq.bed
# 2.6M, down from 12M
hgLoadBed hg19 -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \
wgEncodeAwgTfbsMotifs motifs.uniq.bed
# Check range of qvalue:
>select min(score) from wgEncodeAwgTfbsMotifs; | 0.000000002479999983151515 |
>select max(score) from wgEncodeAwgTfbsMotifs; | 0.6370000243186951 |
>select count(*) from wgEncodeAwgTfbsMotifs where score > .1 | 1065928 |
# BongHyun says these are bad quality -- should drop
>select count(*) from wgEncodeAwgTfbsMotifs; | 12022426 |
# Reload table with -log10 scoring and motif name fixes (Jan 21, 2014 KRR)
# (Two new canonicals were found that have Jaspar naming convention V_ prefix:)
#NFIC NFIC (V_NF1_Q6)
#NR4A1 NR4A1 (V_TR4_03)
# Also, both v-Jun and v-JUN are used
cat > dedupAndScore.pl << 'EOF'
#!/usr/bin/env/perl
# retain lowest scored item (q-value) at a location, converting score to -log10
# expects file to be pre-sorted with: sort -k1,4 -k6,6 -k5,5
sub minusLog10 {
my $n = shift;
return -log($n)/log(10);
}
$chrN = "";
while (<>) {
chomp;
($chr, $start, $end, $name, $score, $strand) = split;
if ($chr ne $chrN || $start ne $startN || $end ne $endN || $name ne $nameN || $strand ne $strandN) {
$logScore = minusLog10($score);
printf("%s\t%d\t%d\t%s\t%.2f\t%s\n", $chr, $start, $end, $name, $logScore, $strand);
$chrN = $chr; $startN = $start; $endN = $end; $nameN = $name; $strandN = $strand;
}
}
'EOF'
sort -k1,4 -k6,6 -k5,5 motifs.bed > motifs.sorted.bed
perl dedupAndScore.pl < motifs.sorted.bed | \
# fix motif naming inconsisencies (V_
sed -e 's/v-Jun/v-JUN/' -e 's/V_NF1_Q6/NFIC/' -e 's/V_TR4_03/NR4A1/' > motifs.load.bed
wc -l motifs.load.bed
#2366151 motifs.load.bed
hgLoadBed hg19 -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \
factorbookMotifPos motifs.load.bed
#################
# Position weight matrices for the motifs
# These are in MEME format
wget http://zlab.umassmed.edu/~kimb/gr_paper_extended.meme
# MOTIF A-Box
#...
# A 0.25000 C 0.25000 G 0.25000 T 0.25000
#...
# letter-probability matrix: alength= 4 w= 21 nsites= 116 E= 5.4e-213
# 0.137931 0.189655 0.017241 0.655172
# 0.620690 0.051724 0.077586 0.250000
# ...
# use w for column count. 1 probability row per column. order: acgt
grep MOTIF gr_paper*meme | wc -l
# 132
# total motifs
cat > convertMotifs.pl << 'EOF'
#!/usr/bin/env perl
# convertMotifs - create load file for UCSC transRegCodeMotif table from MEME output
## MEME format:
#MOTIF
#...
#letter-probability matrix: alength= 4 w=
#Aprob Cprob Gprob Tprob
#...
## TransRegCodeMotif format:
#name columnCount aProb cProb gProb
# where *Prob fields are comma-sep lists of length columnCount
use warnings;
use strict;
my $name;
my $columnCount;
my $cols = 0;
my $aProb, my $cProb, my $gProb, my $tProb;
my @aProbs = (), my @cProbs = (), my @gProbs = (), my @tProbs = ();
sub printProbs {
# print comma-sep list of probabilities
my ($probs) = @_;
print "\t";
foreach my $prob (@$probs) {
print "$prob,";
}
}
while (<>) {
if (/^MOTIF (\S+)/) {
$name = $1;
} elsif (defined($name) && /^letter-probability matrix: alength= \d+ w= (\d+)/) {
$columnCount = $1;
} elsif (defined($columnCount)) {
# working on probabilities
if ($cols < $columnCount) {
# slurp in probabilities
$cols++;
($aProb, $cProb, $gProb, $tProb) = split;
push(@aProbs, $aProb);
push(@cProbs, $cProb);
push(@gProbs, $gProb);
push(@tProbs, $tProb);
} else {
# finished with this motif -- print it out and set up for next
print "$name\t$columnCount";
printProbs(\@aProbs);
printProbs(\@cProbs);
printProbs(\@gProbs);
printProbs(\@tProbs);
print "\n";
undef $name;
undef $columnCount;
$cols = 0;
@aProbs = (); @cProbs = (); @gProbs = (); @tProbs = ();
}
}
}
'EOF'
convertMotifs.pl gr_paper_extended.meme > transRegCodeMotif.tab
# wc -l transRegCodeMotifs.tab
# 132 transRegCodeMotifs.tab
hgLoadSqlTab hg19 transRegCodeMotifs_factorBook ~/kent/src/hg/lib/transRegCodeMotif.sql \
transRegCodeMotif.tab
# mySQL error 1062: Duplicate entry 'v-JUN' for key 1
# There are both v-JUN and v-Jun entries
# For now, prune out v-Jun (and inquire at zlab) amd reload
# Again, motif name consistency
sed -e 's/V_NF1_Q6/NFIC/' -e 's/V_TR4_03/NR4A1/' transRegCodeMotif.fixed.tab > pwms.tab
hgLoadSqlTab hg19 factorbookMotifPwm ~/kent/src/hg/lib/transRegCodeMotif.sql pwms.tab
##########
# Create alias table to map factor names at UCSC (HGNC, in ENCODE controlled vocab), to Factorbook
# names (not always HGNC), so linkouts work. For use with trackDb url setting.
cat > geneAlias.sql << 'EOF'
CREATE TABLE factorbookGeneAlias (
name VARCHAR(255) NOT NULL,
value VARCHAR(255) NOT NULL,
PRIMARY KEY (name)
);
cat > testLinks.csh << 'EOF'
foreach f (`awk '{print $1}' allMotifTargets.load.tab`)
wget -O /dev/null -q http://www.factorbook.org/mediawiki/index.php/$f
set ret = $status
echo "$f : $ret"
end
'EOF'
csh testLinks.csh > testLinks.out
# results in 'noPage.txt' and 'misnamedPage.txt'
# various tweaking to generate factorbookGeneAlias.tab (sorry, log is lost, but should be easy to recreate)
hgsql hg19 < geneAlias.sql
hgsql hg19 -e 'LOAD DATA LOCAL INFILE "factorbookGeneAlias.tab" INTO TABLE factorbookGeneAlias'
#1/31/14 - patch factorbookGeneAlias
update factorbookGeneAlias set value='ERalpha a' where name='ESR1';
#3/13/14 - remove genes not in TF clusters (bad quality dataaset)
delete from factorbookGeneAlias where name='NR4A1'
delete from factorbookGeneAlias where name='SREBP2'
##########
# Create table to map factor names to their canonical motifs (12/4/13 Kate)
# Many twists and turns here -- the goal is to assure factor names match UCSC TF CV names, and
# that motif names match BED of motif positions. Also, that only factors having DNA-binding domains
# that will bind directly are listed (not those with 'tethered-only' binding mode).
# Approach was to use Table S1 + communications from Zlab for newer factors, removing those
# listed in Table S3 as tethered only. Zlabbers have offered to provide us with a full
# file for reloading this table.
# Get targets for each dataset
csh listMotifs.csh > targets.txt
cp targets.txt targets.txt.fixed
# Remove low quality datasets (see details in AwgTfbsUniform)
# These were left out of TFBS Clusters V3
#wgEncodeSydhTfbsGm12878Spt20StdAlnRep0
#wgEncodeSydhTfbsGm12878Gcn5StdAlnRep0
#wgEncodeHaibTfbsPfsk1Pol24h8V0416101AlnRep0
#wgEncodeHaibTfbsA549Sp1V0422111Etoh02AlnRep0
#wgEncodeSydhTfbsHepg2CebpzIggrabAlnRep0
#wgEncodeSydhTfbsHelas3Gcn5StdAlnRep0
#wgEncodeSydhTfbsGm12878Irf3IggmusAlnRep0
#wgEncodeSydhTfbsK562Xrcc4StdAlnRep0
#wgEncodeSydhTfbsHepg2Srebp2PravastStdAlnRep0
#wgEncodeSydhTfbsHepg2Srebp1PravastStdAlnRep0
#wgEncodeSydhTfbsHepg2Pol2PravastStdAlnRep0
# 78 left unmatched
# List motifs in datasets
csh listMotifs2.csh > motifs2.txt
cp motifs2.txt motifs2.txt.fixed
# remove bad datasets above
# merge files
paste targets.txt.fixed motifs2.txt.fixed > target_motifs.txt
# remove bad Uchicago datasets. All but:
#wgEncodeUchicagoTfbsK562Ehdac8ControlAlnRep0
#wgEncodeUchicagoTfbsK562EjundControlAlnRep0
#wgEncodeUchicagoTfbsK562EjunbControlAlnRep0
#wgEncodeUchicagoTfbsK562Egata2ControlAlnRep0
#wgEncodeUchicagoTfbsK562EfosControlAlnRep0
# inventory file
awk -F'\t' 'OFS="\t" {print $1, $2, $4}' < target_motifs.txt > target_motifs_out.txt
# Create map table
cat > motifTarget.sql << 'EOF'
CREATE TABLE motifTarget (
target VARCHAR(255) NOT NULL,
motif VARCHAR(255) NOT NULL,
PRIMARY KEY (target)
);
'EOF'
hgsql hg19 < factorbookMotifTarget.sql
# Get mappings from GR paper (Wang 2012)
wget http://genome.cshlp.org/content/suppl/2012/08/22/22.9.1798.DC1/TableS1.xls
# extract tab-sep file: to FactorbookTableS1.txt
# columns needed for map table: 2: HGNC ID, 4: canonical motif
# NOTE: a few have two canonical( ; sep) -- keep it simple for now, just use first
# NOTE: some have no canonical -- col 4 is empty
tail -n +2 FactorbookTableS1.txt | \
awk -F'\t''{print $2, $4}' | sort | uniq | sed 's/;.*//' > pubMotifs.txt
# Need same for newer data
# Merge and load in table
#awk '{print $1, "\t", $2}' pubMotifs.txt newMotifs.guess.txt | sort > allMotifs.tab
#hgsql hg19 -e 'LOAD DATA LOCAL INFILE "factorbookTargetToMotif.tab" INTO TABLE factorbookTargetToMotif'
# Replace with mappings from Bong-Hyun, sent in 12/19 email
cat << 'EOF' | sed 's/ */ /' | grep -v '^#' > newMotifs.txt
#The TFs not mentioned in my list do not have specific binding motifs, e.g. histone acetylases or other non-specific DNA binding proteins.
#Common.Name canonical.motif
ATF1 CREB
ATF2 CREB
BACH1 NFE2
CREB1 CREB
ELK1 ELF1
FOXM1 FOXA
FOXP2 FOXA
MAZ UAK42
NFIC NFIC
NR4A1 NR4A1
RUNX3 RUNX1
SP4 SP1
SREBF2 SREBF1
STAT5A STAT1
TCF3 TCF3
TEAD4 TEAD4
ZKSCAN1 UAK54
ZNF217 UAK59
'EOF'
# NOTE: NR4A1 and SREBF2(SREBP2) not in Anshul's uniform peaks (failed qual filter I think)
# Drop these from target list
sort pubMotifs.txt newMotifs.txt | sed 's/ /\t/' > allMotifTargets.tab
hgsql hg19 < motifTarget.sql
hgsql hg19 -e 'alter table motifTarget rename to factorbookMotifTarget'
hgsql hg19 -e 'LOAD DATA LOCAL INFILE "allMotifTargets.tab" INTO TABLE factorbookMotifTarget'
# Check for identifier mismatches
# 1. Clusters table with Motif targets table
hgsql hg19 -e "select distinct(name) from wgEncodeRegTfbsClusteredV3 order by name" > clusterTargets.txt
hgsql hg19 -e "select distinct(target) from factorbookMotifTarget order by target" > motifTargets.txt
diff clusterTargets.txt motifTargets.txt > targets.diff
# These are targets in Factorbook, not in UCSC targets list
grep '>' targets.diff
> target
> NFKB1
# -> RELA in UCSC CV (target gene for this antibody - NFKB p65)
> NR4A1
# not in UCSC clusters track (low qual data ? It's Chicago)
> POLR3A
# -> RPC155 in UCSC CV (old-school, not current HGNC)
> SREBF1
# -> SREBP1 in UCSC CV (old-school, not current HGNC)
> SREBF2
# -> SREBP2 in UCSC CV (old-school, not current HGNC)
# Update targets table to use UCSC CV targets
sed -e 's/^NFKB1/RELA/' -e 's/^POLR3A/RPC155/' -e 's/^SREBF1/SREBP1/' -e 's/^SREBF2/SREBP2/' \
allMotifTargets.tab > allMotifTargets.load.tab
hgsql hg19 -e 'LOAD DATA LOCAL INFILE "allMotifTargets.load.tab" INTO TABLE factorbookMotifTarget'
# TODO: remove targets having no canonical motif (25 of total 137)
# Also need to rename two new canonicals found that have Jaspar naming convention:
#NFIC NFIC (V_NF1_Q6)
#NR4A1 NR4A1 (V_TR4_03)
# Also, spot-checking new canonical list above, I noted that ATF2 motif file lacked CREB -- inquiring with BongHyun as to why.
#wgEncodeHaibTfbsGm12878Atf2sc81188V0422111AlnRep0.tfbs.bed : AP1 UAK16 v-JUN
#wgEncodeHaibTfbsH1hescAtf2sc81188V0422111AlnRep0.tfbs.bed : v-JUN
#wgEncodeHaibTfbsH1hescAtf3V0416102AlnRep0.tfbs.bed : CREB UAK18 USF ZNF143-ext
#wgEncodeHaibTfbsHepg2Atf3V0416101AlnRep0.tfbs.bed : CREB-ext UAK18 USF
#wgEncodeHaibTfbsK562Atf3V0416101AlnRep0.tfbs.bed : AP1 UAK17
#wgEncodeSydhTfbsK562Atf106325StdAlnRep0.tfbs.bed : NFY SP1 ZNF143-ext v-JUN
# NOTE: some factors have two canonical motifs listed (; separated in Table S1)
# For now, using just the first, but looking at data, both seem important.
# Need code changes to pick these up.
$ grep ';' *S1.txt | awk '{print $2, $4}' | sort | uniq
ATF3 CREB;CREB-ext
CTCF CTCF;CTCF-ext
CTCFL CTCF;CTCF-ext
GATA1 GATA1;GATA1-ext
GATA2 GATA1;GATA1-ext
JUN AP-1;v-JUN
JUNB AP-1;v-JUN
JUND AP-1;v-JUN
RAD21 CTCF;CTCF-ext
SMC3 CTCF;CTCF-ext
THAP1 UA4;UA5
ZNF143 ZNF143;ZNF143-ext
ZNF274 UA10;UA11
# Check for identifier mismatches
# 1. Clusters table with Motif targets table
hgsql hg19 -e "select distinct(name) from wgEncodeRegTfbsClusteredV3 order by name" > clusterTargets.txt
hgsql hg19 -e "select distinct(target) from factorbookMotifTarget order by target" > motifTargets.txt
diff clusterTargets.txt motifTargets.txt > targets.diff
# These are targets in Factorbook, not in UCSC targets list
grep '>' targets.diff
> target
> NFKB1
# -> RELA in UCSC CV (target gene for this antibody - NFKB p65)
> NR4A1
# not in UCSC clusters track (low qual data ? It's Chicago)
> POLR3A
# -> RPC155 in UCSC CV (old-school, not current HGNC)
> SREBF1
# -> SREBP1 in UCSC CV (old-school, not current HGNC)
> SREBF2
# -> SREBP2 in UCSC CV (old-school, not current HGNC)
# Update targets table to use UCSC CV targets
grep '<' targets.diff > ucsc.diff
# Notes here about targets at UCSC not in target table. FYI.
# Check motif name consistency across tables
awk '{print $4}' motifs.load.bed | sort | uniq > bedMotifs.txt
awk '{print $2}' allMotifTargets.load.tab | sort | uniq > targetMotifs.txt
awk '{print $1}' pwms.tab | sort | uniq > pwmMotifs.txt
diff pwmMotifs.txt targetMotifs.txt | grep '>'
>
> AP-1
# AP1
> AP-2
# AP2
> NF-Y
# NFY, NFY-UA2 ?
> PU.1
# PU1
> bHLH
# BHLHE40
# Check pwms vs. bed files
diff pwmMotifs.txt bedMotifs.txt
67c67
< UA12
# no UA12 in bed file
---
> UA14
# no UA14 in pwms
71d70
< UA5
77a77,79
> UAK18
> UAK19
> UAK20
130a133
> v-Jun
# should be v-JUN for consistency
# Update BED table to correct v-Jun
cp motifs.load.bed motifs.fix.bed
sed 's/v-Jun/v-JUN/' motifs.fix.bed >motifs.load.bed
hgLoadBed hg19 -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \
factorbookMotifPos motifs.load.bed
# Update targets table to match bed & pwm tables
cp allMotifTargets.load.tab allMotifTargets.fix.tab
sed -e 's/AP-1$/AP1/' -e 's/AP-2$/AP2/' -e 's/NF-Y/NFY/' -e 's/PU.1/PU1/' -e 's/bHLH/BHLHE40/' \
allMotifTargets.fix.tab > allMotifTargets.load.tab
hgsql hg19 -e 'DELETE FROM factorbookMotifTarget; LOAD DATA LOCAL INFILE "allMotifTargets.load.tab" INTO TABLE factorbookMotifTarget'
#Add secondary canonicals:
#CREB-ext, CTCF-ext, GATA1-ext, v-JUN (to AP-1), UA5 (to UA4), ANF143-ext, UA11 (to UA10)
sed \
-e 's/CREB$/CREB,CREB-ext/g' \
-e 's/CTCF$/CTCF,CTCF-ext/g' \
-e 's/GATA1$/GATA1,GATA1-ext/g' \
-e 's/ZNF143$/ZNF143,ZNF143-ext/g' \
-e 's/UA4$/UA4,UA5/g' \
-e 's/UA10$/UA10,UA11/g' \
-e 's/AP-1$/AP-1,v-JUN/g' \
allMotifTargets.load.tab > allMotifTargetsV2.load.tab
hgsql hg19 < motifTarget.sql
hgsql hg19 -e 'alter table motifTarget rename to factorbookMotifTargetV2'
hgsql hg19 -e 'LOAD DATA LOCAL INFILE "allMotifTargetsV2.load.tab" INTO TABLE factorbookMotifTargetV2'
# Remove motifs for 'indirect' binding TFs (those showing only 'tethered' mode in Table S3
awk 'FS="\t" {printf("%s\t%s\t%s\n", $2, $3, $12)}' FactorbookTableS3.txt | sort | uniq > indirects.tsv
# Tethered only are:
BATF AP-1
BCL11A UA9
BCL3 NFKB1
ETS1 ETS1
FOS AP-1
GATA2 GATA1;GATA1-ext
GATA3 GATA3
IRF1 PRDM1
MXI1 MAX
NR3C1 NR3C1
PAX5 PAX5
POLR2A TBP
PPARGC1A ESRRA
RAD21 CTCF;CTCF-ext
RFX5 RFX5
SMC3 CTCF;CTCF-ext
SREBF1 SREBF1
SRF SRF
STAT3 STAT1
TAF1 TBP
# Hand edit motifs for factors above out of file (allMotifTargetsV2.load.tab -> V3)
hgsql hg19 -e 'select distinct(name) from wgEncodeRegTfbsClusteredV3 order by name' > allFactors.txt
hgsql hg19 -e 'select distinct(target) from factorbookMotifTarget order by name' > allMotifFactors.txt
diff allFactors.txt allMotifFactors.txt | grep '<' | sed -e 's/< //' > newNoMotifFactors.txt
wc -l newNoMotifFactors.txt
26 newNoMotifFactors.txt
# NOTE: NR4A1 and SREBF2(SREBP2) not in Anshul's uniform peaks (failed qual filter I think)
# Drop these from allMotifTargetsV4.load.tab
sed -e 's/$/\t/' -e '/NR4A1/d' -e '/SREBF2/d' newNoMotifFactors.txt > foo
mv foo newNoMotifFactors.txt
cat allMotifTargetsV3.load.tab newNoMotifFactors.txt | sort > allMotifTargetsV4.load.tab
hgsql hg19 < motifTarget.sql
hgsql hg19 -e 'alter table motifTarget rename to factorbookMotifTargetV4'
hgsql hg19 -e 'LOAD DATA LOCAL INFILE "allMotifTargetsV4.load.tab" INTO TABLE factorbookMotifTargetV4'
# Generate comma-sep list of factors for trackDb filter, 'factorFilterList.txt'
# Get citations for track description References section
getTrackReferences 22955990 23203885 22955619 > refs.txt
# Requested corrections target table using new gene vs. canonical motif table from Jiali at Zlab (2014-03-06 kate)
# They returned files with new motif calls & annotations. Perhaps for another version of the track!
wget http://zlab.umassmed.edu/~zhuangj/Direct_binding_motifs.tar.gz
wget http://zlab.umassmed.edu/~kimb/motif_centric_encode_tfbs.sorted.bed.bz2
bunzip2 !$
# In email, the following TF's 4 were specifically mentioned as direct binders
# missing canonical motif:
# NFIC->NFIC (already in table)
# NR4A1->NR4A1 (not in our TF clusters, rejected due to low quality, so not relevant to table)
# and these two, which will be added to the table!
# NR3C1->NR3C1
# FOS->API
sed -e 's/NR3C1\t$/NR3C1\tNR3C1/' -e 's/FOS\t$/FOS\tAP1/' allMotifTargetsV4.load.tab > factorbookMotifCanonical.load.tab
hgsql hg19 < motifTarget.sql
hgsql hg19 -e 'alter table motifTarget rename to factorbookMotifCanonical'
hgsql hg19 -e 'LOAD DATA LOCAL INFILE "factorbookMotifCanonical.load.tab" INTO TABLE factorbookMotifCanonical'