d87e5f63bc000d560c21bca638b6d5f3dee0173d
kate
Wed Apr 3 12:06:45 2019 -0700
Add scripts and make docs for ENCODE 3 TF binding site tracks. refs #21139
diff --git src/hg/makeDb/doc/encode3/tfbs.txt src/hg/makeDb/doc/encode3/tfbs.txt
new file mode 100644
index 0000000..3bd8e8f
--- /dev/null
+++ src/hg/makeDb/doc/encode3/tfbs.txt
@@ -0,0 +1,447 @@
+# ENCODE TFBS from DAC
+
+# (2018-12-04 in progress) kate
+
+# NOTE: ENCODE2 track is described in doc/encodeRegHg19.txt
+
+# from Henry Pratt, student in Zhiping's lab:
+
+Hi Kate,
+
+
+Here are links to tarballs for all the narrowPeak files we have used for motif discovery in GRCh38, hg19, and mm10. Each one has a metadata.tsv with file ID, experiment ID, antibody ID, factor name, donor ID, cell type, and lab for each narrowPeak. Does this look good? Let me know if I need to tweak anything.
+
+
+http://users.wenglab.org/pratth/tf.GRCh38.tar.gz
+
+http://users.wenglab.org/pratth/tf.hg19.tar.gz
+
+http://users.wenglab.org/pratth/tf.mm10.tar.gz
+
+
+Henry
+
+###############
+# Download files and metadata
+
+mkdir hg38 hg19 mm10
+
+wget -nd -P hg38 http://users.wenglab.org/pratth/tf.GRCh38.tar.gz
+wget -nd -P hg19 http://users.wenglab.org/pratth/tf.hg19.tar.gz
+wget -nd -P mm10 http://users.wenglab.org/pratth/tf.mm10.tar.gz
+
+# retrieve and unroll narrowpeak files
+
+cd hg19
+gunzip *.gz
+tar xvfz tf.hg19.tar
+ls *.bed.gz | wc -l
+# 1400
+
+ls *.bed.gz | head -1
+ENCFF002ROJ.bed.gz
+
+grep ENCFF108RWG ../portal/ENCODE_TF_optimal.hg19.tsv
+# found --> these are optimal (not conservative) peaks (confirm w/ Henry)
+
+# peek at a file
+
+gunzip -c ENCFF108RWG.* | head -4
+#chr5 14664291 14664587 . 719 . 5.86423 -1.00000 0.00437 148
+#chr2 86668055 86668351 . 1000 . 5.86602 -1.00000 0.00413 148
+
+# it's an unsorted narrowPeak. pValue not used. peak always at 148 (fixed size peaks: 296)
+
+# stash files
+mkdir peaks
+mv *.bed.gz peaks
+
+# peek at metadata
+
+head -2 metadata.tsv
+#file_id experiment_id antibody_id factor_name donor_id biosample_name lab_name
+#ENCFF125NZH ENCSR925QAW ENCAB697XQW 3xFLAG-HMG20B ENCDO000AAC HepG2 richard-myers
+
+# NOTE: need treatment (request from Henry, 12/4)
+
+# To uniquify, need donor (for normal tissues)
+
+# count factors
+awk -F"\t" '{print $3}' metadata.tsv | sort | uniq | wc -l
+# 481
+
+###############
+# Run configuration tool
+# (2019-03-20)
+# Redone to remove 3xFLAG and eGFP prefixes on factor names
+# (2019-03-25)
+
+cd hg19
+
+# create input files for config tool
+
+# re-use cell codes from previous run (doc is ~kent/src/hg/makeDb/doc/encodeRegHg19.txt)
+set prev = /hive/data/genomes/hg19/bed/wgEncodeRegV2/tfbs2012/
+cp $prev/cellCodesTop.txt .
+
+# create file list with metadata, format is 4 tab-sep fields:
+# file cell+treatment+lab antibody target
+
+# per JK -- remove modified cell assays (3xFLAG, eGFP).
+# These will be in separate, per-cell-line tracks
+grep -v eGFP metadata.tsv| grep -v 3xFLAG > metadata.pruned.tsv
+perl ../makeFilelist.pl < metadata.pruned.tsv > fileCellAbTarget.tab
+wc -l fileCellAbTarget.tab
+# 1264 fileCellAbTarget.tab
+
+# run config tool (this takes a few minutes). Tools are in ~/kent/src/hg/regulate/
+
+regClusterBedExpCfg -useTarget -tabList \
+ fileCellAbTarget.tab -cellLetter=cellCodesTop.txt -noLetterOk clusters.cfg
+
+# NOTE: this tool is pretty useless w/o normalization. Consider updating next tool in
+# pipeline to take saner input
+
+###############
+# Run cluster tool
+# This takes some time (a few hours ?)
+
+hgBedsToBedExps -verbose=2 -dupeLetterOk clusters.cfg clusters.bed clusters.exps >&! clusters.log &
+# Got 1264 sources
+# Got 338 factors
+
+wc -l clusters.bed
+# 10560472 clusters.bed
+
+# previous run (with modded cells)
+# 13214618 clusters.bed
+# ~13M
+
+hgBedsToBedExps -verbose=2 -dupeLetterOk clusters.cfg clusters.bed clusters.exps >&! clusters.log &
+
+cat clusters.log
+### kent source version 378 ###
+#Loaded 1264 records from clusters.cfg
+#Got 1264 sources
+#Got 338 factors
+
+# Previous run (with modified cell types)
+#Got 1400 sources
+#Got 454 factors
+# NOTE: normalized track has much better dynamic range of scores -- similar to ENCODE 2 track
+
+# compare to previous -- 2x sources, 3x factors -> 3x cluster count. Seems reasonable
+wc -l $prev/clusters.target.bed
+# 4380444 clusters.target.bed
+# Got 690 sources
+# Got 161 factors
+# 2x sources, 3x factors -> 3x cluster count. Seems reasonable
+
+hgLoadSqlTab hg19 encode3RegTfbsExp ~/kent/src/hg/lib/expRecord.sql clusters.exps
+
+# Create inputTrackTable with columns to match trackDb setting 'inputTableFieldDisplay'.
+# e.g. cell factor treatment lab
+- 7 columns:
+
+perl ../makeInputs.pl < fileCellAbTarget.tab > clusters.inputs.tab
+hgLoadSqlTab hg19 encode3RegTfbsClusterInput \
+ ~/kent/src/hg/lib/clusterInputTrackTable5.sql clusters.inputs.tab
+
+# Compact to factorSource format
+set tools = ~/kent/src/hg/makeDb/hgBedsToBedExps
+(date; $tools/bedExpsToFactorSource.pl clusters.bed > clusters.factorSource.bed; date) >& makeFactorSource.log &
+# Mon Apr 1 15:01:59 PDT 2019
+# Mon Apr 1 16:19:20 PDT 2019
+
+# check for max score issue (RM #13224)
+$tools/factorSourceCheckScore.pl < clusters.factorSource.bed
+# Errors: 0 in 10560472 lines
+
+hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/factorSource.sql -renameSqlTable \
+ hg19 encode3RegTfbsCluster clusters.factorSource.bed
+
+# Read 10560472 elements of size 8 from clusters.factorSource.bed
+
+# compare coverage
+# TODO
+featureBits hg19 -noRandom -enrichment wgEncodeRegTfbsClusteredV3 encode3RegTfbsCluster
+#wgEncodeRegTfbsClusteredV3 12.740%, encode3RegTfbsCluster 27.457%, both 11.716%, cover 91.96%, enrich 3.35x
+
+# list cells
+awk '{print $2}' fileCellAbTarget.tab | sed 's/+.*//' | sort | uniq > cells.txt
+wc -l cells.txt
+#130 cells.txt
+
+# list factors for trackDb filterBy
+
+awk '{print $4}' fileCellAbTarget.tab | sort | uniq | sed 's/$/,\\/' > factors.trackDb
+wc -l factors.trackDb
+# 338 factors.trackDb
+
+cat > trackDb.ra << 'EOF'
+ track wgEncodeRegTfbsClusteredV3
+ shortLabel Txn Factor ChIP
+ longLabel Transcription Factor ChIP-seq (161 factors) from ENCODE with Factorbook Motifs
+ type factorSource
+ superTrack wgEncodeReg dense
+ sourceTable wgEncodeRegTfbsCellsV3
+ inputTrackTable wgEncodeRegTfbsClusteredInputsV3
+ inputTableFieldDisplay cellType factor antibody treatment lab
+ motifTable factorbookMotifPos
+ motifPwmTable factorbookMotifPwm
+ motifMapTable factorbookMotifCanonical
+ motifMaxWindow 50000
+ motifDrawDefault on
+ urlLabel Factorbook Link:
+ url http://www.factorbook.org/mediawiki/index.php/$$
+ idInUrlSql select value from factorbookGeneAlias where name='%s'
+ controlledVocabulary encode/cv.ra cellType=cell treatment=treatment lab=lab
+ visibility dense
+ useScore 1
+ priority 1.71
+ maxWindowToDraw 10000000
+ dataVersion ENCODE Mar 2012 Freeze
+ filterBy name:factor=\
+'EOF'
+
+###############
+# hg38
+
+# (2019-03-25 kate)
+
+cd ../hg38
+mkdir peaks
+mv tf.GRCh38.tar peaks
+cd peaks
+tar xvf tf.GRCh38.tar
+cd ..
+mv peaks/metadata.tsv .
+
+# config
+
+cp ../hg19/cellCodesTop.txt .
+
+#regClusterBedExpCfg -useTarget -tabList \
+ #fileCellAbTarget.tab -cellLetter=cellCodesTop.txt -noLetterOk clusters.cfg
+
+#wc -l clusters.cfg
+# 1391 clusters.cfg
+
+# cluster
+
+(date; hgBedsToBedExps -verbose=2 -dupeLetterOk clusters.cfg clusters.bed clusters.exps; date) >&! clusters.log &
+# ERR: couldn't open peaks/ENCFF520CZS.bed.gz , No such file or directory
+# NOTE: ENCFF520CZS not at ENCODE portal
+
+# remove from metadata file
+# also, per JK -- remove modified cell assays (3xFLAG, eGFP)
+grep -v ENCFF520CZS metadata.tsv | grep -v eGFP | grep -v 3xFLAG > metadata.pruned.tsv
+wc -l metadata.pruned.tsv
+# 1257
+perl ../makeFilelist.pl < metadata.pruned.tsv > fileCellAbTarget.tab
+wc -l fileCellAbTarget.tab
+# 1256 (tool removes header line)
+
+regClusterBedExpCfg -useTarget -tabList \
+ fileCellAbTarget.tab -cellLetter=cellCodesTop.txt -noLetterOk clusters.cfg
+
+(date; hgBedsToBedExps -verbose=2 -dupeLetterOk clusters.cfg clusters.bed clusters.exps; date) >&! clusters.log &
+
+#Mon Mar 25 16:31:45 PDT 2019
+### kent source version 378 ###
+#Loaded 1256 records from clusters.cfg
+#Got 1256 sources
+#Got 340 factors
+#
+#Mon Mar 25 17:20:24 PDT 2019
+
+# Previous run - included 3xflag and eGFP
+# 2019-03-20
+#Got 1390 sources
+#Got 453 factors
+# Files moved to withMods/ directory
+
+wc -l clusters.bed
+# 10565630 clusters.bed
+
+# compare to hg19
+# TODO
+
+# Compact to factorSource format
+set tools = ~/kent/src/hg/makeDb/hgBedsToBedExps
+(date; $tools/bedExpsToFactorSource.pl clusters.bed > clusters.factorSource.bed; date) >& makeFactorSource.log &
+
+cat makeFactorSource.log
+#Mon Apr 1 13:39:39 PDT 2019
+#Mon Apr 1 14:56:53 PDT 2019
+
+# Elapsed 1:17
+
+# check for max score issue (RM #13224)
+$tools/factorSourceCheckScore.pl < clusters.factorSource.bed
+# Errors: 0 in 10565630 lines
+
+hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/factorSource.sql -renameSqlTable \
+ hg38 encode3RegTfbsCluster clusters.factorSource.bed
+# Read 10565630 elements of size 8 from clusters.factorSource.bed
+
+# load experiments tables
+hgLoadSqlTab hg38 encode3RegTfbsExp ~/kent/src/hg/lib/expRecord.sql clusters.exps
+
+# Create inputTrackTable with columns to match trackDb setting 'inputTableFieldDisplay'.
+# e.g. cell factor treatment lab
+- 7 columns:
+
+perl ../makeInputs.pl < fileCellAbTarget.tab > clusters.inputs.tab
+hgLoadSqlTab hg38 encode3RegTfbsClusterInput \
+ ~/kent/src/hg/lib/clusterInputTrackTable5.sql clusters.inputs.tab
+
+# list factors for trackDb filterBy
+awk '{print $4}' fileCellAbTarget.tab | sort | uniq | sed 's/$/,\\/' > factors.trackDb
+wc -l factors.trackDb
+# 340 factors.trackDb
+
+# add to trackDb filterBy setting
+
+# list cells
+awk '{print $2}' fileCellAbTarget.tab | sed 's/+.*//' | sort | uniq > cells.txt
+wc -l cells.txt
+#129 cells.txt
+
+################
+# Load per-factor tables (needed for clusters track details
+# Consider also creating a track for these (composite with 1200 subtracks)
+
+cat > scorePeaks.csh << 'EOF'
+ set db = $1
+ set f = $2
+ set t = $f:r:r
+ set table = encode3TfbsPk$t
+ echo $table
+ zcat $f > $table.bed
+ bedScore -col=7 -verbose=2 -method=reg -uniform $f $table.scored.bed
+ hgLoadBed -noNameIx -trimSqlTable \
+ -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable \
+ -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as $db $table $table.scored.bed
+ gzip -c $table.scored.bed > ../scoredPeaks/$table.bed.gz
+ rm $table.bed $table.scored.bed
+ end
+'EOF'
+
+cd hg19
+mkdir scoredPeaks
+cd peaks
+
+awk '{print $1}' ../fileCellAbTarget.tab | sed 's/peaks\///' | \
+ xargs -L 1 ../../scorePeaks.csh hg19 >&! ../scorePeaks.log &
+
+perl ../makeTrackDb.pl < clusters.inputs.tab > trackDb.ra
+
+cd ../hg38
+mkdir scoredPeaks
+cd peaks
+awk '{print $1}' ../fileCellAbTarget.tab | sed 's/peaks\///' | \
+ xargs -L 1 ../../scorePeaks.csh hg38 >&! ../scorePeaks.log &
+
+perl ../makeTrackDb.pl < clusters.inputs.tab > trackDb.ra
+
+paste cells.txt cells.txt > cellGroup.txt
+# edit for subgroup format
+
+####################
+# Motifs (hg38) from Henry Pratt at Zlab
+# 2019-03-13
+
+# Notes from Henry:
+
+#The structure is mostly the same as the existing tables: canonical.tsv contains lists
+# of canonical motifs for each factor, pwms.tsv contains the PWM list for the motifs,
+# and fimo.tsv contains a large list of occurrences for each motif. We have expanded
+# the number of canonical motifs in many cases to more than two, including some novel
+# motifs MEME discovered which aren't annotated in the databases I searched.
+# That's a primary area where we'll probably look to filter and/or merge PWMs to
+# reduce the size a little bit more."
+
+cd /hive/data/outside/encode3/tfbs/dac
+
+wget https://users.wenglab.org/pratth/hg38.motifs.tar.gz
+tar xvfz hg38.motifs.tar.gz
+cd hg38.motifs
+wc -l *
+# 470 canonical.tsv
+# 72022536 fimo.tsv
+# 1204 pwms.tsv
+
+# Hmm, 72M localizations... (was 12M deduped to 2M in previous track)
+
+# Format of localizations file?
+
+# from MEME doc (http://meme-suite.org/doc/fimo-output-format.html#tsv_results)
+#motif_id motif_alt_id sequence_name start stop strand score p-value q-value matched_sequence
+#MA0060.1 NFYA chr2 60221163 60221178 - 18.75 3.36e-09 0.00195 CTCGGCCAATCAGAGC
+
+# So Henry's looks like:
+# chr5 138465927 138465955 ZNF263 0 + 17.3537 1.6e-07 4.69e-05
+# Probably this (will confirm)
+# chr start end factor 0 strand score p-value q-value
+
+# convert to chrom, start, end, name, float_Score, strand (bed6FloatScore)
+
+cd ..
+awk '{OFS="\t"; print $1, $2-1, $3, $4, $9, $6}' hg38.motifs/fimo.tsv > motifs.bed
+
+# try filtering out dupes
+
+awk '{$5=0; print}' motifs.bed | uniq > motifs.uniq.bed
+wc -l
+# 68223131
+# 68M -- not much help!
+
+# See if we can load
+
+sort -k1,4 -k6,6 -k5,5 motifs.bed > motifs.sorted.bed
+hgLoadBed hg38 -noSort -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \
+ encode3RegTfbsMotifs motifs.sorted.bed
+
+# check qvalue score range
+> select min(score) from encode3RegTfbsMotifs;
+| 0.000000000000002019999937447319 |
+>select max(score) from encode3RegTfbsMotifs;
+| 1 |
+select count(*) from encode3RegTfbsMotifs where score > .1
+| 9487460 |
+
+# verify above are OK (previous track they were dropped)
+
+
+# score and de-dup, as in earlier version of track
+
+cp /hive/data/genomes/hg19/bed/factorbook/dedupAndScore.pl .
+sort -k1,4 -k6,6 -k5,5 motifs.bed > motifs.resorted.bed
+perl dedupAndScore.pl < motifs.resorted.bed > motifs.load.bed
+hgLoadBed hg38 -noSort -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \
+ encode3FactorbookMotifPos motifs.load.bed
+
+# create load file for UCSC transRegCodeMotif
+# pwms.tsv is similar -- just lacks initial count field
+# NOTE: wc -l pwms.tsv
+1204 pwms.tsv
+
+vs. wc -l transRegCodeMotif.tab
+132 transRegCodeMotif.tab
+
+#10x as many pwms...
+
+# Investigate constructs -- do some of them also have antibodies (duped experiments) ?
+# (2019-03-20)
+#
+awk '{print $2, $4}' fileCellAbTarget.tab | sed 's/\+.* / /' | sed -e 's/3xFLAG-//' -e 's/eGFP-//' | sort | uniq | wc -l
+#1057
+awk '{print $2, $4}' fileCellAbTarget.tab | sed 's/\+.* / /' | sort | uniq | wc -l
+#1065
+
+# Yes, but only 8 out of 1000 experiments
+
+
+
+
| |