d87e5f63bc000d560c21bca638b6d5f3dee0173d kate Wed Apr 3 12:06:45 2019 -0700 Add scripts and make docs for ENCODE 3 TF binding site tracks. refs #21139 diff --git src/hg/makeDb/doc/encode3/tfbs.txt src/hg/makeDb/doc/encode3/tfbs.txt new file mode 100644 index 0000000..3bd8e8f --- /dev/null +++ src/hg/makeDb/doc/encode3/tfbs.txt @@ -0,0 +1,447 @@ +# ENCODE TFBS from DAC + +# (2018-12-04 in progress) kate + +# NOTE: ENCODE2 track is described in doc/encodeRegHg19.txt + +# from Henry Pratt, student in Zhiping's lab: + +Hi Kate, + + +Here are links to tarballs for all the narrowPeak files we have used for motif discovery in GRCh38, hg19, and mm10. Each one has a metadata.tsv with file ID, experiment ID, antibody ID, factor name, donor ID, cell type, and lab for each narrowPeak. Does this look good? Let me know if I need to tweak anything. + + +http://users.wenglab.org/pratth/tf.GRCh38.tar.gz + +http://users.wenglab.org/pratth/tf.hg19.tar.gz + +http://users.wenglab.org/pratth/tf.mm10.tar.gz + + +Henry + +############### +# Download files and metadata + +mkdir hg38 hg19 mm10 + +wget -nd -P hg38 http://users.wenglab.org/pratth/tf.GRCh38.tar.gz +wget -nd -P hg19 http://users.wenglab.org/pratth/tf.hg19.tar.gz +wget -nd -P mm10 http://users.wenglab.org/pratth/tf.mm10.tar.gz + +# retrieve and unroll narrowpeak files + +cd hg19 +gunzip *.gz +tar xvfz tf.hg19.tar +ls *.bed.gz | wc -l +# 1400 + +ls *.bed.gz | head -1 +ENCFF002ROJ.bed.gz + +grep ENCFF108RWG ../portal/ENCODE_TF_optimal.hg19.tsv +# found --> these are optimal (not conservative) peaks (confirm w/ Henry) + +# peek at a file + +gunzip -c ENCFF108RWG.* | head -4 +#chr5 14664291 14664587 . 719 . 5.86423 -1.00000 0.00437 148 +#chr2 86668055 86668351 . 1000 . 5.86602 -1.00000 0.00413 148 + +# it's an unsorted narrowPeak. pValue not used. peak always at 148 (fixed size peaks: 296) + +# stash files +mkdir peaks +mv *.bed.gz peaks + +# peek at metadata + +head -2 metadata.tsv +#file_id experiment_id antibody_id factor_name donor_id biosample_name lab_name +#ENCFF125NZH ENCSR925QAW ENCAB697XQW 3xFLAG-HMG20B ENCDO000AAC HepG2 richard-myers + +# NOTE: need treatment (request from Henry, 12/4) + +# To uniquify, need donor (for normal tissues) + +# count factors +awk -F"\t" '{print $3}' metadata.tsv | sort | uniq | wc -l +# 481 + +############### +# Run configuration tool +# (2019-03-20) +# Redone to remove 3xFLAG and eGFP prefixes on factor names +# (2019-03-25) + +cd hg19 + +# create input files for config tool + +# re-use cell codes from previous run (doc is ~kent/src/hg/makeDb/doc/encodeRegHg19.txt) +set prev = /hive/data/genomes/hg19/bed/wgEncodeRegV2/tfbs2012/ +cp $prev/cellCodesTop.txt . + +# create file list with metadata, format is 4 tab-sep fields: +# file cell+treatment+lab antibody target + +# per JK -- remove modified cell assays (3xFLAG, eGFP). +# These will be in separate, per-cell-line tracks +grep -v eGFP metadata.tsv| grep -v 3xFLAG > metadata.pruned.tsv +perl ../makeFilelist.pl < metadata.pruned.tsv > fileCellAbTarget.tab +wc -l fileCellAbTarget.tab +# 1264 fileCellAbTarget.tab + +# run config tool (this takes a few minutes). Tools are in ~/kent/src/hg/regulate/ + +regClusterBedExpCfg -useTarget -tabList \ + fileCellAbTarget.tab -cellLetter=cellCodesTop.txt -noLetterOk clusters.cfg + +# NOTE: this tool is pretty useless w/o normalization. Consider updating next tool in +# pipeline to take saner input + +############### +# Run cluster tool +# This takes some time (a few hours ?) + +hgBedsToBedExps -verbose=2 -dupeLetterOk clusters.cfg clusters.bed clusters.exps >&! clusters.log & +# Got 1264 sources +# Got 338 factors + +wc -l clusters.bed +# 10560472 clusters.bed + +# previous run (with modded cells) +# 13214618 clusters.bed +# ~13M + +hgBedsToBedExps -verbose=2 -dupeLetterOk clusters.cfg clusters.bed clusters.exps >&! clusters.log & + +cat clusters.log +### kent source version 378 ### +#Loaded 1264 records from clusters.cfg +#Got 1264 sources +#Got 338 factors + +# Previous run (with modified cell types) +#Got 1400 sources +#Got 454 factors +# NOTE: normalized track has much better dynamic range of scores -- similar to ENCODE 2 track + +# compare to previous -- 2x sources, 3x factors -> 3x cluster count. Seems reasonable +wc -l $prev/clusters.target.bed +# 4380444 clusters.target.bed +# Got 690 sources +# Got 161 factors +# 2x sources, 3x factors -> 3x cluster count. Seems reasonable + +hgLoadSqlTab hg19 encode3RegTfbsExp ~/kent/src/hg/lib/expRecord.sql clusters.exps + +# Create inputTrackTable with columns to match trackDb setting 'inputTableFieldDisplay'. +# e.g. cell factor treatment lab +- 7 columns: + +perl ../makeInputs.pl < fileCellAbTarget.tab > clusters.inputs.tab +hgLoadSqlTab hg19 encode3RegTfbsClusterInput \ + ~/kent/src/hg/lib/clusterInputTrackTable5.sql clusters.inputs.tab + +# Compact to factorSource format +set tools = ~/kent/src/hg/makeDb/hgBedsToBedExps +(date; $tools/bedExpsToFactorSource.pl clusters.bed > clusters.factorSource.bed; date) >& makeFactorSource.log & +# Mon Apr 1 15:01:59 PDT 2019 +# Mon Apr 1 16:19:20 PDT 2019 + +# check for max score issue (RM #13224) +$tools/factorSourceCheckScore.pl < clusters.factorSource.bed +# Errors: 0 in 10560472 lines + +hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/factorSource.sql -renameSqlTable \ + hg19 encode3RegTfbsCluster clusters.factorSource.bed + +# Read 10560472 elements of size 8 from clusters.factorSource.bed + +# compare coverage +# TODO +featureBits hg19 -noRandom -enrichment wgEncodeRegTfbsClusteredV3 encode3RegTfbsCluster +#wgEncodeRegTfbsClusteredV3 12.740%, encode3RegTfbsCluster 27.457%, both 11.716%, cover 91.96%, enrich 3.35x + +# list cells +awk '{print $2}' fileCellAbTarget.tab | sed 's/+.*//' | sort | uniq > cells.txt +wc -l cells.txt +#130 cells.txt + +# list factors for trackDb filterBy + +awk '{print $4}' fileCellAbTarget.tab | sort | uniq | sed 's/$/,\\/' > factors.trackDb +wc -l factors.trackDb +# 338 factors.trackDb + +cat > trackDb.ra << 'EOF' + track wgEncodeRegTfbsClusteredV3 + shortLabel Txn Factor ChIP + longLabel Transcription Factor ChIP-seq (161 factors) from ENCODE with Factorbook Motifs + type factorSource + superTrack wgEncodeReg dense + sourceTable wgEncodeRegTfbsCellsV3 + inputTrackTable wgEncodeRegTfbsClusteredInputsV3 + inputTableFieldDisplay cellType factor antibody treatment lab + motifTable factorbookMotifPos + motifPwmTable factorbookMotifPwm + motifMapTable factorbookMotifCanonical + motifMaxWindow 50000 + motifDrawDefault on + urlLabel Factorbook Link: + url http://www.factorbook.org/mediawiki/index.php/$$ + idInUrlSql select value from factorbookGeneAlias where name='%s' + controlledVocabulary encode/cv.ra cellType=cell treatment=treatment lab=lab + visibility dense + useScore 1 + priority 1.71 + maxWindowToDraw 10000000 + dataVersion ENCODE Mar 2012 Freeze + filterBy name:factor=\ +'EOF' + +############### +# hg38 + +# (2019-03-25 kate) + +cd ../hg38 +mkdir peaks +mv tf.GRCh38.tar peaks +cd peaks +tar xvf tf.GRCh38.tar +cd .. +mv peaks/metadata.tsv . + +# config + +cp ../hg19/cellCodesTop.txt . + +#regClusterBedExpCfg -useTarget -tabList \ + #fileCellAbTarget.tab -cellLetter=cellCodesTop.txt -noLetterOk clusters.cfg + +#wc -l clusters.cfg +# 1391 clusters.cfg + +# cluster + +(date; hgBedsToBedExps -verbose=2 -dupeLetterOk clusters.cfg clusters.bed clusters.exps; date) >&! clusters.log & +# ERR: couldn't open peaks/ENCFF520CZS.bed.gz , No such file or directory +# NOTE: ENCFF520CZS not at ENCODE portal + +# remove from metadata file +# also, per JK -- remove modified cell assays (3xFLAG, eGFP) +grep -v ENCFF520CZS metadata.tsv | grep -v eGFP | grep -v 3xFLAG > metadata.pruned.tsv +wc -l metadata.pruned.tsv +# 1257 +perl ../makeFilelist.pl < metadata.pruned.tsv > fileCellAbTarget.tab +wc -l fileCellAbTarget.tab +# 1256 (tool removes header line) + +regClusterBedExpCfg -useTarget -tabList \ + fileCellAbTarget.tab -cellLetter=cellCodesTop.txt -noLetterOk clusters.cfg + +(date; hgBedsToBedExps -verbose=2 -dupeLetterOk clusters.cfg clusters.bed clusters.exps; date) >&! clusters.log & + +#Mon Mar 25 16:31:45 PDT 2019 +### kent source version 378 ### +#Loaded 1256 records from clusters.cfg +#Got 1256 sources +#Got 340 factors +# +#Mon Mar 25 17:20:24 PDT 2019 + +# Previous run - included 3xflag and eGFP +# 2019-03-20 +#Got 1390 sources +#Got 453 factors +# Files moved to withMods/ directory + +wc -l clusters.bed +# 10565630 clusters.bed + +# compare to hg19 +# TODO + +# Compact to factorSource format +set tools = ~/kent/src/hg/makeDb/hgBedsToBedExps +(date; $tools/bedExpsToFactorSource.pl clusters.bed > clusters.factorSource.bed; date) >& makeFactorSource.log & + +cat makeFactorSource.log +#Mon Apr 1 13:39:39 PDT 2019 +#Mon Apr 1 14:56:53 PDT 2019 + +# Elapsed 1:17 + +# check for max score issue (RM #13224) +$tools/factorSourceCheckScore.pl < clusters.factorSource.bed +# Errors: 0 in 10565630 lines + +hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/factorSource.sql -renameSqlTable \ + hg38 encode3RegTfbsCluster clusters.factorSource.bed +# Read 10565630 elements of size 8 from clusters.factorSource.bed + +# load experiments tables +hgLoadSqlTab hg38 encode3RegTfbsExp ~/kent/src/hg/lib/expRecord.sql clusters.exps + +# Create inputTrackTable with columns to match trackDb setting 'inputTableFieldDisplay'. +# e.g. cell factor treatment lab +- 7 columns:
+ +perl ../makeInputs.pl < fileCellAbTarget.tab > clusters.inputs.tab +hgLoadSqlTab hg38 encode3RegTfbsClusterInput \ + ~/kent/src/hg/lib/clusterInputTrackTable5.sql clusters.inputs.tab + +# list factors for trackDb filterBy +awk '{print $4}' fileCellAbTarget.tab | sort | uniq | sed 's/$/,\\/' > factors.trackDb +wc -l factors.trackDb +# 340 factors.trackDb + +# add to trackDb filterBy setting + +# list cells +awk '{print $2}' fileCellAbTarget.tab | sed 's/+.*//' | sort | uniq > cells.txt +wc -l cells.txt +#129 cells.txt + +################ +# Load per-factor tables (needed for clusters track details +# Consider also creating a track for these (composite with 1200 subtracks) + +cat > scorePeaks.csh << 'EOF' + set db = $1 + set f = $2 + set t = $f:r:r + set table = encode3TfbsPk$t + echo $table + zcat $f > $table.bed + bedScore -col=7 -verbose=2 -method=reg -uniform $f $table.scored.bed + hgLoadBed -noNameIx -trimSqlTable \ + -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable \ + -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as $db $table $table.scored.bed + gzip -c $table.scored.bed > ../scoredPeaks/$table.bed.gz + rm $table.bed $table.scored.bed + end +'EOF' + +cd hg19 +mkdir scoredPeaks +cd peaks + +awk '{print $1}' ../fileCellAbTarget.tab | sed 's/peaks\///' | \ + xargs -L 1 ../../scorePeaks.csh hg19 >&! ../scorePeaks.log & + +perl ../makeTrackDb.pl < clusters.inputs.tab > trackDb.ra + +cd ../hg38 +mkdir scoredPeaks +cd peaks +awk '{print $1}' ../fileCellAbTarget.tab | sed 's/peaks\///' | \ + xargs -L 1 ../../scorePeaks.csh hg38 >&! ../scorePeaks.log & + +perl ../makeTrackDb.pl < clusters.inputs.tab > trackDb.ra + +paste cells.txt cells.txt > cellGroup.txt +# edit for subgroup format + +#################### +# Motifs (hg38) from Henry Pratt at Zlab +# 2019-03-13 + +# Notes from Henry: + +#The structure is mostly the same as the existing tables: canonical.tsv contains lists +# of canonical motifs for each factor, pwms.tsv contains the PWM list for the motifs, +# and fimo.tsv contains a large list of occurrences for each motif. We have expanded +# the number of canonical motifs in many cases to more than two, including some novel +# motifs MEME discovered which aren't annotated in the databases I searched. +# That's a primary area where we'll probably look to filter and/or merge PWMs to +# reduce the size a little bit more." + +cd /hive/data/outside/encode3/tfbs/dac + +wget https://users.wenglab.org/pratth/hg38.motifs.tar.gz +tar xvfz hg38.motifs.tar.gz +cd hg38.motifs +wc -l * +# 470 canonical.tsv +# 72022536 fimo.tsv +# 1204 pwms.tsv + +# Hmm, 72M localizations... (was 12M deduped to 2M in previous track) + +# Format of localizations file? + +# from MEME doc (http://meme-suite.org/doc/fimo-output-format.html#tsv_results) +#motif_id motif_alt_id sequence_name start stop strand score p-value q-value matched_sequence +#MA0060.1 NFYA chr2 60221163 60221178 - 18.75 3.36e-09 0.00195 CTCGGCCAATCAGAGC + +# So Henry's looks like: +# chr5 138465927 138465955 ZNF263 0 + 17.3537 1.6e-07 4.69e-05 +# Probably this (will confirm) +# chr start end factor 0 strand score p-value q-value + +# convert to chrom, start, end, name, float_Score, strand (bed6FloatScore) + +cd .. +awk '{OFS="\t"; print $1, $2-1, $3, $4, $9, $6}' hg38.motifs/fimo.tsv > motifs.bed + +# try filtering out dupes + +awk '{$5=0; print}' motifs.bed | uniq > motifs.uniq.bed +wc -l +# 68223131 +# 68M -- not much help! + +# See if we can load + +sort -k1,4 -k6,6 -k5,5 motifs.bed > motifs.sorted.bed +hgLoadBed hg38 -noSort -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \ + encode3RegTfbsMotifs motifs.sorted.bed + +# check qvalue score range +> select min(score) from encode3RegTfbsMotifs; +| 0.000000000000002019999937447319 | +>select max(score) from encode3RegTfbsMotifs; +| 1 | +select count(*) from encode3RegTfbsMotifs where score > .1 +| 9487460 | + +# verify above are OK (previous track they were dropped) + + +# score and de-dup, as in earlier version of track + +cp /hive/data/genomes/hg19/bed/factorbook/dedupAndScore.pl . +sort -k1,4 -k6,6 -k5,5 motifs.bed > motifs.resorted.bed +perl dedupAndScore.pl < motifs.resorted.bed > motifs.load.bed +hgLoadBed hg38 -noSort -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \ + encode3FactorbookMotifPos motifs.load.bed + +# create load file for UCSC transRegCodeMotif +# pwms.tsv is similar -- just lacks initial count field +# NOTE: wc -l pwms.tsv +1204 pwms.tsv + +vs. wc -l transRegCodeMotif.tab +132 transRegCodeMotif.tab + +#10x as many pwms... + +# Investigate constructs -- do some of them also have antibodies (duped experiments) ? +# (2019-03-20) +# +awk '{print $2, $4}' fileCellAbTarget.tab | sed 's/\+.* / /' | sed -e 's/3xFLAG-//' -e 's/eGFP-//' | sort | uniq | wc -l +#1057 +awk '{print $2, $4}' fileCellAbTarget.tab | sed 's/\+.* / /' | sort | uniq | wc -l +#1065 + +# Yes, but only 8 out of 1000 experiments + + + +