d87e5f63bc000d560c21bca638b6d5f3dee0173d kate Wed Apr 3 12:06:45 2019 -0700 Add scripts and make docs for ENCODE 3 TF binding site tracks. refs #21139 diff --git src/hg/makeDb/doc/encodeRegHg19.txt src/hg/makeDb/doc/encodeRegHg19.txt index 1196a19..d1a59a2 100644 --- src/hg/makeDb/doc/encodeRegHg19.txt +++ src/hg/makeDb/doc/encodeRegHg19.txt @@ -1,1753 +1,1754 @@ ####################################################################### # ENCODE Regulation track (In progress March 2010, very rough instructions # as am still defining how to build track...). JK # make root dir and link in our bad.bed file for filtering out # blacklisted and multiple mapping regions. mkdir -p /cluster/data/hg19/bed/wgEncodeReg cd /cluster/data/hg19/bed/wgEncodeReg ln -s /hive/users/kent/regulate/companion/mapping/bad.bed badRegions.bed # Create the DNAse peak clusters subtrack. # Get all of the narrowPeak format files for the wgEncodeUwDnaseSeq # linked into directory /hive/users/kent/regulate/dnase/peaks mkdir dnase cd dnase mkdir peaks ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeUwDnase/*.narrowPeak.gz peaks # Process these into clusters in a bed file and load clusters into # table. /bin/ls -1 peaks/*.narrowPeak.gz > peak.lst uw02 peak.lst peak.table regCluster peak.table /dev/null peak.bed awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClustered.bed hgLoadBed hg19 wgEncodeRegDnaseClustered wgEncodeRegDnaseClustered.bed # Make wgEncodeRegDnaseClusteredInput table. Start with mdbQuery, and # then do some massaging since not completely in sync with file list. mdbQuery out=tab "select obj,cell,treatment,replicate,lab,dateUnrestricted from hg19 where obj like 'wgEncodeUwDnase%' and view='Peaks'" | sed 's/n\/a/None/' > inputMdb.tab cut -f 1 peak.table | sed 's/\.narrowPeak\.gz//' | sed 's/peaks\///' > inputs.lst weedLines inputs.lst inputMdb.tab wgEncodeRegDnaseClusteredInputs.tab -invert hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredInputs ~/kent/src/hg/lib/clusterInputDnase.sql \ wgEncodeRegDnaseClusteredInputs.tab # Make transcription factor binding site clusters. Start with Anshul's # nice uniform peak calls, downloaded from # ftp://ftp-private.ebi.ac.uk/byDataType/peaks/jan2011/spp/optimal # using the user name and password in the ENCODE wiki at: # http://genomewiki.ucsc.edu/EncodeDCC/index.php/Locations_of_ENCODE_Data # Put all of the ones that actually are based on two replicates into # /cluster/data/hg19/bed/wgEncodeReg/tfbs/peaks # Once that is done do a little work towards making the cluster input. cd /cluster/data/hg19/bed/wgEncodeReg/tfbs /bin/ls peaks/*.narrowPeak.gz > peak.lst regClusterMakeTableOfTables ans01 peak.lst partial.table # Unfortunately at this point only about 90% of the peak files actually # have metaDb data associated with them in metaDb, so run C program that # takes it from metaDb if possible, otherwise tries to get it from file # names. regClusterAttachMetadataToTableOfTables hg19 partial.table withDupes.table # One more complication - there are often several cases where there is # more than one data set on the same cell+treatment+antibody combination. # For good reasons hgBedToBedExps only wants one. # This next bit filters them out based on a picks file that was hand # created - picksFromDupes. This was based on the criteria that # the Broad CTCF were used, and that Snyd was used in preference # to OpenChromatin, which in turn was used in preference over Haib. # The rational for this is the Broad CTCF set has been in use for a # while with no complaints, and when examing the Pol2 signals, which # were with CTCF the main component of the Dupes, the Haib were the # weakest and the Snyd the strongest. cp ~/src/hg/regulate/wgEncodeReg/picksFromDupes.tab . wgEncodeRegTfbsDedupeHelp withDupes.table picksFromDupes.tab peak.table # Make three column file: fileName, cell+treatment, antibody awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' peak.table > fileCellAb.tab # Create config file out for clustering job, using a table assigning # cell lines+treatments to single letter codes edited by hand. In # the future may be able to automate this by having fixed upper case codes for # tier1&2 cell lines, and then just use first letter lower case codes # for rest. That is the pattern cellLetter2.tab follows anyway. cp ~/kent/src/hg/regulate/regClusterBedExpCfg/cellLetter2.tab . regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter2.tab # Do the actual clustering hgBedsToBedExps -dupeLetterOk cluster.cfg peak.bed peak.exps # Load database (reloaded 3/26/2012 to fix a relatively rare bug) hgLoadBed hg19 wgEncodeRegTfbsClustered peak.bed hgLoadSqlTab hg19 wgEncodeRegTfbsCells ~/kent/src/hg/lib/expRecord.sql peak.exps # Create inputTrackTable - six columns: <table> <source> <factor> <cell> <treatment> <lab> awk '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $7, ($10 == "None" ? $8 : $8 "+" $10), $9, $8, $10, $11);}' peak.table | sed 's/AlnRep/PkRep/'> wgEncodeRegTfbsClusteredInputs.tab hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputs ~/kent/src/hg/lib/clusterInputTrackTable4.sql wgEncodeRegTfbsClusteredInputs.tab # Handle the RNA seq, starting by pooling it and filtering out multiple # mapping and blacklisted regions. mkdir -p /hive/data/genomes/hg19/bed/wgEncodeReg/txn/pooled cd /hive/data/genomes/hg19/bed/wgEncodeReg/txn/pooled bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqGm12878R2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqH1hescR2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHct116R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHct116R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHct116R2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHelas3R2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHepg2R2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHsmmR2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHuvecR2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqK562R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqK562R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqK562R2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqMcf7R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqMcf7R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqMcf7R2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqNhekR2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHepg2R2x75Il200SigPooled.bedGraph bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2.bigWig \ /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep2V2.bigWig stdout \ | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHelas3R2x75Il200SigPooled.bedGraph # Now have to normalize. This takes two passes. The first just creates # a tab-separated file with normalization values. A batch file is made # by running awk on the tab file, and this batch file executes the # second half, transforming the bedGraphs to normalized values. The # normalization standardizes all files as if the total signal integrated to 10 # billion over the genome. rm -f ../norm.tab foreach i (*SigPooled.bedGraph) echo processing $i echo -n $i " " >> ../norm.tab awk '{sum += $4*($3-$2);} END {print 10000000000/sum}' $i >> ../norm.tab end awk '{printf("colTransform 4 %s 0 %g ../normalized/%s\n", $1, $2, $1);}' ../norm.tab > ../norm.csh chmod a+x ../norm.csh mkdir ../normalized ../norm.csh # Make bigWigs cd ../normalized foreach i (*.bedGraph) echo processing $i bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw end # Link into /gbdb ln -s `pwd`/*.bw /gbdb/hg19/bbi # Tell database about it foreach i (*.bw) hgBbiDbLink hg19 $i:r /gbdb/hg19/bbi/$i end # Clean up cd /hive/data/genomes/hg19/bed/wgEncodeReg/txn rm pooled/*.bedGraph normalized/*.bedGraph # Make metadata: (forgive the long line) # JK TODO - add Hepg2 and Helas3 from here on down mdbQuery out=ra "select * from hg19 where \ obj='wgEncodeCaltechRnaSeqNhlfR2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqK562R2x75Il200SigRep1V2' or \ obj='wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep1V2'" \ | sed 's/SigRep1V2/SigPooled/g' | sed 's/wgEncodeCal/wgEncodeRegTxnCal/' \ | grep -v ^replicate > mdb.ra # Now patch mdb.ra into ~/kent/src/hg/makeDb/trackDb/human/hg19/metaDb/alpha/wgEncodeReg.ra # At this point actually are done with this track, except for # all the trackDb work on the overlayed histone mod tracks. # The rest is for some coloring stuff that is not yet actually used. # Do work to figure out how to color cells. This is based on # hierarchical clustering of wgEncodeUwAffyAllExon. mkdir -p /hive/data/genomes/hg19/bed/wgEncodeReg/cellColors cd /hive/data/genomes/hg19/bed/wgEncodeReg # Start with some alas time consuming data cleanup to get rid of # overlaps - losing about 10% of the data in the process which is # totally ok. mkdir allExon cd allExon mkdir inLinks noOverlap onlyCommon ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeUwAffyExonArray/*.gz inLinks cd inLinks foreach i (*.gz) bedRemoveOverlap $i ../noOverlap/$i:r end cd .. # Now make up a file of regions present in all inputs for further data # cleanup bedCommonRegions noOverlap/*.* > commonExons.bed cd noOverlap foreach i (*.broadPeak) echo processing $i bedRestrictToPositions $i ../commonExons.bed ../onlyCommon/$i end rm -r noOverlap # Now pool replicates where possible mdbQuery "select fileName,cell,replicate from hg19 where composite='wgEncodeUwAffyExonArray' and view='SimpleSignal'" -out=ra \ | sed 's/\.gz//g' > replicate.ra encodeMergeReplicatesBatch replicate.ra onlyCommon merge.sh merged.ra merged mkdir merged chmod a+x merge.sh merge.sh # Resort merged output - somehow gets slightly messed by # encodeMergeReplicates. cd merged foreach i (*.broadPeak) echo sorting $i sort -k 1,1 -k 2,2n $i > tmp mv tmp $i end # Make file that converts fileName to cell raToTab merged.ra -cols=fileName,cell stdout \ | sed 's/wgEncode/merged\/wgEncode/' > fileToCell.tab # Now build the tree. Took 2.5 minutes ls -1 merged/*.broadPeak > clusterInput.lst regClusterTreeCells rename=fileToCell.tab clusterInput.lst cluster.tree cluster.distances ######################################################################### # 2012 ENCODE Regulation supertrack updates: #7526 (kate) ######################################################################### # DNase Clusters: #9954 (2013-01-11) # # Create the DNAse peak clusters track using AWG uniformly processed peaks (Jan 2011 freeze, Sep 2012 analysis pubs). # See doc/encodeAwgHg19.txt # There are 125 cell type+treatments represented -- UW, Duke, and Uw+Duke for 14 cell types # Previous track had 74 # Redoing filtering (2014-06-24) V3 version, to include single-cell clusters cd /hive/data/genomes/hg19/bed/wgEncodeReg mkdir -p dnaseAwg cd dnaseAwg set dir = `pwd` cd /hive/data/genomes/hg19/bed/wgEncodeAwgDnaseUniform/narrowPeak ls *.narrowPeak > $dir/peak.lst # generate a normalization factor # output format: # <filename> 0 1 2 <scoreIx> <scoreFactor> # wgEncodeAwgDnaseDuke8988tPeak.bed 0 1 2 6 10.7234 . regClusterMakeTableOfTables awgDnase01 $dir/peak.lst $dir/peak.table -verbose=3 >&! $dir/regTable.out & regClusterMakeTableOfTables awgDnase01Hg19 $dir/peak.lst $dir/peak.meta.table -verbose=3 >&! $dir/regTable.meta.out & # Make cluster table: wgEncodeRegDnaseClusteredV2 (bed5, with name=#datasets represented in cluster) regCluster $dir/peak.table /dev/null $dir/peak.bed # Read 125 sources from peak.table # 2233542 singly-linked clusters, 2343745 clusters in 24 chromosomes regCluster -bedSources $dir/peak.meta.table /dev/null $dir/peak.meta.bed # filter out low scoring, single peaks in cluster and load in database cd $dir awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV2.bed wc -l wgEncodeRegDnaseClusteredV2.bed # 1281988 wgEncodeRegDnaseClusteredV2.bed hgLoadBed hg19 wgEncodeRegDnaseClusteredV2 wgEncodeRegDnaseClusteredV2.bed # V3 of track -- don't filter out single celltype peaks! # August 2014, KRR # filter out low scoring, but retain single cell type clusters awk '$5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV3a.bed # 1867665 wgEncodeRegDnaseClusteredV3a.bed hgLoadBed hg19 wgEncodeRegDnaseClusteredV3a wgEncodeRegDnaseClusteredV3a.bed # do same with metadata version, as we will actually load track with # this additional info, for VAI integration awk '$5 >= 100' peak.meta.bed > peak.meta.filtered.bed wc -l peak.meta.filtered.bed 1867665 peak.meta.filtered.bed # format to BED5+floatscore+sources for hgBedSources # which will extract, uniquify, and assign ID's to sources awk 'BEGIN {OFS="\t"}{print $1, $2, $3, $4, $5, 0, $7;}' peak.meta.filtered.bed > peak.meta.bed6 hgBedSources peak.meta.bed6 regDnase mv regDnaseSources.tab wgEncodeRegDnaseClusteredSources.tab # Load sources table autoSql $HOME/kent/src/hg/lib/idName.as idName hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredSources idName.sql wgEncodeRegDnaseClusteredSources.tab # merge files and format to BED5+sourceCount+sourceIds+sourceVals awk '{print $8}' peak.meta.filtered.bed > peak.vals awk 'BEGIN {OFS="\t"}{print $1, $2, $3, $4, $5, $7, $8;}' regDnase.bed | paste - peak.vals > wgEncodeRegDnaseClusteredV3.bed hgLoadBed hg19 wgEncodeRegDnaseClusteredV3 -sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql -renameSqlTable \ -as=$HOME/kent/src/hg/lib/bed5SourceVals.as wgEncodeRegDnaseClusteredV3.bed # NOTE: V1 had ~1M clusters, V2 has 1.2M, V3 has 1.9M # It would be nice if hgc sorted datasets by signal or cellType # Create metadata table: wgEncodeRegV2DnaseClusteredInputs (inputTrackTable setting) # was: objName, cellType, treatment, replicate, lab, dateUnrestricted # For V2, AWG is providing pooled replicates, and sometimes pooled labs. Also, # this data has all passed timestamp restriction, so don't need to show # objName, cellType, treatment, lab(s) # DISPLAY: Consider item name change to % assays represented instead of count. # Also, could color item name label to highlight ubiquitous vs. cell-type-specific # (top quartile -> color indicating high expression (red/orange) ?, bottom green/blue ?) # Make wgEncodeRegDnaseClusteredInputV2 table. mdbQuery out=tab "select obj,cell,treatment,lab,dccAccession from hg19 where obj like 'wgEncodeAwgDnase%UniPk' and view='Peaks'" > wgEncodeRegDnaseClusteredInputsV2.tab hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredInputsV2 \ ~/kent/src/hg/lib/clusterInputAwgDnase.sql wgEncodeRegDnaseClusteredInputsV2.tab # Set up downloads cd $dir mkdir downloads gzip -c wgEncodeRegDnaseClusteredV2.bed > downloads/wgEncodeRegDnaseClusteredV2.bed.gz gzip -c wgEncodeRegDnaseClusteredInputsV2.tab > downloads/wgEncodeRegDnaseClusteredInputsV2.tab.gz gzip -c wgEncodeRegDnaseClusteredV3.bed > downloads/wgEncodeRegDnaseClusteredV3.bed.gz cp wgEncodeRegDnaseClusteredSources.tab downloads # use (existing) ENCODE DCC downloads dir set dload = /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegDnaseClustered cd $dload # call existing stuff release1 (use ENCODE-style downloads scheme) mkdir release1 ln wgEncodeRegDnaseClustered.bed.gz wgEncodeRegDnaseClusteredInputs.tab.gz release1 cp README.txt md5sum.txt files.txt release1 # new release dir mkdir release2 ln -s release2 beta ln -s release2 releaseLatest cd $dir/downloads ln wgEncodeRegDnaseClusteredV2.bed.gz wgEncodeRegDnaseClusteredInputsV2.tab.gz $dload cd $dload ln wgEncodeRegDnaseClusteredV2.bed.gz wgEncodeRegDnaseClusteredInputsV2.tab.gz releaseLatest # include older files as well ln wgEncodeRegDnaseClustered.bed.gz wgEncodeRegDnaseClusteredInputs.tab.gz releaseLatest cp README.txt releaseLatest cd releaseLatest md5sum *.gz > md5sum.txt # vim README.txt cp README.txt md5sum.txt .. mkdir release3 rm beta releaseLatest ln -s release3 beta ln -s release3 releaseLatest cd $dir/downloads ln wgEncodeRegDnaseClusteredV3.bed.gz wgEncodeRegDnaseClusteredSources.tab $dload cd $dload ln wgEncodeRegDnaseClusteredV3.bed.gz wgEncodeRegDnaseClusteredSources.tab releaseLatest cp README.txt releaseLatest cd releaseLatest md5sum *.gz > md5sum.txt # vim README.txt cp README.txt md5sum.txt .. # add minimal metaObject to metaDb to associate new table with (pseudo-)composite # wgEncodeRegDnaseClustered -- this allows hgTrackUi to locate downloads cd ~/kent/src/hg/makeDb/trackDb/human/hg19/metaDb/alpha cat > wgEncodeRegDnaseClustered.ra << 'EOF' metaObject wgEncodeRegDnaseClusteredV2 objType table composite wgEncodeRegDnaseClustered fileName wgEncodeRegDnaseClusteredV2.bed.gz tableName wgEncodeRegDnaseClusteredV2 project wgEncode 'EOF' cat > wgEncodeRegDnaseClustered.ra << 'EOF' metaObject wgEncodeRegDnaseClusteredV3 objType table composite wgEncodeRegDnaseClustered fileName wgEncodeRegDnaseClusteredV3.bed.gz tableName wgEncodeRegDnaseClusteredV3 project wgEncode 'EOF' ######################################################################### # Preliminary/Experimental/Older work below ######################################################################### # ENCODE Regulation track V2: Track #7526 (In progress 2012-06-20 kate) # This version will have: # * Replace Tx with CSHL stranded data, pooled (JK advising, also AWG use) # http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformRNASignal.txt # These are from DCC wgEncodeCshlLongRnaSeq downloads (Long polyA+, whole cell), # AWG did not pool reps or plus/minus for their hub (but we will) # AWG has filtered for bad regions so no longer necessary # * Possibly a 10th cell type (HMEC) for Tx and Histone # Note: single replicate for HMEC Tx (others have 2) # * Two additional cell lines for Histones (Hela, HepG2) # - Note Hela & HepG2 already added to Tx track (July2012 release) # * Histones from uniform processing: H3K4me1, H3K4me3, H3K27ac # (http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/) # wgEncodeBroadHistone{cell}{histone}StdAln_?Reps.norm5.rawsignal.bw # * Add latest TF's, and use AWG uniform peak calls # * Add latest DNase calls (possibly use AWG uniform) # (http://www.uwencode.org/public/rthurman/encode_integrated_open_chrom/data/dataHub) # AWG trackDb: http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt # - can use subGroups line to determine cell type, but need to take care # with treatments, which are in filename but not subGroups # - NOTE: AWG includes Duke data processed like UW. Where both # labs had data, they are merged. View is .01FDR peaks (narrow, bigbed 6+) # Previous track: 74 cell types (2 reps), all UW. This version: 125 cell types, pooled replicates, merged UW+Duke # Also considering adding new data types from AWG hub: # - Short RNA's # - AWG uniform RNA contigs # (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/rna_contigs/) # - Segmentations (chromHMM, segWay, or combined) # (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/segmentations) # - FAIRE (cell-specific, union over 25 cell types) # (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/faire/bigBed/) # NOTE: following Jim's methods from initial version # make root dir and link in our bad.bed file for filtering out # blacklisted and multiple mapping regions. cd /hive/data/genomes/hg19/bed mkdir wgEncodeRegV2 cd wgEncodeRegV2 ln -s /hive/data/genomes/hg19/bed/wgEncodeReg/badRegions.bed . #(was /hive/users/kent/regulate/companion/mapping/bad.bed) ###### # Small RNA's # For signal, pool strands and replicates ###### cd /hive/data/genomes/hg19/bed/ mkdir -p wgEncodeRegV2/shortRna/pooled cd wgEncodeRegV2/shortRna/pooled cat > list.csh << 'EOF' foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hmec Hsmm Huvec K562 Nhek Nhlf) echo "processing ${cell}" ls /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotal*.bigWig 'EOF' csh list.csh >&! list.out & # There are 6 of overalpping with histones and long RNAs. 4 other tier2's. cat > pool.csh << 'EOF' foreach cell (Gm12878 H1hesc Helas3 Hepg2 K562 Nhek A549 Cd20 Imr90 Mcf7) echo "processing ${cell}" bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapMinusRawRep1.bigWig \ /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapMinusRawRep2.bigWig \ /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapPlusRawRep1.bigWig \ /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapPlusRawRep2.bigWig \ wgEncodeRegTxnCshlShortRnaSeq${cell}CellTotalRawSigPooled.bedGraph end 'EOF' csh pool.csh >&! pool.out & # got 8 cells, two failed -- CD20 and IMR90, may be rescuable, but continuing # for now with others to see if its worth it. cat > big.csh << 'EOF' foreach i (*.bedGraph) echo processing $i bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw end 'EOF' csh big.csh >&! big.out & # add to database set dir = /gbdb/hg19/bbi/encodeRegV2 mkdir $dir ln -s `pwd`/*.bw $dir foreach i (*.bw) hgBbiDbLink hg19 $i:r $dir/$i end ###### # Handle the RNA seq, starting by pooling it and filtering out multiple # mapping and blacklisted regions. ###### cd /hive/data/genomes/hg19/bed/ mkdir -p wgEncodeRegV2/txn/pooled cd wgEncodeRegV2/txn/pooled cat > poolTx.csh << 'EOF' foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hsmm Huvec K562 Nhek Nhlf) echo "processing ${cell}" bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep1.bigWig \ /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep2.bigWig \ /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep1.bigWig \ /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep2.bigWig \ wgEncodeRegTxnCshlLongRnaSeq${cell}CellPapRawSigPooled.bedGraph end 'EOF' csh poolTx.csh >&! poolTx.out & # ~8 minutes per cell type # NOTE: 25 chroms in Hepg2, H1hesc, Huvec, Nhlf (male or unknown sex) # Add in HMEC, which has single replicates cat > poolHmec.csh << 'EOF' foreach cell (Hmec) echo "processing ${cell}" bigWigMerge -adjust=-1 \ /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep1.bigWig \ /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep1.bigWig \ wgEncodeRegTxnCshlLongRnaSeq${cell}CellPapRawSigPooled.bedGraph end 'EOF' csh poolHmec.csh >&! poolHmec.out & # Normalize. This takes two passes. The first creates # a tab-separated file with normalization values. A batch file is made # by running awk on the tab file, and this batch file executes the # second half, transforming the bedGraphs to normalized values. The # normalization standardizes all files as if the total signal integrated to 10 # billion over the genome. rm -f ../norm.tab cat > makeNorm.csh << 'EOF' foreach i (*SigPooled.bedGraph) echo processing $i echo -n $i " " >> ../norm.tab awk '{sum += $4*($3-$2);} END {print 10000000000/sum}' $i >> ../norm.tab end awk '{printf("echo %s; colTransform 4 %s 0 %g ../normalized/%s\n", $1, $1, $2, $1);}' ../norm.tab > ../norm.csh 'EOF' csh makeNorm.csh >&! makeNorm.out & # ~2 mins per cell type mkdir ../normalized csh ../norm.csh >&! ../norm.out & #### STOPPED HERE cd ../normalized # make bigWigs cat > big.csh << 'EOF' foreach i (*.bedGraph) echo processing $i bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw end 'EOF' csh big.csh >&! big.out & # add to database set dir = /gbdb/hg19/bbi/encodeRegV2 mkdir $dir ln -s `pwd`/*.bw $dir foreach i (*.bw) hgBbiDbLink hg19 $i:r $dir/$i end ###### # Possible new track: FAIRE Peaks Union across 25 cells, from AWG ###### cd /hive/data/genomes/hg19/bed/wgEncodeRegV2 mkdir -p faire/peaks cd faire/peaks wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformFaire.txt sed -n 's/bigDataUrl /wget /p' uniformFaire.txt | grep Union > wget.csh csh wget.csh >&! wget.out & mv *Union*.bb encodeAwgFairePeaks25CellUnion.bb bigBedToBed encodeAwgFairePeaks25CellUnion.bb encodeAwgFairePeaks25CellUnion.bed # head -1 *.bed chr1 10033 10376 # bed 3 hgLoadBed hg19 encodeAwgFairePeaks25CellUnion encodeAwgFairePeaks25CellUnion.bed # Read 2369615 elements of size 3 from encodeAwgFairePeaks25CellUnion.bed ###### # Create the DNAse peak clusters subtrack. ###### cd /hive/data/genomes/hg19/bed/wgEncodeRegV2 mkdir -p dnase/peaks cd dnase wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.html wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt sed -n 's/bigDataUrl /wget /p' uniformDnase.txt > wget.csh csh wget.csh >&! wget.out & # 125 files (.bb), dated Nov 22, 2011 # Sample case had ~150K peaks (by bigBedInfo) # These are narrowPeak (by bigBedSummary) w/ position & signalVal, no name, score or stats #$ bigBedToBed wgEncodeUwDnaseSKNMC.fdr01peaks.hg19.bb stdout | head chr1 10180 10330 . 0 . 26.000000 -1 -1 -1 # NOTE: Steve Wilder is planning to add scores and make formats of these files more uniform # rename to UCSC style and convert to bed: wgEncodeAwgDnase<lab><cell>Peak ls *.fdr01peaks.hg19.bb | \ perl -wpe 's/(wgEncode)(\S+)(Dnase)(\S+)(.fdr01peaks.hg19.bb)/bigBedToBed $1$2$3$4$5 $1Awg$3\u\L$2\E\u\L$4\EPeak.bed/' > unbig.csh # convert to bed and rename csh unbig.csh >&! unbig.out # udc couldn't read 4 bytes from wgEncodeUWDukeDnaseHepG2.fdr01peaks.hg19.bb, did read 0 # This dataset was truncated # Repeated wget and bigBedToBed for HepG2 (8/23) ls -1 *AwgDnase*.bed > peak.lst # Datasets with treated cells (hints from long label) grep '(' uniformDnase.txt | grep longLabel > treated.txt wc -l treated.txt # 7 datasets # Tamoxifen -> Tam10030 (Duke) or 40HAM20nM72hr (UW), Estradiol -> Est10nm30m, Ifna4h, Andro, Hypox # TODO: Edit to rename # generate a normalization factor to # convert signalValue to score (0-1000). Also note score field; narrowPeak is 7 (default) # output format: # <filename> 0 1 2 <scoreIx> <scoreFactor> # wgEncodeAwgDnaseDuke8988tPeak.bed 0 1 2 6 10.7234 . regClusterMakeTableOfTables awgDnase01 peak.lst peak.table -verbose=3 >&! regTable.out & # Make cluster table: wgEncodeRegDnaseClusteredV2 (bed5, with name=#datasets represented in cluster) regCluster peak.table /dev/null peak.bed #Read 125 sources from peak.table # 2233542 singly-linked clusters, 2343745 clusters in 24 chromosomes # filter out low scoring, single peaks in cluster and load in database awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV2.bed wc -l wgEncodeRegDnaseClusteredV2.bed hgLoadBed hg19 wgEncodeRegDnaseClusteredV2 wgEncodeRegDnaseClusteredV2.bed # Read 1281988 elements of size 5 from wgEncodeRegDnaseClusteredV2.bed # load individual peak files as a track (needed by hgc) # TODO: generate labels out of metadata cat > loadDnase.csh << 'EOF' set b = wgEncodeAwgDnase set t = ${b}Peak foreach f (${b}*) set table = $f:r hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table $f set cell = `echo $f | perl -wpe 's/(wgEncodeAwgDnase[A-Z][a-z]+)(.*)Peak.bed/$2/'` echo " track $table" >> $t.ra echo " type narrowPeak" >> $t.ra echo " parent $t" >> $t.ra echo " shortLabel $cell DNase" >> $t.ra echo " longLabel $cell DNaseI HS Uniform Peaks from ENCODE/Analysis" >> $t.ra echo " subGroups cellType=$cell treatment=None" >> $t.ra echo "" >> $t.ra end 'EOF' # NOTE: V1 had ~1M clusters. # Other data tables: Need to load the peak files as they are used by hgc # These should probably also be subtracks. # It would be nice if hgc sorted datasets by signal or cellType # Create metadata table: wgEncodeRegV2DnaseClusteredInputs (inputTrackTable setting) # was: objName, cellType, treatment, replicate, lab, dateUnrestricted # For V2, AWG is providing pooled replicates, and sometimes pooled labs. Also, # this data has all passed timestamp restriction, so don't need to show # objName, cellType, treatment, lab(s) # DISPLAY: Consider item name change to % assays represented instead of count. # Also, could color item name label to highlight ubiquitous vs. cell-type-specific # (top quartile -> color indicating high expression (red/orange) ?, bottom green/blue ?) ###### # Uniform histone signal for overlay wiggles # Note: there are UW signal files as well -- omitting these for consistency, # Check if merging would be a better option ###### cd /hive/data/genomes/hg19/bed/wgEncodeRegV2 mkdir -p histone/signal cd histone/signal cat > getHistones.csh << 'EOF' foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hsmm Huvec K562 Nhek Nhlf Hmec) foreach histone (H3k4me1 H3k4me3 H3k27ac) set var = "wgEncodeBroadHistone${cell}${histone}" echo $var curl -O http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/${var}StdAln_\[1-3\]Reps.norm5.rawsignal.bw end end 'EOF' csh getHistones.csh >&! getHistones.out & # save real files (>1k size) as curl creates info files when match used mkdir sav find . -name '*.bw' -a -size +1k -exec mv '{}' sav \; rm *.bw mv sav/* . rmdir sav # note: missing Hela and HepG2 H3K4me1 - these have different filename pattern (check with Anshul why) # use curl to retrieve by pattern match as we don't have read perms on directory. Then remove non-useful files curl -O http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/wgEncodeBroadHistoneHelas3H3k4me1Std_\[1-3]Reps.norm5.rawsignal.bw # good one is '*Std_1Rep', remove 2Rep and 3Rep rm *Hela*Std_[2-3]* wget http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/wgEncodeBroadHistoneHepg2H3k4me1Std_1Reps.norm5.rawsignal.bw ###### # TFBS from Jan2011 freeze ###### # The TFBS track is a specialized type: factorSource. Requires 3 tables: # track table: wgEncodeRegTfbsClustered (bed15) # source table: wgEncodeRegTfbsCells # Maps cell type (including treatment) to single letter # Currently, there are fixed unique assignments for Tier1 and 2. # There are all upper case. # Tier 3 just use first char of cell name, lower-cased. Not unique at all! # Looks like only rows 2 and 3 of the table are actually used/informative # Ex. row 0 G GM12878 n/a n/a n/a 3 n/a,n/a/,GM12878, # inputTrack (metadata) table: wgEncodeRegTfbsClusteredInputs # DISPLAY: Better with target protein instead of antibody ? # DISPLAY: Consider item showing letter for all experiments, with name colored (brown ?) if this peak # found it the cell type # DISPLAY: fix ordering of cell letters -- they should be sorted # (in cellLetter.tab first, so Exps files has correct id's) cd /hive/data/genomes/hg19/bed/wgEncodeRegV2 mkdir -p tfbs/peaks cd tfbs/peaks wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformTfbs_nocontrols.txt # this file has metadata # there are 3 'method' values: SPP, PeakSeq, wiggler # there are 2 'quality' values: Good, Caution # There are 144 antibodies (for 122 factors), 75 cell types, 95 cell types plus treatments, 457 datasets # Version 1 of this track had 148 antibodies, 67 cell types, 95 cell type+treatments, 486 datasets # -> basically the same data, but cleaned up sed -n 's/bigDataUrl /wget /p' uniformTfbs_nocontrols.txt | grep Tfbs > wget.csh sed -n 's/bigDataUrl /wget /p' uniformTfbs_nocontrols.txt | grep -v Tfbs > wget2.csh csh wget.csh >&! wget.out & # 416 files csh wget2.csh >&! wget2.out & # 41 (have Histone in filename) # sample filename: spp.optimal.wgEncodeUwTfbsHcfaaCtcfStdAlnRep1_VS_wgEncodeUwTfbsHcfaaInputStdAlnRep1.bb # from hub trackDb, metadata is in subGroups line: # subGroups tier=3 cellType=HCFaa factor=CTCF antibody=CTCF lab=UW treatment=None protocol=Std quality=Good method=SPP # Note 'quality'. Think about dropping those with quality='Caution'. Unfortunately some # Tier1 , esp. Pol2 are only available with this quality # Q: What does Rep1 mean ? Most are 'Rep0' (Answer from Anshul -- Rep0 are pooled, Rep1 were single-rep but adequate qual) # (Jim's methods exluded those not based on 2 replicates) # Q: Why are some datasets 'StdAln' and some are 'Aln' ? (Answer from Anshul -- names are from UCSC) # Q: How is quality assigned ? # rename to UCSC style and convert to bed: wgEncodeAwgTfbs<lab><cell>Peak cd .. ls peaks/spp.optimal.*.bb | \ perl -wpe 's^peaks/(spp.optimal.)(wgEncode)(\S+)(Tfbs)(\S+)(StdAln|Aln)(Rep\d_VS\S+)^bigBedToBed peaks/$1$2$3$4$5$6$7 peaks/$2Awg$4$3$5Peak.bed^' > unbig.csh # convert to bed and rename csh unbig.csh >&! unbig.out & # NOTE: ~10% (49 of 457 total) are dupes from different labs # Factors are: CFOS, CMYC, CTCF, E2F6, EBF1F FOXA1, GATA2, JUND, MAFK # MAX, POL2, RAD21, SRF, YY1 ls -1 peaks/*.bed > peak.lst ###### # TFBS from July 2012 freeze # Uniform processing with slightly more relaxed threshold (2% FDR vs 1% based on # input Anshul received from users. And there are more data sets (690 after removing # quality failures vs. 495 in previous freeze) # Based on Anshul's recommendations, using 'optimal' SPP peaks, with black list filtering # (as in previous track). ###### cd /hive/data/genomes/hg19/bed/wgEncodeRegV2 mkdir -p tfbs2012 cd tfbs2012 set dir = `pwd` mkdir logs # use high-quality uniform peaks used to build Uniform TFBS track # (see doc/encodeAwgHg19.txt) # This will require much less work than previous build, as much of the processing # was done to make this predecessor track (uniform naming & normalized scoring, extracting metadata) ln -s /hive/data/genomes/hg19/bed/wgEncodeAwgTfbsUniform/downloads peaks ls peaks/*.narrowPeak.gz > peak.lst wc -l peak.lst # 690 peak.lst # extract metadata for files, to format: # filename cell[+treatment] antibody cat > getMeta.csh << 'EOF' foreach p (`cat $1`) set obj = $p:t:r:r echo -n "$p\t" # note: must strip off treatment=None mdbPrint hg19 -obj=$obj | raToTab stdin -cols=cell,treatment,antibody stdout | \ perl -wne '($cell, $treat, $ab) = split; $cellTreat = $cell . "+" . $treat; \ $cellTreat =~ s/\+None//; print $cellTreat, "\t", $ab, "\n";' end 'EOF' csh getMeta.csh peak.lst > fileCellAb.withDupes.tab # alternate extraction of metadata - include lab, so we can be inclusive of max data # filename cell[+treatment]+lab antibody cat > getMetaBig.csh << 'EOF' foreach p (`cat $1`) set obj = $p:t:r:r echo -n "$p\t" # note: must strip off treatment=None mdbPrint hg19 -obj=$obj | raToTab stdin -cols=cell,treatment,lab,antibody stdout | \ perl -wne '($cell, $treat, $lab, $ab) = split; $cellTreat = $cell . "+" . $treat . "+" . $lab; \ $cellTreat =~ s/\+None//; print $cellTreat, "\t", $ab, "\n";' end 'EOF' csh getMetaBig.csh peak.lst > fileCellAb.tab # assign upper case codes to cell types in tiers 1, 2, and 2.5 # (tier 3's will all be assigned lower case first char) # note: treated cells have same code as untreated cp ~/kent/src/hg/regulate/regClusterBedExpCfg/cellLetter2.tab cellCodesTop.txt # hand-edit in codes, promoting Tier 2.5 to caps (A549, MCF-7, SK-N-SH, IMR90) # create config file for clustering regClusterBedExpCfg -noNormalize -tabList fileCellAb.withDupes.tab \ -cellLetter=cellCodesTop.txt -noLetterOk clusters.cfg # format is: # antibody cell[+treatment] cell_char file score_column normalization filename # CTCF H1-hESC 1 file 4 1 peaks/wgEncodeAwgTfbsBroadH1hescCtcfUniPk.narrowPeak.gz regClusterBedExpCfg -noNormalize -tabList fileCellAb.tab \ -cellLetter=cellCodesTop.txt -noLetterOk clusters.all.cfg # format is: # antibody cell[+treatment]+lab cell_char file score_column normalization filename # CTCF H1-hESC 1 file 4 1 peaks/wgEncodeAwgTfbsBroadH1hescCtcfUniPk.narrowPeak.gz # do clustering hgBedsToBedExps -dupeLetterOk clusters.cfg clusters.bed clusters.exps >&! logs/clusters.out & hgBedsToBedExps -dupeLetterOk clusters.all.cfg clusters.bed clusters.exps >&! logs/clusters.all.out & # 4 dupes (HAIB A549+Etoh/USF1, HepG2/NRSF, SK-N-SH/NRSF -- these are V04* vs. Pcr*) # and SYDH K562/Pol2 -- Iggrab control vs none indicated # hand-edit fileCellAb.tab to add V04 and Iggrab to cell field to uniquify regClusterBedExpCfg -noNormalize -tabList fileCellAb.uniq.tab \ -cellLetter=cellCodesTop.txt -noLetterOk clusters.uniq.cfg # cluster again, using uniquified config file hgBedsToBedExps -dupeLetterOk clusters.uniq.cfg clusters.bed clusters.exps >&! logs/clusters.uniq.out & cat logs/clusters.uniq.out #Loaded 690 records from clusters.uniq.cfg #Got 194 sources #Got 189 factors ## Success! wc -l clusters.bed # 4910269 clusters.bed # Load database, first clusters hgLoadBed hg19 wgEncodeRegTfbsClusteredV4 clusters.bed # Read 4910269 elements of size 15 from clusters.bed # was 4735242 in V3 # was 2750490 in V2 # Load sourceTable hgLoadSqlTab hg19 wgEncodeRegTfbsCellsV4 ~/kent/src/hg/lib/expRecord.sql clusters.exps # Create inputTrackTable - six columns: <table> <source> <factor> <cell> <treatment> <lab> cat > getInputs.csh << 'EOF' foreach p (`cat $1`) set obj = $p:t:r:r echo -n "$obj\t" # note: strip off treatment=None mdbPrint hg19 -obj=$obj | raToTab stdin -cols=cell,treatment,lab,antibody stdout | \ perl -wne '($cell, $treat, $lab, $ab) = split; \ $source = $cell . "+" . $treat . "+" . $lab; \ $source =~ s/\+None//; \ print $source, "\t", $ab, "\t", $cell, "\t", $treat, "\t", $lab, "\n";' end 'EOF' csh getInputs.csh peak.lst > wgEncodeRegTfbsClusteredInputsV4.tab hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputsV4 ~/kent/src/hg/lib/clusterInputTrackTable4.sql wgEncodeRegTfbsClusteredInputsV4.tab # Clustering by target cat > getMetaHuge.csh << 'EOF' foreach p (`cat $1`) set obj = $p:t:r:r echo -n "$p\t" # note: must strip off treatment=None mdbPrint hg19 -obj=$obj | \ raToTab stdin -cols=cell,treatment,lab,antibody,target stdout | \ perl -wne '($cell, $treat, $lab, $ab, $target) = split; \ $source = $cell . "+" . $treat . "+" . $lab; \ $source =~ s/\+None//; print $source, "\t", $ab, "\t", $target, "\n";' end 'EOF' csh getMetaHuge.csh peak.lst > fileCellAbTarget.tab regClusterBedExpCfg -noNormalize -useTarget -tabList fileCellAbTarget.tab \ -cellLetter=cellCodesTop.txt -noLetterOk clusters.target.cfg # format is: # target cell[+treatment]+lab+ab cell_char file score_column normalization filename hgBedsToBedExps -dupeLetterOk clusters.target.cfg clusters.target.bed clusters.target.exps >&! \ logs/clusters.target.out & # 4 dupes: 3 HudsonAlpha (2 protocols), 1 Stanford (2 controls) # hand-edit fileCellAbTarget.tab to add V04 and Iggrab to cell field to uniquify # create cluster config file regClusterBedExpCfg -noNormalize -useTarget -tabList fileCellAbTarget.uniq.tab \ -cellLetter=cellCodesTop.txt -noLetterOk clusters.target.cfg # next step is the actual clustering, and takes some time (minutes) hgBedsToBedExps -dupeLetterOk clusters.target.cfg clusters.target.bed clusters.target.exps >&! \ logs/clusters.target.out & #Loaded 690 records from clusters.target.cfg #Got 690 sources #Got 161 factors hgLoadBed hg19 wgEncodeRegTfbsClusteredV6 clusters.target.bed ln -s clusters.target.bed wgEncodeRegTfbsClusteredV6.tab # V5 with separate HA-E2F1 factor: Read 4385242 elements of size 15 from clusters.target.bed # Read 4380444 elements of size 15 from clusters.target.bed hgLoadSqlTab hg19 wgEncodeRegTfbsCellsV6 ~/kent/src/hg/lib/expRecord.sql clusters.target.exps ln -s clusters.targets.exps wgEncodeRegTfbsCellsV6.tab # Create inputTrackTable - six columns: <table> <source> <factor> <cell> <treatment> <lab> # For target-based, include antibody in source field cat > getTargetInputs.csh << 'EOF' foreach p (`cat $1`) set obj = $p:t:r:r echo -n "$obj\t" # note: strip off treatment=None mdbPrint hg19 -obj=$obj | raToTab stdin -cols=cell,treatment,lab,antibody,target stdout | \ perl -wne '($cell, $treat, $lab, $ab, $target) = split; \ $source = $cell . "+" . $treat . "+" . $lab . "+" . $ab; \ $source =~ s/\+None//; \ print $source, "\t", $target, "\t", $ab, "\t", \ $cell, "\t", $treat, "\t", $lab, "\n";' end 'EOF' csh getTargetInputs.csh peak.lst > wgEncodeRegTfbsClusteredInputsV6.tab hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputsV6 \ ~/kent/src/hg/lib/clusterInputTrackTable5.sql wgEncodeRegTfbsClusteredInputsV6.tab # compare coverage of V2-V6 (V5 is same as 6, except clusters HA-E2F1 as separate target) # V2 - current version. June 2011 TFBS (425 inputs) featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV2 # wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV2 7.797%, both 4.706%, cover 35.14%, enrich 4.51x featureBits hg19 -noRandom -enrichment wgEncodeRegTfbsClusteredV2 wgEncodeRegDnaseClusteredV2 # wgEncodeRegTfbsClusteredV2 7.797%, wgEncodeRegDnaseClusteredV2 13.390%, both 4.706%, cover 60.36%, enrich 4.51x # 'V3' - clustered by antibody. Mar 2012 TFBS, datasets thinned to uniquify (e.g. no multi-lab) # (635 inputs) featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV3 # wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV3 12.855%, both 6.578%, cover 49.13%, enrich 3.82x # 'V4' - clustered by antibody. Mar 2012 TFBS, all datasets passing Anshul's qual filter (690) featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV4 # wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV4 13.156%, both 6.690%, cover 49.96%, enrich 3.80 # 'V5' - clustered by target. Mar 2012 TFBS, all datasets passing Anshul's qual filter (690) featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV5 # wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV5 13.156%, both 6.690%, cover 49.96%, enrich 3.80x # 'V6' - clustered by target. Mar 2012 TFBS, all datasets passing Anshul's qual filter (690) featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV6 # wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV6 13.156%, both 6.690%, cover 49.96%, enrich 3.80x featureBits hg19 -noRandom -enrichment wgEncodeRegTfbsClusteredV5 wgEncodeRegDnaseClusteredV2 # wgEncodeRegTfbsClusteredV5 13.156%, wgEncodeRegDnaseClusteredV2 13.390%, both 6.690%, cover 50.86%, enrich 3.80x # V6 is release target -- rename the antibody clustering to V3a, and rename V6 to V3 (use symlinks in build dir) # Set up downloads set dir = /hive/data/genomes/hg19/bed/wgEncodeRegV2/tfbs2012 cd $dir mkdir downloads gzip -c wgEncodeRegTfbsClusteredV3.bed > downloads/wgEncodeRegTfbsClusteredV3.bed.gz # disallow name truncation gzip -c -n wgEncodeRegTfbsClusteredInputsV3.tab > downloads/wgEncodeRegTfbsClusteredInputsV3.tab.gz gzip -c wgEncodeRegTfbsCellsV3.tab > downloads/wgEncodeRegTfbsCellsV3.tab.gz # use (existing) ENCODE DCC downloads dir set dload = /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegTfbsClustered cd $dload # new release dir mkdir release3 ln -s release3 beta ln -s release3 releaseLatest cd $dir/downloads ln wgEncodeRegTfbsClusteredV3.bed.gz wgEncodeRegTfbsClusteredInputsV3.tab.gz wgEncodeRegTfbsCellsV3.tab.gz $dload cd $dload ln wgEncodeRegTfbsClusteredV3.bed.gz wgEncodeRegTfbsClusteredInputsV3.tab.gz wgEncodeRegTfbsCellsV3.tab.gz releaseLatest # include older files as well cp wgEncodeRegTfbsClusteredV2.bed.gz wgEncodeRegTfbsClusteredInputs.tab.gz wgEncodeRegTfbsCells.tab.gz releaseLatest cp README.txt releaseLatest # clean up bogus names embedded in zips) # gunzip wgEncodeRegTfbsClusteredV2.bed.gz wgEncodeRegTfbsClusteredInputs.tab.gz wgEncodeRegTfbsCells.tab.gz md5sum *.gz > md5sum.txt # vim README.txt # use wgEncodeRegDnaseClustered for basics cp README.txt md5sum.txt .. # add minimal metaObject to metaDb to associate new table with (pseudo-)composite # wgEncodeRegTfbsClustered -- this allows hgTrackUi to locate downloads cd $dir cat > wgEncodeRegTfbsClustered.ra << 'EOF' metaObject wgEncodeRegTfbsClusteredV3 objType table composite wgEncodeRegTfbsClustered fileName wgEncodeRegTfbsClusteredV3.bed.gz tableName wgEncodeRegTfbsClusteredV3 project wgEncode 'EOF' # also, add aux tables, then mdbUpdate, mdbPrint # This script creates a BED5+ download file with clusters and cell info clusterAddSources.pl wgEncodeRegTfbsClusteredV3.bed wgEncodeTfbsClusteredInputsV3.tab > \ wgEncodeRegTfbsClusteredWithCellsV3.bed gzip -c -n wgEncodeRegTfbsClusteredWithCellsV3.bed > downloads/wgEncodeRegTfbsClusteredWithCellsV3.bed.gz cd downloads ln wgEncodeRegTfbsClusteredWithCellsV3.bed.gz $dload cd $dload ln wgEncodeRegTfbsClusteredWithCellsV3.bed.gz releaseLatest md5sum *.gz > md5sum.txt # vim README.txt cp README.txt md5sum.txt .. ############ # Fixes to Inputs table (2014-05-19 kate) # Matches manual changes to uniqify Cells table hgsql hg19 -e 'update wgEncodeRegTfbsClusteredInputsV3 set source="A549+EtOH_0.02pct+HudsonAlpha+V04+USF-1" where tableName="wgEncodeAwgTfbsHaibA549Usf1V0422111Etoh02UniPk"' hgsql hg19 -e 'update wgEncodeRegTfbsClusteredInputsV3 set source="HepG2+HudsonAlpha+V04+NRSF" where tableName="wgEncodeAwgTfbsHaibHepg2NrsfV0416101UniPk"' hgsql hg19 -e 'update wgEncodeRegTfbsClusteredInputsV3 set source="SK-N-SH+HudsonAlpha+V04+NRSF" where tableName="wgEncodeAwgTfbsHaibSknshNrsfV0416101UniPk"' hgsql hg19 -e 'update wgEncodeRegTfbsClusteredInputsV3 set source="K562+Stanford+Iggrab+Pol2(phosphoS2)" where tableName="wgEncodeAwgTfbsSydhK562Pol2s2IggrabUniPk"' ####################################################################### # performance eval # try changing expIds and expScores to varchar 2048 from blob # create table alter table kateTemp disable keys; insert into kateTemp (select * from wgEncodeTfbsClusteredV3) # try bigBed set dir = /cluster/data/hg19/bed/wgEncodeRegV2/tfbs2012 cd $dir set track = wgEncodeRegTfbsClusteredV3 bedSort clusters.target.bed clusters.target.sorted.bed /cluster/bin/x86_64/bedToBigBed -type=bed15 -tab clusters.target.sorted.bed \ /hive/data/genomes/hg19/chrom.sizes $track.bb # Error line 1 of xx.bed: expecting 690 elements in expIds list (bed field 14) # Might be a bogus error -- looks OK on inspection # GOT HERE ln -s `pwd`/$track.bb /gbdb/hg19/wgEncodeReg hgBbiDbLink hg19 $track wg/gbdb/hg19/wgEncodeReg/$track.bb # for now, adding maxWindowToDraw 1M setting to trackDb to limit performance hit # shrink table format to BED5+, with columns 6 and 7 non-sparse comma-sep lists # of exp id's and scores clusterShrinkExps.pl clusters.target.sorted.bed > clusters.shrunk.bed +# NOTE: moving this tool to makeDb/hgBedsToBedExps/bedExpsToFactorSource.pl hgLoadBed -sqlTable=~/kent/src/hg/lib/factorSource.sql -renameSqlTable \ hg19 wgEncodeRegTfbsClusteredShrunkMini test.shrink.bed hgLoadBed -sqlTable=~/kent/src/hg/lib/factorSource.sql -renameSqlTable \ hg19 wgEncodeRegTfbsClusteredShrunk clusters.shrunk.bed # rename Shrunk to V3 # copy to downloads cp clusters.shrunk.bed downloads/wgEncodeRegTfbsClusteredV3.bed cd downloads gzip wgEncodeRegTfbsClusteredV3.bed cd /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegTfbsClustered/releaseLatest md5sum wgEncodeRegTfbsClusteredV3.bed >> md5sum.txt # and remove old entry # (see Wrangler HowTo in ENCODE2 wiki for these steps) # replace md5sum in md5sum.txt, metaDb # create files.txt encodeMkFilesList hg19 ####################################################################### # EXPERIMENTAL: TF motif tracks, from EMBL/Aus (MEME toolkit) # These are generated using FIMO tool: # Charles E. Grant, Timothy L. Bailey, and William Stafford Noble, "FIMO: Scanning for occurrences of a given motif", Bioinformatics 27(7):10171018, 2011 # http://bioinformatics.oxfordjournals.org/content/early/2011/02/16/bioinformatics.btr064.full # http://meme.ebi.edu.au/meme/cgi-bin/fimo.cgi mkdir /cluster/data/hg19/bed/tfbsFimo cd /cluster/data/hg19/bed/tfbsFimo hgLoadBed hg19 tfbsFimoJaspar TfMotifsFIMO_Jaspar.hg19.bed # Read 1386302 elements of size 6 from TfMotifsFIMO_Jaspar.hg19.bed hgLoadBed hg19 tfbsFimoTransfac TfMotifsFIMO_Transfac.hg19.bed # Read 3665198 elements of size 6 from TfMotifsFIMO_Transfac.hg19.bed ####################################################################### # more motif exploratory work -- give Larry M's transRegCode a ride cd /hive/data/genomes/hg19/bed/wgEncodeRegV2/tfbs2012 mkdir motifs cd motifs # lift motif localizations from hg18 (where did they come from originally ?) hgsql hg18 -Ne "select * from wgEncodeRegTfbsClusteredMotifs" | \ cut --complement -f 1 | \ liftOver -bedPlus=4 stdin /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz \ motifs.bed motifs.unmapped hgLoadBed hg19 -as=~/kent/src/hg/lib/bed6FloatScore.as wgEncodeRegTfbsClusteredMotifs motifs.bed # Read 1750827 elements of size 6 from motifs.bed ####################################################################### # Table cleanup/renaming (more to come): TfbsClusteredV3a -> TfbsClusteredV3a_old TfbsClusteredV4 -> TfbsClusteredV3a TfbsClusteredV5 -> TfbsClusteredV3b ####################################################################### # Factorbook motifs (Jun 2013, and then Dec 2013-Feb 2014, kate) # Data consists of motif locations, positions weight matrices, # and curation (which motifs are canonical for which factors. # Basis of the track is supplementary data from Wang et al. Sep2012 Genome Res paper, # plus newer data provided by Bong Hyun Kim (nghyun.kim nghyun.kim@umassmed.edu) # of Factorbook team at Zhiping Weng's lab (Zlab). # From Bong-Hyun: #These are discovered motifs from 2012 freeze, methods in GR 2012:22(9), Wang et al. #Basically the motif were discovered from 500 strongest signal peaks by MEME, and then they are tested if they are enriched compared to the flanking regions of the peaks or GC content matching regions. And then the motifs are condensed by manual curation. Then the peak regions are searched by the condensed motifs using FIMO. Note that UA (unannotated) motifs are new motif described in the genome research paper, but UAK's are potentially unannotated motifs found newer datasets and not yet carefully curated. cd /cluster/data/hg19/bed mkdir factorbook cd factorbook ################# # Motif locations within ENCODE peaks # Each line is the genomic location of a motif whose name is the 5th column. # The 6, 7 and 8th columns are q-value of the motif match, peak number (or bed file line number) # in the original uniform processed peak call, and peak summit, respectively. # # Load these as a BED table of merged data from all experiments, with q-value replaced with # -log10 score. Duplicate items are removed, with the item scored as maximum found. # (NOTE: this table could be expanded to included the sources as in factorSource tables -- this # would allow it to be used to show motifs in the individual experiment narrowPeak files) wget http://zlab.umassmed.edu/zlab/publications/TFBS.tar.gz # data from Sep2012 Genome Research paper (through Jan 2011 ENCODE freeze) wget http://zlab.umassmed.edu/~kimb/encode_tfbs.tar.gz # newer data (Jan 2011 - Mar 2012 ENCODE freeze) tar xvfz encode_tfbs.tar.gz cd encode_tfbs cat *.bed | wc -l # 6316631 # ls *.bed | wc -l # 294 # why so few ? -- asking Bong-hyun # NOTE: some beds are for controls! # NOTE: there are motifs for different factors in these peaks (not just target of assay) # Around 6M localizations in these files, some appearing multiply reflecting detection # in peaks from different expereriments. These will be merged in the final table. # convert: # chrom, start, end, strand, name, qvalue -> # chrom, start, end, name, float_score, strand (bed6FLoatScore) cat encode_tfbs/*.bed pub_tfbs/*.summary | \ awk '{OFS="\t"; print $1, $2-1, $3, $5, $6, $4}' > motifs.withdupes.bed wc -l motifs.withdupes.bed # 12115922 bedSort motifs.withdupes.bed stdout | uniq > motifs.bed wc -l motifs.bed # 12022426 motifs.bed hgLoadBed hg19 -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \ wgEncodeRegTfbsClusteredMotifs_factorBook motifs.bed # Read 12022426 elements of size 6 from motifs.bed # TODO: remove multiple hits at same position, remove q-value (we will rescore later) awk '{$5=0; print}' motifs.bed | uniq > motifs.uniq.bed wc -l motifs.uniq.bed # 2631434 motifs.uniq.bed # 2.6M, down from 12M hgLoadBed hg19 -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \ wgEncodeAwgTfbsMotifs motifs.uniq.bed # Check range of qvalue: >select min(score) from wgEncodeAwgTfbsMotifs; | 0.000000002479999983151515 | >select max(score) from wgEncodeAwgTfbsMotifs; | 0.6370000243186951 | >select count(*) from wgEncodeAwgTfbsMotifs where score > .1 | 1065928 | # BongHyun says these are bad quality -- should drop >select count(*) from wgEncodeAwgTfbsMotifs; | 12022426 | # Reload table with -log10 scoring and motif name fixes (Jan 21, 2014 KRR) # (Two new canonicals were found that have Jaspar naming convention V_ prefix:) #NFIC NFIC (V_NF1_Q6) #NR4A1 NR4A1 (V_TR4_03) # Also, both v-Jun and v-JUN are used cat > dedupAndScore.pl << 'EOF' #!/usr/bin/env/perl # retain lowest scored item (q-value) at a location, converting score to -log10 # expects file to be pre-sorted with: sort -k1,4 -k6,6 -k5,5 sub minusLog10 { my $n = shift; return -log($n)/log(10); } $chrN = ""; while (<>) { chomp; ($chr, $start, $end, $name, $score, $strand) = split; if ($chr ne $chrN || $start ne $startN || $end ne $endN || $name ne $nameN || $strand ne $strandN) { $logScore = minusLog10($score); printf("%s\t%d\t%d\t%s\t%.2f\t%s\n", $chr, $start, $end, $name, $logScore, $strand); $chrN = $chr; $startN = $start; $endN = $end; $nameN = $name; $strandN = $strand; } } 'EOF' sort -k1,4 -k6,6 -k5,5 motifs.bed > motifs.sorted.bed perl dedupAndScore.pl < motifs.sorted.bed | \ # fix motif naming inconsisencies (V_ sed -e 's/v-Jun/v-JUN/' -e 's/V_NF1_Q6/NFIC/' -e 's/V_TR4_03/NR4A1/' > motifs.load.bed wc -l motifs.load.bed #2366151 motifs.load.bed hgLoadBed hg19 -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \ factorbookMotifPos motifs.load.bed ################# # Position weight matrices for the motifs # These are in MEME format wget http://zlab.umassmed.edu/~kimb/gr_paper_extended.meme # MOTIF A-Box #... # A 0.25000 C 0.25000 G 0.25000 T 0.25000 #... # letter-probability matrix: alength= 4 w= 21 nsites= 116 E= 5.4e-213 # 0.137931 0.189655 0.017241 0.655172 # 0.620690 0.051724 0.077586 0.250000 # ... # use w for column count. 1 probability row per column. order: acgt grep MOTIF gr_paper*meme | wc -l # 132 # total motifs cat > convertMotifs.pl << 'EOF' #!/usr/bin/env perl # convertMotifs - create load file for UCSC transRegCodeMotif table from MEME output ## MEME format: #MOTIF <name> #... #letter-probability matrix: alength= 4 w=<columns> #Aprob Cprob Gprob Tprob #... ## TransRegCodeMotif format: #name columnCount aProb cProb gProb # where *Prob fields are comma-sep lists of length columnCount use warnings; use strict; my $name; my $columnCount; my $cols = 0; my $aProb, my $cProb, my $gProb, my $tProb; my @aProbs = (), my @cProbs = (), my @gProbs = (), my @tProbs = (); sub printProbs { # print comma-sep list of probabilities my ($probs) = @_; print "\t"; foreach my $prob (@$probs) { print "$prob,"; } } while (<>) { if (/^MOTIF (\S+)/) { $name = $1; } elsif (defined($name) && /^letter-probability matrix: alength= \d+ w= (\d+)/) { $columnCount = $1; } elsif (defined($columnCount)) { # working on probabilities if ($cols < $columnCount) { # slurp in probabilities $cols++; ($aProb, $cProb, $gProb, $tProb) = split; push(@aProbs, $aProb); push(@cProbs, $cProb); push(@gProbs, $gProb); push(@tProbs, $tProb); } else { # finished with this motif -- print it out and set up for next print "$name\t$columnCount"; printProbs(\@aProbs); printProbs(\@cProbs); printProbs(\@gProbs); printProbs(\@tProbs); print "\n"; undef $name; undef $columnCount; $cols = 0; @aProbs = (); @cProbs = (); @gProbs = (); @tProbs = (); } } } 'EOF' convertMotifs.pl gr_paper_extended.meme > transRegCodeMotif.tab # wc -l transRegCodeMotifs.tab # 132 transRegCodeMotifs.tab hgLoadSqlTab hg19 transRegCodeMotifs_factorBook ~/kent/src/hg/lib/transRegCodeMotif.sql \ transRegCodeMotif.tab # mySQL error 1062: Duplicate entry 'v-JUN' for key 1 # There are both v-JUN and v-Jun entries # For now, prune out v-Jun (and inquire at zlab) amd reload # Again, motif name consistency sed -e 's/V_NF1_Q6/NFIC/' -e 's/V_TR4_03/NR4A1/' transRegCodeMotif.fixed.tab > pwms.tab hgLoadSqlTab hg19 factorbookMotifPwm ~/kent/src/hg/lib/transRegCodeMotif.sql pwms.tab ########## # Create alias table to map factor names at UCSC (HGNC, in ENCODE controlled vocab), to Factorbook # names (not always HGNC), so linkouts work. For use with trackDb url setting. cat > geneAlias.sql << 'EOF' CREATE TABLE factorbookGeneAlias ( name VARCHAR(255) NOT NULL, value VARCHAR(255) NOT NULL, PRIMARY KEY (name) ); cat > testLinks.csh << 'EOF' foreach f (`awk '{print $1}' allMotifTargets.load.tab`) wget -O /dev/null -q http://www.factorbook.org/mediawiki/index.php/$f set ret = $status echo "$f : $ret" end 'EOF' csh testLinks.csh > testLinks.out # results in 'noPage.txt' and 'misnamedPage.txt' # various tweaking to generate factorbookGeneAlias.tab (sorry, log is lost, but should be easy to recreate) hgsql hg19 < geneAlias.sql hgsql hg19 -e 'LOAD DATA LOCAL INFILE "factorbookGeneAlias.tab" INTO TABLE factorbookGeneAlias' #1/31/14 - patch factorbookGeneAlias update factorbookGeneAlias set value='ERalpha a' where name='ESR1'; #3/13/14 - remove genes not in TF clusters (bad quality dataaset) delete from factorbookGeneAlias where name='NR4A1' delete from factorbookGeneAlias where name='SREBP2' ########## # Create table to map factor names to their canonical motifs (12/4/13 Kate) # Many twists and turns here -- the goal is to assure factor names match UCSC TF CV names, and # that motif names match BED of motif positions. Also, that only factors having DNA-binding domains # that will bind directly are listed (not those with 'tethered-only' binding mode). # Approach was to use Table S1 + communications from Zlab for newer factors, removing those # listed in Table S3 as tethered only. Zlabbers have offered to provide us with a full # file for reloading this table. # Get targets for each dataset csh listMotifs.csh > targets.txt cp targets.txt targets.txt.fixed # Remove low quality datasets (see details in AwgTfbsUniform) # These were left out of TFBS Clusters V3 #wgEncodeSydhTfbsGm12878Spt20StdAlnRep0 #wgEncodeSydhTfbsGm12878Gcn5StdAlnRep0 #wgEncodeHaibTfbsPfsk1Pol24h8V0416101AlnRep0 #wgEncodeHaibTfbsA549Sp1V0422111Etoh02AlnRep0 #wgEncodeSydhTfbsHepg2CebpzIggrabAlnRep0 #wgEncodeSydhTfbsHelas3Gcn5StdAlnRep0 #wgEncodeSydhTfbsGm12878Irf3IggmusAlnRep0 #wgEncodeSydhTfbsK562Xrcc4StdAlnRep0 #wgEncodeSydhTfbsHepg2Srebp2PravastStdAlnRep0 #wgEncodeSydhTfbsHepg2Srebp1PravastStdAlnRep0 #wgEncodeSydhTfbsHepg2Pol2PravastStdAlnRep0 # 78 left unmatched # List motifs in datasets csh listMotifs2.csh > motifs2.txt cp motifs2.txt motifs2.txt.fixed # remove bad datasets above # merge files paste targets.txt.fixed motifs2.txt.fixed > target_motifs.txt # remove bad Uchicago datasets. All but: #wgEncodeUchicagoTfbsK562Ehdac8ControlAlnRep0 #wgEncodeUchicagoTfbsK562EjundControlAlnRep0 #wgEncodeUchicagoTfbsK562EjunbControlAlnRep0 #wgEncodeUchicagoTfbsK562Egata2ControlAlnRep0 #wgEncodeUchicagoTfbsK562EfosControlAlnRep0 # inventory file awk -F'\t' 'OFS="\t" {print $1, $2, $4}' < target_motifs.txt > target_motifs_out.txt # Create map table cat > motifTarget.sql << 'EOF' CREATE TABLE motifTarget ( target VARCHAR(255) NOT NULL, motif VARCHAR(255) NOT NULL, PRIMARY KEY (target) ); 'EOF' hgsql hg19 < factorbookMotifTarget.sql # Get mappings from GR paper (Wang 2012) wget http://genome.cshlp.org/content/suppl/2012/08/22/22.9.1798.DC1/TableS1.xls # extract tab-sep file: to FactorbookTableS1.txt # columns needed for map table: 2: HGNC ID, 4: canonical motif # NOTE: a few have two canonical( ; sep) -- keep it simple for now, just use first # NOTE: some have no canonical -- col 4 is empty tail -n +2 FactorbookTableS1.txt | \ awk -F'\t''{print $2, $4}' | sort | uniq | sed 's/;.*//' > pubMotifs.txt # Need same for newer data # Merge and load in table #awk '{print $1, "\t", $2}' pubMotifs.txt newMotifs.guess.txt | sort > allMotifs.tab #hgsql hg19 -e 'LOAD DATA LOCAL INFILE "factorbookTargetToMotif.tab" INTO TABLE factorbookTargetToMotif' # Replace with mappings from Bong-Hyun, sent in 12/19 email cat << 'EOF' | sed 's/ */ /' | grep -v '^#' > newMotifs.txt #The TFs not mentioned in my list do not have specific binding motifs, e.g. histone acetylases or other non-specific DNA binding proteins. #Common.Name canonical.motif ATF1 CREB ATF2 CREB BACH1 NFE2 CREB1 CREB ELK1 ELF1 FOXM1 FOXA FOXP2 FOXA MAZ UAK42 NFIC NFIC NR4A1 NR4A1 RUNX3 RUNX1 SP4 SP1 SREBF2 SREBF1 STAT5A STAT1 TCF3 TCF3 TEAD4 TEAD4 ZKSCAN1 UAK54 ZNF217 UAK59 'EOF' # NOTE: NR4A1 and SREBF2(SREBP2) not in Anshul's uniform peaks (failed qual filter I think) # Drop these from target list sort pubMotifs.txt newMotifs.txt | sed 's/ /\t/' > allMotifTargets.tab hgsql hg19 < motifTarget.sql hgsql hg19 -e 'alter table motifTarget rename to factorbookMotifTarget' hgsql hg19 -e 'LOAD DATA LOCAL INFILE "allMotifTargets.tab" INTO TABLE factorbookMotifTarget' # Check for identifier mismatches # 1. Clusters table with Motif targets table hgsql hg19 -e "select distinct(name) from wgEncodeRegTfbsClusteredV3 order by name" > clusterTargets.txt hgsql hg19 -e "select distinct(target) from factorbookMotifTarget order by target" > motifTargets.txt diff clusterTargets.txt motifTargets.txt > targets.diff # These are targets in Factorbook, not in UCSC targets list grep '>' targets.diff > target > NFKB1 # -> RELA in UCSC CV (target gene for this antibody - NFKB p65) > NR4A1 # not in UCSC clusters track (low qual data ? It's Chicago) > POLR3A # -> RPC155 in UCSC CV (old-school, not current HGNC) > SREBF1 # -> SREBP1 in UCSC CV (old-school, not current HGNC) > SREBF2 # -> SREBP2 in UCSC CV (old-school, not current HGNC) # Update targets table to use UCSC CV targets sed -e 's/^NFKB1/RELA/' -e 's/^POLR3A/RPC155/' -e 's/^SREBF1/SREBP1/' -e 's/^SREBF2/SREBP2/' \ allMotifTargets.tab > allMotifTargets.load.tab hgsql hg19 -e 'LOAD DATA LOCAL INFILE "allMotifTargets.load.tab" INTO TABLE factorbookMotifTarget' # TODO: remove targets having no canonical motif (25 of total 137) # Also need to rename two new canonicals found that have Jaspar naming convention: #NFIC NFIC (V_NF1_Q6) #NR4A1 NR4A1 (V_TR4_03) # Also, spot-checking new canonical list above, I noted that ATF2 motif file lacked CREB -- inquiring with BongHyun as to why. #wgEncodeHaibTfbsGm12878Atf2sc81188V0422111AlnRep0.tfbs.bed : AP1 UAK16 v-JUN #wgEncodeHaibTfbsH1hescAtf2sc81188V0422111AlnRep0.tfbs.bed : v-JUN #wgEncodeHaibTfbsH1hescAtf3V0416102AlnRep0.tfbs.bed : CREB UAK18 USF ZNF143-ext #wgEncodeHaibTfbsHepg2Atf3V0416101AlnRep0.tfbs.bed : CREB-ext UAK18 USF #wgEncodeHaibTfbsK562Atf3V0416101AlnRep0.tfbs.bed : AP1 UAK17 #wgEncodeSydhTfbsK562Atf106325StdAlnRep0.tfbs.bed : NFY SP1 ZNF143-ext v-JUN # NOTE: some factors have two canonical motifs listed (; separated in Table S1) # For now, using just the first, but looking at data, both seem important. # Need code changes to pick these up. $ grep ';' *S1.txt | awk '{print $2, $4}' | sort | uniq ATF3 CREB;CREB-ext CTCF CTCF;CTCF-ext CTCFL CTCF;CTCF-ext GATA1 GATA1;GATA1-ext GATA2 GATA1;GATA1-ext JUN AP-1;v-JUN JUNB AP-1;v-JUN JUND AP-1;v-JUN RAD21 CTCF;CTCF-ext SMC3 CTCF;CTCF-ext THAP1 UA4;UA5 ZNF143 ZNF143;ZNF143-ext ZNF274 UA10;UA11 # Check for identifier mismatches # 1. Clusters table with Motif targets table hgsql hg19 -e "select distinct(name) from wgEncodeRegTfbsClusteredV3 order by name" > clusterTargets.txt hgsql hg19 -e "select distinct(target) from factorbookMotifTarget order by target" > motifTargets.txt diff clusterTargets.txt motifTargets.txt > targets.diff # These are targets in Factorbook, not in UCSC targets list grep '>' targets.diff > target > NFKB1 # -> RELA in UCSC CV (target gene for this antibody - NFKB p65) > NR4A1 # not in UCSC clusters track (low qual data ? It's Chicago) > POLR3A # -> RPC155 in UCSC CV (old-school, not current HGNC) > SREBF1 # -> SREBP1 in UCSC CV (old-school, not current HGNC) > SREBF2 # -> SREBP2 in UCSC CV (old-school, not current HGNC) # Update targets table to use UCSC CV targets grep '<' targets.diff > ucsc.diff # Notes here about targets at UCSC not in target table. FYI. # Check motif name consistency across tables awk '{print $4}' motifs.load.bed | sort | uniq > bedMotifs.txt awk '{print $2}' allMotifTargets.load.tab | sort | uniq > targetMotifs.txt awk '{print $1}' pwms.tab | sort | uniq > pwmMotifs.txt diff pwmMotifs.txt targetMotifs.txt | grep '>' > > AP-1 # AP1 > AP-2 # AP2 > NF-Y # NFY, NFY-UA2 ? > PU.1 # PU1 > bHLH # BHLHE40 # Check pwms vs. bed files diff pwmMotifs.txt bedMotifs.txt 67c67 < UA12 # no UA12 in bed file --- > UA14 # no UA14 in pwms 71d70 < UA5 77a77,79 > UAK18 > UAK19 > UAK20 130a133 > v-Jun # should be v-JUN for consistency # Update BED table to correct v-Jun cp motifs.load.bed motifs.fix.bed sed 's/v-Jun/v-JUN/' motifs.fix.bed >motifs.load.bed hgLoadBed hg19 -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \ factorbookMotifPos motifs.load.bed # Update targets table to match bed & pwm tables cp allMotifTargets.load.tab allMotifTargets.fix.tab sed -e 's/AP-1$/AP1/' -e 's/AP-2$/AP2/' -e 's/NF-Y/NFY/' -e 's/PU.1/PU1/' -e 's/bHLH/BHLHE40/' \ allMotifTargets.fix.tab > allMotifTargets.load.tab hgsql hg19 -e 'DELETE FROM factorbookMotifTarget; LOAD DATA LOCAL INFILE "allMotifTargets.load.tab" INTO TABLE factorbookMotifTarget' #Add secondary canonicals: #CREB-ext, CTCF-ext, GATA1-ext, v-JUN (to AP-1), UA5 (to UA4), ANF143-ext, UA11 (to UA10) sed \ -e 's/CREB$/CREB,CREB-ext/g' \ -e 's/CTCF$/CTCF,CTCF-ext/g' \ -e 's/GATA1$/GATA1,GATA1-ext/g' \ -e 's/ZNF143$/ZNF143,ZNF143-ext/g' \ -e 's/UA4$/UA4,UA5/g' \ -e 's/UA10$/UA10,UA11/g' \ -e 's/AP-1$/AP-1,v-JUN/g' \ allMotifTargets.load.tab > allMotifTargetsV2.load.tab hgsql hg19 < motifTarget.sql hgsql hg19 -e 'alter table motifTarget rename to factorbookMotifTargetV2' hgsql hg19 -e 'LOAD DATA LOCAL INFILE "allMotifTargetsV2.load.tab" INTO TABLE factorbookMotifTargetV2' # Remove motifs for 'indirect' binding TFs (those showing only 'tethered' mode in Table S3 awk 'FS="\t" {printf("%s\t%s\t%s\n", $2, $3, $12)}' FactorbookTableS3.txt | sort | uniq > indirects.tsv # Tethered only are: BATF AP-1 BCL11A UA9 BCL3 NFKB1 ETS1 ETS1 FOS AP-1 GATA2 GATA1;GATA1-ext GATA3 GATA3 IRF1 PRDM1 MXI1 MAX NR3C1 NR3C1 PAX5 PAX5 POLR2A TBP PPARGC1A ESRRA RAD21 CTCF;CTCF-ext RFX5 RFX5 SMC3 CTCF;CTCF-ext SREBF1 SREBF1 SRF SRF STAT3 STAT1 TAF1 TBP # Hand edit motifs for factors above out of file (allMotifTargetsV2.load.tab -> V3) hgsql hg19 -e 'select distinct(name) from wgEncodeRegTfbsClusteredV3 order by name' > allFactors.txt hgsql hg19 -e 'select distinct(target) from factorbookMotifTarget order by name' > allMotifFactors.txt diff allFactors.txt allMotifFactors.txt | grep '<' | sed -e 's/< //' > newNoMotifFactors.txt wc -l newNoMotifFactors.txt 26 newNoMotifFactors.txt # NOTE: NR4A1 and SREBF2(SREBP2) not in Anshul's uniform peaks (failed qual filter I think) # Drop these from allMotifTargetsV4.load.tab sed -e 's/$/\t/' -e '/NR4A1/d' -e '/SREBF2/d' newNoMotifFactors.txt > foo mv foo newNoMotifFactors.txt cat allMotifTargetsV3.load.tab newNoMotifFactors.txt | sort > allMotifTargetsV4.load.tab hgsql hg19 < motifTarget.sql hgsql hg19 -e 'alter table motifTarget rename to factorbookMotifTargetV4' hgsql hg19 -e 'LOAD DATA LOCAL INFILE "allMotifTargetsV4.load.tab" INTO TABLE factorbookMotifTargetV4' # Generate comma-sep list of factors for trackDb filter, 'factorFilterList.txt' # Get citations for track description References section getTrackReferences 22955990 23203885 22955619 > refs.txt # Requested corrections target table using new gene vs. canonical motif table from Jiali at Zlab (2014-03-06 kate) # They returned files with new motif calls & annotations. Perhaps for another version of the track! wget http://zlab.umassmed.edu/~zhuangj/Direct_binding_motifs.tar.gz wget http://zlab.umassmed.edu/~kimb/motif_centric_encode_tfbs.sorted.bed.bz2 bunzip2 !$ # In email, the following TF's 4 were specifically mentioned as direct binders # missing canonical motif: # NFIC->NFIC (already in table) # NR4A1->NR4A1 (not in our TF clusters, rejected due to low quality, so not relevant to table) # and these two, which will be added to the table! # NR3C1->NR3C1 # FOS->API sed -e 's/NR3C1\t$/NR3C1\tNR3C1/' -e 's/FOS\t$/FOS\tAP1/' allMotifTargetsV4.load.tab > factorbookMotifCanonical.load.tab hgsql hg19 < motifTarget.sql hgsql hg19 -e 'alter table motifTarget rename to factorbookMotifCanonical' hgsql hg19 -e 'LOAD DATA LOCAL INFILE "factorbookMotifCanonical.load.tab" INTO TABLE factorbookMotifCanonical'