src/hg/makeDb/doc/encodeRegHg19.txt d87e5f63bc000d560c21bca638b6d5f3dee0173d

d87e5f63bc000d560c21bca638b6d5f3dee0173d
kate
  Wed Apr 3 12:06:45 2019 -0700
Add scripts and make docs for ENCODE 3 TF binding site tracks. refs #21139

diff --git src/hg/makeDb/doc/encodeRegHg19.txt src/hg/makeDb/doc/encodeRegHg19.txt
index 1196a19..d1a59a2 100644
--- src/hg/makeDb/doc/encodeRegHg19.txt
+++ src/hg/makeDb/doc/encodeRegHg19.txt
@@ -1,1753 +1,1754 @@
 #######################################################################
 # ENCODE Regulation track  (In progress March 2010, very rough instructions
 # as am still defining how to build track...).  JK
 
       # make root dir and link in our bad.bed file for filtering out
       # blacklisted and multiple mapping regions.
       mkdir -p /cluster/data/hg19/bed/wgEncodeReg
       cd /cluster/data/hg19/bed/wgEncodeReg
       ln -s /hive/users/kent/regulate/companion/mapping/bad.bed badRegions.bed
 
       # Create the DNAse peak clusters subtrack.
       # Get all of the narrowPeak format files for the wgEncodeUwDnaseSeq
       # linked into directory /hive/users/kent/regulate/dnase/peaks
       mkdir dnase
       cd dnase
       mkdir peaks
       ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeUwDnase/*.narrowPeak.gz peaks
 
       # Process these into clusters in a bed file and load clusters into
       # table.
       /bin/ls -1 peaks/*.narrowPeak.gz > peak.lst
        uw02 peak.lst peak.table
       regCluster peak.table /dev/null peak.bed
       awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClustered.bed
       hgLoadBed hg19 wgEncodeRegDnaseClustered wgEncodeRegDnaseClustered.bed
 
       # Make wgEncodeRegDnaseClusteredInput table.  Start with mdbQuery, and
       # then do some massaging since not completely in sync with file list.
       mdbQuery out=tab "select obj,cell,treatment,replicate,lab,dateUnrestricted from hg19 where obj like 'wgEncodeUwDnase%' and view='Peaks'" | sed 's/n\/a/None/' > inputMdb.tab
       cut -f 1 peak.table | sed 's/\.narrowPeak\.gz//' | sed 's/peaks\///' > inputs.lst
       weedLines inputs.lst inputMdb.tab wgEncodeRegDnaseClusteredInputs.tab -invert 
       hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredInputs ~/kent/src/hg/lib/clusterInputDnase.sql \
           wgEncodeRegDnaseClusteredInputs.tab
 
       # Make transcription factor binding site clusters.  Start with Anshul's
       # nice uniform peak calls, downloaded from 
       #   ftp://ftp-private.ebi.ac.uk/byDataType/peaks/jan2011/spp/optimal
       # using the user name and password in the ENCODE wiki at:
       #    http://genomewiki.ucsc.edu/EncodeDCC/index.php/Locations_of_ENCODE_Data
       # Put all of the ones that actually are based on two replicates into
       #    /cluster/data/hg19/bed/wgEncodeReg/tfbs/peaks
       # Once that is done do a little work towards making the cluster input.
       cd /cluster/data/hg19/bed/wgEncodeReg/tfbs
       /bin/ls peaks/*.narrowPeak.gz > peak.lst
       regClusterMakeTableOfTables ans01 peak.lst partial.table
 
       # Unfortunately at this point only about 90% of the peak files actually
       # have metaDb data associated with them in metaDb, so run C program that
       # takes it from metaDb if possible, otherwise tries to get it from file
       # names.
       regClusterAttachMetadataToTableOfTables hg19 partial.table withDupes.table
 
       # One more complication - there are often several cases where there is
       # more than one data set on the same cell+treatment+antibody combination. 
       # For good reasons hgBedToBedExps only wants one.
       # This next bit filters them out based on a picks file that was hand
       # created - picksFromDupes.  This was based on the criteria that
       # the Broad CTCF were used,  and that Snyd was used in preference
       # to OpenChromatin, which in turn was used in preference over Haib.
       # The rational for this is the Broad CTCF set has been in use for a 
       # while with no complaints,  and when examing the Pol2 signals, which
       # were with CTCF the main component of the Dupes, the Haib were the
       # weakest and the Snyd the strongest.  
       cp ~/src/hg/regulate/wgEncodeReg/picksFromDupes.tab .
       wgEncodeRegTfbsDedupeHelp withDupes.table picksFromDupes.tab peak.table
 
       # Make three column file: fileName, cell+treatment, antibody
       awk -F '\t' '{printf ( "%s\t%s\t%s\n", $1, ($10 == "None" ? $8 : $8 "+" $10), $9 ) ; }' peak.table > fileCellAb.tab
 
 
       # Create config file out for clustering job, using a table assigning
       # cell lines+treatments to single letter codes edited by hand.  In
       # the future may be able to automate this by having fixed upper case codes for
       # tier1&2 cell lines, and then just use first letter lower case codes
       # for rest. That is the pattern cellLetter2.tab follows anyway.
       cp ~/kent/src/hg/regulate/regClusterBedExpCfg/cellLetter2.tab .
       regClusterBedExpCfg -tabList fileCellAb.tab cluster.cfg -cellLetter=cellLetter2.tab
 
       # Do the actual clustering 
       hgBedsToBedExps -dupeLetterOk cluster.cfg peak.bed peak.exps
 
       # Load database (reloaded 3/26/2012 to fix a relatively rare bug)
       hgLoadBed hg19 wgEncodeRegTfbsClustered peak.bed
       hgLoadSqlTab hg19 wgEncodeRegTfbsCells ~/kent/src/hg/lib/expRecord.sql peak.exps
 
       # Create inputTrackTable - six columns: <table> <source> <factor> <cell> <treatment> <lab>
       awk '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $7, ($10 == "None" ? $8 : $8 "+" $10), $9, $8, $10, $11);}' peak.table | sed 's/AlnRep/PkRep/'> wgEncodeRegTfbsClusteredInputs.tab
       hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputs ~/kent/src/hg/lib/clusterInputTrackTable4.sql wgEncodeRegTfbsClusteredInputs.tab
 
 
       # Handle the RNA seq, starting by pooling it and filtering out multiple
       # mapping and blacklisted regions.
       mkdir -p /hive/data/genomes/hg19/bed/wgEncodeReg/txn/pooled
       cd /hive/data/genomes/hg19/bed/wgEncodeReg/txn/pooled
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep1V2.bigWig \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep2V2.bigWig stdout \
           | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqGm12878R2x75Il200SigPooled.bedGraph
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep1V2.bigWig \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep2V2.bigWig stdout \
           | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqH1hescR2x75Il200SigPooled.bedGraph
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHct116R2x75Il200SigRep1V2.bigWig \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHct116R2x75Il200SigRep2V2.bigWig stdout \
           | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHct116R2x75Il200SigPooled.bedGraph
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2.bigWig \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep2V2.bigWig stdout \
           | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHelas3R2x75Il200SigPooled.bedGraph
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2.bigWig \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep2V2.bigWig stdout \
           | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHepg2R2x75Il200SigPooled.bedGraph
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep1V2.bigWig \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep2V2.bigWig stdout \
           | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHsmmR2x75Il200SigPooled.bedGraph
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep1V2.bigWig \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep2V2.bigWig stdout \
           | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHuvecR2x75Il200SigPooled.bedGraph
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqK562R2x75Il200SigRep1V2.bigWig \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqK562R2x75Il200SigRep2V2.bigWig stdout \
           | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqK562R2x75Il200SigPooled.bedGraph
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqMcf7R2x75Il200SigRep1V2.bigWig \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqMcf7R2x75Il200SigRep2V2.bigWig stdout \
           | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqMcf7R2x75Il200SigPooled.bedGraph
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep1V2.bigWig \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep2V2.bigWig stdout \
           | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqNhekR2x75Il200SigPooled.bedGraph
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2.bigWig \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep2V2.bigWig stdout \
           | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHepg2R2x75Il200SigPooled.bedGraph
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2.bigWig \
           /gbdb/hg19/bbi/wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep2V2.bigWig stdout \
           | bedWeedOverlapping ../../badRegions.bed stdin wgEncodeRegTxnCaltechRnaSeqHelas3R2x75Il200SigPooled.bedGraph
       # Now have to normalize.  This takes two passes.  The first just creates
       # a tab-separated file with normalization values. A batch file is made
       # by running awk on the tab file, and this batch file executes the
       # second half, transforming the bedGraphs to normalized values. The
       # normalization standardizes all files as if the total signal integrated to 10
       # billion over the genome.
       rm -f ../norm.tab
       foreach i (*SigPooled.bedGraph)
           echo processing $i
           echo -n $i " " >> ../norm.tab
           awk '{sum += $4*($3-$2);} END {print 10000000000/sum}' $i >> ../norm.tab
       end
 
       awk '{printf("colTransform 4 %s 0 %g ../normalized/%s\n", $1, $2, $1);}' ../norm.tab > ../norm.csh
       chmod a+x ../norm.csh
       mkdir ../normalized
       ../norm.csh
 
       # Make bigWigs
       cd ../normalized
       foreach i (*.bedGraph)
           echo processing $i
           bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw
       end
       
       # Link into /gbdb
       ln -s `pwd`/*.bw /gbdb/hg19/bbi
 
       # Tell database about it
       foreach i (*.bw)
           hgBbiDbLink hg19 $i:r /gbdb/hg19/bbi/$i
       end
 
       # Clean up
       cd /hive/data/genomes/hg19/bed/wgEncodeReg/txn
       rm pooled/*.bedGraph normalized/*.bedGraph
 
       # Make metadata: (forgive the long line)
       # JK TODO - add Hepg2 and Helas3 from here on down
 
       mdbQuery out=ra "select * from hg19 where \
           obj='wgEncodeCaltechRnaSeqNhlfR2x75Il200SigRep1V2' or \
           obj='wgEncodeCaltechRnaSeqNhekR2x75Il200SigRep1V2' or \
           obj='wgEncodeCaltechRnaSeqH1hescR2x75Il200SigRep1V2' or \
           obj='wgEncodeCaltechRnaSeqHelas3R2x75Il200SigRep1V2' or \
           obj='wgEncodeCaltechRnaSeqHepg2R2x75Il200SigRep1V2' or \
           obj='wgEncodeCaltechRnaSeqHuvecR2x75Il200SigRep1V2' or \
           obj='wgEncodeCaltechRnaSeqHsmmR2x75Il200SigRep1V2' or \
           obj='wgEncodeCaltechRnaSeqK562R2x75Il200SigRep1V2' or \
           obj='wgEncodeCaltechRnaSeqGm12878R2x75Il200SigRep1V2'" \
       | sed 's/SigRep1V2/SigPooled/g' | sed 's/wgEncodeCal/wgEncodeRegTxnCal/' \
       | grep -v ^replicate > mdb.ra 
 
       # Now patch mdb.ra into ~/kent/src/hg/makeDb/trackDb/human/hg19/metaDb/alpha/wgEncodeReg.ra
 
 
 
       # At this point actually are done with this track, except for
       # all the trackDb work on the overlayed histone mod tracks.
       # The rest is for some coloring stuff that is not yet actually used.
 
 
       # Do work to figure out how to color cells.  This is based on
       # hierarchical clustering of wgEncodeUwAffyAllExon. 
       mkdir -p /hive/data/genomes/hg19/bed/wgEncodeReg/cellColors
       cd /hive/data/genomes/hg19/bed/wgEncodeReg
 
       # Start with some alas time consuming data cleanup to get rid of
       # overlaps - losing about 10% of the data in the process which is
       # totally ok.
       mkdir allExon
       cd allExon
       mkdir inLinks noOverlap onlyCommon
       ln -s /hive/groups/encode/dcc/analysis/ftp/pipeline/hg19/wgEncodeUwAffyExonArray/*.gz inLinks
       cd inLinks
       foreach i (*.gz)
           bedRemoveOverlap $i ../noOverlap/$i:r
       end
       cd ..
       
       # Now make up a file of regions present in all inputs for further data
       # cleanup
       bedCommonRegions noOverlap/*.* > commonExons.bed
       cd noOverlap
       foreach i (*.broadPeak)
           echo processing $i
           bedRestrictToPositions $i ../commonExons.bed ../onlyCommon/$i
       end
       rm -r noOverlap
 
       # Now pool replicates where possible
       mdbQuery "select fileName,cell,replicate from hg19 where composite='wgEncodeUwAffyExonArray' and view='SimpleSignal'" -out=ra \
          | sed 's/\.gz//g' > replicate.ra
       encodeMergeReplicatesBatch replicate.ra onlyCommon merge.sh merged.ra merged
       mkdir merged
       chmod a+x merge.sh
       merge.sh
 
       # Resort merged output - somehow gets slightly messed by
       # encodeMergeReplicates.
       cd merged
       foreach i (*.broadPeak)
           echo sorting $i
           sort -k 1,1 -k 2,2n $i > tmp
           mv tmp $i
       end
 
       # Make file that converts fileName to cell
       raToTab merged.ra -cols=fileName,cell stdout \
          | sed 's/wgEncode/merged\/wgEncode/' > fileToCell.tab
 
       # Now build the tree. Took 2.5 minutes
       ls -1 merged/*.broadPeak > clusterInput.lst
       regClusterTreeCells rename=fileToCell.tab clusterInput.lst cluster.tree cluster.distances
 
 #########################################################################
 # 2012 ENCODE Regulation supertrack updates: #7526 (kate)
 
     #########################################################################
     # DNase Clusters: #9954  (2013-01-11)
     #
     # Create the DNAse peak clusters track using AWG uniformly processed peaks (Jan 2011 freeze, Sep 2012 analysis pubs).  
     # See doc/encodeAwgHg19.txt
     # There are 125 cell type+treatments  represented -- UW, Duke, and Uw+Duke for 14 cell types
     # Previous track had 74
 
     # Redoing filtering (2014-06-24) V3 version, to include single-cell clusters
  
     cd /hive/data/genomes/hg19/bed/wgEncodeReg
     mkdir -p dnaseAwg
     cd dnaseAwg
     set dir = `pwd`
 
     cd /hive/data/genomes/hg19/bed/wgEncodeAwgDnaseUniform/narrowPeak
     ls *.narrowPeak > $dir/peak.lst
 
     # generate a normalization factor
     # output format:
     # <filename> 0 1 2 <scoreIx> <scoreFactor>
     # wgEncodeAwgDnaseDuke8988tPeak.bed       0       1       2       6       10.7234 .
     regClusterMakeTableOfTables awgDnase01 $dir/peak.lst $dir/peak.table -verbose=3 >&! $dir/regTable.out &
     regClusterMakeTableOfTables awgDnase01Hg19 $dir/peak.lst $dir/peak.meta.table -verbose=3 >&! $dir/regTable.meta.out &
 
     # Make cluster table:  wgEncodeRegDnaseClusteredV2 (bed5, with name=#datasets represented in cluster)
     regCluster $dir/peak.table /dev/null $dir/peak.bed
     # Read 125 sources from peak.table
     # 2233542 singly-linked clusters, 2343745 clusters in 24 chromosomes
     regCluster -bedSources $dir/peak.meta.table /dev/null $dir/peak.meta.bed
 
     # filter out low scoring, single peaks in cluster and load in database
     cd $dir
     awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV2.bed
     wc -l wgEncodeRegDnaseClusteredV2.bed
     # 1281988 wgEncodeRegDnaseClusteredV2.bed
     hgLoadBed hg19 wgEncodeRegDnaseClusteredV2 wgEncodeRegDnaseClusteredV2.bed
 
     # V3 of track -- don't filter out single celltype peaks!
     # August 2014, KRR
     # filter out low scoring, but retain single cell type clusters
     awk '$5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV3a.bed
     # 1867665 wgEncodeRegDnaseClusteredV3a.bed
     hgLoadBed hg19 wgEncodeRegDnaseClusteredV3a wgEncodeRegDnaseClusteredV3a.bed
 
     # do same with metadata version, as we will actually load track with
     # this additional info, for VAI integration
     awk '$5 >= 100' peak.meta.bed > peak.meta.filtered.bed
     wc -l peak.meta.filtered.bed
     1867665 peak.meta.filtered.bed
 
     # format to BED5+floatscore+sources for hgBedSources 
     #   which will extract, uniquify, and assign ID's to sources
     awk 'BEGIN {OFS="\t"}{print $1, $2, $3, $4, $5, 0, $7;}' peak.meta.filtered.bed > peak.meta.bed6
     hgBedSources peak.meta.bed6 regDnase
     mv regDnaseSources.tab wgEncodeRegDnaseClusteredSources.tab
 
     # Load sources table
     autoSql $HOME/kent/src/hg/lib/idName.as idName
     hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredSources idName.sql wgEncodeRegDnaseClusteredSources.tab
     # merge files and format to BED5+sourceCount+sourceIds+sourceVals
     awk '{print $8}' peak.meta.filtered.bed > peak.vals
     awk 'BEGIN {OFS="\t"}{print $1, $2, $3, $4, $5, $7, $8;}' regDnase.bed | paste - peak.vals > wgEncodeRegDnaseClusteredV3.bed
     hgLoadBed hg19 wgEncodeRegDnaseClusteredV3 -sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql -renameSqlTable \
         -as=$HOME/kent/src/hg/lib/bed5SourceVals.as wgEncodeRegDnaseClusteredV3.bed
 
 
     #  NOTE: V1 had ~1M clusters, V2 has 1.2M, V3 has 1.9M
     # It would be nice if hgc sorted datasets by signal or cellType
     # Create metadata table:  wgEncodeRegV2DnaseClusteredInputs (inputTrackTable setting)
     #  was: objName, cellType, treatment, replicate, lab, dateUnrestricted
     # For V2, AWG is providing pooled replicates, and sometimes pooled labs.  Also,
     #   this data has all passed timestamp restriction, so don't need to show
     #      objName, cellType, treatment, lab(s)
     # DISPLAY: Consider item name change to % assays represented instead of count.
     #  Also, could color item name label to highlight ubiquitous vs. cell-type-specific
     #  (top quartile -> color indicating high expression (red/orange) ?, bottom green/blue ?)
 
     # Make wgEncodeRegDnaseClusteredInputV2 table.
     mdbQuery out=tab "select obj,cell,treatment,lab,dccAccession from hg19 where obj like 'wgEncodeAwgDnase%UniPk' and view='Peaks'" > wgEncodeRegDnaseClusteredInputsV2.tab
 
     hgLoadSqlTab hg19 wgEncodeRegDnaseClusteredInputsV2 \
         ~/kent/src/hg/lib/clusterInputAwgDnase.sql wgEncodeRegDnaseClusteredInputsV2.tab
 
     # Set up downloads
     cd $dir
     mkdir downloads
     gzip -c wgEncodeRegDnaseClusteredV2.bed > downloads/wgEncodeRegDnaseClusteredV2.bed.gz
     gzip -c wgEncodeRegDnaseClusteredInputsV2.tab > downloads/wgEncodeRegDnaseClusteredInputsV2.tab.gz
     gzip -c wgEncodeRegDnaseClusteredV3.bed > downloads/wgEncodeRegDnaseClusteredV3.bed.gz
     cp wgEncodeRegDnaseClusteredSources.tab downloads
 
     # use (existing) ENCODE DCC downloads dir
     set dload = /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegDnaseClustered
     cd $dload
 
     # call existing stuff release1 (use ENCODE-style downloads scheme)
     mkdir release1
     ln wgEncodeRegDnaseClustered.bed.gz wgEncodeRegDnaseClusteredInputs.tab.gz release1
     cp README.txt md5sum.txt files.txt release1 
 
     # new release dir
     mkdir release2
     ln -s release2 beta
     ln -s release2 releaseLatest
     cd $dir/downloads
     ln wgEncodeRegDnaseClusteredV2.bed.gz wgEncodeRegDnaseClusteredInputsV2.tab.gz $dload
     cd $dload
     ln wgEncodeRegDnaseClusteredV2.bed.gz wgEncodeRegDnaseClusteredInputsV2.tab.gz releaseLatest
     # include older files as well
     ln wgEncodeRegDnaseClustered.bed.gz wgEncodeRegDnaseClusteredInputs.tab.gz releaseLatest
     cp README.txt releaseLatest
     cd releaseLatest
     md5sum *.gz > md5sum.txt
     # vim README.txt
     cp README.txt md5sum.txt ..
 
     mkdir release3
     rm beta releaseLatest
     ln -s release3 beta
     ln -s release3 releaseLatest
     cd $dir/downloads
     ln wgEncodeRegDnaseClusteredV3.bed.gz wgEncodeRegDnaseClusteredSources.tab $dload
     cd $dload
     ln wgEncodeRegDnaseClusteredV3.bed.gz wgEncodeRegDnaseClusteredSources.tab releaseLatest
     cp README.txt releaseLatest
     cd releaseLatest
     md5sum *.gz > md5sum.txt
     # vim README.txt
     cp README.txt md5sum.txt ..
 
     # add minimal metaObject to metaDb to associate new table with (pseudo-)composite 
     # wgEncodeRegDnaseClustered -- this allows hgTrackUi to locate downloads
     cd ~/kent/src/hg/makeDb/trackDb/human/hg19/metaDb/alpha
     cat > wgEncodeRegDnaseClustered.ra << 'EOF'
 metaObject wgEncodeRegDnaseClusteredV2
 objType table
 composite wgEncodeRegDnaseClustered
 fileName wgEncodeRegDnaseClusteredV2.bed.gz
 tableName wgEncodeRegDnaseClusteredV2
 project wgEncode
 'EOF'
 
     cat > wgEncodeRegDnaseClustered.ra << 'EOF'
 metaObject wgEncodeRegDnaseClusteredV3
 objType table
 composite wgEncodeRegDnaseClustered
 fileName wgEncodeRegDnaseClusteredV3.bed.gz
 tableName wgEncodeRegDnaseClusteredV3
 project wgEncode
 'EOF'
 
 
 #########################################################################
 # Preliminary/Experimental/Older work below
 #########################################################################
 # ENCODE Regulation track V2: Track #7526 (In progress 2012-06-20 kate)
 # This version will have:
 #       * Replace Tx with CSHL stranded data, pooled (JK advising, also AWG use)
 #              http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformRNASignal.txt 
 #              These are from DCC wgEncodeCshlLongRnaSeq downloads (Long polyA+, whole cell),
 #               AWG did not pool reps or plus/minus for their hub (but we will)
 #               AWG has filtered for bad regions so no longer necessary
 #       * Possibly a 10th cell type (HMEC) for Tx and Histone
 #               Note: single replicate for HMEC Tx (others have 2)
 #       * Two additional cell lines for Histones (Hela, HepG2)
 #               - Note Hela & HepG2 already added to Tx track (July2012 release)
 #       * Histones from uniform processing: H3K4me1, H3K4me3, H3K27ac
 #             (http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/) 
 #                       wgEncodeBroadHistone{cell}{histone}StdAln_?Reps.norm5.rawsignal.bw
 #       * Add latest TF's, and use AWG uniform peak calls
 
 #       * Add latest DNase calls (possibly use AWG uniform)
 #          (http://www.uwencode.org/public/rthurman/encode_integrated_open_chrom/data/dataHub)
 #          AWG trackDb: http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt
 #               - can use subGroups line to determine cell type, but need to take care
 #                       with treatments, which are in filename but not subGroups
 #               - NOTE: AWG includes Duke data processed like UW.  Where both 
 #                       labs had data, they are merged. View is .01FDR peaks (narrow, bigbed 6+)
 #          Previous track:  74 cell types (2 reps), all UW.  This version:  125 cell types, pooled replicates, merged UW+Duke
 
 #  Also considering adding new data types from AWG hub:
 #               - Short RNA's
 #               - AWG uniform RNA contigs
 #          (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/rna_contigs/)
 #               - Segmentations (chromHMM, segWay, or combined)
 #          (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/segmentations)
 #               - FAIRE (cell-specific, union over 25 cell types)
 #          (http://www.ebi.ac.uk/~anshul/public/encodeRawData/dcc/hub/awgHub/faire/bigBed/)
 # NOTE: following Jim's methods from initial version
 
     # make root dir and link in our bad.bed file for filtering out
     # blacklisted and multiple mapping regions.
     cd /hive/data/genomes/hg19/bed
     mkdir wgEncodeRegV2
     cd wgEncodeRegV2
     ln -s /hive/data/genomes/hg19/bed/wgEncodeReg/badRegions.bed .
     #(was /hive/users/kent/regulate/companion/mapping/bad.bed)
 
     ######
     # Small RNA's
     # For signal, pool strands and replicates
     ######
     cd /hive/data/genomes/hg19/bed/
     mkdir -p wgEncodeRegV2/shortRna/pooled
     cd wgEncodeRegV2/shortRna/pooled
 
 cat > list.csh << 'EOF'
     foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hmec Hsmm Huvec K562 Nhek Nhlf)
       echo "processing ${cell}"
       ls /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotal*.bigWig
 'EOF'
     csh list.csh >&! list.out &
     # There are 6 of overalpping with histones and long RNAs.  4 other tier2's.
 
 cat > pool.csh << 'EOF'
     foreach cell (Gm12878 H1hesc Helas3 Hepg2 K562 Nhek A549 Cd20 Imr90 Mcf7)
       echo "processing ${cell}"
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapMinusRawRep1.bigWig \
           /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapMinusRawRep2.bigWig \
           /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapPlusRawRep1.bigWig \
           /gbdb/hg19/bbi/wgEncodeCshlShortRnaSeq${cell}CellShorttotalTapPlusRawRep2.bigWig \
             wgEncodeRegTxnCshlShortRnaSeq${cell}CellTotalRawSigPooled.bedGraph
     end
 'EOF'
     csh pool.csh >&! pool.out &
 
     # got 8 cells, two failed -- CD20 and IMR90, may be rescuable, but continuing 
     #  for now with others to see if its worth it.
 
 cat > big.csh << 'EOF'
     foreach i (*.bedGraph)
         echo processing $i
         bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw
     end
 'EOF'
     csh big.csh >&! big.out &
 
     # add to database
     set dir = /gbdb/hg19/bbi/encodeRegV2
     mkdir $dir
     ln -s `pwd`/*.bw $dir
     foreach i (*.bw)
         hgBbiDbLink hg19 $i:r $dir/$i
     end
 
     ######
     # Handle the RNA seq, starting by pooling it and filtering out multiple
     # mapping and blacklisted regions.
     ######
     cd /hive/data/genomes/hg19/bed/
     mkdir -p wgEncodeRegV2/txn/pooled
     cd wgEncodeRegV2/txn/pooled
 
 cat > poolTx.csh << 'EOF'
     foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hsmm Huvec K562 Nhek Nhlf)
       echo "processing ${cell}"
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep1.bigWig \
           /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep2.bigWig \
           /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep1.bigWig \
           /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep2.bigWig \
             wgEncodeRegTxnCshlLongRnaSeq${cell}CellPapRawSigPooled.bedGraph
     end
 'EOF'
     csh poolTx.csh >&! poolTx.out &
 
     #  ~8 minutes per cell type
     # NOTE:  25 chroms in Hepg2, H1hesc, Huvec, Nhlf (male or unknown sex)
 
     # Add in HMEC, which has single replicates
 
 cat > poolHmec.csh << 'EOF'
     foreach cell (Hmec)
       echo "processing ${cell}"
       bigWigMerge -adjust=-1 \
           /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapMinusRawSigRep1.bigWig \
           /gbdb/hg19/bbi/wgEncodeCshlLongRnaSeq${cell}CellPapPlusRawSigRep1.bigWig \
               wgEncodeRegTxnCshlLongRnaSeq${cell}CellPapRawSigPooled.bedGraph
     end
 'EOF'
     csh poolHmec.csh >&! poolHmec.out &
 
     # Normalize.  This takes two passes.  The first creates
     # a tab-separated file with normalization values. A batch file is made
     # by running awk on the tab file, and this batch file executes the
     # second half, transforming the bedGraphs to normalized values. The
     # normalization standardizes all files as if the total signal integrated to 10
     # billion over the genome.
     rm -f ../norm.tab
 
 cat > makeNorm.csh << 'EOF'
     foreach i (*SigPooled.bedGraph)
         echo processing $i
         echo -n $i " " >> ../norm.tab
         awk '{sum += $4*($3-$2);} END {print 10000000000/sum}' $i >> ../norm.tab
     end
     awk '{printf("echo %s; colTransform 4 %s 0 %g ../normalized/%s\n", $1, $1, $2, $1);}' ../norm.tab > ../norm.csh
 'EOF'
     csh makeNorm.csh >&! makeNorm.out &
 # ~2 mins per cell type
 
     mkdir ../normalized
     csh ../norm.csh  >&! ../norm.out &
 #### STOPPED HERE
     cd ../normalized
 
     # make bigWigs
 cat > big.csh << 'EOF'
     foreach i (*.bedGraph)
         echo processing $i
         bedGraphToBigWig $i /cluster/data/hg19/chrom.sizes $i:r.bw
     end
 'EOF'
     csh big.csh >&! big.out &
 
     # add to database
     set dir = /gbdb/hg19/bbi/encodeRegV2
     mkdir $dir
     ln -s `pwd`/*.bw $dir
     foreach i (*.bw)
         hgBbiDbLink hg19 $i:r $dir/$i
     end
 
     ######
     # Possible new track:  FAIRE Peaks Union across 25 cells, from AWG
     ######
     cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
     mkdir -p faire/peaks
     cd faire/peaks
     wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformFaire.txt
     sed -n 's/bigDataUrl /wget /p' uniformFaire.txt | grep Union > wget.csh
     csh wget.csh >&! wget.out &
     mv *Union*.bb encodeAwgFairePeaks25CellUnion.bb
     bigBedToBed encodeAwgFairePeaks25CellUnion.bb encodeAwgFairePeaks25CellUnion.bed 
     # head -1 *.bed
      chr1    10033   10376
     # bed 3
     hgLoadBed hg19 encodeAwgFairePeaks25CellUnion encodeAwgFairePeaks25CellUnion.bed 
     # Read 2369615 elements of size 3 from encodeAwgFairePeaks25CellUnion.bed
 
     ######
     # Create the DNAse peak clusters subtrack.
     ######
     cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
     mkdir -p dnase/peaks
     cd dnase
     wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.html
     wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformDnase.txt
     sed -n 's/bigDataUrl /wget /p' uniformDnase.txt > wget.csh
     csh wget.csh >&! wget.out &
 # 125 files (.bb), dated Nov 22, 2011
 # Sample case had ~150K peaks (by bigBedInfo)
 # These are narrowPeak (by bigBedSummary) w/ position & signalVal, no name, score or stats
 #$ bigBedToBed wgEncodeUwDnaseSKNMC.fdr01peaks.hg19.bb stdout | head
 chr1    10180   10330   .       0       .       26.000000       -1      -1      -1
 # NOTE: Steve Wilder is planning to add scores and make formats of these files more uniform
 
     # rename to UCSC style and convert to bed: wgEncodeAwgDnase<lab><cell>Peak
     ls *.fdr01peaks.hg19.bb | \
         perl -wpe 's/(wgEncode)(\S+)(Dnase)(\S+)(.fdr01peaks.hg19.bb)/bigBedToBed $1$2$3$4$5 $1Awg$3\u\L$2\E\u\L$4\EPeak.bed/' > unbig.csh
     # convert to bed and rename
     csh unbig.csh >&! unbig.out
     # udc couldn't read 4 bytes from wgEncodeUWDukeDnaseHepG2.fdr01peaks.hg19.bb, did read 0
     #  This dataset was truncated
     #  Repeated wget and bigBedToBed for HepG2 (8/23)
     ls -1 *AwgDnase*.bed > peak.lst
 
     # Datasets with treated cells (hints from long label)
     grep '(' uniformDnase.txt | grep longLabel > treated.txt
     wc -l treated.txt
 # 7 datasets
     # Tamoxifen -> Tam10030 (Duke) or 40HAM20nM72hr (UW), Estradiol -> Est10nm30m, Ifna4h, Andro, Hypox
     # TODO: Edit to rename
 
     # generate a normalization factor to
     # convert signalValue to score (0-1000).  Also note score field; narrowPeak is 7 (default)
     # output format:
     # <filename> 0 1 2 <scoreIx> <scoreFactor>
     # wgEncodeAwgDnaseDuke8988tPeak.bed       0       1       2       6       10.7234 .
     regClusterMakeTableOfTables awgDnase01 peak.lst peak.table -verbose=3 >&! regTable.out &
 
     # Make cluster table:  wgEncodeRegDnaseClusteredV2 (bed5, with name=#datasets represented in cluster)
     regCluster peak.table /dev/null peak.bed
     #Read 125 sources from peak.table
     # 2233542 singly-linked clusters, 2343745 clusters in 24 chromosomes
 
     # filter out low scoring, single peaks in cluster and load in database
     awk '$4 > 1 && $5 >= 100' peak.bed > wgEncodeRegDnaseClusteredV2.bed
     wc -l wgEncodeRegDnaseClusteredV2.bed
 
     hgLoadBed hg19 wgEncodeRegDnaseClusteredV2 wgEncodeRegDnaseClusteredV2.bed
     # Read 1281988 elements of size 5 from wgEncodeRegDnaseClusteredV2.bed
 
     # load individual peak files as a track (needed by hgc)
     # TODO: generate labels out of metadata
 
 cat > loadDnase.csh << 'EOF'
     set b = wgEncodeAwgDnase
     set t = ${b}Peak
     foreach f (${b}*)
         set table = $f:r
         hgLoadBed -noNameIx -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg19 $table $f
         set cell = `echo $f | perl -wpe 's/(wgEncodeAwgDnase[A-Z][a-z]+)(.*)Peak.bed/$2/'`
         echo "    track $table" >> $t.ra
         echo "    type narrowPeak" >> $t.ra
         echo "    parent $t" >> $t.ra
         echo "    shortLabel $cell DNase" >> $t.ra
         echo "    longLabel $cell DNaseI HS Uniform Peaks from ENCODE/Analysis" >> $t.ra
         echo "    subGroups cellType=$cell treatment=None" >> $t.ra
         echo "" >> $t.ra
     end
 'EOF'
 
     #  NOTE: V1 had ~1M clusters.
     # Other data tables: Need to load the peak files as they are used by hgc
     #   These should probably also be subtracks.
     # It would be nice if hgc sorted datasets by signal or cellType
     # Create metadata table:  wgEncodeRegV2DnaseClusteredInputs (inputTrackTable setting)
     #  was: objName, cellType, treatment, replicate, lab, dateUnrestricted
     # For V2, AWG is providing pooled replicates, and sometimes pooled labs.  Also,
     #   this data has all passed timestamp restriction, so don't need to show
     #      objName, cellType, treatment, lab(s)
     # DISPLAY: Consider item name change to % assays represented instead of count.
     #  Also, could color item name label to highlight ubiquitous vs. cell-type-specific
     #  (top quartile -> color indicating high expression (red/orange) ?, bottom green/blue ?)
 
 
     ######
     # Uniform histone signal for overlay wiggles
     # Note: there are UW signal files as well -- omitting these for consistency,
     # Check if merging would be a better option
     ######
     cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
     mkdir -p histone/signal
     cd histone/signal
 
 cat > getHistones.csh << 'EOF'
     foreach cell (Gm12878 H1hesc Helas3 Hepg2 Hsmm Huvec K562 Nhek Nhlf Hmec)
         foreach histone (H3k4me1 H3k4me3 H3k27ac)
             set var = "wgEncodeBroadHistone${cell}${histone}"
             echo $var
             curl -O http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/${var}StdAln_\[1-3\]Reps.norm5.rawsignal.bw
         end
     end
 'EOF'
     csh getHistones.csh >&! getHistones.out &
 
     # save real files (>1k size) as curl creates info files when match used
     mkdir sav
     find . -name '*.bw' -a -size +1k -exec mv '{}' sav \;
     rm *.bw
     mv sav/* .
     rmdir sav
 
     # note: missing Hela and HepG2 H3K4me1 - these have different filename pattern (check with Anshul why)
     # use curl to retrieve by pattern match as we don't have read perms on directory.  Then remove non-useful files
     curl -O http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/wgEncodeBroadHistoneHelas3H3k4me1Std_\[1-3]Reps.norm5.rawsignal.bw
     # good one is '*Std_1Rep', remove 2Rep and 3Rep
     rm *Hela*Std_[2-3]*
     wget http://www.ebi.ac.uk/~anshul/public/encodeRawData/rawsignal/jan2011/pooledReps/bigwig/wgEncodeBroadHistoneHepg2H3k4me1Std_1Reps.norm5.rawsignal.bw
 
     ######
     # TFBS from Jan2011 freeze
     ######
     # The TFBS track is a specialized type:  factorSource.  Requires 3 tables:
     #   track table: wgEncodeRegTfbsClustered (bed15)
     #   source table: wgEncodeRegTfbsCells
     #          Maps cell type (including treatment) to single letter
     #          Currently, there are fixed unique assignments for Tier1 and 2.
     #           There are all upper case.
     #           Tier 3 just use first char of cell name, lower-cased.  Not unique at all!
     # Looks like only rows 2 and 3 of the table are actually used/informative
     # Ex. row         0 G     GM12878 n/a     n/a     n/a     3       n/a,n/a/,GM12878,
     #   inputTrack (metadata) table: wgEncodeRegTfbsClusteredInputs
     # DISPLAY: Better with target protein instead of antibody ?
     # DISPLAY: Consider item showing letter for all experiments, with name colored (brown ?) if this peak
     #           found it the cell type
     # DISPLAY: fix ordering of cell letters -- they should be sorted 
     #   (in cellLetter.tab first, so Exps files has correct id's)
 
     cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
     mkdir -p tfbs/peaks
     cd tfbs/peaks
 
     wget http://www.ebi.ac.uk/~swilder/ENCODE/awgHub/hg19/uniformTfbs_nocontrols.txt 
     # this file has metadata
     # there are 3 'method' values:  SPP, PeakSeq, wiggler
     # there are 2 'quality' values:  Good, Caution
     # There are 144 antibodies (for 122 factors), 75 cell types, 95 cell types plus treatments, 457 datasets
     # Version 1 of this track had 148 antibodies, 67 cell types, 95 cell type+treatments, 486 datasets
     # -> basically the same data, but cleaned up
 
     sed -n 's/bigDataUrl /wget /p' uniformTfbs_nocontrols.txt | grep Tfbs > wget.csh
     sed -n 's/bigDataUrl /wget /p' uniformTfbs_nocontrols.txt | grep -v Tfbs > wget2.csh
     csh wget.csh >&! wget.out &
     # 416 files
     csh wget2.csh >&! wget2.out &
     # 41 (have Histone in filename)
     # sample filename: spp.optimal.wgEncodeUwTfbsHcfaaCtcfStdAlnRep1_VS_wgEncodeUwTfbsHcfaaInputStdAlnRep1.bb
     # from hub trackDb, metadata is in subGroups line:
     #  subGroups tier=3 cellType=HCFaa factor=CTCF antibody=CTCF lab=UW treatment=None protocol=Std quality=Good method=SPP
     # Note 'quality'.  Think about dropping those with quality='Caution'. Unfortunately some
     #  Tier1 , esp. Pol2 are only available with this quality
     # Q: What does Rep1 mean ?  Most are 'Rep0' (Answer from Anshul -- Rep0 are pooled, Rep1 were single-rep but adequate qual)
     # (Jim's methods exluded those not based on 2 replicates)
     # Q: Why are some datasets 'StdAln' and some are 'Aln' ? (Answer from Anshul -- names are from UCSC)
     # Q: How is quality assigned ?
 
     # rename to UCSC style and convert to bed: wgEncodeAwgTfbs<lab><cell>Peak
     cd ..
     ls peaks/spp.optimal.*.bb | \
         perl -wpe 's^peaks/(spp.optimal.)(wgEncode)(\S+)(Tfbs)(\S+)(StdAln|Aln)(Rep\d_VS\S+)^bigBedToBed peaks/$1$2$3$4$5$6$7 peaks/$2Awg$4$3$5Peak.bed^' > unbig.csh
     # convert to bed and rename
     csh unbig.csh >&! unbig.out &
 
     # NOTE: ~10% (49 of 457 total) are dupes from different labs
     # Factors are:  CFOS, CMYC, CTCF, E2F6, EBF1F FOXA1, GATA2, JUND, MAFK
     # MAX, POL2, RAD21, SRF, YY1
 
     ls -1 peaks/*.bed > peak.lst
 
 ######
 # TFBS from July 2012 freeze
 #  Uniform processing with slightly more relaxed threshold (2% FDR vs 1% based on
 #   input Anshul received from users.  And there are more data sets (690 after removing
 #   quality failures vs. 495 in previous freeze)
 #   Based on Anshul's recommendations, using 'optimal' SPP peaks, with black list filtering
 #   (as in previous track).
 ######
   
     cd /hive/data/genomes/hg19/bed/wgEncodeRegV2
     mkdir -p tfbs2012
     cd tfbs2012
     set dir = `pwd`
     mkdir logs
 
     # use high-quality uniform peaks used to build Uniform TFBS track
     # (see doc/encodeAwgHg19.txt)
     # This will require much less work than previous build, as much of the processing
     # was done to make this predecessor track (uniform naming & normalized scoring, extracting metadata)
 
     ln -s /hive/data/genomes/hg19/bed/wgEncodeAwgTfbsUniform/downloads peaks
     ls peaks/*.narrowPeak.gz > peak.lst
     wc -l  peak.lst
     # 690 peak.lst
 
     # extract metadata for files, to format:
     # filename   cell[+treatment] antibody
 cat > getMeta.csh << 'EOF'
     foreach p (`cat $1`)
         set obj = $p:t:r:r
         echo -n "$p\t"
         # note: must strip off treatment=None
         mdbPrint hg19 -obj=$obj | raToTab stdin -cols=cell,treatment,antibody stdout | \
             perl -wne '($cell, $treat, $ab) = split; $cellTreat = $cell . "+" . $treat; \
                         $cellTreat =~ s/\+None//; print $cellTreat, "\t", $ab, "\n";'
     end
 'EOF'
     csh getMeta.csh peak.lst > fileCellAb.withDupes.tab
 
     # alternate extraction of metadata - include lab, so we can be inclusive of max data
     # filename   cell[+treatment]+lab antibody
 cat > getMetaBig.csh << 'EOF'
     foreach p (`cat $1`)
         set obj = $p:t:r:r
         echo -n "$p\t"
         # note: must strip off treatment=None
         mdbPrint hg19 -obj=$obj | raToTab stdin -cols=cell,treatment,lab,antibody stdout | \
             perl -wne '($cell, $treat, $lab, $ab) = split; $cellTreat = $cell . "+" . $treat . "+" . $lab; \
                         $cellTreat =~ s/\+None//; print $cellTreat, "\t", $ab, "\n";'
     end
 'EOF'
     csh getMetaBig.csh peak.lst > fileCellAb.tab
 
     # assign upper case codes to cell types in tiers 1, 2, and 2.5
     # (tier 3's will all be assigned lower case first char)
     # note: treated cells have same code as untreated
     cp ~/kent/src/hg/regulate/regClusterBedExpCfg/cellLetter2.tab cellCodesTop.txt
     # hand-edit in codes, promoting Tier 2.5 to caps (A549, MCF-7, SK-N-SH, IMR90)
 
     # create config file for clustering
     regClusterBedExpCfg -noNormalize -tabList fileCellAb.withDupes.tab \
         -cellLetter=cellCodesTop.txt -noLetterOk clusters.cfg
     # format is:
     # antibody   cell[+treatment]  cell_char   file   score_column   normalization   filename
     # CTCF    H1-hESC 1   file  4   1     peaks/wgEncodeAwgTfbsBroadH1hescCtcfUniPk.narrowPeak.gz
     regClusterBedExpCfg -noNormalize -tabList fileCellAb.tab \
         -cellLetter=cellCodesTop.txt -noLetterOk clusters.all.cfg
     # format is:
     # antibody   cell[+treatment]+lab  cell_char   file   score_column   normalization   filename
     # CTCF    H1-hESC 1   file  4   1    peaks/wgEncodeAwgTfbsBroadH1hescCtcfUniPk.narrowPeak.gz
 
     # do clustering
     hgBedsToBedExps -dupeLetterOk clusters.cfg clusters.bed clusters.exps >&! logs/clusters.out &
     hgBedsToBedExps -dupeLetterOk clusters.all.cfg clusters.bed clusters.exps >&! logs/clusters.all.out & # 4 dupes (HAIB A549+Etoh/USF1, HepG2/NRSF, SK-N-SH/NRSF -- these are V04* vs. Pcr*)
     # and SYDH K562/Pol2 -- Iggrab control vs none indicated
 
     # hand-edit fileCellAb.tab to add V04 and Iggrab to cell field to uniquify
     regClusterBedExpCfg -noNormalize -tabList fileCellAb.uniq.tab \
         -cellLetter=cellCodesTop.txt -noLetterOk clusters.uniq.cfg
 
     # cluster again, using uniquified config file
     hgBedsToBedExps -dupeLetterOk clusters.uniq.cfg clusters.bed clusters.exps >&! logs/clusters.uniq.out &
 
     cat logs/clusters.uniq.out
     #Loaded 690 records from clusters.uniq.cfg
     #Got 194 sources
     #Got 189 factors
     ## Success!
     wc -l clusters.bed
     # 4910269 clusters.bed
 
     # Load database, first clusters
     hgLoadBed hg19 wgEncodeRegTfbsClusteredV4 clusters.bed
     # Read 4910269 elements of size 15 from clusters.bed
     # was 4735242 in V3
     # was 2750490 in V2
 
     # Load sourceTable
     hgLoadSqlTab hg19 wgEncodeRegTfbsCellsV4 ~/kent/src/hg/lib/expRecord.sql clusters.exps
 
     # Create inputTrackTable - six columns: <table> <source> <factor> <cell> <treatment> <lab>
 cat > getInputs.csh << 'EOF'
     foreach p (`cat $1`)
         set obj = $p:t:r:r
         echo -n "$obj\t"
         # note: strip off treatment=None
         mdbPrint hg19 -obj=$obj | raToTab stdin -cols=cell,treatment,lab,antibody stdout | \
             perl -wne '($cell, $treat, $lab, $ab) = split; \
                     $source = $cell . "+" . $treat . "+" . $lab; \
                     $source =~ s/\+None//; \
                     print $source, "\t", $ab, "\t", $cell, "\t", $treat, "\t", $lab, "\n";'
     end
 'EOF'
     csh getInputs.csh peak.lst > wgEncodeRegTfbsClusteredInputsV4.tab
 
     hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputsV4 ~/kent/src/hg/lib/clusterInputTrackTable4.sql wgEncodeRegTfbsClusteredInputsV4.tab
 
     # Clustering by target
 
 cat > getMetaHuge.csh << 'EOF'
     foreach p (`cat $1`)
         set obj = $p:t:r:r
         echo -n "$p\t"
         # note: must strip off treatment=None
         mdbPrint hg19 -obj=$obj | \
                 raToTab stdin -cols=cell,treatment,lab,antibody,target stdout | \
             perl -wne '($cell, $treat, $lab, $ab, $target) = split;  \
                     $source = $cell . "+" . $treat . "+" . $lab; \
                     $source =~ s/\+None//; print $source, "\t", $ab, "\t", $target, "\n";'
     end
 'EOF'
     csh getMetaHuge.csh peak.lst > fileCellAbTarget.tab
     regClusterBedExpCfg -noNormalize -useTarget -tabList fileCellAbTarget.tab \
         -cellLetter=cellCodesTop.txt -noLetterOk clusters.target.cfg
     # format is:
     # target   cell[+treatment]+lab+ab  cell_char   file   score_column   normalization   filename
     hgBedsToBedExps -dupeLetterOk clusters.target.cfg clusters.target.bed clusters.target.exps >&! \
                 logs/clusters.target.out &
     # 4 dupes:  3 HudsonAlpha (2 protocols), 1 Stanford (2 controls)
     # hand-edit fileCellAbTarget.tab to add V04 and Iggrab to cell field to uniquify
 
     # create cluster config file
     regClusterBedExpCfg -noNormalize -useTarget -tabList fileCellAbTarget.uniq.tab \
                 -cellLetter=cellCodesTop.txt -noLetterOk clusters.target.cfg
 
     # next step is the actual clustering, and takes some time (minutes)
     hgBedsToBedExps -dupeLetterOk clusters.target.cfg clusters.target.bed clusters.target.exps >&! \
                 logs/clusters.target.out &
     #Loaded 690 records from clusters.target.cfg
     #Got 690 sources
     #Got 161 factors
 
     hgLoadBed hg19 wgEncodeRegTfbsClusteredV6 clusters.target.bed
     ln -s clusters.target.bed wgEncodeRegTfbsClusteredV6.tab
     # V5 with separate HA-E2F1 factor: Read 4385242 elements of size 15 from clusters.target.bed
     # Read 4380444 elements of size 15 from clusters.target.bed
     hgLoadSqlTab hg19 wgEncodeRegTfbsCellsV6 ~/kent/src/hg/lib/expRecord.sql clusters.target.exps
     ln -s clusters.targets.exps wgEncodeRegTfbsCellsV6.tab
 
     # Create inputTrackTable - six columns: <table> <source> <factor> <cell> <treatment> <lab>
     # For target-based, include antibody in source field
 cat > getTargetInputs.csh << 'EOF'
     foreach p (`cat $1`)
         set obj = $p:t:r:r
         echo -n "$obj\t"
         # note: strip off treatment=None
         mdbPrint hg19 -obj=$obj | raToTab stdin -cols=cell,treatment,lab,antibody,target stdout | \
             perl -wne '($cell, $treat, $lab, $ab, $target) = split; \
                 $source = $cell . "+" . $treat . "+" . $lab . "+" . $ab; \
                 $source =~ s/\+None//; \
                 print $source, "\t", $target, "\t", $ab, "\t", \
                         $cell, "\t", $treat, "\t", $lab, "\n";'
     end
 'EOF'
     csh getTargetInputs.csh peak.lst > wgEncodeRegTfbsClusteredInputsV6.tab
     hgLoadSqlTab hg19 wgEncodeRegTfbsClusteredInputsV6 \
         ~/kent/src/hg/lib/clusterInputTrackTable5.sql wgEncodeRegTfbsClusteredInputsV6.tab
 
     # compare coverage of V2-V6  (V5 is same as 6, except clusters HA-E2F1 as separate target)
     # V2 - current version.  June 2011 TFBS (425 inputs)
     featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV2
     # wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV2 7.797%, both 4.706%, cover 35.14%, enrich 4.51x
     featureBits hg19 -noRandom -enrichment wgEncodeRegTfbsClusteredV2 wgEncodeRegDnaseClusteredV2 
     # wgEncodeRegTfbsClusteredV2 7.797%, wgEncodeRegDnaseClusteredV2 13.390%, both 4.706%, cover 60.36%, enrich 4.51x
 
     # 'V3' - clustered by antibody. Mar 2012 TFBS, datasets thinned to uniquify (e.g. no multi-lab)
     # (635 inputs)
     featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV3
     # wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV3 12.855%, both 6.578%, cover 49.13%, enrich 3.82x
 
     # 'V4' - clustered by antibody. Mar 2012 TFBS, all datasets passing Anshul's qual filter (690)
     featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV4
     # wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV4 13.156%, both 6.690%, cover 49.96%, enrich 3.80
 
     # 'V5' - clustered by target. Mar 2012 TFBS, all datasets passing Anshul's qual filter (690)
     featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV5
     # wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV5 13.156%, both 6.690%, cover 49.96%, enrich 3.80x
 
     # 'V6' - clustered by target. Mar 2012 TFBS, all datasets passing Anshul's qual filter (690)
     featureBits hg19 -noRandom -enrichment wgEncodeRegDnaseClusteredV2 wgEncodeRegTfbsClusteredV6
     # wgEncodeRegDnaseClusteredV2 13.390%, wgEncodeRegTfbsClusteredV6 13.156%, both 6.690%, cover 49.96%, enrich 3.80x
 
     featureBits hg19 -noRandom -enrichment wgEncodeRegTfbsClusteredV5 wgEncodeRegDnaseClusteredV2 
     # wgEncodeRegTfbsClusteredV5 13.156%, wgEncodeRegDnaseClusteredV2 13.390%, both 6.690%, cover 50.86%, enrich 3.80x
 
     # V6 is release target -- rename the antibody clustering to V3a, and rename V6 to V3 (use symlinks in build dir)
 
     # Set up downloads
     set dir = /hive/data/genomes/hg19/bed/wgEncodeRegV2/tfbs2012
     cd $dir
     mkdir downloads
     gzip -c wgEncodeRegTfbsClusteredV3.bed > downloads/wgEncodeRegTfbsClusteredV3.bed.gz
     # disallow name truncation
     gzip -c -n wgEncodeRegTfbsClusteredInputsV3.tab > downloads/wgEncodeRegTfbsClusteredInputsV3.tab.gz
     gzip -c wgEncodeRegTfbsCellsV3.tab > downloads/wgEncodeRegTfbsCellsV3.tab.gz
 
     # use (existing) ENCODE DCC downloads dir
     set dload = /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegTfbsClustered
     cd $dload
 
     # new release dir
     mkdir release3
     ln -s release3 beta
     ln -s release3 releaseLatest
     cd $dir/downloads
     ln wgEncodeRegTfbsClusteredV3.bed.gz wgEncodeRegTfbsClusteredInputsV3.tab.gz wgEncodeRegTfbsCellsV3.tab.gz $dload
     cd $dload
     ln wgEncodeRegTfbsClusteredV3.bed.gz wgEncodeRegTfbsClusteredInputsV3.tab.gz wgEncodeRegTfbsCellsV3.tab.gz releaseLatest
     # include older files as well 
     cp wgEncodeRegTfbsClusteredV2.bed.gz wgEncodeRegTfbsClusteredInputs.tab.gz wgEncodeRegTfbsCells.tab.gz releaseLatest
     cp README.txt releaseLatest
     # clean up bogus names embedded in zips)
     # gunzip wgEncodeRegTfbsClusteredV2.bed.gz wgEncodeRegTfbsClusteredInputs.tab.gz wgEncodeRegTfbsCells.tab.gz
     md5sum *.gz > md5sum.txt
     # vim README.txt
     # use wgEncodeRegDnaseClustered for basics
     cp README.txt md5sum.txt ..
 
     # add minimal metaObject to metaDb to associate new table with (pseudo-)composite 
     # wgEncodeRegTfbsClustered -- this allows hgTrackUi to locate downloads
     cd $dir
     cat > wgEncodeRegTfbsClustered.ra << 'EOF'
 metaObject wgEncodeRegTfbsClusteredV3
 objType table
 composite wgEncodeRegTfbsClustered
 fileName wgEncodeRegTfbsClusteredV3.bed.gz
 tableName wgEncodeRegTfbsClusteredV3
 project wgEncode
 'EOF'
     # also, add aux tables, then mdbUpdate, mdbPrint
 
    # This script creates a BED5+ download file with clusters and cell info
    clusterAddSources.pl wgEncodeRegTfbsClusteredV3.bed wgEncodeTfbsClusteredInputsV3.tab > \
         wgEncodeRegTfbsClusteredWithCellsV3.bed
     gzip -c -n wgEncodeRegTfbsClusteredWithCellsV3.bed > downloads/wgEncodeRegTfbsClusteredWithCellsV3.bed.gz
     cd downloads
     ln wgEncodeRegTfbsClusteredWithCellsV3.bed.gz $dload
     cd $dload
     ln wgEncodeRegTfbsClusteredWithCellsV3.bed.gz releaseLatest
     md5sum *.gz > md5sum.txt
     # vim README.txt
     cp README.txt md5sum.txt ..
 
 ############
 # Fixes to Inputs table  (2014-05-19 kate)
 # Matches manual changes to uniqify Cells table
 
 hgsql hg19 -e 'update wgEncodeRegTfbsClusteredInputsV3 set source="A549+EtOH_0.02pct+HudsonAlpha+V04+USF-1" where tableName="wgEncodeAwgTfbsHaibA549Usf1V0422111Etoh02UniPk"'
 hgsql hg19 -e 'update wgEncodeRegTfbsClusteredInputsV3 set source="HepG2+HudsonAlpha+V04+NRSF" where tableName="wgEncodeAwgTfbsHaibHepg2NrsfV0416101UniPk"'
 hgsql hg19 -e 'update wgEncodeRegTfbsClusteredInputsV3 set source="SK-N-SH+HudsonAlpha+V04+NRSF"  where tableName="wgEncodeAwgTfbsHaibSknshNrsfV0416101UniPk"'
 hgsql hg19 -e 'update wgEncodeRegTfbsClusteredInputsV3 set source="K562+Stanford+Iggrab+Pol2(phosphoS2)" where tableName="wgEncodeAwgTfbsSydhK562Pol2s2IggrabUniPk"'
 
 
 #######################################################################
 # performance eval
 
 # try changing expIds and expScores to varchar 2048 from blob
 # create table
 alter table kateTemp disable keys;
 insert into kateTemp (select * from wgEncodeTfbsClusteredV3)
 
 # try bigBed
 
     set dir = /cluster/data/hg19/bed/wgEncodeRegV2/tfbs2012
     cd $dir
     set track = wgEncodeRegTfbsClusteredV3
     bedSort clusters.target.bed clusters.target.sorted.bed
     /cluster/bin/x86_64/bedToBigBed -type=bed15 -tab clusters.target.sorted.bed \
         /hive/data/genomes/hg19/chrom.sizes $track.bb
     # Error line 1 of xx.bed: expecting 690 elements in expIds list (bed field 14)
     # Might be a bogus error -- looks OK on inspection
     # GOT HERE
     ln -s `pwd`/$track.bb /gbdb/hg19/wgEncodeReg
     hgBbiDbLink hg19 $track wg/gbdb/hg19/wgEncodeReg/$track.bb
 
 # for now, adding maxWindowToDraw 1M setting to trackDb to limit performance hit
 
 # shrink table format to BED5+, with columns 6 and 7 non-sparse comma-sep lists
 # of exp id's and scores
 
     clusterShrinkExps.pl clusters.target.sorted.bed > clusters.shrunk.bed
+# NOTE: moving this tool to makeDb/hgBedsToBedExps/bedExpsToFactorSource.pl
 
     hgLoadBed -sqlTable=~/kent/src/hg/lib/factorSource.sql -renameSqlTable \
         hg19 wgEncodeRegTfbsClusteredShrunkMini test.shrink.bed
 
     hgLoadBed -sqlTable=~/kent/src/hg/lib/factorSource.sql -renameSqlTable \
         hg19 wgEncodeRegTfbsClusteredShrunk clusters.shrunk.bed
 
 # rename Shrunk to V3
 # copy to downloads
     cp clusters.shrunk.bed downloads/wgEncodeRegTfbsClusteredV3.bed
     cd downloads
     gzip wgEncodeRegTfbsClusteredV3.bed
 
     cd /hive/groups/encode/dcc/pipeline/downloads/hg19/wgEncodeRegTfbsClustered/releaseLatest
     md5sum wgEncodeRegTfbsClusteredV3.bed >> md5sum.txt
     # and remove old entry
 
     # (see Wrangler HowTo in ENCODE2 wiki for these steps)
     # replace md5sum in md5sum.txt, metaDb
 
     # create files.txt
     encodeMkFilesList hg19
 
 
 
 #######################################################################
 
 # EXPERIMENTAL:  TF motif tracks, from EMBL/Aus (MEME toolkit)
 # These are generated using FIMO tool:
 # Charles E. Grant, Timothy L. Bailey, and William Stafford Noble, "FIMO: Scanning for occurrences of a given motif", Bioinformatics 27(7):10171018, 2011
 # http://bioinformatics.oxfordjournals.org/content/early/2011/02/16/bioinformatics.btr064.full
 #  http://meme.ebi.edu.au/meme/cgi-bin/fimo.cgi
 
     mkdir /cluster/data/hg19/bed/tfbsFimo
     cd /cluster/data/hg19/bed/tfbsFimo
 
     hgLoadBed hg19 tfbsFimoJaspar TfMotifsFIMO_Jaspar.hg19.bed
     # Read 1386302 elements of size 6 from TfMotifsFIMO_Jaspar.hg19.bed
 
     hgLoadBed hg19 tfbsFimoTransfac TfMotifsFIMO_Transfac.hg19.bed
     # Read 3665198 elements of size 6 from TfMotifsFIMO_Transfac.hg19.bed
 #######################################################################
 
 # more motif exploratory work  -- give Larry M's transRegCode a ride
 
     cd /hive/data/genomes/hg19/bed/wgEncodeRegV2/tfbs2012
     mkdir motifs
     cd motifs
 
     # lift motif localizations from hg18 (where did they come from originally ?)
     hgsql hg18 -Ne "select * from wgEncodeRegTfbsClusteredMotifs" | \
         cut --complement -f 1 | \
         liftOver -bedPlus=4 stdin /gbdb/hg18/liftOver/hg18ToHg19.over.chain.gz \
                 motifs.bed motifs.unmapped
     hgLoadBed hg19 -as=~/kent/src/hg/lib/bed6FloatScore.as wgEncodeRegTfbsClusteredMotifs motifs.bed
     # Read 1750827 elements of size 6 from motifs.bed
 
 
 #######################################################################
 # Table cleanup/renaming (more to come):
 
 TfbsClusteredV3a -> TfbsClusteredV3a_old
 TfbsClusteredV4 -> TfbsClusteredV3a
 TfbsClusteredV5 -> TfbsClusteredV3b
 
 
 #######################################################################
 # Factorbook motifs (Jun 2013, and then Dec 2013-Feb 2014, kate)
 
 # Data consists of motif locations, positions weight matrices, 
 # and curation (which motifs are canonical for which factors.
 # Basis of the track is supplementary data from Wang et al. Sep2012 Genome Res paper,
 # plus newer data provided by Bong Hyun Kim (nghyun.kim nghyun.kim@umassmed.edu)
 # of Factorbook team at Zhiping Weng's lab (Zlab).
 
 # From Bong-Hyun:
 #These are discovered motifs from 2012 freeze, methods in GR 2012:22(9), Wang et al.
 #Basically the motif were discovered from 500 strongest signal peaks by MEME, and then they are tested if they are enriched compared to the flanking regions of the peaks or GC content matching regions. And then the motifs are condensed by manual curation. Then the peak regions are searched by the condensed motifs using FIMO.  Note that UA (unannotated) motifs are new motif described in the genome research paper, but UAK's are potentially unannotated motifs found newer datasets and not yet carefully curated.
 
     cd /cluster/data/hg19/bed
     mkdir factorbook
     cd factorbook
 
 #################
 # Motif locations within ENCODE peaks
 # Each line is the genomic location of a motif whose name is the 5th column.
 # The 6, 7 and 8th columns are q-value of the motif match, peak number (or bed file line number) 
 # in the original uniform processed peak call, and peak summit, respectively.
 #
 # Load these as a BED table of merged data from all experiments, with q-value replaced with
 # -log10 score.  Duplicate items are removed, with the item scored as maximum found.
 # (NOTE: this table could be expanded to included the sources as in factorSource tables -- this
 # would allow it to be used to show motifs in the individual experiment narrowPeak files)
 
     wget http://zlab.umassmed.edu/zlab/publications/TFBS.tar.gz  
         # data from Sep2012 Genome Research paper (through Jan 2011 ENCODE freeze)
     wget http://zlab.umassmed.edu/~kimb/encode_tfbs.tar.gz
         # newer data (Jan 2011 - Mar 2012 ENCODE freeze)
 
     tar xvfz encode_tfbs.tar.gz
     cd encode_tfbs
     cat *.bed | wc -l
     # 6316631 
     # ls *.bed | wc -l
     # 294
     #  why so few  ?  -- asking Bong-hyun
     # NOTE: some beds are for controls!
     # NOTE: there are motifs for different factors in these peaks (not just target of assay)
     # Around 6M localizations in these files, some appearing multiply reflecting detection
     # in peaks from different expereriments.  These will be merged in the final table.
 
     # convert:
     #  chrom, start, end, strand, name, qvalue ->
     #  chrom, start, end, name, float_score, strand (bed6FLoatScore)
     cat encode_tfbs/*.bed pub_tfbs/*.summary | \
         awk '{OFS="\t"; print $1, $2-1, $3, $5, $6, $4}' > motifs.withdupes.bed 
     wc -l motifs.withdupes.bed 
        # 12115922 
 
     bedSort motifs.withdupes.bed stdout | uniq > motifs.bed
 
     wc -l motifs.bed
       # 12022426 motifs.bed
 
     hgLoadBed hg19 -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \
         wgEncodeRegTfbsClusteredMotifs_factorBook motifs.bed
     # Read 12022426 elements of size 6 from motifs.bed
 
     # TODO: remove multiple hits at same position, remove q-value (we will rescore later)
 
     awk '{$5=0; print}' motifs.bed | uniq > motifs.uniq.bed
 
     wc -l motifs.uniq.bed
     # 2631434 motifs.uniq.bed
     # 2.6M, down from 12M
 
     hgLoadBed hg19 -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \
         wgEncodeAwgTfbsMotifs motifs.uniq.bed
 
 # Check range of qvalue:
 
     >select min(score) from wgEncodeAwgTfbsMotifs; | 0.000000002479999983151515 |
     >select max(score) from wgEncodeAwgTfbsMotifs; | 0.6370000243186951 |
     >select count(*) from wgEncodeAwgTfbsMotifs where score > .1 |  1065928 |
 
     # BongHyun says these are bad quality -- should drop
 
     >select count(*) from wgEncodeAwgTfbsMotifs; | 12022426 |
 
 # Reload table with -log10 scoring and motif name fixes (Jan 21, 2014 KRR)
 # (Two new canonicals were found that have Jaspar naming convention V_ prefix:)
 
 #NFIC    NFIC (V_NF1_Q6)
 #NR4A1   NR4A1 (V_TR4_03)
 
 # Also, both v-Jun and v-JUN are used
 
     cat > dedupAndScore.pl << 'EOF'
 #!/usr/bin/env/perl
 
 # retain lowest scored item (q-value) at a location, converting score to -log10
 # expects file to be pre-sorted with: sort -k1,4 -k6,6 -k5,5
 
 sub minusLog10 {
     my $n = shift;
     return -log($n)/log(10);
 }
 
 $chrN = "";
 while (<>) {
     chomp;
     ($chr, $start, $end, $name, $score, $strand) = split;
     if ($chr ne $chrN || $start ne $startN || $end ne $endN || $name ne $nameN || $strand ne $strandN) {
         $logScore = minusLog10($score);
         printf("%s\t%d\t%d\t%s\t%.2f\t%s\n", $chr, $start, $end, $name, $logScore, $strand);
         $chrN = $chr; $startN = $start; $endN = $end; $nameN = $name; $strandN = $strand;
     }
 }
 'EOF'
 
     sort -k1,4 -k6,6 -k5,5 motifs.bed > motifs.sorted.bed
     perl dedupAndScore.pl < motifs.sorted.bed | \
         # fix motif naming inconsisencies (V_ 
         sed -e 's/v-Jun/v-JUN/' -e 's/V_NF1_Q6/NFIC/' -e 's/V_TR4_03/NR4A1/' > motifs.load.bed
 
     wc -l motifs.load.bed
     #2366151 motifs.load.bed
 
     hgLoadBed hg19 -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \
         factorbookMotifPos motifs.load.bed
 
 
 #################
 # Position weight matrices for the motifs
 # These are in MEME format
 
     wget http://zlab.umassmed.edu/~kimb/gr_paper_extended.meme
 
     # MOTIF A-Box 
     #...
     # A 0.25000 C 0.25000 G 0.25000 T 0.25000
     #...
     # letter-probability matrix: alength= 4 w= 21 nsites= 116 E= 5.4e-213
     # 0.137931        0.189655        0.017241        0.655172
     # 0.620690        0.051724        0.077586        0.250000
     # ...
 
     # use w for column count.  1 probability row per column.  order: acgt
 
      grep MOTIF gr_paper*meme | wc -l
      # 132
      # total motifs
 
 cat > convertMotifs.pl << 'EOF'
 #!/usr/bin/env perl
 
 # convertMotifs - create load file for UCSC transRegCodeMotif table from MEME output
 
 ## MEME format:  
 
 #MOTIF <name>
 #...
 #letter-probability matrix: alength= 4 w=<columns>
 #Aprob Cprob Gprob Tprob
 #...
 
 ## TransRegCodeMotif format:
 #name columnCount aProb cProb gProb
 #    where *Prob fields are comma-sep lists of length columnCount
 
 use warnings;
 use strict;
 
 my $name;
 my $columnCount;
 my $cols = 0;
 my $aProb, my $cProb, my $gProb, my $tProb;
 my @aProbs = (), my @cProbs = (), my @gProbs = (), my @tProbs = ();
 
 sub printProbs {
     # print comma-sep list of probabilities
     my ($probs) = @_;
     print "\t";
     foreach my $prob (@$probs) {
         print "$prob,";
     }
 }
 
 while (<>) {
     if (/^MOTIF (\S+)/) {
         $name = $1;
     } elsif (defined($name) && /^letter-probability matrix: alength= \d+ w= (\d+)/) {
         $columnCount = $1;
     } elsif (defined($columnCount)) {
         # working on probabilities
         if ($cols < $columnCount) {
             # slurp in probabilities
             $cols++;
             ($aProb, $cProb, $gProb, $tProb) = split;
             push(@aProbs, $aProb);
             push(@cProbs, $cProb);
             push(@gProbs, $gProb);
             push(@tProbs, $tProb);
         } else {
             # finished with this motif -- print it out and set up for next
             print "$name\t$columnCount";
             printProbs(\@aProbs);
             printProbs(\@cProbs);
             printProbs(\@gProbs);
             printProbs(\@tProbs);
             print "\n";
 
             undef $name;
             undef $columnCount;
             $cols = 0;
             @aProbs = (); @cProbs = (); @gProbs = (); @tProbs = ();
         }
     }
 }
 'EOF'
 
      convertMotifs.pl gr_paper_extended.meme > transRegCodeMotif.tab
      # wc -l transRegCodeMotifs.tab
      # 132 transRegCodeMotifs.tab
 
      hgLoadSqlTab hg19 transRegCodeMotifs_factorBook ~/kent/src/hg/lib/transRegCodeMotif.sql \
             transRegCodeMotif.tab
        # mySQL error 1062: Duplicate entry 'v-JUN' for key 1
        # There are both v-JUN and v-Jun entries 
        # For now, prune out v-Jun (and inquire at zlab) amd reload
 
     # Again, motif name consistency
     sed -e 's/V_NF1_Q6/NFIC/' -e 's/V_TR4_03/NR4A1/' transRegCodeMotif.fixed.tab > pwms.tab
     hgLoadSqlTab hg19 factorbookMotifPwm ~/kent/src/hg/lib/transRegCodeMotif.sql pwms.tab
 
 
 ##########
 # Create alias table to map factor names at UCSC (HGNC, in ENCODE controlled vocab), to Factorbook
 # names (not always HGNC), so linkouts work.  For use with trackDb url setting.
 
 cat > geneAlias.sql << 'EOF'
     CREATE TABLE factorbookGeneAlias (
         name VARCHAR(255) NOT NULL,
         value VARCHAR(255) NOT NULL,
             PRIMARY KEY (name)
     );
 
 cat > testLinks.csh << 'EOF'
 foreach f (`awk '{print $1}' allMotifTargets.load.tab`)
     wget -O /dev/null -q http://www.factorbook.org/mediawiki/index.php/$f  
     set ret = $status
     echo "$f : $ret"
 end
 'EOF'
 
 csh testLinks.csh > testLinks.out
 
 # results in 'noPage.txt' and 'misnamedPage.txt'
 # various tweaking to generate factorbookGeneAlias.tab (sorry, log is lost, but should be easy to recreate)
 
 hgsql hg19 < geneAlias.sql
 hgsql hg19 -e 'LOAD DATA LOCAL INFILE "factorbookGeneAlias.tab" INTO TABLE factorbookGeneAlias'
 
 #1/31/14 - patch factorbookGeneAlias
 update factorbookGeneAlias set value='ERalpha a' where name='ESR1';
 
 #3/13/14 - remove genes not in TF clusters (bad quality dataaset)
 delete from factorbookGeneAlias where name='NR4A1'
 delete from factorbookGeneAlias where name='SREBP2'
 
 ##########
 # Create table to map factor names to their canonical motifs (12/4/13 Kate)
 # Many twists and turns here -- the goal is to assure factor names match UCSC TF CV names, and
 # that motif names match BED of motif positions.  Also, that only factors having DNA-binding domains
 # that will bind directly are listed (not those with 'tethered-only' binding mode).
 # Approach was to use Table S1 + communications from Zlab for newer factors, removing those 
 # listed in Table S3 as tethered only.  Zlabbers have offered to provide us with a full
 # file for reloading this table.
 
 # Get targets for each dataset
     csh listMotifs.csh > targets.txt
     cp targets.txt targets.txt.fixed
 
 # Remove low quality datasets (see details in AwgTfbsUniform)
 #   These were left out of TFBS Clusters V3
 
     #wgEncodeSydhTfbsGm12878Spt20StdAlnRep0
     #wgEncodeSydhTfbsGm12878Gcn5StdAlnRep0
     #wgEncodeHaibTfbsPfsk1Pol24h8V0416101AlnRep0
     #wgEncodeHaibTfbsA549Sp1V0422111Etoh02AlnRep0
     #wgEncodeSydhTfbsHepg2CebpzIggrabAlnRep0
     #wgEncodeSydhTfbsHelas3Gcn5StdAlnRep0
     #wgEncodeSydhTfbsGm12878Irf3IggmusAlnRep0
     #wgEncodeSydhTfbsK562Xrcc4StdAlnRep0
     #wgEncodeSydhTfbsHepg2Srebp2PravastStdAlnRep0
     #wgEncodeSydhTfbsHepg2Srebp1PravastStdAlnRep0
     #wgEncodeSydhTfbsHepg2Pol2PravastStdAlnRep0
 
 # 78 left unmatched
 
 # List motifs in datasets
     csh listMotifs2.csh > motifs2.txt
     cp motifs2.txt motifs2.txt.fixed
 # remove bad datasets above
 
 # merge files
 
     paste targets.txt.fixed motifs2.txt.fixed > target_motifs.txt
 
 # remove bad Uchicago datasets.  All but:
 
     #wgEncodeUchicagoTfbsK562Ehdac8ControlAlnRep0
     #wgEncodeUchicagoTfbsK562EjundControlAlnRep0
     #wgEncodeUchicagoTfbsK562EjunbControlAlnRep0
     #wgEncodeUchicagoTfbsK562Egata2ControlAlnRep0
     #wgEncodeUchicagoTfbsK562EfosControlAlnRep0
 
 # inventory file
     awk -F'\t' 'OFS="\t" {print $1, $2, $4}' < target_motifs.txt > target_motifs_out.txt
 
 # Create map table
 
 cat > motifTarget.sql << 'EOF'
     CREATE TABLE motifTarget (
         target VARCHAR(255) NOT NULL,
         motif VARCHAR(255) NOT NULL,
             PRIMARY KEY (target)
     );
 'EOF'
         
 hgsql hg19 < factorbookMotifTarget.sql
 
 # Get mappings from GR paper (Wang 2012)
 
 wget http://genome.cshlp.org/content/suppl/2012/08/22/22.9.1798.DC1/TableS1.xls
 
 # extract tab-sep file: to FactorbookTableS1.txt
 # columns needed for map table:   2: HGNC ID, 4: canonical motif
 # NOTE: a few have two canonical( ; sep) -- keep it simple for now, just use first
 # NOTE: some have no canonical -- col 4 is empty
 
 tail -n +2 FactorbookTableS1.txt | \
         awk -F'\t''{print $2, $4}' | sort | uniq | sed 's/;.*//' > pubMotifs.txt
 
 # Need same for newer data
 # Merge and load in table
 
 #awk '{print $1, "\t", $2}' pubMotifs.txt newMotifs.guess.txt | sort > allMotifs.tab
 #hgsql hg19 -e 'LOAD DATA LOCAL INFILE "factorbookTargetToMotif.tab" INTO TABLE factorbookTargetToMotif'
 
 # Replace with mappings from Bong-Hyun, sent in 12/19 email
 
 cat << 'EOF' | sed 's/  */ /' | grep -v '^#' > newMotifs.txt
 #The TFs not mentioned in my list do not have specific binding motifs, e.g. histone acetylases or other non-specific DNA binding proteins.
 #Common.Name     canonical.motif
 ATF1    CREB
 ATF2    CREB
 BACH1   NFE2
 CREB1   CREB
 ELK1    ELF1
 FOXM1   FOXA
 FOXP2   FOXA
 MAZ     UAK42
 NFIC    NFIC
 NR4A1   NR4A1
 RUNX3   RUNX1
 SP4     SP1
 SREBF2  SREBF1
 STAT5A  STAT1
 TCF3    TCF3
 TEAD4   TEAD4
 ZKSCAN1         UAK54
 ZNF217  UAK59
 'EOF'
 
 # NOTE: NR4A1 and SREBF2(SREBP2) not in Anshul's uniform peaks (failed qual filter I think)
 # Drop these from target list
 
 sort pubMotifs.txt newMotifs.txt | sed 's/ /\t/' > allMotifTargets.tab
 hgsql hg19 < motifTarget.sql
 hgsql hg19 -e 'alter table motifTarget rename to factorbookMotifTarget'
 hgsql hg19 -e 'LOAD DATA LOCAL INFILE "allMotifTargets.tab" INTO TABLE factorbookMotifTarget'
 
 # Check for identifier mismatches
 # 1.  Clusters table with Motif targets table
 hgsql hg19 -e "select distinct(name) from wgEncodeRegTfbsClusteredV3 order by name" > clusterTargets.txt
 hgsql hg19 -e "select distinct(target) from factorbookMotifTarget order by target" > motifTargets.txt
 diff clusterTargets.txt motifTargets.txt > targets.diff
 # These are targets in Factorbook, not in UCSC targets list
 grep '>' targets.diff
 > target
 > NFKB1
         # -> RELA in UCSC CV (target gene for this antibody -  NFKB p65)
 > NR4A1
         # not in UCSC clusters track (low qual data ?  It's Chicago)
 > POLR3A
         # -> RPC155 in UCSC CV (old-school, not current HGNC)
 > SREBF1
         # -> SREBP1 in UCSC CV (old-school, not current HGNC)
 > SREBF2
         # -> SREBP2 in UCSC CV (old-school, not current HGNC)
 
 # Update targets table to use UCSC CV targets 
 sed -e 's/^NFKB1/RELA/' -e 's/^POLR3A/RPC155/' -e 's/^SREBF1/SREBP1/' -e 's/^SREBF2/SREBP2/' \
         allMotifTargets.tab > allMotifTargets.load.tab
 hgsql hg19 -e 'LOAD DATA LOCAL INFILE "allMotifTargets.load.tab" INTO TABLE factorbookMotifTarget'
 
 
 # TODO: remove targets having no canonical motif (25 of total 137)
 
 # Also need to rename two new canonicals found that have Jaspar naming convention:
 
 #NFIC    NFIC (V_NF1_Q6)
 #NR4A1   NR4A1 (V_TR4_03)
 
 
 # Also, spot-checking new canonical list above, I noted that ATF2 motif file lacked CREB -- inquiring with BongHyun as to why.
 
 #wgEncodeHaibTfbsGm12878Atf2sc81188V0422111AlnRep0.tfbs.bed : AP1 UAK16 v-JUN
 #wgEncodeHaibTfbsH1hescAtf2sc81188V0422111AlnRep0.tfbs.bed : v-JUN
 #wgEncodeHaibTfbsH1hescAtf3V0416102AlnRep0.tfbs.bed : CREB UAK18 USF ZNF143-ext
 #wgEncodeHaibTfbsHepg2Atf3V0416101AlnRep0.tfbs.bed : CREB-ext UAK18 USF
 #wgEncodeHaibTfbsK562Atf3V0416101AlnRep0.tfbs.bed : AP1 UAK17
 #wgEncodeSydhTfbsK562Atf106325StdAlnRep0.tfbs.bed : NFY SP1 ZNF143-ext v-JUN
 
 # NOTE: some factors have two canonical motifs listed (; separated in Table S1)
 # For now, using just the first, but looking at data, both seem important.
 # Need code changes to pick these up.
 
 $ grep ';' *S1.txt | awk '{print $2, $4}' | sort | uniq 
 ATF3 CREB;CREB-ext
 CTCF CTCF;CTCF-ext
 CTCFL CTCF;CTCF-ext
 GATA1 GATA1;GATA1-ext
 GATA2 GATA1;GATA1-ext
 JUN AP-1;v-JUN
 JUNB AP-1;v-JUN
 JUND AP-1;v-JUN
 RAD21 CTCF;CTCF-ext
 SMC3 CTCF;CTCF-ext
 THAP1 UA4;UA5
 ZNF143 ZNF143;ZNF143-ext
 ZNF274 UA10;UA11
 
 # Check for identifier mismatches
 # 1.  Clusters table with Motif targets table
 hgsql hg19 -e "select distinct(name) from wgEncodeRegTfbsClusteredV3 order by name" > clusterTargets.txt
 hgsql hg19 -e "select distinct(target) from factorbookMotifTarget order by target" > motifTargets.txt
 diff clusterTargets.txt motifTargets.txt > targets.diff
 # These are targets in Factorbook, not in UCSC targets list
 grep '>' targets.diff
 > target
 > NFKB1
         # -> RELA in UCSC CV (target gene for this antibody -  NFKB p65)
 > NR4A1
         # not in UCSC clusters track (low qual data ?  It's Chicago)
 > POLR3A
         # -> RPC155 in UCSC CV (old-school, not current HGNC)
 > SREBF1
         # -> SREBP1 in UCSC CV (old-school, not current HGNC)
 > SREBF2
         # -> SREBP2 in UCSC CV (old-school, not current HGNC)
 
 # Update targets table to use UCSC CV targets 
 
 grep '<' targets.diff > ucsc.diff
 # Notes here about targets at UCSC not in target table.  FYI.
 
 # Check motif name consistency across tables
 awk '{print $4}' motifs.load.bed | sort | uniq > bedMotifs.txt
 awk '{print $2}' allMotifTargets.load.tab | sort | uniq > targetMotifs.txt
 awk '{print $1}' pwms.tab | sort | uniq > pwmMotifs.txt
 
 diff pwmMotifs.txt targetMotifs.txt | grep '>'
 > 
 > AP-1
         # AP1
 > AP-2
         # AP2
 > NF-Y
         # NFY, NFY-UA2 ?
 > PU.1
         # PU1
 > bHLH
         # BHLHE40
 
 # Check pwms vs. bed files
 diff pwmMotifs.txt bedMotifs.txt
 67c67
 < UA12
         # no UA12 in bed file
 ---
 > UA14
         # no UA14 in pwms
 71d70
 < UA5
 77a77,79
 > UAK18
 > UAK19
 > UAK20
 130a133
 > v-Jun
         # should be v-JUN for consistency
 
 # Update BED table to correct v-Jun
 cp motifs.load.bed motifs.fix.bed
 sed 's/v-Jun/v-JUN/' motifs.fix.bed >motifs.load.bed
 hgLoadBed hg19 -sqlTable=$HOME/kent/src/hg/lib/bed6FloatScore.sql -renameSqlTable \
         factorbookMotifPos motifs.load.bed
 
 
 # Update targets table to match bed & pwm tables
 cp allMotifTargets.load.tab allMotifTargets.fix.tab
 sed -e 's/AP-1$/AP1/' -e 's/AP-2$/AP2/' -e 's/NF-Y/NFY/' -e 's/PU.1/PU1/' -e 's/bHLH/BHLHE40/' \
         allMotifTargets.fix.tab > allMotifTargets.load.tab
 hgsql hg19 -e 'DELETE FROM factorbookMotifTarget; LOAD DATA LOCAL INFILE "allMotifTargets.load.tab" INTO TABLE factorbookMotifTarget'
 
 
 #Add secondary canonicals:
     #CREB-ext, CTCF-ext, GATA1-ext, v-JUN (to AP-1), UA5 (to UA4), ANF143-ext, UA11 (to UA10)
 
 sed \
     -e 's/CREB$/CREB,CREB-ext/g' \
     -e 's/CTCF$/CTCF,CTCF-ext/g' \
     -e 's/GATA1$/GATA1,GATA1-ext/g' \
     -e 's/ZNF143$/ZNF143,ZNF143-ext/g' \
     -e 's/UA4$/UA4,UA5/g' \
     -e 's/UA10$/UA10,UA11/g' \
     -e 's/AP-1$/AP-1,v-JUN/g' \
         allMotifTargets.load.tab > allMotifTargetsV2.load.tab
 
 hgsql hg19 < motifTarget.sql
 hgsql hg19 -e 'alter table motifTarget rename to factorbookMotifTargetV2'
 hgsql hg19 -e 'LOAD DATA LOCAL INFILE "allMotifTargetsV2.load.tab" INTO TABLE factorbookMotifTargetV2'
         
 # Remove motifs for 'indirect' binding TFs (those showing only 'tethered' mode in Table S3
 
 awk 'FS="\t" {printf("%s\t%s\t%s\n", $2, $3, $12)}' FactorbookTableS3.txt | sort | uniq > indirects.tsv
 
 # Tethered only are:
 BATF      AP-1
 BCL11A      UA9
 BCL3      NFKB1
 ETS1      ETS1
 FOS      AP-1
 GATA2      GATA1;GATA1-ext
 GATA3      GATA3
 IRF1      PRDM1
 MXI1      MAX
 NR3C1      NR3C1
 PAX5      PAX5
 POLR2A      TBP
 PPARGC1A      ESRRA
 RAD21      CTCF;CTCF-ext
 RFX5      RFX5
 SMC3      CTCF;CTCF-ext
 SREBF1      SREBF1
 SRF      SRF
 STAT3      STAT1
 TAF1      TBP 
 
 # Hand edit motifs for factors above out of file (allMotifTargetsV2.load.tab -> V3)
 
 hgsql hg19 -e 'select distinct(name) from wgEncodeRegTfbsClusteredV3 order by name' > allFactors.txt
 hgsql hg19 -e 'select distinct(target) from factorbookMotifTarget order by name' > allMotifFactors.txt
 
 diff allFactors.txt allMotifFactors.txt | grep '<' | sed -e 's/< //' > newNoMotifFactors.txt
 wc -l newNoMotifFactors.txt
 26 newNoMotifFactors.txt
 
 # NOTE: NR4A1 and SREBF2(SREBP2) not in Anshul's uniform peaks (failed qual filter I think)
 # Drop these from allMotifTargetsV4.load.tab
 
 sed -e 's/$/\t/' -e '/NR4A1/d' -e '/SREBF2/d' newNoMotifFactors.txt > foo
 mv foo newNoMotifFactors.txt
 
 cat allMotifTargetsV3.load.tab newNoMotifFactors.txt | sort > allMotifTargetsV4.load.tab
 
 hgsql hg19 < motifTarget.sql
 hgsql hg19 -e 'alter table motifTarget rename to factorbookMotifTargetV4'
 hgsql hg19 -e 'LOAD DATA LOCAL INFILE "allMotifTargetsV4.load.tab" INTO TABLE factorbookMotifTargetV4'
 
 
 # Generate comma-sep list of factors for trackDb filter, 'factorFilterList.txt'
 
 
 # Get citations for track description References section
         getTrackReferences 22955990  23203885  22955619 > refs.txt
 
 # Requested corrections target table using new gene vs. canonical motif table from Jiali at Zlab (2014-03-06 kate)
 # They returned files with new motif calls & annotations.  Perhaps for another version of the track!
 
 wget http://zlab.umassmed.edu/~zhuangj/Direct_binding_motifs.tar.gz
 wget http://zlab.umassmed.edu/~kimb/motif_centric_encode_tfbs.sorted.bed.bz2
 bunzip2 !$
 
 # In email, the following TF's 4 were specifically mentioned as direct binders 
 # missing canonical motif:
 # NFIC->NFIC (already in table)
 # NR4A1->NR4A1 (not in our TF clusters, rejected due to low quality, so not relevant to table)
 # and these two, which will be added to the table!
 # NR3C1->NR3C1
 # FOS->API
 
 sed -e 's/NR3C1\t$/NR3C1\tNR3C1/' -e 's/FOS\t$/FOS\tAP1/' allMotifTargetsV4.load.tab > factorbookMotifCanonical.load.tab
 hgsql hg19 < motifTarget.sql
 hgsql hg19 -e 'alter table motifTarget rename to factorbookMotifCanonical'
 hgsql hg19 -e 'LOAD DATA LOCAL INFILE "factorbookMotifCanonical.load.tab" INTO TABLE factorbookMotifCanonical'