aba55984f35dacd02061eafb2ca9a9784efb8bae kate Thu May 9 16:31:31 2019 -0700 Add/polish track labels and descriptions for ENCODE 3 TF tracks. refs #21139 diff --git src/hg/makeDb/doc/encode3/tfbs.txt src/hg/makeDb/doc/encode3/tfbs.txt index 3bd8e8f..9b2a5fb 100644 --- src/hg/makeDb/doc/encode3/tfbs.txt +++ src/hg/makeDb/doc/encode3/tfbs.txt @@ -192,30 +192,37 @@ motifMapTable factorbookMotifCanonical motifMaxWindow 50000 motifDrawDefault on urlLabel Factorbook Link: url http://www.factorbook.org/mediawiki/index.php/$$ idInUrlSql select value from factorbookGeneAlias where name='%s' controlledVocabulary encode/cv.ra cellType=cell treatment=treatment lab=lab visibility dense useScore 1 priority 1.71 maxWindowToDraw 10000000 dataVersion ENCODE Mar 2012 Freeze filterBy name:factor=\ 'EOF' +# rename tables for consistency with earlier regulatory supertrack tracks (but distinguish from +# ENCODE 2 by prefix + +hgsql hg19 -e "alter table encode3RegTfbsCluster rename to encRegTfbsClustered" +hgsql hg19 -e "alter table encode3RegTfbsClusterInput rename to encRegTfbsClusteredInputs" +hgsql hg19 -e "alter table encode3RegTfbsExp rename to encRegTfbsClusteredSources" + ############### # hg38 # (2019-03-25 kate) cd ../hg38 mkdir peaks mv tf.GRCh38.tar peaks cd peaks tar xvf tf.GRCh38.tar cd .. mv peaks/metadata.tsv . # config @@ -296,70 +303,135 @@ hgLoadSqlTab hg38 encode3RegTfbsClusterInput \ ~/kent/src/hg/lib/clusterInputTrackTable5.sql clusters.inputs.tab # list factors for trackDb filterBy awk '{print $4}' fileCellAbTarget.tab | sort | uniq | sed 's/$/,\\/' > factors.trackDb wc -l factors.trackDb # 340 factors.trackDb # add to trackDb filterBy setting # list cells awk '{print $2}' fileCellAbTarget.tab | sed 's/+.*//' | sort | uniq > cells.txt wc -l cells.txt #129 cells.txt +# rename tables for consistency with earlier regulatory supertrack tracks (but distinguish from +# ENCODE 2 by prefix + +hgsql hg38 -e "alter table encode3RegTfbsCluster rename to encRegTfbsClustered" +hgsql hg38 -e "alter table encode3RegTfbsClusterInput rename to encRegTfbsClusteredInputs" +hgsql hg38 -e "alter table encode3RegTfbsExp rename to encRegTfbsClusteredSources" + ################ # Load per-factor tables (needed for clusters track details # Consider also creating a track for these (composite with 1200 subtracks) cat > scorePeaks.csh << 'EOF' set db = $1 set f = $2 set t = $f:r:r set table = encode3TfbsPk$t echo $table zcat $f > $table.bed bedScore -col=7 -verbose=2 -method=reg -uniform $f $table.scored.bed hgLoadBed -noNameIx -trimSqlTable \ -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable \ -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as $db $table $table.scored.bed gzip -c $table.scored.bed > ../scoredPeaks/$table.bed.gz rm $table.bed $table.scored.bed end 'EOF' cd hg19 mkdir scoredPeaks cd peaks awk '{print $1}' ../fileCellAbTarget.tab | sed 's/peaks\///' | \ xargs -L 1 ../../scorePeaks.csh hg19 >&! ../scorePeaks.log & - +cd .. perl ../makeTrackDb.pl < clusters.inputs.tab > trackDb.ra +# trim shortLabels: +grep shortLabel trackDb.ra | sed 's/shortLabel //' | sort > shortLabels.orig.txt +sort | awk -F'\n' '{print $1, "\t", $1}' shortLabels.orig.txt > \ + shortLabels.twocol.txt +# 2. import to google sheet +# 3. trim 2nd column to 17 chars +# 4. export as tab-sep + +tdbRename trackDb.ra shortLabel encode3TfChipShortLabels.hg19.txt trackDb.new.ra + +# Fix up subGroup members w/ punctuation and initial numbers: +# Peyer's_patch -> Peyers_patch +# NT2/D1 -> NT2_D1 +# 22Rv1 -> X22Rv1 +# MM.1S -> MM_1S + +# rename tables (encode3Tfbs -> encTfChipPk) + +hgsql hg19 -e 'show tables like "encode3TfbsPk%"' > tables.old.txt +sed -e 's/^/alter table /' -e 's/$/ rename to /' tables.old.txt > rename.1.sql +sed -e 's/encode3TfbsPk/encTfChipPk/' tables.old.txt | paste rename.1.sql - | \ + sed 's/$/;/' > rename.sql +hgsql hg19 < rename.sql + +# reload cluster input table +hgLoadSqlTab hg19 encode3RegTfbsClusterInput \ + ~/kent/src/hg/lib/clusterInputTrackTable5.sql clusters.inputs.tab + +############### +# hg38 + cd ../hg38 mkdir scoredPeaks cd peaks awk '{print $1}' ../fileCellAbTarget.tab | sed 's/peaks\///' | \ xargs -L 1 ../../scorePeaks.csh hg38 >&! ../scorePeaks.log & +cd .. perl ../makeTrackDb.pl < clusters.inputs.tab > trackDb.ra paste cells.txt cells.txt > cellGroup.txt # edit for subgroup format +grep shortLabel trackDb.ra | sed 's/shortLabel //' | sort > shortLabels.orig.txt + +comm -2 -3 shortLabels.orig.txt ../hg19/shortLabels.orig.txt > shortLabels.hg38.only.txt + +# add to google spreadsheet for trimming to 17 chars (input to tdbRename) + +tdbRename trackDb.ra shortLabel encode3TfShortLabels.uniq.tsv trackDb.new.ra + +# Fix up subGroup members w/ punctuation and initial numbers: +# Peyer's_patch -> Peyers_patch +# NT2/D1 -> NT2_D1 +# 22Rv1 -> X22Rv1 +# MM.1S -> MM_1S + +# rename tables (encode3Tfbs -> encTfChipPk) + +hgsql hg38 -e 'show tables like "encode3TfbsPk%"' > tables.old.txt +sed -e 's/^/alter table /' -e 's/$/ rename to /' tables.old.txt > rename.1.sql +sed -e 's/encode3TfbsPk/encTfChipPk/' tables.old.txt | paste rename.1.sql - | \ + sed 's/$/;/' > rename.sql +hgsql hg38 < rename.sql + +# reload cluster input table +hgLoadSqlTab hg38 encode3RegTfbsClusterInput \ + ~/kent/src/hg/lib/clusterInputTrackTable5.sql clusters.inputs.tab + #################### # Motifs (hg38) from Henry Pratt at Zlab # 2019-03-13 # Notes from Henry: #The structure is mostly the same as the existing tables: canonical.tsv contains lists # of canonical motifs for each factor, pwms.tsv contains the PWM list for the motifs, # and fimo.tsv contains a large list of occurrences for each motif. We have expanded # the number of canonical motifs in many cases to more than two, including some novel # motifs MEME discovered which aren't annotated in the databases I searched. # That's a primary area where we'll probably look to filter and/or merge PWMs to # reduce the size a little bit more." cd /hive/data/outside/encode3/tfbs/dac