9c3a278e681d47532c5e404d43f7019239c60602
kate
  Mon Oct 14 12:48:13 2019 -0700
Updates to track description, add make doc. refs #23881

diff --git src/hg/makeDb/doc/encode3/mouse.txt src/hg/makeDb/doc/encode3/mouse.txt
index ee70c3d..c71b3ea 100644
--- src/hg/makeDb/doc/encode3/mouse.txt
+++ src/hg/makeDb/doc/encode3/mouse.txt
@@ -113,15 +113,167 @@
 # oops, they have decimal points in score.  Strip these.
 
 // biggify
 set sizes = /hive/data/genomes/mm10/chrom.sizes
 foreach f (*.bed)
     set d = $f:r
     sed 's/\..*//' < $d.bed > $d.fixed.bed
     bedToBigBed -type=bed5 $d.fixed.bed $sizes $d.bb
 end
 
 # link to /gbdb
 rm /hive/data/gbdb/mm10/encode3/atac/*.bb
 csh rename2.bb.csh
 
 
+######################################################################################
+# Interact tracks from Ren Lab enhancer/gene map, with EPDnew promoters (JK rec)
+
+# (kate July 2019)
+
+#################
+# Download spreadsheet from Dropbox copy provided by Dave Gorkin
+# https://www.dropbox.com/s/ksxt9k2dh46k2ya/Gorkin_Ren_tableS5-11.xlsx?dl=0
+
+# Gorkin_Ren_EnhancerGene_Rep1.txt
+
+dos2unix Gorkin_Ren_EnhancerGene_Rep1.txt
+tr '\r' '\n' < Gorkin* > enhancerGene.rep1.txt
+
+# edit out title line
+head -1 enhancerGene.rep1.txt
+# chrom   start   end     ensembl symbol  SCC     Z       p-value (z)     p-value (empirical)
+# chr1    4426300 4428300 ENSMUSG00000025902.9    Sox17   6.16E-01        2.07E+00        0.018999025     0.016378526
+
+# The SCC field will be basis of score.  It ranges from .25 to 1.0
+
+# strip trailing empty lines (from bad XLS export)
+
+#################
+# Download promoters from EPDnew (rec from JK)
+# https://epd.epfl.ch/EPDnew_database.php
+
+wget ftp://ccg.epfl.ch/epdnew/README
+wget ftp://ccg.epfl.ch/epdnew/M_musculus/003/cross_references.txt .
+
+# corrupted file ?  First line:
+
+wget ftp://ccg.epfl.ch/epdnew/M_musculus/current/Mm_EPDnew.bed
+
+# This is version 3, file dated 6/4/18
+
+# NOTE: file contains 1 or more promoters per gene.
+# Based on conversations with provider (Philip Bucher), we will merge into a single promoter region
+
+wc -l Mmn_EPDnew.bed
+# 25111
+
+sed 's/_.*900 / 900 /' MM_EPDnew.bed > promoters.temp.bed
+bedtools groupby -g 1,4,5,6 -c 2,3 -o min,max < promoters.temp.bed | \
+        awk '{OFS="\t"; print $1, $5, $6, $2, $3, $4}' | \
+                bedSort stdin promoters.bed
+
+wc -l promoters.bed
+# 20549
+# Reduced by 5000
+
+#################
+# Create interact file from interactions and promoters files
+
+perl makeInteract.pl enhancerGene.rep1.txt MM_EPDnew.bed cross_references.txt >&! errors.txt
+
+# strip first two lines from enhancerGene file to make map.rep1.txt
+#perl makeInteract.pl map.rep1.txt promoters.bed cross_references.txt > enhancerGeneInteract.bed
+#Found 30793 interactions with promoters, 1171 with missing promoters
+
+# NOTE: Missing 374 promoters (will need to get these from GB annotation)
+
+# NOTE: Problem with cross-references file. At least one instance of error 
+# Here the ENSG 65324 should be Eno1, not Eno1b (acc to GENCODE V20)
+ENSMUSG00000059040      Eno1b   NM_001025388    Mus musculus enolase 1B, retrotransposed (Eno1b), mRNA.
+ENSMUSG00000063524      Eno1b   NM_023119       Mus musculus enolase 1B, retrotransposed (Eno1b), mRNA.
+
+# NOTE: Problem with duplicated interactions in enhancerGene file -- 2 ENSG's mapped to two
+# gene names (so interactions appear twice):
+
+chr17   13666000        13671200        ENSMUSG00000038347.10   Tcte2   8.69E-01        1.85E+00        0.03221258      0.002229654
+chr17   13666000        13671200        ENSMUSG00000038347.10   2700054A10Rik   8.69E-01        1.85E+00        0.03221258      0.002229654
+
+chr12     57514900        57516900        ENSMUSG00000046782.10   4921506M07Rik   4.82E-01        1.75E+00        0.039890209     0.04302926
+chr12     57514900        57516900        ENSMUSG00000046782.10   Ttc6    4.82E-01        1.75E+00        0.039890209     0.04302926
+
+
+bedSort enhancerGene.rep1.txt.out encode3EnhancerPromoterInteract.bed
+
+# Biggify interactions
+mkdir gbdb
+set sizes = /hive/data/genomes/mm10/chrom.sizes
+bedToBigBed -type=bed5+13 -as=enhancerPromoterInteract.as encode3EnhancerPromoterInteract.bed \
+                  $sizes gbdb/encode3EnhancerPromoterInteract.bb
+cd /gbdb
+ln -s `pwd`/encode3EnhancerPromoterInteract.bb /gbdb/mm10/bbi
+cd ..
+
+# Biggify promoters file
+bedSort MM_EPDnew.bed.out epdPromoters3.bed
+
+# TODO: add more fields ?
+bedToBigBed -type=bed9 epdPromoters3.bed \
+                $sizes gbdb/epdProomoters3.bb
+cd gbdb
+ln -s `pwd`/epdPromoters3.bb /gbdb/mm10/bbi
+cd ..
+
+
+# old
+#hgLoadBed mm10 -noSort -noBin -type=bed5+13 \
+        #-sqlTable=/cluster/home/kate/kent/src/hg/lib/interact.sql -renameSqlTable \
+            #encode3EnhancerPromoter enhancerGeneInteract.bed
+            #encode3RenEnhancerGeneInteract enhancerGeneInteract.bed
+
+
+# merge replicates, stripping extra columns, adding a column for count.  
+
+# export from spreadsheet.  Trim empty lines.  dos2unix.  tr \m's to \n's.
+wc -l map.*
+  31964 map.rep1.txt
+  33301 map.rep2.txt
+  21142 map.replicated.txt
+
+perl mergeReps.pl map.rep1.txt map.rep2.txt | bedSort stdin map.merged.txt
+
+mkdir out
+perl makeInteract.pl map.merged.txt MM_EPDnew.bed cross_references.txt out/ >&! makeInteract.log
+bedSort out/enhancers.all.bed out/enhancers.all.sorted.bed
+bedSort out/enhancers.rep.bed out/enhancers.rep.sorted.bed
+
+# why needed ???
+bedSort out/interactions.all.bed out/interactions.all.sorted.bed
+bedSort out/interactions.rep.bed out/interactions.rep.sorted.bed
+
+# biggify
+set sizes = /hive/data/genomes/mm10/chrom.sizes
+bedToBigBed -type=bed5+14 -as=enhancerPromoterInteract.as out/interactions.all.sorted.bed \
+                  $sizes gbdb/encode3EnhancerPromoterInteractAll.bb
+bedToBigBed -type=bed5+14 -as=enhancerPromoterInteract.as out/interactions.rep.sorted.bed \
+                  $sizes gbdb/encode3EnhancerPromoterInteractRep.bb
+
+# TODO: add more fields
+bedToBigBed -type=bed9 out/promoters.all.bed \
+                $sizes gbdb/epdPromoterAll.bb
+bedToBigBed -type=bed9 out/promoters.rep.bed \
+                $sizes gbdb/epdPromoterRep.bb
+
+bedToBigBed -type=bed4 out/enhancers.all.sorted.bed \
+                $sizes gbdb/encode3EnhancerAll.bb
+bedToBigBed -type=bed4 out/enhancers.rep.sorted.bed \
+                $sizes gbdb/encode3EnhancerRep.bb
+cd gbdb
+foreach f (epdPromoterAll epdPromoterRep encode3EnhancerAll encode3EnhancerRep encode3EnhancerPromoterInteractAll encode3EnhancerPromoterInteractRep)
+    ln -s `pwd`/$f.bb /gbdb/mm10/bbi
+end
+
+cd gbdb
+ln -s `pwd`/epdPromoters3.bb /gbdb/mm10/bbi
+
+# NOTE: Renamed .bb files to prefix w/ encode3Ren/encode3RenInteract
+