8f288a664a2b8c73a3a5cbf0507905f89ff17e05
kate
  Wed Nov 6 11:05:03 2019 -0800
Cleanup make doc. refs #23881

diff --git src/hg/makeDb/doc/encode3/mouse.txt src/hg/makeDb/doc/encode3/mouse.txt
index 4d76eed..5d7826d 100644
--- src/hg/makeDb/doc/encode3/mouse.txt
+++ src/hg/makeDb/doc/encode3/mouse.txt
@@ -181,30 +181,31 @@
 # https://www.dropbox.com/s/ksxt9k2dh46k2ya/Gorkin_Ren_tableS5-11.xlsx?dl=0
 
 # Gorkin_Ren_EnhancerGene_Rep1.txt
 
 dos2unix Gorkin_Ren_EnhancerGene_Rep1.txt
 tr '\r' '\n' < Gorkin* > enhancerGene.rep1.txt
 
 # edit out title line
 head -1 enhancerGene.rep1.txt
 # chrom   start   end     ensembl symbol  SCC     Z       p-value (z)     p-value (empirical)
 # chr1    4426300 4428300 ENSMUSG00000025902.9    Sox17   6.16E-01        2.07E+00        0.018999025     0.016378526
 
 # The SCC field will be basis of score.  It ranges from .25 to 1.0
 
 # strip trailing empty lines (from bad XLS export)
+# strip first two lines from enhancerGene file to make map.rep1.txt
 
 #################
 # Download promoters from EPDnew (rec from JK)
 # https://epd.epfl.ch/EPDnew_database.php
 
 wget ftp://ccg.epfl.ch/epdnew/README
 wget ftp://ccg.epfl.ch/epdnew/M_musculus/003/cross_references.txt .
 
 # corrupted file ?  First line:
 
 wget ftp://ccg.epfl.ch/epdnew/M_musculus/current/Mm_EPDnew.bed
 
 # This is version 3, file dated 6/4/18
 
 # NOTE: file contains 1 or more promoters per gene.
@@ -213,88 +214,60 @@
 wc -l Mm_EPDnew.bed
 # 25111
 
 sed 's/_.*900 / 900 /' MM_EPDnew.bed > promoters.temp.bed
 bedtools groupby -g 1,4,5,6 -c 2,3 -o min,max < promoters.temp.bed | \
         awk '{OFS="\t"; print $1, $5, $6, $2, $3, $4}' | \
                 bedSort stdin promoters.bed
 
 wc -l promoters.bed
 # 20549
 # Reduced by 5000
 
 #################
 # Create interact file from interactions and promoters files
 
-ln -s /cluster/home/kate/kent/src/hg/makeDb/outside/encode3/mouse/makeInteract.pl .
-perl makeInteract.pl enhancerGene.rep1.txt MM_EPDnew.bed cross_references.txt >&! errors.txt
-
-# strip first two lines from enhancerGene file to make map.rep1.txt
-#perl makeInteract.pl map.rep1.txt promoters.bed cross_references.txt > enhancerGeneInteract.bed
-#Found 30793 interactions with promoters, 1171 with missing promoters
-
-# NOTE: Missing 374 promoters (will need to get these from GB annotation)
-
-# NOTE: Problem with cross-references file. At least one instance of error 
-# Here the ENSG 65324 should be Eno1, not Eno1b (acc to GENCODE V20)
-ENSMUSG00000059040      Eno1b   NM_001025388    Mus musculus enolase 1B, retrotransposed (Eno1b), mRNA.
-ENSMUSG00000063524      Eno1b   NM_023119       Mus musculus enolase 1B, retrotransposed (Eno1b), mRNA.
-
-# NOTE: Problem with duplicated interactions in enhancerGene file -- 2 ENSG's mapped to two
-# gene names (so interactions appear twice):
-
-chr17   13666000        13671200        ENSMUSG00000038347.10   Tcte2   8.69E-01        1.85E+00        0.03221258      0.002229654
-chr17   13666000        13671200        ENSMUSG00000038347.10   2700054A10Rik   8.69E-01        1.85E+00        0.03221258      0.002229654
-
-chr12     57514900        57516900        ENSMUSG00000046782.10   4921506M07Rik   4.82E-01        1.75E+00        0.039890209     0.04302926
-chr12     57514900        57516900        ENSMUSG00000046782.10   Ttc6    4.82E-01        1.75E+00        0.039890209     0.04302926
-
-
-bedSort enhancerGene.rep1.txt.out encode3EnhancerPromoterInteract.bed
-
-# Biggify interactions
-mkdir gbdb
-set sizes = /hive/data/genomes/mm10/chrom.sizes
-bedToBigBed -type=bed5+13 -as=enhancerPromoterInteract.as encode3EnhancerPromoterInteract.bed \
-                  $sizes gbdb/encode3EnhancerPromoterInteract.bb
-cd /gbdb
-ln -s `pwd`/encode3EnhancerPromoterInteract.bb /gbdb/mm10/bbi
-cd ..
-
-# Biggify promoters file
-bedSort MM_EPDnew.bed.out epdPromoters3.bed
-
-# TODO: add more fields ?
-bedToBigBed -type=bed9 epdPromoters3.bed \
-                $sizes gbdb/epdProomoters3.bb
-cd gbdb
-ln -s `pwd`/epdPromoters3.bb /gbdb/mm10/bbi
-cd ..
-
 # merge replicates, stripping extra columns, adding a column for count.  
 
 # export from spreadsheet.  Trim empty lines.  dos2unix.  tr \m's to \n's.
 wc -l map.*
   31964 map.rep1.txt
   33301 map.rep2.txt
   21142 map.replicated.txt
 
 ln -s /cluster/home/kate/kent/src/hg/makeDb/outside/encode3/mouse/mergeInteractReps.pl mergeReps.pl
 perl mergeReps.pl map.rep1.txt map.rep2.txt | bedSort stdin map.merged.txt
 
+ln -s /cluster/home/kate/kent/src/hg/makeDb/outside/encode3/mouse/makeInteract.pl .
 mkdir out
 perl makeInteract.pl map.merged.txt MM_EPDnew.bed cross_references.txt out/ >&! makeInteract.log
+
+head makeInteract.log
+#Creating output files in directory out//
+#ERROR: Rejecting promoter for Apoo ENSMUSG00000049233
+                                    #on chr13 (chrX|94367074|94367134|+|237,177,32)
+#Found 41941 interactions with promoters, 1649 with missing promoters
+#RESCUES: 295
+#ENSMUSG00000036097 Slf2
+#ENSMUSG00000098306 Kiss1
+#...
+#MISSING PROMOTERS: 328
+#Gpbar1
+#Cacng6
+#Fev
+#..
+
 bedSort out/enhancers.all.bed out/enhancers.all.sorted.bed
 bedSort out/enhancers.rep.bed out/enhancers.rep.sorted.bed
 
 # why needed ???
 bedSort out/interactions.all.bed out/interactions.all.sorted.bed
 bedSort out/interactions.rep.bed out/interactions.rep.sorted.bed
 
 # biggify
 set sizes = /hive/data/genomes/mm10/chrom.sizes
 bedToBigBed -type=bed5+14 -as=enhancerPromoterInteract.as out/interactions.all.sorted.bed \
                   $sizes gbdb/encode3EnhancerPromoterInteractAll.bb
 bedToBigBed -type=bed5+14 -as=enhancerPromoterInteract.as out/interactions.rep.sorted.bed \
                   $sizes gbdb/encode3EnhancerPromoterInteractRep.bb
 
 # TODO: add more fields
@@ -305,15 +278,61 @@
 
 bedToBigBed -type=bed4 out/enhancers.all.sorted.bed \
                 $sizes gbdb/encode3EnhancerAll.bb
 bedToBigBed -type=bed4 out/enhancers.rep.sorted.bed \
                 $sizes gbdb/encode3EnhancerRep.bb
 cd gbdb
 foreach f (epdPromoterAll epdPromoterRep encode3EnhancerAll encode3EnhancerRep encode3EnhancerPromoterInteractAll encode3EnhancerPromoterInteractRep)
     ln -s `pwd`/$f.bb /gbdb/mm10/bbi
 end
 
 cd gbdb
 ln -s `pwd`/epdPromoters3.bb /gbdb/mm10/bbi
 
 # NOTE: Renamed .bb files to prefix w/ encode3Ren/encode3RenInteract
 
+#########################
+# old  stuff
+
+perl makeInteract.pl enhancerGene.rep1.txt MM_EPDnew.bed cross_references.txt >&! errors.txt
+
+# strip first two lines from enhancerGene file to make map.rep1.txt
+#perl makeInteract.pl map.rep1.txt promoters.bed cross_references.txt > enhancerGeneInteract.bed
+#Found 30793 interactions with promoters, 1171 with missing promoters
+
+# NOTE: Missing 374 promoters (will need to get these from GB annotation)
+
+# NOTE: Problem with cross-references file. At least one instance of error 
+# Here the ENSG 65324 should be Eno1, not Eno1b (acc to GENCODE V20)
+ENSMUSG00000059040      Eno1b   NM_001025388    Mus musculus enolase 1B, retrotransposed (Eno1b), mRNA.
+ENSMUSG00000063524      Eno1b   NM_023119       Mus musculus enolase 1B, retrotransposed (Eno1b), mRNA.
+
+# NOTE: Problem with duplicated interactions in enhancerGene file -- 2 ENSG's mapped to two
+# gene names (so interactions appear twice):
+
+chr17   13666000        13671200        ENSMUSG00000038347.10   Tcte2   8.69E-01        1.85E+00        0.03221258      0.002229654
+chr17   13666000        13671200        ENSMUSG00000038347.10   2700054A10Rik   8.69E-01        1.85E+00        0.03221258      0.002229654
+
+chr12     57514900        57516900        ENSMUSG00000046782.10   4921506M07Rik   4.82E-01        1.75E+00        0.039890209     0.04302926
+chr12     57514900        57516900        ENSMUSG00000046782.10   Ttc6    4.82E-01        1.75E+00        0.039890209     0.04302926
+
+
+bedSort enhancerGene.rep1.txt.out encode3EnhancerPromoterInteract.bed
+
+# Biggify interactions
+mkdir gbdb
+set sizes = /hive/data/genomes/mm10/chrom.sizes
+bedToBigBed -type=bed5+13 -as=enhancerPromoterInteract.as encode3EnhancerPromoterInteract.bed \
+                  $sizes gbdb/encode3EnhancerPromoterInteract.bb
+cd /gbdb
+ln -s `pwd`/encode3EnhancerPromoterInteract.bb /gbdb/mm10/bbi
+cd ..
+
+# Biggify promoters file
+bedSort MM_EPDnew.bed.out epdPromoters3.bed
+
+# TODO: add more fields ?
+bedToBigBed -type=bed9 epdPromoters3.bed \
+                $sizes gbdb/epdProomoters3.bb
+cd gbdb
+ln -s `pwd`/epdPromoters3.bb /gbdb/mm10/bbi
+cd ..