src/hg/makeDb/doc/hg38/reg.txt 76e480f51fbf36171248a26c9f24c3bd0e95af7f

76e480f51fbf36171248a26c9f24c3bd0e95af7f
kate
  Thu Jun 4 11:38:18 2020 -0700
Save build script and schema for CCRE track. refs #24668

diff --git src/hg/makeDb/doc/hg38/reg.txt src/hg/makeDb/doc/hg38/reg.txt
index 697ad22..75aa882 100644
--- src/hg/makeDb/doc/hg38/reg.txt
+++ src/hg/makeDb/doc/hg38/reg.txt
@@ -1,683 +1,685 @@
 # for emacs: -*- mode: sh; -*-
 
 # Regulation tracks for hg38 / GRCh38
 
 #############################################################################
 # Building UW DNAse I ENCODE2 tracks (In progress 2014-09-03 Jim Kent)
 
 #These tracks contain the results of DNAse I hypersensitivity experiments from the 
 #John Stamatoyannapoulos lab at the University of Washington done for the ENCODE Project
 #phase 2.
 
 #The data was processed according to the July 2014 version of the ENCODE 3 DNAse
 #processing pipeline.  At a high level this means pooling aligning the reads
 #with the bwa program against hg38 with the 'sponge' sequence, removing multiple
 #mapping reads, and reads that aligned to the sponge or mitochondria,  pooling
 #the results for all replicates, and running the hotspot program.  The bigWig output
 #was normalized so that the average value genome-wide is 1.
 
 #The bam files were created by the encode analysis pipeline on each replicate separately
 #and the process for doing this won't be described here.  It is a bit complex, and
 #really will just need to be reworked into something simpler now that we've no longer
 #are working directly on that contract.  This build assumes that the relevant bam
 #files are in the /hive/groups/encode/3/eap/cach directory.
 
 # To do the mapping again you'd start with fastq files and use the script
 # eap_run_bwa_se on an index that included the sponge as well as hg38
 # chromosomes (but not alternative haplotypes).   Bwa itself is run in a
 # rather vanilla mode, with no options beyond -t 4 to parallelize the
 # first pass of the alignment in 4 threads.
 # The first section of methods here are to create a hub with peaks,  hotspots, and signal
 # from pooled replicates.
 
 #The detailed instructions after the bam files are available are: 
 
 ##In more detail.  First 
 mkdir /hive/data/genomes/hg38/bed/uwDnase1
 
 ## Run program to generate most of parasol batches
 ssh encode-02
 cd /hive/data/genomes/hg38/bed/uwDnase1
 dnaseHg38Batch batchDir
 
 ## By hand edit split batchDir into pooled and single replicate versions in directories
 ## run_pooled and run_replicates (sorry for the hand work)
 
 ## Do parasol runs on pooled
 ssh ku
 cd cd /hive/data/genomes/hg38/bed/uwDnase1/run_pooled
 para make
 para time
 #Completed: 95 of 95 jobs
 #CPU time in finished jobs:    2908517s   48475.28m   807.92h   33.66d  0.092 y
 #IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
 #Average job time:               27838s     463.96m     7.73h    0.32d
 #Longest finished job:          128043s    2134.05m    35.57h    1.48d
 #Submission to last job:        128747s    2145.78m    35.76h    1.49d
 #Estimated complete:                 0s       0.00m     0.00h    0.00d
 
 ## Do parasol runs on replicates (these are not actually currently used)
 ssh ku
 cd /hive/data/genomes/hg38/bed/uwDnase1/run_replicates
 para make
 para time
 #completed: 189 of 189 jobs
 #CPU time in finished jobs:    4025020s   67083.66m  1118.06h   46.59d  0.128 y
 #IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
 #Average job time:               20115s     335.25m     5.59h    0.23d
 #Longest finished job:          110245s    1837.42m    30.62h    1.28d
 #Submission to last job:        111410s    1856.83m    30.95h    1.29d
 #Estimated complete:                 0s       0.00m     0.00h    0.00d
 #Note that one of the experiments only has replicate 2.  It's because both
 #iterations of replicate 1 were deprecated.
 
 ## Augment metadata
 ssh hgwdev
 cd /hive/data/genomes/hg38/bed/uwDnase1
 dnaseHg38AddTreatments batchDir/meta.tab meta.tab
 
 ## Do correlations between all pooled experiments 
 ssh ku
 cd /hive/data/genomes/hg38/bed/uwDnase1
 mkdir run_correlations
 cd run_correlations
 
 # Create little script to make tab separated output out of bigWigCorrelate results
 cat << '_EOF_' > corr2
 #!/bin/tcsh -efx
 echo -n "$1\t$2\t" > $3
 bigWigCorrelate $1 $2 >> $3
 '_EOF_'
   # << happy emacs
 
 # Create gensub2 input
 cat << '_EOF_' > gsub
 #LOOP
 corr2 $(path1) $(path2) out/$(root1)_vs_$(root2)
 #ENDLOOP
 '_EOF_'
   # << happy emacs
 
 # Run gensub2 with brand new selfPair method on all pooled files
 ls -1 /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/*.bigWig > fileList
 gensub2 fileList selfPair gsub jobList
 
 # The parasol run using just 10 CPUs because we are i/o heavy
 para create jobList
 para push -maxJob=10
 para time
 #Completed: 4465 of 4465 jobs
 #CPU time in finished jobs:     349724s    5828.74m    97.15h    4.05d  0.011 y
 #IO & Wait Time:                 58019s     966.98m    16.12h    0.67d  0.002 y
 #Average job time:                  91s       1.52m     0.03h    0.00d
 #Longest finished job:             701s      11.68m     0.19h    0.01d
 #Submission to last job:         47080s     784.67m    13.08h    0.54d
 #Estimated complete:                 0s       0.00m     0.00h    0.00d
 
 # Concatenate results
 cat out/* > ../correlation.tab
 
 # Set up inputs for clustering run to choose colors and make tree
 cd /hive/data/genomes/hg38/bed/uwDnase1
 ls -1 /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/*.bigWig > ../pooled.lst
 grep -v '^#' meta.tab | cut -f 6 > foo
 paste pooled.lst foo > pooled.labels
 
 # Run clustering program, which takes about 20 hours
 mkdir /hive/data/genomes/hg38/bed/uwDnase1/calcGraph
 cd /hive/data/genomes/hg38/bed/uwDnase1/calcGraph
 mkdir -p /scratch/kent/tmpDir
 bigWigCluster ../pooled.lst /hive data/genomes/hg38/chrom.sizes uwDnase1.json uwDnase1.tab -precalc=../correlation.tab -threads=10 -tmpDir=/scratch/kent/tmpDir -labels=../pooled.labels
 
 ## Make normalized versions of wigs (Might be able to encorperate this into
 # the pooled job maker in the future
 ssh ku
 cd /hive/data/genomes/hg38/bed/uwDnase1
 mkdir run_normalized
 ls -1 /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/*.bigWig | \
 	sed 's/.pooled.bigwig//' > run_normalized/fileList
 cd run_normalized
 mkdir out
 
 # Make normalization script
 cat << '_EOF_' > norm1
 #!/bin/tcsh -efx
 set m = `bigWigInfo $1 | awk '/mean/ {print 1.0/$2}'`
 bigWigToBedGraph $1 stdout | colTransform 4 stdin 0 $m tmp.bedGraph
 bedGraphToBigWig tmp.bedGraph /hive/data/genomes/hg38/chrom.sizes tmp.bw
 rm tmp.bedGraph
 mv tmp.bw $2
 '_EOF_'
   # << happy emacs
  
 # Create gensub2 input
 cat << '_EOF_' > gsub
 #LOOP
 edwCdJob /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/norm1 $(path1).pooled.bigWig /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/out/$(root1).norm.bw
 #ENDLOOP
 #ENDLOOP
 '_EOF_'
   # << happy emacs
 
 # Do parasol run
 gensub2 fileList single gsub jobList
 para make jobList -maxJob=20
 para time
 #Completed: 95 of 95 jobs
 #CPU time in finished jobs:      20273s     337.88m     5.63h    0.23d  0.001 y
 #IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
 #Average job time:                 189s       3.15m     0.05h    0.00d
 #Longest finished job:             364s       6.07m     0.10h    0.00d
 #Submission to last job:          2006s      33.43m     0.56h    0.02d
 #Estimated complete:                 0s       0.00m     0.00h    0.00d
 
 
 # Link results into pooled directory
 ln -s /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/out/*.bw /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/
 sed 's/pooled.bigWig/norm.bw/' < calcGraph/uwDnase1.tab > colors.tab
 
 # Run program to generate trackDb file.  The source is in /hg/makeDb/outside/uwDnaseTrackHub
 cd /hive/data/genomes/hg38/bed/uwDnase1
 uwDnaseTrackHub meta.tab run_pooled colors.tab hub
 
 
 ##########################################################
 # Create DNase tracks from the hub files (In progress 2014-12-09 Kate)
 
 # There are 3 tracks (Redmine #14353):
 #       1) Composite with peaks, hotspots, and signal (as on hub)
 #       2) Multiwig of signal, colored by similarity (as on hub)
 #       3) Clusters (as on hg19)
 
 # The hub data file dir is:  /hive/data/genomes/hg38/bed/uwDnase1/run_pooled
 # (and normalized signal files are linked into that dir from :
 # /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/out
 # The hub trackDb file is: /hive/data/genomes/hg38/bed/uwDnase1/hub/hg38/trackDb.txt
 
 #################################
 # DNAse peaks, hotspots, and signal track, and the multiwig
 
 # Add scores to bigBeds (use signalValue)
 
     cd /hive/data/genomes/hg38/bed/uwDnase1/run_pooled
     mkdir -p scored/in scored/out
     foreach f (*.narrowPeak *.broadPeak)
         bigBedToBed $f scored/in/$f
     end
     cd scored/in
     bedScore -uniform -method=reg -col=7 *.narrowPeak ../out >&! score.out
     bedScore -uniform -method=reg -col=7 *.broadPeak ../out >>&! score.out &
 
     cd ../out
     foreach f (*.broadPeak)
         echo $f
         bedToBigBed -type=bed6+3 -as=$HOME/kent/src/hg/lib/encode/broadPeak.as \
                $f /hive/data/genomes/hg38/chrom.sizes ../$f.bb
     end
 
 # Link data files into gbdb and create bbi tables
     mkdir /hive/data/gbdb/hg38/bbi/uwDnase
     cd /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/scored
     set bbi = /gbdb/hg38/bbi/uwDnase/scored
     mkdir $bbi
 
     foreach f (*.broadPeak.bb)
         echo $f
         ln -s `pwd`/$f $bbi
         set exp = `echo $f | sed 's/wgEncodeEH\([0-9]*\).*/WgEncodeEH\1/'`
         hgBbiDbLink hg38 uwEnc2DnaseHot${exp} $bbi/$f
     end
 
     foreach f (*.narrowPeak.bb)
         echo $f
         ln -s `pwd`/$f $bbi
         set exp = `echo $f | sed 's/wgEncodeEH\([0-9]*\).*/WgEncodeEH\1/'`
         hgBbiDbLink hg38 uwEnc2DnasePeaks${exp} $bbi/$f
     end
 
     cd /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/out
     foreach f (*.bw)
         ln -s `pwd`/$f $bbi/$f
         set exp = `echo $f | sed 's/wgEncodeEH\([0-9]*\).*/WgEncodeEH\1/'`
         hgBbiDbLink hg38 uwEnc2DnaseSignal${exp} $bbi/$f
         hgBbiDbLink hg38 uwEnc2DnaseWig${exp} $bbi/$f
     end
 
 # Load peaks into database (needed by hgc. we may be able to drop these with code changes)
     cd /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/scored/out
     foreach f (*.narrowPeak)
         echo $f
         set exp = `echo $f | sed 's/wgEncodeEH\([0-9]*\).*/WgEncodeEH\1/'`
         #hgLoadBed -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg38 uwEnc2DnaseBedPeaks${exp} $f
         hgLoadBed -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg38 uwEnc2DnaseBedPeaks${exp} $f
     end
 
 # Use cell curation to make more informative long labels
     cd /hive/data/genomes/hg38/bed/uwDnase1
     hgsql hg38 -Ne 'select * from wgEncodeCell' > cells/cellInfo.tab
     uwDnaseTrackHub -cellFile=cells/cellInfo.tab meta.tab run_pooled colors.tab kateHub4
 
 # Convert trackDb from hub to native
     cd /hive/data/genomes/hg38/bed/uwDnase1/
     mkdir tracks
     cd tracks
     cp ../kateHub4/hg38/trackDb.txt .
     sed -e 's/type bigBed/type bigBed 6 +/' -e '/bigDataUrl/d' trackDb.txt > trackDb.ra
     cp trackDb.ra ~/kent/src/hg/makeDb/trackDb/human/hg38/uwDnase.ra
 
 # NOTE: hotspot5 had a bug causing it to silently abort peak identification on some datasets 
 # before completing all chromosomes.  I fixed this issue, and created hotspot5.1 version, which
 # was used to remake the broad and narrow peaks.  The corrected bigBeds are dated Feb. 13-19.
 #
 # The hotspot tool is hosted on hive at:  /hive/groups/encode/encode3/tools/hotspot-distr
 # The version is v4 (though the tool prints hotspot5 as its name)
 # The fix is:
 #
 # diff  hotspot-deploy-made/src/InputDataReader.cpp  hotspot-deploy-fixed/src/InputDataReader.cpp
 # 74a75
 # >                         numLines++;
 # I ran this by Bob Thurman who currently maintains the package, and he thought it reasonable -- said
 # he would test but we've not heard back.  I will post it to gitHub as an issue sometime.
 
 # Scored files are:
 #       /hive/data/genomes/hg38/bed/wgEncodeRegDnase/run_pooled_hotspot5.1/scored/*broadPeak.bb
 #       /hive/data/genomes/hg38/bed/wgEncodeRegDnase/run_pooled_hotspot5.1/scored/out/*.narrowPeak
 
 #################################
 # DNase clusters track (Kate Feb 2015)
 
     cd /hive/data/genomes/hg38/bed/uwDnase1
     mkdir clusters
     # cd run_pooled/scored/out
     cd run_pooled_hotspot5.1/scored/out
     ls *.pooled.narrowPeak > ../../../clusters/peak.lst
 
     # calculate normalization factor
     regClusterMakeTableOfTables -verbose=3 eapDnase01Hg38 \
         ../../../clusters/peak.lst ../../../clusters/peak.table >&! ../../../clusters/regTable.out &
 
     # cluster
     regCluster -bedSources ../../../clusters/peak.table /dev/null ../../../clusters/peak.bed \
         >&! ../../../clusters/regCluster.out &
     2011652 singly-linked clusters, 2076756 clusters in 96 chromosomes
     # NOTE: more clusters (2.2M) in hg19 (which included Duke data)
 
     # filter out low scoring
     cd ../../../clusters
     awk '$5 >= 100' peak.bed > peak.filtered.bed
     wc -l peak.filtered.bed
     # 1330766 peak.filtered.bed
     # retained 66% vs 83% in hg19 (seems low ??)
 
     # --> keep them all for now, filter with UI
 
     # format to BED5+floatscore+sources for hgBedSources 
     #   which will extract, uniquify, and assign ID's to sources
     awk 'BEGIN {OFS="\t"}{print $1, $2, $3, $4, $5, 0, $7;}' peak.bed > peak.bed6
     hgBedSources peak.bed6 regDnase
     mv regDnaseSources.tab uwEnc2DnaseSources.tab
 
     # hand edit to fix curation --  add RA treatments where needed
     # NHBE_RA -> NHBE_RA+RA
     # SK-N-SH_RA -> SK-N-SH_RA+RA
 
     # load sources table
     autoSql $HOME/kent/src/hg/lib/idName.as idName
     hgLoadSqlTab hg38 uwEnc2DnaseSources idName.sql uwEnc2DnaseSources.tab
 
     # merge files and format to BED5+sourceCount+sourceIds+sourceVals
     awk '{print $8}' peak.bed > peak.vals
     awk 'BEGIN {OFS="\t"}{print $1, $2, $3, $4, $5, $7, $8;}' regDnase.bed | \
         paste - peak.vals > uwEnc2DnaseClustered.bed
     hgLoadBed hg38 uwEnc2DnaseClustered -sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql \
         -renameSqlTable -as=$HOME/kent/src/hg/lib/bed5SourceVals.as uwEnc2DnaseClustered.bed
 
     # create inputs file to display metadata on details page
     # NOTE: this can probably be jettisoned in favor of new code, since source info
     # is now in the BED file
 cat > makeInputs.csh << 'EOF'
     set tables = `hgsql hg38 -Ne "show tables like 'uwEnc2DnasePeaks%'"`
     foreach t ($tables)
         set exp = `echo $t | sed 's/uwEnc2DnasePeaksWgEncode/wgEncode/'`
         set t = `echo $t | sed 's/Peaks/BedPeaks/'`
         set cell = `encodeExp show $exp cellType`
         set treatment = `encodeExp show $exp treatment`
         echo "$t\t$cell\t$treatment"
     end
 'EOF'
     csh makeInputs.csh > inputs.tab
     hgLoadSqlTab hg38 uwEnc2DnaseInputs ~/kent/src/hg/lib/clusterInputEapDnase.sql inputs.tab
 
     # try bigBed version
     sed 's/BedPeaks/Peaks/' inputs.tab > bigInputs.tab
     hgsql hg38 -e 'alter table uwEnc2DnaseInputs rename to uwEnc2DnaseInputs_old'
     hgLoadSqlTab hg38 uwEnc2DnaseInputs ~/kent/src/hg/lib/clusterInputEapDnase.sql bigInputs.tab
 
     #Hmm, hgc peakClusters doesn't appear to work with bigBed peak files...
     # Revert trackDb to BED peak files
 
 
 #################
 # Load up data.  Clean up peaks & hotspots a little first (truncate signal, -log10 of pVal) 
 # and then rename all -- replacing wgEncodeEH* with cell and treatment, to help users.  
 # Change prefix to match hg19
 # (kate 2015-06-17)
 
     cd /hive/data/genomes/hg38/bed/uwDnase1
     mkdir load
     cd load
 cat > expToFile.csh << 'EOF'
 #!/bin/csh -ef
     set exp = `echo $1 | sed 's/wgEncodeEH\([0-9]*\).*/\1/'`
     set cell = `encodeExp show $exp cellType | sed -e 's/[-+_]//g' -e 's/\(.\)\(.*\)/\U\1\L\2/'`
     set treatment = `encodeExp show $exp treatment | sed -e 's/[-+_]//g' -e 's/\(.\)\(.*\)/\U\1\L\2/'`
     echo ${cell}${treatment}
 'EOF'
 
 cat > renameWig.csh << 'EOF'
 #!/bin/csh -ef
 set bbi = /gbdb/hg38/bbi/wgEncodeRegDnase
 mkdir -p $bbi
 set path = (`pwd` $path)
 set build = /hive/data/genomes/hg38/bed/wgEncodeRegDnase
 set data = run_normalized/out
 set wd = `pwd`
 pushd $build/$data
 rm $wd/edit.csh
 foreach f (*.norm.bw)
     echo "${f} "
     set exp = $f:r:r
     set vars = `expToFile.csh $exp`
     set t = wgEncodeRegDnaseUw${vars}Signal
     echo "-e s/uwEnc2DnaseSignal${exp}/$t/ \" >> $wd/edit.csh
     ln -s $build/$data/$f $bbi/$t.bw
     hgBbiDbLink hg38 $t $bbi/$t.bw
 end
 popd
 'EOF'
 
 cat > loadPeak.csh << 'EOF'
 #!/bin/csh -ef
 set path = (`pwd` $path)
 set wd = `pwd`
 set build = /hive/data/genomes/hg38/bed/wgEncodeRegDnase
 set data = run_pooled/scored/out
 pushd $build/$data
 foreach f (*.narrowPeak)
     set exp = $f:r:r
     set vars = `expToFile.csh $exp`
     set t = wgEncodeRegDnaseUw${vars}Peak
     echo "${f}   ${t}"
     # truncate signal value to int and transform pValue to -log10
     # -log10 pValue
     awk 'BEGIN {OFS="\t"} {{if ($8==0) $8=4.94e-324; {$7=int($7); $8=-log($8)/log(10);print}}}' $f | \
         hgLoadBed  -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql \
         -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg38 $t stdin
 end
 'EOF'
 
 cat > loadHot.csh << 'EOF'
 #!/bin/csh -ef
 set bbi = /gbdb/hg38/bbi/wgEncodeRegDnase
 mkdir -p $bbi
 set path = (`pwd` $path)
 set wd = `pwd`
 set build = /hive/data/genomes/hg38/bed/wgEncodeRegDnase
 set data = run_pooled/scored/out
 pushd $build/$data
 foreach f (*.broadPeak)
     set exp = $f:r:r
     set vars = `expToFile.csh $exp`
     set t = wgEncodeRegDnaseUw${vars}Hotspot
     echo "${f}   ${t}"
     #echo "-e s/uwEnc2DnaseHot${exp}/$t/ \\" >> $wd/edit2.csh
     awk 'BEGIN {OFS="\t"} {{if ($8==0) $8=4.94e-324; {$7=int($7); $8=-log($8)/log(10);print}}}' $f > bed.tmp
     bedToBigBed -type=bed6+3 -as=${HOME}/kent/src/hg/lib/encode/broadPeak.as \ 
                 bed.tmp /hive/data/genomes/hg38/chrom.sizes ../$f.bb
     #ln -s $build/$data/$f $bbi/$t.broadPeak.bb
     #hgBbiDbLink hg38 $t $bbi/$t.broadPeak.bb
 end
 'EOF'
 
 # Manually tweak edit.csh and run to replace table and track names in trackDb.ra
 # Do same for broadPeak bigBeds, and narrowPeak tables
 
 csh edit5.csh ../clusters/bigInputs.tab > ../clusters/wgEncodeRegDnaseClusteredInputs.tab
 cd ../clusters
 hgLoadSqlTab hg38 wgEncodeRegDnaseClusteredInputs \
         ~/kent/src/hg/lib/clusterInputEapDnase.sql wgEncodeRegDnaseClusteredInputs.tab
 
 hgql hg38 -e "alter table uwEnc2DnaseClustered rename to wgEncodeRegDnaseClustered"
 hgsql hg38 -e "alter table uwEnc2DnaseSources rename to wgEncodeRegDnaseClusteredSources"
 
 
 #################
 # Cell table (use for cell metadata, instead of metaDb)
 
     cd /hive/data/genomes/hg38/bed/uwDnase1
     mkdir cells
     cd cells
 
     # collect cell info from ENCODE2 and ENCODE3
     ~/kent/src/hg/encode3/cellsFromEncode3.py > cells.tsv
 
     # load to google spreadsheet and clean 
     #https://docs.google.com/a/soe.ucsc.edu/spreadsheets/d/10EWdr-JTtDvfLKKLPvP3T2ft6MZ5KVdKbBzY76SRaug/edit#gid=1783710206
 
     # extract useful columns to file and load
     tail -n +2 wgEncodeCell.tab | \
         nl -v 0 | \
         hgLoadSqlTab hg38 wgEncodeCell ~/kent/src/hg/lib/encode/wgEncodeCell.sql stdin
 
     # add order URL's
     #https://docs.google.com/spreadsheets/d/14HvZfqJdClt6mfcwf2w0xRPvdhk5o7bgV77LMc1qTcU/edit?usp=sharing
     tail -n +2 wgEncodeCellUrl.tsv | \
         nl -v 0 | \
         hgLoadSqlTab hg38 wgEncodeCell ~/kent/src/hg/lib/encode/wgEncodeCell.sql stdin
 
     # to verify links
     checkUrlsInTable hg38 wgEncodeCell > errs.txt
 
 # Update table when cron of above reports errors
 
 cat > badUrls.txt << 'EOF'
 http://www.sciencellonline.com/site/productInformation.php?keyword=1830 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=1820 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=7110 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=1810 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=1000 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=1100 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=6300 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=6320 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=200  302
 http://www.sciencellonline.com/site/productInformation.php?keyword=1310 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=6570 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=2720 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=2620 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=6560 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=7630 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=6580 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=3120 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=3300 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=4000 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=6540 302
 http://www.sciencellonline.com/site/productInformation.php?keyword=7130 302
 'EOF'
 
 # NOTE: site query string has changed to:
 #       http://sciencellonline.com/catalogsearch/result/?q=1820
     hgsql hg38 -e 'select * from wgEncodeCell' > wgEncodeCell.2015-05-06.tab
     sed -e 's^www.sciencellonline.com/site/productInformation.php^sciencellonline.com/catalogsearch/result/^' -e 's^keyword=^q=^' wgEncodeCell.2015-05-06.tab > wgEncodeCell.2015-05-07.tab
     ln -s wgEncodeCell.2015-05-07.tab wgEncodeCell.latest.tab
     # save old table for now
     hgsql hg38 -e "alter table wgEncodeCell rename to wgEncodeCell_old"
     hgLoadSqlTab hg38 wgEncodeCell ~/kent/src/hg/lib/encode/wgEncodeCell.sql wgEncodeCell.latest.tab
     # check, then remove old table
     hgsql hg38 -e "drop table wgEncodeCell_old"
     checkUrlsInTable hg38 wgEncodeCell > errs.txt
 
 # Nov 2015 -- sciencell links are reported as permanently moved (HTTP status code 301)
 # Time to fix them!  21 total
 # navigate to links and grab correct URL
 # These must be fixed in the wgEncodeCell table (for hg38) and also in cv.ra (for hg19 tracks)
 #  (2015-11 kate)
 
 cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/cells
 checkUrlsInTable hg38 wgEncodeCell > wgEncodeCell.errs
 
 # hand edited below
 http://sciencellonline.com/catalogsearch/result/?q=1830 301
         http://www.sciencellonline.com/human-astrocytes-hippocampal.html HA-h
 http://sciencellonline.com/catalogsearch/result/?q=1820 301
         http://www.sciencellonline.com/human-astrocytes-spinal-cord.html HA-sp
 http://sciencellonline.com/catalogsearch/result/?q=7110 301
         http://www.sciencellonline.com/haepic.html HAEpiC
 http://sciencellonline.com/catalogsearch/result/?q=1810 301
         http://www.sciencellonline.com/human-astrocytes-cerebellar.html HA-c
 http://sciencellonline.com/catalogsearch/result/?q=1000 301
         http://www.sciencellonline.com/human-brain-microvascular-endothelial-cells.html HBMEC
 http://sciencellonline.com/catalogsearch/result/?q=1100 301
         http://www.sciencellonline.com/human-brain-vascular-smooth-muscle-cells.html HBVSMC
 http://sciencellonline.com/catalogsearch/result/?q=6300 301
         http://www.sciencellonline.com/human-cardiac-fibroblasts.html HCF
 http://sciencellonline.com/catalogsearch/result/?q=6320 301
         http://www.sciencellonline.com/human-cardiac-fibroblasts-adult-atrial.html HCF-aa
 http://sciencellonline.com/catalogsearch/result/?q=200  301
         http://www.sciencellonline.com/human-cardiac-myocytes.html HCM
 http://sciencellonline.com/catalogsearch/result/?q=1310 301
         http://www.sciencellonline.com/human-choroid-plexus-epithelial-cells.html HCPEpiC
 http://sciencellonline.com/catalogsearch/result/?q=6570 301
         http://www.sciencellonline.com/hconf.html HConF
 http://sciencellonline.com/catalogsearch/result/?q=2720 301
         http://www.sciencellonline.com/heepic.html HEEpiC
 http://sciencellonline.com/catalogsearch/result/?q=2620 301
         http://www.sciencellonline.com/human-gingival-fibroblasts.html HGnF
 http://sciencellonline.com/catalogsearch/result/?q=6560 301
         http://www.sciencellonline.com/hipepic.html HIPEpiC
 http://sciencellonline.com/catalogsearch/result/?q=7630 301
         http://www.sciencellonline.com/hmf.html HMF
 http://sciencellonline.com/catalogsearch/result/?q=6580 301
         http://www.sciencellonline.com/hnpcepic.html HNPCEpiC
 http://sciencellonline.com/catalogsearch/result/?q=3120 301
         http://www.sciencellonline.com/hpaaf.html HPAAF
 http://sciencellonline.com/catalogsearch/result/?q=3300 301
         http://www.sciencellonline.com/hpf.html HPF
 http://sciencellonline.com/catalogsearch/result/?q=4000 301
         http://www.sciencellonline.com/human-renal-glomerular-endothelial-cells.html HRGEC
 http://sciencellonline.com/catalogsearch/result/?q=6540 301
         http://www.sciencellonline.com/hrpepic.html HRPEpiC
 http://sciencellonline.com/catalogsearch/result/?q=7130 301
         http://www.sciencellonline.com/hvmf.html HVMF
 ###
 # hand-munge above into a shell script to edit .tab file
 # sed -e 's^bad-url^good-url^'
 hgsql hg38 -Ne 'select * from wgEncodeCell' > wgEncodeCell.2015-11-11.tab
 csh fixCells.csh wgEncodeCell.2015-11-11.tab > wgEncodeCell.2015-11-11.fixed.tab
 hgsql hg38 -e "alter table wgEncodeCell rename to wgEncodeCell_old"
 hgLoadSqlTab hg38 wgEncodeCell ~/kent/src/hg/lib/encode/wgEncodeCell.sql wgEncodeCell.2015-11-11.fixed.tab
 # check, then remove old table
 checkUrlsInTable hg38 wgEncodeCell > errs.txt
 hgsql hg38 -e "drop table wgEncodeCell_old"
 
 
 
 
 
 # Treatment table
 # TBD
 
 # create term/description tab sep file (currently just treatments in UW DNase)
 #cd /hive/data/genomes/hg38/bed/uwDnase1
 #cd cells
 #tail -n +2 treatments.tab | \
     #nl -v 0 | \
     #hgLoadSqlTab hg38 wgEncodeTreatment ~/kent/src/hg/lib/encode/wgEncodeTreatment.sql stdin
 
 
 # Comparison to lifted track (Chris Eisenhart)
 # Visually, new track is appears noisy. This is not unexpected as it excludes Duke data.
 # There are more elements in new track: 2076756 vs 1867194
 
 # Coverage is similar:
 
 featureBits hg38 wgEncodeRegDnaseClustered
 # 451551920 bases of 3049335806 (14.808%) in intersection
 
 featureBits hg38 wgEncodeRegDnaseClusteredLifted
 # 477271764 bases of 3049335806 (15.652%) in intersection
 
 # Comparing chr1:
 
 featureBits hg38 -chrom=chr1 wgEncodeRegDnaseClustered
 #41505780 bases of 230486321 (18.008%) in intersection
 featureBits hg38 -chrom=chr1 wgEncodeRegDnaseClusteredLifted
 #46543116 bases of 230486321 (20.193%) in intersection
 
 # Greater number of elements must be due to mappings on new alt chroms
 # (96 chroms in new track, 38 in old)
 
 
 ##############################################################################
 # wgEncodeReg ENCODE Regulatory tracks (Done Chris Eisenhart)
 # Transcription, Layered H3K4Me1, Layered H3K4Me3, Layered H3K27Ac
     mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg
     mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegMarkH3k27ac
     mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegMarkH34me1
     mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegMarkH3k4me3
     mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegTfbsClusteredV3
     mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegTxn
     mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg
     liftManyBigWigs /cluster/home/ceisenhart/kent/src/hg/utils/liftList/bigWigList.ra
 
 ##############################################################################
 # ENCODE Registry of Candidate cis-Regulatory Elements
 #
 # 2020-04-14  kate
 #
 # From ENCODE 3 Data Analysis Center at U Mass Med Center (Zlab)
 # Data contacts:  Henry Pratt, Jill Moore, Zhiping Weng PI
 #
 # RM #24668
 #
 # Download BED file (hosted on their integrative hub)
 
 cd /hive/data/outside/encode3/ccre
 wget http://gcp.wenglab.org/hubs/integrative1/data/GRCh38/CTA/GRCh38-ccREs.bigBed
 
 # Later Jill asked to add scores, download that
 
 wget -nd https://users.wenglab.org/moorej3/Human-maxZ-DNase.txt.gz
 gunzip Human-maxZ-DNase.txt.gz
 
 # check score distribution
 textHistogram -real -col=2 Human-maxZ-DNase.txt
 1.000000 ***************** 102913
 2.000000 ************************************************************ 362463
 3.000000 ************************************************* 294351
 4.000000 ********************* 128201
 5.000000 ***** 29345
 6.000000 * 6991
 7.000000  2096
 8.000000  168
 9.000000  5
 10.000000  2
 
 # noting that order of accessions in score file doesn't match bed file ;-(
 sort Human-maxZ-DNase.txt > Human-maxZ-DNase.sorted.txt
 sort -k 4 GRCh38-ccREs.bed > GRCh38-ccREs.sorted.bed
 
 paste GRCh38-ccREs.sorted.bed Human-maxZ-DNase.sorted.txt > ccres.prescored.bed
 # manually sanity check start and end of file to see that accessions match
 
 # score using zscore,  min(zscore*100),1000), and reformat
 awk '{OFS="\t"; print $1, $2, $3, $4, ($13>10)? 1000 : int($13 * 100), $6, $7, $8, $9, $10, $13}' \
         ccres.prescored.bed | bedSort stdin ccres.scored.bed
 
 # Reformat to add fields for filtering and mouseover, etc.
 set f = encodeCcreCombined
-perl makeCcreCombined.pl < ccres.scored.bed > $f.bed
-bedToBigBed -tab -type=bed9+6 -as=$f.as $f.bed /hive/data/genomes/hg38/chrom.sizes $f.bb
+set bin = ~/kent/src/hg/makeDb/outside/encode3/ccre
+perl $bin/makeCcreCombined.pl < ccres.scored.bed > $f.bed
+set lib = ~/kent/src/hg/lib
+bedToBigBed -tab -type=bed9+6 -as=$lib/$f.as $f.bed /hive/data/genomes/hg38/chrom.sizes $f.bb
 ln -s `pwd`/$f.bb /gbdb/hg38/encode3/ccre/
 
 ###############################