76e480f51fbf36171248a26c9f24c3bd0e95af7f kate Thu Jun 4 11:38:18 2020 -0700 Save build script and schema for CCRE track. refs #24668 diff --git src/hg/makeDb/doc/hg38/reg.txt src/hg/makeDb/doc/hg38/reg.txt index 697ad22..75aa882 100644 --- src/hg/makeDb/doc/hg38/reg.txt +++ src/hg/makeDb/doc/hg38/reg.txt @@ -1,683 +1,685 @@ # for emacs: -*- mode: sh; -*- # Regulation tracks for hg38 / GRCh38 ############################################################################# # Building UW DNAse I ENCODE2 tracks (In progress 2014-09-03 Jim Kent) #These tracks contain the results of DNAse I hypersensitivity experiments from the #John Stamatoyannapoulos lab at the University of Washington done for the ENCODE Project #phase 2. #The data was processed according to the July 2014 version of the ENCODE 3 DNAse #processing pipeline. At a high level this means pooling aligning the reads #with the bwa program against hg38 with the 'sponge' sequence, removing multiple #mapping reads, and reads that aligned to the sponge or mitochondria, pooling #the results for all replicates, and running the hotspot program. The bigWig output #was normalized so that the average value genome-wide is 1. #The bam files were created by the encode analysis pipeline on each replicate separately #and the process for doing this won't be described here. It is a bit complex, and #really will just need to be reworked into something simpler now that we've no longer #are working directly on that contract. This build assumes that the relevant bam #files are in the /hive/groups/encode/3/eap/cach directory. # To do the mapping again you'd start with fastq files and use the script # eap_run_bwa_se on an index that included the sponge as well as hg38 # chromosomes (but not alternative haplotypes). Bwa itself is run in a # rather vanilla mode, with no options beyond -t 4 to parallelize the # first pass of the alignment in 4 threads. # The first section of methods here are to create a hub with peaks, hotspots, and signal # from pooled replicates. #The detailed instructions after the bam files are available are: ##In more detail. First mkdir /hive/data/genomes/hg38/bed/uwDnase1 ## Run program to generate most of parasol batches ssh encode-02 cd /hive/data/genomes/hg38/bed/uwDnase1 dnaseHg38Batch batchDir ## By hand edit split batchDir into pooled and single replicate versions in directories ## run_pooled and run_replicates (sorry for the hand work) ## Do parasol runs on pooled ssh ku cd cd /hive/data/genomes/hg38/bed/uwDnase1/run_pooled para make para time #Completed: 95 of 95 jobs #CPU time in finished jobs: 2908517s 48475.28m 807.92h 33.66d 0.092 y #IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y #Average job time: 27838s 463.96m 7.73h 0.32d #Longest finished job: 128043s 2134.05m 35.57h 1.48d #Submission to last job: 128747s 2145.78m 35.76h 1.49d #Estimated complete: 0s 0.00m 0.00h 0.00d ## Do parasol runs on replicates (these are not actually currently used) ssh ku cd /hive/data/genomes/hg38/bed/uwDnase1/run_replicates para make para time #completed: 189 of 189 jobs #CPU time in finished jobs: 4025020s 67083.66m 1118.06h 46.59d 0.128 y #IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y #Average job time: 20115s 335.25m 5.59h 0.23d #Longest finished job: 110245s 1837.42m 30.62h 1.28d #Submission to last job: 111410s 1856.83m 30.95h 1.29d #Estimated complete: 0s 0.00m 0.00h 0.00d #Note that one of the experiments only has replicate 2. It's because both #iterations of replicate 1 were deprecated. ## Augment metadata ssh hgwdev cd /hive/data/genomes/hg38/bed/uwDnase1 dnaseHg38AddTreatments batchDir/meta.tab meta.tab ## Do correlations between all pooled experiments ssh ku cd /hive/data/genomes/hg38/bed/uwDnase1 mkdir run_correlations cd run_correlations # Create little script to make tab separated output out of bigWigCorrelate results cat << '_EOF_' > corr2 #!/bin/tcsh -efx echo -n "$1\t$2\t" > $3 bigWigCorrelate $1 $2 >> $3 '_EOF_' # << happy emacs # Create gensub2 input cat << '_EOF_' > gsub #LOOP corr2 $(path1) $(path2) out/$(root1)_vs_$(root2) #ENDLOOP '_EOF_' # << happy emacs # Run gensub2 with brand new selfPair method on all pooled files ls -1 /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/*.bigWig > fileList gensub2 fileList selfPair gsub jobList # The parasol run using just 10 CPUs because we are i/o heavy para create jobList para push -maxJob=10 para time #Completed: 4465 of 4465 jobs #CPU time in finished jobs: 349724s 5828.74m 97.15h 4.05d 0.011 y #IO & Wait Time: 58019s 966.98m 16.12h 0.67d 0.002 y #Average job time: 91s 1.52m 0.03h 0.00d #Longest finished job: 701s 11.68m 0.19h 0.01d #Submission to last job: 47080s 784.67m 13.08h 0.54d #Estimated complete: 0s 0.00m 0.00h 0.00d # Concatenate results cat out/* > ../correlation.tab # Set up inputs for clustering run to choose colors and make tree cd /hive/data/genomes/hg38/bed/uwDnase1 ls -1 /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/*.bigWig > ../pooled.lst grep -v '^#' meta.tab | cut -f 6 > foo paste pooled.lst foo > pooled.labels # Run clustering program, which takes about 20 hours mkdir /hive/data/genomes/hg38/bed/uwDnase1/calcGraph cd /hive/data/genomes/hg38/bed/uwDnase1/calcGraph mkdir -p /scratch/kent/tmpDir bigWigCluster ../pooled.lst /hive data/genomes/hg38/chrom.sizes uwDnase1.json uwDnase1.tab -precalc=../correlation.tab -threads=10 -tmpDir=/scratch/kent/tmpDir -labels=../pooled.labels ## Make normalized versions of wigs (Might be able to encorperate this into # the pooled job maker in the future ssh ku cd /hive/data/genomes/hg38/bed/uwDnase1 mkdir run_normalized ls -1 /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/*.bigWig | \ sed 's/.pooled.bigwig//' > run_normalized/fileList cd run_normalized mkdir out # Make normalization script cat << '_EOF_' > norm1 #!/bin/tcsh -efx set m = `bigWigInfo $1 | awk '/mean/ {print 1.0/$2}'` bigWigToBedGraph $1 stdout | colTransform 4 stdin 0 $m tmp.bedGraph bedGraphToBigWig tmp.bedGraph /hive/data/genomes/hg38/chrom.sizes tmp.bw rm tmp.bedGraph mv tmp.bw $2 '_EOF_' # << happy emacs # Create gensub2 input cat << '_EOF_' > gsub #LOOP edwCdJob /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/norm1 $(path1).pooled.bigWig /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/out/$(root1).norm.bw #ENDLOOP #ENDLOOP '_EOF_' # << happy emacs # Do parasol run gensub2 fileList single gsub jobList para make jobList -maxJob=20 para time #Completed: 95 of 95 jobs #CPU time in finished jobs: 20273s 337.88m 5.63h 0.23d 0.001 y #IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y #Average job time: 189s 3.15m 0.05h 0.00d #Longest finished job: 364s 6.07m 0.10h 0.00d #Submission to last job: 2006s 33.43m 0.56h 0.02d #Estimated complete: 0s 0.00m 0.00h 0.00d # Link results into pooled directory ln -s /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/out/*.bw /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/ sed 's/pooled.bigWig/norm.bw/' < calcGraph/uwDnase1.tab > colors.tab # Run program to generate trackDb file. The source is in /hg/makeDb/outside/uwDnaseTrackHub cd /hive/data/genomes/hg38/bed/uwDnase1 uwDnaseTrackHub meta.tab run_pooled colors.tab hub ########################################################## # Create DNase tracks from the hub files (In progress 2014-12-09 Kate) # There are 3 tracks (Redmine #14353): # 1) Composite with peaks, hotspots, and signal (as on hub) # 2) Multiwig of signal, colored by similarity (as on hub) # 3) Clusters (as on hg19) # The hub data file dir is: /hive/data/genomes/hg38/bed/uwDnase1/run_pooled # (and normalized signal files are linked into that dir from : # /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/out # The hub trackDb file is: /hive/data/genomes/hg38/bed/uwDnase1/hub/hg38/trackDb.txt ################################# # DNAse peaks, hotspots, and signal track, and the multiwig # Add scores to bigBeds (use signalValue) cd /hive/data/genomes/hg38/bed/uwDnase1/run_pooled mkdir -p scored/in scored/out foreach f (*.narrowPeak *.broadPeak) bigBedToBed $f scored/in/$f end cd scored/in bedScore -uniform -method=reg -col=7 *.narrowPeak ../out >&! score.out bedScore -uniform -method=reg -col=7 *.broadPeak ../out >>&! score.out & cd ../out foreach f (*.broadPeak) echo $f bedToBigBed -type=bed6+3 -as=$HOME/kent/src/hg/lib/encode/broadPeak.as \ $f /hive/data/genomes/hg38/chrom.sizes ../$f.bb end # Link data files into gbdb and create bbi tables mkdir /hive/data/gbdb/hg38/bbi/uwDnase cd /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/scored set bbi = /gbdb/hg38/bbi/uwDnase/scored mkdir $bbi foreach f (*.broadPeak.bb) echo $f ln -s `pwd`/$f $bbi set exp = `echo $f | sed 's/wgEncodeEH\([0-9]*\).*/WgEncodeEH\1/'` hgBbiDbLink hg38 uwEnc2DnaseHot${exp} $bbi/$f end foreach f (*.narrowPeak.bb) echo $f ln -s `pwd`/$f $bbi set exp = `echo $f | sed 's/wgEncodeEH\([0-9]*\).*/WgEncodeEH\1/'` hgBbiDbLink hg38 uwEnc2DnasePeaks${exp} $bbi/$f end cd /hive/data/genomes/hg38/bed/uwDnase1/run_normalized/out foreach f (*.bw) ln -s `pwd`/$f $bbi/$f set exp = `echo $f | sed 's/wgEncodeEH\([0-9]*\).*/WgEncodeEH\1/'` hgBbiDbLink hg38 uwEnc2DnaseSignal${exp} $bbi/$f hgBbiDbLink hg38 uwEnc2DnaseWig${exp} $bbi/$f end # Load peaks into database (needed by hgc. we may be able to drop these with code changes) cd /hive/data/genomes/hg38/bed/uwDnase1/run_pooled/scored/out foreach f (*.narrowPeak) echo $f set exp = `echo $f | sed 's/wgEncodeEH\([0-9]*\).*/WgEncodeEH\1/'` #hgLoadBed -fillInScore=signalValue -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg38 uwEnc2DnaseBedPeaks${exp} $f hgLoadBed -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg38 uwEnc2DnaseBedPeaks${exp} $f end # Use cell curation to make more informative long labels cd /hive/data/genomes/hg38/bed/uwDnase1 hgsql hg38 -Ne 'select * from wgEncodeCell' > cells/cellInfo.tab uwDnaseTrackHub -cellFile=cells/cellInfo.tab meta.tab run_pooled colors.tab kateHub4 # Convert trackDb from hub to native cd /hive/data/genomes/hg38/bed/uwDnase1/ mkdir tracks cd tracks cp ../kateHub4/hg38/trackDb.txt . sed -e 's/type bigBed/type bigBed 6 +/' -e '/bigDataUrl/d' trackDb.txt > trackDb.ra cp trackDb.ra ~/kent/src/hg/makeDb/trackDb/human/hg38/uwDnase.ra # NOTE: hotspot5 had a bug causing it to silently abort peak identification on some datasets # before completing all chromosomes. I fixed this issue, and created hotspot5.1 version, which # was used to remake the broad and narrow peaks. The corrected bigBeds are dated Feb. 13-19. # # The hotspot tool is hosted on hive at: /hive/groups/encode/encode3/tools/hotspot-distr # The version is v4 (though the tool prints hotspot5 as its name) # The fix is: # # diff hotspot-deploy-made/src/InputDataReader.cpp hotspot-deploy-fixed/src/InputDataReader.cpp # 74a75 # > numLines++; # I ran this by Bob Thurman who currently maintains the package, and he thought it reasonable -- said # he would test but we've not heard back. I will post it to gitHub as an issue sometime. # Scored files are: # /hive/data/genomes/hg38/bed/wgEncodeRegDnase/run_pooled_hotspot5.1/scored/*broadPeak.bb # /hive/data/genomes/hg38/bed/wgEncodeRegDnase/run_pooled_hotspot5.1/scored/out/*.narrowPeak ################################# # DNase clusters track (Kate Feb 2015) cd /hive/data/genomes/hg38/bed/uwDnase1 mkdir clusters # cd run_pooled/scored/out cd run_pooled_hotspot5.1/scored/out ls *.pooled.narrowPeak > ../../../clusters/peak.lst # calculate normalization factor regClusterMakeTableOfTables -verbose=3 eapDnase01Hg38 \ ../../../clusters/peak.lst ../../../clusters/peak.table >&! ../../../clusters/regTable.out & # cluster regCluster -bedSources ../../../clusters/peak.table /dev/null ../../../clusters/peak.bed \ >&! ../../../clusters/regCluster.out & 2011652 singly-linked clusters, 2076756 clusters in 96 chromosomes # NOTE: more clusters (2.2M) in hg19 (which included Duke data) # filter out low scoring cd ../../../clusters awk '$5 >= 100' peak.bed > peak.filtered.bed wc -l peak.filtered.bed # 1330766 peak.filtered.bed # retained 66% vs 83% in hg19 (seems low ??) # --> keep them all for now, filter with UI # format to BED5+floatscore+sources for hgBedSources # which will extract, uniquify, and assign ID's to sources awk 'BEGIN {OFS="\t"}{print $1, $2, $3, $4, $5, 0, $7;}' peak.bed > peak.bed6 hgBedSources peak.bed6 regDnase mv regDnaseSources.tab uwEnc2DnaseSources.tab # hand edit to fix curation -- add RA treatments where needed # NHBE_RA -> NHBE_RA+RA # SK-N-SH_RA -> SK-N-SH_RA+RA # load sources table autoSql $HOME/kent/src/hg/lib/idName.as idName hgLoadSqlTab hg38 uwEnc2DnaseSources idName.sql uwEnc2DnaseSources.tab # merge files and format to BED5+sourceCount+sourceIds+sourceVals awk '{print $8}' peak.bed > peak.vals awk 'BEGIN {OFS="\t"}{print $1, $2, $3, $4, $5, $7, $8;}' regDnase.bed | \ paste - peak.vals > uwEnc2DnaseClustered.bed hgLoadBed hg38 uwEnc2DnaseClustered -sqlTable=$HOME/kent/src/hg/lib/bed5SourceVals.sql \ -renameSqlTable -as=$HOME/kent/src/hg/lib/bed5SourceVals.as uwEnc2DnaseClustered.bed # create inputs file to display metadata on details page # NOTE: this can probably be jettisoned in favor of new code, since source info # is now in the BED file cat > makeInputs.csh << 'EOF' set tables = `hgsql hg38 -Ne "show tables like 'uwEnc2DnasePeaks%'"` foreach t ($tables) set exp = `echo $t | sed 's/uwEnc2DnasePeaksWgEncode/wgEncode/'` set t = `echo $t | sed 's/Peaks/BedPeaks/'` set cell = `encodeExp show $exp cellType` set treatment = `encodeExp show $exp treatment` echo "$t\t$cell\t$treatment" end 'EOF' csh makeInputs.csh > inputs.tab hgLoadSqlTab hg38 uwEnc2DnaseInputs ~/kent/src/hg/lib/clusterInputEapDnase.sql inputs.tab # try bigBed version sed 's/BedPeaks/Peaks/' inputs.tab > bigInputs.tab hgsql hg38 -e 'alter table uwEnc2DnaseInputs rename to uwEnc2DnaseInputs_old' hgLoadSqlTab hg38 uwEnc2DnaseInputs ~/kent/src/hg/lib/clusterInputEapDnase.sql bigInputs.tab #Hmm, hgc peakClusters doesn't appear to work with bigBed peak files... # Revert trackDb to BED peak files ################# # Load up data. Clean up peaks & hotspots a little first (truncate signal, -log10 of pVal) # and then rename all -- replacing wgEncodeEH* with cell and treatment, to help users. # Change prefix to match hg19 # (kate 2015-06-17) cd /hive/data/genomes/hg38/bed/uwDnase1 mkdir load cd load cat > expToFile.csh << 'EOF' #!/bin/csh -ef set exp = `echo $1 | sed 's/wgEncodeEH\([0-9]*\).*/\1/'` set cell = `encodeExp show $exp cellType | sed -e 's/[-+_]//g' -e 's/\(.\)\(.*\)/\U\1\L\2/'` set treatment = `encodeExp show $exp treatment | sed -e 's/[-+_]//g' -e 's/\(.\)\(.*\)/\U\1\L\2/'` echo ${cell}${treatment} 'EOF' cat > renameWig.csh << 'EOF' #!/bin/csh -ef set bbi = /gbdb/hg38/bbi/wgEncodeRegDnase mkdir -p $bbi set path = (`pwd` $path) set build = /hive/data/genomes/hg38/bed/wgEncodeRegDnase set data = run_normalized/out set wd = `pwd` pushd $build/$data rm $wd/edit.csh foreach f (*.norm.bw) echo "${f} " set exp = $f:r:r set vars = `expToFile.csh $exp` set t = wgEncodeRegDnaseUw${vars}Signal echo "-e s/uwEnc2DnaseSignal${exp}/$t/ \" >> $wd/edit.csh ln -s $build/$data/$f $bbi/$t.bw hgBbiDbLink hg38 $t $bbi/$t.bw end popd 'EOF' cat > loadPeak.csh << 'EOF' #!/bin/csh -ef set path = (`pwd` $path) set wd = `pwd` set build = /hive/data/genomes/hg38/bed/wgEncodeRegDnase set data = run_pooled/scored/out pushd $build/$data foreach f (*.narrowPeak) set exp = $f:r:r set vars = `expToFile.csh $exp` set t = wgEncodeRegDnaseUw${vars}Peak echo "${f} ${t}" # truncate signal value to int and transform pValue to -log10 # -log10 pValue awk 'BEGIN {OFS="\t"} {{if ($8==0) $8=4.94e-324; {$7=int($7); $8=-log($8)/log(10);print}}}' $f | \ hgLoadBed -trimSqlTable -sqlTable=$HOME/kent/src/hg/lib/encode/narrowPeak.sql \ -renameSqlTable -as=$HOME/kent/src/hg/lib/encode/narrowPeak.as hg38 $t stdin end 'EOF' cat > loadHot.csh << 'EOF' #!/bin/csh -ef set bbi = /gbdb/hg38/bbi/wgEncodeRegDnase mkdir -p $bbi set path = (`pwd` $path) set wd = `pwd` set build = /hive/data/genomes/hg38/bed/wgEncodeRegDnase set data = run_pooled/scored/out pushd $build/$data foreach f (*.broadPeak) set exp = $f:r:r set vars = `expToFile.csh $exp` set t = wgEncodeRegDnaseUw${vars}Hotspot echo "${f} ${t}" #echo "-e s/uwEnc2DnaseHot${exp}/$t/ \\" >> $wd/edit2.csh awk 'BEGIN {OFS="\t"} {{if ($8==0) $8=4.94e-324; {$7=int($7); $8=-log($8)/log(10);print}}}' $f > bed.tmp bedToBigBed -type=bed6+3 -as=${HOME}/kent/src/hg/lib/encode/broadPeak.as \ bed.tmp /hive/data/genomes/hg38/chrom.sizes ../$f.bb #ln -s $build/$data/$f $bbi/$t.broadPeak.bb #hgBbiDbLink hg38 $t $bbi/$t.broadPeak.bb end 'EOF' # Manually tweak edit.csh and run to replace table and track names in trackDb.ra # Do same for broadPeak bigBeds, and narrowPeak tables csh edit5.csh ../clusters/bigInputs.tab > ../clusters/wgEncodeRegDnaseClusteredInputs.tab cd ../clusters hgLoadSqlTab hg38 wgEncodeRegDnaseClusteredInputs \ ~/kent/src/hg/lib/clusterInputEapDnase.sql wgEncodeRegDnaseClusteredInputs.tab hgql hg38 -e "alter table uwEnc2DnaseClustered rename to wgEncodeRegDnaseClustered" hgsql hg38 -e "alter table uwEnc2DnaseSources rename to wgEncodeRegDnaseClusteredSources" ################# # Cell table (use for cell metadata, instead of metaDb) cd /hive/data/genomes/hg38/bed/uwDnase1 mkdir cells cd cells # collect cell info from ENCODE2 and ENCODE3 ~/kent/src/hg/encode3/cellsFromEncode3.py > cells.tsv # load to google spreadsheet and clean #https://docs.google.com/a/soe.ucsc.edu/spreadsheets/d/10EWdr-JTtDvfLKKLPvP3T2ft6MZ5KVdKbBzY76SRaug/edit#gid=1783710206 # extract useful columns to file and load tail -n +2 wgEncodeCell.tab | \ nl -v 0 | \ hgLoadSqlTab hg38 wgEncodeCell ~/kent/src/hg/lib/encode/wgEncodeCell.sql stdin # add order URL's #https://docs.google.com/spreadsheets/d/14HvZfqJdClt6mfcwf2w0xRPvdhk5o7bgV77LMc1qTcU/edit?usp=sharing tail -n +2 wgEncodeCellUrl.tsv | \ nl -v 0 | \ hgLoadSqlTab hg38 wgEncodeCell ~/kent/src/hg/lib/encode/wgEncodeCell.sql stdin # to verify links checkUrlsInTable hg38 wgEncodeCell > errs.txt # Update table when cron of above reports errors cat > badUrls.txt << 'EOF' http://www.sciencellonline.com/site/productInformation.php?keyword=1830 302 http://www.sciencellonline.com/site/productInformation.php?keyword=1820 302 http://www.sciencellonline.com/site/productInformation.php?keyword=7110 302 http://www.sciencellonline.com/site/productInformation.php?keyword=1810 302 http://www.sciencellonline.com/site/productInformation.php?keyword=1000 302 http://www.sciencellonline.com/site/productInformation.php?keyword=1100 302 http://www.sciencellonline.com/site/productInformation.php?keyword=6300 302 http://www.sciencellonline.com/site/productInformation.php?keyword=6320 302 http://www.sciencellonline.com/site/productInformation.php?keyword=200 302 http://www.sciencellonline.com/site/productInformation.php?keyword=1310 302 http://www.sciencellonline.com/site/productInformation.php?keyword=6570 302 http://www.sciencellonline.com/site/productInformation.php?keyword=2720 302 http://www.sciencellonline.com/site/productInformation.php?keyword=2620 302 http://www.sciencellonline.com/site/productInformation.php?keyword=6560 302 http://www.sciencellonline.com/site/productInformation.php?keyword=7630 302 http://www.sciencellonline.com/site/productInformation.php?keyword=6580 302 http://www.sciencellonline.com/site/productInformation.php?keyword=3120 302 http://www.sciencellonline.com/site/productInformation.php?keyword=3300 302 http://www.sciencellonline.com/site/productInformation.php?keyword=4000 302 http://www.sciencellonline.com/site/productInformation.php?keyword=6540 302 http://www.sciencellonline.com/site/productInformation.php?keyword=7130 302 'EOF' # NOTE: site query string has changed to: # http://sciencellonline.com/catalogsearch/result/?q=1820 hgsql hg38 -e 'select * from wgEncodeCell' > wgEncodeCell.2015-05-06.tab sed -e 's^www.sciencellonline.com/site/productInformation.php^sciencellonline.com/catalogsearch/result/^' -e 's^keyword=^q=^' wgEncodeCell.2015-05-06.tab > wgEncodeCell.2015-05-07.tab ln -s wgEncodeCell.2015-05-07.tab wgEncodeCell.latest.tab # save old table for now hgsql hg38 -e "alter table wgEncodeCell rename to wgEncodeCell_old" hgLoadSqlTab hg38 wgEncodeCell ~/kent/src/hg/lib/encode/wgEncodeCell.sql wgEncodeCell.latest.tab # check, then remove old table hgsql hg38 -e "drop table wgEncodeCell_old" checkUrlsInTable hg38 wgEncodeCell > errs.txt # Nov 2015 -- sciencell links are reported as permanently moved (HTTP status code 301) # Time to fix them! 21 total # navigate to links and grab correct URL # These must be fixed in the wgEncodeCell table (for hg38) and also in cv.ra (for hg19 tracks) # (2015-11 kate) cd /hive/data/genomes/hg38/bed/wgEncodeRegDnase/cells checkUrlsInTable hg38 wgEncodeCell > wgEncodeCell.errs # hand edited below http://sciencellonline.com/catalogsearch/result/?q=1830 301 http://www.sciencellonline.com/human-astrocytes-hippocampal.html HA-h http://sciencellonline.com/catalogsearch/result/?q=1820 301 http://www.sciencellonline.com/human-astrocytes-spinal-cord.html HA-sp http://sciencellonline.com/catalogsearch/result/?q=7110 301 http://www.sciencellonline.com/haepic.html HAEpiC http://sciencellonline.com/catalogsearch/result/?q=1810 301 http://www.sciencellonline.com/human-astrocytes-cerebellar.html HA-c http://sciencellonline.com/catalogsearch/result/?q=1000 301 http://www.sciencellonline.com/human-brain-microvascular-endothelial-cells.html HBMEC http://sciencellonline.com/catalogsearch/result/?q=1100 301 http://www.sciencellonline.com/human-brain-vascular-smooth-muscle-cells.html HBVSMC http://sciencellonline.com/catalogsearch/result/?q=6300 301 http://www.sciencellonline.com/human-cardiac-fibroblasts.html HCF http://sciencellonline.com/catalogsearch/result/?q=6320 301 http://www.sciencellonline.com/human-cardiac-fibroblasts-adult-atrial.html HCF-aa http://sciencellonline.com/catalogsearch/result/?q=200 301 http://www.sciencellonline.com/human-cardiac-myocytes.html HCM http://sciencellonline.com/catalogsearch/result/?q=1310 301 http://www.sciencellonline.com/human-choroid-plexus-epithelial-cells.html HCPEpiC http://sciencellonline.com/catalogsearch/result/?q=6570 301 http://www.sciencellonline.com/hconf.html HConF http://sciencellonline.com/catalogsearch/result/?q=2720 301 http://www.sciencellonline.com/heepic.html HEEpiC http://sciencellonline.com/catalogsearch/result/?q=2620 301 http://www.sciencellonline.com/human-gingival-fibroblasts.html HGnF http://sciencellonline.com/catalogsearch/result/?q=6560 301 http://www.sciencellonline.com/hipepic.html HIPEpiC http://sciencellonline.com/catalogsearch/result/?q=7630 301 http://www.sciencellonline.com/hmf.html HMF http://sciencellonline.com/catalogsearch/result/?q=6580 301 http://www.sciencellonline.com/hnpcepic.html HNPCEpiC http://sciencellonline.com/catalogsearch/result/?q=3120 301 http://www.sciencellonline.com/hpaaf.html HPAAF http://sciencellonline.com/catalogsearch/result/?q=3300 301 http://www.sciencellonline.com/hpf.html HPF http://sciencellonline.com/catalogsearch/result/?q=4000 301 http://www.sciencellonline.com/human-renal-glomerular-endothelial-cells.html HRGEC http://sciencellonline.com/catalogsearch/result/?q=6540 301 http://www.sciencellonline.com/hrpepic.html HRPEpiC http://sciencellonline.com/catalogsearch/result/?q=7130 301 http://www.sciencellonline.com/hvmf.html HVMF ### # hand-munge above into a shell script to edit .tab file # sed -e 's^bad-url^good-url^' hgsql hg38 -Ne 'select * from wgEncodeCell' > wgEncodeCell.2015-11-11.tab csh fixCells.csh wgEncodeCell.2015-11-11.tab > wgEncodeCell.2015-11-11.fixed.tab hgsql hg38 -e "alter table wgEncodeCell rename to wgEncodeCell_old" hgLoadSqlTab hg38 wgEncodeCell ~/kent/src/hg/lib/encode/wgEncodeCell.sql wgEncodeCell.2015-11-11.fixed.tab # check, then remove old table checkUrlsInTable hg38 wgEncodeCell > errs.txt hgsql hg38 -e "drop table wgEncodeCell_old" # Treatment table # TBD # create term/description tab sep file (currently just treatments in UW DNase) #cd /hive/data/genomes/hg38/bed/uwDnase1 #cd cells #tail -n +2 treatments.tab | \ #nl -v 0 | \ #hgLoadSqlTab hg38 wgEncodeTreatment ~/kent/src/hg/lib/encode/wgEncodeTreatment.sql stdin # Comparison to lifted track (Chris Eisenhart) # Visually, new track is appears noisy. This is not unexpected as it excludes Duke data. # There are more elements in new track: 2076756 vs 1867194 # Coverage is similar: featureBits hg38 wgEncodeRegDnaseClustered # 451551920 bases of 3049335806 (14.808%) in intersection featureBits hg38 wgEncodeRegDnaseClusteredLifted # 477271764 bases of 3049335806 (15.652%) in intersection # Comparing chr1: featureBits hg38 -chrom=chr1 wgEncodeRegDnaseClustered #41505780 bases of 230486321 (18.008%) in intersection featureBits hg38 -chrom=chr1 wgEncodeRegDnaseClusteredLifted #46543116 bases of 230486321 (20.193%) in intersection # Greater number of elements must be due to mappings on new alt chroms # (96 chroms in new track, 38 in old) ############################################################################## # wgEncodeReg ENCODE Regulatory tracks (Done Chris Eisenhart) # Transcription, Layered H3K4Me1, Layered H3K4Me3, Layered H3K27Ac mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegMarkH3k27ac mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegMarkH34me1 mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegMarkH3k4me3 mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegTfbsClusteredV3 mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg/wgEncodeRegTxn mkdir /hive/data/genomes/hg38/bed/hg19MassiveLift/wgEncodeReg liftManyBigWigs /cluster/home/ceisenhart/kent/src/hg/utils/liftList/bigWigList.ra ############################################################################## # ENCODE Registry of Candidate cis-Regulatory Elements # # 2020-04-14 kate # # From ENCODE 3 Data Analysis Center at U Mass Med Center (Zlab) # Data contacts: Henry Pratt, Jill Moore, Zhiping Weng PI # # RM #24668 # # Download BED file (hosted on their integrative hub) cd /hive/data/outside/encode3/ccre wget http://gcp.wenglab.org/hubs/integrative1/data/GRCh38/CTA/GRCh38-ccREs.bigBed # Later Jill asked to add scores, download that wget -nd https://users.wenglab.org/moorej3/Human-maxZ-DNase.txt.gz gunzip Human-maxZ-DNase.txt.gz # check score distribution textHistogram -real -col=2 Human-maxZ-DNase.txt 1.000000 ***************** 102913 2.000000 ************************************************************ 362463 3.000000 ************************************************* 294351 4.000000 ********************* 128201 5.000000 ***** 29345 6.000000 * 6991 7.000000 2096 8.000000 168 9.000000 5 10.000000 2 # noting that order of accessions in score file doesn't match bed file ;-( sort Human-maxZ-DNase.txt > Human-maxZ-DNase.sorted.txt sort -k 4 GRCh38-ccREs.bed > GRCh38-ccREs.sorted.bed paste GRCh38-ccREs.sorted.bed Human-maxZ-DNase.sorted.txt > ccres.prescored.bed # manually sanity check start and end of file to see that accessions match # score using zscore, min(zscore*100),1000), and reformat awk '{OFS="\t"; print $1, $2, $3, $4, ($13>10)? 1000 : int($13 * 100), $6, $7, $8, $9, $10, $13}' \ ccres.prescored.bed | bedSort stdin ccres.scored.bed # Reformat to add fields for filtering and mouseover, etc. set f = encodeCcreCombined -perl makeCcreCombined.pl < ccres.scored.bed > $f.bed -bedToBigBed -tab -type=bed9+6 -as=$f.as $f.bed /hive/data/genomes/hg38/chrom.sizes $f.bb +set bin = ~/kent/src/hg/makeDb/outside/encode3/ccre +perl $bin/makeCcreCombined.pl < ccres.scored.bed > $f.bed +set lib = ~/kent/src/hg/lib +bedToBigBed -tab -type=bed9+6 -as=$lib/$f.as $f.bed /hive/data/genomes/hg38/chrom.sizes $f.bb ln -s `pwd`/$f.bb /gbdb/hg38/encode3/ccre/ ###############################