be4311c07e14feb728abc6425ee606ffaa611a58 markd Fri Jan 22 06:46:58 2021 -0800 merge with master diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt index 36fa16a..4b9bc09 100644 --- src/hg/makeDb/doc/hg19.txt +++ src/hg/makeDb/doc/hg19.txt @@ -32138,43 +32138,59 @@ chr21 45650008 45650008 rs145424134 8 ENSG00000160223.12 ICOSLG -10841 -0.070 - 0 1 esophagusMuscular, -0.070, 5.106, 0.008, # refine generated trackDb.gtexEqtl.ra file and install in makeDb/trackDb/human/hg19 ######## # Load 44 per-tissue tracks: gtexEqtlTissue<tissueName> csh $bin/getxEqtlLoadTissues.csh UCSC_output >&! loadTissuesV2.log & #NOTE: V2 was a second release that followed immediately after first release (which was timed to coincide # with Nature paper pub. V2 revised schema (added ensembl gene ID, additional summary fields) # and color conventions. ########################################################################### # HGMD (updated 12/10/19 max) # HGMD (updated 01/25/18 max) -# got hgmd 2017 from Frank Schacherer Frank.Schacherer@qiagen.com and Rupert Yip Rupert.Yip@qiagen.com +# HGMD (updated 12/12/20 max) +# got hgmd from Frank Schacherer Frank.Schacherer@qiagen.com and Rupert Yip Rupert.Yip@qiagen.com # see also the file hg38/hgmd.txt -year=2019 +year=2020 cd /hive/data/genomes/hg19/bed/hgmd cat /hive/data/outside/hgmd/$year.4-hgmd-public_hg19.tsv | grep -v \# | tawk '{if ($5=="I") {start=$4-1; end=$4+1; col="100,100,100"} else if ($5=="D") {start=$4-1; end=$4; col="170,170,170"} else {start=$4-1; end=$4; col="0,0,0"}; print "chr"$3,start,end,$2":"$1,0,".",start,end,col,$2,$1,$5}' | sed -e 's/M$/substitution/' | sed -e 's/I$/insertion (between the two basepairs, sequence not provided by HGMD)/' | sed -e 's/D$/deletion (endpoint not provided by HGMD)/' | sed -e 's/X$/insertion-deletion (endpoint not provided by HGMD)/' | sed -e 's/R$/regulatory variant/' | sed -e 's/S$/splicing variant/' | sort -k1,1 -k2,2n > hgmd.bed bedToBigBed hgmd.bed /hive/data/genomes/hg19/chrom.sizes hgmd.bb -type=bed9+ -as=hgmd.as -tab ln -s /hive/data/genomes/hg19/bed/hgmd/hgmd.bb /gbdb/hg19/bbi/hgmd.bb hgBbiDbLink hg19 hgmd /gbdb/hg19/bbi/hgmd.bb # Forgot, finally done Oct 24: also updated hgBeacon bigBedToBed /gbdb/hg19/bbi/hgmd.bb /tmp/temp.bed -/usr/local/apache/cgi-bin/hgBeacon -f hgmd temp.bed hgmd +python2 /usr/local/apache/cgi-bin/hgBeacon -f hgmd /tmp/temp.bed hgmd # Forgot, finally done June 26: updated GBIB as qateam scp /gbdb/hg19/bbi/hgmd.bb hgdownload:/usr/local/apache/gbib/prot/ +# next restrict RefSeq down to HGMD subset + +# addition of HGMD-restricted subset, Max, Jan 29 2019, updated Dec 10 2019 +cd /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.2020-10-27/ +year=2019 +# change in 2019: ignore the version numbers, otherwise only 1815 transcripts left, big update by HGMD in 2019 +# adding "." so NM_123 doesn't match NM_123123 +cat /hive/data/outside/hgmd/$year.4-hgmd-public_hg38.tsv | cut -f7 | cut -d. -f1 | sort -u | awk '{print $1"."}' > hgmdTranscripts.txt +cat process/hg19.curated.gp.gz | fgrep -f hgmdTranscripts.txt - > hgmd.curated.gp +hgLoadGenePred -genePredExt hg19 ncbiRefSeqHgmd hgmd.curated.gp +$ wc -l hgmd.curated.gp +7965 hgmd.curated.gp in 2019 +8971 hgmd.curated.gp in 2020 + +# now continue the process at ../hg38/hgmd.txt ############################################################################# # LASTZ human/hg19 vs. pig/susScr11 - (DONE - 2018-04-02 - Hiram) mkdir /hive/data/genomes/hg19/bed/lastzSusScr11.2018-04-02 cd /hive/data/genomes/hg19/bed/lastzSusScr11.2018-04-02 printf '# human vs pig BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_O=400 BLASTZ_E=30 BLASTZ_M=254 # default BLASTZ_Q score matrix: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 @@ -32776,42 +32792,30 @@ # real 62m32.858s cat fb.ponAbe3.chainHg19Link.txt # 2690870339 bases of 3043444524 (88.415%) in intersection cat fb.ponAbe3.chainSynHg19Link.txt # 2675805099 bases of 3043444524 (87.920%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev \ -buildDir=`pwd` ponAbe3 hg19) > rbest.log 2>&1 # real 76m24.498s cat fb.ponAbe3.chainRBest.Hg19.txt # 2641865423 bases of 3043444524 (86.805%) in intersection -############################################################################## -# addition of HGMD-restricted subset, Max, Jan 29 2019, updated Dec 10 2019 -cd /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.2019-11-21/ -year=2019 -#cat /hive/data/outside/hgmd/$year.4-hgmd-public_hg38.tsv | cut -f7 | sort -u > hgmdTranscripts.txt -# change in 2019: ignore the version numbers, otherwise only 1815 transcripts left, big update by HGMD in 2019 -# adding "." so NM_123 doesn't match NM_123123 -cat /hive/data/outside/hgmd/$year.4-hgmd-public_hg38.tsv | cut -f7 | cut -d. -f1 | sort -u | awk '{print $1"."}' > hgmdTranscripts.txt -cat process/hg19.curated.gp | fgrep -f hgmdTranscripts.txt - > hgmd.curated.gp -hgLoadGenePred -genePredExt hg19 ncbiRefSeqHgmd hgmd.curated.gp -$ wc -l hgmd.curated.gp -7965 hgmd.curated.gp ############################################################################# # genomenom mastermind track, Max, Feb 2019 cd /hive/data/genomes/hg19/bed/mastermind/ wget 'https://mastermind.genomenon.com/cvr/download?format=csv' -O - > mastermind.2018.11.26.csv.gz unzip mastermind.2018.11.26.csv.zip mv mastermind_cited_variants_reference-2018.11.26-csv/ 2018-11-26 hgsql hg19 -NB -e 'select alias, chrom from chromAlias where source = "refseq";' > chromAlias.tab python ~/kent/src/hg/makeDb/mastermind/mastermindToBed.py 2018-11-26/mastermind_cited_variants_reference-2018.11.26.csv bedSort mastermind.bed mastermind.bed bedToBigBed -type=bed9+ -as=~/kent/src/hg/makeDb/mastermind/mastermind.as -tab mastermind.bed /hive/data/genomes/hg19/chrom.sizes mastermind.bb ln -s `pwd`/mastermind.bb /gbdb/hg19/bbi/mastermind.bb ############################################################################## # DGV GOLD (DATABASE OF GENOMIC VARIANTS GOLD STANDARD) (DONE 5/06/19 ChrisL) # Redmine #23371 ############################################################################## @@ -33697,38 +33701,40 @@ $buildDir/$db.rna.fa \ $pre) # pslMismatchGapToBed: NM_001365372.1 gapIx 9 shifted right 74 bases, but next block size is only 38; report to NCBI # pslMismatchGapToBed: NM_001288811.1 gapIx 1 shifted left 6 bases, but previous block size is only 5; report to NCBI # real 0m21.265s bedToBigBed -type=bed9+ -tab -as=$HOME/kent/src/hg/lib/txAliDiff.as $pre.bed \ /hive/data/genomes/$db/chrom.sizes $pre.bb # pass1 - making usageList (180 chroms): 77 millis # pass2 - checking and writing primary data (27362 records, 20 fields): 234 millis ln -sf `pwd`/$pre.bb /gbdb/hg19/ncbiRefSeq/$pre.bb ############################################################################# -# clinvarSubLolly track IN PROGRESS BRANEY 10/17/2020 +# clinvarSubLolly track DONE BRANEY 12/14/2020 mkdir /cluster/data/hg19/bed/clinvarSubLolly cd /cluster/data/hg19/bed/clinvarSubLolly bigBedToBed /gbdb/hg19/bbi/clinvar/clinvarMain.bb stdout | tawk '{print $40, $1,$2,$2+1,$4}' | sort -S 40g > sort.main.bed hgsql hg19 -Ne "select varId,clinSign,scv from clinvarSub" | sort -S 40g > clinvarSubSub.txt join -t $'\t' sort.main.bed clinvarSubSub.txt | tawk '{print $2,$3,$4,$5,$6,$1, $7}' | sort -S 40g -k1,1 -k2,2n -k5,5 | tawk -f makeFranklin | tawk -f assignColors > tmp1 -tawk '{print $1":"$2 + 1"-"$3"āVariants (submissions):"$11}' tmp1 > tmp2 +# add the line break after v409 +#tawk '{print $1":"$2 + 1"-"$3" <br>Variants (submissions):"$11}' tmp1 > tmp2 +tawk '{print $1":"$2 + 1"-"$3" Variants (submissions):"$11}' tmp1 > tmp2 paste tmp1 tmp2 > bigBedInput.bed bedToBigBed -as=$HOME/kent/src/hg/lib/clinvarSubLolly.as -type=bed9+5 -tab bigBedInput.bed /cluster/data/hg19/chrom.sizes clinvarSubLolly.bb mkdir -p /gbdb/hg19/clinvarSubLolly ln -s `pwd`/clinvarSubLolly.bb /gbdb/hg19/clinvarSubLolly/clinvarSubLolly.bb bigBedToBed /gbdb/hg19/bbi/clinvar/clinvarMain.bb stdout | tawk '{print $40, $1,$2,$2+1,$4,$13,$15,$18,$19}' | sort -S 40g > sort.main.bed hgsql hg19 -Ne "select * from clinvarSub" | sort -S 40g > clinvarSubSub.txt join -t $'\t' sort.main.bed clinvarSubSub.txt | tawk '{print $2,$3,$4,$5,0,"+",0,0,"0,0,0",$6,$20,$8, $9,$1,$10,$7,$11,$12,$13,$14,$15,$16,$17,$18,$19,$21}' | sort -S 40g -k1,1 -k2,2n | tawk -f assignScore > bigBedInput.bed bedToBigBed -as=clinvarSubBB.as -type=bed9+11 -tab bigBedInput.bed /cluster/data/hg19/chrom.sizes clinvarSub.bb ln -s `pwd`/clinvarSub.bb /gbdb/hg19/clinvarSubLolly/clinvarSub.bb ############################################################################# @@ -33842,15 +33848,499 @@ # sys 12m0.858s cd .. time cat hg19/genomes/*.bed | ./gnomadVcfBedToBigBed stdin stdout | sort -k1,1 -k2,2n > gnomad.v2.1.1.genomes.bed # real 199m48.619s # user 186m49.769s # sys 29m12.841s # now South Asian variants in the genomes file, change type: time bedToBigBed -type=bed9+47 -tab -as=genomes.as gnomad.v2.1.1.genomes.bed /hive/data/genomes/hg19/chrom.sizes genomes.bb # pass1 - making usageList (23 chroms): 165336 millis # pass2 - checking and writing primary data (253556152 records, 55 fields): 4909106 millis # # real 89m3.165s # user 86m41.554s # sys 2m15.722s + +############################################################################# +# LASTZ Cow bosTau9 (ONE - 2020-12-07 - Hiram) + mkdir /hive/data/genomes/hg19/bed/lastzBosTau9.2020-12-07 + cd /hive/data/genomes/hg19/bed/lastzBosTau9.2020-12-07 + + printf '# human vs Cow +BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz +BLASTZ_T=2 +BLASTZ_O=400 +BLASTZ_E=30 +BLASTZ_M=254 +# default BLASTZ_Q score matrix: +# A C G T +# A 91 -114 -31 -123 +# C -114 100 -125 -31 +# G -31 -125 100 -114 +# T -123 -31 -114 91 + +# TARGET: human hg19 +SEQ1_DIR=/hive/data/genomes/hg19/hg19.2bit +SEQ1_LEN=/hive/data/genomes/hg19/chrom.sizes +SEQ1_CHUNK=20000000 +SEQ1_LAP=10000 + +# QUERY: Cow bosTau9 +SEQ2_DIR=/hive/data/genomes/bosTau9/bosTau9.2bit +SEQ2_LEN=/hive/data/genomes/bosTau9/chrom.sizes +SEQ2_CHUNK=20000000 +SEQ2_LIMIT=10 +SEQ2_LAP=0 + +BASE=/hive/data/genomes/hg19/bed/lastzBosTau9.2020-12-07 +TMPDIR=/dev/shm +' > DEF + + time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ + -chainMinScore=3000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ + -syntenicNet) > do.log 2>&1 + # real 239m35.175s + + cat fb.hg19.chainBosTau9Link.txt + # 1407432462 bases of 2991710746 (47.044%) in intersection + + cat fb.hg19.chainSynBosTau9Link.txt + # 1354159575 bases of 2991710746 (45.264%) in intersection + + time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` hg19 bosTau9) > rbest.log 2>&1 & + # real 274m55.811s + + cat fb.hg19.chainRBest.BosTau9.txt + # 1290531802 bases of 2991710746 (43.137%) in intersection + + # running the swap + mkdir /hive/data/genomes/bosTau9/bed/blastz.hg19.swap + cd /hive/data/genomes/bosTau9/bed/blastz.hg19.swap + time (doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/hg19/bed/lastzBosTau9.2020-12-07/DEF \ + -swap -syntenicNet -workhorse=hgwdev \ + -smallClusterHub=hgwdev -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 + # real 72m28.826s + + cat fb.bosTau9.chainHg19Link.txt + # 1342159887 bases of 2715853792 (49.419%) in intersection + cat fb.bosTau9.chainSynHg19Link.txt + # 1305558878 bases of 2715853792 (48.072%) in intersection + + time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` bosTau9 hg19) > rbest.log 2>&1 & +XXX - running - Tue Dec 8 09:13:34 PST 2020 + # real 272m15.176s + + cat fb.bosTau9.chainRBest.Hg19.txt + # 1290810412 bases of 2715853792 (47.529%) in intersection + +############################################################################# +# Exome Probesets composite track +# Tue Jan 5 02:25:06 PST 2021 Made by Ana, Tiana, Pranav, Beagan, reviewed and committed by Max +# Download data for hg19: +cd /hive/data/genomes/hg19/bed/exomeProbesets +We made tracks for the main Exome Kit Vendors: IDT, Twist Biosciences, MGI, Agilent, Roche, and Illumina. + +Note: IDT, Agilent and Roche have bed files for the Probes and for the Target Regions. Twist, MGI, and Illumina have bed files for the Target Regions (but not for Probes). + +Data downloaded in my windows desktop and copied to hgwdev: +scp <file.bed> ana@hgwdev.gi.ucsc.edu://hive/data/genomes/hg19/bed/exonArrays/raw/idt + +# IDT Datasets: + +Track: IDT - xGen Exome Research Panel Probes +Download: https://sfvideo.blob.core.windows.net/sitefinity/docs/default-source/supplementary-product-info/xgen-exome-research-panel-probesbe255a1532796e2eaa53ff00001c1b3c.bed?sfvrsn=425c3407_7&download=true +File name: xgen-exome-research-panel-probes-hg19.bed + +Track: IDT - xGen Exome Research Panel Target Regions +Download: https://sfvideo.blob.core.windows.net/sitefinity/docs/default-source/supplementary-product-info/xgen-exome-research-panel-targetsae255a1532796e2eaa53ff00001c1b3c.bed?sfvrsn=435c3407_7&download=true +File name: xgen-exome-research-panel-targets-hg19.bed + +Track: IDT - xGen Exome Research Panel V2 Probes +Download: https://sfvideo.blob.core.windows.net/sitefinity/docs/default-source/supplementary-product-info/xgen-exome-research-panel-v2-probes-hg1952a5791532796e2eaa53ff00001c1b3c.bed?sfvrsn=1dd1707_6&download=true +File name: xgen-exome-research-panel-v2-probes-hg19.bed + +Track: IDT - xGen Exome Research Panel V2 Target Regions +Download: https://sfvideo.blob.core.windows.net/sitefinity/docs/default-source/supplementary-product-info/xgen-exome-research-panel-v2-targets-hg1902a5791532796e2eaa53ff00001c1b3c.bed?sfvrsn=6dd1707_10&download=true +File name: xgen-exome-research-panel-v2-targets-hg19.bed + +# Twist Biosciences Datasets: + +Track: Twist - RefSeq Exome Panel Target Regions +Download: https://www.twistbioscience.com/sites/default/files/resources/2019-09/Twist_Exome_RefSeq_targets_hg19_0.bed +File name: Twist_Exome_RefSeq_targets_hg19_0.bed + +Track: Twist - Core Exome Panel Target Regions +Download: https://www.twistbioscience.com/sites/default/files/resources/2018-09/Twist_Exome_Target_hg19.bed +File name: Twist_Exome_Target_hg19.bed + +Track: Twist - Comprehensive Exome Panel Target Regions +Download: https://www.twistbioscience.com/sites/default/files/resources/2020-09/Twist_ComprehensiveExome_targets_hg19.bed +File name: Twist_ComprehensiveExome_targets_hg19.bed + +# MGI Datasets: + +Track: MGI - Easy Exome Capture V4 Target Regions +Download: https://en.mgitech.cn/Uploads/Temp/file/20191225/5e03126e808a0.zip +File name: MGI_Exome_Capture_V4.bed + +Track: MGI - Easy Exome Capture V5 Target Regions +Download: https://en.mgitech.cn/Uploads/Temp/file/20191225/5e0312a7be43e.zip +File name: MGI_Exome_Capture_V5.bed + +# Agilent Datasets: +Download for all Agilent files: https://earray.chem.agilent.com/suredesign/ - Password needed (from Ana) + +Track: Agilent - SureSelect Clinical Research Exome Covered by Probes +File name: S06588914_Covered.bed + +Track: Agilent - SureSelect Clinical Research Exome Target Regions +File name: S06588914_Regions.bed + +Track: Agilent - SureSelect Clinical Research Exome V2 Covered by Probes +File name: S30409818_Covered.bed + +Track: Agilent - SureSelect Clinical Research Exome V2 Target Regions +File name: S30409818_Regions.bed + +Track: Agilent - SureSelect Focused Exome Covered by Probes +File name: S07084713_Covered.bed + +Track: Agilent - SureSelect Focused Exome Target Regions +File name: S07084713_Regions.bed + +Track: Agilent - SureSelect All Exon V4 Covered by Probes +File name: S03723314_Covered.bed + +Track: Agilent - SureSelect All Exon V4 Target Regions +File name: S03723314_Regions.bed + +Track: Agilent - SureSelect All Exon V4 + UTRs Covered by Probes +File name: S03723424_Covered.bed + +Track: Agilent - SureSelect All Exon V4 + UTRs Target Regions +File name: S03723424_Regions.bed + +Track: Agilent - SureSelect All Exon V5 Covered by Probes +File name: S04380110_Covered.bed + +Track: Agilent - SureSelect All Exon V5 Target Regions +File name: S04380110_Regions.bed + +Track: Agilent - SureSelect All Exon V5 + UTRs Covered by Probes +File name: S04380219_Covered.bed + +Track: Agilent - SureSelect All Exon V5 + UTRs Target Regions +File name: S04380219_Regions.bed + +Track: Agilent - SureSelect All Exon V6 r2 Covered by Probes +File name: S07604514_Covered.bed + +Track: Agilent - SureSelect All Exon V6 r2 Target Regions +File name: S07604514_Regions.bed + +Track: Agilent - SureSelect All Exon V6 + COSMIC r2 Covered by Probes +File name: S07604715_Covered.bed + +Track: Agilent - SureSelect All Exon V6 + COSMIC r2 Target Regions +File name: S07604715_Regions.bed + +Track: Agilent - SureSelect All Exon V6 + UTR r2 Covered by Probes +File name: S07604624_Covered.bed + +Track: Agilent - SureSelect All Exon V6 + UTR r2 Target Regions +File name: S07604624_Regions.bed + +Track: Agilent - SureSelect All Exon V7 Covered by Probes +File name: S31285117_Covered.bed + +Track: Agilent - SureSelect All Exon V7 Target Regions +File name: S31285117_Regions.bed + +# Roche Datasets: + +Track: Roche - KAPA HyperExome Capture Probe Footprint +Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/design-files/KAPA%20HyperExome%20Design%20files%20hg19.zip +File name: KAPA_HyperExome_hg19_capture_targets.bed + +Track: Roche - KAPA HyperExome Primary Target Regions +Download: +https://sequencing.roche.com/content/dam/rochesequence/worldwide/design-files/KAPA%20HyperExome%20Design%20files%20hg19.zip +File name: KAPA_HyperExome_hg19_primary_targets.bed + +Track: Roche - SeqCap EZ Exome V3 Capture Probe Footprint +Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/SeqCapEZ_Exome_v3.0_Design_Annotation_files.zip +File name: SeqCap_EZ_Exome_v3_hg19_capture_targets.bed + +Track: Roche - SeqCap EZ Exome V3 Primary Target Regions +Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/SeqCapEZ_Exome_v3.0_Design_Annotation_files.zip +File name: SeqCap_EZ_Exome_v3_hg19_primary_targets.bed + +Track: Roche - SeqCap EZ Exome V3 + UTR Capture Probe Footprint +Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/Exome_UTR_Design_Annotation_Files.zip +File name: SeqCap_EZ_ExomeV3_Plus_UTR_hg19_capture_annotated.bed + +Track: Roche - SeqCap EZ Exome V3 + UTR Primary Target Regions +Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/Exome_UTR_Design_Annotation_Files.zip +File name: SeqCap_EZ_ExomeV3_Plus_UTR_hg19_primary_annotated.bed + +Track: Roche - SeqCap EZ MedExome Capture Probe Footprint +Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/MedExome_design_files.zip +File name: SeqCap_EZ_MedExome_hg19_capture_targets.bed + +Track: Roche - SeqCap EZ MedExome Empirical Target Regions +Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/MedExome_design_files.zip +File name: SeqCap_EZ_MedExome_hg19_empirical_targets.bed + +Track: Roche - SeqCap EZ MedExome + Mito Capture Probe Footprint +Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/MedExomePlusMito_design_files.zip +File name: SeqCap_EZ_MedExomePlusMito_hg19_capture_targets.bed + +Track: Roche - SeqCap EZ MedExome + Mito Empirical Target Regions +Download: https://sequencing.roche.com/content/dam/rochesequence/worldwide/shared-designs/MedExomePlusMito_design_files.zip +File name: SeqCap_EZ_MedExomePlusMito_hg19_empirical_targets.bed + +# Illumina Datasets: + +Track: Illumina - Nextera DNA Exome V1.2 Target Regions +Download: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/nextera-dna-exome/nextera-dna-exome-targeted-regions-manifest-bed.zip +File name: nextera-dna-exome-targeted-regions-manifest-v1-2.bed + +Track: Illumina - Nextera Rapid Capture Exome Target Regions +Download: https://support.illumina.com/softwaredownload.html?assetId=d2c2bc7e-75e5-4f20-bfb7-780839390565&assetDetails=nexterarapidcapture_exome_targetedregions.bed - Password needed (from Ana) +File name: nexterarapidcapture_exome_targetedregions.bed + +Track: Illumina - Nextera Rapid Capture Exome V1.2 Target Regions +Download: https://support.illumina.com/softwaredownload.html?assetId=197e4b2b-161d-4576-a52f-1204833567c5&assetDetails=nexterarapidcapture_exome_targetedregions_v1.2.bed - Password needed (from Ana) +File name: nexterarapidcapture_exome_targetedregions_v1.2.bed + +Track: Illumina - Nextera Rapid Capture Expanded Exome Target Regions +Download: https://support.illumina.com/softwaredownload.html?assetId=f020d708-dad9-44e4-8c7c-439add28536c&assetDetails=nexterarapidcapture_expandedexome_targetedregions.bed - Password needed (from Ana) +File name: nexterarapidcapture_expandedexome_targetedregions.bed + +Track: Illumina - TruSeq DNA Exome V1.2 Target Regions +Download: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/truseq/truseq-dna-exome/truseq-dna-exome-targeted-regions-manifest-v1-2-bed.zip +File name: truseq-dna-exome-targeted-regions-manifest-v1-2.bed + +Track: Illumina - TruSeq Rapid Exome V1.2 Target Regions +Download: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/truseq/truseq-rapid-exome-targeted-regions-manifest-v1-2-bed.zip +File name: truseq-rapid-exome-targeted-regions-manifest-v1-2.bed + +Track: Illumina - TruSight ONE V1.1 Target Regions +Download: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/trusight/trusight-one-file-for-ucsc-browser-v1-1.zip +File name: TruSight_One_v1.1.bed + +Track: Illumina - TruSight ONE Expanded V2.0 Target Regions +Download: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/nextera/nextera-flex-for-enrichment/trusight-one-expanded-targeted-regions-v2-0.zip +File name: TSOne_Expanded_Final_TargetedRegions_v2 + +Track: Illumina - TruSight Exome Target Regions +Download: https://support.illumina.com/content/dam/illumina-support/documents/documentation/chemistry_documentation/trusight/trusight_exome_manifest_a.bed +File name: trusight_exome_manifest_a.bed + +Track: Illumina - AmpliSeq Exome Panel Target Regions +Download: https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/ampliseq-for-illumina/ampliseq-for-illumina-exome-panel-manifest-file-bed.zip +File name: Exome.dna_manifest.20180509.bed + +# Converting bed files for hg19: + +All files were converted from bed to bigBed using the Genome Browser documentation. All of the files underwent the following steps, with the exception of a few files that are described below. (NOTE: the documentation includes a step to remove any header lines -- only a couple files had headers, and those were simply removed within vi/vim.) + +1. Sort all bed files +sort -k1,1 -k2,2n unsorted.bed > input.bed + +2. fetchChromSizes (run once) +fetchChromSizes hg19 > hg19.chrom.sizes + +Note: this only needs to be run once, since ione hg19.chrom.sizes files can be used for all bedToBigBed runs. + +3. bedToBigBed for all files +bedToBigBed input.bed hg19.chrom.sizes myBigBed.bb + +Here's an example using the MGI Exome Capture V4 file: + +sort -k1,1 -k2,2n MGI_Exome_Capture_V4.bed > sorted_MGI_Exome_Capture_V4.bed + +fetchChromSizes hg19 > hg19.chrom.sizes + +bedToBigBed sorted_MGI_Exome_Capture_V4.bed hg19.chrom.sizes MGI_Exome_Capture_V4.bb + +-- + +The following files from Roche had long entries in col4, causing these files to have rows that were too long for bedToBigBed. Therefore, all the input bed files had col4 cut. (Note: these were just the ensembl and ccds ids, which did not provide any other substantial information.) + +We ran the command + +> cut -f1,2,3 + +for all such files. Here's an example for the Roche - KAPA HyperExome Capture Probe: + +Footprint file: + +cut -f1,2,3 sorted-KAPA_HyperExome_hg19_capture_targets.bed > sorted-cut-KAPA_HyperExome_hg19_capture_targets.bed +############################################################################# + +############################################################################# +# skinSoleBoldo JimK 01-14-2020 +# This describes how we got the skinSoleBoldo data set into the +# Genome Browser from the Cell Browser. +############################################################################# + +# Create working directory and go there +mkdir /hive/data/genomes/hg19/bed/singleCell/skinSoleBoldo +cd /hive/data/genomes/hg19/bed/singleCell/skinSoleBoldo + +# Create output dir for binaries +mkdir bbi + +# Downloaded files from the UCSC cell browser's as so +wget https://cells.ucsc.edu/aging-human-skin/meta.tsv +wget https://cells.ucsc.edu/aging-human-skin/exprMatrix.tsv.gz + +# Get the first line (fields) out of meta.tsv and also make stats on it +head -1 meta.tsv > meta.fields +tabInfo meta.tsv -vals=20 > meta.20 + + +# Make a bunch of smaller matrices by clustering columns. Mostly we'll use the cluster one +# but some of the others are good to look at sometimes too. This is the time consuming step. +mkdir clust +matrixClusterColumns -makeIndex=clust/exprMatrix.ix exprMatrix.tsv.gz meta.tsv \ + Celltype clust/cell_type.matrix bbi/cell_type.stats \ + subj clust/donor.matrix bbi/donor.stats \ + age clust/age.matrix bbi/age.stats \ + Celltype_and_Age clust/age_cell_type.matrix bbi/age_cell_type.stats + +# Get the first column (the genes) out of expression matrix. +cut -f 1 clust/cell_type.matrix > gene.lst + +# Figure out the geneset they used and generate mapping file +gencodeVersionForGenes gene.lst /hive/data/inside/geneSymVerTx.tsv -bed=mapping.bed +# best is gencodeV19 as sym on hg19 with 21217 of 21353 (99.3631%) hits + +# Turn some into barChart, and then bigBarChart +foreach s (cell_type donor age age_cell_type) + matrixToBarChartBed clust/$s.matrix mapping.bed clust/$s.bed -stats=bbi/$s.stats -trackDb=clust/$s.ra + bedSort clust/$s.bed clust/$s.bed + bedToBigBed clust/$s.bed /hive/data/genomes/hg19/chrom.sizes bbi/$s.bb -type=bed6+3 -as=/cluster/home/kent/src/hg/lib/simpleBarChartBed.as +end + +# Make up special colors for cell_type. First manually create two column +# file that relates at least some of sample labels to cell types we have colors for. +# Call this file clust/cell_type.labels. +matrixClusterColumns clust/cell_type.matrix clust/cell_type.labels cluster clust/cell_type.unnormed clust/cell_type.restats +matrixNormalize column sum clust/cell_type.unnormed clust/cell_type.ref + +# Use same colors for sample +foreach s (cell_type donor age age_cell_type) + hcaColorCells clust/cell_type.ref ../typeColors.tsv clust/$s.matrix clust/$s.refStats -trackDb=clust/$s.colors -stats=bbi/$s.stats +end + +# Link files needed by browser at runtime to the /gbdb dir +mkdir /gbdb/hg19/bbi/skinSoleBoldo +foreach s (cell_type donor age age_cell_type) + ln -s /hive/data/genomes/hg19/bed/singleCell/skinSoleBoldo/bbi/$s.bb /gbdb/hg19/bbi/skinSoleBoldo/ + ln -s /hive/data/genomes/hg19/bed/singleCell/skinSoleBoldo/bbi/$s.stats /gbdb/hg19/bbi/skinSoleBoldo/ +end + +# Add the bits from clust/*.ra and clust/*.colors to hg19/trackDb.ra and you should be good. +rm -f tracks.ra +foreach s (cell_type donor age age_cell_type) + grep -v barChartColors clust/$s.ra >>tracks.ra + cat clust/$s.colors >> tracks.ra + echo transformFunc NONE >> tracks.ra + echo barChartLimit 2 >> tracks.ra + echo "" >> tracks.ra +end + + +############################################################################# +# fetalGeneAtlas JimK 01-19-2020 +############################################################################ +# This is the RNA-seq part of the data set described in +# "A human cell atlas of fetal gene expression" by Cao, Day et al +# Science 13 Nove 2020. This was imported from Cell Browser + +# Create directory for work. + +mkdir -p /hive/data/genomes/hg19/bed/singleCell/fetalGeneAtlas +cd /hive/data/genomes/hg19/bed/singleCell/fetalGeneAtlas + +# Create output dir for binaries +mkdir bbi + +# link in in files from cell browser +ln -s /hive/data/inside/cells/datasets/fetal-gene-atlas/genes/all/meta.tsv . +ln -s /hive/data/inside/cells/datasets/fetal-gene-atlas/genes/all/exprMatrix.tsv.gz . + +# Get the first line (fields) out of meta.tsv and also make stats on it +head -1 meta.tsv > meta.fields +tabInfo meta.tsv -vals=20 > meta.20 + + +# Make a bunch of smaller matrices by clustering columns. Mostly we'll use the cluster one +# but some of the others are good to look at sometimes too. This is the time consuming step. +mkdir clust +matrixClusterColumns -makeIndex=clust/exprMatrix.ix exprMatrix.tsv.gz meta.tsv \ + Main_cluster_name clust/cell_type.matrix bbi/cell_type.stats \ + Assay clust/Assay.matrix bbi/Assay.stats \ + Experiment_batch clust/Experiment_batch.matrix bbi/Experiment_batch.stats \ + Fetus_id clust/donor.matrix bbi/donor.stats \ + Organ clust/Organ.matrix bbi/Organ.stats \ + Organ_cell_lineage clust/Organ_cell_lineage.matrix bbi/Organ_cell_lineage.stats \ + RT_group clust/RT_group.matrix bbi/RT_group.stats \ + sex clust/sex.matrix bbi/sex.stats + +# Get the first column (the genes) out of expression matrix. +cut -f 1 clust/cell_type.matrix > gene.lst + + +# Figure out the geneset they used and generate mapping file +gencodeVersionForGenes gene.lst /hive/data/inside/geneSymVerTx.tsv -bed=mapping.bed +# best is gencodeV19 as id on hg19 with 60284 of 63562 (94.8428%) hits + + +# Turn some into barChart, and then bigBarChart +foreach s (cell_type Assay Experiment_batch donor Organ Organ_cell_lineage RT_group sex) + matrixToBarChartBed clust/$s.matrix mapping.bed clust/$s.bed -stats=bbi/$s.stats -trackDb=clust/$s.ra + bedSort clust/$s.bed clust/$s.bed + bedToBigBed clust/$s.bed /hive/data/genomes/hg19/chrom.sizes bbi/$s.bb -type=bed6+3 -as=/cluster/home/kent/src/hg/lib/simpleBarChartBed.as +end + +# Make up special colors for cell_type. First manually create two column +# file that relates at least some of sample labels to cell types we have colors for. +# Call this file cell_type.labels. +matrixClusterColumns clust/cell_type.matrix cell_type.labels cluster clust/cell_type.unnormed clust/cell_type.restats +matrixNormalize column sum clust/cell_type.unnormed clust/cell_type.ref +#hcaColorCells clust/cell_type.ref ../typeColors.tsv clust/cell_type.matrix clust/cell_type.refStats -trackDb=clust/cell_type.colors -stats=bbi/cell_type.stats + +# Use same colors for some others +foreach s (cell_type Assay Experiment_batch donor Organ Organ_cell_lineage RT_group sex) + hcaColorCells clust/cell_type.ref ../typeColors.tsv clust/$s.matrix clust/$s.refStats -trackDb=clust/$s.colors -stats=bbi/$s.stats +end + +# Link files needed by browser at runtime to the /gbdb dir +mkdir /gbdb/hg19/bbi/fetalGeneAtlas +foreach s (cell_type Assay Experiment_batch donor Organ Organ_cell_lineage RT_group sex) + ln -s /hive/data/genomes/hg19/bed/singleCell/fetalGeneAtlas/bbi/$s.bb /gbdb/hg19/bbi/fetalGeneAtlas/ + ln -s /hive/data/genomes/hg19/bed/singleCell/fetalGeneAtlas/bbi/$s.stats /gbdb/hg19/bbi/fetalGeneAtlas/ +end + + +hgBbiDbLink hg19 fetalGeneAtlasCellTypes /gbdb/hg19/bbi/fetalGeneAtlas/cell_type.bb +hgBbiDbLink hg19 fetalGeneAtlasDonor /gbdb/hg19/bbi/fetalGeneAtlas/donor.bb +hgBbiDbLink hg19 fetalGeneAtlasAssay /gbdb/hg19/bbi/fetalGeneAtlas/Assay.bb +hgBbiDbLink hg19 fetalGeneAtlasExperiment /gbdb/hg19/bbi/fetalGeneAtlas/Experiment_batch.bb +hgBbiDbLink hg19 fetalGeneAtlasOrgan /gbdb/hg19/bbi/fetalGeneAtlas/Organ.bb +hgBbiDbLink hg19 fetalGeneAtlasOrganCellLineage /gbdb/hg19/bbi/fetalGeneAtlas/Organ_cell_lineage.bb +hgBbiDbLink hg19 fetalGeneAtlasRtGroup /gbdb/hg19/bbi/fetalGeneAtlas/RG_group.bb +hgBbiDbLink hg19 fetalGeneAtlasSex /gbdb/hg19/bbi/fetalGeneAtlas/sex.bb + +# Add the bits from clust/*.ra and clust/*.colors to hg19/trackDb.ra and you should be good. +foreach s (cell_type Assay Experiment_batch donor Organ Organ_cell_lineage RT_group sex) + echo >> clust/$s.ra +end +cat clust/*.ra > tracks.ra +