50c0049649c7268bd12ab0689cda7f8261b46a63 chmalee Mon May 6 16:11:42 2019 -0700 DGV Gold track mostly ready for QA, refs #23371 diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt index 47b51f2..fb4fb50 100644 --- src/hg/makeDb/doc/hg19.txt +++ src/hg/makeDb/doc/hg19.txt @@ -33352,15 +33352,124 @@ # Update human/hg19/wgEncodeGencodeSuper.html and update 'Release Notes' # to describe new release. [ONLY if it's going to be pushed] # edit human/hg19/trackDb.wgEncode.ra to add new .ra file include make DBS=hg19 # edit all.joiner to add ~/tmp/gencodeV30lift37.joiner # verify with: pushd /hive/data/genomes/hg19/bed/gencodeV30lift37 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck # commit all ############################################################################## +# DGV GOLD (DATABASE OF GENOMIC VARIANTS GOLD STANDARD) (DONE 5/06/19 ChrisL) +# Redmine #23371 +############################################################################## + TODAY=`date +%y%m%d` + mkdir -p /hive/data/genomes/hg19/bed/dgv/$TODAY + cd /hive/data/genomes/hg19/bed/dgv/$TODAY + wget http://dgv.tcag.ca/dgv/docs/DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3 + + # GFF3 with the 9th field full of extra info that we need to recreate the blocks + # as seen at the DGV website. See note-6 in the redmine (23371) for an example + # of the different cnv representations (1, 2, or 3 blocks). + + # what sub-fields are in the 9th field: + head -1 DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3 | cut -f9 | tr ';' '\n' | cut -d'=' -f1 + # ID + # Name + # variant_type + # variant_sub_type + # outer_start + # inner_start + # inner_end + # outer_end + # inner_rank + # num_variants + # variants + # num_studies + # Studies + # num_platforms + # Platforms + # number_of_algorithms + # algorithms + # num_samples + # samples + # Frequency + # PopulationSummary + # num_unique_samples_tested + + # and how many unique CNV regions? + cut -f9 DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3 | tr ';' '\t' | cut -f1 | cut -d'=' -f2 | sort -u | wc -l + # 38185 + wc -l DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3 + # 114555 DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3 + calc 114555 /3 + # 114555 /3 = 38185.000000 + + # run script to process the bedlines out of each of the gff lines + ~/kent/src/hg/utils/automation/translateDgvGold.py DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3 | sort -k1,1 -k2,2n | uniq > dgvGold.bed12 + + cat << EOF > dgvGold.as +table dgvGold +"Database of Genomic Variants Gold Standard Curated Variants" + ( + string chrom; "Chromosome name" + uint chromStart; "Maximum boundary of CNV" + uint chromEnd; "Maximum boundary of CNV" + string name; "Name from gff" + uint score; "Not used" + char[1] strand; "Not used" + uint thickStart; "Same as chromEnd" + uint thickEnd; "Same as chromEnd" + uint reserved; "Color of item. Blue for gain and red for loss" + int blockCount; "Number of blocks" + int[blockCount] blockSizes; "Size of each block" + int[blockCount] chromStarts; "Start position of each block relative to chromStart" + string dgvID; "Name of CNV from DGV" + string variant_type; "CNV" + string variant_sub_type; "Gain or Loss" + int inner_rank; "Rank of the used to assign the blocks" + int num_variants; "Number of variants coalesced to form the entire region" + string[num_variants] variants; "Supporting variants" + int num_studies; "Number of studies" + string[num_studies] Studies; "Study names in 'Name Year' format" + int num_platforms; "Number of sequencing platforms" + string[num_platforms] Platforms; "Sequencing platform names" + int number_of_algorithms; "Number of CNV detection algorithms" + string[number_of_algorithms] algorithms; "CNV detection algorithms used" + int num_samples; "Number of samples" + string[num_samples] samples; "Sample names" + string Frequency; "Overall frequency of variants across all studies" + string PopulationSummary; "Populations tested across all studies" + int num_unique_samples_tested; "Number of samples " + ) +EOF + + CHROMSIZES=/hive/data/genomes/hg19/chrom.sizes + bedToBigBed -type=bed12+17 -as=dgvGold.as -tab dgvGold.bed12 $CHROMSIZES dgvGold.bb + bigBedInfo dgvGold.bb + # version: 4 + # fieldCount: 29 + # hasHeaderExtension: yes + # isCompressed: yes + # isSwapped: 0 + # extraIndexCount: 0 + # itemCount: 38,185 + # primaryDataSize: 30,841,362 + # primaryIndexSize: 6,892 + # zoomLevels: 8 + # chromCount: 24 + # basesCovered: 580,564,080 + # meanDepth (of bases covered): 3.668451 + # minDepth: 1.000000 + # maxDepth: 81.000000 + # std of depth: 5.825349 + + # link into gbdb + mkdir -p /gbdb/hg19/dgv + ln -s `pwd`/dgvGold.bb /gbdb/hg19/dgv/ + +##############################################################################