2735c94211fbf200b2aaf0d349c4f5ceb74fe633 angie Thu Nov 14 14:48:08 2019 -0800 hg38 finally gets 1000Genomes Phase 3 Variants, yay! refs #21805 diff --git src/hg/makeDb/doc/hg38/variation.txt src/hg/makeDb/doc/hg38/variation.txt index ec1c7c7..d1d5cfb 100644 --- src/hg/makeDb/doc/hg38/variation.txt +++ src/hg/makeDb/doc/hg38/variation.txt @@ -1807,15 +1807,56 @@ ############################################################################## # SNPMASKED SEQUENCE FOR SNP151 (DONE 4/23/18 angie) # Redmine #21010 screen -S mask -t mask ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg38 151 -debug # *** Steps were performed in /hive/data/genomes/hg38/snp151Mask.2018-04-23 cd /hive/data/genomes/hg38/snp151Mask.2018-04-23 ~/kent/src/hg/utils/automation/doDbSnpMaskSequence.pl hg38 151 \ >>& do.log & tail -f do.log # *** All done! ############################################################################## +# DBSNP152 / DBSNP153 / bigDbSnp: see ../bigDbSnp.txt + + +############################################################################## +# 1000 GENOMES PHASE 3 VARIANT CALLS (DONE 11/13/19 angie) + screen -S tgp -t tgp + mkdir /hive/data/genomes/hg38/bed/1000GenomesPhase3 + cd /hive/data/genomes/hg38/bed/1000GenomesPhase3 + wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/20190312_biallelic_SNV_and_INDEL_README.txt + wget --timestamping \ + ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr\*.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz + wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/20190312_biallelic_SNV_and_INDEL_MANIFEST.txt + # Check md5sums + tawk '{print $3, $1;}' 20190312_biallelic_SNV_and_INDEL_MANIFEST.txt | g -v tbi > md5sums.check + md5sum -c md5sums.check +#./ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz: OK +#... +#./ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz: OK + # Index them locally instead of downloading, probably quicker. + for f in *.vcf.gz; do + echo indexing $f + tabix -p vcf $f + done + # Install the files + mkdir /gbdb/hg38/1000Genomes + ln -s `pwd`/*.vcf.gz* /gbdb/hg38/1000Genomes/ + cp /dev/null tgpPhase3.txt + for c in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X; do + file=ALL.chr$c.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz + echo -e "/gbdb/hg38/1000Genomes/$file\tchr$c" >> tgpPhase3.txt + done + # hgBbiDbLink doesn't support the seq column so use hgLoadSqlTab: + hgLoadSqlTab hg38 tgpPhase3 ~/kent/src/hg/lib/bbiChroms.sql tgpPhase3.txt + # Make a chromosomes line for trackDb (no alts, no Y!): + hgsql hg38 -NBe 'select seqName from tgpPhase3' | xargs echo | sed -e 's/ /,/g' +#chr1,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr2,chr20,chr21,chr22,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chrX + # I don't see counts of SNPs / indels documented anywhere, so extract: + time (zcat ALL.chr*.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | g -v ^# | cut -f 8 | sed -re 's/.*VT=//; s/;.*//' | sort | uniq -c | head -100) + + +##############################################################################