418b11748d3ae9c1217714a92d4a7fe46b6819b9
chmalee
  Thu May 28 17:14:54 2020 -0700
Added a few more trios to the 1000 Genomes trio track QA ready, refs #25582

diff --git src/hg/makeDb/doc/hg38/variation.txt src/hg/makeDb/doc/hg38/variation.txt
index 1e06434..a61f1c8 100644
--- src/hg/makeDb/doc/hg38/variation.txt
+++ src/hg/makeDb/doc/hg38/variation.txt
@@ -1861,16 +1861,111 @@
 
 ##############################################################################
 # DGV with filters - DONE Jan 01 2020
     zcat dgvSupporting.bed.gz | tawk '{print $0, $3-$2}' | sort -k1,1 -k2,2n > dgvSupportingWithSize.bed
     zcat dgvMerged.bed.gz | tawk '{print $0, $3-$2}' | sort -k1,1 -k2,2n > dgvMergedWithSize.bed
     bedToBigBed -tab -as=/hive/data/genomes/hg19/bed/dgv/160810/dgvPlusSize.as -type=bed9+14 dgvMergedWithSize.bed  /hive/data/genomes/hg38/chrom.sizes dgvMerged.bb
     # pass1 - making usageList (41 chroms): 333 millis
     # pass2 - checking and writing primary data (389106 records, 23 fields): 9803 millis
     bedToBigBed -tab -as=/hive/data/genomes/hg19/bed/dgv/160810/dgvPlusSize.as -type=bed9+14 dgvSupportingWithSize.bed  /hive/data/genomes/hg38/chrom.sizes dgvSupporting.bb
     # pass1 - making usageList (41 chroms): 2695 millis
     # pass2 - checking and writing primary data (6584665 records, 23 fields): 27859 millis
     mkdir -p /gbdb/hg38/dgv/
     cd /gbdb/hg38/dgv
     ln -s /hive/data/genomes/hg38/bed/dgv/160906/dgvMerged.bb
     ln -s /hive/data/genomes/hg38/bed/dgv/160906/dgvSupporting.bb
+
+##############################################################################
+# Trios for 1000 Genomes Phase 3 Data - DONE 05/28/20 - ChrisL
+
+    # download the related samples
+    cd /hive/data/genomes/hg38/bed/1000GenomesPhase3
+    mkdir related_samples
+    cd related_samples/
+
+    wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/20190312_biallelic_SNV_and_INDEL_README.txt
+    wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/20190312_biallelic_SNV_and_INDEL_MANIFEST.txt
+    time for i in {1..22}; do wget --timestamping http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/supporting/related_samples/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_related_samples_27022019.GRCh38.phased.vcf.gz; done &> wget.log
+    # real    43m6.593s
+    # user    0m2.132s
+    # sys 0m16.292s
+    wget --timestamping http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/supporting/related_samples/ALL.chrX.shapeit2_integrated_snvindels_v2a_related_samples_27022019.GRCh38.phased.vcf.gz                  
+
+    # Check md5sums
+    tawk '{print $3, $1;}' 20190312_related_biallelic_SNV_and_INDEL_MANIFEST.txt | grep -v tbi > md5sums.check
+    md5sum -c md5sums.check
+    #./ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz: OK
+    #...
+    #./ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz: OK
+    # Index them locally instead of downloading, probably quicker.
+
+    for f in *.vcf.gz; do
+      echo indexing $f
+      tabix -p vcf $f
+    done
+
+    # Download 1000 Genomes Family tree info and find existing trios:
+    wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v2.20130502.ALL.ped
+    tawk '{if ($2 != 0 && $3 != 0 && $4 != 0) printf "%s\t%s,%s\t%s_%s_%s\n", $2,$4,$3,$2,$1,$7 }' integrated_call_samples_v2.20130502.ALL.ped > allTrios.txt
+
+    # this is not strictly "child" IDs if there are grandparents or something but close enough
+    cut -f1 allTrios.txt | tail -n +2 > childIds.txt
+    bcftools query -l ../related_samples/ALL.chr22.shapeit2_integrated_snvindels_v2a_related_samples_27022019.GRCh38.phased.vcf.gz | grep -Fwf childIds.txt  - | grep -wf - allTrios.txt > trackTrios.txt
+    bcftools query -l ../ALL.chr22.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | grep -Fwf childIds.txt  - | grep -wf - allTrios.txt >> trackTrios.txt
+
+    # this below script runs the various bcftools commands to merge VCFs together:
+    # subset sample: bcftools view --no-version -s 'sample1,sample2,...'
+    # merge two VCFs: "bcftools merge --no-version file1 file2
+    # remove homozygous reference variants (artifacts of subsetting the whole pop dataset):
+    #   bcftools view --no-version -i \"GT[*]!='RR'\" "
+    # this is a parasol run as it can take ~20 minutes to run on one chromosome
+
+    # first make a configuration file so we know where the VCFs are and stuff:
+    cat << _EOF > conf.txt
+    relationshipFile=trackTrios.txt
+    primaryVCF=/hive/data/genomes/hg38/bed/1000GenomesPhase3/ALL.*.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
+    suppVCF=/hive/data/genomes/hg38/bed/1000GenomesPhase3/related_samples/ALL.*.shapeit2_integrated_snvindels_v2a_related_samples_27022019.GRCh38.phased.vcf.gz
+    _EOF
+
+    ./makeTrio.py -v conf.txt
+
+    cd work
+    para time
+    # 161 jobs in batch
+    # 0 jobs (including everybody's) in Parasol queue or running.
+    # Checking finished jobs
+    # Completed: 161 of 161 jobs
+    # CPU time in finished jobs:     277100s    4618.34m    76.97h    3.21d  0.009 y
+    # IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
+    # Average job time:                1634s      27.23m     0.45h    0.02d
+    # Longest finished job:            3753s      62.55m     1.04h    0.04d
+    # Submission to last job:          3766s      62.77m     1.05h    0.04d
+    # Estimated complete:                 0s       0.00m     0.00h    0.00d
+
+    # now load the per chrom VCF tables:
+    ./loadTables.sh
+
+    # Get some stats:
+    for d in output/*; do echo $d; for f in $d/*.vcf.gz; do zgrep -v "^#"  $f | cut -f8 | sed -re 's/.*VT=//; s/;.*//' | sort | uniq -c; done | awk '{arr[$2] += $1}END{for (i in arr){printf "%s: %d\n", i, arr[i]}}'; done
+    output/HG00702_SH089_CHS
+    SNP: 4522294
+    INDEL: 589214
+    output/HG00733_PR05_PUR
+    SNP: 4783649
+    INDEL: 621948
+    output/HG02024_VN049_KHV
+    SNP: 4554542
+    INDEL: 597609
+    output/NA12878_1463_CEU
+    SNP: 4565394
+    INDEL: 593052
+    output/NA19240_Y117_YRI
+    SNP: 5954728
+    INDEL: 728480
+    output/NA19675_m004_MXL
+    SNP: 4763107
+    INDEL: 619293
+    output/NA19685_m011_MXL
+    SNP: 4758396
+    INDEL: 619294
+
 ##############################################################################