src/hg/makeDb/doc/hg19.txt 1.115

1.115 2010/05/27 23:52:00 angie
Made snp131-masked genome sequences.
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.114
retrieving revision 1.115
diff -b -B -U 4 -r1.114 -r1.115
--- src/hg/makeDb/doc/hg19.txt	26 May 2010 16:17:24 -0000	1.114
+++ src/hg/makeDb/doc/hg19.txt	27 May 2010 23:52:00 -0000	1.115
@@ -9834,8 +9834,138 @@
       snp131CodingDbSnp.bed
 #Loaded 443454 elements of size 11
 
 
+############################################################################
+# SNPMASKED SEQUENCE FOR SNP131 (DONE 5/27/10 angie)
+    mkdir /hive/data/genomes/hg19/snp131Mask
+    cd /hive/data/genomes/hg19/snp131Mask
+
+    # Identify rsIds with various problems -- we will exclude those.
+    # MultipleAlignments is kinda broad because anything that maps on
+    # both chrN and chrN_foo_hap1 will be excluded... similarly, extra
+    # matches on chrN_random might disqualify good matches on chrN.
+    # Well, erring on the side of caution is good.
+    awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \
+      /hive/data/outside/dbSNP/131/human/snp131Exceptions.bed \
+      | sort -u \
+      > snp131ExcludeRsIds.txt
+    time grep -vFwf snp131ExcludeRsIds.txt \
+      /hive/data/outside/dbSNP/131/human/snp131.bed \
+      > snp131Cleaned.bed
+#193.507u 5.203s 4:07.62 80.2%   0+0k 0+0io 0pf+0w
+
+    # Substitutions:
+    mkdir substitutions
+    snpMaskSingle snp131Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout \
+    | faSplit byname stdin substitutions/
+    # 180 warnings about differing observed strings at same base position --
+    # saved as diffObserved.txt.
+#Masked 17377658 snps in 17377491 out of 3099287100 genomic bases
+#/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3099287100 (difference is 37874164)
+#52.903u 10.964s 4:49.08 22.0%   0+0k 0+0io 3pf+0w
+    # Check that 37874164 is the total #bases in sequences with nothing in snp131Cleaned:
+    grep -Fw single snp131Cleaned.bed | cut -f 1 | uniq > /tmp/1
+    grep -vwf /tmp/1 ../chrom.sizes
+    grep -vwf /tmp/1 ../chrom.sizes \
+    | awk 'BEGIN {TOTAL = 0;}  {TOTAL += $2;}  END {printf "%d\n", TOTAL;}'
+#37874164
+#TODO: send list to dbSNP.
+    # Make sure that sizes are identical, first diffs are normal -> IUPAC,
+    # and first diffs' case is preserved:
+    foreach f (substitutions/chr*.fa)
+      faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ"
+    end
+#chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10491 (y != c)
+#chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 61004 (r != a)
+#...
+#(output OK -- ambiguous bases replacing [agct] at SNP positions)
+    foreach f (substitutions/chr*.fa)
+      echo $f:t:r
+      mv $f $f:r.subst.fa
+      gzip $f:r.subst.fa
+    end
+
+    # Insertions:
+    mkdir insertions
+    snpMaskAddInsertions snp131Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout \
+    | faSplit byname stdin insertions/
+#Added 2496221 snps totaling 5939697 bases to 3098816404 genomic bases
+#/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3098816404 (difference is 38344860)
+#52.764u 12.593s 3:55.47 27.7%   0+0k 0+0io 2pf+0w
+    # Again, that just means that some chroms didn't have filtered SNPs.
+    # Make sure that all sizes have increased relative to original:
+    foreach f (insertions/chr*.fa)
+      echo -n "${f:t:r}: "
+      faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ" \
+      |& perl -we '$_=<>; \
+           if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
+             if ($1 > $2) {print "OK: ins size $1 > $2\n";} \
+             else {die "ERROR: ins size $1 <= $2\n";} \
+           } else {die $_;}'
+    end
+#chr1: OK: ins size 249717078 > 249250621
+#chr10: OK: ins size 135805198 > 135534747
+#...
+#(output OK -- new sizes > old)
+    foreach f (insertions/chr*.fa)
+      mv $f $f:r.ins.fa
+      gzip $f:r.ins.fa
+    end
+
+    # Deletions:
+    mkdir deletions
+    snpMaskCutDeletions snp131Cleaned.bed /hive/data/genomes/hg19/hg19.2bit stdout \
+    | faSplit byname stdin deletions/
+#Cut 1522178 snps totaling 3455905 bases from 3098701788 genomic bases
+#/hive/data/genomes/hg19/hg19.2bit has 3137161264 total bases, but the total number of bases in sequences for which we masked snps is 3098701788 (difference is 38459476)
+#114.251u 20.911s 4:24.26 51.1%  0+0k 0+0io 3pf+0w
+    # Again, that just means that some chroms didn't have filtered SNPs.
+    # Make sure that all sizes have decreased relative to original:
+    foreach f (deletions/chr*.fa)
+      echo -n "${f:t:r}: "
+      faCmp -softMask $f ../[1-9UMXY]*/$f:t |& grep -v "that differ" \
+      |& perl -we '$_=<>; \
+           if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
+             if ($1 < $2) {print "OK: del size $1 < $2\n";} \
+             else {die "ERROR: del size $1 >= $2\n";} \
+           } else {die $_;}'
+    end
+#chr1: OK: del size 248968549 < 249250621
+#chr10: OK: del size 135378065 < 135534747
+#...
+#(output OK -- del sizes < old)
+    foreach f (deletions/chr*.fa)
+      mv $f $f:r.del.fa
+      gzip $f:r.del.fa
+    end
+
+    # Clean up and prepare for download:
+    gzip snp131Cleaned.bed
+    foreach d (substitutions insertions deletions)
+      pushd $d
+        md5sum *.gz > md5sum.txt
+        cp /hive/data/genomes/hg18/snp130Mask/$d/README.txt .
+      popd
+    end
+    # Edit the README.txt in each subdir.
+
+    # Create download links on hgwdev.
+    # NOTE: Currently we offer only the substitutions.
+    # If we get any user requests, then maybe we can put the insertions
+    # and deletions out there.
+    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp131Mask
+    ln -s /hive/data/genomes/hg19/snp131Mask/substitutions/* \
+      /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp131Mask/
+## If there is user demand for ins & del, then start over with an empty
+## goldenPath/snp131Mask and do this:
+##    foreach type (substitutions insertions deletions)
+##      mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp131Mask/$type
+##      ln -s /hive/data/genomes/hg19/snp131Mask/$type/* \
+##        /usr/local/apache/htdocs-hgdownload/goldenPath/hg19/snp131Mask/$type/
+##    end
+
+
 ##############################################################################
 #  RE-BUILD sno/miRNA TRACK (DONE - 04-20-2010 - Chin)
 
     # The data in this track is out of date so update the track.