src/hg/makeDb/doc/hg18.txt 1.406

1.406 2010/03/25 21:35:39 angie
Some really old changes: snp130 func annotation fixes, hapmap and 1000 genomes experimental work (some interrupted).
Index: src/hg/makeDb/doc/hg18.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg18.txt,v
retrieving revision 1.405
retrieving revision 1.406
diff -b -B -U 4 -r1.405 -r1.406
--- src/hg/makeDb/doc/hg18.txt	19 Mar 2010 00:23:03 -0000	1.405
+++ src/hg/makeDb/doc/hg18.txt	25 Mar 2010 21:35:39 -0000	1.406
@@ -3323,8 +3323,9 @@
 #LOOP
 /cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/hg18/11.ooc {check out line+ out/$(root2)/$(root1).$(root2).psl}
 #ENDLOOP
 '_EOF_'
+    # << emacs
     gensub2 contigs.lst bacends.lst template jobList
     foreach f (`cat bacends.lst`)
         set d = $f:r:t
         echo $d
@@ -21630,9 +21631,8 @@
 # snp129Seq) to .2008-08-08 00:00:00 to avoid unwarranted joinerCheck
 # time discrepancy errors. (8/8/08, brooke)
 
     # Set up build directory
-    ssh kkstore06
     mkdir -p /cluster/store3/dbSNP129/{human,shared}
     ln -s /cluster/store3/dbSNP129 /cluster/data/dbSNP/129
 
     # Get field encodings -- if there are changes or additions to the
@@ -22367,8 +22367,10 @@
 
 ############################################################################
 # dbSNP BUILD 130 (UPDATED 8/18/09 angie)
 # Originally done 5/22/09.
+# Functional annotations restricted by mapping position 7/7.
+# dbSNP corrections applied to func field 8/18.
     # Set up build directory
     mkdir -p /hive/data/outside/dbSNP/130/{human,shared}
 
     # Get field encodings -- if there are changes or additions to the
@@ -22513,33 +22515,46 @@
 
     #################### EXTRACT INFO FROM NCBI TABLES ####################
     # Glom each SNP's function codes together and load up a new hg18Snp130 table.
     # Also extract NCBI's annotations of coding SNPs' effects on translation.
-    # ContigLocusId includes contig_acc and asn_{from,to} but those are not comprehensive!
-    # If a SNP has been mapped to multiple contigs, one is randomly selected, and if 
-    # it is not a reference contig, we miss out if we restrict by contig.  We may end
-    # up getting a few spurious functional annotations from mappings to other assemblies
-    # but them's the breaks.
+    # We extract ContigLocusId info only for reference assembly mapping.
+    # Some SNP's functional annotations are for an alternate assembly, so we will
+    # have no NCBI functional annotations to display for those (but our own are 
+    # available).
     cd /hive/data/outside/dbSNP/130/human
-    hgsql hg18snp130 -NBe 'select snp_id, mrna_acc, fxn_class, \
-                           reading_frame, allele, residue, codon from ContigLocusId' \
+    hgsql hg18snp130 -NBe 'select snp_id, ci.contig_acc, asn_from, asn_to, mrna_acc, \
+                           fxn_class, reading_frame, allele, residue, codon, cli.ctg_id \
+                           from ContigLocusId as cli, ContigInfo as ci \
+                           where cli.ctg_id = ci.ctg_id and \
+                           group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2")' \
       > ncbiFuncAnnotations.txt
-    cut -f 1,3 ncbiFuncAnnotations.txt \
-    | sort -u -k1,1 -k2,2n  \
-    | awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \
-            else { if (prevId) {print prevId "\t" prevFunc;} \
-                                prevFunc = $2 ","; }} \
-           {prevId = $1;} \
-           END {print prevId "\t" prevFunc;}' \
+    # Ignore function code 8 (cds-reference, just means that some allele matches reference)
+    # and glom functions for each SNP id:
+    cut -f 1-4,6,11 ncbiFuncAnnotations.txt \
+    | sort -u -k1n,1n -k6n,6n -k3n,3n -k5n,5n \
+    | perl -we 'while (<>) { chomp; \
+                  ($id, undef, $s, $e, $f, $c) = split; \
+                  if (defined $prevId && $id == $prevId && $c == $prevC && $s == $prevS) { \
+                    $prevFunc .= "$f," unless ($f == 8); \
+                  } else { \
+                    print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n" if (defined $prevId); \
+                    $prevFunc = ($f == 8) ? "" : "$f,"; \
+                  } \
+                  ($prevId, $prevC, $prevS, $prevE) = ($id, $c, $s, $e); \
+                } \
+                print "$prevId\t$prevC\t$prevS\t$prevE\t$prevFunc\n"' \
       > ucscFunc.txt
-
     wc -l ucscFunc.txt
-#7344853 ucscFunc.txt
+#7035685 ucscFunc.txt
     cat > ucscFunc.sql <<EOF
 CREATE TABLE ucscFunc (
         snp_id int NOT NULL ,
+        ctg_id int(10) NOT NULL ,
+        asn_from int(10) NOT NULL ,
+        asn_to int(10) NOT NULL ,
         fxn_class varchar(255) NOT NULL ,
-        INDEX snp_id (snp_id)
+        INDEX snp_id (snp_id),
+        INDEX ctg_id (ctg_id)
 );
 EOF
     hgLoadSqlTab hg18snp130 ucscFunc{,.sql,.txt}
 
@@ -22568,9 +22583,9 @@
     # Add indices to tables for a big join (5 or 6 minutes):
     hgsql hg18snp130 -e \
       'alter table ContigLoc  add index (ctg_id); \
        alter table ContigInfo add index (ctg_id); \
-       alter table ContigLoc  add index (snp_id); \
+       alter table ContigLocusId add index (snp_id); \
        alter table SNP        add index (snp_id); \
        alter table MapInfo    add index (snp_id);'
 
     # Big leftie join to bring together all of the columns that we want in snp130,
@@ -22586,18 +22601,22 @@
                   ci.group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2")) \
           LEFT JOIN MapInfo as mi ON mi.snp_id = cl.snp_id and mi.assembly = ci.group_label) \
          LEFT JOIN SNP as s ON s.snp_id = cl.snp_id) \
         LEFT JOIN ucscGnl as ug ON ug.snp_id = cl.snp_id) \
-       LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id;' \
+       LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id and uf.ctg_id = cl.ctg_id \
+                                and uf.asn_from = cl.asn_from;' \
       > ucscNcbiSnp.ctg.bed
 #on a not-so busy hgwdev: 80.735u 36.958s 8:54.76 22.0%   0+0k 0+0io 0pf+0w
-#on a very busy hgwdev:   78.753u 41.304s 30:19.77 6.5%   0+0k 0+0io 0pf+0w
+#on a busy hgwdev:   78.753u 41.304s 30:19.77 6.5%   0+0k 0+0io 0pf+0w
+#on hgwdev with giant chains loading in parallel:
+#                         78.213u 33.826s 58:16.41 3.2%   0+0k 0+0io 0pf+0w
     wc -l ucscNcbiSnp.ctg.bed 
 #19189750 ucscNcbiSnp.ctg.bed
+
     liftUp ucscNcbiSnp.bed \
-      /cluster/data/hg18/jkStuff/liftContigs.lft warn \
+      /hive/data/genomes/hg18/jkStuff/liftContigs.lft warn \
       ucscNcbiSnp.ctg.bed
-#116.027u 7.078s 2:27.93 83.2%   0+0k 0+0io 0pf+0w
+#119.644u 8.992s 2:36.67 82.1%   0+0k 0+0io 3pf+0w
 
     # Drum roll please... translate NCBI's encoding into UCSC's, and
     # perform a bunch of checks.  This is where developer involvement
     # is most likely as NCBI extends the encodings used in dbSNP.
@@ -22612,9 +22631,9 @@
 #count of snps with weight  2 = 389501
 #count of snps with weight  3 = 1189989
 #count of snps with weight 10 = 281391
 #Found no errors.
-#143.111u 14.313s 3:15.18 80.6%  0+0k 0+0io 0pf+0w
+#163.878u 10.302s 3:33.84 81.4%	0+0k 0+0io 0pf+0w
     wc -l snp*
 #  18833531 snp130.bed
 #        22 snp130.sql
 #         0 snp130Errors.bed
@@ -22645,24 +22664,20 @@
     cd /hive/data/outside/dbSNP/130/human
     time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
       hg18 snp130 -sqlTable=snp130.sql snp130.bed
 #Loaded 18833531 elements of size 17
-#107.246u 11.546s 10:49.23 18.2% 0+0k 0+0io 0pf+0w
+#114.088u 12.924s 12:54.18 16.4%	0+0k 0+0io 0pf+0w
     time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
       hg18 snp130Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
       snp130Exceptions.bed
 #15.255u 1.257s 1:11.11 23.2%    0+0k 0+0io 0pf+0w
-    sed -e 's/snp125/snp130/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
-      > snp130ExceptionDesc.sql
-    hgLoadSqlTab hg18 snp130ExceptionDesc snp130ExceptionDesc.sql \
+    hgLoadSqlTab hg18 snp130ExceptionDesc ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
       snp130ExceptionDesc.tab
     # Load up sequences.
-    sed -e 's/snpSeq/snp130Seq/' ~/kent/src/hg/lib/snpSeq.sql \
-      > snp130Seq.sql
     mkdir -p /gbdb/hg18/snp
     ln -s /hive/data/outside/dbSNP/130/human/snp130.fa /gbdb/hg18/snp/snp130.fa
-    time nice hgLoadSqlTab hg18 snp130Seq snp130Seq.sql snp130Seq.tab
-#0.001u 0.004s 3:41.13 0.0%      0+0k 0+0io 0pf+0w
+    time nice hgLoadSqlTab hg18 snp130Seq ~/kent/src/hg/lib/snpSeq.sql snp130Seq.tab
+#0.005u 0.002s 6:02.78 0.0%	0+0k 0+0io 0pf+0w
 
     # Put in a link where one would expect to find the track build dir...
     ln -s /hive/data/outside/dbSNP/130/human /cluster/data/hg18/bed/snp130
 
@@ -22717,9 +22732,11 @@
     # those.  E.g. rs437678.
 
 
 #######################################################################
-# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP130 (DONE 5/15/09 angie)
+# ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP130 (DONE 5/26/09 angie)
+# Originally done 5/15; reloaded 5/26 after making sure no coords had changed,
+# reloaded 7/7/09 to bump timestamp
     mkdir /hive/data/genomes/hg18/bed/snp130Ortho
     cd /hive/data/genomes/hg18/bed/snp130Ortho
 
     # Following Heather's lead in snp126orthos, filter SNPs to to keep
@@ -22876,8 +22893,198 @@
     nice gzip snp130Simple.bed snp130ExcludeIds.txt snp130ForLiftOver.bed
     rm -r run*/split tmp.txt *.orthoGlom.txt
 
 
+#######################################################################
+# DBSNP CODING ANNOTATIONS (DONE 7/7/09 angie)
+# originally done 6/2/09 - redone w/snp130, using mapping locations of dbSNP's func. annos
+    cd /hive/data/outside/dbSNP/130/human
+    awk '$9 != NULL {print;}' ncbiFuncAnnotations.txt \
+    | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n | uniq > ncbiCodingAnnotations.txt
+    wc -l ncbiCodingAnnotations.txt
+#576726 ncbiCodingAnnotations.txt
+    # How many & what kinds of function types?
+    cut -f 6 ncbiCodingAnnotations.txt \
+    | sort -n | uniq -c
+# 107963  3  (coding-synon)
+# 276197  8  (cds-reference)
+#   4664 41  (nonsense)
+# 146908 42  (missense)
+#  40994 44  (frameshift)
+    # Does everybody have a reference annotation?
+    awk '$6 == 8 {print $1 "\t" $5;}' ncbiCodingAnnotations.txt | uniq > tmp1
+    awk '$6 != 8 {print $1 "\t" $5;}' ncbiCodingAnnotations.txt | uniq > tmp2
+    wc -l tmp1 tmp2
+#  276113 tmp1
+#  279647 tmp2
+    # Doh! not everybody.  So hgTracks will sometimes have to process ref itself...
+    # Gather up multiple annotation lines into one line per {snp, gene, frame}:
+    perl -e  'while (<>) { chomp; \
+                my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \
+                if (defined $lastRs && \
+                    ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \
+                     $lastTx ne $txId || $lastFrm ne $frm)) { \
+                  if (defined $refRow) { \
+                    $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
+                    $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
+                  } \
+                  print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
+                        "$count\t$fxns\t$nts\t$codons\t$aas\n"; \
+                  $refRow = undef;  @rows = ();  ($count, $fxns, $nts, $codons, $aas) = (); \
+                } \
+                ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \
+                    ($rsId, $ctg, $s, $e, $txId, $frm); \
+                $count++; \
+                if ($fxn == 8) { \
+                  $refRow = [$fxn, $nt, $aa, $codon]; \
+                } else { \
+                 $fxns .= "$fxn,";  $nts .= "$nt,";  $aas .= "$aa,";  $codons .= "$codon,"; \
+                } \
+              } \
+              if (defined $refRow) { \
+                $fxns = "$refRow->[0],$fxns";  $nts = "$refRow->[1],$nts"; \
+                $aas = "$refRow->[2],$aas";    $codons = "$refRow->[3],$codons"; \
+              } \
+              print "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \
+                    "$count\t$fxns\t$nts\t$codons\t$aas\n";' \
+      ncbiCodingAnnotations.txt \
+        > snp130CodingDbSnp.ctg.txt
+    liftUp snp130CodingDbSnp.bed \
+      /hive/data/genomes/hg18/jkStuff/liftContigs.lft warn snp130CodingDbSnp.ctg.txt
+    hgLoadBed hg18 snp130CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \
+      -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \
+      snp130CodingDbSnp.bed
+#Loaded 279815 elements of size 11
+
+
+#######################################################################
+# SNPMASKED SEQUENCE FOR SNP130 (DONE 7/10/09 angie)
+    mkdir /hive/data/genomes/hg18/snp130Mask
+    cd /hive/data/genomes/hg18/snp130Mask
+
+    # Identify rsIds with various problems -- we will exclude those.
+    # MultipleAlignments is kinda broad because anything that maps on
+    # both chrN and chrN_foo_hap1 will be excluded... similarly, extra
+    # matches on chrN_random might disqualify good matches on chrN.
+    # Well, erring on the side of caution is good.
+    awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \
+      /hive/data/outside/dbSNP/130/human/snp130Exceptions.bed \
+      | sort -u \
+      > snp130ExcludeRsIds.txt
+    time grep -vFwf snp130ExcludeRsIds.txt \
+      /hive/data/outside/dbSNP/130/human/snp130.bed \
+      > snp130Cleaned.bed
+#185.202u 4.847s 3:22.55 93.8%   0+0k 0+0io 0pf+0w
+
+    # Substitutions:
+    mkdir substitutions
+    snpMaskSingle snp130Cleaned.bed /hive/data/genomes/hg18/hg18.2bit stdout \
+    | faSplit byname stdin substitutions/
+#Masked 12142171 snps in 12141860 out of 3091592211 genomic bases
+#/hive/data/genomes/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3091592211 (difference is 16085062)
+#94.376u 16.038s 3:10.37 57.9%   0+0k 0+0io 0pf+0w
+    # Check that 16085062 is the total #bases in sequences with nothing in snp130Cleaned:
+    cut -f 1 snp130Cleaned.bed | uniq > /tmp/1
+    grep -vwf /tmp/1 ../chrom.sizes
+    grep -vwf /tmp/1 ../chrom.sizes \
+    | awk 'BEGIN {TOTAL = 0 ; } {TOTAL += $2 ; } END {printf "%d\n", TOTAL ; }'
+#16085062
+    # 338 warnings about differing observed strings at same base position --
+    # saved as diffObserved.txt.
+#TODO: send list to dbSNP.
+    # Make sure that sizes are identical, first diffs are normal -> IUPAC,
+    # and first diffs' case is preserved:
+    foreach f (substitutions/chr*.fa)
+      faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ"
+    end
+#chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 491 (y != c)
+#chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 55877 (s != c)
+#...
+#(output OK -- ambiguous bases replacing [agct] at SNP positions)
+    foreach f (substitutions/chr*.fa)
+      echo $f:t:r
+      mv $f $f:r.subst.fa
+      gzip $f:r.subst.fa
+    end
+
+    # Insertions:
+    mkdir insertions
+    snpMaskAddInsertions snp130Cleaned.bed /hive/data/genomes/hg18/hg18.2bit stdout \
+    | faSplit byname stdin insertions/
+#Added 2464798 snps totaling 5891837 bases to 3085167749 genomic bases
+#/hive/data/genomes/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524)
+#99.269u 17.928s 3:31.80 55.3%   0+0k 0+0io 1pf+0w
+    # Again, that just means that some chroms didn't have filtered SNPs.
+    # Make sure that all sizes have increased relative to original:
+    foreach f (insertions/chr*.fa)
+      faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
+      |& perl -we '$_=<>; \
+           if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
+             if ($1 > $2) {print "OK: ins size $1 > $2\n";} \
+             else {die "ERROR: ins size $1 <= $2\n";} \
+           } else {die $_;}'
+    end
+#OK: ins size 247711739 > 247249719
+#OK: ins size 135642480 > 135374737
+#...
+#(output OK -- new sizes > old)
+    foreach f (insertions/chr*.fa)
+      mv $f $f:r.ins.fa
+      gzip $f:r.ins.fa
+    end
+
+    # Deletions:
+    mkdir deletions
+    snpMaskCutDeletions snp130Cleaned.bed /hive/data/genomes/hg18/hg18.2bit stdout \
+    | faSplit byname stdin deletions/
+#Cut 1514798 snps totaling 3554896 bases from 3086962619 genomic bases
+#/hive/data/genomes/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3086962619 (difference is 20714654)
+#103.312u 31.094s 3:56.12 56.9%  0+0k 0+0io 1pf+0w
+    # Again, that just means that some chroms didn't have filtered SNPs.
+    # Make sure that all sizes have decreased relative to original:
+    foreach f (deletions/chr*.fa)
+      faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
+      |& perl -we '$_=<>; \
+           if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
+             if ($1 < $2) {print "OK: del size $1 < $2\n";} \
+             else {die "ERROR: del size $1 >= $2\n";} \
+           } else {die $_;}'
+    end
+#OK: del size 246960459 < 247249719
+#OK: del size 135214654 < 135374737
+#...
+#(output OK -- del sizes < old)
+    foreach f (deletions/chr*.fa)
+      mv $f $f:r.del.fa
+      gzip $f:r.del.fa
+    end
+
+    # Clean up and prepare for download:
+    gzip snp130Cleaned.bed
+    foreach d (substitutions insertions deletions)
+      pushd $d
+        md5sum *.gz > md5sum.txt
+        cp ../../snp129Mask/$d/README.txt .
+      popd
+    end
+    # Edit the README.txt in each subdir.
+
+    # Create download links on hgwdev.
+    # NOTE: Currently we offer only the substitutions.
+    # If we get any user requests, then maybe we can put the insertions
+    # and deletions out there.
+    mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask
+    ln -s /hive/data/genomes/hg18/snp130Mask/substitutions/* \
+      /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask/
+## If there is user demand for ins & del, then start over with an empty
+## goldenPath/snp130Mask and do this:
+##    foreach type (substitutions insertions deletions)
+##      mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask/$type
+##      ln -s /hive/data/genomes/hg18/snp130Mask/$type/* \
+##        /usr/local/apache/htdocs/goldenPath/hg18/snp130Mask/$type/
+##    end
+
+
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
@@ -28104,8 +28311,32 @@
       -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/hapmapPhaseIIISummary.sql
 #Loaded 4166007 elements of size 18
 #33.401u 3.275s 1:46.95 34.2%    0+0k 0+0io 0pf+0w
 
+#############################################################################
+# DOWNLOAD HAPMAP PHASED GENOTYPES (PHASE III)  (DONE 2/23/09 angie)
+    mkdir -p /hive/data/outside/hapmap/phasing/2009-02_phaseIII/HapMap3_r2
+    cd /hive/data/outside/hapmap/phasing/2009-02_phaseIII/HapMap3_r2
+    wget --timestamping \
+      ftp://ftp.hapmap.org/pub/hapmap/public/phasing/2009-02_phaseIII/HapMap3_r2/\*
+    foreach pop (ASW CEU CHD GIH JPT+CHB LWK MEX MKK TSI YRI)
+      foreach type (DUOS TRIOS UNRELATED)
+        mkdir -p $pop/$type
+        pushd $pop/$type
+        wget --timestamping \
+          ftp://ftp.hapmap.org/pub/hapmap/public/phasing/2009-02_phaseIII/HapMap3_r2/$pop/$type/\*
+        popd
+      end
+    end
+    # Looks like phased genotypes are given only for the populations with 
+    # family structure: ASW, CEU, MEX, MKK, and YRI.
+    # Next: use these data to make LD tracks.
+
+
+#############################################################################
+# HAPMAP LD COMPUTED ON PHASED & UNPHASED GENOTYPES (TODO angie)
+
+
 
 #############################################################################
 # GERP Conservation scoring and elements for Ensembl 31-way alignments
 # From Javier Guerroro
@@ -28130,8 +28361,9 @@
         echo $f
         bzcat $f | tail -n +2 | wigEncode stdin temp.wig temp.wib
     end
 'EOF'
+    # << emacs
 
     bzcat lab/*.wig.bz2 | tail -n +2 | \
         wigEncode stdin ensembl31wayGerpScores.wig ensembl31wayGerpScores.wib
 
@@ -28170,8 +28403,82 @@
 #checked: 62958 failed: 0
     hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp
     hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp
 
+
+#############################################################################
+# COVERAGE FOR 1000 GENOMES HIGH-COV INDIVIDS (IN PROGRESS 6/10/09 angie)
+#TODO: try again now that wigToBigWig is more mem-efficient
+# also, new alignments have probably become available since then.
+    # wigBedToStep ran out of memory on hgwdev (w/limit of 32G)... roll own:
+    cd /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes
+    foreach s (NA12878  NA12891  NA12892  NA19238  NA19239  NA19240)
+      pushd data/$s/alignment
+      foreach p (454 SLX)
+        echo "==== $s $p ===="
+        ls -1 $s.chrom*.$p.SRP000032.2009_04.bam \
+        | grep -v chromMT \
+        | xargs -L 1 samtools pileup \
+        | perl -pe '($c, $start, undef, $depth) = split; \
+                    if ($c ne $lastC || $start != $lastStart+1) { \
+                      print "fixedStep chrom=chr$c start=$start step=1 span=1\n"; \
+                    } \
+                    $_ = "$depth\n"; \
+                    ($lastC, $lastStart) = ($c, $start);' \
+        | gzip -c > cov${s}By{$p}.fixedStep.gz
+        echo ""
+      end
+      popd
+    end
+
+#TODO
+    # Killing memory -- run separately:
+        | wigToBigWig -clip stdin /hive/data/genomes/hg18/chrom.sizes cov${s}By$p.bw
+
+#[bam_pileup] fail to read the header of NA12878.chromY.454.SRP000032.2009_04.bam: non-exisiting file or wrong format.
+    # NA12878.chromY.454.SRP000032.2009_04.bam is an empty file.
+
+    # Load tables
+    foreach bw (`find /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes \
+                   -name cov\*.bw`)
+      ln -s $bw /gbdb/hg18/bbi/
+      hgsql hg18 -e "drop table if exists $bw:t:r; \
+                     create table $bw:t:r (fileName varchar(255) not null); \
+                     insert into $bw:t:r values ('/gbdb/hg18/bbi/$bw:t');"
+    end
+
+
+#############################################################################
+# 1000 GENOMES HIGH-COV INDIVIDS READ ALIGNMENTS (DONE 11/30/09 angie)
+    # one-off to test BAM as track type:
+    cd /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes
+    set testBam = NA12878/alignment/NA12878.chrom22.SRP000032.2009_02.bam
+    ln -s `pwd`/$testBam{,.bai} \
+       /gbdb/hg18/bbi/
+    hgsql hg18 -e "drop table if exists bamNA12878; \
+                   create table bamNA12878 (fileName varchar(255) not null); \
+                   insert into bamNA12878 values ('/gbdb/hg18/bbi/$testBam:t');"
+
+    # 9/14/09: update bamNA12878 to use new seqName column and try samtools'
+    # capability to fetch ftp sparsely:
+    hgsql hg18 -e "drop table if exists bamNA12878; \
+                   create table bamNA12878 (fileName varchar(255) not null, \
+                                            seqName varchar(255) not null); \
+                   insert into bamNA12878 values ('ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/data/NA12878/alignment/NA12878.chrom21.SLX.maq.SRP000032.2009_07.bam', '21'); \
+                   insert into bamNA12878 values ('/gbdb/hg18/bbi/NA12878.chrom22.SLX.SRP000032.2009_04.bam', '22');"
+    # 11/30/09: Add more remote files:
+    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 X Y)
+      hgsql hg18 -e "insert into bamNA12878 values ('ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/data/NA12878/alignment/NA12878.chrom$c.SLX.maq.SRP000032.2009_07.bam', '$c');"
+    end
+    # Add an all-remote NA12891 for testing composite track:
+    hgsql hg18 -e "create table bamNA12891 (fileName varchar(255) not null, \
+                                            seqName varchar(255) not null);"
+    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
+      hgsql hg18 -e "insert into bamNA12891 values ('ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/pilot_data/data/NA12891/alignment/NA12891.chrom$c.SLX.maq.SRP000032.2009_07.bam', '$c');"
+    end
+
+
+
 ##############################################################################
 # UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram)
     mkdir /hive/data/genomes/hg18/bed/ucscToEnsembl
     cd /hive/data/genomes/hg18/bed/ucscToEnsembl
@@ -28813,8 +29120,52 @@
     # Added code to src/hg/hgTracks/simpleTracks.c to register a track handler
     # for vegaGeneComposite that is now used for this data. This used
     # vegaGeneMethods to display the name2 field (gene) as the item label in
     # the track.
+
+
+############################################################################
+# EPO ANCESTRAL REGIONS (DONE 8/5/09 angie)
+    # Use Aspera client to download 1000Genomes' Enredo-Pecan-Ortheus 
+    # four-catarrhini ancestral-tree calls for genome regions, as well as 
+    # their annotated fasta (requested by Sol Katzman):
+    cd /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/
+    set asperaInst = /opt/aspera/connect
+    set ascp = $asperaInst/bin/ascp
+    set aKey = $asperaInst/etc/asperaweb_id_dsa.putty
+    set aOpts = "-i $aKey -QTr -l300M -q"
+    set server = anonftp@ftp-private.ncbi.nlm.nih.gov
+    set aliDir = technical/reference/ancestral_alignments
+    mkdir -p $aliDir
+    cd $aliDir
+    foreach f (MD5SUM README README.ancestral_alignments summary.txt \
+      $ascp $aOpts $server\:/1000genomes/ftp/$aliDir/$f .
+    end
+    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
+      $ascp $aOpts $server\:/1000genomes/ftp/$aliDir/human_ancestor$c.bed
+      $ascp $aOpts $server\:/1000genomes/ftp/$aliDir/human_ancestor$c.fa.bz2
+    end
+    chmod 444 *
+    # Check md5sums:
+    perl -wpe 'chomp; ($expSum, $f) = split; $actSum = `md5sum $f`; $actSum =~ s/ .*\n//; \
+               $_ = ""; \
+               if ($expSum ne $actSum) { warn "MISMATCH: $f exp=$expSum, actual=$actSum\n"; } \
+               else {print "$f OK\n";}' MD5SUM
+    # Shortcut requested by Sol:
+    ln -s `pwd` /hive/data/outside/ancestral.epo.hg18
+    # Load up the regions:
+    mkdir /hive/data/genomes/hg18/bed/epoAncestralRegions
+    cd /hive/data/genomes/hg18/bed/epoAncestralRegions
+    set aliPath = /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/$aliDir
+    sed -e 's/^/chr/' $aliPath/human_ancestor_*.bed > epoAncestralRegions.bed
+    hgLoadBed hg18 epoAncestralRegions epoAncestralRegions.bed -tab -allowStartEqualEnd
+#Loaded 10195 elements of size 4
+    featureBits hg18 epoAncestralRegions
+#2778857014 bases of 2881515245 (96.437%) in intersection
+    featureBits hg18 -countGaps gap epoAncestralRegions
+#6232933 bases of 3107677273 (0.201%) in intersection
+
+
     # 2009-08-16 (hartera)
     # ensGtp table definition is in ~/kent/src/hg/lib/ensGtp.sql
     # There is an index on the protein field so it can not be NULL. 
     # If there is no protein, the gene name is given.
@@ -29292,8 +29643,18 @@
     hgLoadBed hg18 gwasCatalog gwasCatalog.bed \
       -tab -sqlTable=$HOME/kent/src/hg/lib/gwasCatalog.sql -notItemRgb -allowStartEqualEnd
 #Loaded 2930 elements of size 22
 
+    # For David: find examples of risk alleles for which dbSNP observed
+    # alleles are complementary (A/T or C/G) -- how do we know what strand the
+    # risk allele is on??  -- asking corresp. author Teri Manolio.
+    hgsql hg18 -e 'select snp.name,gc.riskAllele,snp.strand,snp.refNcbi,snp.observed \
+                   from gwasCatalog as gc, snp130 as snp \
+                   where gc.riskAllele rlike "^rs[0-9]+-[ACGT]" and \
+                         gc.name = snp.name and snp.observed in ("C/G", "A/T") \
+                   order by gc.name limit 20;'
+
+
 #############################################################################
 # CRG MAPABILITY (2010-01-19 - 2010-01-28, hartera, DONE)
 # Data was provided by Thomas Derrien (thomas.derrien.crg.es) and Paolo Ribeca
 # from the Guigo lab at the Center for Genomic Regulation (CRG) in Barcelona.