src/hg/makeDb/doc/hg18.txt 1.356

1.356 2009/03/12 18:24:04 angie
Fixed coords of hgdpHzyBantu and reloaded.
Index: src/hg/makeDb/doc/hg18.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg18.txt,v
retrieving revision 1.355
retrieving revision 1.356
diff -b -B -U 4 -r1.355 -r1.356
--- src/hg/makeDb/doc/hg18.txt	11 Mar 2009 18:31:44 -0000	1.355
+++ src/hg/makeDb/doc/hg18.txt	12 Mar 2009 18:24:04 -0000	1.356
@@ -26841,16 +26841,15 @@
 #Loaded 660280 elements of size 7
 
 
 #############################################################################
-# HGDP HETEROZYGOSITY (DONE 2/12/09 angie, except for Bantu 3/02/09)
+# HGDP HETEROZYGOSITY (DONE 2/12/09 angie, except for Bantu 3/12/09)
     mkdir /hive/data/genomes/hg18/bed/hgdpHzy
     cd /hive/data/genomes/hg18/bed/hgdpHzy
     foreach continent (african americas easia european mideast oceania sasia)
       wget --timestamping http://hgdp.uchicago.edu/data/hzy/$continent.gff.gz
     end
     wget --timestamping http://hgdp.uchicago.edu/data/hzy/allbantu.hzy.gff.gz
-#*** waiting to hear back from Joe about whether that's the right file
     foreach continent (african allbantu americas easia european mideast oceania sasia)
       set bedGraph = `echo $continent \
                       | sed -re 's/can$/ca/; s/pean$/pe/; s/asia/Asia/; s/allbantu/bantu/; \
                                  s/(.*)/hgdpHzy\u\1.bedGraph/'`
@@ -26858,17 +26857,32 @@
       zcat $continent.gff.gz \
       | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
         > $bedGraph
     end
+
+    # 3/12/09: All of the original files' coords were intervals between SNPs,
+    # but the Bantu file had SNP coordinates, and one more line per chrom than
+    # the others.  So (after getting OK from Joe) I am going to transform the
+    # Bantu SNP coords to intervals like the others.
+    perl -we 'while (<>) { \
+      chomp; ($c, $s, undef, $h) = split; \
+      if (defined $lastC) { \
+        if ($lastC eq $c) { \
+          print "$c\t$lastS\t$s\t$lastH\n"; \
+        } # Discarding last SNP on each chrom \
+      } \
+      ($lastC, $lastS, $lastH) = ($c, $s, $h); \
+    }' \
+      hgdpHzyBantu.bedGraph > tmp
+    mv tmp hgdpHzyBantu.bedGraph
+
     # Using bedGraph, not wig, because there are only 640k datapoints and 
     # some are over the 10Mbase wiggle item size limit.
     foreach f (*.bedGraph)
       hgLoadBed hg18 $f:r $f -bedGraph=4
     end
     # All have same size:
 #Loaded 640676 elements of size 4
-    # except for Bantu:
-#Loaded 640698 elements of size 4
 
 
 #############################################################################
 # HGDP FST (DONE 2/12/09 angie)