src/hg/makeDb/doc/hg19.txt 1.80

1.80 2010/02/09 16:10:41 hartera
Loaded new data for segmental duplications.
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.79
retrieving revision 1.80
diff -b -B -U 4 -r1.79 -r1.80
--- src/hg/makeDb/doc/hg19.txt	6 Feb 2010 02:47:48 -0000	1.79
+++ src/hg/makeDb/doc/hg19.txt	9 Feb 2010 16:10:41 -0000	1.80
@@ -8190,21 +8190,23 @@
 # field so the loader read only 28 words instead of 29. E-mailed Tin to
 # ask for the data to be fixed. 
 # 2010-02-03 Received new data as the previous data had empty fields.
 # 2010-02-04 Loaded new data into hg19 database.
+# 2010-02-09 Received new data on 02/08/10 as there were more errors in the
+# code that caused the data to have empty fields.
     mkdir /hive/data/genomes/hg19/bed/genomicSuperDups
     cd /hive/data/genomes/hg19/bed/genomicSuperDups
     # Remove old data
     rm *
     wget --timestamping \
-        ftp://mesh.gs.washington.edu/pub/UCSC/hg19genomicSuperDups.gz    
-    gunzip hg19genomicSuperDups.gz
+        ftp://mesh.gs.washington.edu/pub/UCSC/hg19genomicSuperDups.tab.gz    
+    gunzip hg19genomicSuperDups.tab.gz
     # Fix incorrect chromosome names in data. Check both chrom and otherChrom.
     # Previously, found several cases where the last letter of random was
     # missing for the names of the random contigs. They all look good this
     # time.
-    awk '{print $1}' hg19genomicSuperDups | sort | uniq > chroms
-    awk '{print $7}' hg19genomicSuperDups | sort | uniq > otherChroms
+    awk '{print $1}' hg19genomicSuperDups.tab | sort | uniq > chroms
+    awk '{print $7}' hg19genomicSuperDups.tab | sort | uniq > otherChroms
     hgsql -Ne 'select chrom from chromInfo;' hg19 | sort | uniq > chromInfo.txt 
     comm -23 chroms chromInfo.txt
     comm -23 otherChroms chromInfo.txt
     # chroms and otherChroms match chromosome names in chromInfo.
@@ -8213,13 +8215,14 @@
     # The awk command was necessary for some recent other species
     # genomicSuperDups that had some too-short regions.  It does not seem
     # to be necessary here, but doesn't hurt and may be useful in
     # future builds.
-    sed -e 's/\t_\t/\t-\t/' hg19genomicSuperDups \
+    hgsql -e 'drop table genomicSuperDups;' hg19
+    sed -e 's/\t_\t/\t-\t/' hg19genomicSuperDups.tab \
     | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
     | hgLoadBed hg19 genomicSuperDups stdin \
       -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
-# Loaded 51549 elements of size 29
+# Loaded 51599 elements of size 29
 # Sorted
 # Creating table definition for genomicSuperDups
 # Saving bed.tab
 # Loading hg19