src/hg/makeDb/doc/hg19.txt 1.77

1.77 2010/02/02 20:11:44 hartera
Re-loading new data for the genomic segmental duplications track - in progress.
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.76
retrieving revision 1.77
diff -b -B -U 4 -r1.76 -r1.77
--- src/hg/makeDb/doc/hg19.txt	28 Jan 2010 23:49:54 -0000	1.76
+++ src/hg/makeDb/doc/hg19.txt	2 Feb 2010 20:11:44 -0000	1.77
@@ -8169,91 +8169,60 @@
     mkdir -p $pd
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
 
-######################
 ############################################################################
-# SEGMENTAL DUPLICATIONS (2009-10-31 and 2009-11-04, hartera, DONE)
+# SEGMENTAL DUPLICATIONS (20010-02-02, hartera, in progress)
 # File emailed from Tin Louie <tinlouie at u.washington.edu>
-# in Evan Eichler's lab.
-# (2009-11-07, hartera, DONE) Re-loaded table after correcting some chromosome
-# names.  
+# in Evan Eichler's lab on 01/28/10. This is a data update since it was
+# thought that the last data set was incorrect so the pipeline had to be
+# re-run.
 # NOTE: Received e-mail from Tin Louie suggesting that the otherSize 
 # column could be dropped. It is just the size of the otherChrom and it 
 # does not seem to be used for the track display or details page. It has the
 # correct description in the table schema so it is ok to keep it for now. 
 # In the future, this column could be dropped if it not useful.
     mkdir /hive/data/genomes/hg19/bed/genomicSuperDups
     cd /hive/data/genomes/hg19/bed/genomicSuperDups
-   
-    wget ftp://mesh.gs.washington.edu/pub/UCSC/hg19genomicSuperDups.gz
+    # Remove old data
+    rm *
+    wget --timestamping \
+       ftp://mesh.gs.washington.edu/pub/UCSC/hg19genomicSuperDups.gz     
     gunzip hg19genomicSuperDups.gz
-    # The sed command is necessary to fix "_" used as strand.
+    # Fix incorrect chromosome names in data. Check both chrom and otherChrom.
+    # Previously, found several cases where the last letter of random was
+    # missing for the names of the random contigs. They all look good this
+    # time.
+    awk '{print $1}' hg19genomicSuperDups | sort | uniq > chroms
+    awk '{print $7}' hg19genomicSuperDups | sort | uniq > otherChroms
+    hgsql -Ne 'select chrom from chromInfo;' hg19 | sort | uniq > chromInfo.txt 
+    comm -23 chroms chromInfo.txt
+    comm -23 otherChroms chromInfo.txt
+    # chroms and otherChroms match chromosome names in chromInfo.
+
+    # The sed command is necessary to fix "_" used as strand to "-".
     # The awk command was necessary for some recent other species
     # genomicSuperDups that had some too-short regions.  It does not seem
     # to be necessary here, but doesn't hurt and may be useful in
     # future builds.
     sed -e 's/\t_\t/\t-\t/' hg19genomicSuperDups \
     | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
     | hgLoadBed hg19 genomicSuperDups stdin \
       -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
+    # Loader says:
+Expecting 29 words line 29 of stdin got 28. Problem is that there are two tabs
+with a blank indelS field on this line so the loader, splitting on tabs, only
+reads 28 fields for this line. Same problem in other lines of the data.
+Contacted Tin to see if this can be fixed. 
+
 # Reading stdin
 # Loaded 63463 elements of size 29
 # Sorted
 # Creating table definition for genomicSuperDups
 # Saving bed.tab
 # Loading hg19
     # Updated details page with suggested text and an additional reference. 
     # src/hg/makeDb/trackDb/genomicSuperDups.html
-    # (2009-11-07, hartera)
-    # Fix incorrect chromosomes in data. 
-    # The script, runCheckTableCoordsDayOld, found this error in the table:
-    # hg19.genomicSuperDups has 28 records with chrom not described in
-    # chromInfo. 
-    cd /hive/data/genomes/hg19/bed/genomicSuperDup
-    awk '{print $1}' hg19genomicSuperDups | sort | uniq > chroms
-    awk '{print $7}' hg19genomicSuperDups | sort | uniq > otherChroms
-    # Found several cases where the last letter of random is missed off
-    # for the random contigs. These are not concatenated into a random chrom
-    # for hg19.  
-    hgsql -Ne 'select chrom from chromInfo;' hg19 | sort | uniq \
-          > chroms.chromInfo
-    # These are the contig names for random chroms that need to be corrected:
-    grep -v random chroms | grep rando | sort | uniq > chromsToCorrect
-    grep -v random otherChroms | grep rando | sort | uniq > otherChromsToCorrect
-    diff chromsToCorrect otherChromsToCorrect
-    # No difference so use chromsToCorrect file. There are 7 chrom names that 
-    # need to be corrected and they are:
-#chr11_gl000202_rando
-#chr17_gl000203_rando
-#chr17_gl000204_rando
-#chr17_gl000205_rando
-#chr17_gl000206_rando
-#chr19_gl000209_rando
-#chr21_gl000210_rando 
-    cp hg19genomicSuperDups hg19genomicSuperDups.orig
-    foreach c (`cat chromsToCorrect`)
-       perl -pi.bak -e "s/${c}/${c}m/g" hg19genomicSuperDups
-    end
-    # Check chroms and otherChroms again. 
-    awk '{print $1}' hg19genomicSuperDups | sort | uniq > chromsNew
-    awk '{print $7}' hg19genomicSuperDups | sort | uniq > otherChromsNew
-    # These look good now. 
-    # Drop the old table and re-load the corrected data.
-    hgsql -e 'drop table hg19genomicSuperDups;' hg19
-    
-    sed -e 's/\t_\t/\t-\t/' hg19genomicSuperDups \
-    | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
-    | hgLoadBed hg19 genomicSuperDups stdin \
-      -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
-# Reading stdin
-# Loaded 63463 elements of size 29
-# Sorted
-# Creating table definition for genomicSuperDups
-# Saving bed.tab
-# Loading hg19
-    # Cleanup
-    # Remove chroms* otherChroms* *.bak
 ############################################################################
 # ADD LINK TO GENENETWORK (DONE. 12/02/09 Fan).
 
 # Received geneNetwork ID list file, GN_human_RefSeq.txt, for hg19 from GeneNetwork, Zhou Xiaodong [xiaodong.zhou@gmail.com].