src/hg/makeDb/doc/mm9.txt 1.90
1.90 2009/02/18 03:07:03 angie
Updated komp (now called IKMC) w/new data from Carol Bult.
Index: src/hg/makeDb/doc/mm9.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/mm9.txt,v
retrieving revision 1.89
retrieving revision 1.90
diff -b -B -U 4 -r1.89 -r1.90
--- src/hg/makeDb/doc/mm9.txt 30 Jan 2009 00:09:27 -0000 1.89
+++ src/hg/makeDb/doc/mm9.txt 18 Feb 2009 03:07:03 -0000 1.90
@@ -8611,56 +8611,73 @@
rm bed.tab
#########################################################################
-# KOMP (KNOCKOUT MOUSE PROJECT) (DONE 10/21/08 angie)
+# KOMP (KNOCKOUT MOUSE PROJECT) (DONE 2/12/09 angie)
+# done 10/21/08 w/files emailed from Carol Bult 10/18
ssh hgwdev
- mkdir /hive/data/genomes/mm9/bed/komp
- cd /hive/data/genomes/mm9/bed/komp
- # Save files emailed from Carol Bult 10/18 as
- # CSD_GFF_10182008.txt and Regeneron_GFF_10172008.txt.
- # Make bed9 (would be bed4, but using itemRgb):
- perl -wpe 'chomp; @w = split("\t"); \
- if ($w[3] eq "") { warn "$_\n"; s/^.*//; \
- next; } # Some lines have no coords. \
- $color = ($w[9] eq "Yellow") ? "255,215,0" : \
- ($w[9] eq "Green") ? "0,240,0" : \
- ($w[9] eq "Blue") ? "0,0,200" : "0,0,0"; \
- $w[3]--; \
- $_ = join("\t", $w[0], $w[3], $w[4], $w[10], \
- 0, ".", $w[3], $w[3], $color) . "\n";' \
- CSD_GFF_10182008.txt Regeneron_GFF_10172008.txt > komp.bed
-#chr10 . CSD . . . MGI:3645882;In Progress Yellow EG633971
-#chr11 . CSD . . . MGI:3650936;Withdrawn/Problematic Black OTTMUSG00000002671
-#chr2 . CSD . . . MGI:3702423;Withdrawn/Problematic Black OTTMUSG00000016613
-#chr2 . CSD . . . MGI:3706568;Withdrawn/Problematic Black OTTMUSG00000016226
-#chr2 . CSD . . . MGI:1913626;In Progress Yellow 3100002L24Rik
-#chr2 . CSD . . . MGI:3649840;Withdrawn/Problematic Black OTTMUSG00000016576
-#chr4 . CSD . . . MGI:3645499;Withdrawn/Problematic Black EG433762
-#chr8 . CSD . . . MGI:3642269;Withdrawn/Problematic Black OTTMUSG00000018948
-#chr8 . CSD . . . MGI:3647176;Withdrawn/Problematic Black EG654453
-#chr11 . Regeneron . . . MGI:1890761;Not Started/On Hold Blue Sox16
-#chr5 . Regeneron . . . MGI:3648938;Reagent(s) Available Green Pvrig
-#chr8 . Regeneron . . . MGI:2387432;Not Started/On Hold Blue Trim61
+ mkdir -p /hive/data/genomes/mm9/bed/komp/2009_02
+ cd /hive/data/genomes/mm9/bed/komp/2009_02
+ # Save files emailed from Carol Bult 2/12 as
+ # csd_gff_021109.gz and regeneron_gff_021109.gz
+ # Make bed12 with itemRgb:
+ zcat *.gz \
+ | perl -we \
+ 'while (<>) { \
+ s/\r?\n$//; \
+ ($chr, undef, $ctr, $s, $e, undef, undef, undef, $id, $col, $n) = split("\t"); \
+ if ($s eq "") { warn "$_\n"; s/^.*//; next; } # Some lines have no coords. \
+ $col = ($col eq "Yellow") ? "255,215,0" : \
+ ($col eq "Green") ? "0,240,0" : \
+ ($col eq "Blue") ? "0,0,200" : "0,0,0"; \
+ $s--; \
+ my $geneId = join("|", $chr, $ctr, $n, $id); \
+ push @{$geneBlks{$geneId}}, [$s, $e, $col]; \
+ } \
+ warn "Got " . scalar(keys %geneBlks) . " genes.\n"; \
+ foreach my $geneId (keys %geneBlks) { \
+ my @blks = @{$geneBlks{$geneId}}; \
+ my ($chrom, $center, $name) = split(/\|/, $geneId); \
+ my $blkCount = @blks; \
+ @blks = sort {$a->[0] <=> $b->[0]} @blks; \
+ my $chromStart = $blks[0]->[0]; \
+ my $chromEnd = $blks[$blkCount-1]->[1]; \
+ my $color = $blks[0]->[2]; \
+ my $blkStarts = ""; \
+ my $blkSizes = ""; \
+ foreach my $blk (@blks) { \
+ my ($start, $end, $col) = @{$blk}; \
+ $blkStarts .= ($start - $chromStart) . ","; \
+ $blkSizes .= ($end - $start) . ","; \
+ if ($col ne $color) { die "Blocks of $geneId of colors $color and $col"; } \
+ } \
+ print join("\t", $chrom, $chromStart, $chromEnd, $name, 0, ".", $chromStart, \
+ $chromStart, $color, $blkCount, $blkSizes, $blkStarts) . "\n"; \
+ }' \
+ | sort -k 1,1 -k 2n,2n > komp.bed
+#Got 16665 genes.
+ # No stderr empty-coord warnings this time (no unmapped items).
# Make an alias-style table with associated info (MGI ID and status):
- perl -wpe 'chomp; @w = split("\t"); \
+ zcat *.gz \
+ | perl -wpe 's/\r?\n$//; @w = split("\t"); \
if ($w[3] eq "") { s/^.*//; next; } # Some lines have no coords. \
$w[8] =~ m/^(MGI:\d+);\s*(\w.*)/ || die; \
($mgi, $status) = ($1, $2); \
$_ = "$w[10]\t$mgi,$w[2],$status\n";' \
- CSD_GFF_10182008.txt Regeneron_GFF_10172008.txt \
- | sort > kompExtra.tab
+ | sort -u > kompExtra.tab
# Load 'em up:
hgLoadBed mm9 komp komp.bed
-#Loaded 9049 elements of size 9
+#Loaded 16665 elements of size 12
sed -e 's/genericAlias/kompExtra/' $HOME/kent/src/hg/lib/genericAlias.sql \
> kompExtra.sql
hgLoadSqlTab mm9 kompExtra kompExtra.sql kompExtra.tab
checkTableCoords -verbose=2 mm9 komp
- # No output, good.
+#mm9.komp item Tekt3 chr11:62887195-62896116: blocks 3 and 4 overlap.
+#mm9.komp has 1 records with overlapping blocks.
+ # Carol talking to the Sanger folks about that one... I think we can waive.
+
runJoiner.csh mm9 komp
-#Checking keys on database mm9
-# mm9.kompExtra.name - hits 9049 of 9049 ok
+# mm9.kompExtra.name - hits 16665 of 16665 ok
#########################################################################
### Affy MOE430 version 2 (DONE - 2008-09-25,10-02 - Hiram)