src/hg/makeDb/doc/hg18.txt 1.401
1.401 2010/03/02 23:04:52 chinhli
Adding H-Inv(7.0) update
Index: src/hg/makeDb/doc/hg18.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg18.txt,v
retrieving revision 1.400
retrieving revision 1.401
diff -b -B -U 4 -r1.400 -r1.401
--- src/hg/makeDb/doc/hg18.txt 1 Mar 2010 21:05:25 -0000 1.400
+++ src/hg/makeDb/doc/hg18.txt 2 Mar 2010 23:04:52 -0000 1.401
@@ -29382,5 +29382,104 @@
cd /hive/data/genomes/hg18/bed/par/
# create hg18.par using the documented coordinates
hgPar hg18 hg18.par par
#####################################################################
+# HH-Inv(7.0)
+ mkdir /hive/data/genomes/hg18/bed/hinv7.0
+ cd /hive/data/genomes/hg18/bed/hinv7.0
+ cat << '_EOF_' > hinvToBed.pl
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $chr="";
+my $start="";
+my $end="";
+my %accNoDups;
+my $invId = "";
+my $invIdVer = "";
+my $accNo = "";
+my $strand = "";
+my $cai = 0;
+open (FH, "zcat FCDNA.gz|") or die "can not zcat FCDNA.gz";
+while (my $line = <FH>) {
+ my ($id, $tag, $rest) = split('\s+', $line, 3);
+ if ($line =~ m/^CDNA_H-INVITATIONAL-ID:/ ) {
+ $invId = $tag;
+ } elsif ($line =~ m/^CDNA_H-INVITATIONAL-ID-VERSION:/ ) {
+ $invIdVer = $tag;
+ } elsif ($line =~ m/^CDNA_CHROMOSOME-NUMBER:/ ) {
+ $chr = $tag;
+ } elsif ($line =~ m/^CDNA_STRAND:/ ) {
+ $strand = $tag;
+ } elsif ($line =~ m/^PREDICTED-ORF_CAI:/ ) {
+ $cai = int($tag * 1000);
+ } elsif ($line =~ m/^CDNA_START:/ ) {
+ $start = $tag;
+ } elsif ($line =~ m/^CDNA_END:/ ) {
+ $end = $tag;
+ } elsif ($line =~ m/^CDNA_ACCESSION-NO:/ ) {
+ $accNo = $tag;
+ } elsif ($line =~ m/CDNA_CLUSTER-ID:/ ) {
+ if (length($accNo) > 0) {
+ next if ($chr eq "UM");
+ if (length($start) < 1 || length($end) < 1) {
+ printf STDERR "no start,end ? chr%s\t%s\n", $chr, $invIdVer;
+ } else {
+ die "have accession but no ID ?" if (length($invId) < 1);
+ $invIdVer =~ s/\.[0-9]+$//;
+ printf "chr%s\t%d\t%d\t%s\t%d\t%s\n",
+ $chr, $start, $end, $invIdVer, $cai, $strand;
+ }
+ }
+ $accNo = "";
+ $invId = "";
+ $invIdVer = "";
+ $chr = "";
+ $start = "";
+ $end = "";
+ $cai = 0;
+ $strand = "";
+ }
+}
+close (FH);
+'_EOF_'
+ # << happy emacs
+ ln -s /hive/data/outside/hinv/H-InvDB_7.0/FCDNA.gz .
+ chmod +x hinvToBed.pl
+ time ./hinvToBed.pl | grep -v chr6_hla_hap | sort -k1.1 -k2.2n > hinv7.0.bed
+ # zcat: FCDNA.gz: decompression OK, trailing garbage ignored
+ # real 3m1.060s
+ # user 3m14.142s
+ # sys 0m10.961s
+ # verify the new table does not exist
+ hgsql -e "show tables" hg18 | grep -i hinv
+ hgLoadBed -verbose=2 hg18 HInvGeneMrnaBed hinv7.0.bed
+ # Reading hinv7.0.bed
+ # Loaded 217721 elements of size 4
+ hgsql -e "show tables" hg18 | grep -i hinv
+ # HInv
+ # HInvGeneMrna
+ # HInvGeneMrnaBed
+ # knownToHInv
+ # knownXToHInv
+ # check the coverage
+ featureBits hg18 HInvGeneMrnaBed
+ # 1350541623 bases of 2881515245 (46.869%) in intersection
+ # exon only
+ featureBits hg18 HInvGeneMrna
+ # 82136473 bases of 2881515245 (2.850%) in intersection
+ # measure exon and intron to compare
+ hgsql -N -e "select tName, tStart, tEnd, qName, strand from HInvGeneMrna;" \
+ hg18 > hinvGeneMrna.bed
+ # 988629029 bases of 2881515245 (34.309%) in intersection
+
+ featureBits hg18 HInvGeneMrnaBed -countGaps gap
+ # 4523138 bases of 3107677273 (0.146%) in intersection
+
+
+
+hgsql -e "show tables" hg18 | grep -i hinv
+#####################################################################
+