src/hg/makeDb/doc/hg18.txt 1.366
1.366 2009/06/25 18:15:15 cline
Added instructions for making the (35-mer) uniqueness track data
Index: src/hg/makeDb/doc/hg18.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg18.txt,v
retrieving revision 1.365
retrieving revision 1.366
diff -b -B -U 4 -r1.365 -r1.366
--- src/hg/makeDb/doc/hg18.txt 13 Jun 2009 20:39:58 -0000 1.365
+++ src/hg/makeDb/doc/hg18.txt 25 Jun 2009 18:15:15 -0000 1.366
@@ -28051,4 +28051,244 @@
# track to wgRna. Will keep the old track around for a while until
# new track checked and QA'd.
hgsql -e 'alter table wgRna rename wgRnaOld;' hg18
hgsql -e 'alter table wgRnaJun09 rename wgRna;' hg18
+
+
+
+##################
+## Uniqueness Track: Step one (courtesy of John Castle, Rosetta)
+## Make oligos of length XX
+
+# Perl one-liner to make a batch file
+# I've included the perl files CNV_makereads2.pl (simply uses substr on a chromosome) and fastagrep.pl (to remove sequences with Ns # The files chr$x.fa are the individual chromosomes
+
+perl -e 'for ($i = 1;$i<= 25; $i++) {$x = $i; if ($i == 23) {$x = 'X';} if ($i == 24) {$x = 'Y';} if ($i == 25) {$x = 'M';} print "~/DTcode/CNV_makereads2.pl 100 /info/genome/Projects/721/ref/chr$x.fa | fastagrep.pl -v n > chr$x.fa\n";}' > batch_chr_get
+
+#!/usr/bin/perl -w
+#---------------------------------------------------------------------
+# C O P Y R I G H T N O T I C E
+#---------------------------------------------------------------------
+# Copyright (c) 2001 Rosetta Inpharmatics, Inc.
+# 12040 115th Avenue NE, Kirkland, WA 98034-6900
+# All Rights Reserved. Reproduction, adaptation, or
+# translation without prior written permission of
+# Rosetta Inpharmatics, Inc. is prohibited.
+#---------------------------------------------------------------------
+# CNV_makereads.pl
+# $Id$
+
+
+#use lib ('/home/castlej/perl/','/home/castlej/OSDTools/','/home/castlej/DTcode/');
+#use strict;
+
+my $oligo_length = $ARGV[0];
+my $file = $ARGV[1];
+
+open(IN,$file);
+$/ = "\n>";# change input line separator to '>' to suck up FASTA sequences
+while ($line= <IN>) {
+ $line =~ s/^>//m;
+ # remove '>' from end of $line
+ $line =~ s/>$//m;
+ # remove Unigene lines starting with '#'
+ $line =~ s/\n\#.*$//m;
+ # get sequence id
+ $line =~ /^\s*(\S+).*([^\0]*)/;
+ $id = $1;
+ $seq = $2;
+ $seq =~ s/\n//g;
+}
+
+if ($id =~ /(chr\S+)\.nib/) {
+ $chr = $1;
+} elsif ($id =~ /(chr\S+)/) {
+ $chr = $1;
+}
+
+for ($i = 0; $i <length($seq)-$oligo_length; $i++) {
+ $a = substr($seq,$i,$oligo_length);
+ $j = $i+$oligo_length;
+ print ">$chr:$i-$j\n$a\n";
+}
+
+
+#!/usr/bin/perl -w
+#---------------------------------------------------------------------
+# C O P Y R I G H T N O T I C E
+#---------------------------------------------------------------------
+# Copyright (c) 2000,2001,2002 Rosetta Inpharmatics, Inc.
+# 12040 115th Avenue NE, Kirkland, WA 98034-6900
+# All Rights Reserved. Reproduction, adaptation, or
+# translation without prior written permission of
+# Rosetta Inpharmatics, Inc. is prohibited.
+#---------------------------------------------------------------------
+#
+# $Id$
+#
+# finds selected sequences in FASTA by regex matching in defline or sequence
+
+use strict;
+
+my( $option,
+ $regex,
+ @regexes,
+ %tofind,
+ $exceptflag,
+ $key,
+ $value,
+ $line,
+ );
+
+$exceptflag = 0;
+
+unless (scalar(@ARGV)) {
+ print "\nUsage: $0 [OPTION] PATTERN [FASTAFILE]\n";
+ print "$0 finds sequences by pattern matching in FASTA format data\n\n";
+ exit;
+}
+
+while ((scalar(@ARGV)) && ($ARGV[0] =~ /^-(\w+)/)) {
+ $option = $1;
+ shift(@ARGV);
+ if ($option =~ /v/) { # user wants sequences NOT matching regex(es)
+ $exceptflag = 1;
+ }
+ if ($option =~ /s/) { # regex on command line
+ push(@regexes, shift(@ARGV));
+ }
+
+ if ($option =~ /f/) { # user wants list of regexes from file
+ open(INHANDLE, "<$ARGV[0]") ||
+ die "$0: error, can't open regex list file $ARGV[0]\n";
+ while (defined($regex = <INHANDLE>)) {
+ chomp $regex;
+ push(@regexes, $regex);
+ }
+ shift(@ARGV);
+ }
+}
+
+if (scalar(@regexes) < 1) { push(@regexes, shift(@ARGV)); }
+$/ = "\n>"; # change input line separator to suck up FASTA sequences
+
+SEQUENCE:
+while (defined($line = <>)) {
+ # remove '>' from start of first $line
+ $line =~ s/^>//m;
+ # stick '>' back on all $lines
+ $line = '>'.$line;
+ # remove '>' from end of $line
+ $line =~ s/>$//m;
+ # remove Unigene lines starting with '#'
+ $line =~ s/\n\#.*$//m;
+ foreach $regex (@regexes) {
+ if ($line =~ /$regex/) {
+ unless ($exceptflag) { print $line; }
+ next SEQUENCE;
+ }
+ }
+ if ($exceptflag) { print $line; }
+}
+
+
+
+# Submit batch file to cluster (we use LSF), each line is a submission
+perl -ne 'chomp; $a = "bsub -q short64 \"$_\"\n"; system($a);' batch_chr_get
+
+
+
+####################
+# Uniqueness Step two # I've used an older version of BWA. The newer version from sourceforge outputs a binary file which then must be converted to a text file
+# HG18 is the human genome
+# I could include banything_2GBNew.pl but it is simply a cluster "chunk and submit" code
+# Method 1 perl -e 'for ($i =1;$i<= 25; $i++) {$x = $i; if ($i == 23) {$x = 'X';} elsif ($i == 24) {$x = 'Y';} elsif ($i == 25) {$x = 'M';} print "banything_2GbNew2.pl -a /ifs65/dtap/bin/bwa/bwa-0.2.0/bwa -z 1000000 -in chr$x.fa -o chr$x.bwa -stdout chr$x.bwa -pre \"aln -o 0 /info/dtap/projects/1057_CNV/HG18/HG18 \" -suf \" \" \n";}' >! batch_banything
+ chmod +777 batch_banything
+batch_banything
+
+# Method 2 perl -e 'for ($i =1;$i<= 25; $i++) {$x = $i; if ($i == 23) {$x = 'X';} elsif ($i == 24) {$x = 'Y';} elsif ($i == 25) {$x = 'M';} print "/ifs65/dtap/bin/bwa/bwa-0.2.0/bwa aln -o 0 /info/dtap/projects/1057_CNV/HG18/HG18 chr$x.fa > chr$x.bwa\n"}' >! batch_banything
+ chmod +777 batch_banything
+perl -ne 'chomp; $a = "bsub -q long64 \"$_\"\n"; system($a);' batch_anything
+
+
+#####################
+# Uniqueness Step three
+# I ran this one-liner from a higher level directory
+perl -e '$pwd = `pwd`; chomp($pwd); @a = `ls`; foreach $dir (@a) {chomp ($dir); unless ($dir =~ /(\d+)mer_2nd/) {next;}; @b = `ls $dir/*fa.bwa`; foreach $file (@b) {chomp($file); $f = "$pwd/$file"; $f =~ /^(\S+chr[^\.]+)\.*/; $e = $1; print "~/DTcode/CNV_parseBWA_wiggle.pl 100 1 $f\* > $e.quality.100.wiggle\n";}}' > batch_wiggle
+# Submit batch file to cluster (we use LSF), each line is a submission
+perl -ne 'chomp; $a = "bsub -q long64 \"$_\"\n"; system($a);' batch_wiggle
+
+#!/usr/bin/perl -w
+# John Castle
+# May 19, 2009
+# $Cap a maximum value to clip data with
+# $Use_score whether to output the uniqueness score or the number of hits
+# @FilesIn the BWA text output files to scan
+# ** NOTE ** The newer BWA algorithm outputs a binary file that is then made into a text file using BWA again.
+# However, the text file output has a slightly different format so the parsing will need to change.
+
+
+($Cap, $Use_score, @FilesIn) = @ARGV;
+
+if ($FilesIn[0] =~ /\.gz/) {
+ open(IN,"gzip -dc $FilesIn[0] |")
+} else {
+ open(IN,$FilesIn[0]);
+}
+
+#### Description
+@a = split("\t",<IN>);
+$a[6] =~ /(\d+)/;
+$len = $1;
+close(IN);
+
+### Wiggle header text
+if ($Use_score == 0) {
+ print "track type=wiggle_0 name=\"Alignment scores of $len\mer as\" description=\"Unique $len mer alignments\" color=100,50,150 gridDefault=on yLineOnOff=on visibility=full maxHeightPixels=40:40:12\n";
+} else {
+ print "track type=wiggle_0 name=\"$len\mer alignment scores\" description=\"$len\mer alignment scores from BWA/MAQ, where 37 indicates a unique alignment\" color=100,50,150 gridDefault=on yLineOnOff=on visibility=full maxHeightPixels=40:40:12\n";
+}
+
+### Parse through file(s)
+foreach $file (@FilesIn) {
+ if ($file =~ /\.gz/) {
+ open(IN,"gzip -dc $file |");
+ } else {
+ open(IN,$file);
+ }
+ @a = split("\t",<IN>);
+ $a[0] =~ /(chr\S+):(\d+)/;
+ $Chr = $1;
+ $start = $2;
+ $score = $a[5];
+ $hits = $a[11];
+ if ($hits > $Cap) {$hits = $Cap;}
+ if ($Use_score == 1) {$value = $score;}
+ else {$value = $hits;}
+
+ while (<IN>) { # Make wiggle track, with start and end coordinates for same scoring regions
+ @a = split("\t",$_);
+ if ($#a <15) {
+ next;
+ }
+
+ $a[0] =~ /(chr\S+):(\d+)/;
+ $chr = $1;
+ $pos = $2;
+ $score = $a[5];
+ $hits = $a[11];
+ if ($hits > $Cap) {$hits = $Cap;}
+
+ if ($Use_score == 1) {$x = $score;
+ } else {$x = $hits;}
+
+ if ($x != $value) {
+ print "$Chr\t$start\t$pos\t$value\n";
+ $Chr = $chr;
+ $value = $x;
+ $start = $pos;
+ }
+ }
+ print "$Chr\t$start\t$pos\t$value\n";
+ close(IN);
+}
+