src/hg/utils/automation/n50.pl 1.3

1.3 2010/02/17 06:48:42 hiram
working even if contig names are identical and ignoring extra columns on input
Index: src/hg/utils/automation/n50.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/n50.pl,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 4 -r1.2 -r1.3
--- src/hg/utils/automation/n50.pl	7 Aug 2009 18:16:48 -0000	1.2
+++ src/hg/utils/automation/n50.pl	17 Feb 2010 06:48:42 -0000	1.3
@@ -16,8 +16,9 @@
     printf STDERR "\tcontigName size\n";
     exit 255;
 }
 
+my $ix = 0;
 while (my $sizeFile = shift) {
     my $sizeCount = 0;
 
     my %sizes;	# key is contigName, value is size
@@ -27,20 +28,22 @@
 	while (my $line = <>) {
 	    next if ($line =~ m/^\s*#/);
 	    ++$sizeCount;
 	    chomp ($line);
-	    my ($name, $size) = split('\s+', $line);
-	    $sizes{$name} = $size;
+	    my ($name, $size, $rest) = split('\s+', $line, 3);
+	    my $key = sprintf("%s_X_%d", $name, $ix++);
+	    $sizes{$key} = $size;
 	}
     } else {
 	printf STDERR "#\treading: $sizeFile\n";
 	open (FH, "<$sizeFile") or die "can not read $sizeFile";
 	while (my $line = <FH>) {
 	    next if ($line =~ m/^\s*#/);
 	    ++$sizeCount;
 	    chomp ($line);
-	    my ($name, $size) = split('\s+', $line);
-	    $sizes{$name} = $size;
+	    my ($name, $size, $rest) = split('\s+', $line, 3);
+	    my $key = sprintf("%s_X_%d", $name, $ix++);
+	    $sizes{$key} = $size;
 	}
 	close (FH);
     }
 
@@ -61,13 +64,17 @@
     foreach my $key (sort { $sizes{$b} <=> $sizes{$a} } keys %sizes) {
 	++$contigCount;
 	$totalSize += $sizes{$key};
 	if ($totalSize > $n50Size) {
+	    my $prevName = $prevContig;
+	    $prevName =~ s/_X_[0-9]+//;
+	    my $origName = $key;
+	    $origName =~ s/_X_[0-9]+//;
 	    printf "# cumulative\tN50 count\tcontig\tcontig size\n";
 	    printf "%d\t%d\t%s\t%d\n",
-		$totalSize-$sizes{$key},$contigCount-1,$prevContig, $prevSize;
+		$totalSize-$sizes{$key},$contigCount-1,$prevName, $prevSize;
 	    printf "%d one half size\n", $n50Size;
-	    printf "%d\t%d\t%s\t%d\n", $totalSize, $contigCount, $key, $sizes{$key};
+	    printf "%d\t%d\t%s\t%d\n", $totalSize, $contigCount, $origName, $sizes{$key};
 	    last;
 	}
 	$prevContig = $key;
 	$prevSize = $sizes{$key};