src/utils/findAccession.pl 1.2
1.2 2009/05/04 17:30:12 hiram
Transitioned and updated file to src/hg/stsMarkers/findAccession.pl
Index: src/utils/findAccession.pl
===================================================================
RCS file: src/utils/findAccession.pl
diff -N src/utils/findAccession.pl
--- src/utils/findAccession.pl 3 Feb 2006 22:49:47 -0000 1.1
+++ /dev/null 1 Jan 1970 00:00:00 -0000
@@ -1,269 +0,0 @@
-#!/usr/bin/env perl
-
-# DO NOT EDIT the /cluster/bin/scripts copy of this file --
-# edit ~/kent/src/utils/findAccession.pl instead.
-
-# $Id$
-
-# File: findAccession
-# Author: Terry Furey
-# Date: 5/17/01
-# Project: Human
-# Description: Finds the accession for a marker on the golden path
-
-# Usage message
-if ($#ARGV < 1) {
- print stderr "USAGE: findAccession [-agp -ncbi -mouse -rat -chicken -all] <file> <accession_info>\n";
- exit(1);
-}
-
-# Read parameters
-$agp = 0;
-$ncbi = 0;
-$mouse = 0;
-$rat = 0;
-$chicken = 0;
-$all = 0;
-@chroms = qw/1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M NA UL 22_h2_hap1 6_cox_hap1 5_h2_hap1 6_qbl_hap2/;
-
-while ($#ARGV > 1) {
- $arg = shift(@ARGV);
- if ($arg eq "-agp") {
- $agp = 1;
- } elsif ($arg eq "-ncbi") {
- $ncbi = 1;
- @chroms = (1..22,X,Y,M,Un);
- } elsif ($arg eq "-mouse") {
- $mouse = 1;
- @chroms = (1..19,X,Y,Un,M);
- } elsif ($arg eq "-rat") {
- $rat = 1;
- @chroms = (1..20,X,Y,Un);
- } elsif ($arg eq '-chicken') {
- $chicken = 1;
- @chroms = (1..24,26..28,32,'E22C19W28','E26C13','E50C23','W','Z','Un');
- } elsif ($arg eq "-all") {
- $all = 1;
- } else {
- print stderr "$arg is not a valid parameter\n";
- }
-}
-$file = shift(@ARGV);
-$info = shift(@ARGV);
-open(FILE, "<$file") || die("Could not open $file");
-open(OUT, ">$file.acc");
-
-# Get chromosomes from agp directory, if possible
-if (($agp) && (-e "$info/chrom_names")) {
- print STDERR "reading chrom names from $info/chrom_names\n";
- @chroms = split("\n", `cat $info/chrom_names`);
-}
-
-# Read in the positions of the accessions
-# Either read from accession_info.rdb file
-if (!$agp) {
- print stderr "Reading accession_info from $info file\n";
- if (! -e $info) { die("$info does not exist\n"); }
- `sorttbl Chr -r Ord Start < $info > $$.sort`;
- open(INFO, "<$$.sort") || die("Count not open $$.sort\n");
- $line = <INFO>;
- $line = <INFO>;
- $index = 0;
- while ($line = <INFO>) {
- chomp($line);
- ($chr, $ord, $start, $end, $mid, $ctg, $ctgIdx, $bargeIdx, $clone, $acc, @rest) = split("\t",$line);
- $acc[$index] = $acc;
- if ($ord == 1) {
- $chr[$index] = $chr;
- } else {
- $chr[$index] = "${chr}_random";
- }
- $start[$index] = $start;
- $end[$index] = $end;
- $index++;
- }
- close(INFO);
-# Or read directly from agp file
-} else {
- print stderr "Reading accession_info from agp files\n";
- $index = -1;
- foreach $chr (@chroms) {
- print stderr "Loading chr$chr agp info\n";
- for ($ord = 0; $ord <=1; $ord++) {
- if ($ord == 0) {
- $file = "chr${chr}_random.agp";
- } else {
- $file = "chr$chr.agp";
- }
- if ( ! -e "$info/$chr/$file" ) { next; }
- print STDERR "reading agp file $info/$chr/$file\n";
- if (open(AGP, "<$info/$chr/$file")) {
- $prevacc = "";
- while ($line = <AGP>) {
- chomp($line);
- ($thischr, $start, $end, $idx, $type, $acc, @rest) = split(' ',$line);
- next if ($type eq "N");
-
- if (! $chicken) {
- ($acc, $vers) = split(/\./,$acc);
- }
- if ($acc ne $prevacc) {
- $index++;
- $acc[$index] = $acc;
- if ($ord == 1) {
- $chr[$index] = "chr$chr";
- } else {
- $chr[$index] = "chr${chr}_random";
- }
- $start[$index] = $start;
- $end[$index] = $end;
- } else {
- $end[$index] = $end;
- }
- $prevacc = $acc;
- }
- close(AGP);
- } else {
- print stderr ("Could not open $info/$chr/$file\n");
- }
- }
- }
- $index++;
-}
-
-# Process each line in the position file
-$i = 0;
-$count = 0;
-print stderr "Starting processing\n";
-while($line = <FILE>) {
- chomp($line);
- ($chr, $start, $end, @rest) = split(' ',$line);
- $count++;
- if ($count%10000 == 0) {
- print stderr "$count\n";
- }
-
- # Make sure at current chromosome
- if ($chr ne $chr[$i]) {
- $i = 0;
- print stderr "Processing $chr\n";
- while ($chr[$i] ne $chr) {
- $i++;
- if ($i > $#chr) {
- die("Can't find chrom $chr in \@chr (\$i=$i)\n");
- }
- }
- $firsti = $i;
- $largest = $end[$i];
- $largesti = $i;
- }
-
- # Find a range that overlaps the placement
- $acc = "";
- $i = $largesti;
- $reset = 0;
- # If want all accessions, make sure start with first accession that overlaps
- if ($all) {
- while (($end[$i] > $start) && ($i > $firsti)) {
- $i--;
- }
- while (($start[$i] <= $end) && ($chr[$i] eq $chr)) {
- if (($start[$i] <= $end) && ($end[$i] >= $start)) {
- if ($acc) {
- $acc .= ",$acc[$i]";
- } else {
- $acc = $acc[$i];
- }
- }
- $i++;
- }
- # If want only best accession that overlaps
- } else {
- while (!$acc) {
- # Check if completely contained
- if (($start >= $start[$i]) && ($end <= $end[$i])) {
- # Check if this is a very long accession and is contained in next accession
- if (($end[$i] - $start[$i]) >= 500000) {
- $diffi = $end[$i] - $start[$i];
- $j = $i + 1;
- while (($start <= $end[$j]) && ($chr[$j] eq $chr)) {
- if (($start >= $start[$j]) && ($end <= $end[$j]) && (($end[$j] - $start[$j]) < $diffi)) {
- $i = $j;
- $diffi = $end[$j] - $start[$j];
- if ($end[$j] > $largest) {
- $largest = $end[$j];
- $largesti = $j;
- }
- }
- $j++;
- }
- }
- $acc = $acc[$i];
- if ($chr[$i] ne $chr) {
- die("Doesn't work - $chr $start $end\n");
- }
- # Check if out of range for chromosome
- } elsif (($start >= $largest)) {
- $i++;
- if ($end[$i] > $largest) {
- $largest = $end[$i];
- $largesti = $i;
- }
- if ($chr ne $chr[$i]) {
- die("1 - Out of chrom - $chr $start $end - $chr[$i] $start[$i] $end[$i] $i\n");
- }
- # Check if accession really does contain feature
- } elsif ($start[$i] > $end) {
- $i--;
- if ($i < $largesti) {
- if ($reset) {
- die("Doesn't line up - $chr $start $end - $chr[$i] $start[$i] $end[$i] $i\n");
- }
- $reset = 1;
- $i = $firsti;
- $largest = $end[$i];
- $largesti = $i;
- }
- if ($chr ne $chr[$i]) {
- die("2 - Out of chrom - $chr $start $end - $chr[$i] $start[$i] $end[$i] $i\n");
- }
-
- # If feature rrosses the border of more than one accession - find the largest overlap
- } else {
- while (($end[$i] > $start) && ($chr eq $chr[$i]) && ($i >= 0)) {
- $i--;
- }
- $i++;
- # Find largest overlap
- $overlap = 0;
- $acc = "";
- while (($start[$i] < $end) && ($chr eq $chr[$i]) && ($i < $index)) {
- $test = $end - $start + 1;
- if ($start < $start[$i]) {
- $test -= ($start[$i] - $start);
- }
- if ($end > $end[$i]) {
- $test -= ($end - $end[$i]);
- }
- if ($test > $overlap) {
- $overlap = $test;
- $acc = $acc[$i];
- }
- $i++;
- }
- if ($chr ne $chr[$i]) {
- $i--;
- }
- }
- }
- }
- # Print it out
- print OUT "$line\t$acc\n";
-}
-close(FILE);
-close(OUT);
-if (!$agp) {
- `rm $$.*`;
-}
-
-