05219e2ffe569b3403687b96701006e2ff16116d hiram Tue Jul 23 13:13:17 2019 -0700 creating index from genePred file for assembly ncbiGene track searching refs #23734 diff --git src/hg/utils/automation/gpToIx.pl src/hg/utils/automation/gpToIx.pl new file mode 100755 index 0000000..0bde5b4 --- /dev/null +++ src/hg/utils/automation/gpToIx.pl @@ -0,0 +1,50 @@ +#!/usr/bin/env perl + +# construct ix.txt index from a genePred file +# index key is the name of the gene from column 1 of the genePred +# alias names for that are constructed: +# the name without any .nnn suffix +# name2 if present, and name2 without .nnn suffix + +use strict; +use warnings; + +my $argc = scalar(@ARGV); + +if ($argc != 1 ) { + printf STDERR "ucage: gpToIx.pl | sort -u > ix.txt\n"; + printf STDERR "then run ixIxx on ix.txt:\n"; + printf STDERR " ixIxx ix.txt out.ix out.ixx\n"; + exit 255; +} + +my $gpFile = shift; + +if ($gpFile =~ m/.gz$/) { + open (FH, "zcat $gpFile|") or die "ERROR: gpToIx.pl can not read '$gpFile'"; +} else { + open (FH, "<$gpFile") or die "ERROR: gpToIx.pl can not read '$gpFile'"; +} +while (my $line = ) { + next if ($line =~ m/^#/); + chomp ($line); + my ($name, $chrom, $strand, $txStart, $txEnd, $cdsStart, $cdsEnd, $exonCount, $exonStarts, $exonEnds, $score, $name2, $cdsStartStat, $cdsEndStat, $exonFrames) = split('\s+', $line); + my $extraNames = ""; + my $noSuffix=$name; + $noSuffix =~ s/\.[0-9][0-9]*$//; + $extraNames = $noSuffix if (($noSuffix ne $name) && (length($noSuffix) > 0)); + if (defined($name2)) { + if ($name !~ m/\Q$name2\E/i) { + if (length($extraNames) > 0) { + $extraNames .= "\t" . $name2; + } else { + $extraNames = $name2; + } + $noSuffix = $name2; + $noSuffix =~ s/\.[0-9][0-9]*$//; + $extraNames .= "\t" . $noSuffix if (($noSuffix ne $name2) && (length($noSuffix) > 0)); + } + } + printf "%s\t%s\n", $name, $extraNames if (length($extraNames) > 0); +} +close (FH);