7a5cff1bced06596181624623f86a803c4be59f6 hiram Mon Aug 12 14:00:53 2019 -0700 correct searching enabled on xenoRefGene track refs #23734 diff --git src/hg/utils/automation/xenoRefGeneIx.pl src/hg/utils/automation/xenoRefGeneIx.pl new file mode 100755 index 0000000..adf9bee --- /dev/null +++ src/hg/utils/automation/xenoRefGeneIx.pl @@ -0,0 +1,119 @@ +#!/usr/bin/env perl + +# construct ix.txt index from a xenoRefGene bigGenePred file +# index key is the name of the gene from column 4 of the bigGenePred +# alias names for that are constructed: +# the name without any .nnn suffix +# name2 if present, and name2 without .nnn suffix +# geneName if present, and geneName without .nnn suffix +# and geneName2 to allow searching the species of origin for the mRNA + +# table bigGenePred +# "bigGenePred gene models" +# ( +# string chrom; "Reference sequence chromosome or scaffold" +# uint chromStart; "Start position in chromosome" +# uint chromEnd; "End position in chromosome" +# 4 # string name; "Name or ID of item, ideally both human readable and unique" +# uint score; "Score (0-1000)" +# char[1] strand; "+ or - for strand" +# uint thickStart; "Start of where display should be thick (start codon)" +# uint thickEnd; "End of where display should be thick (stop codon)" +# uint reserved; "RGB value (use R,G,B string in input file)" +# int blockCount; "Number of blocks" +# int[blockCount] blockSizes; "Comma separated list of block sizes" +# int[blockCount] chromStarts; "Start positions relative to chromStart" +# 13 # string name2; "Alternative/human readable name" +# string cdsStartStat; "Status of CDS start annotation (none, unknown, incomplete, or complete)" +# string cdsEndStat; "Status of CDS end annotation (none, unknown, incomplete, or complete)" +# int[blockCount] exonFrames; "Exon frame {0,1,2}, or -1 if no frame for exon" +# string type; "Transcript type" +# 18 # string geneName; "Primary identifier for gene" +# 19 # string geneName2; "species of origin of the mRNA" +# string geneType; "Gene type" +# ) + + +use strict; +use warnings; + +my $argc = scalar(@ARGV); + +if ($argc != 1 ) { + printf STDERR "usage: xenoRefGeneIx.pl | sort -u > ix.txt\n"; + printf STDERR "then run ixIxx on ix.txt:\n"; + printf STDERR " ixIxx ix.txt out.ix out.ixx\n"; + exit 255; +} + +# some of the Arabidopsis thaliana genes are coming in as pairs +# of different names separated by '; ', process both names + +sub processOne($$) { + my ($nameHash, $someName) = @_; + if ($someName =~ m/; /) { + my ($name1, $name2) = split('; ', $someName); + $nameHash->{$name1} += 1; + $nameHash->{$name2} += 1; + $nameHash->{lc($name1)} += 1; + $nameHash->{lc($name2)} += 1; + my $noSuffix = $name1; + $noSuffix =~ s/\.[0-9][0-9]*$//; + $nameHash->{$noSuffix} += 1; + $nameHash->{lc($noSuffix)} += 1; + $noSuffix = $name2; + $noSuffix =~ s/\.[0-9][0-9]*$//; + $nameHash->{$noSuffix} += 1; + $nameHash->{lc($noSuffix)} += 1; + } elsif ($someName =~ m/\s/) { + $nameHash->{$someName} += 1; + $nameHash->{lc($someName)} += 1; + my @names = split('\s+', $someName); + foreach my $name (@names) { + $nameHash->{$name} += 1; + $nameHash->{lc($name)} += 1; + my $noSuffix = $name; + $noSuffix =~ s/\.[0-9][0-9]*$//; + $nameHash->{$noSuffix} += 1; + $nameHash->{lc($noSuffix)} += 1; + } + } else { + $nameHash->{$someName} += 1; + $nameHash->{lc($someName)} += 1; + my $noSuffix = $someName; + $noSuffix =~ s/\.[0-9][0-9]*$//; + $nameHash->{$noSuffix} += 1; + $nameHash->{lc($noSuffix)} += 1; + } +} + +my $gpFile = shift; + +if ($gpFile =~ m/.gz$/) { + open (FH, "zcat $gpFile|") or die "ERROR: xenoRefGeneIx.pl can not read '$gpFile'"; +} else { + open (FH, "<$gpFile") or die "ERROR: xenoRefGeneIx.pl can not read '$gpFile'"; +} +while (my $line = ) { + next if ($line =~ m/^#/); + chomp ($line); + my ($chrom, $chromStart, $chromEnd, $name, $score, $strand, $thickStart, $thickEnd, $reserved, $blockCount, $blockSizes, $chromStarts, $name2, $cdsStartStat, $cdsEndStat, $exonFrames, $type, $geneName, $geneName2, $geneType) = split('\t', $line); + my %allNames; # key is name, value is count of times seen + processOne(\%allNames, $name) if (length($name)); + processOne(\%allNames, $name2) if (length($name)); + processOne(\%allNames, $geneName) if (length($geneName)); + processOne(\%allNames, $geneName2) if (length($geneName2)); + my $howMany = 0; + foreach my $someName (keys %allNames) { + $howMany += 1; + } + if ($howMany > 1) { + printf "%s", $name; + foreach my $someName (sort keys %allNames) { + next if ($someName =~ m/\Q$name\E/i); + printf "\t%s", $someName; + } + printf "\n"; + } +} +close (FH);