4389d5f48220ed09aff243f8ff13b064e7dc7e0b braney Wed Feb 22 17:32:10 2012 -0800 moving various pieces of code to build the omim table into one place #6943 diff --git src/utils/omim/parseGeneMap.pl src/utils/omim/parseGeneMap.pl new file mode 100644 index 0000000..ce63dd1 --- /dev/null +++ src/utils/omim/parseGeneMap.pl @@ -0,0 +1,221 @@ +#!/usr/bin/env perl + +#-------------------------------------------------------------------------- + +use strict; +use warnings; + + +#-------------------------------------------------------------------------- +# +# Documentation +# +# ./script1.pl --gene-map-file=genemap.txt +# + + +#-------------------------------------------------------------------------- +# +# Required packages +# + + +#-------------------------------------------------------------------------- +# +# Package Constants +# + +BEGIN { + + # Set the locale to utf8 + $ENV{'LC_ALL'} = 'en_US.UTF-8'; + $ENV{'LANGUAGE'} = 'en_US.UTF-8'; + $ENV{'LANG'} = 'en_US.UTF-8'; + +} + + +#-------------------------------------------------------------------------- +# +# Function: main() +# +# Purpose: main +# +# Called by: +# +# Parameters: +# +# Global Variables: +# +# Returns: void +# + + +# Gene map file path +my $geneMapFilePath = './genemap.txt'; + + +# Check for command parameters +for ( my $argc = 0; $argc <= $#ARGV; $argc++ ) { + + my $option = $ARGV[$argc]; + + if ( $option =~ /^--gene-map-file=(.*)$/i ) { + $geneMapFilePath = $1; + } + + elsif ( $option =~ /^(--help|\-?)$/ ) { + printf("Usage:\n"); + printf("\t[--help|-?] print out this message.\n"); + printf("\n"); + printf("\t[--gene-map-file=path] gene map file path, defaults to: '%s'.\n", $geneMapFilePath); + printf("\n"); + } + else { + printf("Error: Invalid action: '%s', type '%s --help' for help.\n", $option, $0); + exit (0); + } +} + + +# Check the gene map file path +if ( !defined($geneMapFilePath) ) { + die "Undefined gene map file path" +} + + + +# Open the gene map file +open (GENE_MAP_FILE, $geneMapFilePath) || die "Failed to open the gene map file: '$geneMapFilePath'"; + + +# Read the gene map file +while (<GENE_MAP_FILE>) { + + # Get the line + my $line = $_; + chomp $line; + + # Split the fields + my @fields = split(/\|/, $line, 18); + + # Get all the data + my $sort = $fields[0]; + + my $month = $fields[1]; + my $day = $fields[2]; + my $year = $fields[3]; + + my $cytoLocation = $fields[4]; + my $geneSymbols = $fields[5]; + my $confidence = $fields[6]; + + my $geneName0 = $fields[7]; + my $geneName1 = $fields[8]; + + my $mimNumber = $fields[9]; + + my $mappingMethod = $fields[10]; + + my $comments0 = $fields[11]; + my $comments1 = $fields[12]; + + my $phenotypes0 = $fields[13]; + my $phenotypes1 = $fields[14]; + my $phenotypes2 = $fields[15]; + + my $mouse = $fields[16]; + my $references = $fields[17]; + + # Re-assemble the hacked fields + my $geneName = $geneName0 . ' ' . $geneName1; + my $comments = $comments0 . ' ' . $comments1; + my $phenotypes = $phenotypes0 . ' ' . $phenotypes1 . ' ' . $phenotypes2; + + # Clean the reassembled fields + $geneName = _cleanText($geneName); + $comments = _cleanText($comments); + $phenotypes = _cleanText($phenotypes); + + + # Split out the phenotypes + foreach my $phenotype ( split(/;\s*/, $phenotypes) ) { + + my $phenotypeMimNumber = ''; + my $phenotypeMappingKey = ''; + + # Extract the phenotype mim number and remove it + if ( $phenotype =~ /(\d{6})/ ) { + $phenotypeMimNumber = $1; + $phenotype =~ s/\d{6}//; + } + + # Extract the phenotype mapping key and remove it + if ( $phenotype =~ /(.*)\((\d)\)/ ) { + $phenotypeMappingKey = $2; + $phenotype = $1 + } + + # Strip trailing comma from the phenotype + if ( $phenotype =~ /(.*?),\s*$/ ) { + $phenotype = $1 + } + + # Final cleaning + $phenotype = _cleanText($phenotype); + + + # Copy the mim number to the phenotype mim number for phenotype mapping key == 2 +# if ( $phenotypeMappingKey == 2 ) { +# $phenotypeMimNumber = $mimNumber; +# } + + + # Write out the data + printf("%s\t%s\t%s\t%s\n", $mimNumber, $phenotype, $phenotypeMimNumber, $phenotypeMappingKey); + + } + + } + + + # Close the gene map file + close (GENE_MAP_FILE); + + + exit (0); + + + +#-------------------------------------------------------------------------- + + + +sub _cleanText { + + my $text = shift; + + + # Check the parameter + return (undef) if !defined($text); + + + # Replace new lines with spaces + $text =~ s/[\n\r]/ /g; + + # Deduplicate spaces + $text =~ s/\s+/ /g; + + # Trim leading and trailing spaces + $text =~ s/^\s+//; + $text =~ s/\s+$//; + + + return ($text); + +} + + +#-------------------------------------------------------------------------- + +