e00236df7c5e59c822460caa58e662972af379cf hiram Fri Mar 13 10:02:00 2020 -0700 the quotes should be 'single' quotes refs #25090 diff --git src/hg/utils/phyloTrees/binaryTree.pl src/hg/utils/phyloTrees/binaryTree.pl index 004bdbb..8f3bf04 100755 --- src/hg/utils/phyloTrees/binaryTree.pl +++ src/hg/utils/phyloTrees/binaryTree.pl @@ -1,417 +1,417 @@ #!/usr/bin/env perl # convert NCBI taxonomy 'phylip' file.phy into a binary newick .nh tree # These NCBI outputs are from the tool: # https://www.ncbi.nlm.nih.gov/Taxonomy/CommonTree/wwwcmt.cgi # this type of output is from the function: 'Save as' 'phylip tree' # Input file format: It is actually newick format output, however it is # conveniently organized for parsing line by line instead of # having to scan input character by character. Each line is one element of # the graph: # 1. lines that are only left paren: ( start a new branch to the tree # 2. lines that start: ')name:length' are internal node names # and close a branch, # 3. each leaf of the tree is a line without ( or ) # they have a name in 'single quotes' and a :length 'distance' # the spaces in the name will be replaced by _ and the # 'single' quote marks removed. Use :length if given or add :0.1 # NCBI appears to output all lengths as '4' # 4. For our purpose here, the commas in the input can be ignored. # They do appear correctly after node names or leaf element names to # indicate a sister element follows this element. There is no comma # when it is the final element in a branch and or leaf with no sister # # The NCBI tree is often polytomy, for our purpose it needs to be converted # to a binary tree. The conversion to binary is done simply making binary # nodes out of the polytomy elements, thereby pushing the sister elements # into branches below the polytomy elements. # parsing and node construction: # each node in the tree is one of two possible types: # 1. a 'branch' with right and left pointers to other elements. # can have an internal node name: name:distance # 2. a 'leaf' element with a name:distance # # Start with 'root' node as an empty current node # There is a 'nextLeft' boolean that means the next element of this node # goes on the left, otherwise new items are always placed on the right. # This boolean helps rectify polytomys into binary branches. # 1. encounter open paren ( operation: # This starts a new 'branch' node. It is inserted as the right # pointer of the current node when nextLeft FALSE # then nextLeft set to TRUE, else it goes into the left pointer # and nextLeft stays at TRUE. This new 'branch' node # becomes the current node. Polytomy is detected when # the left pointer is already occupied. This left pointer moves to # the new node right pointer, and this new node becomes the 'current' # node. nextLeft is set TRUE for this new node since right is taken. # 2. encounter: 'string':length # This becomes a new 'leaf' node, 'name' and 'distance' are set. # Inserted on right pointer of current node when nextLeft FALSE # and nextLeft set to TRUE, else inserted on left pointer, # nextLeft stays at TRUE. # If nextLeft is TRUE and the left pointer is # already in use, this is a polytomy which creates a new 'branch' # node inserted into the left pointer, the left elements move down # as the right pointer of the new branch node, and this leaf # becomes the left pointer element. This new branch node becomes # the current. nextLeft stays at TRUE. # 3. encounter )'string':length # This 'string' becomes the internal node 'name' of this branch, and # the distance is recorded. The 'parent' of this branch becomes # the 'current' node. # 4 effectively ignoring the commas. They are supposed to mean the next # element goes into the left, but we are keeping track of this # with the nextLeft boolean. The polytomys are a series of elements # more than two, with commas at the ends of the element names # # Tree print: recursive function: # 1. if right pointer exists output ( and print right tree then # if left pointer, output )nodeName:length, and print left tree # 2. no pointers, this is a leaf, print name:length when parent right # pointer got here, else print ,name:length when left pointer got here use strict; use warnings; use Getopt::Long; ############################################################################## sub usage() { printf STDERR "usage: binaryTree.pl [options] file.phy options: -noInternal - do not output internal node names -defaultDistance=0.1 - use this distance when not given in input -allDistances=0.1 - use this distance for everything, default use input -lineOutput - output one line per leaf output, indented per depth - -quoteNames - add \"quotes\" on node names, default not quoted + -quoteNames - add 'quotes' on node names, default not quoted -nameTranslate=<file> - two column file, translate names from input file, first column is name in input file, second column is output name tab separation columns -verbose=N - specify verbose debug printout, 0 nothing, 1 a bit, 2 more, etc reads 'phylip' file format from NCBI taxonomy and outputs binary newick tree format, resolving the polytomys common to NCBI output format. Output is to 'stdout'.\n"; exit 255; } ############################################################################## # globals and options my $noInternal = 0; # option -noInternal - do not output internal node names my $defaultDistance = "0.1"; # to set distances when not given in input my $verbose = 0; # verbose debug level, integer my $allDistances = ""; # to set all distances to this value, default use input my $lineOutput = 0; # one line per leaf output format my $quoteNames = 0; # add "quotes" on node names my $nameTranslate = ""; # two column tab separated: inputName<tab>outputName my %translateName; # key is input name, value is output name # establish empty root branch parent my %root; my $root = \%root; # pointer handle to root node $root->{'parent'} = undef; $root->{'right'} = undef; $root->{'left'} = undef; $root->{'name'} = 'root'; $root->{'distance'} = $defaultDistance; $root->{'nextLeft'} = 0; # starts out false # the following two are only on this root node for global bookeeping $root->{'branchCount'} = 0; $root->{'leafCount'} = 0; ########################################################################## # forward declaration for recursive function sub printTree($); my $printDepth = 0; my $printNewLine = 0; # Tree print: # 1. if this node is not a leaf: # a. print ( and print right tree # b. print , and print left tree # c. print ) # 2. then print the name:distance of this node # # the if defines were added to the prints "(" "," and ")" since it # was printing too many of those things. It seems to work OK. # this algorithm was found in the paper: # http://people.sc.fsu.edu/~pbeerli/BSC-5936/08-29-05/Lecture_Notes_1.pdf # algorithm 3 page 17 sub printTree($) { my ($node) = @_; if ($lineOutput && $printNewLine) { for (my $i =0; $i < ($printDepth-2); ++$i) { printf " "; } $printNewLine = 0; } ++$printDepth; if (! $node->{'isLeaf'}) { # not a leaf, follow pointers right,left printf "(" if defined($node->{'left'}); if (defined($node->{'right'})) { printTree($node->{'right'}); printf "," if (defined($node->{'left'})); if ($lineOutput && defined($node->{'left'})) { printf "\n"; $printNewLine = 1; } } printTree($node->{'left'}) if (defined($node->{'left'})); printf ")" if defined($node->{'left'}); } my $distOut = sprintf("%.9f", $node->{'distance'}); $distOut =~ s/0+$//g; $distOut = "0.000001" if ($distOut eq "0."); if ( $node->{'isLeaf'} ) { if ($quoteNames) { - printf "\"%s\":%s", $node->{'name'}, $distOut; + printf "'%s':%s", $node->{'name'}, $distOut; } else { printf "%s:%s", $node->{'name'}, $distOut; } } elsif ( ! $noInternal) { if ($quoteNames) { - printf "\"%s\":%s", $node->{'name'}, $distOut; + printf "'%s':%s", $node->{'name'}, $distOut; } else { printf "%s:%s", $node->{'name'}, $distOut; } } else { printf ":%s", $distOut; } $printDepth -= 1; } # sub printTree($) ############################################################################## # start a new node element sub newNode($$$$$$) { my ($parent, $right, $left, $name, $distance, $nextLeft) = @_; my %node; $node{'parent'} = $parent; $node{'right'} = $right; $node{'left'} = $left; $node{'name'} = $name; $node{'isLeaf'} = 0; $node{'distance'} = $distance; $node{'nextLeft'} = $nextLeft; return \%node; } # sub newNode($$$$$$) ############################################################################## # Note functional description above of how this insert functions to resolve # polytomys # to the current node, add a new node, # returns new node as current node sub addNewBranch($) { my ($curNode) = @_; $root->{'branchCount'} += 1; my $nodeName = sprintf("branch.%d", $root->{'branchCount'}); my $newNode = newNode($curNode, undef, undef, $nodeName, $defaultDistance, 0); if (0 == $curNode->{'nextLeft'}) { # when false, insert on right if (defined($curNode->{'right'})) { printf STDERR "# ERROR: addNewBranch: nextLeft is FALSE but right pointer is already used on '%s'\n", $curNode->{'name'}; printf STDERR "# not expecting this ?\n"; exit 255; } else { # good to go, newNode becomes right of current $newNode->{'parent'} = $curNode; $curNode->{'right'} = $newNode; $curNode->{'nextLeft'} = 1; printf STDERR "# add new node %s as right child of parent %s\n", $newNode->{'name'}, $curNode->{'name'} if ($verbose > 2); } } else { # nextLeft is TRUE, see if node is fully occupied == POLYTOMY ! if (defined($curNode->{'left'})) { # move entire curNode down my $parent = $curNode->{'parent'}; if (!defined($parent)) { # only root has no parent $root->{'branchCount'} += 1; my $tmpName = sprintf("branch.%d", $root->{'branchCount'}); printf STDERR "# create new node '%s' to take pointers from %s\n", $tmpName, $curNode->{'name'} if ($verbose > 2); # need to make up a new node to hold the two pointers from curNode, # and this new node becomes the right branch of this curNode, # thereby freeing up the left branch of curNode my $extraNode = newNode($curNode, $curNode->{'right'}, $curNode->{'left'}, $tmpName, $defaultDistance, 1); $curNode->{'right'} = $extraNode; $newNode->{'parent'} = $curNode; $curNode->{'left'} = $newNode; printf STDERR "# add new node %s as left child of parent %s\n", $newNode->{'name'}, $curNode->{'name'} if ($verbose > 2); return $newNode; # all done here, do nothing else } else { # left pointer already occupied == POLYTOMY ! printf STDERR "# polytomy parent '%s' of current node '%s'\n", $parent->{'name'}, $curNode->{'name'} if ($verbose > 2); if ($parent->{'right'} == $curNode) { printf STDERR "# gets new right child '%s'\n", $newNode->{'name'} if ($verbose > 2); $parent->{'right'} = $newNode; } elsif ($parent->{'left'} == $curNode) { printf STDERR "# gets new left child '%s'\n", $newNode->{'name'} if ($verbose > 2); $parent->{'left'} = $newNode; } else { printf STDERR "# what ? curNode '%s' has no pointer to it from parent '%s'\n", $curNode->{'name'}, $parent->{'name'}; exit 255; } printf STDERR "# previous child %s becomes right child of new node %s\n", $curNode->{'name'}, $newNode->{'name'} if ($verbose > 2); $newNode->{'right'} = $curNode; $newNode->{'parent'} = $curNode->{'parent'}; $curNode->{'parent'} = $newNode; $newNode->{'nextLeft'} = 1; } } else { # if (defined($curNode->{'left'})) == left is free $newNode->{'parent'} = $curNode; $curNode->{'left'} = $newNode; printf STDERR "# add new node %s as left child of parent %s\n", $newNode->{'name'}, $curNode->{'name'} if ($verbose > 2); } } # been managing leftNext TRUE return $newNode; } # sub addNewBranch($$) ############################################################################## # clean up a leaf name, or use a translated name if given ############################################################################## sub cleanLeafName($) { my ($cleanName) = @_; my ($leafName, $leafDistance) = split(':', $cleanName); $leafName =~ s/'//g; # eliminate 'single quotes' $leafName =~ s/,//g; # eliminate commas $leafName = "noName" if (length($leafName) < 1); $leafDistance = $defaultDistance if (!defined($leafDistance)); $leafDistance = $allDistances if (length($allDistances) > 0); $leafDistance =~ s/,//; # eliminate trailing comma if (defined($translateName{$leafName})) { $leafName = $translateName{$leafName}; } else { $leafName=~ s/\s+/_/g; # blanks to _ } return ($leafName, $leafDistance); } ############################################################################## # clean up a node name, or use a translated name if given ############################################################################## sub cleanNodeName($) { my ($cleanName) = @_; my ($nodeName, $nodeDistance) = split(':', $cleanName); $nodeName =~ s/\)//g; # eliminate right ) paren $nodeName =~ s/;//g; # eliminate ending semi-colon ; $nodeName =~ s/,//g; # eliminate commas $nodeName =~ s/'//g; # eliminate 'single quotes' $nodeName = "noName" if (length($nodeName) < 1); $nodeDistance = $defaultDistance if (!defined($nodeDistance)); $nodeDistance = $allDistances if (length($allDistances) > 0); $nodeDistance =~ s/,//; # eliminate trailing comma if (defined($translateName{$nodeName})) { $nodeName = $translateName{$nodeName}; } else { $nodeName =~ s/\s+/_/g; # blanks to _ } return ($nodeName, $nodeDistance); } ############################################################################## # main starts here ############################################################################## my $argc = scalar(@ARGV); if ($argc < 1) { usage; } GetOptions ("noInternal" => \$noInternal, "defaultDistance=f" => \$defaultDistance, "verbose=i" => \$verbose, "nameTranslate=s" => \$nameTranslate, "lineOutput" => \$lineOutput, "quoteNames" => \$quoteNames, "allDistances=f" => \$allDistances) or die "Error in command line arguments\n"; $defaultDistance = $allDistances if (length($allDistances)); printf STDERR "# noInternal: %s\n", $noInternal ? "TRUE" : "FALSE"; printf STDERR "# defaultDistance: %f\n", $defaultDistance; printf STDERR "# allDistances: %f\n", $allDistances if (length($allDistances)); printf STDERR "# nameTranslate from: %s\n", $nameTranslate if (length($nameTranslate)); printf STDERR "# lineOutput '%s'\n", $lineOutput ? "TRUE" : "FALSE"; printf STDERR "# quoteNames '%s'\n", $quoteNames ? "TRUE" : "FALSE"; printf STDERR "# verbose: %d\n", $verbose; if (length($nameTranslate)) { open (FH, "<$nameTranslate") or die "can not read nameTranslate file '$nameTranslate'"; while (my $line = <FH>) { chomp $line; my ($inName, $outName) = split('\t', $line); $translateName{$inName} = $outName; } close (FH); } my $phyFile = shift; my $currentNode = $root; printf STDERR "# reading %s\n", $phyFile if ($verbose > 0); ############################################################################# # processing the input file ############################################################################# open (FH, "<$phyFile") or die "can not read $phyFile"; while (my $line = <FH>) { chomp $line; $line =~ s/^\s+//; # eliminate leading space, just in case garbage $line =~ s/\s+$//; # eliminate trailing space, just in case garbage if ($line =~ m/^\(/) { $currentNode = addNewBranch($currentNode); } elsif ($line =~ m/^\)/) { # name:length usually follows ) to name this my ($nodeName, $nodeDistance) = cleanNodeName($line); # clean up the name printf STDERR "# node: '%s' becomes '%s:%s'\n", $currentNode->{'name'}, $nodeName, $nodeDistance if ($verbose > 2); $currentNode->{'name'} = $nodeName; $currentNode->{'distance'} = $nodeDistance; $currentNode = $currentNode->{'parent'} if (defined($currentNode->{'parent'})); printf STDERR "# new currentNode: '%s'\n", $currentNode->{'name'} if ($verbose > 2); } else { # encounter 'string:length,' See procedure description above my ($leafName, $leafDistance) = cleanLeafName($line); my $newNode = newNode($currentNode, undef, undef, $leafName, $leafDistance, 0); $newNode->{'isLeaf'} = 1; if ($currentNode->{'nextLeft'}) { if (defined($currentNode->{'left'})) { # POLYTOMY add new branch $currentNode = addNewBranch($currentNode); } $currentNode->{'left'} = $newNode; $newNode->{'parent'} = $currentNode; printf STDERR "# new leaf '%s' added to left parent '%s'\n", $newNode->{'name'}, $currentNode->{'name'} if ($verbose > 2); } else { # new leaf goes on right if (defined($currentNode->{'right'})) { printf STDERR "# ERROR: adding new leaf nextLeft is FALSE but right pointer is used on '%s:%s'\n", $currentNode->{'name'}, $currentNode->{'distance'}; printf STDERR "# not expecting this ?\n"; exit 255; } else { $currentNode->{'right'} = $newNode; $newNode->{'parent'} = $currentNode; $currentNode->{'nextLeft'} = 1; printf STDERR "# new leaf '%s' added to right parent '%s'\n", $newNode->{'name'}, $currentNode->{'name'} if ($verbose > 2); } } $root->{'leafCount'} += 1; } # been managing 'string':length leaf element } # while (my $line = <FH>) close (FH); ######################## output ########################################## printf STDERR "# tree print summary:\n" if ($verbose > 0); printf STDERR "# branch count: %d\n", $root->{'branchCount'} if ($verbose > 0); printf STDERR "# leaf count: %d\n", $root->{'leafCount'} if ($verbose > 0); printTree($root); printf ";\n"; # end of 'root' branch