src/hg/utils/automation/parseLrgXml.pl 06d7be056190c14b85e71bc12523f18ea6815b5e

06d7be056190c14b85e71bc12523f18ea6815b5e
markd
  Mon Dec 7 00:50:29 2020 -0800
BLAT mmap index support merge with master

diff --git src/hg/utils/automation/parseLrgXml.pl src/hg/utils/automation/parseLrgXml.pl
index d33dd84..bb28d0f 100755
--- src/hg/utils/automation/parseLrgXml.pl
+++ src/hg/utils/automation/parseLrgXml.pl
@@ -1,39 +1,44 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/parseLrgXml.pl instead.
 
 use warnings;
 use strict;
 
 use Encode;  # Of the UTF-8 sort, not the ENCODE project. :)
 use XML::LibXML;
 use Data::Dumper; #print Data::Dumper->Dump([$dom], [qw(dom)]);
 
 my $assemblyPrefix = shift @ARGV;
+my $workdir = shift @ARGV;
+if (not defined $workdir) { $workdir = "./";}
 
 sub usage {
   my ($status) = @_;
   my $base = $0;
   $base =~ s/^(.*\/)?//;
   print STDERR "
-usage: $base assemblyPrefix
+usage: $base assemblyPrefix pathToFiles
     Parses LRG_*.xml files in current directory into BED (LRG regions)
     and genePred+Cdna+Pep (transcripts in LRG coordinates.)
     assemblyPrefix is something like \"GRCh37\" to match GRCh37.p5 etc.
 
+    If pathToFiles is present, look for pathToFiles/LRG_*.xml files. pathToFiles
+    defaults to the current working directory.
+
 ";
   exit $status;
 } # usage
 
 my %gbAccToHg19Alt = ( gl000250 => 'chr6_apd_hap1',
                        gl000251 => 'chr6_cox_hap2',
                        gl000252 => 'chr6_dbb_hap3',
                        gl000253 => 'chr6_mann_hap4',
                        gl000254 => 'chr6_mcf_hap5',
                        gl000255 => 'chr6_qbl_hap6',
                        gl000256 => 'chr6_ssto_hap7',
                        gl000257 => 'chr4_ctg9_hap1',
                        gl000258 => 'chr17_ctg5_hap1',
                      );
 
@@ -294,33 +299,33 @@
 
     if (@codingRegions) {
       my @ps = $codingRegions[0]->findnodes('translation');
       # I expect one for protein-coding genes, none for non-coding:
       print $pepF "$txName\t" . $ps[0]->findvalue('sequence') . "\n";
     }
   } # each transcript
 } # parseOneLrg
 
 ################################ MAIN #################################
 
 if (! $assemblyPrefix) {
   usage(1);
 }
 
-my @xmlFiles = <LRG_*.xml>;
+my @xmlFiles = <"$workdir/LRG_*.xml">;
 if (@xmlFiles == 0) {
-  warn "No LRG_*.xml files found in current directory -- need to cd somewhere?\n";
+  warn "No LRG_*.xml files found in $workdir -- need to cd somewhere?\n";
 }
 
 open(my $bedF, "| sort -k1,1 -k2n,2n >lrg.bed")
   || die "Can't open lrg.bed for (sorting and) writing): $!\n";
 open(my $gpF, "| sort -k2,2 -k4n,4n >lrgTranscriptsUnmapped.gp")
   || die "Can't open lrgTranscriptsUnmapped.gp for (sorting and) writing: $!\n";
 open(my $cdnaF, ">lrgCdna.tab") || die "Can't open lrgCdna.tab for writing: $!\n";
 open(my $pepF, ">lrgPep.tab") || die "Can't open lrgPep.tab for writing: $!\n";
 
 foreach my $file (@xmlFiles) {
   parseOneLrg($file, $bedF, $gpF, $cdnaF, $pepF);
 }
 
 close($bedF);
 close($gpF);