src/hg/utils/automation/makeGenomeDb.pl 1.25

1.25 2009/11/20 00:06:49 galt
adding subsetLittleIds option where agp ok if all its ids in col 6 are found in the fasta files
Index: src/hg/utils/automation/makeGenomeDb.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/makeGenomeDb.pl,v
retrieving revision 1.24
retrieving revision 1.25
diff -b -B -U 4 -r1.24 -r1.25
--- src/hg/utils/automation/makeGenomeDb.pl	7 Aug 2009 21:13:12 -0000	1.24
+++ src/hg/utils/automation/makeGenomeDb.pl	20 Nov 2009 00:06:49 -0000	1.25
@@ -155,8 +155,13 @@
 
 mitoSize N
   - to override the internal default of max size for mitochondrial
     sequence of $maxMitoSize e.g. for yeast: mitoSize 90000
+    
+subsetLittleIds Y      
+  - ok if agp little ids (col6) are a subset of fasta sequences
+    rather than requiring an exact match 
+
 " if ($detailed);
   print STDERR "\n";
   exit $status;
 } # usage
@@ -179,9 +184,9 @@
 # Conditionally required config parameters:
 my ($fakeAgpMinContigGap, $fakeAgpMinScaffoldGap,
     $clade, $genomeCladePriority);
 # Optional config parameters:
-my ($commonName, $agpFiles, $qualFiles, $mitoSize);
+my ($commonName, $agpFiles, $qualFiles, $mitoSize, $subsetLittleIds);
 # Other globals:
 my ($gotMito, $gotAgp, $gotQual, $topDir, $chromBased);
 my ($bedDir, $scriptDir, $endNotes);
 
@@ -258,8 +263,9 @@
   $commonName = &optionalVar('commonName', \%config);
   $agpFiles = &optionalVar('agpFiles', \%config);
   $qualFiles = &optionalVar('qualFiles', \%config);
   $mitoSize = &optionalVar('mitoSize', \%config);
+  $subsetLittleIds = &optionalVar('subsetLittleIds', \%config);
   # Make sure no unrecognized variables were given.
   my @stragglers = sort keys %config;
   if (scalar(@stragglers) > 0) {
     die "Error: config file $CONFIG has unrecognized variables:\n" .
@@ -382,8 +388,12 @@
   # care of its own splitting (better yet, make it use 2bit specs).
 
   my $acat = "cat";
   my $fcat = "cat";
+  my $sli = "";
+  if ($subsetLittleIds eq "Y") {
+    $sli = "-1 ";  
+  }
   foreach my $file (`ls $fastaFiles 2> /dev/null`) {
     if ($file =~ m/\.gz$/) {
       $fcat = "zcat";
       last;
@@ -406,17 +416,17 @@
 set agpBigIds = `mktemp -p /tmp makeGenomeDb.agpIds.XXXXXX`
 $acat $agpFiles | awk '{print \$1;}' | sort -u \\
   > \$agpBigIds
 set agpLittleIds = `mktemp -p /tmp makeGenomeDb.agpIds.XXXXXX`
-$acat $agpFiles | awk '\$5 != "N" {print \$6;}' | sort -u \\
+$acat $agpFiles | awk '((\$5 != "N") && (\$5 != "U")) {print \$6;}' | sort -u \\
   > \$agpLittleIds
 
 # Compare fasta IDs to first and sixth columns of AGP:
 set diffBigCount = `comm -3 \$fastaIds \$agpBigIds | wc -l`
-set diffLittleCount = `comm -3 \$fastaIds \$agpLittleIds | wc -l`
+set diffLittleCount = `comm $sli-3 \$fastaIds \$agpLittleIds | wc -l`
 
 # If AGP "big" IDs match sequence IDs, use sequence as-is.
-# If AGP "little" IDs match sequence IDs, assemble sequence with agpToFa.
+# If AGP "little" IDs match sequence IDs, or are a subset, assemble sequence with agpToFa.
 if (\$diffLittleCount == 0) then
   set agpTmp = `mktemp -p /tmp makeGenomeDb.agp.XXXXXX`
   $acat $agpFiles > \$agpTmp
   $fcat $fastaFiles \\