src/hg/utils/automation/makeGenomeDb.pl 1.25
1.25 2009/11/20 00:06:49 galt
adding subsetLittleIds option where agp ok if all its ids in col 6 are found in the fasta files
Index: src/hg/utils/automation/makeGenomeDb.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/makeGenomeDb.pl,v
retrieving revision 1.24
retrieving revision 1.25
diff -b -B -U 4 -r1.24 -r1.25
--- src/hg/utils/automation/makeGenomeDb.pl 7 Aug 2009 21:13:12 -0000 1.24
+++ src/hg/utils/automation/makeGenomeDb.pl 20 Nov 2009 00:06:49 -0000 1.25
@@ -155,8 +155,13 @@
mitoSize N
- to override the internal default of max size for mitochondrial
sequence of $maxMitoSize e.g. for yeast: mitoSize 90000
+
+subsetLittleIds Y
+ - ok if agp little ids (col6) are a subset of fasta sequences
+ rather than requiring an exact match
+
" if ($detailed);
print STDERR "\n";
exit $status;
} # usage
@@ -179,9 +184,9 @@
# Conditionally required config parameters:
my ($fakeAgpMinContigGap, $fakeAgpMinScaffoldGap,
$clade, $genomeCladePriority);
# Optional config parameters:
-my ($commonName, $agpFiles, $qualFiles, $mitoSize);
+my ($commonName, $agpFiles, $qualFiles, $mitoSize, $subsetLittleIds);
# Other globals:
my ($gotMito, $gotAgp, $gotQual, $topDir, $chromBased);
my ($bedDir, $scriptDir, $endNotes);
@@ -258,8 +263,9 @@
$commonName = &optionalVar('commonName', \%config);
$agpFiles = &optionalVar('agpFiles', \%config);
$qualFiles = &optionalVar('qualFiles', \%config);
$mitoSize = &optionalVar('mitoSize', \%config);
+ $subsetLittleIds = &optionalVar('subsetLittleIds', \%config);
# Make sure no unrecognized variables were given.
my @stragglers = sort keys %config;
if (scalar(@stragglers) > 0) {
die "Error: config file $CONFIG has unrecognized variables:\n" .
@@ -382,8 +388,12 @@
# care of its own splitting (better yet, make it use 2bit specs).
my $acat = "cat";
my $fcat = "cat";
+ my $sli = "";
+ if ($subsetLittleIds eq "Y") {
+ $sli = "-1 ";
+ }
foreach my $file (`ls $fastaFiles 2> /dev/null`) {
if ($file =~ m/\.gz$/) {
$fcat = "zcat";
last;
@@ -406,17 +416,17 @@
set agpBigIds = `mktemp -p /tmp makeGenomeDb.agpIds.XXXXXX`
$acat $agpFiles | awk '{print \$1;}' | sort -u \\
> \$agpBigIds
set agpLittleIds = `mktemp -p /tmp makeGenomeDb.agpIds.XXXXXX`
-$acat $agpFiles | awk '\$5 != "N" {print \$6;}' | sort -u \\
+$acat $agpFiles | awk '((\$5 != "N") && (\$5 != "U")) {print \$6;}' | sort -u \\
> \$agpLittleIds
# Compare fasta IDs to first and sixth columns of AGP:
set diffBigCount = `comm -3 \$fastaIds \$agpBigIds | wc -l`
-set diffLittleCount = `comm -3 \$fastaIds \$agpLittleIds | wc -l`
+set diffLittleCount = `comm $sli-3 \$fastaIds \$agpLittleIds | wc -l`
# If AGP "big" IDs match sequence IDs, use sequence as-is.
-# If AGP "little" IDs match sequence IDs, assemble sequence with agpToFa.
+# If AGP "little" IDs match sequence IDs, or are a subset, assemble sequence with agpToFa.
if (\$diffLittleCount == 0) then
set agpTmp = `mktemp -p /tmp makeGenomeDb.agp.XXXXXX`
$acat $agpFiles > \$agpTmp
$fcat $fastaFiles \\