src/hg/utils/automation/makeGenomeDb.pl 1.26
1.26 2010/02/04 18:33:19 hiram
fixup usage of AGP files with comments and FASTA with Genbank ID lines
Index: src/hg/utils/automation/makeGenomeDb.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/makeGenomeDb.pl,v
retrieving revision 1.25
retrieving revision 1.26
diff -b -B -U 4 -r1.25 -r1.26
--- src/hg/utils/automation/makeGenomeDb.pl 20 Nov 2009 00:06:49 -0000 1.25
+++ src/hg/utils/automation/makeGenomeDb.pl 4 Feb 2010 18:33:19 -0000 1.26
@@ -408,17 +408,17 @@
if ($gotAgp) {
$bossScript->add(<<_EOF_
# Get sorted IDs from fasta sequence files:
set fastaIds = `mktemp -p /tmp makeGenomeDb.fastaIds.XXXXXX`
-$fcat $fastaFiles | grep '^>' | perl -wpe 's/^>(\\S+).*/\$1/' | sort \\
+$fcat $fastaFiles | grep '^>' | sed -e 's/^>.*gb\|/>/; s/\|.*//' | perl -wpe 's/^>(\\S+).*/\$1/' | sort \\
> \$fastaIds
# Get sorted "big" (1st column) and "little" (6th column) IDs from AGP files:
set agpBigIds = `mktemp -p /tmp makeGenomeDb.agpIds.XXXXXX`
-$acat $agpFiles | awk '{print \$1;}' | sort -u \\
+$acat $agpFiles | grep -v '^#' | awk '{print \$1;}' | sort -u \\
> \$agpBigIds
set agpLittleIds = `mktemp -p /tmp makeGenomeDb.agpIds.XXXXXX`
-$acat $agpFiles | awk '((\$5 != "N") && (\$5 != "U")) {print \$6;}' | sort -u \\
+$acat $agpFiles | grep -v '^#' | awk '((\$5 != "N") && (\$5 != "U")) {print \$6;}' | sort -u \\
> \$agpLittleIds
# Compare fasta IDs to first and sixth columns of AGP:
set diffBigCount = `comm -3 \$fastaIds \$agpBigIds | wc -l`
@@ -427,15 +427,16 @@
# If AGP "big" IDs match sequence IDs, use sequence as-is.
# If AGP "little" IDs match sequence IDs, or are a subset, assemble sequence with agpToFa.
if (\$diffLittleCount == 0) then
set agpTmp = `mktemp -p /tmp makeGenomeDb.agp.XXXXXX`
- $acat $agpFiles > \$agpTmp
- $fcat $fastaFiles \\
+ $acat $agpFiles | grep -v '^#' > \$agpTmp
+ $fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' \\
| agpToFa -simpleMultiMixed \$agpTmp all stdout stdin \\
| faToTwoBit -noMask stdin $chrM $db.unmasked.2bit
rm -f \$agpTmp
else if (\$diffBigCount == 0) then
- faToTwoBit -noMask $fastaFiles $chrM $db.unmasked.2bit
+ $fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' \\
+ | faToTwoBit -noMask stdin $chrM $db.unmasked.2bit
else
echo "Error: IDs in fastaFiles ($fastaFiles)"
echo "do not perfectly match IDs in either the first or sixth columns of"
echo "agpFiles ($agpFiles)"
@@ -450,9 +451,10 @@
);
} else {
# No AGP -- just make an unmasked 2bit.
$bossScript->add(<<_EOF_
-faToTwoBit -noMask $fastaFiles $chrM $db.unmasked.2bit
+$fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' \\
+ faToTwoBit -noMask stdin $chrM $db.unmasked.2bit
_EOF_
);
}
@@ -471,9 +473,9 @@
$bossScript->add(<<_EOF_
if (`wc -l < chrom.sizes` < 1000) then
# Install per-chrom .agp files for download.
- $acat $agpFiles \\
+ $acat $agpFiles | grep -v '^#' \\
| splitFileByColumn -col=1 -ending=.agp stdin $topDir -chromDirs
endif
_EOF_
);
@@ -559,9 +561,9 @@
# efficient to run this one chrom at a time. However, since the filenames
# are arbitrary, I'm not sure how to identify the pairings of AGP and fa
# files. So cat 'em all together and check everything at once:
-$acat $agpFiles | sort -k1,1 -k2n,2n > $allAgp
+$acat $agpFiles | grep -v '^#' | sort -k1,1 -k2n,2n > $allAgp
set result = `checkAgpAndFa $exclude $allAgp $db.unmasked.2bit | tail -1`
if ("\$result" != 'All AGP and FASTA entries agree - both files are valid') then