src/hg/utils/automation/makeGenomeDb.pl 1.26

1.26 2010/02/04 18:33:19 hiram
fixup usage of AGP files with comments and FASTA with Genbank ID lines
Index: src/hg/utils/automation/makeGenomeDb.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/makeGenomeDb.pl,v
retrieving revision 1.25
retrieving revision 1.26
diff -b -B -U 4 -r1.25 -r1.26
--- src/hg/utils/automation/makeGenomeDb.pl	20 Nov 2009 00:06:49 -0000	1.25
+++ src/hg/utils/automation/makeGenomeDb.pl	4 Feb 2010 18:33:19 -0000	1.26
@@ -408,17 +408,17 @@
   if ($gotAgp) {
     $bossScript->add(<<_EOF_
 # Get sorted IDs from fasta sequence files:
 set fastaIds = `mktemp -p /tmp makeGenomeDb.fastaIds.XXXXXX`
-$fcat $fastaFiles | grep '^>' | perl -wpe 's/^>(\\S+).*/\$1/' | sort \\
+$fcat $fastaFiles | grep '^>' | sed -e 's/^>.*gb\|/>/; s/\|.*//' | perl -wpe 's/^>(\\S+).*/\$1/' | sort \\
   > \$fastaIds
 
 # Get sorted "big" (1st column) and "little" (6th column) IDs from AGP files:
 set agpBigIds = `mktemp -p /tmp makeGenomeDb.agpIds.XXXXXX`
-$acat $agpFiles | awk '{print \$1;}' | sort -u \\
+$acat $agpFiles | grep -v '^#' | awk '{print \$1;}' | sort -u \\
   > \$agpBigIds
 set agpLittleIds = `mktemp -p /tmp makeGenomeDb.agpIds.XXXXXX`
-$acat $agpFiles | awk '((\$5 != "N") && (\$5 != "U")) {print \$6;}' | sort -u \\
+$acat $agpFiles | grep -v '^#' | awk '((\$5 != "N") && (\$5 != "U")) {print \$6;}' | sort -u \\
   > \$agpLittleIds
 
 # Compare fasta IDs to first and sixth columns of AGP:
 set diffBigCount = `comm -3 \$fastaIds \$agpBigIds | wc -l`
@@ -427,15 +427,16 @@
 # If AGP "big" IDs match sequence IDs, use sequence as-is.
 # If AGP "little" IDs match sequence IDs, or are a subset, assemble sequence with agpToFa.
 if (\$diffLittleCount == 0) then
   set agpTmp = `mktemp -p /tmp makeGenomeDb.agp.XXXXXX`
-  $acat $agpFiles > \$agpTmp
-  $fcat $fastaFiles \\
+  $acat $agpFiles | grep -v '^#' > \$agpTmp
+  $fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' \\
   | agpToFa -simpleMultiMixed \$agpTmp all stdout stdin \\
   | faToTwoBit -noMask stdin $chrM $db.unmasked.2bit
   rm -f \$agpTmp
 else if (\$diffBigCount == 0) then
-  faToTwoBit -noMask $fastaFiles $chrM $db.unmasked.2bit
+  $fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' \\
+  | faToTwoBit -noMask stdin $chrM $db.unmasked.2bit
 else
   echo "Error: IDs in fastaFiles ($fastaFiles)"
   echo "do not perfectly match IDs in either the first or sixth columns of"
   echo "agpFiles ($agpFiles)"
@@ -450,9 +451,10 @@
     );
   } else {
     # No AGP -- just make an unmasked 2bit.
     $bossScript->add(<<_EOF_
-faToTwoBit -noMask $fastaFiles $chrM $db.unmasked.2bit
+$fcat $fastaFiles | sed -e 's/^>.*gb\|/>/; s/\|.*//' \\
+    faToTwoBit -noMask stdin $chrM $db.unmasked.2bit
 _EOF_
     );
   }
 
@@ -471,9 +473,9 @@
     $bossScript->add(<<_EOF_
 
 if (`wc -l < chrom.sizes` < 1000) then
   # Install per-chrom .agp files for download.
-  $acat $agpFiles \\
+  $acat $agpFiles | grep -v '^#' \\
   | splitFileByColumn -col=1 -ending=.agp stdin $topDir -chromDirs
 endif
 _EOF_
       );
@@ -559,9 +561,9 @@
 # efficient to run this one chrom at a time.  However, since the filenames
 # are arbitrary, I'm not sure how to identify the pairings of AGP and fa
 # files.  So cat 'em all together and check everything at once:
 
-$acat $agpFiles | sort -k1,1 -k2n,2n > $allAgp
+$acat $agpFiles | grep -v '^#' | sort -k1,1 -k2n,2n > $allAgp
 
 set result = `checkAgpAndFa $exclude $allAgp $db.unmasked.2bit | tail -1`
 
 if ("\$result" != 'All AGP and FASTA entries agree - both files are valid') then