src/hg/utils/automation/doEnsGeneUpdate.pl ecc8358a1e03a67aba51417799cca013f98eacc5

ecc8358a1e03a67aba51417799cca013f98eacc5
hiram
  Mon Feb 17 16:38:17 2020 -0800
correct construction of name index file from the gtfToGenePred infoOut.txt newer format refs #23891

diff --git src/hg/utils/automation/doEnsGeneUpdate.pl src/hg/utils/automation/doEnsGeneUpdate.pl
index a6a6445..6451c8c 100755
--- src/hg/utils/automation/doEnsGeneUpdate.pl
+++ src/hg/utils/automation/doEnsGeneUpdate.pl
@@ -1,903 +1,905 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/doEnsGeneUpdate.pl instead.
 
 use Getopt::Long;
 use warnings;
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use EnsGeneAutomate;
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 use File::Basename;
 
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_ensVersion
     $opt_vegaGene
     $opt_buildDir
     $opt_chromSizes
     /;
 
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'download',   func => \&doDownload },
       { name => 'process',   func => \&doProcess },
       { name => 'load',   func => \&doLoad },
       { name => 'cleanup', func => \&doCleanup },
       { name => 'makeDoc', func => \&doMakeDoc },
     ]
 );
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $vegaSpecies = "human";
 my $vegaPep = "Homo_sapiens.VEGA";
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 my (@versionList) = &EnsGeneAutomate::ensVersionList();
 my $versionListString = join(", ", @versionList);
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base -ensVersion=NN <db>.ensGene.ra
 required options:
   -ensVersion=NN - must specify desired Ensembl version
                  - possible values: $versionListString
   <db>.ensGene.ra - configuration file with database and other options
 
 other options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -vegaGene             Special processing for vegaGene instead of Ensembl
                           works only on Human, Mouse, Zebrafish
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ensGene.<ensVersion #>
                           (necessary if experimenting with builds).
     -chromSizes filePath  Use filePath for chrom.size file instead of database
                           chromInfo request
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => '',
 						'bigClusterHub' => '',
 						'smallClusterHub' => '');
   print STDERR "
 Automates UCSC's Ensembl gene updates.  Steps:
     download: fetch GFF and protein data files from Ensembl FTP site
     process: parse the GFF file, create coding and non-coding gff files
 		lift random contigs to UCSC random coordinates
     load: load the coding and non-coding tracks, and the proteins
     cleanup: Removes or compresses intermediate files.
     makeDoc: Displays the make doc text entry to stdout for the procedure
 	which would be done for this build.
 All operations are performed in the build directory which is
 $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ensGene.<ensVersion #> unless -buildDir is given.
 <db>.ensGene.ra describes the species, assembly and downloaded files.
 To see detailed information about what should appear in config.ra,
 run \"$base -help\".
 ";
   # Detailed help (-help):
   if ($detailed) {
   print STDERR <<_EOF_
 -----------------------------------------------------------------------------
 Required <db>.ensGene.ra settings:
 
 db xxxYyyN
   - The UCSC database name and assembly identifier.  xxx is the first three
     letters of the genus and Yyy is the first three letters of the species.
     N is the sequence number of this species' build at UCSC.  For some
     species that we have been processing since before this convention, the
     pattern is xyN.
 
 Optional <db>.ensGene.ra settings:
 
 liftRandoms yes
   - Need to lift Ensembl contig coordinates to UCSC _random coordinates.
   - Will use UCSC ctgPos information about contigs to perform the lift.
 nameTranslation "<sed translation pattern>"
   - sed translation statement to change Ensembl chrom numbers and MT to
     UCSC chrN and chrM chrom names.  Example:
     nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/"
 geneScaffolds yes
   - need to translate Ensembl GeneScaffold coordinates to UCSC scaffolds
     Will fetch and use the Ensembl MySQL tables seq_region and assembly
     to perform the translation
 skipInvalid yes
   - during the loading of the gene pred, skip all invalid genes
 haplotypeLift <path/to/file.lift>
   - a lift-down file for Ensembl full chromosome haplotype coordinates
   - to UCSC simple haplotype coordinates
 liftUp <path/to/ensGene.lft>
   - after everything else is done, lift the final gene pred via this
   - lift file.  Comes in handy if Ensembl chrom names are simply
   - different than UCSC chrom names.  For example in bushbaby, otoGar1
 liftMtOver <path/to/ensGene.Mt.overChain>
   - before exit process step, lift the gene pred in chrM  via this lift over 
   - file if Ensembl is using chrM sequence different from the sequence 
   - used by UCSC. For example in turkey, melGal1.
 
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for
    database/assembly \$db.
 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from
    \$db.2bit.
    (can be changed with the -chromSizes option)
 3. The \$db.2bit files have already been distributed to cluster-scratch
    (/scratch/hg or /iscratch, /san etc).
 4. anything else here ?
 _EOF_
   }
   print "\n";
   exit $status;
 }
 
 
 
 # Globals:
 my ($species, $ensGtfUrl, $ensGtfFile, $ensPepUrl, $ensPepFile,
     $ensMySqlUrl, $ensVersionDateReference, $previousEnsVersion, $buildDir,
     $chromSizes, $previousBuildDir, $vegaGene);
 # Command line argument:
 my $CONFIG;
 # Required command line arguments:
 my ($ensVersion);
 # Required config parameters:
 my ($db);
 # Conditionally required config parameters:
 my ($liftRandoms, $nameTranslation, $geneScaffolds, $knownToEnsembl,
     $skipInvalid, $haplotypeLift, $liftUp, $liftMtOver);
 # Other globals:
 my ($topDir, $chromBased);
 my ($bedDir, $scriptDir, $endNotes);
 my ($secondsStart, $secondsEnd);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'ensVersion=s',
 		      'vegaGene',
 		      'buildDir=s',
 		      'chromSizes=s',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 #########################################################################
 # * step: load [dbHost]
 sub doLoad {
   my $runDir = "$buildDir";
   if (! -d "$buildDir/process" && ! $opt_debug) {
     die "ERROR: load: directory: '$buildDir/process' does not exist.\n" .
       "The process step appears to have not been done.\n" .
 	"Run with -continue=process before this step.\n";
   }
 
   my $whatItDoes = "load processed ensGene data into local database.";
   my $bossScript = new HgRemoteScript("$runDir/doLoad.csh", $dbHost,
 				      $runDir, $whatItDoes);
 
   my $thisGenePred = "$buildDir" . "/process/$db.allGenes.gp.gz";
   my $prevGenePred = "$previousBuildDir" . "/process/$db.allGenes.gp.gz";
   printf STDERR "# checking previous: $previousBuildDir\n";
   my $thisGtp = "$buildDir" . "/process/ensGtp.tab";
   my $prevGtp = "$previousBuildDir" . "/process/ensGtp.tab";
   my $thisGeneName = "$buildDir" . "/process/ensemblToGeneName.tab";
   my $prevGeneName = "$previousBuildDir" . "/process/ensemblToGeneName.tab";
   my $thisSource = "$buildDir" . "/process/ensemblSource.tab";
   my $prevSource = "$previousBuildDir" . "/process/ensemblSource.tab";
   my $identicalToPrevious = 1;
   if ( -f $prevGenePred && -f $prevGtp && -f $prevGeneName && -f $prevSource ) {
       my $thisSum = `zcat $thisGenePred | sort | sum`;
       chomp $thisSum;
       my $prevSum = `zcat $prevGenePred | sort | sum`;
       chomp $prevSum;
       printf STDERR "# genePred prev: $prevSum %s this: $thisSum\n",
          $prevSum eq $thisSum ? "==" : "!=";
       $identicalToPrevious = 0 if ($prevSum ne $thisSum);
 
       $thisSum = `sort $thisGtp | sum`;
       chomp $thisSum;
       $prevSum = `sort $prevGtp | sum`;
       chomp $prevSum;
       $identicalToPrevious = 0 if ($prevSum ne $thisSum);
       printf STDERR "# ensGtp prev: $prevSum %s this: $thisSum\n",
          $prevSum eq $thisSum ? "==" : "!=";
 
       $thisSum = `sort $thisGeneName | sum`;
       chomp $thisSum;
       $prevSum = `sort $prevGeneName | sum`;
       chomp $prevSum;
       $identicalToPrevious = 0 if ($prevSum ne $thisSum);
       printf STDERR "# ensemblToGeneName prev: $prevSum %s this: $thisSum\n",
          $prevSum eq $thisSum ? "==" : "!=";
 
       $thisSum = `sort $thisSource | sum`;
       chomp $thisSum;
       $prevSum = `sort $prevSource | sum`;
       chomp $prevSum;
       $identicalToPrevious = 0 if ($prevSum ne $thisSum);
       printf STDERR "# ensemblSource prev: $prevSum %s this: $thisSum\n",
          $prevSum eq $thisSum ? "==" : "!=";
 
       if (1 == $identicalToPrevious) {
 	print STDERR "previous genes same as new genes";
       }
   } else {
     $identicalToPrevious = 0;
   }
 
 $identicalToPrevious = 0;
 
   if ($identicalToPrevious ) {
       $bossScript->add(<<_EOF_
 hgsql -e 'INSERT INTO trackVersion \\
     (db, name, who, version, updateTime, comment, source, dateReference) \\
     VALUES("$db", "ensGene", "$ENV{'USER'}", "$ensVersion", now(), \\
 	"identical to previous version $previousEnsVersion", \\
 	"identical to previous version $previousEnsVersion", \\
 	"$ensVersionDateReference" );' hgFixed
 _EOF_
 	  );
   } else {
       my $skipInv = "";
       $skipInv = "-skipInvalid" if (defined $skipInvalid);
 
       if ($opt_vegaGene) {
       $bossScript->add(<<_EOF_
 hgLoadGenePred $skipInv -genePredExt $db \\
     vegaGene process/not.vegaPseudo.gp.gz >& load.not.pseudo.errors.txt
 hgLoadGenePred $skipInv -genePredExt $db \\
     vegaPseudoGene process/vegaPseudo.gp.gz >& load.pseudo.errors.txt
 hgsql -N -e "select name from vegaPseudoGene;" $db > pseudo.name
 hgsql -N -e "select name from vegaGene;" $db > not.pseudo.name
 sort -u pseudo.name not.pseudo.name > vegaGene.name
 sed -e "s/20/40/; s/19/39/" $ENV{'HOME'}/kent/src/hg/lib/ensGtp.sql \\
     > vegaGtp.sql
 hgLoadSqlTab $db vegaGtp vegaGtp.sql process/ensGtp.tab
 zcat download/$ensPepFile \\
 	| sed -e 's/^>.* Transcript:/>/;' | gzip > vegaPep.txt.gz
 zcat vegaPep.txt.gz \\
     | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \\
 	 | sed -e '/^\$/d; s/\*\$//' | sort > vegaPepAll.$db.fa.tab
 join vegaPepAll.$db.fa.tab vegaGene.name | sed -e "s/ /\\t/" \\
     > vegaPep.$db.fa.tab
 hgPepPred $db tab vegaPep vegaPep.$db.fa.tab
 # verify names in vegaGene is a superset of names in vegaPep
 hgsql -N -e "select name from vegaPep;" $db | sort > vegaPep.name
 set geneCount = `cat vegaGene.name | wc -l`
 set pepCount = `cat vegaPep.name | wc -l`
 set commonCount = `comm -12 vegaPep.name vegaGene.name | wc -l`
 set percentId = \\
     `echo \$commonCount \$pepCount | awk '{printf "%d", 100.0*\$1/\$2}'`
 echo "gene count: \$geneCount, peptide count: \$pepCount, common name count: \$commonCount"
 echo "percentId: \$percentId"
 if (! (\$percentId > 95)) then
     echo "ERROR: percent coverage of peptides to genes: \$percentId"
     echo "ERROR: should be greater than 95"
     exit 255
 endif
 _EOF_
 	  );
       } elsif (defined $geneScaffolds) {
 	  $bossScript->add(<<_EOF_
 hgLoadGenePred $skipInv -genePredExt $db ensGene process/$db.allGenes.gp.gz \\
 	>& loadGenePred.errors.txt
 checkTableCoords $db -table=ensGene
 zcat process/ensemblGeneScaffolds.$db.bed.gz | sort > to.clean.GeneScaffolds.bed
 cut -f1 /hive/data/genomes/$db/chrom.sizes | sort > legitimate.names
 join -t'\t' legitimate.names to.clean.GeneScaffolds.bed \\
     | sed -e "s/GeneScaffold/GS/" | hgLoadBed $db ensemblGeneScaffold stdin
 checkTableCoords $db -table=ensemblGeneScaffold
 _EOF_
 	  );
       } else {
       $bossScript->add(<<_EOF_
 hgLoadGenePred $skipInv -genePredExt $db \\
     ensGene process/$db.allGenes.gp.gz >& loadGenePred.errors.txt
 _EOF_
 	  );
       }
 
       if (! $opt_vegaGene) {
       $bossScript->add(<<_EOF_
 zcat download/$ensPepFile \\
 	| sed -e 's/^>.* transcript:/>/; s/ CCDS.*\$//; s/ .*\$//' | gzip > ensPep.txt.gz
 zcat ensPep.txt.gz \\
     | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \\
 	 | sed -e '/^\$/d; s/\*\$//' | sort > ensPep.$db.fa.tab
 hgPepPred $db tab ensPep ensPep.$db.fa.tab
 hgLoadSqlTab $db ensGtp ~/kent/src/hg/lib/ensGtp.sql process/ensGtp.tab
 hgLoadSqlTab $db ensemblToGeneName process/ensemblToGeneName.sql process/ensemblToGeneName.tab
 hgLoadSqlTab $db ensemblSource process/ensemblSource.sql process/ensemblSource.tab
 # verify names in ensGene is a superset of names in ensPep
 hgsql -N -e "select name from ensPep;" $db | sort > ensPep.name
 hgsql -N -e "select name from ensGene;" $db | sort > ensGene.name
 set geneCount = `cat ensGene.name | wc -l`
 set pepCount = `cat ensPep.name | wc -l`
 set commonCount = `comm -12 ensPep.name ensGene.name | wc -l`
 set percentId = \\
     `echo \$commonCount \$pepCount | awk '{printf "%d", 100.0*\$1/\$2}'`
 echo "gene count: \$geneCount, peptide count: \$pepCount, common name count: \$commonCount"
 echo "percentId: \$percentId"
 if (! (\$percentId > 95)) then
     echo "ERROR: percent coverage of peptides to genes: \$percentId"
     echo "ERROR: should be greater than 95"
     exit 255
 endif
 _EOF_
       );
       }
       if (! $opt_vegaGene && defined $knownToEnsembl) {
 	  $bossScript->add(<<_EOF_
 hgMapToGene $db ensGene knownGene knownToEnsembl
 _EOF_
 	  );
       }
       if ($opt_vegaGene) {
       $bossScript->add(<<_EOF_
 hgsql -e 'INSERT INTO trackVersion \\
     (db, name, who, version, updateTime, comment, source, dateReference) \\
     VALUES("$db", "vegaGene", "$ENV{'USER'}", "$ensVersion", now(), \\
 	"with peptides $ensPepFile", \\
 	"$ensGtfUrl", \\
 	"$ensVersionDateReference" );' hgFixed
 _EOF_
       );
       } else {
       $bossScript->add(<<_EOF_
 hgsql -e 'INSERT INTO trackVersion \\
     (db, name, who, version, updateTime, comment, source, dateReference) \\
     VALUES("$db", "ensGene", "$ENV{'USER'}", "$ensVersion", now(), \\
 	"with peptides $ensPepFile", \\
 	"$ensGtfUrl", \\
 	"$ensVersionDateReference" );' hgFixed
 _EOF_
       );
       }
   }
   $bossScript->execute() if (! $opt_debug);
 } # doLoad
 
 #########################################################################
 # * step: process [dbHost]
 sub doProcess {
   my $runDir = "$buildDir/process";
   # First, make sure we're starting clean.
   if (-d "$runDir" && ! $opt_debug) {
     die "ERROR: process: looks like this was run successfully already\n" .
       "($runDir exists)\nEither run with -continue=load or some later\n" .
 	"stage, or move aside/remove\n$runDir\nand run again.\n";
   }
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "process downloaded data from Ensembl FTP site into locally usable data.";
   my $lifting = 0;
   my $bossScript;
   if (defined $liftRandoms) {
       $lifting = 1;
   }
   $bossScript = new HgRemoteScript("$runDir/doProcess.csh", $dbHost,
 				  $runDir, $whatItDoes);
   #  if lifting, create the lift file
   if ($lifting) {
       $bossScript->add(<<_EOF_
 rm -f randoms.$db.lift
 foreach C (`cut -f1 /hive/data/genomes/$db/chrom.sizes | grep random`)
    set size = `grep \$C /hive/data/genomes/$db/chrom.sizes | cut -f2`
    hgsql -N -e \\
 "select chromStart,contig,size,chrom,\$size from ctgPos where chrom='\$C';" \\
 	$db  | awk '{gsub("\\\\.[0-9]+", "", \$2); print }' >> randoms.$db.lift
 end
 _EOF_
       );
   }
   #  somg
   $bossScript->add(<<_EOF_
 zcat ../download/$ensGtfFile \\
 _EOF_
   );
   #  translate Ensemble chrom names to UCSC chrom name
   if (defined $nameTranslation) {
   $bossScript->add(<<_EOF_
 	| sed -e $nameTranslation \\
 _EOF_
   );
   }
   # lift down haplotypes if necessary
   if (defined $haplotypeLift) {
       $bossScript->add(<<_EOF_
 	| liftUp -type=.gtf stdout $haplotypeLift carry stdin \\
 _EOF_
       );
   }
   #  lift randoms if necessary
   if ($lifting) {
       $bossScript->add(<<_EOF_
 	| liftUp -type=.gtf stdout randoms.$db.lift carry stdin \\
 _EOF_
       );
   }
   #  result is protein coding set
   $bossScript->add(<<_EOF_
 	| gzip > allGenes.gtf.gz
 _EOF_
   );
   $bossScript->add(<<_EOF_
 gtfToGenePred -includeVersion -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \\
     | gzip > $db.allGenes.gp.gz
 $Bin/extractGtf.pl infoOut.txt > ensGtp.tab
 $Bin/ensemblInfo.pl infoOut.txt > ensemblToGeneName.tab
 $Bin/extractSource.pl allGenes.gtf.gz | sort -u > ensemblSource.tab
 set NL = `awk 'BEGIN{max=0}{if (length(\$1) > max) max=length(\$1)}END{print max}' ensemblToGeneName.tab`
 set VL = `awk 'BEGIN{max=0}{if (length(\$2) > max) max=length(\$2)}END{print max}' ensemblToGeneName.tab`
 sed -e "s/ knownTo / ensemblToGeneName /; s/known gene/ensGen/; s/INDEX(name(12)/PRIMARY KEY(name(\$NL)/; s/value(12)/value(\$VL)/" \\
             $ENV{'HOME'}/kent/src/hg/lib/knownTo.sql > ensemblToGeneName.sql
 sed -e "s/name(15)/name(\$NL)/" \\
 	$ENV{'HOME'}/kent/src/hg/lib/ensemblSource.sql > ensemblSource.sql
 _EOF_
   );
   if ($opt_vegaGene) {
   $bossScript->add(<<_EOF_
 zcat allGenes.gtf.gz | grep -i pseudo \\
   | sed -e 's/gene_id/other_gene_id/; s/gene_name/gene_id/' > vegaPseudo.gtf
 zcat allGenes.gtf.gz | grep -v -i pseudo \\
   | sed -e 's/gene_id/other_gene_id/; s/gene_name/gene_id/' > not.vegaPseudo.gtf
 gtfToGenePred -includeVersion -genePredExt vegaPseudo.gtf vegaPseudo.gp
 gtfToGenePred -includeVersion -genePredExt not.vegaPseudo.gtf not.vegaPseudo.gp
 gzip vegaPseudo.gp not.vegaPseudo.gp
 _EOF_
   );
   }
   if (defined $geneScaffolds) {
       $bossScript->add(<<_EOF_
 mv $db.allGenes.gp.gz $db.allGenes.beforeLift.gp.gz
 $Bin/ensGeneScaffolds.pl ../download/seq_region.txt.gz \\
 	../download/assembly.txt.gz | gzip > $db.ensGene.lft.gz
 liftAcross -warn -bedOut=ensemblGeneScaffolds.$db.bed $db.ensGene.lft.gz \\
 	$db.allGenes.beforeLift.gp.gz $db.allGenes.gp >& liftAcross.err.out
 gzip ensemblGeneScaffolds.$db.bed $db.allGenes.gp liftAcross.err.out
 _EOF_
       );
   }
   if (defined $liftMtOver) {
       $bossScript->add(<<_EOF_
 cp $db.allGenes.gp.gz $db.allGenes.beforeLiftMtOver.gp.gz
 zcat $db.allGenes.gp.gz > all.gp
 grep chrM all.gp | liftOver -genePred stdin $liftMtOver chrMLifted.gp noMap.chrM
 grep -v chrM all.gp | cat - chrMLifted.gp > allLifted.gp
 gzip -c allLifted.gp > $db.allGenes.gp.gz
 rm *.gp
 _EOF_
       );
     }
   if (defined $liftUp) {
       $bossScript->add(<<_EOF_
 mv $db.allGenes.gp.gz $db.allGenes.beforeLiftUp.gp.gz
 liftUp -extGenePred -type=.gp $db.allGenes.gp \\
     $liftUp carry \\
     $db.allGenes.beforeLiftUp.gp.gz
 gzip $db.allGenes.gp
 _EOF_
       );
       if (defined $geneScaffolds) {
       $bossScript->add(<<_EOF_
 mv ensemblGeneScaffolds.$db.bed.gz ensemblGeneScaffolds.$db.beforeLiftUp.bed.gz
 liftUp -type=.bed ensemblGeneScaffolds.$db.bed \\
     $liftUp carry \\
     ensemblGeneScaffolds.$db.beforeLiftUp.bed.gz
 gzip ensemblGeneScaffolds.$db.bed
 _EOF_
 	  );
       }
   }
   # if all of these are supposed to be valid, they should be able to
   #	pass genePredCheck right now
   if (! defined $skipInvalid) {
       if ($opt_vegaGene) {
       $bossScript->add(<<_EOF_
 genePredCheck -db=$db vegaPseudo.gp.gz
 genePredCheck -db=$db not.vegaPseudo.gp.gz
 _EOF_
 	  );
       }
       if (-s "$chromSizes") {
       $bossScript->add(<<_EOF_
 genePredCheck -chromSizes=$chromSizes $db.allGenes.gp.gz
 _EOF_
          );
       } else {
       $bossScript->add(<<_EOF_
 genePredCheck -db=$db $db.allGenes.gp.gz
 _EOF_
          );
       }
       $bossScript->add(<<_EOF_
 # construct bigBed and index for assembly hub
 mkdir -p bbi
 genePredToBed $db.allGenes.gp.gz stdout | sort -k1,1 -k2,2n > $db.ensGene.bed
 _EOF_
 	  );
       if (-s "$chromSizes") {
       $bossScript->add(<<_EOF_
 bedToBigBed -extraIndex=name $db.ensGene.bed $chromSizes bbi/$db.ensGene.bb
 _EOF_
 	  );
       } else {
       $bossScript->add(<<_EOF_
 bedToBigBed -extraIndex=name $db.ensGene.bed ../../../chrom.sizes bbi/$db.ensGene.bb
 _EOF_
 	  );
       }
       $bossScript->add(<<_EOF_
-grep -v "^#" infoOut.txt | awk '{printf "%s\\t%s,%s,%s,%s,%s\\n", \$1,\$2,\$3,\$8,\$9,\$10}' > $db.ensGene.nameIndex.txt
-ixIxx $db.ensGene.nameIndex.txt $db.ensGene.name.ix $db.ensGene.name.ixx
+grep -v "^#" infoOut.txt \\
+  | awk -F\$'\\t' '{printf "%s\\t%s,%s,%s,%s\\n", \$1,\$2,\$8,\$9,\$10}' \\
+    | sed -e 's/,,/,/g; s/,\$//;' > $db.ensGene.nameIndex.txt
+ixIxx $db.ensGene.nameIndex.txt $db.ensGene.ix $db.ensGene.ixx
 
 _EOF_
 	  );
       }
   $bossScript->execute() if (! $opt_debug);
 } # doProcess
 
 #########################################################################
 # * step: download [dbHost]
 sub doDownload {
   my $runDir = "$buildDir/download";
   # check if been already done
   if (-s "$runDir/$ensGtfFile" && -s "$runDir/$ensPepFile" ) {
      &HgAutomate::verbose(1,
          "# step download is already completed, continuing...\n");
      return;
   }
   # If not already done, then it should be clean.
   if (-d "$runDir" && ! $opt_debug) {
     die "ERROR: download: looks like this was attempted unsuccessfully" .
       " before.\n($runDir exists, download files do not)\n";
   }
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "download data from Ensembl FTP site.";
   my $bossScript = new HgRemoteScript("$runDir/doDownload.csh", $dbHost,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 wget --tries=2 --user=anonymous --password=ucscGenomeBrowser\@ucsc.edu \\
 $ensGtfUrl \\
 -O $ensGtfFile
 wget --tries=2 --user=anonymous --password=ucscGenomeBrowser\@ucsc.edu \\
 $ensPepUrl \\
 -O $ensPepFile
 _EOF_
   );
   if (defined $geneScaffolds) {
       $bossScript->add(<<_EOF_
 wget --tries=2 --user=anonymous --password=ucscGenomeBrowser\@ucsc.edu \\
 $ensMySqlUrl/seq_region.txt.gz \\
 -O seq_region.txt.gz
 wget --tries=2 --user=anonymous --password=ucscGenomeBrowser\@ucsc.edu \\
 $ensMySqlUrl/assembly.txt.gz \\
 -O assembly.txt.gz
 _EOF_
       );
   }
   $bossScript->execute() if (! $opt_debug);
 } # doDownload
 
 #########################################################################
 # * step: cleanup [dbHost]
 sub doCleanup {
   my $runDir = "$buildDir";
   my $whatItDoes = "It cleans up or compresses intermediate files.";
   my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $dbHost,
 				      $runDir, $whatItDoes);
   if ($opt_vegaGene) {
     $bossScript->add(<<_EOF_
 rm -f pseudo.name not.pseudo.name vegaGene.name vegaPepAll.$db.fa.tab vegaPep.name
 gzip vegaPep.$db.fa.tab
 _EOF_
     );
   } else {
     $bossScript->add(<<_EOF_
 rm -f bed.tab ensPep.txt.gz ensPep.$db.fa.tab ensPep.name ensGene.name
 _EOF_
     );
   }
   $bossScript->execute() if (! $opt_debug);
 } # doCleanup
 
 #########################################################################
 # * step: makeDoc [dbHost]
 sub doMakeDoc {
   my $runDir = "$buildDir";
   my $whatItDoes = "Display the make doc text to stdout.";
 
   my $updateTime = `hgsql -N -e 'select updateTime from trackVersion where db = "$db" order by updateTime DESC limit 1;' hgFixed`;
   chomp $updateTime;
   $updateTime =~ s/ .*//;	#	removes time
   my $organism = `hgsql -N -e 'select organism from dbDb where name = "$db";' hgcentraltest`;
   chomp $organism;
   if (length($organism) < 1) {
      if ( -s "$HgAutomate::clusterData/$db/species.name.txt" ) {
         $organism = `cat $HgAutomate::clusterData/$db/species.name.txt`;
         chomp $organism;
      } else {
         $organism = "species name not found";
      }
   }
 
   my $vegaOpt = "";
   my $trackName = "Ensembl";
   my $tableName = "ensGene";
   my $workDir = "ensGene";
   $tableName = "vegaGene" if ($opt_vegaGene);
   $trackName = "Vega" if ($opt_vegaGene);
   $vegaOpt = "-vegaGene" if ($opt_vegaGene);
   $workDir = "vega" if ($opt_vegaGene);
   if ($opt_vegaGene) {
     print <<_EOF_
 ############################################################################
 #  $db - $organism - $trackName Genes version $ensVersion  (DONE - $updateTime - $ENV{'USER'})
     ssh $dbHost
     cd /hive/data/genomes/$db
 _EOF_
   ;
   } else {
     print <<_EOF_
 ############################################################################
 #  $db - $organism - $trackName Genes version $ensVersion  (DONE - $updateTime - $ENV{'USER'})
     ssh $dbHost
     cd /hive/data/genomes/$db
     cat << '_EOF_' > $db.$tableName.ra
 _EOF_
   ;
     print `cat $db.ensGene.ra`;
     print "'_EOF_'\n";
     print "#  << happy emacs\n\n";
   }
   print "    doEnsGeneUpdate.pl ${vegaOpt} -ensVersion=$ensVersion $db.ensGene.ra\n";
   print "    ssh hgwdev\n";
   print "    cd /hive/data/genomes/$db/bed/$workDir.$ensVersion\n";
   print "    featureBits $db $tableName\n";
   print "    # ";
   print `featureBits $db $tableName`;
   if ($opt_vegaGene) {
     print "    featureBits $db vegaPseudoGene\n";
     print "    # ";
     print `featureBits $db vegaPseudoGene`;
   }
   print "############################################################################\n";
 
 } # doMakeDoc
 
 sub requireVar {
   # Ensure that var is in %config and return its value.
   # Remove it from %config so we can check for unrecognized contents.
   my ($var, $config) = @_;
   my $val = $config->{$var}
     || die "ERROR: requireVar: $CONFIG is missing required variable \"$var\".\n" .
       "For a detailed list of required variables, run \"$base -help\".\n";
   delete $config->{$var};
   return $val;
 } # requireVar
 
 sub optionalVar {
   # If var has a value in %config, return it.
   # Remove it from %config so we can check for unrecognized contents.
   my ($var, $config) = @_;
   my $val = $config->{$var};
   delete $config->{$var} if ($val);
   return $val;
 } # optionalVar
 
 sub parseConfig {
   # Parse config.ra file, make sure it contains the required variables.
   my ($configFile) = @_;
   my %config = ();
   my $fh = &HgAutomate::mustOpen($configFile);
   while (<$fh>) {
     next if (/^\s*#/ || /^\s*$/);
     if (/^\s*(\w+)\s*(.*)$/) {
       my ($var, $val) = ($1, $2);
       if (! exists $config{$var}) {
 	$config{$var} = $val;
       } else {
 	die "ERROR: parseConfig: Duplicate definition for $var line $. of config file $configFile.\n";
       }
     } else {
       die "ERROR: parseConfig: Can't parse line $. of config file $configFile:\n$_\n";
     }
   }
   close($fh);
 
   # Required variables.
   $db = &requireVar('db', \%config);
   $species = &HgAutomate::getSpecies($dbHost, $db);
   &HgAutomate::verbose(1,
 	"\n db: $db, species: '$species'\n");
 
   # Optional variables.
   $liftRandoms = &optionalVar('liftRandoms', \%config);
   $nameTranslation = &optionalVar('nameTranslation', \%config);
   $geneScaffolds = &optionalVar('geneScaffolds', \%config);
   $knownToEnsembl = &optionalVar('knownToEnsembl', \%config);
   $skipInvalid = &optionalVar('skipInvalid', \%config);
   $haplotypeLift = &optionalVar('haplotypeLift', \%config);
   $liftUp = &optionalVar('liftUp', \%config);
   $liftMtOver = &optionalVar('liftMtOver', \%config);
   #	verify they actually do say yes
   if (defined $liftRandoms && $liftRandoms !~ m/^yes$/i) {
 	undef $liftRandoms;
   }
   if (defined $geneScaffolds && $geneScaffolds !~ m/^yes$/i) {
 	undef $geneScaffolds;
   }
   if (defined $knownToEnsembl && $knownToEnsembl !~ m/^yes$/i) {
 	undef $knownToEnsembl;
   }
   if (defined $skipInvalid && $skipInvalid !~ m/^yes$/i) {
 	undef $skipInvalid;
   }
   if (defined $haplotypeLift) {
     if (! -s $haplotypeLift) {
         print STDERR "ERROR: config file specifies a haplotypeLift file:\n";
 	die "$haplotypeLift can not be found.\n";
     }
   }
   if (defined $liftUp) {
     if (! -s $liftUp) {
         print STDERR "ERROR: config file specifies a liftUp file:\n";
 	die "$liftUp can not be found.\n";
     }
   }
   if (defined $liftMtOver) {
     if (! -s $liftMtOver) {
         print STDERR "ERROR: config file specifies a liftMtOver file:\n";
         die "$liftMtOver can not be found.\n";
     }
   } 
 
   # Make sure no unrecognized variables were given.
   my @stragglers = sort keys %config;
   if (scalar(@stragglers) > 0) {
     die "ERROR: parseConfig: config file $CONFIG has unrecognized variables:\n"
       . "    " . join(", ", @stragglers) . "\n" .
       "For a detailed list of supported variables, run \"$base -help\".\n";
   }
   $topDir = "/hive/data/genomes/$db";
 } # parseConfig
 
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
 &usage(1) if (scalar(@ARGV) != 1);
 
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 
 if (!defined $ENV{'USER'}) {
     print STDERR "ERROR: your shell environment does not define a USER";
     print STDERR "The USER identity is required for history";
     exit 255;
 }
 
 if (!defined $opt_ensVersion) {
     print STDERR "ERROR: must specify -ensVersion=NN on command line\n";
     &usage(1);
 }
 
 $ensVersion = $opt_ensVersion;
 
 if (! $opt_vegaGene) {
   if ($versionListString !~ m/$ensVersion/) {
     print STDERR "ERROR: do not recognize given -ensVersion=$ensVersion\n";
     die "must be one of: $versionListString\n";
   }
 }
 
 ($CONFIG) = @ARGV;
 &parseConfig($CONFIG);
 $bedDir = "$topDir/$HgAutomate::trackBuild";
 
 # Force debug and verbose until this is looking pretty solid:
 # $opt_debug = 1;
 # $opt_verbose = 3 if ($opt_verbose < 3);
 printf STDERR "# running debug mode, will not execute scripts\n" if ($opt_debug);
 
 # Establish previous version
 $previousEnsVersion = `hgsql -Ne 'select max(version) from trackVersion where name="ensGene" AND db="$db" AND version<$ensVersion;' hgFixed`;
 chomp $previousEnsVersion;
 if ( $previousEnsVersion eq 'NULL') { $previousEnsVersion=0;}
 
 $chromSizes = $opt_chromSizes ? $opt_chromSizes: "";
 
 # Establish what directory we will work in, tack on the ensembl version ID.
 if ($opt_vegaGene) {
   if ($db !~ m/^mm|^hg|^danRer/) {
     printf STDERR "ERROR: this is database: $db\n";
     die "vegaGene build only good on human, mouse or zebrafish\n";
   }
   $buildDir = $opt_buildDir ? $opt_buildDir :
     "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/vega.$ensVersion";
   $previousBuildDir =
     "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/vega.$previousEnsVersion";
 } else {
   $buildDir = $opt_buildDir ? $opt_buildDir :
     "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/ensGene.$ensVersion";
   $previousBuildDir =
     "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/ensGene.$previousEnsVersion";
 }
 
 if ($opt_vegaGene) {
     $vegaSpecies = "mouse" if ($db =~ m/^mm/);
     $vegaSpecies = "human" if ($db =~ m/^hg/);
     $vegaSpecies = "danio" if ($db =~ m/^danRer/);
     $vegaPep = "Mus_musculus.VEGA$ensVersion" if ($db =~ m/^mm/);
     $vegaPep = "Homo_sapiens.VEGA$ensVersion" if ($db =~ m/^hg/);
     $vegaPep = "Danio_rerio.VEGA$ensVersion" if ($db =~ m/^danRer/);
     $ensGtfUrl = "ftp://ftp.sanger.ac.uk/pub/vega/$vegaSpecies/gtf_file.gz";
     $ensGtfUrl = "ftp://ftp.sanger.ac.uk/pub/vega/$vegaSpecies/gtf_human_rel$ensVersion.gz" if ($db =~ m/^hg/);;
     $ensPepUrl = "ftp://ftp.sanger.ac.uk/pub/vega/$vegaSpecies/pep/$vegaPep.$ensVersion.pep.all.fa.gz";
     $ensMySqlUrl = "";
     $ensVersionDateReference = `date "+%Y-%m-%d"`;
     chomp $ensVersionDateReference;
 } else {
   ($ensGtfUrl, $ensPepUrl, $ensMySqlUrl, $ensVersionDateReference) =
     &EnsGeneAutomate::ensGeneVersioning($db, $ensVersion );
 }
 
 die "ERROR: download: can not find Ensembl version $ensVersion FTP URL for UCSC database $db"
     if (!defined $ensGtfUrl);
 
 $ensGtfFile = basename($ensGtfUrl);
 $ensPepFile = basename($ensPepUrl);
 &HgAutomate::verbose(2,"Ensembl GTF URL: $ensGtfUrl\n");
 &HgAutomate::verbose(2,"Ensembl GTF File: $ensGtfFile\n");
 &HgAutomate::verbose(2,"Ensembl PEP URL: $ensPepUrl\n");
 &HgAutomate::verbose(2,"Ensembl PEP File: $ensPepFile\n");
 &HgAutomate::verbose(2,"Ensembl MySql URL: $ensMySqlUrl\n");
 &HgAutomate::verbose(2,"Ensembl Date Reference: $ensVersionDateReference\n");
 
 # Do everything.
 $stepper->execute();
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 &HgAutomate::verbose(1,
 	"\n *** All done !$upThrough  Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s\n");
 &HgAutomate::verbose(1,
 	" *** Steps were performed in $buildDir\n");
 &HgAutomate::verbose(1, "\n");