src/hg/utils/automation/doEnsGeneUpdate.pl 1.20

1.20 2010/01/15 22:25:55 hiram
adding vegaGene processing option
Index: src/hg/utils/automation/doEnsGeneUpdate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/doEnsGeneUpdate.pl,v
retrieving revision 1.19
retrieving revision 1.20
diff -b -B -U 4 -r1.19 -r1.20
--- src/hg/utils/automation/doEnsGeneUpdate.pl	27 Oct 2009 21:26:00 -0000	1.19
+++ src/hg/utils/automation/doEnsGeneUpdate.pl	15 Jan 2010 22:25:55 -0000	1.20
@@ -21,8 +21,9 @@
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_ensVersion
+    $opt_vegaGene
     $opt_buildDir
     /;
 
 
@@ -37,8 +38,10 @@
 );
 
 # Option defaults:
 my $dbHost = 'hgwdev';
+my $vegaSpecies = "human";
+my $vegaPep = "Homo_sapiens.VEGA";
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 my (@versionList) = &EnsGeneAutomate::ensVersionList();
@@ -58,18 +61,17 @@
 other options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
-    -ensVersion NN        Request one of the Ensembl versions:
-                          Currently available: $versionListString
+    -vegaGene             Special processing for vegaGene instead of Ensembl
+                          works only on Human, Mouse, Zebrafish
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ensGene.<ensVersion #>
                           (necessary if experimenting with builds).
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => '',
-						'fileServer' => '',
 						'bigClusterHub' => '',
 						'smallClusterHub' => '');
   print STDERR "
 Automates UCSC's Ensembl gene updates.  Steps:
@@ -140,12 +142,12 @@
 
 # Globals:
 my ($species, $ensGtfUrl, $ensGtfFile, $ensPepUrl, $ensPepFile,
     $ensMySqlUrl, $ensVersionDateReference, $previousEnsVersion, $buildDir,
-    $previousBuildDir);
+    $previousBuildDir, $vegaGene);
 # Command line argument:
 my $CONFIG;
-# Required command line argumen:
+# Required command line arguments:
 my ($ensVersion);
 # Required config parameters:
 my ($db);
 # Conditionally required config parameters:
@@ -158,8 +160,9 @@
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'ensVersion=s',
+		      'vegaGene',
 		      'buildDir=s',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
@@ -211,9 +214,45 @@
   } else {
       my $skipInv = "";
       $skipInv = "-skipInvalid" if (defined $skipInvalid);
 
-      if (defined $geneScaffolds) {
+      if ($opt_vegaGene) {
+      $bossScript->add(<<_EOF_
+hgLoadGenePred $skipInv -genePredExt $db \\
+    vegaGene process/not.vegaPseudo.gp.gz >& load.not.pseudo.errors.txt
+hgLoadGenePred $skipInv -genePredExt $db \\
+    vegaPseudoGene process/vegaPseudo.gp.gz >& load.pseudo.errors.txt
+hgsql -N -e "select name from vegaPseudoGene;" $db > pseudo.name
+hgsql -N -e "select name from vegaGene;" $db > not.pseudo.name
+sort -u pseudo.name not.pseudo.name > vegaGene.name
+sed -e "s/20/40/; s/19/39/" $ENV{'HOME'}/kent/src/hg/lib/ensGtp.sql \\
+    > vegaGtp.sql
+hgLoadSqlTab $db vegaGtp vegaGtp.sql process/ensGtp.tab
+zcat download/$ensPepFile \\
+	| sed -e 's/^>.* Transcript:/>/;' | gzip > vegaPep.txt.gz
+zcat vegaPep.txt.gz \\
+    | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \\
+	 | sed -e '/^\$/d; s/\*\$//' | sort > vegaPepAll.$db.fa.tab
+join vegaPepAll.$db.fa.tab vegaGene.name | sed -e "s/ /\\t/" \\
+    > vegaPep.$db.fa.tab
+hgPepPred $db tab vegaPep vegaPep.$db.fa.tab
+# verify names in vegaGene is a superset of names in vegaPep
+hgsql -N -e "select name from vegaPep;" $db | sort > vegaPep.name
+set geneCount = `cat vegaGene.name | wc -l`
+set pepCount = `cat vegaPep.name | wc -l`
+set commonCount = `comm -12 vegaPep.name vegaGene.name | wc -l`
+set percentId = \\
+    `echo \$commonCount \$pepCount | awk '{printf "%d", 100.0*\$1/\$2}'`
+echo "gene count: \$geneCount, peptide count: \$pepCount, common name count: \$commonCount"
+echo "percentId: \$percentId"
+if (! (\$percentId > 95)) then
+    echo "ERROR: percent coverage of peptides to genes: \$percentId"
+    echo "ERROR: should be greater than 95"
+    exit 255
+endif
+_EOF_
+	  );
+      } elsif (defined $geneScaffolds) {
 	  $bossScript->add(<<_EOF_
 hgLoadGenePred $skipInv -genePredExt $db ensGene process/$db.allGenes.gp.gz \\
 	>& loadGenePred.errors.txt
 zcat process/ensemblGeneScaffolds.$db.bed.gz \\
@@ -227,8 +266,9 @@
 _EOF_
 	  );
       }
 
+      if (! $opt_vegaGene) {
       $bossScript->add(<<_EOF_
 zcat download/$ensPepFile \\
 	| sed -e 's/^>.* transcript:/>/; s/ CCDS.*\$//;' | gzip > ensPep.txt.gz
 zcat ensPep.txt.gz \\
@@ -252,14 +292,25 @@
     exit 255
 endif
 _EOF_
       );
-      if (defined $knownToEnsembl) {
+      }
+      if (! $opt_vegaGene && defined $knownToEnsembl) {
 	  $bossScript->add(<<_EOF_
 hgMapToGene $db ensGene knownGene knownToEnsembl
 _EOF_
 	  );
       }
+      if ($opt_vegaGene) {
+      $bossScript->add(<<_EOF_
+hgsql -e 'INSERT INTO trackVersion \\
+    (db, name, who, version, updateTime, comment, source) \\
+    VALUES("$db", "vegaGene", "$ENV{'USER'}", "$ensVersion", now(), \\
+	"with peptides $ensPepFile", \\
+	"$ensGtfUrl" );' hgFixed
+_EOF_
+      );
+      } else {
       $bossScript->add(<<_EOF_
 hgsql -e 'INSERT INTO trackVersion \\
     (db, name, who, version, updateTime, comment, source) \\
     VALUES("$db", "ensGene", "$ENV{'USER'}", "$ensVersion", now(), \\
@@ -267,8 +318,9 @@
 	"$ensGtfUrl" );' hgFixed
 _EOF_
       );
   }
+  }
   $bossScript->execute();
 } # doLoad
 
 #########################################################################
@@ -340,8 +392,20 @@
     | gzip > $db.allGenes.gp.gz
 $Bin/extractGtf.pl infoOut.txt > ensGtp.tab
 _EOF_
   );
+  if ($opt_vegaGene) {
+  $bossScript->add(<<_EOF_
+zcat allGenes.gtf.gz | grep -i pseudo \\
+  | sed -e 's/gene_id/other_gene_id/; s/gene_name/gene_id/' > vegaPseudo.gtf
+zcat allGenes.gtf.gz | grep -v -i pseudo \\
+  | sed -e 's/gene_id/other_gene_id/; s/gene_name/gene_id/' > not.vegaPseudo.gtf
+gtfToGenePred -genePredExt vegaPseudo.gtf vegaPseudo.gp
+gtfToGenePred -genePredExt not.vegaPseudo.gtf not.vegaPseudo.gp
+gzip vegaPseudo.gp not.vegaPseudo.gp
+_EOF_
+  );
+  }
   if (defined $geneScaffolds) {
       $bossScript->add(<<_EOF_
 mv $db.allGenes.gp.gz $db.allGenes.beforeLift.gp.gz
 $Bin/ensGeneScaffolds.pl ../download/seq_region.txt.gz \\
@@ -374,8 +438,15 @@
   }
   # if all of these are supposed to be valid, they should be able to
   #	pass genePredCheck right now
   if (! defined $skipInvalid) {
+      if ($opt_vegaGene) {
+      $bossScript->add(<<_EOF_
+genePredCheck -db=$db vegaPseudo.gp.gz
+genePredCheck -db=$db not.vegaPseudo.gp.gz
+_EOF_
+	  );
+      }
       $bossScript->add(<<_EOF_
 genePredCheck -db=$db $db.allGenes.gp.gz
 _EOF_
 	  );
@@ -383,9 +454,9 @@
   $bossScript->execute();
 } # doProcess
 
 #########################################################################
-# * step: download [fileServer]
+# * step: download [dbHost]
 sub doDownload {
   my $runDir = "$buildDir/download";
   # First, make sure we're starting clean.
   if (-d "$runDir") {
@@ -395,10 +466,9 @@
   }
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "download data from Ensembl FTP site.";
-  my $fileServer = &HgAutomate::chooseFileServer($runDir);
-  my $bossScript = new HgRemoteScript("$runDir/doDownload.csh", $fileServer,
+  my $bossScript = new HgRemoteScript("$runDir/doDownload.csh", $dbHost,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 wget --tries=2 --timestamping --user=anonymous --password=ucscGenomeBrowser\@ucsc.edu \\
@@ -424,14 +494,13 @@
 } # doDownload
 
 
 #########################################################################
-# * step: cleanup [fileServer]
+# * step: cleanup [dbHost]
 sub doCleanup {
   my $runDir = "$buildDir";
   my $whatItDoes = "It cleans up or compresses intermediate files.";
-  my $fileServer = &HgAutomate::chooseFileServer($runDir);
-  my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $fileServer,
+  my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $dbHost,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 rm -f bed.tab ensPep.txt.gz ensPep.$db.fa.tab ensPep.name ensGene.name
 _EOF_
@@ -439,13 +508,12 @@
   $bossScript->execute();
 } # doCleanup
 
 #########################################################################
-# * step: makeDoc [fileServer]
+# * step: makeDoc [dbHost]
 sub doMakeDoc {
   my $runDir = "$buildDir";
   my $whatItDoes = "Display the make doc text to stdout.";
-  my $fileServer = &HgAutomate::chooseFileServer($runDir);
 
   my $updateTime = `hgsql -N -e 'select updateTime from trackVersion where db = "$db" order by updateTime DESC limit 1;' hgFixed`;
   chomp $updateTime;
   $updateTime =~ s/ .*//;	#	removes time
@@ -454,9 +522,9 @@
 
   print <<_EOF_
 ############################################################################
 #  $db - $organism - Ensembl Genes version $ensVersion  (DONE - $updateTime - $ENV{'USER'})
-    ssh $fileServer
+    ssh $dbHost
     cd /hive/data/genomes/$db
     cat << '_EOF_' > $db.ensGene.ra
 _EOF_
   ;
@@ -587,11 +655,13 @@
 
 $ensVersion = $opt_ensVersion;
 $previousEnsVersion = $ensVersion - 1;
 
-if ($versionListString !~ m/$ensVersion/) {
+if (! $opt_vegaGene) {
+  if ($versionListString !~ m/$ensVersion/) {
     print STDERR "ERROR: do not recognize given -ensVersion=$ensVersion\n";
     die "must be one of: $versionListString\n";
+  }
 }
 
 ($CONFIG) = @ARGV;
 &parseConfig($CONFIG);
@@ -601,15 +671,43 @@
 # $opt_debug = 1;
 # $opt_verbose = 3 if ($opt_verbose < 3);
 
 # Establish what directory we will work in, tack on the ensembl version ID.
-$buildDir = $opt_buildDir ? $opt_buildDir :
+if ($opt_vegaGene) {
+  if ($db !~ m/^mm|^hg|^danRer/) {
+    printf STDERR "ERROR: this is database: $db\n";
+    die "vegaGene build only good on human, mouse or zebrafish\n";
+  }
+  $buildDir = $opt_buildDir ? $opt_buildDir :
+    "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/vega.$ensVersion";
+  $previousBuildDir =
+    "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/vega.$previousEnsVersion";
+} else {
+  $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/ensGene.$ensVersion";
-$previousBuildDir =
+  $previousBuildDir =
   "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/ensGene.$previousEnsVersion";
+}
 
-($ensGtfUrl, $ensPepUrl, $ensMySqlUrl, $ensVersionDateReference) =
+if ($opt_vegaGene) {
+    $vegaSpecies = "mouse" if ($db =~ m/^mm/);
+    $vegaSpecies = "human" if ($db =~ m/^hg/);
+    $vegaSpecies = "danio" if ($db =~ m/^danRer/);
+    $vegaPep = "Mus_musculus.VEGA$ensVersion" if ($db =~ m/^mm/);
+    $vegaPep = "Homo_sapiens.VEGA" if ($db =~ m/^hg/);
+    $vegaPep = "Danio_rerio.VEGA$ensVersion" if ($db =~ m/^danRer/);
+}
+
+if ($opt_vegaGene) {
+  $ensGtfUrl = "ftp://ftp.sanger.ac.uk/pub/vega/$vegaSpecies/gtf_file.gz";
+  $ensPepUrl = "ftp://ftp.sanger.ac.uk/pub/vega/$vegaSpecies/pep/$vegaPep.$ensVersion.pep.all.fa.gz";
+  $ensMySqlUrl = "";
+  $ensVersionDateReference = `date "+%Y-%m-%d"`;
+  chomp $ensVersionDateReference;
+} else {
+  ($ensGtfUrl, $ensPepUrl, $ensMySqlUrl, $ensVersionDateReference) =
 	&EnsGeneAutomate::ensGeneVersioning($db, $ensVersion );
+}
 
 die "ERROR: download: can not find Ensembl version $ensVersion FTP URL for UCSC database $db"
     if (!defined $ensGtfUrl);