bf57fe2786a732c3e49ae6453aed45b8062b6ed6
hiram
  Thu Aug 1 14:57:57 2019 -0700
add twoBitDup check and construct liftOver files and add doNcbiGene refs #23734

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index 514dee5..5b64ed1 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -35,30 +35,31 @@
     [ { name => 'download',   func => \&doDownload },
       { name => 'sequence',   func => \&doSequence },
       { name => 'assemblyGap',   func => \&doAssemblyGap },
       { name => 'gatewayPage',   func => \&doGatewayPage },
       { name => 'cytoBand',   func => \&doCytoBand },
       { name => 'gc5Base',   func => \&doGc5Base },
       { name => 'repeatMasker',   func => \&doRepeatMasker },
       { name => 'simpleRepeat',   func => \&doSimpleRepeat },
       { name => 'allGaps',   func => \&doAllGaps },
       { name => 'idKeys',   func => \&doIdKeys },
       { name => 'windowMasker',   func => \&doWindowMasker },
       { name => 'addMask',   func => \&doAddMask },
       { name => 'gapOverlap',   func => \&doGapOverlap },
       { name => 'tandemDups',   func => \&doTandemDups },
       { name => 'cpgIslands',   func => \&doCpgIslands },
+      { name => 'ncbiGene',   func => \&doNcbiGene },
       { name => 'augustus',   func => \&doAugustus },
       { name => 'trackDb',   func => \&doTrackDb },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $sourceDir = "/hive/data/outside/ncbi/genomes";
 my $augustusSpecies = "human";
 my $ucscNames = 0;  # default 'FALSE' (== 0)
 my $workhorse = "hgwdev";  # default workhorse when none chosen
 my $fileServer = "hgwdev";  # default when none chosen
 my $bigClusterHub = "ku";  # default when none chosen
 my $smallClusterHub = "ku";  # default when none chosen
@@ -277,38 +278,41 @@
 }	# sub compositeFasta($$$)
 
 #########################################################################
 # process NCBI unlocalized AGP file, perhaps translate into UCSC naming scheme
 #   the agpNames result file is a naming correspondence file for later use
 sub unlocalizedAgp($$$$) {
   my ($chr2acc, $agpSource, $agpOutput, $agpNames) = @_;
 
   my %accToChr;
   readChr2Acc($chr2acc, \%accToChr);
   if ($ucscNames) {
     foreach my $acc (keys %accToChr) {
       my $ucscAcc = $acc;
       $ucscAcc =~ s/\./v/;
       $accToChr{$ucscAcc} = $accToChr{$acc};
-      $accToChr{$acc} = undef;
+      delete $accToChr{$acc};
     }
   }
 
   open (AGP, "|gzip -c >$agpOutput") or die "can not write to $agpOutput";
   open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames";
+  my %chrNDone;	# key is chrom name, value is 1 when done
   foreach my $acc (keys %accToChr) {
     my $chrN = $accToChr{$acc};
+    next if (exists($chrNDone{$chrN}));
+    $chrNDone{$chrN} = 1;
     my $agpFile =  "$agpSource/chr$chrN.unlocalized.scaf.agp.gz";
     open (FH, "zcat $agpFile|") or die "can not read $agpFile";
     while (my $line = <FH>) {
       if ($line !~ m/^#/) {
          chomp $line;
          my (@a) = split('\t', $line);
          my $ncbiAcc = $a[0];
          my $acc = $ncbiAcc;
          $acc =~ s/\./v/ if ($ucscNames);
          die "ERROR: chrN $chrN not correct for $acc"
              if ($accToChr{$acc} ne $chrN);
          my $ucscName = "${acc}";
          $ucscName = "chr${chrN}_${acc}_random" if ($ucscNames);
          printf NAMES "%s\t%s\n", $ucscName, $ncbiAcc;
          printf AGP "%s", $ucscName;    # begin AGP line with accession name
@@ -584,50 +588,61 @@
 
 ### XXX TO BE DONE - construct sequence when no AGP files exist
 
   $bossScript->add(<<_EOF_
 export asmId="$asmId"
 
 
 if [ -s ../\$asmId.chrom.sizes ]; then
   printf "sequence step previously completed\\n" 1>&2
   exit 0
 fi
 
 zcat *.agp.gz | gzip > ../\$asmId.agp.gz
 faToTwoBit *.fa.gz ../\$asmId.2bit
 faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit
+twoBitDup -keyList=stdout ../\$asmId.unmasked.2bit > \$asmId.dupCheck.txt
+(grep "are identical" \$asmId.dupCheck.txt || true) > \$asmId.dups.txt
+if [ -s "\$asmId.dups.txt" ]; then
+  printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\n" 1>&2
+  grep "are identical" \$asmId.dupCheck.txt 1>&2
+  exit 255
+else
+  rm -f \$asmId.dups.txt
+fi
 touch -r ../download/\$asmId.2bit ../\$asmId.2bit
 touch -r ../download/\$asmId.2bit ../\$asmId.unmasked.2bit
 touch -r ../download/\$asmId.2bit ../\$asmId.agp.gz
 twoBitInfo ../\$asmId.2bit stdout | sort -k2nr > ../\$asmId.chrom.sizes
 touch -r ../\$asmId.2bit ../\$asmId.chrom.sizes
 # verify everything is there
 twoBitInfo ../download/\$asmId.2bit stdout | sort -k2nr > source.\$asmId.chrom.sizes
 export newTotal=`ave -col=2 ../\$asmId.chrom.sizes | grep "^total"`
 export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"`
 if [ "\$newTotal" != "\$oldTotal" ]; then
   printf "# ERROR: sequence construction error: not same totals source vs. new:\n" 1>&2
   printf "# \$newTotal != \$oldTotal\n" 1>&2
   exit 255
 fi
 rm source.\$asmId.chrom.sizes
 export checkAgp=`checkAgpAndFa ../\$asmId.agp.gz ../\$asmId.2bit 2>&1 | tail -1`
 if [ "\$checkAgp" != "All AGP and FASTA entries agree - both files are valid" ]; then
   printf "# ERROR: checkAgpAndFa \$asmId.agp.gz \$asmId.2bit failing\n" 1>&2
   exit 255
 fi
+join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$3,\$2,\$1,\$2}' > \$asmId.ncbiToUcsc.lift
+join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$1,\$2,\$3,\$2}' > \$asmId.ucscToNcbi.lift
 
 twoBitToFa ../\$asmId.2bit stdout | faCount stdin | gzip -c > \$asmId.faCount.txt.gz
 touch -r ../\$asmId.2bit \$asmId.faCount.txt.gz
 zgrep -P "^total\t" \$asmId.faCount.txt.gz > \$asmId.faCount.signature.txt
 touch -r ../\$asmId.2bit \$asmId.faCount.signature.txt
 _EOF_
   );
   $bossScript->execute();
 } # doSequence
 
 #########################################################################
 # * step: assemblyGap [workhorse]
 sub doAssemblyGap {
   my $runDir = "$buildDir/trackData/assemblyGap";
   &HgAutomate::mustMkdir($runDir);
@@ -793,31 +808,33 @@
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 if [ $buildDir/\$asmId.2bit -nt faSize.rmsk.txt ]; then
 export species=`echo $species | sed -e 's/_/ /g;'`
 
 doRepeatMasker.pl -stop=mask -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit \\
   -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId
 
 gzip \$asmId.sorted.fa.out \$asmId.fa.out \$asmId.nestedRepeats.bed
 
 doRepeatMasker.pl -continue=cleanup -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit \\
   -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId
 
 \$HOME/kent/src/hg/utils/automation/asmHubRepeatMasker.sh \$asmId `pwd`/\$asmId.sorted.fa.out.gz `pwd`
-
+else
+  printf "# repeatMasker step previously completed\\n" 1>&2
+  exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 } # repeatMasker
 
 #########################################################################
 # * step: simpleRepeat [workhorse]
 sub doSimpleRepeat {
   my $runDir = "$buildDir/trackData/simpleRepeat";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct TRF/simpleRepeat track data";
   my $bossScript = newBash HgRemoteScript("$runDir/doSimpleRepeat.bash",
                     $workhorse, $runDir, $whatItDoes);
@@ -1150,57 +1167,128 @@
   doCpgIslands.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` \\
     -dbHost=$dbHost \\
     -smallClusterHub=$smallClusterHub -bigClusterHub=$bigClusterHub -workhorse=$workhorse \\
     -maskedSeq=$buildDir/trackData/addMask/\$asmId.masked.2bit \\
     -chromSizes=$buildDir/\$asmId.chrom.sizes \$asmId
 else
   printf "# cpgIslands masked previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 } # sub doCpgIslands
 
 #########################################################################
+# * step: ncbiGene [workhorse]
+sub doNcbiGene {
+  my $gffFile = "$sourceDir/${asmId}_genomic.gff.gz";
+  if ( ! -s "${gffFile}" ) {
+    printf STDERR "# step ncbiGene: no gff file found at:\n#  %s\n", $gffFile;
+    return;
+  }
+  if ( ! -s "$buildDir/sequence/$asmId.ncbiToUcsc.lift" ) {
+    printf STDERR "# ERROR: ncbiGene: can not find ../../sequence/$asmId.ncbiToUcsc.lift\n";
+    exit 255;
+  }
+  my $runDir = "$buildDir/trackData/ncbiGene";
+
+  &HgAutomate::mustMkdir($runDir);
+
+  my $whatItDoes = "translate NCBI GFF3 gene definitions into a track";
+  my $bossScript = newBash HgRemoteScript("$runDir/doNcbiGene.bash",
+                    $workhorse, $runDir, $whatItDoes);
+
+  $bossScript->add(<<_EOF_
+export asmId=$asmId
+export gffFile=$gffFile
+
+function cleanUp() {
+  rm -f \$asmId.ncbiGene.genePred.gz \$asmId.ncbiGene.genePred
+  rm -f \$asmId.geneAttrs.ncbi.txt
+}
+
+if [ \$gffFile -nt \$asmId.ncbiGene.bb ]; then
+  (gff3ToGenePred -warnAndContinue -useName \\
+    -attrsOut=\$asmId.geneAttrs.ncbi.txt \$gffFile stdout \\
+      2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter stdin stdout \\
+        | gzip -c > \$asmId.ncbiGene.genePred.gz
+  genePredCheck \$asmId.ncbiGene.genePred.gz
+  export howMany=`genePredCheck \$asmId.ncbiGene.genePred.gz 2>&1 | grep "^checked" | awk '{print \$2}'`
+  if [ "\${howMany}" -eq 0 ]; then
+     printf "# ncbiGene: no gene definitions found in \$gffFile\n";
+     cleanUp
+     exit 0
+  fi
+  liftUp -extGenePred -type=.gp stdout \\
+      ../../sequence/\$asmId.ncbiToUcsc.lift error \\
+       \$asmId.ncbiGene.genePred.gz | gzip -c \\
+          > \$asmId.ncbiGene.ucsc.genePred.gz
+  ~/kent/src/hg/utils/automation/gpToIx.pl \$asmId.ncbiGene.ucsc.genePred.gz \\
+    | sort -u > \$asmId.ncbiGene.ix.txt
+  ixIxx \$asmId.ncbiGene.ix.txt \$asmId.ncbiGene.ix \$asmId.ncbiGene.ixx
+  rm -f \$asmId.ncbiGene.ix.txt
+  genePredToBigGenePred \$asmId.ncbiGene.ucsc.genePred.gz stdout \\
+      | sort -k1,1 -k2,2n > \$asmId.ncbiGene.bed
+  (bedToBigBed -type=bed12+8 -tab -as=\$HOME/kent/src/hg/lib/bigGenePred.as \\
+      -extraIndex=name \$asmId.ncbiGene.bed \\
+        ../../\$asmId.chrom.sizes \$asmId.ncbiGene.bb || true)
+  if [ ! -s "\$asmId.ncbiGene.bb" ]; then
+    printf "# ncbiGene: failing bedToBigBed\\n" 1>&2
+    exit 255
+  fi
+  touch -r\$gffFile \$asmId.ncbiGene.bb
+  bigBedInfo \$asmId.ncbiGene.bb | egrep "^itemCount:|^basesCovered:" \\
+    | sed -e 's/,//g' > \$asmId.ncbiGene.stats.txt
+  LC_NUMERIC=en_US /usr/bin/printf "# ncbiGene %s %'d %s %'d\\n" `cat \$asmId.ncbiGene.stats.txt` | xargs echo
+else
+  printf "# ncbiGene previously completed\\n" 1>&2
+fi
+_EOF_
+  );
+  $bossScript->execute();
+} # doNcbiGene
+
+
+#########################################################################
 # * step: augustus [workhorse]
 sub doAugustus {
   my $runDir = "$buildDir/trackData/augustus";
 
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "run Augustus gene prediction procedures";
   my $bossScript = newBash HgRemoteScript("$runDir/doAugustus.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 if [ $buildDir/\$asmId.2bit -nt \$asmId.augustus.bb ]; then
   time (~/kent/src/hg/utils/automation/doAugustus.pl -stop=makeGp -buildDir=`pwd` -dbHost=$dbHost \\
     -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\
     -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > makeDb.log 2>&1
   time (~/kent/src/hg/utils/automation/doAugustus.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\
     -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\
     -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > cleanup.log 2>&1
 else
   printf "# augustus genes previously completed\\n" 1>&2
 fi
 _EOF_
   );
   $bossScript->execute();
-} # windowMasker
+} # doAugustus
 
 #########################################################################
 # * step: trackDb [workhorse]
 sub doTrackDb {
   my $runDir = "$buildDir";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct asmId.trackDb.txt file";
   my $bossScript = newBash HgRemoteScript("$runDir/doTrackDb.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 \$HOME/kent/src/hg/utils/automation/asmHubTrackDb.sh $genbankRefseq \$asmId $runDir \\