src/hg/utils/automation/doNcbiRefSeq.pl be2fb5e3a2af8d3e1f73c1cc5c6bd5ff05bad8a5

be2fb5e3a2af8d3e1f73c1cc5c6bd5ff05bad8a5
hiram
  Wed Apr 8 07:52:51 2020 -0700
handle the problem of duplicate contigs and make all download files including archive refs #24547

diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl
index 06758c7..177d402 100755
--- src/hg/utils/automation/doNcbiRefSeq.pl
+++ src/hg/utils/automation/doNcbiRefSeq.pl
@@ -231,34 +231,36 @@
 faToTwoBit \${asmId}_genomic.fna.gz \${asmId}.ncbi.2bit
 _EOF_
     );
   }
 
   my $haveLiftFile = 0;
   if ( -s "$liftFile" ) {
        $haveLiftFile = 1;
   } elsif ($dbExists) {
      # check if db.ucscToRefSeq exists?
      my $sql = "show tables like 'ucscToRefSeq';";
      my $result = `hgsql $db -BN -e "$sql"`;
      chomp $result;
      if ( $result =~ m/ucscToRefSeq/ ) {
        $haveLiftFile = 1;
+### the sort -k2,2 -u eliminates the problem of duplicate sequences existing
+### in the target $db assembly
        $bossScript->add(<<_EOF_
 cd \$runDir
 hgsql -N -e 'select 0,name,chromEnd,chrom,chromEnd from ucscToRefSeq;' \${db} \\
-     > \${asmId}To\${db}.lift
+     | sort -k2,2 -u > \${asmId}To\${db}.lift
 _EOF_
        );
     }
   }
 
   my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit";
   $dbTwoBit = $target2bit if (-s "$target2bit");
 
   if (! $haveLiftFile ) {
        $bossScript->add(<<_EOF_
 # generate idKeys for the NCBI sequence to translate names to UCSC equivalents
 
 rm -rf \$runDir/idKeys
 mkdir -p \$runDir/idKeys
 cd \$runDir/idKeys
@@ -319,30 +321,31 @@
     $fakePslSizes = "-chromSize=\$db.chrom.sizes";
   }
 
   my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit";
   $dbTwoBit = $target2bit if (-s "$target2bit");
 
   $bossScript->add(<<_EOF_
 # establish all variables to use here
 
 export asmId=$asmId
 export downloadDir=$downloadDir
 export ncbiGffGz=\$downloadDir/\${asmId}_genomic.gff.gz
 export db=$db
 export gff3ToRefLink=$gff3ToRefLink
 export gbffToCds=$gbffToCds
+export dateStamp=`date "+%F"`
 
 export annotationRelease=`zcat \$ncbiGffGz | head -100 | grep ^#.annotation-source | sed -e 's/.*annotation-source //'`
 if [ "\$annotationRelease" == "" ]; then
   export annotationRelease=\$asmId
 fi
 export versionDate=`ls -L --full-time \$ncbiGffGz | awk '{print \$6;}'`
 echo "\$annotationRelease (\$versionDate)" > ncbiRefSeqVersion.txt
 
 # this produces the genePred in NCBI coordinates
 # 8/23/17: gff3ToGenePred quits over illegal attribute SO_type... make it legal (so_type):
 zcat \$ncbiGffGz \\
   | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\
   | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\
       -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp
 genePredCheck \$asmId.gp
@@ -359,30 +362,34 @@
 # extract labels from semi-structured text in gbff COMMENT/description sections:
 zcat \$downloadDir/\${asmId}_rna.gbff.gz \\
   | (grep ' :: ' || true) \\
     | perl -wpe 's/\\s+::.*//; s/^\\s+//;' \\
       | sort -u \\
         > pragmaLabels.txt
 
 # extract cross reference text for refLink
 \$gff3ToRefLink \$downloadDir/\$asmId.raFile.txt \$ncbiGffGz pragmaLabels.txt 2> \$db.refLink.stderr.txt \\
   | sort > \$asmId.refLink.tab
 
 # converting the NCBI coordinates to UCSC coordinates
 liftUp -extGenePred -type=.gp stdout $localLiftFile warn \$asmId.gp \\
   | gzip -c > \$asmId.\$db.gp.gz
 $genePredCheckDb \$asmId.\$db.gp.gz
+zcat \$asmId.\$db.gp.gz > ncbiRefSeq.\$dateStamp
+genePredToGtf -utr file ncbiRefSeq.\$dateStamp stdout | gzip -c \\
+  > ../\$db.\$dateStamp.ncbiRefSeq.gtf.gz
+rm -f ncbiRefSeq.\$dateStamp
 
 # curated subset of all genes
 (zegrep "^[NY][MRP]_" \$asmId.\$db.gp.gz || true) > \$db.curated.gp
 # may not be any curated genes
 if [ ! -s \$db.curated.gp ]; then
   rm -f \$db.curated.gp
 elif [ -s \$asmId.refseqSelectTranscripts.txt ]; then
   cat \$db.curated.gp | fgrep -f \$asmId.refseqSelectTranscripts.txt - \\
     > \$db.refseqSelect.curated.gp
   # may not be any refseqSelect.curated genes
   if [ ! -s \$db.refseqSelect.curated.gp ]; then
     rm -f \$db.refseqSelect.curated.gp
   fi
 fi
 
@@ -720,30 +727,38 @@
   pslCat -nohead process/\$asmId.\$db.psl.gz \\
     | grep -Fwf \$db.noRna.available.list \\
       | grep chrM > missingChrMFa.psl
   if [ -s missingChrMFa.psl ]; then
     pslToBed missingChrMFa.psl stdout \\
       | twoBitToFa -bed=stdin $dbTwoBit stdout >> \$db.rna.fa
   fi
 fi
 
 mkdir -p $gbdbDir
 ln -f -s `pwd`/\$db.rna.fa $gbdbDir/seqNcbiRefSeq.rna.fa
 hgLoadSeq -drop -seqTbl=seqNcbiRefSeq -extFileTbl=extNcbiRefSeq \$db $gbdbDir/seqNcbiRefSeq.rna.fa
 
 hgLoadPsl \$db -table=ncbiRefSeqPsl process/\$asmId.\$db.psl.gz
 
+if [ -d "/usr/local/apache/htdocs-hgdownload/goldenPath/archive" ]; then
+ gtfFile=`ls \$db.*.ncbiRefSeq.gtf.gz`
+ mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq
+ rm -f /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq/\$gtfFile
+ ln -s `pwd`/\$db.*.ncbiRefSeq.gtf.gz \\
+   /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq/
+fi
+
 featureBits \$db ncbiRefSeq > fb.ncbiRefSeq.\$db.txt 2>&1
 cat fb.ncbiRefSeq.\$db.txt 2>&1
 _EOF_
     );
   }	#	if ($dbExists)
   $bossScript->execute();
 } # doLoad
 
 #########################################################################
 # * step: cleanup [fileServer]
 sub doCleanup {
   my $runDir = "$buildDir";
   my $whatItDoes = "It cleans up or compresses intermediate files.";
   my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $fileServer,
 				      $runDir, $whatItDoes);