src/hg/utils/automation/asmHubNcbiRefSeq.pl 9374faeb653ab0653e07e6ba890d6531395ae476

9374faeb653ab0653e07e6ba890d6531395ae476
hiram
  Fri May 24 09:59:56 2024 -0700
correctly identify the source for the genes when coming from GCF onto GCA no redmine

diff --git src/hg/utils/automation/asmHubNcbiRefSeq.pl src/hg/utils/automation/asmHubNcbiRefSeq.pl
index e79b908..c254633 100755
--- src/hg/utils/automation/asmHubNcbiRefSeq.pl
+++ src/hg/utils/automation/asmHubNcbiRefSeq.pl
@@ -10,45 +10,54 @@
 my $argc = scalar(@ARGV);
 
 if ($argc != 3) {
   printf STDERR "usage: asmHubNcbiGene.pl asmId asmId.names.tab .../trackData/\n";
   printf STDERR "where asmId is the assembly identifier,\n";
   printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n";
   exit 255;
 }
 
 my $asmId = shift;
 my @parts = split('_', $asmId, 3);
 my $accession = "$parts[0]_$parts[1]";
 my $namesFile = shift;
 my $trackDataDir = shift;
 my $ncbiRefSeqBbi = "$trackDataDir/ncbiRefSeq/$asmId.ncbiRefSeq.bb";
-my $asmType = "refseq";
+my $srcGff =  `ls $trackDataDir/ncbiRefSeq/download/*_genomic.gff.gz | head -1`;
+chomp $srcGff;
+my $srcAsmId = $asmId;
+my $gcfToGcaLiftedText = "";
+if (length($srcGff) > 10) {
+  $srcAsmId = basename($srcGff);
+  $srcAsmId =~ s/_genomic.gff.gz//;
+  if ($srcAsmId ne $asmId) {
+    $gcfToGcaLiftedText = "RefSeq annotations from $srcAsmId were lifted to this $asmId assembly to provide these gene annotations on this corresponding assembly."
+  }
+}
 my $asmIdPath = &AsmHub::asmIdToPath($asmId);
 my $downloadGtf = "https://hgdownload.soe.ucsc.edu/hubs/$asmIdPath/$accession/genes/$asmId.ncbiRefSeq.gtf.gz";
 
 if ( ! -s $ncbiRefSeqBbi ) {
   printf STDERR "ERROR: can not find $asmId.ncbiRefSeq.bb file\n";
   exit 255;
 }
 
-my @partNames = split('_', $asmId);
+my @partNames = split('_', $srcAsmId);
 my $ftpDirPath = sprintf("%s/%s/%s/%s/%s", $partNames[0],
    substr($partNames[1],0,3), substr($partNames[1],3,3),
    substr($partNames[1],6,3), $asmId);
 
-$asmType = "genbank" if ($partNames[0] =~ m/GCA/);
 my $totalBases = `ave -col=2 $trackDataDir/../${asmId}.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`;
 chomp $totalBases;
 my $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeq.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`;
 chomp $geneStats;
 my ($itemCount, $basesCovered) = split('\s+', $geneStats);
 my $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases);
 $itemCount = &AsmHub::commify($itemCount);
 $basesCovered = &AsmHub::commify($basesCovered);
 my $totalBasesCmfy = &AsmHub::commify($totalBases);
 
 my $em = "<em>";
 my $noEm = "</em>";
 my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`;
 chomp $assemblyDate;
 my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`;
@@ -164,34 +173,36 @@
    including the gene name, OMIM identifier and accession names, or turn off 
    the label completely.</li>
   <li><strong>Codon coloring:</strong> This track has an optional codon
    coloring feature that allows users to quickly validate and compare gene
    predictions. To display codon colors, select the <em>genomic codons</em>
    option from the <em>Color track by codons</em> pull-down menu. For more 
    information about this feature, go to the <a href="../goldenPath/help/hgCodonColoring.html" 
   target="_blank">Coloring Gene Predictions and Annotations by Codon</a> page.</li>
 </ul>
 
 <a name="methods"></a>
 <h2>Methods</h2>
 <p>
 The RefSeq annotation and RefSeq RNA alignment tracks
 were created at UCSC using data from the NCBI RefSeq project. GFF format
-data files were downloaded from the file <b>${asmId}_genomic.gff.gz</b>
+data files were downloaded from the file <b>${srcAsmId}_genomic.gff.gz</b>
 delivered with the NCBI RefSeq genome assemblies at the FTP location:<br>
 <a href='ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$ftpDirPath/' target='_blank'>ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$ftpDirPath/</a>
 
+$gcfToGcaLiftedText
+
 The GFF file was converted to the
 genePred and PSL table formats for display in the Genome Browser.
 Information about the NCBI annotation pipeline can be found 
 <a href="https://www.ncbi.nlm.nih.gov/genome/annotation_euk/process/"
 target="_blank">here</a>.
 </p>
 
 <h2>Track statistics summary</h2>
 <p>
 <b>Total genome size: </b>$totalBasesCmfy <b>bases</b><br><br>
 <b>Curated and Predicted Gene count: </b>$itemCount<br>
 <b>Bases in these genes: </b>$basesCovered<br>
 <b>Percent genome coverage: </b>% $percentCoverage<br>
 </p>