9374faeb653ab0653e07e6ba890d6531395ae476 hiram Fri May 24 09:59:56 2024 -0700 correctly identify the source for the genes when coming from GCF onto GCA no redmine diff --git src/hg/utils/automation/asmHubNcbiRefSeq.pl src/hg/utils/automation/asmHubNcbiRefSeq.pl index e79b908..c254633 100755 --- src/hg/utils/automation/asmHubNcbiRefSeq.pl +++ src/hg/utils/automation/asmHubNcbiRefSeq.pl @@ -10,45 +10,54 @@ my $argc = scalar(@ARGV); if ($argc != 3) { printf STDERR "usage: asmHubNcbiGene.pl asmId asmId.names.tab .../trackData/\n"; printf STDERR "where asmId is the assembly identifier,\n"; printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n"; exit 255; } my $asmId = shift; my @parts = split('_', $asmId, 3); my $accession = "$parts[0]_$parts[1]"; my $namesFile = shift; my $trackDataDir = shift; my $ncbiRefSeqBbi = "$trackDataDir/ncbiRefSeq/$asmId.ncbiRefSeq.bb"; -my $asmType = "refseq"; +my $srcGff = `ls $trackDataDir/ncbiRefSeq/download/*_genomic.gff.gz | head -1`; +chomp $srcGff; +my $srcAsmId = $asmId; +my $gcfToGcaLiftedText = ""; +if (length($srcGff) > 10) { + $srcAsmId = basename($srcGff); + $srcAsmId =~ s/_genomic.gff.gz//; + if ($srcAsmId ne $asmId) { + $gcfToGcaLiftedText = "RefSeq annotations from $srcAsmId were lifted to this $asmId assembly to provide these gene annotations on this corresponding assembly." + } +} my $asmIdPath = &AsmHub::asmIdToPath($asmId); my $downloadGtf = "https://hgdownload.soe.ucsc.edu/hubs/$asmIdPath/$accession/genes/$asmId.ncbiRefSeq.gtf.gz"; if ( ! -s $ncbiRefSeqBbi ) { printf STDERR "ERROR: can not find $asmId.ncbiRefSeq.bb file\n"; exit 255; } -my @partNames = split('_', $asmId); +my @partNames = split('_', $srcAsmId); my $ftpDirPath = sprintf("%s/%s/%s/%s/%s", $partNames[0], substr($partNames[1],0,3), substr($partNames[1],3,3), substr($partNames[1],6,3), $asmId); -$asmType = "genbank" if ($partNames[0] =~ m/GCA/); my $totalBases = `ave -col=2 $trackDataDir/../${asmId}.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`; chomp $totalBases; my $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeq.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`; chomp $geneStats; my ($itemCount, $basesCovered) = split('\s+', $geneStats); my $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases); $itemCount = &AsmHub::commify($itemCount); $basesCovered = &AsmHub::commify($basesCovered); my $totalBasesCmfy = &AsmHub::commify($totalBases); my $em = "<em>"; my $noEm = "</em>"; my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; chomp $assemblyDate; my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; @@ -164,34 +173,36 @@ including the gene name, OMIM identifier and accession names, or turn off the label completely.</li> <li><strong>Codon coloring:</strong> This track has an optional codon coloring feature that allows users to quickly validate and compare gene predictions. To display codon colors, select the <em>genomic codons</em> option from the <em>Color track by codons</em> pull-down menu. For more information about this feature, go to the <a href="../goldenPath/help/hgCodonColoring.html" target="_blank">Coloring Gene Predictions and Annotations by Codon</a> page.</li> </ul> <a name="methods"></a> <h2>Methods</h2> <p> The RefSeq annotation and RefSeq RNA alignment tracks were created at UCSC using data from the NCBI RefSeq project. GFF format -data files were downloaded from the file <b>${asmId}_genomic.gff.gz</b> +data files were downloaded from the file <b>${srcAsmId}_genomic.gff.gz</b> delivered with the NCBI RefSeq genome assemblies at the FTP location:<br> <a href='ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$ftpDirPath/' target='_blank'>ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$ftpDirPath/</a> +$gcfToGcaLiftedText + The GFF file was converted to the genePred and PSL table formats for display in the Genome Browser. Information about the NCBI annotation pipeline can be found <a href="https://www.ncbi.nlm.nih.gov/genome/annotation_euk/process/" target="_blank">here</a>. </p> <h2>Track statistics summary</h2> <p> <b>Total genome size: </b>$totalBasesCmfy <b>bases</b><br><br> <b>Curated and Predicted Gene count: </b>$itemCount<br> <b>Bases in these genes: </b>$basesCovered<br> <b>Percent genome coverage: </b>% $percentCoverage<br> </p>