9374faeb653ab0653e07e6ba890d6531395ae476 hiram Fri May 24 09:59:56 2024 -0700 correctly identify the source for the genes when coming from GCF onto GCA no redmine diff --git src/hg/utils/automation/asmHubNcbiRefSeq.pl src/hg/utils/automation/asmHubNcbiRefSeq.pl index e79b908..c254633 100755 --- src/hg/utils/automation/asmHubNcbiRefSeq.pl +++ src/hg/utils/automation/asmHubNcbiRefSeq.pl @@ -10,45 +10,54 @@ my $argc = scalar(@ARGV); if ($argc != 3) { printf STDERR "usage: asmHubNcbiGene.pl asmId asmId.names.tab .../trackData/\n"; printf STDERR "where asmId is the assembly identifier,\n"; printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n"; exit 255; } my $asmId = shift; my @parts = split('_', $asmId, 3); my $accession = "$parts[0]_$parts[1]"; my $namesFile = shift; my $trackDataDir = shift; my $ncbiRefSeqBbi = "$trackDataDir/ncbiRefSeq/$asmId.ncbiRefSeq.bb"; -my $asmType = "refseq"; +my $srcGff = `ls $trackDataDir/ncbiRefSeq/download/*_genomic.gff.gz | head -1`; +chomp $srcGff; +my $srcAsmId = $asmId; +my $gcfToGcaLiftedText = ""; +if (length($srcGff) > 10) { + $srcAsmId = basename($srcGff); + $srcAsmId =~ s/_genomic.gff.gz//; + if ($srcAsmId ne $asmId) { + $gcfToGcaLiftedText = "RefSeq annotations from $srcAsmId were lifted to this $asmId assembly to provide these gene annotations on this corresponding assembly." + } +} my $asmIdPath = &AsmHub::asmIdToPath($asmId); my $downloadGtf = "https://hgdownload.soe.ucsc.edu/hubs/$asmIdPath/$accession/genes/$asmId.ncbiRefSeq.gtf.gz"; if ( ! -s $ncbiRefSeqBbi ) { printf STDERR "ERROR: can not find $asmId.ncbiRefSeq.bb file\n"; exit 255; } -my @partNames = split('_', $asmId); +my @partNames = split('_', $srcAsmId); my $ftpDirPath = sprintf("%s/%s/%s/%s/%s", $partNames[0], substr($partNames[1],0,3), substr($partNames[1],3,3), substr($partNames[1],6,3), $asmId); -$asmType = "genbank" if ($partNames[0] =~ m/GCA/); my $totalBases = `ave -col=2 $trackDataDir/../${asmId}.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`; chomp $totalBases; my $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeq.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`; chomp $geneStats; my ($itemCount, $basesCovered) = split('\s+', $geneStats); my $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases); $itemCount = &AsmHub::commify($itemCount); $basesCovered = &AsmHub::commify($basesCovered); my $totalBasesCmfy = &AsmHub::commify($totalBases); my $em = ""; my $noEm = ""; my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; chomp $assemblyDate; my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; @@ -164,34 +173,36 @@ including the gene name, OMIM identifier and accession names, or turn off the label completely.
The RefSeq annotation and RefSeq RNA alignment tracks
were created at UCSC using data from the NCBI RefSeq project. GFF format
-data files were downloaded from the file ${asmId}_genomic.gff.gz
+data files were downloaded from the file ${srcAsmId}_genomic.gff.gz
delivered with the NCBI RefSeq genome assemblies at the FTP location:
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$ftpDirPath/
+$gcfToGcaLiftedText
+
The GFF file was converted to the
genePred and PSL table formats for display in the Genome Browser.
Information about the NCBI annotation pipeline can be found
here.
Total genome size: $totalBasesCmfy bases
Curated and Predicted Gene count: $itemCount
Bases in these genes: $basesCovered
Percent genome coverage: % $percentCoverage