9374faeb653ab0653e07e6ba890d6531395ae476 hiram Fri May 24 09:59:56 2024 -0700 correctly identify the source for the genes when coming from GCF onto GCA no redmine diff --git src/hg/utils/automation/asmHubNcbiRefSeq.pl src/hg/utils/automation/asmHubNcbiRefSeq.pl index e79b908..c254633 100755 --- src/hg/utils/automation/asmHubNcbiRefSeq.pl +++ src/hg/utils/automation/asmHubNcbiRefSeq.pl @@ -10,45 +10,54 @@ my $argc = scalar(@ARGV); if ($argc != 3) { printf STDERR "usage: asmHubNcbiGene.pl asmId asmId.names.tab .../trackData/\n"; printf STDERR "where asmId is the assembly identifier,\n"; printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n"; exit 255; } my $asmId = shift; my @parts = split('_', $asmId, 3); my $accession = "$parts[0]_$parts[1]"; my $namesFile = shift; my $trackDataDir = shift; my $ncbiRefSeqBbi = "$trackDataDir/ncbiRefSeq/$asmId.ncbiRefSeq.bb"; -my $asmType = "refseq"; +my $srcGff = `ls $trackDataDir/ncbiRefSeq/download/*_genomic.gff.gz | head -1`; +chomp $srcGff; +my $srcAsmId = $asmId; +my $gcfToGcaLiftedText = ""; +if (length($srcGff) > 10) { + $srcAsmId = basename($srcGff); + $srcAsmId =~ s/_genomic.gff.gz//; + if ($srcAsmId ne $asmId) { + $gcfToGcaLiftedText = "RefSeq annotations from $srcAsmId were lifted to this $asmId assembly to provide these gene annotations on this corresponding assembly." + } +} my $asmIdPath = &AsmHub::asmIdToPath($asmId); my $downloadGtf = "https://hgdownload.soe.ucsc.edu/hubs/$asmIdPath/$accession/genes/$asmId.ncbiRefSeq.gtf.gz"; if ( ! -s $ncbiRefSeqBbi ) { printf STDERR "ERROR: can not find $asmId.ncbiRefSeq.bb file\n"; exit 255; } -my @partNames = split('_', $asmId); +my @partNames = split('_', $srcAsmId); my $ftpDirPath = sprintf("%s/%s/%s/%s/%s", $partNames[0], substr($partNames[1],0,3), substr($partNames[1],3,3), substr($partNames[1],6,3), $asmId); -$asmType = "genbank" if ($partNames[0] =~ m/GCA/); my $totalBases = `ave -col=2 $trackDataDir/../${asmId}.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`; chomp $totalBases; my $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeq.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`; chomp $geneStats; my ($itemCount, $basesCovered) = split('\s+', $geneStats); my $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases); $itemCount = &AsmHub::commify($itemCount); $basesCovered = &AsmHub::commify($basesCovered); my $totalBasesCmfy = &AsmHub::commify($totalBases); my $em = ""; my $noEm = ""; my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; chomp $assemblyDate; my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; @@ -164,34 +173,36 @@ including the gene name, OMIM identifier and accession names, or turn off the label completely.
  • Codon coloring: This track has an optional codon coloring feature that allows users to quickly validate and compare gene predictions. To display codon colors, select the genomic codons option from the Color track by codons pull-down menu. For more information about this feature, go to the Coloring Gene Predictions and Annotations by Codon page.
  • Methods

    The RefSeq annotation and RefSeq RNA alignment tracks were created at UCSC using data from the NCBI RefSeq project. GFF format -data files were downloaded from the file ${asmId}_genomic.gff.gz +data files were downloaded from the file ${srcAsmId}_genomic.gff.gz delivered with the NCBI RefSeq genome assemblies at the FTP location:
    ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$ftpDirPath/ +$gcfToGcaLiftedText + The GFF file was converted to the genePred and PSL table formats for display in the Genome Browser. Information about the NCBI annotation pipeline can be found here.

    Track statistics summary

    Total genome size: $totalBasesCmfy bases

    Curated and Predicted Gene count: $itemCount
    Bases in these genes: $basesCovered
    Percent genome coverage: % $percentCoverage