9374faeb653ab0653e07e6ba890d6531395ae476 hiram Fri May 24 09:59:56 2024 -0700 correctly identify the source for the genes when coming from GCF onto GCA no redmine diff --git src/hg/utils/automation/asmHubNcbiRefSeq.pl src/hg/utils/automation/asmHubNcbiRefSeq.pl index e79b908..c254633 100755 --- src/hg/utils/automation/asmHubNcbiRefSeq.pl +++ src/hg/utils/automation/asmHubNcbiRefSeq.pl @@ -1,300 +1,311 @@ #!/usr/bin/env perl use strict; use warnings; use FindBin qw($Bin); use lib "$Bin"; use AsmHub; use File::Basename; my $argc = scalar(@ARGV); if ($argc != 3) { printf STDERR "usage: asmHubNcbiGene.pl asmId asmId.names.tab .../trackData/\n"; printf STDERR "where asmId is the assembly identifier,\n"; printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n"; exit 255; } my $asmId = shift; my @parts = split('_', $asmId, 3); my $accession = "$parts[0]_$parts[1]"; my $namesFile = shift; my $trackDataDir = shift; my $ncbiRefSeqBbi = "$trackDataDir/ncbiRefSeq/$asmId.ncbiRefSeq.bb"; -my $asmType = "refseq"; +my $srcGff = `ls $trackDataDir/ncbiRefSeq/download/*_genomic.gff.gz | head -1`; +chomp $srcGff; +my $srcAsmId = $asmId; +my $gcfToGcaLiftedText = ""; +if (length($srcGff) > 10) { + $srcAsmId = basename($srcGff); + $srcAsmId =~ s/_genomic.gff.gz//; + if ($srcAsmId ne $asmId) { + $gcfToGcaLiftedText = "RefSeq annotations from $srcAsmId were lifted to this $asmId assembly to provide these gene annotations on this corresponding assembly." + } +} my $asmIdPath = &AsmHub::asmIdToPath($asmId); my $downloadGtf = "https://hgdownload.soe.ucsc.edu/hubs/$asmIdPath/$accession/genes/$asmId.ncbiRefSeq.gtf.gz"; if ( ! -s $ncbiRefSeqBbi ) { printf STDERR "ERROR: can not find $asmId.ncbiRefSeq.bb file\n"; exit 255; } -my @partNames = split('_', $asmId); +my @partNames = split('_', $srcAsmId); my $ftpDirPath = sprintf("%s/%s/%s/%s/%s", $partNames[0], substr($partNames[1],0,3), substr($partNames[1],3,3), substr($partNames[1],6,3), $asmId); -$asmType = "genbank" if ($partNames[0] =~ m/GCA/); my $totalBases = `ave -col=2 $trackDataDir/../${asmId}.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`; chomp $totalBases; my $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeq.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`; chomp $geneStats; my ($itemCount, $basesCovered) = split('\s+', $geneStats); my $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases); $itemCount = &AsmHub::commify($itemCount); $basesCovered = &AsmHub::commify($basesCovered); my $totalBasesCmfy = &AsmHub::commify($totalBases); my $em = ""; my $noEm = ""; my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; chomp $assemblyDate; my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; chomp $ncbiAssemblyId; my $organism = `grep -v "^#" $namesFile | cut -f5`; chomp $organism; print <<_EOF_

Description

The NCBI RefSeq Genes composite track shows $assemblyDate $em${organism}$noEm/$asmId protein-coding and non-protein-coding genes taken from the NCBI RNA reference sequences collection (RefSeq). All subtracks use coordinates provided by RefSeq. See the Methods section for more details about how the different tracks were created.

Please visit NCBI's Feedback for Gene and Reference Sequences (RefSeq) page to make suggestions, submit additions and corrections, or ask for help concerning RefSeq records.

For more information on the different gene tracks, see our Genes FAQ.

Data Access

Download $asmId.ncbiRefSeq.gtf.gz GTF file.

Display Conventions and Configuration

To show only a selected set of subtracks, uncheck the boxes next to the tracks that you wish to hide.

The tracks available here can include (not all may be present):
RefSeq annotations and alignments

The RefSeq All, RefSeq Curated and RefSeq Predicted, tracks follow the display conventions for gene prediction tracks. The color shading indicates the level of review the RefSeq record has undergone: predicted (light), provisional (medium), or reviewed (dark), as defined by RefSeq.

Color Level of review
Reviewed: the RefSeq record has been reviewed by NCBI staff or by a collaborator. The NCBI review process includes assessing available sequence data and the literature. Some RefSeq records may incorporate expanded sequence and annotation information.
Provisional: the RefSeq record has not yet been subject to individual review. The initial sequence-to-gene association has been established by outside collaborators or NCBI staff.
Predicted: the RefSeq record has not yet been subject to individual review, and some aspect of the RefSeq record is predicted.

The RefSeq Alignments track follows the display conventions for PSL tracks.

The item labels and codon display properties for features within this track can be configured through the controls at the top of the track description page. To adjust the settings for an individual subtrack, click the wrench icon next to the track name in the subtrack list.

Methods

The RefSeq annotation and RefSeq RNA alignment tracks were created at UCSC using data from the NCBI RefSeq project. GFF format -data files were downloaded from the file ${asmId}_genomic.gff.gz +data files were downloaded from the file ${srcAsmId}_genomic.gff.gz delivered with the NCBI RefSeq genome assemblies at the FTP location:
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$ftpDirPath/ +$gcfToGcaLiftedText + The GFF file was converted to the genePred and PSL table formats for display in the Genome Browser. Information about the NCBI annotation pipeline can be found here.

Track statistics summary

Total genome size: $totalBasesCmfy bases

Curated and Predicted Gene count: $itemCount
Bases in these genes: $basesCovered
Percent genome coverage: % $percentCoverage

_EOF_ ; if ( -s "$trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqCurated.stats.txt" ) { $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqCurated.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`; chomp $geneStats; ($itemCount, $basesCovered) = split('\s+', $geneStats); $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases); $itemCount = &AsmHub::commify($itemCount); $basesCovered = &AsmHub::commify($basesCovered); printf <<_EOF_

Curated gene count: $itemCount
Bases in curated genes: $basesCovered
Percent genome coverage: %% $percentCoverage

_EOF_ } else { printf <<_EOF_

There are no curated gene annotations.

_EOF_ } if ( -s "$trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqPredicted.stats.txt" ) { $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqPredicted.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`; chomp $geneStats; ($itemCount, $basesCovered) = split('\s+', $geneStats); $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases); $itemCount = &AsmHub::commify($itemCount); $basesCovered = &AsmHub::commify($basesCovered); printf <<_EOF_

Predicted gene count: $itemCount
Bases in genes: $basesCovered
Percent genome coverage: %% $percentCoverage

_EOF_ } else { printf <<_EOF_

there are no predicted gene annotations

_EOF_ } if ( -s "$trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqOther.stats.txt" ) { $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqOther.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`; chomp $geneStats; ($itemCount, $basesCovered) = split('\s+', $geneStats); $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases); $itemCount = &AsmHub::commify($itemCount); $basesCovered = &AsmHub::commify($basesCovered); printf <<_EOF_

Other annotation count: $itemCount
Bases in other annotations: $basesCovered
Percent genome coverage: %% $percentCoverage

_EOF_ } printf <<_EOF_

Credits

This track was produced at UCSC from data generated by scientists worldwide and curated by the NCBI RefSeq project.

References

Kent WJ. BLAT - the BLAST-like alignment tool. Genome Res. 2002 Apr;12(4):656-64. PMID: 11932250; PMC: PMC187518

Pruitt KD, Brown GR, Hiatt SM, Thibaud-Nissen F, Astashyn A, Ermolaeva O, Farrell CM, Hart J, Landrum MJ, McGarvey KM et al. RefSeq: an update on mammalian reference sequences. Nucleic Acids Res. 2014 Jan;42(Database issue):D756-63. PMID: 24259432; PMC: PMC3965018

Pruitt KD, Tatusova T, Maglott DR. NCBI Reference Sequence (RefSeq): a curated non-redundant sequence database of genomes, transcripts and proteins. Nucleic Acids Res. 2005 Jan 1;33(Database issue):D501-4. PMID: 15608248; PMC: PMC539979

_EOF_