e7dfb28b932077142b9e5db6f43a21ddbd32e7fa hiram Thu Mar 6 09:36:52 2025 -0800 function to update column 18 of bigGenePred to be GeneID for link to NCBI refs #34704 diff --git src/hg/utils/automation/updateName2.pl src/hg/utils/automation/updateName2.pl new file mode 100755 index 00000000000..bc0a8188702 --- /dev/null +++ src/hg/utils/automation/updateName2.pl @@ -0,0 +1,131 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +my $argc = scalar(@ARGV); +if ($argc != 2) { + printf STDERR "usage: updateName2.pl <file.attrs.txt> <file.gp>\n"; + printf STDERR "will read the file.gp converting it to a bigGenePred\n"; + printf STDERR "with column 18 of the bigGenePred outputting the\n"; + printf STDERR "GeneID identifier when found in the file.attrs.txt\n"; + printf STDERR "output of the revised bigBigGenePred will be to stdout.\n"; + printf STDERR "This function is used in the doNcbiRefSeq.pl script.\n"; + exit 255; +} +my $attrs = shift; +my $gpFile = shift; + +my %dupList; # key is gene name, placed on this list when duplicate IDs found + # for the same named gene, avoid these. +my %geneId; # key is gene name from colunn 1, value is GeneID number + +open (my $fh, "-|", "grep GeneID $attrs") or die "can not grep $attrs"; +while (my $line = <$fh>) { + chomp $line; + my @a = split(/\s+/, $line); + my $geneName = $a[0]; + next if (defined($dupList{$geneName})); + my $geneId = $a[-1]; + my @b = split(/,/, $geneId); + # could be csv list of gene IDs here, looking for the 'GeneID' + for my $maybeGeneId (@b) { + if ($maybeGeneId =~ m/^GeneID/i) { + $maybeGeneId =~ s/GeneID://i; + if (defined($geneId{$geneName})) { + if ($geneId{$geneName} ne $maybeGeneId) { + $dupList{$geneName} = "$geneId{$geneName},${maybeGeneId}"; + delete $geneId{$geneName}; + next; + } + } else { + $geneId{$geneName} = $maybeGeneId; + } + last; + } + } +} +close ($fh); + +my $updatedNames = 0; +my $totalItems = 0; +open ($fh, "-|", "genePredToBigGenePred $gpFile stdout") or die "can not read $gpFile"; +while (my $line = <$fh>) { + chomp $line; + ++$totalItems; + # the -1 keeps the trailing empty field at the end of the line + my @a = split(/\t/, $line, -1); + # if name is equal to name2 see if name2 can be improved + if ($a[3] eq $a[17]) { + if (defined($geneId{$a[3]})) { + $a[17] = $geneId{$a[3]}; + ++$updatedNames; + } + } + printf "%s\n", join("\t", @a); +} +close ($fh); +printf STDERR "### updated %d items of total %d - %s\n", $updatedNames, $totalItems, $gpFile; + +__END__ + +00 NW_027257890.1 +01 99779 +02 105382 +03 XM_071090490.1 +04 0 +05 + +06 99779 +07 105239 +08 0 +09 2 +10 58,349, +11 0,5254, +12 LOC139361723 +13 cmpl +14 cmpl +15 0,1, +16 +17 XM_071090490.1 +18 LOC139361723 +19 + +bigGenePredFields: + + 1 string chrom; "Reference sequence chromosome or scaffold" + 2 uint chromStart; "Start position in chromosome" + 3 uint chromEnd; "End position in chromosome" + 4 string name; "Name or ID of item, ideally both human readable and unique" + 5 uint score; "Score (0-1000)" + 6 char[1] strand; "+ or - for strand" + 7 uint thickStart; "Start of where display should be thick (start codon)" + 8 uint thickEnd; "End of where display should be thick (stop codon)" + 9 uint reserved; "RGB value (use R,G,B string in input file)" + 10 int blockCount; "Number of blocks" + 11 int[blockCount] blockSizes; "Comma separated list of block sizes" + 12 int[blockCount] chromStarts; "Start positions relative to chromStart" + 13 string name2; "Alternative/human readable name" + 14 string cdsStartStat; "Status of CDS start annotation (none, unknown, incomplete, or complete)" + 15 string cdsEndStat; "Status of CDS end annotation (none, unknown, incomplete, or complete)" + 16 int[blockCount] exonFrames; "Reading frame of the start of the CDS region of the exon, in the direction of transcription (0,1,2), or -1 if there is no CDS region." + 17 string type; "Transcript type" + 18 string geneName; "Primary identifier for gene" + 19 string geneName2; "Alternative/human readable gene name" + 20 string geneType; "Gene type" + + +grep LOC139361723 GCF_043159975.1_mMacNem.hap1.attrs.txt +gene-LOC139361723 ID gene-LOC139361723 +gene-LOC139361723 Dbxref GeneID:139361723 +gene-LOC139361723 Name LOC139361723 +gene-LOC139361723 description beta-defensin 109 +gene-LOC139361723 gbkey Gene +gene-LOC139361723 gene LOC139361723 +gene-LOC139361723 gene_biotype protein_coding +gene-LOC139361723 Seqid NW_027257890.1 +gene-LOC139361723 Source Gnomon +gene-LOC139361723 Type gene +gene-LOC139361723 Start 99779 +gene-LOC139361723 End 105382 +gene-LOC139361723 Strand + +XM_071090490.1 Parent gene-LOC139361723