b4924336d330068ac4c39222f6cd730d33f486a1 hiram Thu Mar 6 15:35:53 2025 -0800 can also fixup geneName2 when transition takes place refs #32704 diff --git src/hg/utils/automation/updateName2.pl src/hg/utils/automation/updateName2.pl index bc0a8188702..aa4d9cb3cf2 100755 --- src/hg/utils/automation/updateName2.pl +++ src/hg/utils/automation/updateName2.pl @@ -1,131 +1,135 @@ #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc != 2) { printf STDERR "usage: updateName2.pl <file.attrs.txt> <file.gp>\n"; printf STDERR "will read the file.gp converting it to a bigGenePred\n"; printf STDERR "with column 18 of the bigGenePred outputting the\n"; printf STDERR "GeneID identifier when found in the file.attrs.txt\n"; printf STDERR "output of the revised bigBigGenePred will be to stdout.\n"; printf STDERR "This function is used in the doNcbiRefSeq.pl script.\n"; exit 255; } my $attrs = shift; my $gpFile = shift; my %dupList; # key is gene name, placed on this list when duplicate IDs found # for the same named gene, avoid these. my %geneId; # key is gene name from colunn 1, value is GeneID number open (my $fh, "-|", "grep GeneID $attrs") or die "can not grep $attrs"; while (my $line = <$fh>) { chomp $line; my @a = split(/\s+/, $line); my $geneName = $a[0]; next if (defined($dupList{$geneName})); my $geneId = $a[-1]; my @b = split(/,/, $geneId); # could be csv list of gene IDs here, looking for the 'GeneID' for my $maybeGeneId (@b) { if ($maybeGeneId =~ m/^GeneID/i) { $maybeGeneId =~ s/GeneID://i; if (defined($geneId{$geneName})) { if ($geneId{$geneName} ne $maybeGeneId) { $dupList{$geneName} = "$geneId{$geneName},${maybeGeneId}"; delete $geneId{$geneName}; next; } } else { $geneId{$geneName} = $maybeGeneId; } last; } } } close ($fh); my $updatedNames = 0; my $totalItems = 0; open ($fh, "-|", "genePredToBigGenePred $gpFile stdout") or die "can not read $gpFile"; while (my $line = <$fh>) { chomp $line; ++$totalItems; # the -1 keeps the trailing empty field at the end of the line my @a = split(/\t/, $line, -1); - # if name is equal to name2 see if name2 can be improved + # if name is equal to geneName see if geneName can be improved if ($a[3] eq $a[17]) { if (defined($geneId{$a[3]})) { $a[17] = $geneId{$a[3]}; ++$updatedNames; + # if name2 is equal to geneName2 reproduce name in geneName2 + if ($a[12] eq $a[18]) { + $a[18] = $a[3]; + } } } printf "%s\n", join("\t", @a); } close ($fh); printf STDERR "### updated %d items of total %d - %s\n", $updatedNames, $totalItems, $gpFile; __END__ 00 NW_027257890.1 01 99779 02 105382 03 XM_071090490.1 04 0 05 + 06 99779 07 105239 08 0 09 2 10 58,349, 11 0,5254, 12 LOC139361723 13 cmpl 14 cmpl 15 0,1, 16 17 XM_071090490.1 18 LOC139361723 19 bigGenePredFields: 1 string chrom; "Reference sequence chromosome or scaffold" 2 uint chromStart; "Start position in chromosome" 3 uint chromEnd; "End position in chromosome" 4 string name; "Name or ID of item, ideally both human readable and unique" 5 uint score; "Score (0-1000)" 6 char[1] strand; "+ or - for strand" 7 uint thickStart; "Start of where display should be thick (start codon)" 8 uint thickEnd; "End of where display should be thick (stop codon)" 9 uint reserved; "RGB value (use R,G,B string in input file)" 10 int blockCount; "Number of blocks" 11 int[blockCount] blockSizes; "Comma separated list of block sizes" 12 int[blockCount] chromStarts; "Start positions relative to chromStart" 13 string name2; "Alternative/human readable name" 14 string cdsStartStat; "Status of CDS start annotation (none, unknown, incomplete, or complete)" 15 string cdsEndStat; "Status of CDS end annotation (none, unknown, incomplete, or complete)" 16 int[blockCount] exonFrames; "Reading frame of the start of the CDS region of the exon, in the direction of transcription (0,1,2), or -1 if there is no CDS region." 17 string type; "Transcript type" 18 string geneName; "Primary identifier for gene" 19 string geneName2; "Alternative/human readable gene name" 20 string geneType; "Gene type" grep LOC139361723 GCF_043159975.1_mMacNem.hap1.attrs.txt gene-LOC139361723 ID gene-LOC139361723 gene-LOC139361723 Dbxref GeneID:139361723 gene-LOC139361723 Name LOC139361723 gene-LOC139361723 description beta-defensin 109 gene-LOC139361723 gbkey Gene gene-LOC139361723 gene LOC139361723 gene-LOC139361723 gene_biotype protein_coding gene-LOC139361723 Seqid NW_027257890.1 gene-LOC139361723 Source Gnomon gene-LOC139361723 Type gene gene-LOC139361723 Start 99779 gene-LOC139361723 End 105382 gene-LOC139361723 Strand + XM_071090490.1 Parent gene-LOC139361723