97199fbf1ff56c9ee01956deb736b97244ea3ac6 hiram Sun Mar 1 21:52:28 2020 -0800 calculating featureBits like measurement for gene tracks, removing duplicates for ncbiRefSeq, remove blanks from gene names for ncbiRefSeq, and fix fundamental bug reference to geneToId in ncbiRefSeqOtherAttrs.pl refs #23891 diff --git src/hg/utils/automation/ncbiRefSeqOtherAttrs.pl src/hg/utils/automation/ncbiRefSeqOtherAttrs.pl index fa82a6c..77898eb 100755 --- src/hg/utils/automation/ncbiRefSeqOtherAttrs.pl +++ src/hg/utils/automation/ncbiRefSeqOtherAttrs.pl @@ -1,203 +1,205 @@ #!/usr/bin/env perl # DO NOT EDIT the /cluster/bin/scripts copy of this file -- # edit ~/kent/src/hg/utils/automation/ncbiRefSeqOtherAttrs.pl instead. # This script reads predigested attributes (gff3ToGenePred -attrsOut=...) and BED 12 input. # It outputs BED 12 + with attributes in extra fields. use warnings; use strict; my $base = $0; $base =~ s/^(.*\/)?//; my $asFile = "ncbiRefSeqOther.as"; sub usage { print STDERR " usage: $base \$db.other.bed \$asmId.attrs.txt > \$db.other.extras.bed Adds GFF3 attributes to BED to make BED+ for ncbiRefSeqOther bigBed. Generates $asFile autoSql file. See doNcbiRefSeq.pl. "; } # usage # NCBI GFF3 attribute (or derivative) names, bigBed column names and descriptions # (used as hgc table labels), in the order of extra fields: my @attributes = (['gene', 'gene', "Gene name"], ['GeneID', 'GeneID', "Entrez Gene"], ['MIM', 'MIM', "OMIM"], ['HGNC', 'HGNC', "HGNC"], ['MGI', 'MGI', "MGI"], ['WormBase', 'WormBase', "WormBase"], ['XenBase', 'XenBase', "XenBase"], ['BGD', 'BGD', "BGD"], ['RGD', 'RGD', "RGD"], ['SGD', 'SGD', "SGD"], ['ZFIN', 'ZFIN', "ZFIN"], ['FlyBase', 'FlyBase', "FlyBase"], ['miRBase', 'miRBase', "miRBase"], ['description', 'description', "Description"], ['Note', 'Note', "Note"], ['exception', 'exception', "Exceptional properties"], ['product', 'product', "Gene product"], ['gene_synonym', 'geneSynonym', "Gene synonyms"], ['model_evidence', 'modelEvidence', "Supporting evidence for gene model"], ['gbkey', 'gbkey', "Feature type"], ['gene_biotype', 'geneBiotype', "Gene biotype"], ['pseudo', 'pseudo', "'true' if pseudogene"], ['partial', 'partial', "'true' if partial"], ['anticodon', 'anticodon', "Anticodon position within tRNA"], ['codons', 'codons', "Number of codons in anticodon range"], ['start_range', 'startRange', "Start range on genome"], ['end_range', 'endRange', "End range on genome"], ['ID', 'ID', "Unique ID in RefSeq GFF3"] ); my @attrOrder = map { $_->[0] } @attributes; # Command line args my $bedFile = shift @ARGV; my $attrsFile = shift @ARGV; if (not (defined $bedFile && defined $attrsFile && scalar @ARGV == 0)) { usage(); exit 1; } # Map attribute names to indexes in extra field order: my %attrToIx = (); my @attrFound = (); for (my $i = 0; $i <= $#attrOrder; $i++) { $attrToIx{$attrOrder[$i]} = $i; $attrFound[$i] = 0; } # Map item IDs to extra columns: my %itemAttrs = (); my %ignoredAttrs = (); my %geneToId = (); my %idToParent = (); open(my $ATTRS, $attrsFile) || die "Can't open attributes file '$attrsFile': $!\n"; while (<$ATTRS>) { chomp; my ($id, $attr, $val) = split("\t"); + $id =~ s/ /_/g; if ($attr eq 'Dbxref') { # Dbxref is one attribute, but split it up into multiple output columns for URL generation my @xrefs = split(',', $val); foreach my $xref (@xrefs) { foreach my $source qw(GeneID MIM HGNC MGI miRBase WormBase XenBase BGD RGD SGD ZFIN FLYBASE) { if ($xref =~ s/^$source://) { my $ix = $attrToIx{$source}; $itemAttrs{$id}->[$ix] = $xref if (! defined $itemAttrs{$id}->[$ix]); $attrFound[$ix] = 1; } } } } elsif ($attr eq 'Parent') { + $val =~ s/ /_/g; $idToParent{$id} = $val; } else { my $ix = $attrToIx{$attr}; if (defined $ix) { if ($attr eq 'gene') { $geneToId{$val} = $id; } $itemAttrs{$id}->[$ix] = $val; $attrFound[$ix] = 1; } else { $ignoredAttrs{$attr}++; } } } close($ATTRS); foreach my $attr (sort { $ignoredAttrs{$b} <=> $ignoredAttrs{$a} } keys %ignoredAttrs) { warn "Ignored attribute '$attr' " . $ignoredAttrs{$attr} . " times\n"; } # Make autoSql open (my $AS, ">$asFile") || die "Can't open $asFile for writing: $!\n"; print $AS <<_EOF_ table ncbiRefSeqOther "Additional information for NCBI 'Other' annotation" ( string chrom; "Chromosome (or contig, scaffold, etc.)" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Name of item" uint score; "Placeholder for BED format (score 0-1000)" char[1] strand; "+ or -" uint thickStart; "CDS start/end in chromosome if coding" uint thickEnd; "CDS start/end in chromosome if coding" uint reserved; "Placeholder for BED format (itemRgb)" int blockCount; "Number of alignment blocks" int[blockCount] blockSizes; "Comma separated list of block sizes" int[blockCount] chromStarts; "Block start positions relative to chromStart" _EOF_ ; for (my $ix = 0; $ix <= $#attributes; $ix++) { if ($attrFound[$ix]) { my ($attr, $colName, $desc) = @{$attributes[$ix]}; if ($colName eq "Note" || $colName eq "geneSynonym") { print $AS " lstring $colName; \"$desc\"\n"; } else { print $AS " string $colName; \"$desc\"\n"; } } } print $AS ")\n"; close($AS); # Make a list of indexes of empty columns, in descending order, for splicing out: for (my $ix = 0; $ix <= $#attrOrder; $ix++) { if (! $attrFound[$ix]) { warn "No values found for $attrOrder[$ix]; removing column from output.\n"; } } # Read BED and append extra columns: open(my $BED, $bedFile) || die "Can't open BED file '$bedFile': $!\n"; while (<$BED>) { chomp; my @bedCols = split; my $id = $bedCols[3]; my $extraCols = $itemAttrs{$id}; if (! defined $extraCols) { - $id = geneToId{$id}; + $id = $geneToId{$id}; $extraCols = $itemAttrs{$id}; } die "No attributes for bed name '$bedCols[3]' (id '$id')" unless defined $extraCols; foreach my $ix (0..$#attrOrder) { if ($attrFound[$ix]) { # If the desired attribute isn't there for $id, but $id has ancestors, look for attr there my $parentId = $idToParent{$id}; while (! defined $extraCols->[$ix] && defined $parentId) { my $parent = $itemAttrs{$parentId} || $itemAttrs{$geneToId{$parentId}}; if (defined $parent) { if (defined $parent->[$ix]) { $extraCols->[$ix] = $parent->[$ix]; } else { $parentId = $idToParent{$parentId}; } } else { warn "Can't find parent for id '$id'\n"; last; } } if (! defined $extraCols->[$ix]) { $extraCols->[$ix] = ""; } } } print join ("\t", @bedCols); # Print out only non-empty extra columns. foreach my $ix (0..$#attrOrder) { if ($attrFound[$ix]) { print "\t" . $extraCols->[$ix]; } } print "\n"; } close($BED);