f68f5f242e08e5273b10f63d4fbc65f2528ee954 hiram Fri Mar 18 15:22:54 2022 -0700 initial version of conversion from chromAlias.txt to chromAlias.bb refs #29109 diff --git src/hg/utils/automation/aliasTextToBed.pl src/hg/utils/automation/aliasTextToBed.pl new file mode 100755 index 0000000..6383e70 --- /dev/null +++ src/hg/utils/automation/aliasTextToBed.pl @@ -0,0 +1,125 @@ +#!/usr/bin/env perl + +use Getopt::Long; +use strict; +use warnings; + +sub usage($) { + my ($msg) = @_; + if (length($msg)) { + printf STDERR "%s\n", $msg; + } + printf STDERR "usage: aliasTextToBed.pl -chromSizes=chrom.sizes \\\n\t-aliasText=chromAlias.txt -aliasBed=chromAlias.bed \\\n\t-aliasAs=chromAlias.as -aliasBigBed=chromAlias.bb\n"; + printf STDERR "converts the chromAlias.txt file into a bed file and\n\tcorresponding .as definition\n"; + exit 255; +} + +my $argc = scalar(@ARGV); +usage("Note: must have five arguments.") if ($argc != 5); + +use vars qw/ + $opt_chromSizes + $opt_aliasText + $opt_aliasBed + $opt_aliasBigBed + $opt_aliasAs + /; + +my $optsOk = GetOptions( + "chromSizes=s", + "aliasText=s", + "aliasBed=s", + "aliasBigBed=s", + "aliasAs=s", + ); + +usage("Note: Cannot recognize the arguments properly ?") if (!$optsOk); + +printf STDERR "# chromSizes: %s\n", $opt_chromSizes; +printf STDERR "# aliasText %s\n", $opt_aliasText; +printf STDERR "# aliasBed %s\n", $opt_aliasBed; +printf STDERR "# aliasBigBed %s\n", $opt_aliasBigBed; +printf STDERR "# aliasAs %s\n", $opt_aliasAs; + +open (FH, "<$opt_aliasText") or die "can not read $opt_aliasText"; +my $titleLine = ; +chomp $titleLine; +if ($titleLine !~ m/^#\s/) { + printf STDERR "ERROR: unrecognized alias file title line:\n%s\n", $titleLine; + exit 255; +} +if ($titleLine =~ m/sequenceName/) { + printf STDERR "ERROR: this is an older style alias file:\n%s\n", $titleLine; + exit 255; +} + +my %chromSizes; # key is chrom name, value is size +open (SZ, "<$opt_chromSizes") or die "can not read the chromSizes: $opt_chromSizes\n"; +while (my $line = ) { + chomp $line; + my ($chrom, $size) = split('\s+', $line); + $chromSizes{$chrom} = $size; +} +close (SZ); + +my %nameLabels = ( + "assembly" => "Assembly", + "genbank" => "GenBank", + "ncbi" => "NCBI", + "refseq" => "RefSeq", + "ucsc" => "UCSC", + "ensembl" => "Ensembl" +); + +open (AS, ">$opt_aliasAs") or die "can not write to $opt_aliasAs"; + +$titleLine =~ s/^#\s+//; +my @legendNames = split('\s+', $titleLine); +my $expectFieldCount = scalar(@legendNames); +my $i = 0; +# output the .as definition +printf AS "table chromAlias\n"; +printf AS " \"chromAlias bigBed index\"\n"; +printf AS " (\n"; +printf AS " string chrom;\t\"native sequence name\"\n"; +printf AS " uint chromStart;\t\"always 0\"\n"; +printf AS " uint chromEnd;\t\"chromosome size\"\n"; +foreach my $title (@legendNames) { + printf AS " string %s;\t\"%s name\"\n", $title, $nameLabels{$title}; +} +printf AS " )\n"; +close (AS); + +open (BD, ">$opt_aliasBed") or die "can not write to $opt_aliasBed"; + +while (my $line = ) { + chomp $line; + my @a = split('\t', $line, -1); # the -1 keeps all empty fields too + if (scalar(@a) != $expectFieldCount) { + printf STDERR "ERROR: expected field count %d =! %d on line %d\n", $expectFieldCount, scalar(@a), $.; + exit 255; + } + my $nameIndex = 0; + printf BD "%s\t0\t%d", $a[$nameIndex], $chromSizes{$a[$nameIndex]}; + foreach my $name (@a) { + printf BD "\t%s", $a[$nameIndex++]; + } + printf BD "\n"; +} +close (FH); +close (BD); + +print `bedToBigBed -tab -type=bed3+5 -as=$opt_aliasAs -extraIndex=ucsc,assembly,genbank,ncbi,refseq $opt_aliasBed $opt_chromSizes $opt_aliasBigBed`; + +__END__ + +# ucsc assembly genbank ncbi refseq +NW_011332701v1_alt HG2139_PATCH KN538374.1 NW_011332701.1 +chr1 1 CM000663.2 1 NC_000001.11 +chr10 10 CM000672.2 10 NC_000010.11 +chr10_NT_187579v1_alt HSCHR10_1_CTG3 KI270824.1 NT_187579.1 +chr10_NT_187580v1_alt HSCHR10_1_CTG4 KI270825.1 NT_187580.1 +chr10_NW_003315934v1_alt HSCHR10_1_CTG1 GL383545.1 NW_003315934.1 +chr10_NW_003315935v1_alt HSCHR10_1_CTG2 GL383546.1 NW_003315935.1 +chr10_NW_009646202v1_alt HG2191_PATCH KN196480.1 NW_009646202.1 +chr10_NW_011332692v1_alt HG2241_PATCH KN538365.1 NW_011332692.1