7aefb4e6059a96c47f3ab3b89745bf0f1c697fba hiram Thu Sep 8 14:26:08 2022 -0700 perfectly strict recognizing old style files no redmine diff --git src/hg/utils/automation/aliasTextToBed.pl src/hg/utils/automation/aliasTextToBed.pl index f7e7df7..fa24d44 100755 --- src/hg/utils/automation/aliasTextToBed.pl +++ src/hg/utils/automation/aliasTextToBed.pl @@ -1,135 +1,135 @@ #!/usr/bin/env perl use Getopt::Long; use strict; use warnings; sub usage($) { my ($msg) = @_; if (length($msg)) { printf STDERR "%s\n", $msg; } printf STDERR "usage: aliasTextToBed.pl -chromSizes=chrom.sizes \\\n\t-aliasText=chromAlias.txt -aliasBed=chromAlias.bed \\\n\t-aliasAs=chromAlias.as -aliasBigBed=chromAlias.bb\n"; printf STDERR "converts the chromAlias.txt file into a bed file and\n\tcorresponding .as definition\n"; exit 255; } my $argc = scalar(@ARGV); usage("Note: must have five arguments.") if ($argc != 5); use vars qw/ $opt_chromSizes $opt_aliasText $opt_aliasBed $opt_aliasBigBed $opt_aliasAs /; my $optsOk = GetOptions( "chromSizes=s", "aliasText=s", "aliasBed=s", "aliasBigBed=s", "aliasAs=s", ); usage("Note: Cannot recognize the arguments properly ?") if (!$optsOk); printf STDERR "# chromSizes: %s\n", $opt_chromSizes; printf STDERR "# aliasText %s\n", $opt_aliasText; printf STDERR "# aliasBed %s\n", $opt_aliasBed; printf STDERR "# aliasBigBed %s\n", $opt_aliasBigBed; printf STDERR "# aliasAs %s\n", $opt_aliasAs; open (FH, "<$opt_aliasText") or die "can not read $opt_aliasText"; my $titleLine = <FH>; chomp $titleLine; if ($titleLine !~ m/^#\s/) { printf STDERR "ERROR: unrecognized alias file title line:\n%s\n", $titleLine; exit 255; } -if ($titleLine =~ m/sequenceName/) { +if ($titleLine =~ m/# sequenceName/) { printf STDERR "ERROR: this is an older style alias file:\n%s\n", $titleLine; exit 255; } my %chromSizes; # key is chrom name, value is size open (SZ, "<$opt_chromSizes") or die "can not read the chromSizes: $opt_chromSizes\n"; while (my $line = <SZ>) { chomp $line; my ($chrom, $size) = split('\s+', $line); $chromSizes{$chrom} = $size; } close (SZ); my %nameLabels = ( "assembly" => "Assembly", "genbank" => "GenBank", "ncbi" => "NCBI", "refseq" => "RefSeq", "ucsc" => "UCSC", "ensembl" => "Ensembl" ); open (AS, ">$opt_aliasAs") or die "can not write to $opt_aliasAs"; my $indexNames; $titleLine =~ s/^#\s+//; my @legendNames = split('\s+', $titleLine); my $expectFieldCount = scalar(@legendNames); my $i = 0; # output the .as definition printf AS "table chromAlias\n"; printf AS " \"chromAlias bigBed index\"\n"; printf AS " (\n"; printf AS " string chrom;\t\"native sequence name\"\n"; printf AS " uint chromStart;\t\"always 0\"\n"; printf AS " uint chromEnd;\t\"chromosome size\"\n"; foreach my $title (@legendNames) { if (length($indexNames)) { $indexNames .= "," . $title; } else { $indexNames = $title; } printf AS " string %s;\t\"%s name\"\n", $title, $nameLabels{$title}; } printf AS " )\n"; close (AS); printf STDERR "# indexNames: '%s'\n", $indexNames; open (BD, "|sort -k1,1 -k2,2n>$opt_aliasBed") or die "can not write to $opt_aliasBed"; while (my $line = <FH>) { chomp $line; my @a = split('\t', $line, -1); # the -1 keeps all empty fields too if (scalar(@a) != $expectFieldCount) { printf STDERR "ERROR: expected field count %d =! %d on line %d\n", $expectFieldCount, scalar(@a), $.; exit 255; } my $nameIndex = 0; printf BD "%s\t0\t%d", $a[$nameIndex], $chromSizes{$a[$nameIndex]}; foreach my $name (@a) { printf BD "\t%s", $a[$nameIndex++]; } printf BD "\n"; } close (FH); close (BD); printf STDERR "bedToBigBed -tab -type=bed3+5 -as=$opt_aliasAs -extraIndex=$indexNames $opt_aliasBed $opt_chromSizes $opt_aliasBigBed\n"; print `bedToBigBed -tab -type=bed3+5 -as=$opt_aliasAs -extraIndex=$indexNames $opt_aliasBed $opt_chromSizes $opt_aliasBigBed`; __END__ # ucsc assembly genbank ncbi refseq NW_011332701v1_alt HG2139_PATCH KN538374.1 NW_011332701.1 chr1 1 CM000663.2 1 NC_000001.11 chr10 10 CM000672.2 10 NC_000010.11 chr10_NT_187579v1_alt HSCHR10_1_CTG3 KI270824.1 NT_187579.1 chr10_NT_187580v1_alt HSCHR10_1_CTG4 KI270825.1 NT_187580.1 chr10_NW_003315934v1_alt HSCHR10_1_CTG1 GL383545.1 NW_003315934.1 chr10_NW_003315935v1_alt HSCHR10_1_CTG2 GL383546.1 NW_003315935.1 chr10_NW_009646202v1_alt HG2191_PATCH KN196480.1 NW_009646202.1 chr10_NW_011332692v1_alt HG2241_PATCH KN538365.1 NW_011332692.1