8c3236684dfa12ba4f8841fab2dbe43127091148 hiram Thu Sep 30 18:19:24 2021 -0700 fixups to function correctly with both target and query are assembly hubs no redmine diff --git src/hg/utils/automation/asmHubChainNet.pl src/hg/utils/automation/asmHubChainNet.pl new file mode 100755 index 0000000..a2d6bc4 --- /dev/null +++ src/hg/utils/automation/asmHubChainNet.pl @@ -0,0 +1,201 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use FindBin qw($Bin); +use lib "$Bin"; +use AsmHub; +use HgAutomate; +use File::Basename; + +my $argc = scalar(@ARGV); + +if ($argc != 4) { + printf STDERR "usage: asmHubChainNet.pl asmId asmId.names.tab queryId hubPath > asmId.chainNet.html\n"; + printf STDERR "where asmId is the assembly identifier,\n"; + printf STDERR "and asmId.names.tab is naming file for this assembly,\n"; + printf STDERR "and queryId is the asmId or db of the other organism,\n"; + printf STDERR "and hubPath is the path to this assembly directory in .../hubs/.\n"; + exit 255; +} + +# specific to UCSC environment +my $dbHost = "hgwdev"; + +my $asmId = shift; +my $namesFile = shift; +my $queryId = shift; +my $hubUrl = shift; + +# if assembly hub, need to find the real full assembly ID +if ($queryId =~ m/^GC/) { + my $gcX = substr($queryId,0,3); + my $d0 = substr($queryId,4,3); + my $d1 = substr($queryId,7,3); + my $d2 = substr($queryId,10,3); + my $hubBuildDir = "refseqBuild"; + $hubBuildDir = "genbankBuild" if ($gcX eq "GCA"); + $queryId = `ls -d /hive/data/genomes/asmHubs/$hubBuildDir/$gcX/$d0/$d1/$d2/${gcX}*`; + chomp $queryId; + $queryId =~ s#.*/##; +} + +my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; +chomp $ncbiAssemblyId; +my $sciName = `grep -v "^#" $namesFile | cut -f5`; +chomp $sciName; + +my ($tGenome, $tDate, $tSource) = &HgAutomate::getAssemblyInfo($dbHost, $asmId); +my ($qGenome, $qDate, $qSource) = &HgAutomate::getAssemblyInfo($dbHost, $queryId); + +print <<_EOF_ +
+This track shows regions of the genome that are alignable +to other genomes ("chain" subtracks) or in synteny ("net" subtracks). +The alignable parts are shown with thick blocks that look like exons. +Non-alignable parts between these are shown like introns. +
++The chain track shows alignments of $qGenome/($qDate) to the +$tGenome/$sciName/$ncbiAssemblyId/$tDate genome using a gap scoring system that allows longer gaps +than traditional affine gap scoring systems. It can also tolerate gaps in both +$qGenome and $tGenome simultaneously. These +"double-sided" gaps can be caused by local inversions and +overlapping deletions in both species. +
++The chain track displays boxes joined together by either single or +double lines. The boxes represent aligning regions. +Single lines indicate gaps that are largely due to a deletion in the +$qGenome assembly or an insertion in the $tGenome +assembly. Double lines represent more complex gaps that involve substantial +sequence in both species. This may result from inversions, overlapping +deletions, an abundance of local mutation, or an unsequenced gap in one +species. In cases where multiple chains align over a particular region of +the $tGenome genome, the chains with single-lined gaps are often +due to processed pseudogenes, while chains with double-lined gaps are more +often due to paralogs and unprocessed pseudogenes.
++In the "pack" and "full" display +modes, the individual feature names indicate the chromosome, strand, and +location (in thousands) of the match for each matching alignment.
+ ++The net track shows the best $qGenome/$tGenome chain for +every part of the $tGenome genome. It is useful for +finding syntenic regions, possibly orthologs, and for studying genome +rearrangement. The $qGenome sequence used in this annotation is from +the $qDate assembly.
+ +By default, the chains to chromosome-based assemblies are colored +based on which chromosome they map to in the aligning organism. To turn +off the coloring, check the "off" button next to: Color +track based on chromosome.
++To display only the chains of one chromosome in the aligning +organism, enter the name of that chromosome (e.g. chr4) in box next to: +Filter by chromosome.
+ ++At base level in full display mode, this track will show the +sequence of $qGenome as it aligned to $tGenome. When the view is +too large to show such detail, a graph of the alignment score will be +shown. +
+ ++The $qGenome genome was aligned to $tGenome genome with lastz. +The resulting alignments were converted into axt format using the lavToAxt +program. The axt alignments were fed into axtChain, which organizes all +alignments between a single $qGenome chromosome and a single +$tGenome chromosome into a group and creates a kd-tree out +of the gapless subsections (blocks) of the alignments. A dynamic program +was then run over the kd-trees to find the maximally scoring chains of these +blocks. +
+ ++Chains were derived from lastz alignments, using the methods +described on the chain tracks description pages, and sorted with the +highest-scoring chains in the genome ranked first. The program +chainNet was then used to place the chains one at a time, trimming them as +necessary to fit into sections not already covered by a higher-scoring chain. +During this process, a natural hierarchy emerged in which a chain that filled +a gap in a higher-scoring chain was placed underneath that chain. The program +netSyntenic was used to fill in information about the relationship between +higher- and lower-level chains, such as whether a lower-level +chain was syntenic or inverted relative to the higher-level chain. +The program netClass was then used to fill in how much of the gaps and chains +contained Ns (sequencing gaps) in one or both species and how much +was filled with transposons inserted before and after the two organisms +diverged. +
+ ++The resulting net file was converted to axt format via netToAxt, +then converted to maf format via axtToMaf, then converted to +the bigMaf format with mafToBigMaf and bedToBigBed +
+ ++lastz was developed by Robert Harris, Pennsylvania State University. +
++The axtChain program was developed at the University of California at +Santa Cruz by Jim Kent with advice from Webb Miller and David Haussler.
++The browser display and database storage of the chains and nets were created +by Robert Baertsch and Jim Kent.
++The chainNet, netSyntenic, and netClass programs +were developed at the University of California +Santa Cruz by Jim Kent.
+ ++Harris, R.S. +(2007) Improved pairwise alignment of genomic DNA +Ph.D. Thesis, The Pennsylvania State University +
+ ++Chiaromonte F, Yap VB, Miller W. +Scoring pairwise genomic sequence alignments. +Pac Symp Biocomput. 2002:115-26. +PMID: 11928468 +
+ ++Kent WJ, Baertsch R, Hinrichs A, Miller W, Haussler D. +Evolution's cauldron: +duplication, deletion, and rearrangement in the mouse and human genomes. +Proc Natl Acad Sci U S A. 2003 Sep 30;100(20):11484-9. +PMID: 14500911; PMC: PMC208784 +
+ ++Schwartz S, Kent WJ, Smit A, Zhang Z, Baertsch R, Hardison RC, +Haussler D, Miller W. +Human-mouse alignments with BLASTZ. +Genome Res. 2003 Jan;13(1):103-7. +PMID: 12529312; PMC: PMC430961 +
+ +_EOF_ + ;