src/hg/makeDb/doc/taeGut2.txt 77740770d513278f03c8cb1eac5f02c83c6c26c4

77740770d513278f03c8cb1eac5f02c83c6c26c4
gperez2
  Thu Feb 10 16:40:31 2022 -0800
Correcting makedoc location for taeGut2 vs. geoFor1 lastz/chain/net run for user, refs #28840

diff --git src/hg/makeDb/doc/taeGut2.txt src/hg/makeDb/doc/taeGut2.txt
index 8b41382..b8ea2a2 100644
--- src/hg/makeDb/doc/taeGut2.txt
+++ src/hg/makeDb/doc/taeGut2.txt
@@ -1,902 +1,1003 @@
 # for emacs: -*- mode: sh; -*-
 
 #	DATE:   08-Feb-2013
 #	ORGANISM:       Taeniopygia guttata
 #	TAXID:  59729
 #	ASSEMBLY LONG NAME:     Taeniopygia_guttata-3.2.4
 #	ASSEMBLY SHORT NAME:    Taeniopygia_guttata-3.2.4
 #	ASSEMBLY SUBMITTER:     Washington University Genome Sequencing Center
 #	ASSEMBLY TYPE:  Haploid
 #	NUMBER OF ASSEMBLY-UNITS:       1
 #	ASSEMBLY ACCESSION:     GCA_000151805.2
 #	FTP-RELEASE DATE: 12-Feb-2013
 
 
 #       http://www.ncbi.nlm.nih.gov/genome/367
 #       http://www.ncbi.nlm.nih.gov/genome/assembly/524908
 #       http://www.ncbi.nlm.nih.gov/bioproject/17289
 
 #       http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABQF01
 #       Genome Coverage : 5.5x  PCAP v. 2008
 
 #       http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=59729
 
 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Taeniopygia_guttata/Taeniopygia_guttata-3.2.4/
 
 ##########################################################################
 # Download sequence (DONE - 2013-06-05 - Hiram)
     mkdir -p /hive/data/genomes/taeGut2/genbank
     cd /hive/data/genomes/taeGut2/genbank
 
     time rsync -a -P \
 rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Taeniopygia_guttata/Taeniopygia_guttata-3.2.4/
     #   real    real    22m54.139s
 
     # verify the size of the sequence here:
     faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz \
 Primary_Assembly/unlocalized_scaffolds/FASTA/chr*.unlocalized.scaf.fa.gz \
 Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz
 
 # 1232118738 bases (9270900 N's 1222847838 real 1222847838 upper 0 lower)
 #    in 37095 sequences in 69 files
 # Total size: mean 33215.2 sd 1455020.7
 #   min 783 (gi|197861094|gb|ABQF01053133.1|)
 #   max 156412533 (gi|209446643|gb|CM000518.1|) median 3541
 
 ##########################################################################
 # fixup names for UCSC standards (DONE - 2013-06-05 - Hiram)
     mkdir /hive/data/genomes/taeGut2/ucsc
     cd /hive/data/genomes/taeGut2/ucsc
 
     ########################  Assembled Chromosomes
     cat << '_EOF_' > ucscCompositeAgp.pl
 #!/bin/env perl
 
 use strict;
 use warnings;
 
 my %accToChr;
 
 open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or
         die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
 while (my $line = <FH>) {
     next if ($line =~ m/^#/);
     chomp $line;
     my ($chrN, $acc) = split('\s+', $line);
     $accToChr{$acc} = $chrN;
 }
 close (FH);
 
 foreach my $acc (keys %accToChr) {
     my $chrN =  $accToChr{$acc};
     print "$acc $accToChr{$acc}\n";
     open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz|") or die "can not read chr${chrN}.comp.agp.gz";
     open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp";
     while (my $line = <FH>) {
         if ($line =~ m/^#/) {
             print UC $line;
         } else {
             $line =~ s/^$acc/chr${chrN}/;
             print UC $line;
         }
     }
     close (FH);
     close (UC);
     open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz";
     open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa";
     while (my $line = <FH>) {
         if ($line =~ m/^>/) {
             printf UC ">chr${chrN}\n";
         } else {
             print UC $line;
         }
     }
     close (FH);
     close (UC);
 }
 '_EOF_'
 
     # << happy emacs
     chmod +x ucscCompositeAgp.pl
     time ./ucscCompositeAgp.pl
     # real    0m16.041s
 
     gzip *.fa *.agp &
 
     ########################  Unplaced scaffolds
     # verify we don't have any .acc numbers different from .1
     zcat \
     ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
 	| cut -f1 | grep -v '^#' \
 	| sed -e 's/[A-Z0-9]*//i' | sort | uniq -c
     #   81153 .1
 
     cat << '_EOF_' > unplaced.pl
 #!/bin/env perl
 
 use strict;
 use warnings;
 
 my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
 my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
 open (FH, "zcat $agpFile|") or die "can not read $agpFile";
 open (UC, ">chrUn.agp") or die "can not write to chrUn.agp";
 while (my $line = <FH>) {
     if ($line =~ m/^#/) {
         print UC $line;
     } else {
         $line =~ s/\.1//;
         printf UC "chrUn_%s", $line;
     }
 }
 close (FH);
 close (UC);
 
 open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
 open (UC, ">chrUn.fa") or die "can not write to chrUn.fa";
 while (my $line = <FH>) {
     if ($line =~ m/^>/) {
         chomp $line;
         $line =~ s/.*gb\|//;
         $line =~ s/\.1. .*//;
         printf UC ">chrUn_$line\n";
     } else {
         print UC $line;
     }
 }
 close (FH);
 close (UC);
 '_EOF_'
     # << happy emacs
     chmod +x unplaced.pl
     time ./unplaced.pl
     # real    0m5.659s
 
     gzip chrUn.agp chrUn.fa &
 
     # make sure none of the names got to be over 31 characers long:
     zcat chrUn.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($1)}' | sort -rn | head -1
     # 18
 
     # Unlocalized scaffolds
 
     cat << '_EOF_' > unlocalized.pl
 #!/bin/env perl
 
 use strict;
 use warnings;
 
 my %accToChr;
 my %chrNames;
 
 open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or
         die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf";
 while (my $line = <FH>) {
     next if ($line =~ m/^#/);
     chomp $line;
     my ($chrN, $acc) = split('\s+', $line);
     $acc =~ s/\.1$//;
     $accToChr{$acc} = $chrN;
     $chrNames{$chrN} += 1;
 }
 close (FH);
 
 foreach my $chrN (keys %chrNames) {
     my $agpFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz";
     my $fastaFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz";
     open (FH, "zcat $agpFile|") or die "can not read $agpFile";
     open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp";
     while (my $line = <FH>) {
         if ($line =~ m/^#/) {
             print UC $line;
         } else {
             chomp $line;
             my (@a) = split('\t', $line);
             my $acc = $a[0];
             $acc =~ s/\.1//;
             die "ERROR: chrN $chrN not correct for $acc"
                 if ($accToChr{$acc} ne $chrN);
             my $ucscName = "chr${chrN}_${acc}_random";
             printf UC "%s", $ucscName;
             for (my $i = 1; $i < scalar(@a); ++$i) {
                 printf UC "\t%s", $a[$i];
             }
             printf UC "\n";
         }
     }
     close (FH);
     close (UC);
     printf "chr%s\n", $chrN;
     open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
     open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa";
     while (my $line = <FH>) {
         if ($line =~ m/^>/) {
             chomp $line;
             my $acc = $line;
             $acc =~ s/.*gb\|//;
             $acc =~ s/\.1. .*//;
             $acc =~ s/\.1//;
             die "ERROR: chrN $chrN not correct for $acc"
                 if ($accToChr{$acc} ne $chrN);
             my $ucscName = "chr${chrN}_${acc}_random";
             printf UC ">$ucscName\n";
         } else {
             print UC $line;
         }
     }
     close (FH);
     close (UC);
 }
 '_EOF_'
     # << happy emacs
 
     chmod +x unlocalized.pl
     time ./unlocalized.pl
     # real    0m2.140s
 
     gzip *.fa *.agp
 
     # verify nothing lost from original:
     time faSize *.fa.gz
     # real    0m32.541s
 
 # 1232118738 bases (9270900 N's 1222847838 real 1222847838 upper 0 lower)
 #    in 37095 sequences in 69 files
 # Total size: mean 33215.2 sd 1455020.7 min 783 (chr5_ABQF01053133_random)
 #    max 156412533 (chr2) median 3541
 
 
     # make sure none of the names got to be over 31 characers long:
     zcat *.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($0)}' \
         | sort -rn | uniq -c | head
 
 ##########################################################################
 # Initial makeGenomeDb.pl (DONE - 2013-06-05 - Hiram)
     cd /hive/data/genomes/taeGut2
     cat << '_EOF_' > taeGut2.config.ra
 # Config parameters for makeGenomeDb.pl:
 db taeGut2
 clade vertebrate
 genomeCladePriority 68
 scientificName Taeniopygia guttata
 commonName Zebra finch
 assemblyDate Feb. 2013
 assemblyLabel Washington University School of Medicine
 assemblyShortLabel WashU taeGut324
 orderKey 436
 mitoAcc NC_007897
 fastaFiles /cluster/data/taeGut2/ucsc/chr*.fa.gz
 agpFiles /cluster/data/taeGut2/ucsc/chr*.agp.gz
 # qualFiles none
 dbDbSpeciesDir zebraFinch
 photoCreditURL http://www.physci.ucla.edu/html/arnold.htm
 photoCreditName UCLA Department of Physiological Science
 ncbiGenomeId 367
 ncbiAssemblyId 524908
 ncbiAssemblyName Taeniopygia_guttata-3.2.4
 ncbiBioProject 17289
 genBankAccessionID GCA_000151805.2
 taxId   59729
 '_EOF_'
     # << happy emacs
 
     # verify sequence and agp are OK
     time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \
         -stop=agp taeGut2.config.ra > agp.log 2>&1
     # verify no problem:
     tail -1 agp.log
     #  *** All done!  (through the 'agp' step)
 
     time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \
         -continue=db taeGut2.config.ra > db.log 2>&1
     # real    10m29.132s
     #
     #	add the trackDb entries to the source tree, and the 2bit link:
     ln -s `pwd`/taeGut2.unmasked.2bit /gbdb/taeGut2/taeGut2.2bit
     #	browser should function now in sandbox
     #   trackDb files here:
     #   /hive/data/genomes/taeGut2/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/gibbon/taeGut2/
     #   into source tree
     #   now browser should function on hgwdev
 
 #########################################################################
 # running repeat masker (DONE - 2013-06-05 - Hiram)
     mkdir /hive/data/genomes/taeGut2/bed/repeatMasker
     cd /hive/data/genomes/taeGut2/bed/repeatMasker
     time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
 	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
 	-smallClusterHub=encodek taeGut2 > do.log 2>&1 &
     # real    220m20.455s
 
     cat faSize.rmsk.txt
     # 1232135591 bases (9270900 N's 1222864691 real 1125277738 upper
     #   97586953 lower) in 37096 sequences in 1 files
     # Total size: mean 33214.8 sd 1455001.1 min 783 (chr5_ABQF01053133_random)
     #   max 156412533 (chr2) median 3541
     # %7.92 masked total, %7.98 masked real
 
     egrep -i "versi|relea" do.log
     #    January 10 2013 (open-4-0-0) version of RepeatMasker
     #  CC   RELEASE 20120418;
 
     time featureBits -countGaps taeGut2 rmsk
     # 97640353 bases of 1232135591 (7.924%) in intersection
     # real    0m16.657s
 
     # why is it different than the faSize above ?
     # because rmsk masks out some N's as well as bases, the count above
     #	separates out the N's from the bases, it doesn't show lower case N's
 
 ##########################################################################
 # running simple repeat (DONE - 2013-06-05 - Hiram)
     mkdir /hive/data/genomes/taeGut2/bed/simpleRepeat
     cd /hive/data/genomes/taeGut2/bed/simpleRepeat
     time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
 	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
 	taeGut2 > do.log 2>&1
     # about 14m30s
 
     cat fb.simpleRepeat
     #   25800814 bases of 1222864691 (2.110%) in intersection
 
     # considering rmsk %8 vs. WM %20 and this is a small genome,
     # use the rmsk result in order to have the classifications from
     # that available
     # add to rmsk after it is done:
     cd /hive/data/genomes/taeGut2
     twoBitMask taeGut2.rmsk.2bit \
 	-add bed/simpleRepeat/trfMask.bed taeGut2.2bit
     #	you can safely ignore the warning about fields >= 13
 
     twoBitToFa taeGut2.2bit stdout | faSize stdin > faSize.taeGut2.2bit.txt
     cat faSize.taeGut2.2bit.txt
 
     # 1232135591 bases (9270900 N's 1222864691 real 1123697742 upper
     #    99166949 lower) in 37096 sequences in 1 files
     # Total size: mean 33214.8 sd 1455001.1 min 783 (chr5_ABQF01053133_random)
     #    max 156412533 (chr2) median 3541
     # %8.05 masked total, %8.11 masked real
 
     rm /gbdb/taeGut2/taeGut2.2bit
     ln -s `pwd`/taeGut2.2bit /gbdb/taeGut2/taeGut2.2bit
 
 #########################################################################
 # Verify all gaps are marked, add any N's not in gap as type 'other'
 #	(DONE - 2013-06-05 - Hiram)
 
     mkdir /hive/data/genomes/taeGut2/bed/gap
     cd /hive/data/genomes/taeGut2/bed/gap
 
     time nice findMotif -motif=gattaca -verbose=4 \
 	-strand=+ ../../taeGut2.unmasked.2bit > findMotif.txt 2>&1
     #   real	1m2.760s
 
     grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
     time featureBits taeGut2 -not gap -bed=notGap.bed
     # 1222864691 bases of 1222864691 (100.000%) in intersection
     # real    0m9.762s
 
     time featureBits taeGut2 allGaps.bed notGap.bed -bed=new.gaps.bed
     # 0 bases of 1222864691 (0.000%) in intersection
     #  real    19m53.371s
 
     # are there non-bridged gaps here:
     hgsql -N -e "select bridge from gap;" taeGut2 | sort | uniq -c
     #     326 no
     #   87384 yes
 
 ##########################################################################
 ## WINDOWMASKER (DONE- 2013-06-05 - Hiram)
     mkdir /hive/data/genomes/taeGut2/bed/windowMasker
     cd /hive/data/genomes/taeGut2/bed/windowMasker
     time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
 	-dbHost=hgwdev taeGut2 > do.log 2>&1 &
     # real    66m52.863s
 
     # Masking statistics
     faSize.taeGut2.cleanWMSdust.txt
 # 1232135591 bases (9270900 N's 1222864691 real 973147255 upper
 #     249717436 lower) in 37096 sequences in 1 files
 # Total size: mean 33214.8 sd 1455001.1 min 783 (chr5_ABQF01053133_random)
 #     max 156412533 (chr2) median 3541
 # %20.27 masked total, %20.42 masked real
 
     # how much does this window masker and repeat masker overlap:
     featureBits -countGaps taeGut2 rmsk windowmaskerSdust
     #   54554911 bases of 1232135591 (4.428%) in intersection
 
 ########################################################################
 # cpgIslands - (DONE - 2013-06-06 - Hiram)
     mkdir /hive/data/genomes/taeGut2/bed/cpgIslands
     cd /hive/data/genomes/taeGut2/bed/cpgIslands
     time doCpgIslands.pl taeGut2 > do.log 2>&1
     #  real    17m40.265s
 
     cat fb.taeGut2.cpgIslandExt.txt
     #   16473980 bases of 1222864691 (1.347%) in intersection
 
 #############################################################################
 # CPG Islands Unmasked track
 
 mkdir /hive/data/genomes/taeGut2/bed/cpgIslandsUnmasked
 cd /hive/data/genomes/taeGut2/bed/cpgIslandsUnmasked
 time doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \
    -tableName=cpgIslandExtUnmasked -dbHost=hgwdev -smallClusterHub=ku \
     -workhorse=hgwdev \
     -maskedSeq=/hive/data/genomes/taeGut2/taeGut2.unmasked.2bit \
         taeGut2 > do.log 2>&1
 
 #############################################################################
 
 #########################################################################
 # genscan - (DONE - 2013-06-06 - Hiram)
     mkdir /hive/data/genomes/taeGut2/bed/genscan
     cd /hive/data/genomes/taeGut2/bed/genscan
     time doGenscan.pl taeGut2 > do.log 2>&1
     #  real    47m7.823s
     # XXX failed four jobs, running manually on hgwdev: chr9 chr1A chr3 chr2
     # with -window=2000000 - all bug chr9 finished
 # real    57m16.383s
 # user    133m46.445s
 # sys     2m55.584s
     # running chr9 with -window=1000000
 # real    10m14.291s
 # user    9m59.182s
 # sys     0m13.720s
 
     time doGenscan.pl -continue=makeBed taeGut2 > makeBed.log 2>&1
     # real    30m4.119s
 
     cat fb.taeGut2.genscan.txt
     #   52986693 bases of 1222864691 (4.333%) in intersection
     cat fb.taeGut2.genscanSubopt.txt
     #   28313661 bases of 1222864691 (2.315%) in intersection
 
 #########################################################################
 # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2013-06-06 - Hiram)
     # Use -repMatch=500, based on size -- for human we use 1024
     # use the "real" number from the faSize measurement,
     # hg19 is 2897316137, calculate the ratio factor for 1024:
     calc \( 1222864691 / 2897316137 \) \* 1024
     #  ( 1222864691 / 2897316137 ) * 1024 = 432.197725
 
     # round up to 500 (taeGut1 used 1024)
 
     cd /hive/data/genomes/taeGut2
     time blat taeGut2.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=jkStuff/taeGut2.11.ooc -repMatch=500
     # Wrote 12162 overused 11-mers to jkStuff/taeGut2.11.ooc
     # real    0m32.994s
 
     # there are non-bridged gaps, therefore, a lift file is needed for genbank
     hgsql -N -e "select bridge from gap;" taeGut2 | sort | uniq -c
     #      326 no
     #    87384 yes
 
     cd /hive/data/genomes/taeGut2/jkStuff
     gapToLift taeGut2 taeGut2.nonBridged.lift -bedFile=taeGut2.nonBridged.bed
     # largest non-bridged contig:
     awk '{print $3-$2,$0}' taeGut2.nonBridged.bed | sort -nr | head
     #   56928224 chr5   4758199 61686423        chr5.07
 
 #########################################################################
 # AUTO UPDATE GENBANK (DONE - 2013-03-08 - Pauline)
     # examine the file:
     /cluster/data/genbank/data/organism.lst
     # for your species to see what counts it has for:
 # organism       mrnaCnt estCnt  refSeqCnt
 # Taeniopygia guttata     5086    92144   987
 
     # to decide which "native" mrna or ests you want to specify in genbank.conf
 
     ssh hgwdev
     cd $HOME/kent/src/hg/makeDb/genbank
     git pull
     # edit etc/genbank.conf to add taeGut2 just after ce2
 
 # taeGut2 - Zebra Finch
 taeGut2.serverGenome = /hive/data/genomes/taeGut2/taeGut2.2bit
 taeGut2.clusterGenome = /hive/data/genomes/taeGut2/taeGut2.2bit
 taeGut2.ooc = /hive/data/genomes/taeGut2/jkStuff/taeGut2.11.ooc
 taeGut2.align.unplacedChroms = chrUn_*,chr*_random
 taeGut2.lift = /hive/data/genomes/taeGut2/jkStuff/taeGut2.nonBridged.lift
 taeGut2.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
 taeGut2.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
 taeGut2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
 taeGut2.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
 taeGut2.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
 taeGut2.refseq.mrna.native.load = yes
 taeGut2.refseq.mrna.xeno.load = yes
 taeGut2.genbank.mrna.xeno.load = yes
 taeGut2.downloadDir = taeGut2
 taeGut2.upstreamGeneTbl = refGene
 
     # end of section added to etc/genbank.conf
     git commit -m "adding taeGut2 Zebra finch refs #10190" etc/genbank.conf
     git push
     make etc-update
 
     ssh hgwdev			# used to do this on "genbank" machine
     screen -S taeGut2           # long running job managed in screen
     cd /cluster/data/genbank
     time ./bin/gbAlignStep -initial taeGut2 &
 #  logFile: var/build/logs/2013.06.18-11:56:48.taeGut2.initalign.log
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
 
     time ./bin/gbDbLoadStep -drop -initialLoad taeGut2 &
     #   logFile: var/dbload/hgwdev/logs/2013.06.08-18:59:10.dbload.log
    #   real    29m9.092s
 
     # enable daily alignment and update of hgwdev (DONE - 2013-03-08 - Pauline)
     cd ~/kent/src/hg/makeDb/genbank
     git pull
     # add taeGut2 to:
     vi etc/align.dbs etc/hgwdev.dbs
     git commit -m "Added taeGut2. refs #10190" etc/align.dbs etc/hgwdev.dbs
     git push
     make etc-update
 
 #########################################################################
 # set default position similar to taeGut1 (DONE - 2013-06-19 - Hiram)
     hgsql -e \
 'update dbDb set defaultPos="chrZ:56283356-56343168" where name="taeGut2";' \
 	hgcentraltest
 
 #########################################################################
 #  BLATSERVERS ENTRY (DONE - 2013-06-19 - Hiram)
 #	After getting a blat server assigned by the Blat Server Gods,
      ssh hgwdev
 
      hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
  	VALUES ("taeGut2", "blat4c", "17848", "1", "0"); \
  	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
  	VALUES ("taeGut2", "blat4c", "17849", "0", "1");' \
  	    hgcentraltest
      #	test it with some sequence
 
 ############################################################################
 # reset dbDb description to be like the previous nomLeu
 #       (DONE - 2013-03-21 - Hiram)
     hgsql hgcentraltest -e 'update dbDb set description = "Oct. 2012 (GGSC Nleu3.0/taeGut2)"" where name = "taeGut2";' \
           | hgsql -h genome-testdb hgcentraltest
 
 ############################################################################
 # construct liftOver from taeGut1 to taeGut2 (DONE - 2013-06-19 - Hiram)
     # documentation for this step is in taeGut1 -  been done
 
 ############################################################################
 # downloads and pushQ entry (DONE - 2014-04-16 - Hiram)
     # after adding taeGut2 to the all.joiner file and verifying that
     #   joinerCheck is clean (i.e. run joinerCheck w -times and -keys flags
     #   to make sure there are no errors), can construct the downloads:
     cd /hive/data/genomes/taeGut2
     time  makeDownloads.pl -workhorse=hgwdev taeGut2
     #   real 14m53.730s
 
     mkdir /hive/data/genomes/taeGut2/pushQ
     cd /hive/data/genomes/taeGut2/pushQ
     time makePushQSql.pl taeGut2 2> stderr.txt > taeGut2.sql
     #  real	2m4.300s
 
     # check the stderr.txt for bad stuff, these kinds of warnings are OK:
 # WARNING: hgwdev does not have /gbdb/taeGut2/wib/gc5Base.wib
 # WARNING: hgwdev does not have /gbdb/taeGut2/wib/quality.wib
 # WARNING: hgwdev does not have /gbdb/taeGut2/bbi/quality.bw
 # WARNING: taeGut2 does not have seq
 # WARNING: taeGut2 does not have extFile
 # WARNING: taeGut2 does not have estOrientInfo
 
     scp -p taeGut2.sql qateam@hgwbeta:/tmp/
     ssh qateam@hgwbeta hgwbeta "./bin/x86_64/hgsql qapushq < /tmp/taeGut2.sql"
 
 ############################################################################
 #  BLASTZ/CHAIN/NET Zebra finch vs Turkey (DONE - 2013-10-18,25 - Hiram)
 #   use a screen to manage this multi-day job
     screen -S melGal1
 
     mkdir /hive/data/genomes/taeGut2/bed/lastzMelGal1.2013-10-18
     cd /hive/data/genomes/taeGut2/bed/lastzMelGal1.2013-10-18
 
     cat << '_EOF_' > DEF
 # Zebra finch vs. Turkey
 BLASTZ_M=50
 
 # TARGET: Zebra finch TaeGut2
 SEQ1_DIR=/hive/data/genomes/taeGut2/taeGut2.2bit
 SEQ1_LEN=/hive/data/genomes/taeGut2/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=100
 
 # QUERY: Turkey MelGal1
 SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit
 SEQ2_LEN=/scratch/data/melGal1/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/taeGut2/bed/lastzMelGal1.2013-10-18
 TMPDIR=/dev/shm
 '_EOF_'
     # << this line keeps emacs coloring happy
     # screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         `pwd`/DEF \
         -syntenicNet \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
     # real    4063m8.278s
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         `pwd`/DEF \
         -continue=cat -syntenicNet \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1
     # real    66m17.485s
 
     cat fb.taeGut2.chainMelGal1Link.txt
     # 781765606 bases of 1222864691 (63.929%) in intersection
 
     cd /hive/data/genomes/taeGut2/bed
     ln -s lastzMelGal1.2013-10-18 lastz.melGal1
 
     #   and then swap
     mkdir /hive/data/genomes/melGal1/bed/blastz.taeGut2.swap
     cd /hive/data/genomes/melGal1/bed/blastz.taeGut2.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         /hive/data/genomes/taeGut2/bed/lastzMelGal1.2013-10-18/DEF \
         -swap -syntenicNet \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
     # real    43m24.666s
     cat fb.melGal1.chainTaeGut2Link.txt
     #   661376423 bases of 935922386 (70.666%) in intersection
 
     cd /hive/data/genomes/melGal1/bed
     ln -s blastz.taeGut2.swap lastz.taeGut2
 
 ############################################################################
 # lifting ensGene from taeGut1 (DONE - 2011-03-23 - Hiram)
     mkdir /hive/data/genomes/taeGut2/bed/ensGene.73
     cd /hive/data/genomes/taeGut2/bed/ensGene.73
 
     # start with taeGut1 ensGene
     hgsql -N -e "select * from ensGene;" taeGut1 | cut -f2- > taeGut1.ensGene.gp
     # lift over to taeGut2
     liftOver -genePred  taeGut1.ensGene.gp \
       /hive/data/genomes/taeGut1/bed/liftOver/taeGut1ToTaeGut2.over.chain.gz \
          taeGut2.ensGene.gp unMapped.txt
     # check to see which genes have gone bad
     genePredCheck -db=taeGut2 taeGut2.ensGene.gp > badOnes.txt 2>&1
     #	checked: 19226 failed: 2
     awk '{print $3}' badOnes.txt | sort -u | grep -v failed > toRemove.txt
     # remove the bad ones
     cp -p taeGut2.ensGene.gp taeGut2.unfiltered.ensGene.gp
 for N in `cat toRemove.txt`
 do
     rm -f t.gp
     grep -v "${N}" taeGut2.ensGene.gp > t.gp
     rm -f taeGut2.ensGene.gp
     mv t.gp taeGut2.ensGene.gp
 done
     genePredCheck -db=taeGut2 taeGut2.ensGene.gp
     #	checked: 19225 failed: 0
     # OK to load this since all are OK now
     hgLoadGenePred  -genePredExt taeGut2 ensGene taeGut2.ensGene.gp
     genePredCheck -db=taeGut2 ensGene
     #	checked: 19225 failed: 0
     # use the ensPep and ensGtp from taeGut1, but remove the bad ones
     zcat /hive/data/genomes/taeGut1/bed/ensGene.73/download/Taeniopygia_guttata.taeGut3.2.4.73.pep.all.fa.gz \
         | sed -e 's/^>.* transcript:/>/; s/ CCDS.*$//; s/ gene_biotype.*$//;' | gzip > ensPep.txt.gz
     zcat ensPep.txt.gz \
     | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \
          | sed -e '/^$/d; s/*$//' | sort > ensPep.taeGut1.fa.tab
 
     cp -p ensPep.taeGut1.fa.tab ensPep.taeGut2.fa.tab
     for N in `cat toRemove.txt`
 do
     rm -f t.fa.tab
     grep -v "${N}" ensPep.taeGut2.fa.tab > t.fa.tab
     rm -f ensPep.taeGut2.fa.tab
     mv t.fa.tab ensPep.taeGut2.fa.tab
     echo "'${N}'"
 done
 
     wc -l ensPep.taeGut1.fa.tab ensPep.taeGut2.fa.tab
     # 18204 ensPep.taeGut1.fa.tab
     # 18203 ensPep.taeGut2.fa.tab
 
     awk '{print $1}' taeGut1.ensGene.gp | sort -u > taeGut1.name.list
     awk '{print $1}' taeGut2.ensGene.gp | sort -u > taeGut2.name.list
     cp -p /hive/data/genomes/taeGut1/bed/ensGene.73/process/ensGtp.tab \
         ./taeGut1.ensGtp.tab
 
     cp -p taeGut1.ensGtp.tab taeGut2.ensGtp.tab
     comm -23 taeGut1.name.list taeGut2.name.list | while read N
 do
     rm -f t.gtp.tab
     grep -v "${N}" taeGut2.ensGtp.tab > t.gtp.tab
     rm -f taeGut2.ensGtp.tab
     mv t.gtp.tab taeGut2.ensGtp.tab
     echo "${N}"
 done
     wc -l *ensGtp*
     # 19334 taeGut1.ensGtp.tab
     # 19225 taeGut2.ensGtp.tab
 
     comm -23 taeGut1.name.list taeGut2.name.list | while read N
 do
     rm -f t.fa.tab
     grep -v "${N}" ensPep.taeGut2.fa.tab > t.fa.tab
     rm -f ensPep.taeGut2.fa.tab
     mv t.fa.tab ensPep.taeGut2.fa.tab
     echo "'${N}'"
 done
     wc -l ensPep.taeGut1.fa.tab ensPep.taeGut2.fa.tab
     # 18204 ensPep.taeGut1.fa.tab
     # 18097 ensPep.taeGut2.fa.tab
 
     # now that those are clean, load them up
     hgPepPred taeGut2 tab ensPep ensPep.taeGut2.fa.tab
     hgLoadSqlTab taeGut2 ensGtp ~/kent/src/hg/lib/ensGtp.sql taeGut2.ensGtp.tab
     # and record version in trackVersion
 hgsql -e 'INSERT INTO trackVersion \
     (db, name, who, version, updateTime, comment, source, dateReference) \
     VALUES("taeGut2", "ensGene", "hiram", "73", now(), \
         "peptides lifted from taeGut1", \
         "genes lifted from taeGut1 ensGene v73", "current" );' hgFixed
 
     /cluster/bin/scripts/ensemblInfo.pl /hive/data/genomes/taeGut1/bed/ensGene.73/process/infoOut.txt | sort > ensemblToGeneName.taeGut1.tab
     /cluster/bin/scripts/ensemblInfo.pl -field2=source /hive/data/genomes/taeGut1/bed/ensGene.73/process/infoOut.txt | sort > ensemblSource.taeGut1.tab
 
     # default join output separates fields by space
     # we need tab separated files for hgLoadSqlTab
     # if it was join -t'\t' - it doesn't recognize \t as a tab
     # if it was the hidden tab character join -t'^I' it wouldn't cut and paste
     # so use the sed to change space into tab since we know our names do
     # not have space
     join taeGut2.name.list ensemblToGeneName.taeGut1.tab \
           | sed -e 's/ /\t/g' > ensemblToGeneName.tab
     join taeGut2.name.list ensemblSource.taeGut1.tab \
           | sed -e 's/ /\t/g' > ensemblSource.tab
 
     hgLoadSqlTab taeGut2 ensemblToGeneName \
       /hive/data/genomes/taeGut1/bed/ensGene.73/process/ensemblToGeneName.sql \
            ensemblToGeneName.tab
     hgLoadSqlTab taeGut2 ensemblSource \
        /hive/data/genomes/taeGut1/bed/ensGene.73/process/ensemblSource.sql \
            ensemblSource.tab
 
 #########################################################################
 # create ucscToINSDC name mapping (DONE - 2014-04-15 - Hiram)
     mkdir /hive/data/genomes/taeGut2/bed/ucscToINSDC
     cd /hive/data/genomes/taeGut2/bed/ucscToINSDC
 
     # this script has been maturing over time, it is close to complete.
     # to find a latest copy of it:
     # ls -ogrt /hive/data/genomes/*/bed/ucscToINSDC/translateNames.sh
 
     cp -p /hive/data/genomes/eriEur2/bed/ucscToINSDC/translateNames.sh .
     ./translateNames.sh
     # it says:
 # need to find chrM accessions
     # so add this one:
     echo -e 'chrM\tNC_007897.1' >> ucscToINSDC.txt
     # needs to be sorted to work with join
     sort ucscToINSDC.txt > ucscToINSDC.tab
 
     awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes | sort \
         > name.coordinate.tab
 
     join name.coordinate.tab ucscToINSDC.tab | tr '[ ]' '[\t]' > ucscToINSDC.bed
 
     cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
 # 28
 
     # use the 28 in this sed:
     sed -e "s/21/28/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab taeGut2 ucscToINSDC stdin ucscToINSDC.bed
     checkTableCoords taeGut2 ucscToINSDC
     # should cover all bases
     featureBits -countGaps taeGut2 ucscToINSDC
     # 1232135591 bases of 1232135591 (100.000%) in intersection
 
 ##############################################################################
 # cytoBandIdeo - (DONE - 2014-04-15 - Hiram)
     mkdir /hive/data/genomes/taeGut2/bed/cytoBand
     cd /hive/data/genomes/taeGut2/bed/cytoBand
     makeCytoBandIdeo.csh taeGut2
 
 ##############################################################################
 # fixup search rule for assembly track/gold table (DONE - 2014-05-01 - Hiram)
     hgsql -N -e "select frag from gold;" taeGut2 | sort | head -1
 ABQF01000001.1
 
     hgsql -N -e "select frag from gold;" taeGut2 | sort | tail -2
 ABQF01124805.1
 NC_007897
 
     # verify this rule will find them all or eliminate them all:
     hgsql -N -e "select frag from gold;" taeGut2 | wc -l
     # 124806
 
     hgsql -N -e "select frag from gold;" taeGut2 | egrep -e '[AN][BC][Q_][F0]0[0-9]+(\.1)?' | wc -l
     # 124806
 
     hgsql -N -e "select frag from gold;" taeGut2 | egrep -v -e '[AN][BC][Q_][F0]0[0-9]+(\.1)?' | wc -l
     # 0
 
     # hence, add to trackDb/zebraFinch/taeGut2/trackDb.ra
 searchTable gold
 shortCircuit 1
 termRegex [AN][BC][Q_][F0]0[0-9]+(\.1)?
 query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
 searchPriority 8
 
 #########################################################################
 ##############################################################################
 # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
 ##############################################################################
 
 ############################################################################
 #  BLASTZ/CHAIN/NET Zebra finch vs Turkey (DONE - 2017-01-19,25 - Hiram)
 #   use a screen to manage this multi-day job
     screen -S melGal5
 
     mkdir /hive/data/genomes/taeGut2/bed/lastzMelGal5.2017-01-19
     cd /hive/data/genomes/taeGut2/bed/lastzMelGal5.2017-01-19
 
     printf '# Zebra finch vs. Turkey
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
 BLASTZ_M=254
 
 # TARGET: Zebra finch TaeGut2
 SEQ1_DIR=/hive/data/genomes/taeGut2/taeGut2.2bit
 SEQ1_LEN=/hive/data/genomes/taeGut2/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=300
 
 # QUERY: Turkey MelGal5
 SEQ2_DIR=/scratch/data/melGal5/melGal5.2bit
 SEQ2_LEN=/scratch/data/melGal5/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=800
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/taeGut2/bed/lastzMelGal5.2017-01-19
 TMPDIR=/dev/shm
 ' > DEF
 
     time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1
     # real    266m59.980s
 
     cat fb.taeGut2.chainMelGal5Link.txt
     # 807888817 bases of 1222864691 (66.065%) in intersection
 
     time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` taeGut2 melGal5) \
            > rbest.log 2>&1 &
     # real    413m28.452s
 
     #   and then swap
     mkdir /hive/data/genomes/melGal5/bed/blastz.taeGut2.swap
     cd /hive/data/genomes/melGal5/bed/blastz.taeGut2.swap
     time (doBlastzChainNet.pl -verbose=2 \
         /hive/data/genomes/taeGut2/bed/lastzMelGal5.2017-01-19/DEF \
         -swap -syntenicNet \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1
     # real    119m13.393s
 
     cat fb.melGal5.chainTaeGut2Link.txt
     #   706578355 bases of 1093044709 (64.643%) in intersection
 
     time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` melGal5 taeGut2) \
            > rbest.log 2>&1
     # real    354m40.975s
 
 ############################################################################
+# LASTZ Zebra finch TaeGut2 vs. Medium ground finch GeoFor1 (DONE - 2022-02-03 -Gerardo)
+
+# should be able to run this from anywhere, this time it was run from:
+    cd kent/src/hg/utils/automation
+
+  time (~/kent/src/hg/utils/automation/pairLastz.sh \
+        taeGut2 geoFor1 other other) \
+           > zebraFinch_medFinch_20220202_secondTime.log 2>&1 &
+  # check the total time
+grep -w real  zebraFinch_medFinch_20220202_secondTime.log  | tail -1 | sed -e 's/^/    # /;'
+    # real      107m34.019s
+
+  # this zebraFinch_medFinch_20220202_secondTime.log log file happens to have a copy of the make doc, as well
+  # as the copy of the make doc left in the target assembly directory:
+# /hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02/makeDoc.txt
+
+    # this command outputs this makeDoc text:
+
+    cat kent/src/hg/utils/automation/zebraFinch_medFinch_20220202_secondTime.log
+
+##############################################################################
+# LASTZ Zebra finch TaeGut2 vs. Medium ground finch GeoFor1
+#    (DONE - 2022-02-03 - Gerardo)
+
+    mkdir /hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02
+    cd /hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02
+
+    printf '# Medium ground finch GeoFor1 vs. Zebra finch TaeGut2
+BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz
+
+# TARGET: Zebra finch TaeGut2
+SEQ1_DIR=/hive/data/genomes/taeGut2/taeGut2.2bit
+SEQ1_LEN=/hive/data/genomes/taeGut2/chrom.sizes
+SEQ1_CHUNK=20000000
+SEQ1_LAP=10000
+SEQ1_LIMIT=40
+
+# QUERY: Medium ground finch GeoFor1
+SEQ2_DIR=/hive/data/genomes/geoFor1/geoFor1.2bit
+SEQ2_LEN=/hive/data/genomes/geoFor1/chrom.sizes
+SEQ2_CHUNK=20000000
+SEQ2_LAP=0
+SEQ2_LIMIT=100
+
+BASE=/hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02
+TMPDIR=/dev/shm
+
+' > DEF
+
+    time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl  -verbose=2 `pwd`/DEF -syntenicNet \
+        -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
+        -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1
+    grep -w real do.log | sed -e 's/^/    # /;'
+    # real      159m10.037s
+
+    sed -e 's/^/    # /;' fb.taeGut2.chainGeoFor1Link.txt
+    # 1125215195 bases of 1222864691 (92.015%) in intersection
+    sed -e 's/^/    # /;' fb.taeGut2.chainSynGeoFor1Link.txt
+    # 1024055100 bases of 1222864691 (83.742%) in intersection
+
+    time (~/kent/src/hg/utils/automation/doRecipBest.pl  -load -workhorse=hgwdev -buildDir=`pwd` \
+       \
+       \
+        taeGut2 geoFor1) > rbest.log 2>&1
+
+    grep -w real rbest.log | sed -e 's/^/    # /;'
+
+    sed -e 's/^/    # /;' fb.taeGut2.chainRBest.GeoFor1.txt
+
+    ### and for the swap
+
+    cd /hive/data/genomes/geoFor1/bed/blastz.taeGut2.swap
+
+   time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl  -swap -verbose=2 \
+    /hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02/DEF -swapDir=`pwd` \
+  -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
+    -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1
+
+    grep -w real swap.log | sed -e 's/^/    # /;'
+    # real      42m46.900s
+
+    sed -e 's/^/    # /;' fb.geoFor1.chainTaeGut2Link.txt
+    # 954457151 bases of 1041286029 (91.661%) in intersection
+    sed -e 's/^/    # /;' fb.geoFor1.chainSynTaeGut2Link.txt
+    # 912902982 bases of 1041286029 (87.671%) in intersection
+\    time (~/kent/src/hg/utils/automation/doRecipBest.pl  -load -workhorse=hgwdev -buildDir=`pwd` \
+    \
+    \
+   geoFor1 taeGut2) > rbest.log 2>&1
+
+    grep -w real rbest.log | sed -e 's/^/    # /;'
+    # real      64m46.642s
+
+    sed -e 's/^/    # /;' fb.geoFor1.chainRBest.TaeGut2.txt
+    # 919870835 bases of 1041286029 (88.340%) in intersection
+
+##############################################################################
+
+real    107m34.019s
+user    0m1.109s
+sys     0m1.771s