src/hg/makeDb/doc/hg38/cactus447.txt 923cb1e06520e8f302baabded147081a64db5666

923cb1e06520e8f302baabded147081a64db5666
hiram
  Tue Jun 27 14:20:10 2023 -0700
construction procedure for the 447-way alignment refs #31528

diff --git src/hg/makeDb/doc/hg38/cactus447.txt src/hg/makeDb/doc/hg38/cactus447.txt
new file mode 100644
index 0000000..3bddbf2
--- /dev/null
+++ src/hg/makeDb/doc/hg38/cactus447.txt
@@ -0,0 +1,305 @@
+
+################################################################################
+#### 2023-06-13 - fetch maf.gz - Hiram
+
+mkdir -p /hive/data/genomes/hg38/bed/cactus447/singleCover
+cd /hive/data/genomes/hg38/bed/cactus447/singleCover
+
+obtained maf file from Glenn, from the 'courtyard' login:
+
+/nanopore/cgl/glenn-scratch/new-maf/447-mammalian-2022v1-single-copy.maf.gz
+
+-rw-r--r-- 1 hickey cgl 881994218891 Jun  5 11:06 /nanopore/cgl/glenn-scratch/new-maf/447-mammalian-2022v1-single-copy.maf.gz
+
+There was some go around with a different version.  This one
+is 'single coverage'
+
+-rw-r--r-- 1 881994218891 Jun  5 11:06 447-mammalian-2022v1-single-copy.maf.gz
+
+################################################################################
+#### split this file into individual chromosome files 2023-06-21 - Hiram
+
+  mkdir  /hive/data/genomes/hg38/bed/cactus447/splitSingleCover
+  cd  /hive/data/genomes/hg38/bed/cactus447/splitSingleCover
+
+time mafSplit -byTarget -useFullSequenceName /dev/null ./  ../singleCover/447-mammalian-2022v1-single-copy.maf.gz
+
+Splitting 1 files by target sequence -- ignoring first argument /dev/null
+
+real    804m35.124s
+user    1061m9.580s
+sys     101m19.894s
+
+
+################################################################################
+### convert the sequence names to hg38 chrom names 2023-06-12 to 06-14 - Hiram
+
+  mkdir  /hive/data/genomes/hg38/bed/cactus447/ucscNames
+  cd  /hive/data/genomes/hg38/bed/cactus447/ucscNames
+
+   runOne script for processing each file:
+
+#!/bin/bash
+
+set -beEu -o pipefail
+
+export cactusMaf="${1}"
+export srcMaf="../splitSingleCover/${cactusMaf}"
+
+cactusId=${cactusMaf%.maf}
+
+export sedCommand=`grep -w "${cactusId}" cactus.to.ucsc.sed`
+export ucscName=`grep -w "${cactusId}" 2022v1.hg38.chrom.equiv.txt | cut -f2`
+
+case "${ucscName}" in
+    chrM)
+if [ ${srcMaf} -nt ${ucscName}.maf ]; then
+  sed -f cactus.to.ucsc.sed ${srcMaf} > ${ucscName}.maf
+  touch -r ${srcMaf} ${ucscName}.maf
+fi
+       ;;
+    chr[0-9][0-9])
+       rm -f "${ucscName}.maf"
+       ln ${srcMaf} "${ucscName}.maf"
+       ;;
+    chr[0-9])
+       rm -f "${ucscName}.maf"
+       ln ${srcMaf} "${ucscName}.maf"
+       ;;
+    G*.maf)
+    K*.maf)
+if [ ${srcMaf} -nt ${ucscName}.maf ]; then
+  sed -e "${sedCommand}" ${srcMaf} > ${ucscName}.maf
+  touch -r ${srcMaf} ${ucscName}.maf
+else
+  printf "DONE: ${ucscName}.maf\n" 1>&2
+fi
+       ;;
+esac
+
+if [ "${ucscName}.maf" -nt "nameScan/${ucscName}.seqCount.txt" ]; then
+  grep "^s." "${ucscName}.maf" | cut -d' ' -f2 | sort | uniq -c | sort -rn \
+    > "nameScan/${ucscName}.seqCount.txt"
+  touch -r "${ucscName}.maf" "nameScan/${ucscName}.seqCount.txt"
+  sed -e "s/^ \+//;" "nameScan/${ucscName}.seqCount.txt" | cut -d' ' -f2 \
+    | cut -d'.' -f1 | sort | uniq -c \
+       | sort -rn > "nameScan/${ucscName}.species.txt"
+fi
+#####################################################
+
+     template:
+#LOOP
+runOne $(path1)
+#ENDLOOP
+
+     using list of maf files:
+     ls ../splitSingleCover  | grep maf > cactus.maf.list
+
+     gensub2 cactus.maf.list single template jobList
+
+     ### made mistake of running too many of those at the same time
+     ### and their use of large amounts of memory caused hgwdev to
+     ### hang up and needed a reboot to recover.  So, run these carefully
+     ### only a couple at a time on hgwdev and on hgcompute-01, the large
+     ### ones get as large as 800 Gb in memory.
+
+     ### get the species names from the nameScan:
+
+   sed -e 's/^ \+//;' nameScan/chr2.species.txt | cut -d' ' -f2 \
+      | sort > ../species.list.txt
+
+################################################################################
+### get the chrom.sizes out of the maf files 2023-06-19 - Hiram
+
+   mkdir /hive/data/genomes/hg38/bed/cactus447/chromSizes
+   cd /hive/data/genomes/hg38/bed/cactus447/chromSizes
+
+   ### runOne script to run each one:
+
+#!/bin/bash
+
+export inMaf="${1}"
+export result="${2}"
+
+grep "^s " "${inMaf}" | awk '{printf "%s\t%s\n", $2,$6}' | sort -u > "${result}
+
+    ### working on the maf list:
+    ls ../ucscNames/chr*.maf > maf.list
+
+     ### template to construct jobList
+#LOOP
+runOne $(path1) {check out line+ result/$(root1).txt}
+#ENDLOOP
+
+     gensub2 maf.list single template jobList
+
+     para -ram=6g create jobList
+
+Completed: 101 of 101 jobs
+CPU time in finished jobs:      90801s    1513.35m    25.22h    1.05d  0.003 y
+IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
+Average job time:                 762s      12.69m     0.21h    0.01d
+Longest finished job:            7952s     132.53m     2.21h    0.09d
+Submission to last job:          8022s     133.70m     2.23h    0.09d
+
+    ### sizes.pl script to get the answer out of the result/*.txt files:
+
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my %chromSizes; # key is assembly name, value is a pointer to a hash
+                #  with key chromName, value size
+
+open (my $FH, "-|", "ls result/*.txt") or die "can not read ls result/*.txt";
+while (my $file = <$FH>) {
+  chomp $file;
+  printf STDERR "# reading: %s\n", $file;
+  open (my $R, "<", $file) or die "can not read $file";
+  while (my $line = <$R>) {
+    chomp $line;
+    my @a = split('\.', $line);
+    if (! defined($chromSizes{$a[0]})) {
+       my %h;
+       $chromSizes{$a[0]} = \%h;
+    }
+    my $asm = $chromSizes{$a[0]};
+    if (scalar(@a) == 3) {
+      my ($a2, $size) = split('\s+', $a[2]);
+      my $chr = "$a[1].$a2";
+      if (defined($asm->{$chr})) {
+         if ($asm->{$chr} != $size) {
+           printf STDERR "ERROR: size mismatch: %d != %d for %s\t%s.%s\n", $asm->{$chr}, $size, $a[0], $a[1], $a[2];
+           exit 255;
+         }
+      } else {
+         $asm->{$chr} = $size;
+      }
+#      printf "%s\t%s.%s\n", $a[0], $a[1], $a[2];
+    } elsif (scalar(@a) == 2) {
+      my ($chr, $size) = split('\s+', $a[1]);
+      if (defined($asm->{$chr})) {
+         if ($asm->{$chr} != $size) {
+           printf STDERR "ERROR: size mismatch: %d != %d for %s\t%s\n", $asm->{$chr}, $size, $a[0], $a[1];
+           exit 255;
+         }
+      } else {
+         $asm->{$chr} = $size;
+      }
+#      printf "%s\t%s\n", $a[0], $a[1];
+    } else {
+      printf STDERR "ERROR: not 2 or 3: '%s'\n", $line;
+      exit 255;
+    }
+  }
+  close ($R);
+}
+close ($FH);
+
+foreach my $asm (sort keys %chromSizes) {
+   printf STDERR "# output %s\n", $asm;
+   my $out = "sizes/${asm}.chrom.sizes";
+   open ($FH, "|-", "sort -k2,2nr > $out") or die "can not write to $out";
+   my $h = $chromSizes{$asm};
+   foreach my $chr (sort keys %$h) {
+#      printf  "%s\t%s\t%d\n", $asm, $chr, $h->{$chr};
+      printf $FH "%s\t%d\n", $chr, $h->{$chr};
+   }
+   close ($FH);
+}
+
+    mkdir sizes
+
+    time (./sizes.pl) > sizes.log 2>&1
+real    10m11.808s
+user    6m40.128s
+sys     3m37.475s
+
+
+################################################################################
+### add the iRow annotation 2023-06-19 to 2023-06-26 - Hiram
+
+  mkdir /hive/data/genomes/hg38/bed/cactus447/iRows
+  cd /hive/data/genomes/hg38/bed/cactus447/iRows
+
+  # obtained the N.bed files from Glenn
+  ##  symLink them into this directory:
+
+ls /hive/data/genomes/hg38/bed/cactus447/Nbeds/*.N.bed | grep -v "hg38.N.bed" \
+   | while read P
+do
+  B=`basename $P | sed -e 's/N.bed/bed/;'`
+  rm -f ./${P}
+  ln -s "${P}" ./${B}
+done
+
+  ### and symlink in the chrom.sizes files as .len files
+
+  ls /hive/data/genomes/hg38/bed/cactus447/chromSizes/sizes/*.chrom.sizes \
+ | grep -v hg38.chrom.sizes | while read P
+do
+  S=`basename ${P} | sed -e 's/chrom.sizes/len/;'`
+  printf "%s\n" "${S}"
+  ln -s "${P}" ./${S}
+done
+
+ln -s /hive/data/genomes/hg38/chrom.sizes hg38.len
+
+   ls *.bed > nBeds
+   ls *.len > sizes
+
+   ### list of maf file to process:
+
+   ls ../ucscNames/chr*.maf > maf.list
+
+   ### template file to construct jobList
+#LOOP
+mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/hg38/hg38.2bit result/$(file1)
+#ENDLOOP
+
+   gensub2 maf.list single template jobList
+
+   ### took a week to get these all done.
+   ### convert each file to a bigMaf file:
+
+   mkdir /hive/data/genomes/hg38/bed/cactus447/iRows/bigMaf
+   cd /hive/data/genomes/hg38/bed/cactus447/iRows/bigMaf
+
+   ### tried to sort them, but made a mistake on the sort -k argument:
+   for F in ../result/chr*.maf
+do
+   B=`basename $F | sed -e 's/.maf//;'`
+   mafToBigMaf hg38 "${F}" stdout | $HOME/bin/x86_64/gnusort -k2,2 -S1000G \
+        --parallel=32 > "${B}.bigMaf"
+done
+
+real    1588m59.209s
+user    1259m38.364s
+sys     371m13.448s
+
+   ### put these all together:
+   ls -S chr*.bigMaf | while read M
+do
+  cat "${M}"
+done > ../hg38.cactus447.bigMaf
+
+real    322m25.741s
+user    0m13.577s
+sys     82m41.509s
+
+   ### since they were sorted incorrectly above, sort this one huge file:
+
+   $HOME/bin/x86_64/gnusort -S500G --parallel=64 -k1,1 -k2,2n \
+     hg38.cactus447.bigMaf > hg38.sorted447way.bigMaf
+real    752m12.425s
+user    47m34.825s
+sys     371m7.394s
+
+################################################################################
+### make the bigBed file from the sorted bigMaf 2023-06-27
+
+bedToBigBed -verbose=2 -itemsPerSlot=4 -type=bed3+1 \
+ -as=$HOME/kent/src/hg/lib/bigMaf.as -tab iRows/hg38.sorted447way.bigMaf \
+     /hive/data/genomes/hg38/chrom.sizes hg38.cactus447way.bb
+XXX running - Tue Jun 27 14:16:31 PDT 2023