80af0d3fbbffc1d7f1a42972fd4630812d4550ae
hiram
  Mon Aug 21 14:58:11 2023 -0700
loading up a 90-way maf file refs #31561

diff --git src/hg/makeDb/doc/hg38/hprc90way.txt src/hg/makeDb/doc/hg38/hprc90way.txt
new file mode 100644
index 0000000..a6d49c4
--- /dev/null
+++ src/hg/makeDb/doc/hg38/hprc90way.txt
@@ -0,0 +1,202 @@
+#######  MAF file for HPRC - 90 genome assemblies
+###############################################################################
+
+# starting with this file:
+
+# -rw-r--r-- 1  11788468388 Aug 16 20:05 hprc-v1.0-mc-grch38.single-copy.maf.gz
+
+# as supplied from Glenn.
+###############################################################################
+
+mkdir -p /hive/data/genomes/hg38/bed/hprc/mafFile
+cd /hive/data/genomes/hg38/bed/hprc/mafFile
+
+### worked up this name translation table from existing trackDb entries:
+
+### head db.hgMatPat.asmName.longLabel.txt
+GCA_018466835.1 HG02257.pat     HG02257#1       HG02257.alt.pat.f1_v2 (May 2021 GCA_018466835.1_HG02257.alt.pat.f1_v2) HPRC project computed Chain Nets
+GCA_018466845.1 HG02257.mat     HG02257#2       HG02257.pri.mat.f1_v2 (May 2021 GCA_018466845.1_HG02257.pri.mat.f1_v2) HPRC project computed Chain Nets
+GCA_018466855.1 HG02559.pat     HG02559#1       HG02559.alt.pat.f1_v2 (May 2021 GCA_018466855.1_HG02559.alt.pat.f1_v2) HPRC project computed Chain Nets
+...
+GCA_018506955.1_HG00733.alt.pat.f1_v2) HPRC project computed Chain Nets
+GCA_018506975.1 HG00733.mat     HG00733#2       HG00733.pri.mat.f1_v2 (May 2021 GCA_018506975.1_HG00733.pri.mat.f1_v2) HPRC project computed Chain Nets
+GCA_018852585.1 HG02145.mat     HG02145#2       HG02145.pri.mat.f1_v2 (Jun. 2021 GCA_018852585.1_HG02145.pri.mat.f1_v2) HPRC project computed Chain Nets
+GCA_018852595.1 HG02145.pat     HG02145#1       HG02145.alt.pat.f1_v2 (Jun. 2021 GCA_018852595.1_HG02145.alt.pat.f1_v2) HPRC project computed Chain Nets
+
+### rework the assembly names with this perl script reName.pl
+
+###############################################################################
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my %asmNameDb;  # key is asmName in the hal file, value is dbName at UCSC
+
+open (my $fh, "<", "db.hgMatPat.asmName.longLabel.txt") or die "can not read db.hgMatPat.asmName.longLabel.txt";
+while (my $line = <$fh>) {
+  chomp $line;
+  my @a = split('\t', $line);
+  $asmNameDb{$a[2]} = $a[0];
+}
+close ($fh);
+$asmNameDb{'GRCh38'} = "hg38";
+$asmNameDb{'CHM13'} = "hs1";
+
+# foreach my $asmName (sort keys %asmNameDb) {
+#  printf "%s\t%s\n", $asmName, $asmNameDb{$asmName};
+# }
+
+open ($fh, "-|", "zcat hprc-v1.0-mc-grch38.single-copy.maf.gz") or die "can not zcat hprc-v1.0-mc-grch38.single-copy.maf.gz";
+while (my $line = <$fh>) {
+  chomp $line;
+  if ($line =~ m/^s\t/) {
+    my @a = split('\t', $line, 3);
+    my $asmName = $a[1];
+    $asmName =~ s/\..*//;
+    my $seqName = "hg38";
+    if (!defined($asmNameDb{$asmName})) {
+      printf "no equivalent name for: '%s'\n", $asmName;
+      exit 255;
+    }
+    $seqName = $asmNameDb{$asmName} if ($asmName ne "GRCh38");
+    $a[1] =~ s/^$asmName/$seqName/;
+    printf "%s\n", join("\t", @a);
+  } else {
+    printf "%s\n", $line;
+  }
+}
+close ($fh);
+###############################################################################
+
+
+   time ./reName.pl > reNamed.maf
+# real    25m41.974s
+
+###############################################################################
+### then, split that reNamed file:
+mkdir /hive/data/genomes/hg38/bed/hprc/mafFile/perChrom
+cd /hive/data/genomes/hg38/bed/hprc/mafFile/perChrom
+time  mafSplit -useFullSequenceName -byTarget /dev/null . ../reNamed.maf
+real    29m13.295s
+
+###############################################################################
+### loading this maf file
+
+[hiram@hgwdev /hive/data/genomes/hg38/bed/hprc/mafFile/perChrom] ln -s `pwd`/chr*.maf /gbdb/hg38/hprc/cactus90way
+
+cd /dev/shm
+time (hgLoadMaf -pathPrefix=/gbdb/hg38/hprc/cactus90way hg38 hprc90way) > load90way.log 2>&1 &
+# Loaded 1571098 mafs in 64 files from /gbdb/hg38/hprc/cactus90way
+# real    20m32.061s
+
+#  -rw-rw-r--    1   84132726 Aug 21 11:56 hprc90way.tab
+
+###############################################################################
+### and the summary table (did not work with the GCA_0123.1 dot suffix
+###    the .1 got trimmed off the names)
+time (cat /gbdb/hg38/hprc/cactus90way/*.maf \
+    | hgLoadMafSummary -verbose=2 -minSize=30000 \
+      -mergeGap=1500 -maxSize=200000 hg38 hprc90waySummary stdin) > do.log 2>&1
+# Created 7864892 summary blocks from 135565223 components and 1571098 mafs from stdin
+# real    44m52.247s
+
+# -rw-rw-r--    1  417328380 Aug 21 12:44 hprc90waySummary.tab
+
+
+### use this perl script to add the .1 to the GCA names
+##############################################################################
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+open (my $fh, "<", "hprc90waySummary.tab") or die "can not read hprc90waySummary.tab";
+while (my $line = <$fh>) {
+  chomp $line;
+  my @a = split('\s+', $line);
+  if ($a[4] =~ m/GCA_/) {
+     $a[4] .= ".1";
+  }
+  printf "%s\t\t\n", join("\t", @a);
+}
+close ($fh);
+##############################################################################
+
+   cd /dev/shm
+   ./addDot1.pl > withDot1.hprc90waySummary.tab
+
+   # reload the table
+   hgLoadSqlTab hg38 hprc90waySummary ~/kent/src/hg/lib/mafSummary.sql  \
+        withDot1.hprc90waySummary.tab
+
+__END__
+
+###############################################################################
+### adding iRows
+###
+    mkdir /hive/data/genomes/hg38/bed/hprc/mafFile/iRows
+    cd /hive/data/genomes/hg38/bed/hprc/mafFile/iRows
+
+    ### create N.bed files
+    ls /hive/data/genomes/hg38/bed/hprc/2bits/*.2bit | while read TB
+do
+  S=`basename $TB | sed -e 's/.2bit//;'`
+  twoBitInfo -nBed ${TB} ${S}.N.bed
+done
+
+    ### and sizes .len files:
+   ls /hive/data/genomes/hg38/bed/hprc/2bits/*.2bit | while read TB
+do
+  S=`basename $TB | sed -e 's/.2bit//;'`
+  twoBitInfo ${TB} stdout | sort -k2,2nr > ${S}.len
+done
+
+    ### and the two special ones:
+    ln -s /hive/data/genomes/hg38/chrom.sizes hg38.len
+    ln -s /hive/data/genomes/hs1/chrom.sizes hs1.len
+    twoBitInfo -nBed /hive/data/genomes/hg38/hg38.2bit hg38.N.bed
+    twoBitInfo -nBed /hive/data/genomes/hs1/hs1.2bit hs1.N.bed
+    ls *.len > sizes
+    ls *.N.bed > nBeds
+
+    printf '#LOOP
+mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/hg38/hg38.2bit {check out line+ result/$(file1)}
+#ENDLOOP
+' > template
+
+    ls ../perChrom/chr*.maf > maf.list
+    gensub2 maf.list single template jobList
+    para -ram=64g create jobList
+    mkdir result
+    para push
+    ### at 6g limit, 19 failed:
+Completed: 45 of 64 jobs
+Crashed: 19 jobs
+CPU time in finished jobs:        224s       3.74m     0.06h    0.00d  0.000 y
+IO & Wait Time:                   138s       2.30m     0.04h    0.00d  0.000 y
+Average job time:                   8s       0.13m     0.00h    0.00d
+Longest finished job:              80s       1.33m     0.02h    0.00d
+Submission to last job:           115s       1.92m     0.03h    0.00d
+
+    ### finished the 19 with 64g limit:
+Completed: 19 of 19 jobs
+CPU time in finished jobs:       2958s      49.29m     0.82h    0.03d  0.000 y
+IO & Wait Time:                  1005s      16.76m     0.28h    0.01d  0.000 y
+Average job time:                 209s       3.48m     0.06h    0.00d
+Longest finished job:             335s       5.58m     0.09h    0.00d
+Submission to last job:           346s       5.77m     0.10h    0.00d
+
+    cd result
+    rm /gbdb/hg38/hprc/cactus90way/chr*.maf
+    ln -s `pwd`/chr*.maf /gbdb/hg38/hprc/cactus90way
+
+     ### reload these iRow annotated maf files:
+
+    cd /dev/shm
+time (hgLoadMaf -pathPrefix=/gbdb/hg38/hprc/cactus90way hg38 hprc90way) > load90way.log 2>&1 &
+
+time (cat /gbdb/hg38/hprc/cactus90way/*.maf \
+    | hgLoadMafSummary -verbose=2 -minSize=30000 \
+      -mergeGap=1500 -maxSize=200000 hg38 hprc90waySummary stdin) > do.log 2>&1
+###############################################################################