80af0d3fbbffc1d7f1a42972fd4630812d4550ae hiram Mon Aug 21 14:58:11 2023 -0700 loading up a 90-way maf file refs #31561 diff --git src/hg/makeDb/doc/hg38/hprc90way.txt src/hg/makeDb/doc/hg38/hprc90way.txt new file mode 100644 index 0000000..a6d49c4 --- /dev/null +++ src/hg/makeDb/doc/hg38/hprc90way.txt @@ -0,0 +1,202 @@ +####### MAF file for HPRC - 90 genome assemblies +############################################################################### + +# starting with this file: + +# -rw-r--r-- 1 11788468388 Aug 16 20:05 hprc-v1.0-mc-grch38.single-copy.maf.gz + +# as supplied from Glenn. +############################################################################### + +mkdir -p /hive/data/genomes/hg38/bed/hprc/mafFile +cd /hive/data/genomes/hg38/bed/hprc/mafFile + +### worked up this name translation table from existing trackDb entries: + +### head db.hgMatPat.asmName.longLabel.txt +GCA_018466835.1 HG02257.pat HG02257#1 HG02257.alt.pat.f1_v2 (May 2021 GCA_018466835.1_HG02257.alt.pat.f1_v2) HPRC project computed Chain Nets +GCA_018466845.1 HG02257.mat HG02257#2 HG02257.pri.mat.f1_v2 (May 2021 GCA_018466845.1_HG02257.pri.mat.f1_v2) HPRC project computed Chain Nets +GCA_018466855.1 HG02559.pat HG02559#1 HG02559.alt.pat.f1_v2 (May 2021 GCA_018466855.1_HG02559.alt.pat.f1_v2) HPRC project computed Chain Nets +... +GCA_018506955.1_HG00733.alt.pat.f1_v2) HPRC project computed Chain Nets +GCA_018506975.1 HG00733.mat HG00733#2 HG00733.pri.mat.f1_v2 (May 2021 GCA_018506975.1_HG00733.pri.mat.f1_v2) HPRC project computed Chain Nets +GCA_018852585.1 HG02145.mat HG02145#2 HG02145.pri.mat.f1_v2 (Jun. 2021 GCA_018852585.1_HG02145.pri.mat.f1_v2) HPRC project computed Chain Nets +GCA_018852595.1 HG02145.pat HG02145#1 HG02145.alt.pat.f1_v2 (Jun. 2021 GCA_018852595.1_HG02145.alt.pat.f1_v2) HPRC project computed Chain Nets + +### rework the assembly names with this perl script reName.pl + +############################################################################### +#!/usr/bin/env perl + +use strict; +use warnings; + +my %asmNameDb; # key is asmName in the hal file, value is dbName at UCSC + +open (my $fh, "<", "db.hgMatPat.asmName.longLabel.txt") or die "can not read db.hgMatPat.asmName.longLabel.txt"; +while (my $line = <$fh>) { + chomp $line; + my @a = split('\t', $line); + $asmNameDb{$a[2]} = $a[0]; +} +close ($fh); +$asmNameDb{'GRCh38'} = "hg38"; +$asmNameDb{'CHM13'} = "hs1"; + +# foreach my $asmName (sort keys %asmNameDb) { +# printf "%s\t%s\n", $asmName, $asmNameDb{$asmName}; +# } + +open ($fh, "-|", "zcat hprc-v1.0-mc-grch38.single-copy.maf.gz") or die "can not zcat hprc-v1.0-mc-grch38.single-copy.maf.gz"; +while (my $line = <$fh>) { + chomp $line; + if ($line =~ m/^s\t/) { + my @a = split('\t', $line, 3); + my $asmName = $a[1]; + $asmName =~ s/\..*//; + my $seqName = "hg38"; + if (!defined($asmNameDb{$asmName})) { + printf "no equivalent name for: '%s'\n", $asmName; + exit 255; + } + $seqName = $asmNameDb{$asmName} if ($asmName ne "GRCh38"); + $a[1] =~ s/^$asmName/$seqName/; + printf "%s\n", join("\t", @a); + } else { + printf "%s\n", $line; + } +} +close ($fh); +############################################################################### + + + time ./reName.pl > reNamed.maf +# real 25m41.974s + +############################################################################### +### then, split that reNamed file: +mkdir /hive/data/genomes/hg38/bed/hprc/mafFile/perChrom +cd /hive/data/genomes/hg38/bed/hprc/mafFile/perChrom +time mafSplit -useFullSequenceName -byTarget /dev/null . ../reNamed.maf +real 29m13.295s + +############################################################################### +### loading this maf file + +[hiram@hgwdev /hive/data/genomes/hg38/bed/hprc/mafFile/perChrom] ln -s `pwd`/chr*.maf /gbdb/hg38/hprc/cactus90way + +cd /dev/shm +time (hgLoadMaf -pathPrefix=/gbdb/hg38/hprc/cactus90way hg38 hprc90way) > load90way.log 2>&1 & +# Loaded 1571098 mafs in 64 files from /gbdb/hg38/hprc/cactus90way +# real 20m32.061s + +# -rw-rw-r-- 1 84132726 Aug 21 11:56 hprc90way.tab + +############################################################################### +### and the summary table (did not work with the GCA_0123.1 dot suffix +### the .1 got trimmed off the names) +time (cat /gbdb/hg38/hprc/cactus90way/*.maf \ + | hgLoadMafSummary -verbose=2 -minSize=30000 \ + -mergeGap=1500 -maxSize=200000 hg38 hprc90waySummary stdin) > do.log 2>&1 +# Created 7864892 summary blocks from 135565223 components and 1571098 mafs from stdin +# real 44m52.247s + +# -rw-rw-r-- 1 417328380 Aug 21 12:44 hprc90waySummary.tab + + +### use this perl script to add the .1 to the GCA names +############################################################################## +#!/usr/bin/env perl + +use strict; +use warnings; + +open (my $fh, "<", "hprc90waySummary.tab") or die "can not read hprc90waySummary.tab"; +while (my $line = <$fh>) { + chomp $line; + my @a = split('\s+', $line); + if ($a[4] =~ m/GCA_/) { + $a[4] .= ".1"; + } + printf "%s\t\t\n", join("\t", @a); +} +close ($fh); +############################################################################## + + cd /dev/shm + ./addDot1.pl > withDot1.hprc90waySummary.tab + + # reload the table + hgLoadSqlTab hg38 hprc90waySummary ~/kent/src/hg/lib/mafSummary.sql \ + withDot1.hprc90waySummary.tab + +__END__ + +############################################################################### +### adding iRows +### + mkdir /hive/data/genomes/hg38/bed/hprc/mafFile/iRows + cd /hive/data/genomes/hg38/bed/hprc/mafFile/iRows + + ### create N.bed files + ls /hive/data/genomes/hg38/bed/hprc/2bits/*.2bit | while read TB +do + S=`basename $TB | sed -e 's/.2bit//;'` + twoBitInfo -nBed ${TB} ${S}.N.bed +done + + ### and sizes .len files: + ls /hive/data/genomes/hg38/bed/hprc/2bits/*.2bit | while read TB +do + S=`basename $TB | sed -e 's/.2bit//;'` + twoBitInfo ${TB} stdout | sort -k2,2nr > ${S}.len +done + + ### and the two special ones: + ln -s /hive/data/genomes/hg38/chrom.sizes hg38.len + ln -s /hive/data/genomes/hs1/chrom.sizes hs1.len + twoBitInfo -nBed /hive/data/genomes/hg38/hg38.2bit hg38.N.bed + twoBitInfo -nBed /hive/data/genomes/hs1/hs1.2bit hs1.N.bed + ls *.len > sizes + ls *.N.bed > nBeds + + printf '#LOOP +mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/hg38/hg38.2bit {check out line+ result/$(file1)} +#ENDLOOP +' > template + + ls ../perChrom/chr*.maf > maf.list + gensub2 maf.list single template jobList + para -ram=64g create jobList + mkdir result + para push + ### at 6g limit, 19 failed: +Completed: 45 of 64 jobs +Crashed: 19 jobs +CPU time in finished jobs: 224s 3.74m 0.06h 0.00d 0.000 y +IO & Wait Time: 138s 2.30m 0.04h 0.00d 0.000 y +Average job time: 8s 0.13m 0.00h 0.00d +Longest finished job: 80s 1.33m 0.02h 0.00d +Submission to last job: 115s 1.92m 0.03h 0.00d + + ### finished the 19 with 64g limit: +Completed: 19 of 19 jobs +CPU time in finished jobs: 2958s 49.29m 0.82h 0.03d 0.000 y +IO & Wait Time: 1005s 16.76m 0.28h 0.01d 0.000 y +Average job time: 209s 3.48m 0.06h 0.00d +Longest finished job: 335s 5.58m 0.09h 0.00d +Submission to last job: 346s 5.77m 0.10h 0.00d + + cd result + rm /gbdb/hg38/hprc/cactus90way/chr*.maf + ln -s `pwd`/chr*.maf /gbdb/hg38/hprc/cactus90way + + ### reload these iRow annotated maf files: + + cd /dev/shm +time (hgLoadMaf -pathPrefix=/gbdb/hg38/hprc/cactus90way hg38 hprc90way) > load90way.log 2>&1 & + +time (cat /gbdb/hg38/hprc/cactus90way/*.maf \ + | hgLoadMafSummary -verbose=2 -minSize=30000 \ + -mergeGap=1500 -maxSize=200000 hg38 hprc90waySummary stdin) > do.log 2>&1 +###############################################################################