5b1483d60866e637703c920a8ff02516b52d53a8 hiram Mon Aug 7 13:25:52 2023 -0700 add the frames procedure refs #31528 diff --git src/hg/makeDb/doc/hg38/cactus447.txt src/hg/makeDb/doc/hg38/cactus447.txt index 3bddbf2..a21cfba 100644 --- src/hg/makeDb/doc/hg38/cactus447.txt +++ src/hg/makeDb/doc/hg38/cactus447.txt @@ -1,305 +1,749 @@ ################################################################################ #### 2023-06-13 - fetch maf.gz - Hiram mkdir -p /hive/data/genomes/hg38/bed/cactus447/singleCover cd /hive/data/genomes/hg38/bed/cactus447/singleCover obtained maf file from Glenn, from the 'courtyard' login: /nanopore/cgl/glenn-scratch/new-maf/447-mammalian-2022v1-single-copy.maf.gz -rw-r--r-- 1 hickey cgl 881994218891 Jun 5 11:06 /nanopore/cgl/glenn-scratch/new-maf/447-mammalian-2022v1-single-copy.maf.gz There was some go around with a different version. This one is 'single coverage' -rw-r--r-- 1 881994218891 Jun 5 11:06 447-mammalian-2022v1-single-copy.maf.gz ################################################################################ #### split this file into individual chromosome files 2023-06-21 - Hiram mkdir /hive/data/genomes/hg38/bed/cactus447/splitSingleCover cd /hive/data/genomes/hg38/bed/cactus447/splitSingleCover time mafSplit -byTarget -useFullSequenceName /dev/null ./ ../singleCover/447-mammalian-2022v1-single-copy.maf.gz Splitting 1 files by target sequence -- ignoring first argument /dev/null real 804m35.124s user 1061m9.580s sys 101m19.894s ################################################################################ ### convert the sequence names to hg38 chrom names 2023-06-12 to 06-14 - Hiram mkdir /hive/data/genomes/hg38/bed/cactus447/ucscNames cd /hive/data/genomes/hg38/bed/cactus447/ucscNames runOne script for processing each file: #!/bin/bash set -beEu -o pipefail export cactusMaf="${1}" export srcMaf="../splitSingleCover/${cactusMaf}" cactusId=${cactusMaf%.maf} export sedCommand=`grep -w "${cactusId}" cactus.to.ucsc.sed` export ucscName=`grep -w "${cactusId}" 2022v1.hg38.chrom.equiv.txt | cut -f2` case "${ucscName}" in chrM) if [ ${srcMaf} -nt ${ucscName}.maf ]; then sed -f cactus.to.ucsc.sed ${srcMaf} > ${ucscName}.maf touch -r ${srcMaf} ${ucscName}.maf fi ;; chr[0-9][0-9]) rm -f "${ucscName}.maf" ln ${srcMaf} "${ucscName}.maf" ;; chr[0-9]) rm -f "${ucscName}.maf" ln ${srcMaf} "${ucscName}.maf" ;; G*.maf) K*.maf) if [ ${srcMaf} -nt ${ucscName}.maf ]; then sed -e "${sedCommand}" ${srcMaf} > ${ucscName}.maf touch -r ${srcMaf} ${ucscName}.maf else printf "DONE: ${ucscName}.maf\n" 1>&2 fi ;; esac if [ "${ucscName}.maf" -nt "nameScan/${ucscName}.seqCount.txt" ]; then grep "^s." "${ucscName}.maf" | cut -d' ' -f2 | sort | uniq -c | sort -rn \ > "nameScan/${ucscName}.seqCount.txt" touch -r "${ucscName}.maf" "nameScan/${ucscName}.seqCount.txt" sed -e "s/^ \+//;" "nameScan/${ucscName}.seqCount.txt" | cut -d' ' -f2 \ | cut -d'.' -f1 | sort | uniq -c \ | sort -rn > "nameScan/${ucscName}.species.txt" fi ##################################################### template: #LOOP runOne $(path1) #ENDLOOP using list of maf files: ls ../splitSingleCover | grep maf > cactus.maf.list gensub2 cactus.maf.list single template jobList ### made mistake of running too many of those at the same time ### and their use of large amounts of memory caused hgwdev to ### hang up and needed a reboot to recover. So, run these carefully ### only a couple at a time on hgwdev and on hgcompute-01, the large ### ones get as large as 800 Gb in memory. ### get the species names from the nameScan: sed -e 's/^ \+//;' nameScan/chr2.species.txt | cut -d' ' -f2 \ | sort > ../species.list.txt ################################################################################ ### get the chrom.sizes out of the maf files 2023-06-19 - Hiram mkdir /hive/data/genomes/hg38/bed/cactus447/chromSizes cd /hive/data/genomes/hg38/bed/cactus447/chromSizes ### runOne script to run each one: #!/bin/bash export inMaf="${1}" export result="${2}" grep "^s " "${inMaf}" | awk '{printf "%s\t%s\n", $2,$6}' | sort -u > "${result} ### working on the maf list: ls ../ucscNames/chr*.maf > maf.list ### template to construct jobList #LOOP runOne $(path1) {check out line+ result/$(root1).txt} #ENDLOOP gensub2 maf.list single template jobList para -ram=6g create jobList Completed: 101 of 101 jobs CPU time in finished jobs: 90801s 1513.35m 25.22h 1.05d 0.003 y IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y Average job time: 762s 12.69m 0.21h 0.01d Longest finished job: 7952s 132.53m 2.21h 0.09d Submission to last job: 8022s 133.70m 2.23h 0.09d ### sizes.pl script to get the answer out of the result/*.txt files: #!/usr/bin/env perl use strict; use warnings; my %chromSizes; # key is assembly name, value is a pointer to a hash # with key chromName, value size open (my $FH, "-|", "ls result/*.txt") or die "can not read ls result/*.txt"; while (my $file = <$FH>) { chomp $file; printf STDERR "# reading: %s\n", $file; open (my $R, "<", $file) or die "can not read $file"; while (my $line = <$R>) { chomp $line; my @a = split('\.', $line); if (! defined($chromSizes{$a[0]})) { my %h; $chromSizes{$a[0]} = \%h; } my $asm = $chromSizes{$a[0]}; if (scalar(@a) == 3) { my ($a2, $size) = split('\s+', $a[2]); my $chr = "$a[1].$a2"; if (defined($asm->{$chr})) { if ($asm->{$chr} != $size) { printf STDERR "ERROR: size mismatch: %d != %d for %s\t%s.%s\n", $asm->{$chr}, $size, $a[0], $a[1], $a[2]; exit 255; } } else { $asm->{$chr} = $size; } # printf "%s\t%s.%s\n", $a[0], $a[1], $a[2]; } elsif (scalar(@a) == 2) { my ($chr, $size) = split('\s+', $a[1]); if (defined($asm->{$chr})) { if ($asm->{$chr} != $size) { printf STDERR "ERROR: size mismatch: %d != %d for %s\t%s\n", $asm->{$chr}, $size, $a[0], $a[1]; exit 255; } } else { $asm->{$chr} = $size; } # printf "%s\t%s\n", $a[0], $a[1]; } else { printf STDERR "ERROR: not 2 or 3: '%s'\n", $line; exit 255; } } close ($R); } close ($FH); foreach my $asm (sort keys %chromSizes) { printf STDERR "# output %s\n", $asm; my $out = "sizes/${asm}.chrom.sizes"; open ($FH, "|-", "sort -k2,2nr > $out") or die "can not write to $out"; my $h = $chromSizes{$asm}; foreach my $chr (sort keys %$h) { # printf "%s\t%s\t%d\n", $asm, $chr, $h->{$chr}; printf $FH "%s\t%d\n", $chr, $h->{$chr}; } close ($FH); } mkdir sizes time (./sizes.pl) > sizes.log 2>&1 real 10m11.808s user 6m40.128s sys 3m37.475s ################################################################################ ### add the iRow annotation 2023-06-19 to 2023-06-26 - Hiram mkdir /hive/data/genomes/hg38/bed/cactus447/iRows cd /hive/data/genomes/hg38/bed/cactus447/iRows # obtained the N.bed files from Glenn ## symLink them into this directory: ls /hive/data/genomes/hg38/bed/cactus447/Nbeds/*.N.bed | grep -v "hg38.N.bed" \ | while read P do B=`basename $P | sed -e 's/N.bed/bed/;'` rm -f ./${P} ln -s "${P}" ./${B} done ### and symlink in the chrom.sizes files as .len files ls /hive/data/genomes/hg38/bed/cactus447/chromSizes/sizes/*.chrom.sizes \ | grep -v hg38.chrom.sizes | while read P do S=`basename ${P} | sed -e 's/chrom.sizes/len/;'` printf "%s\n" "${S}" ln -s "${P}" ./${S} done ln -s /hive/data/genomes/hg38/chrom.sizes hg38.len ls *.bed > nBeds ls *.len > sizes ### list of maf file to process: ls ../ucscNames/chr*.maf > maf.list ### template file to construct jobList #LOOP mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/hg38/hg38.2bit result/$(file1) #ENDLOOP gensub2 maf.list single template jobList ### took a week to get these all done. ### convert each file to a bigMaf file: mkdir /hive/data/genomes/hg38/bed/cactus447/iRows/bigMaf cd /hive/data/genomes/hg38/bed/cactus447/iRows/bigMaf ### tried to sort them, but made a mistake on the sort -k argument: for F in ../result/chr*.maf do B=`basename $F | sed -e 's/.maf//;'` mafToBigMaf hg38 "${F}" stdout | $HOME/bin/x86_64/gnusort -k2,2 -S1000G \ --parallel=32 > "${B}.bigMaf" done real 1588m59.209s user 1259m38.364s sys 371m13.448s ### put these all together: ls -S chr*.bigMaf | while read M do cat "${M}" done > ../hg38.cactus447.bigMaf real 322m25.741s user 0m13.577s sys 82m41.509s ### since they were sorted incorrectly above, sort this one huge file: $HOME/bin/x86_64/gnusort -S500G --parallel=64 -k1,1 -k2,2n \ hg38.cactus447.bigMaf > hg38.sorted447way.bigMaf real 752m12.425s user 47m34.825s sys 371m7.394s ################################################################################ ### make the bigBed file from the sorted bigMaf 2023-06-27 bedToBigBed -verbose=2 -itemsPerSlot=4 -type=bed3+1 \ -as=$HOME/kent/src/hg/lib/bigMaf.as -tab iRows/hg38.sorted447way.bigMaf \ /hive/data/genomes/hg38/chrom.sizes hg38.cactus447way.bb -XXX running - Tue Jun 27 14:16:31 PDT 2023 + +# pid=61008: VmPeak: 5682928 kB +# real 4157m14.091s + +# that time is 69 hours + +############################################################################## +# liftOver files to convert sequence names (DONE - 2023-08-04 - Hiram) +# +# The sequence names in the assemblies in the alignment do not match +# the sequence names in the corresponding genome browsers assemblies + + mkdir /hive/data/genomes/hg38/bed/cactus447/liftOver + cd /hive/data/genomes/hg38/bed/cactus447/liftOver + mkdir lifts + ln -s ../idKeys/447way.equivalent.txt . + +# using this script to construct liftOver files +#!/bin/bash + +TOP="/hive/data/genomes/hg38/bed/cactus447/liftOver" + +cd "${TOP}" +mkdir -p lifts + +cat 447way.equivalent.txt | while read L +do + seqName=`printf "%s" "${L}" | cut -f1` + asmId=`printf "%s" "${L}" | cut -f2` + seqIdKeys="/hive/data/genomes/hg38/bed/cactus447/idKeys/${seqName}/${seqName}.idKeys.txt" + case ${asmId} in + GCA_* | GCF_* ) + gcX="${asmId:0:3}" + d0="${asmId:4:3}" + d1="${asmId:7:3}" + d2="${asmId:10:3}" + ab="/hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmId}" + buildDir=`realpath "${ab}"` + asmKeys="${buildDir}/trackData/idKeys/${asmId}.idKeys.txt" + asmSizes="${buildDir}/${asmId}.chrom.sizes" +# asmSizes="${buildDir}/idKeys/chrom.sizes" +ls -og "${asmKeys}" + ;; + [a-zA-Z0-9]*) + asmKeys="/hive/data/genomes/${asmId}/bed/idKeys/${asmId}.idKeys.txt" + asmSizes="/hive/data/genomes/${asmId}/chrom.sizes" + ;; + *) printf "ERROR not recognized $asmId\n" 1>&2 + exit 255 + ;; + esac + seqKeys="/hive/data/genomes/hg38/bed/cactus447/idKeys/${seqName}/${seqName}.idKeys.txt" + seq2Bit="/hive/data/genomes/hg38/bed/cactus447/2bitFiles/${seqName}.2bit" + if [ ! -s "${seqKeys}" ]; then + printf "can not find seq idKeys\n%s\n" "${seqKeys}" 1>&2 + exit 255 + fi + if [ ! -s "${asmKeys}" ]; then + printf "can not find asm idKeys\n%s\n" "${asmKeys}" 1>&2 + exit 255 + fi + printf "%s\t%s\n" "${asmId}" "${seqName}" 1>&2 +printf "lifts/${seqName}.${asmId}.lift\n" 1>&2 +printf "$seqKeys\n" 1>&2 +printf "$asmKeys\n" 1>&2 +printf "$asmSizes\n" 1>&2 + if [ ! -s "lifts/${seqName}.${asmId}.lift" ]; then + join -t$'\t' "${seqKeys}" "${asmKeys}" | sort -k3,3 \ + | join -t$'\t' -2 3 <(sort -k1,1 ${asmSizes}) - \ + | awk -F$'\t' '{printf "0\t%s\t%d\t%s\t%d\n", $1, $2, $4, $2}' \ + > lifts/${seqName}.${asmId}.lift + fi +done + + # a couple of those ended up with duplicate names in the liftOver + # due to duplicate contigs in the assemblies. They were edited + # manually, one was really bad and needed this perl script to clean it: + +#!/usr/bin/env perl + +use strict; +use warnings; + +my %idDone; # key is sequence name, value is string of names used + +open (my $fh, "<", "lifts/Otolemur_garnettii.otoGar3.lift") or die "can not read lifts/Otolemur_garnettii.otoGar3.lift"; +while (my $line = <$fh>) { + chomp $line; + my @a = split('\t', $line); + if (!defined($idDone{$a[1]})) { + $idDone{$a[1]} = $a[3]; + printf "%s\n", $line; + } else { + $idDone{$a[1]} .= "," . $a[3]; + } +} +close ($fh); + +foreach my $seq (sort keys %idDone) { + printf STDERR "%s\t%s\n", $seq, $idDone{$seq}; +} + +############################################################################## +# MAF FRAMES (working - 2023-08-01 - Hiram) + ssh hgwdev + mkdir /hive/data/genomes/hg38/bed/cactus447/frames + cd /hive/data/genomes/hg38/bed/cactus447/frames + + mkdir genes + +# survey all the genomes to find out what kinds of gene tracks they have + +printf '#/bin/bash + +egrep -v "GCA_|GCF_" ../idKeys/447way.equivalent.txt | cut -f2 | while read db +do + printf "# ${db}:" + for table in `hgsql -N -e "show tables;" $db | egrep "Gene|ncbiRefSeq" | egrep -v "Curated|Cds|Link|Other|PepTable|Predicted|Psl|wgEncode|gencode|uuGene|ToGeneName|ccds|encode|Ext|Old|Pep|Select|Bak|Verify|orfeome|Multi|transMap|Pfam|knownGeneMrna|sgpGene"` + do + count=`hgsql -N -e "select count(*) from $table;" $db` + printf " %%s: %%s" "${table}" "${count}" + done + printf "\\n" +done +' > showGenes.sh + + + chmod +x ./showGenes.sh + ./showGenes.sh > gene.survey.txt 2>&1 & + # most of them have ncbiRefSeq + grep ncbiRefSeq gene.survey.txt | cut -d' ' -f2 | sed -e 's/://;' \ + | sort | xargs echo | fold -s -w 72 +# bosMut1 canFam4 chiLan1 colAng1 dipOrd2 eleEdw1 felCat8 fukDam1 micOch1 +# mm10 myoBra1 myoDav1 myoLuc2 odoRosDiv1 orcOrc1 otoGar3 proCoq1 pteAle1 +# rheMac10 sorAra2 speTri2 susScr3 tarSyr2 tupChi1 + + # a couple do not: + grep -v ncbiRefSeq gene.survey.txt | cut -d' ' -f2 | sed -e 's/://;' \ + | sort | xargs echo | fold -s -w 72 +# cavApe1 eidHel1 ptePar1 + # but they do have ensGene or augustusGene: + egrep "cavApe1|eidHel1|ptePar1" gene.survey.txt +# cavApe1: augustusGene: 10114 ensGene: 22510 xenoRefGene: 322224 +# eidHel1: augustusGene: 49674 +# ptePar1: augustusGene: 68613 + + # 1. ncbiRefSeq - add hg38 here manually since it didn't come in above +# for db in hg38 +# do +# asmName="${db}" + + for db in bosMut1 canFam4 chiLan1 colAng1 dipOrd2 eleEdw1 felCat8 fukDam1 micOch1 mm10 myoBra1 myoDav1 myoLuc2 odoRosDiv1 orcOrc1 otoGar3 proCoq1 pteAle1 rheMac10 sorAra2 speTri2 susScr3 tarSyr2 tupChi1 +do + asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1` + hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiRefSeq" ${db} \ + | genePredSingleCover stdin stdout | gzip -2c \ + > genes/${asmName}.gp.gz + echo -n "# ${asmName}: " + genePredCheck -db=${db} genes/${asmName}.gp.gz +done +# hg38: checked: 22882 failed: 0 +# Bos_mutus: checked: 20457 failed: 0 +# CanFam4: checked: 21143 failed: 0 +# Chinchilla_lanigera: checked: 20482 failed: 0 +# Colobus_angolensis: checked: 20042 failed: 0 +# Dipodomys_ordii: checked: 19738 failed: 0 +# Elephantulus_edwardii: checked: 20384 failed: 0 +# Felis_catus: checked: 20051 failed: 0 +# Fukomys_damarensis: checked: 19811 failed: 0 +# Microtus_ochrogaster: checked: 20048 failed: 0 +# Mus_musculus: checked: 23375 failed: 0 +# Myotis_brandtii: checked: 19949 failed: 0 +# Myotis_davidii: checked: 18815 failed: 0 +# Myotis_lucifugus: checked: 19895 failed: 0 +# Odobenus_rosmarus: checked: 19346 failed: 0 +# Orcinus_orca: checked: 19136 failed: 0 +# Otolemur_garnettii: checked: 19536 failed: 0 +# Propithecus_coquerelli: checked: 19356 failed: 0 +# Pteropus_alecto: checked: 18326 failed: 0 +# Macaca_mulatta: checked: 21021 failed: 0 +# Sorex_araneus: checked: 19160 failed: 0 +# Ictidomys_tridecemlineatus: checked: 19892 failed: 0 +# Sus_scrofa: checked: 24180 failed: 0 +# Carlito_syrichta: checked: 19968 failed: 0 +# Tupaia_chinensis: checked: 21047 failed: 0 + + # 2. ensGene + for db in cavApe1 +do +asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1` +hgsql -N -e "select +name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds +from ensGene" ${db} \ + | genePredSingleCover stdin stdout | gzip -2c \ + > /dev/shm/${db}.tmp.gz + mv /dev/shm/${db}.tmp.gz genes/${asmName}.gp.gz + echo -n "# ${asmName}: " + genePredCheck -db=${db} genes/${asmName}.gp.gz +done +# Cavia_aperea: checked: 14182 failed: 0 + + # 3. augustusGene + for db in eidHel1 ptePar1 +do +asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1` +hgsql -N -e "select +name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds +from augustusGene" ${db} \ + | genePredSingleCover stdin stdout | gzip -2c \ + > /dev/shm/${db}.tmp.gz + mv /dev/shm/${db}.tmp.gz genes/${asmName}.gp.gz + echo -n "# ${asmName}: " + genePredCheck -db=${db} genes/${asmName}.gp.gz +done +# Eidolon_helvum: checked: 31495 failed: 0 +# Pteronotus_parnellii: checked: 47678 failed: 0 + + # verify counts for genes are reasonable: + for T in genes/*.gz +do + printf "# $T: " + zcat $T | cut -f1 | sort -u | wc -l +done +# genes/Bos_mutus.gp.gz: 20457 +# genes/CanFam4.gp.gz: 21143 +# genes/Canis_lupus_familiaris.gp.gz: 19812 +# genes/Carlito_syrichta.gp.gz: 19968 +# genes/Cavia_aperea.gp.gz: 14182 +# genes/Cercocebus_atys.gp.gz: 20534 +# genes/Chinchilla_lanigera.gp.gz: 20481 +# genes/Colobus_angolensis.gp.gz: 20042 +# genes/Condylura_cristata.gp.gz: 17842 +# genes/Dipodomys_ordii.gp.gz: 19738 +... +# genes/Pteropus_vampyrus.gp.gz: 19485 +# genes/Rattus_norvegicus.gp.gz: 22854 +# genes/Rhinopithecus_roxellana.gp.gz: 21027 +# genes/Saimiri_boliviensis.gp.gz: 20933 +# genes/Sorex_araneus.gp.gz: 19160 +# genes/Sus_scrofa.gp.gz: 24043 +# genes/Theropithecus_gelada.gp.gz: 20592 +# genes/Tupaia_chinensis.gp.gz: 21047 + +# And, can pick up ncbiRefSeq for GCF equivalent assemblies for those +# outside the UCSC database set of genomes + +grep GCF_ ../idKeys/447way.equivalent.txt | while read dbGCF +do + asmName=`printf "%s" "${dbGCF}" | cut -f1` + asmId=`printf "%s" "${dbGCF}" | cut -f2` + gcX="${asmId:0:3}" + d0="${asmId:4:3}" + d1="${asmId:7:3}" + d2="${asmId:10:3}" + buildDir="/hive/data/genomes/asmHubs/refseqBuild/$gcX/$d0/$d1/$d2/${asmId}" + if [ ! -d "${buildDir}" ]; then + printf "not found: %s\n" "${buildDir}" + else + + ncbiRefSeqGp="${buildDir}/trackData/ncbiRefSeq/process/${asmId}.ncbiRefSeq.gp" + sizes="${buildDir}/${asmId}.chrom.sizes" + if [ -s "${ncbiRefSeqGp}" ]; then + genePredSingleCover "${ncbiRefSeqGp}" stdout | gzip -c > genes/${asmName}.gp.gz + printf "# %s: " "${asmName}" + genePredCheck -chromSizes="${sizes}" genes/${asmName}.gp.gz + else + printf "MISSING: ${ncbiRefSeqGp}\n" 1>&2 + fi + fi + +done + +# verify counts for genes are reasonable: + for T in genes/*.gz +do + printf "# $T: " + zcat $T | cut -f1 | sort -u | wc -l +done +# genes/Bos_mutus.gp.gz: 20457 +# genes/CanFam4.gp.gz: 21143 +# genes/Canis_lupus_familiaris.gp.gz: 19812 +# genes/Carlito_syrichta.gp.gz: 19968 +# genes/Cavia_aperea.gp.gz: 14182 +# genes/Cercocebus_atys.gp.gz: 20534 +# genes/Chinchilla_lanigera.gp.gz: 20481 +# genes/Colobus_angolensis.gp.gz: 20042 +# genes/Condylura_cristata.gp.gz: 17842 +# genes/Dipodomys_ordii.gp.gz: 19738 +... +# genes/Pteropus_vampyrus.gp.gz: 19485 +# genes/Rattus_norvegicus.gp.gz: 22854 +# genes/Rhinopithecus_roxellana.gp.gz: 21027 +# genes/Saimiri_boliviensis.gp.gz: 20933 +# genes/Sorex_araneus.gp.gz: 19160 +# genes/Sus_scrofa.gp.gz: 24043 +# genes/Theropithecus_gelada.gp.gz: 20592 +# genes/Tupaia_chinensis.gp.gz: 21047 + + # these gene predictions need lifting to change the sequence names + mv genes genes.seqToAsm + mkdir genes + cd genes.seqToAsm + +for F in *.gp.gz +do + seqName=`echo $F | sed -e 's/.gp.gz//;'` + liftFile="`ls -d /hive/data/genomes/hg38/bed/cactus447/liftOver/lifts/${seqName}.*.lift 2> /dev/null`" + if [ -s "${liftFile}" ]; then + printf "%s\n" "${seqName}" 1>&2 + liftUp -type=.genepred stdout "${liftFile}" warn "${F}" | gzip -c \ + > "../genes/${seqName}.gp.gz" + else + printf "can not find lift file for $seqName\n" 1>&2 + fi +done + + # kluster job to annotate each maf file + screen -S hg38 # manage long running procedure with screen + ssh ku + cd /hive/data/genomes/hg38/bed/cactus447way/frames + + printf '#!/bin/bash +set -beEu -o pipefail + +export C="${1}" +export G="${2}" + +cat ../iRows/result/${C}.maf | genePredToMafFrames hg38 stdin stdout \ + ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz +' > runOne + + chmod +x runOne + + ls ../iRows/result | grep maf | sed -e "s/.maf//" > chr.list + ls genes | sed -e "s/.gp.gz//" > gene.list + + printf '#LOOP +runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz} +#ENDLOOP +' > template + + mkdir parts + # run on the ku kluster + gensub2 chr.list gene.list template jobList + para -ram=128g create jobList + para try ... check ... push +# Completed: 4444 of 4444 jobs +# CPU time in finished jobs: 3955725s 65928.75m 1098.81h 45.78d 0.125 y +# IO & Wait Time: 1735948s 28932.46m 482.21h 20.09d 0.055 y +# Average job time: 1281s 21.35m 0.36h 0.01d +# Longest finished job: 34637s 577.28m 9.62h 0.40d +# Submission to last job: 55372s 922.87m 15.38h 0.64d + + # collect all results into one file: + cd /hive/data/genomes/hg38/bed/cactus447way/frames + find ./parts -type f | while read F; +do + zcat ${F} | awk '11 == NF' +done | sort -k1,1 -k2,2n > cactus447wayFrames.bed + + bedToBigBed -type=bed3+7 -as=$HOME/kent/src/hg/lib/mafFrames.as \ + -tab cactus447wayFrames.bed ../../../chrom.sizes cactus447wayFrames.bb + + bigBedInfo cactus447wayFrames.bb | sed -e 's/^/# /;' + +# version: 4 +# fieldCount: 11 +# hasHeaderExtension: yes +# isCompressed: yes +# isSwapped: 0 +# extraIndexCount: 0 +# itemCount: 16,063,019 +# primaryDataSize: 136,782,205 +# primaryIndexSize: 1,013,720 +# zoomLevels: 10 +# chromCount: 82 +# basesCovered: 65,486,909 +# meanDepth (of bases covered): 22.709888 +# minDepth: 1.000000 +# maxDepth: 44.000000 +# std of depth: 18.692097 + + # -rw-rw-r-- 1 1192312943 Aug 5 15:28 cactus447wayFrames.bed + # -rw-rw-r-- 1 166572857 Aug 5 15:29 cactus447wayFrames.bb + + gzip cactus447wayFrames.bed + + # verify there are frames on everything expected. Since the + # frames on the HL* asemblies did not work due to sequence name + # differences, this won't be everything. + # (ls genes | wc shows 44 'expected') + zcat cactus447wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \ + | sed -e 's/^/# /;' > species.check.list + wc -l species.check.list + # 44 + +# 391443 Bos_mutus +# 414721 CanFam4 +# 417196 Canis_lupus_familiaris +# 328151 Carlito_syrichta +# 255097 Cavia_aperea +# 373053 Cercocebus_atys +# 398356 Chinchilla_lanigera +# 350442 Colobus_angolensis +# 346524 Condylura_cristata +# 363333 Dipodomys_ordii +... +# 379732 Pteropus_alecto +# 387484 Pteropus_vampyrus +# 389056 Rattus_norvegicus +# 360929 Rhinopithecus_roxellana +# 366019 Saimiri_boliviensis +# 336296 Sorex_araneus +# 402603 Sus_scrofa +# 357418 Theropithecus_gelada +# 372558 Tupaia_chinensis +# 195495 hg38 + + # load the resulting file, merely for measurement purposes here + ssh hgwdev + cd /hive/data/genomes/hg38/bed/cactus447way/frames + time hgLoadMafFrames hg38 cactus447wayFrames cactus447wayFrames.bed.gz + # real 3m14.979s + + hgsql -e 'select count(*) from cactus447wayFrames;' hg38 + # +----------+ + # | count(*) | + # +----------+ + # | 16063019 | + # +----------+ + + time featureBits -countGaps hg38 cactus447wayFrames + # 65486909 bases of 3299210039 (1.985%) in intersection + # real 1m50.539s + + # enable the trackDb entries: +# frames https://hgdownload.soe.ucsc.edu/goldenPath/hg38/cactus447way/cactus447wayFrames.bb +# irows on + # zoom to base level in an exon to see codon displays + # appears to work OK + + # do not need this loaded table: + hgsql hg38 -e 'drop table cactus447wayFrames;' + +#########################################################################