33401c5cafc069c3f78bcecc3c0d6f2d0b77aeda hiram Wed Aug 30 14:36:08 2023 -0700 phyloP done for both all and primates subset refs #31528 diff --git src/hg/makeDb/doc/hg38/cactus447.txt src/hg/makeDb/doc/hg38/cactus447.txt index a21cfba..704e2d2 100644 --- src/hg/makeDb/doc/hg38/cactus447.txt +++ src/hg/makeDb/doc/hg38/cactus447.txt @@ -1,749 +1,1463 @@ ################################################################################ #### 2023-06-13 - fetch maf.gz - Hiram mkdir -p /hive/data/genomes/hg38/bed/cactus447/singleCover cd /hive/data/genomes/hg38/bed/cactus447/singleCover obtained maf file from Glenn, from the 'courtyard' login: /nanopore/cgl/glenn-scratch/new-maf/447-mammalian-2022v1-single-copy.maf.gz -rw-r--r-- 1 hickey cgl 881994218891 Jun 5 11:06 /nanopore/cgl/glenn-scratch/new-maf/447-mammalian-2022v1-single-copy.maf.gz There was some go around with a different version. This one is 'single coverage' -rw-r--r-- 1 881994218891 Jun 5 11:06 447-mammalian-2022v1-single-copy.maf.gz ################################################################################ #### split this file into individual chromosome files 2023-06-21 - Hiram mkdir /hive/data/genomes/hg38/bed/cactus447/splitSingleCover cd /hive/data/genomes/hg38/bed/cactus447/splitSingleCover time mafSplit -byTarget -useFullSequenceName /dev/null ./ ../singleCover/447-mammalian-2022v1-single-copy.maf.gz Splitting 1 files by target sequence -- ignoring first argument /dev/null real 804m35.124s user 1061m9.580s sys 101m19.894s ################################################################################ ### convert the sequence names to hg38 chrom names 2023-06-12 to 06-14 - Hiram mkdir /hive/data/genomes/hg38/bed/cactus447/ucscNames cd /hive/data/genomes/hg38/bed/cactus447/ucscNames runOne script for processing each file: #!/bin/bash set -beEu -o pipefail export cactusMaf="${1}" export srcMaf="../splitSingleCover/${cactusMaf}" cactusId=${cactusMaf%.maf} export sedCommand=`grep -w "${cactusId}" cactus.to.ucsc.sed` export ucscName=`grep -w "${cactusId}" 2022v1.hg38.chrom.equiv.txt | cut -f2` case "${ucscName}" in chrM) if [ ${srcMaf} -nt ${ucscName}.maf ]; then sed -f cactus.to.ucsc.sed ${srcMaf} > ${ucscName}.maf touch -r ${srcMaf} ${ucscName}.maf fi ;; chr[0-9][0-9]) rm -f "${ucscName}.maf" ln ${srcMaf} "${ucscName}.maf" ;; chr[0-9]) rm -f "${ucscName}.maf" ln ${srcMaf} "${ucscName}.maf" ;; G*.maf) K*.maf) if [ ${srcMaf} -nt ${ucscName}.maf ]; then sed -e "${sedCommand}" ${srcMaf} > ${ucscName}.maf touch -r ${srcMaf} ${ucscName}.maf else printf "DONE: ${ucscName}.maf\n" 1>&2 fi ;; esac if [ "${ucscName}.maf" -nt "nameScan/${ucscName}.seqCount.txt" ]; then grep "^s." "${ucscName}.maf" | cut -d' ' -f2 | sort | uniq -c | sort -rn \ > "nameScan/${ucscName}.seqCount.txt" touch -r "${ucscName}.maf" "nameScan/${ucscName}.seqCount.txt" sed -e "s/^ \+//;" "nameScan/${ucscName}.seqCount.txt" | cut -d' ' -f2 \ | cut -d'.' -f1 | sort | uniq -c \ | sort -rn > "nameScan/${ucscName}.species.txt" fi ##################################################### template: #LOOP runOne $(path1) #ENDLOOP using list of maf files: ls ../splitSingleCover | grep maf > cactus.maf.list gensub2 cactus.maf.list single template jobList ### made mistake of running too many of those at the same time ### and their use of large amounts of memory caused hgwdev to ### hang up and needed a reboot to recover. So, run these carefully ### only a couple at a time on hgwdev and on hgcompute-01, the large ### ones get as large as 800 Gb in memory. ### get the species names from the nameScan: sed -e 's/^ \+//;' nameScan/chr2.species.txt | cut -d' ' -f2 \ | sort > ../species.list.txt ################################################################################ ### get the chrom.sizes out of the maf files 2023-06-19 - Hiram mkdir /hive/data/genomes/hg38/bed/cactus447/chromSizes cd /hive/data/genomes/hg38/bed/cactus447/chromSizes ### runOne script to run each one: #!/bin/bash export inMaf="${1}" export result="${2}" grep "^s " "${inMaf}" | awk '{printf "%s\t%s\n", $2,$6}' | sort -u > "${result} ### working on the maf list: ls ../ucscNames/chr*.maf > maf.list ### template to construct jobList #LOOP runOne $(path1) {check out line+ result/$(root1).txt} #ENDLOOP gensub2 maf.list single template jobList para -ram=6g create jobList Completed: 101 of 101 jobs CPU time in finished jobs: 90801s 1513.35m 25.22h 1.05d 0.003 y IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y Average job time: 762s 12.69m 0.21h 0.01d Longest finished job: 7952s 132.53m 2.21h 0.09d Submission to last job: 8022s 133.70m 2.23h 0.09d ### sizes.pl script to get the answer out of the result/*.txt files: #!/usr/bin/env perl use strict; use warnings; my %chromSizes; # key is assembly name, value is a pointer to a hash # with key chromName, value size open (my $FH, "-|", "ls result/*.txt") or die "can not read ls result/*.txt"; while (my $file = <$FH>) { chomp $file; printf STDERR "# reading: %s\n", $file; open (my $R, "<", $file) or die "can not read $file"; while (my $line = <$R>) { chomp $line; my @a = split('\.', $line); if (! defined($chromSizes{$a[0]})) { my %h; $chromSizes{$a[0]} = \%h; } my $asm = $chromSizes{$a[0]}; if (scalar(@a) == 3) { my ($a2, $size) = split('\s+', $a[2]); my $chr = "$a[1].$a2"; if (defined($asm->{$chr})) { if ($asm->{$chr} != $size) { printf STDERR "ERROR: size mismatch: %d != %d for %s\t%s.%s\n", $asm->{$chr}, $size, $a[0], $a[1], $a[2]; exit 255; } } else { $asm->{$chr} = $size; } # printf "%s\t%s.%s\n", $a[0], $a[1], $a[2]; } elsif (scalar(@a) == 2) { my ($chr, $size) = split('\s+', $a[1]); if (defined($asm->{$chr})) { if ($asm->{$chr} != $size) { printf STDERR "ERROR: size mismatch: %d != %d for %s\t%s\n", $asm->{$chr}, $size, $a[0], $a[1]; exit 255; } } else { $asm->{$chr} = $size; } # printf "%s\t%s\n", $a[0], $a[1]; } else { printf STDERR "ERROR: not 2 or 3: '%s'\n", $line; exit 255; } } close ($R); } close ($FH); foreach my $asm (sort keys %chromSizes) { printf STDERR "# output %s\n", $asm; my $out = "sizes/${asm}.chrom.sizes"; open ($FH, "|-", "sort -k2,2nr > $out") or die "can not write to $out"; my $h = $chromSizes{$asm}; foreach my $chr (sort keys %$h) { # printf "%s\t%s\t%d\n", $asm, $chr, $h->{$chr}; printf $FH "%s\t%d\n", $chr, $h->{$chr}; } close ($FH); } mkdir sizes time (./sizes.pl) > sizes.log 2>&1 real 10m11.808s user 6m40.128s sys 3m37.475s ################################################################################ ### add the iRow annotation 2023-06-19 to 2023-06-26 - Hiram mkdir /hive/data/genomes/hg38/bed/cactus447/iRows cd /hive/data/genomes/hg38/bed/cactus447/iRows # obtained the N.bed files from Glenn ## symLink them into this directory: ls /hive/data/genomes/hg38/bed/cactus447/Nbeds/*.N.bed | grep -v "hg38.N.bed" \ | while read P do B=`basename $P | sed -e 's/N.bed/bed/;'` rm -f ./${P} ln -s "${P}" ./${B} done ### and symlink in the chrom.sizes files as .len files ls /hive/data/genomes/hg38/bed/cactus447/chromSizes/sizes/*.chrom.sizes \ | grep -v hg38.chrom.sizes | while read P do S=`basename ${P} | sed -e 's/chrom.sizes/len/;'` printf "%s\n" "${S}" ln -s "${P}" ./${S} done ln -s /hive/data/genomes/hg38/chrom.sizes hg38.len ls *.bed > nBeds ls *.len > sizes ### list of maf file to process: ls ../ucscNames/chr*.maf > maf.list ### template file to construct jobList #LOOP mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/hg38/hg38.2bit result/$(file1) #ENDLOOP gensub2 maf.list single template jobList ### took a week to get these all done. ### convert each file to a bigMaf file: mkdir /hive/data/genomes/hg38/bed/cactus447/iRows/bigMaf cd /hive/data/genomes/hg38/bed/cactus447/iRows/bigMaf ### tried to sort them, but made a mistake on the sort -k argument: for F in ../result/chr*.maf do B=`basename $F | sed -e 's/.maf//;'` mafToBigMaf hg38 "${F}" stdout | $HOME/bin/x86_64/gnusort -k2,2 -S1000G \ --parallel=32 > "${B}.bigMaf" done real 1588m59.209s user 1259m38.364s sys 371m13.448s ### put these all together: ls -S chr*.bigMaf | while read M do cat "${M}" done > ../hg38.cactus447.bigMaf real 322m25.741s user 0m13.577s sys 82m41.509s ### since they were sorted incorrectly above, sort this one huge file: $HOME/bin/x86_64/gnusort -S500G --parallel=64 -k1,1 -k2,2n \ hg38.cactus447.bigMaf > hg38.sorted447way.bigMaf real 752m12.425s user 47m34.825s sys 371m7.394s ################################################################################ ### make the bigBed file from the sorted bigMaf 2023-06-27 bedToBigBed -verbose=2 -itemsPerSlot=4 -type=bed3+1 \ -as=$HOME/kent/src/hg/lib/bigMaf.as -tab iRows/hg38.sorted447way.bigMaf \ /hive/data/genomes/hg38/chrom.sizes hg38.cactus447way.bb # pid=61008: VmPeak: 5682928 kB # real 4157m14.091s # that time is 69 hours ############################################################################## # liftOver files to convert sequence names (DONE - 2023-08-04 - Hiram) # # The sequence names in the assemblies in the alignment do not match # the sequence names in the corresponding genome browsers assemblies mkdir /hive/data/genomes/hg38/bed/cactus447/liftOver cd /hive/data/genomes/hg38/bed/cactus447/liftOver mkdir lifts ln -s ../idKeys/447way.equivalent.txt . # using this script to construct liftOver files #!/bin/bash TOP="/hive/data/genomes/hg38/bed/cactus447/liftOver" cd "${TOP}" mkdir -p lifts cat 447way.equivalent.txt | while read L do seqName=`printf "%s" "${L}" | cut -f1` asmId=`printf "%s" "${L}" | cut -f2` seqIdKeys="/hive/data/genomes/hg38/bed/cactus447/idKeys/${seqName}/${seqName}.idKeys.txt" case ${asmId} in GCA_* | GCF_* ) gcX="${asmId:0:3}" d0="${asmId:4:3}" d1="${asmId:7:3}" d2="${asmId:10:3}" ab="/hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmId}" buildDir=`realpath "${ab}"` asmKeys="${buildDir}/trackData/idKeys/${asmId}.idKeys.txt" asmSizes="${buildDir}/${asmId}.chrom.sizes" # asmSizes="${buildDir}/idKeys/chrom.sizes" ls -og "${asmKeys}" ;; [a-zA-Z0-9]*) asmKeys="/hive/data/genomes/${asmId}/bed/idKeys/${asmId}.idKeys.txt" asmSizes="/hive/data/genomes/${asmId}/chrom.sizes" ;; *) printf "ERROR not recognized $asmId\n" 1>&2 exit 255 ;; esac seqKeys="/hive/data/genomes/hg38/bed/cactus447/idKeys/${seqName}/${seqName}.idKeys.txt" seq2Bit="/hive/data/genomes/hg38/bed/cactus447/2bitFiles/${seqName}.2bit" if [ ! -s "${seqKeys}" ]; then printf "can not find seq idKeys\n%s\n" "${seqKeys}" 1>&2 exit 255 fi if [ ! -s "${asmKeys}" ]; then printf "can not find asm idKeys\n%s\n" "${asmKeys}" 1>&2 exit 255 fi printf "%s\t%s\n" "${asmId}" "${seqName}" 1>&2 printf "lifts/${seqName}.${asmId}.lift\n" 1>&2 printf "$seqKeys\n" 1>&2 printf "$asmKeys\n" 1>&2 printf "$asmSizes\n" 1>&2 if [ ! -s "lifts/${seqName}.${asmId}.lift" ]; then join -t$'\t' "${seqKeys}" "${asmKeys}" | sort -k3,3 \ | join -t$'\t' -2 3 <(sort -k1,1 ${asmSizes}) - \ | awk -F$'\t' '{printf "0\t%s\t%d\t%s\t%d\n", $1, $2, $4, $2}' \ > lifts/${seqName}.${asmId}.lift fi done # a couple of those ended up with duplicate names in the liftOver # due to duplicate contigs in the assemblies. They were edited # manually, one was really bad and needed this perl script to clean it: #!/usr/bin/env perl use strict; use warnings; my %idDone; # key is sequence name, value is string of names used open (my $fh, "<", "lifts/Otolemur_garnettii.otoGar3.lift") or die "can not read lifts/Otolemur_garnettii.otoGar3.lift"; while (my $line = <$fh>) { chomp $line; my @a = split('\t', $line); if (!defined($idDone{$a[1]})) { $idDone{$a[1]} = $a[3]; printf "%s\n", $line; } else { $idDone{$a[1]} .= "," . $a[3]; } } close ($fh); foreach my $seq (sort keys %idDone) { printf STDERR "%s\t%s\n", $seq, $idDone{$seq}; } ############################################################################## # MAF FRAMES (working - 2023-08-01 - Hiram) ssh hgwdev mkdir /hive/data/genomes/hg38/bed/cactus447/frames cd /hive/data/genomes/hg38/bed/cactus447/frames mkdir genes # survey all the genomes to find out what kinds of gene tracks they have printf '#/bin/bash egrep -v "GCA_|GCF_" ../idKeys/447way.equivalent.txt | cut -f2 | while read db do printf "# ${db}:" for table in `hgsql -N -e "show tables;" $db | egrep "Gene|ncbiRefSeq" | egrep -v "Curated|Cds|Link|Other|PepTable|Predicted|Psl|wgEncode|gencode|uuGene|ToGeneName|ccds|encode|Ext|Old|Pep|Select|Bak|Verify|orfeome|Multi|transMap|Pfam|knownGeneMrna|sgpGene"` do count=`hgsql -N -e "select count(*) from $table;" $db` printf " %%s: %%s" "${table}" "${count}" done printf "\\n" done ' > showGenes.sh chmod +x ./showGenes.sh ./showGenes.sh > gene.survey.txt 2>&1 & # most of them have ncbiRefSeq grep ncbiRefSeq gene.survey.txt | cut -d' ' -f2 | sed -e 's/://;' \ | sort | xargs echo | fold -s -w 72 # bosMut1 canFam4 chiLan1 colAng1 dipOrd2 eleEdw1 felCat8 fukDam1 micOch1 # mm10 myoBra1 myoDav1 myoLuc2 odoRosDiv1 orcOrc1 otoGar3 proCoq1 pteAle1 # rheMac10 sorAra2 speTri2 susScr3 tarSyr2 tupChi1 # a couple do not: grep -v ncbiRefSeq gene.survey.txt | cut -d' ' -f2 | sed -e 's/://;' \ | sort | xargs echo | fold -s -w 72 # cavApe1 eidHel1 ptePar1 # but they do have ensGene or augustusGene: egrep "cavApe1|eidHel1|ptePar1" gene.survey.txt # cavApe1: augustusGene: 10114 ensGene: 22510 xenoRefGene: 322224 # eidHel1: augustusGene: 49674 # ptePar1: augustusGene: 68613 # 1. ncbiRefSeq - add hg38 here manually since it didn't come in above # for db in hg38 # do # asmName="${db}" for db in bosMut1 canFam4 chiLan1 colAng1 dipOrd2 eleEdw1 felCat8 fukDam1 micOch1 mm10 myoBra1 myoDav1 myoLuc2 odoRosDiv1 orcOrc1 otoGar3 proCoq1 pteAle1 rheMac10 sorAra2 speTri2 susScr3 tarSyr2 tupChi1 do asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1` hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiRefSeq" ${db} \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/${asmName}.gp.gz echo -n "# ${asmName}: " genePredCheck -db=${db} genes/${asmName}.gp.gz done # hg38: checked: 22882 failed: 0 # Bos_mutus: checked: 20457 failed: 0 # CanFam4: checked: 21143 failed: 0 # Chinchilla_lanigera: checked: 20482 failed: 0 # Colobus_angolensis: checked: 20042 failed: 0 # Dipodomys_ordii: checked: 19738 failed: 0 # Elephantulus_edwardii: checked: 20384 failed: 0 # Felis_catus: checked: 20051 failed: 0 # Fukomys_damarensis: checked: 19811 failed: 0 # Microtus_ochrogaster: checked: 20048 failed: 0 # Mus_musculus: checked: 23375 failed: 0 # Myotis_brandtii: checked: 19949 failed: 0 # Myotis_davidii: checked: 18815 failed: 0 # Myotis_lucifugus: checked: 19895 failed: 0 # Odobenus_rosmarus: checked: 19346 failed: 0 # Orcinus_orca: checked: 19136 failed: 0 # Otolemur_garnettii: checked: 19536 failed: 0 # Propithecus_coquerelli: checked: 19356 failed: 0 # Pteropus_alecto: checked: 18326 failed: 0 # Macaca_mulatta: checked: 21021 failed: 0 # Sorex_araneus: checked: 19160 failed: 0 # Ictidomys_tridecemlineatus: checked: 19892 failed: 0 # Sus_scrofa: checked: 24180 failed: 0 # Carlito_syrichta: checked: 19968 failed: 0 # Tupaia_chinensis: checked: 21047 failed: 0 # 2. ensGene for db in cavApe1 do asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1` hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${db} \ | genePredSingleCover stdin stdout | gzip -2c \ > /dev/shm/${db}.tmp.gz mv /dev/shm/${db}.tmp.gz genes/${asmName}.gp.gz echo -n "# ${asmName}: " genePredCheck -db=${db} genes/${asmName}.gp.gz done # Cavia_aperea: checked: 14182 failed: 0 # 3. augustusGene for db in eidHel1 ptePar1 do asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1` hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from augustusGene" ${db} \ | genePredSingleCover stdin stdout | gzip -2c \ > /dev/shm/${db}.tmp.gz mv /dev/shm/${db}.tmp.gz genes/${asmName}.gp.gz echo -n "# ${asmName}: " genePredCheck -db=${db} genes/${asmName}.gp.gz done # Eidolon_helvum: checked: 31495 failed: 0 # Pteronotus_parnellii: checked: 47678 failed: 0 # verify counts for genes are reasonable: for T in genes/*.gz do printf "# $T: " zcat $T | cut -f1 | sort -u | wc -l done # genes/Bos_mutus.gp.gz: 20457 # genes/CanFam4.gp.gz: 21143 # genes/Canis_lupus_familiaris.gp.gz: 19812 # genes/Carlito_syrichta.gp.gz: 19968 # genes/Cavia_aperea.gp.gz: 14182 # genes/Cercocebus_atys.gp.gz: 20534 # genes/Chinchilla_lanigera.gp.gz: 20481 # genes/Colobus_angolensis.gp.gz: 20042 # genes/Condylura_cristata.gp.gz: 17842 # genes/Dipodomys_ordii.gp.gz: 19738 ... # genes/Pteropus_vampyrus.gp.gz: 19485 # genes/Rattus_norvegicus.gp.gz: 22854 # genes/Rhinopithecus_roxellana.gp.gz: 21027 # genes/Saimiri_boliviensis.gp.gz: 20933 # genes/Sorex_araneus.gp.gz: 19160 # genes/Sus_scrofa.gp.gz: 24043 # genes/Theropithecus_gelada.gp.gz: 20592 # genes/Tupaia_chinensis.gp.gz: 21047 # And, can pick up ncbiRefSeq for GCF equivalent assemblies for those # outside the UCSC database set of genomes grep GCF_ ../idKeys/447way.equivalent.txt | while read dbGCF do asmName=`printf "%s" "${dbGCF}" | cut -f1` asmId=`printf "%s" "${dbGCF}" | cut -f2` gcX="${asmId:0:3}" d0="${asmId:4:3}" d1="${asmId:7:3}" d2="${asmId:10:3}" buildDir="/hive/data/genomes/asmHubs/refseqBuild/$gcX/$d0/$d1/$d2/${asmId}" if [ ! -d "${buildDir}" ]; then printf "not found: %s\n" "${buildDir}" else ncbiRefSeqGp="${buildDir}/trackData/ncbiRefSeq/process/${asmId}.ncbiRefSeq.gp" sizes="${buildDir}/${asmId}.chrom.sizes" if [ -s "${ncbiRefSeqGp}" ]; then genePredSingleCover "${ncbiRefSeqGp}" stdout | gzip -c > genes/${asmName}.gp.gz printf "# %s: " "${asmName}" genePredCheck -chromSizes="${sizes}" genes/${asmName}.gp.gz else printf "MISSING: ${ncbiRefSeqGp}\n" 1>&2 fi fi done # verify counts for genes are reasonable: for T in genes/*.gz do printf "# $T: " zcat $T | cut -f1 | sort -u | wc -l done # genes/Bos_mutus.gp.gz: 20457 # genes/CanFam4.gp.gz: 21143 # genes/Canis_lupus_familiaris.gp.gz: 19812 # genes/Carlito_syrichta.gp.gz: 19968 # genes/Cavia_aperea.gp.gz: 14182 # genes/Cercocebus_atys.gp.gz: 20534 # genes/Chinchilla_lanigera.gp.gz: 20481 # genes/Colobus_angolensis.gp.gz: 20042 # genes/Condylura_cristata.gp.gz: 17842 # genes/Dipodomys_ordii.gp.gz: 19738 ... # genes/Pteropus_vampyrus.gp.gz: 19485 # genes/Rattus_norvegicus.gp.gz: 22854 # genes/Rhinopithecus_roxellana.gp.gz: 21027 # genes/Saimiri_boliviensis.gp.gz: 20933 # genes/Sorex_araneus.gp.gz: 19160 # genes/Sus_scrofa.gp.gz: 24043 # genes/Theropithecus_gelada.gp.gz: 20592 # genes/Tupaia_chinensis.gp.gz: 21047 # these gene predictions need lifting to change the sequence names mv genes genes.seqToAsm mkdir genes cd genes.seqToAsm for F in *.gp.gz do seqName=`echo $F | sed -e 's/.gp.gz//;'` liftFile="`ls -d /hive/data/genomes/hg38/bed/cactus447/liftOver/lifts/${seqName}.*.lift 2> /dev/null`" if [ -s "${liftFile}" ]; then printf "%s\n" "${seqName}" 1>&2 liftUp -type=.genepred stdout "${liftFile}" warn "${F}" | gzip -c \ > "../genes/${seqName}.gp.gz" else printf "can not find lift file for $seqName\n" 1>&2 fi done # kluster job to annotate each maf file screen -S hg38 # manage long running procedure with screen ssh ku cd /hive/data/genomes/hg38/bed/cactus447way/frames printf '#!/bin/bash set -beEu -o pipefail export C="${1}" export G="${2}" cat ../iRows/result/${C}.maf | genePredToMafFrames hg38 stdin stdout \ ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz ' > runOne chmod +x runOne ls ../iRows/result | grep maf | sed -e "s/.maf//" > chr.list ls genes | sed -e "s/.gp.gz//" > gene.list printf '#LOOP runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz} #ENDLOOP ' > template mkdir parts # run on the ku kluster gensub2 chr.list gene.list template jobList para -ram=128g create jobList para try ... check ... push # Completed: 4444 of 4444 jobs # CPU time in finished jobs: 3955725s 65928.75m 1098.81h 45.78d 0.125 y # IO & Wait Time: 1735948s 28932.46m 482.21h 20.09d 0.055 y # Average job time: 1281s 21.35m 0.36h 0.01d # Longest finished job: 34637s 577.28m 9.62h 0.40d # Submission to last job: 55372s 922.87m 15.38h 0.64d # collect all results into one file: cd /hive/data/genomes/hg38/bed/cactus447way/frames find ./parts -type f | while read F; do zcat ${F} | awk '11 == NF' done | sort -k1,1 -k2,2n > cactus447wayFrames.bed bedToBigBed -type=bed3+7 -as=$HOME/kent/src/hg/lib/mafFrames.as \ -tab cactus447wayFrames.bed ../../../chrom.sizes cactus447wayFrames.bb bigBedInfo cactus447wayFrames.bb | sed -e 's/^/# /;' # version: 4 # fieldCount: 11 # hasHeaderExtension: yes # isCompressed: yes # isSwapped: 0 # extraIndexCount: 0 # itemCount: 16,063,019 # primaryDataSize: 136,782,205 # primaryIndexSize: 1,013,720 # zoomLevels: 10 # chromCount: 82 # basesCovered: 65,486,909 # meanDepth (of bases covered): 22.709888 # minDepth: 1.000000 # maxDepth: 44.000000 # std of depth: 18.692097 # -rw-rw-r-- 1 1192312943 Aug 5 15:28 cactus447wayFrames.bed # -rw-rw-r-- 1 166572857 Aug 5 15:29 cactus447wayFrames.bb gzip cactus447wayFrames.bed # verify there are frames on everything expected. Since the # frames on the HL* asemblies did not work due to sequence name # differences, this won't be everything. # (ls genes | wc shows 44 'expected') zcat cactus447wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \ | sed -e 's/^/# /;' > species.check.list wc -l species.check.list # 44 # 391443 Bos_mutus # 414721 CanFam4 # 417196 Canis_lupus_familiaris # 328151 Carlito_syrichta # 255097 Cavia_aperea # 373053 Cercocebus_atys # 398356 Chinchilla_lanigera # 350442 Colobus_angolensis # 346524 Condylura_cristata # 363333 Dipodomys_ordii ... # 379732 Pteropus_alecto # 387484 Pteropus_vampyrus # 389056 Rattus_norvegicus # 360929 Rhinopithecus_roxellana # 366019 Saimiri_boliviensis # 336296 Sorex_araneus # 402603 Sus_scrofa # 357418 Theropithecus_gelada # 372558 Tupaia_chinensis # 195495 hg38 # load the resulting file, merely for measurement purposes here ssh hgwdev cd /hive/data/genomes/hg38/bed/cactus447way/frames time hgLoadMafFrames hg38 cactus447wayFrames cactus447wayFrames.bed.gz # real 3m14.979s hgsql -e 'select count(*) from cactus447wayFrames;' hg38 # +----------+ # | count(*) | # +----------+ # | 16063019 | # +----------+ time featureBits -countGaps hg38 cactus447wayFrames # 65486909 bases of 3299210039 (1.985%) in intersection # real 1m50.539s # enable the trackDb entries: # frames https://hgdownload.soe.ucsc.edu/goldenPath/hg38/cactus447way/cactus447wayFrames.bb # irows on # zoom to base level in an exon to see codon displays # appears to work OK # do not need this loaded table: hgsql hg38 -e 'drop table cactus447wayFrames;' ######################################################################### +# Try loading this track data into database to see if download files +# such as upstream gene mafs can be constructed + mkdir -p /gbdb/hg38/cactus447way/maf + cd /hive/data/genomes/hg38/bed/cactus447/iRows/result + ln -s `pwd`/chr*.maf /gbdb/hg38/cactus447way/maf/ + + # this generates an immense cactus447way.tab file in the directory + # where it is running. Best to run this over in scratch. + # This is going to take all day. + cd /dev/shm + time hgLoadMaf -pathPrefix=/gbdb/hg38/cactus447way/maf hg38 cactus447way + +# -rw-rw-r-- 1 55422976 Aug 8 09:22 cactus447way.tab + + hgsql -e 'select count(*) from cactus447way;' hg38 + + # Loading cactus447way into database + # Loaded 108225530 mafs in 101 files from /gbdb/hg38/cactus447way/maf + + # real 440m30.144s + # user 404m46.183s + # sys 19m27.664s + +######################################################################### +# Phylogenetic tree from 447-way (DONE - 2022-08-26 - Hiram) + mkdir /hive/data/genomes/hg38/bed/cactus447/4d + cd /hive/data/genomes/hg38/bed/cactus447/4d + + # the annotated maf's are in: + ../iRows/result/chr*.maf + + # using ncbiRefSeq for hg38, only transcribed genes and nothing + # from the randoms and other misc. + hgsql -Ne "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiRefSeq where cdsEnd > cdsStart;" hg38 \ + | egrep -E -v "chrM|chrUn|random|_alt|_fix" > ncbiRefSeq.gp + wc -l *.gp + # 130420 ncbiRefSeq.gp + + # verify it is only on the chroms: + cut -f2 ncbiRefSeq.gp | sort | uniq -c | sort -rn | sed -e 's/^/ # /;' + # 12841 chr1 + # 9518 chr2 + # 8490 chr3 + # 7521 chr19 + # 7519 chr11 + # 7329 chr17 + # 7287 chr12 + # 6155 chr6 + # 6151 chr10 + # 5905 chr7 + # 5561 chr5 + # 5504 chr9 + # 5351 chr4 + # 5235 chr16 + # 4964 chr8 + # 4296 chrX + # 4255 chr15 + # 3959 chr14 + # 3089 chr20 + # 2721 chr22 + # 2521 chr18 + # 2471 chr13 + # 1411 chr21 + # 366 chrY + + genePredSingleCover ncbiRefSeq.gp stdout | sort > ncbiRefSeqNR.gp + wc -l ncbiRefSeqNR.gp + # 19542 ncbiRefSeqNR.gp + + ssh ku + mkdir /hive/data/genomes/hg38/bed/cactus447/4d/run + cd /hive/data/genomes/hg38/bed/cactus447/4d/run + mkdir ../mfa + + # newer versions of msa_view have a slightly different operation + # the sed of the gp file inserts the reference species in the chr name + # these processes are going to be copying the whole chromosome maf file + # to /scratch/tmp/ - these are huge files, make sure /scratch/tmp/ + # on the kluster nodes is clean enough to manage the largest chromosome + # maf file, for example: +# -rw-rw-r-- 1 661829442743 Jun 20 18:55 chr1.maf +# -rw-rw-r-- 1 682410918389 Jun 21 16:21 chr2.maf + + cat << '_EOF_' > 4d.csh +#!/bin/csh -fe +set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2018-03-29/bin +set r = "/hive/data/genomes/hg38/bed/cactus447" +set c = $1 +set infile = $r/iRows/result/$2 +set outfile = $3 +cd /scratch/tmp +# 'clean' maf, removes all chrom names, leaves only the db name +perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf +awk -v C=$c '$2 == C {print}' $r/4d/ncbiRefSeqNR.gp | sed -e "s/\t$c\t/\thg38.$c\t/" > $c.gp +set NL=`wc -l $c.gp| gawk '{print $1}'` +if ("$NL" != "0") then + $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss + $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile +else + echo "" > $r/4d/run/$outfile +endif +rm -f $c.gp $c.maf $c.ss +'_EOF_' + # << happy emacs + chmod +x 4d.csh + + ls -1S /hive/data/genomes/hg38/bed/cactus447/iRows/result/chr*.maf \ + | sed -e "s#.*cactus447/iRows/result/##" \ + | egrep -E -v "chrM|chrUn|random|_alt|_fix" > maf.list + + printf '#LOOP +4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} +#ENDLOOP +' > template + + gensub2 maf.list single template jobList + para -ram=128g create jobList + para try ... check ... push ... etc... + para time +# CPU time in finished jobs: 290959s 4849.31m 80.82h 3.37d 0.009 y +# IO & Wait Time: 17027s 283.79m 4.73h 0.20d 0.001 y +# Average job time: 15399s 256.65m 4.28h 0.18d +# Longest finished job: 36250s 604.17m 10.07h 0.42d +# Submission to last job: 36265s 604.42m 10.07h 0.42d + + # combine mfa files + ssh hgwdev + cd /hive/data/genomes/hg38/bed/cactus447/4d + # verify no tiny files: + ls -og mfa | sort -k3nr | tail -3 + # -rw-rw-r-- 1 2583636 Aug 8 10:30 chr21.mfa + # -rw-rw-r-- 1 1759815 Aug 8 09:47 chrY.mfa + + #want comma-less species.list + time /cluster/bin/phast.build/cornellCVS/phast.2018-03-29/bin/msa_view \ + --aggregate "`cat ../species.list.txt`" mfa/*.mfa | sed s/"> "/">"/ \ + > 4d.all.mfa + # real 0m9.416s + # -rw-rw-r-- 1 303080280 Aug 8 21:48 4d.all.mfa + + # check they are all in there: + grep "^>" 4d.all.mfa | wc -l + # 447 + + sed 's/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ + ../hg38.447way.nh + + sed 's/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ + ../hg38.447way.nh > tree-commas.nh + + # use phyloFit to create tree model (output is phyloFit.mod) + time /cluster/bin/phast.build/cornellCVS/phast.2018-03-29/bin/phyloFit \ + --EM --precision MED --msa-format FASTA --subst-mod REV \ + --tree tree-commas.nh 4d.all.mfa + # real 880m32.863s + + mv phyloFit.mod all.mod + + grep TREE all.mod +# TREE: (((((((((((((hg38:0.00899231,(Pan_paniscus:0.00233221, +# Pan_troglodytes:0.00214362):0.00450706):0.00200006, +# (Gorilla_gorilla:0.00120794, +# Gorilla_beringei:0.00131286):0.00731999):0.00848699, +# (Pongo_abelii:0.00175881,Pongo_pygmaeus:0.0021602):0.015206):0.00273536, +# (((((Hylobates_lar:0.000904688,Hylobates_pileatus:0.000921279):0.00271202, +# (((Hylobates_abbotti:0.00182513,Hylobates_muelleri:0.00188051):0.000678509, +# Hylobates_agilis:0.00217126):0.00068124, +# ... +# (Ceratotherium_simum:0.000693402, +# Ceratotherium_simum_cottoni:0.000665708):0.00612505):0.00973125):0.0392953):0.0100285):0.0323073):0.00299066):0.00383198):0.00906124):0.0213096):0.012194, +# (((Dasypus_novemcinctus:0.0705749,(Tolypeutes_matacus:0.036879, +# Chaetophractus_vellerosus:0.0367115):0.0193664):0.0421356, +# ((Tamandua_tetradactyla:0.0243613, +# Myrmecophaga_tridactyla:0.0216596):0.0822034, +# (Choloepus_didactylus:0.0057887, +# Choloepus_hoffmanni:0.00621468):0.0702582):0.018824):0.0529836, +# (((Trichechus_manatus:0.0621184,(Procavia_capensis:0.0107201, +# Heterohyrax_brucei:0.0105857):0.149548):0.00440411, +# Loxodonta_africana:0.0800445):0.0234193, +# ((((Microgale_talazaci:0.118008,Echinops_telfairi:0.077048):0.158928, +# Chrysochloris_asiatica:0.138211):0.0147081, +# Elephantulus_edwardii:0.20827):0.00712897, +# Orycteropus_afer:0.112233):0.00664842):0.0534765):0.012194); + + # compare these calculated lengths to what we started with + + /cluster/bin/phast/all_dists ../hg38.447way.nh | grep hg38 \ + | sed -e "s/hg38.//;" | sort > original.dists + + grep TREE all.mod | sed -e 's/TREE: //;' \ + | /cluster/bin/phast/all_dists /dev/stdin | grep hg38 \ + | sed -e "s/hg38.//;" | sort > hg38.dists + + # printing out the 'original', the 'new' the 'difference' and + # percent difference/delta + join original.dists hg38.dists | awk '{ + printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%8.6f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }' | sort -k4n | cut -f2,3,4,6 | sort -k4,4n +Chiropotes_albinasus 0.044221 0.153018 -71.100786 +Hapalemur_gilberti 0.091144 0.238983 -61.861722 +Hapalemur_meridionalis 0.091511 0.239866 -61.849116 +Hapalemur_occidentalis 0.090759 0.237751 -61.826028 +Hapalemur_griseus 0.091379 0.239368 -61.824889 +Hapalemur_alaotrensis 0.091149 0.238685 -61.812012 +Eulemur_rufus 0.090624 0.236415 -61.667407 +Eulemur_fulvus 0.090508 0.236040 -61.655652 +Lemur_catta 0.090748 0.236490 -61.627130 +... +Tolypeutes_matacus 0.518057 0.332742 55.693300 +Chaetophractus_vellerosus 0.520177 0.332575 56.408930 +Catagonus_wagneri 0.578187 0.366272 57.857275 +Megaderma_lyra 0.575967 0.360862 59.608659 +Spermophilus_dauricus 0.530847 0.329338 61.186076 +Choloepus_didactylus 0.537227 0.329232 63.175815 +Vicugna_pacos 0.553177 0.338170 63.579561 +Choloepus_hoffmanni 0.539357 0.329658 63.611076 +Tursiops_truncatus 0.533317 0.325632 63.779051 +Myrmecophaga_tridactyla 0.589117 0.357048 64.996583 + + # also calculated with SSREV model: + mkdir /cluster/data/hg38/bed/cactus447/4dSSREV + cd /cluster/data/hg38/bed/cactus447/4dSSREV + cp -p ../4d/4d.all.mfa . + cp -p ../4d/tree-commas.nh . + time /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/phyloFit \ + --EM --precision MED --msa-format FASTA --subst-mod SSREV \ + --tree tree-commas.nh 4d.all.mfa + # real 1406m51.018s + + mv phyloFit.mod all.mod + grep BACK all.mod + # BACKGROUND: 0.254978 0.245022 0.245022 0.254978 + +######################################################################### +# phyloP for 447-way (DONE - 2017-11-06 - Hiram) +# + # split SS files into 1M chunks, this business needs smaller files + # to complete + + ssh ku + mkdir /hive/data/genomes/hg38/bed/cactus447/consPhyloP + cd /hive/data/genomes/hg38/bed/cactus447/consPhyloP + mkdir ss run.split + cd run.split + + # the annotated maf's are in: + ../iRows/result/chr*.maf + + printf '#!/bin/csh -ef +set c = $1 +set MAF = /hive/data/genomes/hg38/bed/cactus447/iRows/result/$c.maf +set WINDOWS = /hive/data/genomes/hg38/bed/cactus447/consPhyloP/ss/$c +set NL = `grep -m 10 -c -v "^#" $MAF` +if ( -s $2 ) then + exit 0 +endif +if ( -s $2.running ) then + exit 0 +endif + +date >> $2.running + +rm -fr $WINDOWS +mkdir -p $WINDOWS +pushd $WINDOWS > /dev/null +if ( $NL > 0 ) then +/cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/msa_split \ + $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000 +endif +popd > /dev/null +date >> $2 +rm -f $2.running +' > doSplit.csh + + chmod +x doSplit.csh + + # do the easy ones first to see some immediate results + ls -1SL -r ../../iRows/result | grep chr | sed -e "s/.maf//;" > maf.list + + # this needs a {check out line+ $(root1.done)} test for verification: + printf '#LOOP +./doSplit.csh $(root1) {check out line+ $(root1).done} +#ENDLOOP +' > template + + gensub2 maf.list single template jobList + # this was tricky, some of the jobs couldn't complete even with + # 128g, had to complete a number of these manually, and they + # take a long time, more than 12 hours for individual chromosomes + para -ram=128g create jobList + para try ... check ... push ... etc... + +# Completed: 79 of 101 jobs +# Crashed: 22 jobs +# CPU time in finished jobs: 8058s 134.29m 2.24h 0.09d 0.000 y +# IO & Wait Time: 380s 6.34m 0.11h 0.00d 0.000 y +# Average job time: 107s 1.78m 0.03h 0.00d +# Longest finished job: 3239s 53.98m 0.90h 0.04d +# Submission to last job: 5350s 89.17m 1.49h 0.06d + + # run phyloP with score=LRT + ssh ku + mkdir /cluster/data/hg38/bed/cactus447/consPhyloP + cd /cluster/data/hg38/bed/cactus447/consPhyloP + + mkdir run.phyloP + cd run.phyloP + # Adjust model file base composition background and rate matrix to be + # representative of the chromosomes in play + grep BACK ../../4dSSREV/all.mod + # BACKGROUND: 0.254978 0.245022 0.245022 0.254978 + # interesting, they are already paired up + + grep BACKGROUND ../../4dSSREV/all.mod | awk '{printf "%0.3f\n", $3 + $4}' + # 0.490 + /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/modFreqs \ + ../../4dSSREV/all.mod 0.490 > all.mod + # verify, the BACKGROUND should now be paired up: + grep BACK all.mod + # BACKGROUND: 0.255000 0.245000 0.245000 0.255000 + + printf '#!/bin/csh -fe +set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin +set f = $1 +set ssFile = $1:t +set out = $2 +set cName = $f:h +set n = $f:r:e +set grp = $cwd:t +set cons = /hive/data/genomes/hg38/bed/cactus447/consPhyloP +set tmp = $cons/tmp/$grp/$f +/bin/rm -fr $tmp +/bin/mkdir -p $tmp +set ssSrc = "$cons/ss/$cName/$ssFile" +set useGrp = "$grp.mod" +/bin/ln -s $cons/run.phyloP/$grp.mod $tmp +pushd $tmp > /dev/null +echo source: $ssSrc.ss +$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ + -i SS $useGrp $ssSrc.ss > $ssFile.wigFix +popd > /dev/null +/bin/mkdir -p $out:h +sleep 4 +/bin/touch $out:h +/bin/mv $tmp/$ssFile.wigFix $out +/bin/rm -fr $tmp +/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp +/bin/rmdir --ignore-fail-on-non-empty $cons/tmp +' > doPhyloP.csh + + chmod +x doPhyloP.csh + + # Create list of chunks + find ../ss -type f | sed -e "s/.ss$//; s#../ss/##;" > ss.list + # make sure the list looks good + wc -l ss.list + # 3031 ss.list + + # Create template file + # file1 == $chr/$chunk/file name without .ss suffix + printf '#LOOP +../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} +#ENDLOOP +' > template + + ###################### Running all species ####################### + # setup run for all species + mkdir /hive/data/genomes/hg38/bed/cactus447/consPhyloP/all + cd /hive/data/genomes/hg38/bed/cactus447/consPhyloP/all + mkdir wigFix + + gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList + # beware overloading the cluster with these quick and high I/O jobs + para -ram=64g create jobList + para try ... check ... + para -maxJob=16 push + para time > run.time +Completed: 3031 of 3031 jobs +CPU time in finished jobs: 28585172s 476419.53m 7940.33h 330.85d 0.906 y +IO & Wait Time: 132590s 2209.83m 36.83h 1.53d 0.004 y +Average job time: 9475s 157.91m 2.63h 0.11d +Longest finished job: 13733s 228.88m 3.81h 0.16d +Submission to last job: 38568s 642.80m 10.71h 0.45d + + mkdir downloads + time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` +do + echo "working: $D" 1>&2 + find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ + | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ + | gzip -c > downloads/${D}.phyloP447way.wigFix.gz +done + # real 34m31.522s + + du -hsc downloads + # 5.4G downloads + + # check integrity of data with wigToBigWig + zcat downloads/*.wigFix.gz > all.wigFix + time (wigToBigWig -verbose=2 all.wigFix /hive/data/genomes/hg38/chrom.sizes \ + phyloP447way.bw) > bigWig.log 2>&1 + + egrep "real|VmPeak" bigWig.log + # pid=236037: VmPeak: 33037308 kB + # real 25m42.759s + + bigWigInfo phyloP447way.bw | sed -e 's/^/# /;' +# version: 4 +# isCompressed: yes +# isSwapped: 0 +# primaryDataSize: 7,824,240,203 +# primaryIndexSize: 90,693,508 +# zoomLevels: 10 +# chromCount: 98 +# basesCovered: 2,874,168,445 +# mean: -0.022678 +# min: -20.000000 +# max: 8.123000 +# std: 1.692685 + + # encode those files into wiggle data +# time (zcat downloads/*.wigFix.gz \ + wigEncode all.wigFix phyloP447way.wig phyloP447way.wib +# Converted all.wigFix, upper limit 8.12, lower limit -20.00 +# real 8m8.231s + + +# -rw-rw-r-- 1 2874168445 Aug 30 14:08 phyloP447way.wib +# -rw-rw-r-- 1 298978228 Aug 30 14:08 phyloP447way.wig + + du -hsc *.wi? + # 2.7G phyloP447way.wib + # 286M phyloP447way.wig + + # Load gbdb and database with wiggle. + rm -f /gbdb/hg38/cactus447way/phyloP447way.wib + ln -s `pwd`/phyloP447way.wib /gbdb/hg38/cactus447way/phyloP447way.wib + time hgLoadWiggle -pathPrefix=/gbdb/hg38/cactus447way hg38 \ + phyloP447way phyloP447way.wig + # real 0m27.533s + + # use to set trackDb.ra entries for wiggle min and max + # and verify table is loaded correctly + + wigTableStats.sh hg38 phyloP447way +# db.table min max mean count sumData +hg38.phyloP447way -20 8.123 -0.022678 2874168445 -6.51804e+07 +# 1.69269 viewLimits=-8.4861:8.123 +# stdDev viewLimits + + # that range is: 20+8.123= 28.123 for hBinSize=0.028123 + + # Create histogram to get an overview of all the data + time hgWiggle -doHistogram \ + -hBinSize=0.028123 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ + -db=hg38 phyloP447way > histogram.data 2>&1 + # real 1m39.861s + + + # x-axis range: + grep -v chrom histogram.data | grep "^[0-9]" | ave -col=2 stdin \ + | sed -e 's/^/# /;' +# Q1 -10.086650 +# median -4.012075 +# Q3 2.062495 +# average -4.017054 +# min -20.000000 +# max 8.123000 +# count 864 +# total -3470.734684 +# standard deviation 7.028195 + + # find out the range for the 2:5 graph + grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \ + | sed -e 's/^/# /;' +# Q1 0.000016 +# median 0.000074 +# Q3 0.000431 +# average 0.001157 +# min 0.000000 +# max 0.025240 +# count 864 +# total 1.000001 +# standard deviation 0.003283 + + # create plot of histogram: + printf 'set terminal pngcairo size 1200,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb" +set output "hg38.phyloP447.histo.png" +set size 1.0, 1.0 +set style line 1 lt 2 lc rgb "#ff88ff" lw 2 +set style line 2 lt 2 lc rgb "#66ff66" lw 2 +set style line 3 lt 2 lc rgb "#ffff00" lw 2 +set style line 4 lt 2 lc rgb "#ffffff" lw 2 +set border lc rgb "#ffff00" +set key left box ls 3 +set key tc variable +set grid noxtics +set grid ytics ls 4 +set y2tics border nomirror out tc rgb "#ffffff" +set ytics border nomirror out tc rgb "#ffffff" +set title " Human hg38 Histogram phyloP447way track" tc rgb "#ffffff" +set xlabel " hg38.phyloP447way score" tc rgb "#ffffff" +set ylabel " Relative Frequency" tc rgb "#ff88ff" +set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66" +set xrange [-4:2.5] +set yrange [0:0.04] +set y2range [0:1] + +plot "histogram.data" using 2:5 title " RelFreq" with impulses ls 1, \ + "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ls 2 +' | gnuplot + + # verify it looks sane + display hg38.phyloP447.histo.png & + # it now shows the spike at -1.000 for the artifical filling of + # no data available. + + ###################### Running primate species ####################### + # computing 4d-sites phyloFit for primates + mkdir /cluster/data/hg38/bed/cactus447/4d/primates + cd /cluster/data/hg38/bed/cactus447/4d/primates + # the primates.list.txt was obtained from the hg38.447way.nh + # tree by taking the first 243 in that tree, ending in: + head -243 ../../hg38.447way.nh | tail -1 + Galagoides_demidoff:0.00802126):0.0124128):0.0334321):0.0146219):0.105, + + head -243 ../../hg38.447way.nh | sed -e 's/^ \+//;' \ + | tr -d '(' | sed -e 's/:.*//;' | sort > primates.list.txt + + # then only using the mfa sequences for those specific sequences + ls ../mfa/chr*.mfa | while read C +do + B=`basename $C` + faSomeRecords "${C}" primates.list.txt ${B} + printf "%s\n" "${B}" +done + # resulting in: + faSize chr*.mfa +# 151201516 bases (18902 N's 151182614 real 151182614 upper 0 lower) +# in 5832 sequences in 24 files + + # then running phyloFit: + sed 's/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' primates.nh > tree-commas.nh + + # note: using the SSREV model here: + /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/phyloFit \ + --EM --precision MED --msa-format FASTA --subst-mod SSREV \ + --tree tree-commas.nh primates.mfa + # rename the result + mv phyloFit.mod primates.mod + + # adjust the frequencies + cd /cluster/data/hg38/bed/cactus447/consPhyloP/run.phyloP + grep BACK ../../4d/primates/primates.mod + # BACKGROUND: 0.261935 0.238065 0.238065 0.261935 + # interesting, they are already paired up + + grep BACK ../../4d/primates/primates.mod | awk '{printf "%0.3f\n", $3 + $4}' + # 0.476 + /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/modFreqs \ + ../../4d/primates/primates.mod 0.476 > primates.mod + # verify, the BACKGROUND should now be paired up: + grep BACK all.mod + # BACKGROUND: 0.262000 0.238000 0.238000 0.262000 + + # setup run for primate species + mkdir /hive/data/genomes/hg38/bed/cactus447/consPhyloP/primates + cd /hive/data/genomes/hg38/bed/cactus447/consPhyloP/primates + mkdir wigFix + + gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList + para -ram=6g create jobList + para try ... check ... + para -maxJob=16 push + para time > run.time +Completed: 2438 of 2438 jobs +CPU time in finished jobs: 18916928s 315282.13m 5254.70h 218.95d 0.600 y +IO & Wait Time: 68909s 1148.49m 19.14h 0.80d 0.002 y +Average job time: 7787s 129.79m 2.16h 0.09d +Longest finished job: 12050s 200.83m 3.35h 0.14d +Submission to last job: 26707s 445.12m 7.42h 0.31d + + mkdir downloads + time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` +do + echo "working: $D" 1>&2 + find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ + | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ + | gzip -c > downloads/${D}.phyloP447wayPrimates.wigFix.gz +done + # real 33m18.006s + + du -hsc downloads + # 4.9G downloads + + # check integrity of data with wigToBigWig + zcat downloads/*.wigFix.gz > primates.wigFix + time (wigToBigWig -verbose=2 primates.wigFix \ + /hive/data/genomes/hg38/chrom.sizes \ + phyloP447wayPrimates.bw) > bigWig.log 2>&1 + + egrep "real|VmPeak" bigWig.log + # pid=236355: VmPeak: 33037308 kB + # real 26m17.042s + + bigWigInfo phyloP447wayPrimates.bw | sed -e 's/^/# /;' +# version: 4 +# isCompressed: yes +# isSwapped: 0 +# primaryDataSize: 6,954,238,655 +# primaryIndexSize: 90,693,508 +# zoomLevels: 10 +# chromCount: 98 +# basesCovered: 2,874,168,445 +# mean: -0.066955 +# min: -20.000000 +# max: 1.587000 +# std: 1.479093 + + # encode those files into wiggle data +# time (zcat downloads/*.wigFix.gz \ + time wigEncode primates.wigFix phyloP447wayPrimates.wig phyloP447wayPrimates.wib +# Converted primates.wigFix, upper limit 1.59, lower limit -20.00 +# real 9m17.113s +XXX - running - Wed Aug 30 13:59:09 PDT 2023 + +# -rw-rw-r-- 1 2874168445 Aug 30 14:08 phyloP447wayPrimates.wib +# -rw-rw-r-- 1 320006223 Aug 30 14:08 phyloP447wayPrimates.wig + + du -hsc *.wi? + # 2.7G phyloP447way.wib + # 306M phyloP447way.wig + + # Load gbdb and database with wiggle. + rm -f /gbdb/hg38/cactus447way/phyloP447wayPrimates.wib + ln -s `pwd`/phyloP447wayPrimates.wib /gbdb/hg38/cactus447way/phyloP447wayPrimates.wib + time hgLoadWiggle -pathPrefix=/gbdb/hg38/cactus447way hg38 \ + phyloP447wayPrimates phyloP447wayPrimates.wig + # real 0m27.140s + + # use to set trackDb.ra entries for wiggle min and max + # and verify table is loaded correctly + + wigTableStats.sh hg38 phyloP447wayPrimates +# db.table min max mean count sumData +hg38.phyloP447wayPrimates -20 1.587 -0.0669553 2874168445 -1.92441e+08 +# 1.47909 viewLimits=-7.46242:1.587 +# stdDev viewLimits + + # that range is: 20+1.587= 21.587 for hBinSize=0.021587 + + # Create histogram to get an overview of all the data + time hgWiggle -doHistogram \ + -hBinSize=0.021587 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ + -db=hg38 phyloP447wayPrimates > histogram.data 2>&1 + # real 1m30.401s + # x-axis range: + grep -v chrom histogram.data | grep "^[0-9]" | ave -col=2 stdin \ + | sed -e 's/^/# /;' +# Q1 -11.645800 +# median -7.231290 +# Q3 -2.816750 +# average -7.238782 +# min -20.000000 +# max 1.565410 +# count 818 +# total -5921.323540 +# standard deviation 5.110470 + + # find out the range for the 2:5 graph + grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \ + | sed -e 's/^/# /;' +# Q1 0.000007 +# median 0.000032 +# Q3 0.000340 +# average 0.001237 +# min 0.000001 +# max 0.025761 +# count 818 +# total 1.012033 +# standard deviation 0.003386 + + # create plot of histogram: + printf 'set terminal pngcairo size 1200,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb" +set output "hg38.phyloP447Primates.histo.png" +set size 1.0, 1.0 +set style line 1 lt 2 lc rgb "#ff88ff" lw 2 +set style line 2 lt 2 lc rgb "#66ff66" lw 2 +set style line 3 lt 2 lc rgb "#ffff00" lw 2 +set style line 4 lt 2 lc rgb "#ffffff" lw 2 +set border lc rgb "#ffff00" +set key left box ls 3 +set key tc variable +set grid noxtics +set grid ytics ls 4 +set y2tics border nomirror out tc rgb "#ffffff" +set ytics border nomirror out tc rgb "#ffffff" +set title " Human hg38 Histogram phyloP447wayPrimates track" tc rgb "#ffffff" +set xlabel " hg38.phyloP447wayPrimates score" tc rgb "#ffffff" +set ylabel " Relative Frequency" tc rgb "#ff88ff" +set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66" +set xrange [-3:2] +set yrange [0:0.04] +set y2range [0:1] + +plot "histogram.data" using 2:5 title " RelFreq" with impulses ls 1, \ + "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ls 2 +' | gnuplot + + # verify it looks sane + display hg38.phyloP447Primates.histo.png & + # it now shows the spike at -1.000 for the artifical filling of + # no data available. + +#############################################################################