52e47105152b7e5dd2621b9e8439446e0d77336f braney Mon Oct 23 12:01:11 2023 -0700 build Jaspar 2024 motif table diff --git src/hg/makeDb/doc/hgFixed.txt src/hg/makeDb/doc/hgFixed.txt index 54518c8..b226141 100644 --- src/hg/makeDb/doc/hgFixed.txt +++ src/hg/makeDb/doc/hgFixed.txt @@ -1079,15 +1079,26 @@ sort -k 1,1 -k 3,3nr -k 4,4 >ctdSorted.tab hgsql hgFixed < ~/kent/src/hg/lib/ctdSorted.sql hgsql hgFixed -e 'load data local infile "ctdSorted.tab" into table ctdSorted' ##### # Jaspar 2022 PFM (DONE 2021/12/08 braney) ##### mkdir -p /hive/data/outside/jaspar/2022/all cd /hive/data/outside/jaspar/2022/all wget "https://jaspar.genereg.net/download/data/2022/CORE/JASPAR2022_CORE_non-redundant_pfms_jaspar.zip" unzip JASPAR2022_CORE_non-redundant_pfms_jaspar.zip for i in *.jaspar; do f=`awk '{print $1; exit}' $i | tr -d '>'`; tail -n +1 $i | awk '{count=NF - 3; for (ii=3; ii <= NF - 1; ii++) counts[ii - 3] += $ii;} END {for(ii=0; ii < count; ii++) printf "%d\t", counts[ii]; printf "\n"}' > /tmp/1; tail -n +2 $i | awk '{count=NF - 3; for (ii=3; ii <= NF - 1; ii++) printf "%d\t", $ii; printf "\n"}' >> /tmp/1; tail -n +1 $i | awk '{count=NF - 3; for (ii=3; ii <= NF - 1; ii++) counts[ii - 3] += $ii;} END {for(ii=0; ii < count; ii++) printf "%d\t", counts[ii]; printf "\n"}' | awk '{numCols=NF; for(ii=1;ii <= numCols; ii++) mat[NR-1][ii] = $ii;} END {for(jj=1; jj <= numCols; jj++) {for(ii=0; ii < NR; ii++) printf "%d\t",mat[ii][jj]; printf "\n";}}' /tmp/1 | awk '{for (ii=2; ii <= NF; ii++) printf "%f\t",$ii/$1; printf "\n";}' | awk -v name=$f '{numCols=NF; for(ii=1;ii <= numCols; ii++) mat[NR-1][ii] = $ii;} END {printf "%s\t%d\t",name,NR;for(jj=1; jj <= numCols; jj++) {for(ii=0; ii < NR; ii++) printf "%g,",mat[ii][jj]; printf "\t";}} END {printf "\n"}' ; done > jasparMotif.tab hgLoadSqlTab hgFixed jasparCore2022 ~/kent/src/hg/lib/dnaMotif.sql jasparMotif.tab + +##### +# Jaspar 2024 PFM (DONE 2024/10/23 braney) +##### +mkdir -p /hive/data/outside/jaspar/2024/all +cd /hive/data/outside/jaspar/2024/all +wget "https://testjaspar.uio.no/download/data/2024/CORE/JASPAR2024_CORE_non-redundant_pfms_jaspar.zip" +unzip JASPAR2024_CORE_non-redundant_pfms_jaspar.zip +for i in *.jaspar; do f=`awk '{print $1; exit}' $i | tr -d '>'`; tail -n +1 $i | awk '{count=NF - 3; for (ii=3; ii <= NF - 1; ii++) counts[ii - 3] += $ii;} END {for(ii=0; ii < count; ii++) printf "%d\t", counts[ii]; printf "\n"}' > /tmp/1; tail -n +2 $i | awk '{count=NF - 3; for (ii=3; ii <= NF - 1; ii++) printf "%d\t", $ii; printf "\n"}' >> /tmp/1; tail -n +1 $i | awk '{count=NF - 3; for (ii=3; ii <= NF - 1; ii++) counts[ii - 3] += $ii;} END {for(ii=0; ii < count; ii++) printf "%d\t", counts[ii]; printf "\n"}' | awk '{numCols=NF; for(ii=1;ii <= numCols; ii++) mat[NR-1][ii] = $ii;} END {for(jj=1; jj <= numCols; jj++) {for(ii=0; ii < NR; ii++) printf "%d\t",mat[ii][jj]; printf "\n";}}' /tmp/1 | awk '{for (ii=2; ii <= NF; ii++) printf "%f\t",$ii/$1; printf "\n";}' | awk -v name=$f '{numCols=NF; for(ii=1;ii <= numCols; ii++) mat[NR-1][ii] = $ii;} END {printf "%s\t%d\t",name,NR;for(jj=1; jj <= numCols; jj++) {for(ii=0; ii < NR; ii++) printf "%g,",mat[ii][jj]; printf "\t";}} END {printf "\n"}' ; done > jasparMotif.tab + +hgLoadSqlTab hgFixed jasparCore2024 ~/kent/src/hg/lib/dnaMotif.sql jasparMotif.tab