38942a57b41e68d9432452fa40e9bbae5a9f8085 hiram Thu Jan 5 16:32:57 2023 -0800 now with appropriate symlinks for bbi and ixIxx files and template description pages for tracks and hub documentation no redmine diff --git src/hg/utils/automation/faToHub.sh src/hg/utils/automation/faToHub.sh index 38cb4af..15513e4 100755 --- src/hg/utils/automation/faToHub.sh +++ src/hg/utils/automation/faToHub.sh @@ -1,194 +1,249 @@ #!/bin/bash # exit on any error set -beEu -o pipefail if [ $# -ne 2 ]; then - printf "usage: faToTwoBit.sh /path/to/assembly.fa /destination/build/directory/\n" 1>&2 - printf " will process the assembly.fa file into a set of files to use\n" 1>&2 - printf " as an assembly hub for the UCSC Genome Browser.\n" 1>&2 - printf "Expected that the /destination/build/directory/ already exists.\n" 1>&2 - printf "The 'name' of the assembly hub will be the 'assembly' name from\n" 1>&2 - printf " the 'assembly.fa' file name.\n" 1>&2 - printf "The assembly.fa file can be gzipped specified with the .gz extension:\n" 1>&2 - printf " assembly.fa.gz\n" 1>&2 + printf "usage: faToTwoBit.sh /path/to/assembly.fa /destination/build/directory/ + will process the assembly.fa file into a set of files to use + as an assembly hub for the UCSC Genome Browser. +Expected that the /destination/build/directory/ already exists. +The 'name' of the assembly hub will be the 'assembly' name from + the 'assembly.fa' file name. +The assembly.fa file can be gzipped specified with the .gz extension: + assembly.fa.gz\n" 1>&2 exit 255 fi +#### expected two arguments #### export asmFa="${1}" export destDir="${2}" +#### must have destDir existing #### if [ ! -d "${destDir}" ]; then printf "expecting the destination directory to already exist\n" 1>&2 printf "do not see '%s' as a directory\n" "${destDir}" 1>&2 exit 255 fi +#### establish fundamental variables #### export buildLog="${destDir}/build.log" - export asmFile=`basename ${asmFa}` export noGz="${asmFile%.gz}" export asmId="${noGz%.*}" +#### working in destDir #### cd "${destDir}" +mkdir -p html bbi ixIxx +#### record progress in buildLog #### printf "#### %s ####\n" "`date \"+%F %T\"`" >> $buildLog printf "# processing %s into assembly %s\n" "${asmFile}" "${asmId}" 1>&2 printf "# processing %s into assembly %s\n" "${asmFile}" "${asmId}" >> $buildLog +#### construct 2bit and 2bit.bpt from assembly.fa sequence #### if [ "${asmFa}" -nt "${asmId}.2bit" ]; then printf "# time faToTwoBit ${asmFa} ${asmId}.2bit\n" 1>&2 printf "# time faToTwoBit ${asmFa} ${asmId}.2bit\n" >> $buildLog time faToTwoBit "${asmFa}" "${asmId}.2bit" >> $buildLog 2>&1 printf "# time bptForTwoBit ${asmId}.2bit ${asmId}.2bit.bpt\n" >> $buildLog time bptForTwoBit "${asmId}.2bit" "${asmId}.2bit.bpt" >> $buildLog 2>&1 touch -r "${asmFa}" "${asmId}.2bit" touch -r "${asmFa}" "${asmId}.2bit.bpt" fi - +#### construct chrom.sizes.txt from 2bit file #### if [ "${asmId}.2bit" -nt "${asmId}.chrom.sizes.txt" ]; then printf "# time twoBitInfo ${asmId}.2bit stdout | sort -k2,2nr > ${asmId}.chrom.sizes.txt\n" 1>&2 printf "# time twoBitInfo ${asmId}.2bit stdout | sort -k2,2nr > ${asmId}.chrom.sizes.txt\n" >> $buildLog time twoBitInfo "${asmId}.2bit" stdout | sort -k2,2nr > "${asmId}.chrom.sizes.txt" 2>> $buildLog touch -r "${asmId}.2bit" "${asmId}.chrom.sizes.txt" fi +#### construct AGP file from assembly.fa sequence #### if [ "${asmFa}" -nt "${asmId}.agp.gz" ]; then printf "# time hgFakeAgp -minContigGap=1 -minScaffoldGap=1000 -singleContigs "${asmFa}" stdout | gzip -c > ${asmId}.agp.gz\n" 1>&2 printf "# time hgFakeAgp -minContigGap=1 -minScaffoldGap=1000 -singleContigs "${asmFa}" stdout | gzip -c > ${asmId}.agp.gz\n" >> $buildLog hgFakeAgp -minContigGap=1 -minScaffoldGap=1000 -singleContigs "${asmFa}" stdout | gzip -c > "${asmId}.agp.gz" 2>> $buildLog touch -r "${asmFa}" "${asmId}.agp.gz" fi +wget --timestamping "https://genome-source.gi.ucsc.edu/gitlist/kent.git/raw/master/src/hg/makeDb/doc/asmHubs/groups.txt" + # calculate a default position in the middle of the largest sequence export chrName=`head -1 $asmId.chrom.sizes.txt | cut -f1` export bigChrom=`head -1 $asmId.chrom.sizes.txt | awk '{print $NF}'` export oneThird=`awk -v s=$bigChrom 'BEGIN {printf "%d", s/3}'` export tenK=`awk -v t=$oneThird -v m=$bigChrom 'BEGIN {r=t + 10000; if (r > m) { r = m}; printf "%d", r}'` export defPos="${chrName}:${oneThird}-${tenK}" printf "hub $asmId genome assembly shortLabel $asmId longLabel $asmId useOneFile on email yourEmail@uni.edu -descriptionHtml html/$asmId.description.html +descriptionUrl html/$asmId.description.html genome $asmId groups groups.txt description $asmId twoBitPath $asmId.2bit twoBitBptUrl $asmId.2bit.bpt organism $asmId defaultPos $defPos scientificName $asmId htmlPath html/$asmId.description.html - " > ${destDir}/hub.txt +printf "

assembly description page

+

+Should be filled in with information about this assembly hub. +

\n" > ${destDir}/html/$asmId.description.html + ###### assembly/gap #################### mkdir -p "${destDir}/trackData/assemblyGap" cd "${destDir}/trackData/assemblyGap" if [ "../../${asmId}.agp.gz" -nt "${asmId}.assembly.bb" ]; then printf "# constructing assembly/gap tracks\n" 1>&2 - zcat ../../$asmId.agp.gz | grep -v "^#" | awk '$5 != "N" && $5 != "U"' \ + zgrep -v "^#" ../../$asmId.agp.gz | awk '$5 != "N" && $5 != "U"' \ | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $1, $2-1, $3, $6, $9}' \ | sort -k1,1 -k2,2n > $asmId.assembly.bed - zcat ../../$asmId.agp.gz | grep -v "^#" | awk '$5 == "N" || $5 == "U"' \ + zgrep -v "^#" ../../$asmId.agp.gz | awk '$5 == "N" || $5 == "U"' \ | awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $8}' \ | sort -k1,1 -k2,2n > $asmId.gap.bed bedToBigBed -extraIndex=name -verbose=0 $asmId.assembly.bed \ ../../$asmId.chrom.sizes.txt $asmId.assembly.bb touch -r ../../$asmId.agp.gz $asmId.assembly.bb $HOME/kent/src/hg/utils/automation/genbank/nameToIx.pl \ $asmId.assembly.bed | sort -u > $asmId.assembly.ix.txt if [ -s $asmId.assembly.ix.txt ]; then ixIxx $asmId.assembly.ix.txt $asmId.assembly.ix $asmId.assembly.ixx fi if [ -s "${asmId}.gap.bed" ]; then bedToBigBed -extraIndex=name -verbose=0 $asmId.gap.bed \ ../../$asmId.chrom.sizes $asmId.gap.bb touch -r ../../$asmId.agp.gz $asmId.gap.bb + rm -f ${destDir}/bbi/$asmId.gap.bb + ln -s ../trackData/assemblyGap/$asmId.gap.bb ${destDir}/bbi/$asmId.gap.bb else rm -f "${asmId}.gap.bed" fi else printf "# assembly/gap previously completed\n" 1>&2 fi -printf "track assembly +cd "${destDir}" + +if [ -s "${destDir}/trackData/assemblyGap/$asmId.assembly.bb" ]; then + rm -f ${destDir}/bbi/$asmId.assembly.bb + ln -s ../trackData/assemblyGap/$asmId.assembly.bb ${destDir}/bbi/$asmId.assembly.bb + if [ -s "${destDir}/trackData/assemblyGap/$asmId.assembly.ix" ]; then + rm -f ${destDir}/ixIxx/$asmId.assembly.ix ${destDir}/ixIxx/$asmId.assembly.ixx + ln -s ../trackData/assemblyGap/$asmId.assembly.ix ${destDir}/ixIxx/$asmId.assembly.ix + ln -s ../trackData/assemblyGap/$asmId.assembly.ixx ${destDir}/ixIxx/$asmId.assembly.ixx + fi + + printf " +track assembly longLabel Assembly shortLabel Assembly visibility pack colorByStrand 150,100,30 230,170,40 color 150,100,30 altColor 230,170,40 bigDataUrl bbi/$asmId.assembly.bb type bigBed 6 html html/$asmId.assembly searchIndex name searchTrix ixIxx/$asmId.assembly.ix url https://www.ncbi.nlm.nih.gov/nuccore/$$ urlLabel NCBI Nucleotide database group map " >> ${destDir}/hub.txt +printf "

assembly track description page

+

+Should be filled in with information about this track. +

\n" > ${destDir}/html/$asmId.assembly.html + +fi + cd "${destDir}" ###### cytoBand #################### mkdir -p "${destDir}/trackData/cytoBand" cd "${destDir}/trackData/cytoBand" if [ ../../$asmId.chrom.sizes.txt -nt $asmId.cytoBand.bb ]; then printf "# construction cytoBand track\n" 1>&2 awk '{printf "%s\t0\t%d\t\tgneg\n", $1, $2}' ../../$asmId.chrom.sizes.txt | sort -k1,1 -k2,2n > $asmId.cytoBand.bed bedToBigBed -type=bed4+1 -as=$HOME/kent/src/hg/lib/cytoBand.as -tab $asmId.cytoBand.bed ../../$asmId.chrom.sizes.txt $asmId.cytoBand.bb >> $buildLog 2>&1 touch -r ../../$asmId.chrom.sizes.txt $asmId.cytoBand.bb else printf "# cytoBand previously completed\n" 1>&2 fi -printf "track cytoBandIdeo +cd "${destDir}" + +if [ -s "${destDir}/trackData/cytoBand/$asmId.cytoBand.bb" ]; then + rm -f ${destDir}/bbi/$asmId.cytoBand.bb + ln -s ../trackData/cytoBand/$asmId.cytoBand.bb ${destDir}/bbi/$asmId.cytoBand.bb + printf " +track cytoBandIdeo shortLabel Chromosome Band (Ideogram) longLabel Ideogram for Orientation group map visibility dense type bigBed 4 + bigDataUrl bbi/$asmId.cytoBand.bb " >> ${destDir}/hub.txt +fi + ###### gc5Base #################### mkdir -p "${destDir}/trackData/gc5Base" cd "${destDir}/trackData/gc5Base" if [ ../../$asmId.2bit -nt $asmId.gc5Base.bw ]; then printf "# constructin gc5Base track\n" 1>&2 hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 test \ ../../$asmId.2bit 2>> $buildLog \ | gzip -c > $asmId.wigVarStep.gz wigToBigWig $asmId.wigVarStep.gz ../../$asmId.chrom.sizes.txt $asmId.gc5Base.bw >> $buildLog 2>&1 rm -f $asmId.wigVarStep.gz touch -r ../../$asmId.2bit $asmId.gc5Base.bw else printf "# gc5Base previously completed\n" 1>&2 fi -printf "track gc5Base +cd "${destDir}" + +if [ -s "${destDir}/trackData/gc5Base/$asmId.gc5Base.bw" ]; then + rm -f ${destDir}/bbi/$asmId.gc5Base.bw + ln -s ../trackData/gc5Base/$asmId.gc5Base.bw ${destDir}/bbi/$asmId.gc5Base.bw + + printf " +track gc5Base shortLabel GC Percent longLabel GC Percent in 5-Base Windows group map visibility full autoScale Off maxHeightPixels 128:36:16 graphTypeDefault Bar gridDefault OFF windowingFunction Mean color 0,0,0 altColor 128,128,128 viewLimits 30:70 type bigWig 0 100 bigDataUrl bbi/$asmId.gc5Base.bw html html/$asmId.gc5Base " >> ${destDir}/hub.txt +printf "

gc5Base description page

+

+Should be filled in with information about this track. +

\n" > ${destDir}/html/$asmId.gc5Base.html + +fi