57d6df521aa11c79fa5e6d3f37f35804b3720a28
hiram
  Thu Jan 5 12:32:08 2023 -0800
beginning of a script that can convert a fasta file into an assembly hub no redmine

diff --git src/hg/utils/automation/faToHub.sh src/hg/utils/automation/faToHub.sh
new file mode 100755
index 0000000..38cb4af
--- /dev/null
+++ src/hg/utils/automation/faToHub.sh
@@ -0,0 +1,194 @@
+#!/bin/bash
+
+# exit on any error
+set -beEu -o pipefail
+
+if [ $# -ne 2 ]; then
+  printf "usage: faToTwoBit.sh /path/to/assembly.fa /destination/build/directory/\n" 1>&2
+  printf "  will process the assembly.fa file into a set of files to use\n" 1>&2
+  printf "  as an assembly hub for the UCSC Genome Browser.\n" 1>&2
+  printf "Expected that the /destination/build/directory/ already exists.\n" 1>&2
+  printf "The 'name' of the assembly hub will be the 'assembly' name from\n" 1>&2
+  printf "  the 'assembly.fa' file name.\n" 1>&2
+  printf "The assembly.fa file can be gzipped specified with the .gz extension:\n" 1>&2
+  printf "   assembly.fa.gz\n" 1>&2
+  exit 255
+fi
+
+export asmFa="${1}"
+export destDir="${2}"
+
+if [ ! -d "${destDir}" ]; then
+  printf "expecting the destination directory to already exist\n" 1>&2
+  printf "do not see '%s' as a directory\n" "${destDir}" 1>&2
+  exit 255
+fi
+
+export buildLog="${destDir}/build.log"
+
+export asmFile=`basename ${asmFa}`
+export noGz="${asmFile%.gz}"
+export asmId="${noGz%.*}"
+
+cd "${destDir}"
+
+printf "#### %s ####\n"  "`date \"+%F %T\"`" >> $buildLog
+printf "# processing %s into assembly %s\n" "${asmFile}" "${asmId}" 1>&2
+printf "# processing %s into assembly %s\n" "${asmFile}" "${asmId}" >> $buildLog
+
+if [ "${asmFa}" -nt "${asmId}.2bit" ]; then
+   printf "# time faToTwoBit ${asmFa} ${asmId}.2bit\n" 1>&2
+   printf "# time faToTwoBit ${asmFa} ${asmId}.2bit\n" >> $buildLog
+   time faToTwoBit "${asmFa}" "${asmId}.2bit" >> $buildLog 2>&1
+   printf "# time bptForTwoBit ${asmId}.2bit ${asmId}.2bit.bpt\n" >> $buildLog
+   time bptForTwoBit "${asmId}.2bit" "${asmId}.2bit.bpt" >> $buildLog 2>&1
+   touch -r "${asmFa}" "${asmId}.2bit"
+   touch -r "${asmFa}" "${asmId}.2bit.bpt"
+fi
+
+
+if [ "${asmId}.2bit" -nt "${asmId}.chrom.sizes.txt" ]; then
+   printf "# time twoBitInfo ${asmId}.2bit stdout | sort -k2,2nr > ${asmId}.chrom.sizes.txt\n" 1>&2
+   printf "# time twoBitInfo ${asmId}.2bit stdout | sort -k2,2nr > ${asmId}.chrom.sizes.txt\n" >> $buildLog
+   time twoBitInfo "${asmId}.2bit" stdout | sort -k2,2nr > "${asmId}.chrom.sizes.txt" 2>> $buildLog
+   touch -r "${asmId}.2bit" "${asmId}.chrom.sizes.txt"
+fi
+
+if [ "${asmFa}" -nt "${asmId}.agp.gz" ]; then
+  printf "# time hgFakeAgp -minContigGap=1 -minScaffoldGap=1000 -singleContigs  "${asmFa}" stdout | gzip -c > ${asmId}.agp.gz\n" 1>&2
+  printf "# time hgFakeAgp -minContigGap=1 -minScaffoldGap=1000 -singleContigs  "${asmFa}" stdout | gzip -c > ${asmId}.agp.gz\n" >> $buildLog
+  hgFakeAgp -minContigGap=1 -minScaffoldGap=1000 -singleContigs  "${asmFa}" stdout | gzip -c > "${asmId}.agp.gz" 2>> $buildLog
+   touch -r "${asmFa}" "${asmId}.agp.gz"
+fi
+
+# calculate a default position in the middle of the largest sequence
+export chrName=`head -1 $asmId.chrom.sizes.txt | cut -f1`
+export bigChrom=`head -1 $asmId.chrom.sizes.txt | awk '{print $NF}'`
+export oneThird=`awk -v s=$bigChrom 'BEGIN {printf "%d", s/3}'`
+export tenK=`awk -v t=$oneThird -v m=$bigChrom 'BEGIN {r=t + 10000; if (r > m) { r = m}; printf "%d", r}'`
+export defPos="${chrName}:${oneThird}-${tenK}"
+
+printf "hub $asmId genome assembly
+shortLabel $asmId
+longLabel $asmId
+useOneFile on
+email yourEmail@uni.edu
+descriptionHtml html/$asmId.description.html
+
+genome $asmId
+groups groups.txt
+description $asmId
+twoBitPath $asmId.2bit
+twoBitBptUrl $asmId.2bit.bpt
+organism $asmId
+defaultPos $defPos
+scientificName $asmId
+htmlPath html/$asmId.description.html
+
+" > ${destDir}/hub.txt
+
+###### assembly/gap ####################
+mkdir -p "${destDir}/trackData/assemblyGap"
+cd "${destDir}/trackData/assemblyGap"
+if [ "../../${asmId}.agp.gz" -nt "${asmId}.assembly.bb" ]; then
+  printf "# constructing assembly/gap tracks\n" 1>&2
+  zcat ../../$asmId.agp.gz | grep -v "^#" | awk '$5 != "N" && $5 != "U"' \
+     | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $1, $2-1, $3, $6, $9}' \
+        | sort -k1,1 -k2,2n > $asmId.assembly.bed
+  zcat ../../$asmId.agp.gz | grep -v "^#" | awk '$5 == "N" || $5 == "U"' \
+     | awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $8}' \
+        | sort -k1,1 -k2,2n > $asmId.gap.bed
+
+  bedToBigBed -extraIndex=name -verbose=0 $asmId.assembly.bed \
+    ../../$asmId.chrom.sizes.txt $asmId.assembly.bb
+  touch -r ../../$asmId.agp.gz $asmId.assembly.bb
+  $HOME/kent/src/hg/utils/automation/genbank/nameToIx.pl \
+    $asmId.assembly.bed | sort -u > $asmId.assembly.ix.txt
+  if [ -s $asmId.assembly.ix.txt ]; then
+    ixIxx $asmId.assembly.ix.txt $asmId.assembly.ix $asmId.assembly.ixx
+  fi
+  if [ -s "${asmId}.gap.bed" ]; then
+    bedToBigBed -extraIndex=name -verbose=0 $asmId.gap.bed \
+      ../../$asmId.chrom.sizes $asmId.gap.bb
+    touch -r ../../$asmId.agp.gz $asmId.gap.bb
+  else
+    rm -f "${asmId}.gap.bed"
+  fi
+else
+  printf "# assembly/gap previously completed\n" 1>&2
+fi
+
+printf "track assembly
+longLabel Assembly
+shortLabel Assembly
+visibility pack
+colorByStrand 150,100,30 230,170,40
+color 150,100,30
+altColor 230,170,40
+bigDataUrl bbi/$asmId.assembly.bb
+type bigBed 6
+html html/$asmId.assembly
+searchIndex name
+searchTrix ixIxx/$asmId.assembly.ix
+url https://www.ncbi.nlm.nih.gov/nuccore/$$
+urlLabel NCBI Nucleotide database
+group map
+" >> ${destDir}/hub.txt
+
+cd "${destDir}"
+
+###### cytoBand ####################
+mkdir -p "${destDir}/trackData/cytoBand"
+cd "${destDir}/trackData/cytoBand"
+if [ ../../$asmId.chrom.sizes.txt -nt $asmId.cytoBand.bb ]; then
+  printf "# construction cytoBand track\n" 1>&2
+  awk '{printf "%s\t0\t%d\t\tgneg\n", $1, $2}' ../../$asmId.chrom.sizes.txt | sort -k1,1 -k2,2n > $asmId.cytoBand.bed
+  bedToBigBed -type=bed4+1 -as=$HOME/kent/src/hg/lib/cytoBand.as -tab $asmId.cytoBand.bed ../../$asmId.chrom.sizes.txt $asmId.cytoBand.bb >> $buildLog 2>&1
+
+  touch -r ../../$asmId.chrom.sizes.txt $asmId.cytoBand.bb
+else
+  printf "# cytoBand previously completed\n" 1>&2
+fi
+
+printf "track cytoBandIdeo
+shortLabel Chromosome Band (Ideogram)
+longLabel Ideogram for Orientation
+group map
+visibility dense
+type bigBed 4 +
+bigDataUrl bbi/$asmId.cytoBand.bb
+" >> ${destDir}/hub.txt
+
+###### gc5Base ####################
+mkdir -p "${destDir}/trackData/gc5Base"
+cd "${destDir}/trackData/gc5Base"
+if [ ../../$asmId.2bit -nt $asmId.gc5Base.bw ]; then
+  printf "# constructin gc5Base track\n" 1>&2
+  hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 test \
+    ../../$asmId.2bit 2>> $buildLog \
+      | gzip -c > $asmId.wigVarStep.gz
+  wigToBigWig $asmId.wigVarStep.gz ../../$asmId.chrom.sizes.txt $asmId.gc5Base.bw >> $buildLog 2>&1
+  rm -f $asmId.wigVarStep.gz
+  touch -r ../../$asmId.2bit $asmId.gc5Base.bw
+else
+  printf "# gc5Base previously completed\n" 1>&2
+fi
+
+printf "track gc5Base
+shortLabel GC Percent
+longLabel GC Percent in 5-Base Windows
+group map
+visibility full
+autoScale Off
+maxHeightPixels 128:36:16
+graphTypeDefault Bar
+gridDefault OFF
+windowingFunction Mean
+color 0,0,0
+altColor 128,128,128
+viewLimits 30:70
+type bigWig 0 100
+bigDataUrl bbi/$asmId.gc5Base.bw
+html html/$asmId.gc5Base
+" >> ${destDir}/hub.txt
+