src/utils/tabToBed/tabToBed d49be6c5dc4fb5e179730b4e56de7f917163a07c

d49be6c5dc4fb5e179730b4e56de7f917163a07c
max
  Tue May 6 02:39:45 2025 -0700
adding tabToBed tool, suggested by Lou

diff --git src/utils/tabToBed/tabToBed src/utils/tabToBed/tabToBed
new file mode 100755
index 00000000000..72ce5b0a0e1
--- /dev/null
+++ src/utils/tabToBed/tabToBed
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+
+import logging, sys, optparse, re, os
+from collections import defaultdict
+from os.path import join, basename, dirname, isfile
+
+# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
+parser = optparse.OptionParser("""usage: %prog [options] inTsv outBed outAs - convert tab-sep to BED and create an auto-Sql file for it.
+
+You can throw very wide tsv files on this script and it can pick out the chrom, start, end and other fields and make extra
+fields for all the other columns.
+Uses the header line of a tab-sep (tsv) file to guess where the chrom, start and end fields are.
+Searches for these field names in the header line, but you can replace names with --bedFields:
+  chrom, start, end, name, score, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, chromStarts
+Always makes a BED12 file.
+""")
+
+parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
+#parser.add_option("-t", "--type", dest="type", action="store", help="number of BED columns, default %default", default=12)
+parser.add_option("", "--bedFields", dest="bedFields", action="store", help="in which columns to find the chrom-start-end-strand-name-score fields, can use 1-based indexes or field names e.g. chrom=1,start=START,end=posEnd,strand=8,name=12,score=10. First column is '1', like cut.")
+parser.add_option("-o", "--oneBased", dest="oneBased", action="store_true", help="Input coordinates are 1-based, so subtract 1 from all start positions. Default is 0-based.")
+#parser.add_option("-t", "--addTab", dest="addTab", action="store", help="add an additional tab file, first column is index field")
+#parser.add_option("-f", "--fields", dest="fields", action="store", help="read the field names and descriptions from this file, one per line, space-separated")
+#parser.add_option("-f", "--file", dest="file", action="store", help="run on file") 
+#parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
+(options, args) = parser.parse_args()
+
+if options.debug:
+    logging.basicConfig(level=logging.DEBUG)
+else:
+    logging.basicConfig(level=logging.INFO)
+# ==== FUNCTIONs =====
+asHead = """table bed
+"Browser extensible data (<=12 fields) "
+    (
+"""
+
+asLines = """
+    string chrom;      "Chromosome (or contig, scaffold, etc.)"
+    uint   chromStart; "Start position in chromosome"
+    uint   chromEnd;   "End position in chromosome"
+    string name;       "Name of item"
+    uint   score;      "Score from 0-1000"
+    char[1] strand;    "+ or -"
+    uint thickStart;   "Start of where display should be thick (start codon)"
+    uint thickEnd;     "End of where display should be thick (stop codon)"
+    uint reserved;     "Used as itemRgb as of 2004-11-22"
+    int blockCount;    "Number of blocks"
+    int[blockCount] blockSizes; "Comma separated list of block sizes"
+    int[blockCount] chromStarts; "Start positions relative to chromStart"
+""".split("\n")
+
+def guessFieldDesc(fieldNames):
+    "given a list of field names, try to guess to which fields in bed12 they correspond "
+    logging.info("Guessing fields from TSV headers: %s" % fieldNames)
+    fieldDesc = {}
+    skipFields = set()
+
+    for fieldIdx, fieldName in enumerate(fieldNames):
+        caseField = fieldName.lower()
+        if caseField in ["chrom", "chromosome"]:
+            name = "chrom"
+        elif caseField in ["start", "chromstart", "position"]:
+            name = "start"
+        elif caseField in ["Reference"]:
+            name = "refAllele"
+        elif caseField in ["strand"]:
+            name = "strand"
+        elif caseField in ["score"]:
+            name = "score"
+        else:
+            continue
+        fieldDesc[name] = fieldIdx
+        skipFields.add(fieldIdx)
+    logging.info("TSV <-> BED correspondance: %s" % fieldDesc)
+    return fieldDesc, skipFields
+
+def parseFieldDesc(fieldDescStr, fieldNames):
+    " given a string chrom=1,start=2,end=3,... return dict {chrom:1,...} "
+    #if not fieldDesc:
+        #return guessFieldDesc(fieldNames)
+    fieldDesc, skipFields = guessFieldDesc(fieldNames)
+
+    if fieldDescStr:
+        for part in fieldDescStr.split(","):
+            bedFieldName, fieldIdxOrName = part.split("=")
+
+            if fieldIdxOrName.isdigit():
+                fieldIdx = int(filedIdxOrName)-1
+            else:
+                fieldIdx = fieldNames.index(fieldIdxOrName)
+            fieldDesc[bedFieldName] = fieldIdx
+            skipFields.add(fieldIdx)
+
+    logging.info("TSV to BED assignment: %s" % fieldDesc)
+    return fieldDesc, skipFields
+
+def makeBedRow(row, fieldDesc, skipFields, isOneBased):
+    " given a row of a tsv file and a fieldDesc with BedFieldName -> field-index, return a bed12+ row with extra fields "
+    bedRow = []
+
+    # first construct the bed 12 fields
+    for fieldName in ["chrom", "start", "end", "name", "score", "strand",
+            "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "chromStarts"]:
+
+        fieldIdx = fieldDesc.get(fieldName)
+        if fieldIdx is not None:
+            if fieldName=="start":
+                chromStart = int(row[fieldDesc["start"]])
+                if isOneBased:
+                    chromStart = chromStart-1
+                val = str(chromStart)
+            elif fieldName=="end":
+                val = row[fieldIdx]
+                chromEnd = int(val)
+            else:
+                val = row[fieldIdx]
+        else:
+            if fieldName=="end":
+                chromEnd = chromStart+1
+                val = str(chromEnd)
+            elif fieldName=="score":
+                val = "0"
+            elif fieldName=="strand":
+                val = "."
+            elif fieldName=="thickStart":
+                val = str(chromStart)
+            elif fieldName=="thickEnd":
+                val = str(chromEnd)
+            elif fieldName=="itemRgb":
+                val = "0,0,0"
+            elif fieldName=="blockCount":
+                val = "1"
+            elif fieldName=="blockSizes":
+                val = str(chromEnd-chromStart)
+            elif fieldName=="chromStarts":
+                #val = str(chromStart)
+                val = "0"
+            else:
+                logging.error("Cannot find a field for %s" % fieldName)
+                sys.exit(1)
+
+        bedRow.append(val)
+
+    # now handle all other fields
+    for fieldIdx, val in enumerate(row):
+        if not fieldIdx in skipFields:
+            bedRow.append( row[fieldIdx] )
+
+    return bedRow
+
+# ----------- MAIN --------------
+if args==[]:
+    parser.print_help()
+    exit(1)
+
+tsvFname, outBedFname, outAsFname = args
+
+# join and output merged bed
+bigCols = set() # col names of columns with > 255 chars
+
+fh = open("temp.tabToBed.bed", "w")
+fieldNames = None
+isOneBased = options.oneBased
+
+for line in open(tsvFname):
+    row = line.rstrip("\r\n").split("\t")
+    if fieldNames is None:
+        fieldNames = row
+        fieldDesc, notExtraFields = parseFieldDesc(options.bedFields, fieldNames)
+        continue
+
+    # note fields with data > 255 chars.
+    for colName, colData in zip(fieldNames, row):
+        if len(colData)>255:
+            bigCols.add(colName)
+
+    bedRow = makeBedRow(row, fieldDesc, notExtraFields, isOneBased)
+
+    chrom = bedRow[0]
+    if chrom.isdigit() or chrom in ["X", "Y"]:
+        bedRow[0] = "chr"+bedRow[0]
+
+    fh.write( ("\t".join(bedRow)))
+    fh.write("\n")
+fh.close()
+
+cmd = "sort -k1,1 -k2,2n temp.tabToBed.bed > %s" % outBedFname
+assert(os.system(cmd)==0)
+os.remove("temp.tabToBed.bed")
+
+# generate autosql
+# BED fields
+#bedColCount = int(options.type.replace("bed", "").replace("+" , ""))
+bedColCount = 12
+fh = open(outAsFname, "w")
+fh.write(asHead)
+fh.write("\n".join(asLines[:bedColCount+1]))
+fh.write("\n")
+
+# extra fields
+#fieldNames = fieldNames[bedColCount:]
+for fieldIdx, field in enumerate(fieldNames):
+    if fieldIdx in notExtraFields:
+        continue
+    name = field.replace(" ","")
+    name = field.replace("%","perc_")
+    name = re.sub("[^a-zA-Z0-9]", "", name)
+    name = name[0].lower()+name[1:]
+    fType = "string"
+    if field in bigCols:
+        fType = "lstring"
+    fh.write('      %s %s; "%s" \n' % (fType, name, field))
+fh.write(")\n")
+fh.close()
+
+print ("run this command to convert BED file to bigBed:")
+print ("bedToBigBed %s /hive/data/genomes/DB/chrom.sizes %s -tab -type=bed%d+ -as=%s" % (outBedFname, outBedFname.replace(".bed", ".bb"), bedColCount, outAsFname))