1b2aa0c4e8b1b40a303328da21830a253749a57e
max
  Sat Feb 2 19:11:35 2019 +0100
adding small tool to convert MAF files to bigBed. refs #22742

diff --git src/utils/cancerMafToBigBed src/utils/cancerMafToBigBed
new file mode 100755
index 0000000..4eb8e8a
--- /dev/null
+++ src/utils/cancerMafToBigBed
@@ -0,0 +1,239 @@
+#!/usr/bin/env python
+
+import logging, sys, optparse, gzip
+from collections import defaultdict, namedtuple
+from os.path import join, basename, dirname, isfile, walk, splitext
+from os import system
+
+# these fields are identical for all variants at the same position. 
+# only a single value is output to the bed
+fixedFields = [
+    "Hugo_Symbol",
+    "Entrez_Gene_Id",
+    "Variant_Classification",
+    "Variant_Type",
+    "Reference_Allele",
+    "Tumor_Seq_Allele1",
+    "Tumor_Seq_Allele2",
+    "dbSNP_RS",
+    "dbSNP_Val_Status"
+]
+
+# these fields vary between variants at the same position, they are output to the BED as a comma-separated list
+varFields = [
+    "Tumor_Sample_Barcode",
+    "Matched_Norm_Sample_Barcode"
+]
+
+# field descriptions for the AS file
+fieldDescs = {
+
+}
+
+# ==== functions =====
+def parseArgs():
+    " setup logging, parse command line arguments and options. -h shows auto-generated help page "
+    parser = optparse.OptionParser("""usage: %prog [options] inDir outBb - summarize GDC MAF files to bigBed
+
+    inDir will be recursively searched for .maf.gz files. Their variants will be extracted, written to outBb
+
+    """)
+
+    parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
+    #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") 
+    #parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
+    (options, args) = parser.parse_args()
+
+    if args==[]:
+        parser.print_help()
+        exit(1)
+
+    if options.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+    return args, options
+
+
+def findMafs(inDir):
+    " recursively find maf.gz files under inDir "
+    mafFnames = []
+
+    def gotFiles(arg, dirName, fnames):
+        " callback for walk "
+        for fname in fnames:
+            if fname.endswith(".maf.gz"):
+                mafFnames.append( join(dirName, fname) )
+
+    walk(inDir, gotFiles, None)
+    return mafFnames
+
+def parseMaf(fname, MafRec):
+    " parse maf file, yield a dict "
+    for line in gzip.open(fname):
+        if line.startswith("#"):
+            continue
+        if line.startswith("Hugo_Sym"):
+            headers = line.rstrip("\n").split("\t")
+            assert(tuple(headers)==MafRec._fields)
+            continue
+        row = line.rstrip("\n").split("\t")
+        #yield dict(zip(headers, row))
+        yield MafRec(*row)
+
+def makeRec(fname):
+    " return the namedtuple rec (=struct) made from first non-comment line in fname "
+    for line in gzip.open(fname):
+        if line.startswith("#"):
+            continue
+        headers = line.rstrip("\n").split("\t")
+        MafRec = namedtuple("mafRecord", headers)
+        return MafRec
+
+def mafsToBed(mafFnames, outBed):
+    " summarize mafFnames to outBed "
+    byPosAndAllele = defaultdict(list)
+    MafRec = None
+    allSampleIds = set()
+    # read the whole data into memory, index by chrom pos
+    for mafName in mafFnames:
+        if MafRec is None:
+            MafRec = makeRec(mafName)
+        logging.info("Reading %s" % mafName)
+        lineCount = 0
+        for var in parseMaf(mafName, MafRec):
+            chrom, start, end = var.Chromosome, var.Start_Position, var.End_Position
+            all1, all2 = var.Tumor_Seq_Allele1, var.Tumor_Seq_Allele2
+            key = (chrom, start, end, all1, all2)
+            byPosAndAllele[key].append(var)
+            lineCount += 1
+            allSampleIds.add(var.Tumor_Sample_Barcode)
+        logging.info("Read %d rows" % lineCount)
+
+    sampleCount = len(allSampleIds)
+    logging.info("Read variants from %d samples in total" % sampleCount)
+
+    logging.info("Writing variant summary to %s" % outBed)
+    # now summarize all variants for every chrom pos
+    longFields = set()
+    ofh = open(outBed, "w")
+    for (chrom, start, end, all1, all2), varList in byPosAndAllele.iteritems():
+        var1 = varList[0]._asdict()
+        refAll = var1["Reference_Allele"]
+        varType = var1["Variant_Type"]
+
+        if varType=="SNP":
+            assert(refAll==all1)
+            #if all2=="":
+            name = refAll+">"+all2
+            #else:
+            #name = refAll+">"+all1+"/"+all2
+        elif varType=="DEL":
+            name = "del"+refAll
+        elif varType=="INS":
+            name = "ins"+all2
+        else:
+            invalidType
+
+        varSampleCount = len(varList) # score is number of variants with this nucl change
+        score = min(1000, varSampleCount) # don't exceed 1000
+
+        ftLen = str(int(end)-int(start))
+        row = [chrom, start, end, name, str(score), ".", start, end, "0,0,0", "1", ftLen, "0"]
+
+        row.append( str(varSampleCount) )
+        row.append( str( float(varSampleCount) / sampleCount ) )
+
+        for fieldName in fixedFields:
+            row.append(var1[fieldName])
+
+        # must convert structs to dicts
+        dictList = []
+        for var in varList:
+            dictList.append(var._asdict())
+
+        for fieldName in varFields:
+            valList = []
+            for d in dictList:
+                valList.append(d[fieldName])
+            valStr = ",".join(valList)
+            row.append(valStr)
+            if len(valStr)>256:
+                longFields.add(fieldName)
+
+        ofh.write("\t".join(row))
+        ofh.write("\n")
+
+    ofh.close()
+    logging.info("Created file %s" % ofh.name)
+    return longFields
+
+def makeAs(extraFields, longFields, outFname):
+    " write the .as file to outFname "
+    minAs = """table bed12Mafs
+"somatic variants converted from MAF files obtained through the NCI GDC"
+    (
+    string chrom;      "Chromosome (or contig, scaffold, etc.)"
+    uint   chromStart; "Start position in chromosome"
+    uint   chromEnd;   "End position in chromosome"
+    string name;       "Name of item"
+    uint   score;      "Score from 0-1000"
+    char[1] strand;    "+ or -"
+    uint thickStart;   "Start of where display should be thick (start codon)"
+    uint thickEnd;     "End of where display should be thick (stop codon)"
+    uint reserved;     "Used as itemRgb as of 2004-11-22"
+    int blockCount;    "Number of blocks"
+    int[blockCount] blockSizes; "Comma separated list of block sizes"
+    int[blockCount] chromStarts; "Start positions relative to chromStart"
+    string sampleCount;    "number of samples with this variant"
+    string freq;    "variant frequency"
+    """
+    ofh = open(outFname, "w")
+    ofh.write(minAs)
+
+    for fieldList in extraFields:
+        for fieldName in fieldList:
+            fieldType = "string"
+            if fieldName in longFields:
+                fieldType = "lstring"
+            fieldDesc = fieldDescs.get(fieldName, fieldName)
+            ofh.write('    %s %s; "%s"\n' % (fieldType, fieldName, fieldDesc))
+    ofh.write(")\n")
+    ofh.close()
+    logging.info("Wrote %s" % ofh.name)
+
+def bedToBb(outBed, outBedSorted, chromSizes, outAs, outBb):
+    " "
+    logging.info("Converting to bigBed")
+    cmd = "sort -k1,1 -k2,2n %s > %s" % (outBed, outBedSorted)
+    assert(system(cmd)==0)
+
+    cmd = "bedToBigBed %s %s  -type=bed12+ -as=%s -tab %s" % \
+        (outBedSorted, chromSizes, outAs, outBb)
+    assert(system(cmd)==0)
+    logging.info("Created %s" % outBb)
+
+# ----------- main --------------
+def main():
+    args, options = parseArgs()
+
+    inDir, outBb = args
+    #if options.test:
+        #logging.debug("test is set")
+        #f = open(options.file, "r")
+
+    mafFnames = findMafs(inDir)
+
+    outBase = splitext(outBb)[0] # strip file extension
+    outBed = outBase+".bed"
+    outBedSorted = outBase+".s.bed"
+    outAs = outBase+".as"
+
+    longFields = mafsToBed(mafFnames, outBed)
+
+    makeAs([fixedFields, varFields], longFields, outAs)
+
+    chromSizes = "/gbdb/hg38/chrom.sizes"
+    bedToBb(outBed, outBedSorted, chromSizes, outAs, outBb)
+
+main()