174946a4b501d43409f582bfe2ef0838a30acd6e
max
  Sat Feb 2 19:35:11 2019 +0100
adding annotations to cancerMafToBigBed

diff --git src/utils/cancerMafToBigBed src/utils/cancerMafToBigBed
index 4eb8e8a..7ad60bc 100755
--- src/utils/cancerMafToBigBed
+++ src/utils/cancerMafToBigBed
@@ -1,239 +1,311 @@
 #!/usr/bin/env python
 
+# possible features:
+# - add counts per diagnosis
+
 import logging, sys, optparse, gzip
-from collections import defaultdict, namedtuple
+from collections import defaultdict, namedtuple, Counter
 from os.path import join, basename, dirname, isfile, walk, splitext
 from os import system
 
 # these fields are identical for all variants at the same position. 
 # only a single value is output to the bed
 fixedFields = [
     "Hugo_Symbol",
     "Entrez_Gene_Id",
     "Variant_Classification",
     "Variant_Type",
     "Reference_Allele",
     "Tumor_Seq_Allele1",
     "Tumor_Seq_Allele2",
     "dbSNP_RS",
     "dbSNP_Val_Status"
 ]
 
 # these fields vary between variants at the same position, they are output to the BED as a comma-separated list
 varFields = [
     "Tumor_Sample_Barcode",
-    "Matched_Norm_Sample_Barcode"
+    "Matched_Norm_Sample_Barcode",
+    "case_id"
+]
+
+# these clinical annotations are summarized as counts
+annotCountFields = [
+    "gender",
+    "project_id",
+    "ethnicity",
+]
+
+# these clinical annotations are copied as a list into every variant
+annotListFields = [
+    # from annotations.tsv
+    "days_to_death",
+    # from exposure.tsv
+    "cigarettes_per_day",
+    "weight",
+    "alcohol_history",
+    "alcohol_intensity",
+    "bmi",
+    "years_smoked",
+    "height"
 ]
 
+
 # field descriptions for the AS file
 fieldDescs = {
 
 }
 
 # ==== functions =====
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] inDir outBb - summarize GDC MAF files to bigBed
 
     inDir will be recursively searched for .maf.gz files. Their variants will be extracted, written to outBb
 
     """)
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
     #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") 
     #parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
     (options, args) = parser.parse_args()
 
     if args==[]:
         parser.print_help()
         exit(1)
 
     if options.debug:
         logging.basicConfig(level=logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
     return args, options
 
 
 def findMafs(inDir):
     " recursively find maf.gz files under inDir "
     mafFnames = []
 
     def gotFiles(arg, dirName, fnames):
-        " callback for walk "
+        " callback for walk, called for every file "
         for fname in fnames:
             if fname.endswith(".maf.gz"):
                 mafFnames.append( join(dirName, fname) )
 
     walk(inDir, gotFiles, None)
     return mafFnames
 
-def parseMaf(fname, MafRec):
-    " parse maf file, yield a dict "
-    for line in gzip.open(fname):
+def openFile(fname):
+    if fname.endswith(".gz"):
+        return gzip.open(fname)
+    else:
+        return open(fname)
+
+def parseTsv(fname, Rec):
+    " parse maf file, use rows to fill the Rec (~struct) and yield Recs "
+    for line in openFile(fname):
         if line.startswith("#"):
             continue
         if line.startswith("Hugo_Sym"):
             headers = line.rstrip("\n").split("\t")
-            assert(tuple(headers)==MafRec._fields)
+            assert(tuple(headers)==Rec._fields)
             continue
         row = line.rstrip("\n").split("\t")
-        #yield dict(zip(headers, row))
-        yield MafRec(*row)
+        yield Rec(*row)
 
 def makeRec(fname):
     " return the namedtuple rec (=struct) made from first non-comment line in fname "
-    for line in gzip.open(fname):
+    for line in openFile(fname):
         if line.startswith("#"):
             continue
         headers = line.rstrip("\n").split("\t")
         MafRec = namedtuple("mafRecord", headers)
         return MafRec
 
-def mafsToBed(mafFnames, outBed):
+def mafsToBed(mafFnames, annots, outBed):
     " summarize mafFnames to outBed "
     byPosAndAllele = defaultdict(list)
     MafRec = None
     allSampleIds = set()
     # read the whole data into memory, index by chrom pos
     for mafName in mafFnames:
         if MafRec is None:
             MafRec = makeRec(mafName)
         logging.info("Reading %s" % mafName)
         lineCount = 0
-        for var in parseMaf(mafName, MafRec):
+        for var in parseTsv(mafName, MafRec):
             chrom, start, end = var.Chromosome, var.Start_Position, var.End_Position
             all1, all2 = var.Tumor_Seq_Allele1, var.Tumor_Seq_Allele2
             key = (chrom, start, end, all1, all2)
             byPosAndAllele[key].append(var)
             lineCount += 1
             allSampleIds.add(var.Tumor_Sample_Barcode)
         logging.info("Read %d rows" % lineCount)
 
     sampleCount = len(allSampleIds)
     logging.info("Read variants from %d samples in total" % sampleCount)
 
     logging.info("Writing variant summary to %s" % outBed)
     # now summarize all variants for every chrom pos
     longFields = set()
     ofh = open(outBed, "w")
     for (chrom, start, end, all1, all2), varList in byPosAndAllele.iteritems():
         var1 = varList[0]._asdict()
         refAll = var1["Reference_Allele"]
         varType = var1["Variant_Type"]
 
         if varType=="SNP":
             assert(refAll==all1)
             #if all2=="":
             name = refAll+">"+all2
             #else:
             #name = refAll+">"+all1+"/"+all2
         elif varType=="DEL":
             name = "del"+refAll
         elif varType=="INS":
             name = "ins"+all2
         else:
             invalidType
 
         varSampleCount = len(varList) # score is number of variants with this nucl change
         score = min(1000, varSampleCount) # don't exceed 1000
 
         ftLen = str(int(end)-int(start))
         row = [chrom, start, end, name, str(score), ".", start, end, "0,0,0", "1", ftLen, "0"]
 
-        row.append( str(varSampleCount) )
-        row.append( str( float(varSampleCount) / sampleCount ) )
+        # add the count/frequency fields that we derive from the data
+        row.append( str(varSampleCount) ) # raw count
+        row.append( str( float(varSampleCount) / sampleCount ) ) # frequency
 
+        # a few fields are always the same, so we just add the value from the first variant
         for fieldName in fixedFields:
             row.append(var1[fieldName])
 
-        # must convert structs to dicts
+        # some annotations are summarized as counts
+        for summCountField in annotCountFields:
+            vals = []
+            for var in varList:
+                annot = annots[var.case_id]
+                vals.append(annot[summCountField])
+
+            mostCommon = Counter(vals).most_common()
+            mostCommStrings = ["%s:%d" % (name, count) for name, count in mostCommon]
+            countStr = ", ".join(mostCommStrings)
+            row.append(countStr)
+
+        # some annotations are copied as a long list
+        for fieldName in annotListFields:
+            vals = []
+            for var in varList:
+                annot = annots[var.case_id]
+                vals.append(annot[fieldName])
+
+            row.append(",".join(vals))
+
+        # first convert structs to dicts
         dictList = []
         for var in varList:
             dictList.append(var._asdict())
 
+        # some variant values are copied as a long list 
         for fieldName in varFields:
             valList = []
             for d in dictList:
                 valList.append(d[fieldName])
             valStr = ",".join(valList)
             row.append(valStr)
-            if len(valStr)>256:
-                longFields.add(fieldName)
+            #if len(valStr)>256:
+                #longFields.add(fieldName)
 
         ofh.write("\t".join(row))
         ofh.write("\n")
 
     ofh.close()
     logging.info("Created file %s" % ofh.name)
-    return longFields
+    #return longFields
 
-def makeAs(extraFields, longFields, outFname):
+def makeAs(extraFields, outFname):
     " write the .as file to outFname "
     minAs = """table bed12Mafs
 "somatic variants converted from MAF files obtained through the NCI GDC"
     (
     string chrom;      "Chromosome (or contig, scaffold, etc.)"
     uint   chromStart; "Start position in chromosome"
     uint   chromEnd;   "End position in chromosome"
     string name;       "Name of item"
     uint   score;      "Score from 0-1000"
     char[1] strand;    "+ or -"
     uint thickStart;   "Start of where display should be thick (start codon)"
     uint thickEnd;     "End of where display should be thick (stop codon)"
     uint reserved;     "Used as itemRgb as of 2004-11-22"
     int blockCount;    "Number of blocks"
     int[blockCount] blockSizes; "Comma separated list of block sizes"
     int[blockCount] chromStarts; "Start positions relative to chromStart"
     string sampleCount;    "number of samples with this variant"
     string freq;    "variant frequency"
     """
     ofh = open(outFname, "w")
     ofh.write(minAs)
 
     for fieldList in extraFields:
         for fieldName in fieldList:
-            fieldType = "string"
-            if fieldName in longFields:
             fieldType = "lstring"
+            #if fieldName in longFields:
+                #fieldType = "lstring"
             fieldDesc = fieldDescs.get(fieldName, fieldName)
             ofh.write('    %s %s; "%s"\n' % (fieldType, fieldName, fieldDesc))
     ofh.write(")\n")
     ofh.close()
     logging.info("Wrote %s" % ofh.name)
 
 def bedToBb(outBed, outBedSorted, chromSizes, outAs, outBb):
     " "
     logging.info("Converting to bigBed")
     cmd = "sort -k1,1 -k2,2n %s > %s" % (outBed, outBedSorted)
     assert(system(cmd)==0)
 
     cmd = "bedToBigBed %s %s  -type=bed12+ -as=%s -tab %s" % \
         (outBedSorted, chromSizes, outAs, outBb)
     assert(system(cmd)==0)
     logging.info("Created %s" % outBb)
 
+def parseAnnots(annotFn):
+    " read clinical annotations and return as dict caseId -> namedtuple "
+    Rec = makeRec(annotFn)
+    ret = {}
+    for annot in parseTsv(annotFn, Rec):
+        caseId = annot.case_id
+        assert(caseId not in ret)
+        ret[caseId] = annot
+    return ret
+
 # ----------- main --------------
 def main():
     args, options = parseArgs()
 
-    inDir, outBb = args
-    #if options.test:
-        #logging.debug("test is set")
-        #f = open(options.file, "r")
+    inDir, annotFn, expFn, outBb = args
+
+    annots = parseAnnots(annotFn)
+    exposures = parseAnnots(expFn)
+
+    # join annots and exposures into a single dict case_id -> field_name -> value
+    joinedAnnots = {}
+    for key, annot in annots.iteritems():
+        ad = annot._asdict()
+        ad.update(exposures[key]._asdict())
+        joinedAnnots[key] = ad
 
     mafFnames = findMafs(inDir)
 
     outBase = splitext(outBb)[0] # strip file extension
     outBed = outBase+".bed"
     outBedSorted = outBase+".s.bed"
     outAs = outBase+".as"
 
-    longFields = mafsToBed(mafFnames, outBed)
+    mafsToBed(mafFnames, joinedAnnots, outBed)
 
-    makeAs([fixedFields, varFields], longFields, outAs)
+    makeAs([fixedFields, annotCountFields, annotListFields, varFields], outAs)
 
     chromSizes = "/gbdb/hg38/chrom.sizes"
     bedToBb(outBed, outBedSorted, chromSizes, outAs, outBb)
 
 main()