src/utils/cancerMafToBigBed 87e0a09f85192a066a9814afe5ba88925398305a

87e0a09f85192a066a9814afe5ba88925398305a
max
  Tue Feb 5 20:09:25 2019 -0800
adding converter and first demo trackDb (only Breast cancer), refs #22742

diff --git src/utils/cancerMafToBigBed src/utils/cancerMafToBigBed
index 7ad60bc..55c1bf5 100755
--- src/utils/cancerMafToBigBed
+++ src/utils/cancerMafToBigBed
@@ -1,311 +1,320 @@
 #!/usr/bin/env python
 
 # possible features:
 # - add counts per diagnosis
 
 import logging, sys, optparse, gzip
 from collections import defaultdict, namedtuple, Counter
 from os.path import join, basename, dirname, isfile, walk, splitext
 from os import system
 
 # these fields are identical for all variants at the same position. 
 # only a single value is output to the bed
 fixedFields = [
     "Hugo_Symbol",
     "Entrez_Gene_Id",
     "Variant_Classification",
     "Variant_Type",
     "Reference_Allele",
     "Tumor_Seq_Allele1",
     "Tumor_Seq_Allele2",
     "dbSNP_RS",
     "dbSNP_Val_Status"
 ]
 
 # these fields vary between variants at the same position, they are output to the BED as a comma-separated list
 varFields = [
     "Tumor_Sample_Barcode",
     "Matched_Norm_Sample_Barcode",
     "case_id"
 ]
 
 # these clinical annotations are summarized as counts
 annotCountFields = [
     "gender",
     "project_id",
     "ethnicity",
 ]
 
 # these clinical annotations are copied as a list into every variant
 annotListFields = [
     # from annotations.tsv
     "days_to_death",
     # from exposure.tsv
     "cigarettes_per_day",
     "weight",
     "alcohol_history",
     "alcohol_intensity",
     "bmi",
     "years_smoked",
     "height"
 ]
 
 
 # field descriptions for the AS file
 fieldDescs = {
 
 }
 
 # ==== functions =====
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] inDir outBb - summarize GDC MAF files to bigBed
 
     inDir will be recursively searched for .maf.gz files. Their variants will be extracted, written to outBb
 
     """)
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
     #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") 
     #parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
     (options, args) = parser.parse_args()
 
     if args==[]:
         parser.print_help()
         exit(1)
 
     if options.debug:
         logging.basicConfig(level=logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
     return args, options
 
 
 def findMafs(inDir):
     " recursively find maf.gz files under inDir "
     mafFnames = []
 
     def gotFiles(arg, dirName, fnames):
         " callback for walk, called for every file "
         for fname in fnames:
             if fname.endswith(".maf.gz"):
                 mafFnames.append( join(dirName, fname) )
 
     walk(inDir, gotFiles, None)
     return mafFnames
 
 def openFile(fname):
     if fname.endswith(".gz"):
         return gzip.open(fname)
     else:
         return open(fname)
 
 def parseTsv(fname, Rec):
     " parse maf file, use rows to fill the Rec (~struct) and yield Recs "
     for line in openFile(fname):
         if line.startswith("#"):
             continue
         if line.startswith("Hugo_Sym"):
             headers = line.rstrip("\n").split("\t")
             assert(tuple(headers)==Rec._fields)
             continue
         row = line.rstrip("\n").split("\t")
         yield Rec(*row)
 
 def makeRec(fname):
     " return the namedtuple rec (=struct) made from first non-comment line in fname "
     for line in openFile(fname):
         if line.startswith("#"):
             continue
         headers = line.rstrip("\n").split("\t")
         MafRec = namedtuple("mafRecord", headers)
         return MafRec
 
+def joinMaxLen(vals, maxLen=255):
+    " join strings together. If total length is > maxLen, just take the first 3 and add ... "
+    newLen = len(vals[0])*len(vals)
+    if newLen>250:
+        newStr = ",".join(vals[:3])
+    else:
+        newStr = ",".join(vals)
+    return newStr
+
 def mafsToBed(mafFnames, annots, outBed):
     " summarize mafFnames to outBed "
     byPosAndAllele = defaultdict(list)
     MafRec = None
     allSampleIds = set()
     # read the whole data into memory, index by chrom pos
     for mafName in mafFnames:
         if MafRec is None:
             MafRec = makeRec(mafName)
         logging.info("Reading %s" % mafName)
         lineCount = 0
         for var in parseTsv(mafName, MafRec):
             chrom, start, end = var.Chromosome, var.Start_Position, var.End_Position
             all1, all2 = var.Tumor_Seq_Allele1, var.Tumor_Seq_Allele2
             key = (chrom, start, end, all1, all2)
             byPosAndAllele[key].append(var)
             lineCount += 1
             allSampleIds.add(var.Tumor_Sample_Barcode)
         logging.info("Read %d rows" % lineCount)
 
     sampleCount = len(allSampleIds)
-    logging.info("Read variants from %d samples in total" % sampleCount)
+    logging.info("Read %d variants from %d samples" % (len(byPosAndAllele),sampleCount))
 
     logging.info("Writing variant summary to %s" % outBed)
     # now summarize all variants for every chrom pos
     longFields = set()
     ofh = open(outBed, "w")
     for (chrom, start, end, all1, all2), varList in byPosAndAllele.iteritems():
         var1 = varList[0]._asdict()
         refAll = var1["Reference_Allele"]
         varType = var1["Variant_Type"]
 
         if varType=="SNP":
             assert(refAll==all1)
             #if all2=="":
             name = refAll+">"+all2
             #else:
             #name = refAll+">"+all1+"/"+all2
         elif varType=="DEL":
             name = "del"+refAll
         elif varType=="INS":
             name = "ins"+all2
         else:
             invalidType
 
         varSampleCount = len(varList) # score is number of variants with this nucl change
         score = min(1000, varSampleCount) # don't exceed 1000
 
         ftLen = str(int(end)-int(start))
         row = [chrom, start, end, name, str(score), ".", start, end, "0,0,0", "1", ftLen, "0"]
 
         # add the count/frequency fields that we derive from the data
         row.append( str(varSampleCount) ) # raw count
         row.append( str( float(varSampleCount) / sampleCount ) ) # frequency
 
         # a few fields are always the same, so we just add the value from the first variant
         for fieldName in fixedFields:
             row.append(var1[fieldName])
 
         # some annotations are summarized as counts
         for summCountField in annotCountFields:
             vals = []
             for var in varList:
                 annot = annots[var.case_id]
                 vals.append(annot[summCountField])
 
             mostCommon = Counter(vals).most_common()
             mostCommStrings = ["%s:%d" % (name, count) for name, count in mostCommon]
             countStr = ", ".join(mostCommStrings)
             row.append(countStr)
 
         # some annotations are copied as a long list
         for fieldName in annotListFields:
             vals = []
             for var in varList:
                 annot = annots[var.case_id]
                 vals.append(annot[fieldName])
 
-            row.append(",".join(vals))
+            row.append(joinMaxLen(vals))
 
         # first convert structs to dicts
         dictList = []
         for var in varList:
             dictList.append(var._asdict())
 
         # some variant values are copied as a long list 
         for fieldName in varFields:
             valList = []
             for d in dictList:
                 valList.append(d[fieldName])
-            valStr = ",".join(valList)
+            valStr = joinMaxLen(valList)
             row.append(valStr)
             #if len(valStr)>256:
                 #longFields.add(fieldName)
 
         ofh.write("\t".join(row))
         ofh.write("\n")
 
     ofh.close()
     logging.info("Created file %s" % ofh.name)
     #return longFields
 
 def makeAs(extraFields, outFname):
     " write the .as file to outFname "
     minAs = """table bed12Mafs
 "somatic variants converted from MAF files obtained through the NCI GDC"
     (
     string chrom;      "Chromosome (or contig, scaffold, etc.)"
     uint   chromStart; "Start position in chromosome"
     uint   chromEnd;   "End position in chromosome"
     string name;       "Name of item"
     uint   score;      "Score from 0-1000"
     char[1] strand;    "+ or -"
     uint thickStart;   "Start of where display should be thick (start codon)"
     uint thickEnd;     "End of where display should be thick (stop codon)"
     uint reserved;     "Used as itemRgb as of 2004-11-22"
     int blockCount;    "Number of blocks"
     int[blockCount] blockSizes; "Comma separated list of block sizes"
     int[blockCount] chromStarts; "Start positions relative to chromStart"
     string sampleCount;    "number of samples with this variant"
     string freq;    "variant frequency"
     """
     ofh = open(outFname, "w")
     ofh.write(minAs)
 
     for fieldList in extraFields:
         for fieldName in fieldList:
             fieldType = "lstring"
             #if fieldName in longFields:
                 #fieldType = "lstring"
             fieldDesc = fieldDescs.get(fieldName, fieldName)
             ofh.write('    %s %s; "%s"\n' % (fieldType, fieldName, fieldDesc))
     ofh.write(")\n")
     ofh.close()
     logging.info("Wrote %s" % ofh.name)
 
 def bedToBb(outBed, outBedSorted, chromSizes, outAs, outBb):
     " "
     logging.info("Converting to bigBed")
     cmd = "sort -k1,1 -k2,2n %s > %s" % (outBed, outBedSorted)
     assert(system(cmd)==0)
 
     cmd = "bedToBigBed %s %s  -type=bed12+ -as=%s -tab %s" % \
         (outBedSorted, chromSizes, outAs, outBb)
     assert(system(cmd)==0)
     logging.info("Created %s" % outBb)
 
 def parseAnnots(annotFn):
     " read clinical annotations and return as dict caseId -> namedtuple "
     Rec = makeRec(annotFn)
     ret = {}
     for annot in parseTsv(annotFn, Rec):
         caseId = annot.case_id
         assert(caseId not in ret)
         ret[caseId] = annot
     return ret
 
 # ----------- main --------------
 def main():
     args, options = parseArgs()
 
     inDir, annotFn, expFn, outBb = args
 
     annots = parseAnnots(annotFn)
     exposures = parseAnnots(expFn)
 
     # join annots and exposures into a single dict case_id -> field_name -> value
     joinedAnnots = {}
     for key, annot in annots.iteritems():
         ad = annot._asdict()
         ad.update(exposures[key]._asdict())
         joinedAnnots[key] = ad
 
     mafFnames = findMafs(inDir)
 
     outBase = splitext(outBb)[0] # strip file extension
     outBed = outBase+".bed"
     outBedSorted = outBase+".s.bed"
     outAs = outBase+".as"
 
     mafsToBed(mafFnames, joinedAnnots, outBed)
 
     makeAs([fixedFields, annotCountFields, annotListFields, varFields], outAs)
 
-    chromSizes = "/gbdb/hg38/chrom.sizes"
+    chromSizes = "/hive/data/genomes/hg38/chrom.sizes"
     bedToBb(outBed, outBedSorted, chromSizes, outAs, outBb)
 
 main()