src/utils/cancerMafToBigBed 87e0a09f85192a066a9814afe5ba88925398305a

87e0a09f85192a066a9814afe5ba88925398305a
max
  Tue Feb 5 20:09:25 2019 -0800
adding converter and first demo trackDb (only Breast cancer), refs #22742

diff --git src/utils/cancerMafToBigBed src/utils/cancerMafToBigBed
index 7ad60bc..55c1bf5 100755
--- src/utils/cancerMafToBigBed
+++ src/utils/cancerMafToBigBed
@@ -109,52 +109,61 @@
             headers = line.rstrip("\n").split("\t")
             assert(tuple(headers)==Rec._fields)
             continue
         row = line.rstrip("\n").split("\t")
         yield Rec(*row)
 
 def makeRec(fname):
     " return the namedtuple rec (=struct) made from first non-comment line in fname "
     for line in openFile(fname):
         if line.startswith("#"):
             continue
         headers = line.rstrip("\n").split("\t")
         MafRec = namedtuple("mafRecord", headers)
         return MafRec
 
+def joinMaxLen(vals, maxLen=255):
+    " join strings together. If total length is > maxLen, just take the first 3 and add ... "
+    newLen = len(vals[0])*len(vals)
+    if newLen>250:
+        newStr = ",".join(vals[:3])
+    else:
+        newStr = ",".join(vals)
+    return newStr
+
 def mafsToBed(mafFnames, annots, outBed):
     " summarize mafFnames to outBed "
     byPosAndAllele = defaultdict(list)
     MafRec = None
     allSampleIds = set()
     # read the whole data into memory, index by chrom pos
     for mafName in mafFnames:
         if MafRec is None:
             MafRec = makeRec(mafName)
         logging.info("Reading %s" % mafName)
         lineCount = 0
         for var in parseTsv(mafName, MafRec):
             chrom, start, end = var.Chromosome, var.Start_Position, var.End_Position
             all1, all2 = var.Tumor_Seq_Allele1, var.Tumor_Seq_Allele2
             key = (chrom, start, end, all1, all2)
             byPosAndAllele[key].append(var)
             lineCount += 1
             allSampleIds.add(var.Tumor_Sample_Barcode)
         logging.info("Read %d rows" % lineCount)
 
     sampleCount = len(allSampleIds)
-    logging.info("Read variants from %d samples in total" % sampleCount)
+    logging.info("Read %d variants from %d samples" % (len(byPosAndAllele),sampleCount))
 
     logging.info("Writing variant summary to %s" % outBed)
     # now summarize all variants for every chrom pos
     longFields = set()
     ofh = open(outBed, "w")
     for (chrom, start, end, all1, all2), varList in byPosAndAllele.iteritems():
         var1 = varList[0]._asdict()
         refAll = var1["Reference_Allele"]
         varType = var1["Variant_Type"]
 
         if varType=="SNP":
             assert(refAll==all1)
             #if all2=="":
             name = refAll+">"+all2
             #else:
@@ -187,43 +196,43 @@
                 annot = annots[var.case_id]
                 vals.append(annot[summCountField])
 
             mostCommon = Counter(vals).most_common()
             mostCommStrings = ["%s:%d" % (name, count) for name, count in mostCommon]
             countStr = ", ".join(mostCommStrings)
             row.append(countStr)
 
         # some annotations are copied as a long list
         for fieldName in annotListFields:
             vals = []
             for var in varList:
                 annot = annots[var.case_id]
                 vals.append(annot[fieldName])
 
-            row.append(",".join(vals))
+            row.append(joinMaxLen(vals))
 
         # first convert structs to dicts
         dictList = []
         for var in varList:
             dictList.append(var._asdict())
 
         # some variant values are copied as a long list 
         for fieldName in varFields:
             valList = []
             for d in dictList:
                 valList.append(d[fieldName])
-            valStr = ",".join(valList)
+            valStr = joinMaxLen(valList)
             row.append(valStr)
             #if len(valStr)>256:
                 #longFields.add(fieldName)
 
         ofh.write("\t".join(row))
         ofh.write("\n")
 
     ofh.close()
     logging.info("Created file %s" % ofh.name)
     #return longFields
 
 def makeAs(extraFields, outFname):
     " write the .as file to outFname "
     minAs = """table bed12Mafs
 "somatic variants converted from MAF files obtained through the NCI GDC"
@@ -293,19 +302,19 @@
         ad = annot._asdict()
         ad.update(exposures[key]._asdict())
         joinedAnnots[key] = ad
 
     mafFnames = findMafs(inDir)
 
     outBase = splitext(outBb)[0] # strip file extension
     outBed = outBase+".bed"
     outBedSorted = outBase+".s.bed"
     outAs = outBase+".as"
 
     mafsToBed(mafFnames, joinedAnnots, outBed)
 
     makeAs([fixedFields, annotCountFields, annotListFields, varFields], outAs)
 
-    chromSizes = "/gbdb/hg38/chrom.sizes"
+    chromSizes = "/hive/data/genomes/hg38/chrom.sizes"
     bedToBb(outBed, outBedSorted, chromSizes, outAs, outBb)
 
 main()