src/utils/cancerMafToBigBed 7c0d137acafd9e780b872808a80b320564579944

7c0d137acafd9e780b872808a80b320564579944
braney
  Mon Apr 22 14:54:22 2019 -0700
some tweaks to get the format more in keeping with what hgc wants

diff --git src/utils/cancerMafToBigBed src/utils/cancerMafToBigBed
index 5db043f..bf84e2d 100755
--- src/utils/cancerMafToBigBed
+++ src/utils/cancerMafToBigBed
@@ -19,47 +19,47 @@
     "Tumor_Seq_Allele1",
     "Tumor_Seq_Allele2",
     "dbSNP_RS",
     "dbSNP_Val_Status"
 ]
 
 # these fields vary between variants at the same position, they are output to the BED as a comma-separated list
 varFields = [
     "Tumor_Sample_Barcode",
     "Matched_Norm_Sample_Barcode",
     "case_id"
 ]
 
 # these clinical annotations are summarized as counts
 annotCountFields = [
-    "gender",
-    "project_id",
-    "ethnicity",
 ]
 
 # these clinical annotations are copied as a list into every variant
 annotListFields = [
     # from annotations.tsv
     "days_to_death",
     # from exposure.tsv
     "cigarettes_per_day",
     "weight",
     "alcohol_history",
     "alcohol_intensity",
     "bmi",
     "years_smoked",
-    "height"
+    "height",
+    "gender",
+    "project_id",
+    "ethnicity",
 ]
 
 
 # field descriptions for the AS file
 fieldDescs = {
 
 }
 
 # ==== functions =====
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] inDir annotationFile exposureFile outBb - summarize GDC MAF files to bigBed
 
     inDir will be recursively searched for .maf.gz files. Their variants will be extracted, written to outBb
 
@@ -116,33 +116,30 @@
         row = line.rstrip("\n").split("\t")
         yield Rec(*row)
 
 def makeRec(fname):
     " return the namedtuple rec (=struct) made from first non-comment line in fname "
     for line in openFile(fname):
         if line.startswith("#"):
             continue
         headers = line.rstrip("\n").split("\t")
         MafRec = namedtuple("mafRecord", headers)
         return MafRec
 
 def joinMaxLen(vals, maxLen=255):
     " join strings together. If total length is > maxLen, just take the first 3 and add ... "
     newLen = len(vals[0])*len(vals)
-    if newLen>250:
-        newStr = ",".join(vals[:3])
-    else:
     newStr = ",".join(vals)
     return newStr
 
 def mafsToBed(mafFnames, annots, outBed):
     " summarize mafFnames to outBed "
     byPosAndAllele = defaultdict(list)
     MafRec = None
     allSampleIds = set()
     # read the whole data into memory, index by chrom pos
     for mafName in mafFnames:
         if MafRec is None:
             MafRec = makeRec(mafName)
         logging.info("Reading %s" % mafName)
         lineCount = 0
         for var in parseTsv(mafName, MafRec):
@@ -170,32 +167,32 @@
             assert(refAll==all1)
             #if all2=="":
             name = refAll+">"+all2
             #else:
             #name = refAll+">"+all1+"/"+all2
         elif varType=="DEL":
             name = "del"+refAll
         elif varType=="INS":
             name = "ins"+all2
         else:
             invalidType
 
         varSampleCount = len(varList) # score is number of variants with this nucl change
         score = min(1000, varSampleCount) # don't exceed 1000
 
-        ftLen = str(int(end)-int(start))
-        row = [chrom, start, end, name, str(score), ".", start, end, "0,0,0", "1", ftLen, "0"]
+        ftLen = str(int(end)-int(start) + 1)
+        row = [chrom, str(int(start) - 1), end, name, str(score), ".", str(int(start) - 1), end, "0,0,0", "1", ftLen, "0"]
 
         # add the count/frequency fields that we derive from the data
         row.append( str(varSampleCount) ) # raw count
         row.append( str( float(varSampleCount) / sampleCount ) ) # frequency
 
         # a few fields are always the same, so we just add the value from the first variant
         for fieldName in fixedFields:
             row.append(var1[fieldName])
 
         # some annotations are summarized as counts
         for summCountField in annotCountFields:
             vals = []
             for var in varList:
                 annot = annots[var.case_id]
                 vals.append(annot[summCountField])