174946a4b501d43409f582bfe2ef0838a30acd6e max Sat Feb 2 19:35:11 2019 +0100 adding annotations to cancerMafToBigBed diff --git src/utils/cancerMafToBigBed src/utils/cancerMafToBigBed index 4eb8e8a..7ad60bc 100755 --- src/utils/cancerMafToBigBed +++ src/utils/cancerMafToBigBed @@ -1,239 +1,311 @@ #!/usr/bin/env python +# possible features: +# - add counts per diagnosis + import logging, sys, optparse, gzip -from collections import defaultdict, namedtuple +from collections import defaultdict, namedtuple, Counter from os.path import join, basename, dirname, isfile, walk, splitext from os import system # these fields are identical for all variants at the same position. # only a single value is output to the bed fixedFields = [ "Hugo_Symbol", "Entrez_Gene_Id", "Variant_Classification", "Variant_Type", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2", "dbSNP_RS", "dbSNP_Val_Status" ] # these fields vary between variants at the same position, they are output to the BED as a comma-separated list varFields = [ "Tumor_Sample_Barcode", - "Matched_Norm_Sample_Barcode" + "Matched_Norm_Sample_Barcode", + "case_id" +] + +# these clinical annotations are summarized as counts +annotCountFields = [ + "gender", + "project_id", + "ethnicity", +] + +# these clinical annotations are copied as a list into every variant +annotListFields = [ + # from annotations.tsv + "days_to_death", + # from exposure.tsv + "cigarettes_per_day", + "weight", + "alcohol_history", + "alcohol_intensity", + "bmi", + "years_smoked", + "height" ] + # field descriptions for the AS file fieldDescs = { } # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] inDir outBb - summarize GDC MAF files to bigBed inDir will be recursively searched for .maf.gz files. Their variants will be extracted, written to outBb """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") #parser.add_option("", "--test", dest="test", action="store_true", help="do something") (options, args) = parser.parse_args() if args==[]: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) return args, options def findMafs(inDir): " recursively find maf.gz files under inDir " mafFnames = [] def gotFiles(arg, dirName, fnames): - " callback for walk " + " callback for walk, called for every file " for fname in fnames: if fname.endswith(".maf.gz"): mafFnames.append( join(dirName, fname) ) walk(inDir, gotFiles, None) return mafFnames -def parseMaf(fname, MafRec): - " parse maf file, yield a dict " - for line in gzip.open(fname): +def openFile(fname): + if fname.endswith(".gz"): + return gzip.open(fname) + else: + return open(fname) + +def parseTsv(fname, Rec): + " parse maf file, use rows to fill the Rec (~struct) and yield Recs " + for line in openFile(fname): if line.startswith("#"): continue if line.startswith("Hugo_Sym"): headers = line.rstrip("\n").split("\t") - assert(tuple(headers)==MafRec._fields) + assert(tuple(headers)==Rec._fields) continue row = line.rstrip("\n").split("\t") - #yield dict(zip(headers, row)) - yield MafRec(*row) + yield Rec(*row) def makeRec(fname): " return the namedtuple rec (=struct) made from first non-comment line in fname " - for line in gzip.open(fname): + for line in openFile(fname): if line.startswith("#"): continue headers = line.rstrip("\n").split("\t") MafRec = namedtuple("mafRecord", headers) return MafRec -def mafsToBed(mafFnames, outBed): +def mafsToBed(mafFnames, annots, outBed): " summarize mafFnames to outBed " byPosAndAllele = defaultdict(list) MafRec = None allSampleIds = set() # read the whole data into memory, index by chrom pos for mafName in mafFnames: if MafRec is None: MafRec = makeRec(mafName) logging.info("Reading %s" % mafName) lineCount = 0 - for var in parseMaf(mafName, MafRec): + for var in parseTsv(mafName, MafRec): chrom, start, end = var.Chromosome, var.Start_Position, var.End_Position all1, all2 = var.Tumor_Seq_Allele1, var.Tumor_Seq_Allele2 key = (chrom, start, end, all1, all2) byPosAndAllele[key].append(var) lineCount += 1 allSampleIds.add(var.Tumor_Sample_Barcode) logging.info("Read %d rows" % lineCount) sampleCount = len(allSampleIds) logging.info("Read variants from %d samples in total" % sampleCount) logging.info("Writing variant summary to %s" % outBed) # now summarize all variants for every chrom pos longFields = set() ofh = open(outBed, "w") for (chrom, start, end, all1, all2), varList in byPosAndAllele.iteritems(): var1 = varList[0]._asdict() refAll = var1["Reference_Allele"] varType = var1["Variant_Type"] if varType=="SNP": assert(refAll==all1) #if all2=="": name = refAll+">"+all2 #else: #name = refAll+">"+all1+"/"+all2 elif varType=="DEL": name = "del"+refAll elif varType=="INS": name = "ins"+all2 else: invalidType varSampleCount = len(varList) # score is number of variants with this nucl change score = min(1000, varSampleCount) # don't exceed 1000 ftLen = str(int(end)-int(start)) row = [chrom, start, end, name, str(score), ".", start, end, "0,0,0", "1", ftLen, "0"] - row.append( str(varSampleCount) ) - row.append( str( float(varSampleCount) / sampleCount ) ) + # add the count/frequency fields that we derive from the data + row.append( str(varSampleCount) ) # raw count + row.append( str( float(varSampleCount) / sampleCount ) ) # frequency + # a few fields are always the same, so we just add the value from the first variant for fieldName in fixedFields: row.append(var1[fieldName]) - # must convert structs to dicts + # some annotations are summarized as counts + for summCountField in annotCountFields: + vals = [] + for var in varList: + annot = annots[var.case_id] + vals.append(annot[summCountField]) + + mostCommon = Counter(vals).most_common() + mostCommStrings = ["%s:%d" % (name, count) for name, count in mostCommon] + countStr = ", ".join(mostCommStrings) + row.append(countStr) + + # some annotations are copied as a long list + for fieldName in annotListFields: + vals = [] + for var in varList: + annot = annots[var.case_id] + vals.append(annot[fieldName]) + + row.append(",".join(vals)) + + # first convert structs to dicts dictList = [] for var in varList: dictList.append(var._asdict()) + # some variant values are copied as a long list for fieldName in varFields: valList = [] for d in dictList: valList.append(d[fieldName]) valStr = ",".join(valList) row.append(valStr) - if len(valStr)>256: - longFields.add(fieldName) + #if len(valStr)>256: + #longFields.add(fieldName) ofh.write("\t".join(row)) ofh.write("\n") ofh.close() logging.info("Created file %s" % ofh.name) - return longFields + #return longFields -def makeAs(extraFields, longFields, outFname): +def makeAs(extraFields, outFname): " write the .as file to outFname " minAs = """table bed12Mafs "somatic variants converted from MAF files obtained through the NCI GDC" ( string chrom; "Chromosome (or contig, scaffold, etc.)" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Name of item" uint score; "Score from 0-1000" char[1] strand; "+ or -" uint thickStart; "Start of where display should be thick (start codon)" uint thickEnd; "End of where display should be thick (stop codon)" uint reserved; "Used as itemRgb as of 2004-11-22" int blockCount; "Number of blocks" int[blockCount] blockSizes; "Comma separated list of block sizes" int[blockCount] chromStarts; "Start positions relative to chromStart" string sampleCount; "number of samples with this variant" string freq; "variant frequency" """ ofh = open(outFname, "w") ofh.write(minAs) for fieldList in extraFields: for fieldName in fieldList: - fieldType = "string" - if fieldName in longFields: fieldType = "lstring" + #if fieldName in longFields: + #fieldType = "lstring" fieldDesc = fieldDescs.get(fieldName, fieldName) ofh.write(' %s %s; "%s"\n' % (fieldType, fieldName, fieldDesc)) ofh.write(")\n") ofh.close() logging.info("Wrote %s" % ofh.name) def bedToBb(outBed, outBedSorted, chromSizes, outAs, outBb): " " logging.info("Converting to bigBed") cmd = "sort -k1,1 -k2,2n %s > %s" % (outBed, outBedSorted) assert(system(cmd)==0) cmd = "bedToBigBed %s %s -type=bed12+ -as=%s -tab %s" % \ (outBedSorted, chromSizes, outAs, outBb) assert(system(cmd)==0) logging.info("Created %s" % outBb) +def parseAnnots(annotFn): + " read clinical annotations and return as dict caseId -> namedtuple " + Rec = makeRec(annotFn) + ret = {} + for annot in parseTsv(annotFn, Rec): + caseId = annot.case_id + assert(caseId not in ret) + ret[caseId] = annot + return ret + # ----------- main -------------- def main(): args, options = parseArgs() - inDir, outBb = args - #if options.test: - #logging.debug("test is set") - #f = open(options.file, "r") + inDir, annotFn, expFn, outBb = args + + annots = parseAnnots(annotFn) + exposures = parseAnnots(expFn) + + # join annots and exposures into a single dict case_id -> field_name -> value + joinedAnnots = {} + for key, annot in annots.iteritems(): + ad = annot._asdict() + ad.update(exposures[key]._asdict()) + joinedAnnots[key] = ad mafFnames = findMafs(inDir) outBase = splitext(outBb)[0] # strip file extension outBed = outBase+".bed" outBedSorted = outBase+".s.bed" outAs = outBase+".as" - longFields = mafsToBed(mafFnames, outBed) + mafsToBed(mafFnames, joinedAnnots, outBed) - makeAs([fixedFields, varFields], longFields, outAs) + makeAs([fixedFields, annotCountFields, annotListFields, varFields], outAs) chromSizes = "/gbdb/hg38/chrom.sizes" bedToBb(outBed, outBedSorted, chromSizes, outAs, outBb) main()