7c0d137acafd9e780b872808a80b320564579944 braney Mon Apr 22 14:54:22 2019 -0700 some tweaks to get the format more in keeping with what hgc wants diff --git src/utils/cancerMafToBigBed src/utils/cancerMafToBigBed index 5db043f..bf84e2d 100755 --- src/utils/cancerMafToBigBed +++ src/utils/cancerMafToBigBed @@ -1,324 +1,321 @@ #!/usr/bin/env python # possible features: # - add counts per diagnosis import logging, sys, optparse, gzip from collections import defaultdict, namedtuple, Counter from os.path import join, basename, dirname, isfile, walk, splitext from os import system # these fields are identical for all variants at the same position. # only a single value is output to the bed fixedFields = [ "Hugo_Symbol", "Entrez_Gene_Id", "Variant_Classification", "Variant_Type", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2", "dbSNP_RS", "dbSNP_Val_Status" ] # these fields vary between variants at the same position, they are output to the BED as a comma-separated list varFields = [ "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode", "case_id" ] # these clinical annotations are summarized as counts annotCountFields = [ - "gender", - "project_id", - "ethnicity", ] # these clinical annotations are copied as a list into every variant annotListFields = [ # from annotations.tsv "days_to_death", # from exposure.tsv "cigarettes_per_day", "weight", "alcohol_history", "alcohol_intensity", "bmi", "years_smoked", - "height" + "height", + "gender", + "project_id", + "ethnicity", ] # field descriptions for the AS file fieldDescs = { } # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] inDir annotationFile exposureFile outBb - summarize GDC MAF files to bigBed inDir will be recursively searched for .maf.gz files. Their variants will be extracted, written to outBb annotationFile and exposureFile are two .tsv files, both are distributed by the GDC. See /hive/data/outside/gdc/pancan33-variants/annotations/ for the versions from Jan 2019 """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") #parser.add_option("", "--test", dest="test", action="store_true", help="do something") (options, args) = parser.parse_args() if args==[]: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) return args, options def findMafs(inDir): " recursively find maf.gz files under inDir " mafFnames = [] def gotFiles(arg, dirName, fnames): " callback for walk, called for every file " for fname in fnames: if fname.endswith(".maf.gz"): mafFnames.append( join(dirName, fname) ) walk(inDir, gotFiles, None) return mafFnames def openFile(fname): if fname.endswith(".gz"): return gzip.open(fname) else: return open(fname) def parseTsv(fname, Rec): " parse maf file, use rows to fill the Rec (~struct) and yield Recs " for line in openFile(fname): if line.startswith("#"): continue if line.startswith("Hugo_Sym"): headers = line.rstrip("\n").split("\t") assert(tuple(headers)==Rec._fields) continue row = line.rstrip("\n").split("\t") yield Rec(*row) def makeRec(fname): " return the namedtuple rec (=struct) made from first non-comment line in fname " for line in openFile(fname): if line.startswith("#"): continue headers = line.rstrip("\n").split("\t") MafRec = namedtuple("mafRecord", headers) return MafRec def joinMaxLen(vals, maxLen=255): " join strings together. If total length is > maxLen, just take the first 3 and add ... " newLen = len(vals[0])*len(vals) - if newLen>250: - newStr = ",".join(vals[:3]) - else: newStr = ",".join(vals) return newStr def mafsToBed(mafFnames, annots, outBed): " summarize mafFnames to outBed " byPosAndAllele = defaultdict(list) MafRec = None allSampleIds = set() # read the whole data into memory, index by chrom pos for mafName in mafFnames: if MafRec is None: MafRec = makeRec(mafName) logging.info("Reading %s" % mafName) lineCount = 0 for var in parseTsv(mafName, MafRec): chrom, start, end = var.Chromosome, var.Start_Position, var.End_Position all1, all2 = var.Tumor_Seq_Allele1, var.Tumor_Seq_Allele2 key = (chrom, start, end, all1, all2) byPosAndAllele[key].append(var) lineCount += 1 allSampleIds.add(var.Tumor_Sample_Barcode) logging.info("Read %d rows" % lineCount) sampleCount = len(allSampleIds) logging.info("Read %d variants from %d samples" % (len(byPosAndAllele),sampleCount)) logging.info("Writing variant summary to %s" % outBed) # now summarize all variants for every chrom pos longFields = set() ofh = open(outBed, "w") for (chrom, start, end, all1, all2), varList in byPosAndAllele.iteritems(): var1 = varList[0]._asdict() refAll = var1["Reference_Allele"] varType = var1["Variant_Type"] if varType=="SNP": assert(refAll==all1) #if all2=="": name = refAll+">"+all2 #else: #name = refAll+">"+all1+"/"+all2 elif varType=="DEL": name = "del"+refAll elif varType=="INS": name = "ins"+all2 else: invalidType varSampleCount = len(varList) # score is number of variants with this nucl change score = min(1000, varSampleCount) # don't exceed 1000 - ftLen = str(int(end)-int(start)) - row = [chrom, start, end, name, str(score), ".", start, end, "0,0,0", "1", ftLen, "0"] + ftLen = str(int(end)-int(start) + 1) + row = [chrom, str(int(start) - 1), end, name, str(score), ".", str(int(start) - 1), end, "0,0,0", "1", ftLen, "0"] # add the count/frequency fields that we derive from the data row.append( str(varSampleCount) ) # raw count row.append( str( float(varSampleCount) / sampleCount ) ) # frequency # a few fields are always the same, so we just add the value from the first variant for fieldName in fixedFields: row.append(var1[fieldName]) # some annotations are summarized as counts for summCountField in annotCountFields: vals = [] for var in varList: annot = annots[var.case_id] vals.append(annot[summCountField]) mostCommon = Counter(vals).most_common() mostCommStrings = ["%s:%d" % (name, count) for name, count in mostCommon] countStr = ", ".join(mostCommStrings) row.append(countStr) # some annotations are copied as a long list for fieldName in annotListFields: vals = [] for var in varList: annot = annots[var.case_id] vals.append(annot[fieldName]) row.append(joinMaxLen(vals)) # first convert structs to dicts dictList = [] for var in varList: dictList.append(var._asdict()) # some variant values are copied as a long list for fieldName in varFields: valList = [] for d in dictList: valList.append(d[fieldName]) valStr = joinMaxLen(valList) row.append(valStr) #if len(valStr)>256: #longFields.add(fieldName) ofh.write("\t".join(row)) ofh.write("\n") ofh.close() logging.info("Created file %s" % ofh.name) #return longFields def makeAs(extraFields, outFname): " write the .as file to outFname " minAs = """table bed12Mafs "somatic variants converted from MAF files obtained through the NCI GDC" ( string chrom; "Chromosome (or contig, scaffold, etc.)" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Name of item" uint score; "Score from 0-1000" char[1] strand; "+ or -" uint thickStart; "Start of where display should be thick (start codon)" uint thickEnd; "End of where display should be thick (stop codon)" uint reserved; "Used as itemRgb as of 2004-11-22" int blockCount; "Number of blocks" int[blockCount] blockSizes; "Comma separated list of block sizes" int[blockCount] chromStarts; "Start positions relative to chromStart" string sampleCount; "number of samples with this variant" string freq; "variant frequency" """ ofh = open(outFname, "w") ofh.write(minAs) for fieldList in extraFields: for fieldName in fieldList: fieldType = "lstring" #if fieldName in longFields: #fieldType = "lstring" fieldDesc = fieldDescs.get(fieldName, fieldName) ofh.write(' %s %s; "%s"\n' % (fieldType, fieldName, fieldDesc)) ofh.write(")\n") ofh.close() logging.info("Wrote %s" % ofh.name) def bedToBb(outBed, outBedSorted, chromSizes, outAs, outBb): " " logging.info("Converting to bigBed") cmd = "sort -k1,1 -k2,2n %s > %s" % (outBed, outBedSorted) assert(system(cmd)==0) cmd = "bedToBigBed %s %s -type=bed12+ -as=%s -tab %s" % \ (outBedSorted, chromSizes, outAs, outBb) assert(system(cmd)==0) logging.info("Created %s" % outBb) def parseAnnots(annotFn): " read clinical annotations and return as dict caseId -> namedtuple " Rec = makeRec(annotFn) ret = {} for annot in parseTsv(annotFn, Rec): caseId = annot.case_id assert(caseId not in ret) ret[caseId] = annot return ret # ----------- main -------------- def main(): args, options = parseArgs() inDir, annotFn, expFn, outBb = args annots = parseAnnots(annotFn) exposures = parseAnnots(expFn) # join annots and exposures into a single dict case_id -> field_name -> value joinedAnnots = {} for key, annot in annots.iteritems(): ad = annot._asdict() ad.update(exposures[key]._asdict()) joinedAnnots[key] = ad mafFnames = findMafs(inDir) outBase = splitext(outBb)[0] # strip file extension outBed = outBase+".bed" outBedSorted = outBase+".s.bed" outAs = outBase+".as" mafsToBed(mafFnames, joinedAnnots, outBed) makeAs([fixedFields, annotCountFields, annotListFields, varFields], outAs) chromSizes = "/hive/data/genomes/hg38/chrom.sizes" bedToBb(outBed, outBedSorted, chromSizes, outAs, outBb) main()