7c0d137acafd9e780b872808a80b320564579944 braney Mon Apr 22 14:54:22 2019 -0700 some tweaks to get the format more in keeping with what hgc wants diff --git src/utils/cancerMafToBigBed src/utils/cancerMafToBigBed index 5db043f..bf84e2d 100755 --- src/utils/cancerMafToBigBed +++ src/utils/cancerMafToBigBed @@ -19,47 +19,47 @@ "Tumor_Seq_Allele1", "Tumor_Seq_Allele2", "dbSNP_RS", "dbSNP_Val_Status" ] # these fields vary between variants at the same position, they are output to the BED as a comma-separated list varFields = [ "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode", "case_id" ] # these clinical annotations are summarized as counts annotCountFields = [ - "gender", - "project_id", - "ethnicity", ] # these clinical annotations are copied as a list into every variant annotListFields = [ # from annotations.tsv "days_to_death", # from exposure.tsv "cigarettes_per_day", "weight", "alcohol_history", "alcohol_intensity", "bmi", "years_smoked", - "height" + "height", + "gender", + "project_id", + "ethnicity", ] # field descriptions for the AS file fieldDescs = { } # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] inDir annotationFile exposureFile outBb - summarize GDC MAF files to bigBed inDir will be recursively searched for .maf.gz files. Their variants will be extracted, written to outBb @@ -116,33 +116,30 @@ row = line.rstrip("\n").split("\t") yield Rec(*row) def makeRec(fname): " return the namedtuple rec (=struct) made from first non-comment line in fname " for line in openFile(fname): if line.startswith("#"): continue headers = line.rstrip("\n").split("\t") MafRec = namedtuple("mafRecord", headers) return MafRec def joinMaxLen(vals, maxLen=255): " join strings together. If total length is > maxLen, just take the first 3 and add ... " newLen = len(vals[0])*len(vals) - if newLen>250: - newStr = ",".join(vals[:3]) - else: newStr = ",".join(vals) return newStr def mafsToBed(mafFnames, annots, outBed): " summarize mafFnames to outBed " byPosAndAllele = defaultdict(list) MafRec = None allSampleIds = set() # read the whole data into memory, index by chrom pos for mafName in mafFnames: if MafRec is None: MafRec = makeRec(mafName) logging.info("Reading %s" % mafName) lineCount = 0 for var in parseTsv(mafName, MafRec): @@ -170,32 +167,32 @@ assert(refAll==all1) #if all2=="": name = refAll+">"+all2 #else: #name = refAll+">"+all1+"/"+all2 elif varType=="DEL": name = "del"+refAll elif varType=="INS": name = "ins"+all2 else: invalidType varSampleCount = len(varList) # score is number of variants with this nucl change score = min(1000, varSampleCount) # don't exceed 1000 - ftLen = str(int(end)-int(start)) - row = [chrom, start, end, name, str(score), ".", start, end, "0,0,0", "1", ftLen, "0"] + ftLen = str(int(end)-int(start) + 1) + row = [chrom, str(int(start) - 1), end, name, str(score), ".", str(int(start) - 1), end, "0,0,0", "1", ftLen, "0"] # add the count/frequency fields that we derive from the data row.append( str(varSampleCount) ) # raw count row.append( str( float(varSampleCount) / sampleCount ) ) # frequency # a few fields are always the same, so we just add the value from the first variant for fieldName in fixedFields: row.append(var1[fieldName]) # some annotations are summarized as counts for summCountField in annotCountFields: vals = [] for var in varList: annot = annots[var.case_id] vals.append(annot[summCountField])