87e0a09f85192a066a9814afe5ba88925398305a max Tue Feb 5 20:09:25 2019 -0800 adding converter and first demo trackDb (only Breast cancer), refs #22742 diff --git src/utils/cancerMafToBigBed src/utils/cancerMafToBigBed index 7ad60bc..55c1bf5 100755 --- src/utils/cancerMafToBigBed +++ src/utils/cancerMafToBigBed @@ -109,52 +109,61 @@ headers = line.rstrip("\n").split("\t") assert(tuple(headers)==Rec._fields) continue row = line.rstrip("\n").split("\t") yield Rec(*row) def makeRec(fname): " return the namedtuple rec (=struct) made from first non-comment line in fname " for line in openFile(fname): if line.startswith("#"): continue headers = line.rstrip("\n").split("\t") MafRec = namedtuple("mafRecord", headers) return MafRec +def joinMaxLen(vals, maxLen=255): + " join strings together. If total length is > maxLen, just take the first 3 and add ... " + newLen = len(vals[0])*len(vals) + if newLen>250: + newStr = ",".join(vals[:3]) + else: + newStr = ",".join(vals) + return newStr + def mafsToBed(mafFnames, annots, outBed): " summarize mafFnames to outBed " byPosAndAllele = defaultdict(list) MafRec = None allSampleIds = set() # read the whole data into memory, index by chrom pos for mafName in mafFnames: if MafRec is None: MafRec = makeRec(mafName) logging.info("Reading %s" % mafName) lineCount = 0 for var in parseTsv(mafName, MafRec): chrom, start, end = var.Chromosome, var.Start_Position, var.End_Position all1, all2 = var.Tumor_Seq_Allele1, var.Tumor_Seq_Allele2 key = (chrom, start, end, all1, all2) byPosAndAllele[key].append(var) lineCount += 1 allSampleIds.add(var.Tumor_Sample_Barcode) logging.info("Read %d rows" % lineCount) sampleCount = len(allSampleIds) - logging.info("Read variants from %d samples in total" % sampleCount) + logging.info("Read %d variants from %d samples" % (len(byPosAndAllele),sampleCount)) logging.info("Writing variant summary to %s" % outBed) # now summarize all variants for every chrom pos longFields = set() ofh = open(outBed, "w") for (chrom, start, end, all1, all2), varList in byPosAndAllele.iteritems(): var1 = varList[0]._asdict() refAll = var1["Reference_Allele"] varType = var1["Variant_Type"] if varType=="SNP": assert(refAll==all1) #if all2=="": name = refAll+">"+all2 #else: @@ -187,43 +196,43 @@ annot = annots[var.case_id] vals.append(annot[summCountField]) mostCommon = Counter(vals).most_common() mostCommStrings = ["%s:%d" % (name, count) for name, count in mostCommon] countStr = ", ".join(mostCommStrings) row.append(countStr) # some annotations are copied as a long list for fieldName in annotListFields: vals = [] for var in varList: annot = annots[var.case_id] vals.append(annot[fieldName]) - row.append(",".join(vals)) + row.append(joinMaxLen(vals)) # first convert structs to dicts dictList = [] for var in varList: dictList.append(var._asdict()) # some variant values are copied as a long list for fieldName in varFields: valList = [] for d in dictList: valList.append(d[fieldName]) - valStr = ",".join(valList) + valStr = joinMaxLen(valList) row.append(valStr) #if len(valStr)>256: #longFields.add(fieldName) ofh.write("\t".join(row)) ofh.write("\n") ofh.close() logging.info("Created file %s" % ofh.name) #return longFields def makeAs(extraFields, outFname): " write the .as file to outFname " minAs = """table bed12Mafs "somatic variants converted from MAF files obtained through the NCI GDC" @@ -293,19 +302,19 @@ ad = annot._asdict() ad.update(exposures[key]._asdict()) joinedAnnots[key] = ad mafFnames = findMafs(inDir) outBase = splitext(outBb)[0] # strip file extension outBed = outBase+".bed" outBedSorted = outBase+".s.bed" outAs = outBase+".as" mafsToBed(mafFnames, joinedAnnots, outBed) makeAs([fixedFields, annotCountFields, annotListFields, varFields], outAs) - chromSizes = "/gbdb/hg38/chrom.sizes" + chromSizes = "/hive/data/genomes/hg38/chrom.sizes" bedToBb(outBed, outBedSorted, chromSizes, outAs, outBb) main()