src/hg/tcga/scripts/segToBed15.py 1.1
1.1 2009/08/06 22:56:50 jsanborn
initial commit
Index: src/hg/tcga/scripts/segToBed15.py
===================================================================
RCS file: src/hg/tcga/scripts/segToBed15.py
diff -N src/hg/tcga/scripts/segToBed15.py
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/tcga/scripts/segToBed15.py 6 Aug 2009 22:56:50 -0000 1.1
@@ -0,0 +1,195 @@
+import sys, os, string, math
+
+MISSING_VAL = -99
+
+class dummy:
+ pass
+
+
+def readSegFile(fname):
+ inFile = open(fname)
+
+ line = inFile.readline()
+ segData = {}
+ for line in inFile:
+ data = line[:-1].split('\t')
+
+ sample = data[0]
+ chrom = "chr" + data[1]
+ start = int(data[2])
+ stop = int(data[3])
+ val = float(data[5])
+
+ if sample not in segData:
+ segData[sample] = {}
+
+ if chrom not in segData[sample]:
+ segData[sample][chrom] = dummy()
+ segData[sample][chrom].starts = []
+ segData[sample][chrom].stops = []
+ segData[sample][chrom].vals = []
+ segData[sample][chrom].starts.append(start)
+ segData[sample][chrom].stops.append(stop)
+ segData[sample][chrom].vals.append(val)
+ inFile.close()
+ return segData
+
+
+def findBreaks(segData):
+
+ breaks = {}
+
+ for s in segData:
+ for c in segData[s]:
+ if c not in breaks:
+ breaks[c] = {}
+ for start in segData[s][c].starts:
+ breaks[c][start] = 1
+ for stop in segData[s][c].stops:
+ breaks[c][stop] = 1
+
+ sortBreaks = {}
+ for c in breaks:
+ tmp = breaks[c].keys()
+ tmp.sort()
+ sortBreaks[c] = tmp
+
+ return sortBreaks
+
+
+def mapBreaks(segData, breaks):
+ mapped = {}
+
+ for c in segData:
+ starts = []
+ stops = []
+ vals = []
+ for i in range(len(breaks[c])-1):
+ starts.append(breaks[c][i])
+ stops.append(breaks[c][i+1])
+ vals.append(MISSING_VAL)
+
+ if c not in mapped:
+ mapped[c] = dummy()
+ mapped[c].starts = starts
+ mapped[c].stops = stops
+ mapped[c].vals = vals
+
+ curIndex = 0
+ for i in range(len(segData[c].starts)):
+ start = segData[c].starts[i]
+ stop = segData[c].stops[i]
+ val = segData[c].vals[i]
+
+ while curIndex < len(mapped[c].starts) and mapped[c].starts[curIndex] < start:
+ curIndex += 1
+
+ while curIndex < len(mapped[c].stops) and mapped[c].stops[curIndex] <= stop:
+ mapped[c].vals[curIndex] = val
+ curIndex += 1
+
+ return mapped
+
+def mappedToBed15(prefix, mapped):
+
+ outname = prefix + '.bed'
+ outFile = open(outname, 'w')
+
+ samples = mapped.keys()
+
+ expIds = []
+ for i in range(len(samples)):
+ expIds.append(str(i))
+ expIds = ','.join(expIds)
+ numSamples = str(len(samples))
+
+ sample = samples[0]
+ mp0 = mapped[sample]
+ chroms = mp0.keys()
+
+ for c in chroms:
+ for i in range(len(mp0[c].starts)):
+ start = str(mp0[c].starts[i])
+ stop = str(mp0[c].stops[i])
+ length = str(mp0[c].stops[i] - mp0[c].starts[i])
+ name = c + '_' + start + '_' + stop
+
+ s = c + '\t' + start + '\t' + stop + '\t' + name + '\t'
+ s += '1000' + '\t' + '+' + '\t' + start + '\t' + stop + '\t'
+ s += '0' + '\t' + '1' + '\t' + length + '\t' + '0,' + '\t' + numSamples + '\t'
+ vals = []
+ for sample in samples:
+ val = mapped[sample][c].vals[i]
+ if val == MISSING_VAL:
+ valStr = ''
+ else:
+ valStr = str(val)
+ vals.append(valStr)
+
+ s += expIds + '\t' + ','.join(vals) + '\n'
+ outFile.write(s)
+ outFile.close()
+
+ # write out microarray groups entry
+ outname = prefix + '_entry.ra'
+ outFile = open(outname, 'w')
+
+ namesStr = "names " + ",".join(samples) + "\n"
+
+ groupSizes = "1" * len(samples)
+ groupSizesStr = "groupSizes " + ",".join(groupSizes) + "\n"
+
+ expIds = []
+ for i in range(len(samples)):
+ expIds.append(str(i))
+
+ expIdsStr = "expIds " + ",".join(expIds) + "\n"
+
+ s = "name " + prefix + "Groups\n"
+ s += "type groupings\n"
+ s += "all " + prefix + "All\n"
+ s += "\n"
+ s += "name " + prefix + "All\n"
+ s += "type all\n"
+ s += "description All Arrays\n"
+ s += expIdsStr
+ s += groupSizesStr
+ s += namesStr
+
+ outFile.write(s)
+ outFile.close()
+
+def main(fname, prefix):
+ segData = readSegFile(fname)
+
+ print "num samples", len(segData)
+
+ breaks = findBreaks(segData)
+
+ numBreaks = 0
+ for c in breaks:
+ numBreaks += len(breaks[c])
+ print "num breaks", numBreaks
+
+ count = 0
+ mapped = {}
+ for s in segData:
+ print s
+ mapped[s] = mapBreaks(segData[s], breaks)
+
+# if count > 5:
+# break
+# count += 1
+
+ mappedToBed15(prefix, mapped)
+
+
+if __name__ == "__main__":
+ if len(sys.argv) != 3:
+ print "python segToBed15.py filename file_prefix"
+ sys.exit(0)
+
+ fname = sys.argv[1]
+ prefix = sys.argv[2]
+ main(fname, prefix)
+