src/hg/tcga/scripts/segToBed15.py 1.1

1.1 2009/08/06 22:56:50 jsanborn
initial commit
Index: src/hg/tcga/scripts/segToBed15.py
===================================================================
RCS file: src/hg/tcga/scripts/segToBed15.py
diff -N src/hg/tcga/scripts/segToBed15.py
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/tcga/scripts/segToBed15.py	6 Aug 2009 22:56:50 -0000	1.1
@@ -0,0 +1,195 @@
+import sys, os, string, math
+
+MISSING_VAL = -99
+
+class dummy:
+    pass
+
+
+def readSegFile(fname):
+    inFile = open(fname)
+
+    line = inFile.readline()
+    segData = {}
+    for line in inFile:
+        data = line[:-1].split('\t')
+
+        sample = data[0]
+        chrom = "chr" + data[1]
+        start = int(data[2])
+        stop = int(data[3])
+        val = float(data[5])
+
+        if sample not in segData:
+            segData[sample] = {}
+
+        if chrom not in segData[sample]:
+            segData[sample][chrom] = dummy()
+            segData[sample][chrom].starts = []
+            segData[sample][chrom].stops = []
+            segData[sample][chrom].vals = []
+        segData[sample][chrom].starts.append(start)
+        segData[sample][chrom].stops.append(stop)
+        segData[sample][chrom].vals.append(val)
+    inFile.close()
+    return segData
+
+
+def findBreaks(segData):
+
+    breaks = {}
+
+    for s in segData:
+        for c in segData[s]:
+            if c not in breaks:
+                breaks[c] = {}
+            for start in segData[s][c].starts:
+                breaks[c][start] = 1
+            for stop in segData[s][c].stops:
+                breaks[c][stop] = 1
+
+    sortBreaks = {}
+    for c in breaks:
+        tmp = breaks[c].keys()
+        tmp.sort()
+        sortBreaks[c] = tmp
+
+    return sortBreaks
+
+
+def mapBreaks(segData, breaks):
+    mapped = {}
+
+    for c in segData:
+        starts = []
+        stops = []
+        vals = []
+        for i in range(len(breaks[c])-1):
+            starts.append(breaks[c][i])
+            stops.append(breaks[c][i+1])
+            vals.append(MISSING_VAL)
+            
+        if c not in mapped:
+            mapped[c] = dummy()
+            mapped[c].starts = starts
+            mapped[c].stops = stops
+            mapped[c].vals = vals
+            
+        curIndex = 0
+        for i in range(len(segData[c].starts)):
+            start = segData[c].starts[i]
+            stop = segData[c].stops[i]
+            val = segData[c].vals[i]
+
+            while curIndex < len(mapped[c].starts) and mapped[c].starts[curIndex] < start:
+                curIndex += 1
+
+            while curIndex < len(mapped[c].stops) and mapped[c].stops[curIndex] <= stop:
+                mapped[c].vals[curIndex] = val
+                curIndex += 1
+
+    return mapped
+
+def mappedToBed15(prefix, mapped):
+
+    outname = prefix + '.bed'
+    outFile = open(outname, 'w')
+
+    samples = mapped.keys()
+
+    expIds = []
+    for i in range(len(samples)):
+        expIds.append(str(i))
+    expIds = ','.join(expIds)
+    numSamples = str(len(samples))
+    
+    sample = samples[0]
+    mp0 = mapped[sample]
+    chroms = mp0.keys()
+
+    for c in chroms:
+        for i in range(len(mp0[c].starts)):
+            start = str(mp0[c].starts[i])
+            stop = str(mp0[c].stops[i])
+            length = str(mp0[c].stops[i] - mp0[c].starts[i])
+            name = c + '_' + start + '_' + stop
+
+            s = c + '\t' + start + '\t' + stop + '\t' + name + '\t'
+            s += '1000' + '\t' + '+' + '\t' + start + '\t' + stop + '\t'
+            s += '0' + '\t' + '1' + '\t' + length + '\t' + '0,' + '\t' + numSamples + '\t'
+            vals = []
+            for sample in samples:
+                val = mapped[sample][c].vals[i]
+                if val == MISSING_VAL:
+                    valStr = ''
+                else:
+                    valStr = str(val)
+                vals.append(valStr)
+
+            s += expIds + '\t' + ','.join(vals) + '\n'
+            outFile.write(s)            
+    outFile.close()
+
+    # write out microarray groups entry
+    outname = prefix + '_entry.ra'
+    outFile = open(outname, 'w')
+
+    namesStr = "names " + ",".join(samples) + "\n"
+    
+    groupSizes = "1" * len(samples)
+    groupSizesStr = "groupSizes " + ",".join(groupSizes) + "\n"
+    
+    expIds = []
+    for i in range(len(samples)):
+        expIds.append(str(i))
+        
+    expIdsStr = "expIds " + ",".join(expIds) + "\n"
+    
+    s = "name " + prefix + "Groups\n"
+    s += "type groupings\n"
+    s += "all " + prefix + "All\n"
+    s += "\n"
+    s += "name " + prefix + "All\n"
+    s += "type all\n"
+    s += "description All Arrays\n"
+    s += expIdsStr
+    s += groupSizesStr
+    s += namesStr
+    
+    outFile.write(s)
+    outFile.close()
+
+def main(fname, prefix):
+    segData = readSegFile(fname)
+
+    print "num samples", len(segData)
+    
+    breaks = findBreaks(segData)
+
+    numBreaks = 0
+    for c in breaks:
+        numBreaks += len(breaks[c])
+    print "num breaks", numBreaks
+
+    count = 0
+    mapped = {}
+    for s in segData:
+        print s
+        mapped[s] = mapBreaks(segData[s], breaks)
+
+#        if count > 5:
+#            break
+#        count += 1
+        
+    mappedToBed15(prefix, mapped)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print "python segToBed15.py filename file_prefix"
+        sys.exit(0)
+
+    fname = sys.argv[1]
+    prefix = sys.argv[2]
+    main(fname, prefix)
+