src/hg/tcga/scripts/segToBed15.py 1.2
1.2 2009/09/18 23:40:43 jsanborn
fixed bug with sparse data
Index: src/hg/tcga/scripts/segToBed15.py
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/scripts/segToBed15.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/tcga/scripts/segToBed15.py 6 Aug 2009 22:56:50 -0000 1.1
+++ src/hg/tcga/scripts/segToBed15.py 18 Sep 2009 23:40:43 -0000 1.2
@@ -1,195 +1,199 @@
import sys, os, string, math
MISSING_VAL = -99
class dummy:
pass
def readSegFile(fname):
inFile = open(fname)
line = inFile.readline()
segData = {}
for line in inFile:
data = line[:-1].split('\t')
sample = data[0]
chrom = "chr" + data[1]
start = int(data[2])
stop = int(data[3])
- val = float(data[5])
+ val = float(data[4])
if sample not in segData:
segData[sample] = {}
if chrom not in segData[sample]:
segData[sample][chrom] = dummy()
segData[sample][chrom].starts = []
segData[sample][chrom].stops = []
segData[sample][chrom].vals = []
segData[sample][chrom].starts.append(start)
segData[sample][chrom].stops.append(stop)
segData[sample][chrom].vals.append(val)
inFile.close()
return segData
def findBreaks(segData):
breaks = {}
for s in segData:
for c in segData[s]:
if c not in breaks:
breaks[c] = {}
for start in segData[s][c].starts:
breaks[c][start] = 1
for stop in segData[s][c].stops:
breaks[c][stop] = 1
sortBreaks = {}
for c in breaks:
tmp = breaks[c].keys()
tmp.sort()
sortBreaks[c] = tmp
return sortBreaks
def mapBreaks(segData, breaks):
mapped = {}
for c in segData:
starts = []
stops = []
vals = []
for i in range(len(breaks[c])-1):
starts.append(breaks[c][i])
stops.append(breaks[c][i+1])
vals.append(MISSING_VAL)
if c not in mapped:
mapped[c] = dummy()
mapped[c].starts = starts
mapped[c].stops = stops
mapped[c].vals = vals
curIndex = 0
for i in range(len(segData[c].starts)):
start = segData[c].starts[i]
stop = segData[c].stops[i]
val = segData[c].vals[i]
while curIndex < len(mapped[c].starts) and mapped[c].starts[curIndex] < start:
curIndex += 1
while curIndex < len(mapped[c].stops) and mapped[c].stops[curIndex] <= stop:
mapped[c].vals[curIndex] = val
curIndex += 1
return mapped
def mappedToBed15(prefix, mapped):
outname = prefix + '.bed'
outFile = open(outname, 'w')
samples = mapped.keys()
expIds = []
for i in range(len(samples)):
expIds.append(str(i))
expIds = ','.join(expIds)
numSamples = str(len(samples))
sample = samples[0]
mp0 = mapped[sample]
chroms = mp0.keys()
for c in chroms:
for i in range(len(mp0[c].starts)):
start = str(mp0[c].starts[i])
stop = str(mp0[c].stops[i])
length = str(mp0[c].stops[i] - mp0[c].starts[i])
name = c + '_' + start + '_' + stop
s = c + '\t' + start + '\t' + stop + '\t' + name + '\t'
s += '1000' + '\t' + '+' + '\t' + start + '\t' + stop + '\t'
s += '0' + '\t' + '1' + '\t' + length + '\t' + '0,' + '\t' + numSamples + '\t'
vals = []
for sample in samples:
+ if c in mapped[sample]:
val = mapped[sample][c].vals[i]
+ else:
+ val = MISSING_VAL
+
if val == MISSING_VAL:
valStr = ''
else:
valStr = str(val)
vals.append(valStr)
s += expIds + '\t' + ','.join(vals) + '\n'
outFile.write(s)
outFile.close()
# write out microarray groups entry
outname = prefix + '_entry.ra'
outFile = open(outname, 'w')
namesStr = "names " + ",".join(samples) + "\n"
groupSizes = "1" * len(samples)
groupSizesStr = "groupSizes " + ",".join(groupSizes) + "\n"
expIds = []
for i in range(len(samples)):
expIds.append(str(i))
expIdsStr = "expIds " + ",".join(expIds) + "\n"
s = "name " + prefix + "Groups\n"
s += "type groupings\n"
s += "all " + prefix + "All\n"
s += "\n"
s += "name " + prefix + "All\n"
s += "type all\n"
s += "description All Arrays\n"
s += expIdsStr
s += groupSizesStr
s += namesStr
outFile.write(s)
outFile.close()
def main(fname, prefix):
segData = readSegFile(fname)
print "num samples", len(segData)
breaks = findBreaks(segData)
numBreaks = 0
for c in breaks:
numBreaks += len(breaks[c])
print "num breaks", numBreaks
count = 0
mapped = {}
for s in segData:
print s
mapped[s] = mapBreaks(segData[s], breaks)
# if count > 5:
# break
# count += 1
mappedToBed15(prefix, mapped)
if __name__ == "__main__":
if len(sys.argv) != 3:
print "python segToBed15.py filename file_prefix"
sys.exit(0)
fname = sys.argv[1]
prefix = sys.argv[2]
main(fname, prefix)