src/hg/tcga/scripts/segToBed15.py 1.2

1.2 2009/09/18 23:40:43 jsanborn
fixed bug with sparse data
Index: src/hg/tcga/scripts/segToBed15.py
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/scripts/segToBed15.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/tcga/scripts/segToBed15.py	6 Aug 2009 22:56:50 -0000	1.1
+++ src/hg/tcga/scripts/segToBed15.py	18 Sep 2009 23:40:43 -0000	1.2
@@ -1,195 +1,199 @@
 import sys, os, string, math
 
 MISSING_VAL = -99
 
 class dummy:
     pass
 
 
 def readSegFile(fname):
     inFile = open(fname)
 
     line = inFile.readline()
     segData = {}
     for line in inFile:
         data = line[:-1].split('\t')
 
         sample = data[0]
         chrom = "chr" + data[1]
         start = int(data[2])
         stop = int(data[3])
-        val = float(data[5])
+        val = float(data[4])
 
         if sample not in segData:
             segData[sample] = {}
 
         if chrom not in segData[sample]:
             segData[sample][chrom] = dummy()
             segData[sample][chrom].starts = []
             segData[sample][chrom].stops = []
             segData[sample][chrom].vals = []
         segData[sample][chrom].starts.append(start)
         segData[sample][chrom].stops.append(stop)
         segData[sample][chrom].vals.append(val)
     inFile.close()
     return segData
 
 
 def findBreaks(segData):
 
     breaks = {}
 
     for s in segData:
         for c in segData[s]:
             if c not in breaks:
                 breaks[c] = {}
             for start in segData[s][c].starts:
                 breaks[c][start] = 1
             for stop in segData[s][c].stops:
                 breaks[c][stop] = 1
 
     sortBreaks = {}
     for c in breaks:
         tmp = breaks[c].keys()
         tmp.sort()
         sortBreaks[c] = tmp
 
     return sortBreaks
 
 
 def mapBreaks(segData, breaks):
     mapped = {}
 
     for c in segData:
         starts = []
         stops = []
         vals = []
         for i in range(len(breaks[c])-1):
             starts.append(breaks[c][i])
             stops.append(breaks[c][i+1])
             vals.append(MISSING_VAL)
             
         if c not in mapped:
             mapped[c] = dummy()
             mapped[c].starts = starts
             mapped[c].stops = stops
             mapped[c].vals = vals
             
         curIndex = 0
         for i in range(len(segData[c].starts)):
             start = segData[c].starts[i]
             stop = segData[c].stops[i]
             val = segData[c].vals[i]
 
             while curIndex < len(mapped[c].starts) and mapped[c].starts[curIndex] < start:
                 curIndex += 1
 
             while curIndex < len(mapped[c].stops) and mapped[c].stops[curIndex] <= stop:
                 mapped[c].vals[curIndex] = val
                 curIndex += 1
 
     return mapped
 
 def mappedToBed15(prefix, mapped):
 
     outname = prefix + '.bed'
     outFile = open(outname, 'w')
 
     samples = mapped.keys()
 
     expIds = []
     for i in range(len(samples)):
         expIds.append(str(i))
     expIds = ','.join(expIds)
     numSamples = str(len(samples))
     
     sample = samples[0]
     mp0 = mapped[sample]
     chroms = mp0.keys()
 
     for c in chroms:
         for i in range(len(mp0[c].starts)):
             start = str(mp0[c].starts[i])
             stop = str(mp0[c].stops[i])
             length = str(mp0[c].stops[i] - mp0[c].starts[i])
             name = c + '_' + start + '_' + stop
 
             s = c + '\t' + start + '\t' + stop + '\t' + name + '\t'
             s += '1000' + '\t' + '+' + '\t' + start + '\t' + stop + '\t'
             s += '0' + '\t' + '1' + '\t' + length + '\t' + '0,' + '\t' + numSamples + '\t'
             vals = []
             for sample in samples:
+                if c in mapped[sample]:
                 val = mapped[sample][c].vals[i]
+                else:
+                    val = MISSING_VAL
+                    
                 if val == MISSING_VAL:
                     valStr = ''
                 else:
                     valStr = str(val)
                 vals.append(valStr)
 
             s += expIds + '\t' + ','.join(vals) + '\n'
             outFile.write(s)            
     outFile.close()
 
     # write out microarray groups entry
     outname = prefix + '_entry.ra'
     outFile = open(outname, 'w')
 
     namesStr = "names " + ",".join(samples) + "\n"
     
     groupSizes = "1" * len(samples)
     groupSizesStr = "groupSizes " + ",".join(groupSizes) + "\n"
     
     expIds = []
     for i in range(len(samples)):
         expIds.append(str(i))
         
     expIdsStr = "expIds " + ",".join(expIds) + "\n"
     
     s = "name " + prefix + "Groups\n"
     s += "type groupings\n"
     s += "all " + prefix + "All\n"
     s += "\n"
     s += "name " + prefix + "All\n"
     s += "type all\n"
     s += "description All Arrays\n"
     s += expIdsStr
     s += groupSizesStr
     s += namesStr
     
     outFile.write(s)
     outFile.close()
 
 def main(fname, prefix):
     segData = readSegFile(fname)
 
     print "num samples", len(segData)
     
     breaks = findBreaks(segData)
 
     numBreaks = 0
     for c in breaks:
         numBreaks += len(breaks[c])
     print "num breaks", numBreaks
 
     count = 0
     mapped = {}
     for s in segData:
         print s
         mapped[s] = mapBreaks(segData[s], breaks)
 
 #        if count > 5:
 #            break
 #        count += 1
         
     mappedToBed15(prefix, mapped)
 
 
 if __name__ == "__main__":
     if len(sys.argv) != 3:
         print "python segToBed15.py filename file_prefix"
         sys.exit(0)
 
     fname = sys.argv[1]
     prefix = sys.argv[2]
     main(fname, prefix)