src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed 26e07f7ed288daa8fd41e2ecdffc721d98a11432

26e07f7ed288daa8fd41e2ecdffc721d98a11432
ceisenhart
  Thu May 4 08:43:31 2017 -0700
Adding a new option, useMean, that lets the user choose mean or median as the middle calculation. refs #19155

diff --git src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
index ad1162b..f74e1d7 100755
--- src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
+++ src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
@@ -21,32 +21,35 @@
             help = " The input matrix file, the first row should start with a hashtag, e.g. #." + \
                 "the samples in the first row should exactly match the ones in " + \
                 "the metaFile. The labels (ex ENST*****) in the first column should exactly match " + \
                 "the ones in the coordinate file.",
         type = argparse.FileType("r"))
     parser.add_argument ("coordinateMap",
         help = " Six column no header,  maps the column labels from the matrix to coordinates. Tab " + \
                 "separated; label, chr, strand, start coord, end coord, gene name. ",
         action = "store")
     parser.add_argument ("outputFile",
         help = " The output file. ",
         type =argparse.FileType("w"))
     # Optional arguments.  
     parser.add_argument ("--groupOrderFile",
         help = " Optional file to define the group order, list the groups in a single column in " + \
-                "the order desired. ",
+                "the order desired. The default ordering is alphabetical.",
         action = "store")
+    parser.add_argument ("--useMean",
+    help = " Calculate the group values using mean rather than median.",
+    action = "store_true")
     parser.add_argument ("--verbose",
     help = " Show runtime messages.",
     action = "store_true")
     
     parser.set_defaults(verbose = False)
     parser.set_defaults(groupOrderFile = None)
 
     if (len(sys.argv) == 1):
         parser.print_help()
         exit(1)
     options = parser.parse_args()
     return options
 
 def median(lst):
     lst = sorted(lst)
@@ -81,31 +84,31 @@
     for char in inFloat: 
         if char is ".":
             beforeDecimal = False
             result += char
             continue
         if beforeDecimal:
             result += char
         else:
             if count >= 2:
                 return result
             else:
                 count += 1
                 result += char
     return result
 
-def condenseMatrixIntoBedCols(matrix, groupOrder, sampleToGroup, validTpms, bedLikeFile):
+def condenseMatrixIntoBedCols(matrix, groupOrder, sampleToGroup, validTpms, bedLikeFile, useMean):
     """
     Take an expression matrix and a dictionary that maps the samples to groups. 
     Go through the expression matrix and calculate the average for each group, outputting
     it to an intermediate file as they are calculated. The intermediate file has
     three columns, the first is the average tpm for the entire gene, next is the
     number of groups and finally the average tpm for each group as a comma separated list. 
     validTpms - An empty list of integers. 
     sampleToGroup - A dictionary that maps string samples to string groups. 
     matrix - An expression matrix, samples are the x rows, transcripts the y rows.  
     bedLikeFile - An intermediate file, looks slightly like a bed. 
     """
     # Store some information on the bed file, most important is the order 
     # of the 8th column. 
     bedInfo = "" 
     firstLine = True
@@ -130,78 +133,95 @@
                     continue
                 group = sampleToGroup[col]
                 columnToGroup.setdefault(count, group)
                 count += 1  
             continue
         
         # Handle the tpm rows, calculating the average for each group per row.  
         groupAverages = dict()
         groupCounts = dict()
         firstCol = True 
         count = 1 
         for col in splitLine: 
             if firstCol:
                 firstCol = False
                 continue 
+            if (useMean): 
+                # First time this group is seen, add it to the groupCounts dict. 
+                if (groupAverages.get(columnToGroup[count]) == None):
+                    groupAverages.setdefault(columnToGroup[count], float(col))
+                    groupCounts.setdefault(columnToGroup[count], 1)
+                # This group has already been seen, update the TPM average.  
+                else:
+                    groupCounts[columnToGroup[count]] += 1 
+                    # Average calculation
+                    normal = float(groupCounts[columnToGroup[count]])
+                    newTpm = (float(col) * (1/normal)) 
+                    oldTpm = (((normal - 1) / normal) * groupAverages[columnToGroup[count]])
+                    groupAverages[columnToGroup[count]] = newTpm + oldTpm 
+            else:
                 # First time this group is seen, add it to the groupCounts dict. 
                 if (groupAverages.get(columnToGroup[count]) == None):
                     groupAverages.setdefault(columnToGroup[count], [float(col)])
                     groupCounts.setdefault(columnToGroup[count], 1)
                 # This group has already been seen, update the TPM average.  
                 else:
                     groupCounts[columnToGroup[count]] += 1 
                     # Median preparation
                     groupAverages[columnToGroup[count]].append(float(col))
-                # Average calculation
-                #normal = float(groupCounts[columnToGroup[count]])
-                #newTpm = (float(col) * (1/normal)) 
-                #oldTpm = (((normal - 1) / normal) * groupAverages[columnToGroup[count]])
-                #groupAverages[columnToGroup[count]] = newTpm + oldTpm 
+
             count += 1
        
         # Store some information on the bed file. Most important is the groupOrder. 
         if getBedInfo: 
             getBedInfo = False 
             bedInfo += "#chr\tstart\tend\tname\tscore\tstrand\tgroupCount\tgroupOrder;"
             if (groupOrder is not None):
                 for group in open(groupOrder, "r"): 
                     bedInfo += group.strip("\n") + " " 
             else: 
                 for key, value in sorted(groupAverages.iteritems()):
                     bedInfo += key + " "
                 bedInfo = bedInfo[:-1] + "\toffset\tlineLength"
 
         # Write out the transcript name, this is needed to join with coordinates later. 
         bedLikeFile.write(splitLine[0] + "\t")
         # Create a list of the average scores per group. 
         bedLine = ""
         # The fullAverage is used to assign a tpm score representative of the entire bed row. 
         fullAverage = 0.0
         count = 0.0
         if (groupOrder is not None):
             for group in open(groupOrder, "r"): 
                 # Averages
-                #value = groupAverages[group.strip("\n")]
+                if (useMean):
+                    value = groupAverages[group.strip("\n")]
+                else:
                     value = median(groupAverages[group.strip("\n")])
                 bedLine = bedLine + "," + floatRound(str(value))
                 count += 1.0
                 fullAverage += value
         else: 
             for key, value in sorted(groupAverages.iteritems()): 
+                if (useMean):
+                    bedLine = bedLine + "," + floatRound(str(value))
+                    fullAverage += value
+                else:
                     bedLine = bedLine + "," + floatRound(str(median(value)))
-                count += 1.0
                     fullAverage += median(value)
+                
+                count += 1.0
         # Create what will be columns 5, 7 and 8 of the final bed. 
         bedLine = str(fullAverage/count) + "\t" + str(int(count)) +  "\t" + bedLine[1:] + "\n"
         # If the fullAverage tpm is greater than 0 then consider it in the validTpm list. 
         if (fullAverage > 0.0): 
             validTpms.append((fullAverage/count))
         # Write the bedLine to the intermediate bed-like file. 
         bedLikeFile.write(bedLine)
     # Return the bedInfo so it can be printed right before the script ends. 
     return bedInfo
 
 def expMatrixToBarchartBed(options):
     """
     Convert the expression matrix into a barchart bed file. 
     options - The command line options (file names, etc). 
         Use the meta data to map the sample names to their groups, then create a dict that maps
@@ -214,31 +234,32 @@
 
     # Create a dictionary that maps the sample names to their group.
     sampleToGroup = dict()
     for item in options.metaFile: 
         splitLine = item.strip("\n").split()
         sampleToGroup.setdefault(splitLine[0], splitLine[1])
 
     # Use an intermediate file to hold the average values for each group. 
     bedLikeFile = tempfile.NamedTemporaryFile( mode = "w+", bufsize = 1)
     # Keep a list of TPM scores greater than 0. This will be used later
     # to assign bed scores. 
     validTpms = []
 
     # Go through the matrix and condense it into a bed like file. Populate
     # the validTpms array and the bedInfo string.  
-    bedInfo = condenseMatrixIntoBedCols(options.matrixFile, options.groupOrderFile, sampleToGroup, validTpms, bedLikeFile)
+    bedInfo = condenseMatrixIntoBedCols(options.matrixFile, options.groupOrderFile, sampleToGroup, \
+                    validTpms, bedLikeFile, options.useMean)
     # Find the number which divides the list of non 0 TPM scores into ten blocks. 
     tpmMedian = sorted(validTpms)
     blockSizes = len(tpmMedian)/10
 
     # Create a list of the ten TPM values at the edge of each block.  
     # These used to cast a TPM score to one of ten value between 0-1000.  
     tpmCutoffs = []
     for i in range(1,10):
         tpmCutoffs.append(tpmMedian[blockSizes*i])
  
     # Sort the bed like file to prepare it for the join. 
     sortedBedLikeFile = tempfile.NamedTemporaryFile( mode = "w+", bufsize = 1)
     cmd = "sort " + bedLikeFile.name + " > " + sortedBedLikeFile.name
     os.system(cmd)