src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed bd7b3d34bfc78957316f16a13a2e0033194b3867

bd7b3d34bfc78957316f16a13a2e0033194b3867
ceisenhart
  Wed May 3 12:32:10 2017 -0700
Changing the group calculation of tpm from mean to median, refs #19155

diff --git src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
index 80f73d8..ad1162b 100755
--- src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
+++ src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
@@ -36,30 +36,39 @@
                 "the order desired. ",
         action = "store")
     parser.add_argument ("--verbose",
     help = " Show runtime messages.",
     action = "store_true")
     
     parser.set_defaults(verbose = False)
     parser.set_defaults(groupOrderFile = None)
 
     if (len(sys.argv) == 1):
         parser.print_help()
         exit(1)
     options = parser.parse_args()
     return options
 
+def median(lst):
+    lst = sorted(lst)
+    if len(lst) < 1:
+        return None
+    if len(lst) %2 == 1:
+        return lst[((len(lst)+1)/2)-1]
+    else:
+        return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0
+
 def determineScore(tpmCutoffs, tpm):
     """
     Cast the tpm to a score between 0-1000. Since there are only
     9 visual blocks cast them to be in one of the 9 blocks.  
     tpmCutoffs - A list of integers
     tpm - An integer
     """
     count = 0
     for val in tpmCutoffs: 
         if (val > tpm): 
             return count*111
         count = count + 1
     return 999
 
 def floatRound(inFloat):
@@ -123,71 +132,76 @@
                 columnToGroup.setdefault(count, group)
                 count += 1  
             continue
         
         # Handle the tpm rows, calculating the average for each group per row.  
         groupAverages = dict()
         groupCounts = dict()
         firstCol = True 
         count = 1 
         for col in splitLine: 
             if firstCol:
                 firstCol = False
                 continue 
             # First time this group is seen, add it to the groupCounts dict. 
             if (groupAverages.get(columnToGroup[count]) == None):
-                groupAverages.setdefault(columnToGroup[count], float(col))
+                groupAverages.setdefault(columnToGroup[count], [float(col)])
                 groupCounts.setdefault(columnToGroup[count], 1)
             # This group has already been seen, update the TPM average.  
             else:
                 groupCounts[columnToGroup[count]] += 1 
-                normal = float(groupCounts[columnToGroup[count]])
-                newTpm = (float(col) * (1/normal)) 
-                oldTpm = (((normal - 1) / normal) * groupAverages[columnToGroup[count]])
-                groupAverages[columnToGroup[count]] = newTpm + oldTpm 
+                # Median preparation
+                groupAverages[columnToGroup[count]].append(float(col))
+                # Average calculation
+                #normal = float(groupCounts[columnToGroup[count]])
+                #newTpm = (float(col) * (1/normal)) 
+                #oldTpm = (((normal - 1) / normal) * groupAverages[columnToGroup[count]])
+                #groupAverages[columnToGroup[count]] = newTpm + oldTpm 
             count += 1
        
         # Store some information on the bed file. Most important is the groupOrder. 
         if getBedInfo: 
             getBedInfo = False 
             bedInfo += "#chr\tstart\tend\tname\tscore\tstrand\tgroupCount\tgroupOrder;"
             if (groupOrder is not None):
                 for group in open(groupOrder, "r"): 
                     bedInfo += group.strip("\n") + " " 
             else: 
                 for key, value in sorted(groupAverages.iteritems()):
                     bedInfo += key + " "
                 bedInfo = bedInfo[:-1] + "\toffset\tlineLength"
 
         # Write out the transcript name, this is needed to join with coordinates later. 
         bedLikeFile.write(splitLine[0] + "\t")
         # Create a list of the average scores per group. 
         bedLine = ""
         # The fullAverage is used to assign a tpm score representative of the entire bed row. 
         fullAverage = 0.0
         count = 0.0
         if (groupOrder is not None):
             for group in open(groupOrder, "r"): 
-                value = groupAverages[group.strip("\n")]
+                # Averages
+                #value = groupAverages[group.strip("\n")]
+                value = median(groupAverages[group.strip("\n")])
                 bedLine = bedLine + "," + floatRound(str(value))
                 count += 1.0
                 fullAverage += value
         else: 
             for key, value in sorted(groupAverages.iteritems()): 
-                bedLine = bedLine + "," + floatRound(str(value))
+                bedLine = bedLine + "," + floatRound(str(median(value)))
                 count += 1.0
-                fullAverage += value
+                fullAverage += median(value)
         # Create what will be columns 5, 7 and 8 of the final bed. 
         bedLine = str(fullAverage/count) + "\t" + str(int(count)) +  "\t" + bedLine[1:] + "\n"
         # If the fullAverage tpm is greater than 0 then consider it in the validTpm list. 
         if (fullAverage > 0.0): 
             validTpms.append((fullAverage/count))
         # Write the bedLine to the intermediate bed-like file. 
         bedLikeFile.write(bedLine)
     # Return the bedInfo so it can be printed right before the script ends. 
     return bedInfo
 
 def expMatrixToBarchartBed(options):
     """
     Convert the expression matrix into a barchart bed file. 
     options - The command line options (file names, etc). 
         Use the meta data to map the sample names to their groups, then create a dict that maps