bd7b3d34bfc78957316f16a13a2e0033194b3867 ceisenhart Wed May 3 12:32:10 2017 -0700 Changing the group calculation of tpm from mean to median, refs #19155 diff --git src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed index 80f73d8..ad1162b 100755 --- src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed +++ src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed @@ -36,30 +36,39 @@ "the order desired. ", action = "store") parser.add_argument ("--verbose", help = " Show runtime messages.", action = "store_true") parser.set_defaults(verbose = False) parser.set_defaults(groupOrderFile = None) if (len(sys.argv) == 1): parser.print_help() exit(1) options = parser.parse_args() return options +def median(lst): + lst = sorted(lst) + if len(lst) < 1: + return None + if len(lst) %2 == 1: + return lst[((len(lst)+1)/2)-1] + else: + return float(sum(lst[(len(lst)/2)-1:(len(lst)/2)+1]))/2.0 + def determineScore(tpmCutoffs, tpm): """ Cast the tpm to a score between 0-1000. Since there are only 9 visual blocks cast them to be in one of the 9 blocks. tpmCutoffs - A list of integers tpm - An integer """ count = 0 for val in tpmCutoffs: if (val > tpm): return count*111 count = count + 1 return 999 def floatRound(inFloat): @@ -123,71 +132,76 @@ columnToGroup.setdefault(count, group) count += 1 continue # Handle the tpm rows, calculating the average for each group per row. groupAverages = dict() groupCounts = dict() firstCol = True count = 1 for col in splitLine: if firstCol: firstCol = False continue # First time this group is seen, add it to the groupCounts dict. if (groupAverages.get(columnToGroup[count]) == None): - groupAverages.setdefault(columnToGroup[count], float(col)) + groupAverages.setdefault(columnToGroup[count], [float(col)]) groupCounts.setdefault(columnToGroup[count], 1) # This group has already been seen, update the TPM average. else: groupCounts[columnToGroup[count]] += 1 - normal = float(groupCounts[columnToGroup[count]]) - newTpm = (float(col) * (1/normal)) - oldTpm = (((normal - 1) / normal) * groupAverages[columnToGroup[count]]) - groupAverages[columnToGroup[count]] = newTpm + oldTpm + # Median preparation + groupAverages[columnToGroup[count]].append(float(col)) + # Average calculation + #normal = float(groupCounts[columnToGroup[count]]) + #newTpm = (float(col) * (1/normal)) + #oldTpm = (((normal - 1) / normal) * groupAverages[columnToGroup[count]]) + #groupAverages[columnToGroup[count]] = newTpm + oldTpm count += 1 # Store some information on the bed file. Most important is the groupOrder. if getBedInfo: getBedInfo = False bedInfo += "#chr\tstart\tend\tname\tscore\tstrand\tgroupCount\tgroupOrder;" if (groupOrder is not None): for group in open(groupOrder, "r"): bedInfo += group.strip("\n") + " " else: for key, value in sorted(groupAverages.iteritems()): bedInfo += key + " " bedInfo = bedInfo[:-1] + "\toffset\tlineLength" # Write out the transcript name, this is needed to join with coordinates later. bedLikeFile.write(splitLine[0] + "\t") # Create a list of the average scores per group. bedLine = "" # The fullAverage is used to assign a tpm score representative of the entire bed row. fullAverage = 0.0 count = 0.0 if (groupOrder is not None): for group in open(groupOrder, "r"): - value = groupAverages[group.strip("\n")] + # Averages + #value = groupAverages[group.strip("\n")] + value = median(groupAverages[group.strip("\n")]) bedLine = bedLine + "," + floatRound(str(value)) count += 1.0 fullAverage += value else: for key, value in sorted(groupAverages.iteritems()): - bedLine = bedLine + "," + floatRound(str(value)) + bedLine = bedLine + "," + floatRound(str(median(value))) count += 1.0 - fullAverage += value + fullAverage += median(value) # Create what will be columns 5, 7 and 8 of the final bed. bedLine = str(fullAverage/count) + "\t" + str(int(count)) + "\t" + bedLine[1:] + "\n" # If the fullAverage tpm is greater than 0 then consider it in the validTpm list. if (fullAverage > 0.0): validTpms.append((fullAverage/count)) # Write the bedLine to the intermediate bed-like file. bedLikeFile.write(bedLine) # Return the bedInfo so it can be printed right before the script ends. return bedInfo def expMatrixToBarchartBed(options): """ Convert the expression matrix into a barchart bed file. options - The command line options (file names, etc). Use the meta data to map the sample names to their groups, then create a dict that maps