e86711604cdb883a23ee8af668b49e4d666dae5c chmalee Tue Apr 2 13:58:55 2019 -0700 Letting python handle printing of floats instead of custom float parser, also allowing fix or alt sequences, refs #23218 diff --git src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed index 7154139..c66c87c 100755 --- src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed +++ src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed @@ -95,52 +95,30 @@ def determineScore(tpmCutoffs, tpm): """ Cast the tpm to a score between 0-1000. Since there are only 9 visual blocks cast them to be in one of the 9 blocks. tpmCutoffs - A list of integers tpm - An integer """ count = 0 for val in tpmCutoffs: if (val > tpm): return count*111 count = count + 1 return 999 -def floatRound(inFloat): - """ - Return a float that has at most 2 decimal places. - """ - beforeDecimal = True - result = "" - count = 0 - for char in inFloat: - if char is ".": - beforeDecimal = False - result += char - continue - if beforeDecimal: - result += char - else: - if count >= 2: - return result - else: - count += 1 - result += char - return result - def condenseMatrixIntoBedCols(matrix, groupOrder, autoSql, sampleToGroup, validTpms, bedLikeFile, useMean): """ Take an expression matrix and a dictionary that maps the samples to groups. Go through the expression matrix and calculate the average for each group, outputting it to an intermediate file as they are calculated. The intermediate file has three columns, the first is the average tpm for the entire gene, next is the number of groups and finally the average tpm for each group as a comma separated list. matrix - An expression matrix, samples are the x rows, transcripts the y rows. groupOrder - Optional order of categories autoSql - Optional description of extra fields sampleToGroup - A dictionary that maps string samples to string groups. validTpms - An empty list of integers. bedLikeFile - An intermediate file, looks slightly like a bed. """ @@ -225,40 +203,40 @@ # Write out the transcript name, this is needed to join with coordinates later. bedLikeFile.write(splitLine[0] + "\t") # Create a list of the average scores per group. bedLine = "" # The fullAverage is used to assign a tpm score representative of the entire bed row. fullAverage = 0.0 count = 0.0 if (groupOrder is not None): for group in open(groupOrder, "r"): # Averages if (useMean): value = groupAverages[group.strip("\n")] else: value = median(groupAverages[group.strip("\n")]) - bedLine = bedLine + "," + floatRound(str(value)) + bedLine = bedLine + "," + "%0.2g" % value count += 1.0 fullAverage += value else: for key, value in sorted(groupAverages.iteritems()): if (useMean): - bedLine = bedLine + "," + floatRound(str(value)) + bedLine = bedLine + "," + "%0.2g" % value fullAverage += value else: - bedLine = bedLine + "," + floatRound(str(median(value))) + bedLine = bedLine + "," + "%0.2g" % median(value) fullAverage += median(value) count += 1.0 # Create what will be columns 5, 7 and 8 of the final bed. bedLine = str(fullAverage/count) + "\t" + str(int(count)) + "\t" + bedLine[1:] + "\n" # If the fullAverage tpm is greater than 0 then consider it in the validTpm list. if (fullAverage > 0.0): validTpms.append((fullAverage/count)) # Write the bedLine to the intermediate bed-like file. bedLikeFile.write(bedLine) # Return the bedInfo so it can be printed right before the script ends. return bedInfo def expMatrixToBarchartBed(options): """ @@ -317,33 +295,30 @@ os.system(cmd) # Join the bed-like file and the coordinate file, the awk accounts for any extra # fields that may be included, and keeps the file in standard bed 6+5 format joinedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1) cmd = "join -t $'\\t' -1 4 -2 1 " + sortedCoords.name + " " + sortedBedLikeFile.name + \ " | awk -F'\t' -v OFS=\"\\t\" '{printf \"%s\\t%s\\t%s\\t%s\", $2,$3,$4,$1; " + \ "for (i=5;i<=NF;i++) {printf \"\\t%s\", $i}; printf\"\\n\";}' > " + joinedFile.name os.system(cmd) # Go through the joined file and re arrange the columns creating a bed 6+5+ file. # Also assign a scaled score 0 - 1000 to each tpm value. bedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1) for line in joinedFile: splitLine = line.strip("\n").split("\t") - if ("_" in splitLine[0]): - sys.stderr.write("This transcript: " + splitLine[0] + " was dropped for having a '_' in the name.\n") - continue # Ignore alt sequences. # Drop sequences where start is greater than end. if (float(splitLine[1]) > float(splitLine[2])): sys.stderr.write("This transcript: " + splitLine[0] + " was dropped since chr end, " + \ splitLine[2] + ", is smaller than chr start, " + splitLine[1] + ".\n") continue score = str(determineScore(tpmCutoffs, float(splitLine[-3]))) if autoSql: #skip the 4th field since we recalculated it #need a different ordering to account for possible extraFields bedLine = "\t".join(splitLine[:4] + [score] + splitLine[5:7] + splitLine[-2:] + splitLine[7:-3]) + "\n" else: #skip the 4th field since we recalculate it bedLine = "\t".join(splitLine[:4] + [score] + splitLine[5:7] + splitLine[8:]) + "\n" bedFile.write(bedLine)