f9690acb15db4552ef45f4565e962acc8b3d5998 ceisenhart Wed Apr 26 09:05:43 2017 -0700 Updating the expMatrixToBarchartBed script to include gene name in the final bed, refs #19155 diff --git src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed index 4c3d068..a2b0225 100755 --- src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed +++ src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed @@ -1,43 +1,43 @@ #!/usr/bin/env python2.7 # expMatrixToBarchartBed """ -Generate a barChart bed6+4 file from a matrix, meta data, and coordinates. +Generate a barChart bed6+5 file from a matrix, meta data, and coordinates. """ import os import sys import argparse import tempfile def parseArgs(args): """ Parse the command line arguments. """ parser= argparse.ArgumentParser(description = __doc__) parser.add_argument ("metaFile", help = " Two column no header, the first column is the samples which should match " + \ "the matrix, the second is the grouping (cell type, tissue, etc)", type = argparse.FileType("r")) parser.add_argument ("matrixFile", help = " The input matrix file, the first row should start with a a #." + \ "the samples in the first row should exactly match the ones in " + \ "the metaFile. The labels (ex ENST*****) in the first column should exactly match " + \ "the ones in the coordinate file.", type = argparse.FileType("r")) parser.add_argument ("coordinateMap", - help = " Five column no header, maps the column labels from the matrix to coordinates. Tab " + \ - "separated; label, chr, strand, start coord, end coord. ", + help = " Six column no header, maps the column labels from the matrix to coordinates. Tab " + \ + "separated; label, chr, strand, start coord, end coord, gene name. ", action = "store") parser.add_argument ("outputFile", help = " The output file. ", type =argparse.FileType("w")) parser.add_argument ("--verbose", help = " Show runtime messages.", action = "store_true") parser.set_defaults(verbose = False) if (len(sys.argv) == 1): parser.print_help() exit(1) options = parser.parse_args() @@ -110,31 +110,31 @@ groupCounts.setdefault(columnToGroup[count], 1) # This group has already been seen, update the TPM average. else: groupCounts[columnToGroup[count]] += 1 normal = float(groupCounts[columnToGroup[count]]) newTpm = (float(col) * (1/normal)) oldTpm = (((normal - 1) / normal) * groupAverages[columnToGroup[count]]) groupAverages[columnToGroup[count]] = newTpm + oldTpm count += 1 # Store some information on the bed file. Most important is the groupOrder. if getBedInfo: getBedInfo = False bedInfo += "#chr\tstart\tend\tname\tscore\tstrand\tgroupCount\tgroupOrder;" for key, value in groupAverages.iteritems(): - bedInfo += key + "," + bedInfo += key + " " bedInfo = bedInfo[:-1] + "\toffset\tlineLength" # Write out the transcript name, this is needed to join with coordinates later. bedLikeFile.write(splitLine[0] + "\t") # Create a list of the average scores per group. bedLine = "" # The fullAverage is used to assign a tpm score representative of the entire bed row. fullAverage = 0.0 count = 0.0 for key, value in groupAverages.iteritems(): bedLine = bedLine + "," + str(value) count += 1.0 fullAverage += value # Create what will be columns 5, 7 and 8 of the final bed. bedLine = str(fullAverage/count) + "\t" + str(int(count)) + "\t" + bedLine[1:] + "\n" @@ -197,36 +197,42 @@ joinedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1) cmd = "join " + sortedCoords.name + " " + sortedBedLikeFile.name + " | awk -v " + \ "OFS=\"\\t\" '$1=$1' > " + joinedFile.name os.system(cmd) # Go through the joined file and re arrange the columns creating a bed 6+2 file. # Also assign a scaled score 0 - 1000 to each tpm value. bedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1) for line in joinedFile: splitLine = line.strip("\n").split() if ("_" in splitLine[0]): continue # Ignore alt sequences. # Drop sequences where start is greater than end. if (float(splitLine[3]) > float(splitLine[4])): continue newLine = (splitLine[1] + "\t" + str(splitLine[3]) + "\t" + str(splitLine[4]) + "\t" + \ - splitLine[0] + "\t" + str(determineScore(tpmCuttoffs, float(splitLine[5]))) + \ - "\t" + str(splitLine[2]) + "\t" + str(splitLine[6]) + "\t" + splitLine[7] + "\n") + splitLine[0] + "\t" + str(determineScore(tpmCuttoffs, float(splitLine[6]))) + \ + "\t" + str(splitLine[2]) + "\t" + str(splitLine[7]) + "\t" + splitLine[8] + \ + "\t" + splitLine[5] + "\n") bedFile.write(newLine) # Run Max's indexing script - cmd = "bedJoinTabOffset " + options.matrixFile.name + " " + bedFile.name + " " + options.outputFile.name + indexedBedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1) + cmd = "bedJoinTabOffset " + options.matrixFile.name + " " + bedFile.name + " " + indexedBedFile.name + os.system(cmd) + + # Reformat with awk, move the gene name to behind the indeces. + cmd = "cat " + indexedBedFile.name + " | awk '{print $1\"\t\"$2\"\t\"$3\"\t\"$4\"\t\"$5\"\t\"$6\"\t\"$7\"\t\"$8\"\t\"$10\"\t\"$11\"\t\"$9}' > " + options.outputFile.name os.system(cmd) # Append the bed info to the end of the file. cmd = "echo '" + bedInfo + "' >> " + options.outputFile.name os.system(cmd) print ("The columns and order of the groups are; \n" + bedInfo) def main(args): """ Initialized options and calls other functions. """ options = parseArgs(args) expMatrixToBarchartBed(options)