f9690acb15db4552ef45f4565e962acc8b3d5998
ceisenhart
  Wed Apr 26 09:05:43 2017 -0700
Updating the expMatrixToBarchartBed script to include gene name in the final bed, refs #19155

diff --git src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
index 4c3d068..a2b0225 100755
--- src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
+++ src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
@@ -1,43 +1,43 @@
 #!/usr/bin/env python2.7
 # expMatrixToBarchartBed
 """
-Generate a barChart bed6+4 file from a matrix, meta data, and coordinates.
+Generate a barChart bed6+5 file from a matrix, meta data, and coordinates.
 """
 import os
 import sys
 import argparse
 import tempfile
 
 def parseArgs(args):
     """
     Parse the command line arguments.
     """
     parser= argparse.ArgumentParser(description = __doc__)
     parser.add_argument ("metaFile",
         help = " Two column no header, the first column is the samples which should match " + \
                 "the matrix, the second is the grouping (cell type, tissue, etc)",
         type = argparse.FileType("r"))
     parser.add_argument ("matrixFile",
             help = " The input matrix file, the first row should start with a a #." + \
                 "the samples in the first row should exactly match the ones in " + \
                 "the metaFile. The labels (ex ENST*****) in the first column should exactly match " + \
                 "the ones in the coordinate file.",
         type = argparse.FileType("r"))
     parser.add_argument ("coordinateMap",
-        help = " Five column no header,  maps the column labels from the matrix to coordinates. Tab " + \
-                "separated; label, chr, strand, start coord, end coord. ",
+        help = " Six column no header,  maps the column labels from the matrix to coordinates. Tab " + \
+                "separated; label, chr, strand, start coord, end coord, gene name. ",
         action = "store")
     parser.add_argument ("outputFile",
         help = " The output file. ",
         type =argparse.FileType("w"))
     
     parser.add_argument ("--verbose",
     help = " Show runtime messages.",
     action = "store_true")
     
     parser.set_defaults(verbose = False)
 
     if (len(sys.argv) == 1):
         parser.print_help()
         exit(1)
     options = parser.parse_args()
@@ -110,31 +110,31 @@
                 groupCounts.setdefault(columnToGroup[count], 1)
             # This group has already been seen, update the TPM average.  
             else:
                 groupCounts[columnToGroup[count]] += 1 
                 normal = float(groupCounts[columnToGroup[count]])
                 newTpm = (float(col) * (1/normal)) 
                 oldTpm = (((normal - 1) / normal) * groupAverages[columnToGroup[count]])
                 groupAverages[columnToGroup[count]] = newTpm + oldTpm 
             count += 1
        
         # Store some information on the bed file. Most important is the groupOrder. 
         if getBedInfo: 
             getBedInfo = False 
             bedInfo += "#chr\tstart\tend\tname\tscore\tstrand\tgroupCount\tgroupOrder;"
             for key, value in groupAverages.iteritems():
-                bedInfo += key + ","
+                bedInfo += key + " "
             bedInfo = bedInfo[:-1] + "\toffset\tlineLength"
 
         # Write out the transcript name, this is needed to join with coordinates later. 
         bedLikeFile.write(splitLine[0] + "\t")
         # Create a list of the average scores per group. 
         bedLine = ""
         # The fullAverage is used to assign a tpm score representative of the entire bed row. 
         fullAverage = 0.0
         count = 0.0
         for key, value in groupAverages.iteritems(): 
             bedLine = bedLine + "," + str(value)
             count += 1.0
             fullAverage += value
         # Create what will be columns 5, 7 and 8 of the final bed. 
         bedLine = str(fullAverage/count) + "\t" + str(int(count)) +  "\t" + bedLine[1:] + "\n"
@@ -197,36 +197,42 @@
     joinedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1)
     cmd = "join " + sortedCoords.name + " " + sortedBedLikeFile.name + " | awk -v " + \
             "OFS=\"\\t\" '$1=$1' > " + joinedFile.name
     os.system(cmd)
     
     # Go through the joined file and re arrange the columns creating a bed 6+2 file.
     # Also assign a scaled score 0 - 1000 to each tpm value. 
     bedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1)
     for line in joinedFile:
         splitLine = line.strip("\n").split()
         if ("_" in splitLine[0]): continue # Ignore alt sequences.  
         # Drop sequences where start is greater than end. 
         if (float(splitLine[3]) > float(splitLine[4])):
             continue
         newLine = (splitLine[1] + "\t" + str(splitLine[3]) + "\t" + str(splitLine[4]) + "\t" + \
-                splitLine[0] + "\t" + str(determineScore(tpmCuttoffs, float(splitLine[5]))) + \
-                "\t" + str(splitLine[2]) + "\t" + str(splitLine[6]) + "\t" + splitLine[7] + "\n")
+                splitLine[0] + "\t" + str(determineScore(tpmCuttoffs, float(splitLine[6]))) + \
+                "\t" + str(splitLine[2]) + "\t" + str(splitLine[7]) + "\t" + splitLine[8] + \
+                "\t" + splitLine[5] + "\n")
         bedFile.write(newLine)
 
     # Run Max's indexing script
-    cmd = "bedJoinTabOffset " + options.matrixFile.name + " " + bedFile.name + " " + options.outputFile.name
+    indexedBedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1)
+    cmd = "bedJoinTabOffset " + options.matrixFile.name + " " + bedFile.name + " " + indexedBedFile.name
+    os.system(cmd)
+
+    # Reformat with awk, move the gene name to behind the indeces. 
+    cmd = "cat " + indexedBedFile.name + " | awk '{print $1\"\t\"$2\"\t\"$3\"\t\"$4\"\t\"$5\"\t\"$6\"\t\"$7\"\t\"$8\"\t\"$10\"\t\"$11\"\t\"$9}' > " + options.outputFile.name
     os.system(cmd)
 
     # Append the bed info to the end of the file.  
     cmd = "echo '" + bedInfo + "' >> " + options.outputFile.name
     os.system(cmd)
     
     print ("The columns and order of the groups are; \n" + bedInfo)
 
 def main(args):
     """
     Initialized options and calls other functions.
     """
     options = parseArgs(args)
     expMatrixToBarchartBed(options)