src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed 69e7bc29592bfd70f29d30ef9da060f91d3621be

69e7bc29592bfd70f29d30ef9da060f91d3621be
ceisenhart
  Fri May 5 10:20:14 2017 -0700
Fixing an output column error spotted by Chris Lee and updating documentation, refs #18736 and #19155

diff --git src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
index d3b4bd8..6bbb794 100755
--- src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
+++ src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
@@ -15,31 +15,31 @@
     parser= argparse.ArgumentParser(description = __doc__)
     parser.add_argument ("metaFile",
         help = " Two column no header, the first column is the samples which should match " + \
                 "the matrix, the second is the grouping (cell type, tissue, etc)",
         type = argparse.FileType("r"))
     parser.add_argument ("matrixFile",
             help = " The input matrix file. The samples in the first row should exactly match the ones in " + \
                 "the metaFile. The labels (ex ENST*****) in the first column should exactly match " + \
                 "the ones in the coordinate file.",
         type = argparse.FileType("r"))
     parser.add_argument ("coordinateMap",
         help = " Bed5+1 format. File that maps the column labels from the matrix to coordinates. Tab " + \
                 "separated; chr, start coord, end coord, label, strand, gene name. ",
         action = "store")
     parser.add_argument ("outputFile",
-        help = " The output file. ",
+        help = " The output file, bed 6+5 format. See the schema in kent/src/hg/lib/barChartBed.as. ",
         type =argparse.FileType("w"))
     # Optional arguments.  
     parser.add_argument ("--groupOrderFile",
         help = " Optional file to define the group order, list the groups in a single column in " + \
                 "the order desired. The default ordering is alphabetical.",
         action = "store")
     parser.add_argument ("--useMean",
     help = " Calculate the group values using mean rather than median.",
     action = "store_true")
     parser.add_argument ("--verbose",
     help = " Show runtime messages.",
     action = "store_true")
     
     parser.set_defaults(verbose = False)
     parser.set_defaults(groupOrderFile = None)
@@ -161,31 +161,31 @@
                 # First time this group is seen, add it to the groupCounts dict. 
                 if (groupAverages.get(columnToGroup[count]) == None):
                     groupAverages.setdefault(columnToGroup[count], [float(col)])
                     groupCounts.setdefault(columnToGroup[count], 1)
                 # This group has already been seen, update the TPM average.  
                 else:
                     groupCounts[columnToGroup[count]] += 1 
                     # Median preparation
                     groupAverages[columnToGroup[count]].append(float(col))
 
             count += 1
        
         # Store some information on the bed file. Most important is the groupOrder. 
         if getBedInfo: 
             getBedInfo = False 
-            bedInfo += "#chr\tstart\tend\tname\tscore\tstrand\tgroupCount\tgroupOrder;"
+            bedInfo += "#chr\tstart\tend\tname\tscore\tstrand\tname2\tgroupCount\tgroupOrder;"
             if (groupOrder is not None):
                 for group in open(groupOrder, "r"): 
                     bedInfo += group.strip("\n") + " " 
             else: 
                 for key, value in sorted(groupAverages.iteritems()):
                     bedInfo += key + " "
                 bedInfo = bedInfo[:-1] + "\toffset\tlineLength"
 
         # Write out the transcript name, this is needed to join with coordinates later. 
         bedLikeFile.write(splitLine[0] + "\t")
         # Create a list of the average scores per group. 
         bedLine = ""
         # The fullAverage is used to assign a tpm score representative of the entire bed row. 
         fullAverage = 0.0
         count = 0.0
@@ -212,35 +212,36 @@
         # Create what will be columns 5, 7 and 8 of the final bed. 
         bedLine = str(fullAverage/count) + "\t" + str(int(count)) +  "\t" + bedLine[1:] + "\n"
         # If the fullAverage tpm is greater than 0 then consider it in the validTpm list. 
         if (fullAverage > 0.0): 
             validTpms.append((fullAverage/count))
         # Write the bedLine to the intermediate bed-like file. 
         bedLikeFile.write(bedLine)
     # Return the bedInfo so it can be printed right before the script ends. 
     return bedInfo
 
 def expMatrixToBarchartBed(options):
     """
     Convert the expression matrix into a barchart bed file. 
     options - The command line options (file names, etc). 
         Use the meta data to map the sample names to their groups, then create a dict that maps
-    the columns to the groups.  Go through the matrix line by line and get the average for each 
-    group. Print this to an intermediate file, then use the unix 'join' command to link this with
-    the coordinates file, creating a bed like file.  Go through this file once more to clean up a
-    bit, re arrange columns and calculate a new score creating a bed 6+2.  Finally run
-    Max's bedJoinTabOffset to index the matrix, creating a bed 6+4 file.   
+    the columns to the groups.  Go through the matrix line by line and get the median or average for each 
+    group. Print this to an intermediate file, then use the unix 'join' command to link with
+    the coordinates file via the first matrix column. This creates a file with many of the bed fields
+    just in the wrong order.  Go through this file to re arrange the columns, check for and remove entries 
+    where chromsomes names include "_" and chr start > chr end.  Finally run Max's bedJoinTabOffset to 
+    index the matrix adding the dataOffset and dataLen columns and creating a bed 6+5 file.   
     """
 
     # Create a dictionary that maps the sample names to their group.
     sampleToGroup = dict()
     for item in options.metaFile: 
         splitLine = item.strip("\n").split()
         sampleToGroup.setdefault(splitLine[0], splitLine[1])
 
     # Use an intermediate file to hold the average values for each group. 
     bedLikeFile = tempfile.NamedTemporaryFile( mode = "w+", bufsize = 1)
     # Keep a list of TPM scores greater than 0. This will be used later
     # to assign bed scores. 
     validTpms = []
 
     # Go through the matrix and condense it into a bed like file. Populate
@@ -284,44 +285,39 @@
     # Go through the joined file and re arrange the columns creating a bed 6+2 file.
     # Also assign a scaled score 0 - 1000 to each tpm value. 
     bedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1)
     for line in joinedFile:
         splitLine = line.strip("\n").split()
         if ("_" in splitLine[0]): 
             sys.stderr.write("This transcript " + splitLine[0] + " was dropped for having a '_' in the name.\n") 
             continue # Ignore alt sequences.  
         # Drop sequences where start is greater than end. 
         if (float(splitLine[2]) > float(splitLine[3])):
             sys.stderr.write("This transcript " + splitLine[0] + " was dropped since chr end, " + \
                     splitLine[3] + ", is smaller than chr start, " + splitLine[2] + "\n.") 
             continue
         newLine = (splitLine[1] + "\t" + str(splitLine[2]) + "\t" + str(splitLine[3]) + "\t" + \
                 splitLine[0] + "\t" + str(determineScore(tpmCutoffs, float(splitLine[6]))) + \
-                "\t" + str(splitLine[4]) + "\t" + str(splitLine[7]) + "\t" + splitLine[8] + \
-                "\t" + splitLine[5] + "\n")
+                "\t" + str(splitLine[4]) + "\t" + splitLine[5] + "\t" + str(splitLine[7]) + \
+                "\t" + splitLine[8] + "\n")
         bedFile.write(newLine)
     
-    # Run Max's indexing script
-    indexedBedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1)
-    cmd = "bedJoinTabOffset " + options.matrixFile.name + " " + bedFile.name + " " + indexedBedFile.name
-    os.system(cmd)
-
     # Prepend the bed info to the start of the file.  
     cmd = "echo '" + bedInfo + "' > " + options.outputFile.name
     os.system(cmd)
 
-    # Reformat with awk, move the gene name to behind the indeces. 
-    cmd = "cat " + indexedBedFile.name + " | awk '{print $1\"\t\"$2\"\t\"$3\"\t\"$4\"\t\"" + \
-            "$5\"\t\"$6\"\t\"$7\"\t\"$8\"\t\"$10\"\t\"$11\"\t\"$9}' >> " + options.outputFile.name
+    # Run Max's indexing script
+    indexedBedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1)
+    cmd = "bedJoinTabOffset " + options.matrixFile.name + " " + bedFile.name + " " + options.outputFile.name
     os.system(cmd)
 
     print ("The columns and order of the groups are; \n" + bedInfo)
 
 def main(args):
     """
     Initialized options and calls other functions.
     """
     options = parseArgs(args)
     expMatrixToBarchartBed(options)
 
 if __name__ == "__main__" : 
     sys.exit(main(sys.argv))