src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed e00c621783b60bd6b781488b7e82f14b749ea92d

e00c621783b60bd6b781488b7e82f14b749ea92d
ceisenhart
  Fri Apr 28 11:27:27 2017 -0700
Fixing up the script and addressing the feedback from Kate, closes #19287

diff --git src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
index a2b0225..80f73d8 100755
--- src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
+++ src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed
@@ -6,74 +6,103 @@
 import os
 import sys
 import argparse
 import tempfile
 
 def parseArgs(args):
     """
     Parse the command line arguments.
     """
     parser= argparse.ArgumentParser(description = __doc__)
     parser.add_argument ("metaFile",
         help = " Two column no header, the first column is the samples which should match " + \
                 "the matrix, the second is the grouping (cell type, tissue, etc)",
         type = argparse.FileType("r"))
     parser.add_argument ("matrixFile",
-            help = " The input matrix file, the first row should start with a a #." + \
+            help = " The input matrix file, the first row should start with a hashtag, e.g. #." + \
                 "the samples in the first row should exactly match the ones in " + \
                 "the metaFile. The labels (ex ENST*****) in the first column should exactly match " + \
                 "the ones in the coordinate file.",
         type = argparse.FileType("r"))
     parser.add_argument ("coordinateMap",
         help = " Six column no header,  maps the column labels from the matrix to coordinates. Tab " + \
                 "separated; label, chr, strand, start coord, end coord, gene name. ",
         action = "store")
     parser.add_argument ("outputFile",
         help = " The output file. ",
         type =argparse.FileType("w"))
-    
+    # Optional arguments.  
+    parser.add_argument ("--groupOrderFile",
+        help = " Optional file to define the group order, list the groups in a single column in " + \
+                "the order desired. ",
+        action = "store")
     parser.add_argument ("--verbose",
     help = " Show runtime messages.",
     action = "store_true")
     
     parser.set_defaults(verbose = False)
+    parser.set_defaults(groupOrderFile = None)
 
     if (len(sys.argv) == 1):
         parser.print_help()
         exit(1)
     options = parser.parse_args()
     return options
 
-def determineScore(tpmCuttoffs, tpm):
+def determineScore(tpmCutoffs, tpm):
     """
     Cast the tpm to a score between 0-1000. Since there are only
     9 visual blocks cast them to be in one of the 9 blocks.  
-    tpmCuttoffs - A list of integers
+    tpmCutoffs - A list of integers
     tpm - An integer
     """
     count = 0
-    for val in tpmCuttoffs: 
+    for val in tpmCutoffs: 
         if (val > tpm): 
             return count*111
         count = count + 1
     return 999
 
-def condenseMatrixIntoBed8Col(validTpms, sampleToGroup, matrix, bedLikeFile):
+def floatRound(inFloat):
+    """
+    Return a float that has at most 2 decimal places.
+    """
+    beforeDecimal = True
+    result = ""
+    count = 0
+    for char in inFloat: 
+        if char is ".":
+            beforeDecimal = False
+            result += char
+            continue
+        if beforeDecimal:
+            result += char
+        else:
+            if count >= 2:
+                return result
+            else:
+                count += 1
+                result += char
+    return result
+
+def condenseMatrixIntoBedCols(matrix, groupOrder, sampleToGroup, validTpms, bedLikeFile):
     """
     Take an expression matrix and a dictionary that maps the samples to groups. 
     Go through the expression matrix and calculate the average for each group, outputting
-    it to an intermediate file as they are calculated. 
+    it to an intermediate file as they are calculated. The intermediate file has
+    three columns, the first is the average tpm for the entire gene, next is the
+    number of groups and finally the average tpm for each group as a comma separated list. 
     validTpms - An empty list of integers. 
     sampleToGroup - A dictionary that maps string samples to string groups. 
     matrix - An expression matrix, samples are the x rows, transcripts the y rows.  
     bedLikeFile - An intermediate file, looks slightly like a bed. 
     """
     # Store some information on the bed file, most important is the order 
     # of the 8th column. 
     bedInfo = "" 
     firstLine = True
     getBedInfo = True
     # Use the first line of the matrix and the sampleToGroup dict to create a dictionary that maps  
     # the column to a group.  
     columnToGroup = dict()
     
     # Go through the matrix line by line. The first line is used to build an index mapping columns 
@@ -109,43 +138,54 @@
                 groupAverages.setdefault(columnToGroup[count], float(col))
                 groupCounts.setdefault(columnToGroup[count], 1)
             # This group has already been seen, update the TPM average.  
             else:
                 groupCounts[columnToGroup[count]] += 1 
                 normal = float(groupCounts[columnToGroup[count]])
                 newTpm = (float(col) * (1/normal)) 
                 oldTpm = (((normal - 1) / normal) * groupAverages[columnToGroup[count]])
                 groupAverages[columnToGroup[count]] = newTpm + oldTpm 
             count += 1
        
         # Store some information on the bed file. Most important is the groupOrder. 
         if getBedInfo: 
             getBedInfo = False 
             bedInfo += "#chr\tstart\tend\tname\tscore\tstrand\tgroupCount\tgroupOrder;"
-            for key, value in groupAverages.iteritems():
+            if (groupOrder is not None):
+                for group in open(groupOrder, "r"): 
+                    bedInfo += group.strip("\n") + " " 
+            else: 
+                for key, value in sorted(groupAverages.iteritems()):
                     bedInfo += key + " "
                 bedInfo = bedInfo[:-1] + "\toffset\tlineLength"
 
         # Write out the transcript name, this is needed to join with coordinates later. 
         bedLikeFile.write(splitLine[0] + "\t")
         # Create a list of the average scores per group. 
         bedLine = ""
         # The fullAverage is used to assign a tpm score representative of the entire bed row. 
         fullAverage = 0.0
         count = 0.0
-        for key, value in groupAverages.iteritems(): 
-            bedLine = bedLine + "," + str(value)
+        if (groupOrder is not None):
+            for group in open(groupOrder, "r"): 
+                value = groupAverages[group.strip("\n")]
+                bedLine = bedLine + "," + floatRound(str(value))
+                count += 1.0
+                fullAverage += value
+        else: 
+            for key, value in sorted(groupAverages.iteritems()): 
+                bedLine = bedLine + "," + floatRound(str(value))
                 count += 1.0
                 fullAverage += value
         # Create what will be columns 5, 7 and 8 of the final bed. 
         bedLine = str(fullAverage/count) + "\t" + str(int(count)) +  "\t" + bedLine[1:] + "\n"
         # If the fullAverage tpm is greater than 0 then consider it in the validTpm list. 
         if (fullAverage > 0.0): 
             validTpms.append((fullAverage/count))
         # Write the bedLine to the intermediate bed-like file. 
         bedLikeFile.write(bedLine)
     # Return the bedInfo so it can be printed right before the script ends. 
     return bedInfo
 
 def expMatrixToBarchartBed(options):
     """
     Convert the expression matrix into a barchart bed file. 
@@ -160,81 +200,94 @@
 
     # Create a dictionary that maps the sample names to their group.
     sampleToGroup = dict()
     for item in options.metaFile: 
         splitLine = item.strip("\n").split()
         sampleToGroup.setdefault(splitLine[0], splitLine[1])
 
     # Use an intermediate file to hold the average values for each group. 
     bedLikeFile = tempfile.NamedTemporaryFile( mode = "w+", bufsize = 1)
     # Keep a list of TPM scores greater than 0. This will be used later
     # to assign bed scores. 
     validTpms = []
 
     # Go through the matrix and condense it into a bed like file. Populate
     # the validTpms array and the bedInfo string.  
-    bedInfo = condenseMatrixIntoBed8Col(validTpms, sampleToGroup, options.matrixFile, bedLikeFile)
+    bedInfo = condenseMatrixIntoBedCols(options.matrixFile, options.groupOrderFile, sampleToGroup, validTpms, bedLikeFile)
     # Find the number which divides the list of non 0 TPM scores into ten blocks. 
     tpmMedian = sorted(validTpms)
     blockSizes = len(tpmMedian)/10
 
     # Create a list of the ten TPM values at the edge of each block.  
     # These used to cast a TPM score to one of ten value between 0-1000.  
-    tpmCuttoffs = []
+    tpmCutoffs = []
     for i in range(1,10):
-        tpmCuttoffs.append(tpmMedian[blockSizes*i])
+        tpmCutoffs.append(tpmMedian[blockSizes*i])
  
     # Sort the bed like file to prepare it for the join. 
     sortedBedLikeFile = tempfile.NamedTemporaryFile( mode = "w+", bufsize = 1)
     cmd = "sort " + bedLikeFile.name + " > " + sortedBedLikeFile.name
     os.system(cmd)
 
+    # Cut apart the coordinate bed to get the transcripts in the first column so it can be joined. 
+    coordBedPart1 = tempfile.NamedTemporaryFile( mode = "w+", bufsize = 1)
+    cmd = "cut -f 4  " + options.coordinateMap + " > " + coordBedPart1.name 
+    os.system(cmd)
+    coordBedPart2 = tempfile.NamedTemporaryFile( mode = "w+", bufsize = 1)
+    cmd = "cut -f 1-3,5,6 " + options.coordinateMap + " > " + coordBedPart2.name 
+    os.system(cmd)
+
     # Sort the coordinate file to prepare it for the join. 
     sortedCoords = tempfile.NamedTemporaryFile( mode = "w+", bufsize = 1)
-    cmd = "sort " + options.coordinateMap + " > " + sortedCoords.name 
+    cmd = "paste " + coordBedPart1.name + " " + coordBedPart2.name + " | sort > " + sortedCoords.name 
     os.system(cmd)
     
     # Join the bed-like file and the coordinate file. 
     joinedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1)
     cmd = "join " + sortedCoords.name + " " + sortedBedLikeFile.name + " | awk -v " + \
             "OFS=\"\\t\" '$1=$1' > " + joinedFile.name
     os.system(cmd)
     
     # Go through the joined file and re arrange the columns creating a bed 6+2 file.
     # Also assign a scaled score 0 - 1000 to each tpm value. 
     bedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1)
     for line in joinedFile:
         splitLine = line.strip("\n").split()
-        if ("_" in splitLine[0]): continue # Ignore alt sequences.  
+        if ("_" in splitLine[0]): 
+            sys.stderr.write("This transcript " + splitLine[0] + " was dropped for having a '_' in the name.\n") 
+            continue # Ignore alt sequences.  
         # Drop sequences where start is greater than end. 
-        if (float(splitLine[3]) > float(splitLine[4])):
+        if (float(splitLine[2]) > float(splitLine[3])):
+            sys.stderr.write("This transcript " + splitLine[0] + " was dropped since chr end, " + \
+                    splitLine[3] + ", is smaller than chr start, " + splitLine[2] + "\n.") 
             continue
-        newLine = (splitLine[1] + "\t" + str(splitLine[3]) + "\t" + str(splitLine[4]) + "\t" + \
-                splitLine[0] + "\t" + str(determineScore(tpmCuttoffs, float(splitLine[6]))) + \
-                "\t" + str(splitLine[2]) + "\t" + str(splitLine[7]) + "\t" + splitLine[8] + \
+        newLine = (splitLine[1] + "\t" + str(splitLine[2]) + "\t" + str(splitLine[3]) + "\t" + \
+                splitLine[0] + "\t" + str(determineScore(tpmCutoffs, float(splitLine[6]))) + \
+                "\t" + str(splitLine[4]) + "\t" + str(splitLine[7]) + "\t" + splitLine[8] + \
                 "\t" + splitLine[5] + "\n")
         bedFile.write(newLine)
 
     # Run Max's indexing script
     indexedBedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1)
     cmd = "bedJoinTabOffset " + options.matrixFile.name + " " + bedFile.name + " " + indexedBedFile.name
     os.system(cmd)
 
-    # Reformat with awk, move the gene name to behind the indeces. 
-    cmd = "cat " + indexedBedFile.name + " | awk '{print $1\"\t\"$2\"\t\"$3\"\t\"$4\"\t\"$5\"\t\"$6\"\t\"$7\"\t\"$8\"\t\"$10\"\t\"$11\"\t\"$9}' > " + options.outputFile.name
+    # Prepend the bed info to the start of the file.  
+    cmd = "echo '" + bedInfo + "' > " + options.outputFile.name
     os.system(cmd)
 
-    # Append the bed info to the end of the file.  
-    cmd = "echo '" + bedInfo + "' >> " + options.outputFile.name
+    # Reformat with awk, move the gene name to behind the indeces. 
+    cmd = "cat " + indexedBedFile.name + " | awk '{print $1\"\t\"$2\"\t\"$3\"\t\"$4\"\t\"" + \
+            "$5\"\t\"$6\"\t\"$7\"\t\"$8\"\t\"$10\"\t\"$11\"\t\"$9}' >> " + options.outputFile.name
     os.system(cmd)
     
     print ("The columns and order of the groups are; \n" + bedInfo)
 
 def main(args):
     """
     Initialized options and calls other functions.
     """
     options = parseArgs(args)
     expMatrixToBarchartBed(options)
 
 if __name__ == "__main__" : 
     sys.exit(main(sys.argv))