69e7bc29592bfd70f29d30ef9da060f91d3621be ceisenhart Fri May 5 10:20:14 2017 -0700 Fixing an output column error spotted by Chris Lee and updating documentation, refs #18736 and #19155 diff --git src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed index d3b4bd8..6bbb794 100755 --- src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed +++ src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed @@ -15,31 +15,31 @@ parser= argparse.ArgumentParser(description = __doc__) parser.add_argument ("metaFile", help = " Two column no header, the first column is the samples which should match " + \ "the matrix, the second is the grouping (cell type, tissue, etc)", type = argparse.FileType("r")) parser.add_argument ("matrixFile", help = " The input matrix file. The samples in the first row should exactly match the ones in " + \ "the metaFile. The labels (ex ENST*****) in the first column should exactly match " + \ "the ones in the coordinate file.", type = argparse.FileType("r")) parser.add_argument ("coordinateMap", help = " Bed5+1 format. File that maps the column labels from the matrix to coordinates. Tab " + \ "separated; chr, start coord, end coord, label, strand, gene name. ", action = "store") parser.add_argument ("outputFile", - help = " The output file. ", + help = " The output file, bed 6+5 format. See the schema in kent/src/hg/lib/barChartBed.as. ", type =argparse.FileType("w")) # Optional arguments. parser.add_argument ("--groupOrderFile", help = " Optional file to define the group order, list the groups in a single column in " + \ "the order desired. The default ordering is alphabetical.", action = "store") parser.add_argument ("--useMean", help = " Calculate the group values using mean rather than median.", action = "store_true") parser.add_argument ("--verbose", help = " Show runtime messages.", action = "store_true") parser.set_defaults(verbose = False) parser.set_defaults(groupOrderFile = None) @@ -161,31 +161,31 @@ # First time this group is seen, add it to the groupCounts dict. if (groupAverages.get(columnToGroup[count]) == None): groupAverages.setdefault(columnToGroup[count], [float(col)]) groupCounts.setdefault(columnToGroup[count], 1) # This group has already been seen, update the TPM average. else: groupCounts[columnToGroup[count]] += 1 # Median preparation groupAverages[columnToGroup[count]].append(float(col)) count += 1 # Store some information on the bed file. Most important is the groupOrder. if getBedInfo: getBedInfo = False - bedInfo += "#chr\tstart\tend\tname\tscore\tstrand\tgroupCount\tgroupOrder;" + bedInfo += "#chr\tstart\tend\tname\tscore\tstrand\tname2\tgroupCount\tgroupOrder;" if (groupOrder is not None): for group in open(groupOrder, "r"): bedInfo += group.strip("\n") + " " else: for key, value in sorted(groupAverages.iteritems()): bedInfo += key + " " bedInfo = bedInfo[:-1] + "\toffset\tlineLength" # Write out the transcript name, this is needed to join with coordinates later. bedLikeFile.write(splitLine[0] + "\t") # Create a list of the average scores per group. bedLine = "" # The fullAverage is used to assign a tpm score representative of the entire bed row. fullAverage = 0.0 count = 0.0 @@ -212,35 +212,36 @@ # Create what will be columns 5, 7 and 8 of the final bed. bedLine = str(fullAverage/count) + "\t" + str(int(count)) + "\t" + bedLine[1:] + "\n" # If the fullAverage tpm is greater than 0 then consider it in the validTpm list. if (fullAverage > 0.0): validTpms.append((fullAverage/count)) # Write the bedLine to the intermediate bed-like file. bedLikeFile.write(bedLine) # Return the bedInfo so it can be printed right before the script ends. return bedInfo def expMatrixToBarchartBed(options): """ Convert the expression matrix into a barchart bed file. options - The command line options (file names, etc). Use the meta data to map the sample names to their groups, then create a dict that maps - the columns to the groups. Go through the matrix line by line and get the average for each - group. Print this to an intermediate file, then use the unix 'join' command to link this with - the coordinates file, creating a bed like file. Go through this file once more to clean up a - bit, re arrange columns and calculate a new score creating a bed 6+2. Finally run - Max's bedJoinTabOffset to index the matrix, creating a bed 6+4 file. + the columns to the groups. Go through the matrix line by line and get the median or average for each + group. Print this to an intermediate file, then use the unix 'join' command to link with + the coordinates file via the first matrix column. This creates a file with many of the bed fields + just in the wrong order. Go through this file to re arrange the columns, check for and remove entries + where chromsomes names include "_" and chr start > chr end. Finally run Max's bedJoinTabOffset to + index the matrix adding the dataOffset and dataLen columns and creating a bed 6+5 file. """ # Create a dictionary that maps the sample names to their group. sampleToGroup = dict() for item in options.metaFile: splitLine = item.strip("\n").split() sampleToGroup.setdefault(splitLine[0], splitLine[1]) # Use an intermediate file to hold the average values for each group. bedLikeFile = tempfile.NamedTemporaryFile( mode = "w+", bufsize = 1) # Keep a list of TPM scores greater than 0. This will be used later # to assign bed scores. validTpms = [] # Go through the matrix and condense it into a bed like file. Populate @@ -284,44 +285,39 @@ # Go through the joined file and re arrange the columns creating a bed 6+2 file. # Also assign a scaled score 0 - 1000 to each tpm value. bedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1) for line in joinedFile: splitLine = line.strip("\n").split() if ("_" in splitLine[0]): sys.stderr.write("This transcript " + splitLine[0] + " was dropped for having a '_' in the name.\n") continue # Ignore alt sequences. # Drop sequences where start is greater than end. if (float(splitLine[2]) > float(splitLine[3])): sys.stderr.write("This transcript " + splitLine[0] + " was dropped since chr end, " + \ splitLine[3] + ", is smaller than chr start, " + splitLine[2] + "\n.") continue newLine = (splitLine[1] + "\t" + str(splitLine[2]) + "\t" + str(splitLine[3]) + "\t" + \ splitLine[0] + "\t" + str(determineScore(tpmCutoffs, float(splitLine[6]))) + \ - "\t" + str(splitLine[4]) + "\t" + str(splitLine[7]) + "\t" + splitLine[8] + \ - "\t" + splitLine[5] + "\n") + "\t" + str(splitLine[4]) + "\t" + splitLine[5] + "\t" + str(splitLine[7]) + \ + "\t" + splitLine[8] + "\n") bedFile.write(newLine) - # Run Max's indexing script - indexedBedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1) - cmd = "bedJoinTabOffset " + options.matrixFile.name + " " + bedFile.name + " " + indexedBedFile.name - os.system(cmd) - # Prepend the bed info to the start of the file. cmd = "echo '" + bedInfo + "' > " + options.outputFile.name os.system(cmd) - # Reformat with awk, move the gene name to behind the indeces. - cmd = "cat " + indexedBedFile.name + " | awk '{print $1\"\t\"$2\"\t\"$3\"\t\"$4\"\t\"" + \ - "$5\"\t\"$6\"\t\"$7\"\t\"$8\"\t\"$10\"\t\"$11\"\t\"$9}' >> " + options.outputFile.name + # Run Max's indexing script + indexedBedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1) + cmd = "bedJoinTabOffset " + options.matrixFile.name + " " + bedFile.name + " " + options.outputFile.name os.system(cmd) print ("The columns and order of the groups are; \n" + bedInfo) def main(args): """ Initialized options and calls other functions. """ options = parseArgs(args) expMatrixToBarchartBed(options) if __name__ == "__main__" : sys.exit(main(sys.argv))