0502f1581f13bae7ba257fc290176d64e8bfd616 ceisenhart Wed May 10 15:25:52 2017 -0700 Changing the format of one of the input files and updating documentation a bit, refs #19155 diff --git src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed index 6bbb794..3e155c2 100755 --- src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed +++ src/utils/expMatrixToBarchartBed/expMatrixToBarchartBed @@ -1,41 +1,41 @@ #!/usr/bin/env python2.7 # expMatrixToBarchartBed """ Generate a barChart bed6+5 file from a matrix, meta data, and coordinates. """ import os import sys import argparse import tempfile def parseArgs(args): """ Parse the command line arguments. """ parser= argparse.ArgumentParser(description = __doc__) - parser.add_argument ("metaFile", + parser.add_argument ("sampleFile", help = " Two column no header, the first column is the samples which should match " + \ "the matrix, the second is the grouping (cell type, tissue, etc)", type = argparse.FileType("r")) parser.add_argument ("matrixFile", help = " The input matrix file. The samples in the first row should exactly match the ones in " + \ - "the metaFile. The labels (ex ENST*****) in the first column should exactly match " + \ - "the ones in the coordinate file.", + "the sampleFile. The labels (ex ENST*****) in the first column should exactly match " + \ + "the ones in the bed file.", type = argparse.FileType("r")) - parser.add_argument ("coordinateMap", - help = " Bed5+1 format. File that maps the column labels from the matrix to coordinates. Tab " + \ + parser.add_argument ("bedFile", + help = " Bed4+2 format. File that maps the column labels from the matrix to coordinates. Tab " + \ "separated; chr, start coord, end coord, label, strand, gene name. ", action = "store") parser.add_argument ("outputFile", help = " The output file, bed 6+5 format. See the schema in kent/src/hg/lib/barChartBed.as. ", type =argparse.FileType("w")) # Optional arguments. parser.add_argument ("--groupOrderFile", help = " Optional file to define the group order, list the groups in a single column in " + \ "the order desired. The default ordering is alphabetical.", action = "store") parser.add_argument ("--useMean", help = " Calculate the group values using mean rather than median.", action = "store_true") parser.add_argument ("--verbose", help = " Show runtime messages.", @@ -161,38 +161,38 @@ # First time this group is seen, add it to the groupCounts dict. if (groupAverages.get(columnToGroup[count]) == None): groupAverages.setdefault(columnToGroup[count], [float(col)]) groupCounts.setdefault(columnToGroup[count], 1) # This group has already been seen, update the TPM average. else: groupCounts[columnToGroup[count]] += 1 # Median preparation groupAverages[columnToGroup[count]].append(float(col)) count += 1 # Store some information on the bed file. Most important is the groupOrder. if getBedInfo: getBedInfo = False - bedInfo += "#chr\tstart\tend\tname\tscore\tstrand\tname2\tgroupCount\tgroupOrder;" + bedInfo += "#chr\tstart\tend\tname\tscore\tstrand\tname2\texpCount\texpScores;" if (groupOrder is not None): for group in open(groupOrder, "r"): bedInfo += group.strip("\n") + " " else: for key, value in sorted(groupAverages.iteritems()): bedInfo += key + " " - bedInfo = bedInfo[:-1] + "\toffset\tlineLength" + bedInfo = bedInfo[:-1] + "\t_offset\t_lineLength" # Write out the transcript name, this is needed to join with coordinates later. bedLikeFile.write(splitLine[0] + "\t") # Create a list of the average scores per group. bedLine = "" # The fullAverage is used to assign a tpm score representative of the entire bed row. fullAverage = 0.0 count = 0.0 if (groupOrder is not None): for group in open(groupOrder, "r"): # Averages if (useMean): value = groupAverages[group.strip("\n")] else: value = median(groupAverages[group.strip("\n")]) @@ -222,102 +222,118 @@ def expMatrixToBarchartBed(options): """ Convert the expression matrix into a barchart bed file. options - The command line options (file names, etc). Use the meta data to map the sample names to their groups, then create a dict that maps the columns to the groups. Go through the matrix line by line and get the median or average for each group. Print this to an intermediate file, then use the unix 'join' command to link with the coordinates file via the first matrix column. This creates a file with many of the bed fields just in the wrong order. Go through this file to re arrange the columns, check for and remove entries where chromsomes names include "_" and chr start > chr end. Finally run Max's bedJoinTabOffset to index the matrix adding the dataOffset and dataLen columns and creating a bed 6+5 file. """ # Create a dictionary that maps the sample names to their group. sampleToGroup = dict() - for item in options.metaFile: + count = 0 + for item in options.sampleFile: + count +=1 splitLine = item.strip("\n").split() + if (len(splitLine) is not 2): + print ("There was an error reading the sample file at line " + count) + exit(1) sampleToGroup.setdefault(splitLine[0], splitLine[1]) # Use an intermediate file to hold the average values for each group. bedLikeFile = tempfile.NamedTemporaryFile( mode = "w+", bufsize = 1) # Keep a list of TPM scores greater than 0. This will be used later # to assign bed scores. validTpms = [] # Go through the matrix and condense it into a bed like file. Populate # the validTpms array and the bedInfo string. bedInfo = condenseMatrixIntoBedCols(options.matrixFile, options.groupOrderFile, sampleToGroup, \ validTpms, bedLikeFile, options.useMean) # Find the number which divides the list of non 0 TPM scores into ten blocks. tpmMedian = sorted(validTpms) blockSizes = len(tpmMedian)/10 # Create a list of the ten TPM values at the edge of each block. # These used to cast a TPM score to one of ten value between 0-1000. tpmCutoffs = [] for i in range(1,10): tpmCutoffs.append(tpmMedian[blockSizes*i]) # Sort the bed like file to prepare it for the join. sortedBedLikeFile = tempfile.NamedTemporaryFile( mode = "w+", bufsize = 1) cmd = "sort " + bedLikeFile.name + " > " + sortedBedLikeFile.name os.system(cmd) # Cut apart the coordinate bed to get the transcripts in the first column so it can be joined. coordBedPart1 = tempfile.NamedTemporaryFile( mode = "w+", bufsize = 1) - cmd = "cut -f 4 " + options.coordinateMap + " > " + coordBedPart1.name + cmd = "cut -f 4 " + options.bedFile + " > " + coordBedPart1.name os.system(cmd) coordBedPart2 = tempfile.NamedTemporaryFile( mode = "w+", bufsize = 1) - cmd = "cut -f 1-3,5,6 " + options.coordinateMap + " > " + coordBedPart2.name + cmd = "cut -f 1-3,5-7 " + options.bedFile + " > " + coordBedPart2.name os.system(cmd) # Sort the coordinate file to prepare it for the join. sortedCoords = tempfile.NamedTemporaryFile( mode = "w+", bufsize = 1) cmd = "paste " + coordBedPart1.name + " " + coordBedPart2.name + " | sort > " + sortedCoords.name os.system(cmd) # Join the bed-like file and the coordinate file. joinedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1) cmd = "join " + sortedCoords.name + " " + sortedBedLikeFile.name + " | awk -v " + \ "OFS=\"\\t\" '$1=$1' > " + joinedFile.name os.system(cmd) # Go through the joined file and re arrange the columns creating a bed 6+2 file. # Also assign a scaled score 0 - 1000 to each tpm value. bedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1) for line in joinedFile: splitLine = line.strip("\n").split() if ("_" in splitLine[0]): sys.stderr.write("This transcript " + splitLine[0] + " was dropped for having a '_' in the name.\n") continue # Ignore alt sequences. # Drop sequences where start is greater than end. if (float(splitLine[2]) > float(splitLine[3])): sys.stderr.write("This transcript " + splitLine[0] + " was dropped since chr end, " + \ splitLine[3] + ", is smaller than chr start, " + splitLine[2] + "\n.") continue - newLine = (splitLine[1] + "\t" + str(splitLine[2]) + "\t" + str(splitLine[3]) + "\t" + \ - splitLine[0] + "\t" + str(determineScore(tpmCutoffs, float(splitLine[6]))) + \ - "\t" + str(splitLine[4]) + "\t" + splitLine[5] + "\t" + str(splitLine[7]) + \ - "\t" + splitLine[8] + "\n") - bedFile.write(newLine) + chrom = splitLine[1] + chrStart = splitLine[2] + chrEnd = splitLine[3] + name = splitLine[0] + score = str(determineScore(tpmCutoffs, float(splitLine[7]))) + strand = splitLine[5] + name2 = splitLine[6] + expCount = str(splitLine[7]) + expScores = splitLine[8] - # Prepend the bed info to the start of the file. - cmd = "echo '" + bedInfo + "' > " + options.outputFile.name - os.system(cmd) + bedLine = (chrom + "\t" + chrStart + "\t" + chrEnd + "\t" + name + "\t" \ + + score + "\t" + strand + "\t" + name2 + "\t" + expCount + "\t" \ + + expScores + "\n") + bedFile.write(bedLine) # Run Max's indexing script indexedBedFile = tempfile.NamedTemporaryFile(mode="w+", bufsize=1) - cmd = "bedJoinTabOffset " + options.matrixFile.name + " " + bedFile.name + " " + options.outputFile.name + cmd = "bedJoinTabOffset " + options.matrixFile.name + " " + bedFile.name + " " + indexedBedFile.name + os.system(cmd) + + # Prepend the bed info to the start of the file. + cmd = "echo '" + bedInfo + "' > " + options.outputFile.name + os.system(cmd) + cmd = "cat " + indexedBedFile.name + " >> " + options.outputFile.name os.system(cmd) print ("The columns and order of the groups are; \n" + bedInfo) def main(args): """ Initialized options and calls other functions. """ options = parseArgs(args) expMatrixToBarchartBed(options) if __name__ == "__main__" : sys.exit(main(sys.argv))