src/utils/matrixMerge cd79064dde5eff6904e314e21df8baa54aafc889

cd79064dde5eff6904e314e21df8baa54aafc889
max
  Mon Aug 8 13:23:10 2016 -0700
adding a command to merge two gene expression matrices, currently useful
for CIRM CDW, but probably useful for other projects that deal with
RNA-seq. No redmine.

diff --git src/utils/matrixMerge src/utils/matrixMerge
new file mode 100755
index 0000000..ffe5aeb
--- /dev/null
+++ src/utils/matrixMerge
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+
+import logging, sys, optparse
+from collections import defaultdict
+from os.path import join, basename, dirname, isfile
+
+# ==== functions =====
+    
+def parseArgs():
+    " setup logging, parse command line arguments and options. -h shows auto-generated help page "
+    parser = optparse.OptionParser("usage: %prog [options] file1 file2 ... outFname - merge expression matrices with one line per gene into a big matrix. Matrices have to be of identical size and sorted by geneId.")
+
+    parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
+    #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") 
+    #parser.add_option("", "--test", dest="test", action="store_true", help="do something") 
+    (options, args) = parser.parse_args()
+
+    if args==[]:
+        parser.print_help()
+        exit(1)
+
+    if options.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+    return args, options
+
+def getAllFields(ifhs):
+    " give a list of file handles, get all non-gene headers and return as a list of names "
+    fieldToFname = {}
+    allFields = []
+    colCounts = []
+    for ifh in ifhs:
+        fields = ifh.readline().rstrip("\n").split("\t")
+        colCounts.append(len(fields))
+        assert(fields[0]=="#gene")
+        for i, field in enumerate(fields):
+            # make sure we have no field name overlaps
+            if field in fieldToFname and i!=0:
+                raise Exception("field %s seen twice, in %s and %s" % (field, ifh.name, fieldToFname[field]))
+            fieldToFname[field] = ifh.name
+
+            # only use first field name from first file
+            if i==0:
+                if len(allFields)==0:
+                    allFields.append(field)
+            else:
+                allFields.append(field)
+
+    return allFields, colCounts
+# ----------- main --------------
+def main():
+    args, options = parseArgs()
+
+    inFnames = args[:-1]
+    outFname = args[-1]
+    ofh = open(outFname, "w")
+
+    ifhs = []
+    for fn in inFnames:
+        ifhs.append( open(fn) )
+
+    headers, colCounts = getAllFields(ifhs)
+    ofh.write("\t".join(headers))
+    ofh.write("\n")
+
+    for i, ifh in enumerate(ifhs):
+        logging.info("File %s: %d columns with values" % (ifhs[i].name, colCounts[i]-1)) 
+
+    fileCount = len(inFnames)
+    progressStep = 1000
+
+    doProcess = True
+    lineCount = 0
+
+    while doProcess:
+        geneId = None
+
+        lineCount += 1
+        if lineCount % progressStep == 0:
+            logging.info("Processed %d rows" % (lineCount))
+
+        for i, ifh in enumerate(ifhs):
+            lineStr = ifh.readline()
+            # check if we're at EOF
+            if lineStr=='':
+                doProcess = False
+                break
+
+            fields = lineStr.rstrip("\n").split("\t")
+            if (len(fields)!=colCounts[i]): # check number of columns against header count
+                raise Exception("Illegal number of fields: file %s, line %d, field count: %d, expected %d" % 
+                    (ifh.name, lineCount, len(fields), colCounts[i]))
+
+            # check the gene ID
+            if i==0: # get the gene ID from the first file
+                geneId = fields[0]
+                allVals = [geneId]
+            else:
+                assert(geneId == fields[0]) # if this fails, then the rows are not in the same order
+
+            allVals.extend(fields[1:])
+
+        ofh.write("\t".join(allVals))
+        ofh.write("\n")
+
+    # make sure that we've read through all files
+    for ifh in ifhs:
+        #print ifh.name
+        assert(ifh.readline()=='') # a file has still lines left to read?
+
+    ofh.close()
+    logging.info("Wrote %d lines (not counting header)" % lineCount)
+
+main()