python/lib/ucscgenomics/ucscUtils.py fea4b164eb055075db2d11a8d96af2bf5659a54e

fea4b164eb055075db2d11a8d96af2bf5659a54e
wong
  Wed Jan 25 11:48:27 2012 -0800
moved to Needleman-Wunsch for alignment of lists, left in code to convert to Smith-Waterman if this proves to be a bad decision
diff --git python/lib/ucscgenomics/ucscUtils.py python/lib/ucscgenomics/ucscUtils.py
index faa5cfb..e6d4ab4 100644
--- python/lib/ucscgenomics/ucscUtils.py
+++ python/lib/ucscgenomics/ucscUtils.py
@@ -1,137 +1,152 @@
 #!/hive/groups/encode/dcc/bin/python
 import sys, string, os, re, argparse, subprocess, math
 
 def isGbdbFile(file, table, database):
     errors = []
     if os.path.isfile("/gbdb/%s/bbi/%s" % (database, file)):
         return 1
     else:
         cmd = "hgsql %s -e \"select fileName from (%s)\"" % (database, table)
         p = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True)
         cmdoutput = p.stdout.read()
         if os.path.isfile(cmdoutput.split("\n")[1]):
             return 1
         else:
             return 0
             
 def makeFileSizes(inlist, path=None):
     checklist = list()
 
     for i in inlist:
     	if path:
             checklist.append("%s/%s" % (path, i))    
         else:
             checklist.append(i)
     filesizes = 0
     for i in checklist:
         realpath = os.path.realpath(i)
         filesizes = filesizes + int(os.path.getsize(realpath))
 
     filesizes = math.ceil(float(filesizes) / (1024**2))
 
     return int(filesizes)
     
 def printIter(set, path=None):
     output = []
     for i in sorted(set):
         if path:
             output.append("%s/%s" % (path, i))
         else:
             output.append("%s" % (i))
     return output
 
 def zeros(m,n):
     cross = list()
     for i in range(m):
         cross.append(list())
         for j in range(n):
             cross[i].append(0)
     return cross
 
 def mergeList(list1,list2):
+    """ Takes in two lists, returns a single list merged into a consensus list using a modified Needleman-Wunsch
+        alignment. The comment lines with two comments are there to signalify a change to Smith-Waterman, remove the 
+        double commented lines to convert.
+        Single comment lines are for debugging, remove when this routine matures.
+    """
+
     m,n = len(list1),len(list2)
     score = zeros(m+1,n+1)
     pointer = zeros(m+1,n+1)
     penalty = -1
     max_i = 0
     max_j = 0
     maxScore = 0
 
+    #print "len_i = %s, len_j = %s" % (m,n)
     for i in range(1,m+1):
         for j in range(1,n+1):
             scoreUp = score[i-1][j]+penalty
             scoreDown = score[i][j-1]+penalty
             scoreDiagonal = score[i-1][j-1]
             if list1[i-1] == list2[j-1]:
                 scoreDiagonal = scoreDiagonal + 10
             score[i][j] = max(0,scoreUp,scoreDown,scoreDiagonal)
             if score[i][j] == 0:
                 pointer[i][j] = 0
             if score[i][j] == scoreUp:
                 pointer[i][j] = 1
             if score[i][j] == scoreDown:
                 pointer[i][j] = 2
             if score[i][j] == scoreDiagonal:
                 pointer[i][j] = 3
-            if score[i][j] >= maxScore:
-                max_i = i
-                max_j = j
-                maxScore = score[i][j]
+            ##if score[i][j] >= maxScore:
+            ##    max_i = i
+            ##    max_j = j
+            ##    maxScore = score[i][j]
 
-    align1,align2 = list(),list()
-    after_i,after_j = max_i,max_j
+    #for k in pointer:
+    #    line = ""
+    #    for l in k:
+    #        line = line + "%s " % l
+    #    print line
 
+    align1,align2 = list(),list()
+    ##after_i,after_j = max_i,max_j
+    #print "max_i = %s, max_j = %s" % (max_i, max_j)
     while pointer[i][j] != 0:
         if pointer[i][j] == 3:
             align1.append(str(list1[i-1]))
             align2.append(str(list2[j-1]))
             i = i-1
             j = j-1
         elif pointer[i][j] == 2:
             align1.append('-')
             align2.append(str(list2[j-1]))
             j = j-1
         elif pointer[i][j] == 1:
             align1.append(str(list1[i-1]))
             align2.append('-')
             i = i-1
 
     before_i,before_j = i,j
     align1.reverse()
     align2.reverse()
     consensus = list()
 
+    #for i in range(len(align1)):
+    #    print "%s = %s" % (align1[i], align2[i])
+
     for i in range(len(align1)):
         if align1[i] == align2[i]:
             consensus.append(align1[i])
         elif align1[i] == "-":
             consensus.append(align2[i])
         elif align2[i] == "-":
             consensus.append(align1[i])
         elif align1[i] != align2[i]:
             consensus.append(align1[i])
             consensus.append(align2[i])
-
     before = list()
     if before_i != 0:
         before.extend(list1[0:before_i])
     if before_j != 0:
         before.extend(list2[0:before_j])
-    if after_i < len(list1):
-        consensus.extend(list1[after_i:])
-    if after_j < len(list2):
-        consensus.extend(list2[after_j:])
+    ##if after_i < len(list1):
+    ##    consensus.extend(list1[after_i:])
+    ##if after_j < len(list2):
+    ##    consensus.extend(list2[after_j:])
     before.extend(consensus)
     setcon = list()
     p = re.compile('^#')
     p2 = re.compile('^\s*$')
     for i in before:
         if p.match(i) or p2.match(i):
             setcon.append(i)
             continue
         if i in setcon:
             continue
         else:
             setcon.append(i)
     
     return setcon