a44e3254174152b2d4b3f241d1935654cd139181
vsmalladi
  Tue May 8 10:11:21 2012 -0700
Renamed library from ucscgenomics to ucscGb. Redmine #7029.
diff --git python/lib/ucscGb/track.py python/lib/ucscGb/track.py
new file mode 100644
index 0000000..d4ca776
--- /dev/null
+++ python/lib/ucscGb/track.py
@@ -0,0 +1,458 @@
+import os, re
+from ucscgenomics import ra, mdb, encode
+
+class TrackFile(object):
+    '''
+    A file in the trackDb, which has useful information about iself.
+    
+    CompositeTrack (below) has multiple dictionaries of TrackFiles, one for
+    the root downloads directory, and one for each release. The root directory
+    will link itself to the CompositeTrack's alpha metadata.
+    '''
+
+    @property 
+    def name(self):
+        '''The file's name'''
+        return self._name
+        
+    @property 
+    def fullname(self):
+        '''The file's full name including path'''
+        return self._path + self._name
+        
+    @property 
+    def path(self):
+        '''The file's path'''
+        return self._path
+        
+    @property 
+    def md5sum(self):
+        '''The md5sum for this file, stored in the md5sum.txt file in the downloads directory'''
+        if self._md5sum == None:
+            self._md5sum = encode.hashFile(self.fullname)
+        return self._md5sum
+        
+    @property 
+    def extension(self):
+        '''The filetype'''
+        return self._extension
+        
+    @property 
+    def size(self):
+        '''The size in bytes'''
+        return self._size
+        
+    @property 
+    def metaObject(self):
+        '''The size in bytes'''
+        return self._metaObj
+    
+    def __init__(self, fullname, md5=None, metaObj=None):
+        fullname = os.path.abspath(fullname)
+        if not os.path.isfile(fullname):
+            raise KeyError('invalid file: %s' % fullname)
+        self._path, self._name = fullname.rsplit('/', 1)
+        self._path = self._path + '/'
+        self._fullname = fullname
+        self._size = os.stat(fullname).st_size
+        self._md5sum = md5
+        self._metaObj = metaObj
+        
+        self._extension = self._name
+        self._extension.replace('.gz', '').replace('.tgz', '')
+        if '.' in self._extension:
+            self._extension = self._extension.rsplit('.')[1]
+        else:
+            self._extension = None
+    
+class Release(object):
+    '''
+    Keeps track of a single release, stored within the track.
+    '''
+    
+    @property
+    def index(self):
+        '''Which release, represented as an int starting with 1'''
+        return self._index
+        
+    # @property
+    # def status(self):
+        # '''A string representing the status of this release: alpha, beta, or public'''
+        # return self._status
+    @property
+    def onAlpha(self):
+        return self._alpha
+    
+    @property
+    def onBeta(self):
+        return self._beta
+        
+    @property
+    def onPublic(self):
+        return self._public
+    
+    @property
+    def files(self):
+        '''A dictionary of TrackFiles belonging to this release where the filename is the key'''
+        return self._files
+    
+    def __init__(self, index, status, files):
+        self._files = files
+        self._index = index
+        if (status.strip() == ''):
+            self._alpha = self._beta = self._public = 1
+        else:
+            self._alpha = 'alpha' in status.split(',')
+            self._beta = 'beta' in status.split(',')
+            self._public = 'public' in status.split(',')
+    
+class CompositeTrack(object):
+    '''
+    Stores an entire track, consisting mainly of its metadata and files.
+    
+    To make a CompositeTrack, you must specify database and name of the track:
+        sometrack = CompositeTrack('hg19', 'wgEncodeCshlLongRnaSeq')
+        
+    You can also specify a trackDb path in the event that yours is different
+    from the default, '~/kent/src/hg/makeDb/trackDb/':
+        sometrack = CompositeTrack('hg19', 'wgEncode...', '/weird/path')
+        
+    It's important to know that the CompositeTrack does NOT load all of its
+    information up front. Therefore, there's no performance hit for using a
+    CompositeTrack instead of just specifying a RaFile. In fact, it's
+    beneficial, since it adds another layer of abstraction to your code. You
+    can access a composite's ra files:
+        somemetadata = sometrack.alphaMetaDb
+        
+    For more information on what you can do with ra files, check the ra.py
+    documentation.
+    
+    You can also access a track's files. This is one of the more useful parts
+    of the composite track:
+        for file in sometrack.files:
+            print '%s %s' % (file.name, file.size)
+            
+    Each file is an instance of a TrackFile object, which is detailed in its
+    own documentation above. There are also lists of these files for each
+    release associated with the track:
+        for file in sometrack.releases[0]:
+            print file.name in sometrack.releases[1]
+            
+    Note that the files are indexed by their filename. This means that you can
+    easily compare multiple releases as in the above example.
+    '''
+
+    @property 
+    def database(self):
+        '''The database for this composite, typically hg19 for humans'''
+        return self._database
+        
+    @property 
+    def name(self):
+        '''The composite name'''
+        return self._name
+        
+    @property 
+    def downloadsDirectory(self):
+        '''The location of files in downloads'''
+        if not os.path.isdir(self._downloadsDirectory):
+            raise KeyError(self._downloadsDirectory + ' does not exist')
+        return self._downloadsDirectory
+   
+    @property 
+    def httpDownloadsPath(self):
+        '''The location of the downloadable files path in apache form'''
+        if not os.path.isdir(self._httpDownloadsPath):
+            raise KeyError(self._httpDownloadsPath + ' does not exist')
+        return self._httpDownloadsPath
+    
+    @property 
+    def files(self):
+        '''A list of all files in the downloads directory of this composite'''
+        try:
+            return self._files
+        except AttributeError:
+            md5sums = encode.readMd5sums(self._md5path)
+            
+            radict = dict()
+            for stanza in self.alphaMetaDb.itervalues():
+                if 'fileName' in stanza:
+                    for file in stanza['fileName'].split(','):
+                        radict[file] = stanza
+            
+            self._files = dict()
+            for file in os.listdir(self.downloadsDirectory):
+                if os.path.isfile(self.downloadsDirectory + file):
+                
+                    stanza = None
+                    if file in radict:
+                        stanza = radict[file]
+                        
+                    if file in md5sums:
+                        self._files[file] = TrackFile(self.downloadsDirectory + file, md5sums[file], stanza)
+                    else:
+                        self._files[file] = TrackFile(self.downloadsDirectory + file, None, stanza)
+        
+            return self._files
+            
+    @property 
+    def qaInitDir(self):
+        qaDir = '/hive/groups/encode/encodeQa/' + self._database + '/' + self._name + '/'
+        if os.path.exists(qaDir) and os.path.isdir(qaDir):
+            pass
+        else:
+            os.makedirs(qaDir)
+        self._qaDir = qaDir
+        return qaDir
+    @property 
+    def qaInitDirTest(self):
+        qaDir = '/hive/groups/encode/encodeQa/test/' + self._database + '/' + self._name + '/'
+        if os.path.exists(qaDir) and os.path.isdir(qaDir):
+            pass
+        else:
+            os.makedirs(qaDir)
+        self._qaDir = qaDir
+        return qaDir
+
+    @property
+    def releaseObjects(self):
+        '''A set of release objects describing each release'''
+        
+        try:
+            return self._releaseObjects
+        except AttributeError:
+            self._releaseObjects = list()
+            
+            omit = ['README.txt', 'md5sum.txt', 'md5sum.history', 'files.txt']
+            
+            maxcomposite = 0
+            statuses = dict()
+            for line in open(self._trackDbDir + 'trackDb.wgEncode.ra'):
+                if line.startswith('#') or line.strip() == '':
+                    continue
+                parts = line.split()
+                composite = parts[1]
+                places = ''
+                if len(parts) > 2:
+                    places = parts[2]
+                if composite.startswith(self.name):
+                    compositeparts = composite.split('.')
+                    if len(compositeparts) >= 2 and compositeparts[1].startswith('release'):
+                        index = int(compositeparts[1].replace('release', ''))
+                        statuses[index] = places
+                        maxcomposite = max(maxcomposite, index)
+                    else:                       # THINK MORE ABOUT THIS REGION RE: PATCHES
+                        statuses[1] = places
+                        maxcomposite = max(maxcomposite, 1)
+            
+            lastplace = statuses[maxcomposite]
+            for i in range(maxcomposite, 0, -1):
+                if i not in statuses:
+                    statuses[i] = lastplace
+                else:
+                    lastplace = statuses[i]
+                    
+            # while(1):
+                # releasepath = self.downloadsDirectory + ('release%d' % count) + '/'
+                
+                # if not os.path.exists(releasepath):
+                    # break
+                    
+                # md5s = encode.readMd5sums(releasepath + 'md5sum.txt')
+                # releasefiles = dict()
+                
+                # for file in os.listdir(releasepath):
+                    # if os.path.isfile(releasepath + file) and file not in omit:
+                        # if md5s != None and file in md5s:
+                            # releasefiles[file] = TrackFile(releasepath + file, md5s[file])
+                        # else:
+                            # releasefiles[file] = TrackFile(releasepath + file, None)
+            for i in range(1, maxcomposite + 1):    
+                self._releaseObjects.append(Release(i, statuses[i], None))
+                
+            return self._releaseObjects
+    @property 
+    def releases(self):
+        '''A list of all files in the release directory of this composite'''
+        try:
+            return self._releaseFiles
+        except AttributeError:
+            self._releaseFiles = list()
+            count = 1
+            
+            while os.path.exists(self.downloadsDirectory + 'release' + str(count)):
+                releasepath = self.downloadsDirectory + 'release' + str(count) + '/'
+                md5s = encode.readMd5sums(releasepath + 'md5sum.txt')
+                releasefiles = dict()
+                
+                for file in os.listdir(releasepath):
+                    if file != 'md5sum.txt' and md5s != None and file in md5s and not os.path.isdir(releasepath + file):
+                        releasefiles[file] = TrackFile(releasepath + file, md5s[file])
+                    elif not os.path.isdir(releasepath + file):
+                        releasefiles[file] = TrackFile(releasepath + file, None)
+                    elif os.path.isdir(releasepath + file):
+                        if not re.match('.*supplemental.*', releasepath + file):
+                            continue
+                        for innerfile in os.listdir(releasepath + file):
+                            pathfile = file + "/" + innerfile 
+                            releasefiles[pathfile] = TrackFile(releasepath + pathfile, None)
+        #releasefiles.sort()
+                self._releaseFiles.append(releasefiles)
+                count = count + 1
+                
+            return self._releaseFiles
+        
+    @property 
+    def alphaMetaDb(self):
+        '''The Ra file in the metaDb for this composite'''
+        try:
+            return self._alphaMetaDb
+        except AttributeError:
+            if not os.path.isfile(self._alphaMdbPath):
+                raise KeyError(self._alphaMdbPath + ' does not exist')
+            self._alphaMetaDb = mdb.MdbFile(self._alphaMdbPath)
+            return self._alphaMetaDb
+        
+    @property 
+    def betaMetaDb(self):
+        '''The Ra file in the metaDb for this composite'''
+        try:
+            return self._betaMetaDb
+        except AttributeError:
+            if not os.path.isfile(self._betaMdbPath):
+                raise KeyError(self._betaMdbPath + ' does not exist')
+            self._betaMetaDb = mdb.MdbFile(self._betaMdbPath)
+            return self._betaMetaDb
+        
+    @property 
+    def publicMetaDb(self):
+        '''The Ra file in the metaDb for this composite'''
+        try:
+            return self._publicMetaDb
+        except AttributeError:
+            if not os.path.isfile(self._publicMdbPath):
+                raise KeyError(self._publicMdbPath + ' does not exist')
+            self._publicMetaDb = mdb.MdbFile(self._publicMdbPath)
+            return self._publicMetaDb
+        
+    @property 
+    def trackDb(self):
+        '''The Ra file in the trackDb for this composite'''
+        try:
+            return self._trackDb
+        except AttributeError:
+            self._trackDb = ra.RaFile(self._trackDbPath)
+            return self._trackDb
+        
+    @property 
+    def trackPath(self):
+        '''The track path for this composite'''
+        return self._trackPath
+        
+    @property 
+    def url(self):
+        '''The url on our site for this composite'''
+        return self._url
+        
+    @property 
+    def organism(self):
+        '''The url on our site for this composite'''
+        return self._organism
+
+    @property 
+    def currentTrackDb(self):
+        trackDb = self._trackDbDir + "trackDb.wgEncode.ra"
+        f = open(trackDb, "r")
+        lines = f.readlines()
+        p = re.compile(".*(%s\S+) ?(\S+)" % self._name)
+        for i in lines:
+            if re.match("^\s*#.*", i):
+                continue
+            m = p.match(i)
+            if m and re.search('alpha', m.group(2)):
+                tdbpath = "%s%s" % (self._trackDbDir, m.group(1))
+                return tdbpath
+        return None
+
+
+    def __init__(self, database, compositeName, trackPath=None, mdbCompositeName=None):
+        
+        if mdbCompositeName == None:
+            mdbCompositeName = compositeName
+        
+        if trackPath == None:
+            self._trackPath = os.path.expanduser('~/kent/src/hg/makeDb/trackDb/')
+        else:
+            self._trackPath = trackPath
+            if not self._trackPath.endswith('/'):
+                self._trackPath = self._trackPath + '/'
+            
+        if database in encode.organisms:
+            self._organism = encode.organisms[database]
+        else:
+            raise KeyError(database + ' is not a valid database')
+        
+        #self._trackDbPath = self._trackPath + self._organism + '/' + database + '/' + compositeName + '.ra'
+        self._trackDbDir = self._trackPath + self._organism + '/' + database + '/'
+  
+        self._alphaMdbPath = self._trackPath + self._organism + '/' + database + '/metaDb/alpha/' + mdbCompositeName + '.ra'
+        self._betaMdbPath = self._trackPath + self._organism + '/' + database + '/metaDb/beta/' + mdbCompositeName + '.ra'    
+        self._publicMdbPath = self._trackPath + self._organism + '/' + database + '/metaDb/public/' + mdbCompositeName + '.ra'
+        self._alphaMdbDir = self._trackPath + self._organism + '/' + database + '/metaDb/alpha/'
+        self._betaMdbDir = self._trackPath + self._organism + '/' + database + '/metaDb/beta/'
+        self._publicMdbDir = self._trackPath + self._organism + '/' + database + '/metaDb/public/'
+        self._downloadsDirectory = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + compositeName + '/'
+        self._httpDownloadsPath = '/usr/local/apache/htdocs-hgdownload/goldenPath/' + database + '/encodeDCC/' + compositeName + '/'
+        self._rrHttpDir = '/usr/local/apache/htdocs/goldenPath/' + database + '/encodeDCC/' + compositeName + '/'
+        self._notesDirectory = os.path.expanduser("~/kent/src/hg/makeDb/doc/encodeDcc%s" % database.capitalize()) + '/'
+        self._url = 'http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=' + database + '&g=' + compositeName
+        self._database = database
+        self._name = compositeName        
+        self._md5path = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + compositeName + '/md5sum.txt'
+        self._trackDbPath = self.currentTrackDb
+        if self._trackDbPath == None:
+            self._trackDbPath = self._trackPath + self._organism + '/' + database + '/' + compositeName + '.ra' 
+        if not os.path.isfile(self._trackDbPath):
+            raise KeyError(self._trackDbPath + ' does not exist')
+        
+
+class TrackCollection(dict):
+    '''
+    A collection that stores all the tracks for a given database, indexed by
+    its metaDb name.
+    '''
+    
+    @property 
+    def database(self):
+        return self._database
+    
+    @property 
+    def organism(self):
+        return self._organism  
+        
+    def __init__(self, database, trackPath=None):
+        dict.__init__(self)
+    
+        self._database = database
+        
+        if database in encode.organisms:
+            self._organism = encode.organisms[database]
+        else:
+            raise KeyError(database + ' is not a valid database')
+    
+        if trackPath == None:
+            self._trackPath = os.path.expanduser('~/kent/src/hg/makeDb/trackDb/')
+        else:
+            self._trackPath = trackPath
+            if not self._trackPath.endswith('/'):
+                self._trackPath = self._trackPath + '/'
+    
+        metaDb = self._trackPath + self._organism + '/' + self._database + '/metaDb/alpha/'
+        
+        for file in os.listdir(metaDb):
+            if os.path.isfile(metaDb + file) and file.endswith('.ra'):
+                trackname = file.replace('.ra', '') 
+                if os.path.isfile(self._trackPath + self._organism + '/' + self._database + '/' + file):
+                    self[trackname] = CompositeTrack(self._database, trackname, self._trackPath)
+                
+                
\ No newline at end of file