a44e3254174152b2d4b3f241d1935654cd139181 vsmalladi Tue May 8 10:11:21 2012 -0700 Renamed library from ucscgenomics to ucscGb. Redmine #7029. diff --git python/lib/ucscGb/track.py python/lib/ucscGb/track.py new file mode 100644 index 0000000..d4ca776 --- /dev/null +++ python/lib/ucscGb/track.py @@ -0,0 +1,458 @@ +import os, re +from ucscgenomics import ra, mdb, encode + +class TrackFile(object): + ''' + A file in the trackDb, which has useful information about iself. + + CompositeTrack (below) has multiple dictionaries of TrackFiles, one for + the root downloads directory, and one for each release. The root directory + will link itself to the CompositeTrack's alpha metadata. + ''' + + @property + def name(self): + '''The file's name''' + return self._name + + @property + def fullname(self): + '''The file's full name including path''' + return self._path + self._name + + @property + def path(self): + '''The file's path''' + return self._path + + @property + def md5sum(self): + '''The md5sum for this file, stored in the md5sum.txt file in the downloads directory''' + if self._md5sum == None: + self._md5sum = encode.hashFile(self.fullname) + return self._md5sum + + @property + def extension(self): + '''The filetype''' + return self._extension + + @property + def size(self): + '''The size in bytes''' + return self._size + + @property + def metaObject(self): + '''The size in bytes''' + return self._metaObj + + def __init__(self, fullname, md5=None, metaObj=None): + fullname = os.path.abspath(fullname) + if not os.path.isfile(fullname): + raise KeyError('invalid file: %s' % fullname) + self._path, self._name = fullname.rsplit('/', 1) + self._path = self._path + '/' + self._fullname = fullname + self._size = os.stat(fullname).st_size + self._md5sum = md5 + self._metaObj = metaObj + + self._extension = self._name + self._extension.replace('.gz', '').replace('.tgz', '') + if '.' in self._extension: + self._extension = self._extension.rsplit('.')[1] + else: + self._extension = None + +class Release(object): + ''' + Keeps track of a single release, stored within the track. + ''' + + @property + def index(self): + '''Which release, represented as an int starting with 1''' + return self._index + + # @property + # def status(self): + # '''A string representing the status of this release: alpha, beta, or public''' + # return self._status + @property + def onAlpha(self): + return self._alpha + + @property + def onBeta(self): + return self._beta + + @property + def onPublic(self): + return self._public + + @property + def files(self): + '''A dictionary of TrackFiles belonging to this release where the filename is the key''' + return self._files + + def __init__(self, index, status, files): + self._files = files + self._index = index + if (status.strip() == ''): + self._alpha = self._beta = self._public = 1 + else: + self._alpha = 'alpha' in status.split(',') + self._beta = 'beta' in status.split(',') + self._public = 'public' in status.split(',') + +class CompositeTrack(object): + ''' + Stores an entire track, consisting mainly of its metadata and files. + + To make a CompositeTrack, you must specify database and name of the track: + sometrack = CompositeTrack('hg19', 'wgEncodeCshlLongRnaSeq') + + You can also specify a trackDb path in the event that yours is different + from the default, '~/kent/src/hg/makeDb/trackDb/': + sometrack = CompositeTrack('hg19', 'wgEncode...', '/weird/path') + + It's important to know that the CompositeTrack does NOT load all of its + information up front. Therefore, there's no performance hit for using a + CompositeTrack instead of just specifying a RaFile. In fact, it's + beneficial, since it adds another layer of abstraction to your code. You + can access a composite's ra files: + somemetadata = sometrack.alphaMetaDb + + For more information on what you can do with ra files, check the ra.py + documentation. + + You can also access a track's files. This is one of the more useful parts + of the composite track: + for file in sometrack.files: + print '%s %s' % (file.name, file.size) + + Each file is an instance of a TrackFile object, which is detailed in its + own documentation above. There are also lists of these files for each + release associated with the track: + for file in sometrack.releases[0]: + print file.name in sometrack.releases[1] + + Note that the files are indexed by their filename. This means that you can + easily compare multiple releases as in the above example. + ''' + + @property + def database(self): + '''The database for this composite, typically hg19 for humans''' + return self._database + + @property + def name(self): + '''The composite name''' + return self._name + + @property + def downloadsDirectory(self): + '''The location of files in downloads''' + if not os.path.isdir(self._downloadsDirectory): + raise KeyError(self._downloadsDirectory + ' does not exist') + return self._downloadsDirectory + + @property + def httpDownloadsPath(self): + '''The location of the downloadable files path in apache form''' + if not os.path.isdir(self._httpDownloadsPath): + raise KeyError(self._httpDownloadsPath + ' does not exist') + return self._httpDownloadsPath + + @property + def files(self): + '''A list of all files in the downloads directory of this composite''' + try: + return self._files + except AttributeError: + md5sums = encode.readMd5sums(self._md5path) + + radict = dict() + for stanza in self.alphaMetaDb.itervalues(): + if 'fileName' in stanza: + for file in stanza['fileName'].split(','): + radict[file] = stanza + + self._files = dict() + for file in os.listdir(self.downloadsDirectory): + if os.path.isfile(self.downloadsDirectory + file): + + stanza = None + if file in radict: + stanza = radict[file] + + if file in md5sums: + self._files[file] = TrackFile(self.downloadsDirectory + file, md5sums[file], stanza) + else: + self._files[file] = TrackFile(self.downloadsDirectory + file, None, stanza) + + return self._files + + @property + def qaInitDir(self): + qaDir = '/hive/groups/encode/encodeQa/' + self._database + '/' + self._name + '/' + if os.path.exists(qaDir) and os.path.isdir(qaDir): + pass + else: + os.makedirs(qaDir) + self._qaDir = qaDir + return qaDir + @property + def qaInitDirTest(self): + qaDir = '/hive/groups/encode/encodeQa/test/' + self._database + '/' + self._name + '/' + if os.path.exists(qaDir) and os.path.isdir(qaDir): + pass + else: + os.makedirs(qaDir) + self._qaDir = qaDir + return qaDir + + @property + def releaseObjects(self): + '''A set of release objects describing each release''' + + try: + return self._releaseObjects + except AttributeError: + self._releaseObjects = list() + + omit = ['README.txt', 'md5sum.txt', 'md5sum.history', 'files.txt'] + + maxcomposite = 0 + statuses = dict() + for line in open(self._trackDbDir + 'trackDb.wgEncode.ra'): + if line.startswith('#') or line.strip() == '': + continue + parts = line.split() + composite = parts[1] + places = '' + if len(parts) > 2: + places = parts[2] + if composite.startswith(self.name): + compositeparts = composite.split('.') + if len(compositeparts) >= 2 and compositeparts[1].startswith('release'): + index = int(compositeparts[1].replace('release', '')) + statuses[index] = places + maxcomposite = max(maxcomposite, index) + else: # THINK MORE ABOUT THIS REGION RE: PATCHES + statuses[1] = places + maxcomposite = max(maxcomposite, 1) + + lastplace = statuses[maxcomposite] + for i in range(maxcomposite, 0, -1): + if i not in statuses: + statuses[i] = lastplace + else: + lastplace = statuses[i] + + # while(1): + # releasepath = self.downloadsDirectory + ('release%d' % count) + '/' + + # if not os.path.exists(releasepath): + # break + + # md5s = encode.readMd5sums(releasepath + 'md5sum.txt') + # releasefiles = dict() + + # for file in os.listdir(releasepath): + # if os.path.isfile(releasepath + file) and file not in omit: + # if md5s != None and file in md5s: + # releasefiles[file] = TrackFile(releasepath + file, md5s[file]) + # else: + # releasefiles[file] = TrackFile(releasepath + file, None) + for i in range(1, maxcomposite + 1): + self._releaseObjects.append(Release(i, statuses[i], None)) + + return self._releaseObjects + @property + def releases(self): + '''A list of all files in the release directory of this composite''' + try: + return self._releaseFiles + except AttributeError: + self._releaseFiles = list() + count = 1 + + while os.path.exists(self.downloadsDirectory + 'release' + str(count)): + releasepath = self.downloadsDirectory + 'release' + str(count) + '/' + md5s = encode.readMd5sums(releasepath + 'md5sum.txt') + releasefiles = dict() + + for file in os.listdir(releasepath): + if file != 'md5sum.txt' and md5s != None and file in md5s and not os.path.isdir(releasepath + file): + releasefiles[file] = TrackFile(releasepath + file, md5s[file]) + elif not os.path.isdir(releasepath + file): + releasefiles[file] = TrackFile(releasepath + file, None) + elif os.path.isdir(releasepath + file): + if not re.match('.*supplemental.*', releasepath + file): + continue + for innerfile in os.listdir(releasepath + file): + pathfile = file + "/" + innerfile + releasefiles[pathfile] = TrackFile(releasepath + pathfile, None) + #releasefiles.sort() + self._releaseFiles.append(releasefiles) + count = count + 1 + + return self._releaseFiles + + @property + def alphaMetaDb(self): + '''The Ra file in the metaDb for this composite''' + try: + return self._alphaMetaDb + except AttributeError: + if not os.path.isfile(self._alphaMdbPath): + raise KeyError(self._alphaMdbPath + ' does not exist') + self._alphaMetaDb = mdb.MdbFile(self._alphaMdbPath) + return self._alphaMetaDb + + @property + def betaMetaDb(self): + '''The Ra file in the metaDb for this composite''' + try: + return self._betaMetaDb + except AttributeError: + if not os.path.isfile(self._betaMdbPath): + raise KeyError(self._betaMdbPath + ' does not exist') + self._betaMetaDb = mdb.MdbFile(self._betaMdbPath) + return self._betaMetaDb + + @property + def publicMetaDb(self): + '''The Ra file in the metaDb for this composite''' + try: + return self._publicMetaDb + except AttributeError: + if not os.path.isfile(self._publicMdbPath): + raise KeyError(self._publicMdbPath + ' does not exist') + self._publicMetaDb = mdb.MdbFile(self._publicMdbPath) + return self._publicMetaDb + + @property + def trackDb(self): + '''The Ra file in the trackDb for this composite''' + try: + return self._trackDb + except AttributeError: + self._trackDb = ra.RaFile(self._trackDbPath) + return self._trackDb + + @property + def trackPath(self): + '''The track path for this composite''' + return self._trackPath + + @property + def url(self): + '''The url on our site for this composite''' + return self._url + + @property + def organism(self): + '''The url on our site for this composite''' + return self._organism + + @property + def currentTrackDb(self): + trackDb = self._trackDbDir + "trackDb.wgEncode.ra" + f = open(trackDb, "r") + lines = f.readlines() + p = re.compile(".*(%s\S+) ?(\S+)" % self._name) + for i in lines: + if re.match("^\s*#.*", i): + continue + m = p.match(i) + if m and re.search('alpha', m.group(2)): + tdbpath = "%s%s" % (self._trackDbDir, m.group(1)) + return tdbpath + return None + + + def __init__(self, database, compositeName, trackPath=None, mdbCompositeName=None): + + if mdbCompositeName == None: + mdbCompositeName = compositeName + + if trackPath == None: + self._trackPath = os.path.expanduser('~/kent/src/hg/makeDb/trackDb/') + else: + self._trackPath = trackPath + if not self._trackPath.endswith('/'): + self._trackPath = self._trackPath + '/' + + if database in encode.organisms: + self._organism = encode.organisms[database] + else: + raise KeyError(database + ' is not a valid database') + + #self._trackDbPath = self._trackPath + self._organism + '/' + database + '/' + compositeName + '.ra' + self._trackDbDir = self._trackPath + self._organism + '/' + database + '/' + + self._alphaMdbPath = self._trackPath + self._organism + '/' + database + '/metaDb/alpha/' + mdbCompositeName + '.ra' + self._betaMdbPath = self._trackPath + self._organism + '/' + database + '/metaDb/beta/' + mdbCompositeName + '.ra' + self._publicMdbPath = self._trackPath + self._organism + '/' + database + '/metaDb/public/' + mdbCompositeName + '.ra' + self._alphaMdbDir = self._trackPath + self._organism + '/' + database + '/metaDb/alpha/' + self._betaMdbDir = self._trackPath + self._organism + '/' + database + '/metaDb/beta/' + self._publicMdbDir = self._trackPath + self._organism + '/' + database + '/metaDb/public/' + self._downloadsDirectory = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + compositeName + '/' + self._httpDownloadsPath = '/usr/local/apache/htdocs-hgdownload/goldenPath/' + database + '/encodeDCC/' + compositeName + '/' + self._rrHttpDir = '/usr/local/apache/htdocs/goldenPath/' + database + '/encodeDCC/' + compositeName + '/' + self._notesDirectory = os.path.expanduser("~/kent/src/hg/makeDb/doc/encodeDcc%s" % database.capitalize()) + '/' + self._url = 'http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=' + database + '&g=' + compositeName + self._database = database + self._name = compositeName + self._md5path = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + compositeName + '/md5sum.txt' + self._trackDbPath = self.currentTrackDb + if self._trackDbPath == None: + self._trackDbPath = self._trackPath + self._organism + '/' + database + '/' + compositeName + '.ra' + if not os.path.isfile(self._trackDbPath): + raise KeyError(self._trackDbPath + ' does not exist') + + +class TrackCollection(dict): + ''' + A collection that stores all the tracks for a given database, indexed by + its metaDb name. + ''' + + @property + def database(self): + return self._database + + @property + def organism(self): + return self._organism + + def __init__(self, database, trackPath=None): + dict.__init__(self) + + self._database = database + + if database in encode.organisms: + self._organism = encode.organisms[database] + else: + raise KeyError(database + ' is not a valid database') + + if trackPath == None: + self._trackPath = os.path.expanduser('~/kent/src/hg/makeDb/trackDb/') + else: + self._trackPath = trackPath + if not self._trackPath.endswith('/'): + self._trackPath = self._trackPath + '/' + + metaDb = self._trackPath + self._organism + '/' + self._database + '/metaDb/alpha/' + + for file in os.listdir(metaDb): + if os.path.isfile(metaDb + file) and file.endswith('.ra'): + trackname = file.replace('.ra', '') + if os.path.isfile(self._trackPath + self._organism + '/' + self._database + '/' + file): + self[trackname] = CompositeTrack(self._database, trackname, self._trackPath) + + \ No newline at end of file