3aee104cf4c1e245dd020f743fbc58c17fd75976 mmaddren Mon Apr 9 12:12:44 2012 -0700 added encode.py to store global constants and other encode stuff, and made all other libraries interface correctly with it diff --git python/lib/ucscgenomics/track.py python/lib/ucscgenomics/track.py index 8a79340..bc45749 100644 --- python/lib/ucscgenomics/track.py +++ python/lib/ucscgenomics/track.py @@ -1,74 +1,47 @@ -import os, re, hashlib -from ucscgenomics import ra, mdb - -organisms = { - 'hg19': 'human', - 'hg18': 'human', - 'mm9': 'mouse', - 'encodeTest': 'human' -} - -def readMd5sums(filename): - if os.path.isfile(filename): - md5sums = dict() - md5file = open(filename, 'r') - for line in md5file: - key, val = map(str.strip, line.split(' ', 1)) - md5sums[key] = val - return md5sums - else: - return None - - -def hashfile(filename, hasher=hashlib.md5(), blocksize=65536): - afile = open(filename, 'rb') - buf = afile.read(blocksize) - while len(buf) > 0: - hasher.update(buf) - buf = afile.read(blocksize) - return hasher.hexdigest() +import os, re +from ucscgenomics import ra, mdb, encode class TrackFile(object): ''' A file in the trackDb, which has useful information about iself. CompositeTrack (below) has multiple dictionaries of TrackFiles, one for the root downloads directory, and one for each release. The root directory will link itself to the CompositeTrack's alpha metadata. ''' @property def name(self): '''The file's name''' return self._name @property def fullname(self): '''The file's full name including path''' return self._path + self._name @property def path(self): '''The file's path''' return self._path @property def md5sum(self): '''The md5sum for this file, stored in the md5sum.txt file in the downloads directory''' if self._md5sum == None: - self._md5sum = hashfile(self.fullname) + self._md5sum = encode.hashFile(self.fullname) return self._md5sum @property def extension(self): '''The filetype''' return self._extension @property def size(self): '''The size in bytes''' return self._size @property def metaObject(self): '''The size in bytes''' @@ -80,30 +53,54 @@ raise KeyError('invalid file: %s' % fullname) self._path, self._name = fullname.rsplit('/', 1) self._path = self._path + '/' self._fullname = fullname self._size = os.stat(fullname).st_size self._md5sum = md5 self._metaObj = metaObj self._extension = self._name self._extension.replace('.gz', '').replace('.tgz', '') if '.' in self._extension: self._extension = self._extension.rsplit('.')[1] else: self._extension = None +class Release(object): + ''' + Keeps track of a single release, stored within the track. + ''' + + @property + def index(self): + '''Which release, represented as an int starting with 1''' + return self._index + + # @property + # def status(self): + # '''A string representing the status of this release: alpha, beta, or public''' + # return self._status + + @property + def files(self): + '''A dictionary of TrackFiles where the filename is the key''' + return self._files + + def __init__(self, index, status, files): + self._files = files + self._index = index + self._status = status.split() class CompositeTrack(object): ''' Stores an entire track, consisting mainly of its metadata and files. To make a CompositeTrack, you must specify database and name of the track: sometrack = CompositeTrack('hg19', 'wgEncodeCshlLongRnaSeq') You can also specify a trackDb path in the event that yours is different from the default, '~/kent/src/hg/makeDb/trackDb/': sometrack = CompositeTrack('hg19', 'wgEncode...', '/weird/path') It's important to know that the CompositeTrack does NOT load all of its information up front. Therefore, there's no performance hit for using a CompositeTrack instead of just specifying a RaFile. In fact, it's @@ -147,31 +144,31 @@ return self._downloadsDirectory @property def httpDownloadsPath(self): '''The location of the downloadable files path in apache form''' if not os.path.isdir(self._httpDownloadsPath): raise KeyError(self._httpDownloadsPath + ' does not exist') return self._httpDownloadsPath @property def files(self): '''A list of all files in the downloads directory of this composite''' try: return self._files except AttributeError: - md5sums = readMd5sums(self._md5path) + md5sums = encode.readMd5sums(self._md5path) radict = dict() for stanza in self.alphaMetaDb.itervalues(): if 'fileName' in stanza: for file in stanza['fileName'].split(','): radict[file] = stanza self._files = dict() for file in os.listdir(self.downloadsDirectory): if os.path.isfile(self.downloadsDirectory + file): stanza = None if file in radict: stanza = radict[file] @@ -190,41 +187,95 @@ else: os.makedirs(qaDir) self._qaDir = qaDir return qaDir @property def qaInitDirTest(self): qaDir = '/hive/groups/encode/encodeQa/test/' + self._database + '/' + self._name + '/' if os.path.exists(qaDir) and os.path.isdir(qaDir): pass else: os.makedirs(qaDir) self._qaDir = qaDir return qaDir @property + def releaseObjects(self): + '''A set of release objects describing each release''' + + try: + return self._releaseObjects + except AttributeError: + self._releaseObjects = list() + count = 1 + + omit = ['README.txt', 'md5sum.txt', 'md5sum.history', 'files.txt'] + + maxcomposite = 0 + statuses = dict() + for line in open(self._trackDbDir): + parts = line.split() + composite = parts[1] + places = '' + if len(parts) > 2: + places = parts[2] + if composite.startswith(self.name): + compositeparts = composite.split('.') + if len(compositeparts) >= 2 and compositeparts[1].startswith('release'): + index = int(compositeparts[1].replace('release', '')) + statuses[index] = places + maxcomposite = max(maxcomposite, index) + else: # THINK MORE ABOUT THIS REGION RE: PATCHES + statuses[1] = places + maxcomposite = max(maxcomposite, 1) + + lastplace = statuses[maxcomposite] + for i in range(maxcomposite, 0, -1): + if i not in statuses: + statuses[i] = lastplace + else: + lastplace = statuses[i] + + while(1): + releasepath = self.downloadsDirectory + ('release%d' % count) + '/' + + if not os.path.exists(releasepath): + break + + md5s = encode.readMd5sums(releasepath + 'md5sum.txt') + releasefiles = dict() + + for file in os.listdir(releasepath): + if os.path.isfile(releasepath + file) and file not in omit: + if md5s != None and file in md5s: + releasefiles[file] = TrackFile(releasepath + file, md5s[file]) + else: + releasefiles[file] = TrackFile(releasepath + file, None) + + self._releaseObjects.append(Release(count, statuses[count], releasefiles)) + @property def releases(self): '''A list of all files in the release directory of this composite''' try: return self._releaseFiles except AttributeError: self._releaseFiles = list() count = 1 while os.path.exists(self.downloadsDirectory + 'release' + str(count)): releasepath = self.downloadsDirectory + 'release' + str(count) + '/' - md5s = readMd5sums(releasepath + 'md5sum.txt') + md5s = encode.readMd5sums(releasepath + 'md5sum.txt') releasefiles = dict() for file in os.listdir(releasepath): if file != 'md5sum.txt' and md5s != None and file in md5s and not os.path.isdir(releasepath + file): releasefiles[file] = TrackFile(releasepath + file, md5s[file]) elif not os.path.isdir(releasepath + file): releasefiles[file] = TrackFile(releasepath + file, None) elif os.path.isdir(releasepath + file): if not re.match('.*supplemental.*', releasepath + file): continue for innerfile in os.listdir(releasepath + file): pathfile = file + "/" + innerfile releasefiles[pathfile] = TrackFile(releasepath + pathfile, None) #releasefiles.sort() self._releaseFiles.append(releasefiles) @@ -305,32 +356,32 @@ return None def __init__(self, database, compositeName, trackPath=None, mdbCompositeName=None): if mdbCompositeName == None: mdbCompositeName = compositeName if trackPath == None: self._trackPath = os.path.expanduser('~/kent/src/hg/makeDb/trackDb/') else: self._trackPath = trackPath if not self._trackPath.endswith('/'): self._trackPath = self._trackPath + '/' - if database in organisms: - self._organism = organisms[database] + if database in encode.organisms: + self._organism = encode.organisms[database] else: raise KeyError(database + ' is not a valid database') #self._trackDbPath = self._trackPath + self._organism + '/' + database + '/' + compositeName + '.ra' self._trackDbDir = self._trackPath + self._organism + '/' + database + '/' self._alphaMdbPath = self._trackPath + self._organism + '/' + database + '/metaDb/alpha/' + mdbCompositeName + '.ra' self._betaMdbPath = self._trackPath + self._organism + '/' + database + '/metaDb/beta/' + mdbCompositeName + '.ra' self._publicMdbPath = self._trackPath + self._organism + '/' + database + '/metaDb/public/' + mdbCompositeName + '.ra' self._alphaMdbDir = self._trackPath + self._organism + '/' + database + '/metaDb/alpha/' self._betaMdbDir = self._trackPath + self._organism + '/' + database + '/metaDb/beta/' self._publicMdbDir = self._trackPath + self._organism + '/' + database + '/metaDb/public/' self._downloadsDirectory = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + compositeName + '/' self._httpDownloadsPath = '/usr/local/apache/htdocs-hgdownload/goldenPath/' + database + '/encodeDCC/' + compositeName + '/' self._rrHttpDir = '/usr/local/apache/htdocs/goldenPath/' + database + '/encodeDCC/' + compositeName + '/' @@ -353,32 +404,32 @@ ''' @property def database(self): return self._database @property def organism(self): return self._organism def __init__(self, database, trackPath=None): dict.__init__(self) self._database = database - if database in organisms: - self._organism = organisms[database] + if database in encode.organisms: + self._organism = encode.organisms[database] else: raise KeyError(database + ' is not a valid database') if trackPath == None: self._trackPath = os.path.expanduser('~/kent/src/hg/makeDb/trackDb/') else: self._trackPath = trackPath if not self._trackPath.endswith('/'): self._trackPath = self._trackPath + '/' metaDb = self._trackPath + self._organism + '/' + self._database + '/metaDb/alpha/' for file in os.listdir(metaDb): if os.path.isfile(metaDb + file) and file.endswith('.ra'): trackname = file.replace('.ra', '')