python/lib/ucscgenomics/mdb.py 0c719913e393d101a7039f3de6008baf5c6896e1

0c719913e393d101a7039f3de6008baf5c6896e1
mmaddren
  Mon Jan 30 10:52:08 2012 -0800
added geo library and metaDb ra file library
diff --git python/lib/ucscgenomics/mdb.py python/lib/ucscgenomics/mdb.py
new file mode 100644
index 0000000..b39400a
--- /dev/null
+++ python/lib/ucscgenomics/mdb.py
@@ -0,0 +1,194 @@
+from ucscgenomics import ra

+
+class DataType(object):
+
+    def __init__(self, name, molecule, strategy, source, selection, type):
+        self.name = name
+        self.molecule = molecule
+        self.strategy = strategy
+        self.source = source
+        self.selection = selection
+        self.type = type
+        
+    @property    
+    def valid(self):
+        return self.molecule != 'REPLACE' and self.strategy != 'REPLACE' and self.source != 'REPLACE' and self.selection != 'REPLACE' and self.type != None
+    
+    @property
+    def shouldSubmit(self):
+        return self.type != 'NotGeo'
+    
+dataTypes = {
+    'Cage': DataType('Cage', 'OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', 'HighThroughput'),
+    'ChipSeq': DataType('ChipSeq', 'genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', 'HighThroughput'),
+    'DnaPet': DataType('DnaPet', 'genomic DNA', 'OTHER', 'genomic', 'size fractionation', 'HighThroughput'),
+    'DnaseDgf': DataType('DnaseDgf', 'genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', 'HighThroughput'),
+    'DnaseSeq': DataType('DnaseSeq', 'genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', 'HighThroughput'),
+    'FaireSeq': DataType('FaireSeq', 'genomic DNA', 'OTHER', 'genomic', 'other', 'HighThroughput'),
+    'MethylSeq': DataType('MethylSeq', 'genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', 'HighThroughput'),
+    'MethylRrbs': DataType('MethylRrbs', 'genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', 'HighThroughput'),
+    'Orchid': DataType('Orchid', 'genomic DNA', 'OTHER', 'genomic', 'other', 'HighThroughput'),
+    'Proteogenomics': DataType('Proteogenomics', 'protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', 'HighThroughput'),
+    'RnaPet': DataType('RnaPet', 'OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', 'HighThroughput'),
+    'RnaSeq': DataType('RnaSeq', 'OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', 'HighThroughput'),
+    
+    #these need to be curated
+    '5C': DataType('5C', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'AffyExonArray': DataType('AffyExonArray', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'MicroArray'),
+    'Bip': DataType('Bip', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'),
+    'Cluster': DataType('Cluster', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Cnv': DataType('Cnv', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Combined': DataType('Combined', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Genotype': DataType('Genotype', 'genomic DNA', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Gencode': DataType('Gencode', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'),
+    'ChiaPet': DataType('ChiaPet', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Mapability': DataType('Mapability', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'),
+    'MethylArray': DataType('MethylArray', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'NRE': DataType('NRE', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'),
+    'Nucleosome': DataType('Nucleosome', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'RnaChip': DataType('RnaChip', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'RipGeneSt': DataType('RipGeneSt', 'OVERRIDE RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', 'MicroArray'), #this isn't correct
+    'RipTiling': DataType('RipTiling', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'RipChip': DataType('RipChip', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'RipSeq': DataType('RipSeq', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Switchgear': DataType('Switchgear', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'),
+    'TfbsValid': DataType('TfbsValid', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo')
+}
+
+#compare this to the source in datatype, give GP ids depending on the type
+gpIds = {
+    'human genomic': '63443',
+    'human transcriptomic': '30709',
+    'human protein': '63447',
+    
+    'mouse genomic': '63471',
+    'mouse transcriptomic': '66167',
+    'mouse protein': '63475'
+}
+

+class MdbFile(ra.RaFile):

+    '''
+    This should be used for all files in the metaDb, since they extend RaFile
+    with useful functionality specific to metaDb ra files.
+    '''
+    
+    @property
+    def expVars(self):
+        '''the experimental variables used in this track'''
+        try:
+            return self._expVars
+        except AttributeError:
+            self._expVars = self.compositeStanza['expVars'].split(',')
+            return self._expVars
+    
+    @property
+    def dataType(self):
+        '''The data type of the experiment. 'None' if inconsistent.'''
+        try:
+            return self._dataType
+        except AttributeError:
+            self._dataType = None
+            for e in self.experiments.itervalues():
+                if self._dataType == None and e.dataType != None:
+                    self._dataType = e.dataType
+                elif self._dataType != e.dataType or e.dataType == None:
+                    self._dataType = None
+                    break
+            return self._dataType
+    
+    @property    
+    def compositeStanza(self):
+        '''the stanza (typically first in file) describing the composite'''
+        try:
+            return self._compositeStanza
+        except AttributeError:
+            self._compositeStanza = self.filter(lambda s: s['objType'] == 'composite', lambda s: s)
+            if len(self._compositeStanza) != 1:
+                raise KeyError
+            else:
+                self._compositeStanza = self._compositeStanza[0]
+            return self._compositeStanza
+            
+    @property    
+    def experiments(self):
+        '''dictionary of MdbExp objects indexed by the expId'''
+        try:
+            return self._experiments
+        except AttributeError:
+            self._experiments = dict()
+            exps = self.filter(lambda s: s['objType'] != 'composite', lambda s: (s['expId'], s))
+            stanzas = dict()
+            for k, v in exps:
+                if k not in stanzas:
+                    stanzas[k] = list()
+                stanzas[k].append(v)
+            for id in stanzas.iterkeys():
+                self._experiments[id] = MdbExp(id, self, stanzas[id])
+            return self._experiments
+    
+    def __init__(self, filepath):
+        ra.RaFile.__init__(self)
+        self.read(filepath)
+        
+    def readStanza(self, stanza, key=None):
+        entry = MdbStanza(self)
+        if entry.readStanza(stanza, key) == None:
+            return None, None, None
+        val1, val2 = entry.readStanza(stanza, key)
+        return val1, val2, entry
+
+class MdbStanza(ra.RaStanza):
+    
+    @property
+    def title(self):
+        '''The expVars catted together, making the title used for GEO'''
+        try:
+            return self._title
+        except AttributeError:
+            expVars = self._parent.expVars
+            if expVars[0] in self:
+                self._title = self[expVars[0]].replace('-m', '')
+            else:
+                self._title = None
+            for expVar in expVars[1:len(expVars)]:
+                if expVar in self and self[expVar] != 'None':
+                    self._title += '_' + self[expVar]
+            return self._title
+        
+    def __init__(self, parent):
+        ra.RaStanza.__init__(self)
+        self._parent = parent
+        
+        
+class MdbExp(list):
+    '''
+    Describes a single experiment ID, which has a collection of its stanzas as
+    well as some additional data that should typically be consistent across all
+    the stanzas, as well as verifying that the data is in fact consistent.
+    '''
+    
+    @property
+    def name(self):
+        return self._id
+        
+    @property
+    def dataType(self):
+        '''The data type of the experiment. 'None' if inconsistent.'''
+        try:
+            return self._dataType
+        except AttributeError:
+            self._dataType = None
+            for s in self:
+                if 'dataType' in s:
+                    if self._dataType == None:
+                        self._dataType = dataTypes[s['dataType']]
+                    elif self._dataType.name != s['dataType']:
+                        self._dataType = None
+                        break
+            return self._dataType
+    
+    def __init__(self, id, parent, stanzas):
+        list.__init__(self)
+        self.extend(stanzas)
+        self._id = id
+        self._parent = parent