e7f828d1d1d15fe187a2cb9a241dee10874af38f mmaddren Mon Feb 6 13:35:15 2012 -0800 cvValidate updated for new cv spec diff --git python/lib/ucscgenomics/mdb.py python/lib/ucscgenomics/mdb.py index b39400a..9ab868c 100644 --- python/lib/ucscgenomics/mdb.py +++ python/lib/ucscgenomics/mdb.py @@ -7,64 +7,73 @@ self.molecule = molecule self.strategy = strategy self.source = source self.selection = selection self.type = type @property def valid(self): return self.molecule != 'REPLACE' and self.strategy != 'REPLACE' and self.source != 'REPLACE' and self.selection != 'REPLACE' and self.type != None @property def shouldSubmit(self): return self.type != 'NotGeo' dataTypes = { - 'Cage': DataType('Cage', 'OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', 'HighThroughput'), + 'Cage': DataType( 'Cage', 'RNA', 'OTHER', 'transcriptomic', 'CAGE', 'HighThroughput'), 'ChipSeq': DataType('ChipSeq', 'genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', 'HighThroughput'), 'DnaPet': DataType('DnaPet', 'genomic DNA', 'OTHER', 'genomic', 'size fractionation', 'HighThroughput'), 'DnaseDgf': DataType('DnaseDgf', 'genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', 'HighThroughput'), 'DnaseSeq': DataType('DnaseSeq', 'genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', 'HighThroughput'), 'FaireSeq': DataType('FaireSeq', 'genomic DNA', 'OTHER', 'genomic', 'other', 'HighThroughput'), 'MethylSeq': DataType('MethylSeq', 'genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', 'HighThroughput'), 'MethylRrbs': DataType('MethylRrbs', 'genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', 'HighThroughput'), 'Orchid': DataType('Orchid', 'genomic DNA', 'OTHER', 'genomic', 'other', 'HighThroughput'), 'Proteogenomics': DataType('Proteogenomics', 'protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', 'HighThroughput'), - 'RnaPet': DataType('RnaPet', 'OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', 'HighThroughput'), - 'RnaSeq': DataType('RnaSeq', 'OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', 'HighThroughput'), + 'RnaPet': DataType( 'RnaPet', 'RNA', 'OTHER', 'transcriptomic', 'other', 'HighThroughput'), + 'RnaSeq': DataType( 'RnaSeq', 'RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', 'HighThroughput'), - #these need to be curated - '5C': DataType('5C', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'AffyExonArray': DataType('AffyExonArray', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'MicroArray'), + #doublecheck + 'ChiaPet': DataType( 'ChiaPet', 'genomic DNA', 'ChIP-Seq followed by ligation', 'genomic', 'other', 'HighThroughput'), + 'Nucleosome': DataType( 'Nucleosome', 'genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', 'HighThroughput'), + 'RipSeq': DataType( 'RipSeq', 'RNA', 'OTHER', 'transcriptomic', 'RNA binding protein antibody', 'HighThroughput'), + #for ripseq, ask geo about new 'ripseq' + + #not geo stuff + '5C': DataType('5C', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), 'Bip': DataType('Bip', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), - 'Cluster': DataType('Cluster', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Cnv': DataType('Cnv', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Combined': DataType('Combined', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Genotype': DataType('Genotype', 'genomic DNA', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Gencode': DataType('Gencode', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), - 'ChiaPet': DataType('ChiaPet', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Mapability': DataType('Mapability', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), - 'MethylArray': DataType('MethylArray', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'NRE': DataType('NRE', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), - 'Nucleosome': DataType('Nucleosome', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RnaChip': DataType('RnaChip', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RipGeneSt': DataType('RipGeneSt', 'OVERRIDE RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', 'MicroArray'), #this isn't correct - 'RipTiling': DataType('RipTiling', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RipChip': DataType('RipChip', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'RipSeq': DataType('RipSeq', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Switchgear': DataType('Switchgear', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), - 'TfbsValid': DataType('TfbsValid', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo') + 'TfbsValid': DataType('TfbsValid', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), + 'Cluster': DataType('Cluster', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), + + #array + 'AffyExonArray': DataType( 'AffyExonArray', 'mRNA', 'RNA-Microarray', 'transcriptomic', 'polyA', 'MicroArray'), + 'MethylArray': DataType( 'MethylArray', 'genomic DNA', 'REPLACE', 'genomic', 'REPLACE', 'MicroArray'), + 'RipGeneSt': DataType( 'RipGeneSt', 'RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', 'MicroArray'), #this isn't correct + 'RipTiling': DataType( 'RipTiling', 'RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', 'MicroArray'), + + #these need to be curated + 'Cnv': DataType( 'Cnv', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Combined': DataType( 'Combined', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Genotype': DataType( 'Genotype', 'genomic DNA', 'REPLACE', 'genomic', 'REPLACE', None), + 'RnaChip': DataType( 'RnaChip', 'RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', None), + 'RipChip': DataType( 'RipChip', 'RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', None) + + } #compare this to the source in datatype, give GP ids depending on the type gpIds = { 'human genomic': '63443', 'human transcriptomic': '30709', 'human protein': '63447', 'mouse genomic': '63471', 'mouse transcriptomic': '66167', 'mouse protein': '63475' } class MdbFile(ra.RaFile): ''' @@ -78,34 +87,37 @@ try: return self._expVars except AttributeError: self._expVars = self.compositeStanza['expVars'].split(',') return self._expVars @property def dataType(self): '''The data type of the experiment. 'None' if inconsistent.''' try: return self._dataType except AttributeError: self._dataType = None for e in self.experiments.itervalues(): if self._dataType == None and e.dataType != None: + print e.dataType self._dataType = e.dataType elif self._dataType != e.dataType or e.dataType == None: + print 'multiple data types!' self._dataType = None break + print 'still none' return self._dataType @property def compositeStanza(self): '''the stanza (typically first in file) describing the composite''' try: return self._compositeStanza except AttributeError: self._compositeStanza = self.filter(lambda s: s['objType'] == 'composite', lambda s: s) if len(self._compositeStanza) != 1: raise KeyError else: self._compositeStanza = self._compositeStanza[0] return self._compositeStanza @@ -169,26 +181,30 @@ @property def name(self): return self._id @property def dataType(self): '''The data type of the experiment. 'None' if inconsistent.''' try: return self._dataType except AttributeError: self._dataType = None for s in self: if 'dataType' in s: if self._dataType == None: + print dataTypes[s['dataType']] self._dataType = dataTypes[s['dataType']] elif self._dataType.name != s['dataType']: + print 'exp multiple data types!' self._dataType = None break + + print 'still none (exp)' return self._dataType def __init__(self, id, parent, stanzas): list.__init__(self) self.extend(stanzas) self._id = id self._parent = parent