python/lib/ucscgenomics/encode.py 3aee104cf4c1e245dd020f743fbc58c17fd75976

3aee104cf4c1e245dd020f743fbc58c17fd75976
mmaddren
  Mon Apr 9 12:12:44 2012 -0700
added encode.py to store global constants and other encode stuff, and made all other libraries interface correctly with it
diff --git python/lib/ucscgenomics/encode.py python/lib/ucscgenomics/encode.py
new file mode 100644
index 0000000..4275eb3
--- /dev/null
+++ python/lib/ucscgenomics/encode.py
@@ -0,0 +1,115 @@
+import os, hashlib

+

+class DataType(object):

+

+    def __init__(self, name, molecule, strategy, source, selection, type):

+        self.name = name

+        self.molecule = molecule

+        self.strategy = strategy

+        self.source = source

+        self.selection = selection

+        self.type = type

+        

+    @property    

+    def valid(self):

+        return self.molecule != 'REPLACE' and self.strategy != 'REPLACE' and self.source != 'REPLACE' and self.selection != 'REPLACE' and self.type != None

+    

+    @property

+    def shouldSubmit(self):

+        return self.type != 'NotGeo'

+

+

+dataTypes = {

+    'Cage': DataType(           'Cage',             'RNA',          'OTHER',                                            'transcriptomic',   'CAGE',                                         'HighThroughput'),

+    'ChipSeq': DataType(        'ChipSeq',          'genomic DNA',  'ChIP-Seq',                                         'genomic',          'ChIP',                                         'HighThroughput'),

+    'DnaPet': DataType(         'DnaPet',           'genomic DNA',  'OTHER',                                            'genomic',          'size fractionation',                           'HighThroughput'),

+    'DnaseDgf': DataType(       'DnaseDgf',         'genomic DNA',  'DNase-Hypersensitivity',                           'genomic',          'DNase',                                        'HighThroughput'),

+    'DnaseSeq': DataType(       'DnaseSeq',         'genomic DNA',  'DNase-Hypersensitivity',                           'genomic',          'DNase',                                        'HighThroughput'),

+    'FaireSeq': DataType(       'FaireSeq',         'genomic DNA',  'OTHER',                                            'genomic',          'other',                                        'HighThroughput'),

+    'MethylSeq': DataType(      'MethylSeq',        'genomic DNA',  'MRE-Seq',                                          'genomic',          'Restriction Digest',                           'HighThroughput'),

+    'MethylRrbs': DataType(     'MethylRrbs',       'genomic DNA',  'Bisulfite-Seq',                                    'genomic',          'Reduced Representation',                       'HighThroughput'),

+    'Orchid': DataType(         'Orchid',           'genomic DNA',  'OTHER',                                            'genomic',          'other',                                        'HighThroughput'),

+    'Proteogenomics': DataType( 'Proteogenomics',   'protein',      'mass spectrometry-based proteogenomic mapping',    'protein',          'chromatographically fractionated peptides',    'HighThroughput'),

+    'RnaPet': DataType(         'RnaPet',           'RNA',          'OTHER',                                            'transcriptomic',   'other',                                        'HighThroughput'),

+    'RnaSeq': DataType(         'RnaSeq',           'RNA',          'RNA-Seq',                                          'transcriptomic',   'cDNA',                                         'HighThroughput'),

+    

+    #doublecheck

+    'ChiaPet': DataType(        'ChiaPet',          'genomic DNA',  'ChIP-Seq followed by ligation',                    'genomic',          'other',                                      'HighThroughput'),

+    'Nucleosome': DataType(     'Nucleosome',       'genomic DNA',  'ChIP-Seq',                                         'genomic',          'ChIP',                                         'HighThroughput'),

+    'RipSeq': DataType(         'RipSeq',           'RNA',          'OTHER',                                          'transcriptomic',   'RNA binding protein antibody',                 'HighThroughput'),

+    #for ripseq, ask geo about new 'ripseq'

+    

+    #not geo stuff

+    '5C': DataType('5C', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'),

+    'Bip': DataType('Bip', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'),

+    'Gencode': DataType('Gencode', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'),

+    'Mapability': DataType('Mapability', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'),

+    'NRE': DataType('NRE', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'),

+    'Switchgear': DataType('Switchgear', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'),

+    'TfbsValid': DataType('TfbsValid', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'),

+    'Cluster': DataType('Cluster', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'),

+    

+    #array

+    'AffyExonArray': DataType(  'AffyExonArray',    'mRNA',         'RNA-Microarray',                                   'transcriptomic',   'polyA',                                        'MicroArray'),

+    'MethylArray': DataType(    'MethylArray',      'genomic DNA',  'REPLACE',                                          'genomic',          'REPLACE',                                      'MicroArray'),

+    'RipGeneSt': DataType(      'RipGeneSt',        'RNA',          'REPLACE',                                          'transcriptomic',   'RNA binding protein antibody',                 'MicroArray'), #this isn't correct

+    'RipTiling': DataType(      'RipTiling',        'RNA',          'REPLACE',                                          'transcriptomic',   'RNA binding protein antibody',                 'MicroArray'),

+    

+    #these need to be curated

+    'Cnv': DataType(            'Cnv',              'REPLACE',      'REPLACE',                                          'REPLACE',          'REPLACE',                                      None),

+    'Combined': DataType(       'Combined',         'REPLACE',      'REPLACE',                                          'REPLACE',          'REPLACE',                                      None),

+    'Genotype': DataType(       'Genotype',         'genomic DNA',  'REPLACE',                                          'genomic',          'REPLACE',                                      None),

+    'RnaChip': DataType(        'RnaChip',          'RNA',          'REPLACE',                                          'transcriptomic',   'RNA binding protein antibody',                 None),

+    'RipChip': DataType(        'RipChip',          'RNA',          'REPLACE',                                          'transcriptomic',   'RNA binding protein antibody',                 None)

+    

+    

+}

+

+#compare this to the source in datatype, give GP ids depending on the type

+gpIds = {

+    'human genomic': '63443',

+    'human transcriptomic': '30709',

+    'human protein': '63447',

+    

+    'mouse genomic': '63471',

+    'mouse transcriptomic': '66167',

+    'mouse protein': '63475'

+}

+

+organisms = {

+    'hg19': 'human',

+    'hg18': 'human',

+    'mm9': 'mouse',

+    'encodeTest': 'human'

+}

+

+def defaultTrackPath():

+    return os.path.expanduser('~/kent/src/hg/makeDb/trackDb/')

+

+def defaultCvPath():

+    return defaultTrackPath() + 'cv/alpha/cv.ra'

+    

+def downloadsPath(database, composite):

+    return '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/'

+    

+def readMd5sums(filename):

+    '''Reads an md5sum.txt file and returns a dictionary of filename: md5'''

+    if os.path.isfile(filename):

+        md5sums = dict()

+        md5file = open(filename, 'r')

+        for line in md5file:

+            key, val = map(str.strip, line.split(' ', 1))

+            md5sums[key] = val

+        return md5sums

+    else:

+        return None

+

+def hashFile(filename, hasher=hashlib.md5(), blocksize=65536):

+    '''MD5's the file, and returns the number'''

+    afile = open(filename, 'rb')

+    buf = afile.read(blocksize)

+    while len(buf) > 0:

+        hasher.update(buf)

+        buf = afile.read(blocksize)

+    return hasher.hexdigest()

+    
\ No newline at end of file