3aee104cf4c1e245dd020f743fbc58c17fd75976 mmaddren Mon Apr 9 12:12:44 2012 -0700 added encode.py to store global constants and other encode stuff, and made all other libraries interface correctly with it diff --git python/lib/ucscgenomics/encode.py python/lib/ucscgenomics/encode.py new file mode 100644 index 0000000..4275eb3 --- /dev/null +++ python/lib/ucscgenomics/encode.py @@ -0,0 +1,115 @@ +import os, hashlib + +class DataType(object): + + def __init__(self, name, molecule, strategy, source, selection, type): + self.name = name + self.molecule = molecule + self.strategy = strategy + self.source = source + self.selection = selection + self.type = type + + @property + def valid(self): + return self.molecule != 'REPLACE' and self.strategy != 'REPLACE' and self.source != 'REPLACE' and self.selection != 'REPLACE' and self.type != None + + @property + def shouldSubmit(self): + return self.type != 'NotGeo' + + +dataTypes = { + 'Cage': DataType( 'Cage', 'RNA', 'OTHER', 'transcriptomic', 'CAGE', 'HighThroughput'), + 'ChipSeq': DataType( 'ChipSeq', 'genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', 'HighThroughput'), + 'DnaPet': DataType( 'DnaPet', 'genomic DNA', 'OTHER', 'genomic', 'size fractionation', 'HighThroughput'), + 'DnaseDgf': DataType( 'DnaseDgf', 'genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', 'HighThroughput'), + 'DnaseSeq': DataType( 'DnaseSeq', 'genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', 'HighThroughput'), + 'FaireSeq': DataType( 'FaireSeq', 'genomic DNA', 'OTHER', 'genomic', 'other', 'HighThroughput'), + 'MethylSeq': DataType( 'MethylSeq', 'genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', 'HighThroughput'), + 'MethylRrbs': DataType( 'MethylRrbs', 'genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', 'HighThroughput'), + 'Orchid': DataType( 'Orchid', 'genomic DNA', 'OTHER', 'genomic', 'other', 'HighThroughput'), + 'Proteogenomics': DataType( 'Proteogenomics', 'protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', 'HighThroughput'), + 'RnaPet': DataType( 'RnaPet', 'RNA', 'OTHER', 'transcriptomic', 'other', 'HighThroughput'), + 'RnaSeq': DataType( 'RnaSeq', 'RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', 'HighThroughput'), + + #doublecheck + 'ChiaPet': DataType( 'ChiaPet', 'genomic DNA', 'ChIP-Seq followed by ligation', 'genomic', 'other', 'HighThroughput'), + 'Nucleosome': DataType( 'Nucleosome', 'genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', 'HighThroughput'), + 'RipSeq': DataType( 'RipSeq', 'RNA', 'OTHER', 'transcriptomic', 'RNA binding protein antibody', 'HighThroughput'), + #for ripseq, ask geo about new 'ripseq' + + #not geo stuff + '5C': DataType('5C', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), + 'Bip': DataType('Bip', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), + 'Gencode': DataType('Gencode', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), + 'Mapability': DataType('Mapability', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), + 'NRE': DataType('NRE', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), + 'Switchgear': DataType('Switchgear', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), + 'TfbsValid': DataType('TfbsValid', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), + 'Cluster': DataType('Cluster', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), + + #array + 'AffyExonArray': DataType( 'AffyExonArray', 'mRNA', 'RNA-Microarray', 'transcriptomic', 'polyA', 'MicroArray'), + 'MethylArray': DataType( 'MethylArray', 'genomic DNA', 'REPLACE', 'genomic', 'REPLACE', 'MicroArray'), + 'RipGeneSt': DataType( 'RipGeneSt', 'RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', 'MicroArray'), #this isn't correct + 'RipTiling': DataType( 'RipTiling', 'RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', 'MicroArray'), + + #these need to be curated + 'Cnv': DataType( 'Cnv', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Combined': DataType( 'Combined', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Genotype': DataType( 'Genotype', 'genomic DNA', 'REPLACE', 'genomic', 'REPLACE', None), + 'RnaChip': DataType( 'RnaChip', 'RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', None), + 'RipChip': DataType( 'RipChip', 'RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', None) + + +} + +#compare this to the source in datatype, give GP ids depending on the type +gpIds = { + 'human genomic': '63443', + 'human transcriptomic': '30709', + 'human protein': '63447', + + 'mouse genomic': '63471', + 'mouse transcriptomic': '66167', + 'mouse protein': '63475' +} + +organisms = { + 'hg19': 'human', + 'hg18': 'human', + 'mm9': 'mouse', + 'encodeTest': 'human' +} + +def defaultTrackPath(): + return os.path.expanduser('~/kent/src/hg/makeDb/trackDb/') + +def defaultCvPath(): + return defaultTrackPath() + 'cv/alpha/cv.ra' + +def downloadsPath(database, composite): + return '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/' + +def readMd5sums(filename): + '''Reads an md5sum.txt file and returns a dictionary of filename: md5''' + if os.path.isfile(filename): + md5sums = dict() + md5file = open(filename, 'r') + for line in md5file: + key, val = map(str.strip, line.split(' ', 1)) + md5sums[key] = val + return md5sums + else: + return None + +def hashFile(filename, hasher=hashlib.md5(), blocksize=65536): + '''MD5's the file, and returns the number''' + afile = open(filename, 'rb') + buf = afile.read(blocksize) + while len(buf) > 0: + hasher.update(buf) + buf = afile.read(blocksize) + return hasher.hexdigest() + \ No newline at end of file