a44e3254174152b2d4b3f241d1935654cd139181 vsmalladi Tue May 8 10:11:21 2012 -0700 Renamed library from ucscgenomics to ucscGb. Redmine #7029. diff --git python/lib/ucscgenomics/encode.py python/lib/ucscgenomics/encode.py deleted file mode 100644 index 2d1f944..0000000 --- python/lib/ucscgenomics/encode.py +++ /dev/null @@ -1,115 +0,0 @@ -import os, hashlib - -class DataType(object): - - def __init__(self, name, molecule, strategy, source, selection, type): - self.name = name - self.molecule = molecule - self.strategy = strategy - self.source = source - self.selection = selection - self.type = type - - @property - def valid(self): - return self.molecule != 'REPLACE' and self.strategy != 'REPLACE' and self.source != 'REPLACE' and self.selection != 'REPLACE' and self.type != None - - @property - def shouldSubmit(self): - return self.type != 'NotGeo' - - -dataTypes = { - 'Cage': DataType( 'Cage', 'RNA', 'OTHER', 'transcriptomic', 'CAGE', 'HighThroughput'), - 'ChipSeq': DataType( 'ChipSeq', 'genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', 'HighThroughput'), - 'DnaPet': DataType( 'DnaPet', 'genomic DNA', 'OTHER', 'genomic', 'size fractionation', 'HighThroughput'), - 'DnaseDgf': DataType( 'DnaseDgf', 'genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', 'HighThroughput'), - 'DnaseSeq': DataType( 'DnaseSeq', 'genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', 'HighThroughput'), - 'FaireSeq': DataType( 'FaireSeq', 'genomic DNA', 'OTHER', 'genomic', 'other', 'HighThroughput'), - 'MethylSeq': DataType( 'MethylSeq', 'genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', 'HighThroughput'), - 'MethylRrbs': DataType( 'MethylRrbs', 'genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', 'HighThroughput'), - 'Orchid': DataType( 'Orchid', 'genomic DNA', 'OTHER', 'genomic', 'other', 'HighThroughput'), - 'Proteogenomics': DataType( 'Proteogenomics', 'protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', 'HighThroughput'), - 'RnaPet': DataType( 'RnaPet', 'RNA', 'OTHER', 'transcriptomic', 'other', 'HighThroughput'), - 'RnaSeq': DataType( 'RnaSeq', 'RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', 'HighThroughput'), - - #doublecheck - 'ChiaPet': DataType( 'ChiaPet', 'genomic DNA', 'ChIP-Seq followed by ligation', 'genomic', 'other', 'HighThroughput'), - 'Nucleosome': DataType( 'Nucleosome', 'genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', 'HighThroughput'), - 'RipSeq': DataType( 'RipSeq', 'RNA', 'OTHER', 'transcriptomic', 'RNA binding protein antibody', 'HighThroughput'), - #for ripseq, ask geo about new 'ripseq' - - #not geo stuff - '5C': DataType('5C', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), - 'Bip': DataType('Bip', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), - 'Gencode': DataType('Gencode', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), - 'Mapability': DataType('Mapability', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), - 'NRE': DataType('NRE', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), - 'Switchgear': DataType('Switchgear', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), - 'TfbsValid': DataType('TfbsValid', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), - 'Cluster': DataType('Cluster', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', 'NotGeo'), - - #array - 'AffyExonArray': DataType( 'AffyExonArray', 'mRNA', 'RNA-Microarray', 'transcriptomic', 'polyA', 'MicroArray'), - 'MethylArray': DataType( 'MethylArray', 'genomic DNA', 'REPLACE', 'genomic', 'REPLACE', 'MicroArray'), - 'RipGeneSt': DataType( 'RipGeneSt', 'RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', 'MicroArray'), #this isn't correct - 'RipTiling': DataType( 'RipTiling', 'RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', 'MicroArray'), - - #these need to be curated - 'Cnv': DataType( 'Cnv', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Combined': DataType( 'Combined', 'REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), - 'Genotype': DataType( 'Genotype', 'genomic DNA', 'REPLACE', 'genomic', 'REPLACE', None), - 'RnaChip': DataType( 'RnaChip', 'RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', None), - 'RipChip': DataType( 'RipChip', 'RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', None) - - -} - -#compare this to the source in datatype, give GP ids depending on the type -gpIds = { - 'human genomic': '63443', - 'human transcriptomic': '30709', - 'human protein': '63447', - - 'mouse genomic': '63471', - 'mouse transcriptomic': '66167', - 'mouse protein': '63475' -} - -organisms = { - 'hg19': 'human', - 'hg18': 'human', - 'mm9': 'mouse', - 'encodeTest': 'human' -} - -def defaultTrackPath(): - return os.path.expanduser('~/kent/src/hg/makeDb/trackDb/') - -def defaultCvPath(): - return defaultTrackPath() + 'cv/alpha/cv.ra' - -def downloadsPath(database, composite): - return '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/' - -def readMd5sums(filename): - '''Reads an md5sum.txt file and returns a dictionary of filename: md5''' - if os.path.isfile(filename): - md5sums = dict() - md5file = open(filename, 'r') - for line in md5file: - key, val = map(str.strip, line.split(' ', 1)) - md5sums[key] = val - return md5sums - else: - return None - -def hashFile(filename, hasher=hashlib.md5(), blocksize=65536): - '''MD5's the file, and returns the number''' - afile = open(filename, 'rb') - buf = afile.read(blocksize) - while len(buf) > 0: - hasher.update(buf) - buf = afile.read(blocksize) - return hasher.hexdigest() - \ No newline at end of file