0bb1cef7954937ff2c3d0851ee8db34c07dc5008 mmaddren Tue Jul 19 14:42:36 2011 -0700 small but in soft file, work on mkGeoPkg for micro array submissions (still not functional) and added a readme diff --git python/programs/mkGeoPkg/trackInfo python/programs/mkGeoPkg/trackInfo new file mode 100755 index 0000000..2247661 --- /dev/null +++ python/programs/mkGeoPkg/trackInfo @@ -0,0 +1,286 @@ +#!/hive/groups/encode/dcc/bin/python +import sys, os, shutil, stat +from rafile.RaFile import * +from softfile.SoftFile import * +from cvfile.CvFile import * + +class DataType(object): + + def __init__(self, molecule, strategy, source, selection, soft): + self.molecule = molecule + self.strategy = strategy + self.source = source + self.selection = selection + self.soft = soft + +datatypes = { + 'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', HighThroughputSoftFile), + 'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', HighThroughputSoftFile), + 'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', HighThroughputSoftFile), + 'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile), + 'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile), + 'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile), + 'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', HighThroughputSoftFile), + 'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', HighThroughputSoftFile), + 'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile), + 'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', HighThroughputSoftFile), + 'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', HighThroughputSoftFile), + 'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', HighThroughputSoftFile), + + #these need to be curated + '5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), + 'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None) +} +cvDetails = { + 'cell': [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ], + 'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ] +} + +#if the term appears in the mdb and must overriding the value in the cv +cvOverride = [ 'sex' ] + +#talk to Venkat lol +cvPretend = { 'antibody Input': 'control' } + +#if its not in cvDetails, which things should we check by default +cvDefaults = [ 'description' ] + +mdbWhitelist = [ + 'age', + 'bioRep', + 'control', + 'controlId', + 'fragSize', + 'labExpId', + 'labVersion', + 'mapAlgorithm', + 'obtainedBy', + 'phase', + 'readType', + 'region', + 'replicate', + 'restrictionEnzyme', + 'run', + 'softwareVersion', + 'spikeInPool', + 'strain' +] + +# if the molecule is RNA, we need to map our data into !Sample_molecule, which only takes certain fields +# first we check rnaExtractMapping. If its not there, we use the localization. This is because (at current) +# polyA is the most important trait, otherwise its going to be nonPolyA which GEO doesn't accept that. +rnaExtractMapping = { + 'shortPolyA': 'polyA RNA', + 'longPolyA': 'polyA RNA', + 'polyA': 'polyA RNA' +} + +localizationMapping = { + 'cytosol': 'cytoplasmic RNA', + 'polysome': 'cytoplasmic RNA', + 'membraneFraction': 'cytoplasmic RNA', + 'mitochondria': 'cytoplasmic RNA', + 'nucleus': 'nuclear RNA', + 'nucleolus': 'nuclear RNA', + 'nucleoplasm': 'nuclear RNA', + 'nuclearMatrix': 'nuclear RNA', + 'chromatin': 'nuclear RNA', + 'cell': 'total RNA' +} + +# map our instrument names to GEO's names +instrumentModels = { + 'Illumina_GA2x': 'Illumina Genome Analyzer II' +} + +organisms = { + 'hg19': 'human', + 'hg18': 'human', + 'mm9': 'mouse' +} + +def filesize(val): + if val > 1099511627776: + return str(round(float(val) / 1099511627776, 2)) + 'TB' + if val > 1073741824: + return str(round(float(val) / 1073741824, 2)) + 'GB' + if val > 1048576: + return str(round(float(val) / 1048576, 2)) + 'MB' + if val > 1024: + return str(round(float(val) / 1024, 2)) + 'KB' + else: + return str(val) + 'B' + + + +def getFileType(filename): + filename.replace('.gz', '') + return filename.rsplit('.')[1] + +def isRawFile(filename): + return (getFileType(filename) == 'fastq' or getFileType(filename) == 'fasta') + +def isSupplimentaryFile(filename): + return not isRawFile(filename) + + +def createMappings(mdb): + expIds = dict() + geoMapping = dict() + expVars = None + series = None + datatype = None + + for stanza in mdb.itervalues(): + + if 'objType' in stanza and stanza['objType'] == 'composite': + series = stanza + expVars = stanza['expVars'].split(',') + continue + + if 'expId' not in stanza: + print stanza.name + ': no expId' + continue + + if 'geoSampleAccession' not in stanza: + # if this hasn't been submitted to GEO yet, we'll add it to the submission list + if stanza['expId'] not in expIds: + expIds[stanza['expId']] = list() + + expIds[stanza['expId']].append(stanza) + + else: + # otherwise we keep track of the geo number for partially submitted samples + if stanza['expId'] not in geoMapping: + geoMapping[stanza['expId']] = stanza['geoSampleAccession'] + elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']: + geoMapping[stanza['expId']] = 'Inconsistent' + print stanza.name + ': inconsistent geo mapping' + + if datatype == None and 'dataType' in stanza: + datatype = stanza['dataType'] + elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']: + raise KeyError(stanza.name + ': inconsistent data type') + + + + datatype = datatypes[datatype] + + return expIds, expVars, geoMapping, series, datatype + + +def main(): + database = sys.argv[1] + composite = sys.argv[2] + organism = organisms[database] + + #list everything + mode = 0 + + if len(sys.argv) > 3: + submitStart = sys.argv[3] + #list individual + mode = 1 + + if len(sys.argv) > 4: + submitSize = int(sys.argv[4]) + #list range + mode = 2 + + + mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #CHANGE + trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra' + + downloadsDirectory = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/' + + + mdb = RaFile(mdbPath) + track = RaFile(trackPath) + expIds, expVars, geoMapping, series, datatype = createMappings(mdb) + + submission = dict() + sortedIds = expIds.keys() + sortedIds.sort() + + if mode == 1: + sortedIds = [submitStart] + elif mode == 2: + sortedIds = sortedIds[sortedIds.index(submitStart):sortedIds.index(submitStart) + submitSize] + + + minId = min(sortedIds) + maxId = max(sortedIds) + out = list() + totalsize = 0 + filecount = 0 + + # 'Generating soft using expIds ' + minId + ' to ' + maxId + + for idNum in sortedIds: + + samplesize = 0 + expId = expIds[idNum] + + for stanza in expId: + + if os.path.exists(downloadsDirectory + stanza['fileName']): + + st = os.stat(downloadsDirectory + stanza['fileName']) + samplesize = samplesize + st.st_size + totalsize = totalsize + st.st_size + filecount = filecount + 1 + + strsub = '[Unsubmitted]' + if idNum in geoMapping and geoMapping[idNum] == 'Inconsistent': + strsub = '[Inconsistent]' + if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': + strsub = '[' + geoMapping[idNum] + ']' + + out.append(' + ' + expId[0]['metaObject'] + ' (' + str(idNum) + ')' + '[' + filesize(samplesize) + ']' + strsub + ' - ' + str(len(expId)) + ' files') + + for stanza in expId: + + if not os.path.exists(downloadsDirectory + stanza['fileName']): + out.append(' | ' + stanza['fileName'] + ' MISSING FILE!') + else: + + st = os.stat(downloadsDirectory + stanza['fileName']) + out.append(' | ' + stanza['fileName'] + ' [' + filesize(st.st_size) + ']') + + + strsub = '[Unsubmitted]' + if 'geoSeriesAccession' in series: + strsub = '[' + series['geoSeriesAccession'] + ']' + + modestr = '' + if mode == 1: + modestr = ' <' + minId + '>' + elif mode == 2: + modestr = ' <' + minId + '-' + maxId + '>' + + out.insert(0, composite + ' [' + filesize(totalsize) + ']' + strsub + modestr + ' - ' + str(filecount) + ' files') + + for line in out: + print line + + +if __name__ == '__main__': + main() \ No newline at end of file