d01a44fe987e619d7f14fd81a5d9beed28e433dc mmaddren Tue Jul 19 16:18:13 2011 -0700 fixed a minor bug in trackInfo diff --git python/programs/mkGeoPkg/trackInfo python/programs/mkGeoPkg/trackInfo index 2247661..2946d18 100755 --- python/programs/mkGeoPkg/trackInfo +++ python/programs/mkGeoPkg/trackInfo @@ -1,286 +1,285 @@ #!/hive/groups/encode/dcc/bin/python import sys, os, shutil, stat from rafile.RaFile import * from softfile.SoftFile import * from cvfile.CvFile import * class DataType(object): def __init__(self, molecule, strategy, source, selection, soft): self.molecule = molecule self.strategy = strategy self.source = source self.selection = selection self.soft = soft datatypes = { 'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', HighThroughputSoftFile), 'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', HighThroughputSoftFile), 'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', HighThroughputSoftFile), 'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile), 'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', HighThroughputSoftFile), 'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile), 'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', HighThroughputSoftFile), 'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', HighThroughputSoftFile), 'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', HighThroughputSoftFile), 'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', HighThroughputSoftFile), 'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', HighThroughputSoftFile), 'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', HighThroughputSoftFile), #these need to be curated '5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Genotype': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipGeneSt': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None), 'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None) } cvDetails = { 'cell': [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ], 'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ] } #if the term appears in the mdb and must overriding the value in the cv cvOverride = [ 'sex' ] #talk to Venkat lol cvPretend = { 'antibody Input': 'control' } #if its not in cvDetails, which things should we check by default cvDefaults = [ 'description' ] mdbWhitelist = [ 'age', 'bioRep', 'control', 'controlId', 'fragSize', 'labExpId', 'labVersion', 'mapAlgorithm', 'obtainedBy', 'phase', 'readType', 'region', 'replicate', 'restrictionEnzyme', 'run', 'softwareVersion', 'spikeInPool', 'strain' ] # if the molecule is RNA, we need to map our data into !Sample_molecule, which only takes certain fields # first we check rnaExtractMapping. If its not there, we use the localization. This is because (at current) # polyA is the most important trait, otherwise its going to be nonPolyA which GEO doesn't accept that. rnaExtractMapping = { 'shortPolyA': 'polyA RNA', 'longPolyA': 'polyA RNA', 'polyA': 'polyA RNA' } localizationMapping = { 'cytosol': 'cytoplasmic RNA', 'polysome': 'cytoplasmic RNA', 'membraneFraction': 'cytoplasmic RNA', 'mitochondria': 'cytoplasmic RNA', 'nucleus': 'nuclear RNA', 'nucleolus': 'nuclear RNA', 'nucleoplasm': 'nuclear RNA', 'nuclearMatrix': 'nuclear RNA', 'chromatin': 'nuclear RNA', 'cell': 'total RNA' } # map our instrument names to GEO's names instrumentModels = { 'Illumina_GA2x': 'Illumina Genome Analyzer II' } organisms = { 'hg19': 'human', 'hg18': 'human', 'mm9': 'mouse' } def filesize(val): if val > 1099511627776: return str(round(float(val) / 1099511627776, 2)) + 'TB' if val > 1073741824: return str(round(float(val) / 1073741824, 2)) + 'GB' if val > 1048576: return str(round(float(val) / 1048576, 2)) + 'MB' if val > 1024: return str(round(float(val) / 1024, 2)) + 'KB' else: return str(val) + 'B' def getFileType(filename): filename.replace('.gz', '') return filename.rsplit('.')[1] def isRawFile(filename): return (getFileType(filename) == 'fastq' or getFileType(filename) == 'fasta') def isSupplimentaryFile(filename): return not isRawFile(filename) def createMappings(mdb): expIds = dict() geoMapping = dict() expVars = None series = None datatype = None for stanza in mdb.itervalues(): if 'objType' in stanza and stanza['objType'] == 'composite': series = stanza expVars = stanza['expVars'].split(',') continue if 'expId' not in stanza: print stanza.name + ': no expId' continue - if 'geoSampleAccession' not in stanza: # if this hasn't been submitted to GEO yet, we'll add it to the submission list if stanza['expId'] not in expIds: expIds[stanza['expId']] = list() expIds[stanza['expId']].append(stanza) - else: + if 'geoSampleAccession' in stanza: # otherwise we keep track of the geo number for partially submitted samples if stanza['expId'] not in geoMapping: geoMapping[stanza['expId']] = stanza['geoSampleAccession'] elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']: geoMapping[stanza['expId']] = 'Inconsistent' print stanza.name + ': inconsistent geo mapping' if datatype == None and 'dataType' in stanza: datatype = stanza['dataType'] elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']: raise KeyError(stanza.name + ': inconsistent data type') datatype = datatypes[datatype] return expIds, expVars, geoMapping, series, datatype def main(): database = sys.argv[1] composite = sys.argv[2] organism = organisms[database] #list everything mode = 0 if len(sys.argv) > 3: submitStart = sys.argv[3] #list individual mode = 1 if len(sys.argv) > 4: submitSize = int(sys.argv[4]) #list range mode = 2 mdbPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/metaDb/alpha/' + composite + '.ra' #CHANGE trackPath = '/cluster/home/mmaddren/kent/src/hg/makeDb/trackDb/' + organism + '/' + database + '/' + composite + '.ra' downloadsDirectory = '/hive/groups/encode/dcc/analysis/ftp/pipeline/' + database + '/' + composite + '/' mdb = RaFile(mdbPath) track = RaFile(trackPath) expIds, expVars, geoMapping, series, datatype = createMappings(mdb) submission = dict() sortedIds = expIds.keys() sortedIds.sort() if mode == 1: sortedIds = [submitStart] elif mode == 2: sortedIds = sortedIds[sortedIds.index(submitStart):sortedIds.index(submitStart) + submitSize] minId = min(sortedIds) maxId = max(sortedIds) out = list() totalsize = 0 filecount = 0 # 'Generating soft using expIds ' + minId + ' to ' + maxId for idNum in sortedIds: samplesize = 0 expId = expIds[idNum] for stanza in expId: if os.path.exists(downloadsDirectory + stanza['fileName']): st = os.stat(downloadsDirectory + stanza['fileName']) samplesize = samplesize + st.st_size totalsize = totalsize + st.st_size filecount = filecount + 1 strsub = '[Unsubmitted]' if idNum in geoMapping and geoMapping[idNum] == 'Inconsistent': strsub = '[Inconsistent]' if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent': strsub = '[' + geoMapping[idNum] + ']' out.append(' + ' + expId[0]['metaObject'] + ' (' + str(idNum) + ')' + '[' + filesize(samplesize) + ']' + strsub + ' - ' + str(len(expId)) + ' files') for stanza in expId: if not os.path.exists(downloadsDirectory + stanza['fileName']): out.append(' | ' + stanza['fileName'] + ' MISSING FILE!') else: st = os.stat(downloadsDirectory + stanza['fileName']) out.append(' | ' + stanza['fileName'] + ' [' + filesize(st.st_size) + ']') strsub = '[Unsubmitted]' if 'geoSeriesAccession' in series: strsub = '[' + series['geoSeriesAccession'] + ']' modestr = '' if mode == 1: modestr = ' <' + minId + '>' elif mode == 2: modestr = ' <' + minId + '-' + maxId + '>' out.insert(0, composite + ' [' + filesize(totalsize) + ']' + strsub + modestr + ' - ' + str(filecount) + ' files') for line in out: print line if __name__ == '__main__': main() \ No newline at end of file