python/programs/mkGeoPkg/mkGeoPkg 464aa35bb5735dfb8f565621dbde8b3b2ead1581

464aa35bb5735dfb8f565621dbde8b3b2ead1581
mmaddren
  Thu May 10 14:53:09 2012 -0700
fixed ucscGb and some of the more important scripts using it since they broke during the move to the new library structure
diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg
index fdac241..96b1519 100755
--- python/programs/mkGeoPkg/mkGeoPkg
+++ python/programs/mkGeoPkg/mkGeoPkg
@@ -1,677 +1,690 @@
-#!/hive/groups/encode/dcc/bin/python
+#!/usr/bin/env python2.7
 
 import sys, os, shutil, stat, argparse, datetime, hashlib
-from ucscgenomics import track, ra, soft, cv, mdb, geo, encode
+
+from ucscGb.gbData.ra.raFile import RaFile
+from ucscGb.externalData.geo import submission
+from ucscGb.externalData.geo import soft
+from ucscGb.encode import encodeUtils
+from ucscGb.encode.cv import CvFile
+from ucscGb.encode.track import CompositeTrack, TrackFile
+
 
 '''
 mkGeoPkg - create a soft file and upload script for a given track, so that it
 may be sumbitted to GEO.
 
 To invoke the script, you must pass the composite and track name:
     mkGeoPkg hg19 wgEncodeSomeTrack
     
 This is typically not enough however; most tracks are not completely covered
 by their metadata, and it is necessary to supply additional information. The
 most commonly needed information is:
     !Series_summary - this is taken from the track's html page description. 
         In most cases it can be copied, one paragraph per line.
     !Series_overall_design - this is taken from the Methods section on the
         track's page. As with summary, 1 paragraph per line.
     !Series_contributor - this is taken from the contributors list on the 
         track's page. It has a special format: "Last,First" where each person
         is listed on a separate line.
     !Sample_instrument_model - this is a bit more difficult, as it technically
         supposed to be a per-sample field. Occasionally this appears in the
         metaDb for newer tracks, if so, it's automatically added in. Otherwise
         it must be either specified on a whole-series basis, or added to the
         metadata. In many cases, we don't actually know all of them. This is
         okay. GEO will allow us to submit whatever information we do have, and
         they can take care of the rest. The instrumentModels dictionary below
         gives a list of the allowable entered values (the keys). The values
         map to the "human readable" version used by GEO.
         
 You can supply all of the above information in an additional file and specify
 it using the r (replace) option:
     mkGeoPkg -r somefile hg19 wgEncodeSomeTrack
     
 The replace file must be organized with each of the above key value pairs as
 in a soft file:
     !Series_summary = some summary stuff ...
     !Series_summary = another paragraph of summary ...
     !Series_overall_design = ...
     !Series_contributor = Researcher,Some
     !Series_contributor = Researcher,Another
     !Sample_instrument_model = Illumina_GA2
     
 There is a template for this, named replaceTemplate in the directory.
 
 You may need to only submit a part of the track, in this case you can specify
 experiment ID numbers to include:
 
 One problem you may run into while running the script is that the DataType is
 not valid. This means that the huge dictionary called datatypes below isn't
 filled in for that entry. If you can get ahold of the information, modify the
 script and push the changes back out.
 
 You may also get an error when trying to submit MicroArray data. This is to be
 expected: MicroArray is currently not functional at all. We have no way as of
 current to map our data to the format expected by GEO, so we've punted on this
 issue for the time being.
 
 Once you've successfully run the script, you'll have generated a soft file and
 a script that will handle the uploading process. All of this will be put into
 a directory named 'database_wgEncodeSomeTrack_year-month-day'. To begin the
 upload, simply cd into the directoy and run upload.sh. This will start the
 aspera copy program, copying files, a files list, and the soft file itself.
 
 For each submission, you need to email GEO. Our current contact is:
     Steve Wilhite, wilhite@ncbi.nlm.nih.gov
     
 You must specify which track you're submitting. GEO will only allow us 1TB of
 space dedicated to ENCODE, so you must break down submissions larger than 1TB
 and only submit as many submissions as they have room to process at any given
 time. In a few days, GEO will get back to you with a list of accession numbers
 which need to be put back into our metadata (see encodeAddGeoAccessions).
 '''
 
 cvDetails = {
     'cell':    [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ],
     'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ]
 }
 
 #if the term appears in the mdb and must overriding the value in the cv
 cvOverride = [ 'sex' ]
 
 #talk to Venkat lol
 cvPretend = { 'antibody Input': 'control' }
 
 #if its not in cvDetails, which things should we check by default
 cvDefaults = [ 'description' ]
 
 mdbWhitelist = [
     'age',
     'bioRep',
     'control',
     'controlId',
     'fragSize',
     'labExpId',
     'labVersion',
     'mapAlgorithm',
     'obtainedBy',
     'phase',
     'readType',
     'region',
     'replicate',
     'restrictionEnzyme',
     'run',
     'softwareVersion',
     'spikeInPool',
     'strain'
 ]
     
 def isRawFile(file):
-    return (file.extension == 'fastq' or file.extension == 'fasta')
+    return (file.extension == 'fastq' or file.extension == 'csfasta' or file.extension == 'csqual')
     
 def isSupplimentaryFile(file):
-    return not isRawFile(file)
+    return (not isRawFile(file)) and file.extension != 'fasta' and file.extension != 'bam' and file.extension != 'bai'
     
 def sampleTitle(stanza, expVars, warn=False):
     concat = stanza[expVars[0]].replace('-m', '')
     for expVar in expVars[1:len(expVars)]:
         if expVar in stanza and stanza[expVar] != 'None':
             concat += '_' + stanza[expVar]
         elif warn:
             print 'warning: %s is None or not in %s' % (expVar, stanza.name)
     return concat
     
 def linkName(file, track):
     return '%s_%s' % (track.database, file.name)
     
 def createMappings(metadb):
     expIds = dict()
     geoMapping = dict()
     expVars = None
     series = None
     datatype = None
     
     for stanza in metadb.itervalues():
         
         if 'objType' in stanza and stanza['objType'] == 'composite':
             series = stanza
             expVars = stanza['expVars'].split(',')
             continue
 
         if 'expId' not in stanza:
             print stanza.name + ': no expId'
             continue
 
         if 'objStatus' in stanza:
             print stanza.name + ': skipping because ' + stanza['objStatus']
             continue
             
         if 'geoSampleAccession' not in stanza:
             # if this hasn't been submitted to GEO yet, we'll add it to the submission list
             if stanza['expId'] not in expIds:
                 expIds[stanza['expId']] = list()
                 
             expIds[stanza['expId']].append(stanza)
         
         else:
             # otherwise we keep track of the geo number for partially submitted samples
             if stanza['expId'] not in geoMapping:
                 geoMapping[stanza['expId']] = stanza['geoSampleAccession']
             elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']:
                 geoMapping[stanza['expId']] = 'Inconsistent'
                 print stanza.name + ': inconsistent geo mapping'
         
         if datatype == None and 'dataType' in stanza:
             datatype = stanza['dataType']
         elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']:
             raise KeyError(stanza.name + ': inconsistent data type') 
 
     try:
         dt = datatype
-        datatype = encode.dataTypes[dt]
+        datatype = encodeUtils.dataTypes[dt]
         datatype.name = dt
     except KeyError:
         raise KeyError(datatype)
     
     return expIds, expVars, geoMapping, series, datatype
     
     
 def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit, argseries):
     
     if 'geoSeriesAccession' in series:
         print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession']
         return
         
     print 'Writing series ' + series['composite']
     
     seriesStanza = soft.SeriesStanza()
     seriesStanza['^SERIES'] = series['composite']
     seriesStanza['!Series_title'] = compositeTrack.trackDb[compositeTrack.name]['longLabel'] #STILL INCORRECT
     
     if '!Series_summary' in replace:
         seriesStanza['!Series_summary'] = replace['!Series_summary']
     else:
         print 'warning: no series summary found. Please include in replace file.'
         seriesStanza['!Series_summary'] = '[REPLACE]'
         if audit:
             print seriesStanza.name + ': no summary'
         
     if '!Series_overall_design' in replace:
         seriesStanza['!Series_overall_design'] = replace['!Series_overall_design']
     else:
         print 'no series overall design found. Please include in replace file.'
         seriesStanza['!Series_overall_design'] = '[REPLACE]'
         if audit:
             print seriesStanza.name + ': no overall design'
             
     seriesStanza['!Series_web_link'] = [ compositeTrack.url, 'http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html' ]
     
     if '!Series_contributor' in replace:
         seriesStanza['!Series_contributor'] = replace['!Series_contributor']
     else:
         seriesStanza['!Series_contributor'] = '[REPLACE]'
         if audit:
             print seriesStanza.name + ': no contributor'
         
-    seriesStanza['!Series_gp_id'] = mdb.gpIds[compositeTrack.organism + ' ' + datatype.source]
+    seriesStanza['!Series_gp_id'] = encodeUtils.gpIds[compositeTrack.organism + ' ' + datatype.source]
     
     # could use !Series_variable_* and !Series_repeats_*
     
     if not argseries:
         seriesStanza['!Series_sample_id'] = list()
         
         for idNum in expIds.iterkeys():
             if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
                 seriesStanza['!Series_sample_id'].append(geoMapping[idNum])
             else:
                 seriesStanza['!Series_sample_id'].append(sampleTitle(expIds[idNum][0], expVars))
         
     softfile[series['composite']] = seriesStanza
     
 def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit, tarpath, argseries):
     
     print 'Creating HighThroughput soft file'
 
     softfile = soft.HighThroughputSoftFile()
     fileList = list()
     
     createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit, argseries)
     
     if argseries:
         return softfile, fileList
     
     for idNum in expIds.iterkeys():
         
         expId = expIds[idNum]
         firstStanza = expId[0]
         print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
         sample = soft.HighThroughputSampleStanza()
 
         sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1)
         sample['!Sample_type'] = 'SRA'
         sample['!Sample_title'] = sample['^SAMPLE']
         
         if 'geoSeriesAccession' in series:
             sample['!Sample_series_id'] = series['geoSeriesAccession']
             
         count = 1
         
         #figure out if the instrument model is consistent across the entire sample
         instrumentModel = None
         for stanza in expId:    
             if 'seqPlatform' in stanza:
                 if instrumentModel == None:
-                    instrumentModel = geo.instrumentModels[stanza['seqPlatform']]
+                    instrumentModel = submission.instrumentModels[stanza['seqPlatform']]
                 else:
-                    if instrumentModel != geo.instrumentModels[stanza['seqPlatform']]:
+                    if instrumentModel != submission.instrumentModels[stanza['seqPlatform']]:
                         instrumentModel = None
                         if audit:
                             print 'expId' + str(expId) + ': inconsistent instrument model'
                         break
         
         for stanza in expId:
         
             for fname in stanza['fileName'].split(','):
               
                 file = compositeTrack.files[fname]
                 filelist = list()
                 
+                if file.extension == 'fasta':
+                    print 'WARNING: fastas detected!!!'
+                
                 if isRawFile(file):
                 
                     if file.name.endswith('.tgz') or file.name.endswith('.tar.gz'):
                     
                         if tarpath == None:
                             raise IOError('this track contains tarred fastqs. Please specify a path through the -z option')
                         dirname = tarpath + file.name.split('.')[0] + '/'
                         if os.path.exists(dirname):
-                            raise IOError(dirname + ' already exists')
+                            print dirname + ' already exists, so not unzipping'
                         else:
                             os.mkdir(dirname)
-
                         os.system('tar -xf %s -C %s' % (file.path + file.name, dirname))
                         
                         for root, dirnames, filenames in os.walk(dirname):
                             for filename in filenames:
                                 if filename.endswith('.fastq') or filename.endswith('.txt'):
                                     os.system('gzip %s' % (root + '/' + filename))
                         
                         for root, dirnames, filenames in os.walk(dirname):
                             for filename in filenames:
                                 print root + '/' + filename
-                                filelist.append(track.TrackFile(root + '/' + filename))
+                                filelist.append(TrackFile(root + '/' + filename))
 
                     else:
                         filelist.append(file)
                         
                     for f in filelist:
                         
                         sample['!Sample_raw_file_' + str(count)] = linkName(f, compositeTrack)
                         if f.extension == 'txt':
                             sample['!Sample_raw_file_type_' + str(count)] = 'fastq'
+                        elif f.extension == 'csfasta':
+                            sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_csfasta'
+                        elif f.extension == 'csqual':
+                            sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_qual'
                         else:
                             sample['!Sample_raw_file_type_' + str(count)] = f.extension
                         
                         sample['!Sample_raw_file_checksum_' + str(count)] = f.md5sum
 
                         if instrumentModel == None and 'seqPlatform' in stanza:
-                            sample['!Sample_raw_file_instrument_model_' + str(count)] = geo.instrumentModels[stanza['seqPlatform']]
+                            sample['!Sample_raw_file_instrument_model_' + str(count)] = submission.instrumentModels[stanza['seqPlatform']]
                             
                         fileList.append(f)    
                         count = count + 1
             
         count = 1
             
         for stanza in expId:
         
             for fname in stanza['fileName'].split(','):
                 file = compositeTrack.files[fname]
         
                 if isSupplimentaryFile(file):
                     sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack)
                     
                     if file.md5sum != None:
                         sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum
                     
                     sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database
                     
                     if instrumentModel == None and 'seqPlatform' in stanza:
-                        sample['!Sample_supplementary_file_instrument_model_' + str(count)] = geo.instrumentModels[stanza['seqPlatform']]
+                        sample['!Sample_supplementary_file_instrument_model_' + str(count)] = submission.instrumentModels[stanza['seqPlatform']]
                     
                     fileList.append(file)
                     count = count + 1
             
         sample['!Sample_source_name'] = firstStanza['cell']
         sample['!Sample_organism'] = compositeTrack.organism
         
         sample['!Sample_characteristics'] = list()
         allVars = expVars + mdbWhitelist
         
         for var in allVars:
             if var in firstStanza:
                 foobar = var
                 sample['!Sample_characteristics'].append(var + ': ' + firstStanza[var])
                 for pretend in cvPretend.iterkeys():
                     if var + ' ' + firstStanza[var] == pretend:
                         foobar = cvPretend[pretend]
                 if foobar in cvDetails:
                     for cvVar in cvDetails[foobar]:
                         if cvVar in cvOverride and cvVar in firstStanza:
                             sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + firstStanza[cvVar])
                         elif cvVar in cv[firstStanza[var]]:
                             sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[firstStanza[var]][cvVar])
                 else:
                     for cvVar in cvDefaults:
                         if firstStanza[var] in cv and cvVar in cv[firstStanza[var]]:
                             sample['!Sample_characteristics'].append(var + ' ' +  cvVar + ': ' + cv[firstStanza[var]][cvVar])
                 
         sample['!Sample_biomaterial_provider'] = cv[firstStanza['cell']]['vendorName']
         
         if 'treatment' in firstStanza:
             sample['!Sample_treatment_protocol'] = firstStanza['treatment']
         
         if 'protocol' in cv[firstStanza['cell']]:
             for protocol in cv[firstStanza['cell']]['protocol'].split(' '):
                     key, val = protocol.split(':')
                     if key == 'ENCODE' or key == cv[firstStanza['lab']]['labPi']:
                         sample['!Sample_growth_protocol'] = val
         
         if datatype.molecule == 'RNA':
-            if firstStanza['rnaExtract'] in geo.rnaExtractMapping:
-                sample['!Sample_molecule'] = geo.rnaExtractMapping[firstStanza['rnaExtract']]
-            elif firstStanza['localization'] in geo.localizationMapping:
-                sample['!Sample_molecule'] = geo.localizationMapping[firstStanza['localization']]
+            if firstStanza['rnaExtract'] in submission.rnaExtractMapping:
+                sample['!Sample_molecule'] = submission.rnaExtractMapping[firstStanza['rnaExtract']]
+            elif firstStanza['localization'] in submission.localizationMapping:
+                sample['!Sample_molecule'] = submission.localizationMapping[firstStanza['localization']]
                 
         else:
             sample['!Sample_molecule'] = datatype.molecule
             
         if '!Sample_instrument_model' in replace and replace['!Sample_instrument_model'][0] == 'Unknown':
-            sample['!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % (geo.instrumentModels[replace['!Sample_instrument_model'][0]], compositeTrack.url)
+            sample['!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % (submission.instrumentModels[replace['!Sample_instrument_model'][0]], compositeTrack.url)
         else:
             sample['!Sample_extract_protocol'] = compositeTrack.url
         sample['!Sample_library_strategy'] = datatype.strategy
         sample['!Sample_library_source'] = datatype.source
         sample['!Sample_library_selection'] = datatype.selection
         
         # if the instrumentModel is consistent, just use that
         # otherwise take the first seqPlatform value from metadata
         # if that still fails, check the replacement file
         # finally just make it say [REPLACE]
         if instrumentModel != None:
             sample['!Sample_instrument_model'] = instrumentModel
         else:
             for stanza in expId:    
                 if 'seqPlatform' in stanza:
-                    sample['!Sample_instrument_model'] = geo.instrumentModels[stanza['seqPlatform']]
+                    sample['!Sample_instrument_model'] = submission.instrumentModels[stanza['seqPlatform']]
                     break
             if '!Sample_instrument_model' not in sample:
                 if '!Sample_instrument_model' in replace:
-                    sample['!Sample_instrument_model'] = geo.instrumentModels[replace['!Sample_instrument_model'][0]]
+                    sample['!Sample_instrument_model'] = submission.instrumentModels[replace['!Sample_instrument_model'][0]]
             if '!Sample_instrument_model' not in sample:
                 sample['!Sample_instrument_model'] = '[REPLACE]'
                 if audit:
                     print stanza.name + ': no instrument'
                 
         sample['!Sample_data_processing'] = compositeTrack.url
 
         if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
             sample['!Sample_geo_accession'] = geoMapping[idNum]
         
         softfile[firstStanza['metaObject']] = sample
         
     return softfile, fileList
         
         
 def createMicroArraySoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit):
     
     #raise KeyError('microarray')
     
     print 'Creating MicroArray soft file'
 
     softfile = soft.MicroArraySoftFile()
     fileList = list()
     
     createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit)
     
     for idNum in expIds.iterkeys():
         
         expId = expIds[idNum]
         firstStanza = expId[0]
         print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
         sample = soft.MicroArraySampleStanza()
             
         sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1)
         
         if 'geoSeriesAccession' in series:
             sample['!Sample_series_id'] = series['geoSeriesAccession']    
             
         #sample['!Sample_title'] = concat
         
         count = 1
             
         count = 1
             
         for stanza in expId:
         
             for fname in stanza['fileName'].split(','):
                 file = compositeTrack.files[fname]
         
                 if isSupplimentaryFile(file):
                     sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack)
                     
                     if file.md5sum != None:
                         sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum
                     
                     sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database
                 
                     fileList.append(file)
                     count = count + 1
 
         # sample['!Sample_table'] = KeyOptional # CEL file
         # sample['!Sample_source_name_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
         # sample['!Sample_organism_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
         # sample['!Sample_characteristics_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
         # sample['!Sample_biomaterial_provider_ch'] = KeyZeroPlusNumbered
         # sample['!Sample_treatment_protocol_ch'] = KeyZeroPlusNumbered
         # sample['!Sample_growth_protocol_ch'] = KeyZeroPlusNumbered
         # sample['!Sample_molecule_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
         # sample['!Sample_extract_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
         # sample['!Sample_label_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
         # sample['!Sample_label_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
         # sample['!Sample_hyb_protocol'] = '[REPLACE]' #KeyOnePlus
         # sample['!Sample_scan_protocol'] = '[REPLACE]' #KeyOnePlus
         # sample['!Sample_data_processing'] = '[REPLACE]' #KeyOnePlus
         # sample['!Sample_description'] = '[REPLACE]' #KeyZeroPlus
         # sample['!Sample_platform_id'] = '[REPLACE]'
         # sample['!Sample_geo_accession'] = KeyOptional
         # sample['!Sample_anchor'] = KeyRequired SAGE ONLY
         # sample['!Sample_type'] = KeyRequired SAGE ONLY
         # sample['!Sample_tag_count'] = KeyRequired SAGE ONLY
         # sample['!Sample_tag_length'] = KeyRequired SAGE ONLY
         # sample['!Sample_table_begin'] = ''
         # sample['!Sample_table_end'] = ''
     
         if 'geoSampleAccession' in firstStanza:
             sample['!Sample_geo_accession'] = firstStanza['geoSampleAccession'] 
     
         softfile[firstStanza['metaObject']] = sample
         
     return softfile, fileList
     
     
 def createSpecialSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype):
     softfile = SoftFile()
     fileList = list()
     
     createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series)
     
     for idNum in expIds.iterkeys():
         
         expId = expIds[idNum]
         firstStanza = expId[0]
         print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
         sample = HighThroughputSampleStanza()
         
         hasbigwig = 0
         for stanza in expId:
         
             if getFileType(stanza['fileName']) == 'bigWig':
                 hasbigwig = 1
                 
         if hasbigwig == 0:
             continue
         
         sample['^SAMPLE'] = firstStanza['geoSampleAccession']
         
         if 'geoSeriesAccession' in series:
             sample['!Sample_series_id'] = series['geoSeriesAccession']
             
         sample['!Sample_geo_accession'] = firstStanza['geoSampleAccession']
         
         count = 1
             
         for stanza in expId:
         
             if getFileType(stanza['fileName']) == 'bigWig':
                 sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName']
                 
                 if 'checksum' in stanza:
                     sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum']
                 elif md5sums != None and stanza['fileName'] in md5sums:
                     sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
                 
                 # sample['!Sample_supplementary_file_build_' + str(count)] = database
                 
                 fileList.append(stanza['fileName'])
                 count = count + 1
                 
         softfile[firstStanza['geoSampleAccession']] = sample
         
     return softfile, fileList
         
 def main():
 
     parser = argparse.ArgumentParser(description = 'Prepares a submission to GEO. Creates a soft file and shell script with the correct call to aspera.')
     parser.add_argument('-t', '--trackPath', help='Overrides the default track path ~/kent/src/hg/makeDb/trackDb/')
     parser.add_argument('-r', '--replace', help='Give the name of a file that has contents to be used to replace unspecified tags in metadata (description, contributers, etc) and instrument model')
     parser.add_argument('-a', '--audit', action='store_true', default=False, help='Instead of building the files, will just give you a list of errors')
     parser.add_argument('-z', '--zip', help='Specifies a directory path to unzip tarred fastqs to, only applicable for tracks with tarred fastqs')
     parser.add_argument('-s', '--series', action='store_true', default=False, help='Only generates the series stanza, instead of generating the entire soft file')
     parser.add_argument('database', help='The database, typically hg19 or mm9')
     parser.add_argument('composite', help='The composite name, wgEncodeCshlLongRnaSeq for instance')
     parser.add_argument('expIds', nargs='*', help='Any number of expIds separated by spaces, you can also specify a range by using a hyphen, "140 150 160-170" for instance, or leave blank to specify the entire file')
     
     if len(sys.argv) == 1:
         parser.print_usage()
         return
     
     args = parser.parse_args(sys.argv[1:])
         
-    compositeTrack = track.CompositeTrack(args.database, args.composite, args.trackPath)
+    compositeTrack = CompositeTrack(args.database, args.composite, args.trackPath)
 
     cvPath = compositeTrack.trackPath + 'cv/alpha/cv.ra'
-    controlledVocab = cv.CvFile(cvPath)
+    controlledVocab = CvFile(cvPath)
     
     if args.zip != None and not args.zip.endswith('/'):
         args.zip += '/'
     
     replace = dict()
     if args.replace != None:
         for line in open(args.replace):
             if line == '':
                 continue
             key, val = map(str.strip, line.split('=', 1))
             if key not in replace:
                 replace[key] = list()
             replace[key].append(val)
         
     tempids = list()
     ids = list()
     
     for id in args.expIds:
         if '-' in id:
             start, end = id.split('-', 1)
             tempids.extend(range(int(start), int(end) + 1))
         else:
             tempids.append(int(id))
     for id in tempids:
         if str(id) in compositeTrack.alphaMetaDb.experiments.keys():
             ids.append(int(id))
 
     expIds, expVars, geoMapping, series, datatype = createMappings(compositeTrack.alphaMetaDb)
   
     submission = dict()
     if len(ids) == 0:
         submission = expIds
     else:
         for expId in ids:
             if str(expId) in expIds.keys():
                 submission[str(expId)] = expIds[str(expId)]
     expIdStr = ' '
     for id in args.expIds:
         expIdStr = expIdStr + id + ',' 
     expIdStr = expIdStr[:len(expIdStr) - 1]
     print 'Generating soft using expIds ' + ','.join(submission.keys())
     
     if datatype.type == 'HighThroughput':
         softfile, fileList = createHighThroughputSoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype, replace, args.audit, args.zip, args.series)
     elif datatype.type == 'MicroArray':
         softfile, fileList = createMicroArraySoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype, replace, args.audit, args.zip, args.series)
     else:
         raise KeyError('unsupported type ' + datatype.name)
     
     if not args.audit and not args.series:
         print 'Creating directory'
     
         d = datetime.datetime.today()
         datestring = '%4d-%02d-%02d' % (d.year, d.month, d.day)
         
         dirname = '%s_%s_%s/' % (compositeTrack.database, compositeTrack.name, datestring)
         linkdirname = '%s_%s/' % (compositeTrack.database, compositeTrack.name)
     
         os.mkdir(dirname)
         os.mkdir(dirname + linkdirname)
     
         print 'Writing file'
         
         outfileName = '%s%s_%s.soft' % (dirname + linkdirname, compositeTrack.database, compositeTrack.name)
         outfile = open(outfileName, 'w')
         outfile.write(str(softfile))
         fileslistname = '%sfiles.txt' % (dirname + linkdirname)
         fileslist = open(fileslistname, 'w')
         scriptname = '%supload.sh' % dirname
         outscript = open(scriptname, 'w')
         
         outscript.write('#!/bin/sh\n\n')
         outscript.write('/cluster/home/mmaddren/.aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk --symbolic-links=follow -QTdr -l300m %s asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n' % linkdirname)
         outscript.close()
     elif args.series:
         outfileName = '%s_%s.soft' % (compositeTrack.database, compositeTrack.name)
         outfile = open(outfileName, 'w')
         outfile.write(str(softfile))
         outfile.close()
         
     for file in fileList:
         if not os.path.exists(file.path):
             print IOError(file.path + ' does not exist')
         elif not args.audit:
             linkname = linkName(file, compositeTrack)
             linkpath = linkdirname + linkname
             os.symlink(file.fullname, dirname + linkpath)
         
             #outscript.write(linkpath + ' \\\n')
             fileslist.write(linkname + '\n')
     
     if not args.audit and not args.series:
         #outscript.write()
         
         fileslist.close()
 
         os.system('chmod +x ' + scriptname)
         
         print 'Finished!'
     
 if __name__ == '__main__':
     main()
\ No newline at end of file