84807912f92036a1d36b282121e757868cd4e4bb
mmaddren
  Tue Dec 18 13:02:11 2012 -0800
updated libraries and some tools to make some things clearer, as per code review v276 final
diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg
index 96b1519..9378071 100755
--- python/programs/mkGeoPkg/mkGeoPkg
+++ python/programs/mkGeoPkg/mkGeoPkg
@@ -69,519 +69,37 @@
 a script that will handle the uploading process. All of this will be put into
 a directory named 'database_wgEncodeSomeTrack_year-month-day'. To begin the
 upload, simply cd into the directoy and run upload.sh. This will start the
 aspera copy program, copying files, a files list, and the soft file itself.
 
 For each submission, you need to email GEO. Our current contact is:
     Steve Wilhite, wilhite@ncbi.nlm.nih.gov
     
 You must specify which track you're submitting. GEO will only allow us 1TB of
 space dedicated to ENCODE, so you must break down submissions larger than 1TB
 and only submit as many submissions as they have room to process at any given
 time. In a few days, GEO will get back to you with a list of accession numbers
 which need to be put back into our metadata (see encodeAddGeoAccessions).
 '''
 
-cvDetails = {
-    'cell':    [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ],
-    'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ]
-}
-
-#if the term appears in the mdb and must overriding the value in the cv
-cvOverride = [ 'sex' ]
-
-#talk to Venkat lol
-cvPretend = { 'antibody Input': 'control' }
-
-#if its not in cvDetails, which things should we check by default
-cvDefaults = [ 'description' ]
-
-mdbWhitelist = [
-    'age',
-    'bioRep',
-    'control',
-    'controlId',
-    'fragSize',
-    'labExpId',
-    'labVersion',
-    'mapAlgorithm',
-    'obtainedBy',
-    'phase',
-    'readType',
-    'region',
-    'replicate',
-    'restrictionEnzyme',
-    'run',
-    'softwareVersion',
-    'spikeInPool',
-    'strain'
-]
-    
-def isRawFile(file):
-    return (file.extension == 'fastq' or file.extension == 'csfasta' or file.extension == 'csqual')
-    
-def isSupplimentaryFile(file):
-    return (not isRawFile(file)) and file.extension != 'fasta' and file.extension != 'bam' and file.extension != 'bai'
-    
-def sampleTitle(stanza, expVars, warn=False):
-    concat = stanza[expVars[0]].replace('-m', '')
-    for expVar in expVars[1:len(expVars)]:
-        if expVar in stanza and stanza[expVar] != 'None':
-            concat += '_' + stanza[expVar]
-        elif warn:
-            print 'warning: %s is None or not in %s' % (expVar, stanza.name)
-    return concat
-    
-def linkName(file, track):
-    return '%s_%s' % (track.database, file.name)
-    
-def createMappings(metadb):
-    expIds = dict()
-    geoMapping = dict()
-    expVars = None
-    series = None
-    datatype = None
-    
-    for stanza in metadb.itervalues():
-        
-        if 'objType' in stanza and stanza['objType'] == 'composite':
-            series = stanza
-            expVars = stanza['expVars'].split(',')
-            continue
-
-        if 'expId' not in stanza:
-            print stanza.name + ': no expId'
-            continue
-
-        if 'objStatus' in stanza:
-            print stanza.name + ': skipping because ' + stanza['objStatus']
-            continue
-            
-        if 'geoSampleAccession' not in stanza:
-            # if this hasn't been submitted to GEO yet, we'll add it to the submission list
-            if stanza['expId'] not in expIds:
-                expIds[stanza['expId']] = list()
-                
-            expIds[stanza['expId']].append(stanza)
-        
-        else:
-            # otherwise we keep track of the geo number for partially submitted samples
-            if stanza['expId'] not in geoMapping:
-                geoMapping[stanza['expId']] = stanza['geoSampleAccession']
-            elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']:
-                geoMapping[stanza['expId']] = 'Inconsistent'
-                print stanza.name + ': inconsistent geo mapping'
-        
-        if datatype == None and 'dataType' in stanza:
-            datatype = stanza['dataType']
-        elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']:
-            raise KeyError(stanza.name + ': inconsistent data type') 
-
-    try:
-        dt = datatype
-        datatype = encodeUtils.dataTypes[dt]
-        datatype.name = dt
-    except KeyError:
-        raise KeyError(datatype)
-    
-    return expIds, expVars, geoMapping, series, datatype
-    
-    
-def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit, argseries):
-    
-    if 'geoSeriesAccession' in series:
-        print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession']
-        return
-        
-    print 'Writing series ' + series['composite']
-    
-    seriesStanza = soft.SeriesStanza()
-    seriesStanza['^SERIES'] = series['composite']
-    seriesStanza['!Series_title'] = compositeTrack.trackDb[compositeTrack.name]['longLabel'] #STILL INCORRECT
-    
-    if '!Series_summary' in replace:
-        seriesStanza['!Series_summary'] = replace['!Series_summary']
-    else:
-        print 'warning: no series summary found. Please include in replace file.'
-        seriesStanza['!Series_summary'] = '[REPLACE]'
-        if audit:
-            print seriesStanza.name + ': no summary'
-        
-    if '!Series_overall_design' in replace:
-        seriesStanza['!Series_overall_design'] = replace['!Series_overall_design']
-    else:
-        print 'no series overall design found. Please include in replace file.'
-        seriesStanza['!Series_overall_design'] = '[REPLACE]'
-        if audit:
-            print seriesStanza.name + ': no overall design'
-            
-    seriesStanza['!Series_web_link'] = [ compositeTrack.url, 'http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html' ]
-    
-    if '!Series_contributor' in replace:
-        seriesStanza['!Series_contributor'] = replace['!Series_contributor']
-    else:
-        seriesStanza['!Series_contributor'] = '[REPLACE]'
-        if audit:
-            print seriesStanza.name + ': no contributor'
-        
-    seriesStanza['!Series_gp_id'] = encodeUtils.gpIds[compositeTrack.organism + ' ' + datatype.source]
-    
-    # could use !Series_variable_* and !Series_repeats_*
-    
-    if not argseries:
-        seriesStanza['!Series_sample_id'] = list()
-        
-        for idNum in expIds.iterkeys():
-            if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
-                seriesStanza['!Series_sample_id'].append(geoMapping[idNum])
-            else:
-                seriesStanza['!Series_sample_id'].append(sampleTitle(expIds[idNum][0], expVars))
-        
-    softfile[series['composite']] = seriesStanza
-    
-def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit, tarpath, argseries):
-    
-    print 'Creating HighThroughput soft file'
-
-    softfile = soft.HighThroughputSoftFile()
-    fileList = list()
-    
-    createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit, argseries)
-    
-    if argseries:
-        return softfile, fileList
-    
-    for idNum in expIds.iterkeys():
-        
-        expId = expIds[idNum]
-        firstStanza = expId[0]
-        print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
-        sample = soft.HighThroughputSampleStanza()
-
-        sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1)
-        sample['!Sample_type'] = 'SRA'
-        sample['!Sample_title'] = sample['^SAMPLE']
-        
-        if 'geoSeriesAccession' in series:
-            sample['!Sample_series_id'] = series['geoSeriesAccession']
-            
-        count = 1
-        
-        #figure out if the instrument model is consistent across the entire sample
-        instrumentModel = None
-        for stanza in expId:    
-            if 'seqPlatform' in stanza:
-                if instrumentModel == None:
-                    instrumentModel = submission.instrumentModels[stanza['seqPlatform']]
-                else:
-                    if instrumentModel != submission.instrumentModels[stanza['seqPlatform']]:
-                        instrumentModel = None
-                        if audit:
-                            print 'expId' + str(expId) + ': inconsistent instrument model'
-                        break
-        
-        for stanza in expId:
-        
-            for fname in stanza['fileName'].split(','):
-              
-                file = compositeTrack.files[fname]
-                filelist = list()
-                
-                if file.extension == 'fasta':
-                    print 'WARNING: fastas detected!!!'
-                
-                if isRawFile(file):
-                
-                    if file.name.endswith('.tgz') or file.name.endswith('.tar.gz'):
-                    
-                        if tarpath == None:
-                            raise IOError('this track contains tarred fastqs. Please specify a path through the -z option')
-                        dirname = tarpath + file.name.split('.')[0] + '/'
-                        if os.path.exists(dirname):
-                            print dirname + ' already exists, so not unzipping'
-                        else:
-                            os.mkdir(dirname)
-                            os.system('tar -xf %s -C %s' % (file.path + file.name, dirname))
-                        
-                            for root, dirnames, filenames in os.walk(dirname):
-                                for filename in filenames:
-                                    if filename.endswith('.fastq') or filename.endswith('.txt'):
-                                        os.system('gzip %s' % (root + '/' + filename))
-                        
-                        for root, dirnames, filenames in os.walk(dirname):
-                            for filename in filenames:
-                                print root + '/' + filename
-                                filelist.append(TrackFile(root + '/' + filename))
-
-                    else:
-                        filelist.append(file)
-                        
-                    for f in filelist:
-                        
-                        sample['!Sample_raw_file_' + str(count)] = linkName(f, compositeTrack)
-                        if f.extension == 'txt':
-                            sample['!Sample_raw_file_type_' + str(count)] = 'fastq'
-                        elif f.extension == 'csfasta':
-                            sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_csfasta'
-                        elif f.extension == 'csqual':
-                            sample['!Sample_raw_file_type_' + str(count)] = 'SOLiD_native_qual'
-                        else:
-                            sample['!Sample_raw_file_type_' + str(count)] = f.extension
-                        
-                        sample['!Sample_raw_file_checksum_' + str(count)] = f.md5sum
-
-                        if instrumentModel == None and 'seqPlatform' in stanza:
-                            sample['!Sample_raw_file_instrument_model_' + str(count)] = submission.instrumentModels[stanza['seqPlatform']]
-                            
-                        fileList.append(f)    
-                        count = count + 1
-            
-        count = 1
-            
-        for stanza in expId:
-        
-            for fname in stanza['fileName'].split(','):
-                file = compositeTrack.files[fname]
-        
-                if isSupplimentaryFile(file):
-                    sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack)
-                    
-                    if file.md5sum != None:
-                        sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum
-                    
-                    sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database
-                    
-                    if instrumentModel == None and 'seqPlatform' in stanza:
-                        sample['!Sample_supplementary_file_instrument_model_' + str(count)] = submission.instrumentModels[stanza['seqPlatform']]
-                    
-                    fileList.append(file)
-                    count = count + 1
-            
-        sample['!Sample_source_name'] = firstStanza['cell']
-        sample['!Sample_organism'] = compositeTrack.organism
-        
-        sample['!Sample_characteristics'] = list()
-        allVars = expVars + mdbWhitelist
-        
-        for var in allVars:
-            if var in firstStanza:
-                foobar = var
-                sample['!Sample_characteristics'].append(var + ': ' + firstStanza[var])
-                for pretend in cvPretend.iterkeys():
-                    if var + ' ' + firstStanza[var] == pretend:
-                        foobar = cvPretend[pretend]
-                if foobar in cvDetails:
-                    for cvVar in cvDetails[foobar]:
-                        if cvVar in cvOverride and cvVar in firstStanza:
-                            sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + firstStanza[cvVar])
-                        elif cvVar in cv[firstStanza[var]]:
-                            sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[firstStanza[var]][cvVar])
-                else:
-                    for cvVar in cvDefaults:
-                        if firstStanza[var] in cv and cvVar in cv[firstStanza[var]]:
-                            sample['!Sample_characteristics'].append(var + ' ' +  cvVar + ': ' + cv[firstStanza[var]][cvVar])
-                
-        sample['!Sample_biomaterial_provider'] = cv[firstStanza['cell']]['vendorName']
-        
-        if 'treatment' in firstStanza:
-            sample['!Sample_treatment_protocol'] = firstStanza['treatment']
-        
-        if 'protocol' in cv[firstStanza['cell']]:
-            for protocol in cv[firstStanza['cell']]['protocol'].split(' '):
-                    key, val = protocol.split(':')
-                    if key == 'ENCODE' or key == cv[firstStanza['lab']]['labPi']:
-                        sample['!Sample_growth_protocol'] = val
-        
-        if datatype.molecule == 'RNA':
-            if firstStanza['rnaExtract'] in submission.rnaExtractMapping:
-                sample['!Sample_molecule'] = submission.rnaExtractMapping[firstStanza['rnaExtract']]
-            elif firstStanza['localization'] in submission.localizationMapping:
-                sample['!Sample_molecule'] = submission.localizationMapping[firstStanza['localization']]
-                
-        else:
-            sample['!Sample_molecule'] = datatype.molecule
-            
-        if '!Sample_instrument_model' in replace and replace['!Sample_instrument_model'][0] == 'Unknown':
-            sample['!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % (submission.instrumentModels[replace['!Sample_instrument_model'][0]], compositeTrack.url)
-        else:
-            sample['!Sample_extract_protocol'] = compositeTrack.url
-        sample['!Sample_library_strategy'] = datatype.strategy
-        sample['!Sample_library_source'] = datatype.source
-        sample['!Sample_library_selection'] = datatype.selection
-        
-        # if the instrumentModel is consistent, just use that
-        # otherwise take the first seqPlatform value from metadata
-        # if that still fails, check the replacement file
-        # finally just make it say [REPLACE]
-        if instrumentModel != None:
-            sample['!Sample_instrument_model'] = instrumentModel
-        else:
-            for stanza in expId:    
-                if 'seqPlatform' in stanza:
-                    sample['!Sample_instrument_model'] = submission.instrumentModels[stanza['seqPlatform']]
-                    break
-            if '!Sample_instrument_model' not in sample:
-                if '!Sample_instrument_model' in replace:
-                    sample['!Sample_instrument_model'] = submission.instrumentModels[replace['!Sample_instrument_model'][0]]
-            if '!Sample_instrument_model' not in sample:
-                sample['!Sample_instrument_model'] = '[REPLACE]'
-                if audit:
-                    print stanza.name + ': no instrument'
-                
-        sample['!Sample_data_processing'] = compositeTrack.url
-
-        if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
-            sample['!Sample_geo_accession'] = geoMapping[idNum]
-        
-        softfile[firstStanza['metaObject']] = sample
-        
-    return softfile, fileList
-        
-        
-def createMicroArraySoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit):
-    
-    #raise KeyError('microarray')
-    
-    print 'Creating MicroArray soft file'
-
-    softfile = soft.MicroArraySoftFile()
-    fileList = list()
-    
-    createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit)
-    
-    for idNum in expIds.iterkeys():
-        
-        expId = expIds[idNum]
-        firstStanza = expId[0]
-        print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
-        sample = soft.MicroArraySampleStanza()
-            
-        sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1)
-        
-        if 'geoSeriesAccession' in series:
-            sample['!Sample_series_id'] = series['geoSeriesAccession']    
-            
-        #sample['!Sample_title'] = concat
-        
-        count = 1
-            
-        count = 1
-            
-        for stanza in expId:
-        
-            for fname in stanza['fileName'].split(','):
-                file = compositeTrack.files[fname]
-        
-                if isSupplimentaryFile(file):
-                    sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack)
-                    
-                    if file.md5sum != None:
-                        sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum
-                    
-                    sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database
-                
-                    fileList.append(file)
-                    count = count + 1
-
-        # sample['!Sample_table'] = KeyOptional # CEL file
-        # sample['!Sample_source_name_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
-        # sample['!Sample_organism_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
-        # sample['!Sample_characteristics_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
-        # sample['!Sample_biomaterial_provider_ch'] = KeyZeroPlusNumbered
-        # sample['!Sample_treatment_protocol_ch'] = KeyZeroPlusNumbered
-        # sample['!Sample_growth_protocol_ch'] = KeyZeroPlusNumbered
-        # sample['!Sample_molecule_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
-        # sample['!Sample_extract_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
-        # sample['!Sample_label_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
-        # sample['!Sample_label_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
-        # sample['!Sample_hyb_protocol'] = '[REPLACE]' #KeyOnePlus
-        # sample['!Sample_scan_protocol'] = '[REPLACE]' #KeyOnePlus
-        # sample['!Sample_data_processing'] = '[REPLACE]' #KeyOnePlus
-        # sample['!Sample_description'] = '[REPLACE]' #KeyZeroPlus
-        # sample['!Sample_platform_id'] = '[REPLACE]'
-        # sample['!Sample_geo_accession'] = KeyOptional
-        # sample['!Sample_anchor'] = KeyRequired SAGE ONLY
-        # sample['!Sample_type'] = KeyRequired SAGE ONLY
-        # sample['!Sample_tag_count'] = KeyRequired SAGE ONLY
-        # sample['!Sample_tag_length'] = KeyRequired SAGE ONLY
-        # sample['!Sample_table_begin'] = ''
-        # sample['!Sample_table_end'] = ''
-    
-        if 'geoSampleAccession' in firstStanza:
-            sample['!Sample_geo_accession'] = firstStanza['geoSampleAccession'] 
-    
-        softfile[firstStanza['metaObject']] = sample
-        
-    return softfile, fileList
-    
-    
-def createSpecialSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype):
-    softfile = SoftFile()
-    fileList = list()
-    
-    createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series)
-    
-    for idNum in expIds.iterkeys():
-        
-        expId = expIds[idNum]
-        firstStanza = expId[0]
-        print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
-        sample = HighThroughputSampleStanza()
-        
-        hasbigwig = 0
-        for stanza in expId:
-        
-            if getFileType(stanza['fileName']) == 'bigWig':
-                hasbigwig = 1
-                
-        if hasbigwig == 0:
-            continue
-        
-        sample['^SAMPLE'] = firstStanza['geoSampleAccession']
-        
-        if 'geoSeriesAccession' in series:
-            sample['!Sample_series_id'] = series['geoSeriesAccession']
-            
-        sample['!Sample_geo_accession'] = firstStanza['geoSampleAccession']
-        
-        count = 1
-            
-        for stanza in expId:
-        
-            if getFileType(stanza['fileName']) == 'bigWig':
-                sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName']
-                
-                if 'checksum' in stanza:
-                    sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum']
-                elif md5sums != None and stanza['fileName'] in md5sums:
-                    sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
-                
-                # sample['!Sample_supplementary_file_build_' + str(count)] = database
-                
-                fileList.append(stanza['fileName'])
-                count = count + 1
-                
-        softfile[firstStanza['geoSampleAccession']] = sample
-        
-    return softfile, fileList
-        
 def main():
 
     parser = argparse.ArgumentParser(description = 'Prepares a submission to GEO. Creates a soft file and shell script with the correct call to aspera.')
     parser.add_argument('-t', '--trackPath', help='Overrides the default track path ~/kent/src/hg/makeDb/trackDb/')
     parser.add_argument('-r', '--replace', help='Give the name of a file that has contents to be used to replace unspecified tags in metadata (description, contributers, etc) and instrument model')
     parser.add_argument('-a', '--audit', action='store_true', default=False, help='Instead of building the files, will just give you a list of errors')
+    parser.add_argument('-b', '--byrep', action='store_true', default=False, help='Submit by replicate')
     parser.add_argument('-z', '--zip', help='Specifies a directory path to unzip tarred fastqs to, only applicable for tracks with tarred fastqs')
     parser.add_argument('-s', '--series', action='store_true', default=False, help='Only generates the series stanza, instead of generating the entire soft file')
     parser.add_argument('database', help='The database, typically hg19 or mm9')
     parser.add_argument('composite', help='The composite name, wgEncodeCshlLongRnaSeq for instance')
     parser.add_argument('expIds', nargs='*', help='Any number of expIds separated by spaces, you can also specify a range by using a hyphen, "140 150 160-170" for instance, or leave blank to specify the entire file')
     
     if len(sys.argv) == 1:
         parser.print_usage()
         return
     
     args = parser.parse_args(sys.argv[1:])
         
     compositeTrack = CompositeTrack(args.database, args.composite, args.trackPath)
 
     cvPath = compositeTrack.trackPath + 'cv/alpha/cv.ra'
@@ -601,49 +119,49 @@
             replace[key].append(val)
         
     tempids = list()
     ids = list()
     
     for id in args.expIds:
         if '-' in id:
             start, end = id.split('-', 1)
             tempids.extend(range(int(start), int(end) + 1))
         else:
             tempids.append(int(id))
     for id in tempids:
         if str(id) in compositeTrack.alphaMetaDb.experiments.keys():
             ids.append(int(id))
 
-    expIds, expVars, geoMapping, series, datatype = createMappings(compositeTrack.alphaMetaDb)
+    expIds, expVars, geoMapping, series, datatype = soft.createMappings(compositeTrack.alphaMetaDb, False, args.byrep)
   
     submission = dict()
     if len(ids) == 0:
         submission = expIds
     else:
         for expId in ids:
             if str(expId) in expIds.keys():
                 submission[str(expId)] = expIds[str(expId)]
     expIdStr = ' '
     for id in args.expIds:
         expIdStr = expIdStr + id + ',' 
     expIdStr = expIdStr[:len(expIdStr) - 1]
     print 'Generating soft using expIds ' + ','.join(submission.keys())
     
     if datatype.type == 'HighThroughput':
-        softfile, fileList = createHighThroughputSoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype, replace, args.audit, args.zip, args.series)
+        softfile, fileList = soft.createHighThroughputSoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype, replace, args.audit, args.zip, args.series, rep=args.byrep)
     elif datatype.type == 'MicroArray':
-        softfile, fileList = createMicroArraySoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype, replace, args.audit, args.zip, args.series)
+        softfile, fileList = soft.createMicroArraySoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype, replace, args.audit, args.zip, args.series)
     else:
         raise KeyError('unsupported type ' + datatype.name)
     
     if not args.audit and not args.series:
         print 'Creating directory'
     
         d = datetime.datetime.today()
         datestring = '%4d-%02d-%02d' % (d.year, d.month, d.day)
         
         dirname = '%s_%s_%s/' % (compositeTrack.database, compositeTrack.name, datestring)
         linkdirname = '%s_%s/' % (compositeTrack.database, compositeTrack.name)
     
         os.mkdir(dirname)
         os.mkdir(dirname + linkdirname)
     
@@ -658,32 +176,35 @@
         outscript = open(scriptname, 'w')
         
         outscript.write('#!/bin/sh\n\n')
         outscript.write('/cluster/home/mmaddren/.aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk --symbolic-links=follow -QTdr -l300m %s asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n' % linkdirname)
         outscript.close()
     elif args.series:
         outfileName = '%s_%s.soft' % (compositeTrack.database, compositeTrack.name)
         outfile = open(outfileName, 'w')
         outfile.write(str(softfile))
         outfile.close()
         
     for file in fileList:
         if not os.path.exists(file.path):
             print IOError(file.path + ' does not exist')
         elif not args.audit:
-            linkname = linkName(file, compositeTrack)
+            linkname = soft.linkName(file, compositeTrack)
             linkpath = linkdirname + linkname
+            if os.path.exists(dirname + linkpath):
+                print 'exists: ' + dirname + linkpath
+            else:
             os.symlink(file.fullname, dirname + linkpath)
         
             #outscript.write(linkpath + ' \\\n')
             fileslist.write(linkname + '\n')
     
     if not args.audit and not args.series:
         #outscript.write()
         
         fileslist.close()
 
         os.system('chmod +x ' + scriptname)
         
         print 'Finished!'
     
 if __name__ == '__main__':