python/programs/mkGeoPkg/mkGeoPkg dbf1ed2fb55f85e23ce20f7ea1781dfe35f162e9

dbf1ed2fb55f85e23ce20f7ea1781dfe35f162e9
mmaddren
  Mon Oct 31 16:40:50 2011 -0700
somehow these got removed when I pulled??? I put them back
diff --git python/programs/mkGeoPkg/mkGeoPkg python/programs/mkGeoPkg/mkGeoPkg
new file mode 100755
index 0000000..22cef27
--- /dev/null
+++ python/programs/mkGeoPkg/mkGeoPkg
@@ -0,0 +1,707 @@
+#!/hive/groups/encode/dcc/bin/python
+import sys, os, shutil, stat, argparse, datetime
+from ucscgenomics import track, ra, soft, cv
+
+"""
+mkGeoPkg - create a soft file and upload script for a given track, so that it
+may be sumbitted to GEO.
+
+To invoke the script, you must pass the composite and track name:
+    mkGeoPkg hg19 wgEncodeSomeTrack
+    
+This is typically not enough however; most tracks are not completely covered
+by their metadata, and it is necessary to supply additional information. The
+most commonly needed information is:
+    !Series_summary - this is taken from the track's html page description. 
+        In most cases it can be copied, one paragraph per line.
+    !Series_overall_design - this is taken from the Methods section on the
+        track's page. As with summary, 1 paragraph per line.
+    !Series_contributor - this is taken from the contributors list on the 
+        track's page. It has a special format: "Last,First" where each person
+        is listed on a separate line.
+    !Sample_instrument_model - this is a bit more difficult, as it technically
+        supposed to be a per-sample field. Occasionally this appears in the
+        metaDb for newer tracks, if so, it's automatically added in. Otherwise
+        it must be either specified on a whole-series basis, or added to the
+        metadata. In many cases, we don't actually know all of them. This is
+        okay. GEO will allow us to submit whatever information we do have, and
+        they can take care of the rest. The instrumentModels dictionary below
+        gives a list of the allowable entered values (the keys). The values
+        map to the "human readable" version used by GEO.
+        
+You can supply all of the above information in an additional file and specify
+it using the r (replace) option:
+    mkGeoPkg -r somefile hg19 wgEncodeSomeTrack
+    
+The replace file must be organized with each of the above key value pairs as
+in a soft file:
+    !Series_summary = some summary stuff ...
+    !Series_summary = another paragraph of summary ...
+    !Series_overall_design = ...
+    !Series_contributor = Researcher,Some
+    !Series_contributor = Researcher,Another
+    !Sample_instrument_model = Illumina_GA2
+    
+There is a template for this, named replaceTemplate in the directory.
+
+You may need to only submit a part of the track, in this case you can specify
+experiment ID numbers to include:
+
+One problem you may run into while running the script is that the DataType is
+not valid. This means that the huge dictionary called datatypes below isn't
+filled in for that entry. If you can get ahold of the information, modify the
+script and push the changes back out.
+
+You may also get an error when trying to submit MicroArray data. This is to be
+expected: MicroArray is currently not functional at all. We have no way as of
+current to map our data to the format expected by GEO, so we've punted on this
+issue for the time being.
+
+Once you've successfully run the script, you'll have generated a soft file and
+a script that will handle the uploading process. All of this will be put into
+a directory named 'database_wgEncodeSomeTrack_year-month-day'. To begin the
+upload, simply cd into the directoy and run upload.sh. This will start the
+aspera copy program, copying files, a files list, and the soft file itself.
+
+For each submission, you need to email GEO. Our current contact is:
+    Steve Wilhite, wilhite@ncbi.nlm.nih.gov
+    
+You must specify which track you're submitting. GEO will only allow us 1TB of
+space dedicated to ENCODE, so you must break down submissions larger than 1TB
+and only submit as many submissions as they have room to process at any given
+time. In a few days, GEO will get back to you with a list of accession numbers
+which need to be put back into our metadata (see encodeAddGeoAccessions).
+
+"""
+
+class DataType(object):
+
+    def __init__(self, molecule, strategy, source, selection, soft):
+        self.molecule = molecule
+        self.strategy = strategy
+        self.source = source
+        self.selection = selection
+        self.soft = soft
+
+datatypes = {
+    'Cage': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'CAGE', soft.HighThroughputSoftFile),
+    'ChipSeq': DataType('genomic DNA', 'ChIP-Seq', 'genomic', 'ChIP', soft.HighThroughputSoftFile),
+    'DnaPet': DataType('genomic DNA', 'OTHER', 'genomic', 'size fractionation', soft.HighThroughputSoftFile),
+    'DnaseDgf': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', soft.HighThroughputSoftFile),
+    'DnaseSeq': DataType('genomic DNA', 'DNase-Hypersensitivity', 'genomic', 'DNase', soft.HighThroughputSoftFile),
+    'FaireSeq': DataType('genomic DNA', 'OTHER', 'genomic', 'other', soft.HighThroughputSoftFile),
+    'MethylSeq': DataType('genomic DNA', 'MRE-Seq', 'genomic', 'Restriction Digest', soft.HighThroughputSoftFile),
+    'MethylRrbs': DataType('genomic DNA', 'Bisulfite-Seq', 'genomic', 'Reduced Representation', soft.HighThroughputSoftFile),
+    'Orchid': DataType('genomic DNA', 'OTHER', 'genomic', 'other', soft.HighThroughputSoftFile),
+    'Proteogenomics': DataType('protein', 'mass spectrometry-based proteogenomic mapping', 'protein', 'chromatographically fractionated peptides', soft.HighThroughputSoftFile),
+    'RnaPet': DataType('OVERRIDE RNA', 'OTHER', 'transcriptomic', 'other', soft.HighThroughputSoftFile),
+    'RnaSeq': DataType('OVERRIDE RNA', 'RNA-Seq', 'transcriptomic', 'cDNA', soft.HighThroughputSoftFile),
+    
+    #these need to be curated
+    '5C': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'AffyExonArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', soft.MicroArraySoftFile),
+    'Bip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Cluster': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Cnv': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Combined': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Genotype': DataType('genomic DNA', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Gencode': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'ChiaPet': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Mapability': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'MethylArray': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'NRE': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Nucleosome': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'RnaChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'RipGeneSt': DataType('OVERRIDE RNA', 'REPLACE', 'transcriptomic', 'RNA binding protein antibody', soft.HighThroughputSoftFile), #this isn't correct
+    'RipTiling': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'RipChip': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'RipSeq': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'Switchgear': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None),
+    'TfbsValid': DataType('REPLACE', 'REPLACE', 'REPLACE', 'REPLACE', None)
+}
+
+#compare this to the source in datatype, give GP ids depending on the type
+gpIds = {
+    'human genomic': '63443',
+    'human transcriptomic': '30709',
+    'human protein': '63447',
+    
+    'mouse genomic': '63471',
+    'mouse transcriptomic': '66167',
+    'mouse protein': '63475'
+}
+
+cvDetails = {
+    'cell':    [ 'organism', 'description', 'karyotype', 'lineage', 'sex' ],
+    'antibody': [ 'antibodyDescription', 'targetDescription', 'vendorName', 'vendorId' ]
+}
+
+#if the term appears in the mdb and must overriding the value in the cv
+cvOverride = [ 'sex' ]
+
+#talk to Venkat lol
+cvPretend = { 'antibody Input': 'control' }
+
+#if its not in cvDetails, which things should we check by default
+cvDefaults = [ 'description' ]
+
+mdbWhitelist = [
+    'age',
+    'bioRep',
+    'control',
+    'controlId',
+    'fragSize',
+    'labExpId',
+    'labVersion',
+    'mapAlgorithm',
+    'obtainedBy',
+    'phase',
+    'readType',
+    'region',
+    'replicate',
+    'restrictionEnzyme',
+    'run',
+    'softwareVersion',
+    'spikeInPool',
+    'strain'
+]
+
+# if the molecule is RNA, we need to map our data into !Sample_molecule, which only takes certain fields
+# first we check rnaExtractMapping. If its not there, we use the localization. This is because (at current)
+# polyA is the most important trait, otherwise its going to be nonPolyA which GEO doesn't accept that. 
+rnaExtractMapping = {
+    'shortPolyA': 'polyA RNA', 
+    'longPolyA': 'polyA RNA', 
+    'polyA': 'polyA RNA'
+}
+
+localizationMapping = {
+    'cytosol': 'cytoplasmic RNA', 
+    'polysome': 'cytoplasmic RNA',
+    'membraneFraction': 'cytoplasmic RNA',
+    'mitochondria': 'cytoplasmic RNA',
+    'nucleus': 'nuclear RNA', 
+    'nucleolus': 'nuclear RNA', 
+    'nucleoplasm': 'nuclear RNA', 
+    'nuclearMatrix': 'nuclear RNA', 
+    'chromatin': 'nuclear RNA',
+    'cell': 'total RNA'
+}
+
+# map our instrument names to GEO's names
+instrumentModels = {
+    'Illumina_GA2x': 'Illumina Genome Analyzer II',
+    'Illumina_GA2': 'Illumina Genome Analyzer II',
+    'Illumina_HiSeq_2000': 'Illumina HiSeq 2000',
+    'Illumina_GA1': 'Illumina Genome Analyzer',
+    'Illumina_GA1_or_GA2': 'Illumina Genome Analyzer, Illumina Genome Analyzer II',
+    'SOLiD_Unknown': 'SOLiD',
+    'Unknown': 'Illumina Genome Analyzer'
+}
+
+    
+def isRawFile(file):
+    return (file.extension == 'fastq' or file.extension == 'fasta')
+    
+def isSupplimentaryFile(file):
+    return not isRawFile(file)
+    
+def sampleTitle(stanza, expVars, warn=False):
+    concat = stanza[expVars[0]].replace('-m', '')
+    for expVar in expVars[1:len(expVars)]:
+        if expVar in stanza and stanza[expVar] != 'None':
+            concat += '_' + stanza[expVar]
+        elif warn:
+            print 'warning: %s is None or not in %s' % (expVar, stanza.name)
+    return concat
+    
+def linkName(file, track):
+    return '%s_%s' % (track.database, file.name)
+    
+def createMappings(mdb):
+    expIds = dict()
+    geoMapping = dict()
+    expVars = None
+    series = None
+    datatype = None
+    
+    for stanza in mdb.itervalues():
+        
+        if 'objType' in stanza and stanza['objType'] == 'composite':
+            series = stanza
+            expVars = stanza['expVars'].split(',')
+            continue
+
+        if 'expId' not in stanza:
+            print stanza.name + ': no expId'
+            continue
+
+        if 'geoSampleAccession' not in stanza or 1:
+            # if this hasn't been submitted to GEO yet, we'll add it to the submission list
+            if stanza['expId'] not in expIds:
+                expIds[stanza['expId']] = list()
+                
+            expIds[stanza['expId']].append(stanza)
+        
+        else:
+            # otherwise we keep track of the geo number for partially submitted samples
+            if stanza['expId'] not in geoMapping:
+                geoMapping[stanza['expId']] = stanza['geoSampleAccession']
+            elif geoMapping[stanza['expId']] != 'Inconsistent' and geoMapping[stanza['expId']] != stanza['geoSampleAccession']:
+                geoMapping[stanza['expId']] = 'Inconsistent'
+                print stanza.name + ': inconsistent geo mapping'
+        
+        if datatype == None and 'dataType' in stanza:
+            datatype = stanza['dataType']
+        elif datatype != None and 'dataType' in stanza and datatype != stanza['dataType']:
+            raise KeyError(stanza.name + ': inconsistent data type') 
+
+    try:
+        dt = datatype
+        datatype = datatypes[dt]
+        datatype.name = dt
+    except KeyError:
+        raise KeyError(datatype)
+    
+    return expIds, expVars, geoMapping, series, datatype
+    
+    
+def createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit):
+    
+    if 'geoSeriesAccession' in series:
+        print 'Existing series ' + series['composite'] + ' using geoSeriesAccession ' + series['geoSeriesAccession']
+        return
+        
+    print 'Writing series ' + series['composite']
+    
+    seriesStanza = soft.SeriesStanza()
+    seriesStanza['^SERIES'] = series['composite']
+    seriesStanza['!Series_title'] = compositeTrack.trackDb[compositeTrack.name]['longLabel'] #STILL INCORRECT
+    
+    if '!Series_summary' in replace:
+        seriesStanza['!Series_summary'] = replace['!Series_summary']
+    else:
+        print 'warning: no series summary found. Please include in replace file.'
+        seriesStanza['!Series_summary'] = '[REPLACE]'
+        if audit:
+            print seriesStanza.name + ': no summary'
+        
+    if '!Series_overall_design' in replace:
+        seriesStanza['!Series_overall_design'] = replace['!Series_overall_design']
+    else:
+        print 'no series overall design found. Please include in replace file.'
+        seriesStanza['!Series_overall_design'] = '[REPLACE]'
+        if audit:
+            print seriesStanza.name + ': no overall design'
+            
+    seriesStanza['!Series_web_link'] = [ compositeTrack.url, 'http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html' ]
+    
+    if '!Series_contributor' in replace:
+        seriesStanza['!Series_contributor'] = replace['!Series_contributor']
+    else:
+        seriesStanza['!Series_contributor'] = '[REPLACE]'
+        if audit:
+            print seriesStanza.name + ': no contributor'
+        
+    seriesStanza['!Series_gp_id'] = gpIds[compositeTrack.organism + ' ' + datatype.source]
+    
+    # could use !Series_variable_* and !Series_repeats_*
+    
+    seriesStanza['!Series_sample_id'] = list()
+    
+    for idNum in expIds.iterkeys():
+        if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
+            seriesStanza['!Series_sample_id'].append(geoMapping[idNum])
+        else:
+            seriesStanza['!Series_sample_id'].append(sampleTitle(expIds[idNum][0], expVars))
+    
+    softfile[series['composite']] = seriesStanza
+    
+def createHighThroughputSoftFile(compositeTrack, cv, expIds, expVars, geoMapping, series, datatype, replace, audit):
+    
+    print 'Creating HighThroughput soft file'
+
+    softfile = soft.HighThroughputSoftFile()
+    fileList = list()
+    
+    createSeries(softfile, compositeTrack, expIds, expVars, geoMapping, series, datatype, replace, audit)
+        
+    for idNum in expIds.iterkeys():
+        
+        expId = expIds[idNum]
+        firstStanza = expId[0]
+        print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
+        sample = soft.HighThroughputSampleStanza()
+
+        sample['^SAMPLE'] = sampleTitle(firstStanza, expVars, 1)
+        sample['!Sample_type'] = 'SRA'
+        sample['!Sample_title'] = sample['^SAMPLE']
+        
+        if 'geoSeriesAccession' in series:
+            sample['!Sample_series_id'] = series['geoSeriesAccession']
+            
+        count = 1
+        
+        #figure out if the instrument model is consistent across the entire sample
+        instrumentModel = None
+        for stanza in expId:    
+            if 'seqPlatform' in stanza:
+                if instrumentModel == None:
+                    instrumentModel = stanza['seqPlatform']
+                else:
+                    if instrumentModel != stanza['seqPlatform']:
+                        instrumentModel = None
+                        if audit:
+                            print 'expId' + str(expId) + ': inconsistent instrument model'
+                        break
+        
+        for stanza in expId:
+        
+            for fname in stanza['fileName'].split(','):
+                file = compositeTrack.files[fname]
+                
+                if isRawFile(file):
+                    sample['!Sample_raw_file_' + str(count)] = linkName(file, compositeTrack)
+                    sample['!Sample_raw_file_type_' + str(count)] = file.extension
+                    
+                    if file.md5sum != None:
+                        sample['!Sample_raw_file_checksum_' + str(count)] = file.md5sum
+
+                    if instrumentModel == None and 'seqPlatform' in stanza:
+                        sample['!Sample_raw_file_instrument_model_' + str(count)] = stanza['seqPlatform']
+                        
+                    fileList.append(file)    
+                    count = count + 1
+            
+        count = 1
+            
+        for stanza in expId:
+        
+            for fname in stanza['fileName'].split(','):
+                file = compositeTrack.files[fname]
+        
+                if isSupplimentaryFile(file):
+                    sample['!Sample_supplementary_file_' + str(count)] = linkName(file, compositeTrack)
+                    
+                    if file.md5sum != None:
+                        sample['!Sample_supplementary_file_checksum_' + str(count)] = file.md5sum
+                    
+                    sample['!Sample_supplementary_file_build_' + str(count)] = compositeTrack.database
+                    
+                    if instrumentModel == None and 'seqPlatform' in stanza:
+                        sample['!Sample_supplementary_file_instrument_model_' + str(count)] = stanza['seqPlatform']
+                    
+                    fileList.append(file)
+                    count = count + 1
+            
+        sample['!Sample_source_name'] = firstStanza['cell']
+        sample['!Sample_organism'] = compositeTrack.organism
+        
+        sample['!Sample_characteristics'] = list()
+        allVars = expVars + mdbWhitelist
+        
+        for var in allVars:
+            if var in firstStanza:
+                foobar = var
+                sample['!Sample_characteristics'].append(var + ': ' + firstStanza[var])
+                for pretend in cvPretend.iterkeys():
+                    if var + ' ' + firstStanza[var] == pretend:
+                        foobar = cvPretend[pretend]
+                if foobar in cvDetails:
+                    for cvVar in cvDetails[foobar]:
+                        if cvVar in cvOverride and cvVar in firstStanza:
+                            sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + firstStanza[cvVar])
+                        elif cvVar in cv[firstStanza[var]]:
+                            sample['!Sample_characteristics'].append(var + ' ' + cvVar + ': ' + cv[firstStanza[var]][cvVar])
+                else:
+                    for cvVar in cvDefaults:
+                        if firstStanza[var] in cv and cvVar in cv[firstStanza[var]]:
+                            sample['!Sample_characteristics'].append(var + ' ' +  cvVar + ': ' + cv[firstStanza[var]][cvVar])
+                
+        sample['!Sample_biomaterial_provider'] = cv[firstStanza['cell']]['vendorName']
+        
+        if 'treatment' in firstStanza:
+            sample['!Sample_treatment_protocol'] = firstStanza['treatment']
+        
+        if 'protocol' in cv[firstStanza['cell']]:
+            for protocol in cv[firstStanza['cell']]['protocol'].split(' '):
+                    key, val = protocol.split(':')
+                    if key == 'ENCODE' or key == cv[firstStanza['lab']]['labPi']:
+                        sample['!Sample_growth_protocol'] = val
+        
+        if datatype.molecule == 'OVERRIDE RNA':
+            if firstStanza['rnaExtract'] in rnaExtractMapping:
+                sample['!Sample_molecule'] = rnaExtractMapping[firstStanza['rnaExtract']]
+            elif firstStanza['localization'] in localizationMapping:
+                sample['!Sample_molecule'] = localizationMapping[firstStanza['localization']]
+                
+        else:
+            sample['!Sample_molecule'] = datatype.molecule
+            
+        if '!Sample_instrument_model' in replace and replace['!Sample_instrument_model'][0] == 'Unknown':
+            sample['!Sample_extract_protocol'] = 'Instrument model unknown. ("%s" specified by default). For more information, see %s' % (instrumentModels[replace['!Sample_instrument_model'][0]], compositeTrack.url)
+        else:
+            sample['!Sample_extract_protocol'] = compositeTrack.url
+        sample['!Sample_library_strategy'] = datatype.strategy
+        sample['!Sample_library_source'] = datatype.source
+        sample['!Sample_library_selection'] = datatype.selection
+        
+        # if the instrumentModel is consistent, just use that
+        # otherwise take the first seqPlatform value from metadata
+        # if that still fails, check the replacement file
+        # finally just make it say [REPLACE]
+        if instrumentModel != None:
+            sample['!Sample_instrument_model'] = instrumentModel
+        else:
+            for stanza in expId:    
+                if 'seqPlatform' in stanza:
+                    sample['!Sample_instrument_model'] = instrumentModels[stanza['seqPlatform']]
+                    break
+            if '!Sample_instrument_model' not in sample:
+                if '!Sample_instrument_model' in replace:
+                    sample['!Sample_instrument_model'] = instrumentModels[replace['!Sample_instrument_model'][0]]
+            if '!Sample_instrument_model' not in sample:
+                sample['!Sample_instrument_model'] = '[REPLACE]'
+                if audit:
+                    print stanza.name + ': no instrument'
+                
+        sample['!Sample_data_processing'] = compositeTrack.url
+
+        if idNum in geoMapping and geoMapping[idNum] != 'Inconsistent':
+            sample['!Sample_geo_accession'] = geoMapping[idNum]
+        
+        softfile[firstStanza['metaObject']] = sample
+        
+    return softfile, fileList
+        
+        
+def createMicroArraySoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype):
+    
+    raise KeyError('microarray')
+    
+    print 'Creating MicroArray soft file'
+
+    softfile = SoftFile()
+    fileList = list()
+    
+    createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series)
+    
+    for idNum in expIds.iterkeys():
+        
+        expId = expIds[idNum]
+        firstStanza = expId[0]
+        print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
+        sample = MicroArraySampleStanza()
+            
+        sample['^SAMPLE'] = sampleTitle(firstStanza, expVars)
+        
+        if 'geoSeriesAccession' in series:
+            sample['!Sample_series_id'] = series['geoSeriesAccession']    
+            
+        sample['!Sample_title'] = concat
+        
+        count = 1
+            
+        for stanza in expId:
+        
+            if isSupplimentaryFile(stanza['fileName']):
+                sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName']
+                
+                if 'checksum' in stanza:
+                    sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum']
+                elif md5sums != None and stanza['fileName'] in md5sums:
+                    sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
+                
+                fileList.append(stanza['fileName'])
+                count = count + 1
+
+        # sample['!Sample_table'] = KeyOptional # CEL file
+        sample['!Sample_source_name_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
+        sample['!Sample_organism_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
+        sample['!Sample_characteristics_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
+        # sample['!Sample_biomaterial_provider_ch'] = KeyZeroPlusNumbered
+        # sample['!Sample_treatment_protocol_ch'] = KeyZeroPlusNumbered
+        # sample['!Sample_growth_protocol_ch'] = KeyZeroPlusNumbered
+        sample['!Sample_molecule_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
+        sample['!Sample_extract_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
+        sample['!Sample_label_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
+        sample['!Sample_label_protocol_ch_1'] = '[REPLACE]' #KeyOnePlusNumbered
+        sample['!Sample_hyb_protocol'] = '[REPLACE]' #KeyOnePlus
+        sample['!Sample_scan_protocol'] = '[REPLACE]' #KeyOnePlus
+        sample['!Sample_data_processing'] = '[REPLACE]' #KeyOnePlus
+        sample['!Sample_description'] = '[REPLACE]' #KeyZeroPlus
+        sample['!Sample_platform_id'] = '[REPLACE]'
+        # sample['!Sample_geo_accession'] = KeyOptional
+        # sample['!Sample_anchor'] = KeyRequired SAGE ONLY
+        # sample['!Sample_type'] = KeyRequired SAGE ONLY
+        # sample['!Sample_tag_count'] = KeyRequired SAGE ONLY
+        # sample['!Sample_tag_length'] = KeyRequired SAGE ONLY
+        sample['!Sample_table_begin'] = ''
+        sample['!Sample_table_end'] = ''
+    
+        softfile[firstStanza['accession']] = sample
+        
+    return softfile, fileList
+    
+    
+def createSpecialSoftFile(database, composite, organism, compositeUrl, mdb, cv, track, md5sums, expIds, expVars, geoMapping, series, datatype):
+    softfile = SoftFile()
+    fileList = list()
+    
+    createSeries(softfile, composite, compositeUrl, track, expIds, geoMapping, series)
+    
+    for idNum in expIds.iterkeys():
+        
+        expId = expIds[idNum]
+        firstStanza = expId[0]
+        print 'Writing sample ' + firstStanza['metaObject'] + ' (' + idNum + ')'
+        sample = HighThroughputSampleStanza()
+        
+        hasbigwig = 0
+        for stanza in expId:
+        
+            if getFileType(stanza['fileName']) == 'bigWig':
+                hasbigwig = 1
+                
+        if hasbigwig == 0:
+            continue
+        
+        sample['^SAMPLE'] = firstStanza['geoSampleAccession']
+        
+        if 'geoSeriesAccession' in series:
+            sample['!Sample_series_id'] = series['geoSeriesAccession']
+            
+        sample['!Sample_geo_accession'] = firstStanza['geoSampleAccession']
+        
+        count = 1
+            
+        for stanza in expId:
+        
+            if getFileType(stanza['fileName']) == 'bigWig':
+                sample['!Sample_supplementary_file_' + str(count)] = stanza['fileName']
+                
+                if 'checksum' in stanza:
+                    sample['!Sample_supplementary_file_checksum_' + str(count)] = stanza['checksum']
+                elif md5sums != None and stanza['fileName'] in md5sums:
+                    sample['!Sample_supplementary_file_checksum_' + str(count)] = md5sums[stanza['fileName']]
+                
+                # sample['!Sample_supplementary_file_build_' + str(count)] = database
+                
+                fileList.append(stanza['fileName'])
+                count = count + 1
+                
+        softfile[firstStanza['geoSampleAccession']] = sample
+        
+    return softfile, fileList
+        
+def main():
+
+    parser = argparse.ArgumentParser(description = 'Prepares a submission to GEO. Creates a soft file and shell script with the correct call to aspera.')
+    parser.add_argument('-t', '--trackPath', help='Overrides the default track path ~/kent/src/hg/makeDb/trackDb/')
+    parser.add_argument('-r', '--replace', help='Give the name of a file that has contents to be used to replace unspecified tags in metadata (description, contributers, etc) and instrument model')
+    parser.add_argument('-a', '--audit', action='store_true', default=False, help='Instead of building the files, will just give you a list of errors')
+    parser.add_argument('database', help='The database, typically hg19 or mm9')
+    parser.add_argument('composite', help='The composite name, wgEncodeCshlLongRnaSeq for instance')
+    parser.add_argument('expIds', nargs='*', help='Any number of expIds separated by spaces, you can also specify a range by using a hyphen, "140 150 160-170" for instance, or leave blank to specify the entire file')
+    
+    if len(sys.argv) == 1:
+        parser.print_usage()
+        return
+    
+    args = parser.parse_args(sys.argv[1:])
+        
+    compositeTrack = track.CompositeTrack(args.database, args.composite, args.trackPath)
+
+    cvPath = compositeTrack.trackPath + 'cv/alpha/cv.ra'
+    controlledVocab = cv.CvFile(cvPath)
+    
+    replace = dict()
+    if args.replace != None:
+        for line in open(args.replace):
+            if line == '':
+                continue
+            key, val = map(str.strip, line.split('=', 1))
+            if key not in replace:
+                replace[key] = list()
+            replace[key].append(val)
+        
+    
+    ids = list()
+    
+    for id in args.expIds:
+        if '-' in id:
+            start, end = id.split('-', 1)
+            ids.extend(range(int(start), int(end) + 1))
+        else:
+            ids.append(int(id))
+    
+    expIds, expVars, geoMapping, series, datatype = createMappings(compositeTrack.alphaMetaDb)
+    
+    submission = dict()
+    if len(ids) == 0:
+        submission = expIds
+    else:
+        for expId in ids:
+            submission[str(expId)] = expIds[str(expId)]
+    
+    expIdStr = ' '
+    for id in args.expIds:
+        expIdStr = expIdStr + id + ',' 
+    expIdStr = expIdStr[:len(expIdStr) - 1]
+    print 'Generating soft using expIds ' + expIdStr
+    
+    if datatype.soft == soft.HighThroughputSoftFile:
+        softfile, fileList = createHighThroughputSoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype, replace, args.audit)
+    elif datatype.soft == soft.MicroArraySoftFile:
+        softfile, fileList = createMicroArraySoftFile(compositeTrack, controlledVocab, submission, expVars, geoMapping, series, datatype)
+    else:
+        raise KeyError('unsupported type ' + datatype.name)
+    
+    if not args.audit:
+        print 'Creating directory'
+    
+        d = datetime.datetime.today()
+        datestring = '%4d-%02d-%02d' % (d.year, d.month, d.day)
+        
+        dirname = '%s_%s_%s/' % (compositeTrack.database, compositeTrack.name, datestring)
+        linkdirname = '%s_%s/' % (compositeTrack.database, compositeTrack.name)
+    
+        os.mkdir(dirname)
+        os.mkdir(dirname + linkdirname)
+    
+        print 'Writing file'
+        
+        outfileName = '%s%s_%s.soft' % (dirname + linkdirname, compositeTrack.database, compositeTrack.name)
+        outfile = open(outfileName, 'w')
+        outfile.write(str(softfile))
+        fileslistname = '%sfiles.txt' % (dirname + linkdirname)
+        fileslist = open(fileslistname, 'w')
+        scriptname = '%supload.sh' % dirname
+        outscript = open(scriptname, 'w')
+        
+        outscript.write('#!/bin/sh\n\n')
+        outscript.write('/cluster/home/mmaddren/.aspera/connect/bin/ascp -i ~/encode_geo_key/encode_geo_key.ppk --symbolic-links=follow -QTdr -l300m %s asp-geo@upload.ncbi.nlm.nih.gov:ENCODE\n' % linkdirname)
+        outscript.close()
+        
+    for file in fileList:
+        if not os.path.exists(file.path):
+            print IOError(file.path + ' does not exist')
+        elif not args.audit:
+            linkname = linkName(file, compositeTrack)
+            linkpath = linkdirname + linkname
+            os.symlink(file.fullname, dirname + linkpath)
+        
+            #outscript.write(linkpath + ' \\\n')
+            fileslist.write(linkname + '\n')
+    
+    if not args.audit:
+        #outscript.write()
+        
+        fileslist.close()
+
+        os.system('chmod +x ' + scriptname)
+        
+        print 'Finished!'
+    
+if __name__ == '__main__':
+    main()
\ No newline at end of file