a44e3254174152b2d4b3f241d1935654cd139181 vsmalladi Tue May 8 10:11:21 2012 -0700 Renamed library from ucscgenomics to ucscGb. Redmine #7029. diff --git python/lib/ucscGb/geo.py python/lib/ucscGb/geo.py new file mode 100644 index 0000000..90d0e22 --- /dev/null +++ python/lib/ucscGb/geo.py @@ -0,0 +1,96 @@ +import urllib2, re, datetime + +# if the molecule is RNA, we need to map our data into !Sample_molecule, which only takes certain fields +# first we check rnaExtractMapping. If its not there, we use the localization. This is because (at current) +# polyA is the most important trait, otherwise its going to be nonPolyA which GEO doesn't accept. +rnaExtractMapping = { + 'shortPolyA': 'polyA RNA', + 'longPolyA': 'polyA RNA', + 'polyA': 'polyA RNA' +} + +localizationMapping = { + 'cytosol': 'cytoplasmic RNA', + 'polysome': 'cytoplasmic RNA', + 'membraneFraction': 'cytoplasmic RNA', + 'mitochondria': 'cytoplasmic RNA', + 'nucleus': 'nuclear RNA', + 'nucleolus': 'nuclear RNA', + 'nucleoplasm': 'nuclear RNA', + 'nuclearMatrix': 'nuclear RNA', + 'chromatin': 'nuclear RNA', + 'cell': 'total RNA' +} + +# map our instrument names to GEO's names +instrumentModels = { + 'Illumina_GA2x': 'Illumina Genome Analyzer II', + 'Illumina_GA2': 'Illumina Genome Analyzer II', + 'Illumina_HiSeq_2000': 'Illumina HiSeq 2000', + 'Illumina_GA1': 'Illumina Genome Analyzer', + 'Illumina_GA1_or_GA2': 'Illumina Genome Analyzer, Illumina Genome Analyzer II', + 'SOLiD_Unknown': 'SOLiD', + 'AB_SOLiD_3.5': 'AB SOLiD 3.5', + 'Unknown': 'Illumina Genome Analyzer' +} + +class Submission(object): + + @property + def accessions(self): + return self._accessions + + @property + def dateSubmitted(self): + return self._submitted + + @property + def dateUpdated(self): + return self._updated + + def __init__(self, geoId): + html = getHtml(geoId) + self._accessions = getGSE(html) + self._submitted = getDateSubmitted(html) + self._updated = getDateUpdated(html) + + def getSample(self, geoId): + html = getHtml(geoId) + return getGSM(html) + +def getHtml(geoId): + try: + response = urllib2.urlopen('http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=%s' % geoId) + except: + return None + return response.read() + +def getGSE(html): + gsms = re.findall('(GSM[0-9]+)</a></td>\n<td valign="top">([^<]+)</td>', html) + d = dict() + for gsm in gsms: + d[gsm[1]] = gsm[0] + return d + +def getGSM(html): + suppfiles = re.findall('<tr valign="top"><td bgcolor="#[0-9A-F]+">([^<]+)</td>', html) + d = dict() + for f in suppfiles: + print f + fname = f.rsplit('_', 1)[1] + d[fname] = fname + return d + + +def getDateSubmitted(html): + datestr = re.search('<td>Submission date</td>\n<td>([^<]+)</td>', html) + if datestr == None: + return None + return datetime.datetime.strptime(datestr.group(1), '%b %d, %Y') + +def getDateUpdated(html): + datestr = re.search('<td>Last update date</td>\n<td>([^<]+)</td>', html) + if datestr == None: + return None + return datetime.datetime.strptime(datestr.group(1), '%b %d, %Y') +