a44e3254174152b2d4b3f241d1935654cd139181 vsmalladi Tue May 8 10:11:21 2012 -0700 Renamed library from ucscgenomics to ucscGb. Redmine #7029. diff --git python/lib/ucscgenomics/geo.py python/lib/ucscgenomics/geo.py deleted file mode 100644 index 90d0e22..0000000 --- python/lib/ucscgenomics/geo.py +++ /dev/null @@ -1,96 +0,0 @@ -import urllib2, re, datetime - -# if the molecule is RNA, we need to map our data into !Sample_molecule, which only takes certain fields -# first we check rnaExtractMapping. If its not there, we use the localization. This is because (at current) -# polyA is the most important trait, otherwise its going to be nonPolyA which GEO doesn't accept. -rnaExtractMapping = { - 'shortPolyA': 'polyA RNA', - 'longPolyA': 'polyA RNA', - 'polyA': 'polyA RNA' -} - -localizationMapping = { - 'cytosol': 'cytoplasmic RNA', - 'polysome': 'cytoplasmic RNA', - 'membraneFraction': 'cytoplasmic RNA', - 'mitochondria': 'cytoplasmic RNA', - 'nucleus': 'nuclear RNA', - 'nucleolus': 'nuclear RNA', - 'nucleoplasm': 'nuclear RNA', - 'nuclearMatrix': 'nuclear RNA', - 'chromatin': 'nuclear RNA', - 'cell': 'total RNA' -} - -# map our instrument names to GEO's names -instrumentModels = { - 'Illumina_GA2x': 'Illumina Genome Analyzer II', - 'Illumina_GA2': 'Illumina Genome Analyzer II', - 'Illumina_HiSeq_2000': 'Illumina HiSeq 2000', - 'Illumina_GA1': 'Illumina Genome Analyzer', - 'Illumina_GA1_or_GA2': 'Illumina Genome Analyzer, Illumina Genome Analyzer II', - 'SOLiD_Unknown': 'SOLiD', - 'AB_SOLiD_3.5': 'AB SOLiD 3.5', - 'Unknown': 'Illumina Genome Analyzer' -} - -class Submission(object): - - @property - def accessions(self): - return self._accessions - - @property - def dateSubmitted(self): - return self._submitted - - @property - def dateUpdated(self): - return self._updated - - def __init__(self, geoId): - html = getHtml(geoId) - self._accessions = getGSE(html) - self._submitted = getDateSubmitted(html) - self._updated = getDateUpdated(html) - - def getSample(self, geoId): - html = getHtml(geoId) - return getGSM(html) - -def getHtml(geoId): - try: - response = urllib2.urlopen('http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=%s' % geoId) - except: - return None - return response.read() - -def getGSE(html): - gsms = re.findall('(GSM[0-9]+)\n([^<]+)', html) - d = dict() - for gsm in gsms: - d[gsm[1]] = gsm[0] - return d - -def getGSM(html): - suppfiles = re.findall('([^<]+)', html) - d = dict() - for f in suppfiles: - print f - fname = f.rsplit('_', 1)[1] - d[fname] = fname - return d - - -def getDateSubmitted(html): - datestr = re.search('Submission date\n([^<]+)', html) - if datestr == None: - return None - return datetime.datetime.strptime(datestr.group(1), '%b %d, %Y') - -def getDateUpdated(html): - datestr = re.search('Last update date\n([^<]+)', html) - if datestr == None: - return None - return datetime.datetime.strptime(datestr.group(1), '%b %d, %Y') -