a789c33bb65dbe0efbf9c98784ee4b561e08c976 cline Mon Jun 27 23:27:25 2011 -0700 Added a script for scraping the cell type wiki table, producing CV stanzas and downloading approved cell protocol documents diff --git src/hg/encode/antibodyWikiParser.py src/hg/encode/antibodyWikiParser.py index c8d1114..f8bc976 100755 --- src/hg/encode/antibodyWikiParser.py +++ src/hg/encode/antibodyWikiParser.py @@ -8,32 +8,32 @@ import base64 from BeautifulSoup import BeautifulSoup import HTMLParser from optparse import OptionParser import re import shlex import subprocess import sys import urllib2 from ucscgenomics.rafile.RaFile import * def stripLeadingTrailingWhitespace(text): """Given a string, remove any leading or trailing whitespace""" - text = re.sub("^( )+", "", text) - text = re.sub("( )+$", "", text) + text = re.sub("^([" + string.whitespace + "])+", "", text) + text = re.sub("([" + string.whitespace + "])+$", "", text) return(text) def getContents(field): """Given an HTML field, return the contents""" contents = stripLeadingTrailingWhitespace(field.contents[0]) if len(contents) == 0: contents = "missing" return(contents) def processSource(orderEntry): """ Given the contents of the 'Source' column, parse out the vendor name, vendor ID, and order URL if any. We are assuming that from the textual contents of the cell (which excludes the order URL), the last word will be the vendor ID and the preceding words will make up the vendor name. Return a list containing, in order,