a789c33bb65dbe0efbf9c98784ee4b561e08c976
cline
  Mon Jun 27 23:27:25 2011 -0700
Added a script for scraping the cell type wiki table, producing CV stanzas and downloading approved cell protocol documents
diff --git src/hg/encode/antibodyWikiParser.py src/hg/encode/antibodyWikiParser.py
index c8d1114..f8bc976 100755
--- src/hg/encode/antibodyWikiParser.py
+++ src/hg/encode/antibodyWikiParser.py
@@ -8,32 +8,32 @@
 import base64
 from BeautifulSoup import BeautifulSoup
 import HTMLParser
 from optparse import OptionParser
 import re
 import shlex
 import subprocess
 import sys
 import urllib2 
 from ucscgenomics.rafile.RaFile import *
 
 
 
 def stripLeadingTrailingWhitespace(text):
     """Given a string, remove any leading or trailing whitespace"""
-    text = re.sub("^( )+", "", text)
-    text = re.sub("( )+$", "", text)
+    text = re.sub("^([" + string.whitespace + "])+", "", text)
+    text = re.sub("([" + string.whitespace + "])+$", "", text)
     return(text)
 
 def getContents(field):
     """Given an HTML field, return the contents"""
     contents = stripLeadingTrailingWhitespace(field.contents[0])
     if len(contents) == 0:
         contents = "missing"
     return(contents)
 
 def processSource(orderEntry):
     """ 
     Given the contents of the 'Source' column, parse out the vendor name, vendor ID,
     and order URL if any.  We are assuming that from the textual contents of the 
     cell (which excludes the order URL), the last word will be the vendor ID and the
     preceding words will make up the vendor name.  Return a list containing, in order,