2e5b5de5dbe263a3ae6de1787d118d9586fac0ce cline Wed Jun 22 18:11:14 2011 -0700 Added the functionality to download multiple validation sub-files and combine them as a single pdf file diff --git src/hg/encode/antibodyWikiParser.py src/hg/encode/antibodyWikiParser.py index 4f60d38..c8d1114 100755 --- src/hg/encode/antibodyWikiParser.py +++ src/hg/encode/antibodyWikiParser.py @@ -1,42 +1,47 @@ #!/usr/bin/env python # # antibodyWikiParser.py: parse any to-be-registered antibodies from the wiki, # and download any newly-approved validation documents. # import base64 from BeautifulSoup import BeautifulSoup import HTMLParser from optparse import OptionParser import re +import shlex +import subprocess import sys import urllib2 from ucscgenomics.rafile.RaFile import * def stripLeadingTrailingWhitespace(text): """Given a string, remove any leading or trailing whitespace""" text = re.sub("^( )+", "", text) text = re.sub("( )+$", "", text) return(text) def getContents(field): """Given an HTML field, return the contents""" - return(stripLeadingTrailingWhitespace(field.contents[0])) + contents = stripLeadingTrailingWhitespace(field.contents[0]) + if len(contents) == 0: + contents = "missing" + return(contents) def processSource(orderEntry): """ Given the contents of the 'Source' column, parse out the vendor name, vendor ID, and order URL if any. We are assuming that from the textual contents of the cell (which excludes the order URL), the last word will be the vendor ID and the preceding words will make up the vendor name. Return a list containing, in order, vendorName, vendorId, and orderUrl. """ urlClauses = orderEntry.findAll("a") if len(urlClauses) == 0: orderUrl = "" orderInfo = getContents(orderEntry) else: orderUrl = urlClauses[0]["href"] @@ -86,31 +91,31 @@ def processValidation(validationCell, species, antibody, lab, downloadsDir, username, password, wikiBaseUrl): """ (1) Given the contents of the 'Validation' cell, plus the species, antibody, and lab, assemble a validation filename that fits the naming convention. (2) If there's any evidence of a filename, set the validation string to it. Otherwise, set the validation string to "missing". (3) If there is a hyperlink to a validation file, download it into the filename assembled here, in the indicated download directory. Assumptions: - users are illogical - the number of ways users can do unexpected things cannot be counted """ validationCellContents = getContents(validationCell) - if validationCellContents is "missing" or len(validationCell.findAll("a")) == 0: + if validationCellContents is "missing" and len(validationCell.findAll("a")) == 0: validation = "missing" else: # # In the off-chance that the user has specified the validation type as the label # in the wiki table, where the filename is linked, pull out the validation type. # Assemble a validation label string for the CGIs, containing the antibody name, # and validation type in parentheses. # If the user has specified the antibody name in the label, it will need to be stripped # out to ensure that it's not included twice. Parentheses complicate things. # At the writing of this script (May 3, 2011), the naming convention for antibodies # is <targetName>_(<vendorId>), such as TRIM37_(SC-49548). When stripping out # the antibody name, be sure to escape any parentheses, so that they can be processed # as literals in the regular expression clause. # validationType = validationCellContents @@ -120,49 +125,66 @@ searchReplaceString = searchReplaceString + "\(" \ + antibodyPieces[1] + "\)" validationType = re.sub(searchReplaceString, "", validationType) # # Remove any extraneous parentheses, any any leading or trailing whitespace validationType = re.sub("[\(\)]", "", validationType) validationType = re.sub("^( )+", "", re.sub("( )+$", "", validationType)) validationLabel = antibody + "(" + validationType + ")" # # Here, download the validation document, if provided, into the expected filename. # When generating the document name, strip any parentheses # from the antibody name. Parentheses in filenames are just bad... documentName = species + "_" + re.sub("[\(\)]", "", antibody) \ + "_validation_" + lab + ".pdf" + targetDocumentName = downloadsDir + "/" + documentName + # + # If a single hyperlinked PDF exists, download the document to the + # downloads directory. If multiple hyperlinked PDFs exist, download the + # documents separately and combine them into a single file. If one or more + # hyperlinked document is not a PDF, give a warning but don't combine them. # - # If a hyperlink exists, download the document to the downloads directory urlClauses = validationCell.findAll("a") - if len(urlClauses) > 0: - if urlClauses[0].has_key("href"): - url = urlClauses[0]["href"] + downloadedFileList = "" + for ii in range(0,len(urlClauses)): + if urlClauses[ii].has_key("href"): + url = urlClauses[ii]["href"] # # If this document is not in PDF format (e.g. if it's a word doc, # print out a warning message and don't try to download it, but return the # target filename (generated above). If the document is a PDF, # download it and save it in the filename generated above. if re.search(".pdf$", url): validationData = accessWiki(wikiBaseUrl + url, username, password) if len(validationData) > 0: - newValidationFile = open(downloadsDir + "/" + documentName, "wb") + downloadFilename = targetDocumentName + str(ii) + newValidationFile = open(downloadFilename, "wb") newValidationFile.write(validationData) newValidationFile.close() + downloadedFileList = downloadedFileList + " " + downloadFilename else: print "Warning: not downloading", url, ": not a PDF file"; + if len(urlClauses) == 1: + renameCmd = "mv %s %s" % (downloadedFileList, targetDocumentName) + subprocess.Popen(renameCmd, shell=True) + #subprocess.Popen("rm %s" % (downloadedFileList), shell=True) + elif len(urlClauses) > 1: + combineCmd = "pdftk %s cat output %s" \ + % (downloadedFileList, targetDocumentName) + subprocess.Popen(combineCmd, shell=True) + #subprocess.Popen("rm %s" % (downloadedFileList), shell=True) validation = validationLabel + ":" + documentName validation = re.sub("(\s)+", "", validation) return(validation) # # Process an antibody table entry, in which the order of the columns is # (Antibody, AntibodyDescription, TargetDescription, Source, Lab, Lot(s), # FactorId, ValidationDocument) # def processAntibodyEntry(entry, species, downloadsDirectory, username, password, wikiBaseUrl): """ For a single wiki table entry, generate an appropriate RA file stanza and