e7c480dfff7785e7c2efd40aa13d55747affe961 jcasper Wed Feb 1 12:17:50 2023 -0800 Added a DOI option to getTrackReferences and updated it's full-text link detection to match the current PubMed page structure. No ticket. diff --git src/hg/encode/getTrackReferences/getTrackReferences src/hg/encode/getTrackReferences/getTrackReferences index 992318f..909bf53 100755 --- src/hg/encode/getTrackReferences/getTrackReferences +++ src/hg/encode/getTrackReferences/getTrackReferences @@ -1,39 +1,41 @@ #!/usr/bin/env python2.7 -import sys, os, urllib2, argparse, re, cgi, textwrap +import sys, os, urllib2, argparse, re, cgi, textwrap, requests from xml.etree import ElementTree as ET def parsePubmed(doc, id): infoDict = dict() infoDict['url'] = "https://www.ncbi.nlm.nih.gov/pubmed/%s" % id attribList = ['PubDate', 'Source', 'Title', 'Volume', 'Issue', 'Pages', 'SO', 'CollectiveName'] for element in doc: if element.tag != "DocSum": continue items = element.findall("Item") for i in items: if i.attrib['Name'] == 'AuthorList': infoDict['Authors'] = list() for j in i: infoDict['Authors'].append(j.text) continue if i.attrib['Name'] == "ArticleIds": for j in i: if (j.attrib['Name'] == 'pubmed'): infoDict[j.attrib['Name']] = j.text if (j.attrib['Name'] == 'pmc'): infoDict[j.attrib['Name']] = j.text + if (j.attrib['Name'] == 'doi'): + infoDict[j.attrib['Name']] = j.text continue if i.attrib['Name'] in attribList: infoDict[i.attrib['Name']] = i.text return infoDict def parsePmc(doc, id): for node in doc.iter("ArticleId"): foundPubMedId = 0 for child in node: if child.tag == "IdType" and child.text == "pmid": foundPubMedId = 1 if foundPubMedId == 1 and child.tag == "Value": return parseInit(child.text) @@ -50,31 +52,31 @@ url = urlbase + "db=%s" % db + "&id=%s&version=2.0" % id sys.stderr.write("Accessing %s\n" % url) fetch = urllib2.urlopen(url) doc = ET.XML(fetch.read()) if db == "Pubmed": infoDict = parsePubmed(doc, id) elif db == "PMC": infoDict = parsePmc(doc, id) return infoDict def htmlEscape(str): return cgi.escape(str).encode('ascii', 'xmlcharrefreplace') -def makeHtml(infoDict, plain, verbose): +def makeHtml(infoDict, plain, verbose, doi): authors = list() authcount = 0 etal = 0 if 'CollectiveName' in infoDict: authors.append(infoDict['CollectiveName']) authcount = 1 for i in infoDict['Authors']: if authcount == 10 and not verbose: etal = 1 break authors.append(i) authcount = authcount + 1 sep = ", " authStr = sep.join(authors) authStr = htmlEscape(authStr) @@ -97,87 +99,93 @@ dateStr = dateStr + ";%s" % infoDict['Volume'] if 'Issue' in infoDict: dateStr = dateStr + "(%s)" % infoDict['Issue'] if 'Pagination' in infoDict: dateStr = dateStr + "%s" % infoDict['Pagination'] elif 'Pages' in infoDict: dateStr = dateStr + ":%s" %infoDict['Pages'] dateStr = re.sub("\s+","", dateStr) dateStr = dateStr1 + dateStr if not re.search("\.$", dateStr): dateStr = dateStr + "." # construct hyperlinks for PMID and PMCID (if it exists) idStr = "PMID: %s" % (htmlEscape(infoDict['url']), infoDict['pubmed']) if 'pmc' in infoDict: idStr = idStr + "; PMC: %s" % (infoDict['pmc'], infoDict['pmc']) + if doi and 'doi' in infoDict: + idStr = ("DOI: %s; " % (htmlEscape(infoDict['doi']), infoDict['doi'] ) ) + idStr # now that the pubmed link has been constructed, we can overwrite the url in infoDict with the original article URL # make sure the portlet that generates outlinks for PubMed didn't fail. If it did, try again until # it works or we give up. + # Note: no longer sure this is necessary - seems like NCBI is doing something different now, but at + # any rate urllib2 no longer seems to fetch the links list in at least some cases. Requests works. for try_count in range(10): origComment = "" - fetch = urllib2.urlopen(infoDict['url']) - doc = fetch.read() - m = re.search('

\s*', doc) # another possible detection of failed portlet p = re.search('Default output of portlet NCBIPageSection', doc) if p is None: break else: print "Failed to fetch complete links from NCBI after 10 tries. Try again later or just use the PubMed paper link." htmlLines = list() htmlLines.append("

") htmlLines.append("%s" % authStr) htmlLines.append("" % htmlEscape(infoDict['url'])) htmlLines.append("%s." % htmlEscape(title)) htmlLines.append("%s. %s" % (htmlEscape(journal), dateStr)) htmlLines.append("%s" % idStr) htmlLines.append("

") if plain: htmlLines = list() idStr = "PMID: %s" % infoDict['pubmed']; if 'pmc' in infoDict: idStr = idStr + "; PMC: %s" % infoDict['pmc'] + if 'doi' in infoDict: + idStr = ("DOI: %s; " % infoDict['doi']) + idStr htmlLines.append("%s %s. %s. %s %s. %s" % (authStr, title, journal, dateStr, idStr, infoDict['url'])) if not plain and origComment: htmlLines.append("%s" % origComment) sep = "\n" space = " " processLines = list() for i in htmlLines: processLines.append(textwrap.fill(i, 100)) htmlStr = sep.join(processLines) return htmlStr, authStr def main(): parser = argparse.ArgumentParser( description='Turns PubMed Ids and PubMedCentral Ids into GB formatted citations in html', epilog='example: getTrackReferences PMC3039671 21347206' ) parser.add_argument('ids', metavar='IDs', type=str, nargs='+', help='The list of PubMed and PubMedCentral Ids') parser.add_argument('-p', '--plain', action="store_true", default=0, help="Output the references in plain-text instead of html.") parser.add_argument('-v', '--verbose', action="store_true", default=0, help="Output the full author list instead of truncating after the first 10.") + parser.add_argument('-d', '--doi', action="store_true", default=0, help="Include a DOI link.") args = parser.parse_args() ids = args.ids references = dict() for i in ids: infoDict = parseInit(i) - html, authStr = makeHtml(infoDict, args.plain, args.verbose) + html, authStr = makeHtml(infoDict, args.plain, args.verbose, args.doi) references[authStr] = html print "\n" for i in sorted(references.keys()): print "%s\n" % references[i] if __name__ == "__main__": main()