1c36c9c04ec6dd169e4ae9ec0d4f2934e829aca5 jcasper Sun Mar 2 12:14:17 2025 -0800 Pushing getTrackReferences to python3, refs #35314 diff --git src/hg/encode/getTrackReferences/getTrackReferences src/hg/encode/getTrackReferences/getTrackReferences index 2b99cb9ac57..0896cc5219d 100755 --- src/hg/encode/getTrackReferences/getTrackReferences +++ src/hg/encode/getTrackReferences/getTrackReferences @@ -1,203 +1,216 @@ -#!/usr/bin/env python2.7 -import sys, os, urllib2, argparse, re, cgi, textwrap, requests +#!/usr/bin/env python +import sys, os, urllib.request, argparse, re, html, textwrap, requests from xml.etree import ElementTree as ET def parsePubmed(doc, id): infoDict = dict() infoDict['url'] = "https://www.ncbi.nlm.nih.gov/pubmed/%s" % id attribList = ['PubDate', 'Source', 'Title', 'Volume', 'Issue', 'Pages', 'SO', 'CollectiveName'] for element in doc: if element.tag != "DocSum": continue items = element.findall("Item") for i in items: if i.attrib['Name'] == 'AuthorList': infoDict['Authors'] = list() for j in i: infoDict['Authors'].append(j.text) continue if i.attrib['Name'] == "ArticleIds": for j in i: if (j.attrib['Name'] == 'pubmed'): infoDict[j.attrib['Name']] = j.text if (j.attrib['Name'] == 'pmc'): infoDict[j.attrib['Name']] = j.text if (j.attrib['Name'] == 'doi'): infoDict[j.attrib['Name']] = j.text continue if i.attrib['Name'] in attribList: infoDict[i.attrib['Name']] = i.text return infoDict def parsePmc(doc, id): for node in doc.iter("ArticleId"): foundPubMedId = 0 for child in node: if child.tag == "IdType" and child.text == "pmid": foundPubMedId = 1 if foundPubMedId == 1 and child.tag == "Value": return parseInit(child.text) sys.stderr.write("Unable to find pubmed id for pubmed central id: %s\n" % id) sys.exit() def parseInit(id): urlbase = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?" db = "Pubmed" url = urlbase + "db=%s" % db + "&id=%s" % id if re.match("^PMC", id): db = "PMC" id = re.sub("PMC", "", id) url = urlbase + "db=%s" % db + "&id=%s&version=2.0" % id sys.stderr.write("Accessing %s\n" % url) - fetch = urllib2.urlopen(url) + fetch = urllib.request.urlopen(url) doc = ET.XML(fetch.read()) if db == "Pubmed": infoDict = parsePubmed(doc, id) elif db == "PMC": infoDict = parsePmc(doc, id) return infoDict def htmlEscape(str): - return cgi.escape(str).encode('ascii', 'xmlcharrefreplace') + return html.escape(str) def makeHtml(infoDict, plain, verbose, doi): authors = list() authcount = 0 etal = 0 if 'CollectiveName' in infoDict: authors.append(infoDict['CollectiveName']) authcount = 1 for i in infoDict['Authors']: if authcount == 10 and not verbose: etal = 1 break authors.append(i) authcount = authcount + 1 sep = ", " authStr = sep.join(authors) + if (not plain): authStr = htmlEscape(authStr) if etal and not plain: - authStr = authStr + " et al" + authStr = authStr + (" et al") if etal and plain: - authStr = authStr + " et al" - authStr = authStr + "." - title = re.sub("\.$", "", infoDict['Title']) + authStr = authStr + (" et al") + authStr = authStr + (".") + title = re.sub(r"\.$", "", infoDict['Title']) if 'Source' in infoDict: journal = infoDict['Source'] elif 'Journal' in infoDict: journal = infoDict['Journal'] if 'SO' in infoDict: dateStr = re.sub(";:", ":", infoDict['SO']) else: dateStr1 = infoDict['PubDate'] dateStr = "" if 'Volume' in infoDict: dateStr = dateStr + ";%s" % infoDict['Volume'] if 'Issue' in infoDict: dateStr = dateStr + "(%s)" % infoDict['Issue'] if 'Pagination' in infoDict: dateStr = dateStr + "%s" % infoDict['Pagination'] elif 'Pages' in infoDict: dateStr = dateStr + ":%s" %infoDict['Pages'] - dateStr = re.sub("\s+","", dateStr) + dateStr = re.sub(r"\s+","", dateStr) dateStr = dateStr1 + dateStr - if not re.search("\.$", dateStr): + if not re.search(r"\.$", dateStr): dateStr = dateStr + "." # construct hyperlinks for PMID and PMCID (if it exists) + if (not plain): idStr = "PMID: %s" % (htmlEscape(infoDict['url']), infoDict['pubmed']) if 'pmc' in infoDict: idStr = idStr + "; PMC: %s" % (infoDict['pmc'], infoDict['pmc']) if doi and 'doi' in infoDict: idStr = ("DOI: %s; " % (htmlEscape(infoDict['doi']), infoDict['doi'] ) ) + idStr + else: + idStr = "PMID: %s" % (infoDict['url'], infoDict['pubmed']) + if 'pmc' in infoDict: + idStr = idStr + "; PMC: %s" % (infoDict['pmc'], infoDict['pmc']) + if doi and 'doi' in infoDict: + idStr = ("DOI: %s; " % (infoDict['doi'], infoDict['doi'] ) ) + idStr # now that the pubmed link has been constructed, we can overwrite the url in infoDict with the original article URL # make sure the portlet that generates outlinks for PubMed didn't fail. If it did, try again until # it works or we give up. # Note: no longer sure this is necessary - seems like NCBI is doing something different now, but at # any rate urllib2 no longer seems to fetch the links list in at least some cases. Requests works. origUrl = infoDict['url'] for try_count in range(10): origComment = "" infoDict['url'] = origUrl fetch = requests.get(infoDict['url']) try: - m = re.search('

\s*\s*

', doc) # another possible detection of failed portlet p = re.search('Default output of portlet NCBIPageSection', doc) if p is None: break except: try: - m = re.search('

\s*\s*") htmlLines.append("%s" % authStr) + if (not plain): htmlLines.append("" % htmlEscape(infoDict['url'])) htmlLines.append("%s." % htmlEscape(title)) htmlLines.append("%s. %s" % (htmlEscape(journal), dateStr)) + else: + htmlLines.append("" % infoDict['url']) + htmlLines.append("%s." % title) + htmlLines.append("%s. %s" % (journal, dateStr)) htmlLines.append("%s" % idStr) htmlLines.append("

") if plain: htmlLines = list() idStr = "PMID: %s" % infoDict['pubmed']; if 'pmc' in infoDict: idStr = idStr + "; PMC: %s" % infoDict['pmc'] if 'doi' in infoDict: idStr = ("DOI: %s; " % infoDict['doi']) + idStr htmlLines.append("%s %s. %s. %s %s. %s" % (authStr, title, journal, dateStr, idStr, infoDict['url'])) if not plain and origComment: htmlLines.append("%s" % origComment) sep = "\n" space = " " processLines = list() for i in htmlLines: processLines.append(textwrap.fill(i, 100)) htmlStr = sep.join(processLines) return htmlStr, authStr def main(): parser = argparse.ArgumentParser( description='Turns PubMed Ids and PubMedCentral Ids into GB formatted citations in html', epilog='example: getTrackReferences PMC3039671 21347206' ) parser.add_argument('ids', metavar='IDs', type=str, nargs='+', help='The list of PubMed and PubMedCentral Ids') parser.add_argument('-p', '--plain', action="store_true", default=0, help="Output the references in plain-text instead of html.") parser.add_argument('-v', '--verbose', action="store_true", default=0, help="Output the full author list instead of truncating after the first 10.") parser.add_argument('-d', '--doi', action="store_true", default=0, help="Include a DOI link.") args = parser.parse_args() ids = args.ids references = dict() for i in ids: infoDict = parseInit(i) html, authStr = makeHtml(infoDict, args.plain, args.verbose, args.doi) references[authStr] = html - print "\n" + print("\n") for i in sorted(references.keys()): - print "%s\n" % references[i] + print("%s\n" % references[i]) if __name__ == "__main__": main()