d827b292f2da39b290412c402752b1bd68b37d2e lrnassar Wed Oct 25 17:00:56 2023 -0700 Adding a new condition to check for another regex expression. I found that two of the broken examples had spaces in the URL, so try to match on that and translate them. I also found one which had three backslashes incorrectly, so try to fix that too. I added it to a new loop so as not to affect the original functionality. This seems to cover all of the broken cases we had noted. Refs #31963 diff --git src/hg/encode/getTrackReferences/getTrackReferences src/hg/encode/getTrackReferences/getTrackReferences index 234dd03..2b99cb9 100755 --- src/hg/encode/getTrackReferences/getTrackReferences +++ src/hg/encode/getTrackReferences/getTrackReferences @@ -1,190 +1,203 @@ #!/usr/bin/env python2.7 import sys, os, urllib2, argparse, re, cgi, textwrap, requests from xml.etree import ElementTree as ET def parsePubmed(doc, id): infoDict = dict() infoDict['url'] = "https://www.ncbi.nlm.nih.gov/pubmed/%s" % id attribList = ['PubDate', 'Source', 'Title', 'Volume', 'Issue', 'Pages', 'SO', 'CollectiveName'] for element in doc: if element.tag != "DocSum": continue items = element.findall("Item") for i in items: if i.attrib['Name'] == 'AuthorList': infoDict['Authors'] = list() for j in i: infoDict['Authors'].append(j.text) continue if i.attrib['Name'] == "ArticleIds": for j in i: if (j.attrib['Name'] == 'pubmed'): infoDict[j.attrib['Name']] = j.text if (j.attrib['Name'] == 'pmc'): infoDict[j.attrib['Name']] = j.text if (j.attrib['Name'] == 'doi'): infoDict[j.attrib['Name']] = j.text continue if i.attrib['Name'] in attribList: infoDict[i.attrib['Name']] = i.text return infoDict def parsePmc(doc, id): for node in doc.iter("ArticleId"): foundPubMedId = 0 for child in node: if child.tag == "IdType" and child.text == "pmid": foundPubMedId = 1 if foundPubMedId == 1 and child.tag == "Value": return parseInit(child.text) sys.stderr.write("Unable to find pubmed id for pubmed central id: %s\n" % id) sys.exit() def parseInit(id): urlbase = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?" db = "Pubmed" url = urlbase + "db=%s" % db + "&id=%s" % id if re.match("^PMC", id): db = "PMC" id = re.sub("PMC", "", id) url = urlbase + "db=%s" % db + "&id=%s&version=2.0" % id sys.stderr.write("Accessing %s\n" % url) fetch = urllib2.urlopen(url) doc = ET.XML(fetch.read()) if db == "Pubmed": infoDict = parsePubmed(doc, id) elif db == "PMC": infoDict = parsePmc(doc, id) return infoDict def htmlEscape(str): return cgi.escape(str).encode('ascii', 'xmlcharrefreplace') def makeHtml(infoDict, plain, verbose, doi): authors = list() authcount = 0 etal = 0 if 'CollectiveName' in infoDict: authors.append(infoDict['CollectiveName']) authcount = 1 for i in infoDict['Authors']: if authcount == 10 and not verbose: etal = 1 break authors.append(i) authcount = authcount + 1 sep = ", " authStr = sep.join(authors) authStr = htmlEscape(authStr) if etal and not plain: authStr = authStr + " et al" if etal and plain: authStr = authStr + " et al" authStr = authStr + "." title = re.sub("\.$", "", infoDict['Title']) if 'Source' in infoDict: journal = infoDict['Source'] elif 'Journal' in infoDict: journal = infoDict['Journal'] if 'SO' in infoDict: dateStr = re.sub(";:", ":", infoDict['SO']) else: dateStr1 = infoDict['PubDate'] dateStr = "" if 'Volume' in infoDict: dateStr = dateStr + ";%s" % infoDict['Volume'] if 'Issue' in infoDict: dateStr = dateStr + "(%s)" % infoDict['Issue'] if 'Pagination' in infoDict: dateStr = dateStr + "%s" % infoDict['Pagination'] elif 'Pages' in infoDict: dateStr = dateStr + ":%s" %infoDict['Pages'] dateStr = re.sub("\s+","", dateStr) dateStr = dateStr1 + dateStr if not re.search("\.$", dateStr): dateStr = dateStr + "." # construct hyperlinks for PMID and PMCID (if it exists) idStr = "PMID: %s" % (htmlEscape(infoDict['url']), infoDict['pubmed']) if 'pmc' in infoDict: idStr = idStr + "; PMC: %s" % (infoDict['pmc'], infoDict['pmc']) if doi and 'doi' in infoDict: idStr = ("DOI: %s; " % (htmlEscape(infoDict['doi']), infoDict['doi'] ) ) + idStr # now that the pubmed link has been constructed, we can overwrite the url in infoDict with the original article URL # make sure the portlet that generates outlinks for PubMed didn't fail. If it did, try again until # it works or we give up. # Note: no longer sure this is necessary - seems like NCBI is doing something different now, but at # any rate urllib2 no longer seems to fetch the links list in at least some cases. Requests works. + origUrl = infoDict['url'] for try_count in range(10): origComment = "" + infoDict['url'] = origUrl fetch = requests.get(infoDict['url']) + try: m = re.search('

\s*

', doc) # another possible detection of failed portlet p = re.search('Default output of portlet NCBIPageSection', doc) if p is None: break + except: + try: + m = re.search('

\s*") htmlLines.append("%s" % authStr) htmlLines.append("" % htmlEscape(infoDict['url'])) htmlLines.append("%s." % htmlEscape(title)) htmlLines.append("%s. %s" % (htmlEscape(journal), dateStr)) htmlLines.append("%s" % idStr) htmlLines.append("

") if plain: htmlLines = list() idStr = "PMID: %s" % infoDict['pubmed']; if 'pmc' in infoDict: idStr = idStr + "; PMC: %s" % infoDict['pmc'] if 'doi' in infoDict: idStr = ("DOI: %s; " % infoDict['doi']) + idStr htmlLines.append("%s %s. %s. %s %s. %s" % (authStr, title, journal, dateStr, idStr, infoDict['url'])) if not plain and origComment: htmlLines.append("%s" % origComment) sep = "\n" space = " " processLines = list() for i in htmlLines: processLines.append(textwrap.fill(i, 100)) htmlStr = sep.join(processLines) return htmlStr, authStr def main(): parser = argparse.ArgumentParser( description='Turns PubMed Ids and PubMedCentral Ids into GB formatted citations in html', epilog='example: getTrackReferences PMC3039671 21347206' ) parser.add_argument('ids', metavar='IDs', type=str, nargs='+', help='The list of PubMed and PubMedCentral Ids') parser.add_argument('-p', '--plain', action="store_true", default=0, help="Output the references in plain-text instead of html.") parser.add_argument('-v', '--verbose', action="store_true", default=0, help="Output the full author list instead of truncating after the first 10.") parser.add_argument('-d', '--doi', action="store_true", default=0, help="Include a DOI link.") args = parser.parse_args() ids = args.ids references = dict() for i in ids: infoDict = parseInit(i) html, authStr = makeHtml(infoDict, args.plain, args.verbose, args.doi) references[authStr] = html print "\n" for i in sorted(references.keys()): print "%s\n" % references[i] if __name__ == "__main__": main()