src/hg/encode/getTrackReferences/getTrackReferences d827b292f2da39b290412c402752b1bd68b37d2e

d827b292f2da39b290412c402752b1bd68b37d2e
lrnassar
  Wed Oct 25 17:00:56 2023 -0700
Adding a new condition to check for another regex expression. I found that two of the broken examples had spaces in the URL, so try to match on that and translate them. I also found one which had three backslashes incorrectly, so try to fix that too. I added it to a new loop so as not to affect the original functionality. This seems to cover all of the broken cases we had noted. Refs #31963

diff --git src/hg/encode/getTrackReferences/getTrackReferences src/hg/encode/getTrackReferences/getTrackReferences
index 234dd03..2b99cb9 100755
--- src/hg/encode/getTrackReferences/getTrackReferences
+++ src/hg/encode/getTrackReferences/getTrackReferences
@@ -1,190 +1,203 @@
 #!/usr/bin/env python2.7
 import sys, os, urllib2, argparse, re, cgi, textwrap, requests
 from xml.etree import ElementTree as ET
 
 def parsePubmed(doc, id):
     infoDict = dict()
     infoDict['url'] = "https://www.ncbi.nlm.nih.gov/pubmed/%s" % id
     attribList = ['PubDate', 'Source', 'Title', 'Volume', 'Issue', 'Pages', 'SO', 'CollectiveName']
     for element in doc:
         if element.tag != "DocSum":
             continue
         items = element.findall("Item")
         for i in items:
             if i.attrib['Name'] == 'AuthorList':
                 infoDict['Authors'] = list()
                 for j in i:
                     infoDict['Authors'].append(j.text)
                 continue
             if i.attrib['Name'] == "ArticleIds":
                 for j in i:
                     if (j.attrib['Name'] == 'pubmed'):
                         infoDict[j.attrib['Name']] = j.text
                     if (j.attrib['Name'] == 'pmc'):
                         infoDict[j.attrib['Name']] = j.text
                     if (j.attrib['Name'] == 'doi'):
                         infoDict[j.attrib['Name']] = j.text
                 continue
             if i.attrib['Name'] in attribList:
                 infoDict[i.attrib['Name']] = i.text
 
     return infoDict
 
 def parsePmc(doc, id):
     for node in doc.iter("ArticleId"):
         foundPubMedId = 0
         for child in node:
             if child.tag == "IdType" and child.text == "pmid":
                 foundPubMedId = 1
             if foundPubMedId == 1 and child.tag == "Value":
                 return parseInit(child.text) 
             
     sys.stderr.write("Unable to find pubmed id for pubmed central id: %s\n" % id)
     sys.exit()
 
 def parseInit(id):
     urlbase = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?"
     db = "Pubmed"
     url = urlbase + "db=%s" % db + "&id=%s" % id
     if re.match("^PMC", id):
         db = "PMC"
         id = re.sub("PMC", "", id)
         url = urlbase + "db=%s" % db + "&id=%s&version=2.0" % id
     sys.stderr.write("Accessing %s\n" % url)
     fetch = urllib2.urlopen(url)
 
     doc = ET.XML(fetch.read())
     if db == "Pubmed":
         infoDict = parsePubmed(doc, id)
     elif db == "PMC":
         infoDict = parsePmc(doc, id)
 
     return infoDict
 
 def htmlEscape(str):
     return cgi.escape(str).encode('ascii', 'xmlcharrefreplace')
 
 def makeHtml(infoDict, plain, verbose, doi):
     authors = list()
     authcount = 0
     etal = 0
     if 'CollectiveName' in infoDict:
         authors.append(infoDict['CollectiveName'])
         authcount = 1
     for i in infoDict['Authors']:
         if authcount == 10 and not verbose:
             etal = 1
             break
         authors.append(i)
         authcount = authcount + 1
     sep = ", "
     authStr = sep.join(authors)
     authStr = htmlEscape(authStr)
     if etal and not plain:
         authStr = authStr + " <em>et al</em>"
     if etal and plain:
        authStr = authStr + " et al"
     authStr = authStr + "."
     title = re.sub("\.$", "", infoDict['Title'])
     if 'Source' in infoDict:
         journal = infoDict['Source']
     elif 'Journal' in infoDict:
         journal = infoDict['Journal']
     if 'SO' in infoDict:
         dateStr = re.sub(";:", ":", infoDict['SO'])
     else:
         dateStr1 = infoDict['PubDate']
         dateStr = ""
         if 'Volume' in infoDict:
             dateStr = dateStr + ";%s" % infoDict['Volume']
         if 'Issue' in infoDict:
             dateStr = dateStr + "(%s)" % infoDict['Issue']
         if 'Pagination' in infoDict:
             dateStr = dateStr + "%s" % infoDict['Pagination']
         elif 'Pages' in infoDict:
             dateStr = dateStr + ":%s" %infoDict['Pages']
         dateStr = re.sub("\s+","", dateStr)
         dateStr = dateStr1 + dateStr
     if not re.search("\.$", dateStr):
         dateStr = dateStr + "."
     # construct hyperlinks for PMID and PMCID (if it exists)
     idStr = "PMID: <a href=\"%s\" target=\"_blank\">%s</a>" % (htmlEscape(infoDict['url']), infoDict['pubmed'])
     if 'pmc' in infoDict:
         idStr = idStr + "; PMC: <a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/%s/\" target=\"_blank\">%s</a>" % (infoDict['pmc'], infoDict['pmc'])
     if doi and 'doi' in infoDict:
         idStr = ("DOI: <a href=\"https://doi.org/%s\" target=\"_blank\">%s</a>; " % (htmlEscape(infoDict['doi']), infoDict['doi'] ) ) + idStr
 
     # now that the pubmed link has been constructed, we can overwrite the url in infoDict with the original article URL
 
     # make sure the portlet that generates outlinks for PubMed didn't fail.  If it did, try again until
     # it works or we give up.
     # Note: no longer sure this is necessary - seems like NCBI is doing something different now, but at
     # any rate urllib2 no longer seems to fetch the links list in at least some cases.  Requests works.
+    origUrl = infoDict['url']
     for try_count in range(10):
         origComment = ""
+        infoDict['url'] = origUrl
         fetch = requests.get(infoDict['url'])
+        try:
 	    m = re.search('<div class="full-text-links-list">\s*<a\s+(class="[^"]*"\s+)?href="(\S+)"', fetch.text)
             if m:
                 if m.group(2):
+                    # Rhetorical: how can m match without m.group(1) being defined for this regex? Anyway ....
                     infoDict['url'] = m.group(2).replace("&amp;", "&")
                 break
             else:
-            origComment = "<!-- Can't find original article link for %s -->" % infoDict['url']
                 #n = re.search('<div class="icons"></div>', doc) # another possible detection of failed portlet
                 p = re.search('Default output of portlet NCBIPageSection', doc)
                 if p is None:
                     break
+        except:
+	    try:
+		m = re.search('<div class="full-text-links-list">\s*<a\s+(class="[^"]*"\s+)?href="(.+)"', fetch.text)
+                if m:
+                    if m.group(2):
+                        # Rhetorical: how can m match without m.group(1) being defined for this regex? Anyway ....
+                        infoDict['url'] = m.group(2).replace("&amp;", "&").replace(" ", "%20").replace("///","//")
+                    break
+            except:
+                print "Failed to fetch complete links from NCBI after 10 tries.  Try again later or just use the PubMed paper link."
     else:
         print "Failed to fetch complete links from NCBI after 10 tries.  Try again later or just use the PubMed paper link."
 
     htmlLines = list()
     htmlLines.append("<p>")
     htmlLines.append("%s" % authStr)
     htmlLines.append("<a href=\"%s\" target=\"_blank\">" % htmlEscape(infoDict['url']))
     htmlLines.append("%s</a>." % htmlEscape(title))
     htmlLines.append("<em>%s</em>. %s" % (htmlEscape(journal), dateStr))
     htmlLines.append("%s" % idStr)
     htmlLines.append("</p>")
     if plain:
         htmlLines = list()
         idStr = "PMID: %s" % infoDict['pubmed'];
         if 'pmc' in infoDict:
             idStr = idStr + "; PMC: %s" % infoDict['pmc']
         if 'doi' in infoDict:
             idStr = ("DOI: %s; " % infoDict['doi']) + idStr
         htmlLines.append("%s %s. %s. %s %s. %s" % (authStr, title, journal, dateStr, idStr, infoDict['url']))
     if not plain and origComment:
         htmlLines.append("%s" %  origComment)
     sep = "\n"
     space = " "
 
     processLines = list()
 
     for i in htmlLines:
         processLines.append(textwrap.fill(i, 100))
     htmlStr = sep.join(processLines)
     return htmlStr, authStr
 
 def main():
     parser = argparse.ArgumentParser(
         description='Turns PubMed Ids and PubMedCentral Ids into GB formatted citations in html',
         epilog='example: getTrackReferences PMC3039671 21347206'
         )
     parser.add_argument('ids', metavar='IDs', type=str, nargs='+', help='The list of PubMed and PubMedCentral Ids')
     parser.add_argument('-p', '--plain', action="store_true", default=0, help="Output the references in plain-text instead of html.")
     parser.add_argument('-v', '--verbose', action="store_true", default=0, help="Output the full author list instead of truncating after the first 10.")
     parser.add_argument('-d', '--doi', action="store_true", default=0, help="Include a DOI link.")
     args = parser.parse_args()
     ids = args.ids
     references = dict()
 
     for i in ids:
         infoDict = parseInit(i)
         html, authStr = makeHtml(infoDict, args.plain, args.verbose, args.doi)
         references[authStr] = html
     print "\n"
     for i in sorted(references.keys()):
         print "%s\n" % references[i]
 
 if __name__ == "__main__":
     main()