community-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s...@apache.org
Subject svn commit: r1878441 - in /comdev/projects.apache.org/trunk/scripts/cronjobs: parsecommitteeinfo.py urlutils.py
Date Wed, 03 Jun 2020 15:33:43 GMT
Author: sebb
Date: Wed Jun  3 15:33:42 2020
New Revision: 1878441

URL: http://svn.apache.org/viewvc?rev=1878441&view=rev
Log:
Use urlutils for HTTP io

Modified:
    comdev/projects.apache.org/trunk/scripts/cronjobs/parsecommitteeinfo.py
    comdev/projects.apache.org/trunk/scripts/cronjobs/urlutils.py

Modified: comdev/projects.apache.org/trunk/scripts/cronjobs/parsecommitteeinfo.py
URL: http://svn.apache.org/viewvc/comdev/projects.apache.org/trunk/scripts/cronjobs/parsecommitteeinfo.py?rev=1878441&r1=1878440&r2=1878441&view=diff
==============================================================================
--- comdev/projects.apache.org/trunk/scripts/cronjobs/parsecommitteeinfo.py (original)
+++ comdev/projects.apache.org/trunk/scripts/cronjobs/parsecommitteeinfo.py Wed Jun  3 15:33:42
2020
@@ -19,15 +19,12 @@ if sys.hexversion < 0x03000000:
 import io
 import os
 import os.path
-import urllib.request
 import xml.etree.ElementTree as ET
 import xml.dom.minidom as minidom
 import datetime
 import sendmail
 
-# urllib is currently broken and will fail on cert verify. Revert once box has been upgraded.
-import ssl
-ssl._create_default_https_context = ssl._create_unverified_context
+from urlutils import URLget, URLexists
 
 sys.path.append("..") # module committee_info is in parent directory
 import committee_info
@@ -75,15 +72,6 @@ def handleChild(el):
             retval[k] = v
     return tag, retval
 
-# Simple-minded check of URL
-def head(url):
-    req = urllib.request.Request(url, method="HEAD")
-    try:
-        resp = urllib.request.urlopen(req)
-        return True
-    except:
-        return False
-
 pmcs = {}
 pmcDataUrls = {} # id -> url
 
@@ -100,7 +88,7 @@ for loc in xmldoc.getElementsByTagName('
     url = loc.childNodes[0].data
     try:
         if url.startswith('http'):
-            rdf = urllib.request.urlopen(url).read()
+            rdf = URLget(url).read()
         else:
             with open("../../data/%s" % url, 'r', encoding='utf-8') as f:
                 rdf = f.read()
@@ -182,7 +170,7 @@ for group in sorted(committees, key=keyo
                 committeeId = group
 
             img = "http://www.apache.org/logos/res/%s/default.png" % committeeId
-            if not skipImageTest and not head(img):
+            if not skipImageTest and not URLexists(img):
                 print("WARN: could not find logo: %s" % (img))
                 
             committeeCount += 1

Modified: comdev/projects.apache.org/trunk/scripts/cronjobs/urlutils.py
URL: http://svn.apache.org/viewvc/comdev/projects.apache.org/trunk/scripts/cronjobs/urlutils.py?rev=1878441&r1=1878440&r2=1878441&view=diff
==============================================================================
--- comdev/projects.apache.org/trunk/scripts/cronjobs/urlutils.py (original)
+++ comdev/projects.apache.org/trunk/scripts/cronjobs/urlutils.py Wed Jun  3 15:33:42 2020
@@ -55,14 +55,18 @@ def mod_date(t):
         return None
     return time.strftime(_HTTP_TIME_FORMAT, time.gmtime(t))
 
-def getIfNewer(url, sinceTime, encoding=None, errors=None, silent=False, debug=False):
+def getIfNewer(url, sinceTime=None, encoding=None, errors=None, silent=False, debug=False,
method='GET'):
     """
-        Get a URL if it is not newer
+        Get a URL if it is newer
     
         @param url: the url to fetch (required)
-        @param sinceTime: the most recent Last-Modified string (required, format as per mod_date())
+        @param sinceTime: the most recent Last-Modified string (format as per mod_date())
         @param encoding: the encoding to use (default 'None')
         @param errors: If encoding is provided, this specifies the on-error action (e.g.
'ignore')
+        @param silent: whether to print the url and headers (default True)
+        @param debug: whether to print additional info (default False)
+        @param method: the HTTP method to use (default GET)
+
         @return: (lastMod, response)
         - lastMod: the Last-Modified string (from sinceTime if the URL is not later) may
be None
         - response: the HTTPResponse (encoding == None) or TextIOBase object.
@@ -76,7 +80,7 @@ def getIfNewer(url, sinceTime, encoding=
     response = None
     try:
         if not silent: print("%s %s" % (url, headers))
-        req = Request(url, headers=headers)
+        req = Request(url, headers=headers, method=method)
         resp = urlopen(req, timeout=URL_TIMEOUT)
         # Debug - detect why json sometimes returned as HTML but no error code
         if debug and not silent: print("STATUS %s" % resp.getcode()) # Works for Py2/3
@@ -98,6 +102,24 @@ def getIfNewer(url, sinceTime, encoding=
             raise
     return lastMod, response
 
+def URLget(url, sinceTime=None, encoding=None, errors=None, silent=True, debug=False, method='GET'):
+    """
+    Get the URL response as for getIfNewer, but default to silent=True and omit lastMod from
reply
+    """
+    _, response = getIfNewer(url, sinceTime=sinceTime, encoding=encoding, errors=errors,
silent=silent, debug=debug, method=method)
+    return response
+
+def URLexists(url):
+    """
+    Does the URL exist?
+    Uses HEAD to check
+    """
+    try:
+        getIfNewer(url, method='HEAD', silent=True)
+        return True
+    except:
+        return False
+
 def findRelPath(relpath):
     for d in ['./','../','../../']: # we may located at same level or 1 or 2 below
         dir = join(d,relpath)
@@ -263,6 +285,12 @@ class UrlCache(object):
             return open(target, 'rb')
 
 if __name__ == '__main__':
+    print(URLexists('https://www.apache.org/'))
+    print(URLexists('https://www.apache.org/__'))
+    print(URLexists('https://__.apache.org/'))
+    resp = URLget('https://www.apache.org/')
+    print(resp.headers)
+
     try:
         fc = UrlCache(cachedir='x')
         raise Error("Expected OSError")



Mime
View raw message