community-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s...@apache.org
Subject svn commit: r1714281 - /comdev/projects.apache.org/scripts/cronjobs/urlutils.py
Date Sat, 14 Nov 2015 00:27:42 GMT
Author: sebb
Date: Sat Nov 14 00:27:42 2015
New Revision: 1714281

URL: http://svn.apache.org/viewvc?rev=1714281&view=rev
Log:
POA needs this too

Added:
    comdev/projects.apache.org/scripts/cronjobs/urlutils.py   (with props)

Added: comdev/projects.apache.org/scripts/cronjobs/urlutils.py
URL: http://svn.apache.org/viewvc/comdev/projects.apache.org/scripts/cronjobs/urlutils.py?rev=1714281&view=auto
==============================================================================
--- comdev/projects.apache.org/scripts/cronjobs/urlutils.py (added)
+++ comdev/projects.apache.org/scripts/cronjobs/urlutils.py Sat Nov 14 00:27:42 2015
@@ -0,0 +1,205 @@
+"""
+   Some utilities for working with URLs
+   Works with Python2 and Python3
+"""
+
+import os
+from os.path import dirname, abspath, join, getmtime
+import shutil
+import io
+# Allow for Python2/3 differences
+try:
+    from urllib.request import urlopen, Request
+    from urllib.error import HTTPError
+    _PY3 = True
+except:
+    from urllib2 import urlopen, Request
+    from urllib2 import HTTPError
+    from io import open # needed for encoding
+    _PY3 = False
+
+import time
+import calendar
+
+# time format used in Last-Modified/If-Modified-Since HTTP headers
+_HTTP_TIME_FORMAT = '%a, %d %b %Y %H:%M:%S GMT'
+
+def touchFile(f, t):
+    if _PY3:
+        os.utime(f, times=(t, t))
+    else:
+        os.utime(f, (t, t))
+
+def mod_date(t):
+    """
+        get file mod date in suitable format for If-Modified-Since
+        e.g. Thu, 15 Nov 2012 16:38:51 GMT
+    """
+    return time.strftime(_HTTP_TIME_FORMAT, time.gmtime(t))
+
+def getIfNewer(url, sinceTime, encoding=None, errors=None):
+    """
+        Get a URL if it is not newer
+    
+        @param url: the url to fetch (required)
+        @param sinceTime: the most recent Last-Modified string (required, format as per mod_date())
+        @param encoding: the encoding to use (default 'None')
+        @param errors: If encoding is provided, this specifies the on-error action (e.g.
'ignore')
+        @return: (lastMod, response)
+        - lastMod: the Last-Modified string (from sinceTime if the URL is not later) may
be None
+        - response: the HTTPResponse (encoding == None) or TextIOBase object.
+         'None' if the URL is not newer
+        @raise urllib.error.HTTPError: if URL not found or other error
+    """
+    if sinceTime:
+        headers = {"If-Modified-Since" : sinceTime}
+    else:
+        headers = {}
+    response = None
+    try:
+        req = Request(url, headers=headers)
+        resp = urlopen(req)
+        try:
+            lastMod = resp.headers['Last-Modified']
+            if not lastMod: # e.g. responses to git blob-plain URLs don't seem to have dates
+                lastMod = None
+        except KeyError: # python2 raises this for missing headers
+            lastMod = None
+        if encoding:
+            response = io.TextIOWrapper(resp, encoding=encoding, errors=errors)
+        else:
+            response = resp
+    except HTTPError as err:
+        if err.code == 304:
+            lastMod = sinceTime # preserve timestamp
+        else:
+            raise
+    return lastMod, response
+
+class UrlCache(object):
+    """
+        Creates a cache for URLs
+        @param cachedir: the cache directory to use 
+            (default data/cache; this is assumed to be at the current directory, its parent
or grandparent)
+        @param interval: minimum interval between checks for updates to the URL (default
300 secs)
+            if set to -1, never checks (intended for testing only)  
+            if set to 0, always checks (primarily intended for testing)
+        @return: the instance to use with the get() method
+    """
+    # get file mod_date
+    def __file_mtime(self, filename):
+        try:
+            t = getmtime(filename)
+        except FileNotFoundError:
+            t = -1 # so cannot be confused with a valid mtime
+        return t
+
+    def __init__(self, cachedir=None, interval=300):
+        __CACHE = 'data/cache'
+        self.__interval = interval
+        self.__cachedir = None
+        if cachedir:
+            self.__cachedir = cachedir
+        else:
+            self.__cachedir = __CACHE # will be overwritten if actually found
+            for d in ['./','../','../../']: # we may located at same level or 1 or 2 below
+                dir = d + __CACHE
+                if os.path.isdir(dir):
+                    self.__cachedir = dir
+                    break
+        
+        if os.path.isdir(self.__cachedir):
+            print("Cachedir: %s" % self.__cachedir)
+        else:
+            raise OSError("Could not find cache directory '%s'" % self.__cachedir)
+
+    def __getname(self, name):
+        return join(self.__cachedir, name)
+
+    def get(self, url, name, encoding=None, errors=None):
+        """
+            Check if the filename exists in the cache.
+            If it does not, or if it does and the URL has not been checked recently,
+            then try to download the URL using If-Modified-Since.
+            The URL is downloaded to a temporary file and renamed to the filename
+            to reduce the time when the file is being updated.
+            The interval parameter is used to determine how often to check if the URL has
changed.
+            (this is mainly intended to avoid excess URL requests in unit testing).
+            If this is set to -1, then the URL will only be downloaded once. 
+            @param url: the url to fetch (required)
+            @param name: the name to use in the cache (required)
+            @param encoding: the encoding to use (default None)
+            @param errors: If encoding is provided, this specifies the on-error action (e.g.
'ignore')
+                        (default None)
+            @return: the opened stream, using the encoding if specified. Otherwise opened
in binary mode. 
+        """
+        target=self.__getname(name)
+        fileTime = self.__file_mtime(target)
+        check = self.__getname("."+name)
+        upToDate = False
+        if fileTime >= 0:
+            if self.__interval == -1:
+                print("File %s exists and URL check has been disabled" % name)
+                upToDate = True
+            elif self.__interval == 0:
+                print("File %s exists and check interval is zero" % name)
+            else:
+                checkTime = self.__file_mtime(check)
+                now = time.time()
+                diff = now - checkTime
+                if diff < self.__interval:
+                    print("Recently checked: %d < %d, skip check" % (diff, self.__interval))
+                    upToDate = True
+                else:
+                    if checkTime >= 0:
+                        print("Not recently checked: %d > %d" % (diff, self.__interval))
+                    else:
+                        print("Not recently checked")
+        else:
+            print("Not found %s " % name)
+
+        if not upToDate:
+            sinceTime = mod_date(fileTime)
+            lastMod, response = getIfNewer(url, sinceTime)
+            if response: # we have a new version
+                if lastMod:
+                    try:
+                        lastModT = calendar.timegm(time.strptime(lastMod, _HTTP_TIME_FORMAT))
+                    except ValueError:
+                        lastModT = 0
+                else:
+                    lastModT = 0
+                
+                tmpFile = target + ".tmp"
+                with open(tmpFile,'wb') as f:
+                    shutil.copyfileobj(response, f)
+                # store the last mod time as the time of the file
+                touchFile(tmpFile, lastModT)
+                os.rename(tmpFile, target) # seems to preserve file mod time
+                if lastMod:
+                    if fileTime > 0:
+                        print("Downloaded new version of %s (%s > %s)" % (name, lastMod,
sinceTime))
+                    else:
+                        print("Downloaded new version of %s" % (name))
+                else:
+                    print("Downloaded new version of %s (undated)" % (name))
+            else:
+                print("Cached copy of %s is up to date (%s)" % (name, lastMod))
+
+    
+            if self.__interval > 0: # no point creating a marker file if we won't be using
it
+                with open(check,'a'):
+                    os.utime(check, None) # touch the marker file
+
+        if encoding:
+            return open(target, 'r', encoding=encoding, errors=errors)
+        else:
+            return open(target, 'rb')
+
+if __name__ == '__main__':
+    fc = UrlCache(interval=-1)
+    icla_info = fc.get("https://whimsy.apache.org/public/icla-info.json","icla-info.json",
encoding='utf-8')
+    print(icla_info.readline().rstrip())
+    print(icla_info.readline().rstrip())
+    print(icla_info.readline().rstrip())
+    print(icla_info.readline().rstrip())

Propchange: comdev/projects.apache.org/scripts/cronjobs/urlutils.py
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message