community-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s...@apache.org
Subject svn commit: r1838432 - in /comdev/reporter.apache.org: branches/TEST_INSTALL/scripts/mailglomper2.py trunk/scripts/mailglomper2.py
Date Mon, 20 Aug 2018 15:36:58 GMT
Author: sebb
Date: Mon Aug 20 15:36:57 2018
New Revision: 1838432

URL: http://svn.apache.org/viewvc?rev=1838432&view=rev
Log:
Only cache lists in the current index

Modified:
    comdev/reporter.apache.org/branches/TEST_INSTALL/scripts/mailglomper2.py
    comdev/reporter.apache.org/trunk/scripts/mailglomper2.py

Modified: comdev/reporter.apache.org/branches/TEST_INSTALL/scripts/mailglomper2.py
URL: http://svn.apache.org/viewvc/comdev/reporter.apache.org/branches/TEST_INSTALL/scripts/mailglomper2.py?rev=1838432&r1=1838431&r2=1838432&view=diff
==============================================================================
--- comdev/reporter.apache.org/branches/TEST_INSTALL/scripts/mailglomper2.py (original)
+++ comdev/reporter.apache.org/branches/TEST_INSTALL/scripts/mailglomper2.py Mon Aug 20 15:36:57
2018
@@ -214,7 +214,8 @@ pmcmails = committee_info.PMCmails()
 if 'empire-db' in pmcmails: # append entry
     pmcmails.append('empire')
 
-lastCheckpoint = time.time() # when output files were last saved
+# get all the mailing lists so we can drop only those that are no longer present even if
the process is stopped early
+mlists = []
 for mlist in re.finditer(r"<a href='([-a-z0-9]+)/'", data):
     ml = mlist.group(1)
     pfx = ml.split('-')[0]
@@ -222,7 +223,10 @@ for mlist in re.finditer(r"<a href='([-a
     if not pfx in pmcmails:
 #         tsprint("Skipping " + ml) # temporary for checking
         continue
+    mlists.append(ml)
     
+lastCheckpoint = time.time() # when output files were last saved
+for ml in mlists:
     tsprint("Processing: " + ml)
     start = time.time()
     mls[ml] = {}
@@ -276,8 +280,15 @@ for mlist in re.finditer(r"<a href='([-a
 tsprint("Completed scanning, writing JSON files (%s)" % str(interrupted))
 with open(__MAILDATA_EXTENDED,'w+') as f:
     json.dump(mls, f, indent=1, sort_keys=True)
+
+# all the possible lists and dates
+found = [ ml + "-" + date for ml in mlists for date in months]
+obsolete = mldcache.keys() - found # drop any left over
+for key in obsolete:
+    tsprint("Dropping unused cache entry: " + key)
+    del mldcache[key]
 with open(__MAILDATA_CACHE,"w") as f:
     json.dump(mldcache, f, indent=1, sort_keys=True)
 tsprint("Dumped JSON files")
 elapsed = time.time()-startTime
-tsprint("Completed in %d seconds" % elapsed)
\ No newline at end of file
+tsprint("Completed in %d seconds" % elapsed)

Modified: comdev/reporter.apache.org/trunk/scripts/mailglomper2.py
URL: http://svn.apache.org/viewvc/comdev/reporter.apache.org/trunk/scripts/mailglomper2.py?rev=1838432&r1=1838431&r2=1838432&view=diff
==============================================================================
--- comdev/reporter.apache.org/trunk/scripts/mailglomper2.py (original)
+++ comdev/reporter.apache.org/trunk/scripts/mailglomper2.py Mon Aug 20 15:36:57 2018
@@ -214,7 +214,8 @@ pmcmails = committee_info.PMCmails()
 if 'empire-db' in pmcmails: # append entry
     pmcmails.append('empire')
 
-lastCheckpoint = time.time() # when output files were last saved
+# get all the mailing lists so we can drop only those that are no longer present even if
the process is stopped early
+mlists = []
 for mlist in re.finditer(r"<a href='([-a-z0-9]+)/'", data):
     ml = mlist.group(1)
     pfx = ml.split('-')[0]
@@ -222,7 +223,10 @@ for mlist in re.finditer(r"<a href='([-a
     if not pfx in pmcmails:
 #         tsprint("Skipping " + ml) # temporary for checking
         continue
+    mlists.append(ml)
     
+lastCheckpoint = time.time() # when output files were last saved
+for ml in mlists:
     tsprint("Processing: " + ml)
     start = time.time()
     mls[ml] = {}
@@ -276,8 +280,15 @@ for mlist in re.finditer(r"<a href='([-a
 tsprint("Completed scanning, writing JSON files (%s)" % str(interrupted))
 with open(__MAILDATA_EXTENDED,'w+') as f:
     json.dump(mls, f, indent=1, sort_keys=True)
+
+# all the possible lists and dates
+found = [ ml + "-" + date for ml in mlists for date in months]
+obsolete = mldcache.keys() - found # drop any left over
+for key in obsolete:
+    tsprint("Dropping unused cache entry: " + key)
+    del mldcache[key]
 with open(__MAILDATA_CACHE,"w") as f:
     json.dump(mldcache, f, indent=1, sort_keys=True)
 tsprint("Dumped JSON files")
 elapsed = time.time()-startTime
-tsprint("Completed in %d seconds" % elapsed)
\ No newline at end of file
+tsprint("Completed in %d seconds" % elapsed)



Mime
View raw message