diff --git a/filters/coercedates.plugin b/filters/coercedates.plugin new file mode 100644 index 00000000..b5f73aac --- /dev/null +++ b/filters/coercedates.plugin @@ -0,0 +1,117 @@ +# If you don't want items to "move up" on your planet if the source feed +# updates them (and changes the update date to something newer then was +# originally used) you may be tempted to use the "ignore_in_feed: updated" +# option, but there are three important things to realize about doing this: +# +# * When you ignore the "updated" date, it will default to the +# "published" date -- but if there is no "published" date (very common +# in many RSS feeds) it will default to the current date+time. +# +# * If you purge the entire cache (perhaps because you added a filter) +# all of the "updated" dates for those items w/o a "published" date will +# be re-set to the current date+time +# +# * The "updated" date is what Venus uses to sort the list +# +# This may seem all seem obvious, but can be highly annoying when you deal +# with some feeds that have no "published" date and have to occasionally +# purge your cache. +# +# One solution would be to only use "ignore_in_feed: updated" on the feeds +# where you know they feed contains a "published" date for each item, and +# don't use it for feeds that only contain an "updated" date for each item +# -- but that can be tedious. +# +# So use this plugin instead +# +# This plugin will replace the "updated" and "published" dates of every item +# with whichever of the two values is the lowest, unless the item is already +# in the cache, in which case it will use the "updated" date from the item in +# the cache -- making it a safe alternative to "ignore_in_feed: updated" for +# all feeds regardless of whether the items have a "published" date or not, +# and regardless of whether the ones that do have a "published" date try to +# modify it or not. +# +########################################################################### + +import sys, time, os +from xml.dom import minidom +import planet +from planet import reconstitute +from planet import config +from planet.reconstitute import date +from planet.spider import filename + +log = planet.logger + +# finds the first descendent element that matches the specified +# namespace and tag name, parses it (in canonical date format), +# returns the parsed value, and removes (all of the) element(s) +def parseAndPurgeDateElement(element, ns, tagName): + result = None + # see if we have any date(s?) + kids = element.getElementsByTagNameNS(ns, tagName) + if kids: + # record the first one + result = time.strptime(kids[0].childNodes[0].nodeValue, + '%Y-%m-%dT%H:%M:%SZ') + # get rid of all of them + for trash in kids: + trash.parentNode.removeChild(trash) + return result + + +# given the identifier of an entry in the cache, fetches the +# formated mtime of that entry (which should match the updated +# date if venus has done it's job right +# +# returns None if the entry is not in the cache +def getDateFromCache(entry): + if entry is None: + log.error("Attempted to lookup the date of 'None'") + return None + + id = entry.getElementsByTagNameNS(atomNS, 'id')[0].childNodes[0].nodeValue + if id is None: + log.error("Unable to find id in entry") + return None + + cache = os.path.join(config.cache_directory()) + file = filename(cache, id) + if os.path.exists(file): + return time.gmtime(os.stat(file).st_mtime) + return None + + +atomNS = 'http://www.w3.org/2005/Atom' +planetNS = 'http://planet.intertwingly.net/' + +# parse input stream +dom = minidom.parse(sys.stdin) + +entries = dom.getElementsByTagNameNS(atomNS, 'entry') +for e in entries: + + # get & remove our dates from the entry + updatedDate = parseAndPurgeDateElement(e, atomNS, 'updated') + pubDate = parseAndPurgeDateElement(e, atomNS, 'published') + + cacheDate = getDateFromCache(e) + + if cacheDate is not None: + mainDate = cacheDate + elif not updatedDate: + mainDate = pubDate + elif not pubDate: + mainDate = updatedDate + elif pubDate < updatedDate: + mainDate = pubDate + else: + mainDate = updatedDate + + # add back to the entry + reconstitute.date(e, 'published', mainDate) + reconstitute.date(e, 'updated', mainDate) + +# output the dom +print dom.toxml('utf-8') diff --git a/planet/spider.py b/planet/spider.py index 50d17393..c636f561 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -235,6 +235,15 @@ def writeCache(feed_uri, feed_info, data): if os.path.exists(cache_file): os.remove(cache_file) continue + # re-set mtime incase filters have modified it + try: + edoc = feedparser.parse(output) + mtime = calendar.timegm(edoc.entries[0].updated_parsed) + except: + log.warning("Unable to re-set mtime on %s after running filters: ", + entry.id, + sys.exc_info()[0]) + # write out and timestamp the results write(output, cache_file, mtime) diff --git a/planet/vendor/feedparser.py b/planet/vendor/feedparser.py index 76167ced..0600c814 100755 --- a/planet/vendor/feedparser.py +++ b/planet/vendor/feedparser.py @@ -1982,6 +1982,7 @@ def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0 sValue = bNormalize and self.normalize(sValue) or sValue.strip() if (not sValue) and (iPropertyType == self.URI): if sNodeName == 'a': sValue = elmResult.get('href') + elif sNodeName == 'iframe': sValue = elmResult.get('src') elif sNodeName == 'img': sValue = elmResult.get('src') elif sNodeName == 'object': sValue = elmResult.get('data') if sValue: @@ -2339,7 +2340,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer', 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', - 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', + 'iframe', 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub', @@ -2355,7 +2356,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for', - 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', + 'form', 'frame', 'frameborder', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max', diff --git a/planet/vendor/html5lib/sanitizer.py b/planet/vendor/html5lib/sanitizer.py index 05face97..d81b2943 100644 --- a/planet/vendor/html5lib/sanitizer.py +++ b/planet/vendor/html5lib/sanitizer.py @@ -13,7 +13,7 @@ class HTMLSanitizerMixin(object): 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer', 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', - 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'keygen', 'kbd', + 'h5', 'h6', 'hr', 'i', 'iframe', 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', 'small', 'sound', @@ -43,7 +43,7 @@ class HTMLSanitizerMixin(object): 'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', - 'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers', + 'face', 'for', 'form', 'frame', 'frameborder', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend', diff --git a/tests/data/filter/coercedates/a-rss-1.xml b/tests/data/filter/coercedates/a-rss-1.xml new file mode 100644 index 00000000..d2467593 --- /dev/null +++ b/tests/data/filter/coercedates/a-rss-1.xml @@ -0,0 +1,42 @@ + + + +Fake RSS Blog +http://fake.url.example.com +Fake RSS Feed For testing + +http://fake.url.example.com/feedlogo.gif +Test RSS Feed +http://fake.url.example.com + +en-us +Not Copyright 2011 Fake Feed, LLC. The contents of this headlines and excerpts feed are available for unlimited distribution. +Blogsmith http://www.blogsmith.com/ + + + Fake Title: RSS Has No Date + http://fake.url.example.com/rss-no-date + http://fake.url.example.com/rss-no-date + http://fake.url.example.com/rss-no-date#comments + + Blah Blah Blah something poinient blah blah blah

]]> +
+ http://fake.url.example.com/rss-no-date.gif + Fake Person +
+ + + + Fake Title: RSS Has Changing Date + http://fake.url.example.com/rss-changing-date + http://fake.url.example.com/rss-changing-date + http://fake.url.example.com/rss-changing-date#comments + + Blah Blah Blah something poinient blah blah blah

]]> +
+ http://fake.url.example.com/rss-changing-date.gif + Fake Person + 2011-12-01T11:00:00+00:00 +
+ +
diff --git a/tests/data/filter/coercedates/a-rss-2.xml b/tests/data/filter/coercedates/a-rss-2.xml new file mode 100644 index 00000000..82f48a2d --- /dev/null +++ b/tests/data/filter/coercedates/a-rss-2.xml @@ -0,0 +1,42 @@ + + + +Fake RSS Blog +http://fake.url.example.com +Fake RSS Feed For testing + +http://fake.url.example.com/feedlogo.gif +Test RSS Feed +http://fake.url.example.com + +en-us +Not Copyright 2011 Fake Feed, LLC. The contents of this headlines and excerpts feed are available for unlimited distribution. +Blogsmith http://www.blogsmith.com/ + + + Fake Title: RSS Has No Date + http://fake.url.example.com/rss-no-date + http://fake.url.example.com/rss-no-date + http://fake.url.example.com/rss-no-date#comments + + Blah Blah Blah something poinient blah blah blah

]]> +
+ http://fake.url.example.com/rss-no-date.gif + Fake Person +
+ + + + Fake Title: RSS Has Changing Date + http://fake.url.example.com/rss-changing-date + http://fake.url.example.com/rss-changing-date + http://fake.url.example.com/rss-changing-date#comments + + Blah Blah Blah something poinient blah blah blah

]]> +
+ http://fake.url.example.com/rss-changing-date.gif + Fake Person + 2011-12-07T11:07:07+00:00 +
+ +
diff --git a/tests/data/filter/coercedates/b-atom-1.xml b/tests/data/filter/coercedates/b-atom-1.xml new file mode 100644 index 00000000..f549b6d8 --- /dev/null +++ b/tests/data/filter/coercedates/b-atom-1.xml @@ -0,0 +1,92 @@ + + + Fake Atom Feed + Fake Atom feed for testing stuff + + 2011-12-08T00:00:28Z + + + http://fake.url.example.com/feed/atom/ + + + WordPress + + + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom Changing Updated Date]]> + + http://fake.url.example.com/atom-changing-updated + 2011-12-05T10:06:38Z + 2011-11-09T00:00:28Z + + + + + 0 + + + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom Changing Published Date]]> + + http://fake.url.example.com/atom-changing-published + 2011-12-08T02:02:28Z + + + + + 0 + + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom No Date]]> + + http://fake.url.example.com/atom-no-date + + + + + 0 + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom Update Before Published]]> + 2011-11-11T11:11:11Z + 2011-12-12T12:12:12Z + + http://fake.url.example.com/atom-update-before-pub + + + + + 0 + + + + + diff --git a/tests/data/filter/coercedates/b-atom-2.xml b/tests/data/filter/coercedates/b-atom-2.xml new file mode 100644 index 00000000..6b71301f --- /dev/null +++ b/tests/data/filter/coercedates/b-atom-2.xml @@ -0,0 +1,92 @@ + + + Fake Atom Feed + Fake Atom feed for testing stuff + + 2011-12-08T00:00:28Z + + + http://fake.url.example.com/feed/atom/ + + + WordPress + + + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom Changing Updated Date]]> + + http://fake.url.example.com/atom-changing-updated + 2011-12-07T07:07:37Z + 2011-11-09T00:00:28Z + + + + + 0 + + + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom Changing Published Date]]> + + http://fake.url.example.com/atom-changing-published + 2011-12-13T13:13:13Z + + + + + 0 + + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom No Date]]> + + http://fake.url.example.com/atom-no-date + + + + + 0 + + + + + Fake Person + http://fake.url.example.com + + <![CDATA[Atom Update Before Published]]> + 2009-09-09T09:09:09Z + 2011-12-12T12:12:12Z + + http://fake.url.example.com/atom-update-before-pub + + + + + 0 + + + + + diff --git a/tests/data/filter/coercedates/config.ini b/tests/data/filter/coercedates/config.ini new file mode 100644 index 00000000..f4223bde --- /dev/null +++ b/tests/data/filter/coercedates/config.ini @@ -0,0 +1,7 @@ +[Planet] +name = test planet +cache_directory = tests/work/coercedates/cache +cache_blasklist_directory = tests/work/coercedates/cache/blacklist + +filters: coercedates.plugin + diff --git a/tests/test_filter_coercedates.py b/tests/test_filter_coercedates.py new file mode 100644 index 00000000..04259eec --- /dev/null +++ b/tests/test_filter_coercedates.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python + +import unittest, os, glob, calendar, shutil, time +from planet.spider import filename, spiderPlanet, writeCache +from planet import feedparser, config +import planet + +workdir = 'tests/work/coercedates/cache' +testfeed = 'tests/data/filter/coercedates/%s.xml' +configfile = 'tests/data/filter/coercedates/config.ini' + +class CoerceDatesTest(unittest.TestCase): + def setUp(self): + # silence errors + self.original_logger = planet.logger + # planet.getLogger('CRITICAL',None) + + try: + os.makedirs(workdir) + except: + self.tearDown() + os.makedirs(workdir) + + def tearDown(self): + shutil.rmtree(workdir) + os.removedirs(os.path.split(workdir)[0]) + planet.logger = self.original_logger + + def spiderFeed(self, feed_uri): + feed_info = feedparser.parse('') + data = feedparser.parse(feed_uri) + writeCache(feed_uri, feed_info, data) + + # no expected_date means we don't know what it should be yet + def verify_date(self, id, expected_date = None): + + file = os.path.join(workdir, id) + + # verify that the file exists + self.assertTrue(os.path.exists(file), msg=file); + + data = feedparser.parse(file) + + # verify published & updated dates are in sync and match expected + + self.assertEqual(data.entries[0].updated, + data.entries[0].published) + + # verify mtime is in sync + self.assertEqual(time.gmtime(os.stat(file).st_mtime), + data.entries[0].updated_parsed) + self.assertEqual(time.gmtime(os.stat(file).st_mtime), + data.entries[0].published_parsed) + + # verify meet hardcoded expectations + if expected_date is not None: + self.assertEqual(expected_date, + data.entries[0].updated) + + return data.entries[0].updated + + def test_coerce_rss(self): + config.load(configfile) + + # load first version of RSS + self.spiderFeed(testfeed % 'a-rss-1') + + rss_no_date_expected = self.verify_date('fake.url.example.com,rss-no-date') + self.verify_date('fake.url.example.com,rss-changing-date', + u'2011-12-01T11:00:00Z') + + # parse updated RSS feed + self.spiderFeed(testfeed % 'a-rss-2') + + # verify dates haven't changed + self.verify_date('fake.url.example.com,rss-no-date', + rss_no_date_expected) + self.verify_date('fake.url.example.com,rss-changing-date', + u'2011-12-01T11:00:00Z') + + def test_coerce_atom(self): + config.load(configfile) + + # load first version of Atom + self.spiderFeed(testfeed % 'b-atom-1') + + atom_no_date_expected = self.verify_date('fake.url.example.com,atom-no-date') + self.verify_date('fake.url.example.com,atom-changing-published', + u'2011-12-08T02:02:28Z') + self.verify_date('fake.url.example.com,atom-changing-updated', + u'2011-11-09T00:00:28Z') + self.verify_date('fake.url.example.com,atom-update-before-pub', + u'2011-11-11T11:11:11Z') + + # parse updated Atom feed + self.spiderFeed(testfeed % 'b-atom-2') + + # verify dates haven't changed + self.verify_date('fake.url.example.com,atom-no-date', + atom_no_date_expected) + self.verify_date('fake.url.example.com,atom-changing-published', + u'2011-12-08T02:02:28Z') + self.verify_date('fake.url.example.com,atom-changing-updated', + u'2011-11-09T00:00:28Z') + self.verify_date('fake.url.example.com,atom-update-before-pub', + u'2011-11-11T11:11:11Z')