NewsBlur/utils/PyRSS2Gen.py

494 lines
14 KiB
Python
Raw Permalink Normal View History

"""PyRSS2Gen - A Python library for generating RSS 2.0 feeds."""
__name__ = "PyRSS2Gen"
__version__ = (1, 0, 0)
__author__ = "Andrew Dalke <dalke@dalkescientific.com>"
_generator_name = __name__ + "-" + ".".join(map(str, __version__))
import datetime
2024-04-24 09:43:56 -04:00
# Could make this the base class; will need to add 'publish'
class WriteXmlMixin:
2024-04-24 09:43:56 -04:00
def write_xml(self, outfile, encoding="iso-8859-1"):
from xml.sax import saxutils
2024-04-24 09:43:56 -04:00
handler = saxutils.XMLGenerator(outfile, encoding)
handler.startDocument()
self.publish(handler)
handler.endDocument()
2024-04-24 09:43:56 -04:00
def to_xml(self, encoding="iso-8859-1"):
try:
2020-06-13 13:13:20 -04:00
import io as StringIO
except ImportError:
2020-06-13 13:13:20 -04:00
import io
f = io.StringIO()
self.write_xml(f, encoding)
return f.getvalue()
2024-04-24 09:43:56 -04:00
def _element(handler, name, obj, d={}):
2020-06-13 13:13:20 -04:00
if isinstance(obj, str) or obj is None:
# special-case handling to make the API easier
# to use for the common case.
handler.startElement(name, d)
if obj is not None:
handler.characters(obj)
handler.endElement(name)
else:
# It better know how to emit the correct XML.
obj.publish(handler)
2024-04-24 09:43:56 -04:00
def _opt_element(handler, name, obj):
if obj is None:
return
_element(handler, name, obj)
def _format_date(dt):
"""convert a datetime into an RFC 822 formatted date
Input date must be in GMT.
"""
# Looks like:
# Sat, 07 Sep 2002 00:00:01 GMT
# Can't use strftime because that's locale dependent
#
# Isn't there a standard way to do this for Python? The
# rfc822 and email.Utils modules assume a timestamp. The
# following is based on the rfc822 module.
return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
2024-04-24 09:43:56 -04:00
["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
dt.day,
["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month - 1],
dt.year,
dt.hour,
dt.minute,
dt.second,
)
##
# A couple simple wrapper objects for the fields which
# take a simple value other than a string.
class IntElement:
"""implements the 'publish' API for integers
Takes the tag name and the integer value to publish.
2024-04-24 09:43:56 -04:00
(Could be used for anything which uses str() to be published
to text for XML.)
"""
2024-04-24 09:43:56 -04:00
element_attrs = {}
2024-04-24 09:43:56 -04:00
def __init__(self, name, val):
self.name = name
self.val = val
2024-04-24 09:43:56 -04:00
def publish(self, handler):
handler.startElement(self.name, self.element_attrs)
handler.characters(str(self.val))
handler.endElement(self.name)
2024-04-24 09:43:56 -04:00
class DateElement:
"""implements the 'publish' API for a datetime.datetime
Takes the tag name and the datetime to publish.
Converts the datetime to RFC 2822 timestamp (4-digit year).
"""
2024-04-24 09:43:56 -04:00
def __init__(self, name, dt):
self.name = name
self.dt = dt
2024-04-24 09:43:56 -04:00
def publish(self, handler):
_element(handler, self.name, _format_date(self.dt))
2024-04-24 09:43:56 -04:00
####
2024-04-24 09:43:56 -04:00
class Category:
"""Publish a category element"""
2024-04-24 09:43:56 -04:00
def __init__(self, category, domain=None):
self.category = category
self.domain = domain
2024-04-24 09:43:56 -04:00
def publish(self, handler):
d = {}
if self.domain is not None:
d["domain"] = self.domain
_element(handler, "category", self.category, d)
2024-04-24 09:43:56 -04:00
class Cloud:
"""Publish a cloud"""
2024-04-24 09:43:56 -04:00
def __init__(self, domain, port, path, registerProcedure, protocol):
self.domain = domain
self.port = port
self.path = path
self.registerProcedure = registerProcedure
self.protocol = protocol
2024-04-24 09:43:56 -04:00
def publish(self, handler):
2024-04-24 09:43:56 -04:00
_element(
handler,
"cloud",
None,
{
"domain": self.domain,
"port": str(self.port),
"path": self.path,
"registerProcedure": self.registerProcedure,
"protocol": self.protocol,
},
)
class Image:
"""Publish a channel Image"""
2024-04-24 09:43:56 -04:00
element_attrs = {}
2024-04-24 09:43:56 -04:00
def __init__(self, url, title, link, width=None, height=None, description=None):
self.url = url
self.title = title
self.link = link
self.width = width
self.height = height
self.description = description
2024-04-24 09:43:56 -04:00
def publish(self, handler):
handler.startElement("image", self.element_attrs)
_element(handler, "url", self.url)
_element(handler, "title", self.title)
_element(handler, "link", self.link)
width = self.width
if isinstance(width, int):
width = IntElement("width", width)
_opt_element(handler, "width", width)
2024-04-24 09:43:56 -04:00
height = self.height
if isinstance(height, int):
height = IntElement("height", height)
_opt_element(handler, "height", height)
_opt_element(handler, "description", self.description)
handler.endElement("image")
2024-04-24 09:43:56 -04:00
class Guid:
"""Publish a guid
Defaults to being a permalink, which is the assumption if it's
omitted. Hence strings are always permalinks.
"""
2024-04-24 09:43:56 -04:00
def __init__(self, guid, isPermaLink=1):
self.guid = guid
self.isPermaLink = isPermaLink
2024-04-24 09:43:56 -04:00
def publish(self, handler):
d = {}
if self.isPermaLink:
d["isPermaLink"] = "true"
else:
d["isPermaLink"] = "false"
_element(handler, "guid", self.guid, d)
2024-04-24 09:43:56 -04:00
class TextInput:
"""Publish a textInput
Apparently this is rarely used.
"""
2024-04-24 09:43:56 -04:00
element_attrs = {}
2024-04-24 09:43:56 -04:00
def __init__(self, title, description, name, link):
self.title = title
self.description = description
self.name = name
self.link = link
def publish(self, handler):
handler.startElement("textInput", self.element_attrs)
_element(handler, "title", self.title)
_element(handler, "description", self.description)
_element(handler, "name", self.name)
_element(handler, "link", self.link)
handler.endElement("textInput")
2024-04-24 09:43:56 -04:00
class Enclosure:
"""Publish an enclosure"""
2024-04-24 09:43:56 -04:00
def __init__(self, url, length, type):
self.url = url
self.length = length
self.type = type
2024-04-24 09:43:56 -04:00
def publish(self, handler):
2024-04-24 09:43:56 -04:00
_element(
handler,
"enclosure",
None,
{
"url": self.url,
"length": str(self.length),
"type": self.type,
},
)
class Source:
"""Publish the item's original source, used by aggregators"""
2024-04-24 09:43:56 -04:00
def __init__(self, name, url):
self.name = name
self.url = url
2024-04-24 09:43:56 -04:00
def publish(self, handler):
_element(handler, "source", self.name, {"url": self.url})
2024-04-24 09:43:56 -04:00
class SkipHours:
"""Publish the skipHours
This takes a list of hours, as integers.
"""
2024-04-24 09:43:56 -04:00
element_attrs = {}
2024-04-24 09:43:56 -04:00
def __init__(self, hours):
self.hours = hours
2024-04-24 09:43:56 -04:00
def publish(self, handler):
if self.hours:
handler.startElement("skipHours", self.element_attrs)
for hour in self.hours:
_element(handler, "hour", str(hour))
handler.endElement("skipHours")
2024-04-24 09:43:56 -04:00
class SkipDays:
"""Publish the skipDays
This takes a list of days as strings.
"""
2024-04-24 09:43:56 -04:00
element_attrs = {}
2024-04-24 09:43:56 -04:00
def __init__(self, days):
self.days = days
2024-04-24 09:43:56 -04:00
def publish(self, handler):
if self.days:
handler.startElement("skipDays", self.element_attrs)
for day in self.days:
_element(handler, "day", day)
handler.endElement("skipDays")
2024-04-24 09:43:56 -04:00
class RSS2(WriteXmlMixin):
"""The main RSS class.
Stores the channel attributes, with the "category" elements under
".categories" and the RSS items under ".items".
"""
2024-04-24 09:43:56 -04:00
rss_attrs = {"version": "2.0"}
element_attrs = {}
2024-04-24 09:43:56 -04:00
def __init__(
self,
title,
link,
description,
language=None,
copyright=None,
managingEditor=None,
webMaster=None,
pubDate=None, # a datetime, *in* *GMT*
lastBuildDate=None, # a datetime
categories=None, # list of strings or Category
generator=_generator_name,
docs="http://blogs.law.harvard.edu/tech/rss",
cloud=None, # a Cloud
ttl=None, # integer number of minutes
image=None, # an Image
rating=None, # a string; I don't know how it's used
textInput=None, # a TextInput
skipHours=None, # a SkipHours with a list of integers
skipDays=None, # a SkipDays with a list of strings
items=None, # list of RSSItems
):
self.title = title
self.link = link
self.description = description
self.language = language
self.copyright = copyright
self.managingEditor = managingEditor
self.webMaster = webMaster
self.pubDate = pubDate
self.lastBuildDate = lastBuildDate
2024-04-24 09:43:56 -04:00
if categories is None:
categories = []
self.categories = categories
self.generator = generator
self.docs = docs
self.cloud = cloud
self.ttl = ttl
self.image = image
self.rating = rating
self.textInput = textInput
self.skipHours = skipHours
self.skipDays = skipDays
if items is None:
items = []
self.items = items
def publish(self, handler):
handler.startElement("rss", self.rss_attrs)
handler.startElement("channel", self.element_attrs)
_element(handler, "title", self.title)
_element(handler, "link", self.link)
_element(handler, "description", self.description)
self.publish_extensions(handler)
2024-04-24 09:43:56 -04:00
_opt_element(handler, "language", self.language)
_opt_element(handler, "copyright", self.copyright)
_opt_element(handler, "managingEditor", self.managingEditor)
_opt_element(handler, "webMaster", self.webMaster)
pubDate = self.pubDate
if isinstance(pubDate, datetime.datetime):
pubDate = DateElement("pubDate", pubDate)
_opt_element(handler, "pubDate", pubDate)
lastBuildDate = self.lastBuildDate
if isinstance(lastBuildDate, datetime.datetime):
lastBuildDate = DateElement("lastBuildDate", lastBuildDate)
_opt_element(handler, "lastBuildDate", lastBuildDate)
for category in self.categories:
2020-06-13 13:13:20 -04:00
if isinstance(category, str):
category = Category(category)
category.publish(handler)
_opt_element(handler, "generator", self.generator)
_opt_element(handler, "docs", self.docs)
if self.cloud is not None:
self.cloud.publish(handler)
ttl = self.ttl
if isinstance(self.ttl, int):
ttl = IntElement("ttl", ttl)
_opt_element(handler, "tt", ttl)
if self.image is not None:
self.image.publish(handler)
_opt_element(handler, "rating", self.rating)
if self.textInput is not None:
self.textInput.publish(handler)
if self.skipHours is not None:
self.skipHours.publish(handler)
if self.skipDays is not None:
self.skipDays.publish(handler)
for item in self.items:
item.publish(handler)
handler.endElement("channel")
handler.endElement("rss")
def publish_extensions(self, handler):
# Derived classes can hook into this to insert
# output after the three required fields.
pass
2024-04-24 09:43:56 -04:00
class RSSItem(WriteXmlMixin):
"""Publish an RSS Item"""
2024-04-24 09:43:56 -04:00
element_attrs = {}
2024-04-24 09:43:56 -04:00
def __init__(
self,
title=None, # string
link=None, # url as string
description=None, # string
author=None, # email address as string
categories=None, # list of string or Category
comments=None, # url as string
enclosure=None, # an Enclosure
guid=None, # a unique string
pubDate=None, # a datetime
source=None, # a Source
):
if title is None and description is None:
2024-04-24 09:43:56 -04:00
raise TypeError("must define at least one of 'title' or 'description'")
self.title = title
self.link = link
self.description = description
self.author = author
if categories is None:
categories = []
self.categories = categories
self.comments = comments
self.enclosure = enclosure
self.guid = guid
self.pubDate = pubDate
self.source = source
# It sure does get tedious typing these names three times...
def publish(self, handler):
handler.startElement("item", self.element_attrs)
_opt_element(handler, "title", self.title)
_opt_element(handler, "link", self.link)
self.publish_extensions(handler)
_opt_element(handler, "description", self.description)
_opt_element(handler, "author", self.author)
for category in self.categories:
2020-06-13 13:13:20 -04:00
if isinstance(category, str):
category = Category(category)
category.publish(handler)
2024-04-24 09:43:56 -04:00
_opt_element(handler, "comments", self.comments)
if self.enclosure is not None:
self.enclosure.publish(handler)
_opt_element(handler, "guid", self.guid)
pubDate = self.pubDate
if isinstance(pubDate, datetime.datetime):
pubDate = DateElement("pubDate", pubDate)
_opt_element(handler, "pubDate", pubDate)
if self.source is not None:
self.source.publish(handler)
2024-04-24 09:43:56 -04:00
handler.endElement("item")
def publish_extensions(self, handler):
# Derived classes can hook into this to insert
# output after the title and link elements
pass