mirror of
https://github.com/viq/NewsBlur.git
synced 2025-08-05 16:49:45 +00:00
2to3 NewsBlur/utils/
This commit is contained in:
parent
5c1e0a1c5e
commit
66b2011707
21 changed files with 3681 additions and 172 deletions
|
@ -19,16 +19,16 @@ class WriteXmlMixin:
|
|||
|
||||
def to_xml(self, encoding = "iso-8859-1"):
|
||||
try:
|
||||
import cStringIO as StringIO
|
||||
import io as StringIO
|
||||
except ImportError:
|
||||
import StringIO
|
||||
f = StringIO.StringIO()
|
||||
import io
|
||||
f = io.StringIO()
|
||||
self.write_xml(f, encoding)
|
||||
return f.getvalue()
|
||||
|
||||
|
||||
def _element(handler, name, obj, d = {}):
|
||||
if isinstance(obj, basestring) or obj is None:
|
||||
if isinstance(obj, str) or obj is None:
|
||||
# special-case handling to make the API easier
|
||||
# to use for the common case.
|
||||
handler.startElement(name, d)
|
||||
|
@ -337,7 +337,7 @@ class RSS2(WriteXmlMixin):
|
|||
_opt_element(handler, "lastBuildDate", lastBuildDate)
|
||||
|
||||
for category in self.categories:
|
||||
if isinstance(category, basestring):
|
||||
if isinstance(category, str):
|
||||
category = Category(category)
|
||||
category.publish(handler)
|
||||
|
||||
|
@ -418,7 +418,7 @@ class RSSItem(WriteXmlMixin):
|
|||
_opt_element(handler, "author", self.author)
|
||||
|
||||
for category in self.categories:
|
||||
if isinstance(category, basestring):
|
||||
if isinstance(category, str):
|
||||
category = Category(category)
|
||||
category.publish(handler)
|
||||
|
||||
|
|
443
utils/PyRSS2Gen.py.bak
Normal file
443
utils/PyRSS2Gen.py.bak
Normal file
|
@ -0,0 +1,443 @@
|
|||
"""PyRSS2Gen - A Python library for generating RSS 2.0 feeds."""
|
||||
|
||||
__name__ = "PyRSS2Gen"
|
||||
__version__ = (1, 0, 0)
|
||||
__author__ = "Andrew Dalke <dalke@dalkescientific.com>"
|
||||
|
||||
_generator_name = __name__ + "-" + ".".join(map(str, __version__))
|
||||
|
||||
import datetime
|
||||
|
||||
# Could make this the base class; will need to add 'publish'
|
||||
class WriteXmlMixin:
|
||||
def write_xml(self, outfile, encoding = "iso-8859-1"):
|
||||
from xml.sax import saxutils
|
||||
handler = saxutils.XMLGenerator(outfile, encoding)
|
||||
handler.startDocument()
|
||||
self.publish(handler)
|
||||
handler.endDocument()
|
||||
|
||||
def to_xml(self, encoding = "iso-8859-1"):
|
||||
try:
|
||||
import cStringIO as StringIO
|
||||
except ImportError:
|
||||
import StringIO
|
||||
f = StringIO.StringIO()
|
||||
self.write_xml(f, encoding)
|
||||
return f.getvalue()
|
||||
|
||||
|
||||
def _element(handler, name, obj, d = {}):
|
||||
if isinstance(obj, basestring) or obj is None:
|
||||
# special-case handling to make the API easier
|
||||
# to use for the common case.
|
||||
handler.startElement(name, d)
|
||||
if obj is not None:
|
||||
handler.characters(obj)
|
||||
handler.endElement(name)
|
||||
else:
|
||||
# It better know how to emit the correct XML.
|
||||
obj.publish(handler)
|
||||
|
||||
def _opt_element(handler, name, obj):
|
||||
if obj is None:
|
||||
return
|
||||
_element(handler, name, obj)
|
||||
|
||||
|
||||
def _format_date(dt):
|
||||
"""convert a datetime into an RFC 822 formatted date
|
||||
|
||||
Input date must be in GMT.
|
||||
"""
|
||||
# Looks like:
|
||||
# Sat, 07 Sep 2002 00:00:01 GMT
|
||||
# Can't use strftime because that's locale dependent
|
||||
#
|
||||
# Isn't there a standard way to do this for Python? The
|
||||
# rfc822 and email.Utils modules assume a timestamp. The
|
||||
# following is based on the rfc822 module.
|
||||
return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
|
||||
["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
|
||||
dt.day,
|
||||
["Jan", "Feb", "Mar", "Apr", "May", "Jun",
|
||||
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1],
|
||||
dt.year, dt.hour, dt.minute, dt.second)
|
||||
|
||||
|
||||
##
|
||||
# A couple simple wrapper objects for the fields which
|
||||
# take a simple value other than a string.
|
||||
class IntElement:
|
||||
"""implements the 'publish' API for integers
|
||||
|
||||
Takes the tag name and the integer value to publish.
|
||||
|
||||
(Could be used for anything which uses str() to be published
|
||||
to text for XML.)
|
||||
"""
|
||||
element_attrs = {}
|
||||
def __init__(self, name, val):
|
||||
self.name = name
|
||||
self.val = val
|
||||
def publish(self, handler):
|
||||
handler.startElement(self.name, self.element_attrs)
|
||||
handler.characters(str(self.val))
|
||||
handler.endElement(self.name)
|
||||
|
||||
class DateElement:
|
||||
"""implements the 'publish' API for a datetime.datetime
|
||||
|
||||
Takes the tag name and the datetime to publish.
|
||||
|
||||
Converts the datetime to RFC 2822 timestamp (4-digit year).
|
||||
"""
|
||||
def __init__(self, name, dt):
|
||||
self.name = name
|
||||
self.dt = dt
|
||||
def publish(self, handler):
|
||||
_element(handler, self.name, _format_date(self.dt))
|
||||
####
|
||||
|
||||
class Category:
|
||||
"""Publish a category element"""
|
||||
def __init__(self, category, domain = None):
|
||||
self.category = category
|
||||
self.domain = domain
|
||||
def publish(self, handler):
|
||||
d = {}
|
||||
if self.domain is not None:
|
||||
d["domain"] = self.domain
|
||||
_element(handler, "category", self.category, d)
|
||||
|
||||
class Cloud:
|
||||
"""Publish a cloud"""
|
||||
def __init__(self, domain, port, path,
|
||||
registerProcedure, protocol):
|
||||
self.domain = domain
|
||||
self.port = port
|
||||
self.path = path
|
||||
self.registerProcedure = registerProcedure
|
||||
self.protocol = protocol
|
||||
def publish(self, handler):
|
||||
_element(handler, "cloud", None, {
|
||||
"domain": self.domain,
|
||||
"port": str(self.port),
|
||||
"path": self.path,
|
||||
"registerProcedure": self.registerProcedure,
|
||||
"protocol": self.protocol})
|
||||
|
||||
class Image:
|
||||
"""Publish a channel Image"""
|
||||
element_attrs = {}
|
||||
def __init__(self, url, title, link,
|
||||
width = None, height = None, description = None):
|
||||
self.url = url
|
||||
self.title = title
|
||||
self.link = link
|
||||
self.width = width
|
||||
self.height = height
|
||||
self.description = description
|
||||
|
||||
def publish(self, handler):
|
||||
handler.startElement("image", self.element_attrs)
|
||||
|
||||
_element(handler, "url", self.url)
|
||||
_element(handler, "title", self.title)
|
||||
_element(handler, "link", self.link)
|
||||
|
||||
width = self.width
|
||||
if isinstance(width, int):
|
||||
width = IntElement("width", width)
|
||||
_opt_element(handler, "width", width)
|
||||
|
||||
height = self.height
|
||||
if isinstance(height, int):
|
||||
height = IntElement("height", height)
|
||||
_opt_element(handler, "height", height)
|
||||
|
||||
_opt_element(handler, "description", self.description)
|
||||
|
||||
handler.endElement("image")
|
||||
|
||||
class Guid:
|
||||
"""Publish a guid
|
||||
|
||||
Defaults to being a permalink, which is the assumption if it's
|
||||
omitted. Hence strings are always permalinks.
|
||||
"""
|
||||
def __init__(self, guid, isPermaLink = 1):
|
||||
self.guid = guid
|
||||
self.isPermaLink = isPermaLink
|
||||
def publish(self, handler):
|
||||
d = {}
|
||||
if self.isPermaLink:
|
||||
d["isPermaLink"] = "true"
|
||||
else:
|
||||
d["isPermaLink"] = "false"
|
||||
_element(handler, "guid", self.guid, d)
|
||||
|
||||
class TextInput:
|
||||
"""Publish a textInput
|
||||
|
||||
Apparently this is rarely used.
|
||||
"""
|
||||
element_attrs = {}
|
||||
def __init__(self, title, description, name, link):
|
||||
self.title = title
|
||||
self.description = description
|
||||
self.name = name
|
||||
self.link = link
|
||||
|
||||
def publish(self, handler):
|
||||
handler.startElement("textInput", self.element_attrs)
|
||||
_element(handler, "title", self.title)
|
||||
_element(handler, "description", self.description)
|
||||
_element(handler, "name", self.name)
|
||||
_element(handler, "link", self.link)
|
||||
handler.endElement("textInput")
|
||||
|
||||
|
||||
class Enclosure:
|
||||
"""Publish an enclosure"""
|
||||
def __init__(self, url, length, type):
|
||||
self.url = url
|
||||
self.length = length
|
||||
self.type = type
|
||||
def publish(self, handler):
|
||||
_element(handler, "enclosure", None,
|
||||
{"url": self.url,
|
||||
"length": str(self.length),
|
||||
"type": self.type,
|
||||
})
|
||||
|
||||
class Source:
|
||||
"""Publish the item's original source, used by aggregators"""
|
||||
def __init__(self, name, url):
|
||||
self.name = name
|
||||
self.url = url
|
||||
def publish(self, handler):
|
||||
_element(handler, "source", self.name, {"url": self.url})
|
||||
|
||||
class SkipHours:
|
||||
"""Publish the skipHours
|
||||
|
||||
This takes a list of hours, as integers.
|
||||
"""
|
||||
element_attrs = {}
|
||||
def __init__(self, hours):
|
||||
self.hours = hours
|
||||
def publish(self, handler):
|
||||
if self.hours:
|
||||
handler.startElement("skipHours", self.element_attrs)
|
||||
for hour in self.hours:
|
||||
_element(handler, "hour", str(hour))
|
||||
handler.endElement("skipHours")
|
||||
|
||||
class SkipDays:
|
||||
"""Publish the skipDays
|
||||
|
||||
This takes a list of days as strings.
|
||||
"""
|
||||
element_attrs = {}
|
||||
def __init__(self, days):
|
||||
self.days = days
|
||||
def publish(self, handler):
|
||||
if self.days:
|
||||
handler.startElement("skipDays", self.element_attrs)
|
||||
for day in self.days:
|
||||
_element(handler, "day", day)
|
||||
handler.endElement("skipDays")
|
||||
|
||||
class RSS2(WriteXmlMixin):
|
||||
"""The main RSS class.
|
||||
|
||||
Stores the channel attributes, with the "category" elements under
|
||||
".categories" and the RSS items under ".items".
|
||||
"""
|
||||
|
||||
rss_attrs = {"version": "2.0"}
|
||||
element_attrs = {}
|
||||
def __init__(self,
|
||||
title,
|
||||
link,
|
||||
description,
|
||||
|
||||
language = None,
|
||||
copyright = None,
|
||||
managingEditor = None,
|
||||
webMaster = None,
|
||||
pubDate = None, # a datetime, *in* *GMT*
|
||||
lastBuildDate = None, # a datetime
|
||||
|
||||
categories = None, # list of strings or Category
|
||||
generator = _generator_name,
|
||||
docs = "http://blogs.law.harvard.edu/tech/rss",
|
||||
cloud = None, # a Cloud
|
||||
ttl = None, # integer number of minutes
|
||||
|
||||
image = None, # an Image
|
||||
rating = None, # a string; I don't know how it's used
|
||||
textInput = None, # a TextInput
|
||||
skipHours = None, # a SkipHours with a list of integers
|
||||
skipDays = None, # a SkipDays with a list of strings
|
||||
|
||||
items = None, # list of RSSItems
|
||||
):
|
||||
self.title = title
|
||||
self.link = link
|
||||
self.description = description
|
||||
self.language = language
|
||||
self.copyright = copyright
|
||||
self.managingEditor = managingEditor
|
||||
|
||||
self.webMaster = webMaster
|
||||
self.pubDate = pubDate
|
||||
self.lastBuildDate = lastBuildDate
|
||||
|
||||
if categories is None:
|
||||
categories = []
|
||||
self.categories = categories
|
||||
self.generator = generator
|
||||
self.docs = docs
|
||||
self.cloud = cloud
|
||||
self.ttl = ttl
|
||||
self.image = image
|
||||
self.rating = rating
|
||||
self.textInput = textInput
|
||||
self.skipHours = skipHours
|
||||
self.skipDays = skipDays
|
||||
|
||||
if items is None:
|
||||
items = []
|
||||
self.items = items
|
||||
|
||||
def publish(self, handler):
|
||||
handler.startElement("rss", self.rss_attrs)
|
||||
handler.startElement("channel", self.element_attrs)
|
||||
_element(handler, "title", self.title)
|
||||
_element(handler, "link", self.link)
|
||||
_element(handler, "description", self.description)
|
||||
|
||||
self.publish_extensions(handler)
|
||||
|
||||
_opt_element(handler, "language", self.language)
|
||||
_opt_element(handler, "copyright", self.copyright)
|
||||
_opt_element(handler, "managingEditor", self.managingEditor)
|
||||
_opt_element(handler, "webMaster", self.webMaster)
|
||||
|
||||
pubDate = self.pubDate
|
||||
if isinstance(pubDate, datetime.datetime):
|
||||
pubDate = DateElement("pubDate", pubDate)
|
||||
_opt_element(handler, "pubDate", pubDate)
|
||||
|
||||
lastBuildDate = self.lastBuildDate
|
||||
if isinstance(lastBuildDate, datetime.datetime):
|
||||
lastBuildDate = DateElement("lastBuildDate", lastBuildDate)
|
||||
_opt_element(handler, "lastBuildDate", lastBuildDate)
|
||||
|
||||
for category in self.categories:
|
||||
if isinstance(category, basestring):
|
||||
category = Category(category)
|
||||
category.publish(handler)
|
||||
|
||||
_opt_element(handler, "generator", self.generator)
|
||||
_opt_element(handler, "docs", self.docs)
|
||||
|
||||
if self.cloud is not None:
|
||||
self.cloud.publish(handler)
|
||||
|
||||
ttl = self.ttl
|
||||
if isinstance(self.ttl, int):
|
||||
ttl = IntElement("ttl", ttl)
|
||||
_opt_element(handler, "tt", ttl)
|
||||
|
||||
if self.image is not None:
|
||||
self.image.publish(handler)
|
||||
|
||||
_opt_element(handler, "rating", self.rating)
|
||||
if self.textInput is not None:
|
||||
self.textInput.publish(handler)
|
||||
if self.skipHours is not None:
|
||||
self.skipHours.publish(handler)
|
||||
if self.skipDays is not None:
|
||||
self.skipDays.publish(handler)
|
||||
|
||||
for item in self.items:
|
||||
item.publish(handler)
|
||||
|
||||
handler.endElement("channel")
|
||||
handler.endElement("rss")
|
||||
|
||||
def publish_extensions(self, handler):
|
||||
# Derived classes can hook into this to insert
|
||||
# output after the three required fields.
|
||||
pass
|
||||
|
||||
|
||||
|
||||
class RSSItem(WriteXmlMixin):
|
||||
"""Publish an RSS Item"""
|
||||
element_attrs = {}
|
||||
def __init__(self,
|
||||
title = None, # string
|
||||
link = None, # url as string
|
||||
description = None, # string
|
||||
author = None, # email address as string
|
||||
categories = None, # list of string or Category
|
||||
comments = None, # url as string
|
||||
enclosure = None, # an Enclosure
|
||||
guid = None, # a unique string
|
||||
pubDate = None, # a datetime
|
||||
source = None, # a Source
|
||||
):
|
||||
|
||||
if title is None and description is None:
|
||||
raise TypeError(
|
||||
"must define at least one of 'title' or 'description'")
|
||||
self.title = title
|
||||
self.link = link
|
||||
self.description = description
|
||||
self.author = author
|
||||
if categories is None:
|
||||
categories = []
|
||||
self.categories = categories
|
||||
self.comments = comments
|
||||
self.enclosure = enclosure
|
||||
self.guid = guid
|
||||
self.pubDate = pubDate
|
||||
self.source = source
|
||||
# It sure does get tedious typing these names three times...
|
||||
|
||||
def publish(self, handler):
|
||||
handler.startElement("item", self.element_attrs)
|
||||
_opt_element(handler, "title", self.title)
|
||||
_opt_element(handler, "link", self.link)
|
||||
self.publish_extensions(handler)
|
||||
_opt_element(handler, "description", self.description)
|
||||
_opt_element(handler, "author", self.author)
|
||||
|
||||
for category in self.categories:
|
||||
if isinstance(category, basestring):
|
||||
category = Category(category)
|
||||
category.publish(handler)
|
||||
|
||||
_opt_element(handler, "comments", self.comments)
|
||||
if self.enclosure is not None:
|
||||
self.enclosure.publish(handler)
|
||||
_opt_element(handler, "guid", self.guid)
|
||||
|
||||
pubDate = self.pubDate
|
||||
if isinstance(pubDate, datetime.datetime):
|
||||
pubDate = DateElement("pubDate", pubDate)
|
||||
_opt_element(handler, "pubDate", pubDate)
|
||||
|
||||
if self.source is not None:
|
||||
self.source.publish(handler)
|
||||
|
||||
handler.endElement("item")
|
||||
|
||||
def publish_extensions(self, handler):
|
||||
# Derived classes can hook into this to insert
|
||||
# output after the title and link elements
|
||||
pass
|
46
utils/S3.py
46
utils/S3.py
|
@ -11,13 +11,13 @@
|
|||
|
||||
import base64
|
||||
import hmac
|
||||
import httplib
|
||||
import http.client
|
||||
import re
|
||||
import sha
|
||||
import sys
|
||||
import time
|
||||
import urllib
|
||||
import urlparse
|
||||
import urllib.request, urllib.parse, urllib.error
|
||||
import urllib.parse
|
||||
import xml.sax
|
||||
|
||||
DEFAULT_HOST = 's3.amazonaws.com'
|
||||
|
@ -34,13 +34,13 @@ def canonical_string(method, bucket="", key="", query_args={}, headers={}, expir
|
|||
interesting_headers[lk] = headers[header_key].strip()
|
||||
|
||||
# these keys get empty strings if they don't exist
|
||||
if not interesting_headers.has_key('content-type'):
|
||||
if 'content-type' not in interesting_headers:
|
||||
interesting_headers['content-type'] = ''
|
||||
if not interesting_headers.has_key('content-md5'):
|
||||
if 'content-md5' not in interesting_headers:
|
||||
interesting_headers['content-md5'] = ''
|
||||
|
||||
# just in case someone used this. it's not necessary in this lib.
|
||||
if interesting_headers.has_key('x-amz-date'):
|
||||
if 'x-amz-date' in interesting_headers:
|
||||
interesting_headers['date'] = ''
|
||||
|
||||
# if you're using expires for query string auth, then it trumps date
|
||||
|
@ -48,7 +48,7 @@ def canonical_string(method, bucket="", key="", query_args={}, headers={}, expir
|
|||
if expires:
|
||||
interesting_headers['date'] = str(expires)
|
||||
|
||||
sorted_header_keys = interesting_headers.keys()
|
||||
sorted_header_keys = list(interesting_headers.keys())
|
||||
sorted_header_keys.sort()
|
||||
|
||||
buf = "%s\n" % method
|
||||
|
@ -63,17 +63,17 @@ def canonical_string(method, bucket="", key="", query_args={}, headers={}, expir
|
|||
buf += "/%s" % bucket
|
||||
|
||||
# add the key. even if it doesn't exist, add the slash
|
||||
buf += "/%s" % urllib.quote_plus(key)
|
||||
buf += "/%s" % urllib.parse.quote_plus(key)
|
||||
|
||||
# handle special query string arguments
|
||||
|
||||
if query_args.has_key("acl"):
|
||||
if "acl" in query_args:
|
||||
buf += "?acl"
|
||||
elif query_args.has_key("torrent"):
|
||||
elif "torrent" in query_args:
|
||||
buf += "?torrent"
|
||||
elif query_args.has_key("logging"):
|
||||
elif "logging" in query_args:
|
||||
buf += "?logging"
|
||||
elif query_args.has_key("location"):
|
||||
elif "location" in query_args:
|
||||
buf += "?location"
|
||||
|
||||
return buf
|
||||
|
@ -83,13 +83,13 @@ def canonical_string(method, bucket="", key="", query_args={}, headers={}, expir
|
|||
def encode(aws_secret_access_key, str, urlencode=False):
|
||||
b64_hmac = base64.encodestring(hmac.new(aws_secret_access_key, str, sha).digest()).strip()
|
||||
if urlencode:
|
||||
return urllib.quote_plus(b64_hmac)
|
||||
return urllib.parse.quote_plus(b64_hmac)
|
||||
else:
|
||||
return b64_hmac
|
||||
|
||||
def merge_meta(headers, metadata):
|
||||
final_headers = headers.copy()
|
||||
for k in metadata.keys():
|
||||
for k in list(metadata.keys()):
|
||||
final_headers[METADATA_PREFIX + k] = metadata[k]
|
||||
|
||||
return final_headers
|
||||
|
@ -98,10 +98,10 @@ def merge_meta(headers, metadata):
|
|||
def query_args_hash_to_string(query_args):
|
||||
query_string = ""
|
||||
pairs = []
|
||||
for k, v in query_args.items():
|
||||
for k, v in list(query_args.items()):
|
||||
piece = k
|
||||
if v != None:
|
||||
piece += "=%s" % urllib.quote_plus(str(v))
|
||||
piece += "=%s" % urllib.parse.quote_plus(str(v))
|
||||
pairs.append(piece)
|
||||
|
||||
return '&'.join(pairs)
|
||||
|
@ -251,7 +251,7 @@ class AWSAuthConnection:
|
|||
|
||||
# add the slash after the bucket regardless
|
||||
# the key will be appended if it is non-empty
|
||||
path += "/%s" % urllib.quote_plus(key)
|
||||
path += "/%s" % urllib.parse.quote_plus(key)
|
||||
|
||||
|
||||
# build the path_argument string
|
||||
|
@ -264,9 +264,9 @@ class AWSAuthConnection:
|
|||
host = "%s:%d" % (server, self.port)
|
||||
while True:
|
||||
if (is_secure):
|
||||
connection = httplib.HTTPSConnection(host)
|
||||
connection = http.client.HTTPSConnection(host)
|
||||
else:
|
||||
connection = httplib.HTTPConnection(host)
|
||||
connection = http.client.HTTPConnection(host)
|
||||
|
||||
final_headers = merge_meta(headers, metadata);
|
||||
# add auth header
|
||||
|
@ -283,7 +283,7 @@ class AWSAuthConnection:
|
|||
# (close connection)
|
||||
resp.read()
|
||||
scheme, host, path, params, query, fragment \
|
||||
= urlparse.urlparse(location)
|
||||
= urllib.parse.urlparse(location)
|
||||
if scheme == "http": is_secure = True
|
||||
elif scheme == "https": is_secure = False
|
||||
else: raise invalidURL("Not http/https: " + location)
|
||||
|
@ -291,7 +291,7 @@ class AWSAuthConnection:
|
|||
# retry with redirect
|
||||
|
||||
def _add_aws_auth_header(self, headers, method, bucket, key, query_args):
|
||||
if not headers.has_key('Date'):
|
||||
if 'Date' not in headers:
|
||||
headers['Date'] = time.strftime("%a, %d %b %Y %X GMT", time.gmtime())
|
||||
|
||||
c_string = canonical_string(method, bucket, key, query_args, headers)
|
||||
|
@ -400,7 +400,7 @@ class QueryStringAuthGenerator:
|
|||
|
||||
url = CallingFormat.build_url_base(self.protocol, self.server, self.port, bucket, self.calling_format)
|
||||
|
||||
url += "/%s" % urllib.quote_plus(key)
|
||||
url += "/%s" % urllib.parse.quote_plus(key)
|
||||
|
||||
query_args['Signature'] = encoded_canonical
|
||||
query_args['Expires'] = expires
|
||||
|
@ -489,7 +489,7 @@ class GetResponse(Response):
|
|||
|
||||
def get_aws_metadata(self, headers):
|
||||
metadata = {}
|
||||
for hkey in headers.keys():
|
||||
for hkey in list(headers.keys()):
|
||||
if hkey.lower().startswith(METADATA_PREFIX):
|
||||
metadata[hkey[len(METADATA_PREFIX):]] = headers[hkey]
|
||||
del headers[hkey]
|
||||
|
|
617
utils/S3.py.bak
Normal file
617
utils/S3.py.bak
Normal file
|
@ -0,0 +1,617 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# This software code is made available "AS IS" without warranties of any
|
||||
# kind. You may copy, display, modify and redistribute the software
|
||||
# code either by itself or as incorporated into your code; provided that
|
||||
# you do not remove any proprietary notices. Your use of this software
|
||||
# code is at your own risk and you waive any claim against Amazon
|
||||
# Digital Services, Inc. or its affiliates with respect to your use of
|
||||
# this software code. (c) 2006-2007 Amazon Digital Services, Inc. or its
|
||||
# affiliates.
|
||||
|
||||
import base64
|
||||
import hmac
|
||||
import httplib
|
||||
import re
|
||||
import sha
|
||||
import sys
|
||||
import time
|
||||
import urllib
|
||||
import urlparse
|
||||
import xml.sax
|
||||
|
||||
DEFAULT_HOST = 's3.amazonaws.com'
|
||||
PORTS_BY_SECURITY = { True: 443, False: 80 }
|
||||
METADATA_PREFIX = 'x-amz-meta-'
|
||||
AMAZON_HEADER_PREFIX = 'x-amz-'
|
||||
|
||||
# generates the aws canonical string for the given parameters
|
||||
def canonical_string(method, bucket="", key="", query_args={}, headers={}, expires=None):
|
||||
interesting_headers = {}
|
||||
for header_key in headers:
|
||||
lk = header_key.lower()
|
||||
if lk in ['content-md5', 'content-type', 'date'] or lk.startswith(AMAZON_HEADER_PREFIX):
|
||||
interesting_headers[lk] = headers[header_key].strip()
|
||||
|
||||
# these keys get empty strings if they don't exist
|
||||
if not interesting_headers.has_key('content-type'):
|
||||
interesting_headers['content-type'] = ''
|
||||
if not interesting_headers.has_key('content-md5'):
|
||||
interesting_headers['content-md5'] = ''
|
||||
|
||||
# just in case someone used this. it's not necessary in this lib.
|
||||
if interesting_headers.has_key('x-amz-date'):
|
||||
interesting_headers['date'] = ''
|
||||
|
||||
# if you're using expires for query string auth, then it trumps date
|
||||
# (and x-amz-date)
|
||||
if expires:
|
||||
interesting_headers['date'] = str(expires)
|
||||
|
||||
sorted_header_keys = interesting_headers.keys()
|
||||
sorted_header_keys.sort()
|
||||
|
||||
buf = "%s\n" % method
|
||||
for header_key in sorted_header_keys:
|
||||
if header_key.startswith(AMAZON_HEADER_PREFIX):
|
||||
buf += "%s:%s\n" % (header_key, interesting_headers[header_key])
|
||||
else:
|
||||
buf += "%s\n" % interesting_headers[header_key]
|
||||
|
||||
# append the bucket if it exists
|
||||
if bucket != "":
|
||||
buf += "/%s" % bucket
|
||||
|
||||
# add the key. even if it doesn't exist, add the slash
|
||||
buf += "/%s" % urllib.quote_plus(key)
|
||||
|
||||
# handle special query string arguments
|
||||
|
||||
if query_args.has_key("acl"):
|
||||
buf += "?acl"
|
||||
elif query_args.has_key("torrent"):
|
||||
buf += "?torrent"
|
||||
elif query_args.has_key("logging"):
|
||||
buf += "?logging"
|
||||
elif query_args.has_key("location"):
|
||||
buf += "?location"
|
||||
|
||||
return buf
|
||||
|
||||
# computes the base64'ed hmac-sha hash of the canonical string and the secret
|
||||
# access key, optionally urlencoding the result
|
||||
def encode(aws_secret_access_key, str, urlencode=False):
|
||||
b64_hmac = base64.encodestring(hmac.new(aws_secret_access_key, str, sha).digest()).strip()
|
||||
if urlencode:
|
||||
return urllib.quote_plus(b64_hmac)
|
||||
else:
|
||||
return b64_hmac
|
||||
|
||||
def merge_meta(headers, metadata):
|
||||
final_headers = headers.copy()
|
||||
for k in metadata.keys():
|
||||
final_headers[METADATA_PREFIX + k] = metadata[k]
|
||||
|
||||
return final_headers
|
||||
|
||||
# builds the query arg string
|
||||
def query_args_hash_to_string(query_args):
|
||||
query_string = ""
|
||||
pairs = []
|
||||
for k, v in query_args.items():
|
||||
piece = k
|
||||
if v != None:
|
||||
piece += "=%s" % urllib.quote_plus(str(v))
|
||||
pairs.append(piece)
|
||||
|
||||
return '&'.join(pairs)
|
||||
|
||||
|
||||
class CallingFormat:
|
||||
PATH = 1
|
||||
SUBDOMAIN = 2
|
||||
VANITY = 3
|
||||
|
||||
def build_url_base(protocol, server, port, bucket, calling_format):
|
||||
url_base = '%s://' % protocol
|
||||
|
||||
if bucket == '':
|
||||
url_base += server
|
||||
elif calling_format == CallingFormat.SUBDOMAIN:
|
||||
url_base += "%s.%s" % (bucket, server)
|
||||
elif calling_format == CallingFormat.VANITY:
|
||||
url_base += bucket
|
||||
else:
|
||||
url_base += server
|
||||
|
||||
url_base += ":%s" % port
|
||||
|
||||
if (bucket != '') and (calling_format == CallingFormat.PATH):
|
||||
url_base += "/%s" % bucket
|
||||
|
||||
return url_base
|
||||
|
||||
build_url_base = staticmethod(build_url_base)
|
||||
|
||||
|
||||
|
||||
class Location:
|
||||
DEFAULT = None
|
||||
EU = 'EU'
|
||||
|
||||
|
||||
|
||||
class AWSAuthConnection:
|
||||
def __init__(self, aws_access_key_id, aws_secret_access_key, is_secure=True,
|
||||
server=DEFAULT_HOST, port=None, calling_format=CallingFormat.SUBDOMAIN):
|
||||
|
||||
if not port:
|
||||
port = PORTS_BY_SECURITY[is_secure]
|
||||
|
||||
self.aws_access_key_id = aws_access_key_id
|
||||
self.aws_secret_access_key = aws_secret_access_key
|
||||
self.is_secure = is_secure
|
||||
self.server = server
|
||||
self.port = port
|
||||
self.calling_format = calling_format
|
||||
|
||||
def create_bucket(self, bucket, headers={}):
|
||||
return Response(self._make_request('PUT', bucket, '', {}, headers))
|
||||
|
||||
def create_located_bucket(self, bucket, location=Location.DEFAULT, headers={}):
|
||||
if location == Location.DEFAULT:
|
||||
body = ""
|
||||
else:
|
||||
body = "<CreateBucketConstraint><LocationConstraint>" + \
|
||||
location + \
|
||||
"</LocationConstraint></CreateBucketConstraint>"
|
||||
return Response(self._make_request('PUT', bucket, '', {}, headers, body))
|
||||
|
||||
def check_bucket_exists(self, bucket):
|
||||
return self._make_request('HEAD', bucket, '', {}, {})
|
||||
|
||||
def list_bucket(self, bucket, options={}, headers={}):
|
||||
return ListBucketResponse(self._make_request('GET', bucket, '', options, headers))
|
||||
|
||||
def delete_bucket(self, bucket, headers={}):
|
||||
return Response(self._make_request('DELETE', bucket, '', {}, headers))
|
||||
|
||||
def put(self, bucket, key, object, headers={}):
|
||||
if not isinstance(object, S3Object):
|
||||
object = S3Object(object)
|
||||
|
||||
return Response(
|
||||
self._make_request(
|
||||
'PUT',
|
||||
bucket,
|
||||
key,
|
||||
{},
|
||||
headers,
|
||||
object.data,
|
||||
object.metadata))
|
||||
|
||||
def get(self, bucket, key, headers={}):
|
||||
return GetResponse(
|
||||
self._make_request('GET', bucket, key, {}, headers))
|
||||
|
||||
def delete(self, bucket, key, headers={}):
|
||||
return Response(
|
||||
self._make_request('DELETE', bucket, key, {}, headers))
|
||||
|
||||
def get_bucket_logging(self, bucket, headers={}):
|
||||
return GetResponse(self._make_request('GET', bucket, '', { 'logging': None }, headers))
|
||||
|
||||
def put_bucket_logging(self, bucket, logging_xml_doc, headers={}):
|
||||
return Response(self._make_request('PUT', bucket, '', { 'logging': None }, headers, logging_xml_doc))
|
||||
|
||||
def get_bucket_acl(self, bucket, headers={}):
|
||||
return self.get_acl(bucket, '', headers)
|
||||
|
||||
def get_acl(self, bucket, key, headers={}):
|
||||
return GetResponse(
|
||||
self._make_request('GET', bucket, key, { 'acl': None }, headers))
|
||||
|
||||
def put_bucket_acl(self, bucket, acl_xml_document, headers={}):
|
||||
return self.put_acl(bucket, '', acl_xml_document, headers)
|
||||
|
||||
def put_acl(self, bucket, key, acl_xml_document, headers={}):
|
||||
return Response(
|
||||
self._make_request(
|
||||
'PUT',
|
||||
bucket,
|
||||
key,
|
||||
{ 'acl': None },
|
||||
headers,
|
||||
acl_xml_document))
|
||||
|
||||
def list_all_my_buckets(self, headers={}):
|
||||
return ListAllMyBucketsResponse(self._make_request('GET', '', '', {}, headers))
|
||||
|
||||
def get_bucket_location(self, bucket):
|
||||
return LocationResponse(self._make_request('GET', bucket, '', {'location' : None}))
|
||||
|
||||
# end public methods
|
||||
|
||||
def _make_request(self, method, bucket='', key='', query_args={}, headers={}, data='', metadata={}):
|
||||
|
||||
server = ''
|
||||
if bucket == '':
|
||||
server = self.server
|
||||
elif self.calling_format == CallingFormat.SUBDOMAIN:
|
||||
server = "%s.%s" % (bucket, self.server)
|
||||
elif self.calling_format == CallingFormat.VANITY:
|
||||
server = bucket
|
||||
else:
|
||||
server = self.server
|
||||
|
||||
path = ''
|
||||
|
||||
if (bucket != '') and (self.calling_format == CallingFormat.PATH):
|
||||
path += "/%s" % bucket
|
||||
|
||||
# add the slash after the bucket regardless
|
||||
# the key will be appended if it is non-empty
|
||||
path += "/%s" % urllib.quote_plus(key)
|
||||
|
||||
|
||||
# build the path_argument string
|
||||
# add the ? in all cases since
|
||||
# signature and credentials follow path args
|
||||
if len(query_args):
|
||||
path += "?" + query_args_hash_to_string(query_args)
|
||||
|
||||
is_secure = self.is_secure
|
||||
host = "%s:%d" % (server, self.port)
|
||||
while True:
|
||||
if (is_secure):
|
||||
connection = httplib.HTTPSConnection(host)
|
||||
else:
|
||||
connection = httplib.HTTPConnection(host)
|
||||
|
||||
final_headers = merge_meta(headers, metadata);
|
||||
# add auth header
|
||||
self._add_aws_auth_header(final_headers, method, bucket, key, query_args)
|
||||
|
||||
connection.request(method, path, data, final_headers)
|
||||
resp = connection.getresponse()
|
||||
if resp.status < 300 or resp.status >= 400:
|
||||
return resp
|
||||
# handle redirect
|
||||
location = resp.getheader('location')
|
||||
if not location:
|
||||
return resp
|
||||
# (close connection)
|
||||
resp.read()
|
||||
scheme, host, path, params, query, fragment \
|
||||
= urlparse.urlparse(location)
|
||||
if scheme == "http": is_secure = True
|
||||
elif scheme == "https": is_secure = False
|
||||
else: raise invalidURL("Not http/https: " + location)
|
||||
if query: path += "?" + query
|
||||
# retry with redirect
|
||||
|
||||
def _add_aws_auth_header(self, headers, method, bucket, key, query_args):
|
||||
if not headers.has_key('Date'):
|
||||
headers['Date'] = time.strftime("%a, %d %b %Y %X GMT", time.gmtime())
|
||||
|
||||
c_string = canonical_string(method, bucket, key, query_args, headers)
|
||||
headers['Authorization'] = \
|
||||
"AWS %s:%s" % (self.aws_access_key_id, encode(self.aws_secret_access_key, c_string))
|
||||
|
||||
|
||||
class QueryStringAuthGenerator:
|
||||
# by default, expire in 1 minute
|
||||
DEFAULT_EXPIRES_IN = 60
|
||||
|
||||
def __init__(self, aws_access_key_id, aws_secret_access_key, is_secure=True,
|
||||
server=DEFAULT_HOST, port=None, calling_format=CallingFormat.SUBDOMAIN):
|
||||
|
||||
if not port:
|
||||
port = PORTS_BY_SECURITY[is_secure]
|
||||
|
||||
self.aws_access_key_id = aws_access_key_id
|
||||
self.aws_secret_access_key = aws_secret_access_key
|
||||
if (is_secure):
|
||||
self.protocol = 'https'
|
||||
else:
|
||||
self.protocol = 'http'
|
||||
|
||||
self.is_secure = is_secure
|
||||
self.server = server
|
||||
self.port = port
|
||||
self.calling_format = calling_format
|
||||
self.__expires_in = QueryStringAuthGenerator.DEFAULT_EXPIRES_IN
|
||||
self.__expires = None
|
||||
|
||||
# for backwards compatibility with older versions
|
||||
self.server_name = "%s:%s" % (self.server, self.port)
|
||||
|
||||
def set_expires_in(self, expires_in):
|
||||
self.__expires_in = expires_in
|
||||
self.__expires = None
|
||||
|
||||
def set_expires(self, expires):
|
||||
self.__expires = expires
|
||||
self.__expires_in = None
|
||||
|
||||
def create_bucket(self, bucket, headers={}):
|
||||
return self.generate_url('PUT', bucket, '', {}, headers)
|
||||
|
||||
def list_bucket(self, bucket, options={}, headers={}):
|
||||
return self.generate_url('GET', bucket, '', options, headers)
|
||||
|
||||
def delete_bucket(self, bucket, headers={}):
|
||||
return self.generate_url('DELETE', bucket, '', {}, headers)
|
||||
|
||||
def put(self, bucket, key, object, headers={}):
|
||||
if not isinstance(object, S3Object):
|
||||
object = S3Object(object)
|
||||
|
||||
return self.generate_url(
|
||||
'PUT',
|
||||
bucket,
|
||||
key,
|
||||
{},
|
||||
merge_meta(headers, object.metadata))
|
||||
|
||||
def get(self, bucket, key, headers={}):
|
||||
return self.generate_url('GET', bucket, key, {}, headers)
|
||||
|
||||
def delete(self, bucket, key, headers={}):
|
||||
return self.generate_url('DELETE', bucket, key, {}, headers)
|
||||
|
||||
def get_bucket_logging(self, bucket, headers={}):
|
||||
return self.generate_url('GET', bucket, '', { 'logging': None }, headers)
|
||||
|
||||
def put_bucket_logging(self, bucket, logging_xml_doc, headers={}):
|
||||
return self.generate_url('PUT', bucket, '', { 'logging': None }, headers)
|
||||
|
||||
def get_bucket_acl(self, bucket, headers={}):
|
||||
return self.get_acl(bucket, '', headers)
|
||||
|
||||
def get_acl(self, bucket, key='', headers={}):
|
||||
return self.generate_url('GET', bucket, key, { 'acl': None }, headers)
|
||||
|
||||
def put_bucket_acl(self, bucket, acl_xml_document, headers={}):
|
||||
return self.put_acl(bucket, '', acl_xml_document, headers)
|
||||
|
||||
# don't really care what the doc is here.
|
||||
def put_acl(self, bucket, key, acl_xml_document, headers={}):
|
||||
return self.generate_url('PUT', bucket, key, { 'acl': None }, headers)
|
||||
|
||||
def list_all_my_buckets(self, headers={}):
|
||||
return self.generate_url('GET', '', '', {}, headers)
|
||||
|
||||
def make_bare_url(self, bucket, key=''):
|
||||
full_url = self.generate_url(self, bucket, key)
|
||||
return full_url[:full_url.index('?')]
|
||||
|
||||
def generate_url(self, method, bucket='', key='', query_args={}, headers={}):
|
||||
expires = 0
|
||||
if self.__expires_in != None:
|
||||
expires = int(time.time() + self.__expires_in)
|
||||
elif self.__expires != None:
|
||||
expires = int(self.__expires)
|
||||
else:
|
||||
raise "Invalid expires state"
|
||||
|
||||
canonical_str = canonical_string(method, bucket, key, query_args, headers, expires)
|
||||
encoded_canonical = encode(self.aws_secret_access_key, canonical_str)
|
||||
|
||||
url = CallingFormat.build_url_base(self.protocol, self.server, self.port, bucket, self.calling_format)
|
||||
|
||||
url += "/%s" % urllib.quote_plus(key)
|
||||
|
||||
query_args['Signature'] = encoded_canonical
|
||||
query_args['Expires'] = expires
|
||||
query_args['AWSAccessKeyId'] = self.aws_access_key_id
|
||||
|
||||
url += "?%s" % query_args_hash_to_string(query_args)
|
||||
|
||||
return url
|
||||
|
||||
|
||||
class S3Object:
|
||||
def __init__(self, data, metadata={}):
|
||||
self.data = data
|
||||
self.metadata = metadata
|
||||
|
||||
class Owner:
|
||||
def __init__(self, id='', display_name=''):
|
||||
self.id = id
|
||||
self.display_name = display_name
|
||||
|
||||
class ListEntry:
|
||||
def __init__(self, key='', last_modified=None, etag='', size=0, storage_class='', owner=None):
|
||||
self.key = key
|
||||
self.last_modified = last_modified
|
||||
self.etag = etag
|
||||
self.size = size
|
||||
self.storage_class = storage_class
|
||||
self.owner = owner
|
||||
|
||||
class CommonPrefixEntry:
|
||||
def __init(self, prefix=''):
|
||||
self.prefix = prefix
|
||||
|
||||
class Bucket:
|
||||
def __init__(self, name='', creation_date=''):
|
||||
self.name = name
|
||||
self.creation_date = creation_date
|
||||
|
||||
class Response:
|
||||
def __init__(self, http_response):
|
||||
self.http_response = http_response
|
||||
# you have to do this read, even if you don't expect a body.
|
||||
# otherwise, the next request fails.
|
||||
self.body = http_response.read()
|
||||
if http_response.status >= 300 and self.body:
|
||||
self.message = self.body
|
||||
else:
|
||||
self.message = "%03d %s" % (http_response.status, http_response.reason)
|
||||
|
||||
|
||||
|
||||
class ListBucketResponse(Response):
|
||||
def __init__(self, http_response):
|
||||
Response.__init__(self, http_response)
|
||||
if http_response.status < 300:
|
||||
handler = ListBucketHandler()
|
||||
xml.sax.parseString(self.body, handler)
|
||||
self.entries = handler.entries
|
||||
self.common_prefixes = handler.common_prefixes
|
||||
self.name = handler.name
|
||||
self.marker = handler.marker
|
||||
self.prefix = handler.prefix
|
||||
self.is_truncated = handler.is_truncated
|
||||
self.delimiter = handler.delimiter
|
||||
self.max_keys = handler.max_keys
|
||||
self.next_marker = handler.next_marker
|
||||
else:
|
||||
self.entries = []
|
||||
|
||||
class ListAllMyBucketsResponse(Response):
|
||||
def __init__(self, http_response):
|
||||
Response.__init__(self, http_response)
|
||||
if http_response.status < 300:
|
||||
handler = ListAllMyBucketsHandler()
|
||||
xml.sax.parseString(self.body, handler)
|
||||
self.entries = handler.entries
|
||||
else:
|
||||
self.entries = []
|
||||
|
||||
class GetResponse(Response):
|
||||
def __init__(self, http_response):
|
||||
Response.__init__(self, http_response)
|
||||
response_headers = http_response.msg # older pythons don't have getheaders
|
||||
metadata = self.get_aws_metadata(response_headers)
|
||||
self.object = S3Object(self.body, metadata)
|
||||
|
||||
def get_aws_metadata(self, headers):
|
||||
metadata = {}
|
||||
for hkey in headers.keys():
|
||||
if hkey.lower().startswith(METADATA_PREFIX):
|
||||
metadata[hkey[len(METADATA_PREFIX):]] = headers[hkey]
|
||||
del headers[hkey]
|
||||
|
||||
return metadata
|
||||
|
||||
class LocationResponse(Response):
|
||||
def __init__(self, http_response):
|
||||
Response.__init__(self, http_response)
|
||||
if http_response.status < 300:
|
||||
handler = LocationHandler()
|
||||
xml.sax.parseString(self.body, handler)
|
||||
self.location = handler.location
|
||||
|
||||
class ListBucketHandler(xml.sax.ContentHandler):
|
||||
def __init__(self):
|
||||
self.entries = []
|
||||
self.curr_entry = None
|
||||
self.curr_text = ''
|
||||
self.common_prefixes = []
|
||||
self.curr_common_prefix = None
|
||||
self.name = ''
|
||||
self.marker = ''
|
||||
self.prefix = ''
|
||||
self.is_truncated = False
|
||||
self.delimiter = ''
|
||||
self.max_keys = 0
|
||||
self.next_marker = ''
|
||||
self.is_echoed_prefix_set = False
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
if name == 'Contents':
|
||||
self.curr_entry = ListEntry()
|
||||
elif name == 'Owner':
|
||||
self.curr_entry.owner = Owner()
|
||||
elif name == 'CommonPrefixes':
|
||||
self.curr_common_prefix = CommonPrefixEntry()
|
||||
|
||||
|
||||
def endElement(self, name):
|
||||
if name == 'Contents':
|
||||
self.entries.append(self.curr_entry)
|
||||
elif name == 'CommonPrefixes':
|
||||
self.common_prefixes.append(self.curr_common_prefix)
|
||||
elif name == 'Key':
|
||||
self.curr_entry.key = self.curr_text
|
||||
elif name == 'LastModified':
|
||||
self.curr_entry.last_modified = self.curr_text
|
||||
elif name == 'ETag':
|
||||
self.curr_entry.etag = self.curr_text
|
||||
elif name == 'Size':
|
||||
self.curr_entry.size = int(self.curr_text)
|
||||
elif name == 'ID':
|
||||
self.curr_entry.owner.id = self.curr_text
|
||||
elif name == 'DisplayName':
|
||||
self.curr_entry.owner.display_name = self.curr_text
|
||||
elif name == 'StorageClass':
|
||||
self.curr_entry.storage_class = self.curr_text
|
||||
elif name == 'Name':
|
||||
self.name = self.curr_text
|
||||
elif name == 'Prefix' and self.is_echoed_prefix_set:
|
||||
self.curr_common_prefix.prefix = self.curr_text
|
||||
elif name == 'Prefix':
|
||||
self.prefix = self.curr_text
|
||||
self.is_echoed_prefix_set = True
|
||||
elif name == 'Marker':
|
||||
self.marker = self.curr_text
|
||||
elif name == 'IsTruncated':
|
||||
self.is_truncated = self.curr_text == 'true'
|
||||
elif name == 'Delimiter':
|
||||
self.delimiter = self.curr_text
|
||||
elif name == 'MaxKeys':
|
||||
self.max_keys = int(self.curr_text)
|
||||
elif name == 'NextMarker':
|
||||
self.next_marker = self.curr_text
|
||||
|
||||
self.curr_text = ''
|
||||
|
||||
def characters(self, content):
|
||||
self.curr_text += content
|
||||
|
||||
|
||||
class ListAllMyBucketsHandler(xml.sax.ContentHandler):
|
||||
def __init__(self):
|
||||
self.entries = []
|
||||
self.curr_entry = None
|
||||
self.curr_text = ''
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
if name == 'Bucket':
|
||||
self.curr_entry = Bucket()
|
||||
|
||||
def endElement(self, name):
|
||||
if name == 'Name':
|
||||
self.curr_entry.name = self.curr_text
|
||||
elif name == 'CreationDate':
|
||||
self.curr_entry.creation_date = self.curr_text
|
||||
elif name == 'Bucket':
|
||||
self.entries.append(self.curr_entry)
|
||||
|
||||
def characters(self, content):
|
||||
self.curr_text = content
|
||||
|
||||
|
||||
class LocationHandler(xml.sax.ContentHandler):
|
||||
def __init__(self):
|
||||
self.location = None
|
||||
self.state = 'init'
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
if self.state == 'init':
|
||||
if name == 'LocationConstraint':
|
||||
self.state = 'tag_location'
|
||||
self.location = ''
|
||||
else: self.state = 'bad'
|
||||
else: self.state = 'bad'
|
||||
|
||||
def endElement(self, name):
|
||||
if self.state == 'tag_location' and name == 'LocationConstraint':
|
||||
self.state = 'done'
|
||||
else: self.state = 'bad'
|
||||
|
||||
def characters(self, content):
|
||||
if self.state == 'tag_location':
|
||||
self.location += content
|
|
@ -6,11 +6,11 @@ from pprint import pprint
|
|||
class ConsoleExceptionMiddleware:
|
||||
def process_exception(self, request, exception):
|
||||
exc_info = sys.exc_info()
|
||||
print "######################## Exception #############################"
|
||||
print '\n'.join(traceback.format_exception(*(exc_info or sys.exc_info())))
|
||||
print "----------------------------------------------------------------"
|
||||
print("######################## Exception #############################")
|
||||
print(('\n'.join(traceback.format_exception(*(exc_info or sys.exc_info())))))
|
||||
print("----------------------------------------------------------------")
|
||||
pprint(inspect.trace()[-1][0].f_locals)
|
||||
print "################################################################"
|
||||
print("################################################################")
|
||||
|
||||
#pprint(request)
|
||||
#print "################################################################"
|
||||
|
|
16
utils/exception_middleware.py.bak
Normal file
16
utils/exception_middleware.py.bak
Normal file
|
@ -0,0 +1,16 @@
|
|||
import traceback
|
||||
import sys
|
||||
import inspect
|
||||
from pprint import pprint
|
||||
|
||||
class ConsoleExceptionMiddleware:
|
||||
def process_exception(self, request, exception):
|
||||
exc_info = sys.exc_info()
|
||||
print("######################## Exception #############################")
|
||||
print('\n'.join(traceback.format_exception(*(exc_info or sys.exc_info()))))
|
||||
print("----------------------------------------------------------------")
|
||||
pprint(inspect.trace()[-1][0].f_locals)
|
||||
print("################################################################")
|
||||
|
||||
#pprint(request)
|
||||
#print "################################################################"
|
|
@ -86,13 +86,13 @@ class FacebookFetcher:
|
|||
social_services = MSocialServices.get_user(self.options.get('requesting_user_id'))
|
||||
facebook_api = social_services.facebook_api()
|
||||
if not facebook_api:
|
||||
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' %
|
||||
logging.debug(' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' %
|
||||
(self.feed.log_title[:30], self.feed.feed_address, self.options))
|
||||
return
|
||||
else:
|
||||
usersubs = UserSubscription.objects.filter(feed=self.feed)
|
||||
if not usersubs:
|
||||
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No subscriptions' %
|
||||
logging.debug(' ***> [%-30s] ~FRFacebook fetch failed: %s: No subscriptions' %
|
||||
(self.feed.log_title[:30], self.feed.feed_address))
|
||||
return
|
||||
|
||||
|
@ -108,7 +108,7 @@ class FacebookFetcher:
|
|||
break
|
||||
|
||||
if not facebook_api:
|
||||
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' %
|
||||
logging.debug(' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' %
|
||||
(self.feed.log_title[:30], self.feed.feed_address, usersubs[0].user.username))
|
||||
return
|
||||
|
||||
|
@ -117,10 +117,10 @@ class FacebookFetcher:
|
|||
def fetch_page_feed(self, facebook_user, page, fields):
|
||||
try:
|
||||
stories = facebook_user.get_object(page, fields=fields)
|
||||
except GraphAPIError, e:
|
||||
except GraphAPIError as e:
|
||||
message = str(e).lower()
|
||||
if 'session has expired' in message:
|
||||
logging.debug(u' ***> [%-30s] ~FRFacebook page failed/expired, disconnecting facebook: %s: %s' %
|
||||
logging.debug(' ***> [%-30s] ~FRFacebook page failed/expired, disconnecting facebook: %s: %s' %
|
||||
(self.feed.log_title[:30], self.feed.feed_address, e))
|
||||
self.feed.save_feed_history(560, "Facebook Error: Expired token")
|
||||
return {}
|
||||
|
@ -137,7 +137,7 @@ class FacebookFetcher:
|
|||
return
|
||||
message = linebreaks(page_story['message'])
|
||||
created_date = page_story['created_time']
|
||||
if isinstance(created_date, unicode):
|
||||
if isinstance(created_date, str):
|
||||
created_date = dateutil.parser.parse(created_date)
|
||||
fields = facebook_user.get_object(page_story['id'], fields='permalink_url,link,attachments')
|
||||
permalink = fields.get('link', fields['permalink_url'])
|
||||
|
@ -175,7 +175,7 @@ class FacebookFetcher:
|
|||
return
|
||||
message = linebreaks(page_story['description'])
|
||||
created_date = page_story['updated_time']
|
||||
if isinstance(created_date, unicode):
|
||||
if isinstance(created_date, str):
|
||||
created_date = dateutil.parser.parse(created_date)
|
||||
permalink = facebook_user.get_object(page_story['id'], fields='permalink_url')['permalink_url']
|
||||
embed_html = facebook_user.get_object(page_story['id'], fields='embed_html')
|
||||
|
@ -206,16 +206,16 @@ class FacebookFetcher:
|
|||
page_name = self.extract_page_name()
|
||||
facebook_user = self.facebook_user()
|
||||
if not facebook_user:
|
||||
logging.debug(u' ***> [%-30s] ~FRFacebook icon failed, disconnecting facebook: %s' %
|
||||
logging.debug(' ***> [%-30s] ~FRFacebook icon failed, disconnecting facebook: %s' %
|
||||
(self.feed.log_title[:30], self.feed.feed_address))
|
||||
return
|
||||
|
||||
try:
|
||||
picture_data = facebook_user.get_object(page_name, fields='picture')
|
||||
except GraphAPIError, e:
|
||||
except GraphAPIError as e:
|
||||
message = str(e).lower()
|
||||
if 'session has expired' in message:
|
||||
logging.debug(u' ***> [%-30s] ~FRFacebook icon failed/expired, disconnecting facebook: %s: %s' %
|
||||
logging.debug(' ***> [%-30s] ~FRFacebook icon failed/expired, disconnecting facebook: %s: %s' %
|
||||
(self.feed.log_title[:30], self.feed.feed_address, e))
|
||||
return
|
||||
|
||||
|
|
224
utils/facebook_fetcher.py.bak
Normal file
224
utils/facebook_fetcher.py.bak
Normal file
|
@ -0,0 +1,224 @@
|
|||
import re
|
||||
import datetime
|
||||
import dateutil.parser
|
||||
from django.conf import settings
|
||||
from django.utils import feedgenerator
|
||||
from django.utils.html import linebreaks
|
||||
from apps.social.models import MSocialServices
|
||||
from apps.reader.models import UserSubscription
|
||||
from utils import log as logging
|
||||
from vendor.facebook import GraphAPIError
|
||||
|
||||
class FacebookFetcher:
|
||||
|
||||
def __init__(self, feed, options=None):
|
||||
self.feed = feed
|
||||
self.options = options or {}
|
||||
|
||||
def fetch(self):
|
||||
page_name = self.extract_page_name()
|
||||
if not page_name:
|
||||
return
|
||||
|
||||
facebook_user = self.facebook_user()
|
||||
if not facebook_user:
|
||||
return
|
||||
|
||||
# If 'video', use video API to get embed:
|
||||
# f.get_object('tastyvegetarian', fields='posts')
|
||||
# f.get_object('1992797300790726', fields='embed_html')
|
||||
feed = self.fetch_page_feed(facebook_user, page_name, 'name,about,posts,videos,photos')
|
||||
|
||||
data = {}
|
||||
data['title'] = feed.get('name', "%s on Facebook" % page_name)
|
||||
data['link'] = feed.get('link', "https://facebook.com/%s" % page_name)
|
||||
data['description'] = feed.get('about', "%s on Facebook" % page_name)
|
||||
data['lastBuildDate'] = datetime.datetime.utcnow()
|
||||
data['generator'] = 'NewsBlur Facebook API Decrapifier - %s' % settings.NEWSBLUR_URL
|
||||
data['docs'] = None
|
||||
data['feed_url'] = self.feed.feed_address
|
||||
rss = feedgenerator.Atom1Feed(**data)
|
||||
merged_data = []
|
||||
|
||||
posts = feed.get('posts', {}).get('data', None)
|
||||
if posts:
|
||||
for post in posts:
|
||||
story_data = self.page_posts_story(facebook_user, post)
|
||||
if not story_data:
|
||||
continue
|
||||
merged_data.append(story_data)
|
||||
|
||||
videos = feed.get('videos', {}).get('data', None)
|
||||
if videos:
|
||||
for video in videos:
|
||||
story_data = self.page_video_story(facebook_user, video)
|
||||
if not story_data:
|
||||
continue
|
||||
for seen_data in merged_data:
|
||||
if story_data['link'] == seen_data['link']:
|
||||
# Video wins over posts (and attachments)
|
||||
seen_data['description'] = story_data['description']
|
||||
seen_data['title'] = story_data['title']
|
||||
break
|
||||
|
||||
for story_data in merged_data:
|
||||
rss.add_item(**story_data)
|
||||
|
||||
return rss.writeString('utf-8')
|
||||
|
||||
def extract_page_name(self):
|
||||
page = None
|
||||
try:
|
||||
page_groups = re.search('facebook.com/(\w+)/?', self.feed.feed_address)
|
||||
if not page_groups:
|
||||
return
|
||||
page = page_groups.group(1)
|
||||
except IndexError:
|
||||
return
|
||||
|
||||
return page
|
||||
|
||||
def facebook_user(self):
|
||||
facebook_api = None
|
||||
social_services = None
|
||||
|
||||
if self.options.get('requesting_user_id', None):
|
||||
social_services = MSocialServices.get_user(self.options.get('requesting_user_id'))
|
||||
facebook_api = social_services.facebook_api()
|
||||
if not facebook_api:
|
||||
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' %
|
||||
(self.feed.log_title[:30], self.feed.feed_address, self.options))
|
||||
return
|
||||
else:
|
||||
usersubs = UserSubscription.objects.filter(feed=self.feed)
|
||||
if not usersubs:
|
||||
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No subscriptions' %
|
||||
(self.feed.log_title[:30], self.feed.feed_address))
|
||||
return
|
||||
|
||||
for sub in usersubs:
|
||||
social_services = MSocialServices.get_user(sub.user_id)
|
||||
if not social_services.facebook_uid:
|
||||
continue
|
||||
|
||||
facebook_api = social_services.facebook_api()
|
||||
if not facebook_api:
|
||||
continue
|
||||
else:
|
||||
break
|
||||
|
||||
if not facebook_api:
|
||||
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' %
|
||||
(self.feed.log_title[:30], self.feed.feed_address, usersubs[0].user.username))
|
||||
return
|
||||
|
||||
return facebook_api
|
||||
|
||||
def fetch_page_feed(self, facebook_user, page, fields):
|
||||
try:
|
||||
stories = facebook_user.get_object(page, fields=fields)
|
||||
except GraphAPIError, e:
|
||||
message = str(e).lower()
|
||||
if 'session has expired' in message:
|
||||
logging.debug(u' ***> [%-30s] ~FRFacebook page failed/expired, disconnecting facebook: %s: %s' %
|
||||
(self.feed.log_title[:30], self.feed.feed_address, e))
|
||||
self.feed.save_feed_history(560, "Facebook Error: Expired token")
|
||||
return {}
|
||||
|
||||
if not stories:
|
||||
return {}
|
||||
|
||||
return stories
|
||||
|
||||
def page_posts_story(self, facebook_user, page_story):
|
||||
categories = set()
|
||||
if 'message' not in page_story:
|
||||
# Probably a story shared on the page's timeline, not a published story
|
||||
return
|
||||
message = linebreaks(page_story['message'])
|
||||
created_date = page_story['created_time']
|
||||
if isinstance(created_date, unicode):
|
||||
created_date = dateutil.parser.parse(created_date)
|
||||
fields = facebook_user.get_object(page_story['id'], fields='permalink_url,link,attachments')
|
||||
permalink = fields.get('link', fields['permalink_url'])
|
||||
attachments_html = ""
|
||||
if fields.get('attachments', None) and fields['attachments']['data']:
|
||||
for attachment in fields['attachments']['data']:
|
||||
if 'media' in attachment:
|
||||
attachments_html += "<img src=\"%s\" />" % attachment['media']['image']['src']
|
||||
if attachment.get('subattachments', None):
|
||||
for subattachment in attachment['subattachments']['data']:
|
||||
attachments_html += "<img src=\"%s\" />" % subattachment['media']['image']['src']
|
||||
|
||||
content = """<div class="NB-facebook-rss">
|
||||
<div class="NB-facebook-rss-message">%s</div>
|
||||
<div class="NB-facebook-rss-picture">%s</div>
|
||||
</div>""" % (
|
||||
message,
|
||||
attachments_html
|
||||
)
|
||||
|
||||
story = {
|
||||
'title': message,
|
||||
'link': permalink,
|
||||
'description': content,
|
||||
'categories': list(categories),
|
||||
'unique_id': "fb_post:%s" % page_story['id'],
|
||||
'pubdate': created_date,
|
||||
}
|
||||
|
||||
return story
|
||||
|
||||
def page_video_story(self, facebook_user, page_story):
|
||||
categories = set()
|
||||
if 'description' not in page_story:
|
||||
return
|
||||
message = linebreaks(page_story['description'])
|
||||
created_date = page_story['updated_time']
|
||||
if isinstance(created_date, unicode):
|
||||
created_date = dateutil.parser.parse(created_date)
|
||||
permalink = facebook_user.get_object(page_story['id'], fields='permalink_url')['permalink_url']
|
||||
embed_html = facebook_user.get_object(page_story['id'], fields='embed_html')
|
||||
|
||||
if permalink.startswith('/'):
|
||||
permalink = "https://www.facebook.com%s" % permalink
|
||||
|
||||
content = """<div class="NB-facebook-rss">
|
||||
<div class="NB-facebook-rss-message">%s</div>
|
||||
<div class="NB-facebook-rss-embed">%s</div>
|
||||
</div>""" % (
|
||||
message,
|
||||
embed_html.get('embed_html', '')
|
||||
)
|
||||
|
||||
story = {
|
||||
'title': page_story.get('story', message),
|
||||
'link': permalink,
|
||||
'description': content,
|
||||
'categories': list(categories),
|
||||
'unique_id': "fb_post:%s" % page_story['id'],
|
||||
'pubdate': created_date,
|
||||
}
|
||||
|
||||
return story
|
||||
|
||||
def favicon_url(self):
|
||||
page_name = self.extract_page_name()
|
||||
facebook_user = self.facebook_user()
|
||||
if not facebook_user:
|
||||
logging.debug(u' ***> [%-30s] ~FRFacebook icon failed, disconnecting facebook: %s' %
|
||||
(self.feed.log_title[:30], self.feed.feed_address))
|
||||
return
|
||||
|
||||
try:
|
||||
picture_data = facebook_user.get_object(page_name, fields='picture')
|
||||
except GraphAPIError, e:
|
||||
message = str(e).lower()
|
||||
if 'session has expired' in message:
|
||||
logging.debug(u' ***> [%-30s] ~FRFacebook icon failed/expired, disconnecting facebook: %s: %s' %
|
||||
(self.feed.log_title[:30], self.feed.feed_address, e))
|
||||
return
|
||||
|
||||
if 'picture' in picture_data:
|
||||
return picture_data['picture']['data']['url']
|
||||
|
|
@ -2,7 +2,7 @@ import time
|
|||
import datetime
|
||||
import traceback
|
||||
import multiprocessing
|
||||
import urllib2
|
||||
import urllib.request, urllib.error, urllib.parse
|
||||
import xml.sax
|
||||
import redis
|
||||
import random
|
||||
|
@ -11,7 +11,7 @@ import re
|
|||
import requests
|
||||
import dateutil.parser
|
||||
import isodate
|
||||
import urlparse
|
||||
import urllib.parse
|
||||
from django.conf import settings
|
||||
from django.db import IntegrityError
|
||||
from django.core.cache import cache
|
||||
|
@ -42,7 +42,7 @@ from utils.json_fetcher import JSONFetcher
|
|||
# Refresh feed code adapted from Feedjack.
|
||||
# http://feedjack.googlecode.com
|
||||
|
||||
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5)
|
||||
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = list(range(5))
|
||||
|
||||
|
||||
class FetchFeed:
|
||||
|
@ -59,7 +59,7 @@ class FetchFeed:
|
|||
"""
|
||||
start = time.time()
|
||||
identity = self.get_identity()
|
||||
log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity,
|
||||
log_msg = '%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity,
|
||||
self.feed.log_title[:30],
|
||||
self.feed.id,
|
||||
datetime.datetime.now() - self.feed.last_update)
|
||||
|
@ -74,19 +74,19 @@ class FetchFeed:
|
|||
modified = None
|
||||
etag = None
|
||||
address = qurl(address, add={"_": random.randint(0, 10000)})
|
||||
logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % (
|
||||
logging.debug(' ---> [%-30s] ~FBForcing fetch: %s' % (
|
||||
self.feed.log_title[:30], address))
|
||||
elif (not self.feed.fetched_once or not self.feed.known_good):
|
||||
modified = None
|
||||
etag = None
|
||||
|
||||
if self.options.get('feed_xml'):
|
||||
logging.debug(u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (
|
||||
logging.debug(' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (
|
||||
self.feed.log_title[:30], len(self.options.get('feed_xml'))))
|
||||
|
||||
if self.options.get('fpf'):
|
||||
self.fpf = self.options.get('fpf')
|
||||
logging.debug(u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (
|
||||
logging.debug(' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (
|
||||
self.feed.log_title[:30]))
|
||||
return FEED_OK, self.fpf
|
||||
|
||||
|
@ -96,21 +96,21 @@ class FetchFeed:
|
|||
except (requests.adapters.ConnectionError):
|
||||
youtube_feed = None
|
||||
if not youtube_feed:
|
||||
logging.debug(u' ***> [%-30s] ~FRYouTube fetch failed: %s.' %
|
||||
logging.debug(' ***> [%-30s] ~FRYouTube fetch failed: %s.' %
|
||||
(self.feed.log_title[:30], address))
|
||||
return FEED_ERRHTTP, None
|
||||
self.fpf = feedparser.parse(youtube_feed)
|
||||
elif re.match(r'(https?)?://twitter.com/\w+/?', qurl(address, remove=['_'])):
|
||||
twitter_feed = self.fetch_twitter(address)
|
||||
if not twitter_feed:
|
||||
logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed: %s' %
|
||||
logging.debug(' ***> [%-30s] ~FRTwitter fetch failed: %s' %
|
||||
(self.feed.log_title[:30], address))
|
||||
return FEED_ERRHTTP, None
|
||||
self.fpf = feedparser.parse(twitter_feed)
|
||||
elif re.match(r'(.*?)facebook.com/\w+/?$', qurl(address, remove=['_'])):
|
||||
facebook_feed = self.fetch_facebook()
|
||||
if not facebook_feed:
|
||||
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s' %
|
||||
logging.debug(' ***> [%-30s] ~FRFacebook fetch failed: %s' %
|
||||
(self.feed.log_title[:30], address))
|
||||
return FEED_ERRHTTP, None
|
||||
self.fpf = feedparser.parse(facebook_feed)
|
||||
|
@ -140,7 +140,7 @@ class FetchFeed:
|
|||
# JSON Feed
|
||||
json_feed = self.fetch_json_feed(address, raw_feed)
|
||||
if not json_feed:
|
||||
logging.debug(u' ***> [%-30s] ~FRJSON fetch failed: %s' %
|
||||
logging.debug(' ***> [%-30s] ~FRJSON fetch failed: %s' %
|
||||
(self.feed.log_title[:30], address))
|
||||
return FEED_ERRHTTP, None
|
||||
self.fpf = feedparser.parse(json_feed)
|
||||
|
@ -152,8 +152,8 @@ class FetchFeed:
|
|||
response_headers=response_headers)
|
||||
if self.options.get('debug', False):
|
||||
logging.debug(" ---> [%-30s] ~FBFeed fetch status %s: %s length / %s" % (self.feed.log_title[:30], raw_feed.status_code, len(smart_unicode(raw_feed.content)), raw_feed.headers))
|
||||
except Exception, e:
|
||||
logging.debug(" ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.log_title[:30], unicode(e)[:100]))
|
||||
except Exception as e:
|
||||
logging.debug(" ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.log_title[:30], str(e)[:100]))
|
||||
|
||||
if not self.fpf or self.options.get('force_fp', False):
|
||||
try:
|
||||
|
@ -161,22 +161,22 @@ class FetchFeed:
|
|||
agent=self.feed.user_agent,
|
||||
etag=etag,
|
||||
modified=modified)
|
||||
except (TypeError, ValueError, KeyError, EOFError, MemoryError), e:
|
||||
logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' %
|
||||
except (TypeError, ValueError, KeyError, EOFError, MemoryError) as e:
|
||||
logging.debug(' ***> [%-30s] ~FRFeed fetch error: %s' %
|
||||
(self.feed.log_title[:30], e))
|
||||
pass
|
||||
|
||||
if not self.fpf:
|
||||
try:
|
||||
logging.debug(u' ***> [%-30s] ~FRTurning off headers...' %
|
||||
logging.debug(' ***> [%-30s] ~FRTurning off headers...' %
|
||||
(self.feed.log_title[:30]))
|
||||
self.fpf = feedparser.parse(address, agent=self.feed.user_agent)
|
||||
except (TypeError, ValueError, KeyError, EOFError, MemoryError), e:
|
||||
logging.debug(u' ***> [%-30s] ~FRFetch failed: %s.' %
|
||||
except (TypeError, ValueError, KeyError, EOFError, MemoryError) as e:
|
||||
logging.debug(' ***> [%-30s] ~FRFetch failed: %s.' %
|
||||
(self.feed.log_title[:30], e))
|
||||
return FEED_ERRHTTP, None
|
||||
|
||||
logging.debug(u' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (
|
||||
logging.debug(' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (
|
||||
self.feed.log_title[:30], time.time() - start))
|
||||
|
||||
return FEED_OK, self.fpf
|
||||
|
@ -217,22 +217,22 @@ class FetchFeed:
|
|||
return
|
||||
elif 'youtube.com/feeds/videos.xml?user=' in address:
|
||||
try:
|
||||
username = urlparse.parse_qs(urlparse.urlparse(address).query)['user'][0]
|
||||
username = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['user'][0]
|
||||
except IndexError:
|
||||
return
|
||||
elif 'youtube.com/feeds/videos.xml?channel_id=' in address:
|
||||
try:
|
||||
channel_id = urlparse.parse_qs(urlparse.urlparse(address).query)['channel_id'][0]
|
||||
channel_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['channel_id'][0]
|
||||
except (IndexError, KeyError):
|
||||
return
|
||||
elif 'youtube.com/playlist' in address:
|
||||
try:
|
||||
list_id = urlparse.parse_qs(urlparse.urlparse(address).query)['list'][0]
|
||||
list_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['list'][0]
|
||||
except IndexError:
|
||||
return
|
||||
elif 'youtube.com/feeds/videos.xml?playlist_id' in address:
|
||||
try:
|
||||
list_id = urlparse.parse_qs(urlparse.urlparse(address).query)['playlist_id'][0]
|
||||
list_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['playlist_id'][0]
|
||||
except IndexError:
|
||||
return
|
||||
|
||||
|
@ -365,7 +365,7 @@ class ProcessFeed:
|
|||
if hasattr(self.fpf, 'status'):
|
||||
if self.options['verbose']:
|
||||
if self.fpf.bozo and self.fpf.status != 304:
|
||||
logging.debug(u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (
|
||||
logging.debug(' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (
|
||||
self.feed.log_title[:30],
|
||||
self.fpf.bozo_exception,
|
||||
len(self.fpf.entries)))
|
||||
|
@ -452,7 +452,7 @@ class ProcessFeed:
|
|||
if hasattr(self.fpf, 'modified') and self.fpf.modified:
|
||||
try:
|
||||
self.feed.last_modified = datetime.datetime.strptime(self.fpf.modified, '%a, %d %b %Y %H:%M:%S %Z')
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
self.feed.last_modified = None
|
||||
logging.debug("Broken mtime %s: %s" % (self.feed.last_modified, e))
|
||||
pass
|
||||
|
@ -510,16 +510,16 @@ class ProcessFeed:
|
|||
start_date = story.get('published')
|
||||
if replace_guids:
|
||||
if replace_permalinks:
|
||||
new_story_guid = unicode(story.get('published'))
|
||||
new_story_guid = str(story.get('published'))
|
||||
if self.options['verbose']:
|
||||
logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s' % (
|
||||
logging.debug(' ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s' % (
|
||||
self.feed.log_title[:30],
|
||||
story.get('guid'), new_story_guid))
|
||||
story['guid'] = new_story_guid
|
||||
else:
|
||||
new_story_guid = Feed.get_permalink(story)
|
||||
if self.options['verbose']:
|
||||
logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s' % (
|
||||
logging.debug(' ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s' % (
|
||||
self.feed.log_title[:30],
|
||||
story.get('guid'), new_story_guid))
|
||||
story['guid'] = new_story_guid
|
||||
|
@ -532,7 +532,7 @@ class ProcessFeed:
|
|||
story_hashes.extend(story_hashes_in_unread_cutoff)
|
||||
story_hashes = list(set(story_hashes))
|
||||
if self.options['verbose'] or settings.DEBUG:
|
||||
logging.debug(u' ---> [%-30s] ~FBFound ~SB%s~SN guids, adding ~SB%s~SN/%s guids from db' % (
|
||||
logging.debug(' ---> [%-30s] ~FBFound ~SB%s~SN guids, adding ~SB%s~SN/%s guids from db' % (
|
||||
self.feed.log_title[:30],
|
||||
original_story_hash_count, len(story_hashes)-original_story_hash_count,
|
||||
len(story_hashes_in_unread_cutoff)))
|
||||
|
@ -572,17 +572,17 @@ class ProcessFeed:
|
|||
if (hub_url and self_url and not settings.DEBUG and
|
||||
self.feed.active_subscribers > 0 and
|
||||
(push_expired or not self.feed.is_push or self.options.get('force'))):
|
||||
logging.debug(u' ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % (
|
||||
logging.debug(' ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % (
|
||||
self.feed.log_title[:30],
|
||||
"~SKRe-~SN" if push_expired else "", hub_url))
|
||||
try:
|
||||
PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url)
|
||||
except TimeoutError:
|
||||
logging.debug(u' ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s' % (
|
||||
logging.debug(' ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s' % (
|
||||
self.feed.log_title[:30], hub_url))
|
||||
elif (self.feed.is_push and
|
||||
(self.feed.active_subscribers <= 0 or not hub_url)):
|
||||
logging.debug(u' ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % (
|
||||
logging.debug(' ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % (
|
||||
self.feed.log_title[:30]))
|
||||
self.feed.is_push = False
|
||||
self.feed = self.feed.save()
|
||||
|
@ -592,7 +592,7 @@ class ProcessFeed:
|
|||
QueueNotifications.delay(self.feed.pk, ret_values['new'])
|
||||
|
||||
# All Done
|
||||
logging.debug(u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (
|
||||
logging.debug(' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (
|
||||
self.feed.log_title[:30],
|
||||
'~FG~SB' if ret_values['new'] else '', ret_values['new'],
|
||||
'~FY~SB' if ret_values['updated'] else '', ret_values['updated'],
|
||||
|
@ -610,7 +610,7 @@ class ProcessFeed:
|
|||
self.feed.save_feed_history(200, "OK", date=fetch_date)
|
||||
|
||||
if self.options['verbose']:
|
||||
logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (
|
||||
logging.debug(' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (
|
||||
self.feed.log_title[:30], time.time() - start))
|
||||
|
||||
return FEED_OK, ret_values
|
||||
|
@ -717,28 +717,28 @@ class Dispatcher:
|
|||
except TimeoutError:
|
||||
logging.debug(' ---> [%-30s] Unread count took too long...' % (feed.log_title[:30],))
|
||||
if self.options['verbose']:
|
||||
logging.debug(u' ---> [%-30s] ~FBTIME: unread count in ~FM%.4ss' % (
|
||||
logging.debug(' ---> [%-30s] ~FBTIME: unread count in ~FM%.4ss' % (
|
||||
feed.log_title[:30], time.time() - start))
|
||||
except urllib2.HTTPError, e:
|
||||
logging.debug(' ---> [%-30s] ~FRFeed throws HTTP error: ~SB%s' % (unicode(feed_id)[:30], e.fp.read()))
|
||||
except urllib.error.HTTPError as e:
|
||||
logging.debug(' ---> [%-30s] ~FRFeed throws HTTP error: ~SB%s' % (str(feed_id)[:30], e.fp.read()))
|
||||
feed_code = e.code
|
||||
feed.save_feed_history(feed_code, e.msg, e.fp.read())
|
||||
fetched_feed = None
|
||||
except Feed.DoesNotExist, e:
|
||||
logging.debug(' ---> [%-30s] ~FRFeed is now gone...' % (unicode(feed_id)[:30]))
|
||||
except Feed.DoesNotExist as e:
|
||||
logging.debug(' ---> [%-30s] ~FRFeed is now gone...' % (str(feed_id)[:30]))
|
||||
continue
|
||||
except SoftTimeLimitExceeded, e:
|
||||
except SoftTimeLimitExceeded as e:
|
||||
logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
|
||||
ret_feed = FEED_ERREXC
|
||||
fetched_feed = None
|
||||
feed_code = 559
|
||||
feed.save_feed_history(feed_code, 'Timeout', e)
|
||||
except TimeoutError, e:
|
||||
except TimeoutError as e:
|
||||
logging.debug(' ---> [%-30s] ~FRFeed fetch timed out...' % (feed.log_title[:30]))
|
||||
feed_code = 505
|
||||
feed.save_feed_history(feed_code, 'Timeout', e)
|
||||
fetched_feed = None
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||
tb = traceback.format_exc()
|
||||
logging.error(tb)
|
||||
|
@ -778,20 +778,20 @@ class Dispatcher:
|
|||
(ret_feed == FEED_OK or
|
||||
(ret_feed == FEED_SAME and feed.stories_last_month > 10)))):
|
||||
|
||||
logging.debug(u' ---> [%-30s] ~FYFetching page: %s' % (feed.log_title[:30], feed.feed_link))
|
||||
logging.debug(' ---> [%-30s] ~FYFetching page: %s' % (feed.log_title[:30], feed.feed_link))
|
||||
page_importer = PageImporter(feed)
|
||||
try:
|
||||
page_data = page_importer.fetch_page()
|
||||
page_duration = time.time() - start_duration
|
||||
except SoftTimeLimitExceeded, e:
|
||||
except SoftTimeLimitExceeded as e:
|
||||
logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
|
||||
page_data = None
|
||||
feed.save_feed_history(557, 'Timeout', e)
|
||||
except TimeoutError, e:
|
||||
except TimeoutError as e:
|
||||
logging.debug(' ---> [%-30s] ~FRPage fetch timed out...' % (feed.log_title[:30]))
|
||||
page_data = None
|
||||
feed.save_page_history(555, 'Timeout', '')
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||
tb = traceback.format_exc()
|
||||
logging.error(tb)
|
||||
|
@ -805,7 +805,7 @@ class Dispatcher:
|
|||
settings.RAVEN_CLIENT.captureException()
|
||||
|
||||
feed = self.refresh_feed(feed.pk)
|
||||
logging.debug(u' ---> [%-30s] ~FYFetching icon: %s' % (feed.log_title[:30], feed.feed_link))
|
||||
logging.debug(' ---> [%-30s] ~FYFetching icon: %s' % (feed.log_title[:30], feed.feed_link))
|
||||
force = self.options['force']
|
||||
if random.random() > .99:
|
||||
force = True
|
||||
|
@ -813,13 +813,13 @@ class Dispatcher:
|
|||
try:
|
||||
icon_importer.save()
|
||||
icon_duration = time.time() - start_duration
|
||||
except SoftTimeLimitExceeded, e:
|
||||
except SoftTimeLimitExceeded as e:
|
||||
logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
|
||||
feed.save_feed_history(558, 'Timeout', e)
|
||||
except TimeoutError, e:
|
||||
except TimeoutError as e:
|
||||
logging.debug(' ---> [%-30s] ~FRIcon fetch timed out...' % (feed.log_title[:30]))
|
||||
feed.save_page_history(556, 'Timeout', '')
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||
tb = traceback.format_exc()
|
||||
logging.error(tb)
|
||||
|
@ -830,7 +830,7 @@ class Dispatcher:
|
|||
settings.RAVEN_CLIENT):
|
||||
settings.RAVEN_CLIENT.captureException()
|
||||
else:
|
||||
logging.debug(u' ---> [%-30s] ~FBSkipping page fetch: (%s on %s stories) %s' % (feed.log_title[:30], self.feed_trans[ret_feed], feed.stories_last_month, '' if feed.has_page else ' [HAS NO PAGE]'))
|
||||
logging.debug(' ---> [%-30s] ~FBSkipping page fetch: (%s on %s stories) %s' % (feed.log_title[:30], self.feed_trans[ret_feed], feed.stories_last_month, '' if feed.has_page else ' [HAS NO PAGE]'))
|
||||
|
||||
feed = self.refresh_feed(feed.pk)
|
||||
delta = time.time() - start_time
|
||||
|
@ -845,7 +845,7 @@ class Dispatcher:
|
|||
if ret_entries and ret_entries['new']:
|
||||
self.publish_to_subscribers(feed, ret_entries['new'])
|
||||
|
||||
done_msg = (u'%2s ---> [%-30s] ~FYProcessed in ~FM~SB%.4ss~FY~SN (~FB%s~FY) [%s]' % (
|
||||
done_msg = ('%2s ---> [%-30s] ~FYProcessed in ~FM~SB%.4ss~FY~SN (~FB%s~FY) [%s]' % (
|
||||
identity, feed.log_title[:30], delta,
|
||||
feed.pk, self.feed_trans[ret_feed],))
|
||||
logging.debug(done_msg)
|
||||
|
@ -899,14 +899,14 @@ class Dispatcher:
|
|||
.read_preference(pymongo.ReadPreference.PRIMARY)
|
||||
missing_stories = Feed.format_stories(missing_stories, feed.pk)
|
||||
stories = missing_stories + stories
|
||||
logging.debug(u' ---> [%-30s] ~FYFound ~SB~FC%s(of %s)/%s~FY~SN un-secondaried stories while computing scores' % (feed.log_title[:30], len(missing_stories), len(missing_story_hashes), len(stories)))
|
||||
logging.debug(' ---> [%-30s] ~FYFound ~SB~FC%s(of %s)/%s~FY~SN un-secondaried stories while computing scores' % (feed.log_title[:30], len(missing_stories), len(missing_story_hashes), len(stories)))
|
||||
cache.set("S:%s" % feed.pk, stories, 60)
|
||||
logging.debug(u' ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)' % (
|
||||
logging.debug(' ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)' % (
|
||||
feed.log_title[:30], len(stories), user_subs.count(),
|
||||
feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers))
|
||||
self.calculate_feed_scores_with_stories(user_subs, stories)
|
||||
elif self.options.get('mongodb_replication_lag'):
|
||||
logging.debug(u' ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag' % (
|
||||
logging.debug(' ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag' % (
|
||||
feed.log_title[:30], self.options.get('mongodb_replication_lag')))
|
||||
|
||||
@timelimit(10)
|
||||
|
|
933
utils/feed_fetcher.py.bak
Normal file
933
utils/feed_fetcher.py.bak
Normal file
|
@ -0,0 +1,933 @@
|
|||
import time
|
||||
import datetime
|
||||
import traceback
|
||||
import multiprocessing
|
||||
import urllib2
|
||||
import xml.sax
|
||||
import redis
|
||||
import random
|
||||
import pymongo
|
||||
import re
|
||||
import requests
|
||||
import dateutil.parser
|
||||
import isodate
|
||||
import urlparse
|
||||
from django.conf import settings
|
||||
from django.db import IntegrityError
|
||||
from django.core.cache import cache
|
||||
from apps.reader.models import UserSubscription
|
||||
from apps.rss_feeds.models import Feed, MStory
|
||||
from apps.rss_feeds.page_importer import PageImporter
|
||||
from apps.rss_feeds.icon_importer import IconImporter
|
||||
from apps.notifications.tasks import QueueNotifications, MUserFeedNotification
|
||||
from apps.push.models import PushSubscription
|
||||
from apps.statistics.models import MAnalyticsFetcher, MStatistics
|
||||
from utils import feedparser
|
||||
from utils.story_functions import pre_process_story, strip_tags, linkify
|
||||
from utils import log as logging
|
||||
from utils.feed_functions import timelimit, TimeoutError
|
||||
from qurl import qurl
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
from django.utils import feedgenerator
|
||||
from django.utils.html import linebreaks
|
||||
from django.utils.encoding import smart_unicode
|
||||
from utils import json_functions as json
|
||||
from celery.exceptions import SoftTimeLimitExceeded
|
||||
from utils.twitter_fetcher import TwitterFetcher
|
||||
from utils.facebook_fetcher import FacebookFetcher
|
||||
from utils.json_fetcher import JSONFetcher
|
||||
# from utils.feed_functions import mail_feed_error_to_admin
|
||||
|
||||
|
||||
# Refresh feed code adapted from Feedjack.
|
||||
# http://feedjack.googlecode.com
|
||||
|
||||
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5)
|
||||
|
||||
|
||||
class FetchFeed:
|
||||
def __init__(self, feed_id, options):
|
||||
self.feed = Feed.get_by_id(feed_id)
|
||||
self.options = options
|
||||
self.fpf = None
|
||||
self.raw_feed = None
|
||||
|
||||
@timelimit(30)
|
||||
def fetch(self):
|
||||
"""
|
||||
Uses requests to download the feed, parsing it in feedparser. Will be storified later.
|
||||
"""
|
||||
start = time.time()
|
||||
identity = self.get_identity()
|
||||
log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity,
|
||||
self.feed.log_title[:30],
|
||||
self.feed.id,
|
||||
datetime.datetime.now() - self.feed.last_update)
|
||||
logging.debug(log_msg)
|
||||
|
||||
etag = self.feed.etag
|
||||
modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
|
||||
address = self.feed.feed_address
|
||||
|
||||
if (self.options.get('force') or random.random() <= .01):
|
||||
self.options['force'] = True
|
||||
modified = None
|
||||
etag = None
|
||||
address = qurl(address, add={"_": random.randint(0, 10000)})
|
||||
logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % (
|
||||
self.feed.log_title[:30], address))
|
||||
elif (not self.feed.fetched_once or not self.feed.known_good):
|
||||
modified = None
|
||||
etag = None
|
||||
|
||||
if self.options.get('feed_xml'):
|
||||
logging.debug(u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (
|
||||
self.feed.log_title[:30], len(self.options.get('feed_xml'))))
|
||||
|
||||
if self.options.get('fpf'):
|
||||
self.fpf = self.options.get('fpf')
|
||||
logging.debug(u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (
|
||||
self.feed.log_title[:30]))
|
||||
return FEED_OK, self.fpf
|
||||
|
||||
if 'youtube.com' in address:
|
||||
try:
|
||||
youtube_feed = self.fetch_youtube(address)
|
||||
except (requests.adapters.ConnectionError):
|
||||
youtube_feed = None
|
||||
if not youtube_feed:
|
||||
logging.debug(u' ***> [%-30s] ~FRYouTube fetch failed: %s.' %
|
||||
(self.feed.log_title[:30], address))
|
||||
return FEED_ERRHTTP, None
|
||||
self.fpf = feedparser.parse(youtube_feed)
|
||||
elif re.match(r'(https?)?://twitter.com/\w+/?', qurl(address, remove=['_'])):
|
||||
twitter_feed = self.fetch_twitter(address)
|
||||
if not twitter_feed:
|
||||
logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed: %s' %
|
||||
(self.feed.log_title[:30], address))
|
||||
return FEED_ERRHTTP, None
|
||||
self.fpf = feedparser.parse(twitter_feed)
|
||||
elif re.match(r'(.*?)facebook.com/\w+/?$', qurl(address, remove=['_'])):
|
||||
facebook_feed = self.fetch_facebook()
|
||||
if not facebook_feed:
|
||||
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s' %
|
||||
(self.feed.log_title[:30], address))
|
||||
return FEED_ERRHTTP, None
|
||||
self.fpf = feedparser.parse(facebook_feed)
|
||||
|
||||
if not self.fpf:
|
||||
try:
|
||||
headers = self.feed.fetch_headers()
|
||||
if etag:
|
||||
headers['If-None-Match'] = etag
|
||||
if modified:
|
||||
# format into an RFC 1123-compliant timestamp. We can't use
|
||||
# time.strftime() since the %a and %b directives can be affected
|
||||
# by the current locale, but RFC 2616 states that dates must be
|
||||
# in English.
|
||||
short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
|
||||
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
||||
modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])
|
||||
headers['If-Modified-Since'] = modified_header
|
||||
if etag or modified:
|
||||
headers['A-IM'] = 'feed'
|
||||
raw_feed = requests.get(address, headers=headers)
|
||||
if raw_feed.status_code >= 400:
|
||||
logging.debug(" ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s" % (self.feed.log_title[:30], raw_feed.status_code, raw_feed.headers))
|
||||
raw_feed = requests.get(self.feed.feed_address, headers=self.feed.fetch_headers(fake=True))
|
||||
|
||||
if raw_feed.content and 'application/json' in raw_feed.headers.get('Content-Type', ""):
|
||||
# JSON Feed
|
||||
json_feed = self.fetch_json_feed(address, raw_feed)
|
||||
if not json_feed:
|
||||
logging.debug(u' ***> [%-30s] ~FRJSON fetch failed: %s' %
|
||||
(self.feed.log_title[:30], address))
|
||||
return FEED_ERRHTTP, None
|
||||
self.fpf = feedparser.parse(json_feed)
|
||||
elif raw_feed.content and raw_feed.status_code < 400:
|
||||
response_headers = raw_feed.headers
|
||||
response_headers['Content-Location'] = raw_feed.url
|
||||
self.raw_feed = smart_unicode(raw_feed.content)
|
||||
self.fpf = feedparser.parse(self.raw_feed,
|
||||
response_headers=response_headers)
|
||||
if self.options.get('debug', False):
|
||||
logging.debug(" ---> [%-30s] ~FBFeed fetch status %s: %s length / %s" % (self.feed.log_title[:30], raw_feed.status_code, len(smart_unicode(raw_feed.content)), raw_feed.headers))
|
||||
except Exception, e:
|
||||
logging.debug(" ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.log_title[:30], unicode(e)[:100]))
|
||||
|
||||
if not self.fpf or self.options.get('force_fp', False):
|
||||
try:
|
||||
self.fpf = feedparser.parse(address,
|
||||
agent=self.feed.user_agent,
|
||||
etag=etag,
|
||||
modified=modified)
|
||||
except (TypeError, ValueError, KeyError, EOFError, MemoryError), e:
|
||||
logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' %
|
||||
(self.feed.log_title[:30], e))
|
||||
pass
|
||||
|
||||
if not self.fpf:
|
||||
try:
|
||||
logging.debug(u' ***> [%-30s] ~FRTurning off headers...' %
|
||||
(self.feed.log_title[:30]))
|
||||
self.fpf = feedparser.parse(address, agent=self.feed.user_agent)
|
||||
except (TypeError, ValueError, KeyError, EOFError, MemoryError), e:
|
||||
logging.debug(u' ***> [%-30s] ~FRFetch failed: %s.' %
|
||||
(self.feed.log_title[:30], e))
|
||||
return FEED_ERRHTTP, None
|
||||
|
||||
logging.debug(u' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (
|
||||
self.feed.log_title[:30], time.time() - start))
|
||||
|
||||
return FEED_OK, self.fpf
|
||||
|
||||
def get_identity(self):
|
||||
identity = "X"
|
||||
|
||||
current_process = multiprocessing.current_process()
|
||||
if current_process._identity:
|
||||
identity = current_process._identity[0]
|
||||
|
||||
return identity
|
||||
|
||||
def fetch_twitter(self, address=None):
|
||||
twitter_fetcher = TwitterFetcher(self.feed, self.options)
|
||||
return twitter_fetcher.fetch(address)
|
||||
|
||||
def fetch_facebook(self):
|
||||
facebook_fetcher = FacebookFetcher(self.feed, self.options)
|
||||
return facebook_fetcher.fetch()
|
||||
|
||||
def fetch_json_feed(self, address, headers):
|
||||
json_fetcher = JSONFetcher(self.feed, self.options)
|
||||
return json_fetcher.fetch(address, headers)
|
||||
|
||||
def fetch_youtube(self, address):
|
||||
username = None
|
||||
channel_id = None
|
||||
list_id = None
|
||||
|
||||
if 'gdata.youtube.com' in address:
|
||||
try:
|
||||
username_groups = re.search('gdata.youtube.com/feeds/\w+/users/(\w+)/', address)
|
||||
if not username_groups:
|
||||
return
|
||||
username = username_groups.group(1)
|
||||
except IndexError:
|
||||
return
|
||||
elif 'youtube.com/feeds/videos.xml?user=' in address:
|
||||
try:
|
||||
username = urlparse.parse_qs(urlparse.urlparse(address).query)['user'][0]
|
||||
except IndexError:
|
||||
return
|
||||
elif 'youtube.com/feeds/videos.xml?channel_id=' in address:
|
||||
try:
|
||||
channel_id = urlparse.parse_qs(urlparse.urlparse(address).query)['channel_id'][0]
|
||||
except (IndexError, KeyError):
|
||||
return
|
||||
elif 'youtube.com/playlist' in address:
|
||||
try:
|
||||
list_id = urlparse.parse_qs(urlparse.urlparse(address).query)['list'][0]
|
||||
except IndexError:
|
||||
return
|
||||
elif 'youtube.com/feeds/videos.xml?playlist_id' in address:
|
||||
try:
|
||||
list_id = urlparse.parse_qs(urlparse.urlparse(address).query)['playlist_id'][0]
|
||||
except IndexError:
|
||||
return
|
||||
|
||||
if channel_id:
|
||||
video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?channel_id=%s" % channel_id, verify=False)
|
||||
channel_json = requests.get("https://www.googleapis.com/youtube/v3/channels?part=snippet&id=%s&key=%s" %
|
||||
(channel_id, settings.YOUTUBE_API_KEY))
|
||||
channel = json.decode(channel_json.content)
|
||||
try:
|
||||
username = channel['items'][0]['snippet']['title']
|
||||
description = channel['items'][0]['snippet']['description']
|
||||
except (IndexError, KeyError):
|
||||
return
|
||||
elif list_id:
|
||||
playlist_json = requests.get("https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s" %
|
||||
(list_id, settings.YOUTUBE_API_KEY))
|
||||
playlist = json.decode(playlist_json.content)
|
||||
try:
|
||||
username = playlist['items'][0]['snippet']['title']
|
||||
description = playlist['items'][0]['snippet']['description']
|
||||
except (IndexError, KeyError):
|
||||
return
|
||||
channel_url = "https://www.youtube.com/playlist?list=%s" % list_id
|
||||
elif username:
|
||||
video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?user=%s" % username, verify=False)
|
||||
description = "YouTube videos uploaded by %s" % username
|
||||
else:
|
||||
return
|
||||
|
||||
if list_id:
|
||||
playlist_json = requests.get("https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s" %
|
||||
(list_id, settings.YOUTUBE_API_KEY))
|
||||
playlist = json.decode(playlist_json.content)
|
||||
try:
|
||||
video_ids = [video['snippet']['resourceId']['videoId'] for video in playlist['items']]
|
||||
except (IndexError, KeyError):
|
||||
return
|
||||
else:
|
||||
if video_ids_xml.status_code != 200:
|
||||
return
|
||||
video_ids_soup = BeautifulSoup(video_ids_xml.content)
|
||||
channel_url = video_ids_soup.find('author').find('uri').getText()
|
||||
video_ids = []
|
||||
for video_id in video_ids_soup.findAll('yt:videoid'):
|
||||
video_ids.append(video_id.getText())
|
||||
|
||||
videos_json = requests.get("https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s" %
|
||||
(','.join(video_ids), settings.YOUTUBE_API_KEY))
|
||||
videos = json.decode(videos_json.content)
|
||||
if 'error' in videos:
|
||||
logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos))
|
||||
return
|
||||
|
||||
data = {}
|
||||
data['title'] = ("%s's YouTube Videos" % username if 'Uploads' not in username else username)
|
||||
data['link'] = channel_url
|
||||
data['description'] = description
|
||||
data['lastBuildDate'] = datetime.datetime.utcnow()
|
||||
data['generator'] = 'NewsBlur YouTube API v3 Decrapifier - %s' % settings.NEWSBLUR_URL
|
||||
data['docs'] = None
|
||||
data['feed_url'] = address
|
||||
rss = feedgenerator.Atom1Feed(**data)
|
||||
|
||||
for video in videos['items']:
|
||||
thumbnail = video['snippet']['thumbnails'].get('maxres')
|
||||
if not thumbnail:
|
||||
thumbnail = video['snippet']['thumbnails'].get('high')
|
||||
if not thumbnail:
|
||||
thumbnail = video['snippet']['thumbnails'].get('medium')
|
||||
duration_sec = isodate.parse_duration(video['contentDetails']['duration']).seconds
|
||||
if duration_sec >= 3600:
|
||||
hours = (duration_sec / 3600)
|
||||
minutes = (duration_sec - (hours*3600)) / 60
|
||||
seconds = duration_sec - (hours*3600) - (minutes*60)
|
||||
duration = "%s:%s:%s" % (hours, '{0:02d}'.format(minutes), '{0:02d}'.format(seconds))
|
||||
else:
|
||||
minutes = duration_sec / 60
|
||||
seconds = duration_sec - (minutes*60)
|
||||
duration = "%s:%s" % ('{0:02d}'.format(minutes), '{0:02d}'.format(seconds))
|
||||
content = """<div class="NB-youtube-player"><iframe allowfullscreen="true" src="%s?iv_load_policy=3"></iframe></div>
|
||||
<div class="NB-youtube-stats"><small>
|
||||
<b>From:</b> <a href="%s">%s</a><br />
|
||||
<b>Duration:</b> %s<br />
|
||||
</small></div><hr>
|
||||
<div class="NB-youtube-description">%s</div>
|
||||
<img src="%s" style="display:none" />""" % (
|
||||
("https://www.youtube.com/embed/" + video['id']),
|
||||
channel_url, username,
|
||||
duration,
|
||||
linkify(linebreaks(video['snippet']['description'])),
|
||||
thumbnail['url'] if thumbnail else "",
|
||||
)
|
||||
|
||||
link = "http://www.youtube.com/watch?v=%s" % video['id']
|
||||
story_data = {
|
||||
'title': video['snippet']['title'],
|
||||
'link': link,
|
||||
'description': content,
|
||||
'author_name': username,
|
||||
'categories': [],
|
||||
'unique_id': "tag:youtube.com,2008:video:%s" % video['id'],
|
||||
'pubdate': dateutil.parser.parse(video['snippet']['publishedAt']),
|
||||
}
|
||||
rss.add_item(**story_data)
|
||||
|
||||
return rss.writeString('utf-8')
|
||||
|
||||
|
||||
class ProcessFeed:
|
||||
def __init__(self, feed_id, fpf, options, raw_feed=None):
|
||||
self.feed_id = feed_id
|
||||
self.options = options
|
||||
self.fpf = fpf
|
||||
self.raw_feed = raw_feed
|
||||
|
||||
def refresh_feed(self):
|
||||
self.feed = Feed.get_by_id(self.feed_id)
|
||||
if self.feed_id != self.feed.pk:
|
||||
logging.debug(" ***> Feed has changed: from %s to %s" % (self.feed_id, self.feed.pk))
|
||||
self.feed_id = self.feed.pk
|
||||
|
||||
def process(self):
|
||||
""" Downloads and parses a feed.
|
||||
"""
|
||||
start = time.time()
|
||||
self.refresh_feed()
|
||||
|
||||
ret_values = dict(new=0, updated=0, same=0, error=0)
|
||||
|
||||
if hasattr(self.fpf, 'status'):
|
||||
if self.options['verbose']:
|
||||
if self.fpf.bozo and self.fpf.status != 304:
|
||||
logging.debug(u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (
|
||||
self.feed.log_title[:30],
|
||||
self.fpf.bozo_exception,
|
||||
len(self.fpf.entries)))
|
||||
|
||||
if self.fpf.status == 304:
|
||||
self.feed = self.feed.save()
|
||||
self.feed.save_feed_history(304, "Not modified")
|
||||
return FEED_SAME, ret_values
|
||||
|
||||
# 302 and 307: Temporary redirect: ignore
|
||||
# 301 and 308: Permanent redirect: save it (after 10 tries)
|
||||
if self.fpf.status == 301 or self.fpf.status == 308:
|
||||
if self.fpf.href.endswith('feedburner.com/atom.xml'):
|
||||
return FEED_ERRHTTP, ret_values
|
||||
redirects, non_redirects = self.feed.count_redirects_in_history('feed')
|
||||
self.feed.save_feed_history(self.fpf.status, "HTTP Redirect (%d to go)" % (10-len(redirects)))
|
||||
if len(redirects) >= 10 or len(non_redirects) == 0:
|
||||
address = self.fpf.href
|
||||
if self.options['force'] and address:
|
||||
address = qurl(address, remove=['_'])
|
||||
self.feed.feed_address = address
|
||||
if not self.feed.known_good:
|
||||
self.feed.fetched_once = True
|
||||
logging.debug(" ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.log_title[:30], self.fpf.status))
|
||||
self.feed = self.feed.schedule_feed_fetch_immediately()
|
||||
if not self.fpf.entries:
|
||||
self.feed = self.feed.save()
|
||||
self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
|
||||
return FEED_ERRHTTP, ret_values
|
||||
if self.fpf.status >= 400:
|
||||
logging.debug(" ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.log_title[:30], self.fpf.status))
|
||||
fixed_feed = None
|
||||
if not self.feed.known_good:
|
||||
fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
|
||||
if not fixed_feed:
|
||||
self.feed.save_feed_history(self.fpf.status, "HTTP Error")
|
||||
else:
|
||||
self.feed = feed
|
||||
self.feed = self.feed.save()
|
||||
return FEED_ERRHTTP, ret_values
|
||||
|
||||
if not self.fpf:
|
||||
logging.debug(" ---> [%-30s] ~SB~FRFeed is Non-XML. No feedparser feed either!" % (self.feed.log_title[:30]))
|
||||
self.feed.save_feed_history(551, "Broken feed")
|
||||
return FEED_ERRHTTP, ret_values
|
||||
|
||||
if self.fpf and not self.fpf.entries:
|
||||
if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
|
||||
logging.debug(" ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.log_title[:30], len(self.fpf.entries)))
|
||||
fixed_feed = None
|
||||
if not self.feed.known_good:
|
||||
fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
|
||||
if not fixed_feed:
|
||||
self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception)
|
||||
else:
|
||||
self.feed = feed
|
||||
self.feed = self.feed.save()
|
||||
return FEED_ERRPARSE, ret_values
|
||||
elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
|
||||
logging.debug(" ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.log_title[:30], len(self.fpf.entries)))
|
||||
fixed_feed = None
|
||||
if not self.feed.known_good:
|
||||
fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
|
||||
if not fixed_feed:
|
||||
self.feed.save_feed_history(553, 'Not an RSS feed', self.fpf.bozo_exception)
|
||||
else:
|
||||
self.feed = feed
|
||||
self.feed = self.feed.save()
|
||||
return FEED_ERRPARSE, ret_values
|
||||
|
||||
# the feed has changed (or it is the first time we parse it)
|
||||
# saving the etag and last_modified fields
|
||||
original_etag = self.feed.etag
|
||||
self.feed.etag = self.fpf.get('etag')
|
||||
if self.feed.etag:
|
||||
self.feed.etag = self.feed.etag[:255]
|
||||
# some times this is None (it never should) *sigh*
|
||||
if self.feed.etag is None:
|
||||
self.feed.etag = ''
|
||||
if self.feed.etag != original_etag:
|
||||
self.feed.save(update_fields=['etag'])
|
||||
|
||||
original_last_modified = self.feed.last_modified
|
||||
if hasattr(self.fpf, 'modified') and self.fpf.modified:
|
||||
try:
|
||||
self.feed.last_modified = datetime.datetime.strptime(self.fpf.modified, '%a, %d %b %Y %H:%M:%S %Z')
|
||||
except Exception, e:
|
||||
self.feed.last_modified = None
|
||||
logging.debug("Broken mtime %s: %s" % (self.feed.last_modified, e))
|
||||
pass
|
||||
if self.feed.last_modified != original_last_modified:
|
||||
self.feed.save(update_fields=['last_modified'])
|
||||
|
||||
self.fpf.entries = self.fpf.entries[:100]
|
||||
|
||||
original_title = self.feed.feed_title
|
||||
if self.fpf.feed.get('title'):
|
||||
self.feed.feed_title = strip_tags(self.fpf.feed.get('title'))
|
||||
if self.feed.feed_title != original_title:
|
||||
self.feed.save(update_fields=['feed_title'])
|
||||
|
||||
tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
|
||||
if tagline:
|
||||
original_tagline = self.feed.data.feed_tagline
|
||||
self.feed.data.feed_tagline = smart_unicode(tagline)
|
||||
if self.feed.data.feed_tagline != original_tagline:
|
||||
self.feed.data.save(update_fields=['feed_tagline'])
|
||||
|
||||
if not self.feed.feed_link_locked:
|
||||
new_feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link
|
||||
if self.options['force'] and new_feed_link:
|
||||
new_feed_link = qurl(new_feed_link, remove=['_'])
|
||||
if new_feed_link != self.feed.feed_link:
|
||||
logging.debug(" ---> [%-30s] ~SB~FRFeed's page is different: %s to %s" % (self.feed.log_title[:30], self.feed.feed_link, new_feed_link))
|
||||
redirects, non_redirects = self.feed.count_redirects_in_history('page')
|
||||
self.feed.save_page_history(301, "HTTP Redirect (%s to go)" % (10-len(redirects)))
|
||||
if len(redirects) >= 10 or len(non_redirects) == 0:
|
||||
self.feed.feed_link = new_feed_link
|
||||
self.feed.save(update_fields=['feed_link'])
|
||||
|
||||
# Determine if stories aren't valid and replace broken guids
|
||||
guids_seen = set()
|
||||
permalinks_seen = set()
|
||||
for entry in self.fpf.entries:
|
||||
guids_seen.add(entry.get('guid'))
|
||||
permalinks_seen.add(Feed.get_permalink(entry))
|
||||
guid_difference = len(guids_seen) != len(self.fpf.entries)
|
||||
single_guid = len(guids_seen) == 1
|
||||
replace_guids = single_guid and guid_difference
|
||||
permalink_difference = len(permalinks_seen) != len(self.fpf.entries)
|
||||
single_permalink = len(permalinks_seen) == 1
|
||||
replace_permalinks = single_permalink and permalink_difference
|
||||
|
||||
# Compare new stories to existing stories, adding and updating
|
||||
start_date = datetime.datetime.utcnow()
|
||||
story_hashes = []
|
||||
stories = []
|
||||
for entry in self.fpf.entries:
|
||||
story = pre_process_story(entry, self.fpf.encoding)
|
||||
if not story['title'] and not story['story_content']: continue
|
||||
if story.get('published') < start_date:
|
||||
start_date = story.get('published')
|
||||
if replace_guids:
|
||||
if replace_permalinks:
|
||||
new_story_guid = unicode(story.get('published'))
|
||||
if self.options['verbose']:
|
||||
logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s' % (
|
||||
self.feed.log_title[:30],
|
||||
story.get('guid'), new_story_guid))
|
||||
story['guid'] = new_story_guid
|
||||
else:
|
||||
new_story_guid = Feed.get_permalink(story)
|
||||
if self.options['verbose']:
|
||||
logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s' % (
|
||||
self.feed.log_title[:30],
|
||||
story.get('guid'), new_story_guid))
|
||||
story['guid'] = new_story_guid
|
||||
story['story_hash'] = MStory.feed_guid_hash_unsaved(self.feed.pk, story.get('guid'))
|
||||
stories.append(story)
|
||||
story_hashes.append(story.get('story_hash'))
|
||||
|
||||
original_story_hash_count = len(story_hashes)
|
||||
story_hashes_in_unread_cutoff = self.feed.story_hashes_in_unread_cutoff[:original_story_hash_count]
|
||||
story_hashes.extend(story_hashes_in_unread_cutoff)
|
||||
story_hashes = list(set(story_hashes))
|
||||
if self.options['verbose'] or settings.DEBUG:
|
||||
logging.debug(u' ---> [%-30s] ~FBFound ~SB%s~SN guids, adding ~SB%s~SN/%s guids from db' % (
|
||||
self.feed.log_title[:30],
|
||||
original_story_hash_count, len(story_hashes)-original_story_hash_count,
|
||||
len(story_hashes_in_unread_cutoff)))
|
||||
|
||||
|
||||
existing_stories = dict((s.story_hash, s) for s in MStory.objects(
|
||||
story_hash__in=story_hashes,
|
||||
# story_date__gte=start_date,
|
||||
# story_feed_id=self.feed.pk
|
||||
))
|
||||
# if len(existing_stories) == 0:
|
||||
# existing_stories = dict((s.story_hash, s) for s in MStory.objects(
|
||||
# story_date__gte=start_date,
|
||||
# story_feed_id=self.feed.pk
|
||||
# ))
|
||||
|
||||
ret_values = self.feed.add_update_stories(stories, existing_stories,
|
||||
verbose=self.options['verbose'],
|
||||
updates_off=self.options['updates_off'])
|
||||
|
||||
# PubSubHubbub
|
||||
if (hasattr(self.fpf, 'feed') and
|
||||
hasattr(self.fpf.feed, 'links') and self.fpf.feed.links):
|
||||
hub_url = None
|
||||
self_url = self.feed.feed_address
|
||||
for link in self.fpf.feed.links:
|
||||
if link['rel'] == 'hub' and not hub_url:
|
||||
hub_url = link['href']
|
||||
elif link['rel'] == 'self':
|
||||
self_url = link['href']
|
||||
push_expired = False
|
||||
if self.feed.is_push:
|
||||
try:
|
||||
push_expired = self.feed.push.lease_expires < datetime.datetime.now()
|
||||
except PushSubscription.DoesNotExist:
|
||||
self.feed.is_push = False
|
||||
if (hub_url and self_url and not settings.DEBUG and
|
||||
self.feed.active_subscribers > 0 and
|
||||
(push_expired or not self.feed.is_push or self.options.get('force'))):
|
||||
logging.debug(u' ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % (
|
||||
self.feed.log_title[:30],
|
||||
"~SKRe-~SN" if push_expired else "", hub_url))
|
||||
try:
|
||||
PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url)
|
||||
except TimeoutError:
|
||||
logging.debug(u' ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s' % (
|
||||
self.feed.log_title[:30], hub_url))
|
||||
elif (self.feed.is_push and
|
||||
(self.feed.active_subscribers <= 0 or not hub_url)):
|
||||
logging.debug(u' ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % (
|
||||
self.feed.log_title[:30]))
|
||||
self.feed.is_push = False
|
||||
self.feed = self.feed.save()
|
||||
|
||||
# Push notifications
|
||||
if ret_values['new'] > 0 and MUserFeedNotification.feed_has_users(self.feed.pk) > 0:
|
||||
QueueNotifications.delay(self.feed.pk, ret_values['new'])
|
||||
|
||||
# All Done
|
||||
logging.debug(u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (
|
||||
self.feed.log_title[:30],
|
||||
'~FG~SB' if ret_values['new'] else '', ret_values['new'],
|
||||
'~FY~SB' if ret_values['updated'] else '', ret_values['updated'],
|
||||
'~SB' if ret_values['same'] else '', ret_values['same'],
|
||||
'~FR~SB' if ret_values['error'] else '', ret_values['error'],
|
||||
len(self.fpf.entries)))
|
||||
self.feed.update_all_statistics(has_new_stories=bool(ret_values['new']), force=self.options['force'])
|
||||
fetch_date = datetime.datetime.now()
|
||||
if ret_values['new']:
|
||||
if not getattr(settings, 'TEST_DEBUG', False):
|
||||
self.feed.trim_feed()
|
||||
self.feed.expire_redis()
|
||||
if MStatistics.get('raw_feed', None) == self.feed.pk:
|
||||
self.feed.save_raw_feed(self.raw_feed, fetch_date)
|
||||
self.feed.save_feed_history(200, "OK", date=fetch_date)
|
||||
|
||||
if self.options['verbose']:
|
||||
logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (
|
||||
self.feed.log_title[:30], time.time() - start))
|
||||
|
||||
return FEED_OK, ret_values
|
||||
|
||||
|
||||
class Dispatcher:
|
||||
def __init__(self, options, num_threads):
|
||||
self.options = options
|
||||
self.feed_stats = {
|
||||
FEED_OK:0,
|
||||
FEED_SAME:0,
|
||||
FEED_ERRPARSE:0,
|
||||
FEED_ERRHTTP:0,
|
||||
FEED_ERREXC:0}
|
||||
self.feed_trans = {
|
||||
FEED_OK:'ok',
|
||||
FEED_SAME:'unchanged',
|
||||
FEED_ERRPARSE:'cant_parse',
|
||||
FEED_ERRHTTP:'http_error',
|
||||
FEED_ERREXC:'exception'}
|
||||
self.feed_keys = sorted(self.feed_trans.keys())
|
||||
self.num_threads = num_threads
|
||||
self.time_start = datetime.datetime.utcnow()
|
||||
self.workers = []
|
||||
|
||||
def refresh_feed(self, feed_id):
|
||||
"""Update feed, since it may have changed"""
|
||||
return Feed.get_by_id(feed_id)
|
||||
|
||||
def process_feed_wrapper(self, feed_queue):
|
||||
delta = None
|
||||
current_process = multiprocessing.current_process()
|
||||
identity = "X"
|
||||
feed = None
|
||||
|
||||
if current_process._identity:
|
||||
identity = current_process._identity[0]
|
||||
|
||||
for feed_id in feed_queue:
|
||||
start_duration = time.time()
|
||||
feed_fetch_duration = None
|
||||
feed_process_duration = None
|
||||
page_duration = None
|
||||
icon_duration = None
|
||||
feed_code = None
|
||||
ret_entries = None
|
||||
start_time = time.time()
|
||||
ret_feed = FEED_ERREXC
|
||||
try:
|
||||
feed = self.refresh_feed(feed_id)
|
||||
|
||||
skip = False
|
||||
if self.options.get('fake'):
|
||||
skip = True
|
||||
weight = "-"
|
||||
quick = "-"
|
||||
rand = "-"
|
||||
elif (self.options.get('quick') and not self.options['force'] and
|
||||
feed.known_good and feed.fetched_once and not feed.is_push):
|
||||
weight = feed.stories_last_month * feed.num_subscribers
|
||||
random_weight = random.randint(1, max(weight, 1))
|
||||
quick = float(self.options.get('quick', 0))
|
||||
rand = random.random()
|
||||
if random_weight < 1000 and rand < quick:
|
||||
skip = True
|
||||
elif False and feed.feed_address.startswith("http://news.google.com/news"):
|
||||
skip = True
|
||||
weight = "-"
|
||||
quick = "-"
|
||||
rand = "-"
|
||||
if skip:
|
||||
logging.debug(' ---> [%-30s] ~BGFaking fetch, skipping (%s/month, %s subs, %s < %s)...' % (
|
||||
feed.log_title[:30],
|
||||
weight,
|
||||
feed.num_subscribers,
|
||||
rand, quick))
|
||||
continue
|
||||
|
||||
ffeed = FetchFeed(feed_id, self.options)
|
||||
ret_feed, fetched_feed = ffeed.fetch()
|
||||
|
||||
feed_fetch_duration = time.time() - start_duration
|
||||
raw_feed = ffeed.raw_feed
|
||||
|
||||
if ((fetched_feed and ret_feed == FEED_OK) or self.options['force']):
|
||||
pfeed = ProcessFeed(feed_id, fetched_feed, self.options, raw_feed=raw_feed)
|
||||
ret_feed, ret_entries = pfeed.process()
|
||||
feed = pfeed.feed
|
||||
feed_process_duration = time.time() - start_duration
|
||||
|
||||
if (ret_entries and ret_entries['new']) or self.options['force']:
|
||||
start = time.time()
|
||||
if not feed.known_good or not feed.fetched_once:
|
||||
feed.known_good = True
|
||||
feed.fetched_once = True
|
||||
feed = feed.save()
|
||||
if self.options['force'] or random.random() <= 0.02:
|
||||
logging.debug(' ---> [%-30s] ~FBPerforming feed cleanup...' % (feed.log_title[:30],))
|
||||
start_cleanup = time.time()
|
||||
feed.sync_redis()
|
||||
logging.debug(' ---> [%-30s] ~FBDone with feed cleanup. Took ~SB%.4s~SN sec.' % (feed.log_title[:30], time.time() - start_cleanup))
|
||||
try:
|
||||
self.count_unreads_for_subscribers(feed)
|
||||
except TimeoutError:
|
||||
logging.debug(' ---> [%-30s] Unread count took too long...' % (feed.log_title[:30],))
|
||||
if self.options['verbose']:
|
||||
logging.debug(u' ---> [%-30s] ~FBTIME: unread count in ~FM%.4ss' % (
|
||||
feed.log_title[:30], time.time() - start))
|
||||
except urllib2.HTTPError, e:
|
||||
logging.debug(' ---> [%-30s] ~FRFeed throws HTTP error: ~SB%s' % (unicode(feed_id)[:30], e.fp.read()))
|
||||
feed_code = e.code
|
||||
feed.save_feed_history(feed_code, e.msg, e.fp.read())
|
||||
fetched_feed = None
|
||||
except Feed.DoesNotExist, e:
|
||||
logging.debug(' ---> [%-30s] ~FRFeed is now gone...' % (unicode(feed_id)[:30]))
|
||||
continue
|
||||
except SoftTimeLimitExceeded, e:
|
||||
logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
|
||||
ret_feed = FEED_ERREXC
|
||||
fetched_feed = None
|
||||
feed_code = 559
|
||||
feed.save_feed_history(feed_code, 'Timeout', e)
|
||||
except TimeoutError, e:
|
||||
logging.debug(' ---> [%-30s] ~FRFeed fetch timed out...' % (feed.log_title[:30]))
|
||||
feed_code = 505
|
||||
feed.save_feed_history(feed_code, 'Timeout', e)
|
||||
fetched_feed = None
|
||||
except Exception, e:
|
||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||
tb = traceback.format_exc()
|
||||
logging.error(tb)
|
||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||
ret_feed = FEED_ERREXC
|
||||
feed = Feed.get_by_id(getattr(feed, 'pk', feed_id))
|
||||
if not feed: continue
|
||||
feed.save_feed_history(500, "Error", tb)
|
||||
feed_code = 500
|
||||
fetched_feed = None
|
||||
# mail_feed_error_to_admin(feed, e, local_vars=locals())
|
||||
if (not settings.DEBUG and hasattr(settings, 'RAVEN_CLIENT') and
|
||||
settings.RAVEN_CLIENT):
|
||||
settings.RAVEN_CLIENT.captureException()
|
||||
|
||||
if not feed_code:
|
||||
if ret_feed == FEED_OK:
|
||||
feed_code = 200
|
||||
elif ret_feed == FEED_SAME:
|
||||
feed_code = 304
|
||||
elif ret_feed == FEED_ERRHTTP:
|
||||
feed_code = 400
|
||||
if ret_feed == FEED_ERREXC:
|
||||
feed_code = 500
|
||||
elif ret_feed == FEED_ERRPARSE:
|
||||
feed_code = 550
|
||||
|
||||
if not feed: continue
|
||||
feed = self.refresh_feed(feed.pk)
|
||||
if not feed: continue
|
||||
|
||||
if ((self.options['force']) or
|
||||
(random.random() > .9) or
|
||||
(fetched_feed and
|
||||
feed.feed_link and
|
||||
feed.has_page and
|
||||
(ret_feed == FEED_OK or
|
||||
(ret_feed == FEED_SAME and feed.stories_last_month > 10)))):
|
||||
|
||||
logging.debug(u' ---> [%-30s] ~FYFetching page: %s' % (feed.log_title[:30], feed.feed_link))
|
||||
page_importer = PageImporter(feed)
|
||||
try:
|
||||
page_data = page_importer.fetch_page()
|
||||
page_duration = time.time() - start_duration
|
||||
except SoftTimeLimitExceeded, e:
|
||||
logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
|
||||
page_data = None
|
||||
feed.save_feed_history(557, 'Timeout', e)
|
||||
except TimeoutError, e:
|
||||
logging.debug(' ---> [%-30s] ~FRPage fetch timed out...' % (feed.log_title[:30]))
|
||||
page_data = None
|
||||
feed.save_page_history(555, 'Timeout', '')
|
||||
except Exception, e:
|
||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||
tb = traceback.format_exc()
|
||||
logging.error(tb)
|
||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||
feed.save_page_history(550, "Page Error", tb)
|
||||
fetched_feed = None
|
||||
page_data = None
|
||||
# mail_feed_error_to_admin(feed, e, local_vars=locals())
|
||||
if (not settings.DEBUG and hasattr(settings, 'RAVEN_CLIENT') and
|
||||
settings.RAVEN_CLIENT):
|
||||
settings.RAVEN_CLIENT.captureException()
|
||||
|
||||
feed = self.refresh_feed(feed.pk)
|
||||
logging.debug(u' ---> [%-30s] ~FYFetching icon: %s' % (feed.log_title[:30], feed.feed_link))
|
||||
force = self.options['force']
|
||||
if random.random() > .99:
|
||||
force = True
|
||||
icon_importer = IconImporter(feed, page_data=page_data, force=force)
|
||||
try:
|
||||
icon_importer.save()
|
||||
icon_duration = time.time() - start_duration
|
||||
except SoftTimeLimitExceeded, e:
|
||||
logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
|
||||
feed.save_feed_history(558, 'Timeout', e)
|
||||
except TimeoutError, e:
|
||||
logging.debug(' ---> [%-30s] ~FRIcon fetch timed out...' % (feed.log_title[:30]))
|
||||
feed.save_page_history(556, 'Timeout', '')
|
||||
except Exception, e:
|
||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||
tb = traceback.format_exc()
|
||||
logging.error(tb)
|
||||
logging.debug('[%d] ! -------------------------' % (feed_id,))
|
||||
# feed.save_feed_history(560, "Icon Error", tb)
|
||||
# mail_feed_error_to_admin(feed, e, local_vars=locals())
|
||||
if (not settings.DEBUG and hasattr(settings, 'RAVEN_CLIENT') and
|
||||
settings.RAVEN_CLIENT):
|
||||
settings.RAVEN_CLIENT.captureException()
|
||||
else:
|
||||
logging.debug(u' ---> [%-30s] ~FBSkipping page fetch: (%s on %s stories) %s' % (feed.log_title[:30], self.feed_trans[ret_feed], feed.stories_last_month, '' if feed.has_page else ' [HAS NO PAGE]'))
|
||||
|
||||
feed = self.refresh_feed(feed.pk)
|
||||
delta = time.time() - start_time
|
||||
|
||||
feed.last_load_time = round(delta)
|
||||
feed.fetched_once = True
|
||||
try:
|
||||
feed = feed.save(update_fields=['last_load_time', 'fetched_once'])
|
||||
except IntegrityError:
|
||||
logging.debug(" ***> [%-30s] ~FRIntegrityError on feed: %s" % (feed.log_title[:30], feed.feed_address,))
|
||||
|
||||
if ret_entries and ret_entries['new']:
|
||||
self.publish_to_subscribers(feed, ret_entries['new'])
|
||||
|
||||
done_msg = (u'%2s ---> [%-30s] ~FYProcessed in ~FM~SB%.4ss~FY~SN (~FB%s~FY) [%s]' % (
|
||||
identity, feed.log_title[:30], delta,
|
||||
feed.pk, self.feed_trans[ret_feed],))
|
||||
logging.debug(done_msg)
|
||||
total_duration = time.time() - start_duration
|
||||
MAnalyticsFetcher.add(feed_id=feed.pk, feed_fetch=feed_fetch_duration,
|
||||
feed_process=feed_process_duration,
|
||||
page=page_duration, icon=icon_duration,
|
||||
total=total_duration, feed_code=feed_code)
|
||||
|
||||
self.feed_stats[ret_feed] += 1
|
||||
|
||||
if len(feed_queue) == 1:
|
||||
return feed
|
||||
|
||||
# time_taken = datetime.datetime.utcnow() - self.time_start
|
||||
|
||||
def publish_to_subscribers(self, feed, new_count):
|
||||
try:
|
||||
r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL)
|
||||
listeners_count = r.publish(str(feed.pk), 'story:new_count:%s' % new_count)
|
||||
if listeners_count:
|
||||
logging.debug(" ---> [%-30s] ~FMPublished to %s subscribers" % (feed.log_title[:30], listeners_count))
|
||||
except redis.ConnectionError:
|
||||
logging.debug(" ***> [%-30s] ~BMRedis is unavailable for real-time." % (feed.log_title[:30],))
|
||||
|
||||
def count_unreads_for_subscribers(self, feed):
|
||||
user_subs = UserSubscription.objects.filter(feed=feed,
|
||||
active=True,
|
||||
user__profile__last_seen_on__gte=feed.unread_cutoff)\
|
||||
.order_by('-last_read_date')
|
||||
|
||||
if not user_subs.count():
|
||||
return
|
||||
|
||||
for sub in user_subs:
|
||||
if not sub.needs_unread_recalc:
|
||||
sub.needs_unread_recalc = True
|
||||
sub.save()
|
||||
|
||||
if self.options['compute_scores']:
|
||||
r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL)
|
||||
stories = MStory.objects(story_feed_id=feed.pk,
|
||||
story_date__gte=feed.unread_cutoff)
|
||||
stories = Feed.format_stories(stories, feed.pk)
|
||||
story_hashes = r.zrangebyscore('zF:%s' % feed.pk, int(feed.unread_cutoff.strftime('%s')),
|
||||
int(time.time() + 60*60*24))
|
||||
missing_story_hashes = set(story_hashes) - set([s['story_hash'] for s in stories])
|
||||
if missing_story_hashes:
|
||||
missing_stories = MStory.objects(story_feed_id=feed.pk,
|
||||
story_hash__in=missing_story_hashes)\
|
||||
.read_preference(pymongo.ReadPreference.PRIMARY)
|
||||
missing_stories = Feed.format_stories(missing_stories, feed.pk)
|
||||
stories = missing_stories + stories
|
||||
logging.debug(u' ---> [%-30s] ~FYFound ~SB~FC%s(of %s)/%s~FY~SN un-secondaried stories while computing scores' % (feed.log_title[:30], len(missing_stories), len(missing_story_hashes), len(stories)))
|
||||
cache.set("S:%s" % feed.pk, stories, 60)
|
||||
logging.debug(u' ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)' % (
|
||||
feed.log_title[:30], len(stories), user_subs.count(),
|
||||
feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers))
|
||||
self.calculate_feed_scores_with_stories(user_subs, stories)
|
||||
elif self.options.get('mongodb_replication_lag'):
|
||||
logging.debug(u' ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag' % (
|
||||
feed.log_title[:30], self.options.get('mongodb_replication_lag')))
|
||||
|
||||
@timelimit(10)
|
||||
def calculate_feed_scores_with_stories(self, user_subs, stories):
|
||||
for sub in user_subs:
|
||||
silent = False if self.options['verbose'] >= 2 else True
|
||||
sub.calculate_feed_scores(silent=silent, stories=stories)
|
||||
|
||||
def add_jobs(self, feeds_queue, feeds_count=1):
|
||||
""" adds a feed processing job to the pool
|
||||
"""
|
||||
self.feeds_queue = feeds_queue
|
||||
self.feeds_count = feeds_count
|
||||
|
||||
def run_jobs(self):
|
||||
if self.options['single_threaded']:
|
||||
return self.process_feed_wrapper(self.feeds_queue[0])
|
||||
else:
|
||||
for i in range(self.num_threads):
|
||||
feed_queue = self.feeds_queue[i]
|
||||
self.workers.append(multiprocessing.Process(target=self.process_feed_wrapper,
|
||||
args=(feed_queue,)))
|
||||
for i in range(self.num_threads):
|
||||
self.workers[i].start()
|
|
@ -3,8 +3,8 @@ import threading
|
|||
import sys
|
||||
import traceback
|
||||
import pprint
|
||||
import urllib
|
||||
import urlparse
|
||||
import urllib.request, urllib.parse, urllib.error
|
||||
import urllib.parse
|
||||
import random
|
||||
import warnings
|
||||
from django.core.mail import mail_admins
|
||||
|
@ -35,12 +35,12 @@ def timelimit(timeout):
|
|||
c = Dispatch()
|
||||
c.join(timeout)
|
||||
if c.isAlive():
|
||||
raise TimeoutError, 'took too long'
|
||||
raise TimeoutError('took too long')
|
||||
if c.error:
|
||||
tb = ''.join(traceback.format_exception(c.error[0], c.error[1], c.error[2]))
|
||||
logging.debug(tb)
|
||||
mail_admins('Error in timeout: %s' % c.error[0], tb)
|
||||
raise c.error[0], c.error[1], c.error[2]
|
||||
raise c.error[0](c.error[1]).with_traceback(c.error[2])
|
||||
return c.result
|
||||
return _2
|
||||
return _1
|
||||
|
@ -67,7 +67,7 @@ def levenshtein_distance(first, second):
|
|||
distance_matrix[i][0] = i
|
||||
for j in range(second_length):
|
||||
distance_matrix[0][j]=j
|
||||
for i in xrange(1, first_length):
|
||||
for i in range(1, first_length):
|
||||
for j in range(1, second_length):
|
||||
deletion = distance_matrix[i-1][j] + 1
|
||||
insertion = distance_matrix[i][j-1] + 1
|
||||
|
@ -111,7 +111,7 @@ def _do_timesince(d, chunks, now=None):
|
|||
|
||||
def relative_timesince(value):
|
||||
if not value:
|
||||
return u''
|
||||
return ''
|
||||
|
||||
chunks = (
|
||||
(60 * 60 * 24, lambda n: ungettext('day', 'days', n)),
|
||||
|
@ -124,7 +124,7 @@ def relative_timesince(value):
|
|||
|
||||
def relative_timeuntil(value):
|
||||
if not value:
|
||||
return u''
|
||||
return ''
|
||||
|
||||
chunks = (
|
||||
(60 * 60, lambda n: ungettext('hour', 'hours', n)),
|
||||
|
@ -171,7 +171,7 @@ def format_relative_date(date, future=False):
|
|||
def add_object_to_folder(obj, in_folder, folders, parent='', added=False):
|
||||
obj_identifier = obj
|
||||
if isinstance(obj, dict):
|
||||
obj_identifier = obj.keys()[0]
|
||||
obj_identifier = list(obj.keys())[0]
|
||||
|
||||
if ((not in_folder or in_folder == " ") and
|
||||
not parent and
|
||||
|
@ -183,7 +183,7 @@ def add_object_to_folder(obj, in_folder, folders, parent='', added=False):
|
|||
child_folder_names = []
|
||||
for item in folders:
|
||||
if isinstance(item, dict):
|
||||
child_folder_names.append(item.keys()[0])
|
||||
child_folder_names.append(list(item.keys())[0])
|
||||
if isinstance(obj, dict) and in_folder.lower() == parent.lower():
|
||||
if obj_identifier not in child_folder_names:
|
||||
folders.append(obj)
|
||||
|
@ -191,7 +191,7 @@ def add_object_to_folder(obj, in_folder, folders, parent='', added=False):
|
|||
|
||||
for k, v in enumerate(folders):
|
||||
if isinstance(v, dict):
|
||||
for f_k, f_v in v.items():
|
||||
for f_k, f_v in list(v.items()):
|
||||
if f_k.lower() == in_folder.lower() and obj_identifier not in f_v and not added:
|
||||
f_v.append(obj)
|
||||
added = True
|
||||
|
@ -216,7 +216,7 @@ def mail_feed_error_to_admin(feed, e, local_vars=None, subject=None):
|
|||
## {{{ http://code.activestate.com/recipes/576611/ (r11)
|
||||
from operator import itemgetter
|
||||
from heapq import nlargest
|
||||
from itertools import repeat, ifilter
|
||||
from itertools import repeat
|
||||
|
||||
class Counter(dict):
|
||||
'''Dict subclass for counting hashable objects. Sometimes called a bag
|
||||
|
@ -253,8 +253,8 @@ class Counter(dict):
|
|||
|
||||
'''
|
||||
if n is None:
|
||||
return sorted(self.iteritems(), key=itemgetter(1), reverse=True)
|
||||
return nlargest(n, self.iteritems(), key=itemgetter(1))
|
||||
return sorted(iter(list(self.items())), key=itemgetter(1), reverse=True)
|
||||
return nlargest(n, iter(list(self.items())), key=itemgetter(1))
|
||||
|
||||
def elements(self):
|
||||
'''Iterator over elements repeating each as many times as its count.
|
||||
|
@ -267,7 +267,7 @@ class Counter(dict):
|
|||
elements() will ignore it.
|
||||
|
||||
'''
|
||||
for elem, count in self.iteritems():
|
||||
for elem, count in list(self.items()):
|
||||
for _ in repeat(None, count):
|
||||
yield elem
|
||||
|
||||
|
@ -295,7 +295,7 @@ class Counter(dict):
|
|||
if hasattr(iterable, 'iteritems'):
|
||||
if self:
|
||||
self_get = self.get
|
||||
for elem, count in iterable.iteritems():
|
||||
for elem, count in list(iterable.items()):
|
||||
self[elem] = self_get(elem, 0) + count
|
||||
else:
|
||||
dict.update(self, iterable) # fast path when counter is empty
|
||||
|
@ -393,7 +393,7 @@ class Counter(dict):
|
|||
result = Counter()
|
||||
if len(self) < len(other):
|
||||
self, other = other, self
|
||||
for elem in ifilter(self.__contains__, other):
|
||||
for elem in filter(self.__contains__, other):
|
||||
newcount = _min(self[elem], other[elem])
|
||||
if newcount > 0:
|
||||
result[elem] = newcount
|
||||
|
@ -402,9 +402,9 @@ class Counter(dict):
|
|||
|
||||
if __name__ == '__main__':
|
||||
import doctest
|
||||
print doctest.testmod()
|
||||
print((doctest.testmod()))
|
||||
## end of http://code.activestate.com/recipes/576611/ }}}
|
||||
|
||||
def chunks(l, n):
|
||||
for i in xrange(0, len(l), n):
|
||||
for i in range(0, len(l), n):
|
||||
yield l[i:i+n]
|
||||
|
|
410
utils/feed_functions.py.bak
Normal file
410
utils/feed_functions.py.bak
Normal file
|
@ -0,0 +1,410 @@
|
|||
import datetime
|
||||
import threading
|
||||
import sys
|
||||
import traceback
|
||||
import pprint
|
||||
import urllib.request, urllib.parse, urllib.error
|
||||
import urllib.parse
|
||||
import random
|
||||
import warnings
|
||||
from django.core.mail import mail_admins
|
||||
from django.utils.translation import ungettext
|
||||
from django.utils.encoding import smart_unicode
|
||||
from utils import log as logging
|
||||
|
||||
|
||||
class TimeoutError(Exception): pass
|
||||
def timelimit(timeout):
|
||||
"""borrowed from web.py"""
|
||||
def _1(function):
|
||||
def _2(*args, **kw):
|
||||
class Dispatch(threading.Thread):
|
||||
def __init__(self):
|
||||
threading.Thread.__init__(self)
|
||||
self.result = None
|
||||
self.error = None
|
||||
|
||||
self.setDaemon(True)
|
||||
self.start()
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
self.result = function(*args, **kw)
|
||||
except:
|
||||
self.error = sys.exc_info()
|
||||
c = Dispatch()
|
||||
c.join(timeout)
|
||||
if c.isAlive():
|
||||
raise TimeoutError('took too long')
|
||||
if c.error:
|
||||
tb = ''.join(traceback.format_exception(c.error[0], c.error[1], c.error[2]))
|
||||
logging.debug(tb)
|
||||
mail_admins('Error in timeout: %s' % c.error[0], tb)
|
||||
raise c.error[0](c.error[1]).with_traceback(c.error[2])
|
||||
return c.result
|
||||
return _2
|
||||
return _1
|
||||
|
||||
|
||||
def utf8encode(tstr):
|
||||
""" Encodes a unicode string in utf-8
|
||||
"""
|
||||
msg = "utf8encode is deprecated. Use django.utils.encoding.smart_unicode instead."
|
||||
warnings.warn(msg, DeprecationWarning)
|
||||
return smart_unicode(tstr)
|
||||
|
||||
# From: http://www.poromenos.org/node/87
|
||||
def levenshtein_distance(first, second):
|
||||
"""Find the Levenshtein distance between two strings."""
|
||||
if len(first) > len(second):
|
||||
first, second = second, first
|
||||
if len(second) == 0:
|
||||
return len(first)
|
||||
first_length = len(first) + 1
|
||||
second_length = len(second) + 1
|
||||
distance_matrix = [[0] * second_length for x in range(first_length)]
|
||||
for i in range(first_length):
|
||||
distance_matrix[i][0] = i
|
||||
for j in range(second_length):
|
||||
distance_matrix[0][j]=j
|
||||
for i in range(1, first_length):
|
||||
for j in range(1, second_length):
|
||||
deletion = distance_matrix[i-1][j] + 1
|
||||
insertion = distance_matrix[i][j-1] + 1
|
||||
substitution = distance_matrix[i-1][j-1]
|
||||
if first[i-1] != second[j-1]:
|
||||
substitution += 1
|
||||
distance_matrix[i][j] = min(insertion, deletion, substitution)
|
||||
return distance_matrix[first_length-1][second_length-1]
|
||||
|
||||
def _do_timesince(d, chunks, now=None):
|
||||
"""
|
||||
Started as a copy of django.util.timesince.timesince, but modified to
|
||||
only output one time unit, and use months as the maximum unit of measure.
|
||||
|
||||
Takes two datetime objects and returns the time between d and now
|
||||
as a nicely formatted string, e.g. "10 minutes". If d occurs after now,
|
||||
then "0 minutes" is returned.
|
||||
|
||||
Units used are months, weeks, days, hours, and minutes.
|
||||
Seconds and microseconds are ignored.
|
||||
"""
|
||||
# Convert datetime.date to datetime.datetime for comparison
|
||||
if d.__class__ is not datetime.datetime:
|
||||
d = datetime.datetime(d.year, d.month, d.day)
|
||||
|
||||
if not now:
|
||||
now = datetime.datetime.utcnow()
|
||||
|
||||
# ignore microsecond part of 'd' since we removed it from 'now'
|
||||
delta = now - (d - datetime.timedelta(0, 0, d.microsecond))
|
||||
since = delta.days * 24 * 60 * 60 + delta.seconds
|
||||
if since > 10:
|
||||
for i, (seconds, name) in enumerate(chunks):
|
||||
count = since // seconds
|
||||
if count != 0:
|
||||
break
|
||||
s = '%(number)d %(type)s' % {'number': count, 'type': name(count)}
|
||||
else:
|
||||
s = 'just a second'
|
||||
return s
|
||||
|
||||
def relative_timesince(value):
|
||||
if not value:
|
||||
return ''
|
||||
|
||||
chunks = (
|
||||
(60 * 60 * 24, lambda n: ungettext('day', 'days', n)),
|
||||
(60 * 60, lambda n: ungettext('hour', 'hours', n)),
|
||||
(60, lambda n: ungettext('minute', 'minutes', n)),
|
||||
(1, lambda n: ungettext('second', 'seconds', n)),
|
||||
(0, lambda n: 'just now'),
|
||||
)
|
||||
return _do_timesince(value, chunks)
|
||||
|
||||
def relative_timeuntil(value):
|
||||
if not value:
|
||||
return ''
|
||||
|
||||
chunks = (
|
||||
(60 * 60, lambda n: ungettext('hour', 'hours', n)),
|
||||
(60, lambda n: ungettext('minute', 'minutes', n))
|
||||
)
|
||||
|
||||
now = datetime.datetime.utcnow()
|
||||
|
||||
return _do_timesince(now, chunks, value)
|
||||
|
||||
def seconds_timesince(value):
|
||||
if not value:
|
||||
return 0
|
||||
now = datetime.datetime.utcnow()
|
||||
delta = now - value
|
||||
|
||||
return delta.days * 24 * 60 * 60 + delta.seconds
|
||||
|
||||
def format_relative_date(date, future=False):
|
||||
if not date or date < datetime.datetime(2010, 1, 1):
|
||||
return "Soon"
|
||||
|
||||
now = datetime.datetime.utcnow()
|
||||
diff = abs(now - date)
|
||||
if diff < datetime.timedelta(minutes=60):
|
||||
minutes = diff.seconds / 60
|
||||
return "%s minute%s %s" % (minutes,
|
||||
'' if minutes == 1 else 's',
|
||||
'' if future else 'ago')
|
||||
elif datetime.timedelta(minutes=60) <= diff < datetime.timedelta(minutes=90):
|
||||
return "1 hour %s" % ('' if future else 'ago')
|
||||
elif diff < datetime.timedelta(hours=24):
|
||||
dec = (diff.seconds / 60 + 15) % 60
|
||||
if dec >= 30:
|
||||
return "%s.5 hours %s" % ((((diff.seconds / 60) + 15) / 60),
|
||||
'' if future else 'ago')
|
||||
else:
|
||||
return "%s hours %s" % ((((diff.seconds / 60) + 15) / 60),
|
||||
'' if future else 'ago')
|
||||
else:
|
||||
days = ((diff.seconds / 60) / 60 / 24)
|
||||
return "%s day%s %s" % (days, '' if days == 1 else 's', '' if future else 'ago')
|
||||
|
||||
def add_object_to_folder(obj, in_folder, folders, parent='', added=False):
|
||||
obj_identifier = obj
|
||||
if isinstance(obj, dict):
|
||||
obj_identifier = list(obj.keys())[0]
|
||||
|
||||
if ((not in_folder or in_folder == " ") and
|
||||
not parent and
|
||||
not isinstance(obj, dict) and
|
||||
obj_identifier not in folders):
|
||||
folders.append(obj)
|
||||
return folders
|
||||
|
||||
child_folder_names = []
|
||||
for item in folders:
|
||||
if isinstance(item, dict):
|
||||
child_folder_names.append(list(item.keys())[0])
|
||||
if isinstance(obj, dict) and in_folder.lower() == parent.lower():
|
||||
if obj_identifier not in child_folder_names:
|
||||
folders.append(obj)
|
||||
return folders
|
||||
|
||||
for k, v in enumerate(folders):
|
||||
if isinstance(v, dict):
|
||||
for f_k, f_v in list(v.items()):
|
||||
if f_k.lower() == in_folder.lower() and obj_identifier not in f_v and not added:
|
||||
f_v.append(obj)
|
||||
added = True
|
||||
folders[k][f_k] = add_object_to_folder(obj, in_folder, f_v, f_k, added)
|
||||
|
||||
return folders
|
||||
|
||||
def mail_feed_error_to_admin(feed, e, local_vars=None, subject=None):
|
||||
# Mail the admins with the error
|
||||
if not subject:
|
||||
subject = "Feed update error"
|
||||
exc_info = sys.exc_info()
|
||||
subject = '%s: %s' % (subject, repr(e))
|
||||
message = 'Traceback:\n%s\n\Feed:\n%s\nLocals:\n%s' % (
|
||||
'\n'.join(traceback.format_exception(*exc_info)),
|
||||
pprint.pformat(feed.__dict__),
|
||||
pprint.pformat(local_vars)
|
||||
)
|
||||
# print message
|
||||
mail_admins(subject, message)
|
||||
|
||||
## {{{ http://code.activestate.com/recipes/576611/ (r11)
|
||||
from operator import itemgetter
|
||||
from heapq import nlargest
|
||||
from itertools import repeat
|
||||
|
||||
class Counter(dict):
|
||||
'''Dict subclass for counting hashable objects. Sometimes called a bag
|
||||
or multiset. Elements are stored as dictionary keys and their counts
|
||||
are stored as dictionary values.
|
||||
|
||||
>>> Counter('zyzygy')
|
||||
Counter({'y': 3, 'z': 2, 'g': 1})
|
||||
|
||||
'''
|
||||
|
||||
def __init__(self, iterable=None, **kwds):
|
||||
'''Create a new, empty Counter object. And if given, count elements
|
||||
from an input iterable. Or, initialize the count from another mapping
|
||||
of elements to their counts.
|
||||
|
||||
>>> c = Counter() # a new, empty counter
|
||||
>>> c = Counter('gallahad') # a new counter from an iterable
|
||||
>>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping
|
||||
>>> c = Counter(a=4, b=2) # a new counter from keyword args
|
||||
|
||||
'''
|
||||
self.update(iterable, **kwds)
|
||||
|
||||
def __missing__(self, key):
|
||||
return 0
|
||||
|
||||
def most_common(self, n=None):
|
||||
'''List the n most common elements and their counts from the most
|
||||
common to the least. If n is None, then list all element counts.
|
||||
|
||||
>>> Counter('abracadabra').most_common(3)
|
||||
[('a', 5), ('r', 2), ('b', 2)]
|
||||
|
||||
'''
|
||||
if n is None:
|
||||
return sorted(iter(self.items()), key=itemgetter(1), reverse=True)
|
||||
return nlargest(n, iter(self.items()), key=itemgetter(1))
|
||||
|
||||
def elements(self):
|
||||
'''Iterator over elements repeating each as many times as its count.
|
||||
|
||||
>>> c = Counter('ABCABC')
|
||||
>>> sorted(c.elements())
|
||||
['A', 'A', 'B', 'B', 'C', 'C']
|
||||
|
||||
If an element's count has been set to zero or is a negative number,
|
||||
elements() will ignore it.
|
||||
|
||||
'''
|
||||
for elem, count in self.items():
|
||||
for _ in repeat(None, count):
|
||||
yield elem
|
||||
|
||||
# Override dict methods where the meaning changes for Counter objects.
|
||||
|
||||
@classmethod
|
||||
def fromkeys(cls, iterable, v=None):
|
||||
raise NotImplementedError(
|
||||
'Counter.fromkeys() is undefined. Use Counter(iterable) instead.')
|
||||
|
||||
def update(self, iterable=None, **kwds):
|
||||
'''Like dict.update() but add counts instead of replacing them.
|
||||
|
||||
Source can be an iterable, a dictionary, or another Counter instance.
|
||||
|
||||
>>> c = Counter('which')
|
||||
>>> c.update('witch') # add elements from another iterable
|
||||
>>> d = Counter('watch')
|
||||
>>> c.update(d) # add elements from another counter
|
||||
>>> c['h'] # four 'h' in which, witch, and watch
|
||||
4
|
||||
|
||||
'''
|
||||
if iterable is not None:
|
||||
if hasattr(iterable, 'iteritems'):
|
||||
if self:
|
||||
self_get = self.get
|
||||
for elem, count in iterable.items():
|
||||
self[elem] = self_get(elem, 0) + count
|
||||
else:
|
||||
dict.update(self, iterable) # fast path when counter is empty
|
||||
else:
|
||||
self_get = self.get
|
||||
for elem in iterable:
|
||||
self[elem] = self_get(elem, 0) + 1
|
||||
if kwds:
|
||||
self.update(kwds)
|
||||
|
||||
def copy(self):
|
||||
'Like dict.copy() but returns a Counter instance instead of a dict.'
|
||||
return Counter(self)
|
||||
|
||||
def __delitem__(self, elem):
|
||||
'Like dict.__delitem__() but does not raise KeyError for missing values.'
|
||||
if elem in self:
|
||||
dict.__delitem__(self, elem)
|
||||
|
||||
def __repr__(self):
|
||||
if not self:
|
||||
return '%s()' % self.__class__.__name__
|
||||
items = ', '.join(map('%r: %r'.__mod__, self.most_common()))
|
||||
return '%s({%s})' % (self.__class__.__name__, items)
|
||||
|
||||
# Multiset-style mathematical operations discussed in:
|
||||
# Knuth TAOCP Volume II section 4.6.3 exercise 19
|
||||
# and at http://en.wikipedia.org/wiki/Multiset
|
||||
#
|
||||
# Outputs guaranteed to only include positive counts.
|
||||
#
|
||||
# To strip negative and zero counts, add-in an empty counter:
|
||||
# c += Counter()
|
||||
|
||||
def __add__(self, other):
|
||||
'''Add counts from two counters.
|
||||
|
||||
>>> Counter('abbb') + Counter('bcc')
|
||||
Counter({'b': 4, 'c': 2, 'a': 1})
|
||||
|
||||
|
||||
'''
|
||||
if not isinstance(other, Counter):
|
||||
return NotImplemented
|
||||
result = Counter()
|
||||
for elem in set(self) | set(other):
|
||||
newcount = self[elem] + other[elem]
|
||||
if newcount > 0:
|
||||
result[elem] = newcount
|
||||
return result
|
||||
|
||||
def __sub__(self, other):
|
||||
''' Subtract count, but keep only results with positive counts.
|
||||
|
||||
>>> Counter('abbbc') - Counter('bccd')
|
||||
Counter({'b': 2, 'a': 1})
|
||||
|
||||
'''
|
||||
if not isinstance(other, Counter):
|
||||
return NotImplemented
|
||||
result = Counter()
|
||||
for elem in set(self) | set(other):
|
||||
newcount = self[elem] - other[elem]
|
||||
if newcount > 0:
|
||||
result[elem] = newcount
|
||||
return result
|
||||
|
||||
def __or__(self, other):
|
||||
'''Union is the maximum of value in either of the input counters.
|
||||
|
||||
>>> Counter('abbb') | Counter('bcc')
|
||||
Counter({'b': 3, 'c': 2, 'a': 1})
|
||||
|
||||
'''
|
||||
if not isinstance(other, Counter):
|
||||
return NotImplemented
|
||||
_max = max
|
||||
result = Counter()
|
||||
for elem in set(self) | set(other):
|
||||
newcount = _max(self[elem], other[elem])
|
||||
if newcount > 0:
|
||||
result[elem] = newcount
|
||||
return result
|
||||
|
||||
def __and__(self, other):
|
||||
''' Intersection is the minimum of corresponding counts.
|
||||
|
||||
>>> Counter('abbb') & Counter('bcc')
|
||||
Counter({'b': 1})
|
||||
|
||||
'''
|
||||
if not isinstance(other, Counter):
|
||||
return NotImplemented
|
||||
_min = min
|
||||
result = Counter()
|
||||
if len(self) < len(other):
|
||||
self, other = other, self
|
||||
for elem in filter(self.__contains__, other):
|
||||
newcount = _min(self[elem], other[elem])
|
||||
if newcount > 0:
|
||||
result[elem] = newcount
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import doctest
|
||||
print(doctest.testmod())
|
||||
## end of http://code.activestate.com/recipes/576611/ }}}
|
||||
|
||||
def chunks(l, n):
|
||||
for i in range(0, len(l), n):
|
||||
yield l[i:i+n]
|
|
@ -46,16 +46,16 @@ Also Jason Diamond, Brian Lalor for bug reporting and patches"""
|
|||
|
||||
_debug = 0
|
||||
|
||||
import sgmllib, urllib, urlparse, re, sys, robotparser
|
||||
import sgmllib, urllib.request, urllib.parse, urllib.error, urllib.parse, re, sys, urllib.robotparser
|
||||
import requests
|
||||
from StringIO import StringIO
|
||||
from io import StringIO
|
||||
from lxml import etree
|
||||
|
||||
|
||||
# XML-RPC support allows feedfinder to query Syndic8 for possible matches.
|
||||
# Python 2.3 now comes with this module by default, otherwise you can download it
|
||||
try:
|
||||
import xmlrpclib # http://www.pythonware.com/products/xmlrpc/
|
||||
import xmlrpc.client # http://www.pythonware.com/products/xmlrpc/
|
||||
except ImportError:
|
||||
xmlrpclib = None
|
||||
|
||||
|
@ -67,28 +67,28 @@ if not dict:
|
|||
return rc
|
||||
|
||||
def _debuglog(message):
|
||||
if _debug: print message
|
||||
if _debug: print(message)
|
||||
|
||||
class URLGatekeeper:
|
||||
"""a class to track robots.txt rules across multiple servers"""
|
||||
def __init__(self):
|
||||
self.rpcache = {} # a dictionary of RobotFileParser objects, by domain
|
||||
self.urlopener = urllib.FancyURLopener()
|
||||
self.urlopener = urllib.request.FancyURLopener()
|
||||
self.urlopener.version = "NewsBlur Feed Finder (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)"
|
||||
_debuglog(self.urlopener.version)
|
||||
self.urlopener.addheaders = [('User-Agent', self.urlopener.version)]
|
||||
# self.urlopener.addheaders = [('User-Agent', self.urlopener.version), ('Accept', '*')]
|
||||
robotparser.URLopener.version = self.urlopener.version
|
||||
robotparser.URLopener.addheaders = self.urlopener.addheaders
|
||||
urllib.robotparser.URLopener.version = self.urlopener.version
|
||||
urllib.robotparser.URLopener.addheaders = self.urlopener.addheaders
|
||||
|
||||
def _getrp(self, url):
|
||||
protocol, domain = urlparse.urlparse(url)[:2]
|
||||
if self.rpcache.has_key(domain):
|
||||
protocol, domain = urllib.parse.urlparse(url)[:2]
|
||||
if domain in self.rpcache:
|
||||
return self.rpcache[domain]
|
||||
baseurl = '%s://%s' % (protocol, domain)
|
||||
robotsurl = urlparse.urljoin(baseurl, 'robots.txt')
|
||||
robotsurl = urllib.parse.urljoin(baseurl, 'robots.txt')
|
||||
_debuglog('fetching %s' % robotsurl)
|
||||
rp = robotparser.RobotFileParser(robotsurl)
|
||||
rp = urllib.robotparser.RobotFileParser(robotsurl)
|
||||
try:
|
||||
rp.read()
|
||||
except:
|
||||
|
@ -119,7 +119,7 @@ class BaseParser(sgmllib.SGMLParser):
|
|||
|
||||
def normalize_attrs(self, attrs):
|
||||
def cleanattr(v):
|
||||
v = sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v)
|
||||
v = sgmllib.charref.sub(lambda m: chr(int(m.groups()[0])), v)
|
||||
if not v: return
|
||||
v = v.strip()
|
||||
v = v.replace('<', '<').replace('>', '>').replace(''', "'").replace('"', '"').replace('&', '&')
|
||||
|
@ -130,7 +130,7 @@ class BaseParser(sgmllib.SGMLParser):
|
|||
|
||||
def do_base(self, attrs):
|
||||
attrsD = dict(self.normalize_attrs(attrs))
|
||||
if not attrsD.has_key('href'): return
|
||||
if 'href' not in attrsD: return
|
||||
self.baseuri = attrsD['href']
|
||||
|
||||
def error(self, *a, **kw): pass # we're not picky
|
||||
|
@ -143,18 +143,18 @@ class LinkParser(BaseParser):
|
|||
'application/x-atom+xml')
|
||||
def do_link(self, attrs):
|
||||
attrsD = dict(self.normalize_attrs(attrs))
|
||||
if not attrsD.has_key('rel'): return
|
||||
if 'rel' not in attrsD: return
|
||||
rels = attrsD['rel'].split()
|
||||
if 'alternate' not in rels: return
|
||||
if attrsD.get('type') not in self.FEED_TYPES: return
|
||||
if not attrsD.has_key('href'): return
|
||||
self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
|
||||
if 'href' not in attrsD: return
|
||||
self.links.append(urllib.parse.urljoin(self.baseuri, attrsD['href']))
|
||||
|
||||
class ALinkParser(BaseParser):
|
||||
def start_a(self, attrs):
|
||||
attrsD = dict(self.normalize_attrs(attrs))
|
||||
if not attrsD.has_key('href'): return
|
||||
self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
|
||||
if 'href' not in attrsD: return
|
||||
self.links.append(urllib.parse.urljoin(self.baseuri, attrsD['href']))
|
||||
|
||||
def makeFullURI(uri):
|
||||
if not uri: return
|
||||
|
@ -218,7 +218,7 @@ def couldBeFeedData(data):
|
|||
|
||||
def isFeed(uri):
|
||||
_debuglog('seeing if %s is a feed' % uri)
|
||||
protocol = urlparse.urlparse(uri)
|
||||
protocol = urllib.parse.urlparse(uri)
|
||||
if protocol[0] not in ('http', 'https'): return 0
|
||||
try:
|
||||
data = _gatekeeper.get(uri, check=False)
|
||||
|
@ -233,7 +233,7 @@ def sortFeeds(feed1Info, feed2Info):
|
|||
def getFeedsFromSyndic8(uri):
|
||||
feeds = []
|
||||
try:
|
||||
server = xmlrpclib.Server('http://www.syndic8.com/xmlrpc.php')
|
||||
server = xmlrpc.client.Server('http://www.syndic8.com/xmlrpc.php')
|
||||
feedids = server.syndic8.FindFeeds(uri)
|
||||
infolist = server.syndic8.GetFeedInfo(feedids, ['headlines_rank','status','dataurl'])
|
||||
infolist.sort(sortFeeds)
|
||||
|
@ -270,7 +270,7 @@ def feeds(uri, all=False, querySyndic8=False, _recurs=None):
|
|||
except:
|
||||
outfeeds = []
|
||||
_debuglog('found %s feeds through LINK tags' % len(outfeeds))
|
||||
outfeeds = filter(isFeed, outfeeds)
|
||||
outfeeds = list(filter(isFeed, outfeeds))
|
||||
if all or not outfeeds:
|
||||
# no LINK tags, look for regular <A> links that point to feeds
|
||||
_debuglog('no LINK tags, looking at A tags')
|
||||
|
@ -281,16 +281,16 @@ def feeds(uri, all=False, querySyndic8=False, _recurs=None):
|
|||
_debuglog('no LINK tags, looking at local links')
|
||||
locallinks = getLocalLinks(links, fulluri)
|
||||
# look for obvious feed links on the same server
|
||||
outfeeds.extend(filter(isFeed, filter(isFeedLink, locallinks)))
|
||||
outfeeds.extend(list(filter(isFeed, list(filter(isFeedLink, locallinks)))))
|
||||
if all or not outfeeds:
|
||||
# look harder for feed links on the same server
|
||||
outfeeds.extend(filter(isFeed, filter(isXMLRelatedLink, locallinks)))
|
||||
outfeeds.extend(list(filter(isFeed, list(filter(isXMLRelatedLink, locallinks)))))
|
||||
if all or not outfeeds:
|
||||
# look for obvious feed links on another server
|
||||
outfeeds.extend(filter(isFeed, filter(isFeedLink, links)))
|
||||
outfeeds.extend(list(filter(isFeed, list(filter(isFeedLink, links)))))
|
||||
if all or not outfeeds:
|
||||
# look harder for feed links on another server
|
||||
outfeeds.extend(filter(isFeed, filter(isXMLRelatedLink, links)))
|
||||
outfeeds.extend(list(filter(isFeed, list(filter(isXMLRelatedLink, links)))))
|
||||
if all or not outfeeds:
|
||||
_debuglog('no A tags, guessing')
|
||||
suffixes = [ # filenames used by popular software:
|
||||
|
@ -302,12 +302,12 @@ def feeds(uri, all=False, querySyndic8=False, _recurs=None):
|
|||
'index.xml', # MT
|
||||
'index.rss' # Slash
|
||||
]
|
||||
outfeeds.extend(filter(isFeed, [urlparse.urljoin(fulluri, x) for x in suffixes]))
|
||||
outfeeds.extend(list(filter(isFeed, [urllib.parse.urljoin(fulluri, x) for x in suffixes])))
|
||||
if (all or not outfeeds) and querySyndic8:
|
||||
# still no luck, search Syndic8 for feeds (requires xmlrpclib)
|
||||
_debuglog('still no luck, searching Syndic8')
|
||||
outfeeds.extend(getFeedsFromSyndic8(uri))
|
||||
if hasattr(__builtins__, 'set') or __builtins__.has_key('set'):
|
||||
if hasattr(__builtins__, 'set') or 'set' in __builtins__:
|
||||
outfeeds = list(set(outfeeds))
|
||||
return outfeeds
|
||||
|
||||
|
@ -317,7 +317,7 @@ def feed(uri):
|
|||
#todo: give preference to certain feed formats
|
||||
feedlist = feeds(uri)
|
||||
if feedlist:
|
||||
feeds_no_comments = filter(lambda f: 'comments' not in f.lower(), feedlist)
|
||||
feeds_no_comments = [f for f in feedlist if 'comments' not in f.lower()]
|
||||
if feeds_no_comments:
|
||||
return feeds_no_comments[0]
|
||||
return feedlist[0]
|
||||
|
@ -338,25 +338,25 @@ def test():
|
|||
count += 1
|
||||
links = getLinks(data, uri)
|
||||
if not links:
|
||||
print '\n*** FAILED ***', uri, 'could not find link'
|
||||
print(('\n*** FAILED ***', uri, 'could not find link'))
|
||||
failed.append(uri)
|
||||
elif len(links) > 1:
|
||||
print '\n*** FAILED ***', uri, 'found too many links'
|
||||
print(('\n*** FAILED ***', uri, 'found too many links'))
|
||||
failed.append(uri)
|
||||
else:
|
||||
atomdata = urllib.urlopen(links[0]).read()
|
||||
atomdata = urllib.request.urlopen(links[0]).read()
|
||||
if atomdata.find('<link rel="alternate"') == -1:
|
||||
print '\n*** FAILED ***', uri, 'retrieved something that is not a feed'
|
||||
print(('\n*** FAILED ***', uri, 'retrieved something that is not a feed'))
|
||||
failed.append(uri)
|
||||
else:
|
||||
backlink = atomdata.split('href="').pop().split('"')[0]
|
||||
if backlink != uri:
|
||||
print '\n*** FAILED ***', uri, 'retrieved wrong feed'
|
||||
print(('\n*** FAILED ***', uri, 'retrieved wrong feed'))
|
||||
failed.append(uri)
|
||||
if data.find('<link rel="next" href="') == -1: break
|
||||
uri = urlparse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])
|
||||
print
|
||||
print count, 'tests executed,', len(failed), 'failed'
|
||||
uri = urllib.parse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])
|
||||
print()
|
||||
print((count, 'tests executed,', len(failed), 'failed'))
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = sys.argv[1:]
|
||||
|
@ -370,4 +370,4 @@ if __name__ == '__main__':
|
|||
if uri == 'test':
|
||||
test()
|
||||
else:
|
||||
print "\n".join(getFeeds(uri))
|
||||
print(("\n".join(getFeeds(uri))))
|
||||
|
|
373
utils/feedfinder.py.bak
Normal file
373
utils/feedfinder.py.bak
Normal file
|
@ -0,0 +1,373 @@
|
|||
"""feedfinder: Find the Web feed for a Web page
|
||||
http://www.aaronsw.com/2002/feedfinder/
|
||||
|
||||
Usage:
|
||||
feed(uri) - returns feed found for a URI
|
||||
feeds(uri) - returns all feeds found for a URI
|
||||
|
||||
>>> import feedfinder
|
||||
>>> feedfinder.feed('scripting.com')
|
||||
'http://scripting.com/rss.xml'
|
||||
>>>
|
||||
>>> feedfinder.feeds('scripting.com')
|
||||
['http://delong.typepad.com/sdj/atom.xml',
|
||||
'http://delong.typepad.com/sdj/index.rdf',
|
||||
'http://delong.typepad.com/sdj/rss.xml']
|
||||
>>>
|
||||
|
||||
Can also use from the command line. Feeds are returned one per line:
|
||||
|
||||
$ python feedfinder.py diveintomark.org
|
||||
http://diveintomark.org/xml/atom.xml
|
||||
|
||||
How it works:
|
||||
0. At every step, feeds are minimally verified to make sure they are really feeds.
|
||||
1. If the URI points to a feed, it is simply returned; otherwise
|
||||
the page is downloaded and the real fun begins.
|
||||
2. Feeds pointed to by LINK tags in the header of the page (autodiscovery)
|
||||
3. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or
|
||||
".atom"
|
||||
4. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom"
|
||||
5. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or
|
||||
".atom"
|
||||
6. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom"
|
||||
7. Try some guesses about common places for feeds (index.xml, atom.xml, etc.).
|
||||
8. As a last ditch effort, we search Syndic8 for feeds matching the URI
|
||||
"""
|
||||
|
||||
__version__ = "1.371"
|
||||
__date__ = "2006-04-24"
|
||||
__maintainer__ = "Aaron Swartz (me@aaronsw.com)"
|
||||
__author__ = "Mark Pilgrim (http://diveintomark.org)"
|
||||
__copyright__ = "Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz"
|
||||
__license__ = "Python"
|
||||
__credits__ = """Abe Fettig for a patch to sort Syndic8 feeds by popularity
|
||||
Also Jason Diamond, Brian Lalor for bug reporting and patches"""
|
||||
|
||||
_debug = 0
|
||||
|
||||
import sgmllib, urllib.request, urllib.parse, urllib.error, urllib.parse, re, sys, urllib.robotparser
|
||||
import requests
|
||||
from io import StringIO
|
||||
from lxml import etree
|
||||
|
||||
|
||||
# XML-RPC support allows feedfinder to query Syndic8 for possible matches.
|
||||
# Python 2.3 now comes with this module by default, otherwise you can download it
|
||||
try:
|
||||
import xmlrpc.client # http://www.pythonware.com/products/xmlrpc/
|
||||
except ImportError:
|
||||
xmlrpclib = None
|
||||
|
||||
if not dict:
|
||||
def dict(aList):
|
||||
rc = {}
|
||||
for k, v in aList:
|
||||
rc[k] = v
|
||||
return rc
|
||||
|
||||
def _debuglog(message):
|
||||
if _debug: print(message)
|
||||
|
||||
class URLGatekeeper:
|
||||
"""a class to track robots.txt rules across multiple servers"""
|
||||
def __init__(self):
|
||||
self.rpcache = {} # a dictionary of RobotFileParser objects, by domain
|
||||
self.urlopener = urllib.request.FancyURLopener()
|
||||
self.urlopener.version = "NewsBlur Feed Finder (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)"
|
||||
_debuglog(self.urlopener.version)
|
||||
self.urlopener.addheaders = [('User-Agent', self.urlopener.version)]
|
||||
# self.urlopener.addheaders = [('User-Agent', self.urlopener.version), ('Accept', '*')]
|
||||
urllib.robotparser.URLopener.version = self.urlopener.version
|
||||
urllib.robotparser.URLopener.addheaders = self.urlopener.addheaders
|
||||
|
||||
def _getrp(self, url):
|
||||
protocol, domain = urllib.parse.urlparse(url)[:2]
|
||||
if domain in self.rpcache:
|
||||
return self.rpcache[domain]
|
||||
baseurl = '%s://%s' % (protocol, domain)
|
||||
robotsurl = urllib.parse.urljoin(baseurl, 'robots.txt')
|
||||
_debuglog('fetching %s' % robotsurl)
|
||||
rp = urllib.robotparser.RobotFileParser(robotsurl)
|
||||
try:
|
||||
rp.read()
|
||||
except:
|
||||
pass
|
||||
self.rpcache[domain] = rp
|
||||
return rp
|
||||
|
||||
def can_fetch(self, url):
|
||||
rp = self._getrp(url)
|
||||
allow = rp.can_fetch(self.urlopener.version, url)
|
||||
_debuglog("gatekeeper of %s says %s" % (url, allow))
|
||||
return allow
|
||||
|
||||
def get(self, url, check=False):
|
||||
if check and not self.can_fetch(url): return ''
|
||||
try:
|
||||
return requests.get(url, headers=dict(self.urlopener.addheaders)).content
|
||||
except:
|
||||
return ''
|
||||
|
||||
_gatekeeper = URLGatekeeper()
|
||||
|
||||
class BaseParser(sgmllib.SGMLParser):
|
||||
def __init__(self, baseuri):
|
||||
sgmllib.SGMLParser.__init__(self)
|
||||
self.links = []
|
||||
self.baseuri = baseuri
|
||||
|
||||
def normalize_attrs(self, attrs):
|
||||
def cleanattr(v):
|
||||
v = sgmllib.charref.sub(lambda m: chr(int(m.groups()[0])), v)
|
||||
if not v: return
|
||||
v = v.strip()
|
||||
v = v.replace('<', '<').replace('>', '>').replace(''', "'").replace('"', '"').replace('&', '&')
|
||||
return v
|
||||
attrs = [(k.lower(), cleanattr(v)) for k, v in attrs if cleanattr(v)]
|
||||
attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs if cleanattr(v)]
|
||||
return attrs
|
||||
|
||||
def do_base(self, attrs):
|
||||
attrsD = dict(self.normalize_attrs(attrs))
|
||||
if 'href' not in attrsD: return
|
||||
self.baseuri = attrsD['href']
|
||||
|
||||
def error(self, *a, **kw): pass # we're not picky
|
||||
|
||||
class LinkParser(BaseParser):
|
||||
FEED_TYPES = ('application/rss+xml',
|
||||
'text/xml',
|
||||
'application/atom+xml',
|
||||
'application/x.atom+xml',
|
||||
'application/x-atom+xml')
|
||||
def do_link(self, attrs):
|
||||
attrsD = dict(self.normalize_attrs(attrs))
|
||||
if 'rel' not in attrsD: return
|
||||
rels = attrsD['rel'].split()
|
||||
if 'alternate' not in rels: return
|
||||
if attrsD.get('type') not in self.FEED_TYPES: return
|
||||
if 'href' not in attrsD: return
|
||||
self.links.append(urllib.parse.urljoin(self.baseuri, attrsD['href']))
|
||||
|
||||
class ALinkParser(BaseParser):
|
||||
def start_a(self, attrs):
|
||||
attrsD = dict(self.normalize_attrs(attrs))
|
||||
if 'href' not in attrsD: return
|
||||
self.links.append(urllib.parse.urljoin(self.baseuri, attrsD['href']))
|
||||
|
||||
def makeFullURI(uri):
|
||||
if not uri: return
|
||||
uri = uri.strip()
|
||||
if uri.startswith('feed://'):
|
||||
uri = 'http://' + uri.split('feed://', 1).pop()
|
||||
for x in ['http', 'https']:
|
||||
if uri.startswith('%s://' % x):
|
||||
return uri
|
||||
return 'http://%s' % uri
|
||||
|
||||
def getLinks(data, baseuri):
|
||||
p = LinkParser(baseuri)
|
||||
p.feed(data)
|
||||
return p.links
|
||||
|
||||
def getLinksLXML(data, baseuri):
|
||||
parser = etree.HTMLParser(recover=True)
|
||||
tree = etree.parse(StringIO(data), parser)
|
||||
links = []
|
||||
for link in tree.findall('.//link'):
|
||||
if link.attrib.get('type') in LinkParser.FEED_TYPES:
|
||||
href = link.attrib['href']
|
||||
if href: links.append(href)
|
||||
return links
|
||||
|
||||
def getALinks(data, baseuri):
|
||||
p = ALinkParser(baseuri)
|
||||
p.feed(data)
|
||||
return p.links
|
||||
|
||||
def getLocalLinks(links, baseuri):
|
||||
found_links = []
|
||||
if not baseuri: return found_links
|
||||
baseuri = baseuri.lower()
|
||||
for l in links:
|
||||
try:
|
||||
if l.lower().startswith(baseuri):
|
||||
found_links.append(l)
|
||||
except (AttributeError, UnicodeDecodeError):
|
||||
pass
|
||||
return found_links
|
||||
|
||||
def isFeedLink(link):
|
||||
return link[-4:].lower() in ('.rss', '.rdf', '.xml', '.atom')
|
||||
|
||||
def isXMLRelatedLink(link):
|
||||
link = link.lower()
|
||||
return link.count('rss') + link.count('rdf') + link.count('xml') + link.count('atom')
|
||||
|
||||
r_brokenRedirect = re.compile('<newLocation[^>]*>(.*?)</newLocation>', re.S)
|
||||
def tryBrokenRedirect(data):
|
||||
if '<newLocation' in data:
|
||||
newuris = r_brokenRedirect.findall(data)
|
||||
if newuris and newuris[0]: return newuris[0].strip()
|
||||
|
||||
def couldBeFeedData(data):
|
||||
data = data.lower()
|
||||
if data.count('<html'): return 0
|
||||
return data.count('<rss') + data.count('<rdf') + data.count('<feed')
|
||||
|
||||
def isFeed(uri):
|
||||
_debuglog('seeing if %s is a feed' % uri)
|
||||
protocol = urllib.parse.urlparse(uri)
|
||||
if protocol[0] not in ('http', 'https'): return 0
|
||||
try:
|
||||
data = _gatekeeper.get(uri, check=False)
|
||||
except (KeyError, UnicodeDecodeError):
|
||||
return False
|
||||
count = couldBeFeedData(data)
|
||||
return count
|
||||
|
||||
def sortFeeds(feed1Info, feed2Info):
|
||||
return cmp(feed2Info['headlines_rank'], feed1Info['headlines_rank'])
|
||||
|
||||
def getFeedsFromSyndic8(uri):
|
||||
feeds = []
|
||||
try:
|
||||
server = xmlrpc.client.Server('http://www.syndic8.com/xmlrpc.php')
|
||||
feedids = server.syndic8.FindFeeds(uri)
|
||||
infolist = server.syndic8.GetFeedInfo(feedids, ['headlines_rank','status','dataurl'])
|
||||
infolist.sort(sortFeeds)
|
||||
feeds = [f['dataurl'] for f in infolist if f['status']=='Syndicated']
|
||||
_debuglog('found %s feeds through Syndic8' % len(feeds))
|
||||
except:
|
||||
pass
|
||||
return feeds
|
||||
|
||||
def feeds(uri, all=False, querySyndic8=False, _recurs=None):
|
||||
if _recurs is None: _recurs = [uri]
|
||||
fulluri = makeFullURI(uri)
|
||||
try:
|
||||
data = _gatekeeper.get(fulluri, check=False)
|
||||
except:
|
||||
return []
|
||||
# is this already a feed?
|
||||
if couldBeFeedData(data):
|
||||
return [fulluri]
|
||||
newuri = tryBrokenRedirect(data)
|
||||
if newuri and newuri not in _recurs:
|
||||
_recurs.append(newuri)
|
||||
return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)
|
||||
# nope, it's a page, try LINK tags first
|
||||
_debuglog('looking for LINK tags')
|
||||
try:
|
||||
outfeeds = getLinks(data, fulluri)
|
||||
except:
|
||||
outfeeds = []
|
||||
if not outfeeds:
|
||||
_debuglog('using lxml to look for LINK tags')
|
||||
try:
|
||||
outfeeds = getLinksLXML(data, fulluri)
|
||||
except:
|
||||
outfeeds = []
|
||||
_debuglog('found %s feeds through LINK tags' % len(outfeeds))
|
||||
outfeeds = list(filter(isFeed, outfeeds))
|
||||
if all or not outfeeds:
|
||||
# no LINK tags, look for regular <A> links that point to feeds
|
||||
_debuglog('no LINK tags, looking at A tags')
|
||||
try:
|
||||
links = getALinks(data, fulluri)
|
||||
except:
|
||||
links = []
|
||||
_debuglog('no LINK tags, looking at local links')
|
||||
locallinks = getLocalLinks(links, fulluri)
|
||||
# look for obvious feed links on the same server
|
||||
outfeeds.extend(list(filter(isFeed, list(filter(isFeedLink, locallinks)))))
|
||||
if all or not outfeeds:
|
||||
# look harder for feed links on the same server
|
||||
outfeeds.extend(list(filter(isFeed, list(filter(isXMLRelatedLink, locallinks)))))
|
||||
if all or not outfeeds:
|
||||
# look for obvious feed links on another server
|
||||
outfeeds.extend(list(filter(isFeed, list(filter(isFeedLink, links)))))
|
||||
if all or not outfeeds:
|
||||
# look harder for feed links on another server
|
||||
outfeeds.extend(list(filter(isFeed, list(filter(isXMLRelatedLink, links)))))
|
||||
if all or not outfeeds:
|
||||
_debuglog('no A tags, guessing')
|
||||
suffixes = [ # filenames used by popular software:
|
||||
'feed/', # obvious
|
||||
'atom.xml', # blogger, TypePad
|
||||
'index.atom', # MT, apparently
|
||||
'index.rdf', # MT
|
||||
'rss.xml', # Dave Winer/Manila
|
||||
'index.xml', # MT
|
||||
'index.rss' # Slash
|
||||
]
|
||||
outfeeds.extend(list(filter(isFeed, [urllib.parse.urljoin(fulluri, x) for x in suffixes])))
|
||||
if (all or not outfeeds) and querySyndic8:
|
||||
# still no luck, search Syndic8 for feeds (requires xmlrpclib)
|
||||
_debuglog('still no luck, searching Syndic8')
|
||||
outfeeds.extend(getFeedsFromSyndic8(uri))
|
||||
if hasattr(__builtins__, 'set') or 'set' in __builtins__:
|
||||
outfeeds = list(set(outfeeds))
|
||||
return outfeeds
|
||||
|
||||
getFeeds = feeds # backwards-compatibility
|
||||
|
||||
def feed(uri):
|
||||
#todo: give preference to certain feed formats
|
||||
feedlist = feeds(uri)
|
||||
if feedlist:
|
||||
feeds_no_comments = [f for f in feedlist if 'comments' not in f.lower()]
|
||||
if feeds_no_comments:
|
||||
return feeds_no_comments[0]
|
||||
return feedlist[0]
|
||||
else:
|
||||
return None
|
||||
|
||||
##### test harness ######
|
||||
|
||||
def test():
|
||||
uri = 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html'
|
||||
failed = []
|
||||
count = 0
|
||||
while 1:
|
||||
data = _gatekeeper.get(uri)
|
||||
if data.find('Atom autodiscovery test') == -1: break
|
||||
sys.stdout.write('.')
|
||||
sys.stdout.flush()
|
||||
count += 1
|
||||
links = getLinks(data, uri)
|
||||
if not links:
|
||||
print('\n*** FAILED ***', uri, 'could not find link')
|
||||
failed.append(uri)
|
||||
elif len(links) > 1:
|
||||
print('\n*** FAILED ***', uri, 'found too many links')
|
||||
failed.append(uri)
|
||||
else:
|
||||
atomdata = urllib.request.urlopen(links[0]).read()
|
||||
if atomdata.find('<link rel="alternate"') == -1:
|
||||
print('\n*** FAILED ***', uri, 'retrieved something that is not a feed')
|
||||
failed.append(uri)
|
||||
else:
|
||||
backlink = atomdata.split('href="').pop().split('"')[0]
|
||||
if backlink != uri:
|
||||
print('\n*** FAILED ***', uri, 'retrieved wrong feed')
|
||||
failed.append(uri)
|
||||
if data.find('<link rel="next" href="') == -1: break
|
||||
uri = urllib.parse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])
|
||||
print()
|
||||
print(count, 'tests executed,', len(failed), 'failed')
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = sys.argv[1:]
|
||||
if args and args[0] == '--debug':
|
||||
_debug = 1
|
||||
args.pop(0)
|
||||
if args:
|
||||
uri = args[0]
|
||||
else:
|
||||
uri = 'http://diveintomark.org/'
|
||||
if uri == 'test':
|
||||
test()
|
||||
else:
|
||||
print("\n".join(getFeeds(uri)))
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
__version__ = "0.0.3"
|
||||
|
||||
|
@ -145,7 +145,7 @@ def url_feed_prob(url):
|
|||
if "georss" in url:
|
||||
return -1
|
||||
kw = ["atom", "rss", "rdf", ".xml", "feed", "json"]
|
||||
for p, t in zip(range(len(kw), 0, -1), kw):
|
||||
for p, t in zip(list(range(len(kw), 0, -1)), kw):
|
||||
if t in url:
|
||||
return p
|
||||
return 0
|
||||
|
@ -156,10 +156,10 @@ def sort_urls(feeds):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(find_feeds("www.preposterousuniverse.com/blog/"))
|
||||
print(find_feeds("http://xkcd.com"))
|
||||
print(find_feeds("dan.iel.fm/atom.xml"))
|
||||
print(find_feeds("dan.iel.fm", check_all=True))
|
||||
print(find_feeds("kapadia.github.io"))
|
||||
print(find_feeds("blog.jonathansick.ca"))
|
||||
print(find_feeds("asdasd"))
|
||||
print((find_feeds("www.preposterousuniverse.com/blog/")))
|
||||
print((find_feeds("http://xkcd.com")))
|
||||
print((find_feeds("dan.iel.fm/atom.xml")))
|
||||
print((find_feeds("dan.iel.fm", check_all=True)))
|
||||
print((find_feeds("kapadia.github.io")))
|
||||
print((find_feeds("blog.jonathansick.ca")))
|
||||
print((find_feeds("asdasd")))
|
165
utils/feedfinder2.py.bak
Executable file
165
utils/feedfinder2.py.bak
Executable file
|
@ -0,0 +1,165 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
|
||||
__version__ = "0.0.3"
|
||||
|
||||
try:
|
||||
__FEEDFINDER2_SETUP__
|
||||
except NameError:
|
||||
__FEEDFINDER2_SETUP__ = False
|
||||
|
||||
if not __FEEDFINDER2_SETUP__:
|
||||
__all__ = ["find_feeds"]
|
||||
|
||||
import logging
|
||||
import requests
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
from six.moves.urllib import parse as urlparse
|
||||
|
||||
|
||||
def coerce_url(url):
|
||||
url = url.strip()
|
||||
if url.startswith("feed://"):
|
||||
return "http://{0}".format(url[7:])
|
||||
for proto in ["http://", "https://"]:
|
||||
if url.startswith(proto):
|
||||
return url
|
||||
return "http://{0}".format(url)
|
||||
|
||||
|
||||
class FeedFinder(object):
|
||||
|
||||
def __init__(self, user_agent=None):
|
||||
if user_agent is None:
|
||||
user_agent = "NewsBlur Feed Finder"
|
||||
self.user_agent = user_agent
|
||||
|
||||
def get_feed(self, url, skip_user_agent=False):
|
||||
try:
|
||||
r = requests.get(url, headers={"User-Agent": self.user_agent if not skip_user_agent else None})
|
||||
except Exception as e:
|
||||
logging.warn("Error while getting '{0}'".format(url))
|
||||
logging.warn("{0}".format(e))
|
||||
return None
|
||||
if not skip_user_agent and r.status_code == 403:
|
||||
return self.get_feed(url, skip_user_agent=True)
|
||||
return r.text
|
||||
|
||||
def is_feed_data(self, text):
|
||||
data = text.lower()
|
||||
if data and data[:100].count("<html"):
|
||||
return False
|
||||
return data.count("<rss")+data.count("<rdf")+data.count("<feed")+data.count("jsonfeed.org")
|
||||
|
||||
def is_feed(self, url):
|
||||
text = self.get_feed(url)
|
||||
if text is None:
|
||||
return False
|
||||
return self.is_feed_data(text)
|
||||
|
||||
def is_feed_url(self, url):
|
||||
return any(map(url.lower().endswith,
|
||||
[".rss", ".rdf", ".xml", ".atom", ".json"]))
|
||||
|
||||
def is_feedlike_url(self, url):
|
||||
return any(map(url.lower().count,
|
||||
["rss", "rdf", "xml", "atom", "feed", "json"]))
|
||||
|
||||
|
||||
def find_feeds(url, check_all=False, user_agent=None):
|
||||
finder = FeedFinder(user_agent=user_agent)
|
||||
|
||||
# Format the URL properly.
|
||||
url = coerce_url(url)
|
||||
|
||||
# Download the requested URL.
|
||||
feed_text = finder.get_feed(url)
|
||||
if feed_text is None:
|
||||
return []
|
||||
|
||||
# Check if it is already a feed.
|
||||
if finder.is_feed_data(feed_text):
|
||||
return [url]
|
||||
|
||||
# Look for <link> tags.
|
||||
logging.info("Looking for <link> tags.")
|
||||
try:
|
||||
tree = BeautifulSoup(feed_text)
|
||||
except ValueError:
|
||||
return []
|
||||
links = []
|
||||
for link in tree.findAll("link"):
|
||||
if link.get("type") in ["application/rss+xml",
|
||||
"text/xml",
|
||||
"application/atom+xml",
|
||||
"application/x.atom+xml",
|
||||
"application/x-atom+xml",
|
||||
"application/json"]:
|
||||
links.append(urlparse.urljoin(url, link.get("href", "")))
|
||||
|
||||
# Check the detected links.
|
||||
urls = list(filter(finder.is_feed, links))
|
||||
logging.info("Found {0} feed <link> tags.".format(len(urls)))
|
||||
if len(urls) and not check_all:
|
||||
return sort_urls(urls)
|
||||
|
||||
# Look for <a> tags.
|
||||
logging.info("Looking for <a> tags.")
|
||||
local, remote = [], []
|
||||
for a in tree.findAll("a"):
|
||||
href = a.get("href", None)
|
||||
if href is None:
|
||||
continue
|
||||
if "://" not in href and finder.is_feed_url(href):
|
||||
local.append(href)
|
||||
if finder.is_feedlike_url(href):
|
||||
remote.append(href)
|
||||
|
||||
# Check the local URLs.
|
||||
local = [urlparse.urljoin(url, l) for l in local]
|
||||
urls += list(filter(finder.is_feed, local))
|
||||
logging.info("Found {0} local <a> links to feeds.".format(len(urls)))
|
||||
if len(urls) and not check_all:
|
||||
return sort_urls(urls)
|
||||
|
||||
# Check the remote URLs.
|
||||
remote = [urlparse.urljoin(url, l) for l in remote]
|
||||
urls += list(filter(finder.is_feed, remote))
|
||||
logging.info("Found {0} remote <a> links to feeds.".format(len(urls)))
|
||||
if len(urls) and not check_all:
|
||||
return sort_urls(urls)
|
||||
|
||||
# Guessing potential URLs.
|
||||
fns = ["atom.xml", "index.atom", "index.rdf", "rss.xml", "index.xml",
|
||||
"index.rss", "index.json"]
|
||||
urls += list(filter(finder.is_feed, [urlparse.urljoin(url, f)
|
||||
for f in fns]))
|
||||
return sort_urls(urls)
|
||||
|
||||
|
||||
def url_feed_prob(url):
|
||||
if "comments" in url:
|
||||
return -2
|
||||
if "georss" in url:
|
||||
return -1
|
||||
kw = ["atom", "rss", "rdf", ".xml", "feed", "json"]
|
||||
for p, t in zip(list(range(len(kw), 0, -1)), kw):
|
||||
if t in url:
|
||||
return p
|
||||
return 0
|
||||
|
||||
|
||||
def sort_urls(feeds):
|
||||
return sorted(list(set(feeds)), key=url_feed_prob, reverse=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(find_feeds("www.preposterousuniverse.com/blog/"))
|
||||
print(find_feeds("http://xkcd.com"))
|
||||
print(find_feeds("dan.iel.fm/atom.xml"))
|
||||
print(find_feeds("dan.iel.fm", check_all=True))
|
||||
print(find_feeds("kapadia.github.io"))
|
||||
print(find_feeds("blog.jonathansick.ca"))
|
||||
print(find_feeds("asdasd"))
|
|
@ -3,7 +3,7 @@
|
|||
from PIL import Image
|
||||
from PIL import ImageOps as PILOps
|
||||
from PIL.ExifTags import TAGS
|
||||
from StringIO import StringIO
|
||||
from io import StringIO
|
||||
from vendor import reseekfile
|
||||
|
||||
PROFILE_PICTURE_SIZES = {
|
||||
|
|
180
utils/json_functions.py.bak
Normal file
180
utils/json_functions.py.bak
Normal file
|
@ -0,0 +1,180 @@
|
|||
#-*- coding: utf-8 -*-
|
||||
from django.db import models
|
||||
from django.utils.functional import Promise
|
||||
from django.utils.encoding import force_unicode, smart_unicode
|
||||
import json
|
||||
from decimal import Decimal
|
||||
from django.core import serializers
|
||||
from django.conf import settings
|
||||
from django.http import HttpResponse, HttpResponseForbidden, Http404
|
||||
from django.core.mail import mail_admins
|
||||
from django.db.models.query import QuerySet
|
||||
from mongoengine.queryset.queryset import QuerySet as MongoQuerySet
|
||||
from bson.objectid import ObjectId
|
||||
import sys
|
||||
import datetime
|
||||
|
||||
|
||||
def decode(data):
|
||||
if not data:
|
||||
return data
|
||||
return json.loads(data)
|
||||
|
||||
|
||||
def encode(data, *args, **kwargs):
|
||||
if type(data) == QuerySet: # Careful, ValuesQuerySet is a dict
|
||||
# Django models
|
||||
return serializers.serialize("json", data, *args, **kwargs)
|
||||
else:
|
||||
return json_encode(data, *args, **kwargs)
|
||||
|
||||
|
||||
def json_encode(data, *args, **kwargs):
|
||||
"""
|
||||
The main issues with django's default json serializer is that properties that
|
||||
had been added to an object dynamically are being ignored (and it also has
|
||||
problems with some models).
|
||||
"""
|
||||
|
||||
def _any(data):
|
||||
ret = None
|
||||
# Opps, we used to check if it is of type list, but that fails
|
||||
# i.e. in the case of django.newforms.utils.ErrorList, which extends
|
||||
# the type "list". Oh man, that was a dumb mistake!
|
||||
if hasattr(data, 'canonical'):
|
||||
ret = _any(data.canonical())
|
||||
elif isinstance(data, list):
|
||||
ret = _list(data)
|
||||
elif isinstance(data, set):
|
||||
ret = _list(list(data))
|
||||
# Same as for lists above.
|
||||
elif isinstance(data, dict):
|
||||
ret = _dict(data)
|
||||
elif isinstance(data, (Decimal, ObjectId)):
|
||||
# json.dumps() cant handle Decimal
|
||||
ret = str(data)
|
||||
elif isinstance(data, models.query.QuerySet):
|
||||
# Actually its the same as a list ...
|
||||
ret = _list(data)
|
||||
elif isinstance(data, MongoQuerySet):
|
||||
# Actually its the same as a list ...
|
||||
ret = _list(data)
|
||||
elif isinstance(data, models.Model):
|
||||
ret = _model(data)
|
||||
# here we need to encode the string as unicode (otherwise we get utf-16 in the json-response)
|
||||
elif isinstance(data, basestring):
|
||||
ret = smart_unicode(data)
|
||||
elif isinstance(data, Exception):
|
||||
ret = unicode(data)
|
||||
# see http://code.djangoproject.com/ticket/5868
|
||||
elif isinstance(data, Promise):
|
||||
ret = force_unicode(data)
|
||||
elif isinstance(data, datetime.datetime) or isinstance(data, datetime.date):
|
||||
ret = str(data)
|
||||
elif hasattr(data, 'to_json'):
|
||||
ret = data.to_json()
|
||||
else:
|
||||
ret = data
|
||||
return ret
|
||||
|
||||
def _model(data):
|
||||
ret = {}
|
||||
# If we only have a model, we only want to encode the fields.
|
||||
for f in data._meta.fields:
|
||||
ret[f.attname] = _any(getattr(data, f.attname))
|
||||
# And additionally encode arbitrary properties that had been added.
|
||||
fields = dir(data.__class__) + ret.keys()
|
||||
add_ons = [k for k in dir(data) if k not in fields]
|
||||
for k in add_ons:
|
||||
ret[k] = _any(getattr(data, k))
|
||||
return ret
|
||||
|
||||
def _list(data):
|
||||
ret = []
|
||||
for v in data:
|
||||
ret.append(_any(v))
|
||||
return ret
|
||||
|
||||
def _dict(data):
|
||||
ret = {}
|
||||
for k, v in data.items():
|
||||
ret[str(k)] = _any(v)
|
||||
return ret
|
||||
|
||||
if hasattr(data, 'to_json'):
|
||||
data = data.to_json()
|
||||
ret = _any(data)
|
||||
return json.dumps(ret)
|
||||
|
||||
|
||||
def json_view(func):
|
||||
def wrap(request, *a, **kw):
|
||||
response = func(request, *a, **kw)
|
||||
return json_response(request, response)
|
||||
|
||||
if isinstance(func, HttpResponse):
|
||||
return func
|
||||
else:
|
||||
return wrap
|
||||
|
||||
|
||||
def json_response(request, response=None):
|
||||
code = 200
|
||||
|
||||
if isinstance(response, HttpResponseForbidden):
|
||||
return response
|
||||
|
||||
try:
|
||||
if isinstance(response, dict):
|
||||
response = dict(response)
|
||||
if 'result' not in response:
|
||||
response['result'] = 'ok'
|
||||
authenticated = request.user.is_authenticated
|
||||
response['authenticated'] = authenticated
|
||||
if authenticated:
|
||||
response['user_id'] = request.user.pk
|
||||
except KeyboardInterrupt:
|
||||
# Allow keyboard interrupts through for debugging.
|
||||
raise
|
||||
except Http404:
|
||||
raise Http404
|
||||
except Exception as e:
|
||||
# Mail the admins with the error
|
||||
exc_info = sys.exc_info()
|
||||
subject = 'JSON view error: %s' % request.path
|
||||
try:
|
||||
request_repr = repr(request)
|
||||
except:
|
||||
request_repr = 'Request repr() unavailable'
|
||||
import traceback
|
||||
message = 'Traceback:\n%s\n\nRequest:\n%s' % (
|
||||
'\n'.join(traceback.format_exception(*exc_info)),
|
||||
request_repr,
|
||||
)
|
||||
|
||||
response = {'result': 'error',
|
||||
'text': unicode(e)}
|
||||
code = 500
|
||||
if not settings.DEBUG:
|
||||
mail_admins(subject, message, fail_silently=True)
|
||||
else:
|
||||
print '\n'.join(traceback.format_exception(*exc_info))
|
||||
|
||||
json = json_encode(response)
|
||||
return HttpResponse(json, content_type='application/json', status=code)
|
||||
|
||||
|
||||
def main():
|
||||
test = {
|
||||
1: True,
|
||||
2: u"string",
|
||||
3: 30,
|
||||
4: u"юнікод, ўўў, © ™ ® ё ² § $ ° ќо́",
|
||||
5: "utf-8: \xd1\x9e, \xc2\xa9 \xe2\x84\xa2 \xc2\xae \xd1\x91 \xd0\xba\xcc\x81\xd0\xbe\xcc\x81",
|
||||
}
|
||||
json_test = json_encode(test)
|
||||
print test, json_test
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
10
utils/log.py
10
utils/log.py
|
@ -5,7 +5,7 @@ import time
|
|||
|
||||
from django.core.handlers.wsgi import WSGIRequest
|
||||
from django.conf import settings
|
||||
from django.utils.encoding import smart_unicode
|
||||
from django.utils.encoding import smart_str
|
||||
|
||||
from user_functions import extract_user_agent
|
||||
from apps.statistics.rstats import RStats
|
||||
|
@ -22,7 +22,7 @@ def getlogger():
|
|||
|
||||
|
||||
def user(u, msg, request=None, warn_color=True):
|
||||
msg = smart_unicode(msg)
|
||||
msg = smart_text(msg)
|
||||
if not u:
|
||||
return debug(msg)
|
||||
|
||||
|
@ -72,19 +72,19 @@ def cipher(msg):
|
|||
|
||||
|
||||
def debug(msg):
|
||||
msg = smart_unicode(msg)
|
||||
msg = smart_text(msg)
|
||||
logger = getlogger()
|
||||
logger.debug(colorize(msg))
|
||||
|
||||
|
||||
def info(msg):
|
||||
msg = smart_unicode(msg)
|
||||
msg = smart_text(msg)
|
||||
logger = getlogger()
|
||||
logger.info(colorize(msg))
|
||||
|
||||
|
||||
def error(msg):
|
||||
msg = smart_unicode(msg)
|
||||
msg = smart_text(msg)
|
||||
logger = getlogger()
|
||||
logger.error(msg)
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ def list_backup_in_s3():
|
|||
bucket = conn.get_bucket(BUCKET_NAME)
|
||||
|
||||
for i, key in enumerate(bucket.get_all_keys()):
|
||||
print "[%s] %s" % (i, key.name)
|
||||
print("[%s] %s" % (i, key.name))
|
||||
|
||||
def delete_all_backups():
|
||||
#FIXME: validate filename exists
|
||||
|
@ -57,13 +57,13 @@ def delete_all_backups():
|
|||
bucket = conn.get_bucket(BUCKET_NAME)
|
||||
|
||||
for i, key in enumerate(bucket.get_all_keys()):
|
||||
print "deleting %s" % (key.name)
|
||||
print("deleting %s" % (key.name))
|
||||
key.delete()
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
if len(sys.argv) < 3:
|
||||
print 'Usage: %s <get/set/list/delete> <backup_filename>' % (sys.argv[0])
|
||||
print('Usage: %s <get/set/list/delete> <backup_filename>' % (sys.argv[0]))
|
||||
else:
|
||||
if sys.argv[1] == 'set':
|
||||
save_file_in_s3(sys.argv[2])
|
||||
|
@ -74,7 +74,7 @@ if __name__ == '__main__':
|
|||
elif sys.argv[1] == 'delete':
|
||||
delete_all_backups()
|
||||
else:
|
||||
print 'Usage: %s <get/set/list/delete> <backup_filename>' % (sys.argv[0])
|
||||
print('Usage: %s <get/set/list/delete> <backup_filename>' % (sys.argv[0]))
|
||||
|
||||
|
||||
class S3Store:
|
||||
|
|
148
utils/s3_utils.py.bak
Normal file
148
utils/s3_utils.py.bak
Normal file
|
@ -0,0 +1,148 @@
|
|||
import os
|
||||
import sys
|
||||
import time
|
||||
import mimetypes
|
||||
from boto.s3.connection import S3Connection
|
||||
from boto.s3.key import Key
|
||||
from utils.image_functions import ImageOps
|
||||
|
||||
if '/srv/newsblur' not in ' '.join(sys.path):
|
||||
sys.path.append("/srv/newsblur")
|
||||
|
||||
os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
|
||||
from django.conf import settings
|
||||
|
||||
ACCESS_KEY = settings.S3_ACCESS_KEY
|
||||
SECRET = settings.S3_SECRET
|
||||
BUCKET_NAME = settings.S3_BACKUP_BUCKET # Note that you need to create this bucket first
|
||||
|
||||
import ssl
|
||||
|
||||
_old_match_hostname = ssl.match_hostname
|
||||
|
||||
def _new_match_hostname(cert, hostname):
|
||||
if hostname.endswith('.s3.amazonaws.com'):
|
||||
pos = hostname.find('.s3.amazonaws.com')
|
||||
hostname = hostname[:pos].replace('.', '') + hostname[pos:]
|
||||
return _old_match_hostname(cert, hostname)
|
||||
|
||||
ssl.match_hostname = _new_match_hostname
|
||||
|
||||
def save_file_in_s3(filename):
|
||||
conn = S3Connection(ACCESS_KEY, SECRET)
|
||||
bucket = conn.get_bucket(BUCKET_NAME)
|
||||
k = Key(bucket)
|
||||
k.key = filename
|
||||
|
||||
k.set_contents_from_filename(filename)
|
||||
|
||||
def get_file_from_s3(filename):
|
||||
conn = S3Connection(ACCESS_KEY, SECRET)
|
||||
bucket = conn.get_bucket(BUCKET_NAME)
|
||||
k = Key(bucket)
|
||||
k.key = filename
|
||||
|
||||
k.get_contents_to_filename(filename)
|
||||
|
||||
def list_backup_in_s3():
|
||||
conn = S3Connection(ACCESS_KEY, SECRET)
|
||||
bucket = conn.get_bucket(BUCKET_NAME)
|
||||
|
||||
for i, key in enumerate(bucket.get_all_keys()):
|
||||
print "[%s] %s" % (i, key.name)
|
||||
|
||||
def delete_all_backups():
|
||||
#FIXME: validate filename exists
|
||||
conn = S3Connection(ACCESS_KEY, SECRET)
|
||||
bucket = conn.get_bucket(BUCKET_NAME)
|
||||
|
||||
for i, key in enumerate(bucket.get_all_keys()):
|
||||
print "deleting %s" % (key.name)
|
||||
key.delete()
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
if len(sys.argv) < 3:
|
||||
print 'Usage: %s <get/set/list/delete> <backup_filename>' % (sys.argv[0])
|
||||
else:
|
||||
if sys.argv[1] == 'set':
|
||||
save_file_in_s3(sys.argv[2])
|
||||
elif sys.argv[1] == 'get':
|
||||
get_file_from_s3(sys.argv[2])
|
||||
elif sys.argv[1] == 'list':
|
||||
list_backup_in_s3()
|
||||
elif sys.argv[1] == 'delete':
|
||||
delete_all_backups()
|
||||
else:
|
||||
print 'Usage: %s <get/set/list/delete> <backup_filename>' % (sys.argv[0])
|
||||
|
||||
|
||||
class S3Store:
|
||||
|
||||
def __init__(self, bucket_name=settings.S3_AVATARS_BUCKET_NAME):
|
||||
if settings.DEBUG:
|
||||
import ssl
|
||||
|
||||
try:
|
||||
_create_unverified_https_context = ssl._create_unverified_context
|
||||
except AttributeError:
|
||||
# Legacy Python that doesn't verify HTTPS certificates by default
|
||||
pass
|
||||
else:
|
||||
# Handle target environment that doesn't support HTTPS verification
|
||||
ssl._create_default_https_context = _create_unverified_https_context
|
||||
|
||||
self.s3 = S3Connection(ACCESS_KEY, SECRET)
|
||||
self.bucket = self.create_bucket(bucket_name)
|
||||
|
||||
def create_bucket(self, bucket_name):
|
||||
return self.s3.create_bucket(bucket_name)
|
||||
|
||||
def save_profile_picture(self, user_id, filename, image_body):
|
||||
content_type, extension = self._extract_content_type(filename)
|
||||
if not content_type or not extension:
|
||||
return
|
||||
|
||||
image_name = 'profile_%s.%s' % (int(time.time()), extension)
|
||||
|
||||
image = ImageOps.resize_image(image_body, 'fullsize', fit_to_size=False)
|
||||
if image:
|
||||
key = 'avatars/%s/large_%s' % (user_id, image_name)
|
||||
self._save_object(key, image, content_type=content_type)
|
||||
|
||||
image = ImageOps.resize_image(image_body, 'thumbnail', fit_to_size=True)
|
||||
if image:
|
||||
key = 'avatars/%s/thumbnail_%s' % (user_id, image_name)
|
||||
self._save_object(key, image, content_type=content_type)
|
||||
|
||||
return image and image_name
|
||||
|
||||
def _extract_content_type(self, filename):
|
||||
content_type = mimetypes.guess_type(filename)[0]
|
||||
extension = None
|
||||
|
||||
if content_type == 'image/jpeg':
|
||||
extension = 'jpg'
|
||||
elif content_type == 'image/png':
|
||||
extension = 'png'
|
||||
elif content_type == 'image/gif':
|
||||
extension = 'gif'
|
||||
|
||||
return content_type, extension
|
||||
|
||||
def _make_key(self):
|
||||
return Key(bucket=self.bucket)
|
||||
|
||||
def _save_object(self, key, file_object, content_type=None):
|
||||
k = self._make_key()
|
||||
k.key = key
|
||||
file_object.seek(0)
|
||||
|
||||
if content_type:
|
||||
k.set_contents_from_file(file_object, headers={
|
||||
'Content-Type': content_type,
|
||||
})
|
||||
else:
|
||||
k.set_contents_from_file(file_object)
|
||||
k.set_acl('public-read')
|
||||
|
Loading…
Add table
Reference in a new issue