2to3 NewsBlur/utils/

This commit is contained in:
jmath1 2020-06-13 13:13:20 -04:00
parent 5c1e0a1c5e
commit 66b2011707
21 changed files with 3681 additions and 172 deletions

View file

@ -19,16 +19,16 @@ class WriteXmlMixin:
def to_xml(self, encoding = "iso-8859-1"):
try:
import cStringIO as StringIO
import io as StringIO
except ImportError:
import StringIO
f = StringIO.StringIO()
import io
f = io.StringIO()
self.write_xml(f, encoding)
return f.getvalue()
def _element(handler, name, obj, d = {}):
if isinstance(obj, basestring) or obj is None:
if isinstance(obj, str) or obj is None:
# special-case handling to make the API easier
# to use for the common case.
handler.startElement(name, d)
@ -337,7 +337,7 @@ class RSS2(WriteXmlMixin):
_opt_element(handler, "lastBuildDate", lastBuildDate)
for category in self.categories:
if isinstance(category, basestring):
if isinstance(category, str):
category = Category(category)
category.publish(handler)
@ -418,7 +418,7 @@ class RSSItem(WriteXmlMixin):
_opt_element(handler, "author", self.author)
for category in self.categories:
if isinstance(category, basestring):
if isinstance(category, str):
category = Category(category)
category.publish(handler)

443
utils/PyRSS2Gen.py.bak Normal file
View file

@ -0,0 +1,443 @@
"""PyRSS2Gen - A Python library for generating RSS 2.0 feeds."""
__name__ = "PyRSS2Gen"
__version__ = (1, 0, 0)
__author__ = "Andrew Dalke <dalke@dalkescientific.com>"
_generator_name = __name__ + "-" + ".".join(map(str, __version__))
import datetime
# Could make this the base class; will need to add 'publish'
class WriteXmlMixin:
def write_xml(self, outfile, encoding = "iso-8859-1"):
from xml.sax import saxutils
handler = saxutils.XMLGenerator(outfile, encoding)
handler.startDocument()
self.publish(handler)
handler.endDocument()
def to_xml(self, encoding = "iso-8859-1"):
try:
import cStringIO as StringIO
except ImportError:
import StringIO
f = StringIO.StringIO()
self.write_xml(f, encoding)
return f.getvalue()
def _element(handler, name, obj, d = {}):
if isinstance(obj, basestring) or obj is None:
# special-case handling to make the API easier
# to use for the common case.
handler.startElement(name, d)
if obj is not None:
handler.characters(obj)
handler.endElement(name)
else:
# It better know how to emit the correct XML.
obj.publish(handler)
def _opt_element(handler, name, obj):
if obj is None:
return
_element(handler, name, obj)
def _format_date(dt):
"""convert a datetime into an RFC 822 formatted date
Input date must be in GMT.
"""
# Looks like:
# Sat, 07 Sep 2002 00:00:01 GMT
# Can't use strftime because that's locale dependent
#
# Isn't there a standard way to do this for Python? The
# rfc822 and email.Utils modules assume a timestamp. The
# following is based on the rfc822 module.
return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
dt.day,
["Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1],
dt.year, dt.hour, dt.minute, dt.second)
##
# A couple simple wrapper objects for the fields which
# take a simple value other than a string.
class IntElement:
"""implements the 'publish' API for integers
Takes the tag name and the integer value to publish.
(Could be used for anything which uses str() to be published
to text for XML.)
"""
element_attrs = {}
def __init__(self, name, val):
self.name = name
self.val = val
def publish(self, handler):
handler.startElement(self.name, self.element_attrs)
handler.characters(str(self.val))
handler.endElement(self.name)
class DateElement:
"""implements the 'publish' API for a datetime.datetime
Takes the tag name and the datetime to publish.
Converts the datetime to RFC 2822 timestamp (4-digit year).
"""
def __init__(self, name, dt):
self.name = name
self.dt = dt
def publish(self, handler):
_element(handler, self.name, _format_date(self.dt))
####
class Category:
"""Publish a category element"""
def __init__(self, category, domain = None):
self.category = category
self.domain = domain
def publish(self, handler):
d = {}
if self.domain is not None:
d["domain"] = self.domain
_element(handler, "category", self.category, d)
class Cloud:
"""Publish a cloud"""
def __init__(self, domain, port, path,
registerProcedure, protocol):
self.domain = domain
self.port = port
self.path = path
self.registerProcedure = registerProcedure
self.protocol = protocol
def publish(self, handler):
_element(handler, "cloud", None, {
"domain": self.domain,
"port": str(self.port),
"path": self.path,
"registerProcedure": self.registerProcedure,
"protocol": self.protocol})
class Image:
"""Publish a channel Image"""
element_attrs = {}
def __init__(self, url, title, link,
width = None, height = None, description = None):
self.url = url
self.title = title
self.link = link
self.width = width
self.height = height
self.description = description
def publish(self, handler):
handler.startElement("image", self.element_attrs)
_element(handler, "url", self.url)
_element(handler, "title", self.title)
_element(handler, "link", self.link)
width = self.width
if isinstance(width, int):
width = IntElement("width", width)
_opt_element(handler, "width", width)
height = self.height
if isinstance(height, int):
height = IntElement("height", height)
_opt_element(handler, "height", height)
_opt_element(handler, "description", self.description)
handler.endElement("image")
class Guid:
"""Publish a guid
Defaults to being a permalink, which is the assumption if it's
omitted. Hence strings are always permalinks.
"""
def __init__(self, guid, isPermaLink = 1):
self.guid = guid
self.isPermaLink = isPermaLink
def publish(self, handler):
d = {}
if self.isPermaLink:
d["isPermaLink"] = "true"
else:
d["isPermaLink"] = "false"
_element(handler, "guid", self.guid, d)
class TextInput:
"""Publish a textInput
Apparently this is rarely used.
"""
element_attrs = {}
def __init__(self, title, description, name, link):
self.title = title
self.description = description
self.name = name
self.link = link
def publish(self, handler):
handler.startElement("textInput", self.element_attrs)
_element(handler, "title", self.title)
_element(handler, "description", self.description)
_element(handler, "name", self.name)
_element(handler, "link", self.link)
handler.endElement("textInput")
class Enclosure:
"""Publish an enclosure"""
def __init__(self, url, length, type):
self.url = url
self.length = length
self.type = type
def publish(self, handler):
_element(handler, "enclosure", None,
{"url": self.url,
"length": str(self.length),
"type": self.type,
})
class Source:
"""Publish the item's original source, used by aggregators"""
def __init__(self, name, url):
self.name = name
self.url = url
def publish(self, handler):
_element(handler, "source", self.name, {"url": self.url})
class SkipHours:
"""Publish the skipHours
This takes a list of hours, as integers.
"""
element_attrs = {}
def __init__(self, hours):
self.hours = hours
def publish(self, handler):
if self.hours:
handler.startElement("skipHours", self.element_attrs)
for hour in self.hours:
_element(handler, "hour", str(hour))
handler.endElement("skipHours")
class SkipDays:
"""Publish the skipDays
This takes a list of days as strings.
"""
element_attrs = {}
def __init__(self, days):
self.days = days
def publish(self, handler):
if self.days:
handler.startElement("skipDays", self.element_attrs)
for day in self.days:
_element(handler, "day", day)
handler.endElement("skipDays")
class RSS2(WriteXmlMixin):
"""The main RSS class.
Stores the channel attributes, with the "category" elements under
".categories" and the RSS items under ".items".
"""
rss_attrs = {"version": "2.0"}
element_attrs = {}
def __init__(self,
title,
link,
description,
language = None,
copyright = None,
managingEditor = None,
webMaster = None,
pubDate = None, # a datetime, *in* *GMT*
lastBuildDate = None, # a datetime
categories = None, # list of strings or Category
generator = _generator_name,
docs = "http://blogs.law.harvard.edu/tech/rss",
cloud = None, # a Cloud
ttl = None, # integer number of minutes
image = None, # an Image
rating = None, # a string; I don't know how it's used
textInput = None, # a TextInput
skipHours = None, # a SkipHours with a list of integers
skipDays = None, # a SkipDays with a list of strings
items = None, # list of RSSItems
):
self.title = title
self.link = link
self.description = description
self.language = language
self.copyright = copyright
self.managingEditor = managingEditor
self.webMaster = webMaster
self.pubDate = pubDate
self.lastBuildDate = lastBuildDate
if categories is None:
categories = []
self.categories = categories
self.generator = generator
self.docs = docs
self.cloud = cloud
self.ttl = ttl
self.image = image
self.rating = rating
self.textInput = textInput
self.skipHours = skipHours
self.skipDays = skipDays
if items is None:
items = []
self.items = items
def publish(self, handler):
handler.startElement("rss", self.rss_attrs)
handler.startElement("channel", self.element_attrs)
_element(handler, "title", self.title)
_element(handler, "link", self.link)
_element(handler, "description", self.description)
self.publish_extensions(handler)
_opt_element(handler, "language", self.language)
_opt_element(handler, "copyright", self.copyright)
_opt_element(handler, "managingEditor", self.managingEditor)
_opt_element(handler, "webMaster", self.webMaster)
pubDate = self.pubDate
if isinstance(pubDate, datetime.datetime):
pubDate = DateElement("pubDate", pubDate)
_opt_element(handler, "pubDate", pubDate)
lastBuildDate = self.lastBuildDate
if isinstance(lastBuildDate, datetime.datetime):
lastBuildDate = DateElement("lastBuildDate", lastBuildDate)
_opt_element(handler, "lastBuildDate", lastBuildDate)
for category in self.categories:
if isinstance(category, basestring):
category = Category(category)
category.publish(handler)
_opt_element(handler, "generator", self.generator)
_opt_element(handler, "docs", self.docs)
if self.cloud is not None:
self.cloud.publish(handler)
ttl = self.ttl
if isinstance(self.ttl, int):
ttl = IntElement("ttl", ttl)
_opt_element(handler, "tt", ttl)
if self.image is not None:
self.image.publish(handler)
_opt_element(handler, "rating", self.rating)
if self.textInput is not None:
self.textInput.publish(handler)
if self.skipHours is not None:
self.skipHours.publish(handler)
if self.skipDays is not None:
self.skipDays.publish(handler)
for item in self.items:
item.publish(handler)
handler.endElement("channel")
handler.endElement("rss")
def publish_extensions(self, handler):
# Derived classes can hook into this to insert
# output after the three required fields.
pass
class RSSItem(WriteXmlMixin):
"""Publish an RSS Item"""
element_attrs = {}
def __init__(self,
title = None, # string
link = None, # url as string
description = None, # string
author = None, # email address as string
categories = None, # list of string or Category
comments = None, # url as string
enclosure = None, # an Enclosure
guid = None, # a unique string
pubDate = None, # a datetime
source = None, # a Source
):
if title is None and description is None:
raise TypeError(
"must define at least one of 'title' or 'description'")
self.title = title
self.link = link
self.description = description
self.author = author
if categories is None:
categories = []
self.categories = categories
self.comments = comments
self.enclosure = enclosure
self.guid = guid
self.pubDate = pubDate
self.source = source
# It sure does get tedious typing these names three times...
def publish(self, handler):
handler.startElement("item", self.element_attrs)
_opt_element(handler, "title", self.title)
_opt_element(handler, "link", self.link)
self.publish_extensions(handler)
_opt_element(handler, "description", self.description)
_opt_element(handler, "author", self.author)
for category in self.categories:
if isinstance(category, basestring):
category = Category(category)
category.publish(handler)
_opt_element(handler, "comments", self.comments)
if self.enclosure is not None:
self.enclosure.publish(handler)
_opt_element(handler, "guid", self.guid)
pubDate = self.pubDate
if isinstance(pubDate, datetime.datetime):
pubDate = DateElement("pubDate", pubDate)
_opt_element(handler, "pubDate", pubDate)
if self.source is not None:
self.source.publish(handler)
handler.endElement("item")
def publish_extensions(self, handler):
# Derived classes can hook into this to insert
# output after the title and link elements
pass

View file

@ -11,13 +11,13 @@
import base64
import hmac
import httplib
import http.client
import re
import sha
import sys
import time
import urllib
import urlparse
import urllib.request, urllib.parse, urllib.error
import urllib.parse
import xml.sax
DEFAULT_HOST = 's3.amazonaws.com'
@ -34,13 +34,13 @@ def canonical_string(method, bucket="", key="", query_args={}, headers={}, expir
interesting_headers[lk] = headers[header_key].strip()
# these keys get empty strings if they don't exist
if not interesting_headers.has_key('content-type'):
if 'content-type' not in interesting_headers:
interesting_headers['content-type'] = ''
if not interesting_headers.has_key('content-md5'):
if 'content-md5' not in interesting_headers:
interesting_headers['content-md5'] = ''
# just in case someone used this. it's not necessary in this lib.
if interesting_headers.has_key('x-amz-date'):
if 'x-amz-date' in interesting_headers:
interesting_headers['date'] = ''
# if you're using expires for query string auth, then it trumps date
@ -48,7 +48,7 @@ def canonical_string(method, bucket="", key="", query_args={}, headers={}, expir
if expires:
interesting_headers['date'] = str(expires)
sorted_header_keys = interesting_headers.keys()
sorted_header_keys = list(interesting_headers.keys())
sorted_header_keys.sort()
buf = "%s\n" % method
@ -63,17 +63,17 @@ def canonical_string(method, bucket="", key="", query_args={}, headers={}, expir
buf += "/%s" % bucket
# add the key. even if it doesn't exist, add the slash
buf += "/%s" % urllib.quote_plus(key)
buf += "/%s" % urllib.parse.quote_plus(key)
# handle special query string arguments
if query_args.has_key("acl"):
if "acl" in query_args:
buf += "?acl"
elif query_args.has_key("torrent"):
elif "torrent" in query_args:
buf += "?torrent"
elif query_args.has_key("logging"):
elif "logging" in query_args:
buf += "?logging"
elif query_args.has_key("location"):
elif "location" in query_args:
buf += "?location"
return buf
@ -83,13 +83,13 @@ def canonical_string(method, bucket="", key="", query_args={}, headers={}, expir
def encode(aws_secret_access_key, str, urlencode=False):
b64_hmac = base64.encodestring(hmac.new(aws_secret_access_key, str, sha).digest()).strip()
if urlencode:
return urllib.quote_plus(b64_hmac)
return urllib.parse.quote_plus(b64_hmac)
else:
return b64_hmac
def merge_meta(headers, metadata):
final_headers = headers.copy()
for k in metadata.keys():
for k in list(metadata.keys()):
final_headers[METADATA_PREFIX + k] = metadata[k]
return final_headers
@ -98,10 +98,10 @@ def merge_meta(headers, metadata):
def query_args_hash_to_string(query_args):
query_string = ""
pairs = []
for k, v in query_args.items():
for k, v in list(query_args.items()):
piece = k
if v != None:
piece += "=%s" % urllib.quote_plus(str(v))
piece += "=%s" % urllib.parse.quote_plus(str(v))
pairs.append(piece)
return '&'.join(pairs)
@ -251,7 +251,7 @@ class AWSAuthConnection:
# add the slash after the bucket regardless
# the key will be appended if it is non-empty
path += "/%s" % urllib.quote_plus(key)
path += "/%s" % urllib.parse.quote_plus(key)
# build the path_argument string
@ -264,9 +264,9 @@ class AWSAuthConnection:
host = "%s:%d" % (server, self.port)
while True:
if (is_secure):
connection = httplib.HTTPSConnection(host)
connection = http.client.HTTPSConnection(host)
else:
connection = httplib.HTTPConnection(host)
connection = http.client.HTTPConnection(host)
final_headers = merge_meta(headers, metadata);
# add auth header
@ -283,7 +283,7 @@ class AWSAuthConnection:
# (close connection)
resp.read()
scheme, host, path, params, query, fragment \
= urlparse.urlparse(location)
= urllib.parse.urlparse(location)
if scheme == "http": is_secure = True
elif scheme == "https": is_secure = False
else: raise invalidURL("Not http/https: " + location)
@ -291,7 +291,7 @@ class AWSAuthConnection:
# retry with redirect
def _add_aws_auth_header(self, headers, method, bucket, key, query_args):
if not headers.has_key('Date'):
if 'Date' not in headers:
headers['Date'] = time.strftime("%a, %d %b %Y %X GMT", time.gmtime())
c_string = canonical_string(method, bucket, key, query_args, headers)
@ -400,7 +400,7 @@ class QueryStringAuthGenerator:
url = CallingFormat.build_url_base(self.protocol, self.server, self.port, bucket, self.calling_format)
url += "/%s" % urllib.quote_plus(key)
url += "/%s" % urllib.parse.quote_plus(key)
query_args['Signature'] = encoded_canonical
query_args['Expires'] = expires
@ -489,7 +489,7 @@ class GetResponse(Response):
def get_aws_metadata(self, headers):
metadata = {}
for hkey in headers.keys():
for hkey in list(headers.keys()):
if hkey.lower().startswith(METADATA_PREFIX):
metadata[hkey[len(METADATA_PREFIX):]] = headers[hkey]
del headers[hkey]

617
utils/S3.py.bak Normal file
View file

@ -0,0 +1,617 @@
#!/usr/bin/env python
# This software code is made available "AS IS" without warranties of any
# kind. You may copy, display, modify and redistribute the software
# code either by itself or as incorporated into your code; provided that
# you do not remove any proprietary notices. Your use of this software
# code is at your own risk and you waive any claim against Amazon
# Digital Services, Inc. or its affiliates with respect to your use of
# this software code. (c) 2006-2007 Amazon Digital Services, Inc. or its
# affiliates.
import base64
import hmac
import httplib
import re
import sha
import sys
import time
import urllib
import urlparse
import xml.sax
DEFAULT_HOST = 's3.amazonaws.com'
PORTS_BY_SECURITY = { True: 443, False: 80 }
METADATA_PREFIX = 'x-amz-meta-'
AMAZON_HEADER_PREFIX = 'x-amz-'
# generates the aws canonical string for the given parameters
def canonical_string(method, bucket="", key="", query_args={}, headers={}, expires=None):
interesting_headers = {}
for header_key in headers:
lk = header_key.lower()
if lk in ['content-md5', 'content-type', 'date'] or lk.startswith(AMAZON_HEADER_PREFIX):
interesting_headers[lk] = headers[header_key].strip()
# these keys get empty strings if they don't exist
if not interesting_headers.has_key('content-type'):
interesting_headers['content-type'] = ''
if not interesting_headers.has_key('content-md5'):
interesting_headers['content-md5'] = ''
# just in case someone used this. it's not necessary in this lib.
if interesting_headers.has_key('x-amz-date'):
interesting_headers['date'] = ''
# if you're using expires for query string auth, then it trumps date
# (and x-amz-date)
if expires:
interesting_headers['date'] = str(expires)
sorted_header_keys = interesting_headers.keys()
sorted_header_keys.sort()
buf = "%s\n" % method
for header_key in sorted_header_keys:
if header_key.startswith(AMAZON_HEADER_PREFIX):
buf += "%s:%s\n" % (header_key, interesting_headers[header_key])
else:
buf += "%s\n" % interesting_headers[header_key]
# append the bucket if it exists
if bucket != "":
buf += "/%s" % bucket
# add the key. even if it doesn't exist, add the slash
buf += "/%s" % urllib.quote_plus(key)
# handle special query string arguments
if query_args.has_key("acl"):
buf += "?acl"
elif query_args.has_key("torrent"):
buf += "?torrent"
elif query_args.has_key("logging"):
buf += "?logging"
elif query_args.has_key("location"):
buf += "?location"
return buf
# computes the base64'ed hmac-sha hash of the canonical string and the secret
# access key, optionally urlencoding the result
def encode(aws_secret_access_key, str, urlencode=False):
b64_hmac = base64.encodestring(hmac.new(aws_secret_access_key, str, sha).digest()).strip()
if urlencode:
return urllib.quote_plus(b64_hmac)
else:
return b64_hmac
def merge_meta(headers, metadata):
final_headers = headers.copy()
for k in metadata.keys():
final_headers[METADATA_PREFIX + k] = metadata[k]
return final_headers
# builds the query arg string
def query_args_hash_to_string(query_args):
query_string = ""
pairs = []
for k, v in query_args.items():
piece = k
if v != None:
piece += "=%s" % urllib.quote_plus(str(v))
pairs.append(piece)
return '&'.join(pairs)
class CallingFormat:
PATH = 1
SUBDOMAIN = 2
VANITY = 3
def build_url_base(protocol, server, port, bucket, calling_format):
url_base = '%s://' % protocol
if bucket == '':
url_base += server
elif calling_format == CallingFormat.SUBDOMAIN:
url_base += "%s.%s" % (bucket, server)
elif calling_format == CallingFormat.VANITY:
url_base += bucket
else:
url_base += server
url_base += ":%s" % port
if (bucket != '') and (calling_format == CallingFormat.PATH):
url_base += "/%s" % bucket
return url_base
build_url_base = staticmethod(build_url_base)
class Location:
DEFAULT = None
EU = 'EU'
class AWSAuthConnection:
def __init__(self, aws_access_key_id, aws_secret_access_key, is_secure=True,
server=DEFAULT_HOST, port=None, calling_format=CallingFormat.SUBDOMAIN):
if not port:
port = PORTS_BY_SECURITY[is_secure]
self.aws_access_key_id = aws_access_key_id
self.aws_secret_access_key = aws_secret_access_key
self.is_secure = is_secure
self.server = server
self.port = port
self.calling_format = calling_format
def create_bucket(self, bucket, headers={}):
return Response(self._make_request('PUT', bucket, '', {}, headers))
def create_located_bucket(self, bucket, location=Location.DEFAULT, headers={}):
if location == Location.DEFAULT:
body = ""
else:
body = "<CreateBucketConstraint><LocationConstraint>" + \
location + \
"</LocationConstraint></CreateBucketConstraint>"
return Response(self._make_request('PUT', bucket, '', {}, headers, body))
def check_bucket_exists(self, bucket):
return self._make_request('HEAD', bucket, '', {}, {})
def list_bucket(self, bucket, options={}, headers={}):
return ListBucketResponse(self._make_request('GET', bucket, '', options, headers))
def delete_bucket(self, bucket, headers={}):
return Response(self._make_request('DELETE', bucket, '', {}, headers))
def put(self, bucket, key, object, headers={}):
if not isinstance(object, S3Object):
object = S3Object(object)
return Response(
self._make_request(
'PUT',
bucket,
key,
{},
headers,
object.data,
object.metadata))
def get(self, bucket, key, headers={}):
return GetResponse(
self._make_request('GET', bucket, key, {}, headers))
def delete(self, bucket, key, headers={}):
return Response(
self._make_request('DELETE', bucket, key, {}, headers))
def get_bucket_logging(self, bucket, headers={}):
return GetResponse(self._make_request('GET', bucket, '', { 'logging': None }, headers))
def put_bucket_logging(self, bucket, logging_xml_doc, headers={}):
return Response(self._make_request('PUT', bucket, '', { 'logging': None }, headers, logging_xml_doc))
def get_bucket_acl(self, bucket, headers={}):
return self.get_acl(bucket, '', headers)
def get_acl(self, bucket, key, headers={}):
return GetResponse(
self._make_request('GET', bucket, key, { 'acl': None }, headers))
def put_bucket_acl(self, bucket, acl_xml_document, headers={}):
return self.put_acl(bucket, '', acl_xml_document, headers)
def put_acl(self, bucket, key, acl_xml_document, headers={}):
return Response(
self._make_request(
'PUT',
bucket,
key,
{ 'acl': None },
headers,
acl_xml_document))
def list_all_my_buckets(self, headers={}):
return ListAllMyBucketsResponse(self._make_request('GET', '', '', {}, headers))
def get_bucket_location(self, bucket):
return LocationResponse(self._make_request('GET', bucket, '', {'location' : None}))
# end public methods
def _make_request(self, method, bucket='', key='', query_args={}, headers={}, data='', metadata={}):
server = ''
if bucket == '':
server = self.server
elif self.calling_format == CallingFormat.SUBDOMAIN:
server = "%s.%s" % (bucket, self.server)
elif self.calling_format == CallingFormat.VANITY:
server = bucket
else:
server = self.server
path = ''
if (bucket != '') and (self.calling_format == CallingFormat.PATH):
path += "/%s" % bucket
# add the slash after the bucket regardless
# the key will be appended if it is non-empty
path += "/%s" % urllib.quote_plus(key)
# build the path_argument string
# add the ? in all cases since
# signature and credentials follow path args
if len(query_args):
path += "?" + query_args_hash_to_string(query_args)
is_secure = self.is_secure
host = "%s:%d" % (server, self.port)
while True:
if (is_secure):
connection = httplib.HTTPSConnection(host)
else:
connection = httplib.HTTPConnection(host)
final_headers = merge_meta(headers, metadata);
# add auth header
self._add_aws_auth_header(final_headers, method, bucket, key, query_args)
connection.request(method, path, data, final_headers)
resp = connection.getresponse()
if resp.status < 300 or resp.status >= 400:
return resp
# handle redirect
location = resp.getheader('location')
if not location:
return resp
# (close connection)
resp.read()
scheme, host, path, params, query, fragment \
= urlparse.urlparse(location)
if scheme == "http": is_secure = True
elif scheme == "https": is_secure = False
else: raise invalidURL("Not http/https: " + location)
if query: path += "?" + query
# retry with redirect
def _add_aws_auth_header(self, headers, method, bucket, key, query_args):
if not headers.has_key('Date'):
headers['Date'] = time.strftime("%a, %d %b %Y %X GMT", time.gmtime())
c_string = canonical_string(method, bucket, key, query_args, headers)
headers['Authorization'] = \
"AWS %s:%s" % (self.aws_access_key_id, encode(self.aws_secret_access_key, c_string))
class QueryStringAuthGenerator:
# by default, expire in 1 minute
DEFAULT_EXPIRES_IN = 60
def __init__(self, aws_access_key_id, aws_secret_access_key, is_secure=True,
server=DEFAULT_HOST, port=None, calling_format=CallingFormat.SUBDOMAIN):
if not port:
port = PORTS_BY_SECURITY[is_secure]
self.aws_access_key_id = aws_access_key_id
self.aws_secret_access_key = aws_secret_access_key
if (is_secure):
self.protocol = 'https'
else:
self.protocol = 'http'
self.is_secure = is_secure
self.server = server
self.port = port
self.calling_format = calling_format
self.__expires_in = QueryStringAuthGenerator.DEFAULT_EXPIRES_IN
self.__expires = None
# for backwards compatibility with older versions
self.server_name = "%s:%s" % (self.server, self.port)
def set_expires_in(self, expires_in):
self.__expires_in = expires_in
self.__expires = None
def set_expires(self, expires):
self.__expires = expires
self.__expires_in = None
def create_bucket(self, bucket, headers={}):
return self.generate_url('PUT', bucket, '', {}, headers)
def list_bucket(self, bucket, options={}, headers={}):
return self.generate_url('GET', bucket, '', options, headers)
def delete_bucket(self, bucket, headers={}):
return self.generate_url('DELETE', bucket, '', {}, headers)
def put(self, bucket, key, object, headers={}):
if not isinstance(object, S3Object):
object = S3Object(object)
return self.generate_url(
'PUT',
bucket,
key,
{},
merge_meta(headers, object.metadata))
def get(self, bucket, key, headers={}):
return self.generate_url('GET', bucket, key, {}, headers)
def delete(self, bucket, key, headers={}):
return self.generate_url('DELETE', bucket, key, {}, headers)
def get_bucket_logging(self, bucket, headers={}):
return self.generate_url('GET', bucket, '', { 'logging': None }, headers)
def put_bucket_logging(self, bucket, logging_xml_doc, headers={}):
return self.generate_url('PUT', bucket, '', { 'logging': None }, headers)
def get_bucket_acl(self, bucket, headers={}):
return self.get_acl(bucket, '', headers)
def get_acl(self, bucket, key='', headers={}):
return self.generate_url('GET', bucket, key, { 'acl': None }, headers)
def put_bucket_acl(self, bucket, acl_xml_document, headers={}):
return self.put_acl(bucket, '', acl_xml_document, headers)
# don't really care what the doc is here.
def put_acl(self, bucket, key, acl_xml_document, headers={}):
return self.generate_url('PUT', bucket, key, { 'acl': None }, headers)
def list_all_my_buckets(self, headers={}):
return self.generate_url('GET', '', '', {}, headers)
def make_bare_url(self, bucket, key=''):
full_url = self.generate_url(self, bucket, key)
return full_url[:full_url.index('?')]
def generate_url(self, method, bucket='', key='', query_args={}, headers={}):
expires = 0
if self.__expires_in != None:
expires = int(time.time() + self.__expires_in)
elif self.__expires != None:
expires = int(self.__expires)
else:
raise "Invalid expires state"
canonical_str = canonical_string(method, bucket, key, query_args, headers, expires)
encoded_canonical = encode(self.aws_secret_access_key, canonical_str)
url = CallingFormat.build_url_base(self.protocol, self.server, self.port, bucket, self.calling_format)
url += "/%s" % urllib.quote_plus(key)
query_args['Signature'] = encoded_canonical
query_args['Expires'] = expires
query_args['AWSAccessKeyId'] = self.aws_access_key_id
url += "?%s" % query_args_hash_to_string(query_args)
return url
class S3Object:
def __init__(self, data, metadata={}):
self.data = data
self.metadata = metadata
class Owner:
def __init__(self, id='', display_name=''):
self.id = id
self.display_name = display_name
class ListEntry:
def __init__(self, key='', last_modified=None, etag='', size=0, storage_class='', owner=None):
self.key = key
self.last_modified = last_modified
self.etag = etag
self.size = size
self.storage_class = storage_class
self.owner = owner
class CommonPrefixEntry:
def __init(self, prefix=''):
self.prefix = prefix
class Bucket:
def __init__(self, name='', creation_date=''):
self.name = name
self.creation_date = creation_date
class Response:
def __init__(self, http_response):
self.http_response = http_response
# you have to do this read, even if you don't expect a body.
# otherwise, the next request fails.
self.body = http_response.read()
if http_response.status >= 300 and self.body:
self.message = self.body
else:
self.message = "%03d %s" % (http_response.status, http_response.reason)
class ListBucketResponse(Response):
def __init__(self, http_response):
Response.__init__(self, http_response)
if http_response.status < 300:
handler = ListBucketHandler()
xml.sax.parseString(self.body, handler)
self.entries = handler.entries
self.common_prefixes = handler.common_prefixes
self.name = handler.name
self.marker = handler.marker
self.prefix = handler.prefix
self.is_truncated = handler.is_truncated
self.delimiter = handler.delimiter
self.max_keys = handler.max_keys
self.next_marker = handler.next_marker
else:
self.entries = []
class ListAllMyBucketsResponse(Response):
def __init__(self, http_response):
Response.__init__(self, http_response)
if http_response.status < 300:
handler = ListAllMyBucketsHandler()
xml.sax.parseString(self.body, handler)
self.entries = handler.entries
else:
self.entries = []
class GetResponse(Response):
def __init__(self, http_response):
Response.__init__(self, http_response)
response_headers = http_response.msg # older pythons don't have getheaders
metadata = self.get_aws_metadata(response_headers)
self.object = S3Object(self.body, metadata)
def get_aws_metadata(self, headers):
metadata = {}
for hkey in headers.keys():
if hkey.lower().startswith(METADATA_PREFIX):
metadata[hkey[len(METADATA_PREFIX):]] = headers[hkey]
del headers[hkey]
return metadata
class LocationResponse(Response):
def __init__(self, http_response):
Response.__init__(self, http_response)
if http_response.status < 300:
handler = LocationHandler()
xml.sax.parseString(self.body, handler)
self.location = handler.location
class ListBucketHandler(xml.sax.ContentHandler):
def __init__(self):
self.entries = []
self.curr_entry = None
self.curr_text = ''
self.common_prefixes = []
self.curr_common_prefix = None
self.name = ''
self.marker = ''
self.prefix = ''
self.is_truncated = False
self.delimiter = ''
self.max_keys = 0
self.next_marker = ''
self.is_echoed_prefix_set = False
def startElement(self, name, attrs):
if name == 'Contents':
self.curr_entry = ListEntry()
elif name == 'Owner':
self.curr_entry.owner = Owner()
elif name == 'CommonPrefixes':
self.curr_common_prefix = CommonPrefixEntry()
def endElement(self, name):
if name == 'Contents':
self.entries.append(self.curr_entry)
elif name == 'CommonPrefixes':
self.common_prefixes.append(self.curr_common_prefix)
elif name == 'Key':
self.curr_entry.key = self.curr_text
elif name == 'LastModified':
self.curr_entry.last_modified = self.curr_text
elif name == 'ETag':
self.curr_entry.etag = self.curr_text
elif name == 'Size':
self.curr_entry.size = int(self.curr_text)
elif name == 'ID':
self.curr_entry.owner.id = self.curr_text
elif name == 'DisplayName':
self.curr_entry.owner.display_name = self.curr_text
elif name == 'StorageClass':
self.curr_entry.storage_class = self.curr_text
elif name == 'Name':
self.name = self.curr_text
elif name == 'Prefix' and self.is_echoed_prefix_set:
self.curr_common_prefix.prefix = self.curr_text
elif name == 'Prefix':
self.prefix = self.curr_text
self.is_echoed_prefix_set = True
elif name == 'Marker':
self.marker = self.curr_text
elif name == 'IsTruncated':
self.is_truncated = self.curr_text == 'true'
elif name == 'Delimiter':
self.delimiter = self.curr_text
elif name == 'MaxKeys':
self.max_keys = int(self.curr_text)
elif name == 'NextMarker':
self.next_marker = self.curr_text
self.curr_text = ''
def characters(self, content):
self.curr_text += content
class ListAllMyBucketsHandler(xml.sax.ContentHandler):
def __init__(self):
self.entries = []
self.curr_entry = None
self.curr_text = ''
def startElement(self, name, attrs):
if name == 'Bucket':
self.curr_entry = Bucket()
def endElement(self, name):
if name == 'Name':
self.curr_entry.name = self.curr_text
elif name == 'CreationDate':
self.curr_entry.creation_date = self.curr_text
elif name == 'Bucket':
self.entries.append(self.curr_entry)
def characters(self, content):
self.curr_text = content
class LocationHandler(xml.sax.ContentHandler):
def __init__(self):
self.location = None
self.state = 'init'
def startElement(self, name, attrs):
if self.state == 'init':
if name == 'LocationConstraint':
self.state = 'tag_location'
self.location = ''
else: self.state = 'bad'
else: self.state = 'bad'
def endElement(self, name):
if self.state == 'tag_location' and name == 'LocationConstraint':
self.state = 'done'
else: self.state = 'bad'
def characters(self, content):
if self.state == 'tag_location':
self.location += content

View file

@ -6,11 +6,11 @@ from pprint import pprint
class ConsoleExceptionMiddleware:
def process_exception(self, request, exception):
exc_info = sys.exc_info()
print "######################## Exception #############################"
print '\n'.join(traceback.format_exception(*(exc_info or sys.exc_info())))
print "----------------------------------------------------------------"
print("######################## Exception #############################")
print(('\n'.join(traceback.format_exception(*(exc_info or sys.exc_info())))))
print("----------------------------------------------------------------")
pprint(inspect.trace()[-1][0].f_locals)
print "################################################################"
print("################################################################")
#pprint(request)
#print "################################################################"

View file

@ -0,0 +1,16 @@
import traceback
import sys
import inspect
from pprint import pprint
class ConsoleExceptionMiddleware:
def process_exception(self, request, exception):
exc_info = sys.exc_info()
print("######################## Exception #############################")
print('\n'.join(traceback.format_exception(*(exc_info or sys.exc_info()))))
print("----------------------------------------------------------------")
pprint(inspect.trace()[-1][0].f_locals)
print("################################################################")
#pprint(request)
#print "################################################################"

View file

@ -86,13 +86,13 @@ class FacebookFetcher:
social_services = MSocialServices.get_user(self.options.get('requesting_user_id'))
facebook_api = social_services.facebook_api()
if not facebook_api:
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' %
logging.debug(' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' %
(self.feed.log_title[:30], self.feed.feed_address, self.options))
return
else:
usersubs = UserSubscription.objects.filter(feed=self.feed)
if not usersubs:
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No subscriptions' %
logging.debug(' ***> [%-30s] ~FRFacebook fetch failed: %s: No subscriptions' %
(self.feed.log_title[:30], self.feed.feed_address))
return
@ -108,7 +108,7 @@ class FacebookFetcher:
break
if not facebook_api:
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' %
logging.debug(' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' %
(self.feed.log_title[:30], self.feed.feed_address, usersubs[0].user.username))
return
@ -117,10 +117,10 @@ class FacebookFetcher:
def fetch_page_feed(self, facebook_user, page, fields):
try:
stories = facebook_user.get_object(page, fields=fields)
except GraphAPIError, e:
except GraphAPIError as e:
message = str(e).lower()
if 'session has expired' in message:
logging.debug(u' ***> [%-30s] ~FRFacebook page failed/expired, disconnecting facebook: %s: %s' %
logging.debug(' ***> [%-30s] ~FRFacebook page failed/expired, disconnecting facebook: %s: %s' %
(self.feed.log_title[:30], self.feed.feed_address, e))
self.feed.save_feed_history(560, "Facebook Error: Expired token")
return {}
@ -137,7 +137,7 @@ class FacebookFetcher:
return
message = linebreaks(page_story['message'])
created_date = page_story['created_time']
if isinstance(created_date, unicode):
if isinstance(created_date, str):
created_date = dateutil.parser.parse(created_date)
fields = facebook_user.get_object(page_story['id'], fields='permalink_url,link,attachments')
permalink = fields.get('link', fields['permalink_url'])
@ -175,7 +175,7 @@ class FacebookFetcher:
return
message = linebreaks(page_story['description'])
created_date = page_story['updated_time']
if isinstance(created_date, unicode):
if isinstance(created_date, str):
created_date = dateutil.parser.parse(created_date)
permalink = facebook_user.get_object(page_story['id'], fields='permalink_url')['permalink_url']
embed_html = facebook_user.get_object(page_story['id'], fields='embed_html')
@ -206,16 +206,16 @@ class FacebookFetcher:
page_name = self.extract_page_name()
facebook_user = self.facebook_user()
if not facebook_user:
logging.debug(u' ***> [%-30s] ~FRFacebook icon failed, disconnecting facebook: %s' %
logging.debug(' ***> [%-30s] ~FRFacebook icon failed, disconnecting facebook: %s' %
(self.feed.log_title[:30], self.feed.feed_address))
return
try:
picture_data = facebook_user.get_object(page_name, fields='picture')
except GraphAPIError, e:
except GraphAPIError as e:
message = str(e).lower()
if 'session has expired' in message:
logging.debug(u' ***> [%-30s] ~FRFacebook icon failed/expired, disconnecting facebook: %s: %s' %
logging.debug(' ***> [%-30s] ~FRFacebook icon failed/expired, disconnecting facebook: %s: %s' %
(self.feed.log_title[:30], self.feed.feed_address, e))
return

View file

@ -0,0 +1,224 @@
import re
import datetime
import dateutil.parser
from django.conf import settings
from django.utils import feedgenerator
from django.utils.html import linebreaks
from apps.social.models import MSocialServices
from apps.reader.models import UserSubscription
from utils import log as logging
from vendor.facebook import GraphAPIError
class FacebookFetcher:
def __init__(self, feed, options=None):
self.feed = feed
self.options = options or {}
def fetch(self):
page_name = self.extract_page_name()
if not page_name:
return
facebook_user = self.facebook_user()
if not facebook_user:
return
# If 'video', use video API to get embed:
# f.get_object('tastyvegetarian', fields='posts')
# f.get_object('1992797300790726', fields='embed_html')
feed = self.fetch_page_feed(facebook_user, page_name, 'name,about,posts,videos,photos')
data = {}
data['title'] = feed.get('name', "%s on Facebook" % page_name)
data['link'] = feed.get('link', "https://facebook.com/%s" % page_name)
data['description'] = feed.get('about', "%s on Facebook" % page_name)
data['lastBuildDate'] = datetime.datetime.utcnow()
data['generator'] = 'NewsBlur Facebook API Decrapifier - %s' % settings.NEWSBLUR_URL
data['docs'] = None
data['feed_url'] = self.feed.feed_address
rss = feedgenerator.Atom1Feed(**data)
merged_data = []
posts = feed.get('posts', {}).get('data', None)
if posts:
for post in posts:
story_data = self.page_posts_story(facebook_user, post)
if not story_data:
continue
merged_data.append(story_data)
videos = feed.get('videos', {}).get('data', None)
if videos:
for video in videos:
story_data = self.page_video_story(facebook_user, video)
if not story_data:
continue
for seen_data in merged_data:
if story_data['link'] == seen_data['link']:
# Video wins over posts (and attachments)
seen_data['description'] = story_data['description']
seen_data['title'] = story_data['title']
break
for story_data in merged_data:
rss.add_item(**story_data)
return rss.writeString('utf-8')
def extract_page_name(self):
page = None
try:
page_groups = re.search('facebook.com/(\w+)/?', self.feed.feed_address)
if not page_groups:
return
page = page_groups.group(1)
except IndexError:
return
return page
def facebook_user(self):
facebook_api = None
social_services = None
if self.options.get('requesting_user_id', None):
social_services = MSocialServices.get_user(self.options.get('requesting_user_id'))
facebook_api = social_services.facebook_api()
if not facebook_api:
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' %
(self.feed.log_title[:30], self.feed.feed_address, self.options))
return
else:
usersubs = UserSubscription.objects.filter(feed=self.feed)
if not usersubs:
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No subscriptions' %
(self.feed.log_title[:30], self.feed.feed_address))
return
for sub in usersubs:
social_services = MSocialServices.get_user(sub.user_id)
if not social_services.facebook_uid:
continue
facebook_api = social_services.facebook_api()
if not facebook_api:
continue
else:
break
if not facebook_api:
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s: No facebook API for %s' %
(self.feed.log_title[:30], self.feed.feed_address, usersubs[0].user.username))
return
return facebook_api
def fetch_page_feed(self, facebook_user, page, fields):
try:
stories = facebook_user.get_object(page, fields=fields)
except GraphAPIError, e:
message = str(e).lower()
if 'session has expired' in message:
logging.debug(u' ***> [%-30s] ~FRFacebook page failed/expired, disconnecting facebook: %s: %s' %
(self.feed.log_title[:30], self.feed.feed_address, e))
self.feed.save_feed_history(560, "Facebook Error: Expired token")
return {}
if not stories:
return {}
return stories
def page_posts_story(self, facebook_user, page_story):
categories = set()
if 'message' not in page_story:
# Probably a story shared on the page's timeline, not a published story
return
message = linebreaks(page_story['message'])
created_date = page_story['created_time']
if isinstance(created_date, unicode):
created_date = dateutil.parser.parse(created_date)
fields = facebook_user.get_object(page_story['id'], fields='permalink_url,link,attachments')
permalink = fields.get('link', fields['permalink_url'])
attachments_html = ""
if fields.get('attachments', None) and fields['attachments']['data']:
for attachment in fields['attachments']['data']:
if 'media' in attachment:
attachments_html += "<img src=\"%s\" />" % attachment['media']['image']['src']
if attachment.get('subattachments', None):
for subattachment in attachment['subattachments']['data']:
attachments_html += "<img src=\"%s\" />" % subattachment['media']['image']['src']
content = """<div class="NB-facebook-rss">
<div class="NB-facebook-rss-message">%s</div>
<div class="NB-facebook-rss-picture">%s</div>
</div>""" % (
message,
attachments_html
)
story = {
'title': message,
'link': permalink,
'description': content,
'categories': list(categories),
'unique_id': "fb_post:%s" % page_story['id'],
'pubdate': created_date,
}
return story
def page_video_story(self, facebook_user, page_story):
categories = set()
if 'description' not in page_story:
return
message = linebreaks(page_story['description'])
created_date = page_story['updated_time']
if isinstance(created_date, unicode):
created_date = dateutil.parser.parse(created_date)
permalink = facebook_user.get_object(page_story['id'], fields='permalink_url')['permalink_url']
embed_html = facebook_user.get_object(page_story['id'], fields='embed_html')
if permalink.startswith('/'):
permalink = "https://www.facebook.com%s" % permalink
content = """<div class="NB-facebook-rss">
<div class="NB-facebook-rss-message">%s</div>
<div class="NB-facebook-rss-embed">%s</div>
</div>""" % (
message,
embed_html.get('embed_html', '')
)
story = {
'title': page_story.get('story', message),
'link': permalink,
'description': content,
'categories': list(categories),
'unique_id': "fb_post:%s" % page_story['id'],
'pubdate': created_date,
}
return story
def favicon_url(self):
page_name = self.extract_page_name()
facebook_user = self.facebook_user()
if not facebook_user:
logging.debug(u' ***> [%-30s] ~FRFacebook icon failed, disconnecting facebook: %s' %
(self.feed.log_title[:30], self.feed.feed_address))
return
try:
picture_data = facebook_user.get_object(page_name, fields='picture')
except GraphAPIError, e:
message = str(e).lower()
if 'session has expired' in message:
logging.debug(u' ***> [%-30s] ~FRFacebook icon failed/expired, disconnecting facebook: %s: %s' %
(self.feed.log_title[:30], self.feed.feed_address, e))
return
if 'picture' in picture_data:
return picture_data['picture']['data']['url']

View file

@ -2,7 +2,7 @@ import time
import datetime
import traceback
import multiprocessing
import urllib2
import urllib.request, urllib.error, urllib.parse
import xml.sax
import redis
import random
@ -11,7 +11,7 @@ import re
import requests
import dateutil.parser
import isodate
import urlparse
import urllib.parse
from django.conf import settings
from django.db import IntegrityError
from django.core.cache import cache
@ -42,7 +42,7 @@ from utils.json_fetcher import JSONFetcher
# Refresh feed code adapted from Feedjack.
# http://feedjack.googlecode.com
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5)
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = list(range(5))
class FetchFeed:
@ -59,7 +59,7 @@ class FetchFeed:
"""
start = time.time()
identity = self.get_identity()
log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity,
log_msg = '%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity,
self.feed.log_title[:30],
self.feed.id,
datetime.datetime.now() - self.feed.last_update)
@ -74,19 +74,19 @@ class FetchFeed:
modified = None
etag = None
address = qurl(address, add={"_": random.randint(0, 10000)})
logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % (
logging.debug(' ---> [%-30s] ~FBForcing fetch: %s' % (
self.feed.log_title[:30], address))
elif (not self.feed.fetched_once or not self.feed.known_good):
modified = None
etag = None
if self.options.get('feed_xml'):
logging.debug(u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (
logging.debug(' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (
self.feed.log_title[:30], len(self.options.get('feed_xml'))))
if self.options.get('fpf'):
self.fpf = self.options.get('fpf')
logging.debug(u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (
logging.debug(' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (
self.feed.log_title[:30]))
return FEED_OK, self.fpf
@ -96,21 +96,21 @@ class FetchFeed:
except (requests.adapters.ConnectionError):
youtube_feed = None
if not youtube_feed:
logging.debug(u' ***> [%-30s] ~FRYouTube fetch failed: %s.' %
logging.debug(' ***> [%-30s] ~FRYouTube fetch failed: %s.' %
(self.feed.log_title[:30], address))
return FEED_ERRHTTP, None
self.fpf = feedparser.parse(youtube_feed)
elif re.match(r'(https?)?://twitter.com/\w+/?', qurl(address, remove=['_'])):
twitter_feed = self.fetch_twitter(address)
if not twitter_feed:
logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed: %s' %
logging.debug(' ***> [%-30s] ~FRTwitter fetch failed: %s' %
(self.feed.log_title[:30], address))
return FEED_ERRHTTP, None
self.fpf = feedparser.parse(twitter_feed)
elif re.match(r'(.*?)facebook.com/\w+/?$', qurl(address, remove=['_'])):
facebook_feed = self.fetch_facebook()
if not facebook_feed:
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s' %
logging.debug(' ***> [%-30s] ~FRFacebook fetch failed: %s' %
(self.feed.log_title[:30], address))
return FEED_ERRHTTP, None
self.fpf = feedparser.parse(facebook_feed)
@ -140,7 +140,7 @@ class FetchFeed:
# JSON Feed
json_feed = self.fetch_json_feed(address, raw_feed)
if not json_feed:
logging.debug(u' ***> [%-30s] ~FRJSON fetch failed: %s' %
logging.debug(' ***> [%-30s] ~FRJSON fetch failed: %s' %
(self.feed.log_title[:30], address))
return FEED_ERRHTTP, None
self.fpf = feedparser.parse(json_feed)
@ -152,8 +152,8 @@ class FetchFeed:
response_headers=response_headers)
if self.options.get('debug', False):
logging.debug(" ---> [%-30s] ~FBFeed fetch status %s: %s length / %s" % (self.feed.log_title[:30], raw_feed.status_code, len(smart_unicode(raw_feed.content)), raw_feed.headers))
except Exception, e:
logging.debug(" ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.log_title[:30], unicode(e)[:100]))
except Exception as e:
logging.debug(" ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.log_title[:30], str(e)[:100]))
if not self.fpf or self.options.get('force_fp', False):
try:
@ -161,22 +161,22 @@ class FetchFeed:
agent=self.feed.user_agent,
etag=etag,
modified=modified)
except (TypeError, ValueError, KeyError, EOFError, MemoryError), e:
logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' %
except (TypeError, ValueError, KeyError, EOFError, MemoryError) as e:
logging.debug(' ***> [%-30s] ~FRFeed fetch error: %s' %
(self.feed.log_title[:30], e))
pass
if not self.fpf:
try:
logging.debug(u' ***> [%-30s] ~FRTurning off headers...' %
logging.debug(' ***> [%-30s] ~FRTurning off headers...' %
(self.feed.log_title[:30]))
self.fpf = feedparser.parse(address, agent=self.feed.user_agent)
except (TypeError, ValueError, KeyError, EOFError, MemoryError), e:
logging.debug(u' ***> [%-30s] ~FRFetch failed: %s.' %
except (TypeError, ValueError, KeyError, EOFError, MemoryError) as e:
logging.debug(' ***> [%-30s] ~FRFetch failed: %s.' %
(self.feed.log_title[:30], e))
return FEED_ERRHTTP, None
logging.debug(u' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (
logging.debug(' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (
self.feed.log_title[:30], time.time() - start))
return FEED_OK, self.fpf
@ -217,22 +217,22 @@ class FetchFeed:
return
elif 'youtube.com/feeds/videos.xml?user=' in address:
try:
username = urlparse.parse_qs(urlparse.urlparse(address).query)['user'][0]
username = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['user'][0]
except IndexError:
return
elif 'youtube.com/feeds/videos.xml?channel_id=' in address:
try:
channel_id = urlparse.parse_qs(urlparse.urlparse(address).query)['channel_id'][0]
channel_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['channel_id'][0]
except (IndexError, KeyError):
return
elif 'youtube.com/playlist' in address:
try:
list_id = urlparse.parse_qs(urlparse.urlparse(address).query)['list'][0]
list_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['list'][0]
except IndexError:
return
elif 'youtube.com/feeds/videos.xml?playlist_id' in address:
try:
list_id = urlparse.parse_qs(urlparse.urlparse(address).query)['playlist_id'][0]
list_id = urllib.parse.parse_qs(urllib.parse.urlparse(address).query)['playlist_id'][0]
except IndexError:
return
@ -365,7 +365,7 @@ class ProcessFeed:
if hasattr(self.fpf, 'status'):
if self.options['verbose']:
if self.fpf.bozo and self.fpf.status != 304:
logging.debug(u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (
logging.debug(' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (
self.feed.log_title[:30],
self.fpf.bozo_exception,
len(self.fpf.entries)))
@ -452,7 +452,7 @@ class ProcessFeed:
if hasattr(self.fpf, 'modified') and self.fpf.modified:
try:
self.feed.last_modified = datetime.datetime.strptime(self.fpf.modified, '%a, %d %b %Y %H:%M:%S %Z')
except Exception, e:
except Exception as e:
self.feed.last_modified = None
logging.debug("Broken mtime %s: %s" % (self.feed.last_modified, e))
pass
@ -510,16 +510,16 @@ class ProcessFeed:
start_date = story.get('published')
if replace_guids:
if replace_permalinks:
new_story_guid = unicode(story.get('published'))
new_story_guid = str(story.get('published'))
if self.options['verbose']:
logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s' % (
logging.debug(' ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s' % (
self.feed.log_title[:30],
story.get('guid'), new_story_guid))
story['guid'] = new_story_guid
else:
new_story_guid = Feed.get_permalink(story)
if self.options['verbose']:
logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s' % (
logging.debug(' ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s' % (
self.feed.log_title[:30],
story.get('guid'), new_story_guid))
story['guid'] = new_story_guid
@ -532,7 +532,7 @@ class ProcessFeed:
story_hashes.extend(story_hashes_in_unread_cutoff)
story_hashes = list(set(story_hashes))
if self.options['verbose'] or settings.DEBUG:
logging.debug(u' ---> [%-30s] ~FBFound ~SB%s~SN guids, adding ~SB%s~SN/%s guids from db' % (
logging.debug(' ---> [%-30s] ~FBFound ~SB%s~SN guids, adding ~SB%s~SN/%s guids from db' % (
self.feed.log_title[:30],
original_story_hash_count, len(story_hashes)-original_story_hash_count,
len(story_hashes_in_unread_cutoff)))
@ -572,17 +572,17 @@ class ProcessFeed:
if (hub_url and self_url and not settings.DEBUG and
self.feed.active_subscribers > 0 and
(push_expired or not self.feed.is_push or self.options.get('force'))):
logging.debug(u' ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % (
logging.debug(' ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % (
self.feed.log_title[:30],
"~SKRe-~SN" if push_expired else "", hub_url))
try:
PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url)
except TimeoutError:
logging.debug(u' ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s' % (
logging.debug(' ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s' % (
self.feed.log_title[:30], hub_url))
elif (self.feed.is_push and
(self.feed.active_subscribers <= 0 or not hub_url)):
logging.debug(u' ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % (
logging.debug(' ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % (
self.feed.log_title[:30]))
self.feed.is_push = False
self.feed = self.feed.save()
@ -592,7 +592,7 @@ class ProcessFeed:
QueueNotifications.delay(self.feed.pk, ret_values['new'])
# All Done
logging.debug(u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (
logging.debug(' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (
self.feed.log_title[:30],
'~FG~SB' if ret_values['new'] else '', ret_values['new'],
'~FY~SB' if ret_values['updated'] else '', ret_values['updated'],
@ -610,7 +610,7 @@ class ProcessFeed:
self.feed.save_feed_history(200, "OK", date=fetch_date)
if self.options['verbose']:
logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (
logging.debug(' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (
self.feed.log_title[:30], time.time() - start))
return FEED_OK, ret_values
@ -717,28 +717,28 @@ class Dispatcher:
except TimeoutError:
logging.debug(' ---> [%-30s] Unread count took too long...' % (feed.log_title[:30],))
if self.options['verbose']:
logging.debug(u' ---> [%-30s] ~FBTIME: unread count in ~FM%.4ss' % (
logging.debug(' ---> [%-30s] ~FBTIME: unread count in ~FM%.4ss' % (
feed.log_title[:30], time.time() - start))
except urllib2.HTTPError, e:
logging.debug(' ---> [%-30s] ~FRFeed throws HTTP error: ~SB%s' % (unicode(feed_id)[:30], e.fp.read()))
except urllib.error.HTTPError as e:
logging.debug(' ---> [%-30s] ~FRFeed throws HTTP error: ~SB%s' % (str(feed_id)[:30], e.fp.read()))
feed_code = e.code
feed.save_feed_history(feed_code, e.msg, e.fp.read())
fetched_feed = None
except Feed.DoesNotExist, e:
logging.debug(' ---> [%-30s] ~FRFeed is now gone...' % (unicode(feed_id)[:30]))
except Feed.DoesNotExist as e:
logging.debug(' ---> [%-30s] ~FRFeed is now gone...' % (str(feed_id)[:30]))
continue
except SoftTimeLimitExceeded, e:
except SoftTimeLimitExceeded as e:
logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
ret_feed = FEED_ERREXC
fetched_feed = None
feed_code = 559
feed.save_feed_history(feed_code, 'Timeout', e)
except TimeoutError, e:
except TimeoutError as e:
logging.debug(' ---> [%-30s] ~FRFeed fetch timed out...' % (feed.log_title[:30]))
feed_code = 505
feed.save_feed_history(feed_code, 'Timeout', e)
fetched_feed = None
except Exception, e:
except Exception as e:
logging.debug('[%d] ! -------------------------' % (feed_id,))
tb = traceback.format_exc()
logging.error(tb)
@ -778,20 +778,20 @@ class Dispatcher:
(ret_feed == FEED_OK or
(ret_feed == FEED_SAME and feed.stories_last_month > 10)))):
logging.debug(u' ---> [%-30s] ~FYFetching page: %s' % (feed.log_title[:30], feed.feed_link))
logging.debug(' ---> [%-30s] ~FYFetching page: %s' % (feed.log_title[:30], feed.feed_link))
page_importer = PageImporter(feed)
try:
page_data = page_importer.fetch_page()
page_duration = time.time() - start_duration
except SoftTimeLimitExceeded, e:
except SoftTimeLimitExceeded as e:
logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
page_data = None
feed.save_feed_history(557, 'Timeout', e)
except TimeoutError, e:
except TimeoutError as e:
logging.debug(' ---> [%-30s] ~FRPage fetch timed out...' % (feed.log_title[:30]))
page_data = None
feed.save_page_history(555, 'Timeout', '')
except Exception, e:
except Exception as e:
logging.debug('[%d] ! -------------------------' % (feed_id,))
tb = traceback.format_exc()
logging.error(tb)
@ -805,7 +805,7 @@ class Dispatcher:
settings.RAVEN_CLIENT.captureException()
feed = self.refresh_feed(feed.pk)
logging.debug(u' ---> [%-30s] ~FYFetching icon: %s' % (feed.log_title[:30], feed.feed_link))
logging.debug(' ---> [%-30s] ~FYFetching icon: %s' % (feed.log_title[:30], feed.feed_link))
force = self.options['force']
if random.random() > .99:
force = True
@ -813,13 +813,13 @@ class Dispatcher:
try:
icon_importer.save()
icon_duration = time.time() - start_duration
except SoftTimeLimitExceeded, e:
except SoftTimeLimitExceeded as e:
logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
feed.save_feed_history(558, 'Timeout', e)
except TimeoutError, e:
except TimeoutError as e:
logging.debug(' ---> [%-30s] ~FRIcon fetch timed out...' % (feed.log_title[:30]))
feed.save_page_history(556, 'Timeout', '')
except Exception, e:
except Exception as e:
logging.debug('[%d] ! -------------------------' % (feed_id,))
tb = traceback.format_exc()
logging.error(tb)
@ -830,7 +830,7 @@ class Dispatcher:
settings.RAVEN_CLIENT):
settings.RAVEN_CLIENT.captureException()
else:
logging.debug(u' ---> [%-30s] ~FBSkipping page fetch: (%s on %s stories) %s' % (feed.log_title[:30], self.feed_trans[ret_feed], feed.stories_last_month, '' if feed.has_page else ' [HAS NO PAGE]'))
logging.debug(' ---> [%-30s] ~FBSkipping page fetch: (%s on %s stories) %s' % (feed.log_title[:30], self.feed_trans[ret_feed], feed.stories_last_month, '' if feed.has_page else ' [HAS NO PAGE]'))
feed = self.refresh_feed(feed.pk)
delta = time.time() - start_time
@ -845,7 +845,7 @@ class Dispatcher:
if ret_entries and ret_entries['new']:
self.publish_to_subscribers(feed, ret_entries['new'])
done_msg = (u'%2s ---> [%-30s] ~FYProcessed in ~FM~SB%.4ss~FY~SN (~FB%s~FY) [%s]' % (
done_msg = ('%2s ---> [%-30s] ~FYProcessed in ~FM~SB%.4ss~FY~SN (~FB%s~FY) [%s]' % (
identity, feed.log_title[:30], delta,
feed.pk, self.feed_trans[ret_feed],))
logging.debug(done_msg)
@ -899,14 +899,14 @@ class Dispatcher:
.read_preference(pymongo.ReadPreference.PRIMARY)
missing_stories = Feed.format_stories(missing_stories, feed.pk)
stories = missing_stories + stories
logging.debug(u' ---> [%-30s] ~FYFound ~SB~FC%s(of %s)/%s~FY~SN un-secondaried stories while computing scores' % (feed.log_title[:30], len(missing_stories), len(missing_story_hashes), len(stories)))
logging.debug(' ---> [%-30s] ~FYFound ~SB~FC%s(of %s)/%s~FY~SN un-secondaried stories while computing scores' % (feed.log_title[:30], len(missing_stories), len(missing_story_hashes), len(stories)))
cache.set("S:%s" % feed.pk, stories, 60)
logging.debug(u' ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)' % (
logging.debug(' ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)' % (
feed.log_title[:30], len(stories), user_subs.count(),
feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers))
self.calculate_feed_scores_with_stories(user_subs, stories)
elif self.options.get('mongodb_replication_lag'):
logging.debug(u' ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag' % (
logging.debug(' ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag' % (
feed.log_title[:30], self.options.get('mongodb_replication_lag')))
@timelimit(10)

933
utils/feed_fetcher.py.bak Normal file
View file

@ -0,0 +1,933 @@
import time
import datetime
import traceback
import multiprocessing
import urllib2
import xml.sax
import redis
import random
import pymongo
import re
import requests
import dateutil.parser
import isodate
import urlparse
from django.conf import settings
from django.db import IntegrityError
from django.core.cache import cache
from apps.reader.models import UserSubscription
from apps.rss_feeds.models import Feed, MStory
from apps.rss_feeds.page_importer import PageImporter
from apps.rss_feeds.icon_importer import IconImporter
from apps.notifications.tasks import QueueNotifications, MUserFeedNotification
from apps.push.models import PushSubscription
from apps.statistics.models import MAnalyticsFetcher, MStatistics
from utils import feedparser
from utils.story_functions import pre_process_story, strip_tags, linkify
from utils import log as logging
from utils.feed_functions import timelimit, TimeoutError
from qurl import qurl
from BeautifulSoup import BeautifulSoup
from django.utils import feedgenerator
from django.utils.html import linebreaks
from django.utils.encoding import smart_unicode
from utils import json_functions as json
from celery.exceptions import SoftTimeLimitExceeded
from utils.twitter_fetcher import TwitterFetcher
from utils.facebook_fetcher import FacebookFetcher
from utils.json_fetcher import JSONFetcher
# from utils.feed_functions import mail_feed_error_to_admin
# Refresh feed code adapted from Feedjack.
# http://feedjack.googlecode.com
FEED_OK, FEED_SAME, FEED_ERRPARSE, FEED_ERRHTTP, FEED_ERREXC = range(5)
class FetchFeed:
def __init__(self, feed_id, options):
self.feed = Feed.get_by_id(feed_id)
self.options = options
self.fpf = None
self.raw_feed = None
@timelimit(30)
def fetch(self):
"""
Uses requests to download the feed, parsing it in feedparser. Will be storified later.
"""
start = time.time()
identity = self.get_identity()
log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity,
self.feed.log_title[:30],
self.feed.id,
datetime.datetime.now() - self.feed.last_update)
logging.debug(log_msg)
etag = self.feed.etag
modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
address = self.feed.feed_address
if (self.options.get('force') or random.random() <= .01):
self.options['force'] = True
modified = None
etag = None
address = qurl(address, add={"_": random.randint(0, 10000)})
logging.debug(u' ---> [%-30s] ~FBForcing fetch: %s' % (
self.feed.log_title[:30], address))
elif (not self.feed.fetched_once or not self.feed.known_good):
modified = None
etag = None
if self.options.get('feed_xml'):
logging.debug(u' ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (
self.feed.log_title[:30], len(self.options.get('feed_xml'))))
if self.options.get('fpf'):
self.fpf = self.options.get('fpf')
logging.debug(u' ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (
self.feed.log_title[:30]))
return FEED_OK, self.fpf
if 'youtube.com' in address:
try:
youtube_feed = self.fetch_youtube(address)
except (requests.adapters.ConnectionError):
youtube_feed = None
if not youtube_feed:
logging.debug(u' ***> [%-30s] ~FRYouTube fetch failed: %s.' %
(self.feed.log_title[:30], address))
return FEED_ERRHTTP, None
self.fpf = feedparser.parse(youtube_feed)
elif re.match(r'(https?)?://twitter.com/\w+/?', qurl(address, remove=['_'])):
twitter_feed = self.fetch_twitter(address)
if not twitter_feed:
logging.debug(u' ***> [%-30s] ~FRTwitter fetch failed: %s' %
(self.feed.log_title[:30], address))
return FEED_ERRHTTP, None
self.fpf = feedparser.parse(twitter_feed)
elif re.match(r'(.*?)facebook.com/\w+/?$', qurl(address, remove=['_'])):
facebook_feed = self.fetch_facebook()
if not facebook_feed:
logging.debug(u' ***> [%-30s] ~FRFacebook fetch failed: %s' %
(self.feed.log_title[:30], address))
return FEED_ERRHTTP, None
self.fpf = feedparser.parse(facebook_feed)
if not self.fpf:
try:
headers = self.feed.fetch_headers()
if etag:
headers['If-None-Match'] = etag
if modified:
# format into an RFC 1123-compliant timestamp. We can't use
# time.strftime() since the %a and %b directives can be affected
# by the current locale, but RFC 2616 states that dates must be
# in English.
short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])
headers['If-Modified-Since'] = modified_header
if etag or modified:
headers['A-IM'] = 'feed'
raw_feed = requests.get(address, headers=headers)
if raw_feed.status_code >= 400:
logging.debug(" ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s" % (self.feed.log_title[:30], raw_feed.status_code, raw_feed.headers))
raw_feed = requests.get(self.feed.feed_address, headers=self.feed.fetch_headers(fake=True))
if raw_feed.content and 'application/json' in raw_feed.headers.get('Content-Type', ""):
# JSON Feed
json_feed = self.fetch_json_feed(address, raw_feed)
if not json_feed:
logging.debug(u' ***> [%-30s] ~FRJSON fetch failed: %s' %
(self.feed.log_title[:30], address))
return FEED_ERRHTTP, None
self.fpf = feedparser.parse(json_feed)
elif raw_feed.content and raw_feed.status_code < 400:
response_headers = raw_feed.headers
response_headers['Content-Location'] = raw_feed.url
self.raw_feed = smart_unicode(raw_feed.content)
self.fpf = feedparser.parse(self.raw_feed,
response_headers=response_headers)
if self.options.get('debug', False):
logging.debug(" ---> [%-30s] ~FBFeed fetch status %s: %s length / %s" % (self.feed.log_title[:30], raw_feed.status_code, len(smart_unicode(raw_feed.content)), raw_feed.headers))
except Exception, e:
logging.debug(" ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.log_title[:30], unicode(e)[:100]))
if not self.fpf or self.options.get('force_fp', False):
try:
self.fpf = feedparser.parse(address,
agent=self.feed.user_agent,
etag=etag,
modified=modified)
except (TypeError, ValueError, KeyError, EOFError, MemoryError), e:
logging.debug(u' ***> [%-30s] ~FRFeed fetch error: %s' %
(self.feed.log_title[:30], e))
pass
if not self.fpf:
try:
logging.debug(u' ***> [%-30s] ~FRTurning off headers...' %
(self.feed.log_title[:30]))
self.fpf = feedparser.parse(address, agent=self.feed.user_agent)
except (TypeError, ValueError, KeyError, EOFError, MemoryError), e:
logging.debug(u' ***> [%-30s] ~FRFetch failed: %s.' %
(self.feed.log_title[:30], e))
return FEED_ERRHTTP, None
logging.debug(u' ---> [%-30s] ~FYFeed fetch in ~FM%.4ss' % (
self.feed.log_title[:30], time.time() - start))
return FEED_OK, self.fpf
def get_identity(self):
identity = "X"
current_process = multiprocessing.current_process()
if current_process._identity:
identity = current_process._identity[0]
return identity
def fetch_twitter(self, address=None):
twitter_fetcher = TwitterFetcher(self.feed, self.options)
return twitter_fetcher.fetch(address)
def fetch_facebook(self):
facebook_fetcher = FacebookFetcher(self.feed, self.options)
return facebook_fetcher.fetch()
def fetch_json_feed(self, address, headers):
json_fetcher = JSONFetcher(self.feed, self.options)
return json_fetcher.fetch(address, headers)
def fetch_youtube(self, address):
username = None
channel_id = None
list_id = None
if 'gdata.youtube.com' in address:
try:
username_groups = re.search('gdata.youtube.com/feeds/\w+/users/(\w+)/', address)
if not username_groups:
return
username = username_groups.group(1)
except IndexError:
return
elif 'youtube.com/feeds/videos.xml?user=' in address:
try:
username = urlparse.parse_qs(urlparse.urlparse(address).query)['user'][0]
except IndexError:
return
elif 'youtube.com/feeds/videos.xml?channel_id=' in address:
try:
channel_id = urlparse.parse_qs(urlparse.urlparse(address).query)['channel_id'][0]
except (IndexError, KeyError):
return
elif 'youtube.com/playlist' in address:
try:
list_id = urlparse.parse_qs(urlparse.urlparse(address).query)['list'][0]
except IndexError:
return
elif 'youtube.com/feeds/videos.xml?playlist_id' in address:
try:
list_id = urlparse.parse_qs(urlparse.urlparse(address).query)['playlist_id'][0]
except IndexError:
return
if channel_id:
video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?channel_id=%s" % channel_id, verify=False)
channel_json = requests.get("https://www.googleapis.com/youtube/v3/channels?part=snippet&id=%s&key=%s" %
(channel_id, settings.YOUTUBE_API_KEY))
channel = json.decode(channel_json.content)
try:
username = channel['items'][0]['snippet']['title']
description = channel['items'][0]['snippet']['description']
except (IndexError, KeyError):
return
elif list_id:
playlist_json = requests.get("https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s" %
(list_id, settings.YOUTUBE_API_KEY))
playlist = json.decode(playlist_json.content)
try:
username = playlist['items'][0]['snippet']['title']
description = playlist['items'][0]['snippet']['description']
except (IndexError, KeyError):
return
channel_url = "https://www.youtube.com/playlist?list=%s" % list_id
elif username:
video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?user=%s" % username, verify=False)
description = "YouTube videos uploaded by %s" % username
else:
return
if list_id:
playlist_json = requests.get("https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s" %
(list_id, settings.YOUTUBE_API_KEY))
playlist = json.decode(playlist_json.content)
try:
video_ids = [video['snippet']['resourceId']['videoId'] for video in playlist['items']]
except (IndexError, KeyError):
return
else:
if video_ids_xml.status_code != 200:
return
video_ids_soup = BeautifulSoup(video_ids_xml.content)
channel_url = video_ids_soup.find('author').find('uri').getText()
video_ids = []
for video_id in video_ids_soup.findAll('yt:videoid'):
video_ids.append(video_id.getText())
videos_json = requests.get("https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s" %
(','.join(video_ids), settings.YOUTUBE_API_KEY))
videos = json.decode(videos_json.content)
if 'error' in videos:
logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos))
return
data = {}
data['title'] = ("%s's YouTube Videos" % username if 'Uploads' not in username else username)
data['link'] = channel_url
data['description'] = description
data['lastBuildDate'] = datetime.datetime.utcnow()
data['generator'] = 'NewsBlur YouTube API v3 Decrapifier - %s' % settings.NEWSBLUR_URL
data['docs'] = None
data['feed_url'] = address
rss = feedgenerator.Atom1Feed(**data)
for video in videos['items']:
thumbnail = video['snippet']['thumbnails'].get('maxres')
if not thumbnail:
thumbnail = video['snippet']['thumbnails'].get('high')
if not thumbnail:
thumbnail = video['snippet']['thumbnails'].get('medium')
duration_sec = isodate.parse_duration(video['contentDetails']['duration']).seconds
if duration_sec >= 3600:
hours = (duration_sec / 3600)
minutes = (duration_sec - (hours*3600)) / 60
seconds = duration_sec - (hours*3600) - (minutes*60)
duration = "%s:%s:%s" % (hours, '{0:02d}'.format(minutes), '{0:02d}'.format(seconds))
else:
minutes = duration_sec / 60
seconds = duration_sec - (minutes*60)
duration = "%s:%s" % ('{0:02d}'.format(minutes), '{0:02d}'.format(seconds))
content = """<div class="NB-youtube-player"><iframe allowfullscreen="true" src="%s?iv_load_policy=3"></iframe></div>
<div class="NB-youtube-stats"><small>
<b>From:</b> <a href="%s">%s</a><br />
<b>Duration:</b> %s<br />
</small></div><hr>
<div class="NB-youtube-description">%s</div>
<img src="%s" style="display:none" />""" % (
("https://www.youtube.com/embed/" + video['id']),
channel_url, username,
duration,
linkify(linebreaks(video['snippet']['description'])),
thumbnail['url'] if thumbnail else "",
)
link = "http://www.youtube.com/watch?v=%s" % video['id']
story_data = {
'title': video['snippet']['title'],
'link': link,
'description': content,
'author_name': username,
'categories': [],
'unique_id': "tag:youtube.com,2008:video:%s" % video['id'],
'pubdate': dateutil.parser.parse(video['snippet']['publishedAt']),
}
rss.add_item(**story_data)
return rss.writeString('utf-8')
class ProcessFeed:
def __init__(self, feed_id, fpf, options, raw_feed=None):
self.feed_id = feed_id
self.options = options
self.fpf = fpf
self.raw_feed = raw_feed
def refresh_feed(self):
self.feed = Feed.get_by_id(self.feed_id)
if self.feed_id != self.feed.pk:
logging.debug(" ***> Feed has changed: from %s to %s" % (self.feed_id, self.feed.pk))
self.feed_id = self.feed.pk
def process(self):
""" Downloads and parses a feed.
"""
start = time.time()
self.refresh_feed()
ret_values = dict(new=0, updated=0, same=0, error=0)
if hasattr(self.fpf, 'status'):
if self.options['verbose']:
if self.fpf.bozo and self.fpf.status != 304:
logging.debug(u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (
self.feed.log_title[:30],
self.fpf.bozo_exception,
len(self.fpf.entries)))
if self.fpf.status == 304:
self.feed = self.feed.save()
self.feed.save_feed_history(304, "Not modified")
return FEED_SAME, ret_values
# 302 and 307: Temporary redirect: ignore
# 301 and 308: Permanent redirect: save it (after 10 tries)
if self.fpf.status == 301 or self.fpf.status == 308:
if self.fpf.href.endswith('feedburner.com/atom.xml'):
return FEED_ERRHTTP, ret_values
redirects, non_redirects = self.feed.count_redirects_in_history('feed')
self.feed.save_feed_history(self.fpf.status, "HTTP Redirect (%d to go)" % (10-len(redirects)))
if len(redirects) >= 10 or len(non_redirects) == 0:
address = self.fpf.href
if self.options['force'] and address:
address = qurl(address, remove=['_'])
self.feed.feed_address = address
if not self.feed.known_good:
self.feed.fetched_once = True
logging.debug(" ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.log_title[:30], self.fpf.status))
self.feed = self.feed.schedule_feed_fetch_immediately()
if not self.fpf.entries:
self.feed = self.feed.save()
self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
return FEED_ERRHTTP, ret_values
if self.fpf.status >= 400:
logging.debug(" ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.log_title[:30], self.fpf.status))
fixed_feed = None
if not self.feed.known_good:
fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed:
self.feed.save_feed_history(self.fpf.status, "HTTP Error")
else:
self.feed = feed
self.feed = self.feed.save()
return FEED_ERRHTTP, ret_values
if not self.fpf:
logging.debug(" ---> [%-30s] ~SB~FRFeed is Non-XML. No feedparser feed either!" % (self.feed.log_title[:30]))
self.feed.save_feed_history(551, "Broken feed")
return FEED_ERRHTTP, ret_values
if self.fpf and not self.fpf.entries:
if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
logging.debug(" ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.log_title[:30], len(self.fpf.entries)))
fixed_feed = None
if not self.feed.known_good:
fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed:
self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception)
else:
self.feed = feed
self.feed = self.feed.save()
return FEED_ERRPARSE, ret_values
elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
logging.debug(" ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.log_title[:30], len(self.fpf.entries)))
fixed_feed = None
if not self.feed.known_good:
fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
if not fixed_feed:
self.feed.save_feed_history(553, 'Not an RSS feed', self.fpf.bozo_exception)
else:
self.feed = feed
self.feed = self.feed.save()
return FEED_ERRPARSE, ret_values
# the feed has changed (or it is the first time we parse it)
# saving the etag and last_modified fields
original_etag = self.feed.etag
self.feed.etag = self.fpf.get('etag')
if self.feed.etag:
self.feed.etag = self.feed.etag[:255]
# some times this is None (it never should) *sigh*
if self.feed.etag is None:
self.feed.etag = ''
if self.feed.etag != original_etag:
self.feed.save(update_fields=['etag'])
original_last_modified = self.feed.last_modified
if hasattr(self.fpf, 'modified') and self.fpf.modified:
try:
self.feed.last_modified = datetime.datetime.strptime(self.fpf.modified, '%a, %d %b %Y %H:%M:%S %Z')
except Exception, e:
self.feed.last_modified = None
logging.debug("Broken mtime %s: %s" % (self.feed.last_modified, e))
pass
if self.feed.last_modified != original_last_modified:
self.feed.save(update_fields=['last_modified'])
self.fpf.entries = self.fpf.entries[:100]
original_title = self.feed.feed_title
if self.fpf.feed.get('title'):
self.feed.feed_title = strip_tags(self.fpf.feed.get('title'))
if self.feed.feed_title != original_title:
self.feed.save(update_fields=['feed_title'])
tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
if tagline:
original_tagline = self.feed.data.feed_tagline
self.feed.data.feed_tagline = smart_unicode(tagline)
if self.feed.data.feed_tagline != original_tagline:
self.feed.data.save(update_fields=['feed_tagline'])
if not self.feed.feed_link_locked:
new_feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link
if self.options['force'] and new_feed_link:
new_feed_link = qurl(new_feed_link, remove=['_'])
if new_feed_link != self.feed.feed_link:
logging.debug(" ---> [%-30s] ~SB~FRFeed's page is different: %s to %s" % (self.feed.log_title[:30], self.feed.feed_link, new_feed_link))
redirects, non_redirects = self.feed.count_redirects_in_history('page')
self.feed.save_page_history(301, "HTTP Redirect (%s to go)" % (10-len(redirects)))
if len(redirects) >= 10 or len(non_redirects) == 0:
self.feed.feed_link = new_feed_link
self.feed.save(update_fields=['feed_link'])
# Determine if stories aren't valid and replace broken guids
guids_seen = set()
permalinks_seen = set()
for entry in self.fpf.entries:
guids_seen.add(entry.get('guid'))
permalinks_seen.add(Feed.get_permalink(entry))
guid_difference = len(guids_seen) != len(self.fpf.entries)
single_guid = len(guids_seen) == 1
replace_guids = single_guid and guid_difference
permalink_difference = len(permalinks_seen) != len(self.fpf.entries)
single_permalink = len(permalinks_seen) == 1
replace_permalinks = single_permalink and permalink_difference
# Compare new stories to existing stories, adding and updating
start_date = datetime.datetime.utcnow()
story_hashes = []
stories = []
for entry in self.fpf.entries:
story = pre_process_story(entry, self.fpf.encoding)
if not story['title'] and not story['story_content']: continue
if story.get('published') < start_date:
start_date = story.get('published')
if replace_guids:
if replace_permalinks:
new_story_guid = unicode(story.get('published'))
if self.options['verbose']:
logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s' % (
self.feed.log_title[:30],
story.get('guid'), new_story_guid))
story['guid'] = new_story_guid
else:
new_story_guid = Feed.get_permalink(story)
if self.options['verbose']:
logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s' % (
self.feed.log_title[:30],
story.get('guid'), new_story_guid))
story['guid'] = new_story_guid
story['story_hash'] = MStory.feed_guid_hash_unsaved(self.feed.pk, story.get('guid'))
stories.append(story)
story_hashes.append(story.get('story_hash'))
original_story_hash_count = len(story_hashes)
story_hashes_in_unread_cutoff = self.feed.story_hashes_in_unread_cutoff[:original_story_hash_count]
story_hashes.extend(story_hashes_in_unread_cutoff)
story_hashes = list(set(story_hashes))
if self.options['verbose'] or settings.DEBUG:
logging.debug(u' ---> [%-30s] ~FBFound ~SB%s~SN guids, adding ~SB%s~SN/%s guids from db' % (
self.feed.log_title[:30],
original_story_hash_count, len(story_hashes)-original_story_hash_count,
len(story_hashes_in_unread_cutoff)))
existing_stories = dict((s.story_hash, s) for s in MStory.objects(
story_hash__in=story_hashes,
# story_date__gte=start_date,
# story_feed_id=self.feed.pk
))
# if len(existing_stories) == 0:
# existing_stories = dict((s.story_hash, s) for s in MStory.objects(
# story_date__gte=start_date,
# story_feed_id=self.feed.pk
# ))
ret_values = self.feed.add_update_stories(stories, existing_stories,
verbose=self.options['verbose'],
updates_off=self.options['updates_off'])
# PubSubHubbub
if (hasattr(self.fpf, 'feed') and
hasattr(self.fpf.feed, 'links') and self.fpf.feed.links):
hub_url = None
self_url = self.feed.feed_address
for link in self.fpf.feed.links:
if link['rel'] == 'hub' and not hub_url:
hub_url = link['href']
elif link['rel'] == 'self':
self_url = link['href']
push_expired = False
if self.feed.is_push:
try:
push_expired = self.feed.push.lease_expires < datetime.datetime.now()
except PushSubscription.DoesNotExist:
self.feed.is_push = False
if (hub_url and self_url and not settings.DEBUG and
self.feed.active_subscribers > 0 and
(push_expired or not self.feed.is_push or self.options.get('force'))):
logging.debug(u' ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % (
self.feed.log_title[:30],
"~SKRe-~SN" if push_expired else "", hub_url))
try:
PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url)
except TimeoutError:
logging.debug(u' ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s' % (
self.feed.log_title[:30], hub_url))
elif (self.feed.is_push and
(self.feed.active_subscribers <= 0 or not hub_url)):
logging.debug(u' ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % (
self.feed.log_title[:30]))
self.feed.is_push = False
self.feed = self.feed.save()
# Push notifications
if ret_values['new'] > 0 and MUserFeedNotification.feed_has_users(self.feed.pk) > 0:
QueueNotifications.delay(self.feed.pk, ret_values['new'])
# All Done
logging.debug(u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (
self.feed.log_title[:30],
'~FG~SB' if ret_values['new'] else '', ret_values['new'],
'~FY~SB' if ret_values['updated'] else '', ret_values['updated'],
'~SB' if ret_values['same'] else '', ret_values['same'],
'~FR~SB' if ret_values['error'] else '', ret_values['error'],
len(self.fpf.entries)))
self.feed.update_all_statistics(has_new_stories=bool(ret_values['new']), force=self.options['force'])
fetch_date = datetime.datetime.now()
if ret_values['new']:
if not getattr(settings, 'TEST_DEBUG', False):
self.feed.trim_feed()
self.feed.expire_redis()
if MStatistics.get('raw_feed', None) == self.feed.pk:
self.feed.save_raw_feed(self.raw_feed, fetch_date)
self.feed.save_feed_history(200, "OK", date=fetch_date)
if self.options['verbose']:
logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (
self.feed.log_title[:30], time.time() - start))
return FEED_OK, ret_values
class Dispatcher:
def __init__(self, options, num_threads):
self.options = options
self.feed_stats = {
FEED_OK:0,
FEED_SAME:0,
FEED_ERRPARSE:0,
FEED_ERRHTTP:0,
FEED_ERREXC:0}
self.feed_trans = {
FEED_OK:'ok',
FEED_SAME:'unchanged',
FEED_ERRPARSE:'cant_parse',
FEED_ERRHTTP:'http_error',
FEED_ERREXC:'exception'}
self.feed_keys = sorted(self.feed_trans.keys())
self.num_threads = num_threads
self.time_start = datetime.datetime.utcnow()
self.workers = []
def refresh_feed(self, feed_id):
"""Update feed, since it may have changed"""
return Feed.get_by_id(feed_id)
def process_feed_wrapper(self, feed_queue):
delta = None
current_process = multiprocessing.current_process()
identity = "X"
feed = None
if current_process._identity:
identity = current_process._identity[0]
for feed_id in feed_queue:
start_duration = time.time()
feed_fetch_duration = None
feed_process_duration = None
page_duration = None
icon_duration = None
feed_code = None
ret_entries = None
start_time = time.time()
ret_feed = FEED_ERREXC
try:
feed = self.refresh_feed(feed_id)
skip = False
if self.options.get('fake'):
skip = True
weight = "-"
quick = "-"
rand = "-"
elif (self.options.get('quick') and not self.options['force'] and
feed.known_good and feed.fetched_once and not feed.is_push):
weight = feed.stories_last_month * feed.num_subscribers
random_weight = random.randint(1, max(weight, 1))
quick = float(self.options.get('quick', 0))
rand = random.random()
if random_weight < 1000 and rand < quick:
skip = True
elif False and feed.feed_address.startswith("http://news.google.com/news"):
skip = True
weight = "-"
quick = "-"
rand = "-"
if skip:
logging.debug(' ---> [%-30s] ~BGFaking fetch, skipping (%s/month, %s subs, %s < %s)...' % (
feed.log_title[:30],
weight,
feed.num_subscribers,
rand, quick))
continue
ffeed = FetchFeed(feed_id, self.options)
ret_feed, fetched_feed = ffeed.fetch()
feed_fetch_duration = time.time() - start_duration
raw_feed = ffeed.raw_feed
if ((fetched_feed and ret_feed == FEED_OK) or self.options['force']):
pfeed = ProcessFeed(feed_id, fetched_feed, self.options, raw_feed=raw_feed)
ret_feed, ret_entries = pfeed.process()
feed = pfeed.feed
feed_process_duration = time.time() - start_duration
if (ret_entries and ret_entries['new']) or self.options['force']:
start = time.time()
if not feed.known_good or not feed.fetched_once:
feed.known_good = True
feed.fetched_once = True
feed = feed.save()
if self.options['force'] or random.random() <= 0.02:
logging.debug(' ---> [%-30s] ~FBPerforming feed cleanup...' % (feed.log_title[:30],))
start_cleanup = time.time()
feed.sync_redis()
logging.debug(' ---> [%-30s] ~FBDone with feed cleanup. Took ~SB%.4s~SN sec.' % (feed.log_title[:30], time.time() - start_cleanup))
try:
self.count_unreads_for_subscribers(feed)
except TimeoutError:
logging.debug(' ---> [%-30s] Unread count took too long...' % (feed.log_title[:30],))
if self.options['verbose']:
logging.debug(u' ---> [%-30s] ~FBTIME: unread count in ~FM%.4ss' % (
feed.log_title[:30], time.time() - start))
except urllib2.HTTPError, e:
logging.debug(' ---> [%-30s] ~FRFeed throws HTTP error: ~SB%s' % (unicode(feed_id)[:30], e.fp.read()))
feed_code = e.code
feed.save_feed_history(feed_code, e.msg, e.fp.read())
fetched_feed = None
except Feed.DoesNotExist, e:
logging.debug(' ---> [%-30s] ~FRFeed is now gone...' % (unicode(feed_id)[:30]))
continue
except SoftTimeLimitExceeded, e:
logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
ret_feed = FEED_ERREXC
fetched_feed = None
feed_code = 559
feed.save_feed_history(feed_code, 'Timeout', e)
except TimeoutError, e:
logging.debug(' ---> [%-30s] ~FRFeed fetch timed out...' % (feed.log_title[:30]))
feed_code = 505
feed.save_feed_history(feed_code, 'Timeout', e)
fetched_feed = None
except Exception, e:
logging.debug('[%d] ! -------------------------' % (feed_id,))
tb = traceback.format_exc()
logging.error(tb)
logging.debug('[%d] ! -------------------------' % (feed_id,))
ret_feed = FEED_ERREXC
feed = Feed.get_by_id(getattr(feed, 'pk', feed_id))
if not feed: continue
feed.save_feed_history(500, "Error", tb)
feed_code = 500
fetched_feed = None
# mail_feed_error_to_admin(feed, e, local_vars=locals())
if (not settings.DEBUG and hasattr(settings, 'RAVEN_CLIENT') and
settings.RAVEN_CLIENT):
settings.RAVEN_CLIENT.captureException()
if not feed_code:
if ret_feed == FEED_OK:
feed_code = 200
elif ret_feed == FEED_SAME:
feed_code = 304
elif ret_feed == FEED_ERRHTTP:
feed_code = 400
if ret_feed == FEED_ERREXC:
feed_code = 500
elif ret_feed == FEED_ERRPARSE:
feed_code = 550
if not feed: continue
feed = self.refresh_feed(feed.pk)
if not feed: continue
if ((self.options['force']) or
(random.random() > .9) or
(fetched_feed and
feed.feed_link and
feed.has_page and
(ret_feed == FEED_OK or
(ret_feed == FEED_SAME and feed.stories_last_month > 10)))):
logging.debug(u' ---> [%-30s] ~FYFetching page: %s' % (feed.log_title[:30], feed.feed_link))
page_importer = PageImporter(feed)
try:
page_data = page_importer.fetch_page()
page_duration = time.time() - start_duration
except SoftTimeLimitExceeded, e:
logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
page_data = None
feed.save_feed_history(557, 'Timeout', e)
except TimeoutError, e:
logging.debug(' ---> [%-30s] ~FRPage fetch timed out...' % (feed.log_title[:30]))
page_data = None
feed.save_page_history(555, 'Timeout', '')
except Exception, e:
logging.debug('[%d] ! -------------------------' % (feed_id,))
tb = traceback.format_exc()
logging.error(tb)
logging.debug('[%d] ! -------------------------' % (feed_id,))
feed.save_page_history(550, "Page Error", tb)
fetched_feed = None
page_data = None
# mail_feed_error_to_admin(feed, e, local_vars=locals())
if (not settings.DEBUG and hasattr(settings, 'RAVEN_CLIENT') and
settings.RAVEN_CLIENT):
settings.RAVEN_CLIENT.captureException()
feed = self.refresh_feed(feed.pk)
logging.debug(u' ---> [%-30s] ~FYFetching icon: %s' % (feed.log_title[:30], feed.feed_link))
force = self.options['force']
if random.random() > .99:
force = True
icon_importer = IconImporter(feed, page_data=page_data, force=force)
try:
icon_importer.save()
icon_duration = time.time() - start_duration
except SoftTimeLimitExceeded, e:
logging.debug(" ---> [%-30s] ~BR~FWTime limit hit!~SB~FR Moving on to next feed..." % feed)
feed.save_feed_history(558, 'Timeout', e)
except TimeoutError, e:
logging.debug(' ---> [%-30s] ~FRIcon fetch timed out...' % (feed.log_title[:30]))
feed.save_page_history(556, 'Timeout', '')
except Exception, e:
logging.debug('[%d] ! -------------------------' % (feed_id,))
tb = traceback.format_exc()
logging.error(tb)
logging.debug('[%d] ! -------------------------' % (feed_id,))
# feed.save_feed_history(560, "Icon Error", tb)
# mail_feed_error_to_admin(feed, e, local_vars=locals())
if (not settings.DEBUG and hasattr(settings, 'RAVEN_CLIENT') and
settings.RAVEN_CLIENT):
settings.RAVEN_CLIENT.captureException()
else:
logging.debug(u' ---> [%-30s] ~FBSkipping page fetch: (%s on %s stories) %s' % (feed.log_title[:30], self.feed_trans[ret_feed], feed.stories_last_month, '' if feed.has_page else ' [HAS NO PAGE]'))
feed = self.refresh_feed(feed.pk)
delta = time.time() - start_time
feed.last_load_time = round(delta)
feed.fetched_once = True
try:
feed = feed.save(update_fields=['last_load_time', 'fetched_once'])
except IntegrityError:
logging.debug(" ***> [%-30s] ~FRIntegrityError on feed: %s" % (feed.log_title[:30], feed.feed_address,))
if ret_entries and ret_entries['new']:
self.publish_to_subscribers(feed, ret_entries['new'])
done_msg = (u'%2s ---> [%-30s] ~FYProcessed in ~FM~SB%.4ss~FY~SN (~FB%s~FY) [%s]' % (
identity, feed.log_title[:30], delta,
feed.pk, self.feed_trans[ret_feed],))
logging.debug(done_msg)
total_duration = time.time() - start_duration
MAnalyticsFetcher.add(feed_id=feed.pk, feed_fetch=feed_fetch_duration,
feed_process=feed_process_duration,
page=page_duration, icon=icon_duration,
total=total_duration, feed_code=feed_code)
self.feed_stats[ret_feed] += 1
if len(feed_queue) == 1:
return feed
# time_taken = datetime.datetime.utcnow() - self.time_start
def publish_to_subscribers(self, feed, new_count):
try:
r = redis.Redis(connection_pool=settings.REDIS_PUBSUB_POOL)
listeners_count = r.publish(str(feed.pk), 'story:new_count:%s' % new_count)
if listeners_count:
logging.debug(" ---> [%-30s] ~FMPublished to %s subscribers" % (feed.log_title[:30], listeners_count))
except redis.ConnectionError:
logging.debug(" ***> [%-30s] ~BMRedis is unavailable for real-time." % (feed.log_title[:30],))
def count_unreads_for_subscribers(self, feed):
user_subs = UserSubscription.objects.filter(feed=feed,
active=True,
user__profile__last_seen_on__gte=feed.unread_cutoff)\
.order_by('-last_read_date')
if not user_subs.count():
return
for sub in user_subs:
if not sub.needs_unread_recalc:
sub.needs_unread_recalc = True
sub.save()
if self.options['compute_scores']:
r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL)
stories = MStory.objects(story_feed_id=feed.pk,
story_date__gte=feed.unread_cutoff)
stories = Feed.format_stories(stories, feed.pk)
story_hashes = r.zrangebyscore('zF:%s' % feed.pk, int(feed.unread_cutoff.strftime('%s')),
int(time.time() + 60*60*24))
missing_story_hashes = set(story_hashes) - set([s['story_hash'] for s in stories])
if missing_story_hashes:
missing_stories = MStory.objects(story_feed_id=feed.pk,
story_hash__in=missing_story_hashes)\
.read_preference(pymongo.ReadPreference.PRIMARY)
missing_stories = Feed.format_stories(missing_stories, feed.pk)
stories = missing_stories + stories
logging.debug(u' ---> [%-30s] ~FYFound ~SB~FC%s(of %s)/%s~FY~SN un-secondaried stories while computing scores' % (feed.log_title[:30], len(missing_stories), len(missing_story_hashes), len(stories)))
cache.set("S:%s" % feed.pk, stories, 60)
logging.debug(u' ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)' % (
feed.log_title[:30], len(stories), user_subs.count(),
feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers))
self.calculate_feed_scores_with_stories(user_subs, stories)
elif self.options.get('mongodb_replication_lag'):
logging.debug(u' ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag' % (
feed.log_title[:30], self.options.get('mongodb_replication_lag')))
@timelimit(10)
def calculate_feed_scores_with_stories(self, user_subs, stories):
for sub in user_subs:
silent = False if self.options['verbose'] >= 2 else True
sub.calculate_feed_scores(silent=silent, stories=stories)
def add_jobs(self, feeds_queue, feeds_count=1):
""" adds a feed processing job to the pool
"""
self.feeds_queue = feeds_queue
self.feeds_count = feeds_count
def run_jobs(self):
if self.options['single_threaded']:
return self.process_feed_wrapper(self.feeds_queue[0])
else:
for i in range(self.num_threads):
feed_queue = self.feeds_queue[i]
self.workers.append(multiprocessing.Process(target=self.process_feed_wrapper,
args=(feed_queue,)))
for i in range(self.num_threads):
self.workers[i].start()

View file

@ -3,8 +3,8 @@ import threading
import sys
import traceback
import pprint
import urllib
import urlparse
import urllib.request, urllib.parse, urllib.error
import urllib.parse
import random
import warnings
from django.core.mail import mail_admins
@ -35,12 +35,12 @@ def timelimit(timeout):
c = Dispatch()
c.join(timeout)
if c.isAlive():
raise TimeoutError, 'took too long'
raise TimeoutError('took too long')
if c.error:
tb = ''.join(traceback.format_exception(c.error[0], c.error[1], c.error[2]))
logging.debug(tb)
mail_admins('Error in timeout: %s' % c.error[0], tb)
raise c.error[0], c.error[1], c.error[2]
raise c.error[0](c.error[1]).with_traceback(c.error[2])
return c.result
return _2
return _1
@ -67,7 +67,7 @@ def levenshtein_distance(first, second):
distance_matrix[i][0] = i
for j in range(second_length):
distance_matrix[0][j]=j
for i in xrange(1, first_length):
for i in range(1, first_length):
for j in range(1, second_length):
deletion = distance_matrix[i-1][j] + 1
insertion = distance_matrix[i][j-1] + 1
@ -111,7 +111,7 @@ def _do_timesince(d, chunks, now=None):
def relative_timesince(value):
if not value:
return u''
return ''
chunks = (
(60 * 60 * 24, lambda n: ungettext('day', 'days', n)),
@ -124,7 +124,7 @@ def relative_timesince(value):
def relative_timeuntil(value):
if not value:
return u''
return ''
chunks = (
(60 * 60, lambda n: ungettext('hour', 'hours', n)),
@ -171,7 +171,7 @@ def format_relative_date(date, future=False):
def add_object_to_folder(obj, in_folder, folders, parent='', added=False):
obj_identifier = obj
if isinstance(obj, dict):
obj_identifier = obj.keys()[0]
obj_identifier = list(obj.keys())[0]
if ((not in_folder or in_folder == " ") and
not parent and
@ -183,7 +183,7 @@ def add_object_to_folder(obj, in_folder, folders, parent='', added=False):
child_folder_names = []
for item in folders:
if isinstance(item, dict):
child_folder_names.append(item.keys()[0])
child_folder_names.append(list(item.keys())[0])
if isinstance(obj, dict) and in_folder.lower() == parent.lower():
if obj_identifier not in child_folder_names:
folders.append(obj)
@ -191,7 +191,7 @@ def add_object_to_folder(obj, in_folder, folders, parent='', added=False):
for k, v in enumerate(folders):
if isinstance(v, dict):
for f_k, f_v in v.items():
for f_k, f_v in list(v.items()):
if f_k.lower() == in_folder.lower() and obj_identifier not in f_v and not added:
f_v.append(obj)
added = True
@ -216,7 +216,7 @@ def mail_feed_error_to_admin(feed, e, local_vars=None, subject=None):
## {{{ http://code.activestate.com/recipes/576611/ (r11)
from operator import itemgetter
from heapq import nlargest
from itertools import repeat, ifilter
from itertools import repeat
class Counter(dict):
'''Dict subclass for counting hashable objects. Sometimes called a bag
@ -253,8 +253,8 @@ class Counter(dict):
'''
if n is None:
return sorted(self.iteritems(), key=itemgetter(1), reverse=True)
return nlargest(n, self.iteritems(), key=itemgetter(1))
return sorted(iter(list(self.items())), key=itemgetter(1), reverse=True)
return nlargest(n, iter(list(self.items())), key=itemgetter(1))
def elements(self):
'''Iterator over elements repeating each as many times as its count.
@ -267,7 +267,7 @@ class Counter(dict):
elements() will ignore it.
'''
for elem, count in self.iteritems():
for elem, count in list(self.items()):
for _ in repeat(None, count):
yield elem
@ -295,7 +295,7 @@ class Counter(dict):
if hasattr(iterable, 'iteritems'):
if self:
self_get = self.get
for elem, count in iterable.iteritems():
for elem, count in list(iterable.items()):
self[elem] = self_get(elem, 0) + count
else:
dict.update(self, iterable) # fast path when counter is empty
@ -393,7 +393,7 @@ class Counter(dict):
result = Counter()
if len(self) < len(other):
self, other = other, self
for elem in ifilter(self.__contains__, other):
for elem in filter(self.__contains__, other):
newcount = _min(self[elem], other[elem])
if newcount > 0:
result[elem] = newcount
@ -402,9 +402,9 @@ class Counter(dict):
if __name__ == '__main__':
import doctest
print doctest.testmod()
print((doctest.testmod()))
## end of http://code.activestate.com/recipes/576611/ }}}
def chunks(l, n):
for i in xrange(0, len(l), n):
for i in range(0, len(l), n):
yield l[i:i+n]

410
utils/feed_functions.py.bak Normal file
View file

@ -0,0 +1,410 @@
import datetime
import threading
import sys
import traceback
import pprint
import urllib.request, urllib.parse, urllib.error
import urllib.parse
import random
import warnings
from django.core.mail import mail_admins
from django.utils.translation import ungettext
from django.utils.encoding import smart_unicode
from utils import log as logging
class TimeoutError(Exception): pass
def timelimit(timeout):
"""borrowed from web.py"""
def _1(function):
def _2(*args, **kw):
class Dispatch(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.result = None
self.error = None
self.setDaemon(True)
self.start()
def run(self):
try:
self.result = function(*args, **kw)
except:
self.error = sys.exc_info()
c = Dispatch()
c.join(timeout)
if c.isAlive():
raise TimeoutError('took too long')
if c.error:
tb = ''.join(traceback.format_exception(c.error[0], c.error[1], c.error[2]))
logging.debug(tb)
mail_admins('Error in timeout: %s' % c.error[0], tb)
raise c.error[0](c.error[1]).with_traceback(c.error[2])
return c.result
return _2
return _1
def utf8encode(tstr):
""" Encodes a unicode string in utf-8
"""
msg = "utf8encode is deprecated. Use django.utils.encoding.smart_unicode instead."
warnings.warn(msg, DeprecationWarning)
return smart_unicode(tstr)
# From: http://www.poromenos.org/node/87
def levenshtein_distance(first, second):
"""Find the Levenshtein distance between two strings."""
if len(first) > len(second):
first, second = second, first
if len(second) == 0:
return len(first)
first_length = len(first) + 1
second_length = len(second) + 1
distance_matrix = [[0] * second_length for x in range(first_length)]
for i in range(first_length):
distance_matrix[i][0] = i
for j in range(second_length):
distance_matrix[0][j]=j
for i in range(1, first_length):
for j in range(1, second_length):
deletion = distance_matrix[i-1][j] + 1
insertion = distance_matrix[i][j-1] + 1
substitution = distance_matrix[i-1][j-1]
if first[i-1] != second[j-1]:
substitution += 1
distance_matrix[i][j] = min(insertion, deletion, substitution)
return distance_matrix[first_length-1][second_length-1]
def _do_timesince(d, chunks, now=None):
"""
Started as a copy of django.util.timesince.timesince, but modified to
only output one time unit, and use months as the maximum unit of measure.
Takes two datetime objects and returns the time between d and now
as a nicely formatted string, e.g. "10 minutes". If d occurs after now,
then "0 minutes" is returned.
Units used are months, weeks, days, hours, and minutes.
Seconds and microseconds are ignored.
"""
# Convert datetime.date to datetime.datetime for comparison
if d.__class__ is not datetime.datetime:
d = datetime.datetime(d.year, d.month, d.day)
if not now:
now = datetime.datetime.utcnow()
# ignore microsecond part of 'd' since we removed it from 'now'
delta = now - (d - datetime.timedelta(0, 0, d.microsecond))
since = delta.days * 24 * 60 * 60 + delta.seconds
if since > 10:
for i, (seconds, name) in enumerate(chunks):
count = since // seconds
if count != 0:
break
s = '%(number)d %(type)s' % {'number': count, 'type': name(count)}
else:
s = 'just a second'
return s
def relative_timesince(value):
if not value:
return ''
chunks = (
(60 * 60 * 24, lambda n: ungettext('day', 'days', n)),
(60 * 60, lambda n: ungettext('hour', 'hours', n)),
(60, lambda n: ungettext('minute', 'minutes', n)),
(1, lambda n: ungettext('second', 'seconds', n)),
(0, lambda n: 'just now'),
)
return _do_timesince(value, chunks)
def relative_timeuntil(value):
if not value:
return ''
chunks = (
(60 * 60, lambda n: ungettext('hour', 'hours', n)),
(60, lambda n: ungettext('minute', 'minutes', n))
)
now = datetime.datetime.utcnow()
return _do_timesince(now, chunks, value)
def seconds_timesince(value):
if not value:
return 0
now = datetime.datetime.utcnow()
delta = now - value
return delta.days * 24 * 60 * 60 + delta.seconds
def format_relative_date(date, future=False):
if not date or date < datetime.datetime(2010, 1, 1):
return "Soon"
now = datetime.datetime.utcnow()
diff = abs(now - date)
if diff < datetime.timedelta(minutes=60):
minutes = diff.seconds / 60
return "%s minute%s %s" % (minutes,
'' if minutes == 1 else 's',
'' if future else 'ago')
elif datetime.timedelta(minutes=60) <= diff < datetime.timedelta(minutes=90):
return "1 hour %s" % ('' if future else 'ago')
elif diff < datetime.timedelta(hours=24):
dec = (diff.seconds / 60 + 15) % 60
if dec >= 30:
return "%s.5 hours %s" % ((((diff.seconds / 60) + 15) / 60),
'' if future else 'ago')
else:
return "%s hours %s" % ((((diff.seconds / 60) + 15) / 60),
'' if future else 'ago')
else:
days = ((diff.seconds / 60) / 60 / 24)
return "%s day%s %s" % (days, '' if days == 1 else 's', '' if future else 'ago')
def add_object_to_folder(obj, in_folder, folders, parent='', added=False):
obj_identifier = obj
if isinstance(obj, dict):
obj_identifier = list(obj.keys())[0]
if ((not in_folder or in_folder == " ") and
not parent and
not isinstance(obj, dict) and
obj_identifier not in folders):
folders.append(obj)
return folders
child_folder_names = []
for item in folders:
if isinstance(item, dict):
child_folder_names.append(list(item.keys())[0])
if isinstance(obj, dict) and in_folder.lower() == parent.lower():
if obj_identifier not in child_folder_names:
folders.append(obj)
return folders
for k, v in enumerate(folders):
if isinstance(v, dict):
for f_k, f_v in list(v.items()):
if f_k.lower() == in_folder.lower() and obj_identifier not in f_v and not added:
f_v.append(obj)
added = True
folders[k][f_k] = add_object_to_folder(obj, in_folder, f_v, f_k, added)
return folders
def mail_feed_error_to_admin(feed, e, local_vars=None, subject=None):
# Mail the admins with the error
if not subject:
subject = "Feed update error"
exc_info = sys.exc_info()
subject = '%s: %s' % (subject, repr(e))
message = 'Traceback:\n%s\n\Feed:\n%s\nLocals:\n%s' % (
'\n'.join(traceback.format_exception(*exc_info)),
pprint.pformat(feed.__dict__),
pprint.pformat(local_vars)
)
# print message
mail_admins(subject, message)
## {{{ http://code.activestate.com/recipes/576611/ (r11)
from operator import itemgetter
from heapq import nlargest
from itertools import repeat
class Counter(dict):
'''Dict subclass for counting hashable objects. Sometimes called a bag
or multiset. Elements are stored as dictionary keys and their counts
are stored as dictionary values.
>>> Counter('zyzygy')
Counter({'y': 3, 'z': 2, 'g': 1})
'''
def __init__(self, iterable=None, **kwds):
'''Create a new, empty Counter object. And if given, count elements
from an input iterable. Or, initialize the count from another mapping
of elements to their counts.
>>> c = Counter() # a new, empty counter
>>> c = Counter('gallahad') # a new counter from an iterable
>>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping
>>> c = Counter(a=4, b=2) # a new counter from keyword args
'''
self.update(iterable, **kwds)
def __missing__(self, key):
return 0
def most_common(self, n=None):
'''List the n most common elements and their counts from the most
common to the least. If n is None, then list all element counts.
>>> Counter('abracadabra').most_common(3)
[('a', 5), ('r', 2), ('b', 2)]
'''
if n is None:
return sorted(iter(self.items()), key=itemgetter(1), reverse=True)
return nlargest(n, iter(self.items()), key=itemgetter(1))
def elements(self):
'''Iterator over elements repeating each as many times as its count.
>>> c = Counter('ABCABC')
>>> sorted(c.elements())
['A', 'A', 'B', 'B', 'C', 'C']
If an element's count has been set to zero or is a negative number,
elements() will ignore it.
'''
for elem, count in self.items():
for _ in repeat(None, count):
yield elem
# Override dict methods where the meaning changes for Counter objects.
@classmethod
def fromkeys(cls, iterable, v=None):
raise NotImplementedError(
'Counter.fromkeys() is undefined. Use Counter(iterable) instead.')
def update(self, iterable=None, **kwds):
'''Like dict.update() but add counts instead of replacing them.
Source can be an iterable, a dictionary, or another Counter instance.
>>> c = Counter('which')
>>> c.update('witch') # add elements from another iterable
>>> d = Counter('watch')
>>> c.update(d) # add elements from another counter
>>> c['h'] # four 'h' in which, witch, and watch
4
'''
if iterable is not None:
if hasattr(iterable, 'iteritems'):
if self:
self_get = self.get
for elem, count in iterable.items():
self[elem] = self_get(elem, 0) + count
else:
dict.update(self, iterable) # fast path when counter is empty
else:
self_get = self.get
for elem in iterable:
self[elem] = self_get(elem, 0) + 1
if kwds:
self.update(kwds)
def copy(self):
'Like dict.copy() but returns a Counter instance instead of a dict.'
return Counter(self)
def __delitem__(self, elem):
'Like dict.__delitem__() but does not raise KeyError for missing values.'
if elem in self:
dict.__delitem__(self, elem)
def __repr__(self):
if not self:
return '%s()' % self.__class__.__name__
items = ', '.join(map('%r: %r'.__mod__, self.most_common()))
return '%s({%s})' % (self.__class__.__name__, items)
# Multiset-style mathematical operations discussed in:
# Knuth TAOCP Volume II section 4.6.3 exercise 19
# and at http://en.wikipedia.org/wiki/Multiset
#
# Outputs guaranteed to only include positive counts.
#
# To strip negative and zero counts, add-in an empty counter:
# c += Counter()
def __add__(self, other):
'''Add counts from two counters.
>>> Counter('abbb') + Counter('bcc')
Counter({'b': 4, 'c': 2, 'a': 1})
'''
if not isinstance(other, Counter):
return NotImplemented
result = Counter()
for elem in set(self) | set(other):
newcount = self[elem] + other[elem]
if newcount > 0:
result[elem] = newcount
return result
def __sub__(self, other):
''' Subtract count, but keep only results with positive counts.
>>> Counter('abbbc') - Counter('bccd')
Counter({'b': 2, 'a': 1})
'''
if not isinstance(other, Counter):
return NotImplemented
result = Counter()
for elem in set(self) | set(other):
newcount = self[elem] - other[elem]
if newcount > 0:
result[elem] = newcount
return result
def __or__(self, other):
'''Union is the maximum of value in either of the input counters.
>>> Counter('abbb') | Counter('bcc')
Counter({'b': 3, 'c': 2, 'a': 1})
'''
if not isinstance(other, Counter):
return NotImplemented
_max = max
result = Counter()
for elem in set(self) | set(other):
newcount = _max(self[elem], other[elem])
if newcount > 0:
result[elem] = newcount
return result
def __and__(self, other):
''' Intersection is the minimum of corresponding counts.
>>> Counter('abbb') & Counter('bcc')
Counter({'b': 1})
'''
if not isinstance(other, Counter):
return NotImplemented
_min = min
result = Counter()
if len(self) < len(other):
self, other = other, self
for elem in filter(self.__contains__, other):
newcount = _min(self[elem], other[elem])
if newcount > 0:
result[elem] = newcount
return result
if __name__ == '__main__':
import doctest
print(doctest.testmod())
## end of http://code.activestate.com/recipes/576611/ }}}
def chunks(l, n):
for i in range(0, len(l), n):
yield l[i:i+n]

View file

@ -46,16 +46,16 @@ Also Jason Diamond, Brian Lalor for bug reporting and patches"""
_debug = 0
import sgmllib, urllib, urlparse, re, sys, robotparser
import sgmllib, urllib.request, urllib.parse, urllib.error, urllib.parse, re, sys, urllib.robotparser
import requests
from StringIO import StringIO
from io import StringIO
from lxml import etree
# XML-RPC support allows feedfinder to query Syndic8 for possible matches.
# Python 2.3 now comes with this module by default, otherwise you can download it
try:
import xmlrpclib # http://www.pythonware.com/products/xmlrpc/
import xmlrpc.client # http://www.pythonware.com/products/xmlrpc/
except ImportError:
xmlrpclib = None
@ -67,28 +67,28 @@ if not dict:
return rc
def _debuglog(message):
if _debug: print message
if _debug: print(message)
class URLGatekeeper:
"""a class to track robots.txt rules across multiple servers"""
def __init__(self):
self.rpcache = {} # a dictionary of RobotFileParser objects, by domain
self.urlopener = urllib.FancyURLopener()
self.urlopener = urllib.request.FancyURLopener()
self.urlopener.version = "NewsBlur Feed Finder (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)"
_debuglog(self.urlopener.version)
self.urlopener.addheaders = [('User-Agent', self.urlopener.version)]
# self.urlopener.addheaders = [('User-Agent', self.urlopener.version), ('Accept', '*')]
robotparser.URLopener.version = self.urlopener.version
robotparser.URLopener.addheaders = self.urlopener.addheaders
urllib.robotparser.URLopener.version = self.urlopener.version
urllib.robotparser.URLopener.addheaders = self.urlopener.addheaders
def _getrp(self, url):
protocol, domain = urlparse.urlparse(url)[:2]
if self.rpcache.has_key(domain):
protocol, domain = urllib.parse.urlparse(url)[:2]
if domain in self.rpcache:
return self.rpcache[domain]
baseurl = '%s://%s' % (protocol, domain)
robotsurl = urlparse.urljoin(baseurl, 'robots.txt')
robotsurl = urllib.parse.urljoin(baseurl, 'robots.txt')
_debuglog('fetching %s' % robotsurl)
rp = robotparser.RobotFileParser(robotsurl)
rp = urllib.robotparser.RobotFileParser(robotsurl)
try:
rp.read()
except:
@ -119,7 +119,7 @@ class BaseParser(sgmllib.SGMLParser):
def normalize_attrs(self, attrs):
def cleanattr(v):
v = sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v)
v = sgmllib.charref.sub(lambda m: chr(int(m.groups()[0])), v)
if not v: return
v = v.strip()
v = v.replace('&lt;', '<').replace('&gt;', '>').replace('&apos;', "'").replace('&quot;', '"').replace('&amp;', '&')
@ -130,7 +130,7 @@ class BaseParser(sgmllib.SGMLParser):
def do_base(self, attrs):
attrsD = dict(self.normalize_attrs(attrs))
if not attrsD.has_key('href'): return
if 'href' not in attrsD: return
self.baseuri = attrsD['href']
def error(self, *a, **kw): pass # we're not picky
@ -143,18 +143,18 @@ class LinkParser(BaseParser):
'application/x-atom+xml')
def do_link(self, attrs):
attrsD = dict(self.normalize_attrs(attrs))
if not attrsD.has_key('rel'): return
if 'rel' not in attrsD: return
rels = attrsD['rel'].split()
if 'alternate' not in rels: return
if attrsD.get('type') not in self.FEED_TYPES: return
if not attrsD.has_key('href'): return
self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
if 'href' not in attrsD: return
self.links.append(urllib.parse.urljoin(self.baseuri, attrsD['href']))
class ALinkParser(BaseParser):
def start_a(self, attrs):
attrsD = dict(self.normalize_attrs(attrs))
if not attrsD.has_key('href'): return
self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
if 'href' not in attrsD: return
self.links.append(urllib.parse.urljoin(self.baseuri, attrsD['href']))
def makeFullURI(uri):
if not uri: return
@ -218,7 +218,7 @@ def couldBeFeedData(data):
def isFeed(uri):
_debuglog('seeing if %s is a feed' % uri)
protocol = urlparse.urlparse(uri)
protocol = urllib.parse.urlparse(uri)
if protocol[0] not in ('http', 'https'): return 0
try:
data = _gatekeeper.get(uri, check=False)
@ -233,7 +233,7 @@ def sortFeeds(feed1Info, feed2Info):
def getFeedsFromSyndic8(uri):
feeds = []
try:
server = xmlrpclib.Server('http://www.syndic8.com/xmlrpc.php')
server = xmlrpc.client.Server('http://www.syndic8.com/xmlrpc.php')
feedids = server.syndic8.FindFeeds(uri)
infolist = server.syndic8.GetFeedInfo(feedids, ['headlines_rank','status','dataurl'])
infolist.sort(sortFeeds)
@ -270,7 +270,7 @@ def feeds(uri, all=False, querySyndic8=False, _recurs=None):
except:
outfeeds = []
_debuglog('found %s feeds through LINK tags' % len(outfeeds))
outfeeds = filter(isFeed, outfeeds)
outfeeds = list(filter(isFeed, outfeeds))
if all or not outfeeds:
# no LINK tags, look for regular <A> links that point to feeds
_debuglog('no LINK tags, looking at A tags')
@ -281,16 +281,16 @@ def feeds(uri, all=False, querySyndic8=False, _recurs=None):
_debuglog('no LINK tags, looking at local links')
locallinks = getLocalLinks(links, fulluri)
# look for obvious feed links on the same server
outfeeds.extend(filter(isFeed, filter(isFeedLink, locallinks)))
outfeeds.extend(list(filter(isFeed, list(filter(isFeedLink, locallinks)))))
if all or not outfeeds:
# look harder for feed links on the same server
outfeeds.extend(filter(isFeed, filter(isXMLRelatedLink, locallinks)))
outfeeds.extend(list(filter(isFeed, list(filter(isXMLRelatedLink, locallinks)))))
if all or not outfeeds:
# look for obvious feed links on another server
outfeeds.extend(filter(isFeed, filter(isFeedLink, links)))
outfeeds.extend(list(filter(isFeed, list(filter(isFeedLink, links)))))
if all or not outfeeds:
# look harder for feed links on another server
outfeeds.extend(filter(isFeed, filter(isXMLRelatedLink, links)))
outfeeds.extend(list(filter(isFeed, list(filter(isXMLRelatedLink, links)))))
if all or not outfeeds:
_debuglog('no A tags, guessing')
suffixes = [ # filenames used by popular software:
@ -302,12 +302,12 @@ def feeds(uri, all=False, querySyndic8=False, _recurs=None):
'index.xml', # MT
'index.rss' # Slash
]
outfeeds.extend(filter(isFeed, [urlparse.urljoin(fulluri, x) for x in suffixes]))
outfeeds.extend(list(filter(isFeed, [urllib.parse.urljoin(fulluri, x) for x in suffixes])))
if (all or not outfeeds) and querySyndic8:
# still no luck, search Syndic8 for feeds (requires xmlrpclib)
_debuglog('still no luck, searching Syndic8')
outfeeds.extend(getFeedsFromSyndic8(uri))
if hasattr(__builtins__, 'set') or __builtins__.has_key('set'):
if hasattr(__builtins__, 'set') or 'set' in __builtins__:
outfeeds = list(set(outfeeds))
return outfeeds
@ -317,7 +317,7 @@ def feed(uri):
#todo: give preference to certain feed formats
feedlist = feeds(uri)
if feedlist:
feeds_no_comments = filter(lambda f: 'comments' not in f.lower(), feedlist)
feeds_no_comments = [f for f in feedlist if 'comments' not in f.lower()]
if feeds_no_comments:
return feeds_no_comments[0]
return feedlist[0]
@ -338,25 +338,25 @@ def test():
count += 1
links = getLinks(data, uri)
if not links:
print '\n*** FAILED ***', uri, 'could not find link'
print(('\n*** FAILED ***', uri, 'could not find link'))
failed.append(uri)
elif len(links) > 1:
print '\n*** FAILED ***', uri, 'found too many links'
print(('\n*** FAILED ***', uri, 'found too many links'))
failed.append(uri)
else:
atomdata = urllib.urlopen(links[0]).read()
atomdata = urllib.request.urlopen(links[0]).read()
if atomdata.find('<link rel="alternate"') == -1:
print '\n*** FAILED ***', uri, 'retrieved something that is not a feed'
print(('\n*** FAILED ***', uri, 'retrieved something that is not a feed'))
failed.append(uri)
else:
backlink = atomdata.split('href="').pop().split('"')[0]
if backlink != uri:
print '\n*** FAILED ***', uri, 'retrieved wrong feed'
print(('\n*** FAILED ***', uri, 'retrieved wrong feed'))
failed.append(uri)
if data.find('<link rel="next" href="') == -1: break
uri = urlparse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])
print
print count, 'tests executed,', len(failed), 'failed'
uri = urllib.parse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])
print()
print((count, 'tests executed,', len(failed), 'failed'))
if __name__ == '__main__':
args = sys.argv[1:]
@ -370,4 +370,4 @@ if __name__ == '__main__':
if uri == 'test':
test()
else:
print "\n".join(getFeeds(uri))
print(("\n".join(getFeeds(uri))))

373
utils/feedfinder.py.bak Normal file
View file

@ -0,0 +1,373 @@
"""feedfinder: Find the Web feed for a Web page
http://www.aaronsw.com/2002/feedfinder/
Usage:
feed(uri) - returns feed found for a URI
feeds(uri) - returns all feeds found for a URI
>>> import feedfinder
>>> feedfinder.feed('scripting.com')
'http://scripting.com/rss.xml'
>>>
>>> feedfinder.feeds('scripting.com')
['http://delong.typepad.com/sdj/atom.xml',
'http://delong.typepad.com/sdj/index.rdf',
'http://delong.typepad.com/sdj/rss.xml']
>>>
Can also use from the command line. Feeds are returned one per line:
$ python feedfinder.py diveintomark.org
http://diveintomark.org/xml/atom.xml
How it works:
0. At every step, feeds are minimally verified to make sure they are really feeds.
1. If the URI points to a feed, it is simply returned; otherwise
the page is downloaded and the real fun begins.
2. Feeds pointed to by LINK tags in the header of the page (autodiscovery)
3. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or
".atom"
4. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom"
5. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or
".atom"
6. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom"
7. Try some guesses about common places for feeds (index.xml, atom.xml, etc.).
8. As a last ditch effort, we search Syndic8 for feeds matching the URI
"""
__version__ = "1.371"
__date__ = "2006-04-24"
__maintainer__ = "Aaron Swartz (me@aaronsw.com)"
__author__ = "Mark Pilgrim (http://diveintomark.org)"
__copyright__ = "Copyright 2002-4, Mark Pilgrim; 2006 Aaron Swartz"
__license__ = "Python"
__credits__ = """Abe Fettig for a patch to sort Syndic8 feeds by popularity
Also Jason Diamond, Brian Lalor for bug reporting and patches"""
_debug = 0
import sgmllib, urllib.request, urllib.parse, urllib.error, urllib.parse, re, sys, urllib.robotparser
import requests
from io import StringIO
from lxml import etree
# XML-RPC support allows feedfinder to query Syndic8 for possible matches.
# Python 2.3 now comes with this module by default, otherwise you can download it
try:
import xmlrpc.client # http://www.pythonware.com/products/xmlrpc/
except ImportError:
xmlrpclib = None
if not dict:
def dict(aList):
rc = {}
for k, v in aList:
rc[k] = v
return rc
def _debuglog(message):
if _debug: print(message)
class URLGatekeeper:
"""a class to track robots.txt rules across multiple servers"""
def __init__(self):
self.rpcache = {} # a dictionary of RobotFileParser objects, by domain
self.urlopener = urllib.request.FancyURLopener()
self.urlopener.version = "NewsBlur Feed Finder (Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3)"
_debuglog(self.urlopener.version)
self.urlopener.addheaders = [('User-Agent', self.urlopener.version)]
# self.urlopener.addheaders = [('User-Agent', self.urlopener.version), ('Accept', '*')]
urllib.robotparser.URLopener.version = self.urlopener.version
urllib.robotparser.URLopener.addheaders = self.urlopener.addheaders
def _getrp(self, url):
protocol, domain = urllib.parse.urlparse(url)[:2]
if domain in self.rpcache:
return self.rpcache[domain]
baseurl = '%s://%s' % (protocol, domain)
robotsurl = urllib.parse.urljoin(baseurl, 'robots.txt')
_debuglog('fetching %s' % robotsurl)
rp = urllib.robotparser.RobotFileParser(robotsurl)
try:
rp.read()
except:
pass
self.rpcache[domain] = rp
return rp
def can_fetch(self, url):
rp = self._getrp(url)
allow = rp.can_fetch(self.urlopener.version, url)
_debuglog("gatekeeper of %s says %s" % (url, allow))
return allow
def get(self, url, check=False):
if check and not self.can_fetch(url): return ''
try:
return requests.get(url, headers=dict(self.urlopener.addheaders)).content
except:
return ''
_gatekeeper = URLGatekeeper()
class BaseParser(sgmllib.SGMLParser):
def __init__(self, baseuri):
sgmllib.SGMLParser.__init__(self)
self.links = []
self.baseuri = baseuri
def normalize_attrs(self, attrs):
def cleanattr(v):
v = sgmllib.charref.sub(lambda m: chr(int(m.groups()[0])), v)
if not v: return
v = v.strip()
v = v.replace('&lt;', '<').replace('&gt;', '>').replace('&apos;', "'").replace('&quot;', '"').replace('&amp;', '&')
return v
attrs = [(k.lower(), cleanattr(v)) for k, v in attrs if cleanattr(v)]
attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs if cleanattr(v)]
return attrs
def do_base(self, attrs):
attrsD = dict(self.normalize_attrs(attrs))
if 'href' not in attrsD: return
self.baseuri = attrsD['href']
def error(self, *a, **kw): pass # we're not picky
class LinkParser(BaseParser):
FEED_TYPES = ('application/rss+xml',
'text/xml',
'application/atom+xml',
'application/x.atom+xml',
'application/x-atom+xml')
def do_link(self, attrs):
attrsD = dict(self.normalize_attrs(attrs))
if 'rel' not in attrsD: return
rels = attrsD['rel'].split()
if 'alternate' not in rels: return
if attrsD.get('type') not in self.FEED_TYPES: return
if 'href' not in attrsD: return
self.links.append(urllib.parse.urljoin(self.baseuri, attrsD['href']))
class ALinkParser(BaseParser):
def start_a(self, attrs):
attrsD = dict(self.normalize_attrs(attrs))
if 'href' not in attrsD: return
self.links.append(urllib.parse.urljoin(self.baseuri, attrsD['href']))
def makeFullURI(uri):
if not uri: return
uri = uri.strip()
if uri.startswith('feed://'):
uri = 'http://' + uri.split('feed://', 1).pop()
for x in ['http', 'https']:
if uri.startswith('%s://' % x):
return uri
return 'http://%s' % uri
def getLinks(data, baseuri):
p = LinkParser(baseuri)
p.feed(data)
return p.links
def getLinksLXML(data, baseuri):
parser = etree.HTMLParser(recover=True)
tree = etree.parse(StringIO(data), parser)
links = []
for link in tree.findall('.//link'):
if link.attrib.get('type') in LinkParser.FEED_TYPES:
href = link.attrib['href']
if href: links.append(href)
return links
def getALinks(data, baseuri):
p = ALinkParser(baseuri)
p.feed(data)
return p.links
def getLocalLinks(links, baseuri):
found_links = []
if not baseuri: return found_links
baseuri = baseuri.lower()
for l in links:
try:
if l.lower().startswith(baseuri):
found_links.append(l)
except (AttributeError, UnicodeDecodeError):
pass
return found_links
def isFeedLink(link):
return link[-4:].lower() in ('.rss', '.rdf', '.xml', '.atom')
def isXMLRelatedLink(link):
link = link.lower()
return link.count('rss') + link.count('rdf') + link.count('xml') + link.count('atom')
r_brokenRedirect = re.compile('<newLocation[^>]*>(.*?)</newLocation>', re.S)
def tryBrokenRedirect(data):
if '<newLocation' in data:
newuris = r_brokenRedirect.findall(data)
if newuris and newuris[0]: return newuris[0].strip()
def couldBeFeedData(data):
data = data.lower()
if data.count('<html'): return 0
return data.count('<rss') + data.count('<rdf') + data.count('<feed')
def isFeed(uri):
_debuglog('seeing if %s is a feed' % uri)
protocol = urllib.parse.urlparse(uri)
if protocol[0] not in ('http', 'https'): return 0
try:
data = _gatekeeper.get(uri, check=False)
except (KeyError, UnicodeDecodeError):
return False
count = couldBeFeedData(data)
return count
def sortFeeds(feed1Info, feed2Info):
return cmp(feed2Info['headlines_rank'], feed1Info['headlines_rank'])
def getFeedsFromSyndic8(uri):
feeds = []
try:
server = xmlrpc.client.Server('http://www.syndic8.com/xmlrpc.php')
feedids = server.syndic8.FindFeeds(uri)
infolist = server.syndic8.GetFeedInfo(feedids, ['headlines_rank','status','dataurl'])
infolist.sort(sortFeeds)
feeds = [f['dataurl'] for f in infolist if f['status']=='Syndicated']
_debuglog('found %s feeds through Syndic8' % len(feeds))
except:
pass
return feeds
def feeds(uri, all=False, querySyndic8=False, _recurs=None):
if _recurs is None: _recurs = [uri]
fulluri = makeFullURI(uri)
try:
data = _gatekeeper.get(fulluri, check=False)
except:
return []
# is this already a feed?
if couldBeFeedData(data):
return [fulluri]
newuri = tryBrokenRedirect(data)
if newuri and newuri not in _recurs:
_recurs.append(newuri)
return feeds(newuri, all=all, querySyndic8=querySyndic8, _recurs=_recurs)
# nope, it's a page, try LINK tags first
_debuglog('looking for LINK tags')
try:
outfeeds = getLinks(data, fulluri)
except:
outfeeds = []
if not outfeeds:
_debuglog('using lxml to look for LINK tags')
try:
outfeeds = getLinksLXML(data, fulluri)
except:
outfeeds = []
_debuglog('found %s feeds through LINK tags' % len(outfeeds))
outfeeds = list(filter(isFeed, outfeeds))
if all or not outfeeds:
# no LINK tags, look for regular <A> links that point to feeds
_debuglog('no LINK tags, looking at A tags')
try:
links = getALinks(data, fulluri)
except:
links = []
_debuglog('no LINK tags, looking at local links')
locallinks = getLocalLinks(links, fulluri)
# look for obvious feed links on the same server
outfeeds.extend(list(filter(isFeed, list(filter(isFeedLink, locallinks)))))
if all or not outfeeds:
# look harder for feed links on the same server
outfeeds.extend(list(filter(isFeed, list(filter(isXMLRelatedLink, locallinks)))))
if all or not outfeeds:
# look for obvious feed links on another server
outfeeds.extend(list(filter(isFeed, list(filter(isFeedLink, links)))))
if all or not outfeeds:
# look harder for feed links on another server
outfeeds.extend(list(filter(isFeed, list(filter(isXMLRelatedLink, links)))))
if all or not outfeeds:
_debuglog('no A tags, guessing')
suffixes = [ # filenames used by popular software:
'feed/', # obvious
'atom.xml', # blogger, TypePad
'index.atom', # MT, apparently
'index.rdf', # MT
'rss.xml', # Dave Winer/Manila
'index.xml', # MT
'index.rss' # Slash
]
outfeeds.extend(list(filter(isFeed, [urllib.parse.urljoin(fulluri, x) for x in suffixes])))
if (all or not outfeeds) and querySyndic8:
# still no luck, search Syndic8 for feeds (requires xmlrpclib)
_debuglog('still no luck, searching Syndic8')
outfeeds.extend(getFeedsFromSyndic8(uri))
if hasattr(__builtins__, 'set') or 'set' in __builtins__:
outfeeds = list(set(outfeeds))
return outfeeds
getFeeds = feeds # backwards-compatibility
def feed(uri):
#todo: give preference to certain feed formats
feedlist = feeds(uri)
if feedlist:
feeds_no_comments = [f for f in feedlist if 'comments' not in f.lower()]
if feeds_no_comments:
return feeds_no_comments[0]
return feedlist[0]
else:
return None
##### test harness ######
def test():
uri = 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html'
failed = []
count = 0
while 1:
data = _gatekeeper.get(uri)
if data.find('Atom autodiscovery test') == -1: break
sys.stdout.write('.')
sys.stdout.flush()
count += 1
links = getLinks(data, uri)
if not links:
print('\n*** FAILED ***', uri, 'could not find link')
failed.append(uri)
elif len(links) > 1:
print('\n*** FAILED ***', uri, 'found too many links')
failed.append(uri)
else:
atomdata = urllib.request.urlopen(links[0]).read()
if atomdata.find('<link rel="alternate"') == -1:
print('\n*** FAILED ***', uri, 'retrieved something that is not a feed')
failed.append(uri)
else:
backlink = atomdata.split('href="').pop().split('"')[0]
if backlink != uri:
print('\n*** FAILED ***', uri, 'retrieved wrong feed')
failed.append(uri)
if data.find('<link rel="next" href="') == -1: break
uri = urllib.parse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])
print()
print(count, 'tests executed,', len(failed), 'failed')
if __name__ == '__main__':
args = sys.argv[1:]
if args and args[0] == '--debug':
_debug = 1
args.pop(0)
if args:
uri = args[0]
else:
uri = 'http://diveintomark.org/'
if uri == 'test':
test()
else:
print("\n".join(getFeeds(uri)))

View file

@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
__version__ = "0.0.3"
@ -145,7 +145,7 @@ def url_feed_prob(url):
if "georss" in url:
return -1
kw = ["atom", "rss", "rdf", ".xml", "feed", "json"]
for p, t in zip(range(len(kw), 0, -1), kw):
for p, t in zip(list(range(len(kw), 0, -1)), kw):
if t in url:
return p
return 0
@ -156,10 +156,10 @@ def sort_urls(feeds):
if __name__ == "__main__":
print(find_feeds("www.preposterousuniverse.com/blog/"))
print(find_feeds("http://xkcd.com"))
print(find_feeds("dan.iel.fm/atom.xml"))
print(find_feeds("dan.iel.fm", check_all=True))
print(find_feeds("kapadia.github.io"))
print(find_feeds("blog.jonathansick.ca"))
print(find_feeds("asdasd"))
print((find_feeds("www.preposterousuniverse.com/blog/")))
print((find_feeds("http://xkcd.com")))
print((find_feeds("dan.iel.fm/atom.xml")))
print((find_feeds("dan.iel.fm", check_all=True)))
print((find_feeds("kapadia.github.io")))
print((find_feeds("blog.jonathansick.ca")))
print((find_feeds("asdasd")))

165
utils/feedfinder2.py.bak Executable file
View file

@ -0,0 +1,165 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__version__ = "0.0.3"
try:
__FEEDFINDER2_SETUP__
except NameError:
__FEEDFINDER2_SETUP__ = False
if not __FEEDFINDER2_SETUP__:
__all__ = ["find_feeds"]
import logging
import requests
from BeautifulSoup import BeautifulSoup
from six.moves.urllib import parse as urlparse
def coerce_url(url):
url = url.strip()
if url.startswith("feed://"):
return "http://{0}".format(url[7:])
for proto in ["http://", "https://"]:
if url.startswith(proto):
return url
return "http://{0}".format(url)
class FeedFinder(object):
def __init__(self, user_agent=None):
if user_agent is None:
user_agent = "NewsBlur Feed Finder"
self.user_agent = user_agent
def get_feed(self, url, skip_user_agent=False):
try:
r = requests.get(url, headers={"User-Agent": self.user_agent if not skip_user_agent else None})
except Exception as e:
logging.warn("Error while getting '{0}'".format(url))
logging.warn("{0}".format(e))
return None
if not skip_user_agent and r.status_code == 403:
return self.get_feed(url, skip_user_agent=True)
return r.text
def is_feed_data(self, text):
data = text.lower()
if data and data[:100].count("<html"):
return False
return data.count("<rss")+data.count("<rdf")+data.count("<feed")+data.count("jsonfeed.org")
def is_feed(self, url):
text = self.get_feed(url)
if text is None:
return False
return self.is_feed_data(text)
def is_feed_url(self, url):
return any(map(url.lower().endswith,
[".rss", ".rdf", ".xml", ".atom", ".json"]))
def is_feedlike_url(self, url):
return any(map(url.lower().count,
["rss", "rdf", "xml", "atom", "feed", "json"]))
def find_feeds(url, check_all=False, user_agent=None):
finder = FeedFinder(user_agent=user_agent)
# Format the URL properly.
url = coerce_url(url)
# Download the requested URL.
feed_text = finder.get_feed(url)
if feed_text is None:
return []
# Check if it is already a feed.
if finder.is_feed_data(feed_text):
return [url]
# Look for <link> tags.
logging.info("Looking for <link> tags.")
try:
tree = BeautifulSoup(feed_text)
except ValueError:
return []
links = []
for link in tree.findAll("link"):
if link.get("type") in ["application/rss+xml",
"text/xml",
"application/atom+xml",
"application/x.atom+xml",
"application/x-atom+xml",
"application/json"]:
links.append(urlparse.urljoin(url, link.get("href", "")))
# Check the detected links.
urls = list(filter(finder.is_feed, links))
logging.info("Found {0} feed <link> tags.".format(len(urls)))
if len(urls) and not check_all:
return sort_urls(urls)
# Look for <a> tags.
logging.info("Looking for <a> tags.")
local, remote = [], []
for a in tree.findAll("a"):
href = a.get("href", None)
if href is None:
continue
if "://" not in href and finder.is_feed_url(href):
local.append(href)
if finder.is_feedlike_url(href):
remote.append(href)
# Check the local URLs.
local = [urlparse.urljoin(url, l) for l in local]
urls += list(filter(finder.is_feed, local))
logging.info("Found {0} local <a> links to feeds.".format(len(urls)))
if len(urls) and not check_all:
return sort_urls(urls)
# Check the remote URLs.
remote = [urlparse.urljoin(url, l) for l in remote]
urls += list(filter(finder.is_feed, remote))
logging.info("Found {0} remote <a> links to feeds.".format(len(urls)))
if len(urls) and not check_all:
return sort_urls(urls)
# Guessing potential URLs.
fns = ["atom.xml", "index.atom", "index.rdf", "rss.xml", "index.xml",
"index.rss", "index.json"]
urls += list(filter(finder.is_feed, [urlparse.urljoin(url, f)
for f in fns]))
return sort_urls(urls)
def url_feed_prob(url):
if "comments" in url:
return -2
if "georss" in url:
return -1
kw = ["atom", "rss", "rdf", ".xml", "feed", "json"]
for p, t in zip(list(range(len(kw), 0, -1)), kw):
if t in url:
return p
return 0
def sort_urls(feeds):
return sorted(list(set(feeds)), key=url_feed_prob, reverse=True)
if __name__ == "__main__":
print(find_feeds("www.preposterousuniverse.com/blog/"))
print(find_feeds("http://xkcd.com"))
print(find_feeds("dan.iel.fm/atom.xml"))
print(find_feeds("dan.iel.fm", check_all=True))
print(find_feeds("kapadia.github.io"))
print(find_feeds("blog.jonathansick.ca"))
print(find_feeds("asdasd"))

View file

@ -3,7 +3,7 @@
from PIL import Image
from PIL import ImageOps as PILOps
from PIL.ExifTags import TAGS
from StringIO import StringIO
from io import StringIO
from vendor import reseekfile
PROFILE_PICTURE_SIZES = {

180
utils/json_functions.py.bak Normal file
View file

@ -0,0 +1,180 @@
#-*- coding: utf-8 -*-
from django.db import models
from django.utils.functional import Promise
from django.utils.encoding import force_unicode, smart_unicode
import json
from decimal import Decimal
from django.core import serializers
from django.conf import settings
from django.http import HttpResponse, HttpResponseForbidden, Http404
from django.core.mail import mail_admins
from django.db.models.query import QuerySet
from mongoengine.queryset.queryset import QuerySet as MongoQuerySet
from bson.objectid import ObjectId
import sys
import datetime
def decode(data):
if not data:
return data
return json.loads(data)
def encode(data, *args, **kwargs):
if type(data) == QuerySet: # Careful, ValuesQuerySet is a dict
# Django models
return serializers.serialize("json", data, *args, **kwargs)
else:
return json_encode(data, *args, **kwargs)
def json_encode(data, *args, **kwargs):
"""
The main issues with django's default json serializer is that properties that
had been added to an object dynamically are being ignored (and it also has
problems with some models).
"""
def _any(data):
ret = None
# Opps, we used to check if it is of type list, but that fails
# i.e. in the case of django.newforms.utils.ErrorList, which extends
# the type "list". Oh man, that was a dumb mistake!
if hasattr(data, 'canonical'):
ret = _any(data.canonical())
elif isinstance(data, list):
ret = _list(data)
elif isinstance(data, set):
ret = _list(list(data))
# Same as for lists above.
elif isinstance(data, dict):
ret = _dict(data)
elif isinstance(data, (Decimal, ObjectId)):
# json.dumps() cant handle Decimal
ret = str(data)
elif isinstance(data, models.query.QuerySet):
# Actually its the same as a list ...
ret = _list(data)
elif isinstance(data, MongoQuerySet):
# Actually its the same as a list ...
ret = _list(data)
elif isinstance(data, models.Model):
ret = _model(data)
# here we need to encode the string as unicode (otherwise we get utf-16 in the json-response)
elif isinstance(data, basestring):
ret = smart_unicode(data)
elif isinstance(data, Exception):
ret = unicode(data)
# see http://code.djangoproject.com/ticket/5868
elif isinstance(data, Promise):
ret = force_unicode(data)
elif isinstance(data, datetime.datetime) or isinstance(data, datetime.date):
ret = str(data)
elif hasattr(data, 'to_json'):
ret = data.to_json()
else:
ret = data
return ret
def _model(data):
ret = {}
# If we only have a model, we only want to encode the fields.
for f in data._meta.fields:
ret[f.attname] = _any(getattr(data, f.attname))
# And additionally encode arbitrary properties that had been added.
fields = dir(data.__class__) + ret.keys()
add_ons = [k for k in dir(data) if k not in fields]
for k in add_ons:
ret[k] = _any(getattr(data, k))
return ret
def _list(data):
ret = []
for v in data:
ret.append(_any(v))
return ret
def _dict(data):
ret = {}
for k, v in data.items():
ret[str(k)] = _any(v)
return ret
if hasattr(data, 'to_json'):
data = data.to_json()
ret = _any(data)
return json.dumps(ret)
def json_view(func):
def wrap(request, *a, **kw):
response = func(request, *a, **kw)
return json_response(request, response)
if isinstance(func, HttpResponse):
return func
else:
return wrap
def json_response(request, response=None):
code = 200
if isinstance(response, HttpResponseForbidden):
return response
try:
if isinstance(response, dict):
response = dict(response)
if 'result' not in response:
response['result'] = 'ok'
authenticated = request.user.is_authenticated
response['authenticated'] = authenticated
if authenticated:
response['user_id'] = request.user.pk
except KeyboardInterrupt:
# Allow keyboard interrupts through for debugging.
raise
except Http404:
raise Http404
except Exception as e:
# Mail the admins with the error
exc_info = sys.exc_info()
subject = 'JSON view error: %s' % request.path
try:
request_repr = repr(request)
except:
request_repr = 'Request repr() unavailable'
import traceback
message = 'Traceback:\n%s\n\nRequest:\n%s' % (
'\n'.join(traceback.format_exception(*exc_info)),
request_repr,
)
response = {'result': 'error',
'text': unicode(e)}
code = 500
if not settings.DEBUG:
mail_admins(subject, message, fail_silently=True)
else:
print '\n'.join(traceback.format_exception(*exc_info))
json = json_encode(response)
return HttpResponse(json, content_type='application/json', status=code)
def main():
test = {
1: True,
2: u"string",
3: 30,
4: u"юнікод, ўўў, © ™ ® ё ² § $ ° ќо́",
5: "utf-8: \xd1\x9e, \xc2\xa9 \xe2\x84\xa2 \xc2\xae \xd1\x91 \xd0\xba\xcc\x81\xd0\xbe\xcc\x81",
}
json_test = json_encode(test)
print test, json_test
if __name__ == '__main__':
main()

View file

@ -5,7 +5,7 @@ import time
from django.core.handlers.wsgi import WSGIRequest
from django.conf import settings
from django.utils.encoding import smart_unicode
from django.utils.encoding import smart_str
from user_functions import extract_user_agent
from apps.statistics.rstats import RStats
@ -22,7 +22,7 @@ def getlogger():
def user(u, msg, request=None, warn_color=True):
msg = smart_unicode(msg)
msg = smart_text(msg)
if not u:
return debug(msg)
@ -72,19 +72,19 @@ def cipher(msg):
def debug(msg):
msg = smart_unicode(msg)
msg = smart_text(msg)
logger = getlogger()
logger.debug(colorize(msg))
def info(msg):
msg = smart_unicode(msg)
msg = smart_text(msg)
logger = getlogger()
logger.info(colorize(msg))
def error(msg):
msg = smart_unicode(msg)
msg = smart_text(msg)
logger = getlogger()
logger.error(msg)

View file

@ -49,7 +49,7 @@ def list_backup_in_s3():
bucket = conn.get_bucket(BUCKET_NAME)
for i, key in enumerate(bucket.get_all_keys()):
print "[%s] %s" % (i, key.name)
print("[%s] %s" % (i, key.name))
def delete_all_backups():
#FIXME: validate filename exists
@ -57,13 +57,13 @@ def delete_all_backups():
bucket = conn.get_bucket(BUCKET_NAME)
for i, key in enumerate(bucket.get_all_keys()):
print "deleting %s" % (key.name)
print("deleting %s" % (key.name))
key.delete()
if __name__ == '__main__':
import sys
if len(sys.argv) < 3:
print 'Usage: %s <get/set/list/delete> <backup_filename>' % (sys.argv[0])
print('Usage: %s <get/set/list/delete> <backup_filename>' % (sys.argv[0]))
else:
if sys.argv[1] == 'set':
save_file_in_s3(sys.argv[2])
@ -74,7 +74,7 @@ if __name__ == '__main__':
elif sys.argv[1] == 'delete':
delete_all_backups()
else:
print 'Usage: %s <get/set/list/delete> <backup_filename>' % (sys.argv[0])
print('Usage: %s <get/set/list/delete> <backup_filename>' % (sys.argv[0]))
class S3Store:

148
utils/s3_utils.py.bak Normal file
View file

@ -0,0 +1,148 @@
import os
import sys
import time
import mimetypes
from boto.s3.connection import S3Connection
from boto.s3.key import Key
from utils.image_functions import ImageOps
if '/srv/newsblur' not in ' '.join(sys.path):
sys.path.append("/srv/newsblur")
os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
from django.conf import settings
ACCESS_KEY = settings.S3_ACCESS_KEY
SECRET = settings.S3_SECRET
BUCKET_NAME = settings.S3_BACKUP_BUCKET # Note that you need to create this bucket first
import ssl
_old_match_hostname = ssl.match_hostname
def _new_match_hostname(cert, hostname):
if hostname.endswith('.s3.amazonaws.com'):
pos = hostname.find('.s3.amazonaws.com')
hostname = hostname[:pos].replace('.', '') + hostname[pos:]
return _old_match_hostname(cert, hostname)
ssl.match_hostname = _new_match_hostname
def save_file_in_s3(filename):
conn = S3Connection(ACCESS_KEY, SECRET)
bucket = conn.get_bucket(BUCKET_NAME)
k = Key(bucket)
k.key = filename
k.set_contents_from_filename(filename)
def get_file_from_s3(filename):
conn = S3Connection(ACCESS_KEY, SECRET)
bucket = conn.get_bucket(BUCKET_NAME)
k = Key(bucket)
k.key = filename
k.get_contents_to_filename(filename)
def list_backup_in_s3():
conn = S3Connection(ACCESS_KEY, SECRET)
bucket = conn.get_bucket(BUCKET_NAME)
for i, key in enumerate(bucket.get_all_keys()):
print "[%s] %s" % (i, key.name)
def delete_all_backups():
#FIXME: validate filename exists
conn = S3Connection(ACCESS_KEY, SECRET)
bucket = conn.get_bucket(BUCKET_NAME)
for i, key in enumerate(bucket.get_all_keys()):
print "deleting %s" % (key.name)
key.delete()
if __name__ == '__main__':
import sys
if len(sys.argv) < 3:
print 'Usage: %s <get/set/list/delete> <backup_filename>' % (sys.argv[0])
else:
if sys.argv[1] == 'set':
save_file_in_s3(sys.argv[2])
elif sys.argv[1] == 'get':
get_file_from_s3(sys.argv[2])
elif sys.argv[1] == 'list':
list_backup_in_s3()
elif sys.argv[1] == 'delete':
delete_all_backups()
else:
print 'Usage: %s <get/set/list/delete> <backup_filename>' % (sys.argv[0])
class S3Store:
def __init__(self, bucket_name=settings.S3_AVATARS_BUCKET_NAME):
if settings.DEBUG:
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
# Legacy Python that doesn't verify HTTPS certificates by default
pass
else:
# Handle target environment that doesn't support HTTPS verification
ssl._create_default_https_context = _create_unverified_https_context
self.s3 = S3Connection(ACCESS_KEY, SECRET)
self.bucket = self.create_bucket(bucket_name)
def create_bucket(self, bucket_name):
return self.s3.create_bucket(bucket_name)
def save_profile_picture(self, user_id, filename, image_body):
content_type, extension = self._extract_content_type(filename)
if not content_type or not extension:
return
image_name = 'profile_%s.%s' % (int(time.time()), extension)
image = ImageOps.resize_image(image_body, 'fullsize', fit_to_size=False)
if image:
key = 'avatars/%s/large_%s' % (user_id, image_name)
self._save_object(key, image, content_type=content_type)
image = ImageOps.resize_image(image_body, 'thumbnail', fit_to_size=True)
if image:
key = 'avatars/%s/thumbnail_%s' % (user_id, image_name)
self._save_object(key, image, content_type=content_type)
return image and image_name
def _extract_content_type(self, filename):
content_type = mimetypes.guess_type(filename)[0]
extension = None
if content_type == 'image/jpeg':
extension = 'jpg'
elif content_type == 'image/png':
extension = 'png'
elif content_type == 'image/gif':
extension = 'gif'
return content_type, extension
def _make_key(self):
return Key(bucket=self.bucket)
def _save_object(self, key, file_object, content_type=None):
k = self._make_key()
k.key = key
file_object.seek(0)
if content_type:
k.set_contents_from_file(file_object, headers={
'Content-Type': content_type,
})
else:
k.set_contents_from_file(file_object)
k.set_acl('public-read')